// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef OPENSSL_HEADER_BN_INTERNAL_H
#define OPENSSL_HEADER_BN_INTERNAL_H

#include <ring-core/base.h>

#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
#pragma warning(push, 3)
#include <intrin.h>
#pragma warning(pop)
#pragma intrinsic(_umul128)
#endif

#include "../../internal.h"

typedef crypto_word_t BN_ULONG;

#if defined(OPENSSL_64_BIT)

#if defined(BORINGSSL_HAS_UINT128)
// MSVC doesn't support two-word integers on 64-bit.
#define BN_ULLONG uint128_t
#endif

#define BN_BITS2 64
#define BN_MONT_CTX_N0_LIMBS 1
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0
#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))

#elif defined(OPENSSL_32_BIT)

#define BN_ULLONG uint64_t
#define BN_BITS2 32
// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
// needs to be two words long. Only certain 32-bit platforms actually make use
// of n0[1] and shorter R value would suffice for the others. However,
// currently only the assembly files know which is which.
#define BN_MONT_CTX_N0_LIMBS 2
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo)
#define TOBN(hi, lo) (lo), (hi)

#else
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
#endif


// BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM|
// used with Montgomery reduction. Ideally this limit would be applied to all
// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB
// values for other operations.
// #define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG))

// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to
// an |N0|.
//
// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
// If neither is fully-reduced, the output may not be either.
//
// This function allocates |num| words on the stack, so |num| should be at most
// |BN_MONTGOMERY_MAX_WORDS|.
//
// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
// off upper bits. The aarch64 implementation expects a 64-bit input and does
// not. |size_t| is the safer option but not strictly correct for x86_64. But
// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
//
// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
// inputs.
//
// |num| must be at least 4, at least on x86.
//
// In other forks, |bn_mul_mont| returns an |int| indicating whether it
// actually did the multiplication. All our implementations always do the
// multiplication, and forcing callers to deal with the possibility of it
// failing just leads to further problems.
OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
                      (sizeof(int) == 4 && sizeof(size_t) == 8),
                      "int and size_t ABI mismatch");
#if defined(OPENSSL_X86_64)
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#elif defined(OPENSSL_AARCH64)
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
    // No point in optimizing for P-256 because P-256 doesn't call into
    // this on AArch64.
    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#elif defined(OPENSSL_ARM)
void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                        const BN_ULONG *np, const BN_ULONG *n0, size_t num);
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
    // Approximate what `bn_mul_mont` did so that the NEON version for P-256
    // when practical.
    if (num == 8) {
        // XXX: This should not be accessing `neon_available` directly.
        if (neon_available) {
            bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
            return;
        }
    }
    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#else
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                 const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
    const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
    bn_mul_mont(rp, ap, bp, np, n0, num);
}
#endif

static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
                                 BN_ULONG a, BN_ULONG b) {
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
  *low_out = _umul128(a, b, high_out);
#else
  BN_ULLONG result = (BN_ULLONG)a * b;
  *low_out = (BN_ULONG)result;
  *high_out = (BN_ULONG)(result >> BN_BITS2);
#endif
}

#endif  // OPENSSL_HEADER_BN_INTERNAL_H