Fix typo in point_add.

Rather than writing the answer into the output, it wrote it into some awkwardly-named temporaries. Thanks to Daniel Hirche for reporting this issue! Bug: chromium:825273 (cherry picked from commit 5fca613918) Change-Id: I315087f5e7414118a6f70bf4aed76325aa4f76a0
Record whether dummy PQ padding was used.
2018-03-29 15:50:25 -04:00 · 2018-02-28 23:38:53 +00:00 · 2018-02-27 23:50:02 +00:00 · 2018-02-27 20:13:53 +00:00 · 2018-02-27 19:57:12 +00:00 · 2018-02-26 22:14:35 +00:00
1171 changed files with 26070 additions and 27874 deletions
@@ -5,6 +5,7 @@ ssl/test/runner/runner
 doc/*.html
 doc/doc.css

+util/bot/android_ndk
 util/bot/android_tools
 util/bot/cmake-linux64
 util/bot/cmake-linux64.tar.gz
@@ -79,14 +79,15 @@ for other variables which may be used to configure the build.

 ### Building for Android

-It's possible to build BoringSSL with the Android NDK using CMake. This has
-been tested with version 10d of the NDK.
+It's possible to build BoringSSL with the Android NDK using CMake. Recent
+versions of the NDK include a CMake toolchain file which works with CMake 3.6.0
+or later. This has been tested with version r16b of the NDK.

 Unpack the Android NDK somewhere and export `ANDROID_NDK` to point to the
 directory. Then make a build directory as above and run CMake like this:

    cmake -DANDROID_ABI=armeabi-v7a \
-          -DCMAKE_TOOLCHAIN_FILE=../third_party/android-cmake/android.toolchain.cmake \
+          -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
          -DANDROID_NATIVE_API_LEVEL=16 \
          -GNinja ..

@@ -94,7 +95,11 @@ Once you've run that, Ninja should produce Android-compatible binaries.  You
 can replace `armeabi-v7a` in the above with `arm64-v8a` and use API level 21 or
 higher to build aarch64 binaries.

-For other options, see [android-cmake's documentation](./third_party/android-cmake/README.md).
+For older NDK versions, BoringSSL ships a third-party CMake toolchain file. Use
+`../third_party/android-cmake/android.toolchain.cmake` for
+`CMAKE_TOOLCHAIN_FILE` instead.
+
+For other options, see the documentation in the toolchain file.

 ### Building for iOS

@@ -145,18 +150,6 @@ corresponding ARM feature.
 Note that if a feature is enabled in this way, but not actually supported at
 run-time, BoringSSL will likely crash.

-## Assembling ARMv8 with Clang
-
-In order to support the ARMv8 crypto instructions, Clang requires that the
-architecture be `armv8-a+crypto`. However, setting that as a general build flag
-would allow the compiler to assume that crypto instructions are *always*
-supported, even without testing for them.
-
-It's possible to set the architecture in an assembly file using the `.arch`
-directive, but only very recent versions of Clang support this. If
-`BORINGSSL_CLANG_SUPPORTS_DOT_ARCH` is defined then `.arch` directives will be
-used with Clang, otherwise you may need to craft acceptable assembler flags.
-
 # Running tests

 There are two sets of tests: the C/C++ tests and the blackbox tests. For former
@@ -1,5 +1,11 @@
 cmake_minimum_required (VERSION 2.8.11)

+# Report AppleClang separately from Clang. Their version numbers are different.
+# https://cmake.org/cmake/help/v3.0/policy/CMP0025.html
+if(POLICY CMP0025)
+  cmake_policy(SET CMP0025 NEW)
+endif()
+
 # Defer enabling C and CXX languages.
 project (BoringSSL NONE)

@@ -42,17 +48,18 @@ endif()

 if(CMAKE_COMPILER_IS_GNUCXX OR CLANG)
  # Note clang-cl is odd and sets both CLANG and MSVC. We base our configuration
-  # primarily on our normal Clang one because the MSVC one is mostly
-  # suppressions for an overaggressive -Wall.
-  set(C_CXX_FLAGS "-Wall -Werror -Wformat=2 -Wsign-compare -Wmissing-field-initializers -Wwrite-strings")
+  # primarily on our normal Clang one.
+  set(C_CXX_FLAGS "-Werror -Wformat=2 -Wsign-compare -Wmissing-field-initializers -Wwrite-strings")
  if(MSVC)
-    # clang-cl sets different default warnings than clang.
-    set(C_CXX_FLAGS "${C_CXX_FLAGS} -Wno-unused-parameter -fmsc-version=1900")
+    # clang-cl sets different default warnings than clang. It also treats -Wall
+    # as -Weverything, to match MSVC. Instead -W3 is the alias for -Wall.
+    # See http://llvm.org/viewvc/llvm-project?view=revision&revision=319116
+    set(C_CXX_FLAGS "${C_CXX_FLAGS} -W3 -Wno-unused-parameter -fmsc-version=1900")
    # googletest suppresses warning C4996 via a pragma, but clang-cl does not
    # honor it. Suppress it here to compensate. See https://crbug.com/772117.
    set(C_CXX_FLAGS "${C_CXX_FLAGS} -Wno-deprecated-declarations")
  else()
-    set(C_CXX_FLAGS "${C_CXX_FLAGS} -ggdb -fvisibility=hidden -fno-common")
+    set(C_CXX_FLAGS "${C_CXX_FLAGS} -Wall -ggdb -fvisibility=hidden -fno-common")
  endif()

  if(CLANG)
@@ -63,6 +70,18 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CLANG)
    set(C_CXX_FLAGS "${C_CXX_FLAGS} -Wno-free-nonheap-object")
  endif()

+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+     NOT "6.0.0" VERSION_GREATER CMAKE_C_COMPILER_VERSION)
+    # Clang's -Wtautological-constant-compare is far too aggressive and does not
+    # account for, say, wanting the same code to work on both 32-bit and 64-bit
+    # platforms.
+    #
+    # Note "Clang" and "AppleClang" version differently, so we check for an
+    # exact match on the COMPILER_ID. As of writing, the warning is not in any
+    # release of AppleClang yet.
+    set(C_CXX_FLAGS "${C_CXX_FLAGS} -Wno-tautological-constant-compare -Wtautological-constant-out-of-range-compare")
+  endif()
+
  if(CLANG OR NOT "7.0.0" VERSION_GREATER CMAKE_C_COMPILER_VERSION)
    set(C_CXX_FLAGS "${C_CXX_FLAGS} -Wimplicit-fallthrough")
  endif()
@@ -321,11 +340,11 @@ else()
  message(FATAL_ERROR "Unknown processor:" ${CMAKE_SYSTEM_PROCESSOR})
 endif()

-if (ANDROID AND ${ARCH} STREQUAL "arm")
-  # The Android-NDK CMake files somehow fail to set the -march flag for
-  # assembly files. Without this flag, the compiler believes that it's
+if (ANDROID AND NOT ANDROID_NDK_REVISION AND ${ARCH} STREQUAL "arm")
+  # The third-party Android-NDK CMake files somehow fail to set the -march flag
+  # for assembly files. Without this flag, the compiler believes that it's
  # building for ARMv5.
-  set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=${CMAKE_SYSTEM_PROCESSOR}")
+  set(CMAKE_ASM_FLAGS "-march=${CMAKE_SYSTEM_PROCESSOR} ${CMAKE_ASM_FLAGS}")
 endif()

 if (${ARCH} STREQUAL "x86" AND APPLE AND ${CMAKE_VERSION} VERSION_LESS "3.0")
@@ -337,8 +356,8 @@ endif()

 # Add minimal googletest targets. The provided one has many side-effects, and
 # googletest has a very straightforward build.
-add_library(gtest third_party/googletest/src/gtest-all.cc)
-target_include_directories(gtest PRIVATE third_party/googletest)
+add_library(boringssl_gtest third_party/googletest/src/gtest-all.cc)
+target_include_directories(boringssl_gtest PRIVATE third_party/googletest)

 include_directories(third_party/googletest/include)

@@ -5,10 +5,9 @@ license. This license is reproduced at the bottom of this file.
 Contributors to BoringSSL are required to follow the CLA rules for Chromium:
 https://cla.developers.google.com/clas

-Some files from Intel are under yet another license, which is also included
-underneath. Files in third_party/ have their own licenses, as described
-therein. The MIT license, for third_party/fiat, which, unlike other third_party
-directories, is compiled into non-test libraries, is included below.
+Files in third_party/ have their own licenses, as described therein. The MIT
+license, for third_party/fiat, which, unlike other third_party directories, is
+compiled into non-test libraries, is included below.

 The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the
 OpenSSL License and the original SSLeay license apply to the toolkit. See below
@@ -158,42 +157,6 @@ ISC license used for completely new code in BoringSSL:
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */


-Some files from Intel carry the following license:
-
-# Copyright (c) 2012, Intel Corporation
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# *  Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#
-# *  Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the
-#    distribution.
-#
-# *  Neither the name of the Intel Corporation nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
 The code in third_party/fiat carries the MIT license:

 Copyright (c) 2015-2016 the fiat-crypto authors (see
@@ -210,7 +210,7 @@ strings and loading algorithms, etc. All of these functions still exist in
 BoringSSL for convenience, but they do nothing and are not necessary.

 The one exception is `CRYPTO_library_init`. In `BORINGSSL_NO_STATIC_INITIALIZER`
-builds, it must be called to query CPU capabitilies before the rest of the
+builds, it must be called to query CPU capabilities before the rest of the
 library. In the default configuration, this is done with a static initializer
 and is also unnecessary.

@@ -53,7 +53,7 @@ if(NOT OPENSSL_NO_ASM)
      set(PERLASM_STYLE win32n)
      set(PERLASM_FLAGS "-DOPENSSL_IA32_SSE2")
    endif()
-    set(CMAKE_ASM_NASM_FLAGS "-g cv8")
+    set(CMAKE_ASM_NASM_FLAGS "-gcv8")

    # On Windows, we use the NASM output, specifically built with Yasm.
    set(ASM_EXT asm)
@@ -134,6 +134,7 @@ add_library(

  OBJECT

+  cpu-aarch64-fuchsia.c
  cpu-aarch64-linux.c
  cpu-arm.c
  cpu-arm-linux.c
@@ -260,6 +261,7 @@ add_executable(
  pool/pool_test.cc
  refcount_test.cc
  rsa_extra/rsa_test.cc
+  self_test.cc
  test/file_test_gtest.cc
  thread_test.cc
  x509/x509_test.cc
@@ -267,11 +269,11 @@ add_executable(
  x509v3/v3name_test.cc

  $<TARGET_OBJECTS:crypto_test_data>
-  $<TARGET_OBJECTS:gtest_main>
+  $<TARGET_OBJECTS:boringssl_gtest_main>
  $<TARGET_OBJECTS:test_support>
 )

-target_link_libraries(crypto_test crypto gtest)
+target_link_libraries(crypto_test crypto boringssl_gtest)
 if (WIN32)
  target_link_libraries(crypto_test ws2_32)
 endif()
@@ -347,40 +347,45 @@ ASN1_INTEGER *d2i_ASN1_UINTEGER(ASN1_INTEGER **a, const unsigned char **pp,

 int ASN1_INTEGER_set(ASN1_INTEGER *a, long v)
 {
-    int j, k;
-    unsigned int i;
-    unsigned char buf[sizeof(long) + 1];
-    long d;
-
-    a->type = V_ASN1_INTEGER;
-    if (a->length < (int)(sizeof(long) + 1)) {
-        if (a->data != NULL)
-            OPENSSL_free(a->data);
-        if ((a->data =
-             (unsigned char *)OPENSSL_malloc(sizeof(long) + 1)) != NULL)
-            OPENSSL_memset((char *)a->data, 0, sizeof(long) + 1);
+    if (v >= 0) {
+        return ASN1_INTEGER_set_uint64(a, (uint64_t) v);
    }
-    if (a->data == NULL) {
+
+    if (!ASN1_INTEGER_set_uint64(a, 0 - (uint64_t) v)) {
+        return 0;
+    }
+
+    a->type = V_ASN1_NEG_INTEGER;
+    return 1;
+}
+
+int ASN1_INTEGER_set_uint64(ASN1_INTEGER *out, uint64_t v)
+{
+    uint8_t *const newdata = OPENSSL_malloc(sizeof(uint64_t));
+    if (newdata == NULL) {
        OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
-        return (0);
-    }
-    d = v;
-    if (d < 0) {
-        d = -d;
-        a->type = V_ASN1_NEG_INTEGER;
+        return 0;
    }

-    for (i = 0; i < sizeof(long); i++) {
-        if (d == 0)
+    OPENSSL_free(out->data);
+    out->data = newdata;
+    v = CRYPTO_bswap8(v);
+    memcpy(out->data, &v, sizeof(v));
+
+    out->type = V_ASN1_INTEGER;
+
+    size_t leading_zeros;
+    for (leading_zeros = 0; leading_zeros < sizeof(uint64_t) - 1;
+         leading_zeros++) {
+        if (out->data[leading_zeros] != 0) {
            break;
-        buf[i] = (int)d & 0xff;
-        d >>= 8;
+        }
    }
-    j = 0;
-    for (k = i - 1; k >= 0; k--)
-        a->data[j++] = buf[k];
-    a->length = j;
-    return (1);
+
+    out->length = sizeof(uint64_t) - leading_zeros;
+    OPENSSL_memmove(out->data, out->data + leading_zeros, out->length);
+
+    return 1;
 }

 long ASN1_INTEGER_get(const ASN1_INTEGER *a)
@@ -61,17 +61,19 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>

+#include "asn1_locl.h"
+
 static int traverse_string(const unsigned char *p, int len, int inform,
-                           int (*rfunc) (unsigned long value, void *in),
+                           int (*rfunc) (uint32_t value, void *in),
                           void *arg);
-static int in_utf8(unsigned long value, void *arg);
-static int out_utf8(unsigned long value, void *arg);
-static int type_str(unsigned long value, void *arg);
-static int cpy_asc(unsigned long value, void *arg);
-static int cpy_bmp(unsigned long value, void *arg);
-static int cpy_univ(unsigned long value, void *arg);
-static int cpy_utf8(unsigned long value, void *arg);
-static int is_printable(unsigned long value);
+static int in_utf8(uint32_t value, void *arg);
+static int out_utf8(uint32_t value, void *arg);
+static int type_str(uint32_t value, void *arg);
+static int cpy_asc(uint32_t value, void *arg);
+static int cpy_bmp(uint32_t value, void *arg);
+static int cpy_univ(uint32_t value, void *arg);
+static int cpy_utf8(uint32_t value, void *arg);
+static int is_printable(uint32_t value);

 /*
 * These functions take a string in UTF8, ASCII or multibyte form and a mask
@@ -100,7 +102,7 @@ int ASN1_mbstring_ncopy(ASN1_STRING **out, const unsigned char *in, int len,
    unsigned char *p;
    int nchar;
    char strbuf[32];
-    int (*cpyfunc) (unsigned long, void *) = NULL;
+    int (*cpyfunc) (uint32_t, void *) = NULL;
    if (len == -1)
        len = strlen((const char *)in);
    if (!mask)
@@ -253,10 +255,10 @@ int ASN1_mbstring_ncopy(ASN1_STRING **out, const unsigned char *in, int len,
 */

 static int traverse_string(const unsigned char *p, int len, int inform,
-                           int (*rfunc) (unsigned long value, void *in),
+                           int (*rfunc) (uint32_t value, void *in),
                           void *arg)
 {
-    unsigned long value;
+    uint32_t value;
    int ret;
    while (len) {
        if (inform == MBSTRING_ASC) {
@@ -267,8 +269,8 @@ static int traverse_string(const unsigned char *p, int len, int inform,
            value |= *p++;
            len -= 2;
        } else if (inform == MBSTRING_UNIV) {
-            value = ((unsigned long)*p++) << 24;
-            value |= ((unsigned long)*p++) << 16;
+            value = ((uint32_t)*p++) << 24;
+            value |= ((uint32_t)*p++) << 16;
            value |= *p++ << 8;
            value |= *p++;
            len -= 4;
@@ -292,7 +294,7 @@ static int traverse_string(const unsigned char *p, int len, int inform,

 /* Just count number of characters */

-static int in_utf8(unsigned long value, void *arg)
+static int in_utf8(uint32_t value, void *arg)
 {
    int *nchar;
    nchar = arg;
@@ -302,7 +304,7 @@ static int in_utf8(unsigned long value, void *arg)

 /* Determine size of output as a UTF8 String */

-static int out_utf8(unsigned long value, void *arg)
+static int out_utf8(uint32_t value, void *arg)
 {
    int *outlen;
    outlen = arg;
@@ -315,7 +317,7 @@ static int out_utf8(unsigned long value, void *arg)
 * "mask".
 */

-static int type_str(unsigned long value, void *arg)
+static int type_str(uint32_t value, void *arg)
 {
    unsigned long types;
    types = *((unsigned long *)arg);
@@ -335,7 +337,7 @@ static int type_str(unsigned long value, void *arg)

 /* Copy one byte per character ASCII like strings */

-static int cpy_asc(unsigned long value, void *arg)
+static int cpy_asc(uint32_t value, void *arg)
 {
    unsigned char **p, *q;
    p = arg;
@@ -347,7 +349,7 @@ static int cpy_asc(unsigned long value, void *arg)

 /* Copy two byte per character BMPStrings */

-static int cpy_bmp(unsigned long value, void *arg)
+static int cpy_bmp(uint32_t value, void *arg)
 {
    unsigned char **p, *q;
    p = arg;
@@ -360,7 +362,7 @@ static int cpy_bmp(unsigned long value, void *arg)

 /* Copy four byte per character UniversalStrings */

-static int cpy_univ(unsigned long value, void *arg)
+static int cpy_univ(uint32_t value, void *arg)
 {
    unsigned char **p, *q;
    p = arg;
@@ -375,7 +377,7 @@ static int cpy_univ(unsigned long value, void *arg)

 /* Copy to a UTF8String */

-static int cpy_utf8(unsigned long value, void *arg)
+static int cpy_utf8(uint32_t value, void *arg)
 {
    unsigned char **p;
    int ret;
@@ -387,7 +389,7 @@ static int cpy_utf8(unsigned long value, void *arg)
 }

 /* Return 1 if the character is permitted in a PrintableString */
-static int is_printable(unsigned long value)
+static int is_printable(uint32_t value)
 {
    int ch;
    if (value > 0x7f)
@@ -59,6 +59,8 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>

+#include "asn1_locl.h"
+
 /* UTF8 utilities */

 /*
@@ -70,10 +72,10 @@
 * incorrectly (not minimal length).
 */

-int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
+int UTF8_getc(const unsigned char *str, int len, uint32_t *val)
 {
    const unsigned char *p;
-    unsigned long value;
+    uint32_t value;
    int ret;
    if (len <= 0)
        return 0;
@@ -112,7 +114,7 @@ int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
            || ((p[2] & 0xc0) != 0x80)
            || ((p[3] & 0xc0) != 0x80))
            return -3;
-        value = ((unsigned long)(*p++ & 0x7)) << 18;
+        value = ((uint32_t)(*p++ & 0x7)) << 18;
        value |= (*p++ & 0x3f) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
@@ -127,9 +129,9 @@ int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
            || ((p[3] & 0xc0) != 0x80)
            || ((p[4] & 0xc0) != 0x80))
            return -3;
-        value = ((unsigned long)(*p++ & 0x3)) << 24;
-        value |= ((unsigned long)(*p++ & 0x3f)) << 18;
-        value |= ((unsigned long)(*p++ & 0x3f)) << 12;
+        value = ((uint32_t)(*p++ & 0x3)) << 24;
+        value |= ((uint32_t)(*p++ & 0x3f)) << 18;
+        value |= ((uint32_t)(*p++ & 0x3f)) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
        if (value < 0x200000)
@@ -144,10 +146,10 @@ int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
            || ((p[4] & 0xc0) != 0x80)
            || ((p[5] & 0xc0) != 0x80))
            return -3;
-        value = ((unsigned long)(*p++ & 0x1)) << 30;
-        value |= ((unsigned long)(*p++ & 0x3f)) << 24;
-        value |= ((unsigned long)(*p++ & 0x3f)) << 18;
-        value |= ((unsigned long)(*p++ & 0x3f)) << 12;
+        value = ((uint32_t)(*p++ & 0x1)) << 30;
+        value |= ((uint32_t)(*p++ & 0x3f)) << 24;
+        value |= ((uint32_t)(*p++ & 0x3f)) << 18;
+        value |= ((uint32_t)(*p++ & 0x3f)) << 12;
        value |= (*p++ & 0x3f) << 6;
        value |= *p++ & 0x3f;
        if (value < 0x4000000)
@@ -167,7 +169,7 @@ int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
 * most 6 characters.
 */

-int UTF8_putc(unsigned char *str, int len, unsigned long value)
+int UTF8_putc(unsigned char *str, int len, uint32_t value)
 {
    if (!str)
        len = 6;                /* Maximum we will need */
@@ -93,6 +93,9 @@ int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
 void asn1_item_combine_free(ASN1_VALUE **pval, const ASN1_ITEM *it,
                            int combine);

+int UTF8_getc(const unsigned char *str, int len, uint32_t *val);
+int UTF8_putc(unsigned char *str, int len, uint32_t value);
+

 #if defined(__cplusplus)
 }  /* extern C */
@@ -15,6 +15,7 @@
 #include <stdio.h>

 #include <gtest/gtest.h>
+#include <limits.h>

 #include <openssl/asn1.h>
 #include <openssl/err.h>
@@ -60,3 +61,30 @@ TEST(ASN1Test, LargeTags) {
  EXPECT_EQ(Bytes(&kZero, 1), Bytes(obj->value.asn1_string->data,
                                    obj->value.asn1_string->length));
 }
+
+TEST(ASN1Test, IntegerSetting) {
+  bssl::UniquePtr<ASN1_INTEGER> by_bn(M_ASN1_INTEGER_new());
+  bssl::UniquePtr<ASN1_INTEGER> by_long(M_ASN1_INTEGER_new());
+  bssl::UniquePtr<ASN1_INTEGER> by_uint64(M_ASN1_INTEGER_new());
+  bssl::UniquePtr<BIGNUM> bn(BN_new());
+
+  const std::vector<int64_t> kValues = {
+      LONG_MIN, -2, -1, 0, 1, 2, 0xff, 0x100, 0xffff, 0x10000, LONG_MAX,
+  };
+  for (const auto &i : kValues) {
+    SCOPED_TRACE(i);
+
+    ASSERT_EQ(1, ASN1_INTEGER_set(by_long.get(), i));
+    const uint64_t abs = i < 0 ? (0 - (uint64_t) i) : i;
+    ASSERT_TRUE(BN_set_u64(bn.get(), abs));
+    BN_set_negative(bn.get(), i < 0);
+    ASSERT_TRUE(BN_to_ASN1_INTEGER(bn.get(), by_bn.get()));
+
+    EXPECT_EQ(0, ASN1_INTEGER_cmp(by_bn.get(), by_long.get()));
+
+    if (i >= 0) {
+      ASSERT_EQ(1, ASN1_INTEGER_set_uint64(by_uint64.get(), i));
+      EXPECT_EQ(0, ASN1_INTEGER_cmp(by_bn.get(), by_uint64.get()));
+    }
+  }
+}
@@ -27,6 +27,7 @@

 #if !defined(OPENSSL_WINDOWS)
 #include <arpa/inet.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <netinet/in.h>
 #include <string.h>
@@ -73,6 +73,7 @@ OPENSSL_MSVC_PRAGMA(warning(pop))
 #include <openssl/mem.h>

 #include "internal.h"
+#include "../internal.h"


 static int bio_fd_non_fatal_error(int err) {
@@ -81,6 +81,8 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>

+#include "../internal.h"
+

 #define BIO_FP_READ 0x02
 #define BIO_FP_WRITE 0x04
@@ -77,8 +77,9 @@ int BN_bn2cbb_padded(CBB *out, size_t len, const BIGNUM *in) {
 static const char hextable[] = "0123456789abcdef";

 char *BN_bn2hex(const BIGNUM *bn) {
+  int width = bn_minimal_width(bn);
  char *buf = OPENSSL_malloc(1 /* leading '-' */ + 1 /* zero is non-empty */ +
-                             bn->top * BN_BYTES * 2 + 1 /* trailing NUL */);
+                             width * BN_BYTES * 2 + 1 /* trailing NUL */);
  if (buf == NULL) {
    OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE);
    return NULL;
@@ -94,7 +95,7 @@ char *BN_bn2hex(const BIGNUM *bn) {
  }

  int z = 0;
-  for (int i = bn->top - 1; i >= 0; i--) {
+  for (int i = width - 1; i >= 0; i--) {
    for (int j = BN_BITS2 - 8; j >= 0; j -= 8) {
      // strip leading zeros
      int v = ((int)(bn->d[i] >> (long)j)) & 0xff;
@@ -153,7 +154,7 @@ static int decode_hex(BIGNUM *bn, const char *in, int in_len) {
    in_len -= todo;
  }
  assert(i <= bn->dmax);
-  bn->top = i;
+  bn->width = i;
  return 1;
 }

@@ -222,7 +223,7 @@ static int bn_x2bn(BIGNUM **outp, const char *in, decode_func decode, char_test_
    goto err;
  }

-  bn_correct_top(ret);
+  bn_set_minimal_width(ret);
  if (!BN_is_zero(ret)) {
    ret->neg = neg;
  }
@@ -347,7 +348,7 @@ int BN_print(BIO *bp, const BIGNUM *a) {
    goto end;
  }

-  for (i = a->top - 1; i >= 0; i--) {
+  for (i = bn_minimal_width(a) - 1; i >= 0; i--) {
    for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
      // strip leading zeros
      v = ((int)(a->d[i] >> (long)j)) & 0x0f;
@@ -889,54 +889,165 @@ TEST(CBSTest, BitString) {

 TEST(CBBTest, AddOIDFromText) {
  const struct {
-    const char *in;
-    bool ok;
-    std::vector<uint8_t> out;
-  } kTests[] = {
+    const char *text;
+    std::vector<uint8_t> der;
+  } kValidOIDs[] = {
      // Some valid values.
-      {"1.2.3.4", true, {0x2a, 0x3, 0x4}},
+      {"0.0", {0x00}},
+      {"0.2.3.4", {0x2, 0x3, 0x4}},
+      {"1.2.3.4", {0x2a, 0x3, 0x4}},
+      {"2.2.3.4", {0x52, 0x3, 0x4}},
      {"1.2.840.113554.4.1.72585",
-       true,
       {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x04, 0x01, 0x84, 0xb7, 0x09}},
      // Test edge cases around the first component.
-      {"0.39", true, {0x27}},
-      {"0.40", false, {}},
-      {"1.0", true, {0x28}},
-      {"1.39", true, {0x4f}},
-      {"1.40", false, {}},
-      {"2.0", true, {0x50}},
-      {"2.1", true, {0x51}},
-      {"2.40", true, {0x78}},
-      // The empty string is not an OID.
-      {"", false, {}},
-      // No empty components.
-      {".1.2.3.4.5", false, {}},
-      {"1..2.3.4.5", false, {}},
-      {"1.2.3.4.5.", false, {}},
-      // There must be at least two components.
-      {"1", false, {}},
-      // No extra leading zeros.
-      {"00.1.2.3.4", false, {}},
-      {"01.1.2.3.4", false, {}},
-      // Check for overflow.
-      {"1.2.4294967295", true, {0x2a, 0x8f, 0xff, 0xff, 0xff, 0x7f}},
-      {"1.2.4294967296", false, {}},
-      // 40*A + B overflows.
-      {"2.4294967215", true, {0x8f, 0xff, 0xff, 0xff, 0x7f}},
-      {"2.4294967216", false, {}},
+      {"0.39", {0x27}},
+      {"1.0", {0x28}},
+      {"1.39", {0x4f}},
+      {"2.0", {0x50}},
+      {"2.1", {0x51}},
+      {"2.40", {0x78}},
+      // Edge cases near an overflow.
+      {"1.2.18446744073709551615",
+       {0x2a, 0x81, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f}},
+      {"2.18446744073709551535",
+       {0x81, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f}},
  };
-  for (const auto &t : kTests) {
-    SCOPED_TRACE(t.in);
+
+  const char *kInvalidTexts[] = {
+      // Invalid second component.
+      "0.40",
+      "1.40",
+      // Invalid first component.
+      "3.1",
+      // The empty string is not an OID.
+      "",
+      // No empty components.
+      ".1.2.3.4.5",
+      "1..2.3.4.5",
+      "1.2.3.4.5.",
+      // There must be at least two components.
+      "1",
+      // No extra leading zeros.
+      "00.1.2.3.4",
+      "01.1.2.3.4",
+      // Overflow for both components or 40*A + B.
+      "1.2.18446744073709551616",
+      "2.18446744073709551536",
+  };
+
+  const std::vector<uint8_t> kInvalidDER[] = {
+      // The empty string is not an OID.
+      {},
+      // Non-minimal representation.
+      {0x80, 0x01},
+      // Overflow. This is the DER representation of
+      // 1.2.840.113554.4.1.72585.18446744073709551616. (The final value is
+      // 2^64.)
+      {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x04, 0x01, 0x84, 0xb7, 0x09,
+       0x82, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00},
+  };
+
+  for (const auto &t : kValidOIDs) {
+    SCOPED_TRACE(t.text);
+
    bssl::ScopedCBB cbb;
    ASSERT_TRUE(CBB_init(cbb.get(), 0));
-    int ok = CBB_add_asn1_oid_from_text(cbb.get(), t.in, strlen(t.in));
-    EXPECT_EQ(t.ok, static_cast<bool>(ok));
-    if (ok) {
-      uint8_t *out;
-      size_t len;
-      ASSERT_TRUE(CBB_finish(cbb.get(), &out, &len));
-      bssl::UniquePtr<uint8_t> free_out(out);
-      EXPECT_EQ(Bytes(t.out), Bytes(out, len));
-    }
+    ASSERT_TRUE(CBB_add_asn1_oid_from_text(cbb.get(), t.text, strlen(t.text)));
+    uint8_t *out;
+    size_t len;
+    ASSERT_TRUE(CBB_finish(cbb.get(), &out, &len));
+    bssl::UniquePtr<uint8_t> free_out(out);
+    EXPECT_EQ(Bytes(t.der), Bytes(out, len));
+
+    CBS cbs;
+    CBS_init(&cbs, t.der.data(), t.der.size());
+    bssl::UniquePtr<char> text(CBS_asn1_oid_to_text(&cbs));
+    ASSERT_TRUE(text.get());
+    EXPECT_STREQ(t.text, text.get());
+  }
+
+  for (const char *t : kInvalidTexts) {
+    SCOPED_TRACE(t);
+    bssl::ScopedCBB cbb;
+    ASSERT_TRUE(CBB_init(cbb.get(), 0));
+    EXPECT_FALSE(CBB_add_asn1_oid_from_text(cbb.get(), t, strlen(t)));
+  }
+
+  for (const auto &t : kInvalidDER) {
+    SCOPED_TRACE(Bytes(t));
+    CBS cbs;
+    CBS_init(&cbs, t.data(), t.size());
+    bssl::UniquePtr<char> text(CBS_asn1_oid_to_text(&cbs));
+    EXPECT_FALSE(text);
+  }
+}
+
+TEST(CBBTest, FlushASN1SetOf) {
+  const struct {
+    std::vector<uint8_t> in, out;
+  } kValidInputs[] = {
+    // No elements.
+    {{}, {}},
+    // One element.
+    {{0x30, 0x00}, {0x30, 0x00}},
+    // Two identical elements.
+    {{0x30, 0x00, 0x30, 0x00}, {0x30, 0x00, 0x30, 0x00}},
+    // clang-format off
+    {{0x30, 0x02, 0x00, 0x00,
+      0x30, 0x00,
+      0x01, 0x00,
+      0x30, 0x02, 0x00, 0x00,
+      0x30, 0x03, 0x00, 0x00, 0x00,
+      0x30, 0x00,
+      0x30, 0x03, 0x00, 0x00, 0x01,
+      0x30, 0x01, 0x00,
+      0x01, 0x01, 0x00},
+     {0x01, 0x00,
+      0x01, 0x01, 0x00,
+      0x30, 0x00,
+      0x30, 0x00,
+      0x30, 0x01, 0x00,
+      0x30, 0x02, 0x00, 0x00,
+      0x30, 0x02, 0x00, 0x00,
+      0x30, 0x03, 0x00, 0x00, 0x00,
+      0x30, 0x03, 0x00, 0x00, 0x01}},
+    // clang-format on
+  };
+
+  for (const auto &t : kValidInputs) {
+    SCOPED_TRACE(Bytes(t.in));
+
+    bssl::ScopedCBB cbb;
+    CBB child;
+    ASSERT_TRUE(CBB_init(cbb.get(), 0));
+    ASSERT_TRUE(CBB_add_asn1(cbb.get(), &child, CBS_ASN1_SET));
+    ASSERT_TRUE(CBB_add_bytes(&child, t.in.data(), t.in.size()));
+    ASSERT_TRUE(CBB_flush_asn1_set_of(&child));
+    EXPECT_EQ(Bytes(t.out), Bytes(CBB_data(&child), CBB_len(&child)));
+
+    // Running it again should be idempotent.
+    ASSERT_TRUE(CBB_flush_asn1_set_of(&child));
+    EXPECT_EQ(Bytes(t.out), Bytes(CBB_data(&child), CBB_len(&child)));
+
+    // The ASN.1 header remain intact.
+    ASSERT_TRUE(CBB_flush(cbb.get()));
+    EXPECT_EQ(0x31, CBB_data(cbb.get())[0]);
+  }
+
+  const std::vector<uint8_t> kInvalidInputs[] = {
+    {0x30},
+    {0x30, 0x01},
+    {0x30, 0x00, 0x30, 0x00, 0x30, 0x01},
+  };
+
+  for (const auto &t : kInvalidInputs) {
+    SCOPED_TRACE(Bytes(t));
+
+    bssl::ScopedCBB cbb;
+    CBB child;
+    ASSERT_TRUE(CBB_init(cbb.get(), 0));
+    ASSERT_TRUE(CBB_add_asn1(cbb.get(), &child, CBS_ASN1_SET));
+    ASSERT_TRUE(CBB_add_bytes(&child, t.data(), t.size()));
+    EXPECT_FALSE(CBB_flush_asn1_set_of(&child));
  }
 }
@@ -18,6 +18,7 @@
 #include <limits.h>
 #include <string.h>

+#include <openssl/buf.h>
 #include <openssl/mem.h>

 #include "../internal.h"
@@ -332,9 +333,9 @@ int CBB_add_u24_length_prefixed(CBB *cbb, CBB *out_contents) {
 // add_base128_integer encodes |v| as a big-endian base-128 integer where the
 // high bit of each byte indicates where there is more data. This is the
 // encoding used in DER for both high tag number form and OID components.
-static int add_base128_integer(CBB *cbb, uint32_t v) {
+static int add_base128_integer(CBB *cbb, uint64_t v) {
  unsigned len_len = 0;
-  unsigned copy = v;
+  uint64_t copy = v;
  while (copy > 0) {
    len_len++;
    copy >>= 7;
@@ -504,11 +505,33 @@ int CBB_add_asn1_uint64(CBB *cbb, uint64_t value) {
  return CBB_flush(cbb);
 }

+int CBB_add_asn1_octet_string(CBB *cbb, const uint8_t *data, size_t data_len) {
+  CBB child;
+  if (!CBB_add_asn1(cbb, &child, CBS_ASN1_OCTETSTRING) ||
+      !CBB_add_bytes(&child, data, data_len) ||
+      !CBB_flush(cbb)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+int CBB_add_asn1_bool(CBB *cbb, int value) {
+  CBB child;
+  if (!CBB_add_asn1(cbb, &child, CBS_ASN1_BOOLEAN) ||
+      !CBB_add_u8(&child, value != 0 ? 0xff : 0) ||
+      !CBB_flush(cbb)) {
+    return 0;
+  }
+
+  return 1;
+}
+
 // parse_dotted_decimal parses one decimal component from |cbs|, where |cbs| is
 // an OID literal, e.g., "1.2.840.113554.4.1.72585". It consumes both the
 // component and the dot, so |cbs| may be passed into the function again for the
 // next value.
-static int parse_dotted_decimal(CBS *cbs, uint32_t *out) {
+static int parse_dotted_decimal(CBS *cbs, uint64_t *out) {
  *out = 0;
  int seen_digit = 0;
  for (;;) {
@@ -524,8 +547,8 @@ static int parse_dotted_decimal(CBS *cbs, uint32_t *out) {
        // Forbid stray leading zeros.
        (seen_digit && *out == 0) ||
        // Check for overflow.
-        *out > UINT32_MAX / 10 ||
-        *out * 10 > UINT32_MAX - (u - '0')) {
+        *out > UINT64_MAX / 10 ||
+        *out * 10 > UINT64_MAX - (u - '0')) {
      return 0;
    }
    *out = *out * 10 + (u - '0');
@@ -544,7 +567,7 @@ int CBB_add_asn1_oid_from_text(CBB *cbb, const char *text, size_t len) {
  CBS_init(&cbs, (const uint8_t *)text, len);

  // OIDs must have at least two components.
-  uint32_t a, b;
+  uint64_t a, b;
  if (!parse_dotted_decimal(&cbs, &a) ||
      !parse_dotted_decimal(&cbs, &b)) {
    return 0;
@@ -554,8 +577,8 @@ int CBB_add_asn1_oid_from_text(CBB *cbb, const char *text, size_t len) {
  // 0, 1, or 2 and that, when it is 0 or 1, |b| is at most 39.
  if (a > 2 ||
      (a < 2 && b > 39) ||
-      b > UINT32_MAX - 80 ||
-      !add_base128_integer(cbb, 40 * a + b)) {
+      b > UINT64_MAX - 80 ||
+      !add_base128_integer(cbb, 40u * a + b)) {
    return 0;
  }

@@ -569,3 +592,77 @@ int CBB_add_asn1_oid_from_text(CBB *cbb, const char *text, size_t len) {

  return 1;
 }
+
+static int compare_set_of_element(const void *a_ptr, const void *b_ptr) {
+  // See X.690, section 11.6 for the ordering. They are sorted in ascending
+  // order by their DER encoding.
+  const CBS *a = a_ptr, *b = b_ptr;
+  size_t a_len = CBS_len(a), b_len = CBS_len(b);
+  size_t min_len = a_len < b_len ? a_len : b_len;
+  int ret = OPENSSL_memcmp(CBS_data(a), CBS_data(b), min_len);
+  if (ret != 0) {
+    return ret;
+  }
+  if (a_len == b_len) {
+    return 0;
+  }
+  // If one is a prefix of the other, the shorter one sorts first. (This is not
+  // actually reachable. No DER encoding is a prefix of another DER encoding.)
+  return a_len < b_len ? -1 : 1;
+}
+
+int CBB_flush_asn1_set_of(CBB *cbb) {
+  if (!CBB_flush(cbb)) {
+    return 0;
+  }
+
+  CBS cbs;
+  size_t num_children = 0;
+  CBS_init(&cbs, CBB_data(cbb), CBB_len(cbb));
+  while (CBS_len(&cbs) != 0) {
+    if (!CBS_get_any_asn1_element(&cbs, NULL, NULL, NULL)) {
+      return 0;
+    }
+    num_children++;
+  }
+
+  if (num_children < 2) {
+    return 1;  // Nothing to do. This is the common case for X.509.
+  }
+  if (num_children > ((size_t)-1) / sizeof(CBS)) {
+    return 0;  // Overflow.
+  }
+
+  // Parse out the children and sort. We alias them into a copy of so they
+  // remain valid as we rewrite |cbb|.
+  int ret = 0;
+  size_t buf_len = CBB_len(cbb);
+  uint8_t *buf = BUF_memdup(CBB_data(cbb), buf_len);
+  CBS *children = OPENSSL_malloc(num_children * sizeof(CBS));
+  if (buf == NULL || children == NULL) {
+    goto err;
+  }
+  CBS_init(&cbs, buf, buf_len);
+  for (size_t i = 0; i < num_children; i++) {
+    if (!CBS_get_any_asn1_element(&cbs, &children[i], NULL, NULL)) {
+      goto err;
+    }
+  }
+  qsort(children, num_children, sizeof(CBS), compare_set_of_element);
+
+  // Rewind |cbb| and write the contents back in the new order.
+  cbb->base->len = cbb->offset + cbb->pending_len_len;
+  for (size_t i = 0; i < num_children; i++) {
+    if (!CBB_add_bytes(cbb, CBS_data(&children[i]), CBS_len(&children[i]))) {
+      goto err;
+    }
+  }
+  assert(CBB_len(cbb) == buf_len);
+
+  ret = 1;
+
+err:
+  OPENSSL_free(buf);
+  OPENSSL_free(children);
+  return ret;
+}
@@ -12,11 +12,16 @@
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

+#if !defined(__STDC_FORMAT_MACROS)
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <openssl/buf.h>
 #include <openssl/mem.h>
 #include <openssl/bytestring.h>

 #include <assert.h>
+#include <inttypes.h>
 #include <string.h>

 #include "internal.h"
@@ -175,6 +180,33 @@ int CBS_get_u24_length_prefixed(CBS *cbs, CBS *out) {
  return cbs_get_length_prefixed(cbs, out, 3);
 }

+// parse_base128_integer reads a big-endian base-128 integer from |cbs| and sets
+// |*out| to the result. This is the encoding used in DER for both high tag
+// number form and OID components.
+static int parse_base128_integer(CBS *cbs, uint64_t *out) {
+  uint64_t v = 0;
+  uint8_t b;
+  do {
+    if (!CBS_get_u8(cbs, &b)) {
+      return 0;
+    }
+    if ((v >> (64 - 7)) != 0) {
+      // The value is too large.
+      return 0;
+    }
+    if (v == 0 && b == 0x80) {
+      // The value must be minimally encoded.
+      return 0;
+    }
+    v = (v << 7) | (b & 0x7f);
+
+    // Values end at an octet with the high bit cleared.
+  } while (b & 0x80);
+
+  *out = v;
+  return 1;
+}
+
 static int parse_asn1_tag(CBS *cbs, unsigned *out) {
  uint8_t tag_byte;
  if (!CBS_get_u8(cbs, &tag_byte)) {
@@ -191,27 +223,15 @@ static int parse_asn1_tag(CBS *cbs, unsigned *out) {
  unsigned tag = ((unsigned)tag_byte & 0xe0) << CBS_ASN1_TAG_SHIFT;
  unsigned tag_number = tag_byte & 0x1f;
  if (tag_number == 0x1f) {
-    tag_number = 0;
-    for (;;) {
-      if (!CBS_get_u8(cbs, &tag_byte) ||
-          ((tag_number << 7) >> 7) != tag_number) {
-        return 0;
-      }
-      tag_number = (tag_number << 7) | (tag_byte & 0x7f);
-      // The tag must be represented in the minimal number of bytes.
-      if (tag_number == 0) {
-        return 0;
-      }
-      if ((tag_byte & 0x80) == 0) {
-        break;
-      }
-    }
-    if (// Check the tag number is within our supported bounds.
-        tag_number > CBS_ASN1_TAG_NUMBER_MASK ||
+    uint64_t v;
+    if (!parse_base128_integer(cbs, &v) ||
+        // Check the tag number is within our supported bounds.
+        v > CBS_ASN1_TAG_NUMBER_MASK ||
        // Small tag numbers should have used low tag number form.
-        tag_number < 0x1f) {
+        v < 0x1f) {
      return 0;
    }
+    tag_number = (unsigned)v;
  }

  tag |= tag_number;
@@ -405,6 +425,22 @@ int CBS_get_asn1_uint64(CBS *cbs, uint64_t *out) {
  return 1;
 }

+int CBS_get_asn1_bool(CBS *cbs, int *out) {
+  CBS bytes;
+  if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_BOOLEAN) ||
+      CBS_len(&bytes) != 1) {
+    return 0;
+  }
+
+  const uint8_t value = *CBS_data(&bytes);
+  if (value != 0 && value != 0xff) {
+    return 0;
+  }
+
+  *out = !!value;
+  return 1;
+}
+
 int CBS_get_optional_asn1(CBS *cbs, CBS *out, int *out_present, unsigned tag) {
  int present = 0;

@@ -527,3 +563,55 @@ int CBS_asn1_bitstring_has_bit(const CBS *cbs, unsigned bit) {
  return byte_num < CBS_len(cbs) &&
         (CBS_data(cbs)[byte_num] & (1 << bit_num)) != 0;
 }
+
+static int add_decimal(CBB *out, uint64_t v) {
+  char buf[DECIMAL_SIZE(uint64_t) + 1];
+  BIO_snprintf(buf, sizeof(buf), "%" PRIu64, v);
+  return CBB_add_bytes(out, (const uint8_t *)buf, strlen(buf));
+}
+
+char *CBS_asn1_oid_to_text(const CBS *cbs) {
+  CBB cbb;
+  if (!CBB_init(&cbb, 32)) {
+    goto err;
+  }
+
+  CBS copy = *cbs;
+  // The first component is 40 * value1 + value2, where value1 is 0, 1, or 2.
+  uint64_t v;
+  if (!parse_base128_integer(&copy, &v)) {
+    goto err;
+  }
+
+  if (v >= 80) {
+    if (!CBB_add_bytes(&cbb, (const uint8_t *)"2.", 2) ||
+        !add_decimal(&cbb, v - 80)) {
+      goto err;
+    }
+  } else if (!add_decimal(&cbb, v / 40) ||
+             !CBB_add_u8(&cbb, '.') ||
+             !add_decimal(&cbb, v % 40)) {
+    goto err;
+  }
+
+  while (CBS_len(&copy) != 0) {
+    if (!parse_base128_integer(&copy, &v) ||
+        !CBB_add_u8(&cbb, '.') ||
+        !add_decimal(&cbb, v)) {
+      goto err;
+    }
+  }
+
+  uint8_t *txt;
+  size_t txt_len;
+  if (!CBB_add_u8(&cbb, '\0') ||
+      !CBB_finish(&cbb, &txt, &txt_len)) {
+    goto err;
+  }
+
+  return (char *)txt;
+
+err:
+  CBB_cleanup(&cbb);
+  return NULL;
+}
@@ -171,6 +171,10 @@ my @ret;
 $code.=<<___;
 #include <openssl/arm_arch.h>

+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
 .text
 #if defined(__thumb2__) || defined(__clang__)
 .syntax	unified
@@ -28,6 +28,7 @@
 # Denver		4.50/+82%       2.63		2.67(*)
 # X-Gene		9.50/+46%       8.82		8.89(*)
 # Mongoose		8.00/+44%	3.64		3.25
+# Kryo			8.17/+50%	4.83		4.65
 #
 # (*)	it's expected that doubling interleave factor doesn't help
 #	all processors, only those with higher NEON latency and
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -22,6 +22,7 @@ add_library(
  e_rc4.c
  e_aesgcmsiv.c
  e_aesctrhmac.c
+  e_aesccm.c
  e_chacha20poly1305.c

  tls_cbc.c
@@ -93,6 +93,8 @@ static const struct KnownAEAD kAEADs[] = {
     "aes_128_ctr_hmac_sha256.txt", false, true, 0},
    {"AES_256_CTR_HMAC_SHA256", EVP_aead_aes_256_ctr_hmac_sha256,
     "aes_256_ctr_hmac_sha256.txt", false, true, 0},
+    {"AES_128_CCM_BLUETOOTH", EVP_aead_aes_128_ccm_bluetooth,
+     "aes_128_ccm_bluetooth_tests.txt", false, false, 0},
 };

 class PerAEADTest : public testing::TestWithParam<KnownAEAD> {
@@ -651,3 +653,39 @@ TEST(AEADTest, AESGCMEmptyNonce) {
  EXPECT_EQ(ERR_LIB_CIPHER, ERR_GET_LIB(err));
  EXPECT_EQ(CIPHER_R_INVALID_NONCE_SIZE, ERR_GET_REASON(err));
 }
+
+TEST(AEADTest, AESCCMLargeAD) {
+  static const std::vector<uint8_t> kKey(16, 'A');
+  static const std::vector<uint8_t> kNonce(13, 'N');
+  static const std::vector<uint8_t> kAD(65536, 'D');
+  static const std::vector<uint8_t> kPlaintext = {
+      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+  static const std::vector<uint8_t> kCiphertext = {
+      0xa2, 0x12, 0x3f, 0x0b, 0x07, 0xd5, 0x02, 0xff,
+      0xa9, 0xcd, 0xa0, 0xf3, 0x69, 0x1c, 0x49, 0x0c};
+  static const std::vector<uint8_t> kTag = {0x4a, 0x31, 0x82, 0x96};
+
+  // Test AES-128-CCM-Bluetooth.
+  bssl::ScopedEVP_AEAD_CTX ctx;
+  ASSERT_TRUE(EVP_AEAD_CTX_init(ctx.get(), EVP_aead_aes_128_ccm_bluetooth(),
+                                kKey.data(), kKey.size(),
+                                EVP_AEAD_DEFAULT_TAG_LENGTH, nullptr));
+
+  std::vector<uint8_t> out(kCiphertext.size() + kTag.size());
+  size_t out_len;
+  EXPECT_TRUE(EVP_AEAD_CTX_seal(ctx.get(), out.data(), &out_len, out.size(),
+                                kNonce.data(), kNonce.size(), kPlaintext.data(),
+                                kPlaintext.size(), kAD.data(), kAD.size()));
+
+  ASSERT_EQ(out_len, kCiphertext.size() + kTag.size());
+  EXPECT_EQ(Bytes(kCiphertext), Bytes(out.data(), kCiphertext.size()));
+  EXPECT_EQ(Bytes(kTag), Bytes(out.data() + kCiphertext.size(), kTag.size()));
+
+  EXPECT_TRUE(EVP_AEAD_CTX_open(ctx.get(), out.data(), &out_len, out.size(),
+                                kNonce.data(), kNonce.size(), out.data(),
+                                out.size(), kAD.data(), kAD.size()));
+
+  ASSERT_EQ(out_len, kPlaintext.size());
+  EXPECT_EQ(Bytes(kPlaintext), Bytes(out.data(), kPlaintext.size()));
+}
@@ -0,0 +1,171 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/aead.h>
+
+#include <assert.h>
+
+#include <openssl/cipher.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "../fipsmodule/cipher/internal.h"
+
+
+#define EVP_AEAD_AES_CCM_BLUETOOTH_TAG_LEN 4
+#define EVP_AEAD_AES_CCM_BLUETOOTH_NONCE_LEN 13
+
+#define EVP_AEAD_AES_CCM_MAX_TAG_LEN 16
+
+struct aead_aes_ccm_ctx {
+  union {
+    double align;
+    AES_KEY ks;
+  } ks;
+  CCM128_CONTEXT ccm;
+};
+
+static int aead_aes_ccm_bluetooth_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
+                                       size_t key_len, size_t tag_len) {
+  if (key_len != 16) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH);
+    return 0;  // EVP_AEAD_CTX_init should catch this.
+  }
+
+  if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) {
+    tag_len = EVP_AEAD_AES_CCM_BLUETOOTH_TAG_LEN;
+  }
+
+  if (tag_len != EVP_AEAD_AES_CCM_BLUETOOTH_TAG_LEN) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE);
+    return 0;
+  }
+
+  struct aead_aes_ccm_ctx *ccm_ctx =
+      OPENSSL_malloc(sizeof(struct aead_aes_ccm_ctx));
+  if (ccm_ctx == NULL) {
+    OPENSSL_PUT_ERROR(CIPHER, ERR_R_MALLOC_FAILURE);
+    return 0;
+  }
+
+  block128_f block;
+  ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len);
+  ctx->tag_len = tag_len;
+  if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, tag_len,
+                          15 - EVP_AEAD_AES_CCM_BLUETOOTH_NONCE_LEN)) {
+    OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR);
+    OPENSSL_free(ccm_ctx);
+    return 0;
+  }
+
+  ctx->aead_state = ccm_ctx;
+  return 1;
+}
+
+static void aead_aes_ccm_cleanup(EVP_AEAD_CTX *ctx) {
+  OPENSSL_free(ctx->aead_state);
+}
+
+static int aead_aes_ccm_seal_scatter(
+    const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag,
+    size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce,
+    size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in,
+    size_t extra_in_len, const uint8_t *ad, size_t ad_len) {
+  const struct aead_aes_ccm_ctx *ccm_ctx = ctx->aead_state;
+
+  if (in_len > CRYPTO_ccm128_max_input(&ccm_ctx->ccm)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
+    return 0;
+  }
+
+  if (max_out_tag_len < ctx->tag_len) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL);
+    return 0;
+  }
+
+  if (nonce_len != EVP_AEAD_nonce_length(ctx->aead)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
+    return 0;
+  }
+
+  if (!CRYPTO_ccm128_encrypt(&ccm_ctx->ccm, &ccm_ctx->ks.ks, out, out_tag,
+                             ctx->tag_len, nonce, nonce_len, in, in_len, ad,
+                             ad_len)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
+    return 0;
+  }
+
+  *out_tag_len = ctx->tag_len;
+  return 1;
+}
+
+static int aead_aes_ccm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
+                                    const uint8_t *nonce, size_t nonce_len,
+                                    const uint8_t *in, size_t in_len,
+                                    const uint8_t *in_tag, size_t in_tag_len,
+                                    const uint8_t *ad, size_t ad_len) {
+  const struct aead_aes_ccm_ctx *ccm_ctx = ctx->aead_state;
+
+  if (in_len > CRYPTO_ccm128_max_input(&ccm_ctx->ccm)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
+    return 0;
+  }
+
+  if (nonce_len != EVP_AEAD_nonce_length(ctx->aead)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
+    return 0;
+  }
+
+  if (in_tag_len != ctx->tag_len) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
+    return 0;
+  }
+
+  uint8_t tag[EVP_AEAD_AES_CCM_MAX_TAG_LEN];
+  assert(ctx->tag_len <= EVP_AEAD_AES_CCM_MAX_TAG_LEN);
+  if (!CRYPTO_ccm128_decrypt(&ccm_ctx->ccm, &ccm_ctx->ks.ks, out, tag,
+                             ctx->tag_len, nonce, nonce_len, in, in_len, ad,
+                             ad_len)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
+    return 0;
+  }
+
+  if (CRYPTO_memcmp(tag, in_tag, ctx->tag_len) != 0) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
+    return 0;
+  }
+
+  return 1;
+}
+
+static const EVP_AEAD aead_aes_128_ccm_bluetooth = {
+    16,
+    EVP_AEAD_AES_CCM_BLUETOOTH_NONCE_LEN,  // nonce length
+    EVP_AEAD_AES_CCM_BLUETOOTH_TAG_LEN,    // overhead
+    EVP_AEAD_AES_CCM_BLUETOOTH_TAG_LEN,    // max tag length
+    0,                                     // seal_scatter_supports_extra_in
+
+    aead_aes_ccm_bluetooth_init,
+    NULL /* init_with_direction */,
+    aead_aes_ccm_cleanup,
+    NULL /* open */,
+    aead_aes_ccm_seal_scatter,
+    aead_aes_ccm_open_gather,
+    NULL /* get_iv */,
+    NULL /* tag_len */,
+};
+
+const EVP_AEAD *EVP_aead_aes_128_ccm_bluetooth(void) {
+  return &aead_aes_128_ccm_bluetooth;
+}
@@ -57,6 +57,8 @@
 #include <openssl/cipher.h>
 #include <openssl/nid.h>

+#include "../internal.h"
+

 #define c2l(c, l)                         \
  do {                                    \
@@ -191,8 +191,7 @@ static int aead_tls_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out,
  // block from encrypting the input and split the result between |out| and
  // |out_tag|. Then feed the rest.

-  const size_t early_mac_len =
-      (block_size - (in_len % block_size) % block_size);
+  const size_t early_mac_len = (block_size - (in_len % block_size)) % block_size;
  if (early_mac_len != 0) {
    assert(len + block_size - early_mac_len == in_len);
    uint8_t buf[EVP_MAX_BLOCK_LENGTH];
@@ -0,0 +1,20 @@
+KEY: 404142434445464748494a4b4c4d4e4f
+NONCE: 101112131415161718191a1b1c
+IN: 20212223
+AD: 0001020304050607
+CT: 69915dad
+TAG: 064617ca
+
+KEY: 404142434445464748494a4b4c4d4e4f
+NONCE: 101112131415161718191a1b1c
+IN: 202122232425262728292a2b2c2d2e2f
+AD: 0001020304050607
+CT: 69915dad1e84c6376a68c2967e4dab61
+TAG: 99763ebb
+
+KEY: 404142434445464748494a4b4c4d4e4f
+NONCE: 101112131415161718191a1b1c
+IN: 202122232425262728292a2b2c2d2e2f
+AD:
+CT: 69915dad1e84c6376a68c2967e4dab61
+TAG: c4630026
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/cpu.h>
+
+#if defined(OPENSSL_AARCH64) && defined(OPENSSL_FUCHSIA) && \
+    !defined(OPENSSL_STATIC_ARMCAP)
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+#include <zircon/types.h>
+
+#include <openssl/arm_arch.h>
+
+#include "internal.h"
+
+extern uint32_t OPENSSL_armcap_P;
+
+void OPENSSL_cpuid_setup(void) {
+  uint32_t hwcap;
+  zx_status_t rc = zx_system_get_features(ZX_FEATURE_KIND_CPU, &hwcap);
+  if (rc != ZX_OK || (hwcap & ZX_ARM64_FEATURE_ISA_ASIMD) == 0) {
+    // Matching OpenSSL, if NEON/ASIMD is missing, don't report other features
+    // either.
+    return;
+  }
+
+  OPENSSL_armcap_P |= ARMV7_NEON;
+
+  if (hwcap & ZX_ARM64_FEATURE_ISA_AES) {
+    OPENSSL_armcap_P |= ARMV8_AES;
+  }
+  if (hwcap & ZX_ARM64_FEATURE_ISA_PMULL) {
+    OPENSSL_armcap_P |= ARMV8_PMULL;
+  }
+  if (hwcap & ZX_ARM64_FEATURE_ISA_SHA1) {
+    OPENSSL_armcap_P |= ARMV8_SHA1;
+  }
+  if (hwcap & ZX_ARM64_FEATURE_ISA_SHA2) {
+    OPENSSL_armcap_P |= ARMV8_SHA256;
+  }
+}
+
+#endif  // OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP
@@ -14,7 +14,8 @@

 #include <openssl/cpu.h>

-#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_STATIC_ARMCAP)
+#if defined(OPENSSL_AARCH64) && defined(OPENSSL_LINUX) && \
+    !defined(OPENSSL_STATIC_ARMCAP)

 #include <sys/auxv.h>

@@ -8,21 +8,12 @@ if (${ARCH} STREQUAL "arm")
  )
 endif()

-if (${ARCH} STREQUAL "x86_64")
-  set(
-    CURVE25519_ARCH_SOURCES
-
-    asm/x25519-asm-x86_64.S
-  )
-endif()
-
 add_library(
  curve25519

  OBJECT

  spake25519.c
-  x25519-x86_64.c

  ${CURVE25519_ARCH_SOURCES}
 )
@@ -44,6 +44,28 @@ TEST(Ed25519Test, TestVectors) {
  });
 }

+TEST(Ed25519Test, Malleability) {
+  // https://tools.ietf.org/html/rfc8032#section-5.1.7 adds an additional test
+  // that s be in [0, order). This prevents someone from adding a multiple of
+  // order to s and obtaining a second valid signature for the same message.
+  static const uint8_t kMsg[] = {0x54, 0x65, 0x73, 0x74};
+  static const uint8_t kSig[] = {
+      0x7c, 0x38, 0xe0, 0x26, 0xf2, 0x9e, 0x14, 0xaa, 0xbd, 0x05, 0x9a,
+      0x0f, 0x2d, 0xb8, 0xb0, 0xcd, 0x78, 0x30, 0x40, 0x60, 0x9a, 0x8b,
+      0xe6, 0x84, 0xdb, 0x12, 0xf8, 0x2a, 0x27, 0x77, 0x4a, 0xb0, 0x67,
+      0x65, 0x4b, 0xce, 0x38, 0x32, 0xc2, 0xd7, 0x6f, 0x8f, 0x6f, 0x5d,
+      0xaf, 0xc0, 0x8d, 0x93, 0x39, 0xd4, 0xee, 0xf6, 0x76, 0x57, 0x33,
+      0x36, 0xa5, 0xc5, 0x1e, 0xb6, 0xf9, 0x46, 0xb3, 0x1d,
+  };
+  static const uint8_t kPub[] = {
+      0x7d, 0x4d, 0x0e, 0x7f, 0x61, 0x53, 0xa6, 0x9b, 0x62, 0x42, 0xb5,
+      0x22, 0xab, 0xbe, 0xe6, 0x85, 0xfd, 0xa4, 0x42, 0x0f, 0x88, 0x34,
+      0xb1, 0x08, 0xc3, 0xbd, 0xae, 0x36, 0x9e, 0xf5, 0x49, 0xfa,
+  };
+
+  EXPECT_FALSE(ED25519_verify(kMsg, sizeof(kMsg), kSig, kPub));
+}
+
 TEST(Ed25519Test, KeypairFromSeed) {
  uint8_t public_key1[32], private_key1[64];
  ED25519_keypair(public_key1, private_key1);
@@ -1,247 +0,0 @@
-/* Copyright (c) 2015, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// This code is mostly taken from the ref10 version of Ed25519 in SUPERCOP
-// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
-// public domain but this file has the ISC license just to keep licencing
-// simple.
-//
-// The field functions are shared by Ed25519 and X25519 where possible.
-
-#include <openssl/curve25519.h>
-
-#include <string.h>
-
-#include "../internal.h"
-#include "../../third_party/fiat/internal.h"
-
-
-#if defined(BORINGSSL_X25519_X86_64)
-
-typedef struct { uint64_t v[5]; } fe25519;
-
-// These functions are defined in asm/x25519-x86_64.S
-void x25519_x86_64_work_cswap(fe25519 *, uint64_t);
-void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b);
-void x25519_x86_64_square(fe25519 *out, const fe25519 *a);
-void x25519_x86_64_freeze(fe25519 *);
-void x25519_x86_64_ladderstep(fe25519 *work);
-
-static void fe25519_setint(fe25519 *r, unsigned v) {
-  r->v[0] = v;
-  r->v[1] = 0;
-  r->v[2] = 0;
-  r->v[3] = 0;
-  r->v[4] = 0;
-}
-
-// Assumes input x being reduced below 2^255
-static void fe25519_pack(unsigned char r[32], const fe25519 *x) {
-  fe25519 t;
-  t = *x;
-  x25519_x86_64_freeze(&t);
-
-  r[0] = (uint8_t)(t.v[0] & 0xff);
-  r[1] = (uint8_t)((t.v[0] >> 8) & 0xff);
-  r[2] = (uint8_t)((t.v[0] >> 16) & 0xff);
-  r[3] = (uint8_t)((t.v[0] >> 24) & 0xff);
-  r[4] = (uint8_t)((t.v[0] >> 32) & 0xff);
-  r[5] = (uint8_t)((t.v[0] >> 40) & 0xff);
-  r[6] = (uint8_t)((t.v[0] >> 48));
-
-  r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8);
-  r[7] = (uint8_t)((t.v[1] >> 5) & 0xff);
-  r[8] = (uint8_t)((t.v[1] >> 13) & 0xff);
-  r[9] = (uint8_t)((t.v[1] >> 21) & 0xff);
-  r[10] = (uint8_t)((t.v[1] >> 29) & 0xff);
-  r[11] = (uint8_t)((t.v[1] >> 37) & 0xff);
-  r[12] = (uint8_t)((t.v[1] >> 45));
-
-  r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0);
-  r[13] = (uint8_t)((t.v[2] >> 2) & 0xff);
-  r[14] = (uint8_t)((t.v[2] >> 10) & 0xff);
-  r[15] = (uint8_t)((t.v[2] >> 18) & 0xff);
-  r[16] = (uint8_t)((t.v[2] >> 26) & 0xff);
-  r[17] = (uint8_t)((t.v[2] >> 34) & 0xff);
-  r[18] = (uint8_t)((t.v[2] >> 42) & 0xff);
-  r[19] = (uint8_t)((t.v[2] >> 50));
-
-  r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe);
-  r[20] = (uint8_t)((t.v[3] >> 7) & 0xff);
-  r[21] = (uint8_t)((t.v[3] >> 15) & 0xff);
-  r[22] = (uint8_t)((t.v[3] >> 23) & 0xff);
-  r[23] = (uint8_t)((t.v[3] >> 31) & 0xff);
-  r[24] = (uint8_t)((t.v[3] >> 39) & 0xff);
-  r[25] = (uint8_t)((t.v[3] >> 47));
-
-  r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0);
-  r[26] = (uint8_t)((t.v[4] >> 4) & 0xff);
-  r[27] = (uint8_t)((t.v[4] >> 12) & 0xff);
-  r[28] = (uint8_t)((t.v[4] >> 20) & 0xff);
-  r[29] = (uint8_t)((t.v[4] >> 28) & 0xff);
-  r[30] = (uint8_t)((t.v[4] >> 36) & 0xff);
-  r[31] = (uint8_t)((t.v[4] >> 44));
-}
-
-static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) {
-  r->v[0] = x[0];
-  r->v[0] += (uint64_t)x[1] << 8;
-  r->v[0] += (uint64_t)x[2] << 16;
-  r->v[0] += (uint64_t)x[3] << 24;
-  r->v[0] += (uint64_t)x[4] << 32;
-  r->v[0] += (uint64_t)x[5] << 40;
-  r->v[0] += ((uint64_t)x[6] & 7) << 48;
-
-  r->v[1] = x[6] >> 3;
-  r->v[1] += (uint64_t)x[7] << 5;
-  r->v[1] += (uint64_t)x[8] << 13;
-  r->v[1] += (uint64_t)x[9] << 21;
-  r->v[1] += (uint64_t)x[10] << 29;
-  r->v[1] += (uint64_t)x[11] << 37;
-  r->v[1] += ((uint64_t)x[12] & 63) << 45;
-
-  r->v[2] = x[12] >> 6;
-  r->v[2] += (uint64_t)x[13] << 2;
-  r->v[2] += (uint64_t)x[14] << 10;
-  r->v[2] += (uint64_t)x[15] << 18;
-  r->v[2] += (uint64_t)x[16] << 26;
-  r->v[2] += (uint64_t)x[17] << 34;
-  r->v[2] += (uint64_t)x[18] << 42;
-  r->v[2] += ((uint64_t)x[19] & 1) << 50;
-
-  r->v[3] = x[19] >> 1;
-  r->v[3] += (uint64_t)x[20] << 7;
-  r->v[3] += (uint64_t)x[21] << 15;
-  r->v[3] += (uint64_t)x[22] << 23;
-  r->v[3] += (uint64_t)x[23] << 31;
-  r->v[3] += (uint64_t)x[24] << 39;
-  r->v[3] += ((uint64_t)x[25] & 15) << 47;
-
-  r->v[4] = x[25] >> 4;
-  r->v[4] += (uint64_t)x[26] << 4;
-  r->v[4] += (uint64_t)x[27] << 12;
-  r->v[4] += (uint64_t)x[28] << 20;
-  r->v[4] += (uint64_t)x[29] << 28;
-  r->v[4] += (uint64_t)x[30] << 36;
-  r->v[4] += ((uint64_t)x[31] & 127) << 44;
-}
-
-static void fe25519_invert(fe25519 *r, const fe25519 *x) {
-  fe25519 z2;
-  fe25519 z9;
-  fe25519 z11;
-  fe25519 z2_5_0;
-  fe25519 z2_10_0;
-  fe25519 z2_20_0;
-  fe25519 z2_50_0;
-  fe25519 z2_100_0;
-  fe25519 t;
-  int i;
-
-  /* 2 */ x25519_x86_64_square(&z2, x);
-  /* 4 */ x25519_x86_64_square(&t, &z2);
-  /* 8 */ x25519_x86_64_square(&t, &t);
-  /* 9 */ x25519_x86_64_mul(&z9, &t, x);
-  /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2);
-  /* 22 */ x25519_x86_64_square(&t, &z11);
-  /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9);
-
-  /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0);
-  /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0);
-
-  /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0);
-  /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0);
-
-  /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0);
-  /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0);
-
-  /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t);
-  /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0);
-
-  /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0);
-  /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0);
-
-  /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0);
-  /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) {
-    x25519_x86_64_square(&t, &t);
-  }
-  /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0);
-
-  /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t);
-  /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0);
-
-  /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t);
-  /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t);
-  /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t);
-
-  /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t);
-
-  /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t);
-  /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11);
-}
-
-static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) {
-  fe25519 work[5];
-
-  work[0] = *xr;
-  fe25519_setint(work + 1, 1);
-  fe25519_setint(work + 2, 0);
-  work[3] = *xr;
-  fe25519_setint(work + 4, 1);
-
-  int i, j;
-  uint8_t prevbit = 0;
-
-  j = 6;
-  for (i = 31; i >= 0; i--) {
-    while (j >= 0) {
-      const uint8_t bit = 1 & (s[i] >> j);
-      const uint64_t swap = bit ^ prevbit;
-      prevbit = bit;
-      x25519_x86_64_work_cswap(work + 1, swap);
-      x25519_x86_64_ladderstep(work);
-      j -= 1;
-    }
-    j = 7;
-  }
-
-  *xr = work[1];
-  *zr = work[2];
-}
-
-void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
-                  const uint8_t point[32]) {
-  uint8_t e[32];
-  OPENSSL_memcpy(e, scalar, sizeof(e));
-
-  e[0] &= 248;
-  e[31] &= 127;
-  e[31] |= 64;
-
-  fe25519 t;
-  fe25519 z;
-  fe25519_unpack(&t, point);
-  mladder(&t, &z, e);
-  fe25519_invert(&z, &z);
-  x25519_x86_64_mul(&t, &t, &z);
-  fe25519_pack(out, &t);
-}
-
-#endif  // BORINGSSL_X25519_X86_64
@@ -239,11 +239,6 @@ int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in,
  }
  BN_CTX_start(ctx);

-  mont = BN_MONT_CTX_new();
-  if (mont == NULL) {
-    goto err;
-  }
-
  r0 = BN_CTX_get(ctx);
  g = BN_CTX_get(ctx);
  W = BN_CTX_get(ctx);
@@ -401,8 +396,9 @@ end:
    goto err;
  }

-  if (!BN_set_word(test, h) ||
-      !BN_MONT_CTX_set(mont, p, ctx)) {
+  mont = BN_MONT_CTX_new_for_modulus(p, ctx);
+  if (mont == NULL ||
+      !BN_set_word(test, h)) {
    goto err;
  }

@@ -839,7 +835,7 @@ int DSA_size(const DSA *dsa) {
 static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx_in, BIGNUM **out_kinv,
                          BIGNUM **out_r) {
  BN_CTX *ctx;
-  BIGNUM k, kq, *kinv = NULL, *r = NULL;
+  BIGNUM k, *kinv = NULL, *r = NULL;
  int ret = 0;

  if (!dsa->p || !dsa->q || !dsa->g) {
@@ -848,7 +844,6 @@ static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx_in, BIGNUM **out_kinv,
  }

  BN_init(&k);
-  BN_init(&kq);

  ctx = ctx_in;
  if (ctx == NULL) {
@@ -859,54 +854,22 @@ static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx_in, BIGNUM **out_kinv,
  }

  r = BN_new();
-  if (r == NULL) {
-    goto err;
-  }
-
-  // Get random k
-  if (!BN_rand_range_ex(&k, 1, dsa->q)) {
-    goto err;
-  }
-
-  if (!BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_p,
+  kinv = BN_new();
+  if (r == NULL || kinv == NULL ||
+      // Get random k
+      !BN_rand_range_ex(&k, 1, dsa->q) ||
+      !BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_p,
                              (CRYPTO_MUTEX *)&dsa->method_mont_lock, dsa->p,
                              ctx) ||
      !BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_q,
                              (CRYPTO_MUTEX *)&dsa->method_mont_lock, dsa->q,
-                              ctx)) {
-    goto err;
-  }
-
-  // Compute r = (g^k mod p) mod q
-  if (!BN_copy(&kq, &k)) {
-    goto err;
-  }
-
-  // We do not want timing information to leak the length of k,
-  // so we compute g^k using an equivalent exponent of fixed length.
-  //
-  // (This is a kludge that we need because the BN_mod_exp_mont()
-  // does not let us specify the desired timing behaviour.)
-
-  if (!BN_add(&kq, &kq, dsa->q)) {
-    goto err;
-  }
-  if (BN_num_bits(&kq) <= BN_num_bits(dsa->q) && !BN_add(&kq, &kq, dsa->q)) {
-    goto err;
-  }
-
-  if (!BN_mod_exp_mont_consttime(r, dsa->g, &kq, dsa->p, ctx,
-                                 dsa->method_mont_p)) {
-    goto err;
-  }
-  if (!BN_mod(r, r, dsa->q, ctx)) {
-    goto err;
-  }
-
-  // Compute part of 's = inv(k) (m + xr) mod q' using Fermat's Little
-  // Theorem.
-  kinv = BN_new();
-  if (kinv == NULL ||
+                              ctx) ||
+      // Compute r = (g^k mod p) mod q
+      !BN_mod_exp_mont_consttime(r, dsa->g, &k, dsa->p, ctx,
+                                 dsa->method_mont_p) ||
+      !BN_mod(r, r, dsa->q, ctx) ||
+      // Compute part of 's = inv(k) (m + xr) mod q' using Fermat's Little
+      // Theorem.
      !bn_mod_inverse_prime(kinv, &k, dsa->q, ctx, dsa->method_mont_q)) {
    goto err;
  }
@@ -930,7 +893,6 @@ err:
    BN_CTX_free(ctx);
  }
  BN_clear_free(&k);
-  BN_clear_free(&kq);
  BN_clear_free(kinv);
  return ret;
 }
@@ -52,6 +52,7 @@ SSL,254,DOWNGRADE_DETECTED
 SSL,143,DTLS_MESSAGE_TOO_BIG
 SSL,257,DUPLICATE_EXTENSION
 SSL,264,DUPLICATE_KEY_SHARE
+SSL,283,EARLY_DATA_NOT_IN_USE
 SSL,144,ECC_CERT_NOT_FOR_SIGNING
 SSL,282,EMPTY_HELLO_RETRY_REQUEST
 SSL,145,EMS_STATE_INCONSISTENT
@@ -64,6 +65,7 @@ SSL,151,EXTRA_DATA_IN_MESSAGE
 SSL,152,FRAGMENT_MISMATCH
 SSL,153,GOT_NEXT_PROTO_WITHOUT_EXTENSION
 SSL,154,HANDSHAKE_FAILURE_ON_CLIENT_HELLO
+SSL,284,HANDSHAKE_NOT_COMPLETE
 SSL,155,HTTPS_PROXY_REQUEST
 SSL,156,HTTP_REQUEST
 SSL,157,INAPPROPRIATE_FALLBACK
@@ -84,6 +86,7 @@ SSL,167,MISSING_TMP_ECDH_KEY
 SSL,168,MIXED_SPECIAL_OPERATOR_WITH_GROUPS
 SSL,169,MTU_TOO_SMALL
 SSL,170,NEGOTIATED_BOTH_NPN_AND_ALPN
+SSL,285,NEGOTIATED_TB_WITHOUT_EMS_OR_RI
 SSL,171,NESTED_GROUP
 SSL,172,NO_CERTIFICATES_RETURNED
 SSL,173,NO_CERTIFICATE_ASSIGNED
@@ -114,6 +117,7 @@ SSL,191,PATH_TOO_LONG
 SSL,192,PEER_DID_NOT_RETURN_A_CERTIFICATE
 SSL,193,PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE
 SSL,267,PRE_SHARED_KEY_MUST_BE_LAST
+SSL,287,PRIVATE_KEY_OPERATION_FAILED
 SSL,194,PROTOCOL_IS_SHUTDOWN
 SSL,271,PSK_IDENTITY_BINDER_COUNT_MISMATCH
 SSL,195,PSK_IDENTITY_NOT_FOUND
@@ -131,6 +135,7 @@ SSL,205,RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION
 SSL,206,SCSV_RECEIVED_WHEN_RENEGOTIATING
 SSL,207,SERVERHELLO_TLSEXT
 SSL,273,SERVER_CERT_CHANGED
+SSL,286,SERVER_ECHOED_INVALID_SESSION_ID
 SSL,208,SESSION_ID_CONTEXT_UNINITIALIZED
 SSL,209,SESSION_MAY_NOT_BE_CREATED
 SSL,250,SHUTDOWN_WHILE_IN_INIT
@@ -545,6 +545,28 @@ OAEPLabel = 00112233445566778899aabbccddeeff
 Input = "Hello World"
 CheckDecrypt

+# Though we will never generate such a key, test that RSA keys where p < q work
+# properly.
+PrivateKey = RSA-Swapped
+Type = RSA
+Input = 30820275020100300d06092a864886f70d01010105000482025f3082025b02010002818100ab28f98747934779011417d5bbb4095eae6f48ed09e13081616cf390aac75b10a206a98953d402647dfef7fa363be2765a303b05ec388bd9a1d75123a1205b4ecb43c33f2e37d3e30842181d694a3acfc39afc52554946e699d97d97066596a46725ce6dea322623afcafecbd2884d9a0c5eae9c4d7da8874c29c19edb762e1902030100010281800d637ea568e169f15ab6be288f6ec55edd29425c9c6dbb941b5160fa1b89cda34ef15378b5107c016d63b0f52721e71497f876dd7f3d6b1f228c4bc20c3c12384644200e91130c9195660d1e706f55b2accf00c5e2174a1d9ee289f0e763ee58860485ec97d19d7fa2df38af5b5910b1fa52087768d288e6ec4c8d5eca23c8d3024100be757a24dc2c923692d964693b2d71ca33ccb2f946f9e5232d2090b715a97dca554068fab8876105bc9ed6dccfd0917c5e0b80339306535c3eeb787e89397bc7024100e60f5c9e52434da079b8c641791a81a96daa4d9921a07e5b48292a9fce230df7c9fc2b97b5e38834ed5caaa387a0bca35c474e989a68dd65b79a6f691a74471f0240438ccf017bc5a3260ff76291a01782204136fcd344c524ebd0f997da17a8c1a09d93f6a7d602cdfa86e79f3539cfb389f4a1079b432e1f2abc762f8a51893dc9024046604ca4e1e554c9d27283b363a888219c3a8ca25b770d303f52d8872a37eefdedfc0619d2ba57e058fc0ff71676453e73ec1c4ef26d41ccebed824754a05d6102404445374d8450e753e0a42085b56b0d6d500b3e3518536dc8f12ec8fd77aa75491835327ac0e12d73b5c3f1b09d03f6a24fe63b9c551dee6559b625435ec92429
+
+Sign = RSA-Swapped
+Digest = SHA256
+Input = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+Output = 07fa4e3de9c002c41c952dc292ef5a814c4c17dc1a6cf958c4c971e8089676d6661b442270ef9295c41e5385c9628aa1bdee2cc2558b8473ba212f2ba04b9ff2264c19187b9506b1d0a1cc2751844cc8dedf555d62ce81bc0e70bfe83d0184ee964593af91b9b327c0fb272c799148cd8737d412cbf36c2ad25fd66977bf805f
+
+# Though we will never generate such a key, test that RSA keys where p and q are
+# different sizes work properly.
+PrivateKey = RSA-PrimeMismatch
+Type = RSA
+Input = 30820295020100300d06092a864886f70d01010105000482027f3082027b02010002818100c766f4fef89f5e9a8e13ed500fb38523ea94d7f8be066900eee58c913b4c6fdcb13d63d39b9108feabcefd1ffd04776403dc58f968ae817977d0809e567d8af512d604a0e9cb448fa5e402204ee519712a5ebbfd002faf8169495a782f54366b4665aac0d968bfec63c5446b6f9b13061c7f3d1f3f1b6bede8fff881b410a66f0203010001028180528c062f49485c771a0b18ca747d8a47f8941ea63c305626cb3f1f067e6861c4441c432687dbd08d484aac3b01f3ffdc3b762c719167f7cb22e565aa6acd597306ef6f7828b9720e9d440816186d940c4c5a9720dddf71fe0b59483f02a751515c8c27e43c575d6725d55f5bb77e0f977773b00afc058cfab6617ec90d0b62a9026100cb8f97c37b4fbc298b645bc3dc0526f8a4274e9a193b33c3acb76499b5b96330e4b586cbaa56368ffc12644952322253bc669496d572c0980f125fd7273739cf790d24401052b13732114d397c8c16a44716dc62d2320fb1ced99290dfd53e07022100fac51ac653609cdaba53280c6b6f209052e270be0c3c68fe8b37d6bf05fbba59026038dff2f04c58d7e2e7ae6fb1469d2de954bc22cb0d77ac1be4fb0ca1a1d39d7240c4b357de4cde4bd68b30f8077e38771af1b25c7e60e48cd7d1337402e1fc460ab57046720918b8aa4589452196669119c7ba65e602d4bdc264a9fdce7c5f2b0220773af0180bdc8bb7938fa6230191bcb1e236b7d4248d347e9242e25fc0c0874102605c4894cde334889f5b52ed8f86a2ee9c1fbe4166287e24ce44f3093bff383962f08043842f6ff3e6002104b0e29442c4a4483c5d06e2254fbe5e3930de3d0e28af10e96c6e341a4b8859382dbba24536a38ae71118e3e22413a93f298a7f744c
+
+Sign = RSA-PrimeMismatch
+Digest = SHA256
+Input = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+Output = 6192b1ce630c87d02e8245fd74d4f6ecac37eef979d188c8fa48c4d355fbe814e7dd3152f42bb020d769b540d11867af5b947387b8c99158d56901ff3708e423931178213916ae1002f162c9d497aacacdcb20e6ffe7ed40138a253fc943ddf3587433df5831a3ce46aeefce358a009bf6bad12d82d77424c2755d984d7da196
+

 # EC tests

@@ -8,7 +8,7 @@ Please note that we cannot answer questions about FIPS, nor about using BoringSS

 BoringCrypto has undergone the following validations:

-1. 2017-06-15: certificate [#2964](http://csrc.nist.gov/groups/STM/cmvp/documents/140-1/1401val2017.htm#2964), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Security-Policy-20170615.docx) (in docx format).
+1. 2017-06-15: certificate [#2964](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/2964), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Security-Policy-20170615.docx) (in docx format).

 ## Running CAVP tests

@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -32,7 +39,7 @@
 # for scaling too, I [try to] avoid the latter by favoring off-by-2
 # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
 #
-# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
+# As was shown by Dean Gaudet, the above note turned out to be
 # void. Performance improvement with off-by-2 shifts was observed on
 # intermediate implementation, which was spilling yet another register
 # to stack... Final offset*4 code below runs just a tad faster on P4,
@@ -48,8 +55,8 @@
 # better performance on most recent µ-archs...
 #
 # Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance imrovement of CBC benchmark results. 40% was
-# observed on P4 core, where "overall" imrovement coefficient, i.e. if
+# up to 40% performance improvement of CBC benchmark results. 40% was
+# observed on P4 core, where "overall" improvement coefficient, i.e. if
 # compared to PIC generated by GCC and in CBC mode, was observed to be
 # as large as 4x:-) CBC performance is virtually identical to ECB now
 # and on some platforms even better, e.g. 17.6 "small" cycles/byte on
@@ -152,7 +159,7 @@
 # combinations then attack becomes infeasible. This is why revised
 # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
 # of data is to be processed in one stroke. The current size limit of
-# 512 bytes is chosen to provide same [diminishigly low] probability
+# 512 bytes is chosen to provide same [diminishingly low] probability
 # for cache-line to remain untouched in large chunk operation with
 # large S-box as for single block operation with compact S-box and
 # surely needs more careful consideration...
@@ -164,12 +171,12 @@
 # yield execution to process performing AES just before timer fires
 # off the scheduler, immediately regain control of CPU and analyze the
 # cache state. For this attack to be efficient attacker would have to
-# effectively slow down the operation by several *orders* of magnitute,
+# effectively slow down the operation by several *orders* of magnitude,
 # by ratio of time slice to duration of handful of AES rounds, which
 # unlikely to remain unnoticed. Not to mention that this also means
-# that he would spend correspondigly more time to collect enough
+# that he would spend correspondingly more time to collect enough
 # statistical data to mount the attack. It's probably appropriate to
-# say that if adeversary reckons that this attack is beneficial and
+# say that if adversary reckons that this attack is beneficial and
 # risks to be noticed, you probably have larger problems having him
 # mere opportunity. In other words suggested code design expects you
 # to preclude/mitigate this attack by overall system security design.
@@ -233,7 +240,7 @@ $small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
 			# contention and in hope to "collect" 5% back
 			# in real-life applications...

-$vertical_spin=0;	# shift "verticaly" defaults to 0, because of
+$vertical_spin=0;	# shift "vertically" defaults to 0, because of
 			# its proof-of-concept status...
 # Note that there is no decvert(), as well as last encryption round is
 # performed with "horizontal" shifts. This is because this "vertical"
@@ -1599,7 +1606,7 @@ sub decstep()
 	# no instructions are reordered, as performance appears
 	# optimal... or rather that all attempts to reorder didn't
 	# result in better performance [which by the way is not a
-	# bit lower than ecryption].
+	# bit lower than encryption].
 	if($i==3)   {	&mov	($key,$__key);			}
 	else        {	&mov	($out,$s[0]);			}
 			&and	($out,0xFF);
@@ -76,6 +76,11 @@ $code=<<___;
 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 #endif

+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES
+@ instructions are in aesv8-armx.pl.)
+.arch  armv7-a
+
 .text
 #if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
@@ -203,7 +208,7 @@ asm_AES_encrypt:
 	adr	r3,.
 #endif
 	stmdb   sp!,{r1,r4-r12,lr}
-#ifdef	__APPLE__
+#if defined(__thumb2__) || defined(__APPLE__)
 	adr	$tbl,AES_Te
 #else
 	sub	$tbl,r3,#asm_AES_encrypt-AES_Te	@ Te
@@ -481,7 +486,7 @@ _armv4_AES_set_encrypt_key:
 	mov	lr,r1			@ bits
 	mov	$key,r2			@ key

-#ifdef	__APPLE__
+#if defined(__thumb2__) || defined(__APPLE__)
 	adr	$tbl,AES_Te+1024				@ Te4
 #else
 	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
@@ -979,7 +984,7 @@ asm_AES_decrypt:
 	adr	r3,.
 #endif
 	stmdb   sp!,{r1,r4-r12,lr}
-#ifdef	__APPLE__
+#if defined(__thumb2__) || defined(__APPLE__)
 	adr	$tbl,AES_Td
 #else
 	sub	$tbl,r3,#asm_AES_decrypt-AES_Td	@ Td
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -590,13 +597,21 @@ $code.=<<___;
 .type	asm_AES_encrypt,\@function,3
 .hidden	asm_AES_encrypt
 asm_AES_encrypt:
+.cfi_startproc
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15

 	# allocate frame "above" key schedule
 	lea	-63(%rdx),%rcx	# %rdx is key argument
@@ -609,6 +624,7 @@ asm_AES_encrypt:

 	mov	%rsi,16(%rsp)	# save out
 	mov	%rax,24(%rsp)	# save original stack pointer
+.cfi_cfa_expression	%rsp+24,deref,+8
 .Lenc_prologue:

 	mov	%rdx,$key
@@ -635,20 +651,29 @@ asm_AES_encrypt:

 	mov	16(%rsp),$out	# restore out
 	mov	24(%rsp),%rsi	# restore saved stack pointer
+.cfi_def_cfa	%rsi,8
 	mov	$s0,0($out)	# write output vector
 	mov	$s1,4($out)
 	mov	$s2,8($out)
 	mov	$s3,12($out)

 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lenc_epilogue:
 	ret
+.cfi_endproc
 .size	asm_AES_encrypt,.-asm_AES_encrypt
 ___

@@ -1186,13 +1211,21 @@ $code.=<<___;
 .type	asm_AES_decrypt,\@function,3
 .hidden	asm_AES_decrypt
 asm_AES_decrypt:
+.cfi_startproc
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15

 	# allocate frame "above" key schedule
 	lea	-63(%rdx),%rcx	# %rdx is key argument
@@ -1205,6 +1238,7 @@ asm_AES_decrypt:

 	mov	%rsi,16(%rsp)	# save out
 	mov	%rax,24(%rsp)	# save original stack pointer
+.cfi_cfa_expression	%rsp+24,deref,+8
 .Ldec_prologue:

 	mov	%rdx,$key
@@ -1233,20 +1267,29 @@ asm_AES_decrypt:

 	mov	16(%rsp),$out	# restore out
 	mov	24(%rsp),%rsi	# restore saved stack pointer
+.cfi_def_cfa	%rsi,8
 	mov	$s0,0($out)	# write output vector
 	mov	$s1,4($out)
 	mov	$s2,8($out)
 	mov	$s3,12($out)

 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Ldec_epilogue:
 	ret
+.cfi_endproc
 .size	asm_AES_decrypt,.-asm_AES_decrypt
 ___
 #------------------------------------------------------------------#
@@ -1284,22 +1327,34 @@ $code.=<<___;
 .globl asm_AES_set_encrypt_key
 .type  asm_AES_set_encrypt_key,\@function,3
 asm_AES_set_encrypt_key:
+.cfi_startproc
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12			# redundant, but allows to share
+.cfi_push	%r12
 	push	%r13			# exception handler...
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
 .Lenc_key_prologue:

 	call	_x86_64_AES_set_encrypt_key

 	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
 	mov	48(%rsp),%rbx
+.cfi_restore	%rbx
 	add	\$56,%rsp
+.cfi_adjust_cfa_offset	-56
 .Lenc_key_epilogue:
 	ret
+.cfi_endproc
 .size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key

 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
@@ -1549,13 +1604,21 @@ $code.=<<___;
 .globl asm_AES_set_decrypt_key
 .type  asm_AES_set_decrypt_key,\@function,3
 asm_AES_set_decrypt_key:
+.cfi_startproc
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	push	%rdx			# save key schedule
+.cfi_adjust_cfa_offset	8
 .Ldec_key_prologue:

 	call	_x86_64_AES_set_encrypt_key
@@ -1609,14 +1672,22 @@ $code.=<<___;
 	xor	%rax,%rax
 .Labort:
 	mov	8(%rsp),%r15
+.cfi_restore	%r15
 	mov	16(%rsp),%r14
+.cfi_restore	%r14
 	mov	24(%rsp),%r13
+.cfi_restore	%r13
 	mov	32(%rsp),%r12
+.cfi_restore	%r12
 	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
 	mov	48(%rsp),%rbx
+.cfi_restore	%rbx
 	add	\$56,%rsp
+.cfi_adjust_cfa_offset	-56
 .Ldec_key_epilogue:
 	ret
+.cfi_endproc
 .size	asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key
 ___

@@ -1645,15 +1716,23 @@ $code.=<<___;
 .extern	OPENSSL_ia32cap_P
 .hidden	asm_AES_cbc_encrypt
 asm_AES_cbc_encrypt:
+.cfi_startproc
 	cmp	\$0,%rdx	# check length
 	je	.Lcbc_epilogue
 	pushfq
+.cfi_push	49		# %rflags
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 .Lcbc_prologue:

 	cld
@@ -1699,8 +1778,10 @@ asm_AES_cbc_encrypt:
 .Lcbc_te_ok:

 	xchg	%rsp,$key
+.cfi_def_cfa_register	$key
 	#add	\$8,%rsp	# reserve for return address!
 	mov	$key,$_rsp	# save %rsp
+.cfi_cfa_expression	$_rsp,deref,+64
 .Lcbc_fast_body:
 	mov	%rdi,$_inp	# save copy of inp
 	mov	%rsi,$_out	# save copy of out
@@ -1930,7 +2011,7 @@ asm_AES_cbc_encrypt:
 	lea	($key,%rax),%rax
 	mov	%rax,$keyend

-	# pick Te4 copy which can't "overlap" with stack frame or key scdedule
+	# pick Te4 copy which can't "overlap" with stack frame or key schedule
 	lea	2048($sbox),$sbox
 	lea	768-8(%rsp),%rax
 	sub	$sbox,%rax
@@ -2082,17 +2163,27 @@ asm_AES_cbc_encrypt:
 .align	16
 .Lcbc_exit:
 	mov	$_rsp,%rsi
+.cfi_def_cfa	%rsi,64
 	mov	(%rsi),%r15
+.cfi_restore	%r15
 	mov	8(%rsi),%r14
+.cfi_restore	%r14
 	mov	16(%rsi),%r13
+.cfi_restore	%r13
 	mov	24(%rsi),%r12
+.cfi_restore	%r12
 	mov	32(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	40(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	48(%rsi),%rsp
+.cfi_def_cfa	%rsp,16
 .Lcbc_popfq:
 	popfq
+.cfi_pop	49		# %rflags
 .Lcbc_epilogue:
 	ret
+.cfi_endproc
 .size	asm_AES_cbc_encrypt,.-asm_AES_cbc_encrypt
 ___
 }
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -43,18 +50,22 @@
 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.

+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
+
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
-#		CBC en-/decrypt	CTR	XTS	ECB
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
 # Westmere	3.77/1.37	1.37	1.52	1.27
-# * Bridge	5.07/0.98	0.99	1.09	0.91
-# Haswell	4.44/0.80	0.97	1.03	0.72
-# Skylake	2.68/0.65	0.65	0.66	0.64
-# Silvermont	5.77/3.56	3.67	4.03	3.46
-# Goldmont	3.84/1.39	1.39	1.63	1.31
-# Bulldozer	5.80/0.98	1.05	1.24	0.93
+# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
+# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
+# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
+# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
+# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
+# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23

 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@@ -228,7 +239,7 @@ sub aesni_generate1	# fully unrolled loop
 # can schedule aes[enc|dec] every cycle optimal interleave factor
 # equals to corresponding instructions latency. 8x is optimal for
 # * Bridge, but it's unfeasible to accommodate such implementation
-# in XMM registers addreassable in 32-bit mode and therefore maximum
+# in XMM registers addressable in 32-bit mode and therefore maximum
 # of 6x is used instead...

 sub aesni_generate2
@@ -2425,7 +2436,7 @@ if ($PREFIX eq "aesni") {
 	&pxor		("xmm3","xmm3");
 	&aesenclast	("xmm2","xmm3");

-	&movdqa		("xmm3","xmm1")
+	&movdqa		("xmm3","xmm1");
 	&pslldq		("xmm1",4);
 	&pxor		("xmm3","xmm1");
 	&pslldq		("xmm1",4);
@@ -60,7 +60,7 @@
 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
 # saving output. CCM CTR "stays invisible," because it's neatly
 # interleaved wih CBC-MAC. This provides ~30% improvement over
-# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# "straightforward" CCM implementation with CTR and CBC-MAC performed
 # disjointly. Parallelizable modes practically achieve the theoretical
 # limit.
 #
@@ -143,14 +143,14 @@
 # asymptotic, if it can be surpassed, isn't it? What happens there?
 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
 # magic is responsible for this. Processor overlaps not only the
-# additional instructions with AES ones, but even AES instuctions
+# additional instructions with AES ones, but even AES instructions
 # processing adjacent triplets of independent blocks. In the 6x case
 # additional instructions  still claim disproportionally small amount
 # of additional cycles, but in 8x case number of instructions must be
 # a tad too high for out-of-order logic to cope with, and AES unit
 # remains underutilized... As you can see 8x interleave is hardly
 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
-# utilizies 6x interleave because of limited register bank capacity.
+# utilizes 6x interleave because of limited register bank capacity.
 #
 # Higher interleave factors do have negative impact on Westmere
 # performance. While for ECB mode it's negligible ~1.5%, other
@@ -1182,6 +1182,7 @@ $code.=<<___;
 .type	aesni_ctr32_encrypt_blocks,\@function,5
 .align	16
 aesni_ctr32_encrypt_blocks:
+.cfi_startproc
 	cmp	\$1,$len
 	jne	.Lctr32_bulk

@@ -1204,7 +1205,9 @@ $code.=<<___;
 .align	16
 .Lctr32_bulk:
 	lea	(%rsp),$key_			# use $key_ as frame pointer
+.cfi_def_cfa_register	$key_
 	push	%rbp
+.cfi_push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
@@ -1548,7 +1551,7 @@ $code.=<<___;
 	sub	\$8,$len
 	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow

-	add	\$8,$len			# restore real remainig $len
+	add	\$8,$len			# restore real remaining $len
 	jz	.Lctr32_done			# done if ($len==0)
 	lea	-0x80($key),$key

@@ -1665,7 +1668,7 @@ $code.=<<___;
 	movups	$inout2,0x20($out)		# $len was 3, stop store

 .Lctr32_done:
-	xorps	%xmm0,%xmm0			# clear regiser bank
+	xorps	%xmm0,%xmm0			# clear register bank
 	xor	$key0,$key0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -1725,9 +1728,12 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-8($key_),%rbp
+.cfi_restore	%rbp
 	lea	($key_),%rsp
+.cfi_def_cfa_register	%rsp
 .Lctr32_epilogue:
 	ret
+.cfi_endproc
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 ___
 }
@@ -1749,8 +1755,11 @@ $code.=<<___;
 .type	aesni_xts_encrypt,\@function,6
 .align	16
 aesni_xts_encrypt:
+.cfi_startproc
 	lea	(%rsp),%r11			# frame pointer
+.cfi_def_cfa_register	%r11
 	push	%rbp
+.cfi_push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
@@ -1848,7 +1857,7 @@ $code.=<<___;
 	lea	`16*6`($inp),$inp
 	pxor	$twmask,$inout5

-	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
+	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
 	aesenc		$rndkey1,$inout4
 	 pxor	$twres,@tweak[1]
 	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
@@ -2215,9 +2224,12 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-8(%r11),%rbp
+.cfi_restore	%rbp
 	lea	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
 .Lxts_enc_epilogue:
 	ret
+.cfi_endproc
 .size	aesni_xts_encrypt,.-aesni_xts_encrypt
 ___

@@ -2226,8 +2238,11 @@ $code.=<<___;
 .type	aesni_xts_decrypt,\@function,6
 .align	16
 aesni_xts_decrypt:
+.cfi_startproc
 	lea	(%rsp),%r11			# frame pointer
+.cfi_def_cfa_register	%r11
 	push	%rbp
+.cfi_push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
@@ -2328,7 +2343,7 @@ $code.=<<___;
 	lea	`16*6`($inp),$inp
 	pxor	$twmask,$inout5

-	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
+	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
 	aesdec		$rndkey1,$inout4
 	 pxor	$twres,@tweak[1]
 	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
@@ -2718,9 +2733,12 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-8(%r11),%rbp
+.cfi_restore	%rbp
 	lea	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
 .Lxts_dec_epilogue:
 	ret
+.cfi_endproc
 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
 ___
 }
@@ -2745,12 +2763,18 @@ $code.=<<___;
 .type	aesni_ocb_encrypt,\@function,6
 .align	32
 aesni_ocb_encrypt:
+.cfi_startproc
 	lea	(%rsp),%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa0(%rsp),%rsp
@@ -2945,6 +2969,7 @@ $code.=<<___ if (!$win64);
 	pxor	%xmm14,%xmm14
 	pxor	%xmm15,%xmm15
 	lea	0x28(%rsp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x00(%rsp),%xmm6
@@ -2972,13 +2997,20 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-40(%rax),%r14
+.cfi_restore	%r14
 	mov	-32(%rax),%r13
+.cfi_restore	%r13
 	mov	-24(%rax),%r12
+.cfi_restore	%r12
 	mov	-16(%rax),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rax),%rbx
+.cfi_restore	%rbx
 	lea	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
 .Locb_enc_epilogue:
 	ret
+.cfi_endproc
 .size	aesni_ocb_encrypt,.-aesni_ocb_encrypt

 .type	__ocb_encrypt6,\@abi-omnipotent
@@ -3191,12 +3223,18 @@ __ocb_encrypt1:
 .type	aesni_ocb_decrypt,\@function,6
 .align	32
 aesni_ocb_decrypt:
+.cfi_startproc
 	lea	(%rsp),%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa0(%rsp),%rsp
@@ -3413,6 +3451,7 @@ $code.=<<___ if (!$win64);
 	pxor	%xmm14,%xmm14
 	pxor	%xmm15,%xmm15
 	lea	0x28(%rsp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x00(%rsp),%xmm6
@@ -3440,13 +3479,20 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-40(%rax),%r14
+.cfi_restore	%r14
 	mov	-32(%rax),%r13
+.cfi_restore	%r13
 	mov	-24(%rax),%r12
+.cfi_restore	%r12
 	mov	-16(%rax),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rax),%rbx
+.cfi_restore	%rbx
 	lea	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
 .Locb_dec_epilogue:
 	ret
+.cfi_endproc
 .size	aesni_ocb_decrypt,.-aesni_ocb_decrypt

 .type	__ocb_decrypt6,\@abi-omnipotent
@@ -3659,6 +3705,7 @@ $code.=<<___;
 .type	${PREFIX}_cbc_encrypt,\@function,6
 .align	16
 ${PREFIX}_cbc_encrypt:
+.cfi_startproc
 	test	$len,$len		# check length
 	jz	.Lcbc_ret

@@ -3735,7 +3782,9 @@ $code.=<<___;
 .align	16
 .Lcbc_decrypt_bulk:
 	lea	(%rsp),%r11		# frame pointer
+.cfi_def_cfa_register	%r11
 	push	%rbp
+.cfi_push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
@@ -4179,9 +4228,12 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-8(%r11),%rbp
+.cfi_restore	%rbp
 	lea	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
 .Lcbc_ret:
 	ret
+.cfi_endproc
 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
 ___
 } 
@@ -4202,7 +4254,9 @@ $code.=<<___;
 .type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
 .align	16
 ${PREFIX}_set_decrypt_key:
+.cfi_startproc
 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+.cfi_adjust_cfa_offset	8
 	call	__aesni_set_encrypt_key
 	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
 	test	%eax,%eax
@@ -4235,15 +4289,16 @@ ${PREFIX}_set_decrypt_key:
 	pxor	%xmm0,%xmm0
 .Ldec_key_ret:
 	add	\$8,%rsp
+.cfi_adjust_cfa_offset	-8
 	ret
+.cfi_endproc
 .LSEH_end_set_decrypt_key:
 .size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
 ___

-# This is based on submission by
-#
-#	Huang Ying <ying.huang@intel.com>
-#	Vinodh Gopal <vinodh.gopal@intel.com>
+# This is based on submission from Intel by
+#	Huang Ying
+#	Vinodh Gopal
 #	Kahraman Akdemir
 #
 # Aggressively optimized in respect to aeskeygenassist's critical path
@@ -4271,7 +4326,9 @@ $code.=<<___;
 .align	16
 ${PREFIX}_set_encrypt_key:
 __aesni_set_encrypt_key:
+.cfi_startproc
 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+.cfi_adjust_cfa_offset	8
 	mov	\$-1,%rax
 	test	$inp,$inp
 	jz	.Lenc_key_ret
@@ -4461,7 +4518,7 @@ __aesni_set_encrypt_key:

 .align	16
 .L14rounds:
-	movups	16($inp),%xmm2			# remaning half of *userKey
+	movups	16($inp),%xmm2			# remaining half of *userKey
 	mov	\$13,$bits			# 14 rounds for 256
 	lea	16(%rax),%rax
 	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
@@ -4565,7 +4622,9 @@ __aesni_set_encrypt_key:
 	pxor	%xmm4,%xmm4
 	pxor	%xmm5,%xmm5
 	add	\$8,%rsp
+.cfi_adjust_cfa_offset	-8
 	ret
+.cfi_endproc
 .LSEH_end_set_encrypt_key:

 .align	16
@@ -58,11 +58,7 @@ $code=<<___;
 #if __ARM_MAX_ARCH__>=7
 .text
 ___
-$code.=<<___ if ($flavour =~ /64/);
-#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
-.arch  armv8-a+crypto
-#endif
-___
+$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
 $code.=<<___						if ($flavour !~ /64/);
 .arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
 .fpu	neon
@@ -933,7 +929,7 @@ if ($flavour =~ /64/) {			######## 64-bit code
 	s/^(\s+)v/$1/o		or	# strip off v prefix
 	s/\bbx\s+lr\b/ret/o;

-	# fix up remainig legacy suffixes
+	# fix up remaining legacy suffixes
 	s/\.[ui]?8//o;
 	m/\],#8/o and s/\.16b/\.8b/go;
 	s/\.[ui]?32//o and s/\.16b/\.4s/go;
@@ -992,7 +988,7 @@ if ($flavour =~ /64/) {			######## 64-bit code
 	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
 	s/\/\/\s?/@ /o;				# new->old style commentary

-	# fix up remainig new-style suffixes
+	# fix up remaining new-style suffixes
 	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
 	s/\],#[0-9]+/]!/o;

@@ -14,8 +14,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 #
 # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
-# granted.
+# of Linaro. Permission to use under GPL terms is granted.
 # ====================================================================

 # Bit-sliced AES for ARM NEON
@@ -49,10 +48,7 @@
 #						<appro@openssl.org>

 # April-August 2013
-#
-# Add CBC, CTR and XTS subroutines, adapt for kernel use.
-#
-#					<ard.biesheuvel@linaro.org>
+# Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.

 $flavour = shift;
 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
@@ -746,7 +742,7 @@ $code.=<<___;
 _bsaes_decrypt8:
 	adr	$const,.
 	vldmia	$key!, {@XMM[9]}		@ round 0 key
-#ifdef	__APPLE__
+#if defined(__thumb2__) || defined(__APPLE__)
 	adr	$const,.LM0ISR
 #else
 	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
@@ -845,7 +841,7 @@ _bsaes_const:
 _bsaes_encrypt8:
 	adr	$const,.
 	vldmia	$key!, {@XMM[9]}		@ round 0 key
-#ifdef	__APPLE__
+#if defined(__thumb2__) || defined(__APPLE__)
 	adr	$const,.LM0SR
 #else
 	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
@@ -953,7 +949,7 @@ $code.=<<___;
 _bsaes_key_convert:
 	adr	$const,.
 	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
-#ifdef	__APPLE__
+#if defined(__thumb2__) || defined(__APPLE__)
 	adr	$const,.LM0
 #else
 	sub	$const,$const,#_bsaes_key_convert-.LM0
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 ###################################################################
 ### AES-128 [originally in CTR mode]				###
@@ -1158,15 +1165,23 @@ $code.=<<___;
 .type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
 .align	16
 bsaes_ecb_encrypt_blocks:
+.cfi_startproc
 	mov	%rsp, %rax
 .Lecb_enc_prologue:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-0x48(%rsp),%rsp
+.cfi_adjust_cfa_offset	0x48
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa0(%rsp), %rsp
@@ -1184,6 +1199,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	%rsp,%rbp		# backup %rsp
+.cfi_def_cfa_register	%rbp
 	mov	240($arg4),%eax		# rounds
 	mov	$arg1,$inp		# backup arguments
 	mov	$arg2,$out
@@ -1328,6 +1344,7 @@ $code.=<<___;
 	jb	.Lecb_enc_bzero

 	lea	0x78(%rbp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -1345,29 +1362,45 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax), %r15
+.cfi_restore	%r15
 	mov	-40(%rax), %r14
+.cfi_restore	%r14
 	mov	-32(%rax), %r13
+.cfi_restore	%r13
 	mov	-24(%rax), %r12
+.cfi_restore	%r12
 	mov	-16(%rax), %rbx
+.cfi_restore	%rbx
 	mov	-8(%rax), %rbp
+.cfi_restore	%rbp
 	lea	(%rax), %rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lecb_enc_epilogue:
 	ret
+.cfi_endproc
 .size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks

 .globl	bsaes_ecb_decrypt_blocks
 .type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
 .align	16
 bsaes_ecb_decrypt_blocks:
+.cfi_startproc
 	mov	%rsp, %rax
 .Lecb_dec_prologue:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-0x48(%rsp),%rsp
+.cfi_adjust_cfa_offset	0x48
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa0(%rsp), %rsp
@@ -1385,6 +1418,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	%rsp,%rbp		# backup %rsp
+.cfi_def_cfa_register	%rbp
 	mov	240($arg4),%eax		# rounds
 	mov	$arg1,$inp		# backup arguments
 	mov	$arg2,$out
@@ -1530,6 +1564,7 @@ $code.=<<___;
 	jb	.Lecb_dec_bzero

 	lea	0x78(%rbp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -1547,14 +1582,22 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax), %r15
+.cfi_restore	%r15
 	mov	-40(%rax), %r14
+.cfi_restore	%r14
 	mov	-32(%rax), %r13
+.cfi_restore	%r13
 	mov	-24(%rax), %r12
+.cfi_restore	%r12
 	mov	-16(%rax), %rbx
+.cfi_restore	%rbx
 	mov	-8(%rax), %rbp
+.cfi_restore	%rbp
 	lea	(%rax), %rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lecb_dec_epilogue:
 	ret
+.cfi_endproc
 .size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
 ___
 }
@@ -1564,6 +1607,7 @@ $code.=<<___;
 .type	bsaes_cbc_encrypt,\@abi-omnipotent
 .align	16
 bsaes_cbc_encrypt:
+.cfi_startproc
 ___
 $code.=<<___ if ($win64);
 	mov	48(%rsp),$arg6		# pull direction flag
@@ -1577,12 +1621,19 @@ $code.=<<___;
 	mov	%rsp, %rax
 .Lcbc_dec_prologue:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset	0x48
 ___
 $code.=<<___ if ($win64);
 	mov	0xa0(%rsp),$arg5	# pull ivp
@@ -1601,6 +1652,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	%rsp, %rbp		# backup %rsp
+.cfi_def_cfa_register	%rbp
 	mov	240($arg4), %eax	# rounds
 	mov	$arg1, $inp		# backup arguments
 	mov	$arg2, $out
@@ -1820,6 +1872,7 @@ $code.=<<___;
 	ja	.Lcbc_dec_bzero

 	lea	0x78(%rbp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -1837,29 +1890,45 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax), %r15
+.cfi_restore	%r15
 	mov	-40(%rax), %r14
+.cfi_restore	%r14
 	mov	-32(%rax), %r13
+.cfi_restore	%r13
 	mov	-24(%rax), %r12
+.cfi_restore	%r12
 	mov	-16(%rax), %rbx
+.cfi_restore	%rbx
 	mov	-8(%rax), %rbp
+.cfi_restore	%rbp
 	lea	(%rax), %rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lcbc_dec_epilogue:
 	ret
+.cfi_endproc
 .size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt

 .globl	bsaes_ctr32_encrypt_blocks
 .type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
 .align	16
 bsaes_ctr32_encrypt_blocks:
+.cfi_startproc
 	mov	%rsp, %rax
 .Lctr_enc_prologue:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset	0x48
 ___
 $code.=<<___ if ($win64);
 	mov	0xa0(%rsp),$arg5	# pull ivp
@@ -1878,6 +1947,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	%rsp, %rbp		# backup %rsp
+.cfi_def_cfa_register	%rbp
 	movdqu	($arg5), %xmm0		# load counter
 	mov	240($arg4), %eax	# rounds
 	mov	$arg1, $inp		# backup arguments
@@ -2052,6 +2122,7 @@ $code.=<<___;
 	ja	.Lctr_enc_bzero

 	lea	0x78(%rbp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -2069,14 +2140,22 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax), %r15
+.cfi_restore	%r15
 	mov	-40(%rax), %r14
+.cfi_restore	%r14
 	mov	-32(%rax), %r13
+.cfi_restore	%r13
 	mov	-24(%rax), %r12
+.cfi_restore	%r12
 	mov	-16(%rax), %rbx
+.cfi_restore	%rbx
 	mov	-8(%rax), %rbp
+.cfi_restore	%rbp
 	lea	(%rax), %rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lctr_enc_epilogue:
 	ret
+.cfi_endproc
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
 ___
 ######################################################################
@@ -2092,15 +2171,23 @@ $code.=<<___;
 .type	bsaes_xts_encrypt,\@abi-omnipotent
 .align	16
 bsaes_xts_encrypt:
+.cfi_startproc
 	mov	%rsp, %rax
 .Lxts_enc_prologue:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset	0x48
 ___
 $code.=<<___ if ($win64);
 	mov	0xa0(%rsp),$arg5	# pull key2
@@ -2120,6 +2207,7 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	%rsp, %rbp		# backup %rsp
+.cfi_def_cfa_register	%rbp
 	mov	$arg1, $inp		# backup arguments
 	mov	$arg2, $out
 	mov	$arg3, $len
@@ -2442,6 +2530,7 @@ $code.=<<___;
 	ja	.Lxts_enc_bzero

 	lea	0x78(%rbp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -2459,29 +2548,45 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax), %r15
+.cfi_restore	%r15
 	mov	-40(%rax), %r14
+.cfi_restore	%r14
 	mov	-32(%rax), %r13
+.cfi_restore	%r13
 	mov	-24(%rax), %r12
+.cfi_restore	%r12
 	mov	-16(%rax), %rbx
+.cfi_restore	%rbx
 	mov	-8(%rax), %rbp
+.cfi_restore	%rbp
 	lea	(%rax), %rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lxts_enc_epilogue:
 	ret
+.cfi_endproc
 .size	bsaes_xts_encrypt,.-bsaes_xts_encrypt

 .globl	bsaes_xts_decrypt
 .type	bsaes_xts_decrypt,\@abi-omnipotent
 .align	16
 bsaes_xts_decrypt:
+.cfi_startproc
 	mov	%rsp, %rax
 .Lxts_dec_prologue:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset	0x48
 ___
 $code.=<<___ if ($win64);
 	mov	0xa0(%rsp),$arg5	# pull key2
@@ -2849,6 +2954,7 @@ $code.=<<___;
 	ja	.Lxts_dec_bzero

 	lea	0x78(%rbp),%rax
+.cfi_def_cfa	%rax,8
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -2866,14 +2972,22 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	mov	-48(%rax), %r15
+.cfi_restore	%r15
 	mov	-40(%rax), %r14
+.cfi_restore	%r14
 	mov	-32(%rax), %r13
+.cfi_restore	%r13
 	mov	-24(%rax), %r12
+.cfi_restore	%r12
 	mov	-16(%rax), %rbx
+.cfi_restore	%rbx
 	mov	-8(%rax), %rbp
+.cfi_restore	%rbp
 	lea	(%rax), %rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lxts_dec_epilogue:
 	ret
+.cfi_endproc
 .size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
 ___
 }
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
@@ -16,21 +16,15 @@
 #define _GNU_SOURCE  // needed for syscall() on Linux.
 #endif

-#include <openssl/aead.h>
-#include <openssl/aes.h>
-#include <openssl/base.h>
-#include <openssl/bn.h>
 #include <openssl/crypto.h>
-#include <openssl/des.h>
-#include <openssl/ecdsa.h>
-#include <openssl/ec_key.h>
+
+#include <stdlib.h>
+
+#include <openssl/digest.h>
 #include <openssl/hmac.h>
-#include <openssl/nid.h>
-#include <openssl/rsa.h>
 #include <openssl/sha.h>

 #include "../internal.h"
-#include "rand/internal.h"

 #include "aes/aes.c"
 #include "aes/key_wrap.c"
@@ -67,15 +61,16 @@
 #include "ec/ec_montgomery.c"
 #include "ec/oct.c"
 #include "ec/p224-64.c"
-#include "ec/p256-64.c"
+#include "../../third_party/fiat/p256.c"
 #include "ec/p256-x86_64.c"
 #include "ec/simple.c"
-#include "ec/util-64.c"
+#include "ec/util.c"
 #include "ec/wnaf.c"
 #include "hmac/hmac.c"
 #include "md4/md4.c"
 #include "md5/md5.c"
 #include "modes/cbc.c"
+#include "modes/ccm.c"
 #include "modes/cfb.c"
 #include "modes/ctr.c"
 #include "modes/gcm.c"
@@ -88,205 +83,16 @@
 #include "rsa/padding.c"
 #include "rsa/rsa.c"
 #include "rsa/rsa_impl.c"
+#include "self_check/self_check.c"
 #include "sha/sha1-altivec.c"
 #include "sha/sha1.c"
 #include "sha/sha256.c"
 #include "sha/sha512.c"
+#include "tls/kdf.c"


 #if defined(BORINGSSL_FIPS)

-static void hexdump(const uint8_t *in, size_t len) {
-  for (size_t i = 0; i < len; i++) {
-    printf("%02x", in[i]);
-  }
-}
-
-static int check_test(const void *expected, const void *actual,
-                      size_t expected_len, const char *name) {
-  if (OPENSSL_memcmp(actual, expected, expected_len) != 0) {
-    printf("%s failed.\nExpected: ", name);
-    hexdump(expected, expected_len);
-    printf("\nCalculated: ");
-    hexdump(actual, expected_len);
-    printf("\n");
-    return 0;
-  }
-  return 1;
-}
-
-static int set_bignum(BIGNUM **out, const uint8_t *in, size_t len) {
-  *out = BN_bin2bn(in, len, NULL);
-  return *out != NULL;
-}
-
-static RSA *self_test_rsa_key(void) {
-  static const uint8_t kN[] = {
-      0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc,
-      0xc9, 0xa2, 0xc2, 0x3a, 0xa6, 0x1d, 0xd8, 0xf0, 0x26, 0x5b, 0x38, 0x90,
-      0x17, 0x48, 0x15, 0xce, 0x21, 0xcd, 0xd6, 0x62, 0x99, 0xe2, 0xd7, 0xda,
-      0x40, 0x80, 0x3c, 0xad, 0x18, 0xb7, 0x26, 0xe9, 0x30, 0x8a, 0x23, 0x3f,
-      0x68, 0x9a, 0x9c, 0x31, 0x34, 0x91, 0x99, 0x06, 0x11, 0x36, 0xb2, 0x9e,
-      0x3a, 0xd0, 0xbc, 0xb9, 0x93, 0x4e, 0xb8, 0x72, 0xa1, 0x9f, 0xb6, 0x8c,
-      0xd5, 0x17, 0x1f, 0x7e, 0xaa, 0x75, 0xbb, 0xdf, 0xa1, 0x70, 0x48, 0xc4,
-      0xec, 0x9a, 0x51, 0xed, 0x41, 0xc9, 0x74, 0xc0, 0x3e, 0x1e, 0x85, 0x2f,
-      0xbe, 0x34, 0xc7, 0x65, 0x34, 0x8b, 0x4d, 0x55, 0x4b, 0xe1, 0x45, 0x54,
-      0x0d, 0x75, 0x7e, 0x89, 0x4d, 0x0c, 0xf6, 0x33, 0xe5, 0xfc, 0xfb, 0x56,
-      0x1b, 0xf2, 0x39, 0x9d, 0xe0, 0xff, 0x55, 0xcf, 0x02, 0x05, 0xb9, 0x74,
-      0xd2, 0x91, 0xfc, 0x87, 0xe1, 0xbb, 0x97, 0x2a, 0xe4, 0xdd, 0x20, 0xc0,
-      0x38, 0x47, 0xc0, 0x76, 0x3f, 0xa1, 0x9b, 0x5c, 0x20, 0xff, 0xff, 0xc7,
-      0x49, 0x3b, 0x4c, 0xaf, 0x99, 0xa6, 0x3e, 0x82, 0x5c, 0x58, 0x27, 0xce,
-      0x01, 0x03, 0xc3, 0x16, 0x35, 0x20, 0xe9, 0xf0, 0x15, 0x7a, 0x41, 0xd5,
-      0x1f, 0x52, 0xea, 0xdf, 0xad, 0x4c, 0xbb, 0x0d, 0xcb, 0x04, 0x91, 0xb0,
-      0x95, 0xa8, 0xce, 0x25, 0xfd, 0xd2, 0x62, 0x47, 0x77, 0xee, 0x13, 0xf1,
-      0x48, 0x72, 0x9e, 0xd9, 0x2d, 0xe6, 0x5f, 0xa4, 0xc6, 0x9e, 0x5a, 0xb2,
-      0xc6, 0xa2, 0xf7, 0x0a, 0x16, 0x17, 0xae, 0x6b, 0x1c, 0x30, 0x7c, 0x63,
-      0x08, 0x83, 0xe7, 0x43, 0xec, 0x54, 0x5e, 0x2c, 0x08, 0x0b, 0x5e, 0x46,
-      0xa7, 0x10, 0x93, 0x43, 0x53, 0x4e, 0xe3, 0x16, 0x73, 0x55, 0xce, 0xf2,
-      0x94, 0xc0, 0xbe, 0xb3,
-  };
-  static const uint8_t kE[] = {0x01, 0x00, 0x01};  // 65537
-  static const uint8_t kD[] = {
-      0x2f, 0x2c, 0x1e, 0xd2, 0x3d, 0x2c, 0xb1, 0x9b, 0x21, 0x02, 0xce, 0xb8,
-      0x95, 0x5f, 0x4f, 0xd9, 0x21, 0x38, 0x11, 0x36, 0xb0, 0x9a, 0x36, 0xab,
-      0x97, 0x47, 0x75, 0xf7, 0x2e, 0xfd, 0x75, 0x1f, 0x58, 0x16, 0x9c, 0xf6,
-      0x14, 0xe9, 0x8e, 0xa3, 0x69, 0x9d, 0x9d, 0x86, 0xfe, 0x5c, 0x1b, 0x3b,
-      0x11, 0xf5, 0x55, 0x64, 0x77, 0xc4, 0xfc, 0x53, 0xaa, 0x8c, 0x78, 0x9f,
-      0x75, 0xab, 0x20, 0x3a, 0xa1, 0x77, 0x37, 0x22, 0x02, 0x8e, 0x54, 0x8a,
-      0x67, 0x1c, 0x5e, 0xe0, 0x3e, 0xd9, 0x44, 0x37, 0xd1, 0x29, 0xee, 0x56,
-      0x6c, 0x30, 0x9a, 0x93, 0x4d, 0xd9, 0xdb, 0xc5, 0x03, 0x1a, 0x75, 0xcc,
-      0x0f, 0xc2, 0x61, 0xb5, 0x6c, 0x62, 0x9f, 0xc6, 0xa8, 0xc7, 0x8a, 0x60,
-      0x17, 0x11, 0x62, 0x4c, 0xef, 0x74, 0x31, 0x97, 0xad, 0x89, 0x2d, 0xe8,
-      0x31, 0x1d, 0x8b, 0x58, 0x82, 0xe3, 0x03, 0x1a, 0x6b, 0xdf, 0x3f, 0x3e,
-      0xa4, 0x27, 0x19, 0xef, 0x46, 0x7a, 0x90, 0xdf, 0xa7, 0xe7, 0xc9, 0x66,
-      0xab, 0x41, 0x1d, 0x65, 0x78, 0x1c, 0x18, 0x40, 0x5c, 0xd6, 0x87, 0xb5,
-      0xea, 0x29, 0x44, 0xb3, 0xf5, 0xb3, 0xd2, 0x4f, 0xce, 0x88, 0x78, 0x49,
-      0x27, 0x4e, 0x0b, 0x30, 0x85, 0xfb, 0x73, 0xfd, 0x8b, 0x32, 0x15, 0xee,
-      0x1f, 0xc9, 0x0e, 0x89, 0xb9, 0x43, 0x2f, 0xe9, 0x60, 0x8d, 0xda, 0xae,
-      0x2b, 0x30, 0x99, 0xee, 0x88, 0x81, 0x20, 0x7b, 0x4a, 0xc3, 0x18, 0xf2,
-      0x94, 0x02, 0x79, 0x94, 0xaa, 0x65, 0xd9, 0x1b, 0x45, 0x2a, 0xac, 0x6e,
-      0x30, 0x48, 0x57, 0xea, 0xbe, 0x79, 0x7d, 0xfc, 0x67, 0xaa, 0x47, 0xc0,
-      0xf7, 0x52, 0xfd, 0x0b, 0x63, 0x4e, 0x3d, 0x2e, 0xcc, 0x36, 0xa0, 0xdb,
-      0x92, 0x0b, 0xa9, 0x1b, 0xeb, 0xc2, 0xd5, 0x08, 0xd3, 0x85, 0x87, 0xf8,
-      0x5d, 0x1a, 0xf6, 0xc1,
-  };
-  static const uint8_t kP[] = {
-      0xf7, 0x06, 0xa3, 0x98, 0x8a, 0x52, 0xf8, 0x63, 0x68, 0x27, 0x4f, 0x68,
-      0x7f, 0x34, 0xec, 0x8e, 0x5d, 0xf8, 0x30, 0x92, 0xb3, 0x62, 0x4c, 0xeb,
-      0xdb, 0x19, 0x6b, 0x09, 0xc5, 0xa3, 0xf0, 0xbb, 0xff, 0x0f, 0xc2, 0xd4,
-      0x9b, 0xc9, 0x54, 0x4f, 0xb9, 0xf9, 0xe1, 0x4c, 0xf0, 0xe3, 0x4c, 0x90,
-      0xda, 0x7a, 0x01, 0xc2, 0x9f, 0xc4, 0xc8, 0x8e, 0xb1, 0x1e, 0x93, 0x75,
-      0x75, 0xc6, 0x13, 0x25, 0xc3, 0xee, 0x3b, 0xcc, 0xb8, 0x72, 0x6c, 0x49,
-      0xb0, 0x09, 0xfb, 0xab, 0x44, 0xeb, 0x4d, 0x40, 0xf0, 0x61, 0x6b, 0xe5,
-      0xe6, 0xfe, 0x3e, 0x0a, 0x77, 0x26, 0x39, 0x76, 0x3d, 0x4c, 0x3e, 0x9b,
-      0x5b, 0xc0, 0xaf, 0xa2, 0x58, 0x76, 0xb0, 0xe9, 0xda, 0x7f, 0x0e, 0x78,
-      0xc9, 0x76, 0x49, 0x5c, 0xfa, 0xb3, 0xb0, 0x15, 0x4b, 0x41, 0xc7, 0x27,
-      0xa4, 0x75, 0x28, 0x5c, 0x30, 0x69, 0x50, 0x29,
-  };
-  static const uint8_t kQ[] = {
-      0xda, 0xe6, 0xd2, 0xbb, 0x44, 0xff, 0x4f, 0xdf, 0x57, 0xc1, 0x11, 0xa3,
-      0x51, 0xba, 0x17, 0x89, 0x4c, 0x01, 0xc0, 0x0c, 0x97, 0x34, 0x50, 0xcf,
-      0x32, 0x1e, 0xc0, 0xbd, 0x7b, 0x35, 0xb5, 0x6a, 0x26, 0xcc, 0xea, 0x4c,
-      0x8e, 0x87, 0x4a, 0x67, 0x8b, 0xd3, 0xe5, 0x4f, 0x3a, 0x60, 0x48, 0x59,
-      0x04, 0x93, 0x39, 0xd7, 0x7c, 0xfb, 0x19, 0x1a, 0x34, 0xd5, 0xe8, 0xaf,
-      0xe7, 0x22, 0x2c, 0x0d, 0xc2, 0x91, 0x69, 0xb6, 0xe9, 0x2a, 0xe9, 0x1c,
-      0x4c, 0x6e, 0x8f, 0x40, 0xf5, 0xa8, 0x3e, 0x82, 0x69, 0x69, 0xbe, 0x9f,
-      0x7d, 0x5c, 0x7f, 0x92, 0x78, 0x17, 0xa3, 0x6d, 0x41, 0x2d, 0x72, 0xed,
-      0x3f, 0x71, 0xfa, 0x97, 0xb4, 0x63, 0xe4, 0x4f, 0xd9, 0x46, 0x03, 0xfb,
-      0x00, 0xeb, 0x30, 0x70, 0xb9, 0x51, 0xd9, 0x0a, 0xd2, 0xf8, 0x50, 0xd4,
-      0xfb, 0x43, 0x84, 0xf8, 0xac, 0x58, 0xc3, 0x7b,
-  };
-  static const uint8_t kDModPMinusOne[] = {
-      0xf5, 0x50, 0x8f, 0x88, 0x7d, 0xdd, 0xb5, 0xb4, 0x2a, 0x8b, 0xd7, 0x4d,
-      0x23, 0xfe, 0xaf, 0xe9, 0x16, 0x22, 0xd2, 0x41, 0xed, 0x88, 0xf2, 0x70,
-      0xcb, 0x4d, 0xeb, 0xc1, 0x71, 0x97, 0xc4, 0x0b, 0x3e, 0x5a, 0x2d, 0x96,
-      0xab, 0xfa, 0xfd, 0x12, 0x8b, 0xd3, 0x3e, 0x4e, 0x05, 0x6f, 0x04, 0xeb,
-      0x59, 0x3c, 0x0e, 0xa1, 0x73, 0xbe, 0x9d, 0x99, 0x2f, 0x05, 0xf9, 0x54,
-      0x8d, 0x98, 0x1e, 0x0d, 0xc4, 0x0c, 0xc3, 0x30, 0x23, 0xff, 0xe5, 0xd0,
-      0x2b, 0xd5, 0x4e, 0x2b, 0xa0, 0xae, 0xb8, 0x32, 0x84, 0x45, 0x8b, 0x3c,
-      0x6d, 0xf0, 0x10, 0x36, 0x9e, 0x6a, 0xc4, 0x67, 0xca, 0xa9, 0xfc, 0x06,
-      0x96, 0xd0, 0xbc, 0xda, 0xd1, 0x55, 0x55, 0x8d, 0x77, 0x21, 0xf4, 0x82,
-      0x39, 0x37, 0x91, 0xd5, 0x97, 0x56, 0x78, 0xc8, 0x3c, 0xcb, 0x5e, 0xf6,
-      0xdc, 0x58, 0x48, 0xb3, 0x7c, 0x94, 0x29, 0x39,
-  };
-  static const uint8_t kDModQMinusOne[] = {
-      0x64, 0x65, 0xbd, 0x7d, 0x1a, 0x96, 0x26, 0xa1, 0xfe, 0xf3, 0x94, 0x0d,
-      0x5d, 0xec, 0x85, 0xe2, 0xf8, 0xb3, 0x4c, 0xcb, 0xf9, 0x85, 0x8b, 0x12,
-      0x9c, 0xa0, 0x32, 0x32, 0x35, 0x92, 0x5a, 0x94, 0x47, 0x1b, 0x70, 0xd2,
-      0x90, 0x04, 0x49, 0x01, 0xd8, 0xc5, 0xe4, 0xc4, 0x43, 0xb7, 0xe9, 0x36,
-      0xba, 0xbc, 0x73, 0xa8, 0xfb, 0xaf, 0x86, 0xc1, 0xd8, 0x3d, 0xcb, 0xac,
-      0xf1, 0xcb, 0x60, 0x7d, 0x27, 0x21, 0xde, 0x64, 0x7f, 0xe8, 0xa8, 0x65,
-      0xcc, 0x40, 0x60, 0xff, 0xa0, 0x2b, 0xfc, 0x0f, 0x80, 0x1d, 0x79, 0xca,
-      0x58, 0x8a, 0xd6, 0x0f, 0xed, 0x78, 0x9a, 0x02, 0x00, 0x04, 0xc2, 0x53,
-      0x41, 0xe8, 0x1a, 0xd0, 0xfd, 0x71, 0x5b, 0x43, 0xac, 0x19, 0x4a, 0xb6,
-      0x12, 0xa3, 0xcb, 0xe1, 0xc7, 0x7d, 0x5c, 0x98, 0x74, 0x4e, 0x63, 0x74,
-      0x6b, 0x91, 0x7a, 0x29, 0x3b, 0x92, 0xb2, 0x85,
-  };
-  static const uint8_t kQInverseModP[] = {
-      0xd0, 0xde, 0x19, 0xda, 0x1e, 0xa2, 0xd8, 0x8f, 0x1c, 0x92, 0x73, 0xb0,
-      0xc9, 0x90, 0xc7, 0xf5, 0xec, 0xc5, 0x89, 0x01, 0x05, 0x78, 0x11, 0x2d,
-      0x74, 0x34, 0x44, 0xad, 0xd5, 0xf7, 0xa4, 0xfe, 0x9f, 0x25, 0x4d, 0x0b,
-      0x92, 0xe3, 0xb8, 0x7d, 0xd3, 0xfd, 0xa5, 0xca, 0x95, 0x60, 0xa3, 0xf9,
-      0x55, 0x42, 0x14, 0xb2, 0x45, 0x51, 0x9f, 0x73, 0x88, 0x43, 0x8a, 0xd1,
-      0x65, 0x9e, 0xd1, 0xf7, 0x82, 0x2a, 0x2a, 0x8d, 0x70, 0x56, 0xe3, 0xef,
-      0xc9, 0x0e, 0x2a, 0x2c, 0x15, 0xaf, 0x7f, 0x97, 0x81, 0x66, 0xf3, 0xb5,
-      0x00, 0xa9, 0x26, 0xcc, 0x1e, 0xc2, 0x98, 0xdd, 0xd3, 0x37, 0x06, 0x79,
-      0xb3, 0x60, 0x58, 0x79, 0x99, 0x3f, 0xa3, 0x15, 0x1f, 0x31, 0xe3, 0x11,
-      0x88, 0x4c, 0x35, 0x57, 0xfa, 0x79, 0xd7, 0xd8, 0x72, 0xee, 0x73, 0x95,
-      0x89, 0x29, 0xc7, 0x05, 0x27, 0x68, 0x90, 0x15,
-  };
-
-  RSA *rsa = RSA_new();
-  if (rsa == NULL ||
-      !set_bignum(&rsa->n, kN, sizeof(kN)) ||
-      !set_bignum(&rsa->e, kE, sizeof(kE)) ||
-      !set_bignum(&rsa->d, kD, sizeof(kD)) ||
-      !set_bignum(&rsa->p, kP, sizeof(kP)) ||
-      !set_bignum(&rsa->q, kQ, sizeof(kQ)) ||
-      !set_bignum(&rsa->dmp1, kDModPMinusOne, sizeof(kDModPMinusOne)) ||
-      !set_bignum(&rsa->dmq1, kDModQMinusOne, sizeof(kDModQMinusOne)) ||
-      !set_bignum(&rsa->iqmp, kQInverseModP, sizeof(kQInverseModP))) {
-    RSA_free(rsa);
-    return NULL;
-  }
-
-  return rsa;
-}
-
-static EC_KEY *self_test_ecdsa_key(void) {
-  static const uint8_t kQx[] = {
-      0xc8, 0x15, 0x61, 0xec, 0xf2, 0xe5, 0x4e, 0xde, 0xfe, 0x66, 0x17,
-      0xdb, 0x1c, 0x7a, 0x34, 0xa7, 0x07, 0x44, 0xdd, 0xb2, 0x61, 0xf2,
-      0x69, 0xb8, 0x3d, 0xac, 0xfc, 0xd2, 0xad, 0xe5, 0xa6, 0x81,
-  };
-  static const uint8_t kQy[] = {
-      0xe0, 0xe2, 0xaf, 0xa3, 0xf9, 0xb6, 0xab, 0xe4, 0xc6, 0x98, 0xef,
-      0x64, 0x95, 0xf1, 0xbe, 0x49, 0xa3, 0x19, 0x6c, 0x50, 0x56, 0xac,
-      0xb3, 0x76, 0x3f, 0xe4, 0x50, 0x7e, 0xec, 0x59, 0x6e, 0x88,
-  };
-  static const uint8_t kD[] = {
-      0xc6, 0xc1, 0xaa, 0xda, 0x15, 0xb0, 0x76, 0x61, 0xf8, 0x14, 0x2c,
-      0x6c, 0xaf, 0x0f, 0xdb, 0x24, 0x1a, 0xff, 0x2e, 0xfe, 0x46, 0xc0,
-      0x93, 0x8b, 0x74, 0xf2, 0xbc, 0xc5, 0x30, 0x52, 0xb0, 0x77,
-  };
-
-  EC_KEY *ec_key = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
-  BIGNUM *qx = BN_bin2bn(kQx, sizeof(kQx), NULL);
-  BIGNUM *qy = BN_bin2bn(kQy, sizeof(kQy), NULL);
-  BIGNUM *d = BN_bin2bn(kD, sizeof(kD), NULL);
-  if (ec_key == NULL || qx == NULL || qy == NULL || d == NULL ||
-      !EC_KEY_set_public_key_affine_coordinates(ec_key, qx, qy) ||
-      !EC_KEY_set_private_key(ec_key, d)) {
-    EC_KEY_free(ec_key);
-    ec_key = NULL;
-  }
-
-  BN_free(qx);
-  BN_free(qy);
-  BN_free(d);
-  return ec_key;
-}
-
 #if !defined(OPENSSL_ASAN)
 // These symbols are filled in by delocate.go. They point to the start and end
 // of the module, and the location of the integrity hash, respectively.
@@ -322,345 +128,7 @@ BORINGSSL_bcm_power_on_self_test(void) {
  }
 #endif

-  static const uint8_t kAESKey[16] = "BoringCrypto Key";
-  static const uint8_t kAESIV[16] = {0};
-  static const uint8_t kPlaintext[64] =
-      "BoringCryptoModule FIPS KAT Encryption and Decryption Plaintext!";
-  static const uint8_t kAESCBCCiphertext[64] = {
-      0x87, 0x2d, 0x98, 0xc2, 0xcc, 0x31, 0x5b, 0x41, 0xe0, 0xfa, 0x7b,
-      0x0a, 0x71, 0xc0, 0x42, 0xbf, 0x4f, 0x61, 0xd0, 0x0d, 0x58, 0x8c,
-      0xf7, 0x05, 0xfb, 0x94, 0x89, 0xd3, 0xbc, 0xaa, 0x1a, 0x50, 0x45,
-      0x1f, 0xc3, 0x8c, 0xb8, 0x98, 0x86, 0xa3, 0xe3, 0x6c, 0xfc, 0xad,
-      0x3a, 0xb5, 0x59, 0x27, 0x7d, 0x21, 0x07, 0xca, 0x4c, 0x1d, 0x55,
-      0x34, 0xdd, 0x5a, 0x2d, 0xc4, 0xb4, 0xf5, 0xa8,
-#if !defined(BORINGSSL_FIPS_BREAK_AES_CBC)
-      0x35
-#else
-      0x00
-#endif
-  };
-  static const uint8_t kAESGCMCiphertext[80] = {
-      0x4a, 0xd8, 0xe7, 0x7d, 0x78, 0xd7, 0x7d, 0x5e, 0xb2, 0x11, 0xb6, 0xc9,
-      0xa4, 0xbc, 0xb2, 0xae, 0xbe, 0x93, 0xd1, 0xb7, 0xfe, 0x65, 0xc1, 0x82,
-      0x2a, 0xb6, 0x71, 0x5f, 0x1a, 0x7c, 0xe0, 0x1b, 0x2b, 0xe2, 0x53, 0xfa,
-      0xa0, 0x47, 0xfa, 0xd7, 0x8f, 0xb1, 0x4a, 0xc4, 0xdc, 0x89, 0xf9, 0xb4,
-      0x14, 0x4d, 0xde, 0x95, 0xea, 0x29, 0x69, 0x76, 0x81, 0xa3, 0x5c, 0x33,
-      0xd8, 0x37, 0xd8, 0xfa, 0x47, 0x19, 0x46, 0x2f, 0xf1, 0x90, 0xb7, 0x61,
-      0x8f, 0x6f, 0xdd, 0x31, 0x3f, 0x6a, 0x64,
-#if !defined(BORINGSSL_FIPS_BREAK_AES_GCM)
-      0x0d
-#else
-      0x00
-#endif
-  };
-  static const DES_cblock kDESKey1 = {"BCMDESK1"};
-  static const DES_cblock kDESKey2 = {"BCMDESK2"};
-  static const DES_cblock kDESKey3 = {"BCMDESK3"};
-  static const DES_cblock kDESIV = {"BCMDESIV"};
-  static const uint8_t kDESCiphertext[64] = {
-      0xa4, 0x30, 0x7a, 0x4c, 0x1f, 0x60, 0x16, 0xd7, 0x4f, 0x41, 0xe1,
-      0xbb, 0x27, 0xc4, 0x27, 0x37, 0xd4, 0x7f, 0xb9, 0x10, 0xf8, 0xbc,
-      0xaf, 0x93, 0x91, 0xb8, 0x88, 0x24, 0xb1, 0xf6, 0xf8, 0xbd, 0x31,
-      0x96, 0x06, 0x76, 0xde, 0x32, 0xcd, 0x29, 0x29, 0xba, 0x70, 0x5f,
-      0xea, 0xc0, 0xcb, 0xde, 0xc7, 0x75, 0x90, 0xe0, 0x0f, 0x5e, 0x2c,
-      0x0d, 0x49, 0x20, 0xd5, 0x30, 0x83, 0xf8, 0x08,
-#if !defined(BORINGSSL_FIPS_BREAK_DES)
-      0x5a
-#else
-      0x00
-#endif
-  };
-  static const uint8_t kPlaintextSHA1[20] = {
-      0xc6, 0xf8, 0xc9, 0x63, 0x1c, 0x14, 0x23, 0x62, 0x9b, 0xbd,
-      0x55, 0x82, 0xf4, 0xd6, 0x1d, 0xf2, 0xab, 0x7d, 0xc8,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_1)
-      0x28
-#else
-      0x00
-#endif
-  };
-  static const uint8_t kPlaintextSHA256[32] = {
-      0x37, 0xbd, 0x70, 0x53, 0x72, 0xfc, 0xd4, 0x03, 0x79, 0x70, 0xfb,
-      0x06, 0x95, 0xb1, 0x2a, 0x82, 0x48, 0xe1, 0x3e, 0xf2, 0x33, 0xfb,
-      0xef, 0x29, 0x81, 0x22, 0x45, 0x40, 0x43, 0x70, 0xce,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_256)
-      0x0f
-#else
-      0x00
-#endif
-  };
-  static const uint8_t kPlaintextSHA512[64] = {
-      0x08, 0x6a, 0x1c, 0x84, 0x61, 0x9d, 0x8e, 0xb3, 0xc0, 0x97, 0x4e,
-      0xa1, 0x9f, 0x9c, 0xdc, 0xaf, 0x3b, 0x5c, 0x31, 0xf0, 0xf2, 0x74,
-      0xc3, 0xbd, 0x6e, 0xd6, 0x1e, 0xb2, 0xbb, 0x34, 0x74, 0x72, 0x5c,
-      0x51, 0x29, 0x8b, 0x87, 0x3a, 0xa3, 0xf2, 0x25, 0x23, 0xd4, 0x1c,
-      0x82, 0x1b, 0xfe, 0xd3, 0xc6, 0xee, 0xb5, 0xd6, 0xaf, 0x07, 0x7b,
-      0x98, 0xca, 0xa7, 0x01, 0xf3, 0x94, 0xf3, 0x68,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_512)
-      0x14
-#else
-      0x00
-#endif
-  };
-  static const uint8_t kRSASignature[256] = {
-      0x62, 0x66, 0x4b, 0xe3, 0xb1, 0xd2, 0x83, 0xf1, 0xa8, 0x56, 0x2b, 0x33,
-      0x60, 0x1e, 0xdb, 0x1e, 0x06, 0xf7, 0xa7, 0x1e, 0xa8, 0xef, 0x03, 0x4d,
-      0x0c, 0xf6, 0x83, 0x75, 0x7a, 0xf0, 0x14, 0xc7, 0xe2, 0x94, 0x3a, 0xb5,
-      0x67, 0x56, 0xa5, 0x48, 0x7f, 0x3a, 0xa5, 0xbf, 0xf7, 0x1d, 0x44, 0xa6,
-      0x34, 0xed, 0x9b, 0xd6, 0x51, 0xaa, 0x2c, 0x4e, 0xce, 0x60, 0x5f, 0xe9,
-      0x0e, 0xd5, 0xcd, 0xeb, 0x23, 0x27, 0xf8, 0xfb, 0x45, 0xe5, 0x34, 0x63,
-      0x77, 0x7f, 0x2e, 0x80, 0xcf, 0x9d, 0x2e, 0xfc, 0xe2, 0x50, 0x75, 0x29,
-      0x46, 0xf4, 0xaf, 0x91, 0xed, 0x36, 0xe1, 0x5e, 0xef, 0x66, 0xa1, 0xff,
-      0x27, 0xfc, 0x87, 0x7e, 0x60, 0x84, 0x0f, 0x54, 0x51, 0x56, 0x0f, 0x68,
-      0x99, 0xc0, 0x3f, 0xeb, 0xa5, 0xa0, 0x46, 0xb0, 0x86, 0x02, 0xb0, 0xc8,
-      0xe8, 0x46, 0x13, 0x06, 0xcd, 0xb7, 0x8a, 0xd0, 0x3b, 0x46, 0xd0, 0x14,
-      0x64, 0x53, 0x9b, 0x5b, 0x5e, 0x02, 0x45, 0xba, 0x6e, 0x7e, 0x0a, 0xb9,
-      0x9e, 0x62, 0xb7, 0xd5, 0x7a, 0x87, 0xea, 0xd3, 0x24, 0xa5, 0xef, 0xb3,
-      0xdc, 0x05, 0x9c, 0x04, 0x60, 0x4b, 0xde, 0xa8, 0x90, 0x08, 0x7b, 0x6a,
-      0x5f, 0xb4, 0x3f, 0xda, 0xc5, 0x1f, 0x6e, 0xd6, 0x15, 0xde, 0x65, 0xa4,
-      0x6e, 0x62, 0x9d, 0x8f, 0xa8, 0xbe, 0x86, 0xf6, 0x09, 0x90, 0x40, 0xa5,
-      0xf4, 0x23, 0xc5, 0xf6, 0x38, 0x86, 0x0d, 0x1c, 0xed, 0x4a, 0x0a, 0xae,
-      0xa4, 0x26, 0xc2, 0x2e, 0xd3, 0x13, 0x66, 0x61, 0xea, 0x35, 0x01, 0x0e,
-      0x13, 0xda, 0x78, 0x20, 0xae, 0x59, 0x5f, 0x9b, 0xa9, 0x6c, 0xf9, 0x1b,
-      0xdf, 0x76, 0x53, 0xc8, 0xa7, 0xf5, 0x63, 0x6d, 0xf3, 0xff, 0xfd, 0xaf,
-      0x75, 0x4b, 0xac, 0x67, 0xb1, 0x3c, 0xbf, 0x5e, 0xde, 0x73, 0x02, 0x6d,
-      0xd2, 0x0c, 0xb1,
-#if !defined(BORINGSSL_FIPS_BREAK_RSA_SIG)
-      0x64
-#else
-      0x00
-#endif
-  };
-  const uint8_t kDRBGEntropy[48] =
-      "BCM Known Answer Test DBRG Initial Entropy      ";
-  const uint8_t kDRBGPersonalization[18] = "BCMPersonalization";
-  const uint8_t kDRBGAD[16] = "BCM DRBG KAT AD ";
-  const uint8_t kDRBGOutput[64] = {
-      0x1d, 0x63, 0xdf, 0x05, 0x51, 0x49, 0x22, 0x46, 0xcd, 0x9b, 0xc5,
-      0xbb, 0xf1, 0x5d, 0x44, 0xae, 0x13, 0x78, 0xb1, 0xe4, 0x7c, 0xf1,
-      0x96, 0x33, 0x3d, 0x60, 0xb6, 0x29, 0xd4, 0xbb, 0x6b, 0x44, 0xf9,
-      0xef, 0xd9, 0xf4, 0xa2, 0xba, 0x48, 0xea, 0x39, 0x75, 0x59, 0x32,
-      0xf7, 0x31, 0x2c, 0x98, 0x14, 0x2b, 0x49, 0xdf, 0x02, 0xb6, 0x5d,
-      0x71, 0x09, 0x50, 0xdb, 0x23, 0xdb, 0xe5, 0x22,
-#if !defined(BORINGSSL_FIPS_BREAK_DRBG)
-      0x95
-#else
-      0x00
-#endif
-  };
-  const uint8_t kDRBGEntropy2[48] =
-      "BCM Known Answer Test DBRG Reseed Entropy       ";
-  const uint8_t kDRBGReseedOutput[64] = {
-      0xa4, 0x77, 0x05, 0xdb, 0x14, 0x11, 0x76, 0x71, 0x42, 0x5b, 0xd8,
-      0xd7, 0xa5, 0x4f, 0x8b, 0x39, 0xf2, 0x10, 0x4a, 0x50, 0x5b, 0xa2,
-      0xc8, 0xf0, 0xbb, 0x3e, 0xa1, 0xa5, 0x90, 0x7d, 0x54, 0xd9, 0xc6,
-      0xb0, 0x96, 0xc0, 0x2b, 0x7e, 0x9b, 0xc9, 0xa1, 0xdd, 0x78, 0x2e,
-      0xd5, 0xa8, 0x66, 0x16, 0xbd, 0x18, 0x3c, 0xf2, 0xaa, 0x7a, 0x2b,
-      0x37, 0xf9, 0xab, 0x35, 0x64, 0x15, 0x01, 0x3f, 0xc4,
-  };
-  const uint8_t kECDSASigR[32] = {
-      0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0,
-      0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02,
-      0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5,
-#if !defined(BORINGSSL_FIPS_BREAK_ECDSA_SIG)
-      0x0c,
-#else
-      0x00,
-#endif
-  };
-  const uint8_t kECDSASigS[32] = {
-      0xa5, 0x93, 0xe0, 0x23, 0x91, 0xe7, 0x4b, 0x8d, 0x77, 0x25, 0xa6,
-      0xba, 0x4d, 0xd9, 0x86, 0x77, 0xda, 0x7d, 0x8f, 0xef, 0xc4, 0x1a,
-      0xf0, 0xcc, 0x81, 0xe5, 0xea, 0x3f, 0xc2, 0x41, 0x7f, 0xd8,
-  };
-
-  AES_KEY aes_key;
-  uint8_t aes_iv[16];
-  uint8_t output[256];
-
-  // AES-CBC Encryption KAT
-  memcpy(aes_iv, kAESIV, sizeof(kAESIV));
-  if (AES_set_encrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) {
-    goto err;
-  }
-  AES_cbc_encrypt(kPlaintext, output, sizeof(kPlaintext), &aes_key, aes_iv,
-                  AES_ENCRYPT);
-  if (!check_test(kAESCBCCiphertext, output, sizeof(kAESCBCCiphertext),
-                  "AES-CBC Encryption KAT")) {
-    goto err;
-  }
-
-  // AES-CBC Decryption KAT
-  memcpy(aes_iv, kAESIV, sizeof(kAESIV));
-  if (AES_set_decrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) {
-    goto err;
-  }
-  AES_cbc_encrypt(kAESCBCCiphertext, output, sizeof(kAESCBCCiphertext),
-                  &aes_key, aes_iv, AES_DECRYPT);
-  if (!check_test(kPlaintext, output, sizeof(kPlaintext),
-                  "AES-CBC Decryption KAT")) {
-    goto err;
-  }
-
-  size_t out_len;
-  uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH];
-  OPENSSL_memset(nonce, 0, sizeof(nonce));
-  EVP_AEAD_CTX aead_ctx;
-  if (!EVP_AEAD_CTX_init(&aead_ctx, EVP_aead_aes_128_gcm(), kAESKey,
-                         sizeof(kAESKey), 0, NULL)) {
-    goto err;
-  }
-
-  // AES-GCM Encryption KAT
-  if (!EVP_AEAD_CTX_seal(&aead_ctx, output, &out_len, sizeof(output), nonce,
-                         EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()),
-                         kPlaintext, sizeof(kPlaintext), NULL, 0) ||
-      !check_test(kAESGCMCiphertext, output, sizeof(kAESGCMCiphertext),
-                  "AES-GCM Encryption KAT")) {
-    goto err;
-  }
-
-  // AES-GCM Decryption KAT
-  if (!EVP_AEAD_CTX_open(&aead_ctx, output, &out_len, sizeof(output), nonce,
-                         EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()),
-                         kAESGCMCiphertext, sizeof(kAESGCMCiphertext), NULL,
-                         0) ||
-      !check_test(kPlaintext, output, sizeof(kPlaintext),
-                  "AES-GCM Decryption KAT")) {
-    goto err;
-  }
-
-  EVP_AEAD_CTX_cleanup(&aead_ctx);
-
-  DES_key_schedule des1, des2, des3;
-  DES_cblock des_iv;
-  DES_set_key(&kDESKey1, &des1);
-  DES_set_key(&kDESKey2, &des2);
-  DES_set_key(&kDESKey3, &des3);
-
-  // 3DES Encryption KAT
-  memcpy(&des_iv, &kDESIV, sizeof(des_iv));
-  DES_ede3_cbc_encrypt(kPlaintext, output, sizeof(kPlaintext), &des1, &des2,
-                       &des3, &des_iv, DES_ENCRYPT);
-  if (!check_test(kDESCiphertext, output, sizeof(kDESCiphertext),
-                  "3DES Encryption KAT")) {
-    goto err;
-  }
-
-  // 3DES Decryption KAT
-  memcpy(&des_iv, &kDESIV, sizeof(des_iv));
-  DES_ede3_cbc_encrypt(kDESCiphertext, output, sizeof(kDESCiphertext), &des1,
-                       &des2, &des3, &des_iv, DES_DECRYPT);
-  if (!check_test(kPlaintext, output, sizeof(kPlaintext),
-                  "3DES Decryption KAT")) {
-    goto err;
-  }
-
-  // SHA-1 KAT
-  SHA1(kPlaintext, sizeof(kPlaintext), output);
-  if (!check_test(kPlaintextSHA1, output, sizeof(kPlaintextSHA1),
-                  "SHA-1 KAT")) {
-    goto err;
-  }
-
-  // SHA-256 KAT
-  SHA256(kPlaintext, sizeof(kPlaintext), output);
-  if (!check_test(kPlaintextSHA256, output, sizeof(kPlaintextSHA256),
-                  "SHA-256 KAT")) {
-    goto err;
-  }
-
-  // SHA-512 KAT
-  SHA512(kPlaintext, sizeof(kPlaintext), output);
-  if (!check_test(kPlaintextSHA512, output, sizeof(kPlaintextSHA512),
-                  "SHA-512 KAT")) {
-    goto err;
-  }
-
-  RSA *rsa_key = self_test_rsa_key();
-  if (rsa_key == NULL) {
-    printf("RSA KeyGen failed\n");
-    goto err;
-  }
-
-  // RSA Sign KAT
-  unsigned sig_len;
-
-  // Disable blinding for the power-on tests because it's not needed and
-  // triggers an entropy draw.
-  rsa_key->flags |= RSA_FLAG_NO_BLINDING;
-
-  if (!RSA_sign(NID_sha256, kPlaintextSHA256, sizeof(kPlaintextSHA256), output,
-                &sig_len, rsa_key) ||
-      !check_test(kRSASignature, output, sizeof(kRSASignature),
-                  "RSA Sign KAT")) {
-    goto err;
-  }
-
-  // RSA Verify KAT
-  if (!RSA_verify(NID_sha256, kPlaintextSHA256, sizeof(kPlaintextSHA256),
-                  kRSASignature, sizeof(kRSASignature), rsa_key)) {
-    printf("RSA Verify KAT failed.\n");
-    goto err;
-  }
-
-  RSA_free(rsa_key);
-
-  EC_KEY *ec_key = self_test_ecdsa_key();
-  if (ec_key == NULL) {
-    printf("ECDSA KeyGen failed\n");
-    goto err;
-  }
-
-  // ECDSA Sign/Verify PWCT
-
-  // The 'k' value for ECDSA is fixed to avoid an entropy draw.
-  ec_key->fixed_k = BN_new();
-  if (ec_key->fixed_k == NULL ||
-      !BN_set_word(ec_key->fixed_k, 42)) {
-    printf("Out of memory\n");
-    goto err;
-  }
-
-  ECDSA_SIG *sig =
-      ECDSA_do_sign(kPlaintextSHA256, sizeof(kPlaintextSHA256), ec_key);
-
-  uint8_t ecdsa_r_bytes[sizeof(kECDSASigR)];
-  uint8_t ecdsa_s_bytes[sizeof(kECDSASigS)];
-  if (sig == NULL ||
-      BN_num_bytes(sig->r) != sizeof(ecdsa_r_bytes) ||
-      !BN_bn2bin(sig->r, ecdsa_r_bytes) ||
-      BN_num_bytes(sig->s) != sizeof(ecdsa_s_bytes) ||
-      !BN_bn2bin(sig->s, ecdsa_s_bytes) ||
-      !check_test(kECDSASigR, ecdsa_r_bytes, sizeof(kECDSASigR), "ECDSA R") ||
-      !check_test(kECDSASigS, ecdsa_s_bytes, sizeof(kECDSASigS), "ECDSA S")) {
-    printf("ECDSA KAT failed.\n");
-    goto err;
-  }
-
-  ECDSA_SIG_free(sig);
-  EC_KEY_free(ec_key);
-
-  // DBRG KAT
-  CTR_DRBG_STATE drbg;
-  if (!CTR_DRBG_init(&drbg, kDRBGEntropy, kDRBGPersonalization,
-                     sizeof(kDRBGPersonalization)) ||
-      !CTR_DRBG_generate(&drbg, output, sizeof(kDRBGOutput), kDRBGAD,
-                         sizeof(kDRBGAD)) ||
-      !check_test(kDRBGOutput, output, sizeof(kDRBGOutput),
-                  "DBRG Generate KAT") ||
-      !CTR_DRBG_reseed(&drbg, kDRBGEntropy2, kDRBGAD, sizeof(kDRBGAD)) ||
-      !CTR_DRBG_generate(&drbg, output, sizeof(kDRBGReseedOutput), kDRBGAD,
-                         sizeof(kDRBGAD)) ||
-      !check_test(kDRBGReseedOutput, output, sizeof(kDRBGReseedOutput),
-                  "DRBG Reseed KAT")) {
-    goto err;
-  }
-  CTR_DRBG_clear(&drbg);
-
-  CTR_DRBG_STATE kZeroDRBG;
-  memset(&kZeroDRBG, 0, sizeof(kZeroDRBG));
-  if (!check_test(&kZeroDRBG, &drbg, sizeof(drbg), "DRBG Clear KAT")) {
+  if (!BORINGSSL_self_test()) {
    goto err;
  }

@@ -676,4 +144,5 @@ void BORINGSSL_FIPS_abort(void) {
    exit(1);
  }
 }
+
 #endif  // BORINGSSL_FIPS
@@ -100,61 +100,38 @@ int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
  return ret;
 }

-int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
-  int max, min, dif;
-  BN_ULONG *ap, *bp, *rp, carry, t1, t2;
-  const BIGNUM *tmp;
-
-  if (a->top < b->top) {
-    tmp = a;
+int bn_uadd_fixed(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  // Widths are public, so we normalize to make |a| the larger one.
+  if (a->width < b->width) {
+    const BIGNUM *tmp = a;
    a = b;
    b = tmp;
  }
-  max = a->top;
-  min = b->top;
-  dif = max - min;

+  int max = a->width;
+  int min = b->width;
  if (!bn_wexpand(r, max + 1)) {
    return 0;
  }
+  r->width = max + 1;

-  r->top = max;
-
-  ap = a->d;
-  bp = b->d;
-  rp = r->d;
-
-  carry = bn_add_words(rp, ap, bp, min);
-  rp += min;
-  ap += min;
-  bp += min;
-
-  if (carry) {
-    while (dif) {
-      dif--;
-      t1 = *(ap++);
-      t2 = t1 + 1;
-      *(rp++) = t2;
-      if (t2) {
-        carry = 0;
-        break;
-      }
-    }
-    if (carry) {
-      // carry != 0 => dif == 0
-      *rp = 1;
-      r->top++;
-    }
+  BN_ULONG carry = bn_add_words(r->d, a->d, b->d, min);
+  for (int i = min; i < max; i++) {
+    // |r| and |a| may alias, so use a temporary.
+    BN_ULONG tmp = carry + a->d[i];
+    carry = tmp < a->d[i];
+    r->d[i] = tmp;
  }

-  if (dif && rp != ap) {
-    while (dif--) {
-      // copy remaining words if ap != rp
-      *(rp++) = *(ap++);
-    }
-  }
+  r->d[max] = carry;
+  return 1;
+}

-  r->neg = 0;
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  if (!bn_uadd_fixed(r, a, b)) {
+    return 0;
+  }
+  bn_set_minimal_width(r);
  return 1;
 }

@@ -182,16 +159,16 @@ int BN_add_word(BIGNUM *a, BN_ULONG w) {
    return i;
  }

-  for (i = 0; w != 0 && i < a->top; i++) {
+  for (i = 0; w != 0 && i < a->width; i++) {
    a->d[i] = l = a->d[i] + w;
    w = (w > l) ? 1 : 0;
  }

-  if (w && i == a->top) {
-    if (!bn_wexpand(a, a->top + 1)) {
+  if (w && i == a->width) {
+    if (!bn_wexpand(a, a->width + 1)) {
      return 0;
    }
-    a->top++;
+    a->width++;
    a->d[i] = w;
  }

@@ -199,7 +176,6 @@ int BN_add_word(BIGNUM *a, BN_ULONG w) {
 }

 int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
-  int max;
  int add = 0, neg = 0;
  const BIGNUM *tmp;

@@ -232,13 +208,6 @@ int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
    return 1;
  }

-  // We are actually doing a - b :-)
-
-  max = (a->top > b->top) ? a->top : b->top;
-  if (!bn_wexpand(r, max)) {
-    return 0;
-  }
-
  if (BN_ucmp(a, b) < 0) {
    if (!BN_usub(r, b, a)) {
      return 0;
@@ -259,8 +228,8 @@ int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
  register BN_ULONG t1, t2, *ap, *bp, *rp;
  int i, carry;

-  max = a->top;
-  min = b->top;
+  max = bn_minimal_width(a);
+  min = bn_minimal_width(b);
  dif = max - min;

  if (dif < 0)  // hmm... should not be happening
@@ -313,9 +282,9 @@ int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
    OPENSSL_memcpy(rp, ap, sizeof(*rp) * dif);
  }

-  r->top = max;
+  r->width = max;
  r->neg = 0;
-  bn_correct_top(r);
+  bn_set_minimal_width(r);

  return 1;
 }
@@ -345,7 +314,7 @@ int BN_sub_word(BIGNUM *a, BN_ULONG w) {
    return i;
  }

-  if ((a->top == 1) && (a->d[0] < w)) {
+  if ((bn_minimal_width(a) == 1) && (a->d[0] < w)) {
    a->d[0] = w - a->d[0];
    a->neg = 1;
    return 1;
@@ -363,8 +332,8 @@ int BN_sub_word(BIGNUM *a, BN_ULONG w) {
    }
  }

-  if ((a->d[i] == 0) && (i == (a->top - 1))) {
-    a->top--;
+  if ((a->d[i] == 0) && (i == (a->width - 1))) {
+    a->width--;
  }

  return 1;
@@ -97,6 +97,10 @@ $_num="$num,#15*4";	$_bpend=$_num;
 $code=<<___;
 #include <openssl/arm_arch.h>

+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
 .text
 #if defined(__thumb2__)
 .syntax	unified
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../../perlasm");
@@ -1,4 +1,10 @@
-#!/usr/local/bin/perl
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../../perlasm");
@@ -1,61 +1,30 @@
-#!/usr/bin/env perl
-
-##############################################################################
-#                                                                            #
-#  Copyright (c) 2012, Intel Corporation                                     #
-#                                                                            #
-#  All rights reserved.                                                      #
-#                                                                            #
-#  Redistribution and use in source and binary forms, with or without        #
-#  modification, are permitted provided that the following conditions are    #
-#  met:                                                                      #
-#                                                                            #
-#  *  Redistributions of source code must retain the above copyright         #
-#     notice, this list of conditions and the following disclaimer.          #
-#                                                                            #
-#  *  Redistributions in binary form must reproduce the above copyright      #
-#     notice, this list of conditions and the following disclaimer in the    #
-#     documentation and/or other materials provided with the                 #
-#     distribution.                                                          #
-#                                                                            #
-#  *  Neither the name of the Intel Corporation nor the names of its         #
-#     contributors may be used to endorse or promote products derived from   #
-#     this software without specific prior written permission.               #
-#                                                                            #
-#                                                                            #
-#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
-#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
-#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
-#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
-#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
-#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
-#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
-#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
-#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
-#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
-#                                                                            #
-##############################################################################
-# Developers and authors:                                                    #
-# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
-# (1) Intel Corporation, Israel Development Center, Haifa, Israel            #
-# (2) University of Haifa, Israel                                            #
-##############################################################################
-# Reference:                                                                 #
-# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular             #
-#     Exponentiation,  Using Advanced Vector Instructions Architectures",    #
-#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,   #
-#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012              #
-# [2] S. Gueron: "Efficient Software Implementations of Modular              #
-#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).  #
-# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE         #
-#     Proceedings of 9th International Conference on Information Technology: #
-#     New Generations (ITNG 2012), pp.821-823 (2012)                         #
-# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
-#     resistant 1024-bit modular exponentiation, for optimizing RSA2048      #
-#     on AVX2 capable x86_64 platforms",                                     #
-#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
-##############################################################################
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2012, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel
+# (2) University of Haifa, Israel
+#
+# References:
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular
+#     Exponentiation,  Using Advanced Vector Instructions Architectures",
+#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
+#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
+# [2] S. Gueron: "Efficient Software Implementations of Modular
+#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
+# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
+#     Proceedings of 9th International Conference on Information Technology:
+#     New Generations (ITNG 2012), pp.821-823 (2012)
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
+#     resistant 1024-bit modular exponentiation, for optimizing RSA2048
+#     on AVX2 capable x86_64 platforms",
+#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
 #
 # +13% improvement over original submission by <appro@openssl.org>
 #
@@ -232,7 +201,7 @@ $code.=<<___;
 	vmovdqu		32*8-128($ap), $ACC8

 	lea	192(%rsp), $tp0			# 64+128=192
-	vpbroadcastq	.Land_mask(%rip), $AND_MASK
+	vmovdqu	.Land_mask(%rip), $AND_MASK
 	jmp	.LOOP_GRANDE_SQR_1024

 .align	32
@@ -1082,10 +1051,10 @@ $code.=<<___;
 	vpmuludq	32*6-128($np),$Yi,$TEMP1
 	vpaddq		$TEMP1,$ACC6,$ACC6
 	vpmuludq	32*7-128($np),$Yi,$TEMP2
-	 vpblendd	\$3, $ZERO, $ACC9, $ACC9	# correct $ACC3
+	 vpblendd	\$3, $ZERO, $ACC9, $TEMP1	# correct $ACC3
 	vpaddq		$TEMP2,$ACC7,$ACC7
 	vpmuludq	32*8-128($np),$Yi,$TEMP0
-	 vpaddq		$ACC9, $ACC3, $ACC3		# correct $ACC3
+	 vpaddq		$TEMP1, $ACC3, $ACC3		# correct $ACC3
 	vpaddq		$TEMP0,$ACC8,$ACC8

 	mov	%rbx, %rax
@@ -1098,7 +1067,9 @@ $code.=<<___;
 	 vmovdqu	-8+32*2-128($ap),$TEMP2

 	mov	$r1, %rax
+	 vpblendd	\$0xfc, $ZERO, $ACC9, $ACC9	# correct $ACC3
 	imull	$n0, %eax
+	 vpaddq		$ACC9,$ACC4,$ACC4		# correct $ACC3
 	and	\$0x1fffffff, %eax

 	 imulq	16-128($ap),%rbx
@@ -1334,15 +1305,12 @@ ___
 #	But as we underutilize resources, it's possible to correct in
 #	each iteration with marginal performance loss. But then, as
 #	we do it in each iteration, we can correct less digits, and
-#	avoid performance penalties completely. Also note that we
-#	correct only three digits out of four. This works because
-#	most significant digit is subjected to less additions.
+#	avoid performance penalties completely.

 $TEMP0 = $ACC9;
 $TEMP3 = $Bi;
 $TEMP4 = $Yi;
 $code.=<<___;
-	vpermq		\$0, $AND_MASK, $AND_MASK
 	vpaddq		(%rsp), $TEMP1, $ACC0

 	vpsrlq		\$29, $ACC0, $TEMP1
@@ -1790,7 +1758,7 @@ $code.=<<___;

 .align	64
 .Land_mask:
-	.quad	0x1fffffff,0x1fffffff,0x1fffffff,-1
+	.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
 .Lscatter_permd:
 	.long	0,2,4,6,7,7,7,7
 .Lgather_permd:
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -71,7 +78,7 @@ $frame=32;				# size of above frame rounded up to 16n
 	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
 	&neg	("edi");

-	# minimize cache contention by arraning 2K window between stack
+	# minimize cache contention by arranging 2K window between stack
 	# pointer and ap argument [np is also position sensitive vector,
 	# but it's assumed to be near ap, as it's allocated at ~same
 	# time].
@@ -52,8 +52,9 @@

 #include <openssl/bn.h>

-// TODO(davidben): Get this file working on Windows x64.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
+// TODO(davidben): Get this file working on MSVC x64.
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))

 #include "../internal.h"

@@ -537,4 +538,4 @@ void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
 #undef mul_add_c2
 #undef sqr_add_c2

-#endif  // !NO_ASM && X86_64 && __GNUC__
+#endif  // !NO_ASM && X86_64 && (__GNUC__ || __clang__)
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -296,12 +303,11 @@ $code.=<<___;
 	mov	$num,$j			# j=num
 	jmp	.Lsub
 .align	16
-.Lsub:
-	sbb	($np,$i,8),%rax
+.Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	dec	$j			# doesnn't affect CF!
+	dec	$j			# doesn't affect CF!
 	jnz	.Lsub

 	sbb	\$0,%rax		# handle upmost overflow bit
@@ -732,7 +738,7 @@ $code.=<<___;
 	mov	56($ap,$i,8),@ri[3]
 	sbb	40($np,$i,8),@ri[1]
 	lea	4($i),$i		# i++
-	dec	$j			# doesnn't affect CF!
+	dec	$j			# doesn't affect CF!
 	jnz	.Lsub4x

 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -396,12 +403,11 @@ $code.=<<___;
 	mov	$num,$j			# j=num
 	jmp	.Lsub
 .align	16
-.Lsub:
-	sbb	($np,$i,8),%rax
+.Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	dec	$j			# doesnn't affect CF!
+	dec	$j			# doesn't affect CF!
 	jnz	.Lsub

 	sbb	\$0,%rax		# handle upmost overflow bit
@@ -2405,7 +2411,7 @@ my $N=$STRIDE/4;		# should match cache line size
 $code.=<<___;
 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
 	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
-	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
+	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimization)
 	lea	128($bp),$bptr		# size optimization

 	pshufd	\$0,%xmm5,%xmm5		# broadcast index
@@ -148,13 +148,13 @@ BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) {
    return dest;
  }

-  if (!bn_wexpand(dest, src->top)) {
+  if (!bn_wexpand(dest, src->width)) {
    return NULL;
  }

-  OPENSSL_memcpy(dest->d, src->d, sizeof(src->d[0]) * src->top);
+  OPENSSL_memcpy(dest->d, src->d, sizeof(src->d[0]) * src->width);

-  dest->top = src->top;
+  dest->width = src->width;
  dest->neg = src->neg;
  return dest;
 }
@@ -164,14 +164,14 @@ void BN_clear(BIGNUM *bn) {
    OPENSSL_memset(bn->d, 0, bn->dmax * sizeof(bn->d[0]));
  }

-  bn->top = 0;
+  bn->width = 0;
  bn->neg = 0;
 }

 DEFINE_METHOD_FUNCTION(BIGNUM, BN_value_one) {
  static const BN_ULONG kOneLimbs[1] = { 1 };
  out->d = (BN_ULONG*) kOneLimbs;
-  out->top = 1;
+  out->width = 1;
  out->dmax = 1;
  out->neg = 0;
  out->flags = BN_FLG_STATIC_DATA;
@@ -180,61 +180,59 @@ DEFINE_METHOD_FUNCTION(BIGNUM, BN_value_one) {
 // BN_num_bits_word returns the minimum number of bits needed to represent the
 // value in |l|.
 unsigned BN_num_bits_word(BN_ULONG l) {
-  static const unsigned char bits[256] = {
-      0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-      5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-      7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+  // |BN_num_bits| is often called on RSA prime factors. These have public bit
+  // lengths, but all bits beyond the high bit are secret, so count bits in
+  // constant time.
+  BN_ULONG x, mask;
+  int bits = (l != 0);

-#if defined(OPENSSL_64_BIT)
-  if (l & 0xffffffff00000000L) {
-    if (l & 0xffff000000000000L) {
-      if (l & 0xff00000000000000L) {
-        return (bits[(int)(l >> 56)] + 56);
-      } else {
-        return (bits[(int)(l >> 48)] + 48);
-      }
-    } else {
-      if (l & 0x0000ff0000000000L) {
-        return (bits[(int)(l >> 40)] + 40);
-      } else {
-        return (bits[(int)(l >> 32)] + 32);
-      }
-    }
-  } else
+#if BN_BITS2 > 32
+  x = l >> 32;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 32 & mask;
+  l ^= (x ^ l) & mask;
 #endif
-  {
-    if (l & 0xffff0000L) {
-      if (l & 0xff000000L) {
-        return (bits[(int)(l >> 24L)] + 24);
-      } else {
-        return (bits[(int)(l >> 16L)] + 16);
-      }
-    } else {
-      if (l & 0xff00L) {
-        return (bits[(int)(l >> 8)] + 8);
-      } else {
-        return (bits[(int)(l)]);
-      }
-    }
-  }
+
+  x = l >> 16;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 16 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 8;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 8 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 4;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 4 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 2;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 2 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 1;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 1 & mask;
+
+  return bits;
 }

 unsigned BN_num_bits(const BIGNUM *bn) {
-  const int max = bn->top - 1;
-
-  if (BN_is_zero(bn)) {
+  const int width = bn_minimal_width(bn);
+  if (width == 0) {
    return 0;
  }

-  return max*BN_BITS2 + BN_num_bits_word(bn->d[max]);
+  return (width - 1) * BN_BITS2 + BN_num_bits_word(bn->d[width - 1]);
 }

 unsigned BN_num_bytes(const BIGNUM *bn) {
@@ -242,7 +240,7 @@ unsigned BN_num_bytes(const BIGNUM *bn) {
 }

 void BN_zero(BIGNUM *bn) {
-  bn->top = bn->neg = 0;
+  bn->width = bn->neg = 0;
 }

 int BN_one(BIGNUM *bn) {
@@ -261,7 +259,7 @@ int BN_set_word(BIGNUM *bn, BN_ULONG value) {

  bn->neg = 0;
  bn->d[0] = value;
-  bn->top = 1;
+  bn->width = 1;
  return 1;
 }

@@ -280,7 +278,7 @@ int BN_set_u64(BIGNUM *bn, uint64_t value) {
  bn->neg = 0;
  bn->d[0] = (BN_ULONG)value;
  bn->d[1] = (BN_ULONG)(value >> 32);
-  bn->top = 2;
+  bn->width = 2;
  return 1;
 #else
 #error "BN_BITS2 must be 32 or 64."
@@ -293,12 +291,40 @@ int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
  }
  OPENSSL_memmove(bn->d, words, num * sizeof(BN_ULONG));
  // |bn_wexpand| verified that |num| isn't too large.
-  bn->top = (int)num;
-  bn_correct_top(bn);
+  bn->width = (int)num;
  bn->neg = 0;
  return 1;
 }

+int bn_fits_in_words(const BIGNUM *bn, size_t num) {
+  // All words beyond |num| must be zero.
+  BN_ULONG mask = 0;
+  for (size_t i = num; i < (size_t)bn->width; i++) {
+    mask |= bn->d[i];
+  }
+  return mask == 0;
+}
+
+int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn) {
+  if (bn->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  size_t width = (size_t)bn->width;
+  if (width > num) {
+    if (!bn_fits_in_words(bn, num)) {
+      OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+      return 0;
+    }
+    width = num;
+  }
+
+  OPENSSL_memset(out, 0, sizeof(BN_ULONG) * num);
+  OPENSSL_memcpy(out, bn->d, sizeof(BN_ULONG) * width);
+  return 1;
+}
+
 int BN_is_negative(const BIGNUM *bn) {
  return bn->neg != 0;
 }
@@ -334,7 +360,7 @@ int bn_wexpand(BIGNUM *bn, size_t words) {
    return 0;
  }

-  OPENSSL_memcpy(a, bn->d, sizeof(BN_ULONG) * bn->top);
+  OPENSSL_memcpy(a, bn->d, sizeof(BN_ULONG) * bn->width);

  OPENSSL_free(bn->d);
  bn->d = a;
@@ -351,20 +377,46 @@ int bn_expand(BIGNUM *bn, size_t bits) {
  return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2);
 }

-void bn_correct_top(BIGNUM *bn) {
-  BN_ULONG *ftl;
-  int tmp_top = bn->top;
-
-  if (tmp_top > 0) {
-    for (ftl = &(bn->d[tmp_top - 1]); tmp_top > 0; tmp_top--) {
-      if (*(ftl--)) {
-        break;
-      }
+int bn_resize_words(BIGNUM *bn, size_t words) {
+  if ((size_t)bn->width <= words) {
+    if (!bn_wexpand(bn, words)) {
+      return 0;
    }
-    bn->top = tmp_top;
+    OPENSSL_memset(bn->d + bn->width, 0,
+                   (words - bn->width) * sizeof(BN_ULONG));
+    bn->width = words;
+    return 1;
  }

-  if (bn->top == 0) {
+  // All words beyond the new width must be zero.
+  if (!bn_fits_in_words(bn, words)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
+  }
+  bn->width = words;
+  return 1;
+}
+
+void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
+                     const BN_ULONG *b, size_t num) {
+  for (size_t i = 0; i < num; i++) {
+    OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                           crypto_word_t_too_small);
+    r[i] = constant_time_select_w(mask, a[i], b[i]);
+  }
+}
+
+int bn_minimal_width(const BIGNUM *bn) {
+  int ret = bn->width;
+  while (ret > 0 && bn->d[ret - 1] == 0) {
+    ret--;
+  }
+  return ret;
+}
+
+void bn_set_minimal_width(BIGNUM *bn) {
+  bn->width = bn_minimal_width(bn);
+  if (bn->width == 0) {
    bn->neg = 0;
  }
 }
@@ -87,11 +87,13 @@

 #include <gtest/gtest.h>

+#include <openssl/bio.h>
 #include <openssl/bn.h>
 #include <openssl/bytestring.h>
 #include <openssl/crypto.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
+#include <openssl/rand.h>

 #include "./internal.h"
 #include "../../internal.h"
@@ -106,34 +108,62 @@ static int HexToBIGNUM(bssl::UniquePtr<BIGNUM> *out, const char *in) {
  return ret;
 }

-static bssl::UniquePtr<BIGNUM> GetBIGNUM(FileTest *t, const char *attribute) {
-  std::string hex;
-  if (!t->GetAttribute(&hex, attribute)) {
-    return nullptr;
+// A BIGNUMFileTest wraps a FileTest to give |BIGNUM| values and also allows
+// injecting oversized |BIGNUM|s.
+class BIGNUMFileTest {
+ public:
+  BIGNUMFileTest(FileTest *t, unsigned large_mask)
+      : t_(t), large_mask_(large_mask), num_bignums_(0) {}
+
+  unsigned num_bignums() const { return num_bignums_; }
+
+  bssl::UniquePtr<BIGNUM> GetBIGNUM(const char *attribute) {
+    return GetBIGNUMImpl(attribute, true /* resize */);
  }

-  bssl::UniquePtr<BIGNUM> ret;
-  if (HexToBIGNUM(&ret, hex.c_str()) != static_cast<int>(hex.size())) {
-    t->PrintLine("Could not decode '%s'.", hex.c_str());
-    return nullptr;
-  }
-  return ret;
-}
+  bool GetInt(int *out, const char *attribute) {
+    bssl::UniquePtr<BIGNUM> ret =
+        GetBIGNUMImpl(attribute, false /* don't resize */);
+    if (!ret) {
+      return false;
+    }

-static bool GetInt(FileTest *t, int *out, const char *attribute) {
-  bssl::UniquePtr<BIGNUM> ret = GetBIGNUM(t, attribute);
-  if (!ret) {
-    return false;
+    BN_ULONG word = BN_get_word(ret.get());
+    if (word > INT_MAX) {
+      return false;
+    }
+
+    *out = static_cast<int>(word);
+    return true;
  }

-  BN_ULONG word = BN_get_word(ret.get());
-  if (word > INT_MAX) {
-    return false;
+ private:
+  bssl::UniquePtr<BIGNUM> GetBIGNUMImpl(const char *attribute, bool resize) {
+    std::string hex;
+    if (!t_->GetAttribute(&hex, attribute)) {
+      return nullptr;
+    }
+
+    bssl::UniquePtr<BIGNUM> ret;
+    if (HexToBIGNUM(&ret, hex.c_str()) != static_cast<int>(hex.size())) {
+      t_->PrintLine("Could not decode '%s'.", hex.c_str());
+      return nullptr;
+    }
+    if (resize) {
+      // Test with an oversized |BIGNUM| if necessary.
+      if ((large_mask_ & (1 << num_bignums_)) &&
+          !bn_resize_words(ret.get(), ret->width * 2 + 1)) {
+        return nullptr;
+      }
+      num_bignums_++;
+    }
+    return ret;
  }

-  *out = static_cast<int>(word);
-  return true;
-}
+  FileTest *t_;
+  unsigned large_mask_;
+  unsigned num_bignums_;
+};

 static testing::AssertionResult AssertBIGNUMSEqual(
    const char *operation_expr, const char *expected_expr,
@@ -159,10 +189,10 @@ static testing::AssertionResult AssertBIGNUMSEqual(
 #define EXPECT_BIGNUMS_EQUAL(op, a, b) \
  EXPECT_PRED_FORMAT3(AssertBIGNUMSEqual, op, a, b)

-static void TestSum(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> b = GetBIGNUM(t, "B");
-  bssl::UniquePtr<BIGNUM> sum = GetBIGNUM(t, "Sum");
+static void TestSum(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> b = t->GetBIGNUM("B");
+  bssl::UniquePtr<BIGNUM> sum = t->GetBIGNUM("Sum");
  ASSERT_TRUE(a);
  ASSERT_TRUE(b);
  ASSERT_TRUE(sum);
@@ -261,9 +291,9 @@ static void TestSum(FileTest *t, BN_CTX *ctx) {
  }
 }

-static void TestLShift1(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> lshift1 = GetBIGNUM(t, "LShift1");
+static void TestLShift1(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> lshift1 = t->GetBIGNUM("LShift1");
  bssl::UniquePtr<BIGNUM> zero(BN_new());
  ASSERT_TRUE(a);
  ASSERT_TRUE(lshift1);
@@ -307,13 +337,13 @@ static void TestLShift1(FileTest *t, BN_CTX *ctx) {
  EXPECT_BIGNUMS_EQUAL("(LShift | 1) >> 1", a.get(), ret.get());
 }

-static void TestLShift(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> lshift = GetBIGNUM(t, "LShift");
+static void TestLShift(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> lshift = t->GetBIGNUM("LShift");
  ASSERT_TRUE(a);
  ASSERT_TRUE(lshift);
  int n = 0;
-  ASSERT_TRUE(GetInt(t, &n, "N"));
+  ASSERT_TRUE(t->GetInt(&n, "N"));

  bssl::UniquePtr<BIGNUM> ret(BN_new());
  ASSERT_TRUE(ret);
@@ -324,13 +354,13 @@ static void TestLShift(FileTest *t, BN_CTX *ctx) {
  EXPECT_BIGNUMS_EQUAL("A >> N", a.get(), ret.get());
 }

-static void TestRShift(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> rshift = GetBIGNUM(t, "RShift");
+static void TestRShift(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> rshift = t->GetBIGNUM("RShift");
  ASSERT_TRUE(a);
  ASSERT_TRUE(rshift);
  int n = 0;
-  ASSERT_TRUE(GetInt(t, &n, "N"));
+  ASSERT_TRUE(t->GetInt(&n, "N"));

  bssl::UniquePtr<BIGNUM> ret(BN_new());
  ASSERT_TRUE(ret);
@@ -338,9 +368,9 @@ static void TestRShift(FileTest *t, BN_CTX *ctx) {
  EXPECT_BIGNUMS_EQUAL("A >> N", rshift.get(), ret.get());
 }

-static void TestSquare(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> square = GetBIGNUM(t, "Square");
+static void TestSquare(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> square = t->GetBIGNUM("Square");
  bssl::UniquePtr<BIGNUM> zero(BN_new());
  ASSERT_TRUE(a);
  ASSERT_TRUE(square);
@@ -386,15 +416,15 @@ static void TestSquare(FileTest *t, BN_CTX *ctx) {
  }

 #if !defined(BORINGSSL_SHARED_LIBRARY)
-  if (static_cast<size_t>(a->top) <= BN_SMALL_MAX_WORDS) {
-    for (size_t num_a = a->top; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
+  int a_width = bn_minimal_width(a.get());
+  if (a_width <= BN_SMALL_MAX_WORDS) {
+    for (size_t num_a = a_width; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
      SCOPED_TRACE(num_a);
      size_t num_r = 2 * num_a;
      // Use newly-allocated buffers so ASan will catch out-of-bounds writes.
      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[num_a]),
          r_words(new BN_ULONG[num_r]);
-      OPENSSL_memset(a_words.get(), 0, num_a * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
+      ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));

      ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
                               a_words.get(), num_a));
@@ -411,10 +441,10 @@ static void TestSquare(FileTest *t, BN_CTX *ctx) {
 #endif
 }

-static void TestProduct(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> b = GetBIGNUM(t, "B");
-  bssl::UniquePtr<BIGNUM> product = GetBIGNUM(t, "Product");
+static void TestProduct(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> b = t->GetBIGNUM("B");
+  bssl::UniquePtr<BIGNUM> product = t->GetBIGNUM("Product");
  bssl::UniquePtr<BIGNUM> zero(BN_new());
  ASSERT_TRUE(a);
  ASSERT_TRUE(b);
@@ -444,22 +474,25 @@ static void TestProduct(FileTest *t, BN_CTX *ctx) {
  }

 #if !defined(BORINGSSL_SHARED_LIBRARY)
-  if (!BN_is_negative(product.get()) &&
-      static_cast<size_t>(a->top) <= BN_SMALL_MAX_WORDS &&
-      static_cast<size_t>(b->top) <= BN_SMALL_MAX_WORDS) {
-    for (size_t num_a = a->top; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
+  BN_set_negative(a.get(), 0);
+  BN_set_negative(b.get(), 0);
+  BN_set_negative(product.get(), 0);
+
+  int a_width = bn_minimal_width(a.get());
+  int b_width = bn_minimal_width(b.get());
+  if (a_width <= BN_SMALL_MAX_WORDS && b_width <= BN_SMALL_MAX_WORDS) {
+    for (size_t num_a = static_cast<size_t>(a_width);
+         num_a <= BN_SMALL_MAX_WORDS; num_a++) {
      SCOPED_TRACE(num_a);
-      for (size_t num_b = b->top; num_b <= BN_SMALL_MAX_WORDS; num_b++) {
+      for (size_t num_b = static_cast<size_t>(b_width);
+           num_b <= BN_SMALL_MAX_WORDS; num_b++) {
        SCOPED_TRACE(num_b);
        size_t num_r = num_a + num_b;
        // Use newly-allocated buffers so ASan will catch out-of-bounds writes.
        std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[num_a]),
            b_words(new BN_ULONG[num_b]), r_words(new BN_ULONG[num_r]);
-        OPENSSL_memset(a_words.get(), 0, num_a * sizeof(BN_ULONG));
-        OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-
-        OPENSSL_memset(b_words.get(), 0, num_b * sizeof(BN_ULONG));
-        OPENSSL_memcpy(b_words.get(), b->d, b->top * sizeof(BN_ULONG));
+        ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
+        ASSERT_TRUE(bn_copy_words(b_words.get(), num_b, b.get()));

        ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
                                 b_words.get(), num_b));
@@ -471,11 +504,11 @@ static void TestProduct(FileTest *t, BN_CTX *ctx) {
 #endif
 }

-static void TestQuotient(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> b = GetBIGNUM(t, "B");
-  bssl::UniquePtr<BIGNUM> quotient = GetBIGNUM(t, "Quotient");
-  bssl::UniquePtr<BIGNUM> remainder = GetBIGNUM(t, "Remainder");
+static void TestQuotient(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> b = t->GetBIGNUM("B");
+  bssl::UniquePtr<BIGNUM> quotient = t->GetBIGNUM("Quotient");
+  bssl::UniquePtr<BIGNUM> remainder = t->GetBIGNUM("Remainder");
  ASSERT_TRUE(a);
  ASSERT_TRUE(b);
  ASSERT_TRUE(quotient);
@@ -519,11 +552,11 @@ static void TestQuotient(FileTest *t, BN_CTX *ctx) {
  }
 }

-static void TestModMul(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> b = GetBIGNUM(t, "B");
-  bssl::UniquePtr<BIGNUM> m = GetBIGNUM(t, "M");
-  bssl::UniquePtr<BIGNUM> mod_mul = GetBIGNUM(t, "ModMul");
+static void TestModMul(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> b = t->GetBIGNUM("B");
+  bssl::UniquePtr<BIGNUM> m = t->GetBIGNUM("M");
+  bssl::UniquePtr<BIGNUM> mod_mul = t->GetBIGNUM("ModMul");
  ASSERT_TRUE(a);
  ASSERT_TRUE(b);
  ASSERT_TRUE(m);
@@ -536,12 +569,12 @@ static void TestModMul(FileTest *t, BN_CTX *ctx) {

  if (BN_is_odd(m.get())) {
    // Reduce |a| and |b| and test the Montgomery version.
-    bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    bssl::UniquePtr<BN_MONT_CTX> mont(
+        BN_MONT_CTX_new_for_modulus(m.get(), ctx));
    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
    ASSERT_TRUE(mont);
    ASSERT_TRUE(a_tmp);
    ASSERT_TRUE(b_tmp);
-    ASSERT_TRUE(BN_MONT_CTX_set(mont.get(), m.get(), ctx));
    ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
    ASSERT_TRUE(BN_nnmod(b.get(), b.get(), m.get(), ctx));
    ASSERT_TRUE(BN_to_montgomery(a_tmp.get(), a.get(), mont.get(), ctx));
@@ -553,24 +586,23 @@ static void TestModMul(FileTest *t, BN_CTX *ctx) {
                         ret.get());

 #if !defined(BORINGSSL_SHARED_LIBRARY)
-    if (m->top <= BN_SMALL_MAX_WORDS) {
-      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m->top]),
-          b_words(new BN_ULONG[m->top]), r_words(new BN_ULONG[m->top]);
-      OPENSSL_memset(a_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-      OPENSSL_memset(b_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(b_words.get(), b->d, b->top * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m->top, a_words.get(),
-                                         m->top, mont.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(b_words.get(), m->top, b_words.get(),
-                                         m->top, mont.get()));
+    size_t m_width = static_cast<size_t>(bn_minimal_width(m.get()));
+    if (m_width <= BN_SMALL_MAX_WORDS) {
+      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m_width]),
+          b_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
+      ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
+      ASSERT_TRUE(bn_copy_words(b_words.get(), m_width, b.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
+                                         m_width, mont.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(b_words.get(), m_width, b_words.get(),
+                                         m_width, mont.get()));
      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m->top, a_words.get(), m->top, b_words.get(), m->top,
+          r_words.get(), m_width, a_words.get(), m_width, b_words.get(), m_width,
          mont.get()));
      // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width, r_words.get(),
+                                           m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
      EXPECT_BIGNUMS_EQUAL("A * B (mod M) (Montgomery, words)", mod_mul.get(),
                           ret.get());
    }
@@ -578,10 +610,10 @@ static void TestModMul(FileTest *t, BN_CTX *ctx) {
  }
 }

-static void TestModSquare(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> m = GetBIGNUM(t, "M");
-  bssl::UniquePtr<BIGNUM> mod_square = GetBIGNUM(t, "ModSquare");
+static void TestModSquare(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> m = t->GetBIGNUM("M");
+  bssl::UniquePtr<BIGNUM> mod_square = t->GetBIGNUM("ModSquare");
  ASSERT_TRUE(a);
  ASSERT_TRUE(m);
  ASSERT_TRUE(mod_square);
@@ -600,11 +632,11 @@ static void TestModSquare(FileTest *t, BN_CTX *ctx) {

  if (BN_is_odd(m.get())) {
    // Reduce |a| and test the Montgomery version.
-    bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    bssl::UniquePtr<BN_MONT_CTX> mont(
+        BN_MONT_CTX_new_for_modulus(m.get(), ctx));
    bssl::UniquePtr<BIGNUM> a_tmp(BN_new());
    ASSERT_TRUE(mont);
    ASSERT_TRUE(a_tmp);
-    ASSERT_TRUE(BN_MONT_CTX_set(mont.get(), m.get(), ctx));
    ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
    ASSERT_TRUE(BN_to_montgomery(a_tmp.get(), a.get(), mont.get(), ctx));
    ASSERT_TRUE(BN_mod_mul_montgomery(ret.get(), a_tmp.get(), a_tmp.get(),
@@ -622,32 +654,32 @@ static void TestModSquare(FileTest *t, BN_CTX *ctx) {
                         ret.get());

 #if !defined(BORINGSSL_SHARED_LIBRARY)
-    if (m->top <= BN_SMALL_MAX_WORDS) {
-      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m->top]),
-          a_copy_words(new BN_ULONG[m->top]), r_words(new BN_ULONG[m->top]);
-      OPENSSL_memset(a_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m->top, a_words.get(),
-                                         m->top, mont.get()));
+    size_t m_width = static_cast<size_t>(bn_minimal_width(m.get()));
+    if (m_width <= BN_SMALL_MAX_WORDS) {
+      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m_width]),
+          a_copy_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
+      ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
+                                         m_width, mont.get()));
      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m->top, a_words.get(), m->top, a_words.get(), m->top,
-          mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+          r_words.get(), m_width, a_words.get(), m_width, a_words.get(),
+          m_width, mont.get()));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
+                                           r_words.get(), m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
      EXPECT_BIGNUMS_EQUAL("A * A (mod M) (Montgomery, words)",
                           mod_square.get(), ret.get());

      // Repeat the operation with |a_copy_words|.
      OPENSSL_memcpy(a_copy_words.get(), a_words.get(),
-                     m->top * sizeof(BN_ULONG));
+                     m_width * sizeof(BN_ULONG));
      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m->top, a_words.get(), m->top, a_copy_words.get(),
-          m->top, mont.get()));
+          r_words.get(), m_width, a_words.get(), m_width, a_copy_words.get(),
+          m_width, mont.get()));
      // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
+                                           r_words.get(), m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
      EXPECT_BIGNUMS_EQUAL("A * A_copy (mod M) (Montgomery, words)",
                           mod_square.get(), ret.get());
    }
@@ -655,11 +687,11 @@ static void TestModSquare(FileTest *t, BN_CTX *ctx) {
  }
 }

-static void TestModExp(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> e = GetBIGNUM(t, "E");
-  bssl::UniquePtr<BIGNUM> m = GetBIGNUM(t, "M");
-  bssl::UniquePtr<BIGNUM> mod_exp = GetBIGNUM(t, "ModExp");
+static void TestModExp(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> e = t->GetBIGNUM("E");
+  bssl::UniquePtr<BIGNUM> m = t->GetBIGNUM("M");
+  bssl::UniquePtr<BIGNUM> mod_exp = t->GetBIGNUM("ModExp");
  ASSERT_TRUE(a);
  ASSERT_TRUE(e);
  ASSERT_TRUE(m);
@@ -682,22 +714,22 @@ static void TestModExp(FileTest *t, BN_CTX *ctx) {
                         ret.get());

 #if !defined(BORINGSSL_SHARED_LIBRARY)
-    if (m->top <= BN_SMALL_MAX_WORDS) {
-      bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    size_t m_width = static_cast<size_t>(bn_minimal_width(m.get()));
+    if (m_width <= BN_SMALL_MAX_WORDS) {
+      bssl::UniquePtr<BN_MONT_CTX> mont(
+          BN_MONT_CTX_new_for_modulus(m.get(), ctx));
      ASSERT_TRUE(mont.get());
-      ASSERT_TRUE(BN_MONT_CTX_set(mont.get(), m.get(), ctx));
      ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
-      std::unique_ptr<BN_ULONG[]> r_words(new BN_ULONG[m->top]),
-          a_words(new BN_ULONG[m->top]);
-      OPENSSL_memset(a_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m->top, a_words.get(),
-                                         m->top, mont.get()));
-      ASSERT_TRUE(bn_mod_exp_mont_small(r_words.get(), m->top, a_words.get(),
-                                        m->top, e->d, e->top, mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+      std::unique_ptr<BN_ULONG[]> r_words(new BN_ULONG[m_width]),
+          a_words(new BN_ULONG[m_width]);
+      ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
+                                         m_width, mont.get()));
+      ASSERT_TRUE(bn_mod_exp_mont_small(r_words.get(), m_width, a_words.get(),
+                                        m_width, e->d, e->width, mont.get()));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
+                                           r_words.get(), m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
      EXPECT_BIGNUMS_EQUAL("A ^ E (mod M) (Montgomery, words)", mod_exp.get(),
                           ret.get());
    }
@@ -705,10 +737,10 @@ static void TestModExp(FileTest *t, BN_CTX *ctx) {
  }
 }

-static void TestExp(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> e = GetBIGNUM(t, "E");
-  bssl::UniquePtr<BIGNUM> exp = GetBIGNUM(t, "Exp");
+static void TestExp(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> e = t->GetBIGNUM("E");
+  bssl::UniquePtr<BIGNUM> exp = t->GetBIGNUM("Exp");
  ASSERT_TRUE(a);
  ASSERT_TRUE(e);
  ASSERT_TRUE(exp);
@@ -719,10 +751,10 @@ static void TestExp(FileTest *t, BN_CTX *ctx) {
  EXPECT_BIGNUMS_EQUAL("A ^ E", exp.get(), ret.get());
 }

-static void TestModSqrt(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> p = GetBIGNUM(t, "P");
-  bssl::UniquePtr<BIGNUM> mod_sqrt = GetBIGNUM(t, "ModSqrt");
+static void TestModSqrt(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> p = t->GetBIGNUM("P");
+  bssl::UniquePtr<BIGNUM> mod_sqrt = t->GetBIGNUM("ModSqrt");
  bssl::UniquePtr<BIGNUM> mod_sqrt2(BN_new());
  ASSERT_TRUE(a);
  ASSERT_TRUE(p);
@@ -744,9 +776,9 @@ static void TestModSqrt(FileTest *t, BN_CTX *ctx) {
  }
 }

-static void TestNotModSquare(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> not_mod_square = GetBIGNUM(t, "NotModSquare");
-  bssl::UniquePtr<BIGNUM> p = GetBIGNUM(t, "P");
+static void TestNotModSquare(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> not_mod_square = t->GetBIGNUM("NotModSquare");
+  bssl::UniquePtr<BIGNUM> p = t->GetBIGNUM("P");
  bssl::UniquePtr<BIGNUM> ret(BN_new());
  ASSERT_TRUE(not_mod_square);
  ASSERT_TRUE(p);
@@ -761,10 +793,10 @@ static void TestNotModSquare(FileTest *t, BN_CTX *ctx) {
  ERR_clear_error();
 }

-static void TestModInv(FileTest *t, BN_CTX *ctx) {
-  bssl::UniquePtr<BIGNUM> a = GetBIGNUM(t, "A");
-  bssl::UniquePtr<BIGNUM> m = GetBIGNUM(t, "M");
-  bssl::UniquePtr<BIGNUM> mod_inv = GetBIGNUM(t, "ModInv");
+static void TestModInv(BIGNUMFileTest *t, BN_CTX *ctx) {
+  bssl::UniquePtr<BIGNUM> a = t->GetBIGNUM("A");
+  bssl::UniquePtr<BIGNUM> m = t->GetBIGNUM("M");
+  bssl::UniquePtr<BIGNUM> mod_inv = t->GetBIGNUM("ModInv");
  ASSERT_TRUE(a);
  ASSERT_TRUE(m);
  ASSERT_TRUE(mod_inv);
@@ -791,7 +823,7 @@ class BNTest : public testing::Test {
 TEST_F(BNTest, TestVectors) {
  static const struct {
    const char *name;
-    void (*func)(FileTest *t, BN_CTX *ctx);
+    void (*func)(BIGNUMFileTest *t, BN_CTX *ctx);
  } kTests[] = {
      {"Sum", TestSum},
      {"LShift1", TestLShift1},
@@ -810,13 +842,34 @@ TEST_F(BNTest, TestVectors) {
  };

  FileTestGTest("crypto/fipsmodule/bn/bn_tests.txt", [&](FileTest *t) {
+    void (*func)(BIGNUMFileTest *t, BN_CTX *ctx) = nullptr;
    for (const auto &test : kTests) {
      if (t->GetType() == test.name) {
-        test.func(t, ctx());
-        return;
+        func = test.func;
+        break;
      }
    }
-    FAIL() << "Unknown test type: " << t->GetType();
+    if (!func) {
+      FAIL() << "Unknown test type: " << t->GetType();
+      return;
+    }
+
+    // Run the test with normalize-sized |BIGNUM|s.
+    BIGNUMFileTest bn_test(t, 0);
+    BN_CTX_start(ctx());
+    func(&bn_test, ctx());
+    BN_CTX_end(ctx());
+    unsigned num_bignums = bn_test.num_bignums();
+
+    // Repeat the test with all combinations of large and small |BIGNUM|s.
+    for (unsigned large_mask = 1; large_mask < (1u << num_bignums);
+         large_mask++) {
+      SCOPED_TRACE(large_mask);
+      BIGNUMFileTest bn_test2(t, large_mask);
+      BN_CTX_start(ctx());
+      func(&bn_test2, ctx());
+      BN_CTX_end(ctx());
+    }
  });
 }

@@ -861,6 +914,15 @@ TEST_F(BNTest, BN2BinPadded) {
    EXPECT_EQ(Bytes(zeros, sizeof(out) - bytes),
              Bytes(out, sizeof(out) - bytes));
    EXPECT_EQ(Bytes(reference, bytes), Bytes(out + sizeof(out) - bytes, bytes));
+
+    // Repeat some tests with a non-minimal |BIGNUM|.
+    EXPECT_TRUE(bn_resize_words(n.get(), 32));
+
+    EXPECT_FALSE(BN_bn2bin_padded(out, bytes - 1, n.get()));
+
+    ASSERT_TRUE(BN_bn2bin_padded(out, bytes + 1, n.get()));
+    EXPECT_EQ(0u, out[0]);
+    EXPECT_EQ(Bytes(reference, bytes), Bytes(out + 1, bytes));
  }
 }

@@ -1266,11 +1328,9 @@ TEST_F(BNTest, BadModulus) {
  bssl::UniquePtr<BIGNUM> a(BN_new());
  bssl::UniquePtr<BIGNUM> b(BN_new());
  bssl::UniquePtr<BIGNUM> zero(BN_new());
-  bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
  ASSERT_TRUE(a);
  ASSERT_TRUE(b);
  ASSERT_TRUE(zero);
-  ASSERT_TRUE(mont);

  BN_zero(zero.get());

@@ -1293,13 +1353,16 @@ TEST_F(BNTest, BadModulus) {
      a.get(), BN_value_one(), BN_value_one(), zero.get(), ctx(), nullptr));
  ERR_clear_error();

-  EXPECT_FALSE(BN_MONT_CTX_set(mont.get(), zero.get(), ctx()));
+  bssl::UniquePtr<BN_MONT_CTX> mont(
+      BN_MONT_CTX_new_for_modulus(zero.get(), ctx()));
+  EXPECT_FALSE(mont);
  ERR_clear_error();

  // Some operations also may not be used with an even modulus.
  ASSERT_TRUE(BN_set_word(b.get(), 16));

-  EXPECT_FALSE(BN_MONT_CTX_set(mont.get(), b.get(), ctx()));
+  mont.reset(BN_MONT_CTX_new_for_modulus(b.get(), ctx()));
+  EXPECT_FALSE(mont);
  ERR_clear_error();

  EXPECT_FALSE(BN_mod_exp_mont(a.get(), BN_value_one(), BN_value_one(), b.get(),
@@ -1758,6 +1821,41 @@ TEST_F(BNTest, PrimeChecking) {
  EXPECT_EQ(0, is_probably_prime_2);
 }

+TEST_F(BNTest, NumBitsWord) {
+  constexpr BN_ULONG kOne = 1;
+
+  // 2^(N-1) takes N bits.
+  for (unsigned i = 1; i < BN_BITS2; i++) {
+    EXPECT_EQ(i, BN_num_bits_word(kOne << (i - 1))) << i;
+  }
+
+  // 2^N - 1 takes N bits.
+  for (unsigned i = 0; i < BN_BITS2; i++) {
+    EXPECT_EQ(i, BN_num_bits_word((kOne << i) - 1)) << i;
+  }
+
+  for (unsigned i = 1; i < 100; i++) {
+    // Generate a random value of a random length.
+    uint8_t buf[1 + sizeof(BN_ULONG)];
+    RAND_bytes(buf, sizeof(buf));
+
+    BN_ULONG w;
+    memcpy(&w, &buf[1], sizeof(w));
+
+    const unsigned num_bits = buf[0] % (BN_BITS2 + 1);
+    if (num_bits == BN_BITS2) {
+      w |= kOne << (BN_BITS2 - 1);
+    } else if (num_bits == 0) {
+      w = 0;
+    } else {
+      w &= (kOne << num_bits) - 1;
+      w |= kOne << (num_bits - 1);
+    }
+
+    EXPECT_EQ(num_bits, BN_num_bits_word(w)) << w;
+  }
+}
+
 #if !defined(BORINGSSL_SHARED_LIBRARY)
 TEST_F(BNTest, LessThanWords) {
  // kTestVectors is an array of 256-bit values in sorted order.
@@ -1848,3 +1946,162 @@ TEST_F(BNTest, LessThanWords) {
  EXPECT_EQ(0, bn_in_range_words(NULL, 0, NULL, 0));
 }
 #endif  // !BORINGSSL_SHARED_LIBRARY
+
+TEST_F(BNTest, NonMinimal) {
+  bssl::UniquePtr<BIGNUM> ten(BN_new());
+  ASSERT_TRUE(ten);
+  ASSERT_TRUE(BN_set_word(ten.get(), 10));
+
+  bssl::UniquePtr<BIGNUM> ten_copy(BN_dup(ten.get()));
+  ASSERT_TRUE(ten_copy);
+
+  bssl::UniquePtr<BIGNUM> eight(BN_new());
+  ASSERT_TRUE(eight);
+  ASSERT_TRUE(BN_set_word(eight.get(), 8));
+
+  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
+  ASSERT_TRUE(forty_two);
+  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
+
+  bssl::UniquePtr<BIGNUM> two_exp_256(BN_new());
+  ASSERT_TRUE(two_exp_256);
+  ASSERT_TRUE(BN_lshift(two_exp_256.get(), BN_value_one(), 256));
+
+  bssl::UniquePtr<BIGNUM> zero(BN_new());
+  ASSERT_TRUE(zero);
+  BN_zero(zero.get());
+
+  for (size_t width = 1; width < 10; width++) {
+    SCOPED_TRACE(width);
+    // Make |ten| and |zero| wider.
+    EXPECT_TRUE(bn_resize_words(ten.get(), width));
+    EXPECT_EQ(static_cast<int>(width), ten->width);
+    EXPECT_TRUE(bn_resize_words(zero.get(), width));
+    EXPECT_EQ(static_cast<int>(width), zero->width);
+
+    EXPECT_TRUE(BN_abs_is_word(ten.get(), 10));
+    EXPECT_TRUE(BN_is_word(ten.get(), 10));
+    EXPECT_EQ(10u, BN_get_word(ten.get()));
+    uint64_t v;
+    ASSERT_TRUE(BN_get_u64(ten.get(), &v));
+    EXPECT_EQ(10u, v);
+
+    EXPECT_TRUE(BN_equal_consttime(ten.get(), ten_copy.get()));
+    EXPECT_TRUE(BN_equal_consttime(ten_copy.get(), ten.get()));
+    EXPECT_EQ(BN_cmp(ten.get(), ten_copy.get()), 0);
+    EXPECT_EQ(BN_cmp(ten_copy.get(), ten.get()), 0);
+
+    EXPECT_FALSE(BN_equal_consttime(ten.get(), eight.get()));
+    EXPECT_LT(BN_cmp(eight.get(), ten.get()), 0);
+    EXPECT_GT(BN_cmp(ten.get(), eight.get()), 0);
+
+    EXPECT_FALSE(BN_equal_consttime(ten.get(), forty_two.get()));
+    EXPECT_GT(BN_cmp(forty_two.get(), ten.get()), 0);
+    EXPECT_LT(BN_cmp(ten.get(), forty_two.get()), 0);
+
+    EXPECT_FALSE(BN_equal_consttime(ten.get(), two_exp_256.get()));
+    EXPECT_GT(BN_cmp(two_exp_256.get(), ten.get()), 0);
+    EXPECT_LT(BN_cmp(ten.get(), two_exp_256.get()), 0);
+
+    EXPECT_EQ(4u, BN_num_bits(ten.get()));
+    EXPECT_EQ(1u, BN_num_bytes(ten.get()));
+    EXPECT_FALSE(BN_is_pow2(ten.get()));
+
+    bssl::UniquePtr<char> hex(BN_bn2hex(ten.get()));
+    EXPECT_STREQ("0a", hex.get());
+    hex.reset(BN_bn2hex(zero.get()));
+    EXPECT_STREQ("0", hex.get());
+
+    bssl::UniquePtr<BIO> bio(BIO_new(BIO_s_mem()));
+    ASSERT_TRUE(bio);
+    ASSERT_TRUE(BN_print(bio.get(), ten.get()));
+    const uint8_t *ptr;
+    size_t len;
+    ASSERT_TRUE(BIO_mem_contents(bio.get(), &ptr, &len));
+    // TODO(davidben): |BN_print| removes leading zeros within a byte, while
+    // |BN_bn2hex| rounds up to a byte, except for zero which it prints as
+    // "0". Fix this discrepancy?
+    EXPECT_EQ(Bytes("a"), Bytes(ptr, len));
+
+    bio.reset(BIO_new(BIO_s_mem()));
+    ASSERT_TRUE(bio);
+    ASSERT_TRUE(BN_print(bio.get(), zero.get()));
+    ASSERT_TRUE(BIO_mem_contents(bio.get(), &ptr, &len));
+    EXPECT_EQ(Bytes("0"), Bytes(ptr, len));
+  }
+
+  // |ten| may be resized back down to one word.
+  EXPECT_TRUE(bn_resize_words(ten.get(), 1));
+  EXPECT_EQ(1, ten->width);
+
+  // But not to zero words, which it does not fit.
+  EXPECT_FALSE(bn_resize_words(ten.get(), 0));
+
+  EXPECT_TRUE(BN_is_pow2(eight.get()));
+  EXPECT_TRUE(bn_resize_words(eight.get(), 4));
+  EXPECT_EQ(4, eight->width);
+  EXPECT_TRUE(BN_is_pow2(eight.get()));
+
+  // |BN_MONT_CTX| is always stored minimally and uses the same R independent of
+  // input width.
+  static const uint8_t kP[] = {
+      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+  bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
+  ASSERT_TRUE(p);
+
+  bssl::UniquePtr<BN_MONT_CTX> mont(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont);
+
+  ASSERT_TRUE(bn_resize_words(p.get(), 32));
+  bssl::UniquePtr<BN_MONT_CTX> mont2(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont2);
+
+  EXPECT_EQ(mont->N.width, mont2->N.width);
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont2->RR));
+}
+
+TEST_F(BNTest, CountLowZeroBits) {
+  bssl::UniquePtr<BIGNUM> ten(BN_new());
+  ASSERT_TRUE(ten);
+  ASSERT_TRUE(BN_set_word(ten.get(), 10));
+
+  bssl::UniquePtr<BIGNUM> eight(BN_new());
+  ASSERT_TRUE(eight);
+  ASSERT_TRUE(BN_set_word(eight.get(), 8));
+
+  bssl::UniquePtr<BIGNUM> two_exp_256(BN_new());
+  ASSERT_TRUE(two_exp_256);
+  ASSERT_TRUE(BN_lshift(two_exp_256.get(), BN_value_one(), 256));
+
+  bssl::UniquePtr<BIGNUM> two_exp_256_plus_4(BN_new());
+  ASSERT_TRUE(two_exp_256_plus_4);
+  ASSERT_TRUE(BN_lshift(two_exp_256_plus_4.get(), BN_value_one(), 256));
+  ASSERT_TRUE(BN_add_word(two_exp_256_plus_4.get(), 4));
+
+  bssl::UniquePtr<BIGNUM> zero(BN_new());
+  ASSERT_TRUE(zero);
+  BN_zero(zero.get());
+
+  EXPECT_EQ(1, BN_count_low_zero_bits(ten.get()));
+  EXPECT_EQ(3, BN_count_low_zero_bits(eight.get()));
+  EXPECT_EQ(256, BN_count_low_zero_bits(two_exp_256.get()));
+  EXPECT_EQ(2, BN_count_low_zero_bits(two_exp_256_plus_4.get()));
+  EXPECT_EQ(0, BN_count_low_zero_bits(zero.get()));
+
+  ASSERT_TRUE(bn_resize_words(ten.get(), 16));
+  ASSERT_TRUE(bn_resize_words(eight.get(), 16));
+  ASSERT_TRUE(bn_resize_words(two_exp_256.get(), 16));
+  ASSERT_TRUE(bn_resize_words(two_exp_256_plus_4.get(), 16));
+  ASSERT_TRUE(bn_resize_words(zero.get(), 16));
+
+  EXPECT_EQ(1, BN_count_low_zero_bits(ten.get()));
+  EXPECT_EQ(3, BN_count_low_zero_bits(eight.get()));
+  EXPECT_EQ(256, BN_count_low_zero_bits(two_exp_256.get()));
+  EXPECT_EQ(2, BN_count_low_zero_bits(two_exp_256_plus_4.get()));
+  EXPECT_EQ(0, BN_count_low_zero_bits(zero.get()));
+}
@@ -5976,6 +5976,70 @@ Product = 542fb814f45924aa09a16f2a6
 A = 542fb814f45924aa09a16f2a6
 B = 1

+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a817a4d8899f571f257fa81c36a868d80e7fa2bcbda68a72ca3e31db8892b94d073e006433dd7128b7bf677d2b411532e5662cdff66d657673d58e03d4a338bae1a5513296f91d4d2b5b680527a2e12318e422ec2b7f05ea4fd3ef4780576488211dad5733685a8f0e5d2ecda549a15eebb235495e70d26b194c994cf16d98d356218d08a34d1593d90bc0d3572df0e84bdb1705c6c5e64ea4895599bb21bf219abdd4329813ecc198e708cee199c22f749bdeb0c206690e8420883f6c0661e47b29969986a7a72996ef63234c31aa39b7be37995d2898063ef5c3b672c43afbc1a065dec2671ae87e17639cfcd3148145a8323e1e9dc4f9c9daf981dd6aba4e8be01344c2eda185b87
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c58ba611f7
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea39224eed9ef1
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a817a4d8899aa73af54a4e1825aa6714016da99d9e3d0c02eb139716db437705cd9efabf0123b0831689735f4e488f226e577d4688d30914dd50ed368939452af0a7a094c065c6718bd54f53a808585fc1728c3bd1e7c968d76c6dca32f95a8323bacad31cdd4aae544d4208262c40bcf726c2f26cf1e60341c3e1e0c8ed4542555b9bf00488680b737a245cc9b7817231f1f6f1e614cdf43ea281fb850ebbb9305b1aa441a45dfdaa1e98b9d79d9ca511be070bfa94d8cd3cc750607c93e1b451a14e32356bd48d77860b37fd2e714827e770a5648ce8579a00ba5cae034502a8b03ba754994d9e002130cfdee6bfdf078dc8f6767b927c964197664c8e32bd3d31bd461ce
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c58ba611f7
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea3922
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a80dbb5a46feff82a92989bca577998c68ee619d9ea9972c6f139e97f5bdde635152830bedf302873508d2ed73badb82f9e32e1f4d12ea8c8b1059aa6d15f8e17d649bf41467903ab40d220d50570b5a263f637c0fcebc0ca29f8a81e2a01bf39bcb60cb9229dfd40618f706b941836bc5c291dec45ee9193e74d3a4cc5f73054ca56fd774a359f17a687268587393b76204a37cd48dcb09d3daed57a7e6d7d93a0ca3d6de8557fc4ddbfe9cb163fd10b7fe5f270dc57aa2fb88cdca2a3795015a17fd352d85fb688a38fa54883d0cab67aab08dbabd58d307c601f0f810014d78b101ff0bddb6d550b2480782406a905b9201e70ef6c1cb9765e91c10c8f5d240c
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c58ba611f7
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef96c826c5268b0a6788e14a9e3812764dd3ebb7489e6e66058ca6ccf9c007f8c049eda369b2889cc411bca78d4f5b0e3a9e80243e87e112072b01922b595afdef4dd562e58ce917f11e69c8fe050de54fdb2d607d05f09afd6dd140e9d195b91d85269610a1e5d5036e8c9fea2d4fa693d80ecdc819b201c0aed27dfe0b92b4b3b9ecabb3b9548f0d27dc917ffb14308c4f970863e163f375852fcd9fb115640dc40534f8f51a7b903599117dca6c80924fa9a1aeb43cf5a9a3f67ae818b484feed51d7ef60b3656720891b13a983c02c281c8a0954f13b7bfaca844d2cb66de5c11ff507e39cf774c7c93b38e296a44f04e5ecf2819b57943fb0509774ddbcfeb
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c58ba611f7
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a817a4d8899c7bedb01951b0f4fdb2c0fb64ad74707fda20027f4cee25da9b59be288d404cbd348f27600b87015d28f03cdf411f0e8c22deb9de5b3e0094f7820d78d59c90017cbd426297f8a32fb4b55b09362cf7cfb5910085acb24dbf618752b8b74c7e87f9cac44cb3b7486c43aa9b19a64d40a74eaf1de8b5f168b43d5750236aef753278c11294efd1adaddb6addb846f45fa55d7391898e8ec1c82bcf0008d9850c4c096571e8872e975dc8af1ba01bfbe8c8c27dc30cdaddd198936e4496579741a3a20e1b8e17241fe4abe5e98794e469180b742b2e1904940381f703f512885bda0340fe74e997ab269be00a3ca29bb937db2e06d8054e26dc13a5014ba51b175
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c5
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea39224eed9ef1
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a817a4d88997cc097fe3f7ace3ffb0fcee52b45551165bb02354b229788b59128489879b1a0373e9862a17692464a2dfc5d09185a0f1c67d2359ba70b52b03f21c7b24feb96e25e1a2dc7f4723952bf203979f7c9e38790f881e2b35006157825555d4c867fce9ea0a3cc6f1c94ee308a68e33f64f286247465ffe854033e9c64f5d79d6d66dcb38ad03535b20376bf4c3cf26e07ef445192ba2baf08bb5286695a61ff6b5dc7aa1832017198d61a324b8c244572157323c7bb3a2fee226133e1b0e0f2ff067cf71fc24bf38d0e172f459b0cdf0707c5bc586390faacf428bfdeb04e850ee0c35f6807eb6ca8d3a473dcc2239541115a8b0d33ea33295ff8c13b2a
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c5
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea3922
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a80dbb5a46fc245133c3335163cce37555d36c555182e6d9a754b9aa9305c070083d0fe806d2c5eda4a976f749d6ef40515c425e6531a7f4d11926e49907b7a8a938205e0d6fefaacb145200cbe3deec686476bcdc1f6bb3535147ecb00818f2cd666ac0dd497f0fbc087bf05c6425b7752a02e2a695655d4310f04943a6178946a74dbe4688bd1eb3f1a166aef37e39f3e1d36b6d6d422ec0db264cae8d44869f57a92952bd74a026dd7cfc672803905f029c723487d4123a7520688fc9c68b2384be32e881f64d0ed7ae555bf00e5799740dd8c6accc40f3fe573f194f4848bb05aea8a5509f2dd10fce023093f1ef20267244a990d7ffd462f4e85a4
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c5
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef96c826c5268b0a6788e14a9e353744d86d954c06f3b84ef271b184ac9957a5f88b08b606fa6aa97afc4983a62f1e74aa3f242e14a3f4cf5ea415d1437818663556a29d117ea7df1cf1ee32f70d6d5566e25d53f892c42d3f92e481b622455fce36e400de09e2d435099695354ceee249c793b76b3c544d70164381e0420ef8b85609502afff9130729ba7851e0775dc5d8c606ba614e7607625fbc38908c88fac43e29ff9b8728f5809e63f20289246b5128016478437550a833c60edb0df43dd9a47654f2e4ef308d4a18cea57ea4b0c6d08add07f2e7adc427cf591c29dbd1f975432922e3f2b71c75e4d2557efccf626be7a0d522b658d420ae321
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9ffec082c5
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a8126efa5e7be8e75d54e5ba9405f671d624eaf8d7a115d0479f6fb773b940525fd46b69bc43c815b6bb1798813ca95790bc68032f0b9e73fc964a9922507d8aac25f859745939b828ef5ed326b226b555e5088f13531be16272a89ad41ae82c940935b5d8fe75dc520a230cc279a887bce01bae0a79356f044af13c6f4a5e53c00b2d03cfcbb0f93b26202441a207ec91576410ac1750e257906d945bfe9204b73fc417600bd191edcf2e3eb79acbf4f84dda372405b5e98397abe85c1593543cd7a5b17cb90e299f422f0ce107d86b56474e435dbbcbb5314fb579cd68d54777aa2d0ff9b6b96de62b4676edea5b09589698ed829cad22a52aaec732b79edf6af
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9f
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea39224eed9ef1
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a8126efa5e7739032d1f8bb68307f4adc912f1d9b83797606874d4f2c669fe0b263565c4898a07701585237aa444234719adb869c17142126611a9cbd6e689fabb2847bb9dc5e2dc89694621a7179df1fe7371deb9bbdf5fea0b271d86bcde2796a65331c27365fb97fa3647435c47e5c854a95718fa49072cc239d046ca0ac2bf453beb31070370d59483adb42b9876776e43fccb663887f1a999f625eb8e9c4cdd0a89099c42cdff06be29ad9ea66a957002925c9425a83c3e74096ca31324134f5d4a2b7d3b8d7fd8d72192049f79c670874f65201c068c5aac2008a7df4e5eba02d88be8ec23683513a9cffe06671a7c2fa5da7a7aa571914caba1e
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9f
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea3922
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef989b0c6121e293a808857c1bdb914ae0fec75b02d527263093a9d9b8a42289ec74dc73e0e46568a9e8ee117659597434048308c9b66fa7a539694285b1238a13d1163fbac33db147e5431af1c7aca5b1a118db4f6650ec6340491ef7a2d203b53e43d536639f980eb6e92a37bffb2149c5eb45d6718a9496f0784370674c1d29732b944a3c3885b68f0fd2a121f556dc82d1b942e7aabba780f087b9df359d86e2055248c3aabc568e93bba67d3ccca2c4240c876506d63bb05aad6fc4c77dfafff1731a46c6711bc60c4d23976268928bc63e1d133add0633c737bb508c81fa1ff3b452b49b992ebac930432d555ab8c62ae17357b1186e80689672f5a9f472c
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9f
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef96c826c5268b0a6783ab6c7314a43e85a92955a5fbfbffcd31ef0913ba93563dab2b7f54d90fa21ca827ad15b5b1fb399a303f94837536b2813cb563f793fb780e91f8333a2de7bb9f10efdb652a504d6f242e7c15362d3a6eb6e3d1a5abb03023dfe964656979765a14fe8fc36af3d785030ce549b92a91dcb8e2aa13f5b89eb8449b31961a0f77117c8cac79af95ee69f6594e557af7bb017cd885027ff7c0cb1d2f99d1ed5eacb788f645c25150e737cf1184b546bb2d55f2014a18015ffe647580df6fe4d528ce983309baeac0347ae8739e2b1f6d1a83e12e4dbfea1cd81b11b8628837432ad1906c70323529b718c8c6e398e1dfa73
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc95262b94a0aca2f9f
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef96ebae79ce1360c374bc58f225bca564b7e6561b56e0edbb3a7f5934f382b916ab38423221d656357ce0e9bf1e9b04c0678b9c555e8365a0f977c95bd8dca1fb2ad2268193531ca36cbe7f40da8e1afe097e451dc2931b323ce731c03cc027a92ed8ae105c5e9c1bd385e238d989fadbf3aa54c097a8666df8a66b7e2d016e65a2a632603f2c84290ccd7346ada28dff79dd06c7f7989689aca4f494b977f984650f91327ab9936cb92675932440f135e54e4abeecf255d7061482b4c8d91769e02fc94b8acc43325d69541903c3ef7a7a8a5bd19bf886506d42bcf0efcb6197a8d178d6a60516a5aa771ae238a342dc61df8c18c6ba1ed952d4e0c3409c14639
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc9
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea39224eed9ef1
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef96ebae79ce1360c374bc58f2210cc134828c520a58df29ae28863a158a044937809d7d84d2940efbdddb448c64da5f1f31977e7865fd5529eac82fee3e804064a6315936295f8cb26f0de16a47373f5e8365939e280a57dacb508166a583a630c75730c2fe54971e70a35e224e7a1a21e3bd8f417a47c4796d34148cae15068e19eec637bed8f32846dc5aa7e8f50599e840903a8129206fc384e0b4085f9f1e7e3bf2fc67b62b02566ce73cb4b22d471cde35b4f0cccb74283cdded5748d62286f7ea5c184c1308d520ecc7c7f1535b1132708298bf94c0967bc8f8541bb2f2b3c81f11e50f1d8cba4ce3746ad5f85e6bacbefada657c9b386b991b2
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc9
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954a6ea3922
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef96ebae79ce1360c36ad2daaf856508e861c7f68a2611a215a93e3a15f68f72bb80a4fe9f4cfb6c7f91639179342c633db0f70c9dd849b5b5767908b27e61b812659dcd1a0613433f2c0940be49010886bb384d4676bd523f9827c1a48c7649fbfa73e872a5160796813956979b0f3fd3af728dd48f8a7348090300e41b181c8acae08a3b3106b61f90b0421803e6eba0d68e9bc93d3b659fd6316ba2815cb4b3b6a74f1f3fd24b0c07f619d995ac2beada44188eb72d371a6894f90087eaabe148755409bbff60114bcfefbfe2182e6dc4218d0da75af80059bbb14e848c2e60790fb35bf1cb685cbb133b2baf3f2faefcc3f69e34102def4
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc9
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad1eb654821c4af954
+
+Product = 4f993781409d730da892c8451cc47a4c5c132a2c079f6c13a2689e9552450ed0b35c5291b82aae5614c0fc34f777940798a33b8bd5e010eb3c5c88595e8668fc8fb88ccd3d0cd5eee7c88e5b0b2be4605980fea4f8f2e42457963abe7860060482cfa2291e568ea55095ae2ada1c6bf9fda228664c9e02e7f12a8da4c355af044a537dd65dbf9c5d746c3c5f05a3d4d0515a48d9434b38fcbcc485558964fd9f212cf3c4aee9c03aebc468c25740df679d17823bfb20d96620c64b29f4013f0385cdd1a40fcbec3b06132a52aee615c4dbd880d0b030d5bc6aa06801d21fabd49774cd81ef504696d9655652db220ef9518c8ddd2bbd782e5f8cb06be77fc8d0c29f12d4ce67bb2478369710d003f0cb6f40a1341a5a5f2509d2d189084ea4346a44368a54f44c2be4c7b90c4d22976a31985927d0379b2e5d715a7e67eb3228943a07325a29316c695867e8f4ff676e00ffca0a6dfe8fe24652aef9e7f12616e8a54e367b90942f543a01dc7c1b8000ff991228ae83fe0131cfc235ba12ab2bdb33bd4ab0ba1b356bdbc6da4a70eed9fbf2c704e14ed6230eb5478dac0b02f4def1d8c076d1c0c0e2c4cdadb248de4acf961cee51dc41e545bd5a605a0860fb343c28ebf3f8814a9d5a7e0f3e9c93e742db76bc5671258d1da7758b41efead5
+A = f33cad5d3876f0b60a001e13043e41033ee78c29ed8528fd6f22a87fc65c8c650277fab430722fcf63b3984c35ac46883127d544e2f44a465647814e15c0ff595382eff8bdff3be862f8a57a51f27ab4af9899861240855380f5bb883476699ef9eff179a1b88c64cfd6648240a5fc68de054468dc91dac11aaebe696dc05b6b0de0f54bd365ad798f3c85bceaf6ddf976b72cdf69de58335520d358f90e9856de5357dd5d2686cd1a41293d8c2687ba2cb1504420ae2c07014521889172b30df89521e2f66142345115110adf3dc603b1ddba5d80dc6b42fb980e9994aba2dfca00a3df8ea9062f570ec7e0e94d2bc9
+B = 53c66ff2bc0e0d733d26f809aeedd151406ae8f44104f4e58f99e3eb54b06d542806932966bdbf30e13d81e5d6fa96f5308fc45613894b49dc7b766af02738dd89b10ca372d6232b0cbd57dcb873dea3c7598ef69b58ea5d72a0f2aaabd71025b488824a35cc33f8068ae4cd999fbb536be54e07f26df5d3bf8705281c8e94dd3712ad7c6a88f9d7b04f6f8924e18568ea07d46e58d197984824d797dd9ca1efe9763c62cc55fff69fad60d6501765dcf4926c18c027b4f9825d53cc38e99365c1b869245e66e7792f40dabeefe63e404cffc1d2ea63a9dd3fd4643afb2ddd288c6d4737abf20cec860584a7a600b4ad
+

 # Quotient tests.
 #
@@ -10507,6 +10571,12 @@ A =  -80000000000000000000000000000000000000000000000000000000000000000000000000
 E = 61803d4973ae68cfb2ba6770dbed70d36760fa42c01a16d1482eacf0d01adf7a917bc86ece58a73b920295c1291b90f49167ef856ecad149330e1fd49ec71392fb62d47270b53e6d4f3c8f044b80a5736753364896932abc6d872c4c5e135d1edb200597a93ceb262ff6c99079177cd10808b9ed20c8cd7352d80ac7f6963103
 M =  b5d257b2c50b050d42f0852eff5cfa2571157c500cd0bd9aa0b2ccdd89c531c9609d520eb81d928fb52b06da25dc713561aa0bd365ee56db9e62ac6787a85936990f44438363560f7af9e0c16f378e5b83f658252390d849401817624da97ec613a1b855fd901847352f434a777e4e32af0cb4033c7547fb6437d067fcd3d965

+# Regression test for CVE-2017-3738.
+ModExp = d360792bd8210786607817c3dda64cc38c8d0f25569597cb1f363c7919a0c3587baff01a2283edaeb04fc288ac0ab3f279b2a89ffcb452d8bdf72422a9f9780f4aa702dc964cf033149d3a339883062cab8564aebdbfac0bf68985e522c6fe545b346044690c525ca85d3f4eb3e3c25cdf541545afc84a309e9b1d7807003461
+A = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff2020202020df
+E = 2020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020FF2020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020
+M = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff2020202020ff
+

 # Exp tests.
 #
@@ -77,7 +77,7 @@ BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  }

  if (len == 0) {
-    ret->top = 0;
+    ret->width = 0;
    return ret;
  }

@@ -93,7 +93,7 @@ BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  // |bn_wexpand| must check bounds on |num_words| to write it into
  // |ret->dmax|.
  assert(num_words <= INT_MAX);
-  ret->top = (int)num_words;
+  ret->width = (int)num_words;
  ret->neg = 0;

  while (len--) {
@@ -105,9 +105,6 @@ BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
    }
  }

-  // need to call this due to clear byte at top if avoiding having the top bit
-  // set (-ve number)
-  bn_correct_top(ret);
  return ret;
 }

@@ -123,7 +120,7 @@ BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  }

  if (len == 0) {
-    ret->top = 0;
+    ret->width = 0;
    ret->neg = 0;
    return ret;
  }
@@ -134,7 +131,7 @@ BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
    BN_free(bn);
    return NULL;
  }
-  ret->top = num_words;
+  ret->width = num_words;

  // Make sure the top bytes will be zeroed.
  ret->d[num_words - 1] = 0;
@@ -142,8 +139,6 @@ BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  // We only support little-endian platforms, so we can simply memcpy the
  // internal representation.
  OPENSSL_memcpy(ret->d, in, len);
-
-  bn_correct_top(ret);
  return ret;
 }

@@ -159,88 +154,54 @@ size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
  return n;
 }

+static int fits_in_bytes(const uint8_t *bytes, size_t num_bytes, size_t len) {
+  uint8_t mask = 0;
+  for (size_t i = len; i < num_bytes; i++) {
+    mask |= bytes[i];
+  }
+  return mask == 0;
+}
+
 int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
-  // If we don't have enough space, fail out.
-  size_t num_bytes = BN_num_bytes(in);
+  const uint8_t *bytes = (const uint8_t *)in->d;
+  size_t num_bytes = in->width * BN_BYTES;
  if (len < num_bytes) {
-    return 0;
+    if (!fits_in_bytes(bytes, num_bytes, len)) {
+      return 0;
+    }
+    num_bytes = len;
  }

  // We only support little-endian platforms, so we can simply memcpy into the
  // internal representation.
-  OPENSSL_memcpy(out, in->d, num_bytes);
-
+  OPENSSL_memcpy(out, bytes, num_bytes);
  // Pad out the rest of the buffer with zeroes.
  OPENSSL_memset(out + num_bytes, 0, len - num_bytes);
-
  return 1;
 }

-// constant_time_select_ulong returns |x| if |v| is 1 and |y| if |v| is 0. Its
-// behavior is undefined if |v| takes any other value.
-static BN_ULONG constant_time_select_ulong(int v, BN_ULONG x, BN_ULONG y) {
-  BN_ULONG mask = v;
-  mask--;
-
-  return (~mask & x) | (mask & y);
-}
-
-// constant_time_le_size_t returns 1 if |x| <= |y| and 0 otherwise. |x| and |y|
-// must not have their MSBs set.
-static int constant_time_le_size_t(size_t x, size_t y) {
-  return ((x - y - 1) >> (sizeof(size_t) * 8 - 1)) & 1;
-}
-
-// read_word_padded returns the |i|'th word of |in|, if it is not out of
-// bounds. Otherwise, it returns 0. It does so without branches on the size of
-// |in|, however it necessarily does not have the same memory access pattern. If
-// the access would be out of bounds, it reads the last word of |in|. |in| must
-// not be zero.
-static BN_ULONG read_word_padded(const BIGNUM *in, size_t i) {
-  // Read |in->d[i]| if valid. Otherwise, read the last word.
-  BN_ULONG l = in->d[constant_time_select_ulong(
-      constant_time_le_size_t(in->dmax, i), in->dmax - 1, i)];
-
-  // Clamp to zero if above |d->top|.
-  return constant_time_select_ulong(constant_time_le_size_t(in->top, i), 0, l);
-}
-
 int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
-  // Special case for |in| = 0. Just branch as the probability is negligible.
-  if (BN_is_zero(in)) {
-    OPENSSL_memset(out, 0, len);
-    return 1;
-  }
-
-  // Check if the integer is too big. This case can exit early in non-constant
-  // time.
-  if ((size_t)in->top > (len + (BN_BYTES - 1)) / BN_BYTES) {
-    return 0;
-  }
-  if ((len % BN_BYTES) != 0) {
-    BN_ULONG l = read_word_padded(in, len / BN_BYTES);
-    if (l >> (8 * (len % BN_BYTES)) != 0) {
+  const uint8_t *bytes = (const uint8_t *)in->d;
+  size_t num_bytes = in->width * BN_BYTES;
+  if (len < num_bytes) {
+    if (!fits_in_bytes(bytes, num_bytes, len)) {
      return 0;
    }
+    num_bytes = len;
  }

-  // Write the bytes out one by one. Serialization is done without branching on
-  // the bits of |in| or on |in->top|, but if the routine would otherwise read
-  // out of bounds, the memory access pattern can't be fixed. However, for an
-  // RSA key of size a multiple of the word size, the probability of BN_BYTES
-  // leading zero octets is low.
-  //
-  // See Falko Stenzke, "Manger's Attack revisited", ICICS 2010.
-  size_t i = len;
-  while (i--) {
-    BN_ULONG l = read_word_padded(in, i / BN_BYTES);
-    *(out++) = (uint8_t)(l >> (8 * (i % BN_BYTES))) & 0xff;
+  // We only support little-endian platforms, so we can simply write the buffer
+  // in reverse.
+  for (size_t i = 0; i < num_bytes; i++) {
+    out[len - i - 1] = bytes[i];
  }
+  // Pad out the rest of the buffer with zeroes.
+  OPENSSL_memset(out, 0, len - num_bytes);
  return 1;
 }

 BN_ULONG BN_get_word(const BIGNUM *bn) {
-  switch (bn->top) {
+  switch (bn_minimal_width(bn)) {
    case 0:
      return 0;
    case 1:
@@ -251,7 +212,7 @@ BN_ULONG BN_get_word(const BIGNUM *bn) {
 }

 int BN_get_u64(const BIGNUM *bn, uint64_t *out) {
-  switch (bn->top) {
+  switch (bn_minimal_width(bn)) {
    case 0:
      *out = 0;
      return 1;
@@ -63,33 +63,43 @@
 #include "../../internal.h"


-int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
-  int i;
-  BN_ULONG t1, t2, *ap, *bp;
-
-  i = a->top - b->top;
-  if (i != 0) {
-    return i;
+static int bn_cmp_words_consttime(const BN_ULONG *a, size_t a_len,
+                                  const BN_ULONG *b, size_t b_len) {
+  OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                         crypto_word_t_too_small);
+  int ret = 0;
+  // Process the common words in little-endian order.
+  size_t min = a_len < b_len ? a_len : b_len;
+  for (size_t i = 0; i < min; i++) {
+    crypto_word_t eq = constant_time_eq_w(a[i], b[i]);
+    crypto_word_t lt = constant_time_lt_w(a[i], b[i]);
+    ret =
+        constant_time_select_int(eq, ret, constant_time_select_int(lt, -1, 1));
  }

-  ap = a->d;
-  bp = b->d;
-  for (i = a->top - 1; i >= 0; i--) {
-    t1 = ap[i];
-    t2 = bp[i];
-    if (t1 != t2) {
-      return (t1 > t2) ? 1 : -1;
+  // If |a| or |b| has non-zero words beyond |min|, they take precedence.
+  if (a_len < b_len) {
+    crypto_word_t mask = 0;
+    for (size_t i = a_len; i < b_len; i++) {
+      mask |= b[i];
    }
+    ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, -1);
+  } else if (b_len < a_len) {
+    crypto_word_t mask = 0;
+    for (size_t i = b_len; i < a_len; i++) {
+      mask |= a[i];
+    }
+    ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 1);
  }

-  return 0;
+  return ret;
+}
+
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
+  return bn_cmp_words_consttime(a->d, a->width, b->d, b->width);
 }

 int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
-  int i;
-  int gt, lt;
-  BN_ULONG t1, t2;
-
  if ((a == NULL) || (b == NULL)) {
    if (a != NULL) {
      return -1;
@@ -100,97 +110,25 @@ int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
    }
  }

+  // We do not attempt to process the sign bit in constant time. Negative
+  // |BIGNUM|s should never occur in crypto, only calculators.
  if (a->neg != b->neg) {
    if (a->neg) {
      return -1;
    }
    return 1;
  }
-  if (a->neg == 0) {
-    gt = 1;
-    lt = -1;
-  } else {
-    gt = -1;
-    lt = 1;
-  }

-  if (a->top > b->top) {
-    return gt;
-  }
-  if (a->top < b->top) {
-    return lt;
-  }
-
-  for (i = a->top - 1; i >= 0; i--) {
-    t1 = a->d[i];
-    t2 = b->d[i];
-    if (t1 > t2) {
-      return gt;
-    } if (t1 < t2) {
-      return lt;
-    }
-  }
-
-  return 0;
-}
-
-int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n) {
-  int i;
-  BN_ULONG aa, bb;
-
-  aa = a[n - 1];
-  bb = b[n - 1];
-  if (aa != bb) {
-    return (aa > bb) ? 1 : -1;
-  }
-
-  for (i = n - 2; i >= 0; i--) {
-    aa = a[i];
-    bb = b[i];
-    if (aa != bb) {
-      return (aa > bb) ? 1 : -1;
-    }
-  }
-  return 0;
-}
-
-int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl) {
-  int n, i;
-  n = cl - 1;
-
-  if (dl < 0) {
-    for (i = dl; i < 0; i++) {
-      if (b[n - i] != 0) {
-        return -1;  // a < b
-      }
-    }
-  }
-  if (dl > 0) {
-    for (i = dl; i > 0; i--) {
-      if (a[n + i] != 0) {
-        return 1;  // a > b
-      }
-    }
-  }
-
-  return bn_cmp_words(a, b, cl);
+  int ret = BN_ucmp(a, b);
+  return a->neg ? -ret : ret;
 }

 int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) {
-  OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
-                         crypto_word_t_too_small);
-  int ret = 0;
-  // Process the words in little-endian order.
-  for (size_t i = 0; i < len; i++) {
-    crypto_word_t eq = constant_time_eq_w(a[i], b[i]);
-    crypto_word_t lt = constant_time_lt_w(a[i], b[i]);
-    ret = constant_time_select_int(eq, ret, constant_time_select_int(lt, 1, 0));
-  }
-  return ret;
+  return bn_cmp_words_consttime(a, len, b, len) < 0;
 }

 int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) {
-  switch (bn->top) {
+  switch (bn_minimal_width(bn)) {
    case 1:
      return bn->d[0] == w;
    case 0:
@@ -205,14 +143,14 @@ int BN_cmp_word(const BIGNUM *a, BN_ULONG b) {
  BN_init(&b_bn);

  b_bn.d = &b;
-  b_bn.top = b > 0;
+  b_bn.width = b > 0;
  b_bn.dmax = 1;
  b_bn.flags = BN_FLG_STATIC_DATA;
  return BN_cmp(a, &b_bn);
 }

 int BN_is_zero(const BIGNUM *bn) {
-  return bn->top == 0;
+  return bn_minimal_width(bn) == 0;
 }

 int BN_is_one(const BIGNUM *bn) {
@@ -224,31 +162,39 @@ int BN_is_word(const BIGNUM *bn, BN_ULONG w) {
 }

 int BN_is_odd(const BIGNUM *bn) {
-  return bn->top > 0 && (bn->d[0] & 1) == 1;
+  return bn->width > 0 && (bn->d[0] & 1) == 1;
 }

 int BN_is_pow2(const BIGNUM *bn) {
-  if (bn->top == 0 || bn->neg) {
+  int width = bn_minimal_width(bn);
+  if (width == 0 || bn->neg) {
    return 0;
  }

-  for (int i = 0; i < bn->top - 1; i++) {
+  for (int i = 0; i < width - 1; i++) {
    if (bn->d[i] != 0) {
      return 0;
    }
  }

-  return 0 == (bn->d[bn->top-1] & (bn->d[bn->top-1] - 1));
+  return 0 == (bn->d[width-1] & (bn->d[width-1] - 1));
 }

 int BN_equal_consttime(const BIGNUM *a, const BIGNUM *b) {
-  if (a->top != b->top) {
-    return 0;
+  BN_ULONG mask = 0;
+  // If |a| or |b| has more words than the other, all those words must be zero.
+  for (int i = a->width; i < b->width; i++) {
+    mask |= b->d[i];
  }
-
-  int limbs_are_equal =
-    CRYPTO_memcmp(a->d, b->d, (size_t)a->top * sizeof(a->d[0])) == 0;
-
-  return constant_time_select_int(constant_time_eq_int(a->neg, b->neg),
-                                  limbs_are_equal, 0);
+  for (int i = b->width; i < a->width; i++) {
+    mask |= a->d[i];
+  }
+  // Common words must match.
+  int min = a->width < b->width ? a->width : b->width;
+  for (int i = 0; i < min; i++) {
+    mask |= (a->d[i] ^ b->d[i]);
+  }
+  // The sign bit must match.
+  mask |= (a->neg ^ b->neg);
+  return mask == 0;
 }
@@ -155,18 +155,18 @@ static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
  //
  // These issues aren't specific to x86 and x86_64, so it might be worthwhile
  // to add more assembly language implementations.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__GNUC__)
-  __asm__ volatile (
-    "divl %4"
-    : "=a"(*quotient_out), "=d"(*rem_out)
-    : "a"(n1), "d"(n0), "rm"(d0)
-    : "cc" );
-#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
-  __asm__ volatile (
-    "divq %4"
-    : "=a"(*quotient_out), "=d"(*rem_out)
-    : "a"(n1), "d"(n0), "rm"(d0)
-    : "cc" );
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && \
+    (defined(__GNUC__) || defined(__clang__))
+  __asm__ volatile("divl %4"
+                   : "=a"(*quotient_out), "=d"(*rem_out)
+                   : "a"(n1), "d"(n0), "rm"(d0)
+                   : "cc");
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))
+  __asm__ volatile("divq %4"
+                   : "=a"(*quotient_out), "=d"(*rem_out)
+                   : "a"(n1), "d"(n0), "rm"(d0)
+                   : "cc");
 #else
 #if defined(BN_ULLONG)
  BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
@@ -202,10 +202,16 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
  BN_ULONG d0, d1;
  int num_n, div_n;

-  // Invalid zero-padding would have particularly bad consequences
-  // so don't just rely on bn_check_top() here
-  if ((numerator->top > 0 && numerator->d[numerator->top - 1] == 0) ||
-      (divisor->top > 0 && divisor->d[divisor->top - 1] == 0)) {
+  // This function relies on the historical minimal-width |BIGNUM| invariant.
+  // It is already not constant-time (constant-time reductions should use
+  // Montgomery logic), so we shrink all inputs and intermediate values to
+  // retain the previous behavior.
+
+  // Invalid zero-padding would have particularly bad consequences.
+  int numerator_width = bn_minimal_width(numerator);
+  int divisor_width = bn_minimal_width(divisor);
+  if ((numerator_width > 0 && numerator->d[numerator_width - 1] == 0) ||
+      (divisor_width > 0 && divisor->d[divisor_width - 1] == 0)) {
    OPENSSL_PUT_ERROR(BN, BN_R_NOT_INITIALIZED);
    return 0;
  }
@@ -234,46 +240,48 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
  if (!BN_lshift(sdiv, divisor, norm_shift)) {
    goto err;
  }
+  bn_set_minimal_width(sdiv);
  sdiv->neg = 0;
  norm_shift += BN_BITS2;
  if (!BN_lshift(snum, numerator, norm_shift)) {
    goto err;
  }
+  bn_set_minimal_width(snum);
  snum->neg = 0;

  // Since we don't want to have special-case logic for the case where snum is
  // larger than sdiv, we pad snum with enough zeroes without changing its
  // value.
-  if (snum->top <= sdiv->top + 1) {
-    if (!bn_wexpand(snum, sdiv->top + 2)) {
+  if (snum->width <= sdiv->width + 1) {
+    if (!bn_wexpand(snum, sdiv->width + 2)) {
      goto err;
    }
-    for (int i = snum->top; i < sdiv->top + 2; i++) {
+    for (int i = snum->width; i < sdiv->width + 2; i++) {
      snum->d[i] = 0;
    }
-    snum->top = sdiv->top + 2;
+    snum->width = sdiv->width + 2;
  } else {
-    if (!bn_wexpand(snum, snum->top + 1)) {
+    if (!bn_wexpand(snum, snum->width + 1)) {
      goto err;
    }
-    snum->d[snum->top] = 0;
-    snum->top++;
+    snum->d[snum->width] = 0;
+    snum->width++;
  }

-  div_n = sdiv->top;
-  num_n = snum->top;
+  div_n = sdiv->width;
+  num_n = snum->width;
  loop = num_n - div_n;
  // Lets setup a 'window' into snum
  // This is the part that corresponds to the current
  // 'area' being divided
  wnum.neg = 0;
  wnum.d = &(snum->d[loop]);
-  wnum.top = div_n;
-  // only needed when BN_ucmp messes up the values between top and max
+  wnum.width = div_n;
+  // only needed when BN_ucmp messes up the values between width and max
  wnum.dmax = snum->dmax - loop;  // so we don't step out of bounds

  // Get the top 2 words of sdiv
-  // div_n=sdiv->top;
+  // div_n=sdiv->width;
  d0 = sdiv->d[div_n - 1];
  d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];

@@ -285,7 +293,7 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
  if (!bn_wexpand(res, loop + 1)) {
    goto err;
  }
-  res->top = loop - 1;
+  res->width = loop - 1;
  resp = &(res->d[loop - 1]);

  // space for temp
@@ -293,9 +301,9 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
    goto err;
  }

-  // if res->top == 0 then clear the neg value otherwise decrease
+  // if res->width == 0 then clear the neg value otherwise decrease
  // the resp pointer
-  if (res->top == 0) {
+  if (res->width == 0) {
    res->neg = 0;
  } else {
    resp--;
@@ -371,7 +379,7 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
    *resp = q;
  }

-  bn_correct_top(snum);
+  bn_set_minimal_width(snum);

  if (rem != NULL) {
    // Keep a copy of the neg flag in numerator because if |rem| == |numerator|
@@ -385,7 +393,7 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
    }
  }

-  bn_correct_top(res);
+  bn_set_minimal_width(res);
  BN_CTX_end(ctx);
  return 1;

@@ -406,6 +414,78 @@ int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) {
  return (d->neg ? BN_sub : BN_add)(r, r, d);
 }

+// bn_mod_sub_words sets |r| to |a| - |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+static void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+  // r = a - b
+  BN_ULONG borrow = bn_sub_words(r, a, b, num);
+  // tmp = a - b + m
+  bn_add_words(tmp, r, m, num);
+  bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num);
+}
+
+// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+static void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+  // tmp = a + b. Note the result fits in |num|+1 words. We store the extra word
+  // in |carry|.
+  BN_ULONG carry = bn_add_words(tmp, a, b, num);
+  // r = a + b - m. We use |bn_sub_words| to perform the bulk of the
+  // subtraction, and then apply the borrow to |carry|.
+  carry -= bn_sub_words(r, tmp, m, num);
+  // |a| and |b| were both fully-reduced, so we know:
+  //
+  //   0 + 0 - m <= r < m + m - m
+  //          -m <= r < m
+  //
+  // If 0 <= |r| < |m|, |r| fits in |num| words and |carry| is zero. We then
+  // wish to select |r| as the answer. Otherwise -m <= r < 0 and we wish to
+  // return |r| + |m|, or |tmp|. |carry| must then be -1 or all ones. In both
+  // cases, |carry| is a suitable input to |bn_select_words|.
+  //
+  // Although |carry| may be one if |bn_add_words| returns one and
+  // |bn_sub_words| returns zero, this would give |r| > |m|, which violates are
+  // input assumptions.
+  assert(carry == 0 || carry == (BN_ULONG)-1);
+  bn_select_words(r, carry, tmp /* r < 0 */, r /* r >= 0 */, num);
+}
+
+static BIGNUM *bn_scratch_space_from_ctx(size_t width, BN_CTX *ctx) {
+  BIGNUM *ret = BN_CTX_get(ctx);
+  if (ret == NULL ||
+      !bn_wexpand(ret, width)) {
+    return NULL;
+  }
+  ret->neg = 0;
+  ret->width = width;
+  return ret;
+}
+
+// bn_resized_from_ctx returns |bn| with width at least |width| or NULL on
+// error. This is so it may be used with low-level "words" functions. If
+// necessary, it allocates a new |BIGNUM| with a lifetime of the current scope
+// in |ctx|, so the caller does not need to explicitly free it. |bn| must fit in
+// |width| words.
+static const BIGNUM *bn_resized_from_ctx(const BIGNUM *bn, size_t width,
+                                         BN_CTX *ctx) {
+  if ((size_t)bn->width >= width) {
+    // Any excess words must be zero.
+    assert(bn_fits_in_words(bn, width));
+    return bn;
+  }
+  BIGNUM *ret = bn_scratch_space_from_ctx(width, ctx);
+  if (ret == NULL ||
+      !BN_copy(ret, bn) ||
+      !bn_resize_words(ret, width)) {
+    return NULL;
+  }
+  return ret;
+}
+
 int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
               BN_CTX *ctx) {
  if (!BN_add(r, a, b)) {
@@ -416,13 +496,27 @@ int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,

 int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                     const BIGNUM *m) {
-  if (!BN_uadd(r, a, b)) {
-    return 0;
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_add_quick_ctx(r, a, b, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
+int bn_mod_add_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  a = bn_resized_from_ctx(a, m->width, ctx);
+  b = bn_resized_from_ctx(b, m->width, ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = a != NULL && b != NULL && tmp != NULL &&
+           bn_wexpand(r, m->width);
+  if (ok) {
+    bn_mod_add_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
+    r->width = m->width;
  }
-  if (BN_ucmp(r, m) >= 0) {
-    return BN_usub(r, r, m);
-  }
-  return 1;
+  BN_CTX_end(ctx);
+  return ok;
 }

 int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
@@ -433,17 +527,29 @@ int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
  return BN_nnmod(r, r, m, ctx);
 }

-// BN_mod_sub variant that may be used if both  a  and  b  are non-negative
-// and less than  m
+int bn_mod_sub_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  a = bn_resized_from_ctx(a, m->width, ctx);
+  b = bn_resized_from_ctx(b, m->width, ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = a != NULL && b != NULL && tmp != NULL &&
+           bn_wexpand(r, m->width);
+  if (ok) {
+    bn_mod_sub_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
+    r->width = m->width;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}
+
 int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                     const BIGNUM *m) {
-  if (!BN_sub(r, a, b)) {
-    return 0;
-  }
-  if (r->neg) {
-    return BN_add(r, r, m);
-  }
-  return 1;
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_sub_quick_ctx(r, a, b, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
 }

 int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
@@ -504,58 +610,33 @@ int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
    abs_m->neg = 0;
  }

-  ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
+  ret = bn_mod_lshift_quick_ctx(r, r, n, (abs_m ? abs_m : m), ctx);

  BN_free(abs_m);
  return ret;
 }

-int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
-  if (r != a) {
-    if (BN_copy(r, a) == NULL) {
+int bn_mod_lshift_quick_ctx(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                            BN_CTX *ctx) {
+  if (!BN_copy(r, a)) {
+    return 0;
+  }
+  for (int i = 0; i < n; i++) {
+    if (!bn_mod_lshift1_quick_ctx(r, r, m, ctx)) {
      return 0;
    }
  }
-
-  while (n > 0) {
-    int max_shift;
-
-    // 0 < r < m
-    max_shift = BN_num_bits(m) - BN_num_bits(r);
-    // max_shift >= 0
-
-    if (max_shift < 0) {
-      OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
-      return 0;
-    }
-
-    if (max_shift > n) {
-      max_shift = n;
-    }
-
-    if (max_shift) {
-      if (!BN_lshift(r, r, max_shift)) {
-        return 0;
-      }
-      n -= max_shift;
-    } else {
-      if (!BN_lshift1(r, r)) {
-        return 0;
-      }
-      --n;
-    }
-
-    // BN_num_bits(r) <= BN_num_bits(m)
-    if (BN_cmp(r, m) >= 0) {
-      if (!BN_sub(r, r, m)) {
-        return 0;
-      }
-    }
-  }
-
  return 1;
 }

+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_lshift_quick_ctx(r, a, n, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
 int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
  if (!BN_lshift1(r, a)) {
    return 0;
@@ -564,15 +645,17 @@ int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
  return BN_nnmod(r, r, m, ctx);
 }

-int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
-  if (!BN_lshift1(r, a)) {
-    return 0;
-  }
-  if (BN_cmp(r, m) >= 0) {
-    return BN_sub(r, r, m);
-  }
+int bn_mod_lshift1_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
+                             BN_CTX *ctx) {
+  return bn_mod_add_quick_ctx(r, a, a, m, ctx);
+}

-  return 1;
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_lshift1_quick_ctx(r, a, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
 }

 BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
@@ -584,7 +667,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
    return (BN_ULONG) - 1;
  }

-  if (a->top == 0) {
+  if (a->width == 0) {
    return 0;
  }

@@ -595,7 +678,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
    return (BN_ULONG) - 1;
  }

-  for (i = a->top - 1; i >= 0; i--) {
+  for (i = a->width - 1; i >= 0; i--) {
    BN_ULONG l = a->d[i];
    BN_ULONG d;
    BN_ULONG unused_rem;
@@ -604,20 +687,13 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
    a->d[i] = d;
  }

-  if ((a->top > 0) && (a->d[a->top - 1] == 0)) {
-    a->top--;
-  }
-
-  if (a->top == 0) {
-    a->neg = 0;
-  }
-
+  bn_set_minimal_width(a);
  ret >>= j;
  return ret;
 }

 BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
-#ifndef BN_ULLONG
+#ifndef BN_CAN_DIVIDE_ULLONG
  BN_ULONG ret = 0;
 #else
  BN_ULLONG ret = 0;
@@ -628,9 +704,9 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
    return (BN_ULONG) -1;
  }

-#ifndef BN_ULLONG
-  // If |w| is too long and we don't have |BN_ULLONG| then we need to fall back
-  // to using |BN_div_word|.
+#ifndef BN_CAN_DIVIDE_ULLONG
+  // If |w| is too long and we don't have |BN_ULLONG| division then we need to
+  // fall back to using |BN_div_word|.
  if (w > ((BN_ULONG)1 << BN_BITS4)) {
    BIGNUM *tmp = BN_dup(a);
    if (tmp == NULL) {
@@ -642,8 +718,8 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
  }
 #endif

-  for (i = a->top - 1; i >= 0; i--) {
-#ifndef BN_ULLONG
+  for (i = a->width - 1; i >= 0; i--) {
+#ifndef BN_CAN_DIVIDE_ULLONG
    ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
    ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
 #else
@@ -654,7 +730,7 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
 }

 int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
-  if (e == 0 || a->top == 0) {
+  if (e == 0 || a->width == 0) {
    BN_zero(r);
    return 1;
  }
@@ -662,7 +738,7 @@ int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
  size_t num_words = 1 + ((e - 1) / BN_BITS2);

  // If |a| definitely has less than |e| bits, just BN_copy.
-  if ((size_t) a->top < num_words) {
+  if ((size_t) a->width < num_words) {
    return BN_copy(r, a) != NULL;
  }

@@ -683,8 +759,8 @@ int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {

  // Fill in the remaining fields of |r|.
  r->neg = a->neg;
-  r->top = (int) num_words;
-  bn_correct_top(r);
+  r->width = (int) num_words;
+  bn_set_minimal_width(r);
  return 1;
 }

@@ -706,27 +782,27 @@ int BN_nnmod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
  }

  // Clear the upper words of |r|.
-  OPENSSL_memset(&r->d[r->top], 0, (num_words - r->top) * BN_BYTES);
+  OPENSSL_memset(&r->d[r->width], 0, (num_words - r->width) * BN_BYTES);

  // Set parameters of |r|.
  r->neg = 0;
-  r->top = (int) num_words;
+  r->width = (int) num_words;

  // Now, invert every word. The idea here is that we want to compute 2^e-|x|,
  // which is actually equivalent to the twos-complement representation of |x|
  // in |e| bits, which is -x = ~x + 1.
-  for (int i = 0; i < r->top; i++) {
+  for (int i = 0; i < r->width; i++) {
    r->d[i] = ~r->d[i];
  }

  // If our exponent doesn't span the top word, we have to mask the rest.
  size_t top_word_exponent = e % BN_BITS2;
  if (top_word_exponent != 0) {
-    r->d[r->top - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1;
+    r->d[r->width - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1;
  }

-  // Keep the correct_top invariant for BN_add.
-  bn_correct_top(r);
+  // Keep the minimal-width invariant for |BIGNUM|.
+  bn_set_minimal_width(r);

  // Finally, add one, for the reason described above.
  return BN_add(r, r, BN_value_one());
@@ -622,8 +622,8 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,

  // Allocate a montgomery context if it was not supplied by the caller.
  if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new();
-    if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
+    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    if (new_mont == NULL) {
      goto err;
    }
    mont = new_mont;
@@ -666,22 +666,7 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    }
  }

-  // Set |r| to one in Montgomery form. If the high bit of |m| is set, |m| is
-  // close to R and we subtract rather than perform Montgomery reduction.
-  if (m->d[m->top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
-    if (!bn_wexpand(r, m->top)) {
-      goto err;
-    }
-    // r = 2^(top*BN_BITS2) - m
-    r->d[0] = 0 - m->d[0];
-    for (int i = 1; i < m->top; i++) {
-      r->d[i] = ~m->d[i];
-    }
-    r->top = m->top;
-    // The upper words will be zero if the corresponding words of |m| were
-    // 0xfff[...], so call |bn_correct_top|.
-    bn_correct_top(r);
-  } else if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
+  if (!bn_one_to_montgomery(r, mont, ctx)) {
    goto err;
  }

@@ -746,8 +731,7 @@ err:
 int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                          size_t num_a, const BN_ULONG *p, size_t num_p,
                          const BN_MONT_CTX *mont) {
-  const BN_ULONG *n = mont->N.d;
-  size_t num_n = mont->N.top;
+  size_t num_n = mont->N.width;
  if (num_n != num_a || num_n != num_r || num_n > BN_SMALL_MAX_WORDS) {
    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
    return 0;
@@ -793,16 +777,7 @@ int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
    }
  }

-  // Set |r| to one in Montgomery form. If the high bit of |m| is set, |m| is
-  // close to R and we subtract rather than perform Montgomery reduction.
-  if (n[num_n - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
-    // r = 2^(top*BN_BITS2) - m
-    r[0] = 0 - n[0];
-    for (size_t i = 1; i < num_n; i++) {
-      r[i] = ~n[i];
-    }
-  } else if (!bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.top,
-                                       mont)) {
+  if (!bn_one_to_montgomery_small(r, num_r, mont)) {
    goto err;
  }

@@ -866,7 +841,7 @@ int bn_mod_inverse_prime_mont_small(BN_ULONG *r, size_t num_r,
                                    const BN_ULONG *a, size_t num_a,
                                    const BN_MONT_CTX *mont) {
  const BN_ULONG *p = mont->N.d;
-  size_t num_p = mont->N.top;
+  size_t num_p = mont->N.width;
  if (num_p > BN_SMALL_MAX_WORDS || num_p == 0) {
    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
    return 0;
@@ -901,8 +876,8 @@ static void copy_to_prebuf(const BIGNUM *b, int top, unsigned char *buf,
  const int width = 1 << window;
  BN_ULONG *table = (BN_ULONG *) buf;

-  if (top > b->top) {
-    top = b->top;  // this works because 'buf' is explicitly zeroed
+  if (top > b->width) {
+    top = b->width;  // this works because 'buf' is explicitly zeroed
  }

  for (i = 0, j = idx; i < top; i++, j += width)  {
@@ -955,8 +930,7 @@ static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx,
    }
  }

-  b->top = top;
-  bn_correct_top(b);
+  b->width = top;
  return 1;
 }

@@ -1006,8 +980,7 @@ static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx,
 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                              const BIGNUM *m, BN_CTX *ctx,
                              const BN_MONT_CTX *mont) {
-  int i, bits, ret = 0, window, wvalue;
-  int top;
+  int i, ret = 0, window, wvalue;
  BN_MONT_CTX *new_mont = NULL;

  int numPowers;
@@ -1022,9 +995,10 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    return 0;
  }

-  top = m->top;
-
-  bits = BN_num_bits(p);
+  // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak
+  // whether the top bits are zero.
+  int max_bits = p->width * BN_BITS2;
+  int bits = max_bits;
  if (bits == 0) {
    // x**0 mod 1 is still zero.
    if (BN_is_one(m)) {
@@ -1036,13 +1010,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,

  // Allocate a montgomery context if it was not supplied by the caller.
  if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new();
-    if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
+    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    if (new_mont == NULL) {
      goto err;
    }
    mont = new_mont;
  }

+  // Use the width in |mont->N|, rather than the copy in |m|. The assembly
+  // implementation assumes it can use |top| to size R.
+  int top = mont->N.width;
+
  if (a->neg || BN_ucmp(a, m) >= 0) {
    new_a = BN_new();
    if (new_a == NULL ||
@@ -1056,15 +1034,14 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
  // If the size of the operands allow it, perform the optimized
  // RSAZ exponentiation. For further information see
  // crypto/bn/rsaz_exp.c and accompanying assembly modules.
-  if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) &&
+  if ((16 == a->width) && (16 == p->width) && (BN_num_bits(m) == 1024) &&
      rsaz_avx2_eligible()) {
    if (!bn_wexpand(rr, 16)) {
      goto err;
    }
    RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]);
-    rr->top = 16;
+    rr->width = 16;
    rr->neg = 0;
-    bn_correct_top(rr);
    ret = 1;
    goto err;
  }
@@ -1110,21 +1087,12 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
  // lay down tmp and am right after powers table
  tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
  am.d = tmp.d + top;
-  tmp.top = am.top = 0;
+  tmp.width = am.width = 0;
  tmp.dmax = am.dmax = top;
  tmp.neg = am.neg = 0;
  tmp.flags = am.flags = BN_FLG_STATIC_DATA;

-// prepare a^0 in Montgomery domain
-// by Shay Gueron's suggestion
-  if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
-    // 2^(top*BN_BITS2) - m
-    tmp.d[0] = 0 - m->d[0];
-    for (i = 1; i < top; i++) {
-      tmp.d[i] = ~m->d[i];
-    }
-    tmp.top = top;
-  } else if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx)) {
+  if (!bn_one_to_montgomery(&tmp, mont, ctx)) {
    goto err;
  }

@@ -1148,10 +1116,10 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,

    // BN_to_montgomery can contaminate words above .top
    // [in BN_DEBUG[_DEBUG] build]...
-    for (i = am.top; i < top; i++) {
+    for (i = am.width; i < top; i++) {
      am.d[i] = 0;
    }
-    for (i = tmp.top; i < top; i++) {
+    for (i = tmp.width; i < top; i++) {
      tmp.d[i] = 0;
    }

@@ -1161,7 +1129,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    }

    bn_scatter5(tmp.d, top, powerbuf, 0);
-    bn_scatter5(am.d, am.top, powerbuf, 1);
+    bn_scatter5(am.d, am.width, powerbuf, 1);
    bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
    bn_scatter5(tmp.d, top, powerbuf, 2);

@@ -1217,7 +1185,6 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
      }
    } else {
      const uint8_t *p_bytes = (const uint8_t *)p->d;
-      int max_bits = p->top * BN_BITS2;
      assert(bits < max_bits);
      // |p = 0| has been handled as a special case, so |max_bits| is at least
      // one word.
@@ -1229,7 +1196,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
      // here is the top bit, inclusive.
      if (bits - 4 >= max_bits - 8) {
        // Read five bits from |bits-4| through |bits|, inclusive.
-        wvalue = p_bytes[p->top * BN_BYTES - 1];
+        wvalue = p_bytes[p->width * BN_BYTES - 1];
        wvalue >>= (bits - 4) & 7;
        wvalue &= 0x1f;
        bits -= 5;
@@ -1248,8 +1215,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    }

    ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
-    tmp.top = top;
-    bn_correct_top(&tmp);
+    tmp.width = top;
    if (ret) {
      if (!BN_copy(rr, &tmp)) {
        ret = 0;
@@ -1363,8 +1329,8 @@ int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,

  // Allocate a montgomery context if it was not supplied by the caller.
  if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new();
-    if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
+    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    if (new_mont == NULL) {
      goto err;
    }
    mont = new_mont;
@@ -64,7 +64,8 @@
 // This file has two other implementations: x86 assembly language in
 // asm/bn-586.pl and x86_64 inline assembly in asm/x86_64-gcc.c.
 #if defined(OPENSSL_NO_ASM) || \
-    !(defined(OPENSSL_X86) || (defined(OPENSSL_X86_64) && defined(__GNUC__)))
+    !(defined(OPENSSL_X86) ||  \
+      (defined(OPENSSL_X86_64) && (defined(__GNUC__) || defined(__clang__))))

 #ifdef BN_ULLONG
 #define mul_add(r, a, w, c)               \
@@ -140,9 +140,12 @@ extern "C" {

 #if defined(OPENSSL_64_BIT)

-#if !defined(_MSC_VER)
+#if defined(BORINGSSL_HAS_UINT128)
 // MSVC doesn't support two-word integers on 64-bit.
 #define BN_ULLONG uint128_t
+#if defined(BORINGSSL_CAN_DIVIDE_UINT128)
+#define BN_CAN_DIVIDE_ULLONG
+#endif
 #endif

 #define BN_BITS2 64
@@ -160,6 +163,7 @@ extern "C" {
 #elif defined(OPENSSL_32_BIT)

 #define BN_ULLONG uint64_t
+#define BN_CAN_DIVIDE_ULLONG
 #define BN_BITS2 32
 #define BN_BYTES 4
 #define BN_BITS4 16
@@ -193,9 +197,13 @@ extern "C" {
 #define Hw(t) ((BN_ULONG)((t) >> BN_BITS2))
 #endif

-// bn_correct_top decrements |bn->top| until |bn->d[top-1]| is non-zero or
-// until |top| is zero. If |bn| is zero, |bn->neg| is set to zero.
-void bn_correct_top(BIGNUM *bn);
+// bn_minimal_width returns the minimal value of |bn->top| which fits the
+// value of |bn|.
+int bn_minimal_width(const BIGNUM *bn);
+
+// bn_set_minimal_width sets |bn->width| to |bn_minimal_width(bn)|. If |bn| is
+// zero, |bn->neg| is set to zero.
+void bn_set_minimal_width(BIGNUM *bn);

 // bn_wexpand ensures that |bn| has at least |words| works of space without
 // altering its value. It returns one on success or zero on allocation
@@ -206,10 +214,27 @@ int bn_wexpand(BIGNUM *bn, size_t words);
 // than a number of words.
 int bn_expand(BIGNUM *bn, size_t bits);

+// bn_resize_words adjusts |bn->top| to be |words|. It returns one on success
+// and zero on allocation error or if |bn|'s value is too large.
+OPENSSL_EXPORT int bn_resize_words(BIGNUM *bn, size_t words);
+
+// bn_select_words sets |r| to |a| if |mask| is all ones or |b| if |mask| is
+// all zeros.
+void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
+                     const BN_ULONG *b, size_t num);
+
 // bn_set_words sets |bn| to the value encoded in the |num| words in |words|,
 // least significant word first.
 int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);

+// bn_fits_in_words returns one if |bn| may be represented in |num| words, plus
+// a sign bit, and zero otherwise.
+int bn_fits_in_words(const BIGNUM *bn, size_t num);
+
+// bn_copy_words copies the value of |bn| to |out| and returns one if the value
+// is representable in |num| words. Otherwise, it returns zero.
+int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn);
+
 // bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places
 // the result in |rp|. |ap| and |rp| must both be |num| words long. It returns
 // the carry word of the operation. |ap| and |rp| may be equal but otherwise may
@@ -255,16 +280,6 @@ void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[4]);
 // bn_sqr_comba4 sets |r| to |a|^2.
 void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]);

-// bn_cmp_words returns a value less than, equal to or greater than zero if
-// the, length |n|, array |a| is less than, equal to or greater than |b|.
-int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
-
-// bn_cmp_words returns a value less than, equal to or greater than zero if the
-// array |a| is less than, equal to or greater than |b|. The arrays can be of
-// different lengths: |cl| gives the minimum of the two lengths and |dl| gives
-// the length of |a| minus the length of |b|.
-int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
-
 // bn_less_than_words returns one if |a| < |b| and zero otherwise, where |a|
 // and |b| both are |len| words long. It runs in constant time.
 int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len);
@@ -292,7 +307,13 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                const BN_ULONG *np, const BN_ULONG *n0, int num);

 uint64_t bn_mont_n0(const BIGNUM *n);
-int bn_mod_exp_base_2_vartime(BIGNUM *r, unsigned p, const BIGNUM *n);
+
+// bn_mod_exp_base_2_consttime calculates r = 2**p (mod n). |p| must be larger
+// than log_2(n); i.e. 2**p must be larger than |n|. |n| must be positive and
+// odd. |p| and the bit width of |n| are assumed public, but |n| is otherwise
+// treated as secret.
+int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n,
+                                BN_CTX *ctx);

 #if defined(OPENSSL_X86_64) && defined(_MSC_VER)
 #define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
@@ -322,6 +343,61 @@ int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
 // otherwise.
 int bn_is_bit_set_words(const BN_ULONG *a, size_t num, unsigned bit);

+// bn_one_to_montgomery sets |r| to one in Montgomery form. It returns one on
+// success and zero on error. This function treats the bit width of the modulus
+// as public.
+int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx);
+
+// bn_less_than_montgomery_R returns one if |bn| is less than the Montgomery R
+// value for |mont| and zero otherwise.
+int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont);
+
+
+// Fixed-width arithmetic.
+//
+// The following functions implement non-modular arithmetic in constant-time
+// and pessimally set |r->width| to the largest possible word size.
+//
+// Note this means that, e.g., repeatedly multiplying by one will cause widths
+// to increase without bound. The corresponding public API functions minimize
+// their outputs to avoid regressing calculator consumers.
+
+// bn_uadd_fixed behaves like |BN_uadd|, but it pessimally sets
+// |r->width| = |a->width| + |b->width| + 1.
+int bn_uadd_fixed(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+// bn_mul_fixed behaves like |BN_mul|, but it rejects negative inputs and
+// pessimally sets |r->width| to |a->width| + |b->width|, to avoid leaking
+// information about |a| and |b|.
+int bn_mul_fixed(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+
+// bn_sqrt_fixed behaves like |BN_sqrt|, but it pessimally sets |r->width| to
+// 2*|a->width|, to avoid leaking information about |a| and |b|.
+int bn_sqr_fixed(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
+
+
+// Constant-time modular arithmetic.
+//
+// The following functions implement basic constant-time modular arithemtic on
+// word arrays.
+
+// bn_mod_add_quick_ctx acts like |BN_mod_add_quick| but takes a |BN_CTX|.
+int bn_mod_add_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx);
+
+// bn_mod_sub_quick_ctx acts like |BN_mod_sub_quick| but takes a |BN_CTX|.
+int bn_mod_sub_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx);
+
+// bn_mod_lshift1_quick_ctx acts like |BN_mod_lshift1_quick| but takes a
+// |BN_CTX|.
+int bn_mod_lshift1_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
+                             BN_CTX *ctx);
+
+// bn_mod_lshift_quick_ctx acts like |BN_mod_lshift_quick| but takes a |BN_CTX|.
+int bn_mod_lshift_quick_ctx(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                            BN_CTX *ctx);
+

 // Low-level operations for small numbers.
 //
@@ -368,6 +444,13 @@ int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
 int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                             size_t num_a, const BN_MONT_CTX *mont);

+// bn_one_to_montgomery_small sets |r| to one in Montgomery form. It returns one
+// on success and zero on error. |num_r| must be the length of the modulus,
+// which is |mont->N.top|. This function treats the bit width of the modulus as
+// public.
+int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
+                               const BN_MONT_CTX *mont);
+
 // bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
 // and outputs are in the Montgomery domain. |num_r| must be the length of the
 // modulus, which is |mont->N.top|. This function returns one on success and
@@ -58,7 +58,7 @@


 // least significant word
-#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
+#define BN_lsw(n) (((n)->width == 0) ? (BN_ULONG) 0 : (n)->d[0])

 int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
  // In 'tab', only odd-indexed entries are relevant:
@@ -126,10 +126,6 @@
 #define OPENSSL_BN_ASM_MONT
 #endif

-static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
-                                          const BIGNUM *b,
-                                          const BN_MONT_CTX *mont, BN_CTX *ctx);
-

 BN_MONT_CTX *BN_MONT_CTX_new(void) {
  BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
@@ -193,6 +189,10 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
    OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
    return 0;
  }
+  // |mont->N| is always stored minimally. Computing RR efficiently leaks the
+  // size of the modulus. While the modulus may be private in RSA (one of the
+  // primes), their sizes are public, so this is fine.
+  bn_set_minimal_width(&mont->N);

  // Find n0 such that n0 * N == -1 (mod r).
  //
@@ -200,7 +200,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
  // others, we could use a shorter R value and use faster |BN_ULONG|-based
  // math instead of |uint64_t|-based math, which would be double-precision.
  // However, currently only the assembler files know which is which.
-  uint64_t n0 = bn_mont_n0(mod);
+  uint64_t n0 = bn_mont_n0(&mont->N);
  mont->n0[0] = (BN_ULONG)n0;
 #if BN_MONT_CTX_N0_LIMBS == 2
  mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2);
@@ -208,19 +208,34 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
  mont->n0[1] = 0;
 #endif

+  BN_CTX *new_ctx = NULL;
+  if (ctx == NULL) {
+    new_ctx = BN_CTX_new();
+    if (new_ctx == NULL) {
+      return 0;
+    }
+    ctx = new_ctx;
+  }
+
  // Save RR = R**2 (mod N). R is the smallest power of 2**BN_BITS2 such that R
  // > mod. Even though the assembly on some 32-bit platforms works with 64-bit
  // values, using |BN_BITS2| here, rather than |BN_MONT_CTX_N0_LIMBS *
  // BN_BITS2|, is correct because R**2 will still be a multiple of the latter
  // as |BN_MONT_CTX_N0_LIMBS| is either one or two.
-  //
-  // XXX: This is not constant time with respect to |mont->N|, but it should be.
-  unsigned lgBigR = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
-  if (!bn_mod_exp_base_2_vartime(&mont->RR, lgBigR * 2, &mont->N)) {
-    return 0;
-  }
+  unsigned lgBigR = mont->N.width * BN_BITS2;
+  int ok = bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx);
+  BN_CTX_free(new_ctx);
+  return ok;
+}

-  return 1;
+BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !BN_MONT_CTX_set(mont, mod, ctx)) {
+    BN_MONT_CTX_free(mont);
+    return NULL;
+  }
+  return mont;
 }

 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
@@ -234,25 +249,12 @@ int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
  }

  CRYPTO_MUTEX_lock_write(lock);
-  ctx = *pmont;
-  if (ctx) {
-    goto out;
+  if (*pmont == NULL) {
+    *pmont = BN_MONT_CTX_new_for_modulus(mod, bn_ctx);
  }
-
-  ctx = BN_MONT_CTX_new();
-  if (ctx == NULL) {
-    goto out;
-  }
-  if (!BN_MONT_CTX_set(ctx, mod, bn_ctx)) {
-    BN_MONT_CTX_free(ctx);
-    ctx = NULL;
-    goto out;
-  }
-  *pmont = ctx;
-
-out:
+  const int ok = *pmont != NULL;
  CRYPTO_MUTEX_unlock_write(lock);
-  return ctx != NULL;
+  return ok;
 }

 int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
@@ -263,7 +265,7 @@ int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
 static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a,
                                       size_t num_a, const BN_MONT_CTX *mont) {
  const BN_ULONG *n = mont->N.d;
-  size_t num_n = mont->N.top;
+  size_t num_n = mont->N.width;
  if (num_r != num_n || num_a != 2 * num_n) {
    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
    return 0;
@@ -304,32 +306,26 @@ static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a,

 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
                                   const BN_MONT_CTX *mont) {
+  if (r->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
  const BIGNUM *n = &mont->N;
-  if (n->top == 0) {
-    ret->top = 0;
+  if (n->width == 0) {
+    ret->width = 0;
    return 1;
  }

-  int max = (2 * n->top);  // carry is stored separately
-  if (!bn_wexpand(r, max) ||
-      !bn_wexpand(ret, n->top)) {
+  int max = 2 * n->width;  // carry is stored separately
+  if (!bn_resize_words(r, max) ||
+      !bn_wexpand(ret, n->width)) {
    return 0;
  }
-  // Clear the top words of |r|.
-  if (max > r->top) {
-    OPENSSL_memset(r->d + r->top, 0, (max - r->top) * sizeof(BN_ULONG));
-  }
-  r->top = max;
-  ret->top = n->top;

-  if (!bn_from_montgomery_in_place(ret->d, ret->top, r->d, r->top, mont)) {
-    return 0;
-  }
-  ret->neg = r->neg;
-
-  bn_correct_top(r);
-  bn_correct_top(ret);
-  return 1;
+  ret->width = n->width;
+  ret->neg = 0;
+  return bn_from_montgomery_in_place(ret->d, ret->width, r->d, r->width, mont);
 }

 int BN_from_montgomery(BIGNUM *r, const BIGNUM *a, const BN_MONT_CTX *mont,
@@ -352,35 +348,24 @@ err:
  return ret;
 }

-int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
-                          const BN_MONT_CTX *mont, BN_CTX *ctx) {
-#if !defined(OPENSSL_BN_ASM_MONT)
-  return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
-#else
-  int num = mont->N.top;
-
-  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
-  if (num < (128 / BN_BITS2) ||
-      a->top != num ||
-      b->top != num) {
-    return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
+int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  // If the high bit of |n| is set, R = 2^(width*BN_BITS2) < 2 * |n|, so we
+  // compute R - |n| rather than perform Montgomery reduction.
+  const BIGNUM *n = &mont->N;
+  if (n->width > 0 && (n->d[n->width - 1] >> (BN_BITS2 - 1)) != 0) {
+    if (!bn_wexpand(r, n->width)) {
+      return 0;
+    }
+    r->d[0] = 0 - n->d[0];
+    for (int i = 1; i < n->width; i++) {
+      r->d[i] = ~n->d[i];
+    }
+    r->width = n->width;
+    r->neg = 0;
+    return 1;
  }

-  if (!bn_wexpand(r, num)) {
-    return 0;
-  }
-  if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
-    // The check above ensures this won't happen.
-    assert(0);
-    OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
-    return 0;
-  }
-  r->neg = a->neg ^ b->neg;
-  r->top = num;
-  bn_correct_top(r);
-
-  return 1;
-#endif
+  return BN_from_montgomery(r, &mont->RR, mont, ctx);
 }

 static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
@@ -396,11 +381,11 @@ static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
  }

  if (a == b) {
-    if (!BN_sqr(tmp, a, ctx)) {
+    if (!bn_sqr_fixed(tmp, a, ctx)) {
      goto err;
    }
  } else {
-    if (!BN_mul(tmp, a, b, ctx)) {
+    if (!bn_mul_fixed(tmp, a, b, ctx)) {
      goto err;
    }
  }
@@ -417,15 +402,51 @@ err:
  return ret;
 }

+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  if (a->neg || b->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+#if defined(OPENSSL_BN_ASM_MONT)
+  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+  int num = mont->N.width;
+  if (num >= (128 / BN_BITS2) &&
+      a->width == num &&
+      b->width == num) {
+    if (!bn_wexpand(r, num)) {
+      return 0;
+    }
+    if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+      // The check above ensures this won't happen.
+      assert(0);
+      OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
+      return 0;
+    }
+    r->neg = 0;
+    r->width = num;
+    return 1;
+  }
+#endif
+
+  return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
+}
+
+int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont) {
+  return !BN_is_negative(bn) &&
+         bn_fits_in_words(bn, mont->N.width);
+}
+
 int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                           size_t num_a, const BN_MONT_CTX *mont) {
  return bn_mod_mul_montgomery_small(r, num_r, a, num_a, mont->RR.d,
-                                     mont->RR.top, mont);
+                                     mont->RR.width, mont);
 }

 int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                             size_t num_a, const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.top;
+  size_t num_n = mont->N.width;
  if (num_a > 2 * num_n || num_r != num_n || num_n > BN_SMALL_MAX_WORDS) {
    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
    return 0;
@@ -439,10 +460,32 @@ int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
  return ret;
 }

+int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
+                               const BN_MONT_CTX *mont) {
+  const BN_ULONG *n = mont->N.d;
+  size_t num_n = mont->N.width;
+  if (num_n == 0 || num_r != num_n) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  // If the high bit of |n| is set, R = 2^(num_n*BN_BITS2) < 2 * |n|, so we
+  // compute R - |n| rather than perform Montgomery reduction.
+  if (num_n > 0 && (n[num_n - 1] >> (BN_BITS2 - 1)) != 0) {
+    r[0] = 0 - n[0];
+    for (size_t i = 1; i < num_n; i++) {
+      r[i] = ~n[i];
+    }
+    return 1;
+  }
+
+  return bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.width, mont);
+}
+
 int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                                size_t num_a, const BN_ULONG *b, size_t num_b,
                                const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.top;
+  size_t num_n = mont->N.width;
  if (num_r != num_n || num_a + num_b > 2 * num_n ||
      num_n > BN_SMALL_MAX_WORDS) {
    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -71,7 +71,7 @@ uint64_t bn_mont_n0(const BIGNUM *n) {
  // |BN_MONT_CTX_N0_LIMBS| limbs of |n|.
  uint64_t n_mod_r = n->d[0];
 #if BN_MONT_CTX_N0_LIMBS == 2
-  if (n->top > 1) {
+  if (n->width > 1) {
    n_mod_r |= (uint64_t)n->d[1] << BN_BITS2;
  }
 #endif
@@ -159,10 +159,8 @@ static uint64_t bn_neg_inv_mod_r_u64(uint64_t n) {
  return v;
 }

-// bn_mod_exp_base_2_vartime calculates r = 2**p (mod n). |p| must be larger
-// than log_2(n); i.e. 2**p must be larger than |n|. |n| must be positive and
-// odd.
-int bn_mod_exp_base_2_vartime(BIGNUM *r, unsigned p, const BIGNUM *n) {
+int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n,
+                                BN_CTX *ctx) {
  assert(!BN_is_zero(n));
  assert(!BN_is_negative(n));
  assert(BN_is_odd(n));
@@ -171,37 +169,17 @@ int bn_mod_exp_base_2_vartime(BIGNUM *r, unsigned p, const BIGNUM *n) {

  unsigned n_bits = BN_num_bits(n);
  assert(n_bits != 0);
+  assert(p > n_bits);
  if (n_bits == 1) {
    return 1;
  }

-  // Set |r| to the smallest power of two larger than |n|.
-  assert(p > n_bits);
-  if (!BN_set_bit(r, n_bits)) {
+  // Set |r| to the larger power of two smaller than |n|, then shift with
+  // reductions the rest of the way.
+  if (!BN_set_bit(r, n_bits - 1) ||
+      !bn_mod_lshift_quick_ctx(r, r, p - (n_bits - 1), n, ctx)) {
    return 0;
  }

-  // Unconditionally reduce |r|.
-  assert(BN_cmp(r, n) > 0);
-  if (!BN_usub(r, r, n)) {
-    return 0;
-  }
-  assert(BN_cmp(r, n) < 0);
-
-  for (unsigned i = n_bits; i < p; ++i) {
-    // This is like |BN_mod_lshift1_quick| except using |BN_usub|.
-    //
-    // TODO: Replace this with the use of a constant-time variant of
-    // |BN_mod_lshift1_quick|.
-    if (!BN_lshift1(r, r)) {
-      return 0;
-    }
-    if (BN_cmp(r, n) >= 0) {
-      if (!BN_usub(r, r, n)) {
-        return 0;
-      }
-    }
-  }
-
  return 1;
 }
@@ -61,6 +61,7 @@

 #include <openssl/err.h>
 #include <openssl/mem.h>
+#include <openssl/type_check.h>

 #include "internal.h"
 #include "../../internal.h"
@@ -70,6 +71,13 @@
 #define BN_SQR_RECURSIVE_SIZE_NORMAL BN_MUL_RECURSIVE_SIZE_NORMAL


+static void bn_abs_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             size_t num, BN_ULONG *tmp) {
+  BN_ULONG borrow = bn_sub_words(tmp, a, b, num);
+  bn_sub_words(r, b, a, num);
+  bn_select_words(r, 0 - borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, num);
+}
+
 static void bn_mul_normal(BN_ULONG *r, const BN_ULONG *a, size_t na,
                          const BN_ULONG *b, size_t nb) {
  if (na < nb) {
@@ -279,25 +287,43 @@ BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
                           int cl, int dl);
 #endif

+// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
+// and returning a mask of all ones if the result was negative and all zeros if
+// the result was positive. |cl| and |dl| follow the |bn_sub_part_words| calling
+// convention.
+//
+// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
+// is confusing. The trouble is 32-bit x86 implements |bn_sub_part_words| in
+// assembly, but we can probably just delete it?
+static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
+                                      const BN_ULONG *b, int cl, int dl,
+                                      BN_ULONG *tmp) {
+  BN_ULONG borrow = bn_sub_part_words(tmp, a, b, cl, dl);
+  bn_sub_part_words(r, b, a, cl, -dl);
+  int r_len = cl + (dl < 0 ? -dl : dl);
+  borrow = 0 - borrow;
+  bn_select_words(r, borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, r_len);
+  return borrow;
+}
+
 // Karatsuba recursive multiplication algorithm
 // (cf. Knuth, The Art of Computer Programming, Vol. 2)

-// r is 2*n2 words in size,
-// a and b are both n2 words in size.
-// n2 must be a power of 2.
-// We multiply and return the result.
-// t must be 2*n2 words in size
-// We calculate
-// a[0]*b[0]
-// a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
-// a[1]*b[1]
-// dnX may not be positive, but n2/2+dnX has to be
+// bn_mul_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r| has
+// length 2*|n2|, |a| has length |n2| + |dna|, |b| has length |n2| + |dnb|, and
+// |t| has length 4*|n2|. |n2| must be a power of two. Finally, we must have
+// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dna| <= 0 and
+// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dnb| <= 0.
+//
+// TODO(davidben): Simplify and |size_t| the calling convention around lengths
+// here.
 static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
                             int n2, int dna, int dnb, BN_ULONG *t) {
-  int n = n2 / 2, c1, c2;
-  int tna = n + dna, tnb = n + dnb;
-  unsigned int neg, zero;
-  BN_ULONG ln, lo, *p;
+  // |n2| is a power of two.
+  assert(n2 != 0 && (n2 & (n2 - 1)) == 0);
+  // Check |dna| and |dnb| are in range.
+  assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dna && dna <= 0);
+  assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dnb && dnb <= 0);

  // Only call bn_mul_comba 8 if n2 == 8 and the
  // two arrays are complete [steve]
@@ -309,276 +335,212 @@ static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  // Else do normal multiply
  if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
    bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
-    if ((dna + dnb) < 0) {
+    if (dna + dnb < 0) {
      OPENSSL_memset(&r[2 * n2 + dna + dnb], 0,
                     sizeof(BN_ULONG) * -(dna + dnb));
    }
    return;
  }

-  // r=(a[0]-a[1])*(b[1]-b[0])
-  c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
-  c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
-  zero = neg = 0;
-  switch (c1 * 3 + c2) {
-    case -4:
-      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);        // -
-      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb);  // -
-      break;
-    case -3:
-      zero = 1;
-      break;
-    case -2:
-      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);        // -
-      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);  // +
-      neg = 1;
-      break;
-    case -1:
-    case 0:
-    case 1:
-      zero = 1;
-      break;
-    case 2:
-      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);        // +
-      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb);  // -
-      neg = 1;
-      break;
-    case 3:
-      zero = 1;
-      break;
-    case 4:
-      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
-      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
-      break;
-  }
+  // Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|.
+  // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
+  // for recursive calls.
+  // Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1
+  // to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as:
+  //
+  //   a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0
+  //
+  // Note that we know |n| >= |BN_MUL_RECURSIVE_SIZE_NORMAL|/2 above, so
+  // |tna| and |tnb| are non-negative.
+  int n = n2 / 2, tna = n + dna, tnb = n + dnb;

+  // t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR
+  // their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1
+  // themselves store the absolute value.
+  BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]);
+  neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]);
+
+  // Compute:
+  // t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)|
+  // r0,r1 = a0 * b0
+  // r2,r3 = a1 * b1
  if (n == 4 && dna == 0 && dnb == 0) {
-    // XXX: bn_mul_comba4 could take extra args to do this well
-    if (!zero) {
-      bn_mul_comba4(&(t[n2]), t, &(t[n]));
-    } else {
-      OPENSSL_memset(&(t[n2]), 0, 8 * sizeof(BN_ULONG));
-    }
+    bn_mul_comba4(&t[n2], t, &t[n]);

    bn_mul_comba4(r, a, b);
-    bn_mul_comba4(&(r[n2]), &(a[n]), &(b[n]));
+    bn_mul_comba4(&r[n2], &a[n], &b[n]);
  } else if (n == 8 && dna == 0 && dnb == 0) {
-    // XXX: bn_mul_comba8 could take extra args to do this well
-    if (!zero) {
-      bn_mul_comba8(&(t[n2]), t, &(t[n]));
-    } else {
-      OPENSSL_memset(&(t[n2]), 0, 16 * sizeof(BN_ULONG));
-    }
+    bn_mul_comba8(&t[n2], t, &t[n]);

    bn_mul_comba8(r, a, b);
-    bn_mul_comba8(&(r[n2]), &(a[n]), &(b[n]));
+    bn_mul_comba8(&r[n2], &a[n], &b[n]);
  } else {
-    p = &(t[n2 * 2]);
-    if (!zero) {
-      bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
-    } else {
-      OPENSSL_memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
-    }
+    BN_ULONG *p = &t[n2 * 2];
+    bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p);
    bn_mul_recursive(r, a, b, n, 0, 0, p);
-    bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), n, dna, dnb, p);
+    bn_mul_recursive(&r[n2], &a[n], &b[n], n, dna, dnb, p);
  }

-  // t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
-  // r[10] holds (a[0]*b[0])
-  // r[32] holds (b[1]*b[1])
+  // t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1
+  BN_ULONG c = bn_add_words(t, r, &r[n2], n2);

-  c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+  // t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0.
+  // The second term is stored as the absolute value, so we do this with a
+  // constant-time select.
+  BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2);
+  BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2);
+  bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2);
+  OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                         crypto_word_t_too_small);
+  c = constant_time_select_w(neg, c_neg, c_pos);

-  if (neg) {
-    // if t[32] is negative
-    c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
-  } else {
-    // Might have a carry
-    c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
+  // We now have our three components. Add them together.
+  // r1,r2,c = r1,r2 + t2,t3,c
+  c += bn_add_words(&r[n], &r[n], &t[n2], n2);
+
+  // Propagate the carry bit to the end.
+  for (int i = n + n2; i < n2 + n2; i++) {
+    BN_ULONG old = r[i];
+    r[i] = old + c;
+    c = r[i] < old;
  }

-  // t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
-  // r[10] holds (a[0]*b[0])
-  // r[32] holds (b[1]*b[1])
-  // c1 holds the carry bits
-  c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
-  if (c1) {
-    p = &(r[n + n2]);
-    lo = *p;
-    ln = lo + c1;
-    *p = ln;
-
-    // The overflow will stop before we over write
-    // words we should not overwrite
-    if (ln < (BN_ULONG)c1) {
-      do {
-        p++;
-        lo = *p;
-        ln = lo + 1;
-        *p = ln;
-      } while (ln == 0);
-    }
-  }
+  // The product should fit without carries.
+  assert(c == 0);
 }

-// n+tn is the word length
-// t needs to be n*4 is size, as does r
-// tnX may not be negative but less than n
+// bn_mul_part_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r|
+// has length 4*|n|, |a| has length |n| + |tna|, |b| has length |n| + |tnb|, and
+// |t| has length 8*|n|. |n| must be a power of two. Additionally, we must have
+// 0 <= tna < n and 0 <= tnb < n, and |tna| and |tnb| must differ by at most
+// one.
+//
+// TODO(davidben): Make this take |size_t| and perhaps the actual lengths of |a|
+// and |b|.
 static void bn_mul_part_recursive(BN_ULONG *r, const BN_ULONG *a,
                                  const BN_ULONG *b, int n, int tna, int tnb,
                                  BN_ULONG *t) {
-  int i, j, n2 = n * 2;
-  int c1, c2, neg;
-  BN_ULONG ln, lo, *p;
+  // |n| is a power of two.
+  assert(n != 0 && (n & (n - 1)) == 0);
+  // Check |tna| and |tnb| are in range.
+  assert(0 <= tna && tna < n);
+  assert(0 <= tnb && tnb < n);
+  assert(-1 <= tna - tnb && tna - tnb <= 1);

+  int n2 = n * 2;
  if (n < 8) {
    bn_mul_normal(r, a, n + tna, b, n + tnb);
+    OPENSSL_memset(r + n2 + tna + tnb, 0, n2 - tna - tnb);
    return;
  }

-  // r=(a[0]-a[1])*(b[1]-b[0])
-  c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
-  c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
-  neg = 0;
-  switch (c1 * 3 + c2) {
-    case -4:
-      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);        // -
-      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb);  // -
-      break;
-    case -3:
-      // break;
-    case -2:
-      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);        // -
-      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);  // +
-      neg = 1;
-      break;
-    case -1:
-    case 0:
-    case 1:
-      // break;
-    case 2:
-      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);        // +
-      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb);  // -
-      neg = 1;
-      break;
-    case 3:
-      // break;
-    case 4:
-      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
-      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
-      break;
-  }
+  // Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|. |a1|
+  // and |b1| have size |tna| and |tnb|, respectively.
+  // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
+  // for recursive calls.
+  // Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1
+  // to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as:
+  //
+  //   a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0

+  // t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR
+  // their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1
+  // themselves store the absolute value.
+  BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]);
+  neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]);
+
+  // Compute:
+  // t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)|
+  // r0,r1 = a0 * b0
+  // r2,r3 = a1 * b1
  if (n == 8) {
-    bn_mul_comba8(&(t[n2]), t, &(t[n]));
+    bn_mul_comba8(&t[n2], t, &t[n]);
    bn_mul_comba8(r, a, b);
-    bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
-    OPENSSL_memset(&(r[n2 + tna + tnb]), 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
-  } else {
-    p = &(t[n2 * 2]);
-    bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
-    bn_mul_recursive(r, a, b, n, 0, 0, p);
-    i = n / 2;
-    // If there is only a bottom half to the number,
-    // just do it
-    if (tna > tnb) {
-      j = tna - i;
-    } else {
-      j = tnb - i;
-    }

-    if (j == 0) {
-      bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i, p);
-      OPENSSL_memset(&(r[n2 + i * 2]), 0, sizeof(BN_ULONG) * (n2 - i * 2));
-    } else if (j > 0) {
-      // eg, n == 16, i == 8 and tn == 11
-      bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i, p);
-      OPENSSL_memset(&(r[n2 + tna + tnb]), 0,
-                     sizeof(BN_ULONG) * (n2 - tna - tnb));
+    bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb);
+    // |bn_mul_normal| only writes |tna| + |tna| words. Zero the rest.
+    OPENSSL_memset(&r[n2 + tna + tnb], 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
+  } else {
+    BN_ULONG *p = &t[n2 * 2];
+    bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p);
+    bn_mul_recursive(r, a, b, n, 0, 0, p);
+
+    OPENSSL_memset(&r[n2], 0, sizeof(BN_ULONG) * n2);
+    if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
+        tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+      bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb);
    } else {
-      // (j < 0) eg, n == 16, i == 8 and tn == 5
-      OPENSSL_memset(&(r[n2]), 0, sizeof(BN_ULONG) * n2);
-      if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
-          tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
-        bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
-      } else {
-        for (;;) {
-          i /= 2;
-          // these simplified conditions work
-          // exclusively because difference
-          // between tna and tnb is 1 or 0
-          if (i < tna || i < tnb) {
-            bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i,
-                                  tnb - i, p);
-            break;
-          } else if (i == tna || i == tnb) {
-            bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i,
-                             p);
-            break;
-          }
+      int i = n;
+      for (;;) {
+        i /= 2;
+        if (i < tna || i < tnb) {
+          // E.g., n == 16, i == 8 and tna == 11. |tna| and |tnb| are within one
+          // of each other, so if |tna| is larger and tna > i, then we know
+          // tnb >= i, and this call is valid.
+          bn_mul_part_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p);
+          break;
        }
+        if (i == tna || i == tnb) {
+          // If there is only a bottom half to the number, just do it. We know
+          // the larger of |tna - i| and |tnb - i| is zero. The other is zero or
+          // -1 by because of |tna| and |tnb| differ by at most one.
+          bn_mul_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p);
+          break;
+        }
+
+        // This loop will eventually terminate when |i| falls below
+        // |BN_MUL_RECURSIVE_SIZE_NORMAL| because we know one of |tna| and |tnb|
+        // exceeds that.
      }
    }
  }

-  // t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
-  // r[10] holds (a[0]*b[0])
-  // r[32] holds (b[1]*b[1])
+  // t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1
+  BN_ULONG c = bn_add_words(t, r, &r[n2], n2);

-  c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+  // t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0.
+  // The second term is stored as the absolute value, so we do this with a
+  // constant-time select.
+  BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2);
+  BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2);
+  bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2);
+  OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                         crypto_word_t_too_small);
+  c = constant_time_select_w(neg, c_neg, c_pos);

-  if (neg) {
-    // if t[32] is negative
-    c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
-  } else {
-    // Might have a carry
-    c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
+  // We now have our three components. Add them together.
+  // r1,r2,c = r1,r2 + t2,t3,c
+  c += bn_add_words(&r[n], &r[n], &t[n2], n2);
+
+  // Propagate the carry bit to the end.
+  for (int i = n + n2; i < n2 + n2; i++) {
+    BN_ULONG old = r[i];
+    r[i] = old + c;
+    c = r[i] < old;
  }

-  // t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
-  // r[10] holds (a[0]*b[0])
-  // r[32] holds (b[1]*b[1])
-  // c1 holds the carry bits
-  c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
-  if (c1) {
-    p = &(r[n + n2]);
-    lo = *p;
-    ln = lo + c1;
-    *p = ln;
-
-    // The overflow will stop before we over write
-    // words we should not overwrite
-    if (ln < (BN_ULONG)c1) {
-      do {
-        p++;
-        lo = *p;
-        ln = lo + 1;
-        *p = ln;
-      } while (ln == 0);
-    }
-  }
+  // The product should fit without carries.
+  assert(c == 0);
 }

-int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
-  int ret = 0;
-  int top, al, bl;
-  BIGNUM *rr;
-  int i;
-  BIGNUM *t = NULL;
-  int j = 0, k;
-
-  al = a->top;
-  bl = b->top;
-
-  if ((al == 0) || (bl == 0)) {
+// bn_mul_impl implements |BN_mul| and |bn_mul_fixed|. Note this function breaks
+// |BIGNUM| invariants and may return a negative zero. This is handled by the
+// callers.
+static int bn_mul_impl(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                       BN_CTX *ctx) {
+  int al = a->width;
+  int bl = b->width;
+  if (al == 0 || bl == 0) {
    BN_zero(r);
    return 1;
  }
-  top = al + bl;

+  int ret = 0;
+  BIGNUM *rr;
  BN_CTX_start(ctx);
-  if ((r == a) || (r == b)) {
-    if ((rr = BN_CTX_get(ctx)) == NULL) {
+  if (r == a || r == b) {
+    rr = BN_CTX_get(ctx);
+    if (r == NULL) {
      goto err;
    }
  } else {
@@ -586,55 +548,55 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
  }
  rr->neg = a->neg ^ b->neg;

-  i = al - bl;
+  int i = al - bl;
  if (i == 0) {
    if (al == 8) {
      if (!bn_wexpand(rr, 16)) {
        goto err;
      }
-      rr->top = 16;
+      rr->width = 16;
      bn_mul_comba8(rr->d, a->d, b->d);
      goto end;
    }
  }

+  int top = al + bl;
  static const int kMulNormalSize = 16;
  if (al >= kMulNormalSize && bl >= kMulNormalSize) {
-    if (i >= -1 && i <= 1) {
-      /* Find out the power of two lower or equal
-         to the longest of the two numbers */
+    if (-1 <= i && i <= 1) {
+      // Find the larger power of two less than or equal to the larger length.
+      int j;
      if (i >= 0) {
        j = BN_num_bits_word((BN_ULONG)al);
-      }
-      if (i == -1) {
+      } else {
        j = BN_num_bits_word((BN_ULONG)bl);
      }
      j = 1 << (j - 1);
      assert(j <= al || j <= bl);
-      k = j + j;
-      t = BN_CTX_get(ctx);
+      BIGNUM *t = BN_CTX_get(ctx);
      if (t == NULL) {
        goto err;
      }
      if (al > j || bl > j) {
-        if (!bn_wexpand(t, k * 4)) {
-          goto err;
-        }
-        if (!bn_wexpand(rr, k * 4)) {
+        // We know |al| and |bl| are at most one from each other, so if al > j,
+        // bl >= j, and vice versa. Thus we can use |bn_mul_part_recursive|.
+        assert(al >= j && bl >= j);
+        if (!bn_wexpand(t, j * 8) ||
+            !bn_wexpand(rr, j * 4)) {
          goto err;
        }
        bn_mul_part_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
      } else {
-        // al <= j || bl <= j
-        if (!bn_wexpand(t, k * 2)) {
-          goto err;
-        }
-        if (!bn_wexpand(rr, k * 2)) {
+        // al <= j && bl <= j. Additionally, we know j <= al or j <= bl, so one
+        // of al - j or bl - j is zero. The other, by the bound on |i| above, is
+        // zero or -1. Thus, we can use |bn_mul_recursive|.
+        if (!bn_wexpand(t, j * 4) ||
+            !bn_wexpand(rr, j * 2)) {
          goto err;
        }
        bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
      }
-      rr->top = top;
+      rr->width = top;
      goto end;
    }
  }
@@ -642,11 +604,10 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
  if (!bn_wexpand(rr, top)) {
    goto err;
  }
-  rr->top = top;
+  rr->width = top;
  bn_mul_normal(rr->d, a->d, al, b->d, bl);

 end:
-  bn_correct_top(rr);
  if (r != rr && !BN_copy(r, rr)) {
    goto err;
  }
@@ -657,6 +618,26 @@ err:
  return ret;
 }

+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  if (!bn_mul_impl(r, a, b, ctx)) {
+    return 0;
+  }
+
+  // This additionally fixes any negative zeros created by |bn_mul_impl|.
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int bn_mul_fixed(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  // Prevent negative zeros.
+  if (a->neg || b->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  return bn_mul_impl(r, a, b, ctx);
+}
+
 int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
                 const BN_ULONG *b, size_t num_b) {
  if (num_r != num_a + num_b) {
@@ -711,25 +692,19 @@ static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, size_t n,
  bn_add_words(r, r, tmp, max);
 }

-// r is 2*n words in size,
-// a and b are both n words in size.    (There's not actually a 'b' here ...)
-// n must be a power of 2.
-// We multiply and return the result.
-// t must be 2*n words in size
-// We calculate
-// a[0]*b[0]
-// a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
-// a[1]*b[1]
-static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2,
+// bn_sqr_recursive sets |r| to |a|^2, using |t| as scratch space. |r| has
+// length 2*|n2|, |a| has length |n2|, and |t| has length 4*|n2|. |n2| must be
+// a power of two.
+static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, size_t n2,
                             BN_ULONG *t) {
-  int n = n2 / 2;
-  int zero, c1;
-  BN_ULONG ln, lo, *p;
+  // |n2| is a power of two.
+  assert(n2 != 0 && (n2 & (n2 - 1)) == 0);

  if (n2 == 4) {
    bn_sqr_comba4(r, a);
    return;
-  } else if (n2 == 8) {
+  }
+  if (n2 == 8) {
    bn_sqr_comba8(r, a);
    return;
  }
@@ -737,63 +712,48 @@ static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2,
    bn_sqr_normal(r, a, n2, t);
    return;
  }
-  // r=(a[0]-a[1])*(a[1]-a[0])
-  c1 = bn_cmp_words(a, &(a[n]), n);
-  zero = 0;
-  if (c1 > 0) {
-    bn_sub_words(t, a, &(a[n]), n);
-  } else if (c1 < 0) {
-    bn_sub_words(t, &(a[n]), a, n);
-  } else {
-    zero = 1;
+
+  // Split |a| into a0,a1, each of size |n|.
+  // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
+  // for recursive calls.
+  // Split |r| into r0,r1,r2,r3. We must contribute a0^2 to r0,r1, 2*a0*a1 to
+  // r1,r2, and a1^2 to r2,r3.
+  size_t n = n2 / 2;
+  BN_ULONG *t_recursive = &t[n2 * 2];
+
+  // t0 = |a0 - a1|.
+  bn_abs_sub_words(t, a, &a[n], n, &t[n]);
+  // t2,t3 = t0^2 = |a0 - a1|^2 = a0^2 - 2*a0*a1 + a1^2
+  bn_sqr_recursive(&t[n2], t, n, t_recursive);
+
+  // r0,r1 = a0^2
+  bn_sqr_recursive(r, a, n, t_recursive);
+
+  // r2,r3 = a1^2
+  bn_sqr_recursive(&r[n2], &a[n], n, t_recursive);
+
+  // t0,t1,c = r0,r1 + r2,r3 = a0^2 + a1^2
+  BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
+  // t2,t3,c = t0,t1,c - t2,t3 = 2*a0*a1
+  c -= bn_sub_words(&t[n2], t, &t[n2], n2);
+
+  // We now have our three components. Add them together.
+  // r1,r2,c = r1,r2 + t2,t3,c
+  c += bn_add_words(&r[n], &r[n], &t[n2], n2);
+
+  // Propagate the carry bit to the end.
+  for (size_t i = n + n2; i < n2 + n2; i++) {
+    BN_ULONG old = r[i];
+    r[i] = old + c;
+    c = r[i] < old;
  }

-  // The result will always be negative unless it is zero
-  p = &(t[n2 * 2]);
-
-  if (!zero) {
-    bn_sqr_recursive(&(t[n2]), t, n, p);
-  } else {
-    OPENSSL_memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
-  }
-  bn_sqr_recursive(r, a, n, p);
-  bn_sqr_recursive(&(r[n2]), &(a[n]), n, p);
-
-  // t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
-  // r[10] holds (a[0]*b[0])
-  // r[32] holds (b[1]*b[1])
-
-  c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
-
-  // t[32] is negative
-  c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
-
-  // t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
-  // r[10] holds (a[0]*a[0])
-  // r[32] holds (a[1]*a[1])
-  // c1 holds the carry bits
-  c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
-  if (c1) {
-    p = &(r[n + n2]);
-    lo = *p;
-    ln = lo + c1;
-    *p = ln;
-
-    // The overflow will stop before we over write
-    // words we should not overwrite
-    if (ln < (BN_ULONG)c1) {
-      do {
-        p++;
-        lo = *p;
-        ln = lo + 1;
-        *p = ln;
-      } while (ln == 0);
-    }
-  }
+  // The square should fit without carries.
+  assert(c == 0);
 }

 int BN_mul_word(BIGNUM *bn, BN_ULONG w) {
-  if (!bn->top) {
+  if (!bn->width) {
    return 1;
  }

@@ -802,37 +762,34 @@ int BN_mul_word(BIGNUM *bn, BN_ULONG w) {
    return 1;
  }

-  BN_ULONG ll = bn_mul_words(bn->d, bn->d, bn->top, w);
+  BN_ULONG ll = bn_mul_words(bn->d, bn->d, bn->width, w);
  if (ll) {
-    if (!bn_wexpand(bn, bn->top + 1)) {
+    if (!bn_wexpand(bn, bn->width + 1)) {
      return 0;
    }
-    bn->d[bn->top++] = ll;
+    bn->d[bn->width++] = ll;
  }

  return 1;
 }

-int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
-  int max, al;
-  int ret = 0;
-  BIGNUM *tmp, *rr;
-
-  al = a->top;
+int bn_sqr_fixed(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
+  int al = a->width;
  if (al <= 0) {
-    r->top = 0;
+    r->width = 0;
    r->neg = 0;
    return 1;
  }

+  int ret = 0;
  BN_CTX_start(ctx);
-  rr = (a != r) ? r : BN_CTX_get(ctx);
-  tmp = BN_CTX_get(ctx);
+  BIGNUM *rr = (a != r) ? r : BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
  if (!rr || !tmp) {
    goto err;
  }

-  max = 2 * al;  // Non-zero (from above)
+  int max = 2 * al;  // Non-zero (from above)
  if (!bn_wexpand(rr, max)) {
    goto err;
  }
@@ -846,13 +803,9 @@ int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
      BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2];
      bn_sqr_normal(rr->d, a->d, al, t);
    } else {
-      int j, k;
-
-      j = BN_num_bits_word((BN_ULONG)al);
-      j = 1 << (j - 1);
-      k = j + j;
-      if (al == j) {
-        if (!bn_wexpand(tmp, k * 2)) {
+      // If |al| is a power of two, we can use |bn_sqr_recursive|.
+      if (al != 0 && (al & (al - 1)) == 0) {
+        if (!bn_wexpand(tmp, al * 4)) {
          goto err;
        }
        bn_sqr_recursive(rr->d, a->d, al, tmp->d);
@@ -866,13 +819,7 @@ int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
  }

  rr->neg = 0;
-  // If the most-significant half of the top word of 'a' is zero, then
-  // the square of 'a' will max-1 words.
-  if (a->d[al - 1] == (a->d[al - 1] & BN_MASK2l)) {
-    rr->top = max - 1;
-  } else {
-    rr->top = max;
-  }
+  rr->width = max;

  if (rr != r && !BN_copy(r, rr)) {
    goto err;
@@ -884,6 +831,15 @@ err:
  return ret;
 }

+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
+  if (!bn_sqr_fixed(r, a, ctx)) {
+    return 0;
+  }
+
+  bn_set_minimal_width(r);
+  return 1;
+}
+
 int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
  if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) {
    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -586,9 +586,8 @@ int BN_enhanced_miller_rabin_primality_test(
  }

  // Montgomery setup for computations mod A
-  mont = BN_MONT_CTX_new();
-  if (mont == NULL ||
-      !BN_MONT_CTX_set(mont, w, ctx)) {
+  mont = BN_MONT_CTX_new_for_modulus(w, ctx);
+  if (mont == NULL) {
    goto err;
  }

@@ -278,15 +278,14 @@ int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,

 int BN_rand_range_ex(BIGNUM *r, BN_ULONG min_inclusive,
                     const BIGNUM *max_exclusive) {
-  if (!bn_wexpand(r, max_exclusive->top) ||
+  if (!bn_wexpand(r, max_exclusive->width) ||
      !bn_rand_range_words(r->d, min_inclusive, max_exclusive->d,
-                           max_exclusive->top, kDefaultAdditionalData)) {
+                           max_exclusive->width, kDefaultAdditionalData)) {
    return 0;
  }

  r->neg = 0;
-  r->top = max_exclusive->top;
-  bn_correct_top(r);
+  r->width = max_exclusive->width;
  return 1;
 }

@@ -1,44 +1,16 @@
-/*****************************************************************************
-*                                                                            *
-*  Copyright (c) 2012, Intel Corporation                                     *
-*                                                                            *
-*  All rights reserved.                                                      *
-*                                                                            *
-*  Redistribution and use in source and binary forms, with or without        *
-*  modification, are permitted provided that the following conditions are    *
-*  met:                                                                      *
-*                                                                            *
-*  *  Redistributions of source code must retain the above copyright         *
-*     notice, this list of conditions and the following disclaimer.          *
-*                                                                            *
-*  *  Redistributions in binary form must reproduce the above copyright      *
-*     notice, this list of conditions and the following disclaimer in the    *
-*     documentation and/or other materials provided with the                 *
-*     distribution.                                                          *
-*                                                                            *
-*  *  Neither the name of the Intel Corporation nor the names of its         *
-*     contributors may be used to endorse or promote products derived from   *
-*     this software without specific prior written permission.               *
-*                                                                            *
-*                                                                            *
-*  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
-*  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
-*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
-*  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
-*  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
-*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
-*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
-*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
-*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
-*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
-*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
-*                                                                            *
-******************************************************************************
-* Developers and authors:                                                    *
-* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
-* (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
-* (2) University of Haifa, Israel                                            *
-*****************************************************************************/
+/*
+ * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2012, Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * (1) Intel Corporation, Israel Development Center, Haifa, Israel
+ * (2) University of Haifa, Israel
+ */

 #include <openssl/base.h>

@@ -51,204 +23,209 @@
 #include "../../internal.h"


-/*
- * See crypto/bn/asm/rsaz-avx2.pl for further details.
- */
-void rsaz_1024_norm2red_avx2(void *red,const void *norm);
-void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
-void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
-void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
-void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
-void rsaz_1024_red2norm_avx2(void *norm,const void *red);
+// See crypto/bn/asm/rsaz-avx2.pl for further details.
+void rsaz_1024_norm2red_avx2(void *red, const void *norm);
+void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, const void *n,
+                        BN_ULONG k);
+void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
+                        int cnt);
+void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
+void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
+void rsaz_1024_red2norm_avx2(void *norm, const void *red);

-alignas(64) static const BN_ULONG one[40] =
-	{1,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-alignas(64) static const BN_ULONG two80[40] =
-	{0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+// one is 1 in RSAZ's representation.
+alignas(64) static const BN_ULONG one[40] = {
+    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+// two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is
+// 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22).
+alignas(64) static const BN_ULONG two80[40] = {
+    0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0,       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
 	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
-	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
-{
-	alignas(64) uint8_t storage[(320 * 3) + (32 * 9 * 16)]; /* 5.5KB */
-	unsigned char	*a_inv, *m, *result,
-			*table_s = storage + (320 * 3),
-			*R2      = table_s;	/* borrow */
-	int index;
-	int wvalue;
+	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) {
+  alignas(64) uint8_t storage[(320 * 3) + (32 * 9 * 16)];  // 5.5KB
+  unsigned char *a_inv, *m, *result, *table_s = storage + (320 * 3),
+                                     *R2 = table_s;  // borrow
+  if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) {
+    result = storage;
+    a_inv = storage + 320;
+    m = storage + (320 * 2);  // should not cross page
+  } else {
+    m = storage;  // should not cross page
+    result = storage + 320;
+    a_inv = storage + (320 * 2);
+  }

-	if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) {
-		result = storage;
-		a_inv = storage + 320;
-		m = storage + (320 * 2); /* should not cross page */
-	} else {
-		m = storage;		/* should not cross page */
-		result = storage + 320;
-		a_inv = storage + (320 * 2);
-	}
+  rsaz_1024_norm2red_avx2(m, m_norm);
+  rsaz_1024_norm2red_avx2(a_inv, base_norm);
+  rsaz_1024_norm2red_avx2(R2, RR);

-	rsaz_1024_norm2red_avx2(m, m_norm);
-	rsaz_1024_norm2red_avx2(a_inv, base_norm);
-	rsaz_1024_norm2red_avx2(R2, RR);
+  // Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix,
+  // giving R = 2^(36*29) = 2^1044.
+  rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+  // R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052
+  rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+  // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2

-	rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
-	rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+  // table[0] = 1
+  rsaz_1024_mul_avx2(result, R2, one, m, k0);
+  // table[1] = a_inv^1
+  rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);

-	/* table[0] = 1 */
-	rsaz_1024_mul_avx2(result, R2, one, m, k0);
-	/* table[1] = a_inv^1 */
-	rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 0);
+  rsaz_1024_scatter5_avx2(table_s, a_inv, 1);

-	rsaz_1024_scatter5_avx2(table_s,result,0);
-	rsaz_1024_scatter5_avx2(table_s,a_inv,1);
-
-	/* table[2] = a_inv^2 */
-	rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,2);
+  // table[2] = a_inv^2
+  rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 2);
 #if 0
-	/* this is almost 2x smaller and less than 1% slower */
-	for (index=3; index<32; index++) {
-		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-		rsaz_1024_scatter5_avx2(table_s,result,index);
-	}
+  // This is almost 2x smaller and less than 1% slower.
+  for (int index = 3; index < 32; index++) {
+    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+    rsaz_1024_scatter5_avx2(table_s, result, index);
+  }
 #else
-	/* table[4] = a_inv^4 */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,4);
-	/* table[8] = a_inv^8 */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,8);
-	/* table[16] = a_inv^16 */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,16);
-	/* table[17] = a_inv^17 */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,17);
+  // table[4] = a_inv^4
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 4);
+  // table[8] = a_inv^8
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 8);
+  // table[16] = a_inv^16
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 16);
+  // table[17] = a_inv^17
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 17);

-	/* table[3] */
-	rsaz_1024_gather5_avx2(result,table_s,2);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,3);
-	/* table[6] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,6);
-	/* table[12] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,12);
- 	/* table[24] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,24);
-	/* table[25] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,25);
+  // table[3]
+  rsaz_1024_gather5_avx2(result, table_s, 2);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 3);
+  // table[6]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 6);
+  // table[12]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 12);
+  // table[24]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 24);
+  // table[25]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 25);

-	/* table[5] */
-	rsaz_1024_gather5_avx2(result,table_s,4);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,5);
-	/* table[10] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,10);
-	/* table[20] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,20);
-	/* table[21] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,21);
+  // table[5]
+  rsaz_1024_gather5_avx2(result, table_s, 4);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 5);
+  // table[10]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 10);
+  // table[20]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 20);
+  // table[21]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 21);

-	/* table[7] */
-	rsaz_1024_gather5_avx2(result,table_s,6);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,7);
-	/* table[14] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,14);
-	/* table[28] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,28);
-	/* table[29] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,29);
+  // table[7]
+  rsaz_1024_gather5_avx2(result, table_s, 6);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 7);
+  // table[14]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 14);
+  // table[28]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 28);
+  // table[29]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 29);

-	/* table[9] */
-	rsaz_1024_gather5_avx2(result,table_s,8);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,9);
-	/* table[18] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,18);
-	/* table[19] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,19);
+  // table[9]
+  rsaz_1024_gather5_avx2(result, table_s, 8);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 9);
+  // table[18]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 18);
+  // table[19]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 19);

-	/* table[11] */
-	rsaz_1024_gather5_avx2(result,table_s,10);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,11);
-	/* table[22] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,22);
-	/* table[23] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,23);
+  // table[11]
+  rsaz_1024_gather5_avx2(result, table_s, 10);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 11);
+  // table[22]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 22);
+  // table[23]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 23);

-	/* table[13] */
-	rsaz_1024_gather5_avx2(result,table_s,12);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,13);
-	/* table[26] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,26);
-	/* table[27] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,27);
+  // table[13]
+  rsaz_1024_gather5_avx2(result, table_s, 12);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 13);
+  // table[26]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 26);
+  // table[27]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 27);

-	/* table[15] */
-	rsaz_1024_gather5_avx2(result,table_s,14);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,15);
-	/* table[30] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,30);
-	/* table[31] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,31);
+  // table[15]
+  rsaz_1024_gather5_avx2(result, table_s, 14);
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 15);
+  // table[30]
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 30);
+  // table[31]
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 31);
 #endif

-	const uint8_t *p_str = (const uint8_t *)exponent;
+  const uint8_t *p_str = (const uint8_t *)exponent;

-	/* load first window */
-	wvalue = p_str[127] >> 3;
-	rsaz_1024_gather5_avx2(result,table_s,wvalue);
+  // load first window
+  int wvalue = p_str[127] >> 3;
+  rsaz_1024_gather5_avx2(result, table_s, wvalue);

-	index = 1014;
+  int index = 1014;
+  while (index > -1) {  // Loop for the remaining 127 windows.

-	while(index > -1) {	/* loop for the remaining 127 windows */
+    rsaz_1024_sqr_avx2(result, result, m, k0, 5);

-		rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+    uint16_t wvalue_16;
+    memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16));
+    wvalue = wvalue_16;
+    wvalue = (wvalue >> (index % 8)) & 31;
+    index -= 5;

-		wvalue = *((const unsigned short*)&p_str[index / 8]);
-		wvalue = (wvalue>> (index%8)) & 31;
-		index-=5;
+    rsaz_1024_gather5_avx2(a_inv, table_s, wvalue);  // Borrow |a_inv|.
+    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  }

-		rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
-		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	}
+  // Square four times.
+  rsaz_1024_sqr_avx2(result, result, m, k0, 4);

-	/* square four times */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+  wvalue = p_str[0] & 15;

-	wvalue = p_str[0] & 15;
+  rsaz_1024_gather5_avx2(a_inv, table_s, wvalue);  // Borrow |a_inv|.
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);

-	rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  // Convert from Montgomery.
+  rsaz_1024_mul_avx2(result, result, one, m, k0);

-	/* from Montgomery */
-	rsaz_1024_mul_avx2(result, result, one, m, k0);
+  rsaz_1024_red2norm_avx2(result_norm, result);

-	rsaz_1024_red2norm_avx2(result_norm, result);
-
-	OPENSSL_cleanse(storage,sizeof(storage));
+  OPENSSL_cleanse(storage, sizeof(storage));
 }

-#endif  /* OPENSSL_X86_64 */
+#endif  // OPENSSL_X86_64
@@ -1,53 +1,33 @@
-/*****************************************************************************
-*                                                                            *
-*  Copyright (c) 2012, Intel Corporation                                     *
-*                                                                            *
-*  All rights reserved.                                                      *
-*                                                                            *
-*  Redistribution and use in source and binary forms, with or without        *
-*  modification, are permitted provided that the following conditions are    *
-*  met:                                                                      *
-*                                                                            *
-*  *  Redistributions of source code must retain the above copyright         *
-*     notice, this list of conditions and the following disclaimer.          *
-*                                                                            *
-*  *  Redistributions in binary form must reproduce the above copyright      *
-*     notice, this list of conditions and the following disclaimer in the    *
-*     documentation and/or other materials provided with the                 *
-*     distribution.                                                          *
-*                                                                            *
-*  *  Neither the name of the Intel Corporation nor the names of its         *
-*     contributors may be used to endorse or promote products derived from   *
-*     this software without specific prior written permission.               *
-*                                                                            *
-*                                                                            *
-*  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
-*  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
-*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
-*  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
-*  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
-*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
-*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
-*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
-*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
-*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
-*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
-*                                                                            *
-******************************************************************************
-* Developers and authors:                                                    *
-* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
-* (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
-* (2) University of Haifa, Israel                                            *
-*****************************************************************************/
+/*
+ * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2012, Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * (1) Intel Corporation, Israel Development Center, Haifa, Israel
+ * (2) University of Haifa, Israel
+ */

-#ifndef RSAZ_EXP_H
-#define RSAZ_EXP_H
+#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H
+#define OPENSSL_HEADER_BN_RSAZ_EXP_H

 #include <openssl/bn.h>

-void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
-	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
-	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0);
+// RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent|
+// modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
+// the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
+// respectively, extracted from |m_norm|'s |BN_MONT_CTX|.
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
+                            const BN_ULONG exponent[16],
+                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
+                            BN_ULONG k0);
+
+// rsaz_avx2_eligible returns one if |RSAZ_1024_mod_exp_avx2| should be used and
+// zero otherwise.
 int rsaz_avx2_eligible(void);

-#endif
+#endif  // OPENSSL_HEADER_BN_RSAZ_EXP_H
@@ -75,28 +75,28 @@ int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) {

  r->neg = a->neg;
  nw = n / BN_BITS2;
-  if (!bn_wexpand(r, a->top + nw + 1)) {
+  if (!bn_wexpand(r, a->width + nw + 1)) {
    return 0;
  }
  lb = n % BN_BITS2;
  rb = BN_BITS2 - lb;
  f = a->d;
  t = r->d;
-  t[a->top + nw] = 0;
+  t[a->width + nw] = 0;
  if (lb == 0) {
-    for (i = a->top - 1; i >= 0; i--) {
+    for (i = a->width - 1; i >= 0; i--) {
      t[nw + i] = f[i];
    }
  } else {
-    for (i = a->top - 1; i >= 0; i--) {
+    for (i = a->width - 1; i >= 0; i--) {
      l = f[i];
      t[nw + i + 1] |= l >> rb;
      t[nw + i] = l << lb;
    }
  }
  OPENSSL_memset(t, 0, nw * sizeof(t[0]));
-  r->top = a->top + nw + 1;
-  bn_correct_top(r);
+  r->width = a->width + nw + 1;
+  bn_set_minimal_width(r);

  return 1;
 }
@@ -107,26 +107,26 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a) {

  if (r != a) {
    r->neg = a->neg;
-    if (!bn_wexpand(r, a->top + 1)) {
+    if (!bn_wexpand(r, a->width + 1)) {
      return 0;
    }
-    r->top = a->top;
+    r->width = a->width;
  } else {
-    if (!bn_wexpand(r, a->top + 1)) {
+    if (!bn_wexpand(r, a->width + 1)) {
      return 0;
    }
  }
  ap = a->d;
  rp = r->d;
  c = 0;
-  for (i = 0; i < a->top; i++) {
+  for (i = 0; i < a->width; i++) {
    t = *(ap++);
    *(rp++) = (t << 1) | c;
    c = t >> (BN_BITS2 - 1);
  }
  if (c) {
    *rp = 1;
-    r->top++;
+    r->width++;
  }

  return 1;
@@ -142,10 +142,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {
    return 0;
  }

+  int a_width = bn_minimal_width(a);
  nw = n / BN_BITS2;
  rb = n % BN_BITS2;
  lb = BN_BITS2 - rb;
-  if (nw >= a->top || a->top == 0) {
+  if (nw >= a_width || a_width == 0) {
    BN_zero(r);
    return 1;
  }
@@ -163,8 +164,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {

  f = &(a->d[nw]);
  t = r->d;
-  j = a->top - nw;
-  r->top = i;
+  j = a_width - nw;
+  r->width = i;

  if (rb == 0) {
    for (i = j; i != 0; i--) {
@@ -183,7 +184,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {
    }
  }

-  if (r->top == 0) {
+  if (r->width == 0) {
    r->neg = 0;
  }

@@ -198,7 +199,7 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a) {
    BN_zero(r);
    return 1;
  }
-  i = a->top;
+  i = bn_minimal_width(a);
  ap = a->d;
  j = i - (ap[i - 1] == 1);
  if (a != r) {
@@ -218,9 +219,9 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a) {
    rp[i] = (t >> 1) | c;
    c = t << (BN_BITS2 - 1);
  }
-  r->top = j;
+  r->width = j;

-  if (r->top == 0) {
+  if (r->width == 0) {
    r->neg = 0;
  }

@@ -234,14 +235,14 @@ int BN_set_bit(BIGNUM *a, int n) {

  int i = n / BN_BITS2;
  int j = n % BN_BITS2;
-  if (a->top <= i) {
+  if (a->width <= i) {
    if (!bn_wexpand(a, i + 1)) {
      return 0;
    }
-    for (int k = a->top; k < i + 1; k++) {
+    for (int k = a->width; k < i + 1; k++) {
      a->d[k] = 0;
    }
-    a->top = i + 1;
+    a->width = i + 1;
  }

  a->d[i] |= (((BN_ULONG)1) << j);
@@ -258,12 +259,12 @@ int BN_clear_bit(BIGNUM *a, int n) {

  i = n / BN_BITS2;
  j = n % BN_BITS2;
-  if (a->top <= i) {
+  if (a->width <= i) {
    return 0;
  }

  a->d[i] &= (~(((BN_ULONG)1) << j));
-  bn_correct_top(a);
+  bn_set_minimal_width(a);
  return 1;
 }

@@ -280,7 +281,7 @@ int BN_is_bit_set(const BIGNUM *a, int n) {
  if (n < 0) {
    return 0;
  }
-  return bn_is_bit_set_words(a->d, a->top, n);
+  return bn_is_bit_set_words(a->d, a->width, n);
 }

 int BN_mask_bits(BIGNUM *a, int n) {
@@ -290,16 +291,32 @@ int BN_mask_bits(BIGNUM *a, int n) {

  int w = n / BN_BITS2;
  int b = n % BN_BITS2;
-  if (w >= a->top) {
+  if (w >= a->width) {
    return 0;
  }
  if (b == 0) {
-    a->top = w;
+    a->width = w;
  } else {
-    a->top = w + 1;
+    a->width = w + 1;
    a->d[w] &= ~(BN_MASK2 << b);
  }

-  bn_correct_top(a);
+  bn_set_minimal_width(a);
  return 1;
 }
+
+int BN_count_low_zero_bits(const BIGNUM *bn) {
+  for (int i = 0; i < bn->width; i++) {
+    if (bn->d[i] != 0) {
+      int bits = 0;
+      for (BN_ULONG w = bn->d[i]; (w & 1) == 0; w >>= 1) {
+        bits++;
+      }
+      return i * BN_BITS2 + bits;
+    }
+  }
+
+  // We got to the end of |bn| and saw no non-zero words. |bn| is zero, so
+  // return zero.
+  return 0;
+}
@@ -184,7 +184,7 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
    // November 1992.)

    // t := 2*a
-    if (!BN_mod_lshift1_quick(t, A, p)) {
+    if (!bn_mod_lshift1_quick_ctx(t, A, p, ctx)) {
      goto end;
    }

@@ -1148,7 +1148,7 @@ struct aead_aes_gcm_ctx {

 struct aead_aes_gcm_tls12_ctx {
  struct aead_aes_gcm_ctx gcm_ctx;
-  uint64_t counter;
+  uint64_t min_next_nonce;
 };

 static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx,
@@ -1349,7 +1349,7 @@ static int aead_aes_gcm_tls12_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
    return 0;
  }

-  gcm_ctx->counter = 0;
+  gcm_ctx->min_next_nonce = 0;

  size_t actual_tag_len;
  if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len,
@@ -1373,23 +1373,23 @@ static int aead_aes_gcm_tls12_seal_scatter(
    size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in,
    size_t extra_in_len, const uint8_t *ad, size_t ad_len) {
  struct aead_aes_gcm_tls12_ctx *gcm_ctx = ctx->aead_state;
-  if (gcm_ctx->counter == UINT64_MAX) {
-    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE);
-    return 0;
-  }
-
  if (nonce_len != 12) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
    return 0;
  }

-  const uint64_t be_counter = CRYPTO_bswap8(gcm_ctx->counter);
-  if (OPENSSL_memcmp((uint8_t *)&be_counter, nonce + nonce_len - 8, 8) != 0) {
+  // The given nonces must be strictly monotonically increasing.
+  uint64_t given_counter;
+  OPENSSL_memcpy(&given_counter, nonce + nonce_len - sizeof(given_counter),
+                 sizeof(given_counter));
+  given_counter = CRYPTO_bswap8(given_counter);
+  if (given_counter == UINT64_MAX ||
+      given_counter < gcm_ctx->min_next_nonce) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE);
    return 0;
  }

-  gcm_ctx->counter++;
+  gcm_ctx->min_next_nonce = given_counter + 1;

  return aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len,
                                   max_out_tag_len, nonce, nonce_len, in,
@@ -59,6 +59,8 @@

 #include <openssl/base.h>

+#include "../../internal.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -1,44 +1,40 @@
-#!/usr/bin/env perl
-
-# Copyright (c) 2014, Intel Corporation.
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
 #
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
 #
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-# Developers and authors:
-# Shay Gueron (1, 2), and Vlad Krasnov (1)
-# (1) Intel Corporation, Israel Development Center
-# (2) University of Haifa
-
-#  Reference:
-#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
-#                           256 Bit Primes"
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel
+# (2) University of Haifa, Israel
+#
+# Reference:
+# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+#                          256 Bit Primes"

 # Further optimization by <appro@openssl.org>:
 #
-#		this/original
-# Opteron	+12-49%
-# Bulldozer	+14-45%
-# P4		+18-46%
-# Westmere	+12-34%
-# Sandy Bridge	+9-35%
-# Ivy Bridge	+9-35%
-# Haswell	+8-37%
-# Broadwell	+18-58%
-# Atom		+15-50%
-# VIA Nano	+43-160%
+#		this/original	with/without -DECP_NISTZ256_ASM(*)
+# Opteron	+12-49%		+110-150%
+# Bulldozer	+14-45%		+175-210%
+# P4		+18-46%		n/a :-(
+# Westmere	+12-34%		+80-87%
+# Sandy Bridge	+9-35%		+110-120%
+# Ivy Bridge	+9-35%		+110-125%
+# Haswell	+8-37%		+140-160%
+# Broadwell	+18-58%		+145-210%
+# Atom		+15-50%		+130-180%
+# VIA Nano	+43-160%	+300-480%
+#
+# (*)	"without -DECP_NISTZ256_ASM" refers to build with
+#	"enable-ec_nistp_64_gcc_128";
 #
 # Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark.
+# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
+# server-side operation. Keep in mind that +100% means 2x improvement.

 $flavour = shift;
 $output  = shift;
@@ -90,8 +86,12 @@ $code.=<<___;
 .type	ecp_nistz256_neg,\@function,2
 .align	32
 ecp_nistz256_neg:
+.cfi_startproc
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
+.Lneg_body:

 	xor	$a0, $a0
 	xor	$a1, $a1
@@ -125,9 +125,15 @@ ecp_nistz256_neg:
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)

-	pop %r13
-	pop %r12
+	mov	0(%rsp),%r13
+.cfi_restore	%r13
+	mov	8(%rsp),%r12
+.cfi_restore	%r12
+	lea	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lneg_epilogue:
 	ret
+.cfi_endproc
 .size	ecp_nistz256_neg,.-ecp_nistz256_neg
 ___
 }
@@ -148,6 +154,7 @@ $code.=<<___;
 .type	ecp_nistz256_mul_mont,\@function,3
 .align	32
 ecp_nistz256_mul_mont:
+.cfi_startproc
 ___
 $code.=<<___	if ($addx);
 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
@@ -157,11 +164,18 @@ ___
 $code.=<<___;
 .Lmul_mont:
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lmul_body:
 ___
 $code.=<<___	if ($addx);
 	cmp	\$0x80100, %ecx
@@ -194,13 +208,23 @@ $code.=<<___	if ($addx);
 ___
 $code.=<<___;
 .Lmul_mont_done:
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbx
-	pop	%rbp
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmul_epilogue:
 	ret
+.cfi_endproc
 .size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont

 .type	__ecp_nistz256_mul_montq,\@abi-omnipotent
@@ -430,6 +454,7 @@ __ecp_nistz256_mul_montq:
 .type	ecp_nistz256_sqr_mont,\@function,2
 .align	32
 ecp_nistz256_sqr_mont:
+.cfi_startproc
 ___
 $code.=<<___	if ($addx);
 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
@@ -438,11 +463,18 @@ $code.=<<___	if ($addx);
 ___
 $code.=<<___;
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lsqr_body:
 ___
 $code.=<<___	if ($addx);
 	cmp	\$0x80100, %ecx
@@ -471,13 +503,23 @@ $code.=<<___	if ($addx);
 ___
 $code.=<<___;
 .Lsqr_mont_done:
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbx
-	pop	%rbp
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqr_epilogue:
 	ret
+.cfi_endproc
 .size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont

 .type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
@@ -1041,10 +1083,10 @@ $code.=<<___	if ($win64);
 	movaps	0x80(%rsp), %xmm14
 	movaps	0x90(%rsp), %xmm15
 	lea	0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_select_w5:
 ___
 $code.=<<___;
 	ret
+.LSEH_end_ecp_nistz256_select_w5:
 .size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5

 ################################################################################
@@ -1128,10 +1170,10 @@ $code.=<<___	if ($win64);
 	movaps	0x80(%rsp), %xmm14
 	movaps	0x90(%rsp), %xmm15
 	lea	0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_select_w7:
 ___
 $code.=<<___;
 	ret
+.LSEH_end_ecp_nistz256_select_w7:
 .size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
 ___
 }
@@ -1152,18 +1194,19 @@ ecp_nistz256_avx2_select_w5:
 ___
 $code.=<<___	if ($win64);
 	lea	-0x88(%rsp), %rax
+	mov	%rsp,%r11
 .LSEH_begin_ecp_nistz256_avx2_select_w5:
-	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
-	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
-	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
-	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
-	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
-	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
-	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
-	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
-	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
+	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
 ___
 $code.=<<___;
 	vmovdqa	.LTwo(%rip), $TWO
@@ -1229,11 +1272,11 @@ $code.=<<___	if ($win64);
 	movaps	0x70(%rsp), %xmm13
 	movaps	0x80(%rsp), %xmm14
 	movaps	0x90(%rsp), %xmm15
-	lea	0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_select_w5:
+	lea	(%r11), %rsp
 ___
 $code.=<<___;
 	ret
+.LSEH_end_ecp_nistz256_avx2_select_w5:
 .size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
 ___
 }
@@ -1256,19 +1299,20 @@ ecp_nistz256_avx2_select_w7:
 	vzeroupper
 ___
 $code.=<<___	if ($win64);
+	mov	%rsp,%r11
 	lea	-0x88(%rsp), %rax
 .LSEH_begin_ecp_nistz256_avx2_select_w7:
-	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
-	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
-	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
-	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
-	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
-	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
-	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
-	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
-	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
+	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
 ___
 $code.=<<___;
 	vmovdqa	.LThree(%rip), $THREE
@@ -1349,11 +1393,11 @@ $code.=<<___	if ($win64);
 	movaps	0x70(%rsp), %xmm13
 	movaps	0x80(%rsp), %xmm14
 	movaps	0x90(%rsp), %xmm15
-	lea	0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_select_w7:
+	lea	(%r11), %rsp
 ___
 $code.=<<___;
 	ret
+.LSEH_end_ecp_nistz256_avx2_select_w7:
 .size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
 ___
 } else {
@@ -1557,6 +1601,7 @@ $code.=<<___;
 .type	ecp_nistz256_point_double,\@function,2
 .align	32
 ecp_nistz256_point_double:
+.cfi_startproc
 ___
 $code.=<<___	if ($addx);
 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
@@ -1574,17 +1619,26 @@ $code.=<<___;
 .type	ecp_nistz256_point_doublex,\@function,2
 .align	32
 ecp_nistz256_point_doublex:
+.cfi_startproc
 .Lpoint_doublex:
 ___
    }
 $code.=<<___;
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	sub	\$32*5+8, %rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_double${x}_body:

 .Lpoint_double_shortcut$x:
 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
@@ -1755,14 +1809,25 @@ $code.=<<___;
 	movq	%xmm1, $r_ptr
 	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);

-	add	\$32*5+8, %rsp
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbx
-	pop	%rbp
+	lea	32*5+56(%rsp), %rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	mov	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_double${x}_epilogue:
 	ret
+.cfi_endproc
 .size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
 ___
 }
@@ -1788,6 +1853,7 @@ $code.=<<___;
 .type	ecp_nistz256_point_add,\@function,3
 .align	32
 ecp_nistz256_point_add:
+.cfi_startproc
 ___
 $code.=<<___	if ($addx);
 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
@@ -1805,17 +1871,26 @@ $code.=<<___;
 .type	ecp_nistz256_point_addx,\@function,3
 .align	32
 ecp_nistz256_point_addx:
+.cfi_startproc
 .Lpoint_addx:
 ___
    }
 $code.=<<___;
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	sub	\$32*18+8, %rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_add${x}_body:

 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
 	movdqu	0x10($a_ptr), %xmm1
@@ -2124,14 +2199,25 @@ $code.=<<___;
 	movdqu	%xmm3, 0x30($r_ptr)

 .Ladd_done$x:
-	add	\$32*18+8, %rsp
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbx
-	pop	%rbp
+	lea	32*18+56(%rsp), %rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	mov	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_add${x}_epilogue:
 	ret
+.cfi_endproc
 .size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
 ___
 }
@@ -2156,6 +2242,7 @@ $code.=<<___;
 .type	ecp_nistz256_point_add_affine,\@function,3
 .align	32
 ecp_nistz256_point_add_affine:
+.cfi_startproc
 ___
 $code.=<<___	if ($addx);
 	leaq	OPENSSL_ia32cap_P(%rip), %rcx
@@ -2173,17 +2260,26 @@ $code.=<<___;
 .type	ecp_nistz256_point_add_affinex,\@function,3
 .align	32
 ecp_nistz256_point_add_affinex:
+.cfi_startproc
 .Lpoint_add_affinex:
 ___
    }
 $code.=<<___;
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	sub	\$32*15+8, %rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affine${x}_body:

 	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
 	mov	$b_org, $b_ptr		# reassign
@@ -2428,14 +2524,25 @@ $code.=<<___;
 	movdqu	%xmm2, 0x20($r_ptr)
 	movdqu	%xmm3, 0x30($r_ptr)

-	add	\$32*15+8, %rsp
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbx
-	pop	%rbp
+	lea	32*15+56(%rsp), %rsi
+.cfi_def_cfa	%rsi,8
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	mov	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affine${x}_epilogue:
 	ret
+.cfi_endproc
 .size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
 ___
 }
@@ -2586,6 +2693,291 @@ ___
 }
 }}}

+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+
+.type	short_handler,\@abi-omnipotent
+.align	16
+short_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	16(%rax),%rax
+
+	mov	-8(%rax),%r12
+	mov	-16(%rax),%r13
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+
+	jmp	.Lcommon_seh_tail
+.size	short_handler,.-short_handler
+
+.type	full_handler,\@abi-omnipotent
+.align	16
+full_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rax,%r10),%rax
+
+	mov	-8(%rax),%rbp
+	mov	-16(%rax),%rbx
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	full_handler,.-full_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_ecp_nistz256_neg
+	.rva	.LSEH_end_ecp_nistz256_neg
+	.rva	.LSEH_info_ecp_nistz256_neg
+
+	.rva	.LSEH_begin_ecp_nistz256_mul_mont
+	.rva	.LSEH_end_ecp_nistz256_mul_mont
+	.rva	.LSEH_info_ecp_nistz256_mul_mont
+
+	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
+	.rva	.LSEH_end_ecp_nistz256_sqr_mont
+	.rva	.LSEH_info_ecp_nistz256_sqr_mont
+
+	.rva	.LSEH_begin_ecp_nistz256_select_w5
+	.rva	.LSEH_end_ecp_nistz256_select_w5
+	.rva	.LSEH_info_ecp_nistz256_select_wX
+
+	.rva	.LSEH_begin_ecp_nistz256_select_w7
+	.rva	.LSEH_end_ecp_nistz256_select_w7
+	.rva	.LSEH_info_ecp_nistz256_select_wX
+___
+$code.=<<___	if ($avx>1);
+	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w5
+	.rva	.LSEH_end_ecp_nistz256_avx2_select_w5
+	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
+
+	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w7
+	.rva	.LSEH_end_ecp_nistz256_avx2_select_w7
+	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
+___
+$code.=<<___;
+	.rva	.LSEH_begin_ecp_nistz256_point_double
+	.rva	.LSEH_end_ecp_nistz256_point_double
+	.rva	.LSEH_info_ecp_nistz256_point_double
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add
+	.rva	.LSEH_end_ecp_nistz256_point_add
+	.rva	.LSEH_info_ecp_nistz256_point_add
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
+	.rva	.LSEH_end_ecp_nistz256_point_add_affine
+	.rva	.LSEH_info_ecp_nistz256_point_add_affine
+___
+$code.=<<___ if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_point_doublex
+	.rva	.LSEH_end_ecp_nistz256_point_doublex
+	.rva	.LSEH_info_ecp_nistz256_point_doublex
+
+	.rva	.LSEH_begin_ecp_nistz256_point_addx
+	.rva	.LSEH_end_ecp_nistz256_point_addx
+	.rva	.LSEH_info_ecp_nistz256_point_addx
+
+	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
+	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
+	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
+___
+$code.=<<___;
+
+.section	.xdata
+.align	8
+.LSEH_info_ecp_nistz256_neg:
+	.byte	9,0,0,0
+	.rva	short_handler
+	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
+.LSEH_info_ecp_nistz256_mul_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_sqr_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_select_wX:
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
+	.align	8
+___
+$code.=<<___	if ($avx>1);
+.LSEH_info_ecp_nistz256_avx2_select_wX:
+	.byte	0x01,0x36,0x17,0x0b
+	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
+	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
+	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
+	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
+	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
+	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
+	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
+	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
+	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
+	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
+	.align	8
+___
+$code.=<<___;
+.LSEH_info_ecp_nistz256_point_double:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
+	.long	32*5+56,0
+.LSEH_info_ecp_nistz256_point_add:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
+	.long	32*18+56,0
+.LSEH_info_ecp_nistz256_point_add_affine:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
+	.long	32*15+56,0
+___
+$code.=<<___ if ($addx);
+.align	8
+.LSEH_info_ecp_nistz256_point_doublex:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
+	.long	32*5+56,0
+.LSEH_info_ecp_nistz256_point_addx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
+	.long	32*18+56,0
+.LSEH_info_ecp_nistz256_point_add_affinex:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
+	.long	32*15+56,0
+___
+}
+
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
 close STDOUT;
@@ -215,13 +215,6 @@ static const uint8_t kP521Params[6 * 66] = {
    0xB7, 0x1E, 0x91, 0x38, 0x64, 0x09,
 };

-// MSan appears to have a bug that causes code to be miscompiled in opt mode.
-// While that is being looked at, don't run the uint128_t code under MSan.
-#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS) && \
-    !defined(MEMORY_SANITIZER)
-#define BORINGSSL_USE_INT128_CODE
-#endif
-
 DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  // 1.3.132.0.35
  static const uint8_t kOIDP521[] = {0x2b, 0x81, 0x04, 0x00, 0x23};
@@ -253,16 +246,12 @@ DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  out->curves[2].param_len = 32;
  out->curves[2].params = kP256Params;
  out->curves[2].method =
-#if defined(BORINGSSL_USE_INT128_CODE)
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
    !defined(OPENSSL_SMALL)
      EC_GFp_nistz256_method();
 #else
      EC_GFp_nistp256_method();
 #endif
-#else
-      EC_GFp_mont_method();
-#endif

  // 1.3.132.0.33
  static const uint8_t kOIDP224[] = {0x2b, 0x81, 0x04, 0x00, 0x21};
@@ -273,7 +262,7 @@ DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  out->curves[3].param_len = 28;
  out->curves[3].params = kP224Params;
  out->curves[3].method =
-#if defined(BORINGSSL_USE_INT128_CODE) && !defined(OPENSSL_SMALL)
+#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL)
      EC_GFp_nistp224_method();
 #else
      EC_GFp_mont_method();
@@ -398,11 +387,12 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
    EC_POINT_free(copy);
    return 0;
  }
+  // Store the order in minimal form, so it can be used with |BN_ULONG| arrays.
+  bn_set_minimal_width(&group->order);

  BN_MONT_CTX_free(group->order_mont);
-  group->order_mont = BN_MONT_CTX_new();
-  if (group->order_mont == NULL ||
-      !BN_MONT_CTX_set(group->order_mont, &group->order, NULL)) {
+  group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, NULL);
+  if (group->order_mont == NULL) {
    return 0;
  }

@@ -459,9 +449,8 @@ static EC_GROUP *ec_group_new_from_data(const struct built_in_curve *curve) {
    goto err;
  }

-  group->order_mont = BN_MONT_CTX_new();
-  if (group->order_mont == NULL ||
-      !BN_MONT_CTX_set(group->order_mont, &group->order, ctx)) {
+  group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, ctx);
+  if (group->order_mont == NULL) {
    OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
    goto err;
  }
@@ -779,6 +768,15 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
  }

  if (!EC_POINT_is_on_curve(group, point, ctx)) {
+    // In the event of an error, defend against the caller not checking the
+    // return value by setting a known safe value: the base point.
+    const EC_POINT *generator = EC_GROUP_get0_generator(group);
+    // The generator can be missing if the caller is in the process of
+    // constructing an arbitrary group. In this, we give up and hope they're
+    // checking the return value.
+    if (generator) {
+      EC_POINT_copy(point, generator);
+    }
    OPENSSL_PUT_ERROR(EC, EC_R_POINT_IS_NOT_ON_CURVE);
    return 0;
  }
@@ -817,6 +815,25 @@ int EC_POINT_invert(const EC_GROUP *group, EC_POINT *a, BN_CTX *ctx) {
  return ec_GFp_simple_invert(group, a, ctx);
 }

+static int arbitrary_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
+                                      const BIGNUM *in, BN_CTX *ctx) {
+  if (ec_bignum_to_scalar(group, out, in)) {
+    return 1;
+  }
+
+  ERR_clear_error();
+
+  // This is an unusual input, so we do not guarantee constant-time processing.
+  const BIGNUM *order = &group->order;
+  BN_CTX_start(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  int ok = tmp != NULL &&
+           BN_nnmod(tmp, in, order, ctx) &&
+           ec_bignum_to_scalar_unchecked(group, out, tmp);
+  BN_CTX_end(ctx);
+  return ok;
+}
+
 int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
                 const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) {
  // Previously, this function set |r| to the point at infinity if there was
@@ -828,30 +845,27 @@ int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
    return 0;
  }

-  // We cannot easily process arbitrary scalars in constant-time, and there is
-  // no need to do so. Require that scalars be the same size as the order.
-  //
-  // One could require they be fully reduced, but some consumers try to check
-  // that |order| * |pubkey| is the identity. This comes from following NIST SP
-  // 800-56A section 5.6.2.3.2. (Though all our curves have cofactor one, so
-  // this check isn't useful.)
  int ret = 0;
  EC_SCALAR g_scalar_storage, p_scalar_storage;
  EC_SCALAR *g_scalar_arg = NULL, *p_scalar_arg = NULL;
-  unsigned order_bits = BN_num_bits(&group->order);
+  BN_CTX *new_ctx = NULL;
+  if (ctx == NULL) {
+    new_ctx = BN_CTX_new();
+    if (new_ctx == NULL) {
+      goto err;
+    }
+    ctx = new_ctx;
+  }
+
  if (g_scalar != NULL) {
-    if (BN_is_negative(g_scalar) || BN_num_bits(g_scalar) > order_bits ||
-        !ec_bignum_to_scalar(group, &g_scalar_storage, g_scalar)) {
-      OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
+    if (!arbitrary_bignum_to_scalar(group, &g_scalar_storage, g_scalar, ctx)) {
      goto err;
    }
    g_scalar_arg = &g_scalar_storage;
  }

  if (p_scalar != NULL) {
-    if (BN_is_negative(p_scalar) || BN_num_bits(p_scalar) > order_bits ||
-        !ec_bignum_to_scalar(group, &p_scalar_storage, p_scalar)) {
-      OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
+    if (!arbitrary_bignum_to_scalar(group, &p_scalar_storage, p_scalar, ctx)) {
      goto err;
    }
    p_scalar_arg = &p_scalar_storage;
@@ -860,11 +874,30 @@ int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
  ret = ec_point_mul_scalar(group, r, g_scalar_arg, p, p_scalar_arg, ctx);

 err:
+  BN_CTX_free(new_ctx);
  OPENSSL_cleanse(&g_scalar_storage, sizeof(g_scalar_storage));
  OPENSSL_cleanse(&p_scalar_storage, sizeof(p_scalar_storage));
  return ret;
 }

+int ec_point_mul_scalar_public(const EC_GROUP *group, EC_POINT *r,
+                               const EC_SCALAR *g_scalar, const EC_POINT *p,
+                               const EC_SCALAR *p_scalar, BN_CTX *ctx) {
+  if ((g_scalar == NULL && p_scalar == NULL) ||
+      (p == NULL) != (p_scalar == NULL))  {
+    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
+    return 0;
+  }
+
+  if (EC_GROUP_cmp(group, r->group, NULL) != 0 ||
+      (p != NULL && EC_GROUP_cmp(group, p->group, NULL) != 0)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
+    return 0;
+  }
+
+  return group->meth->mul_public(group, r, g_scalar, p, p_scalar, ctx);
+}
+
 int ec_point_mul_scalar(const EC_GROUP *group, EC_POINT *r,
                        const EC_SCALAR *g_scalar, const EC_POINT *p,
                        const EC_SCALAR *p_scalar, BN_CTX *ctx) {
@@ -883,18 +916,6 @@ int ec_point_mul_scalar(const EC_GROUP *group, EC_POINT *r,
  return group->meth->mul(group, r, g_scalar, p, p_scalar, ctx);
 }

-int ec_point_set_Jprojective_coordinates_GFp(const EC_GROUP *group,
-                                             EC_POINT *point, const BIGNUM *x,
-                                             const BIGNUM *y, const BIGNUM *z,
-                                             BN_CTX *ctx) {
-  if (EC_GROUP_cmp(group, point->group, NULL) != 0) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
-    return 0;
-  }
-  return ec_GFp_simple_set_Jprojective_coordinates_GFp(group, point, x, y, z,
-                                                       ctx);
-}
-
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {}

 const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) {
@@ -927,17 +948,27 @@ size_t EC_get_builtin_curves(EC_builtin_curve *out_curves,

 int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
                        const BIGNUM *in) {
-  if (BN_is_negative(in) || in->top > group->order.top) {
+  if (!ec_bignum_to_scalar_unchecked(group, out, in)) {
+    return 0;
+  }
+  if (!bn_less_than_words(out->words, group->order.d, group->order.width)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
+    return 0;
+  }
+  return 1;
+}
+
+int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
+                                  const BIGNUM *in) {
+  if (!bn_copy_words(out->words, group->order.width, in)) {
    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
    return 0;
  }
-  OPENSSL_memset(out->words, 0, group->order.top * sizeof(BN_ULONG));
-  OPENSSL_memcpy(out->words, in->d, in->top * sizeof(BN_ULONG));
  return 1;
 }

 int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
                             const uint8_t additional_data[32]) {
-  return bn_rand_range_words(out->words, 1, group->order.d, group->order.top,
+  return bn_rand_range_words(out->words, 1, group->order.d, group->order.width,
                             additional_data);
 }
@@ -233,19 +233,21 @@ int EC_KEY_is_opaque(const EC_KEY *key) {
 const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) { return key->group; }

 int EC_KEY_set_group(EC_KEY *key, const EC_GROUP *group) {
+  // If |key| already has a group, it is an error to switch to another one.
+  if (key->group != NULL) {
+    if (EC_GROUP_cmp(key->group, group, NULL) != 0) {
+      OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH);
+      return 0;
+    }
+    return 1;
+  }
+
+  assert(key->priv_key == NULL);
+  assert(key->pub_key == NULL);
+
  EC_GROUP_free(key->group);
-  // TODO(fork): duplicating the group seems wasteful but see
-  // |EC_KEY_set_conv_form|.
  key->group = EC_GROUP_dup(group);
-  if (key->group == NULL) {
-    return 0;
-  }
-  // XXX: |BN_cmp| is not constant time.
-  if (key->priv_key != NULL &&
-      BN_cmp(key->priv_key, EC_GROUP_get0_order(group)) >= 0) {
-    return 0;
-  }
-  return 1;
+  return key->group != NULL;
 }

 const BIGNUM *EC_KEY_get0_private_key(const EC_KEY *key) {
@@ -253,8 +255,12 @@ const BIGNUM *EC_KEY_get0_private_key(const EC_KEY *key) {
 }

 int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *priv_key) {
-  // XXX: |BN_cmp| is not constant time.
-  if (key->group != NULL &&
+  if (key->group == NULL) {
+    OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS);
+    return 0;
+  }
+
+  if (BN_is_negative(priv_key) ||
      BN_cmp(priv_key, EC_GROUP_get0_order(key->group)) >= 0) {
    OPENSSL_PUT_ERROR(EC, EC_R_WRONG_ORDER);
    return 0;
@@ -269,6 +275,16 @@ const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key) {
 }

 int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub_key) {
+  if (key->group == NULL) {
+    OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS);
+    return 0;
+  }
+
+  if (EC_GROUP_cmp(key->group, pub_key->group, NULL) != 0) {
+    OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH);
+    return 0;
+  }
+
  EC_POINT_free(key->pub_key);
  key->pub_key = EC_POINT_dup(pub_key, key->group);
  return (key->pub_key == NULL) ? 0 : 1;
@@ -317,8 +333,8 @@ int EC_KEY_check_key(const EC_KEY *eckey) {
  // in case the priv_key is present :
  // check if generator * priv_key == pub_key
  if (eckey->priv_key) {
-    // XXX: |BN_cmp| is not constant time.
-    if (BN_cmp(eckey->priv_key, EC_GROUP_get0_order(eckey->group)) >= 0) {
+    if (BN_is_negative(eckey->priv_key) ||
+        BN_cmp(eckey->priv_key, EC_GROUP_get0_order(eckey->group)) >= 0) {
      OPENSSL_PUT_ERROR(EC, EC_R_WRONG_ORDER);
      goto err;
    }
@@ -372,8 +388,6 @@ int EC_KEY_check_fips(const EC_KEY *key) {

 int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
                                             BIGNUM *y) {
-  BN_CTX *ctx = NULL;
-  BIGNUM *tx, *ty;
  EC_POINT *point = NULL;
  int ok = 0;

@@ -381,51 +395,18 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
    return 0;
  }
-  ctx = BN_CTX_new();

-  if (ctx == NULL) {
-    return 0;
-  }
-
-  BN_CTX_start(ctx);
  point = EC_POINT_new(key->group);
-
-  if (point == NULL) {
-    goto err;
-  }
-
-  tx = BN_CTX_get(ctx);
-  ty = BN_CTX_get(ctx);
-  if (tx == NULL ||
-      ty == NULL) {
-    goto err;
-  }
-
-  if (!EC_POINT_set_affine_coordinates_GFp(key->group, point, x, y, ctx) ||
-      !EC_POINT_get_affine_coordinates_GFp(key->group, point, tx, ty, ctx)) {
-    goto err;
-  }
-
-  // Check if retrieved coordinates match originals: if not values
-  // are out of range.
-  if (BN_cmp(x, tx) || BN_cmp(y, ty)) {
-    OPENSSL_PUT_ERROR(EC, EC_R_COORDINATES_OUT_OF_RANGE);
-    goto err;
-  }
-
-  if (!EC_KEY_set_public_key(key, point)) {
-    goto err;
-  }
-
-  if (EC_KEY_check_key(key) == 0) {
+  if (point == NULL ||
+      !EC_POINT_set_affine_coordinates_GFp(key->group, point, x, y, NULL) ||
+      !EC_KEY_set_public_key(key, point) ||
+      !EC_KEY_check_key(key)) {
    goto err;
  }

  ok = 1;

 err:
-  BN_CTX_end(ctx);
-  BN_CTX_free(ctx);
  EC_POINT_free(point);
  return ok;
 }
@@ -93,7 +93,6 @@ void ec_GFp_mont_group_finish(EC_GROUP *group) {
 int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
                                const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
  BN_CTX *new_ctx = NULL;
-  BN_MONT_CTX *mont = NULL;
  int ret = 0;

  BN_MONT_CTX_free(group->mont);
@@ -106,18 +105,12 @@ int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
    }
  }

-  mont = BN_MONT_CTX_new();
-  if (mont == NULL) {
-    goto err;
-  }
-  if (!BN_MONT_CTX_set(mont, p, ctx)) {
+  group->mont = BN_MONT_CTX_new_for_modulus(p, ctx);
+  if (group->mont == NULL) {
    OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
    goto err;
  }

-  group->mont = mont;
-  mont = NULL;
-
  ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);

  if (!ret) {
@@ -127,7 +120,6 @@ int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,

 err:
  BN_CTX_free(new_ctx);
-  BN_MONT_CTX_free(mont);
  return ret;
 }

@@ -270,6 +262,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_mont_method) {
  out->group_set_curve = ec_GFp_mont_group_set_curve;
  out->point_get_affine_coordinates = ec_GFp_mont_point_get_affine_coordinates;
  out->mul = ec_wNAF_mul /* XXX: Not constant time. */;
+  out->mul_public = ec_wNAF_mul;
  out->field_mul = ec_GFp_mont_field_mul;
  out->field_sqr = ec_GFp_mont_field_sqr;
  out->field_encode = ec_GFp_mont_field_encode;
@@ -29,6 +29,8 @@
 #include <openssl/obj.h>

 #include "../../test/test_util.h"
+#include "../bn/internal.h"
+#include "internal.h"


 // kECKeyWithoutPublic is an ECPrivateKey with the optional publicKey field
@@ -303,9 +305,73 @@ TEST(ECTest, ArbitraryCurve) {
                                     order.get(), BN_value_one()));

  EXPECT_NE(0, EC_GROUP_cmp(group.get(), group3.get(), NULL));
+
+#if !defined(BORINGSSL_SHARED_LIBRARY)
+  // group4 has non-minimal components that do not fit in |EC_SCALAR| and the
+  // future |EC_FELEM|.
+  ASSERT_TRUE(bn_resize_words(p.get(), 32));
+  ASSERT_TRUE(bn_resize_words(a.get(), 32));
+  ASSERT_TRUE(bn_resize_words(b.get(), 32));
+  ASSERT_TRUE(bn_resize_words(gx.get(), 32));
+  ASSERT_TRUE(bn_resize_words(gy.get(), 32));
+  ASSERT_TRUE(bn_resize_words(order.get(), 32));
+
+  bssl::UniquePtr<EC_GROUP> group4(
+      EC_GROUP_new_curve_GFp(p.get(), a.get(), b.get(), ctx.get()));
+  ASSERT_TRUE(group4);
+  bssl::UniquePtr<EC_POINT> generator4(EC_POINT_new(group4.get()));
+  ASSERT_TRUE(generator4);
+  ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp(
+      group4.get(), generator4.get(), gx.get(), gy.get(), ctx.get()));
+  ASSERT_TRUE(EC_GROUP_set_generator(group4.get(), generator4.get(),
+                                     order.get(), BN_value_one()));
+
+  EXPECT_EQ(0, EC_GROUP_cmp(group.get(), group4.get(), NULL));
+#endif
 }

-class ECCurveTest : public testing::TestWithParam<EC_builtin_curve> {};
+TEST(ECTest, SetKeyWithoutGroup) {
+  bssl::UniquePtr<EC_KEY> key(EC_KEY_new());
+  ASSERT_TRUE(key);
+
+  // Private keys may not be configured without a group.
+  EXPECT_FALSE(EC_KEY_set_private_key(key.get(), BN_value_one()));
+
+  // Public keys may not be configured without a group.
+  bssl::UniquePtr<EC_GROUP> group(
+      EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
+  ASSERT_TRUE(group);
+  EXPECT_FALSE(
+      EC_KEY_set_public_key(key.get(), EC_GROUP_get0_generator(group.get())));
+}
+
+TEST(ECTest, GroupMismatch) {
+  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_secp384r1));
+  ASSERT_TRUE(key);
+  bssl::UniquePtr<EC_GROUP> p256(
+      EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
+  ASSERT_TRUE(p256);
+
+  // Changing a key's group is invalid.
+  EXPECT_FALSE(EC_KEY_set_group(key.get(), p256.get()));
+
+  // Configuring a public key with the wrong group is invalid.
+  EXPECT_FALSE(
+      EC_KEY_set_public_key(key.get(), EC_GROUP_get0_generator(p256.get())));
+}
+
+class ECCurveTest : public testing::TestWithParam<EC_builtin_curve> {
+ public:
+  const EC_GROUP *group() const { return group_.get(); }
+
+  void SetUp() override {
+    group_.reset(EC_GROUP_new_by_curve_name(GetParam().nid));
+    ASSERT_TRUE(group_);
+  }
+
+ private:
+  bssl::UniquePtr<EC_GROUP> group_;
+};

 TEST_P(ECCurveTest, SetAffine) {
  // Generate an EC_KEY.
@@ -313,32 +379,44 @@ TEST_P(ECCurveTest, SetAffine) {
  ASSERT_TRUE(key);
  ASSERT_TRUE(EC_KEY_generate_key(key.get()));

-  const EC_GROUP *const group = EC_KEY_get0_group(key.get());
-  EXPECT_TRUE(
-      EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()), nullptr));
+  EXPECT_TRUE(EC_POINT_is_on_curve(group(), EC_KEY_get0_public_key(key.get()),
+                                   nullptr));

  // Get the public key's coordinates.
  bssl::UniquePtr<BIGNUM> x(BN_new());
  ASSERT_TRUE(x);
  bssl::UniquePtr<BIGNUM> y(BN_new());
  ASSERT_TRUE(y);
+  bssl::UniquePtr<BIGNUM> p(BN_new());
+  ASSERT_TRUE(p);
  EXPECT_TRUE(EC_POINT_get_affine_coordinates_GFp(
-      group, EC_KEY_get0_public_key(key.get()), x.get(), y.get(), nullptr));
+      group(), EC_KEY_get0_public_key(key.get()), x.get(), y.get(), nullptr));
+  EXPECT_TRUE(
+      EC_GROUP_get_curve_GFp(group(), p.get(), nullptr, nullptr, nullptr));

  // Points on the curve should be accepted.
-  auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group));
+  auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group()));
  ASSERT_TRUE(point);
-  EXPECT_TRUE(EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(),
+  EXPECT_TRUE(EC_POINT_set_affine_coordinates_GFp(group(), point.get(), x.get(),
                                                  y.get(), nullptr));

  // Subtract one from |y| to make the point no longer on the curve.
  EXPECT_TRUE(BN_sub(y.get(), y.get(), BN_value_one()));

  // Points not on the curve should be rejected.
-  bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group));
+  bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group()));
  ASSERT_TRUE(invalid_point);
-  EXPECT_FALSE(EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(),
+  EXPECT_FALSE(EC_POINT_set_affine_coordinates_GFp(group(), invalid_point.get(),
                                                   x.get(), y.get(), nullptr));
+
+  // Coordinates out of range should be rejected.
+  EXPECT_TRUE(BN_add(y.get(), y.get(), BN_value_one()));
+  EXPECT_TRUE(BN_add(y.get(), y.get(), p.get()));
+
+  EXPECT_FALSE(EC_POINT_set_affine_coordinates_GFp(group(), invalid_point.get(),
+                                                   x.get(), y.get(), nullptr));
+  EXPECT_FALSE(
+      EC_KEY_set_public_key_affine_coordinates(key.get(), x.get(), y.get()));
 }

 TEST_P(ECCurveTest, GenerateFIPS) {
@@ -353,57 +431,52 @@ TEST_P(ECCurveTest, AddingEqualPoints) {
  ASSERT_TRUE(key);
  ASSERT_TRUE(EC_KEY_generate_key(key.get()));

-  const EC_GROUP *const group = EC_KEY_get0_group(key.get());
-
-  bssl::UniquePtr<EC_POINT> p1(EC_POINT_new(group));
+  bssl::UniquePtr<EC_POINT> p1(EC_POINT_new(group()));
  ASSERT_TRUE(p1);
  ASSERT_TRUE(EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())));

-  bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group));
+  bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group()));
  ASSERT_TRUE(p2);
  ASSERT_TRUE(EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get())));

-  bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group));
+  bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group()));
  ASSERT_TRUE(double_p1);
  bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
  ASSERT_TRUE(ctx);
-  ASSERT_TRUE(EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()));
+  ASSERT_TRUE(EC_POINT_dbl(group(), double_p1.get(), p1.get(), ctx.get()));

-  bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group));
+  bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group()));
  ASSERT_TRUE(p1_plus_p2);
  ASSERT_TRUE(
-      EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get()));
+      EC_POINT_add(group(), p1_plus_p2.get(), p1.get(), p2.get(), ctx.get()));

  EXPECT_EQ(0,
-            EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()))
+            EC_POINT_cmp(group(), double_p1.get(), p1_plus_p2.get(), ctx.get()))
      << "A+A != 2A";
 }

 TEST_P(ECCurveTest, MulZero) {
-  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
-  ASSERT_TRUE(group);
-
-  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group()));
  ASSERT_TRUE(point);
  bssl::UniquePtr<BIGNUM> zero(BN_new());
  ASSERT_TRUE(zero);
  BN_zero(zero.get());
-  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr,
-                           nullptr, nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), zero.get(), nullptr, nullptr,
+                           nullptr));

-  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group(), point.get()))
      << "g * 0 did not return point at infinity.";

  // Test that zero times an arbitrary point is also infinity. The generator is
  // used as the arbitrary point.
-  bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get()));
+  bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group()));
  ASSERT_TRUE(generator);
-  ASSERT_TRUE(EC_POINT_mul(group.get(), generator.get(), BN_value_one(),
-                           nullptr, nullptr, nullptr));
-  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(),
+  ASSERT_TRUE(EC_POINT_mul(group(), generator.get(), BN_value_one(), nullptr,
+                           nullptr, nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), nullptr, generator.get(),
                           zero.get(), nullptr));

-  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group(), point.get()))
      << "p * 0 did not return point at infinity.";
 }

@@ -413,55 +486,184 @@ TEST_P(ECCurveTest, MulZero) {
 // 5.6.2.3.2. (Though all our curves have cofactor one, so this check isn't
 // useful.)
 TEST_P(ECCurveTest, MulOrder) {
-  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
-  ASSERT_TRUE(group);
-
  // Test that g × order = ∞.
-  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group()));
  ASSERT_TRUE(point);
-  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(),
-                           EC_GROUP_get0_order(group.get()), nullptr, nullptr,
-                           nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), EC_GROUP_get0_order(group()),
+                           nullptr, nullptr, nullptr));

-  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group(), point.get()))
      << "g * order did not return point at infinity.";

  // Test that p × order = ∞, for some arbitrary p.
  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
  ASSERT_TRUE(forty_two);
  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
-  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), forty_two.get(), nullptr,
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), forty_two.get(), nullptr,
                           nullptr, nullptr));
-  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), nullptr, point.get(),
-                           EC_GROUP_get0_order(group.get()), nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), nullptr, point.get(),
+                           EC_GROUP_get0_order(group()), nullptr));

-  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group(), point.get()))
      << "p * order did not return point at infinity.";
 }

+// Test that |EC_POINT_mul| works with out-of-range scalars. The operation will
+// not be constant-time, but we'll compute the right answer.
+TEST_P(ECCurveTest, MulOutOfRange) {
+  bssl::UniquePtr<BIGNUM> n_minus_one(BN_dup(EC_GROUP_get0_order(group())));
+  ASSERT_TRUE(n_minus_one);
+  ASSERT_TRUE(BN_sub_word(n_minus_one.get(), 1));
+
+  bssl::UniquePtr<BIGNUM> minus_one(BN_new());
+  ASSERT_TRUE(minus_one);
+  ASSERT_TRUE(BN_one(minus_one.get()));
+  BN_set_negative(minus_one.get(), 1);
+
+  bssl::UniquePtr<BIGNUM> seven(BN_new());
+  ASSERT_TRUE(seven);
+  ASSERT_TRUE(BN_set_word(seven.get(), 7));
+
+  bssl::UniquePtr<BIGNUM> ten_n_plus_seven(
+      BN_dup(EC_GROUP_get0_order(group())));
+  ASSERT_TRUE(ten_n_plus_seven);
+  ASSERT_TRUE(BN_mul_word(ten_n_plus_seven.get(), 10));
+  ASSERT_TRUE(BN_add_word(ten_n_plus_seven.get(), 7));
+
+  bssl::UniquePtr<EC_POINT> point1(EC_POINT_new(group())),
+      point2(EC_POINT_new(group()));
+  ASSERT_TRUE(point1);
+  ASSERT_TRUE(point2);
+
+  ASSERT_TRUE(EC_POINT_mul(group(), point1.get(), n_minus_one.get(), nullptr,
+                           nullptr, nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group(), point2.get(), minus_one.get(), nullptr,
+                           nullptr, nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group(), point1.get(), point2.get(), nullptr))
+      << "-1 * G and (n-1) * G did not give the same result";
+
+  ASSERT_TRUE(EC_POINT_mul(group(), point1.get(), seven.get(), nullptr, nullptr,
+                           nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group(), point2.get(), ten_n_plus_seven.get(),
+                           nullptr, nullptr, nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group(), point1.get(), point2.get(), nullptr))
+      << "7 * G and (10n + 7) * G did not give the same result";
+}
+
 // Test that 10×∞ + G = G.
 TEST_P(ECCurveTest, Mul) {
-  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
-  ASSERT_TRUE(group);
-  bssl::UniquePtr<EC_POINT> p(EC_POINT_new(group.get()));
+  bssl::UniquePtr<EC_POINT> p(EC_POINT_new(group()));
  ASSERT_TRUE(p);
-  bssl::UniquePtr<EC_POINT> result(EC_POINT_new(group.get()));
+  bssl::UniquePtr<EC_POINT> result(EC_POINT_new(group()));
  ASSERT_TRUE(result);
  bssl::UniquePtr<BIGNUM> n(BN_new());
  ASSERT_TRUE(n);
-  ASSERT_TRUE(EC_POINT_set_to_infinity(group.get(), p.get()));
+  ASSERT_TRUE(EC_POINT_set_to_infinity(group(), p.get()));
  ASSERT_TRUE(BN_set_word(n.get(), 10));

  // First check that 10×∞ = ∞.
-  ASSERT_TRUE(EC_POINT_mul(group.get(), result.get(), nullptr, p.get(), n.get(),
-                           nullptr));
-  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), result.get()));
+  ASSERT_TRUE(
+      EC_POINT_mul(group(), result.get(), nullptr, p.get(), n.get(), nullptr));
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group(), result.get()));

  // Now check that 10×∞ + G = G.
-  const EC_POINT *generator = EC_GROUP_get0_generator(group.get());
-  ASSERT_TRUE(EC_POINT_mul(group.get(), result.get(), BN_value_one(), p.get(),
+  const EC_POINT *generator = EC_GROUP_get0_generator(group());
+  ASSERT_TRUE(EC_POINT_mul(group(), result.get(), BN_value_one(), p.get(),
                           n.get(), nullptr));
-  EXPECT_EQ(0, EC_POINT_cmp(group.get(), result.get(), generator, nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group(), result.get(), generator, nullptr));
+}
+
+TEST_P(ECCurveTest, MulNonMinimal) {
+  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
+  ASSERT_TRUE(forty_two);
+  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
+
+  // Compute g × 42.
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group()));
+  ASSERT_TRUE(point);
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), forty_two.get(), nullptr,
+                           nullptr, nullptr));
+
+  // Compute it again with a non-minimal 42, much larger than the scalar.
+  ASSERT_TRUE(bn_resize_words(forty_two.get(), 64));
+
+  bssl::UniquePtr<EC_POINT> point2(EC_POINT_new(group()));
+  ASSERT_TRUE(point2);
+  ASSERT_TRUE(EC_POINT_mul(group(), point2.get(), forty_two.get(), nullptr,
+                           nullptr, nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group(), point.get(), point2.get(), nullptr));
+}
+
+// Test that EC_KEY_set_private_key rejects invalid values.
+TEST_P(ECCurveTest, SetInvalidPrivateKey) {
+  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid));
+  ASSERT_TRUE(key);
+
+  bssl::UniquePtr<BIGNUM> bn(BN_new());
+  ASSERT_TRUE(BN_one(bn.get()));
+  BN_set_negative(bn.get(), 1);
+  EXPECT_FALSE(EC_KEY_set_private_key(key.get(), bn.get()))
+      << "Unexpectedly set a key of -1";
+  ERR_clear_error();
+
+  ASSERT_TRUE(
+      BN_copy(bn.get(), EC_GROUP_get0_order(EC_KEY_get0_group(key.get()))));
+  EXPECT_FALSE(EC_KEY_set_private_key(key.get(), bn.get()))
+      << "Unexpectedly set a key of the group order.";
+  ERR_clear_error();
+}
+
+TEST_P(ECCurveTest, IgnoreOct2PointReturnValue) {
+  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
+  ASSERT_TRUE(forty_two);
+  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
+
+  // Compute g × 42.
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group()));
+  ASSERT_TRUE(point);
+  ASSERT_TRUE(EC_POINT_mul(group(), point.get(), forty_two.get(), nullptr,
+                           nullptr, nullptr));
+
+  // Serialize the point.
+  size_t serialized_len = EC_POINT_point2oct(
+      group(), point.get(), POINT_CONVERSION_UNCOMPRESSED, nullptr, 0, nullptr);
+  ASSERT_NE(0u, serialized_len);
+
+  std::vector<uint8_t> serialized(serialized_len);
+  ASSERT_EQ(
+      serialized_len,
+      EC_POINT_point2oct(group(), point.get(), POINT_CONVERSION_UNCOMPRESSED,
+                         serialized.data(), serialized_len, nullptr));
+
+  // Create a serialized point that is not on the curve.
+  serialized[serialized_len - 1]++;
+
+  ASSERT_FALSE(EC_POINT_oct2point(group(), point.get(), serialized.data(),
+                                  serialized.size(), nullptr));
+  // After a failure, |point| should have been set to the generator to defend
+  // against code that doesn't check the return value.
+  ASSERT_EQ(0, EC_POINT_cmp(group(), point.get(),
+                            EC_GROUP_get0_generator(group()), nullptr));
+}
+
+TEST_P(ECCurveTest, DoubleSpecialCase) {
+  const EC_POINT *g = EC_GROUP_get0_generator(group());
+
+  bssl::UniquePtr<EC_POINT> two_g(EC_POINT_new(group()));
+  ASSERT_TRUE(two_g);
+  ASSERT_TRUE(EC_POINT_dbl(group(), two_g.get(), g, nullptr));
+
+  bssl::UniquePtr<EC_POINT> p(EC_POINT_new(group()));
+  ASSERT_TRUE(p);
+  ASSERT_TRUE(EC_POINT_mul(group(), p.get(), BN_value_one(), g, BN_value_one(),
+                           nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group(), p.get(), two_g.get(), nullptr));
+
+  EC_SCALAR one;
+  ASSERT_TRUE(ec_bignum_to_scalar(group(), &one, BN_value_one()));
+  ASSERT_TRUE(
+      ec_point_mul_scalar_public(group(), p.get(), &one, g, &one, nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group(), p.get(), two_g.get(), nullptr));
 }

 static std::vector<EC_builtin_curve> AllCurves() {
@@ -91,10 +91,9 @@ extern "C" {
 OPENSSL_COMPILE_ASSERT(EC_MAX_SCALAR_WORDS <= BN_SMALL_MAX_WORDS,
                       bn_small_functions_applicable);

-// An EC_SCALAR is a |BN_num_bits(order)|-bit integer. Only the first
-// |order->top| words are used. An |EC_SCALAR| is specific to an |EC_GROUP| and
-// must not be mixed between groups. Unless otherwise specified, it is fully
-// reduced modulo the |order|.
+// An EC_SCALAR is an integer fully reduced modulo the order. Only the first
+// |order->width| words are used. An |EC_SCALAR| is specific to an |EC_GROUP|
+// and must not be mixed between groups.
 typedef union {
  // bytes is the representation of the scalar in little-endian order.
  uint8_t bytes[EC_MAX_SCALAR_BYTES];
@@ -116,6 +115,12 @@ struct ec_method_st {
  // non-null.
  int (*mul)(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
             const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
+  // mul_public performs the same computation as mul. It further assumes that
+  // the inputs are public so there is no concern about leaking their values
+  // through timing.
+  int (*mul_public)(const EC_GROUP *group, EC_POINT *r,
+                    const EC_SCALAR *g_scalar, const EC_POINT *p,
+                    const EC_SCALAR *p_scalar, BN_CTX *ctx);

  // 'field_mul' and 'field_sqr' can be used by 'add' and 'dbl' so that the
  // same implementations of point operations can be used with different
@@ -173,12 +178,15 @@ struct ec_point_st {

 EC_GROUP *ec_group_new(const EC_METHOD *meth);

-// ec_bignum_to_scalar converts |in| to an |EC_SCALAR| and writes it to |*out|.
-// |in| must be non-negative and have at most |BN_num_bits(&group->order)| bits.
-// It returns one on success and zero on error. It does not ensure |in| is fully
-// reduced.
-int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
-                        const BIGNUM *in);
+// ec_bignum_to_scalar converts |in| to an |EC_SCALAR| and writes it to
+// |*out|. It returns one on success and zero if |in| is out of range.
+OPENSSL_EXPORT int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
+                                       const BIGNUM *in);
+
+// ec_bignum_to_scalar_unchecked behaves like |ec_bignum_to_scalar| but does not
+// check |in| is fully reduced.
+int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
+                                  const BIGNUM *in);

 // ec_random_nonzero_scalar sets |out| to a uniformly selected random value from
 // 1 to |group->order| - 1. It returns one on success and zero on error.
@@ -193,6 +201,24 @@ int ec_point_mul_scalar(const EC_GROUP *group, EC_POINT *r,
                        const EC_SCALAR *g_scalar, const EC_POINT *p,
                        const EC_SCALAR *p_scalar, BN_CTX *ctx);

+// ec_point_mul_scalar_public performs the same computation as
+// ec_point_mul_scalar.  It further assumes that the inputs are public so
+// there is no concern about leaking their values through timing.
+OPENSSL_EXPORT int ec_point_mul_scalar_public(
+    const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
+    const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
+
+// ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
+// |scalar| to |out| and returns one on success or zero on internal error. |out|
+// must have room for |bits| + 1 elements, each of which will be either zero or
+// odd with an absolute value less than  2^w  satisfying
+//     scalar = \sum_j out[j]*2^j
+// where at most one of any  w+1  consecutive digits is non-zero
+// with the exception that the most significant digit may be only
+// w-1 zeros away from that next non-zero digit.
+int ec_compute_wNAF(const EC_GROUP *group, int8_t *out, const EC_SCALAR *scalar,
+                    size_t bits, int w);
+
 int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
                const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);

@@ -208,16 +234,9 @@ int ec_GFp_simple_point_init(EC_POINT *);
 void ec_GFp_simple_point_finish(EC_POINT *);
 int ec_GFp_simple_point_copy(EC_POINT *, const EC_POINT *);
 int ec_GFp_simple_point_set_to_infinity(const EC_GROUP *, EC_POINT *);
-int ec_GFp_simple_set_Jprojective_coordinates_GFp(const EC_GROUP *, EC_POINT *,
-                                                  const BIGNUM *x,
-                                                  const BIGNUM *y,
-                                                  const BIGNUM *z, BN_CTX *);
 int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *, EC_POINT *,
                                               const BIGNUM *x, const BIGNUM *y,
                                               BN_CTX *);
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *, EC_POINT *,
-                                             const BIGNUM *x, int y_bit,
-                                             BN_CTX *);
 int ec_GFp_simple_add(const EC_GROUP *, EC_POINT *r, const EC_POINT *a,
                      const EC_POINT *b, BN_CTX *);
 int ec_GFp_simple_dbl(const EC_GROUP *, EC_POINT *r, const EC_POINT *a,
@@ -249,11 +268,6 @@ int ec_GFp_mont_field_encode(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
 int ec_GFp_mont_field_decode(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                             BN_CTX *);

-int ec_point_set_Jprojective_coordinates_GFp(const EC_GROUP *group,
-                                             EC_POINT *point, const BIGNUM *x,
-                                             const BIGNUM *y, const BIGNUM *z,
-                                             BN_CTX *ctx);
-
 void ec_GFp_nistp_recode_scalar_bits(uint8_t *sign, uint8_t *digit, uint8_t in);

 const EC_METHOD *EC_GFp_nistp224_method(void);
@@ -77,11 +77,9 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
                                      const EC_POINT *point,
                                      point_conversion_form_t form,
                                      uint8_t *buf, size_t len, BN_CTX *ctx) {
-  size_t ret;
+  size_t ret = 0;
  BN_CTX *new_ctx = NULL;
  int used_ctx = 0;
-  BIGNUM *x, *y;
-  size_t field_len, i;

  if ((form != POINT_CONVERSION_COMPRESSED) &&
      (form != POINT_CONVERSION_UNCOMPRESSED)) {
@@ -94,14 +92,16 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
    goto err;
  }

-  // ret := required output buffer length
-  field_len = BN_num_bytes(&group->field);
-  ret =
-      (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2 * field_len;
+  const size_t field_len = BN_num_bytes(&group->field);
+  size_t output_len = 1 /* type byte */ + field_len;
+  if (form == POINT_CONVERSION_UNCOMPRESSED) {
+    // Uncompressed points have a second coordinate.
+    output_len += field_len;
+  }

  // if 'buf' is NULL, just return required length
  if (buf != NULL) {
-    if (len < ret) {
+    if (len < output_len) {
      OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL);
      goto err;
    }
@@ -115,8 +115,8 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,

    BN_CTX_start(ctx);
    used_ctx = 1;
-    x = BN_CTX_get(ctx);
-    y = BN_CTX_get(ctx);
+    BIGNUM *x = BN_CTX_get(ctx);
+    BIGNUM *y = BN_CTX_get(ctx);
    if (y == NULL) {
      goto err;
    }
@@ -131,7 +131,7 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
    } else {
      buf[0] = form;
    }
-    i = 1;
+    size_t i = 1;

    if (!BN_bn2bin_padded(buf + i, field_len, x)) {
      OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
@@ -147,70 +147,66 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
      i += field_len;
    }

-    if (i != ret) {
+    if (i != output_len) {
      OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
      goto err;
    }
  }

-  if (used_ctx) {
-    BN_CTX_end(ctx);
-  }
-  BN_CTX_free(new_ctx);
-  return ret;
+  ret = output_len;

 err:
  if (used_ctx) {
    BN_CTX_end(ctx);
  }
  BN_CTX_free(new_ctx);
-  return 0;
+  return ret;
 }

-
 static int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
                                   const uint8_t *buf, size_t len,
                                   BN_CTX *ctx) {
-  point_conversion_form_t form;
-  int y_bit;
  BN_CTX *new_ctx = NULL;
-  BIGNUM *x, *y;
-  size_t field_len, enc_len;
-  int ret = 0;
+  int ret = 0, used_ctx = 0;

  if (len == 0) {
    OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL);
-    return 0;
+    goto err;
  }
-  form = buf[0];
-  y_bit = form & 1;
+
+  point_conversion_form_t form = buf[0];
+  const int y_bit = form & 1;
  form = form & ~1U;
  if ((form != POINT_CONVERSION_COMPRESSED &&
       form != POINT_CONVERSION_UNCOMPRESSED) ||
      (form == POINT_CONVERSION_UNCOMPRESSED && y_bit)) {
    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING);
-    return 0;
+    goto err;
  }

-  field_len = BN_num_bytes(&group->field);
-  enc_len =
-      (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2 * field_len;
+  const size_t field_len = BN_num_bytes(&group->field);
+  size_t enc_len = 1 /* type byte */ + field_len;
+  if (form == POINT_CONVERSION_UNCOMPRESSED) {
+    // Uncompressed points have a second coordinate.
+    enc_len += field_len;
+  }

  if (len != enc_len) {
    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING);
-    return 0;
+    goto err;
  }

  if (ctx == NULL) {
    ctx = new_ctx = BN_CTX_new();
    if (ctx == NULL) {
-      return 0;
+      goto err;
    }
  }

  BN_CTX_start(ctx);
-  x = BN_CTX_get(ctx);
-  y = BN_CTX_get(ctx);
+  used_ctx = 1;
+  BIGNUM *x = BN_CTX_get(ctx);
+  BIGNUM *y = BN_CTX_get(ctx);
  if (x == NULL || y == NULL) {
    goto err;
  }
@@ -244,7 +240,9 @@ static int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
  ret = 1;

 err:
-  BN_CTX_end(ctx);
+  if (used_ctx) {
+    BN_CTX_end(ctx);
+  }
  BN_CTX_free(new_ctx);
  return ret;
 }
@@ -268,16 +266,20 @@ size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point,
  return ec_GFp_simple_point2oct(group, point, form, buf, len, ctx);
 }

-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group,
-                                             EC_POINT *point, const BIGNUM *x,
-                                             int y_bit, BN_CTX *ctx) {
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group,
+                                            EC_POINT *point, const BIGNUM *x,
+                                            int y_bit, BN_CTX *ctx) {
+  if (EC_GROUP_cmp(group, point->group, NULL) != 0) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
+    return 0;
+  }
+
  if (BN_is_negative(x) || BN_cmp(x, &group->field) >= 0) {
    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSED_POINT);
    return 0;
  }

  BN_CTX *new_ctx = NULL;
-  BIGNUM *tmp1, *tmp2, *y;
  int ret = 0;

  ERR_clear_error();
@@ -292,10 +294,13 @@ int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group,
  y_bit = (y_bit != 0);

  BN_CTX_start(ctx);
-  tmp1 = BN_CTX_get(ctx);
-  tmp2 = BN_CTX_get(ctx);
-  y = BN_CTX_get(ctx);
-  if (y == NULL) {
+  BIGNUM *tmp1 = BN_CTX_get(ctx);
+  BIGNUM *tmp2 = BN_CTX_get(ctx);
+  BIGNUM *a = BN_CTX_get(ctx);
+  BIGNUM *b = BN_CTX_get(ctx);
+  BIGNUM *y = BN_CTX_get(ctx);
+  if (y == NULL ||
+      !EC_GROUP_get_curve_GFp(group, NULL, a, b, ctx)) {
    goto err;
  }

@@ -304,54 +309,28 @@ int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group,
  // so  y  is one of the square roots of  x^3 + a*x + b.

  // tmp1 := x^3
-  if (group->meth->field_decode == 0) {
-    // field_{sqr,mul} work on standard representation
-    if (!group->meth->field_sqr(group, tmp2, x, ctx) ||
-        !group->meth->field_mul(group, tmp1, tmp2, x, ctx)) {
-      goto err;
-    }
-  } else {
-    if (!BN_mod_sqr(tmp2, x, &group->field, ctx) ||
-        !BN_mod_mul(tmp1, tmp2, x, &group->field, ctx)) {
-      goto err;
-    }
+  if (!BN_mod_sqr(tmp2, x, &group->field, ctx) ||
+      !BN_mod_mul(tmp1, tmp2, x, &group->field, ctx)) {
+    goto err;
  }

  // tmp1 := tmp1 + a*x
  if (group->a_is_minus3) {
-    if (!BN_mod_lshift1_quick(tmp2, x, &group->field) ||
-        !BN_mod_add_quick(tmp2, tmp2, x, &group->field) ||
-        !BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) {
+    if (!bn_mod_lshift1_quick_ctx(tmp2, x, &group->field, ctx) ||
+        !bn_mod_add_quick_ctx(tmp2, tmp2, x, &group->field, ctx) ||
+        !bn_mod_sub_quick_ctx(tmp1, tmp1, tmp2, &group->field, ctx)) {
      goto err;
    }
  } else {
-    if (group->meth->field_decode) {
-      if (!group->meth->field_decode(group, tmp2, &group->a, ctx) ||
-          !BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) {
-        goto err;
-      }
-    } else {
-      // field_mul works on standard representation
-      if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) {
-        goto err;
-      }
-    }
-
-    if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) {
+    if (!BN_mod_mul(tmp2, a, x, &group->field, ctx) ||
+        !bn_mod_add_quick_ctx(tmp1, tmp1, tmp2, &group->field, ctx)) {
      goto err;
    }
  }

  // tmp1 := tmp1 + b
-  if (group->meth->field_decode) {
-    if (!group->meth->field_decode(group, tmp2, &group->b, ctx) ||
-        !BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) {
-      goto err;
-    }
-  } else {
-    if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) {
-      goto err;
-    }
+  if (!bn_mod_add_quick_ctx(tmp1, tmp1, b, &group->field, ctx)) {
+    goto err;
  }

  if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) {
@@ -392,13 +371,3 @@ err:
  BN_CTX_free(new_ctx);
  return ret;
 }
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group,
-                                            EC_POINT *point, const BIGNUM *x,
-                                            int y_bit, BN_CTX *ctx) {
-  if (EC_GROUP_cmp(group, point->group, NULL) != 0) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
-    return 0;
-  }
-  return ec_GFp_simple_set_compressed_coordinates(group, point, x, y_bit, ctx);
-}
@@ -19,9 +19,6 @@

 #include <openssl/base.h>

-#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS) && \
-    !defined(OPENSSL_SMALL)
-
 #include <openssl/bn.h>
 #include <openssl/ec.h>
 #include <openssl/err.h>
@@ -34,6 +31,8 @@
 #include "../../internal.h"


+#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL)
+
 // Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
 // using 64-bit coefficients called 'limbs', and sometimes (for multiplication
 // results) as b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 +
@@ -1016,22 +1015,27 @@ static int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
  p224_felem_inv(z2, z1);
  p224_felem_square(tmp, z2);
  p224_felem_reduce(z1, tmp);
-  p224_felem_mul(tmp, x_in, z1);
-  p224_felem_reduce(x_in, tmp);
-  p224_felem_contract(x_out, x_in);
-  if (x != NULL && !p224_felem_to_BN(x, x_out)) {
-    OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
-    return 0;
+
+  if (x != NULL) {
+    p224_felem_mul(tmp, x_in, z1);
+    p224_felem_reduce(x_in, tmp);
+    p224_felem_contract(x_out, x_in);
+    if (!p224_felem_to_BN(x, x_out)) {
+      OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
+      return 0;
+    }
  }

-  p224_felem_mul(tmp, z1, z2);
-  p224_felem_reduce(z1, tmp);
-  p224_felem_mul(tmp, y_in, z1);
-  p224_felem_reduce(y_in, tmp);
-  p224_felem_contract(y_out, y_in);
-  if (y != NULL && !p224_felem_to_BN(y, y_out)) {
-    OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
-    return 0;
+  if (y != NULL) {
+    p224_felem_mul(tmp, z1, z2);
+    p224_felem_reduce(z1, tmp);
+    p224_felem_mul(tmp, y_in, z1);
+    p224_felem_reduce(y_in, tmp);
+    p224_felem_contract(y_out, y_in);
+    if (!p224_felem_to_BN(y, y_out)) {
+      OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
+      return 0;
+    }
  }

  return 1;
@@ -1041,28 +1045,9 @@ static int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
                                      const EC_SCALAR *g_scalar,
                                      const EC_POINT *p,
                                      const EC_SCALAR *p_scalar, BN_CTX *ctx) {
-  int ret = 0;
-  BN_CTX *new_ctx = NULL;
-  BIGNUM *x, *y, *z, *tmp_scalar;
  p224_felem p_pre_comp[17][3];
  p224_felem x_in, y_in, z_in, x_out, y_out, z_out;

-  if (ctx == NULL) {
-    ctx = BN_CTX_new();
-    new_ctx = ctx;
-    if (ctx == NULL) {
-      return 0;
-    }
-  }
-
-  BN_CTX_start(ctx);
-  if ((x = BN_CTX_get(ctx)) == NULL ||
-      (y = BN_CTX_get(ctx)) == NULL ||
-      (z = BN_CTX_get(ctx)) == NULL ||
-      (tmp_scalar = BN_CTX_get(ctx)) == NULL) {
-    goto err;
-  }
-
  if (p != NULL && p_scalar != NULL) {
    // We treat NULL scalars as 0, and NULL points as points at infinity, i.e.,
    // they contribute nothing to the linear combination.
@@ -1071,7 +1056,7 @@ static int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
    if (!p224_BN_to_felem(x_out, &p->X) ||
        !p224_BN_to_felem(y_out, &p->Y) ||
        !p224_BN_to_felem(z_out, &p->Z)) {
-      goto err;
+      return 0;
    }

    p224_felem_assign(p_pre_comp[1][0], x_out);
@@ -1101,18 +1086,13 @@ static int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
  p224_felem_contract(x_in, x_out);
  p224_felem_contract(y_in, y_out);
  p224_felem_contract(z_in, z_out);
-  if (!p224_felem_to_BN(x, x_in) ||
-      !p224_felem_to_BN(y, y_in) ||
-      !p224_felem_to_BN(z, z_in)) {
+  if (!p224_felem_to_BN(&r->X, x_in) ||
+      !p224_felem_to_BN(&r->Y, y_in) ||
+      !p224_felem_to_BN(&r->Z, z_in)) {
    OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
-    goto err;
+    return 0;
  }
-  ret = ec_point_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
-
-err:
-  BN_CTX_end(ctx);
-  BN_CTX_free(new_ctx);
-  return ret;
+  return 1;
 }

 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp224_method) {
@@ -1122,10 +1102,11 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp224_method) {
  out->point_get_affine_coordinates =
      ec_GFp_nistp224_point_get_affine_coordinates;
  out->mul = ec_GFp_nistp224_points_mul;
+  out->mul_public = ec_GFp_nistp224_points_mul;
  out->field_mul = ec_GFp_simple_field_mul;
  out->field_sqr = ec_GFp_simple_field_sqr;
  out->field_encode = NULL;
  out->field_decode = NULL;
 };

-#endif  // 64_BIT && !WINDOWS && !SMALL
+#endif  // BORINGSSL_HAS_UINT128 && !SMALL
@@ -1,24 +1,20 @@
-/* Copyright (c) 2014, Intel Corporation.
+/*
+ * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2014, Intel Corporation. All Rights Reserved.
 *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// Developers and authors:
-// Shay Gueron (1, 2), and Vlad Krasnov (1)
-// (1) Intel Corporation, Israel Development Center
-// (2) University of Haifa
-// Reference:
-// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
-//                          256 Bit Primes"
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * (1) Intel Corporation, Israel Development Center, Haifa, Israel
+ * (2) University of Haifa, Israel
+ *
+ * Reference:
+ * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+ *                          256 Bit Primes"
+ */

 #include <openssl/ec.h>

@@ -205,13 +201,7 @@ static void ecp_nistz256_mod_inverse_mont(BN_ULONG r[P256_LIMBS],
 // returns one if it fits. Otherwise it returns zero.
 static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS],
                                             const BIGNUM *in) {
-  if (in->top > P256_LIMBS) {
-    return 0;
-  }
-
-  OPENSSL_memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS);
-  OPENSSL_memcpy(out, in->d, sizeof(BN_ULONG) * in->top);
-  return 1;
+  return bn_copy_words(out, P256_LIMBS, in);
 }

 // r = p * p_scalar
@@ -446,6 +436,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
  out->group_set_curve = ec_GFp_mont_group_set_curve;
  out->point_get_affine_coordinates = ecp_nistz256_get_affine;
  out->mul = ecp_nistz256_points_mul;
+  out->mul_public = ecp_nistz256_points_mul;
  out->field_mul = ec_GFp_mont_field_mul;
  out->field_sqr = ec_GFp_mont_field_sqr;
  out->field_encode = ec_GFp_mont_field_encode;
@@ -1,16 +1,20 @@
-/* Copyright (c) 2014, Intel Corporation.
+/*
+ * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2014, Intel Corporation. All Rights Reserved.
 *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
 *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * (1) Intel Corporation, Israel Development Center, Haifa, Israel
+ * (2) University of Haifa, Israel
+ *
+ * Reference:
+ * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+ *                          256 Bit Primes"
+ */

 #ifndef OPENSSL_HEADER_EC_P256_X86_64_H
 #define OPENSSL_HEADER_EC_P256_X86_64_H
@@ -160,17 +160,16 @@ static bool PointToAffine(P256_POINT_AFFINE *out, const P256_POINT *in) {
    return false;
  }

-  OPENSSL_memset(out, 0, sizeof(P256_POINT_AFFINE));
-
  if (BN_is_zero(z.get())) {
    // The point at infinity is represented as (0, 0).
+    OPENSSL_memset(out, 0, sizeof(P256_POINT_AFFINE));
    return true;
  }

  bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
-  bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+  bssl::UniquePtr<BN_MONT_CTX> mont(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx.get()));
  if (!ctx || !mont ||
-      !BN_MONT_CTX_set(mont.get(), p.get(), ctx.get()) ||
      // Invert Z.
      !BN_from_montgomery(z.get(), z.get(), mont.get(), ctx.get()) ||
      !BN_mod_inverse(z.get(), z.get(), p.get(), ctx.get()) ||
@@ -185,12 +184,11 @@ static bool PointToAffine(P256_POINT_AFFINE *out, const P256_POINT *in) {
      !BN_mod_mul_montgomery(y.get(), y.get(), z.get(), mont.get(),
                             ctx.get()) ||
      !BN_mod_mul_montgomery(y.get(), y.get(), z.get(), mont.get(),
-                             ctx.get())) {
+                             ctx.get()) ||
+      !bn_copy_words(out->X, P256_LIMBS, x.get()) ||
+      !bn_copy_words(out->Y, P256_LIMBS, y.get())) {
    return false;
  }
-
-  OPENSSL_memcpy(out->X, x->d, sizeof(BN_ULONG) * x->top);
-  OPENSSL_memcpy(out->Y, y->d, sizeof(BN_ULONG) * y->top);
  return true;
 }

@@ -135,9 +135,11 @@ int ec_GFp_simple_group_set_curve(EC_GROUP *group, const BIGNUM *p,
    goto err;
  }
  BN_set_negative(&group->field, 0);
+  // Store the field in minimal form, so it can be used with |BN_ULONG| arrays.
+  bn_set_minimal_width(&group->field);

  // group->a
-  if (!BN_nnmod(tmp_a, a, p, ctx)) {
+  if (!BN_nnmod(tmp_a, a, &group->field, ctx)) {
    goto err;
  }
  if (group->meth->field_encode) {
@@ -149,7 +151,7 @@ int ec_GFp_simple_group_set_curve(EC_GROUP *group, const BIGNUM *p,
  }

  // group->b
-  if (!BN_nnmod(&group->b, b, p, ctx)) {
+  if (!BN_nnmod(&group->b, b, &group->field, ctx)) {
    goto err;
  }
  if (group->meth->field_encode &&
@@ -269,9 +271,14 @@ static int set_Jprojective_coordinate_GFp(const EC_GROUP *group, BIGNUM *out,
  return BN_copy(out, in) != NULL;
 }

-int ec_GFp_simple_set_Jprojective_coordinates_GFp(
-    const EC_GROUP *group, EC_POINT *point, const BIGNUM *x, const BIGNUM *y,
-    const BIGNUM *z, BN_CTX *ctx) {
+int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *group,
+                                               EC_POINT *point, const BIGNUM *x,
+                                               const BIGNUM *y, BN_CTX *ctx) {
+  if (x == NULL || y == NULL) {
+    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
+    return 0;
+  }
+
  BN_CTX *new_ctx = NULL;
  int ret = 0;

@@ -284,7 +291,7 @@ int ec_GFp_simple_set_Jprojective_coordinates_GFp(

  if (!set_Jprojective_coordinate_GFp(group, &point->X, x, ctx) ||
      !set_Jprojective_coordinate_GFp(group, &point->Y, y, ctx) ||
-      !set_Jprojective_coordinate_GFp(group, &point->Z, z, ctx)) {
+      !BN_copy(&point->Z, &group->one)) {
    goto err;
  }

@@ -295,19 +302,6 @@ err:
  return ret;
 }

-int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *group,
-                                               EC_POINT *point, const BIGNUM *x,
-                                               const BIGNUM *y, BN_CTX *ctx) {
-  if (x == NULL || y == NULL) {
-    // unlike for projective coordinates, we do not tolerate this
-    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
-    return 0;
-  }
-
-  return ec_point_set_Jprojective_coordinates_GFp(group, point, x, y,
-                                                  BN_value_one(), ctx);
-}
-
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
                      const EC_POINT *b, BN_CTX *ctx) {
  int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *,
@@ -401,8 +395,8 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  }

  // n5, n6
-  if (!BN_mod_sub_quick(n5, n1, n3, p) ||
-      !BN_mod_sub_quick(n6, n2, n4, p)) {
+  if (!bn_mod_sub_quick_ctx(n5, n1, n3, p, ctx) ||
+      !bn_mod_sub_quick_ctx(n6, n2, n4, p, ctx)) {
    goto end;
  }
  // n5 = n1 - n3
@@ -424,8 +418,8 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  }

  // 'n7', 'n8'
-  if (!BN_mod_add_quick(n1, n1, n3, p) ||
-      !BN_mod_add_quick(n2, n2, n4, p)) {
+  if (!bn_mod_add_quick_ctx(n1, n1, n3, p, ctx) ||
+      !bn_mod_add_quick_ctx(n2, n2, n4, p, ctx)) {
    goto end;
  }
  // 'n7' = n1 + n3
@@ -459,14 +453,14 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  if (!field_sqr(group, n0, n6, ctx) ||
      !field_sqr(group, n4, n5, ctx) ||
      !field_mul(group, n3, n1, n4, ctx) ||
-      !BN_mod_sub_quick(&r->X, n0, n3, p)) {
+      !bn_mod_sub_quick_ctx(&r->X, n0, n3, p, ctx)) {
    goto end;
  }
  // X_r = n6^2 - n5^2 * 'n7'

  // 'n9'
-  if (!BN_mod_lshift1_quick(n0, &r->X, p) ||
-      !BN_mod_sub_quick(n0, n3, n0, p)) {
+  if (!bn_mod_lshift1_quick_ctx(n0, &r->X, p, ctx) ||
+      !bn_mod_sub_quick_ctx(n0, n3, n0, p, ctx)) {
    goto end;
  }
  // n9 = n5^2 * 'n7' - 2 * X_r
@@ -477,7 +471,7 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
    goto end;  // now n5 is n5^3
  }
  if (!field_mul(group, n1, n2, n5, ctx) ||
-      !BN_mod_sub_quick(n0, n0, n1, p)) {
+      !bn_mod_sub_quick_ctx(n0, n0, n1, p, ctx)) {
    goto end;
  }
  if (BN_is_odd(n0) && !BN_add(n0, n0, p)) {
@@ -542,31 +536,31 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  // n1
  if (BN_cmp(&a->Z, &group->one) == 0) {
    if (!field_sqr(group, n0, &a->X, ctx) ||
-        !BN_mod_lshift1_quick(n1, n0, p) ||
-        !BN_mod_add_quick(n0, n0, n1, p) ||
-        !BN_mod_add_quick(n1, n0, &group->a, p)) {
+        !bn_mod_lshift1_quick_ctx(n1, n0, p, ctx) ||
+        !bn_mod_add_quick_ctx(n0, n0, n1, p, ctx) ||
+        !bn_mod_add_quick_ctx(n1, n0, &group->a, p, ctx)) {
      goto err;
    }
    // n1 = 3 * X_a^2 + a_curve
  } else if (group->a_is_minus3) {
    if (!field_sqr(group, n1, &a->Z, ctx) ||
-        !BN_mod_add_quick(n0, &a->X, n1, p) ||
-        !BN_mod_sub_quick(n2, &a->X, n1, p) ||
+        !bn_mod_add_quick_ctx(n0, &a->X, n1, p, ctx) ||
+        !bn_mod_sub_quick_ctx(n2, &a->X, n1, p, ctx) ||
        !field_mul(group, n1, n0, n2, ctx) ||
-        !BN_mod_lshift1_quick(n0, n1, p) ||
-        !BN_mod_add_quick(n1, n0, n1, p)) {
+        !bn_mod_lshift1_quick_ctx(n0, n1, p, ctx) ||
+        !bn_mod_add_quick_ctx(n1, n0, n1, p, ctx)) {
      goto err;
    }
    // n1 = 3 * (X_a + Z_a^2) * (X_a - Z_a^2)
    //    = 3 * X_a^2 - 3 * Z_a^4
  } else {
    if (!field_sqr(group, n0, &a->X, ctx) ||
-        !BN_mod_lshift1_quick(n1, n0, p) ||
-        !BN_mod_add_quick(n0, n0, n1, p) ||
+        !bn_mod_lshift1_quick_ctx(n1, n0, p, ctx) ||
+        !bn_mod_add_quick_ctx(n0, n0, n1, p, ctx) ||
        !field_sqr(group, n1, &a->Z, ctx) ||
        !field_sqr(group, n1, n1, ctx) ||
        !field_mul(group, n1, n1, &group->a, ctx) ||
-        !BN_mod_add_quick(n1, n1, n0, p)) {
+        !bn_mod_add_quick_ctx(n1, n1, n0, p, ctx)) {
      goto err;
    }
    // n1 = 3 * X_a^2 + a_curve * Z_a^4
@@ -580,7 +574,7 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  } else if (!field_mul(group, n0, &a->Y, &a->Z, ctx)) {
    goto err;
  }
-  if (!BN_mod_lshift1_quick(&r->Z, n0, p)) {
+  if (!bn_mod_lshift1_quick_ctx(&r->Z, n0, p, ctx)) {
    goto err;
  }
  // Z_r = 2 * Y_a * Z_a
@@ -588,30 +582,30 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  // n2
  if (!field_sqr(group, n3, &a->Y, ctx) ||
      !field_mul(group, n2, &a->X, n3, ctx) ||
-      !BN_mod_lshift_quick(n2, n2, 2, p)) {
+      !bn_mod_lshift_quick_ctx(n2, n2, 2, p, ctx)) {
    goto err;
  }
  // n2 = 4 * X_a * Y_a^2

  // X_r
-  if (!BN_mod_lshift1_quick(n0, n2, p) ||
+  if (!bn_mod_lshift1_quick_ctx(n0, n2, p, ctx) ||
      !field_sqr(group, &r->X, n1, ctx) ||
-      !BN_mod_sub_quick(&r->X, &r->X, n0, p)) {
+      !bn_mod_sub_quick_ctx(&r->X, &r->X, n0, p, ctx)) {
    goto err;
  }
  // X_r = n1^2 - 2 * n2

  // n3
  if (!field_sqr(group, n0, n3, ctx) ||
-      !BN_mod_lshift_quick(n3, n0, 3, p)) {
+      !bn_mod_lshift_quick_ctx(n3, n0, 3, p, ctx)) {
    goto err;
  }
  // n3 = 8 * Y_a^4

  // Y_r
-  if (!BN_mod_sub_quick(n0, n2, &r->X, p) ||
+  if (!bn_mod_sub_quick_ctx(n0, n2, &r->X, p, ctx) ||
      !field_mul(group, n0, n1, n0, ctx) ||
-      !BN_mod_sub_quick(&r->Y, n0, n3, p)) {
+      !bn_mod_sub_quick_ctx(&r->Y, n0, n3, p, ctx)) {
    goto err;
  }
  // Y_r = n1 * (n2 - X_r) - n3
@@ -694,15 +688,15 @@ int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,

    // rh := (rh + a*Z^4)*X
    if (group->a_is_minus3) {
-      if (!BN_mod_lshift1_quick(tmp, Z4, p) ||
-          !BN_mod_add_quick(tmp, tmp, Z4, p) ||
-          !BN_mod_sub_quick(rh, rh, tmp, p) ||
+      if (!bn_mod_lshift1_quick_ctx(tmp, Z4, p, ctx) ||
+          !bn_mod_add_quick_ctx(tmp, tmp, Z4, p, ctx) ||
+          !bn_mod_sub_quick_ctx(rh, rh, tmp, p, ctx) ||
          !field_mul(group, rh, rh, &point->X, ctx)) {
        goto err;
      }
    } else {
      if (!field_mul(group, tmp, Z4, &group->a, ctx) ||
-          !BN_mod_add_quick(rh, rh, tmp, p) ||
+          !bn_mod_add_quick_ctx(rh, rh, tmp, p, ctx) ||
          !field_mul(group, rh, rh, &point->X, ctx)) {
        goto err;
      }
@@ -710,17 +704,17 @@ int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,

    // rh := rh + b*Z^6
    if (!field_mul(group, tmp, &group->b, Z6, ctx) ||
-        !BN_mod_add_quick(rh, rh, tmp, p)) {
+        !bn_mod_add_quick_ctx(rh, rh, tmp, p, ctx)) {
      goto err;
    }
  } else {
    // rh := (rh + a)*X
-    if (!BN_mod_add_quick(rh, rh, &group->a, p) ||
+    if (!bn_mod_add_quick_ctx(rh, rh, &group->a, p, ctx) ||
        !field_mul(group, rh, rh, &point->X, ctx)) {
      goto err;
    }
    // rh := rh + b
-    if (!BN_mod_add_quick(rh, rh, &group->b, p)) {
+    if (!bn_mod_add_quick_ctx(rh, rh, &group->b, p, ctx)) {
      goto err;
    }
  }
@@ -14,9 +14,6 @@

 #include <openssl/base.h>

-
-#if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS)
-
 #include <openssl/ec.h>

 #include "internal.h"
@@ -105,5 +102,3 @@ void ec_GFp_nistp_recode_scalar_bits(uint8_t *sign, uint8_t *digit,
  *sign = s & 1;
  *digit = d;
 }
-
-#endif  // 64_BIT && !WINDOWS
@@ -73,8 +73,10 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>
 #include <openssl/thread.h>
+#include <openssl/type_check.h>

 #include "internal.h"
+#include "../bn/internal.h"
 #include "../../internal.h"


@@ -83,58 +85,21 @@
 //   http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13
 //   http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf

-// Determine the modified width-(w+1) Non-Adjacent Form (wNAF) of 'scalar'.
-// This is an array  r[]  of values that are either zero or odd with an
-// absolute value less than  2^w  satisfying
-//     scalar = \sum_j r[j]*2^j
-// where at most one of any  w+1  consecutive digits is non-zero
-// with the exception that the most significant digit may be only
-// w-1 zeros away from that next non-zero digit.
-static int8_t *compute_wNAF(const BIGNUM *scalar, int w, size_t *ret_len) {
-  int window_val;
-  int ok = 0;
-  int8_t *r = NULL;
-  int sign = 1;
-  int bit, next_bit, mask;
-  size_t len = 0, j;
-
-  if (BN_is_zero(scalar)) {
-    r = OPENSSL_malloc(1);
-    if (!r) {
-      OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-      goto err;
-    }
-    r[0] = 0;
-    *ret_len = 1;
-    return r;
-  }
-
+int ec_compute_wNAF(const EC_GROUP *group, int8_t *out, const EC_SCALAR *scalar,
+                    size_t bits, int w) {
  // 'int8_t' can represent integers with absolute values less than 2^7.
-  if (w <= 0 || w > 7) {
+  if (w <= 0 || w > 7 || bits == 0) {
    OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
-    goto err;
+    return 0;
  }
-  bit = 1 << w;         // at most 128
-  next_bit = bit << 1;  // at most 256
-  mask = next_bit - 1;  // at most 255
+  int bit = 1 << w;         // at most 128
+  int next_bit = bit << 1;  // at most 256
+  int mask = next_bit - 1;  // at most 255

-  if (BN_is_negative(scalar)) {
-    sign = -1;
-  }
-
-  len = BN_num_bits(scalar);
-  // The modified wNAF may be one digit longer than binary representation
-  // (*ret_len will be set to the actual length, i.e. at most
-  // BN_num_bits(scalar) + 1).
-  r = OPENSSL_malloc(len + 1);
-  if (r == NULL) {
-    OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-    goto err;
-  }
-  window_val = scalar->d[0] & mask;
-  j = 0;
-  // If j+w+1 >= len, window_val will not increase.
-  while (window_val != 0 || j + w + 1 < len) {
+  int window_val = scalar->words[0] & mask;
+  size_t j = 0;
+  // If j+w+1 >= bits, window_val will not increase.
+  while (window_val != 0 || j + w + 1 < bits) {
    int digit = 0;

    // 0 <= window_val <= 2^(w+1)
@@ -146,7 +111,7 @@ static int8_t *compute_wNAF(const BIGNUM *scalar, int w, size_t *ret_len) {
        digit = window_val - next_bit;  // -2^w < digit < 0

 #if 1  // modified wNAF
-        if (j + w + 1 >= len) {
+        if (j + w + 1 >= bits) {
          // special case for generating modified wNAFs:
          // no new bits will be added into window_val,
          // so using a positive digit here will decrease
@@ -161,7 +126,7 @@ static int8_t *compute_wNAF(const BIGNUM *scalar, int w, size_t *ret_len) {

      if (digit <= -bit || digit >= bit || !(digit & 1)) {
        OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
-        goto err;
+        return 0;
      }

      window_val -= digit;
@@ -170,52 +135,38 @@ static int8_t *compute_wNAF(const BIGNUM *scalar, int w, size_t *ret_len) {
      // for modified window NAFs, it may also be 2^w.
      if (window_val != 0 && window_val != next_bit && window_val != bit) {
        OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
-        goto err;
+        return 0;
      }
    }

-    r[j++] = sign * digit;
+    out[j++] = digit;

    window_val >>= 1;
-    window_val += bit * BN_is_bit_set(scalar, j + w);
+    window_val +=
+        bit * bn_is_bit_set_words(scalar->words, group->order.width, j + w);

    if (window_val > next_bit) {
      OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
-      goto err;
+      return 0;
    }
  }

-  if (j > len + 1) {
+  // Fill the rest of the wNAF with zeros.
+  if (j > bits + 1) {
    OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
-    goto err;
+    return 0;
+  }
+  for (size_t i = j; i < bits + 1; i++) {
+    out[i] = 0;
  }
-  len = j;
-  ok = 1;

-err:
-  if (!ok) {
-    OPENSSL_free(r);
-    r = NULL;
-  }
-  if (ok) {
-    *ret_len = len;
-  }
-  return r;
+  return 1;
 }

-
 // TODO: table should be optimised for the wNAF-based implementation,
 //       sometimes smaller windows will give better performance
 //       (thus the boundaries should be increased)
 static size_t window_bits_for_scalar_size(size_t b) {
-  if (b >= 2000) {
-    return 6;
-  }
-
-  if (b >= 800) {
-    return 5;
-  }
-
  if (b >= 300) {
    return 4;
  }
@@ -231,25 +182,62 @@ static size_t window_bits_for_scalar_size(size_t b) {
  return 1;
 }

-int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r,
-                const EC_SCALAR *g_scalar_raw, const EC_POINT *p,
-                const EC_SCALAR *p_scalar_raw, BN_CTX *ctx) {
+// EC_WNAF_MAX_WINDOW_BITS is the largest value returned by
+// |window_bits_for_scalar_size|.
+#define EC_WNAF_MAX_WINDOW_BITS 4
+
+// compute_precomp sets |out[i]| to a newly-allocated |EC_POINT| containing
+// (2*i+1)*p, for i from 0 to |len|. It returns one on success and
+// zero on error.
+static int compute_precomp(const EC_GROUP *group, EC_POINT **out,
+                           const EC_POINT *p, size_t len, BN_CTX *ctx) {
+  out[0] = EC_POINT_new(group);
+  if (out[0] == NULL ||
+      !EC_POINT_copy(out[0], p)) {
+    return 0;
+  }
+
+  int ret = 0;
+  EC_POINT *two_p = EC_POINT_new(group);
+  if (two_p == NULL ||
+      !EC_POINT_dbl(group, two_p, p, ctx)) {
+    goto err;
+  }
+
+  for (size_t i = 1; i < len; i++) {
+    out[i] = EC_POINT_new(group);
+    if (out[i] == NULL ||
+        !EC_POINT_add(group, out[i], out[i - 1], two_p, ctx)) {
+      goto err;
+    }
+  }
+
+  ret = 1;
+
+err:
+  EC_POINT_free(two_p);
+  return ret;
+}
+
+static int lookup_precomp(const EC_GROUP *group, EC_POINT *out,
+                          EC_POINT *const *precomp, int digit, BN_CTX *ctx) {
+  if (digit < 0) {
+    digit = -digit;
+    return EC_POINT_copy(out, precomp[digit >> 1]) &&
+           EC_POINT_invert(group, out, ctx);
+  }
+
+  return EC_POINT_copy(out, precomp[digit >> 1]);
+}
+
+int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
+                const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx) {
  BN_CTX *new_ctx = NULL;
-  const EC_POINT *generator = NULL;
+  EC_POINT *precomp_storage[2 * (1 << (EC_WNAF_MAX_WINDOW_BITS - 1))] = {NULL};
+  EC_POINT **g_precomp = NULL, **p_precomp = NULL;
+  int8_t g_wNAF[EC_MAX_SCALAR_BYTES * 8 + 1];
+  int8_t p_wNAF[EC_MAX_SCALAR_BYTES * 8 + 1];
  EC_POINT *tmp = NULL;
-  size_t total_num = 0;
-  size_t i, j;
-  int k;
-  int r_is_inverted = 0;
-  int r_is_at_infinity = 1;
-  size_t *wsize = NULL;      // individual window sizes
-  int8_t **wNAF = NULL;  // individual wNAFs
-  size_t *wNAF_len = NULL;
-  size_t max_len = 0;
-  size_t num_val = 0;
-  EC_POINT **val = NULL;  // precomputation
-  EC_POINT **v;
-  EC_POINT ***val_sub = NULL;  // pointers to sub-arrays of 'val'
  int ret = 0;

  if (ctx == NULL) {
@@ -258,217 +246,109 @@ int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r,
      goto err;
    }
  }
-  BN_CTX_start(ctx);

-  // Convert from |EC_SCALAR| to |BIGNUM|. |BIGNUM| is not constant-time, but
-  // neither is the rest of this function.
-  BIGNUM *g_scalar = NULL, *p_scalar = NULL;
-  if (g_scalar_raw != NULL) {
-    g_scalar = BN_CTX_get(ctx);
-    if (g_scalar == NULL ||
-        !bn_set_words(g_scalar, g_scalar_raw->words, group->order.top)) {
-      goto err;
-    }
-  }
-  if (p_scalar_raw != NULL) {
-    p_scalar = BN_CTX_get(ctx);
-    if (p_scalar == NULL ||
-        !bn_set_words(p_scalar, p_scalar_raw->words, group->order.top)) {
-      goto err;
-    }
-  }
+  size_t bits = BN_num_bits(&group->order);
+  size_t wsize = window_bits_for_scalar_size(bits);
+  size_t wNAF_len = bits + 1;
+  size_t precomp_len = (size_t)1 << (wsize - 1);

-  // TODO: This function used to take |points| and |scalars| as arrays of
-  // |num| elements. The code below should be simplified to work in terms of |p|
-  // and |p_scalar|.
-  size_t num = p != NULL ? 1 : 0;
-  const EC_POINT **points = p != NULL ? &p : NULL;
-  BIGNUM **scalars = p != NULL ? &p_scalar : NULL;
+  OPENSSL_COMPILE_ASSERT(
+      OPENSSL_ARRAY_SIZE(g_wNAF) == OPENSSL_ARRAY_SIZE(p_wNAF),
+      g_wNAF_and_p_wNAF_are_different_sizes);

-  total_num = num;
-
-  if (g_scalar != NULL) {
-    generator = EC_GROUP_get0_generator(group);
-    if (generator == NULL) {
-      OPENSSL_PUT_ERROR(EC, EC_R_UNDEFINED_GENERATOR);
-      goto err;
-    }
-
-    ++total_num;  // treat 'g_scalar' like 'num'-th element of 'scalars'
-  }
-
-
-  wsize = OPENSSL_malloc(total_num * sizeof(wsize[0]));
-  wNAF_len = OPENSSL_malloc(total_num * sizeof(wNAF_len[0]));
-  wNAF = OPENSSL_malloc(total_num * sizeof(wNAF[0]));
-  val_sub = OPENSSL_malloc(total_num * sizeof(val_sub[0]));
-
-  // Ensure wNAF is initialised in case we end up going to err.
-  if (wNAF != NULL) {
-    OPENSSL_memset(wNAF, 0, total_num * sizeof(wNAF[0]));
-  }
-
-  if (!wsize || !wNAF_len || !wNAF || !val_sub) {
-    OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-    goto err;
-  }
-
-  // num_val will be the total number of temporarily precomputed points
-  num_val = 0;
-
-  for (i = 0; i < total_num; i++) {
-    size_t bits;
-
-    bits = i < num ? BN_num_bits(scalars[i]) : BN_num_bits(g_scalar);
-    wsize[i] = window_bits_for_scalar_size(bits);
-    num_val += (size_t)1 << (wsize[i] - 1);
-    wNAF[i] =
-        compute_wNAF((i < num ? scalars[i] : g_scalar), wsize[i], &wNAF_len[i]);
-    if (wNAF[i] == NULL) {
-      goto err;
-    }
-    if (wNAF_len[i] > max_len) {
-      max_len = wNAF_len[i];
-    }
-  }
-
-  // All points we precompute now go into a single array 'val'. 'val_sub[i]' is
-  // a pointer to the subarray for the i-th point.
-  val = OPENSSL_malloc(num_val * sizeof(val[0]));
-  if (val == NULL) {
-    OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-    goto err;
-  }
-  OPENSSL_memset(val, 0, num_val * sizeof(val[0]));
-
-  // allocate points for precomputation
-  v = val;
-  for (i = 0; i < total_num; i++) {
-    val_sub[i] = v;
-    for (j = 0; j < ((size_t)1 << (wsize[i] - 1)); j++) {
-      *v = EC_POINT_new(group);
-      if (*v == NULL) {
-        goto err;
-      }
-      v++;
-    }
-  }
-  if (!(v == val + num_val)) {
+  if (wNAF_len > OPENSSL_ARRAY_SIZE(g_wNAF) ||
+      2 * precomp_len > OPENSSL_ARRAY_SIZE(precomp_storage)) {
    OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
    goto err;
  }

-  if (!(tmp = EC_POINT_new(group))) {
+  // TODO(davidben): |mul_public| is for ECDSA verification which can assume
+  // non-NULL inputs, but this code is also used for |mul| which cannot. It's
+  // not constant-time, so replace the generic |mul| and remove the NULL checks.
+  size_t total_precomp = 0;
+  if (g_scalar != NULL) {
+    const EC_POINT *g = EC_GROUP_get0_generator(group);
+    if (g == NULL) {
+      OPENSSL_PUT_ERROR(EC, EC_R_UNDEFINED_GENERATOR);
+      goto err;
+    }
+    g_precomp = precomp_storage + total_precomp;
+    total_precomp += precomp_len;
+    if (!ec_compute_wNAF(group, g_wNAF, g_scalar, bits, wsize) ||
+        !compute_precomp(group, g_precomp, g, precomp_len, ctx)) {
+      goto err;
+    }
+  }
+
+  if (p_scalar != NULL) {
+    p_precomp = precomp_storage + total_precomp;
+    total_precomp += precomp_len;
+    if (!ec_compute_wNAF(group, p_wNAF, p_scalar, bits, wsize) ||
+        !compute_precomp(group, p_precomp, p, precomp_len, ctx)) {
+      goto err;
+    }
+  }
+
+  tmp = EC_POINT_new(group);
+  if (tmp == NULL ||
+      // |window_bits_for_scalar_size| assumes we do this step.
+      !EC_POINTs_make_affine(group, total_precomp, precomp_storage, ctx)) {
    goto err;
  }

-  // prepare precomputed values:
-  //    val_sub[i][0] :=     points[i]
-  //    val_sub[i][1] := 3 * points[i]
-  //    val_sub[i][2] := 5 * points[i]
-  //    ...
-  for (i = 0; i < total_num; i++) {
-    if (i < num) {
-      if (!EC_POINT_copy(val_sub[i][0], points[i])) {
-        goto err;
-      }
-    } else if (!EC_POINT_copy(val_sub[i][0], generator)) {
+  int r_is_at_infinity = 1;
+  for (size_t k = wNAF_len - 1; k < wNAF_len; k--) {
+    if (!r_is_at_infinity && !EC_POINT_dbl(group, r, r, ctx)) {
      goto err;
    }

-    if (wsize[i] > 1) {
-      if (!EC_POINT_dbl(group, tmp, val_sub[i][0], ctx)) {
-        goto err;
+    if (g_scalar != NULL) {
+      if (g_wNAF[k] != 0) {
+        if (!lookup_precomp(group, tmp, g_precomp, g_wNAF[k], ctx)) {
+          goto err;
+        }
+        if (r_is_at_infinity) {
+          if (!EC_POINT_copy(r, tmp)) {
+            goto err;
+          }
+          r_is_at_infinity = 0;
+        } else if (!EC_POINT_add(group, r, r, tmp, ctx)) {
+          goto err;
+        }
      }
-      for (j = 1; j < ((size_t)1 << (wsize[i] - 1)); j++) {
-        if (!EC_POINT_add(group, val_sub[i][j], val_sub[i][j - 1], tmp, ctx)) {
+    }
+
+    if (p_scalar != NULL) {
+      if (p_wNAF[k] != 0) {
+        if (!lookup_precomp(group, tmp, p_precomp, p_wNAF[k], ctx)) {
+          goto err;
+        }
+        if (r_is_at_infinity) {
+          if (!EC_POINT_copy(r, tmp)) {
+            goto err;
+          }
+          r_is_at_infinity = 0;
+        } else if (!EC_POINT_add(group, r, r, tmp, ctx)) {
          goto err;
        }
      }
    }
  }

-#if 1  // optional; window_bits_for_scalar_size assumes we do this step
-  if (!EC_POINTs_make_affine(group, num_val, val, ctx)) {
-    goto err;
-  }
-#endif
-
-  r_is_at_infinity = 1;
-
-  for (k = max_len - 1; k >= 0; k--) {
-    if (!r_is_at_infinity && !EC_POINT_dbl(group, r, r, ctx)) {
-      goto err;
-    }
-
-    for (i = 0; i < total_num; i++) {
-      if (wNAF_len[i] > (size_t)k) {
-        int digit = wNAF[i][k];
-        int is_neg;
-
-        if (digit) {
-          is_neg = digit < 0;
-
-          if (is_neg) {
-            digit = -digit;
-          }
-
-          if (is_neg != r_is_inverted) {
-            if (!r_is_at_infinity && !EC_POINT_invert(group, r, ctx)) {
-              goto err;
-            }
-            r_is_inverted = !r_is_inverted;
-          }
-
-          // digit > 0
-
-          if (r_is_at_infinity) {
-            if (!EC_POINT_copy(r, val_sub[i][digit >> 1])) {
-              goto err;
-            }
-            r_is_at_infinity = 0;
-          } else {
-            if (!EC_POINT_add(group, r, r, val_sub[i][digit >> 1], ctx)) {
-              goto err;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  if (r_is_at_infinity) {
-    if (!EC_POINT_set_to_infinity(group, r)) {
-      goto err;
-    }
-  } else if (r_is_inverted && !EC_POINT_invert(group, r, ctx)) {
+  if (r_is_at_infinity &&
+      !EC_POINT_set_to_infinity(group, r)) {
    goto err;
  }

  ret = 1;

 err:
-  if (ctx != NULL) {
-    BN_CTX_end(ctx);
-  }
  BN_CTX_free(new_ctx);
  EC_POINT_free(tmp);
-  OPENSSL_free(wsize);
-  OPENSSL_free(wNAF_len);
-  if (wNAF != NULL) {
-    for (i = 0; i < total_num; i++) {
-      OPENSSL_free(wNAF[i]);
-    }
-
-    OPENSSL_free(wNAF);
+  OPENSSL_cleanse(&g_wNAF, sizeof(g_wNAF));
+  OPENSSL_cleanse(&p_wNAF, sizeof(p_wNAF));
+  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(precomp_storage); i++) {
+    EC_POINT_free(precomp_storage[i]);
  }
-  if (val != NULL) {
-    for (i = 0; i < num_val; i++) {
-      EC_POINT_free(val[i]);
-    }
-
-    OPENSSL_free(val);
-  }
-  OPENSSL_free(val_sub);
  return ret;
 }
@@ -66,10 +66,52 @@
 #include "../../internal.h"


+// EC_LOOSE_SCALAR is like |EC_SCALAR| but is bounded by 2^|BN_num_bits(order)|
+// rather than |order|.
+typedef union {
+  // bytes is the representation of the scalar in little-endian order.
+  uint8_t bytes[EC_MAX_SCALAR_BYTES];
+  BN_ULONG words[EC_MAX_SCALAR_WORDS];
+} EC_LOOSE_SCALAR;
+
+static void scalar_add_loose(const EC_GROUP *group, EC_LOOSE_SCALAR *r,
+                             const EC_LOOSE_SCALAR *a, const EC_SCALAR *b) {
+  // Add and subtract one copy of |order| if necessary. We have:
+  //   |a| + |b| < 2^BN_num_bits(order) + order
+  // so this leaves |r| < 2^BN_num_bits(order).
+  const BIGNUM *order = &group->order;
+  BN_ULONG carry = bn_add_words(r->words, a->words, b->words, order->width);
+  EC_LOOSE_SCALAR tmp;
+  BN_ULONG v =
+      bn_sub_words(tmp.words, r->words, order->d, order->width) - carry;
+  bn_select_words(r->words, 0u - v, r->words /* tmp < 0 */,
+                  tmp.words /* tmp >= 0 */, order->width);
+}
+
+static int scalar_mod_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a, const EC_SCALAR *b) {
+  const BIGNUM *order = &group->order;
+  return bn_mod_mul_montgomery_small(r->words, order->width, a->words,
+                                     order->width, b->words, order->width,
+                                     group->order_mont);
+}
+
+static int scalar_mod_mul_montgomery_loose(const EC_GROUP *group, EC_SCALAR *r,
+                                           const EC_LOOSE_SCALAR *a,
+                                           const EC_SCALAR *b) {
+  // Although |a| is loose, |bn_mod_mul_montgomery_small| only requires the
+  // product not exceed R * |order|. |b| is fully reduced and |a| <
+  // 2^BN_num_bits(order) <= R, so this holds.
+  const BIGNUM *order = &group->order;
+  return bn_mod_mul_montgomery_small(r->words, order->width, a->words,
+                                     order->width, b->words, order->width,
+                                     group->order_mont);
+}
+
 // digest_to_scalar interprets |digest_len| bytes from |digest| as a scalar for
 // ECDSA. Note this value is not fully reduced modulo the order, only the
 // correct number of bits.
-static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
+static void digest_to_scalar(const EC_GROUP *group, EC_LOOSE_SCALAR *out,
                             const uint8_t *digest, size_t digest_len) {
  const BIGNUM *order = &group->order;
  size_t num_bits = BN_num_bits(order);
@@ -85,11 +127,11 @@ static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
  // If still too long truncate remaining bits with a shift
  if (8 * digest_len > num_bits) {
    size_t shift = 8 - (num_bits & 0x7);
-    for (int i = 0; i < order->top - 1; i++) {
+    for (int i = 0; i < order->width - 1; i++) {
      out->words[i] =
          (out->words[i] >> shift) | (out->words[i + 1] << (BN_BITS2 - shift));
    }
-    out->words[order->top - 1] >>= shift;
+    out->words[order->width - 1] >>= shift;
  }
 }

@@ -195,15 +237,12 @@ int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
    goto err;
  }

-  EC_SCALAR r, s, m, u1, u2, s_inv_mont;
+  EC_SCALAR r, s, u1, u2, s_inv_mont;
+  EC_LOOSE_SCALAR m;
  const BIGNUM *order = EC_GROUP_get0_order(group);
  if (BN_is_zero(sig->r) ||
-      BN_is_negative(sig->r) ||
-      BN_ucmp(sig->r, order) >= 0 ||
      !ec_bignum_to_scalar(group, &r, sig->r) ||
      BN_is_zero(sig->s) ||
-      BN_is_negative(sig->s) ||
-      BN_ucmp(sig->s, order) >= 0 ||
      !ec_bignum_to_scalar(group, &s, sig->s)) {
    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
    goto err;
@@ -212,26 +251,21 @@ int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
  // the products below.
  int no_inverse;
  if (!BN_mod_inverse_odd(X, &no_inverse, sig->s, order, ctx) ||
-      !ec_bignum_to_scalar(group, &s_inv_mont, X) ||
-      !bn_to_montgomery_small(s_inv_mont.words, order->top, s_inv_mont.words,
-                              order->top, group->order_mont)) {
+      // TODO(davidben): Add a words version of |BN_mod_inverse_odd| and write
+      // into |s_inv_mont| directly.
+      !ec_bignum_to_scalar_unchecked(group, &s_inv_mont, X) ||
+      !bn_to_montgomery_small(s_inv_mont.words, order->width, s_inv_mont.words,
+                              order->width, group->order_mont)) {
    goto err;
  }
-  // u1 = m * s_inv_mont mod order
-  // u2 = r * s_inv_mont mod order
+  // u1 = m * s^-1 mod order
+  // u2 = r * s^-1 mod order
  //
  // |s_inv_mont| is in Montgomery form while |m| and |r| are not, so |u1| and
-  // |u2| will be taken out of Montgomery form, as desired. Note that, although
-  // |m| is not fully reduced, |bn_mod_mul_montgomery_small| only requires the
-  // product not exceed R * |order|. |s_inv_mont| is fully reduced and |m| <
-  // 2^BN_num_bits(order) <= R, so this holds.
+  // |u2| will be taken out of Montgomery form, as desired.
  digest_to_scalar(group, &m, digest, digest_len);
-  if (!bn_mod_mul_montgomery_small(u1.words, order->top, m.words, order->top,
-                                   s_inv_mont.words, order->top,
-                                   group->order_mont) ||
-      !bn_mod_mul_montgomery_small(u2.words, order->top, r.words, order->top,
-                                   s_inv_mont.words, order->top,
-                                   group->order_mont)) {
+  if (!scalar_mod_mul_montgomery_loose(group, &u1, &m, &s_inv_mont) ||
+      !scalar_mod_mul_montgomery(group, &u2, &r, &s_inv_mont)) {
    goto err;
  }

@@ -240,7 +274,7 @@ int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
    OPENSSL_PUT_ERROR(ECDSA, ERR_R_MALLOC_FAILURE);
    goto err;
  }
-  if (!ec_point_mul_scalar(group, point, &u1, pub_key, &u2, ctx)) {
+  if (!ec_point_mul_scalar_public(group, point, &u1, pub_key, &u2, ctx)) {
    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
    goto err;
  }
@@ -308,7 +342,7 @@ static int ecdsa_sign_setup(const EC_KEY *eckey, BN_CTX *ctx,
      SHA512_CTX sha;
      uint8_t additional_data[SHA512_DIGEST_LENGTH];
      SHA512_Init(&sha);
-      SHA512_Update(&sha, priv_key->words, order->top * sizeof(BN_ULONG));
+      SHA512_Update(&sha, priv_key->words, order->width * sizeof(BN_ULONG));
      SHA512_Update(&sha, digest, digest_len);
      SHA512_Final(additional_data, &sha);
      if (!ec_random_nonzero_scalar(group, &k, additional_data)) {
@@ -318,10 +352,10 @@ static int ecdsa_sign_setup(const EC_KEY *eckey, BN_CTX *ctx,

    // Compute k^-1. We leave it in the Montgomery domain as an optimization for
    // later operations.
-    if (!bn_to_montgomery_small(out_kinv_mont->words, order->top, k.words,
-                                order->top, group->order_mont) ||
-        !bn_mod_inverse_prime_mont_small(out_kinv_mont->words, order->top,
-                                         out_kinv_mont->words, order->top,
+    if (!bn_to_montgomery_small(out_kinv_mont->words, order->width, k.words,
+                                order->width, group->order_mont) ||
+        !bn_mod_inverse_prime_mont_small(out_kinv_mont->words, order->width,
+                                         out_kinv_mont->words, order->width,
                                         group->order_mont)) {
      goto err;
    }
@@ -368,14 +402,17 @@ ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
  int ok = 0;
  ECDSA_SIG *ret = ECDSA_SIG_new();
  BN_CTX *ctx = BN_CTX_new();
-  EC_SCALAR kinv_mont, priv_key, r_mont, s, tmp, m;
+  EC_SCALAR kinv_mont, priv_key, r_mont, s;
+  EC_LOOSE_SCALAR m, tmp;
  if (ret == NULL || ctx == NULL) {
    OPENSSL_PUT_ERROR(ECDSA, ERR_R_MALLOC_FAILURE);
    return NULL;
  }

  digest_to_scalar(group, &m, digest, digest_len);
-  if (!ec_bignum_to_scalar(group, &priv_key, priv_key_bn)) {
+  // TODO(davidben): Store the private key as an |EC_SCALAR| so this is obvious
+  // via the type system.
+  if (!ec_bignum_to_scalar_unchecked(group, &priv_key, priv_key_bn)) {
    goto err;
  }
  for (;;) {
@@ -385,37 +422,22 @@ ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
    }

    // Compute priv_key * r (mod order). Note if only one parameter is in the
-    // Montgomery domain, |bn_mod_mul_montgomery_small| will compute the answer
-    // in the normal domain.
+    // Montgomery domain, |scalar_mod_mul_montgomery| will compute the answer in
+    // the normal domain.
    if (!ec_bignum_to_scalar(group, &r_mont, ret->r) ||
-        !bn_to_montgomery_small(r_mont.words, order->top, r_mont.words,
-                                order->top, group->order_mont) ||
-        !bn_mod_mul_montgomery_small(s.words, order->top, priv_key.words,
-                                     order->top, r_mont.words, order->top,
-                                     group->order_mont)) {
+        !bn_to_montgomery_small(r_mont.words, order->width, r_mont.words,
+                                order->width, group->order_mont) ||
+        !scalar_mod_mul_montgomery(group, &s, &priv_key, &r_mont)) {
      goto err;
    }

-    // Compute s += m in constant time. Reduce one copy of |order| if necessary.
-    // Note this does not leave |s| fully reduced. We have
-    // |m| < 2^BN_num_bits(order), so subtracting |order| leaves
-    // 0 <= |s| < 2^BN_num_bits(order).
-    BN_ULONG carry = bn_add_words(s.words, s.words, m.words, order->top);
-    BN_ULONG v = bn_sub_words(tmp.words, s.words, order->d, order->top) - carry;
-    v = 0u - v;
-    for (int i = 0; i < order->top; i++) {
-      s.words[i] = constant_time_select_w(v, s.words[i], tmp.words[i]);
-    }
+    // Compute tmp = m + priv_key * r.
+    scalar_add_loose(group, &tmp, &m, &s);

    // Finally, multiply s by k^-1. That was retained in Montgomery form, so the
-    // same technique as the previous multiplication works. Although the
-    // previous step did not fully reduce |s|, |bn_mod_mul_montgomery_small|
-    // only requires the product not exceed R * |order|. |kinv_mont| is fully
-    // reduced and |s| < 2^BN_num_bits(order) <= R, so this holds.
-    if (!bn_mod_mul_montgomery_small(s.words, order->top, s.words, order->top,
-                                     kinv_mont.words, order->top,
-                                     group->order_mont) ||
-        !bn_set_words(ret->s, s.words, order->top)) {
+    // same technique as the previous multiplication works.
+    if (!scalar_mod_mul_montgomery_loose(group, &s, &tmp, &kinv_mont) ||
+        !bn_set_words(ret->s, s.words, order->width)) {
      goto err;
    }
    if (!BN_is_zero(ret->s)) {
@@ -142,6 +142,11 @@ ___
 $code=<<___;
 #include <openssl/arm_arch.h>

+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch  armv7-a
+
 .text
 #if defined(__thumb2__) || defined(__clang__)
 .syntax	unified
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -96,14 +103,13 @@
 #
 # Does it make sense to increase Naggr? To start with it's virtually
 # impossible in 32-bit mode, because of limited register bank
-# capacity. Otherwise improvement has to be weighed agiainst slower
+# capacity. Otherwise improvement has to be weighed against slower
 # setup, as well as code size and complexity increase. As even
 # optimistic estimate doesn't promise 30% performance improvement,
 # there are currently no plans to increase Naggr.
 #
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.

 # January 2010
 #
@@ -66,11 +66,7 @@ $code=<<___;

 .text
 ___
-$code.=<<___ if ($flavour =~ /64/);
-#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
-.arch  armv8-a+crypto
-#endif
-___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
 $code.=<<___				if ($flavour !~ /64/);
 .fpu	neon
 .code	32
@@ -0,0 +1,258 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/cpu.h>
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+#include "internal.h"
+
+
+struct ccm128_state {
+  union {
+    uint64_t u[2];
+    uint8_t c[16];
+  } nonce, cmac;
+};
+
+int CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, const void *key, block128_f block,
+                       ctr128_f ctr, unsigned M, unsigned L) {
+  if (M < 4 || M > 16 || (M & 1) != 0 || L < 2 || L > 8) {
+    return 0;
+  }
+  ctx->block = block;
+  ctx->ctr = ctr;
+  ctx->M = M;
+  ctx->L = L;
+  return 1;
+}
+
+size_t CRYPTO_ccm128_max_input(const CCM128_CONTEXT *ctx) {
+  return ctx->L >= sizeof(size_t) ? (size_t)-1
+                                  : (((size_t)1) << (ctx->L * 8)) - 1;
+}
+
+static int ccm128_init_state(const CCM128_CONTEXT *ctx,
+                             struct ccm128_state *state, const void *key,
+                             const uint8_t *nonce, size_t nonce_len,
+                             const uint8_t *aad, size_t aad_len,
+                             size_t plaintext_len) {
+  const block128_f block = ctx->block;
+  const unsigned M = ctx->M;
+  const unsigned L = ctx->L;
+
+  // |L| determines the expected |nonce_len| and the limit for |plaintext_len|.
+  if (plaintext_len > CRYPTO_ccm128_max_input(ctx) ||
+      nonce_len != 15 - L) {
+    return 0;
+  }
+
+  // Assemble the first block for computing the MAC.
+  OPENSSL_memset(state, 0, sizeof(*state));
+  state->nonce.c[0] = (uint8_t)((L - 1) | ((M - 2) / 2) << 3);
+  if (aad_len != 0) {
+    state->nonce.c[0] |= 0x40;  // Set AAD Flag
+  }
+  OPENSSL_memcpy(&state->nonce.c[1], nonce, nonce_len);
+  for (unsigned i = 0; i < L; i++) {
+    state->nonce.c[15 - i] = (uint8_t)(plaintext_len >> (8 * i));
+  }
+
+  (*block)(state->nonce.c, state->cmac.c, key);
+  size_t blocks = 1;
+
+  if (aad_len != 0) {
+    unsigned i;
+    // Cast to u64 to avoid the compiler complaining about invalid shifts.
+    uint64_t aad_len_u64 = aad_len;
+    if (aad_len_u64 < 0x10000 - 0x100) {
+      state->cmac.c[0] ^= (uint8_t)(aad_len_u64 >> 8);
+      state->cmac.c[1] ^= (uint8_t)aad_len_u64;
+      i = 2;
+    } else if (aad_len_u64 <= 0xffffffff) {
+      state->cmac.c[0] ^= 0xff;
+      state->cmac.c[1] ^= 0xfe;
+      state->cmac.c[2] ^= (uint8_t)(aad_len_u64 >> 24);
+      state->cmac.c[3] ^= (uint8_t)(aad_len_u64 >> 16);
+      state->cmac.c[4] ^= (uint8_t)(aad_len_u64 >> 8);
+      state->cmac.c[5] ^= (uint8_t)aad_len_u64;
+      i = 6;
+    } else {
+      state->cmac.c[0] ^= 0xff;
+      state->cmac.c[1] ^= 0xff;
+      state->cmac.c[2] ^= (uint8_t)(aad_len_u64 >> 56);
+      state->cmac.c[3] ^= (uint8_t)(aad_len_u64 >> 48);
+      state->cmac.c[4] ^= (uint8_t)(aad_len_u64 >> 40);
+      state->cmac.c[5] ^= (uint8_t)(aad_len_u64 >> 32);
+      state->cmac.c[6] ^= (uint8_t)(aad_len_u64 >> 24);
+      state->cmac.c[7] ^= (uint8_t)(aad_len_u64 >> 16);
+      state->cmac.c[8] ^= (uint8_t)(aad_len_u64 >> 8);
+      state->cmac.c[9] ^= (uint8_t)aad_len_u64;
+      i = 10;
+    }
+
+    do {
+      for (; i < 16 && aad_len != 0; i++) {
+        state->cmac.c[i] ^= *aad;
+        aad++;
+        aad_len--;
+      }
+      (*block)(state->cmac.c, state->cmac.c, key);
+      blocks++;
+      i = 0;
+    } while (aad_len != 0);
+  }
+
+  // Per RFC 3610, section 2.6, the total number of block cipher operations done
+  // must not exceed 2^61. There are two block cipher operations remaining per
+  // message block, plus one block at the end to encrypt the MAC.
+  size_t remaining_blocks = 2 * ((plaintext_len + 15) / 16) + 1;
+  if (plaintext_len + 15 < plaintext_len ||
+      remaining_blocks + blocks < blocks ||
+      // Silence Clang's unhelpful -Wtautological-constant-out-of-range-compare
+      // warning.
+      (sizeof(size_t) > 4 && remaining_blocks + blocks > UINT64_C(1) << 61)) {
+    return 0;
+  }
+
+  // Assemble the first block for encrypting and decrypting. The bottom |L|
+  // bytes are replaced with a counter and all bit the encoding of |L| is
+  // cleared in the first byte.
+  state->nonce.c[0] &= 7;
+  return 1;
+}
+
+static int ccm128_encrypt(const CCM128_CONTEXT *ctx, struct ccm128_state *state,
+                          const void *key, uint8_t *out, const uint8_t *in,
+                          size_t len) {
+  // The counter for encryption begins at one.
+  for (unsigned i = 0; i < ctx->L; i++) {
+    state->nonce.c[15 - i] = 0;
+  }
+  state->nonce.c[15] = 1;
+
+  uint8_t partial_buf[16];
+  unsigned num = 0;
+  if (ctx->ctr != NULL) {
+    CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, state->nonce.c, partial_buf,
+                                &num, ctx->ctr);
+  } else {
+    CRYPTO_ctr128_encrypt(in, out, len, key, state->nonce.c, partial_buf, &num,
+                          ctx->block);
+  }
+  return 1;
+}
+
+static int ccm128_compute_mac(const CCM128_CONTEXT *ctx,
+                              struct ccm128_state *state, const void *key,
+                              uint8_t *out_tag, size_t tag_len,
+                              const uint8_t *in, size_t len) {
+  block128_f block = ctx->block;
+  if (tag_len != ctx->M) {
+    return 0;
+  }
+
+  // Incorporate |in| into the MAC.
+  union {
+    uint64_t u[2];
+    uint8_t c[16];
+  } tmp;
+  while (len >= 16) {
+    OPENSSL_memcpy(tmp.c, in, 16);
+    state->cmac.u[0] ^= tmp.u[0];
+    state->cmac.u[1] ^= tmp.u[1];
+    (*block)(state->cmac.c, state->cmac.c, key);
+    in += 16;
+    len -= 16;
+  }
+  if (len > 0) {
+    for (size_t i = 0; i < len; i++) {
+      state->cmac.c[i] ^= in[i];
+    }
+    (*block)(state->cmac.c, state->cmac.c, key);
+  }
+
+  // Encrypt the MAC with counter zero.
+  for (unsigned i = 0; i < ctx->L; i++) {
+    state->nonce.c[15 - i] = 0;
+  }
+  (*block)(state->nonce.c, tmp.c, key);
+  state->cmac.u[0] ^= tmp.u[0];
+  state->cmac.u[1] ^= tmp.u[1];
+
+  OPENSSL_memcpy(out_tag, state->cmac.c, tag_len);
+  return 1;
+}
+
+int CRYPTO_ccm128_encrypt(const CCM128_CONTEXT *ctx, const void *key,
+                          uint8_t *out, uint8_t *out_tag, size_t tag_len,
+                          const uint8_t *nonce, size_t nonce_len,
+                          const uint8_t *in, size_t len, const uint8_t *aad,
+                          size_t aad_len) {
+  struct ccm128_state state;
+  return ccm128_init_state(ctx, &state, key, nonce, nonce_len, aad, aad_len,
+                           len) &&
+         ccm128_compute_mac(ctx, &state, key, out_tag, tag_len, in, len) &&
+         ccm128_encrypt(ctx, &state, key, out, in, len);
+}
+
+int CRYPTO_ccm128_decrypt(const CCM128_CONTEXT *ctx, const void *key,
+                          uint8_t *out, uint8_t *out_tag, size_t tag_len,
+                          const uint8_t *nonce, size_t nonce_len,
+                          const uint8_t *in, size_t len, const uint8_t *aad,
+                          size_t aad_len) {
+  struct ccm128_state state;
+  return ccm128_init_state(ctx, &state, key, nonce, nonce_len, aad, aad_len,
+                           len) &&
+         ccm128_encrypt(ctx, &state, key, out, in, len) &&
+         ccm128_compute_mac(ctx, &state, key, out_tag, tag_len, out, len);
+}
@@ -66,38 +66,6 @@ extern "C" {
 #define STRICT_ALIGNMENT 0
 #endif

-#if defined(__GNUC__) && __GNUC__ >= 2
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  return __builtin_bswap32(x);
-}
-
-static inline uint64_t CRYPTO_bswap8(uint64_t x) {
-  return __builtin_bswap64(x);
-}
-#elif defined(_MSC_VER)
-OPENSSL_MSVC_PRAGMA(warning(push, 3))
-#include <intrin.h>
-OPENSSL_MSVC_PRAGMA(warning(pop))
-#pragma intrinsic(_byteswap_uint64, _byteswap_ulong)
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  return _byteswap_ulong(x);
-}
-
-static inline uint64_t CRYPTO_bswap8(uint64_t x) {
-  return _byteswap_uint64(x);
-}
-#else
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  x = (x >> 16) | (x << 16);
-  x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8);
-  return x;
-}
-
-static inline uint64_t CRYPTO_bswap8(uint64_t x) {
-  return CRYPTO_bswap4(x >> 32) | (((uint64_t)CRYPTO_bswap4(x)) << 32);
-}
-#endif
-
 static inline uint32_t GETU32(const void *in) {
  uint32_t v;
  OPENSSL_memcpy(&v, in, sizeof(v));
@@ -281,6 +249,42 @@ OPENSSL_EXPORT void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, uint8_t *tag,
                                      size_t len);


+// CCM.
+
+typedef struct ccm128_context {
+  block128_f block;
+  ctr128_f ctr;
+  unsigned M, L;
+} CCM128_CONTEXT;
+
+// CRYPTO_ccm128_init initialises |ctx| to use |block| (typically AES) with the
+// specified |M| and |L| parameters. It returns one on success and zero if |M|
+// or |L| is invalid.
+int CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, const void *key, block128_f block,
+                       ctr128_f ctr, unsigned M, unsigned L);
+
+// CRYPTO_ccm128_max_input returns the maximum input length accepted by |ctx|.
+size_t CRYPTO_ccm128_max_input(const CCM128_CONTEXT *ctx);
+
+// CRYPTO_ccm128_encrypt encrypts |len| bytes from |in| to |out| writing the tag
+// to |out_tag|. |key| must be the same key that was passed to
+// |CRYPTO_ccm128_init|. It returns one on success and zero otherwise.
+int CRYPTO_ccm128_encrypt(const CCM128_CONTEXT *ctx, const void *key,
+                          uint8_t *out, uint8_t *out_tag, size_t tag_len,
+                          const uint8_t *nonce, size_t nonce_len,
+                          const uint8_t *in, size_t len, const uint8_t *aad,
+                          size_t aad_len);
+
+// CRYPTO_ccm128_decrypt decrypts |len| bytes from |in| to |out|, writing the
+// expected tag to |out_tag|. |key| must be the same key that was passed to
+// |CRYPTO_ccm128_init|. It returns one on success and zero otherwise.
+int CRYPTO_ccm128_decrypt(const CCM128_CONTEXT *ctx, const void *key,
+                          uint8_t *out, uint8_t *out_tag, size_t tag_len,
+                          const uint8_t *nonce, size_t nonce_len,
+                          const uint8_t *in, size_t len, const uint8_t *aad,
+                          size_t aad_len);
+
+
 // CBC.

 // cbc128_f is the type of a function that performs CBC-mode encryption.
@@ -74,11 +74,11 @@ static void ctr32_add(CTR_DRBG_STATE *drbg, uint32_t n) {
      CRYPTO_bswap4(CRYPTO_bswap4(drbg->counter.words[3]) + n);
 }

-static int CTR_DRBG_update(CTR_DRBG_STATE *drbg, const uint8_t *data,
+static int ctr_drbg_update(CTR_DRBG_STATE *drbg, const uint8_t *data,
                           size_t data_len) {
-  // Section 10.2.1.2. A value of |data_len| which less than
-  // |CTR_DRBG_ENTROPY_LEN| is permitted and acts the same as right-padding
-  // with zeros. This can save a copy.
+  // Per section 10.2.1.2, |data_len| must be |CTR_DRBG_ENTROPY_LEN|. Here, we
+  // allow shorter inputs and right-pad them with zeros. This is equivalent to
+  // the specified algorithm but saves a copy in |CTR_DRBG_generate|.
  if (data_len > CTR_DRBG_ENTROPY_LEN) {
    return 0;
  }
@@ -119,7 +119,7 @@ int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg,
    entropy = entropy_copy;
  }

-  if (!CTR_DRBG_update(drbg, entropy, CTR_DRBG_ENTROPY_LEN)) {
+  if (!ctr_drbg_update(drbg, entropy, CTR_DRBG_ENTROPY_LEN)) {
    return 0;
  }

@@ -142,7 +142,7 @@ int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out, size_t out_len,
  }

  if (additional_data_len != 0 &&
-      !CTR_DRBG_update(drbg, additional_data, additional_data_len)) {
+      !ctr_drbg_update(drbg, additional_data, additional_data_len)) {
    return 0;
  }

@@ -187,7 +187,9 @@ int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out, size_t out_len,
    OPENSSL_memcpy(out, block, out_len);
  }

-  if (!CTR_DRBG_update(drbg, additional_data, additional_data_len)) {
+  // Right-padding |additional_data| in step 2.2 is handled implicitly by
+  // |ctr_drbg_update|, to save a copy.
+  if (!ctr_drbg_update(drbg, additional_data, additional_data_len)) {
    return 0;
  }

--- a/Show More
+++ b/Show More