Make CRYPTO_is_NEON_capable aware of the buggy CPU.

If we're to allow the buggy CPU workaround to fire when __ARM_NEON__ is set, CRYPTO_is_NEON_capable also needs to be aware of it. Also add an API to export this value out of BoringSSL, so we can get some metrics on how prevalent this chip is. BUG=chromium:606629 Change-Id: I97d65a47a6130689098b32ce45a8c57c468aa405 Reviewed-on: https://boringssl-review.googlesource.com/7796 Reviewed-by: Adam Langley <agl@google.com>
Don't set a default armcap state in dynamic armcap modes.
2016-04-29 14:24:33 -04:00 · 2016-04-29 14:24:26 -04:00 · 2016-03-31 22:15:51 +00:00 · 2016-03-31 22:12:46 +00:00 · 2016-03-31 22:12:09 +00:00 · 2016-03-31 20:50:33 +00:00
2027 changed files with 25135 additions and 11653 deletions
@@ -0,0 +1,7 @@
+Please do not send pull requests to the BoringSSL repository.
+
+We do, however, take contributions gladly.
+
+See https://boringssl.googlesource.com/boringssl/+/master/CONTRIBUTING.md
+
+Thanks!
@@ -31,16 +31,6 @@
  * [Go](https://golang.org/dl/) is required. If not found by CMake, the go
    executable may be configured explicitly by setting `GO_EXECUTABLE`.

-  * If you change crypto/chacha/chacha\_vec.c, you will need the
-    arm-linux-gnueabihf-gcc compiler:
-
-    ```
-    wget https://releases.linaro.org/14.11/components/toolchain/binaries/arm-linux-gnueabihf/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf.tar.xz && \
-    echo bc4ca2ced084d2dc12424815a4442e19cb1422db87068830305d90075feb1a3b  gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf.tar.xz | sha256sum -c && \
-    tar xf gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf.tar.xz && \
-    sudo mv gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf /opt/
-    ```
-
 ## Building

 Using Ninja (note the 'N' is capitalized in the cmake invocation):
@@ -28,8 +28,8 @@ endif()

 if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
  set(C_CXX_FLAGS "-Wall -Werror -Wformat=2 -Wsign-compare -Wmissing-field-initializers -ggdb -fvisibility=hidden")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x ${C_CXX_FLAGS}")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_CXX_FLAGS} -Wmissing-prototypes")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x ${C_CXX_FLAGS} -Wmissing-declarations")
 elseif(MSVC)
  set(MSVC_DISABLED_WARNINGS_LIST
      "C4100" # 'exarg' : unreferenced formal parameter
@@ -62,7 +62,6 @@ elseif(MSVC)
              # copy constructor is inaccessible or deleted
      "C4626" # assignment operator could not be generated because a base class
              # assignment operator is inaccessible or deleted
-      "C4701" # potentially uninitialized local variable 'p' used
      "C4706" # assignment within conditional expression
      "C4710" # 'function': function not inlined
      "C4711" # function 'function' selected for inline expansion
@@ -72,10 +71,25 @@ elseif(MSVC)
      "C4996" # 'read': The POSIX name for this item is deprecated. Instead,
              # use the ISO C++ conformant name: _read.
     )
+  if(NOT(CMAKE_C_COMPILER_VERSION VERSION_LESS "19.0.23506"))
+    # MSVC 2015 Update 1.
+    set(MSVC_DISABLED_WARNINGS_LIST
+        ${MSVC_DISABLED_WARNINGS_LIST}
+        "C4464" # relative include path contains '..'
+        "C4623" # default constructor was implicitly defined as deleted
+        "C5027" # move assignment operator was implicitly defined as deleted
+       )
+    set(MSVC_LEVEL4_WARNINGS_LIST
+        # See https://connect.microsoft.com/VisualStudio/feedback/details/1217660/warning-c4265-when-using-functional-header
+        "C4265" # class has virtual functions, but destructor is not virtual
+        )
+    string(REPLACE "C" " -w4" MSVC_LEVEL4_WARNINGS_STR
+                              ${MSVC_LEVEL4_WARNINGS_LIST})
+  endif()
  string(REPLACE "C" " -wd" MSVC_DISABLED_WARNINGS_STR
                            ${MSVC_DISABLED_WARNINGS_LIST})
-  set(CMAKE_C_FLAGS   "-Wall -WX ${MSVC_DISABLED_WARNINGS_STR}")
-  set(CMAKE_CXX_FLAGS "-Wall -WX ${MSVC_DISABLED_WARNINGS_STR}")
+  set(CMAKE_C_FLAGS   "-Wall -WX ${MSVC_DISABLED_WARNINGS_STR} ${MSVC_LEVEL4_WARNINGS_STR}")
+  set(CMAKE_CXX_FLAGS "-Wall -WX ${MSVC_DISABLED_WARNINGS_STR} ${MSVC_LEVEL4_WARNINGS_STR}")
  add_definitions(-D_HAS_EXCEPTIONS=0)
  add_definitions(-DWIN32_LEAN_AND_MEAN)
  add_definitions(-DNOMINMAX)
@@ -97,8 +111,11 @@ if(FUZZ)
    message("You need to build with Clang for fuzzing to work")
  endif()

-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fsanitize-coverage=edge,indirect-calls")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize-coverage=edge,indirect-calls")
+  add_definitions(-DBORINGSSL_UNSAFE_FUZZER_MODE)
+  set(RUNNER_ARGS "-fuzzer")
+
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fsanitize-coverage=edge,indirect-calls,8bit-counters")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize-coverage=edge,indirect-calls,8bit-counters")
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address")
  link_directories(.)
 endif()
@@ -185,6 +202,7 @@ add_custom_target(
            ${CMAKE_BINARY_DIR}
    COMMAND cd ssl/test/runner
    COMMAND ${GO_EXECUTABLE} test -shim-path $<TARGET_FILE:bssl_shim>
+            ${RUNNER_ARGS}
    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
    DEPENDS all_tests bssl_shim
    ${MAYBE_USES_TERMINAL})
@@ -13,9 +13,9 @@ CC=clang CXX=clang++ cmake -GNinja -DFUZZ=1 ..
 In order for the fuzz tests to link, the linker needs to find libFuzzer. This is not commonly provided and you may need to download the [Clang source code](http://llvm.org/releases/download.html) and do the following:

 ```
-cd llvm-3.7.0.src/lib
-clang -c -g -O2 -std=c++11 Fuzzer/*.cpp -IFuzzer
-ar q libFuzzer.a *.o
+svn co http://llvm.org/svn/llvm-project/llvm/trunk/lib/Fuzzer
+clang++ -c -g -O2 -std=c++11 Fuzzer/*.cpp -IFuzzer
+ar ruv libFuzzer.a Fuzzer*.o
 ```

 Then copy `libFuzzer.a` to the top-level of your BoringSSL source directory.
@@ -23,18 +23,29 @@ Then copy `libFuzzer.a` to the top-level of your BoringSSL source directory.
 From the `build/` directory, you can then run the fuzzers. For example:

 ```
-./fuzz/cert -max_len=4000 -jobs=32 -workers=32 ../fuzz/cert_corpus/
+./fuzz/cert -max_len=3072 -jobs=32 -workers=32 ../fuzz/cert_corpus/
 ```

-The `max_len` argument is often important because, without it, libFuzzer defaults to limiting all test cases to 64 bytes, which is often insufficient for the formats that we wish to fuzz. The arguments to `jobs` and `workers` should be the number of cores that you wish to dedicate to fuzzing.
+The arguments to `jobs` and `workers` should be the number of cores that you wish to dedicate to fuzzing. By default, libFuzzer uses the largest test in the corpus (or 64 if empty) as the maximum test case length. The `max_len` argument overrides this.
+
+The recommended values of `max_len` for each test may be found in `.options` files alongside the test source. These were determined by rounding up the length of the largest case in the corpus. When writing a new fuzzer, configure `max_len` in a similar file.

 There are directories in `fuzz/` for each of the fuzzing tests which contain seed files for fuzzing. Some of the seed files were generated manually but many of them are “interesting” results generated by the fuzzing itself. (Where “interesting” means that it triggered a previously unknown path in the code.)

-Here are the recommended values of `max_len` for each test.
+## Minimising the corpuses

-| Test      | `max_len` value |
-|-----------|-----------------|
-| `privkey` | 2048            |
-| `cert`    | 3072            |
-| `server`  | 1024            |
-| `client`  | 4096            |
+When a large number of new seeds are available, it's a good idea to minimise the corpus so that different seeds that trigger the same code paths can be deduplicated.
+
+In order to minimise all the corpuses, build for fuzzing and run `./fuzz/minimise_corpuses.sh`. Note that minimisation is, oddly, often not idempotent for unknown reasons.
+
+## Fuzzer mode
+
+When `-DFUZZ=1` is passed into CMake, BoringSSL builds with `BORINGSSL_UNSAFE_FUZZER_MODE` defined. This modifies the library, particularly the TLS stack, to be more friendly to fuzzers. It will:
+
+* Replace `RAND_bytes` with a deterministic PRNG. Call `RAND_reset_for_fuzzing()` at the start of fuzzers which use `RAND_bytes` to reset the PRNG state.
+
+* Modify the TLS stack to perform all signature checks (CertificateVerify and ServerKeyExchange) and the Finished check, but always act as if the check succeeded.
+
+* Treat every cipher as the NULL cipher.
+
+This is to prevent the fuzzer from getting stuck at a cryptographic invariant in the protocol.
@@ -14,6 +14,13 @@ for the actual license texts. Actually both licenses are BSD-style Open Source
 licenses. In case of any license issues related to OpenSSL please contact
 openssl-core@openssl.org.

+The following are Google-internal bug numbers where explicit permission from
+some authors is recorded for use of their work. (This is purely for our own
+record keeping.)
+  27287199
+  27287880
+  27287883
+
  OpenSSL License
  ---------------

@@ -22,6 +22,7 @@ elseif(UNIX)
  endif()
  set(ASM_EXT S)
  enable_language(ASM)
+  set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,--noexecstack")
 else()
  if (CMAKE_CL_64)
    message("Using nasm")
@@ -53,39 +54,6 @@ function(perlasm dest src)
  )
 endfunction()

-if (${ARCH} STREQUAL "x86_64")
-  set(
-    CRYPTO_ARCH_SOURCES
-
-    cpu-intel.c
-  )
-endif()
-
-if (${ARCH} STREQUAL "x86")
-  set(
-    CRYPTO_ARCH_SOURCES
-
-    cpu-intel.c
-  )
-endif()
-
-if (${ARCH} STREQUAL "arm")
-  set(
-    CRYPTO_ARCH_SOURCES
-
-    cpu-arm.c
-    cpu-arm-asm.S
-  )
-endif()
-
-if (${ARCH} STREQUAL "aarch64")
-  set(
-    CRYPTO_ARCH_SOURCES
-
-    cpu-arm.c
-  )
-endif()
-
 # Level 0.1 - depends on nothing outside this set.
 add_subdirectory(stack)
 add_subdirectory(lhash)
@@ -143,6 +111,10 @@ add_subdirectory(test)
 add_library(
  crypto

+  cpu-aarch64-linux.c
+  cpu-arm.c
+  cpu-arm-linux.c
+  cpu-intel.c
  crypto.c
  directory_posix.c
  directory_win.c
@@ -156,8 +128,6 @@ add_library(
  thread_win.c
  time_support.c

-  ${CRYPTO_ARCH_SOURCES}
-
  $<TARGET_OBJECTS:stack>
  $<TARGET_OBJECTS:lhash>
  $<TARGET_OBJECTS:err>
@@ -32,7 +32,6 @@ add_library(
  f_int.c
  f_string.c
  t_bitst.c
-  t_pkey.c
  tasn_dec.c
  tasn_enc.c
  tasn_fre.c
@@ -63,6 +63,8 @@
 #include <openssl/mem.h>
 #include <openssl/time_support.h>

+#include "asn1_locl.h"
+
 int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d)
 {
    static const int min[9] = { 0, 0, 1, 1, 0, 0, 0, 0, 0 };
@@ -63,6 +63,8 @@
 #include <openssl/mem.h>
 #include <openssl/time_support.h>

+#include "asn1_locl.h"
+
 #if 0
 int i2d_ASN1_UTCTIME(ASN1_UTCTIME *a, unsigned char **pp)
 {
@@ -63,9 +63,15 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>

-/* Cross-module errors from crypto/x509/i2d_pr.c */
+/* Cross-module errors from crypto/x509/i2d_pr.c. */
 OPENSSL_DECLARE_ERROR_REASON(ASN1, UNSUPPORTED_PUBLIC_KEY_TYPE);

+/* Cross-module errors from crypto/x509/algorithm.c. */
+OPENSSL_DECLARE_ERROR_REASON(ASN1, CONTEXT_NOT_INITIALISED);
+OPENSSL_DECLARE_ERROR_REASON(ASN1, DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_SIGNATURE_ALGORITHM);
+OPENSSL_DECLARE_ERROR_REASON(ASN1, WRONG_PUBLIC_KEY_TYPE);
 /*
 * Cross-module errors from crypto/x509/asn1_gen.c. TODO(davidben): Remove
 * these once asn1_gen.c is gone.
@@ -21,6 +21,11 @@
 #include "../test/scoped_types.h"


+// kTag128 is an ASN.1 structure with a universal tag with number 128.
+static const uint8_t kTag128[] = {
+    0x1f, 0x81, 0x00, 0x01, 0x00,
+};
+
 // kTag258 is an ASN.1 structure with a universal tag with number 258.
 static const uint8_t kTag258[] = {
    0x1f, 0x82, 0x02, 0x01, 0x00,
@@ -29,13 +34,38 @@ static const uint8_t kTag258[] = {
 static_assert(V_ASN1_NEG_INTEGER == 258,
              "V_ASN1_NEG_INTEGER changed. Update kTag258 to collide with it.");

-bool TestLargeTags() {
+// kTagOverflow is an ASN.1 structure with a universal tag with number 2^35-1,
+// which will not fit in an int.
+static const uint8_t kTagOverflow[] = {
+    0x1f, 0xff, 0xff, 0xff, 0xff, 0x7f, 0x01, 0x00,
+};
+
+static bool TestLargeTags() {
  const uint8_t *p = kTag258;
  ScopedASN1_TYPE obj(d2i_ASN1_TYPE(NULL, &p, sizeof(kTag258)));
  if (obj) {
    fprintf(stderr, "Parsed value with illegal tag (type = %d).\n", obj->type);
    return false;
  }
+  ERR_clear_error();
+
+  p = kTagOverflow;
+  obj.reset(d2i_ASN1_TYPE(NULL, &p, sizeof(kTagOverflow)));
+  if (obj) {
+    fprintf(stderr, "Parsed value with tag overflow (type = %d).\n", obj->type);
+    return false;
+  }
+  ERR_clear_error();
+
+  p = kTag128;
+  obj.reset(d2i_ASN1_TYPE(NULL, &p, sizeof(kTag128)));
+  if (!obj || obj->type != 128 || obj->value.asn1_string->length != 1 ||
+      obj->value.asn1_string->data[0] != 0) {
+    fprintf(stderr, "Failed to parse value with tag 128.\n");
+    ERR_print_errors_fp(stderr);
+    return false;
+  }
+
  return true;
 }

@@ -706,13 +706,12 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
                                 const unsigned char **in, long inlen,
                                 const ASN1_ITEM *it,
                                 int tag, int aclass, char opt, ASN1_TLC *ctx)
-    OPENSSL_SUPPRESS_POTENTIALLY_UNINITIALIZED_WARNINGS
 {
    int ret = 0, utype;
    long plen;
    char cst, inf, free_cont = 0;
    const unsigned char *p;
-    BUF_MEM buf;
+    BUF_MEM buf = {0, NULL, 0 };
    const unsigned char *cont = NULL;
    long len;
    if (!pval) {
@@ -786,7 +785,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
        } else {
            len = p - cont + plen;
            p += plen;
-            buf.data = NULL;
        }
    } else if (cst) {
        if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
@@ -797,9 +795,8 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
            return 0;
        }

-        buf.length = 0;
-        buf.max = 0;
-        buf.data = NULL;
+        /* Free any returned 'buf' content */
+        free_cont = 1;
        /*
         * Should really check the internal tags are correct but some things
         * may get this wrong. The relevant specs say that constructed string
@@ -807,18 +804,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
         * So instead just check for UNIVERSAL class and ignore the tag.
         */
        if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
-            free_cont = 1;
            goto err;
        }
        len = buf.length;
        /* Append a final null to string */
        if (!BUF_MEM_grow_clean(&buf, len + 1)) {
            OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
-            return 0;
+            goto err;
        }
        buf.data[len] = 0;
        cont = (const unsigned char *)buf.data;
-        free_cont = 1;
    } else {
        cont = p;
        len = plen;
@@ -826,6 +821,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
    }

    /* We now have content length and type: translate into a structure */
+    /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
    if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
        goto err;

@@ -64,7 +64,7 @@
 #include <openssl/mem.h>


-BIO *BIO_new_mem_buf(void *buf, int len) {
+BIO *BIO_new_mem_buf(const void *buf, int len) {
  BIO *ret;
  BUF_MEM *b;
  const size_t size = len < 0 ? strlen((char *)buf) : (size_t)len;
@@ -80,7 +80,8 @@ BIO *BIO_new_mem_buf(void *buf, int len) {
  }

  b = (BUF_MEM *)ret->ptr;
-  b->data = buf;
+  /* BIO_FLAGS_MEM_RDONLY ensures |b->data| is not written to. */
+  b->data = (void *)buf;
  b->length = size;
  b->max = size;

@@ -331,7 +331,7 @@ static bool TestPrintf() {

 static bool ReadASN1(bool should_succeed, const uint8_t *data, size_t data_len,
                     size_t expected_len, size_t max_len) {
-  ScopedBIO bio(BIO_new_mem_buf(const_cast<uint8_t*>(data), data_len));
+  ScopedBIO bio(BIO_new_mem_buf(data, data_len));

  uint8_t *out;
  size_t out_len;
@@ -58,7 +58,6 @@

 #include <assert.h>
 #include <errno.h>
-#include <stdio.h>
 #include <string.h>

 #if !defined(OPENSSL_WINDOWS)
@@ -542,3 +541,7 @@ int BIO_set_conn_port(BIO *bio, const char *port_str) {
 int BIO_set_nbio(BIO *bio, int on) {
  return BIO_ctrl(bio, BIO_C_SET_NBIO, on, NULL);
 }
+
+int BIO_do_connect(BIO *bio) {
+  return BIO_ctrl(bio, BIO_C_DO_STATE_MACHINE, 0, NULL);
+}
@@ -72,6 +72,8 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>

+#include "internal.h"
+

 static int bio_fd_non_fatal_error(int err) {
  if (
@@ -87,47 +87,11 @@
 #define BIO_FP_WRITE 0x04
 #define BIO_FP_APPEND 0x08

-static FILE *open_file(const char *filename, const char *mode) {
-#if defined(OPENSSL_WINDOWS) && defined(CP_UTF8)
-  int sz, len_0 = (int)strlen(filename) + 1;
-  DWORD flags;
-
-  /* Basically there are three cases to cover: a) filename is pure ASCII
-   * string; b) actual UTF-8 encoded string and c) locale-ized string, i.e. one
-   * containing 8-bit characters that are meaningful in current system locale.
-   * If filename is pure ASCII or real UTF-8 encoded string,
-   * MultiByteToWideChar succeeds and _wfopen works. If filename is locale-ized
-   * string, chances are that MultiByteToWideChar fails reporting
-   * ERROR_NO_UNICODE_TRANSLATION, in which case we fall back to fopen... */
-  if ((sz = MultiByteToWideChar(CP_UTF8, (flags = MB_ERR_INVALID_CHARS),
-                                filename, len_0, NULL, 0)) > 0 ||
-      (GetLastError() == ERROR_INVALID_FLAGS &&
-       (sz = MultiByteToWideChar(CP_UTF8, (flags = 0), filename, len_0, NULL,
-                                 0)) > 0)) {
-    WCHAR wmode[8];
-    WCHAR *wfilename = _alloca(sz * sizeof(WCHAR));
-
-    if (MultiByteToWideChar(CP_UTF8, flags, filename, len_0, wfilename, sz) &&
-        MultiByteToWideChar(CP_UTF8, 0, mode, strlen(mode) + 1, wmode,
-                            sizeof(wmode) / sizeof(wmode[0])) &&
-        (file = _wfopen(wfilename, wmode)) == NULL &&
-        (errno == ENOENT ||
-         errno == EBADF)) /* UTF-8 decode succeeded, but no file, filename
-                           * could still have been locale-ized... */
-      return fopen(filename, mode);
-  } else if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
-    return fopen(filename, mode);
-  }
-#else
-  return fopen(filename, mode);
-#endif
-}
-
 BIO *BIO_new_file(const char *filename, const char *mode) {
  BIO *ret;
  FILE *file;

-  file = open_file(filename, mode);
+  file = fopen(filename, mode);
  if (file == NULL) {
    OPENSSL_PUT_SYSTEM_ERROR();

@@ -256,7 +220,7 @@ static long file_ctrl(BIO *b, int cmd, long num, void *ptr) {
        ret = 0;
        break;
      }
-      fp = open_file(ptr, p);
+      fp = fopen(ptr, p);
      if (fp == NULL) {
        OPENSSL_PUT_SYSTEM_ERROR();
        ERR_add_error_data(5, "fopen('", ptr, "','", p, "')");
@@ -67,6 +67,9 @@ typedef unsigned short u_short;
 #include <sys/types.h>
 #include <sys/socket.h>
 #else
+#pragma warning(push, 3)
+#include <winsock2.h>
+#pragma warning(pop)
 typedef int socklen_t;
 #endif

@@ -742,7 +742,7 @@ static const BIO_METHOD methods_biop = {
    bio_free,     NULL /* no bio_callback_ctrl */
 };

-const BIO_METHOD *bio_s_bio(void) { return &methods_biop; }
+static const BIO_METHOD *bio_s_bio(void) { return &methods_biop; }

 int BIO_new_bio_pair(BIO** bio1_p, size_t writebuf1,
                     BIO** bio2_p, size_t writebuf2) {
@@ -107,7 +107,7 @@ bn_mul_mont:
 #ifdef	__APPLE__
 	ldr	r0,[r0]
 #endif
-	tst	r0,#1			@ NEON available?
+	tst	r0,#ARMV7_NEON		@ NEON available?
 	ldmia	sp, {r0,r2}
 	beq	.Lialu
 	add	sp,sp,#8
@@ -427,7 +427,7 @@ $TEMP2 = $B2;
 $TEMP3 = $Y1;
 $TEMP4 = $Y2;
 $code.=<<___;
-	#we need to fix indexes 32-39 to avoid overflow
+	# we need to fix indices 32-39 to avoid overflow
 	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
 	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
 	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
@@ -1576,68 +1576,128 @@ rsaz_1024_scatter5_avx2:
 .type	rsaz_1024_gather5_avx2,\@abi-omnipotent
 .align	32
 rsaz_1024_gather5_avx2:
+	vzeroupper
+	mov	%rsp,%r11
 ___
 $code.=<<___ if ($win64);
 	lea	-0x88(%rsp),%rax
-	vzeroupper
 .LSEH_begin_rsaz_1024_gather5:
 	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
-	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6,-0x20(%rax)
-	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7,-0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8,0(%rax)
-	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9,0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10,0x20(%rax)
-	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11,0x30(%rax)
-	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12,0x40(%rax)
-	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13,0x50(%rax)
-	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14,0x60(%rax)
-	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15,0x70(%rax)
+	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax),%rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6,-0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7,-0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8,0(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9,0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10,0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11,0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12,0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13,0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14,0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15,0x70(%rax)
 ___
 $code.=<<___;
-	lea	.Lgather_table(%rip),%r11
-	mov	$power,%eax
-	and	\$3,$power
-	shr	\$2,%eax			# cache line number
-	shl	\$4,$power			# offset within cache line
+	lea	-0x100(%rsp),%rsp
+	and	\$-32, %rsp
+	lea	.Linc(%rip), %r10
+	lea	-128(%rsp),%rax			# control u-op density

-	vmovdqu		-32(%r11),%ymm7		# .Lgather_permd
-	vpbroadcastb	8(%r11,%rax), %xmm8
-	vpbroadcastb	7(%r11,%rax), %xmm9
-	vpbroadcastb	6(%r11,%rax), %xmm10
-	vpbroadcastb	5(%r11,%rax), %xmm11
-	vpbroadcastb	4(%r11,%rax), %xmm12
-	vpbroadcastb	3(%r11,%rax), %xmm13
-	vpbroadcastb	2(%r11,%rax), %xmm14
-	vpbroadcastb	1(%r11,%rax), %xmm15
+	vmovd		$power, %xmm4
+	vmovdqa		(%r10),%ymm0
+	vmovdqa		32(%r10),%ymm1
+	vmovdqa		64(%r10),%ymm5
+	vpbroadcastd	%xmm4,%ymm4

-	lea	64($inp,$power),$inp
-	mov	\$64,%r11			# size optimization
-	mov	\$9,%eax
-	jmp	.Loop_gather_1024
+	vpaddd		%ymm5, %ymm0, %ymm2
+	vpcmpeqd	%ymm4, %ymm0, %ymm0
+	vpaddd		%ymm5, %ymm1, %ymm3
+	vpcmpeqd	%ymm4, %ymm1, %ymm1
+	vmovdqa		%ymm0, 32*0+128(%rax)
+	vpaddd		%ymm5, %ymm2, %ymm0
+	vpcmpeqd	%ymm4, %ymm2, %ymm2
+	vmovdqa		%ymm1, 32*1+128(%rax)
+	vpaddd		%ymm5, %ymm3, %ymm1
+	vpcmpeqd	%ymm4, %ymm3, %ymm3
+	vmovdqa		%ymm2, 32*2+128(%rax)
+	vpaddd		%ymm5, %ymm0, %ymm2
+	vpcmpeqd	%ymm4, %ymm0, %ymm0
+	vmovdqa		%ymm3, 32*3+128(%rax)
+	vpaddd		%ymm5, %ymm1, %ymm3
+	vpcmpeqd	%ymm4, %ymm1, %ymm1
+	vmovdqa		%ymm0, 32*4+128(%rax)
+	vpaddd		%ymm5, %ymm2, %ymm8
+	vpcmpeqd	%ymm4, %ymm2, %ymm2
+	vmovdqa		%ymm1, 32*5+128(%rax)
+	vpaddd		%ymm5, %ymm3, %ymm9
+	vpcmpeqd	%ymm4, %ymm3, %ymm3
+	vmovdqa		%ymm2, 32*6+128(%rax)
+	vpaddd		%ymm5, %ymm8, %ymm10
+	vpcmpeqd	%ymm4, %ymm8, %ymm8
+	vmovdqa		%ymm3, 32*7+128(%rax)
+	vpaddd		%ymm5, %ymm9, %ymm11
+	vpcmpeqd	%ymm4, %ymm9, %ymm9
+	vpaddd		%ymm5, %ymm10, %ymm12
+	vpcmpeqd	%ymm4, %ymm10, %ymm10
+	vpaddd		%ymm5, %ymm11, %ymm13
+	vpcmpeqd	%ymm4, %ymm11, %ymm11
+	vpaddd		%ymm5, %ymm12, %ymm14
+	vpcmpeqd	%ymm4, %ymm12, %ymm12
+	vpaddd		%ymm5, %ymm13, %ymm15
+	vpcmpeqd	%ymm4, %ymm13, %ymm13
+	vpcmpeqd	%ymm4, %ymm14, %ymm14
+	vpcmpeqd	%ymm4, %ymm15, %ymm15
+
+	vmovdqa	-32(%r10),%ymm7			# .Lgather_permd
+	lea	128($inp), $inp
+	mov	\$9,$power

-.align	32
 .Loop_gather_1024:
-	vpand		-64($inp),		%xmm8,%xmm0
-	vpand		($inp),			%xmm9,%xmm1
-	vpand		64($inp),		%xmm10,%xmm2
-	vpand		($inp,%r11,2),		%xmm11,%xmm3
-	 vpor					%xmm0,%xmm1,%xmm1
-	vpand		64($inp,%r11,2),	%xmm12,%xmm4
-	 vpor					%xmm2,%xmm3,%xmm3
-	vpand		($inp,%r11,4),		%xmm13,%xmm5
-	 vpor					%xmm1,%xmm3,%xmm3
-	vpand		64($inp,%r11,4),	%xmm14,%xmm6
-	 vpor					%xmm4,%xmm5,%xmm5
-	vpand		-128($inp,%r11,8),	%xmm15,%xmm2
-	lea		($inp,%r11,8),$inp
-	 vpor					%xmm3,%xmm5,%xmm5
-	 vpor					%xmm2,%xmm6,%xmm6
-	 vpor					%xmm5,%xmm6,%xmm6
-	vpermd		%ymm6,%ymm7,%ymm6
-	vmovdqu		%ymm6,($out)
+	vmovdqa		32*0-128($inp),	%ymm0
+	vmovdqa		32*1-128($inp),	%ymm1
+	vmovdqa		32*2-128($inp),	%ymm2
+	vmovdqa		32*3-128($inp),	%ymm3
+	vpand		32*0+128(%rax),	%ymm0,	%ymm0
+	vpand		32*1+128(%rax),	%ymm1,	%ymm1
+	vpand		32*2+128(%rax),	%ymm2,	%ymm2
+	vpor		%ymm0, %ymm1, %ymm4
+	vpand		32*3+128(%rax),	%ymm3,	%ymm3
+	vmovdqa		32*4-128($inp),	%ymm0
+	vmovdqa		32*5-128($inp),	%ymm1
+	vpor		%ymm2, %ymm3, %ymm5
+	vmovdqa		32*6-128($inp),	%ymm2
+	vmovdqa		32*7-128($inp),	%ymm3
+	vpand		32*4+128(%rax),	%ymm0,	%ymm0
+	vpand		32*5+128(%rax),	%ymm1,	%ymm1
+	vpand		32*6+128(%rax),	%ymm2,	%ymm2
+	vpor		%ymm0, %ymm4, %ymm4
+	vpand		32*7+128(%rax),	%ymm3,	%ymm3
+	vpand		32*8-128($inp),	%ymm8,	%ymm0
+	vpor		%ymm1, %ymm5, %ymm5
+	vpand		32*9-128($inp),	%ymm9,	%ymm1
+	vpor		%ymm2, %ymm4, %ymm4
+	vpand		32*10-128($inp),%ymm10,	%ymm2
+	vpor		%ymm3, %ymm5, %ymm5
+	vpand		32*11-128($inp),%ymm11,	%ymm3
+	vpor		%ymm0, %ymm4, %ymm4
+	vpand		32*12-128($inp),%ymm12,	%ymm0
+	vpor		%ymm1, %ymm5, %ymm5
+	vpand		32*13-128($inp),%ymm13,	%ymm1
+	vpor		%ymm2, %ymm4, %ymm4
+	vpand		32*14-128($inp),%ymm14,	%ymm2
+	vpor		%ymm3, %ymm5, %ymm5
+	vpand		32*15-128($inp),%ymm15,	%ymm3
+	lea		32*16($inp), $inp
+	vpor		%ymm0, %ymm4, %ymm4
+	vpor		%ymm1, %ymm5, %ymm5
+	vpor		%ymm2, %ymm4, %ymm4
+	vpor		%ymm3, %ymm5, %ymm5
+
+	vpor		%ymm5, %ymm4, %ymm4
+	vextracti128	\$1, %ymm4, %xmm5	# upper half is cleared
+	vpor		%xmm4, %xmm5, %xmm5
+	vpermd		%ymm5,%ymm7,%ymm5
+	vmovdqu		%ymm5,($out)
 	lea		32($out),$out
-	dec	%eax
+	dec	$power
 	jnz	.Loop_gather_1024

 	vpxor	%ymm0,%ymm0,%ymm0
@@ -1645,20 +1705,20 @@ $code.=<<___;
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
-	movaps	(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	movaps	0x20(%rsp),%xmm8
-	movaps	0x30(%rsp),%xmm9
-	movaps	0x40(%rsp),%xmm10
-	movaps	0x50(%rsp),%xmm11
-	movaps	0x60(%rsp),%xmm12
-	movaps	0x70(%rsp),%xmm13
-	movaps	0x80(%rsp),%xmm14
-	movaps	0x90(%rsp),%xmm15
-	lea	0xa8(%rsp),%rsp
+	movaps	-0xa8(%r11),%xmm6
+	movaps	-0x98(%r11),%xmm7
+	movaps	-0x88(%r11),%xmm8
+	movaps	-0x78(%r11),%xmm9
+	movaps	-0x68(%r11),%xmm10
+	movaps	-0x58(%r11),%xmm11
+	movaps	-0x48(%r11),%xmm12
+	movaps	-0x38(%r11),%xmm13
+	movaps	-0x28(%r11),%xmm14
+	movaps	-0x18(%r11),%xmm15
 .LSEH_end_rsaz_1024_gather5:
 ___
 $code.=<<___;
+	lea	(%r11),%rsp
 	ret
 .size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
 ___
@@ -1692,8 +1752,10 @@ $code.=<<___;
 	.long	0,2,4,6,7,7,7,7
 .Lgather_permd:
 	.long	0,7,1,7,2,7,3,7
-.Lgather_table:
-	.byte	0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.Linc:
+	.long	0,0,0,0, 1,1,1,1
+	.long	2,2,2,2, 3,3,3,3
+	.long	4,4,4,4, 4,4,4,4
 .align	64
 ___

@@ -1821,18 +1883,19 @@ rsaz_se_handler:
 	.rva	rsaz_se_handler
 	.rva	.Lmul_1024_body,.Lmul_1024_epilogue
 .LSEH_info_rsaz_1024_gather5:
-	.byte	0x01,0x33,0x16,0x00
-	.byte	0x36,0xf8,0x09,0x00	#vmovaps 0x90(rsp),xmm15
-	.byte	0x31,0xe8,0x08,0x00	#vmovaps 0x80(rsp),xmm14
-	.byte	0x2c,0xd8,0x07,0x00	#vmovaps 0x70(rsp),xmm13
-	.byte	0x27,0xc8,0x06,0x00	#vmovaps 0x60(rsp),xmm12
-	.byte	0x22,0xb8,0x05,0x00	#vmovaps 0x50(rsp),xmm11
-	.byte	0x1d,0xa8,0x04,0x00	#vmovaps 0x40(rsp),xmm10
-	.byte	0x18,0x98,0x03,0x00	#vmovaps 0x30(rsp),xmm9
-	.byte	0x13,0x88,0x02,0x00	#vmovaps 0x20(rsp),xmm8
-	.byte	0x0e,0x78,0x01,0x00	#vmovaps 0x10(rsp),xmm7
-	.byte	0x09,0x68,0x00,0x00	#vmovaps 0x00(rsp),xmm6
-	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
+	.byte	0x01,0x36,0x17,0x0b
+	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
+	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
+	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
+	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
+	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
+	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
+	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
+	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
+	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
+	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
 ___
 }

@@ -902,9 +902,76 @@ rsaz_512_mul_gather4:
 	push	%r14
 	push	%r15

-	mov	$pwr, $pwr
-	subq	\$128+24, %rsp
+	subq	\$`128+24+($win64?0xb0:0)`, %rsp
+___
+$code.=<<___	if ($win64);
+	movaps	%xmm6,0xa0(%rsp)
+	movaps	%xmm7,0xb0(%rsp)
+	movaps	%xmm8,0xc0(%rsp)
+	movaps	%xmm9,0xd0(%rsp)
+	movaps	%xmm10,0xe0(%rsp)
+	movaps	%xmm11,0xf0(%rsp)
+	movaps	%xmm12,0x100(%rsp)
+	movaps	%xmm13,0x110(%rsp)
+	movaps	%xmm14,0x120(%rsp)
+	movaps	%xmm15,0x130(%rsp)
+___
+$code.=<<___;
 .Lmul_gather4_body:
+	movd	$pwr,%xmm8
+	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
+	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
+
+	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+	movdqa	%xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+	pcmpeqd	%xmm8,%xmm7
+
+	movdqa	16*0($bp),%xmm8
+	movdqa	16*1($bp),%xmm9
+	movdqa	16*2($bp),%xmm10
+	movdqa	16*3($bp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4($bp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5($bp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6($bp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7($bp),%xmm15
+	leaq	128($bp), %rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
 ___
 $code.=<<___ if ($addx);
 	movl	\$0x80100,%r11d
@@ -913,45 +980,38 @@ $code.=<<___ if ($addx);
 	je	.Lmulx_gather
 ___
 $code.=<<___;
-	movl	64($bp,$pwr,4), %eax
-	movq	$out, %xmm0		# off-load arguments
-	movl	($bp,$pwr,4), %ebx
-	movq	$mod, %xmm1
-	movq	$n0, 128(%rsp)
+	movq	%xmm8,%rbx
+
+	movq	$n0, 128(%rsp)		# off-load arguments
+	movq	$out, 128+8(%rsp)
+	movq	$mod, 128+16(%rsp)

-	shlq	\$32, %rax
-	or	%rax, %rbx
 	movq	($ap), %rax
 	 movq	8($ap), %rcx
-	 leaq	128($bp,$pwr,4), %rbp
 	mulq	%rbx			# 0 iteration
 	movq	%rax, (%rsp)
 	movq	%rcx, %rax
 	movq	%rdx, %r8

 	mulq	%rbx
-	 movd	(%rbp), %xmm4
 	addq	%rax, %r8
 	movq	16($ap), %rax
 	movq	%rdx, %r9
 	adcq	\$0, %r9

 	mulq	%rbx
-	 movd	64(%rbp), %xmm5
 	addq	%rax, %r9
 	movq	24($ap), %rax
 	movq	%rdx, %r10
 	adcq	\$0, %r10

 	mulq	%rbx
-	 pslldq	\$4, %xmm5
 	addq	%rax, %r10
 	movq	32($ap), %rax
 	movq	%rdx, %r11
 	adcq	\$0, %r11

 	mulq	%rbx
-	 por	%xmm5, %xmm4
 	addq	%rax, %r11
 	movq	40($ap), %rax
 	movq	%rdx, %r12
@@ -964,14 +1024,12 @@ $code.=<<___;
 	adcq	\$0, %r13

 	mulq	%rbx
-	 leaq	128(%rbp), %rbp
 	addq	%rax, %r13
 	movq	56($ap), %rax
 	movq	%rdx, %r14
 	adcq	\$0, %r14
 	
 	mulq	%rbx
-	 movq	%xmm4, %rbx
 	addq	%rax, %r14
 	 movq	($ap), %rax
 	movq	%rdx, %r15
@@ -983,6 +1041,35 @@ $code.=<<___;

 .align	32
 .Loop_mul_gather:
+	movdqa	16*0(%rbp),%xmm8
+	movdqa	16*1(%rbp),%xmm9
+	movdqa	16*2(%rbp),%xmm10
+	movdqa	16*3(%rbp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4(%rbp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5(%rbp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6(%rbp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7(%rbp),%xmm15
+	leaq	128(%rbp), %rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,%rbx
+
 	mulq	%rbx
 	addq	%rax, %r8
 	movq	8($ap), %rax
@@ -991,7 +1078,6 @@ $code.=<<___;
 	adcq	\$0, %r8

 	mulq	%rbx
-	 movd	(%rbp), %xmm4
 	addq	%rax, %r9
 	movq	16($ap), %rax
 	adcq	\$0, %rdx
@@ -1000,7 +1086,6 @@ $code.=<<___;
 	adcq	\$0, %r9

 	mulq	%rbx
-	 movd	64(%rbp), %xmm5
 	addq	%rax, %r10
 	movq	24($ap), %rax
 	adcq	\$0, %rdx
@@ -1009,7 +1094,6 @@ $code.=<<___;
 	adcq	\$0, %r10

 	mulq	%rbx
-	 pslldq	\$4, %xmm5
 	addq	%rax, %r11
 	movq	32($ap), %rax
 	adcq	\$0, %rdx
@@ -1018,7 +1102,6 @@ $code.=<<___;
 	adcq	\$0, %r11

 	mulq	%rbx
-	 por	%xmm5, %xmm4
 	addq	%rax, %r12
 	movq	40($ap), %rax
 	adcq	\$0, %rdx
@@ -1043,7 +1126,6 @@ $code.=<<___;
 	adcq	\$0, %r14

 	mulq	%rbx
-	 movq	%xmm4, %rbx
 	addq	%rax, %r15
 	 movq	($ap), %rax
 	adcq	\$0, %rdx
@@ -1051,7 +1133,6 @@ $code.=<<___;
 	movq	%rdx, %r15	
 	adcq	\$0, %r15

-	leaq	128(%rbp), %rbp
 	leaq	8(%rdi), %rdi

 	decl	%ecx
@@ -1066,8 +1147,8 @@ $code.=<<___;
 	movq	%r14, 48(%rdi)
 	movq	%r15, 56(%rdi)

-	movq	%xmm0, $out
-	movq	%xmm1, %rbp
+	movq	128+8(%rsp), $out
+	movq	128+16(%rsp), %rbp

 	movq	(%rsp), %r8
 	movq	8(%rsp), %r9
@@ -1085,45 +1166,37 @@ $code.=<<___ if ($addx);

 .align	32
 .Lmulx_gather:
-	mov	64($bp,$pwr,4), %eax
-	movq	$out, %xmm0		# off-load arguments
-	lea	128($bp,$pwr,4), %rbp
-	mov	($bp,$pwr,4), %edx
-	movq	$mod, %xmm1
-	mov	$n0, 128(%rsp)
+	movq	%xmm8,%rdx
+
+	mov	$n0, 128(%rsp)		# off-load arguments
+	mov	$out, 128+8(%rsp)
+	mov	$mod, 128+16(%rsp)

-	shl	\$32, %rax
-	or	%rax, %rdx
 	mulx	($ap), %rbx, %r8	# 0 iteration
 	mov	%rbx, (%rsp)
 	xor	%edi, %edi		# cf=0, of=0

 	mulx	8($ap), %rax, %r9
-	 movd	(%rbp), %xmm4

 	mulx	16($ap), %rbx, %r10
-	 movd	64(%rbp), %xmm5
 	adcx	%rax, %r8

 	mulx	24($ap), %rax, %r11
-	 pslldq	\$4, %xmm5
 	adcx	%rbx, %r9

 	mulx	32($ap), %rbx, %r12
-	 por	%xmm5, %xmm4
 	adcx	%rax, %r10

 	mulx	40($ap), %rax, %r13
 	adcx	%rbx, %r11

 	mulx	48($ap), %rbx, %r14
-	 lea	128(%rbp), %rbp
 	adcx	%rax, %r12
 	
 	mulx	56($ap), %rax, %r15
-	 movq	%xmm4, %rdx
 	adcx	%rbx, %r13
 	adcx	%rax, %r14
+	.byte	0x67
 	mov	%r8, %rbx
 	adcx	%rdi, %r15		# %rdi is 0

@@ -1132,24 +1205,48 @@ $code.=<<___ if ($addx);

 .align	32
 .Loop_mulx_gather:
-	mulx	($ap), %rax, %r8
+	movdqa	16*0(%rbp),%xmm8
+	movdqa	16*1(%rbp),%xmm9
+	movdqa	16*2(%rbp),%xmm10
+	movdqa	16*3(%rbp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4(%rbp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5(%rbp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6(%rbp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7(%rbp),%xmm15
+	leaq	128(%rbp), %rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,%rdx
+
+	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
 	adcx	%rax, %rbx
 	adox	%r9, %r8

 	mulx	8($ap), %rax, %r9
-	.byte	0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00		# movd	(%rbp), %xmm4
 	adcx	%rax, %r8
 	adox	%r10, %r9

 	mulx	16($ap), %rax, %r10
-	 movd	64(%rbp), %xmm5
-	 lea	128(%rbp), %rbp
 	adcx	%rax, %r9
 	adox	%r11, %r10

 	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
-	 pslldq	\$4, %xmm5
-	 por	%xmm5, %xmm4
 	adcx	%rax, %r10
 	adox	%r12, %r11

@@ -1163,10 +1260,10 @@ $code.=<<___ if ($addx);

 	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
 	adcx	%rax, %r13
+	.byte	0x67
 	adox	%r15, %r14

 	mulx	56($ap), %rax, %r15
-	 movq	%xmm4, %rdx
 	 mov	%rbx, 64(%rsp,%rcx,8)
 	adcx	%rax, %r14
 	adox	%rdi, %r15
@@ -1185,10 +1282,10 @@ $code.=<<___ if ($addx);
 	mov	%r14, 64+48(%rsp)
 	mov	%r15, 64+56(%rsp)

-	movq	%xmm0, $out
-	movq	%xmm1, %rbp
+	mov	128(%rsp), %rdx		# pull arguments
+	mov	128+8(%rsp), $out
+	mov	128+16(%rsp), %rbp

-	mov	128(%rsp), %rdx		# pull $n0
 	mov	(%rsp), %r8
 	mov	8(%rsp), %r9
 	mov	16(%rsp), %r10
@@ -1216,6 +1313,21 @@ $code.=<<___;
 	call	__rsaz_512_subtract

 	leaq	128+24+48(%rsp), %rax
+___
+$code.=<<___	if ($win64);
+	movaps	0xa0-0xc8(%rax),%xmm6
+	movaps	0xb0-0xc8(%rax),%xmm7
+	movaps	0xc0-0xc8(%rax),%xmm8
+	movaps	0xd0-0xc8(%rax),%xmm9
+	movaps	0xe0-0xc8(%rax),%xmm10
+	movaps	0xf0-0xc8(%rax),%xmm11
+	movaps	0x100-0xc8(%rax),%xmm12
+	movaps	0x110-0xc8(%rax),%xmm13
+	movaps	0x120-0xc8(%rax),%xmm14
+	movaps	0x130-0xc8(%rax),%xmm15
+	lea	0xb0(%rax),%rax
+___
+$code.=<<___;
 	movq	-48(%rax), %r15
 	movq	-40(%rax), %r14
 	movq	-32(%rax), %r13
@@ -1245,7 +1357,7 @@ rsaz_512_mul_scatter4:
 	mov	$pwr, $pwr
 	subq	\$128+24, %rsp
 .Lmul_scatter4_body:
-	leaq	($tbl,$pwr,4), $tbl
+	leaq	($tbl,$pwr,8), $tbl
 	movq	$out, %xmm0		# off-load arguments
 	movq	$mod, %xmm1
 	movq	$tbl, %xmm2
@@ -1316,30 +1428,14 @@ $code.=<<___;

 	call	__rsaz_512_subtract

-	movl	%r8d, 64*0($inp)	# scatter
-	shrq	\$32, %r8
-	movl	%r9d, 64*2($inp)
-	shrq	\$32, %r9
-	movl	%r10d, 64*4($inp)
-	shrq	\$32, %r10
-	movl	%r11d, 64*6($inp)
-	shrq	\$32, %r11
-	movl	%r12d, 64*8($inp)
-	shrq	\$32, %r12
-	movl	%r13d, 64*10($inp)
-	shrq	\$32, %r13
-	movl	%r14d, 64*12($inp)
-	shrq	\$32, %r14
-	movl	%r15d, 64*14($inp)
-	shrq	\$32, %r15
-	movl	%r8d, 64*1($inp)
-	movl	%r9d, 64*3($inp)
-	movl	%r10d, 64*5($inp)
-	movl	%r11d, 64*7($inp)
-	movl	%r12d, 64*9($inp)
-	movl	%r13d, 64*11($inp)
-	movl	%r14d, 64*13($inp)
-	movl	%r15d, 64*15($inp)
+	movq	%r8, 128*0($inp)	# scatter
+	movq	%r9, 128*1($inp)
+	movq	%r10, 128*2($inp)
+	movq	%r11, 128*3($inp)
+	movq	%r12, 128*4($inp)
+	movq	%r13, 128*5($inp)
+	movq	%r14, 128*6($inp)
+	movq	%r15, 128*7($inp)

 	leaq	128+24+48(%rsp), %rax
 	movq	-48(%rax), %r15
@@ -1943,16 +2039,14 @@ $code.=<<___;
 .type	rsaz_512_scatter4,\@abi-omnipotent
 .align	16
 rsaz_512_scatter4:
-	leaq	($out,$power,4), $out
+	leaq	($out,$power,8), $out
 	movl	\$8, %r9d
 	jmp	.Loop_scatter
 .align	16
 .Loop_scatter:
 	movq	($inp), %rax
 	leaq	8($inp), $inp
-	movl	%eax, ($out)
-	shrq	\$32, %rax
-	movl	%eax, 64($out)
+	movq	%rax, ($out)
 	leaq	128($out), $out
 	decl	%r9d
 	jnz	.Loop_scatter
@@ -1963,22 +2057,106 @@ rsaz_512_scatter4:
 .type	rsaz_512_gather4,\@abi-omnipotent
 .align	16
 rsaz_512_gather4:
-	leaq	($inp,$power,4), $inp
+___
+$code.=<<___	if ($win64);
+.LSEH_begin_rsaz_512_gather4:
+	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
+	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
+	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
+	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
+	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
+	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
+	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
+___
+$code.=<<___;
+	movd	$power,%xmm8
+	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
+	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
+
+	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+	movdqa	%xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+	pcmpeqd	%xmm8,%xmm7
 	movl	\$8, %r9d
 	jmp	.Loop_gather
 .align	16
 .Loop_gather:
-	movl	($inp), %eax
-	movl	64($inp), %r8d
+	movdqa	16*0($inp),%xmm8
+	movdqa	16*1($inp),%xmm9
+	movdqa	16*2($inp),%xmm10
+	movdqa	16*3($inp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4($inp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5($inp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6($inp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7($inp),%xmm15
 	leaq	128($inp), $inp
-	shlq	\$32, %r8
-	or	%r8, %rax
-	movq	%rax, ($out)
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,($out)
 	leaq	8($out), $out
 	decl	%r9d
 	jnz	.Loop_gather
+___
+$code.=<<___	if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	add	\$0xa8,%rsp
+___
+$code.=<<___;
 	ret
+.LSEH_end_rsaz_512_gather4:
 .size	rsaz_512_gather4,.-rsaz_512_gather4
+
+.align	64
+.Linc:
+	.long	0,0, 1,1
+	.long	2,2, 2,2
 ___
 }

@@ -2026,6 +2204,18 @@ se_handler:

 	lea	128+24+48(%rax),%rax

+	lea	.Lmul_gather4_epilogue(%rip),%rbx
+	cmp	%r10,%rbx
+	jne	.Lse_not_in_mul_gather4
+
+	lea	0xb0(%rax),%rax
+
+	lea	-48-0xa8(%rax),%rsi
+	lea	512($context),%rdi
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lse_not_in_mul_gather4:
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
@@ -2077,7 +2267,7 @@ se_handler:
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	sqr_handler,.-sqr_handler
+.size	se_handler,.-se_handler

 .section	.pdata
 .align	4
@@ -2101,6 +2291,10 @@ se_handler:
 	.rva	.LSEH_end_rsaz_512_mul_by_one
 	.rva	.LSEH_info_rsaz_512_mul_by_one

+	.rva	.LSEH_begin_rsaz_512_gather4
+	.rva	.LSEH_end_rsaz_512_gather4
+	.rva	.LSEH_info_rsaz_512_gather4
+
 .section	.xdata
 .align	8
 .LSEH_info_rsaz_512_sqr:
@@ -2123,6 +2317,19 @@ se_handler:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
+.LSEH_info_rsaz_512_gather4:
+	.byte	0x01,0x46,0x16,0x00
+	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
+	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
+	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
+	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
+	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
+	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
+	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
+	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
+	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
+	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
+	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
 ___
 }

@@ -186,14 +186,6 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
  }
 }

-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
-  BN_ULONG ret, waste;
-
-  asm("divq	%4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
-
-  return ret;
-}
-
 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                      int n) {
  BN_ULONG ret;
@@ -761,100 +761,126 @@ bn_sqr8x_mont:
 	# 4096. this is done to allow memory disambiguation logic
 	# do its job.
 	#
-	lea	-64(%rsp,$num,4),%r11
+	lea	-64(%rsp,$num,2),%r11
 	mov	($n0),$n0		# *n0
 	sub	$aptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lsqr8x_sp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 	jmp	.Lsqr8x_sp_done

 .align	32
 .Lsqr8x_sp_alt:
-	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
-	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
 	sub	%r11,%rsp
 .Lsqr8x_sp_done:
 	and	\$-64,%rsp
-	mov	$num,%r10	
+	mov	$num,%r10
 	neg	$num

-	lea	64(%rsp,$num,2),%r11	# copy of modulus
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
 .Lsqr8x_body:

-	mov	$num,$i
-	movq	%r11, %xmm2		# save pointer to modulus copy
-	shr	\$3+2,$i
-	mov	OPENSSL_ia32cap_P+8(%rip),%eax
-	jmp	.Lsqr8x_copy_n
-
-.align	32
-.Lsqr8x_copy_n:
-	movq	8*0($nptr),%xmm0
-	movq	8*1($nptr),%xmm1
-	movq	8*2($nptr),%xmm3
-	movq	8*3($nptr),%xmm4
-	lea	8*4($nptr),$nptr
-	movdqa	%xmm0,16*0(%r11)
-	movdqa	%xmm1,16*1(%r11)
-	movdqa	%xmm3,16*2(%r11)
-	movdqa	%xmm4,16*3(%r11)
-	lea	16*4(%r11),%r11
-	dec	$i
-	jnz	.Lsqr8x_copy_n
-
+	movq	$nptr, %xmm2		# save pointer to modulus
 	pxor	%xmm0,%xmm0
 	movq	$rptr,%xmm1		# save $rptr
 	movq	%r10, %xmm3		# -$num
 ___
 $code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%eax
 	and	\$0x80100,%eax
 	cmp	\$0x80100,%eax
 	jne	.Lsqr8x_nox

 	call	bn_sqrx8x_internal	# see x86_64-mont5 module
-
-	pxor	%xmm0,%xmm0
-	lea	48(%rsp),%rax
-	lea	64(%rsp,$num,2),%rdx
-	shr	\$3+2,$num
-	mov	40(%rsp),%rsi		# restore %rsp
-	jmp	.Lsqr8x_zero
+					# %rax	top-most carry
+					# %rbp	nptr
+					# %rcx	-8*num
+					# %r8	end of tp[2*num]
+	lea	(%r8,%rcx),%rbx
+	mov	%rcx,$num
+	mov	%rcx,%rdx
+	movq	%xmm1,$rptr
+	sar	\$3+2,%rcx		# %cf=0
+	jmp	.Lsqr8x_sub

 .align	32
 .Lsqr8x_nox:
 ___
 $code.=<<___;
 	call	bn_sqr8x_internal	# see x86_64-mont5 module
-
-	pxor	%xmm0,%xmm0
-	lea	48(%rsp),%rax
-	lea	64(%rsp,$num,2),%rdx
-	shr	\$3+2,$num
-	mov	40(%rsp),%rsi		# restore %rsp
-	jmp	.Lsqr8x_zero
+					# %rax	top-most carry
+					# %rbp	nptr
+					# %r8	-8*num
+					# %rdi	end of tp[2*num]
+	lea	(%rdi,$num),%rbx
+	mov	$num,%rcx
+	mov	$num,%rdx
+	movq	%xmm1,$rptr
+	sar	\$3+2,%rcx		# %cf=0
+	jmp	.Lsqr8x_sub

 .align	32
-.Lsqr8x_zero:
-	movdqa	%xmm0,16*0(%rax)	# wipe t
-	movdqa	%xmm0,16*1(%rax)
-	movdqa	%xmm0,16*2(%rax)
-	movdqa	%xmm0,16*3(%rax)
-	lea	16*4(%rax),%rax
-	movdqa	%xmm0,16*0(%rdx)	# wipe n
-	movdqa	%xmm0,16*1(%rdx)
-	movdqa	%xmm0,16*2(%rdx)
-	movdqa	%xmm0,16*3(%rdx)
-	lea	16*4(%rdx),%rdx
-	dec	$num
-	jnz	.Lsqr8x_zero
+.Lsqr8x_sub:
+	mov	8*0(%rbx),%r12
+	mov	8*1(%rbx),%r13
+	mov	8*2(%rbx),%r14
+	mov	8*3(%rbx),%r15
+	lea	8*4(%rbx),%rbx
+	sbb	8*0(%rbp),%r12
+	sbb	8*1(%rbp),%r13
+	sbb	8*2(%rbp),%r14
+	sbb	8*3(%rbp),%r15
+	lea	8*4(%rbp),%rbp
+	mov	%r12,8*0($rptr)
+	mov	%r13,8*1($rptr)
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+	inc	%rcx			# preserves %cf
+	jnz	.Lsqr8x_sub
+
+	sbb	\$0,%rax		# top-most carry
+	lea	(%rbx,$num),%rbx	# rewind
+	lea	($rptr,$num),$rptr	# rewind
+
+	movq	%rax,%xmm1
+	pxor	%xmm0,%xmm0
+	pshufd	\$0,%xmm1,%xmm1
+	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lsqr8x_cond_copy
+
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	16*0(%rbx),%xmm2
+	movdqa	16*1(%rbx),%xmm3
+	lea	16*2(%rbx),%rbx
+	movdqu	16*0($rptr),%xmm4
+	movdqu	16*1($rptr),%xmm5
+	lea	16*2($rptr),$rptr
+	movdqa	%xmm0,-16*2(%rbx)	# zero tp
+	movdqa	%xmm0,-16*1(%rbx)
+	movdqa	%xmm0,-16*2(%rbx,%rdx)
+	movdqa	%xmm0,-16*1(%rbx,%rdx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-16*2($rptr)
+	movdqu	%xmm5,-16*1($rptr)
+	add	\$32,$num
+	jnz	.Lsqr8x_cond_copy

 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
@@ -1121,64 +1147,75 @@ $code.=<<___;
 	adc	$zero,%r15		# modulo-scheduled
 	sub	0*8($tptr),$zero	# pull top-most carry
 	adc	%r15,%r14
-	mov	-8($nptr),$mi
 	sbb	%r15,%r15		# top-most carry
 	mov	%r14,-1*8($tptr)

 	cmp	16(%rsp),$bptr
 	jne	.Lmulx4x_outer

-	sub	%r14,$mi		# compare top-most words
-	sbb	$mi,$mi
-	or	$mi,%r15
-
-	neg	$num
-	xor	%rdx,%rdx
-	mov	32(%rsp),$rptr		# restore rp
 	lea	64(%rsp),$tptr
-
-	pxor	%xmm0,%xmm0
-	mov	0*8($nptr,$num),%r8
-	mov	1*8($nptr,$num),%r9
-	neg	%r8
-	jmp	.Lmulx4x_sub_entry
+	sub	$num,$nptr		# rewind $nptr
+	neg	%r15
+	mov	$num,%rdx
+	shr	\$3+2,$num		# %cf=0
+	mov	32(%rsp),$rptr		# restore rp
+	jmp	.Lmulx4x_sub

 .align	32
 .Lmulx4x_sub:
-	mov	0*8($nptr,$num),%r8
-	mov	1*8($nptr,$num),%r9
-	not	%r8
-.Lmulx4x_sub_entry:
-	mov	2*8($nptr,$num),%r10
-	not	%r9
-	and	%r15,%r8
-	mov	3*8($nptr,$num),%r11
-	not	%r10
-	and	%r15,%r9
-	not	%r11
-	and	%r15,%r10
-	and	%r15,%r11
-
-	neg	%rdx			# mov %rdx,%cf
-	adc	0*8($tptr),%r8
-	adc	1*8($tptr),%r9
-	movdqa	%xmm0,($tptr)
-	adc	2*8($tptr),%r10
-	adc	3*8($tptr),%r11
-	movdqa	%xmm0,16($tptr)
-	lea	4*8($tptr),$tptr
-	sbb	%rdx,%rdx		# mov %cf,%rdx
-
-	mov	%r8,0*8($rptr)
-	mov	%r9,1*8($rptr)
-	mov	%r10,2*8($rptr)
-	mov	%r11,3*8($rptr)
-	lea	4*8($rptr),$rptr
-
-	add	\$32,$num
+	mov	8*0($tptr),%r11
+	mov	8*1($tptr),%r12
+	mov	8*2($tptr),%r13
+	mov	8*3($tptr),%r14
+	lea	8*4($tptr),$tptr
+	sbb	8*0($nptr),%r11
+	sbb	8*1($nptr),%r12
+	sbb	8*2($nptr),%r13
+	sbb	8*3($nptr),%r14
+	lea	8*4($nptr),$nptr
+	mov	%r11,8*0($rptr)
+	mov	%r12,8*1($rptr)
+	mov	%r13,8*2($rptr)
+	mov	%r14,8*3($rptr)
+	lea	8*4($rptr),$rptr
+	dec	$num			# preserves %cf
 	jnz	.Lmulx4x_sub

+	sbb	\$0,%r15		# top-most carry
+	lea	64(%rsp),$tptr
+	sub	%rdx,$rptr		# rewind
+
+	movq	%r15,%xmm1
+	pxor	%xmm0,%xmm0
+	pshufd	\$0,%xmm1,%xmm1
 	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lmulx4x_cond_copy
+
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	16*0($tptr),%xmm2
+	movdqa	16*1($tptr),%xmm3
+	lea	16*2($tptr),$tptr
+	movdqu	16*0($rptr),%xmm4
+	movdqu	16*1($rptr),%xmm5
+	lea	16*2($rptr),$rptr
+	movdqa	%xmm0,-16*2($tptr)	# zero tp
+	movdqa	%xmm0,-16*1($tptr)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-16*2($rptr)
+	movdqu	%xmm5,-16*1($rptr)
+	sub	\$32,%rdx
+	jnz	.Lmulx4x_cond_copy
+
+	mov	%rdx,($tptr)
+
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
 	mov	-40(%rsi),%r14
@@ -266,6 +266,18 @@ int BN_set_word(BIGNUM *bn, BN_ULONG value) {
  return 1;
 }

+int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
+  if (bn_wexpand(bn, num) == NULL) {
+    return 0;
+  }
+  memmove(bn->d, words, num * sizeof(BN_ULONG));
+  /* |bn_wexpand| verified that |num| isn't too large. */
+  bn->top = (int)num;
+  bn_correct_top(bn);
+  bn->neg = 0;
+  return 1;
+}
+
 int BN_is_negative(const BIGNUM *bn) {
  return bn->neg != 0;
 }
@@ -56,55 +56,126 @@

 #include <openssl/bn.h>

+#include <assert.h>
 #include <limits.h>
 #include <openssl/err.h>

 #include "internal.h"


-#define asm __asm__
+#if !defined(BN_ULLONG)
+/* bn_div_words divides a double-width |h|,|l| by |d| and returns the result,
+ * which must fit in a |BN_ULONG|. */
+static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
+  int i, count = 2;

-#if !defined(OPENSSL_NO_ASM)
-# if defined(__GNUC__) && __GNUC__>=2
-#  if defined(OPENSSL_X86)
-   /*
-    * There were two reasons for implementing this template:
-    * - GNU C generates a call to a function (__udivdi3 to be exact)
-    *   in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
-    *   understand why...);
-    * - divl doesn't only calculate quotient, but also leaves
-    *   remainder in %edx which we can definitely use here:-)
-    *
-    *					<appro@fy.chalmers.se>
-    */
-#undef div_asm
-#  define div_asm(n0,n1,d0)		\
-	({  asm volatile (			\
-		"divl	%4"			\
-		: "=a"(q), "=d"(rem)		\
-		: "a"(n1), "d"(n0), "g"(d0)	\
-		: "cc");			\
-	    q;					\
-	})
-#  define REMAINDER_IS_ALREADY_CALCULATED
-#  elif defined(OPENSSL_X86_64)
-   /*
-    * Same story here, but it's 128-bit by 64-bit division. Wow!
-    *					<appro@fy.chalmers.se>
-    */
-#  undef div_asm
-#  define div_asm(n0,n1,d0)		\
-	({  asm volatile (			\
-		"divq	%4"			\
-		: "=a"(q), "=d"(rem)		\
-		: "a"(n1), "d"(n0), "g"(d0)	\
-		: "cc");			\
-	    q;					\
-	})
-#  define REMAINDER_IS_ALREADY_CALCULATED
-#  endif /* __<cpu> */
-# endif /* __GNUC__ */
-#endif /* OPENSSL_NO_ASM */
+  if (d == 0) {
+    return BN_MASK2;
+  }
+
+  i = BN_num_bits_word(d);
+  assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
+
+  i = BN_BITS2 - i;
+  if (h >= d) {
+    h -= d;
+  }
+
+  if (i) {
+    d <<= i;
+    h = (h << i) | (l >> (BN_BITS2 - i));
+    l <<= i;
+  }
+  dh = (d & BN_MASK2h) >> BN_BITS4;
+  dl = (d & BN_MASK2l);
+  for (;;) {
+    if ((h >> BN_BITS4) == dh) {
+      q = BN_MASK2l;
+    } else {
+      q = h / dh;
+    }
+
+    th = q * dh;
+    tl = dl * q;
+    for (;;) {
+      t = h - th;
+      if ((t & BN_MASK2h) ||
+          ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
+        break;
+      }
+      q--;
+      th -= dh;
+      tl -= dl;
+    }
+    t = (tl >> BN_BITS4);
+    tl = (tl << BN_BITS4) & BN_MASK2h;
+    th += t;
+
+    if (l < tl) {
+      th++;
+    }
+    l -= tl;
+    if (h < th) {
+      h += d;
+      q--;
+    }
+    h -= th;
+
+    if (--count == 0) {
+      break;
+    }
+
+    ret = q << BN_BITS4;
+    h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
+    l = (l & BN_MASK2l) << BN_BITS4;
+  }
+
+  ret |= q;
+  return ret;
+}
+#endif /* !defined(BN_ULLONG) */
+
+static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
+                                    BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
+  /* GCC and Clang generate function calls to |__udivdi3| and |__umoddi3| when
+   * the |BN_ULLONG|-based C code is used.
+   *
+   * GCC bugs:
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54183
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58897
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668
+   *
+   * Clang bugs:
+   *   * https://llvm.org/bugs/show_bug.cgi?id=6397
+   *   * https://llvm.org/bugs/show_bug.cgi?id=12418
+   *
+   * These issues aren't specific to x86 and x86_64, so it might be worthwhile
+   * to add more assembly language implementations. */
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__GNUC__)
+  __asm__ volatile (
+    "divl %4"
+    : "=a"(*quotient_out), "=d"(*rem_out)
+    : "a"(n1), "d"(n0), "g"(d0)
+    : "cc" );
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
+  __asm__ volatile (
+    "divq %4"
+    : "=a"(*quotient_out), "=d"(*rem_out)
+    : "a"(n1), "d"(n0), "g"(d0)
+    : "cc" );
+#else
+#if defined(BN_ULLONG)
+  BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
+  *quotient_out = (BN_ULONG)(n / d0);
+#else
+  *quotient_out = bn_div_words(n0, n1, d0);
+#endif
+  *rem_out = n1 - (*quotient_out * d0);
+#endif
+}

 /* BN_div computes  dv := num / divisor,  rounding towards
 * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
@@ -260,23 +331,10 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
      q = BN_MASK2;
    } else {
      /* n0 < d0 */
+      bn_div_rem_words(&q, &rem, n0, n1, d0);
+
 #ifdef BN_ULLONG
-      BN_ULLONG t2;
-
-#if defined(BN_ULLONG) && !defined(div_asm)
-      q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
-#else
-      q = div_asm(n0, n1, d0);
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-      /* rem doesn't have to be BN_ULLONG. The least we know it's less that d0,
-       * isn't it? */
-      rem = (n1 - q * d0) & BN_MASK2;
-#endif
-
-      t2 = (BN_ULLONG)d1 * q;
-
+      BN_ULLONG t2 = (BN_ULLONG)d1 * q;
      for (;;) {
        if (t2 <= ((((BN_ULLONG)rem) << BN_BITS2) | wnump[-2])) {
          break;
@@ -290,13 +348,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
      }
 #else /* !BN_ULLONG */
      BN_ULONG t2l, t2h;
-
-      q = bn_div_words(n0, n1, d0);
-
-      rem = (n1 - q * d0) & BN_MASK2;
-
      BN_UMULT_LOHI(t2l, t2h, d1, q);
-
      for (;;) {
        if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2]))) {
          break;
@@ -556,7 +608,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
    return 0;
  }

-  /* normalize input (so bn_div_words doesn't complain) */
+  /* normalize input for |bn_div_rem_words|. */
  j = BN_BITS2 - BN_num_bits_word(w);
  w <<= j;
  if (!BN_lshift(a, a, j)) {
@@ -564,10 +616,10 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
  }

  for (i = a->top - 1; i >= 0; i--) {
-    BN_ULONG l, d;
-
-    l = a->d[i];
-    d = bn_div_words(ret, l, w);
+    BN_ULONG l = a->d[i];
+    BN_ULONG d;
+    BN_ULONG unused_rem;
+    bn_div_rem_words(&d, &unused_rem, ret, l, w);
    ret = (l - ((d * w) & BN_MASK2)) & BN_MASK2;
    a->d[i] = d;
  }
@@ -209,6 +209,7 @@ static void BN_RECP_CTX_init(BN_RECP_CTX *recp) {
  BN_init(&recp->N);
  BN_init(&recp->Nr);
  recp->num_bits = 0;
+  recp->shift = 0;
  recp->flags = 0;
 }

@@ -787,29 +788,65 @@ err:
 * pattern as far as cache lines are concerned. The following functions are
 * used to transfer a BIGNUM from/to that table. */
 static int copy_to_prebuf(const BIGNUM *b, int top, unsigned char *buf, int idx,
-                          int width) {
-  size_t i, j;
+                          int window) {
+  int i, j;
+  const int width = 1 << window;
+  BN_ULONG *table = (BN_ULONG *) buf;

  if (top > b->top) {
    top = b->top; /* this works because 'buf' is explicitly zeroed */
  }
-  for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
-    buf[j] = ((unsigned char *)b->d)[i];
+
+  for (i = 0, j = idx; i < top; i++, j += width)  {
+    table[j] = b->d[i];
  }

  return 1;
 }

 static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx,
-                            int width) {
-  size_t i, j;
+                            int window) {
+  int i, j;
+  const int width = 1 << window;
+  volatile BN_ULONG *table = (volatile BN_ULONG *)buf;

  if (bn_wexpand(b, top) == NULL) {
    return 0;
  }

-  for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
-    ((unsigned char *)b->d)[i] = buf[j];
+  if (window <= 3) {
+    for (i = 0; i < top; i++, table += width) {
+      BN_ULONG acc = 0;
+
+      for (j = 0; j < width; j++) {
+        acc |= table[j] & ((BN_ULONG)0 - (constant_time_eq_int(j, idx) & 1));
+      }
+
+      b->d[i] = acc;
+    }
+  } else {
+    int xstride = 1 << (window - 2);
+    BN_ULONG y0, y1, y2, y3;
+
+    i = idx >> (window - 2); /* equivalent of idx / xstride */
+    idx &= xstride - 1;      /* equivalent of idx % xstride */
+
+    y0 = (BN_ULONG)0 - (constant_time_eq_int(i, 0) & 1);
+    y1 = (BN_ULONG)0 - (constant_time_eq_int(i, 1) & 1);
+    y2 = (BN_ULONG)0 - (constant_time_eq_int(i, 2) & 1);
+    y3 = (BN_ULONG)0 - (constant_time_eq_int(i, 3) & 1);
+
+    for (i = 0; i < top; i++, table += width) {
+      BN_ULONG acc = 0;
+
+      for (j = 0; j < xstride; j++) {
+        acc |= ((table[j + 0 * xstride] & y0) | (table[j + 1 * xstride] & y1) |
+                (table[j + 2 * xstride] & y2) | (table[j + 3 * xstride] & y3)) &
+               ((BN_ULONG)0 - (constant_time_eq_int(j, idx) & 1));
+      }
+
+      b->d[i] = acc;
+    }
  }

  b->top = top;
@@ -891,8 +928,6 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    return BN_one(rr);
  }

-  BN_CTX_start(ctx);
-
  /* Allocate a montgomery context if it was not supplied by the caller. */
  if (mont == NULL) {
    new_mont = BN_MONT_CTX_new();
@@ -935,9 +970,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 #if defined(OPENSSL_BN_ASM_MONT5)
  if (window >= 5) {
    window = 5; /* ~5% improvement for RSA2048 sign, and even for RSA4096 */
-    if ((top & 7) == 0) {
-      powerbufLen += 2 * top * sizeof(m->d[0]);
-    }
+    /* reserve space for mont->N.d[] copy */
+    powerbufLen += top * sizeof(mont->N.d[0]);
  }
 #endif

@@ -1008,7 +1042,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
  /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
   * 512-bit RSA is hardly relevant, we omit it to spare size... */
  if (window == 5 && top > 1) {
-    const BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+    const BN_ULONG *n0 = mont->n0;
+    BN_ULONG *np;

    /* BN_to_montgomery can contaminate words above .top
     * [in BN_DEBUG[_DEBUG] build]... */
@@ -1019,14 +1054,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
      tmp.d[i] = 0;
    }

-    if (top & 7) {
-      np2 = np;
-    } else {
-      BN_ULONG *np_double = am.d + top;
-      for (i = 0; i < top; i++) {
-        np_double[2 * i] = np[i];
-      }
-      np2 = np_double;
+    /* copy mont->N.d[] to improve cache locality */
+    for (np = am.d + top, i = 0; i < top; i++) {
+      np[i] = mont->N.d[i];
    }

    bn_scatter5(tmp.d, top, powerbuf, 0);
@@ -1041,7 +1071,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    }
    for (i = 3; i < 8; i += 2) {
      int j;
-      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
      bn_scatter5(tmp.d, top, powerbuf, i);
      for (j = 2 * i; j < 32; j *= 2) {
        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -1049,13 +1079,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
      }
    }
    for (; i < 16; i += 2) {
-      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
      bn_scatter5(tmp.d, top, powerbuf, i);
      bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
      bn_scatter5(tmp.d, top, powerbuf, 2 * i);
    }
    for (; i < 32; i += 2) {
-      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
      bn_scatter5(tmp.d, top, powerbuf, i);
    }

@@ -1103,7 +1133,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        wvalue >>= (bits - 4) & 7;
        wvalue &= 0x1f;
        bits -= 5;
-        bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+        bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
      }
      while (bits >= 0) {
        /* Read five bits from |bits-4| through |bits|, inclusive. */
@@ -1112,11 +1142,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        wvalue >>= first_bit & 7;
        wvalue &= 0x1f;
        bits -= 5;
-        bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+        bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
      }
    }

-    ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+    ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
    tmp.top = top;
    bn_correct_top(&tmp);
    if (ret) {
@@ -1128,8 +1158,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
  } else
 #endif
  {
-    if (!copy_to_prebuf(&tmp, top, powerbuf, 0, numPowers) ||
-        !copy_to_prebuf(&am, top, powerbuf, 1, numPowers)) {
+    if (!copy_to_prebuf(&tmp, top, powerbuf, 0, window) ||
+        !copy_to_prebuf(&am, top, powerbuf, 1, window)) {
      goto err;
    }

@@ -1140,13 +1170,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
     */
    if (window > 1) {
      if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx) ||
-          !copy_to_prebuf(&tmp, top, powerbuf, 2, numPowers)) {
+          !copy_to_prebuf(&tmp, top, powerbuf, 2, window)) {
        goto err;
      }
      for (i = 3; i < numPowers; i++) {
        /* Calculate a^i = a^(i-1) * a */
        if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx) ||
-            !copy_to_prebuf(&tmp, top, powerbuf, i, numPowers)) {
+            !copy_to_prebuf(&tmp, top, powerbuf, i, window)) {
          goto err;
        }
      }
@@ -1156,7 +1186,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) {
      wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
    }
-    if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, numPowers)) {
+    if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) {
      goto err;
    }

@@ -1175,7 +1205,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
      }

      /* Fetch the appropriate pre-computed value from the pre-buf */
-      if (!copy_from_prebuf(&am, top, powerbuf, wvalue, numPowers)) {
+      if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) {
        goto err;
      }

@@ -1198,7 +1228,6 @@ err:
    OPENSSL_cleanse(powerbuf, powerbufLen);
    OPENSSL_free(powerbufFree);
  }
-  BN_CTX_end(ctx);
  return (ret);
 }

@@ -202,86 +202,6 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
  }
 }

-#if defined(BN_ULLONG)
-
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
-  return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
-}
-
-#else
-
-/* Divide h,l by d and return the result. */
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
-  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
-  int i, count = 2;
-
-  if (d == 0) {
-    return BN_MASK2;
-  }
-
-  i = BN_num_bits_word(d);
-  assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
-
-  i = BN_BITS2 - i;
-  if (h >= d) {
-    h -= d;
-  }
-
-  if (i) {
-    d <<= i;
-    h = (h << i) | (l >> (BN_BITS2 - i));
-    l <<= i;
-  }
-  dh = (d & BN_MASK2h) >> BN_BITS4;
-  dl = (d & BN_MASK2l);
-  for (;;) {
-    if ((h >> BN_BITS4) == dh) {
-      q = BN_MASK2l;
-    } else {
-      q = h / dh;
-    }
-
-    th = q * dh;
-    tl = dl * q;
-    for (;;) {
-      t = h - th;
-      if ((t & BN_MASK2h) ||
-          ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
-        break;
-      }
-      q--;
-      th -= dh;
-      tl -= dl;
-    }
-    t = (tl >> BN_BITS4);
-    tl = (tl << BN_BITS4) & BN_MASK2h;
-    th += t;
-
-    if (l < tl) {
-      th++;
-    }
-    l -= tl;
-    if (h < th) {
-      h += d;
-      q--;
-    }
-    h -= th;
-
-    if (--count == 0) {
-      break;
-    }
-
-    ret = q << BN_BITS4;
-    h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
-    l = (l & BN_MASK2l) << BN_BITS4;
-  }
-
-  ret |= q;
-  return ret;
-}
-
-#endif /* !defined(BN_ULLONG) */
-
 #ifdef BN_ULLONG
 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
                      int n) {
@@ -192,10 +192,14 @@ BIGNUM *bn_expand(BIGNUM *bn, size_t bits);
 #define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
 #endif

+
+/* bn_set_words sets |bn| to the value encoded in the |num| words in |words|,
+ * least significant word first. */
+int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
+
 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
 void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);

@@ -326,14 +326,12 @@ int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
  return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx);
 }

-#if 0
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
                                   const BN_MONT_CTX *mont) {
-  const BIGNUM *n;
  BN_ULONG *ap, *np, *rp, n0, v, carry;
  int nl, max, i;

-  n = &mont->N;
+  const BIGNUM *n = &mont->N;
  nl = n->top;
  if (nl == 0) {
    ret->top = 0;
@@ -376,13 +374,13 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,

  {
    BN_ULONG *nrp;
-    size_t m;
+    uintptr_t m;

    v = bn_sub_words(rp, ap, np, nl) - carry;
    /* if subtraction result is real, then trick unconditional memcpy below to
     * perform in-place "refresh" instead of actual copy. */
-    m = (0 - (size_t)v);
-    nrp = (BN_ULONG *)(((intptr_t)rp & ~m) | ((intptr_t)ap & m));
+    m = (0u - (uintptr_t)v);
+    nrp = (BN_ULONG *)(((uintptr_t)rp & ~m) | ((uintptr_t)ap & m));

    for (i = 0, nl -= 4; i < nl; i += 4) {
      BN_ULONG t1, t2, t3, t4;
@@ -411,103 +409,25 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,

  return 1;
 }
-#endif

-#define PTR_SIZE_INT size_t
-
-static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, const BN_MONT_CTX *mont)
-	{
-	BN_ULONG *ap,*np,*rp,n0,v,carry;
-	int nl,max,i;
-
-	const BIGNUM *n = &mont->N;
-	nl=n->top;
-	if (nl == 0) { ret->top=0; return(1); }
-
-	max=(2*nl); /* carry is stored separately */
-	if (bn_wexpand(r,max) == NULL) return(0);
-
-	r->neg^=n->neg;
-	np=n->d;
-	rp=r->d;
-
-	/* clear the top words of T */
-#if 1
-	for (i=r->top; i<max; i++) /* memset? XXX */
-		rp[i]=0;
-#else
-	memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
-#endif
-
-	r->top=max;
-	n0=mont->n0[0];
-
-	for (carry=0, i=0; i<nl; i++, rp++)
-		{
-		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
-		v = (v+carry+rp[nl])&BN_MASK2;
-		carry |= (v != rp[nl]);
-		carry &= (v <= rp[nl]);
-		rp[nl]=v;
-		}
-
-	if (bn_wexpand(ret,nl) == NULL) return(0);
-	ret->top=nl;
-	ret->neg=r->neg;
-
-	rp=ret->d;
-	ap=&(r->d[nl]);
-
-	{
-	BN_ULONG *nrp;
-	size_t m;
-
-	v=bn_sub_words(rp,ap,np,nl)-carry;
-	/* if subtraction result is real, then
-	 * trick unconditional memcpy below to perform in-place
-	 * "refresh" instead of actual copy. */
-	m=(0-(size_t)v);
-	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
-
-	for (i=0,nl-=4; i<nl; i+=4)
-		{
-		BN_ULONG t1,t2,t3,t4;
-		
-		t1=nrp[i+0];
-		t2=nrp[i+1];
-		t3=nrp[i+2];	ap[i+0]=0;
-		t4=nrp[i+3];	ap[i+1]=0;
-		rp[i+0]=t1;	ap[i+2]=0;
-		rp[i+1]=t2;	ap[i+3]=0;
-		rp[i+2]=t3;
-		rp[i+3]=t4;
-		}
-	for (nl+=4; i<nl; i++)
-		rp[i]=nrp[i], ap[i]=0;
-	}
-	bn_correct_top(r);
-	bn_correct_top(ret);
-
-	return(1);
-	}
-
-int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+int BN_from_montgomery(BIGNUM *r, const BIGNUM *a, const BN_MONT_CTX *mont,
                       BN_CTX *ctx) {
-  int retn = 0;
+  int ret = 0;
  BIGNUM *t;

  BN_CTX_start(ctx);
  t = BN_CTX_get(ctx);
-  if (t == NULL) {
-    return 0;
+  if (t == NULL ||
+      !BN_copy(t, a)) {
+    goto err;
  }

-  if (BN_copy(t, a)) {
-    retn = BN_from_montgomery_word(ret, t, mont);
-  }
+  ret = BN_from_montgomery_word(r, t, mont);
+
+err:
  BN_CTX_end(ctx);

-  return retn;
+  return ret;
 }

 int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
@@ -66,7 +66,8 @@
 #define BN_SQR_RECURSIVE_SIZE_NORMAL BN_MUL_RECURSIVE_SIZE_NORMAL


-void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb) {
+static void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b,
+                          int nb) {
  BN_ULONG *rr;

  if (na < nb) {
@@ -34,6 +34,7 @@ static int is_string_type(unsigned tag) {
  switch (tag & 0x1f) {
    case CBS_ASN1_BITSTRING:
    case CBS_ASN1_OCTETSTRING:
+    case CBS_ASN1_UTF8STRING:
    case CBS_ASN1_NUMERICSTRING:
    case CBS_ASN1_PRINTABLESTRING:
    case CBS_ASN1_T16STRING:
@@ -4,7 +4,31 @@ if (${ARCH} STREQUAL "arm")
  set(
    CHACHA_ARCH_SOURCES

-    chacha_vec_arm.S
+    chacha-armv4.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "aarch64")
+  set(
+    CHACHA_ARCH_SOURCES
+
+    chacha-armv8.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "x86")
+  set(
+    CHACHA_ARCH_SOURCES
+
+    chacha-x86.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "x86_64")
+  set(
+    CHACHA_ARCH_SOURCES
+
+    chacha-x86_64.${ASM_EXT}
  )
 endif()

@@ -13,8 +37,22 @@ add_library(

  OBJECT

-  chacha_generic.c
-  chacha_vec.c
+  chacha.c

  ${CHACHA_ARCH_SOURCES}
 )
+
+add_executable(
+  chacha_test
+
+  chacha_test.cc
+  $<TARGET_OBJECTS:test_support>
+)
+
+target_link_libraries(chacha_test crypto)
+add_dependencies(all_tests chacha_test)
+
+perlasm(chacha-armv4.${ASM_EXT} asm/chacha-armv4.pl)
+perlasm(chacha-armv8.${ASM_EXT} asm/chacha-armv8.pl)
+perlasm(chacha-x86.${ASM_EXT} asm/chacha-x86.pl)
+perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl)
@@ -0,0 +1,769 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# January 2015
+#
+# ChaCha20 for x86.
+#
+# Performance in cycles per byte out of large buffer.
+#
+#		1xIALU/gcc	4xSSSE3
+# Pentium	17.5/+80%
+# PIII		14.2/+60%
+# P4		18.6/+84%
+# Core2		9.56/+89%	4.83
+# Westmere	9.50/+45%	3.35
+# Sandy Bridge	10.7/+47%	3.24
+# Haswell	8.22/+50%	2.89
+# Silvermont	17.8/+36%	8.53
+# Sledgehammer	10.2/+54%
+# Bulldozer	13.5/+50%	4.39(*)
+#
+# (*)  Bulldozer actually executes 4xXOP code path that delivers 3.50;
+#
+# Modified from upstream OpenSSL to remove the XOP code.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
+
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=$xmm;
+
+$a="eax";
+($b,$b_)=("ebx","ebp");
+($c,$c_)=("ecx","esi");
+($d,$d_)=("edx","edi");
+
+sub QUARTERROUND {
+my ($ai,$bi,$ci,$di,$i)=@_;
+my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
+my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
+
+	#       a   b   c   d
+	#
+	#       0   4   8  12 < even round
+	#       1   5   9  13
+	#       2   6  10  14
+	#       3   7  11  15
+	#       0   5  10  15 < odd round
+	#       1   6  11  12
+	#       2   7   8  13
+	#       3   4   9  14
+
+	if ($i==0) {
+            my $j=4;
+	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
+	} elsif ($i==3) {
+            my $j=0;
+	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
+	} elsif ($i==4) {
+            my $j=4;
+	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
+	} elsif ($i==7) {
+            my $j=0;
+	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
+	}
+
+	#&add	($a,$b);			# see elsewhere
+	&xor	($d,$a);
+	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
+	&rol	($d,16);
+	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
+	&add	($c,$d);
+	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
+	&xor	($b,$c);
+	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
+	&rol	($b,12);
+	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
+	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
+	&add	($a,$b);
+	&xor	($d,$a);
+	&mov	(&DWP(4*$ai,"esp"),$a);
+	&rol	($d,8);
+	&mov	($a,&DWP(4*$an,"esp"));
+	&add	($c,$d);
+	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
+	&mov	($d_,$d)			if ($di==$dn);
+	&xor	($b,$c);
+	 &add	($a,$b_)			if ($i<7);	# elsewhere
+	&rol	($b,7);
+
+	($b,$b_)=($b_,$b);
+	($c,$c_)=($c_,$c);
+	($d,$d_)=($d_,$d);
+}
+
+&static_label("ssse3_shortcut");
+&static_label("ssse3_data");
+&static_label("pic_point");
+
+&function_begin("ChaCha20_ctr32");
+	&xor	("eax","eax");
+	&cmp	("eax",&wparam(2));		# len==0?
+	&je	(&label("no_data"));
+if ($xmm) {
+	&call	(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("eax");
+	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
+	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
+	&jz	(&label("x86"));
+	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
+	&jz	(&label("x86"));
+	&jmp	(&label("ssse3_shortcut"));
+&set_label("x86");
+}
+	&mov	("esi",&wparam(3));		# key
+	&mov	("edi",&wparam(4));		# counter and nonce
+
+	&stack_push(33);
+
+	&mov	("eax",&DWP(4*0,"esi"));	# copy key
+	&mov	("ebx",&DWP(4*1,"esi"));
+	&mov	("ecx",&DWP(4*2,"esi"));
+	&mov	("edx",&DWP(4*3,"esi"));
+	&mov	(&DWP(64+4*4,"esp"),"eax");
+	&mov	(&DWP(64+4*5,"esp"),"ebx");
+	&mov	(&DWP(64+4*6,"esp"),"ecx");
+	&mov	(&DWP(64+4*7,"esp"),"edx");
+	&mov	("eax",&DWP(4*4,"esi"));
+	&mov	("ebx",&DWP(4*5,"esi"));
+	&mov	("ecx",&DWP(4*6,"esi"));
+	&mov	("edx",&DWP(4*7,"esi"));
+	&mov	(&DWP(64+4*8,"esp"),"eax");
+	&mov	(&DWP(64+4*9,"esp"),"ebx");
+	&mov	(&DWP(64+4*10,"esp"),"ecx");
+	&mov	(&DWP(64+4*11,"esp"),"edx");
+	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
+	&mov	("ebx",&DWP(4*1,"edi"));
+	&mov	("ecx",&DWP(4*2,"edi"));
+	&mov	("edx",&DWP(4*3,"edi"));
+	&sub	("eax",1);
+	&mov	(&DWP(64+4*12,"esp"),"eax");
+	&mov	(&DWP(64+4*13,"esp"),"ebx");
+	&mov	(&DWP(64+4*14,"esp"),"ecx");
+	&mov	(&DWP(64+4*15,"esp"),"edx");
+	&jmp	(&label("entry"));
+
+&set_label("outer_loop",16);
+	&mov	(&wparam(1),$b);		# save input
+	&mov	(&wparam(0),$a);		# save output
+	&mov	(&wparam(2),$c);		# save len
+&set_label("entry");
+	&mov	($a,0x61707865);
+	&mov	(&DWP(4*1,"esp"),0x3320646e);
+	&mov	(&DWP(4*2,"esp"),0x79622d32);
+	&mov	(&DWP(4*3,"esp"),0x6b206574);
+
+	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
+	&mov	($b_,&DWP(64+4*6,"esp"));
+	&mov	($c, &DWP(64+4*10,"esp"));
+	&mov	($c_,&DWP(64+4*11,"esp"));
+	&mov	($d, &DWP(64+4*13,"esp"));
+	&mov	($d_,&DWP(64+4*14,"esp"));
+	&mov	(&DWP(4*5,"esp"),$b);
+	&mov	(&DWP(4*6,"esp"),$b_);
+	&mov	(&DWP(4*10,"esp"),$c);
+	&mov	(&DWP(4*11,"esp"),$c_);
+	&mov	(&DWP(4*13,"esp"),$d);
+	&mov	(&DWP(4*14,"esp"),$d_);
+
+	&mov	($b, &DWP(64+4*7,"esp"));
+	&mov	($d_,&DWP(64+4*15,"esp"));
+	&mov	($d, &DWP(64+4*12,"esp"));
+	&mov	($b_,&DWP(64+4*4,"esp"));
+	&mov	($c, &DWP(64+4*8,"esp"));
+	&mov	($c_,&DWP(64+4*9,"esp"));
+	&add	($d,1);				# counter value
+	&mov	(&DWP(4*7,"esp"),$b);
+	&mov	(&DWP(4*15,"esp"),$d_);
+	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
+
+	&mov	($b,10);			# loop counter
+	&jmp	(&label("loop"));
+
+&set_label("loop",16);
+	&add	($a,$b_);			# elsewhere
+	&mov	(&DWP(128,"esp"),$b);		# save loop counter
+	&mov	($b,$b_);
+	&QUARTERROUND(0, 4, 8, 12, 0);
+	&QUARTERROUND(1, 5, 9, 13, 1);
+	&QUARTERROUND(2, 6,10, 14, 2);
+	&QUARTERROUND(3, 7,11, 15, 3);
+	&QUARTERROUND(0, 5,10, 15, 4);
+	&QUARTERROUND(1, 6,11, 12, 5);
+	&QUARTERROUND(2, 7, 8, 13, 6);
+	&QUARTERROUND(3, 4, 9, 14, 7);
+	&dec	($b);
+	&jnz	(&label("loop"));
+
+	&mov	($b,&wparam(2));		# load len
+
+	&add	($a,0x61707865);		# accumulate key material
+	&add	($b_,&DWP(64+4*4,"esp"));
+	&add	($c, &DWP(64+4*8,"esp"));
+	&add	($c_,&DWP(64+4*9,"esp"));
+
+	&cmp	($b,64);
+	&jb	(&label("tail"));
+
+	&mov	($b,&wparam(1));		# load input pointer
+	&add	($d, &DWP(64+4*12,"esp"));
+	&add	($d_,&DWP(64+4*14,"esp"));
+
+	&xor	($a, &DWP(4*0,$b));		# xor with input
+	&xor	($b_,&DWP(4*4,$b));
+	&mov	(&DWP(4*0,"esp"),$a);		# off-load for later write
+	&mov	($a,&wparam(0));		# load output pointer
+	&xor	($c, &DWP(4*8,$b));
+	&xor	($c_,&DWP(4*9,$b));
+	&xor	($d, &DWP(4*12,$b));
+	&xor	($d_,&DWP(4*14,$b));
+	&mov	(&DWP(4*4,"esp"),$b_);
+	&mov	($b_,&DWP(4*0,"esp"));
+	&mov	(&DWP(4*8,"esp"),$c);
+	&mov	(&DWP(4*9,"esp"),$c_);
+	&mov	(&DWP(4*12,"esp"),$d);
+	&mov	(&DWP(4*14,"esp"),$d_);
+
+	&mov	(&DWP(4*0,$a),$b_);		# write output in order
+	&mov	($b_,&DWP(4*1,"esp"));
+	&mov	($c, &DWP(4*2,"esp"));
+	&mov	($c_,&DWP(4*3,"esp"));
+	&mov	($d, &DWP(4*5,"esp"));
+	&mov	($d_,&DWP(4*6,"esp"));
+	&add	($b_,0x3320646e);		# accumulate key material
+	&add	($c, 0x79622d32);
+	&add	($c_,0x6b206574);
+	&add	($d, &DWP(64+4*5,"esp"));
+	&add	($d_,&DWP(64+4*6,"esp"));
+	&xor	($b_,&DWP(4*1,$b));
+	&xor	($c, &DWP(4*2,$b));
+	&xor	($c_,&DWP(4*3,$b));
+	&xor	($d, &DWP(4*5,$b));
+	&xor	($d_,&DWP(4*6,$b));
+	&mov	(&DWP(4*1,$a),$b_);
+	&mov	($b_,&DWP(4*4,"esp"));
+	&mov	(&DWP(4*2,$a),$c);
+	&mov	(&DWP(4*3,$a),$c_);
+	&mov	(&DWP(4*4,$a),$b_);
+	&mov	(&DWP(4*5,$a),$d);
+	&mov	(&DWP(4*6,$a),$d_);
+
+	&mov	($c,&DWP(4*7,"esp"));
+	&mov	($d,&DWP(4*8,"esp"));
+	&mov	($d_,&DWP(4*9,"esp"));
+	&add	($c,&DWP(64+4*7,"esp"));
+	&mov	($b_, &DWP(4*10,"esp"));
+	&xor	($c,&DWP(4*7,$b));
+	&mov	($c_,&DWP(4*11,"esp"));
+	&mov	(&DWP(4*7,$a),$c);
+	&mov	(&DWP(4*8,$a),$d);
+	&mov	(&DWP(4*9,$a),$d_);
+
+	&add	($b_, &DWP(64+4*10,"esp"));
+	&add	($c_,&DWP(64+4*11,"esp"));
+	&xor	($b_, &DWP(4*10,$b));
+	&xor	($c_,&DWP(4*11,$b));
+	&mov	(&DWP(4*10,$a),$b_);
+	&mov	(&DWP(4*11,$a),$c_);
+
+	&mov	($c,&DWP(4*12,"esp"));
+	&mov	($c_,&DWP(4*14,"esp"));
+	&mov	($d, &DWP(4*13,"esp"));
+	&mov	($d_,&DWP(4*15,"esp"));
+	&add	($d, &DWP(64+4*13,"esp"));
+	&add	($d_,&DWP(64+4*15,"esp"));
+	&xor	($d, &DWP(4*13,$b));
+	&xor	($d_,&DWP(4*15,$b));
+	&lea	($b,&DWP(4*16,$b));
+	&mov	(&DWP(4*12,$a),$c);
+	&mov	($c,&wparam(2));		# len
+	&mov	(&DWP(4*13,$a),$d);
+	&mov	(&DWP(4*14,$a),$c_);
+	&mov	(&DWP(4*15,$a),$d_);
+	&lea	($a,&DWP(4*16,$a));
+	&sub	($c,64);
+	&jnz	(&label("outer_loop"));
+
+	&jmp	(&label("done"));
+
+&set_label("tail");
+	&add	($d, &DWP(64+4*12,"esp"));
+	&add	($d_,&DWP(64+4*14,"esp"));
+	&mov	(&DWP(4*0,"esp"),$a);
+	&mov	(&DWP(4*4,"esp"),$b_);
+	&mov	(&DWP(4*8,"esp"),$c);
+	&mov	(&DWP(4*9,"esp"),$c_);
+	&mov	(&DWP(4*12,"esp"),$d);
+	&mov	(&DWP(4*14,"esp"),$d_);
+
+	&mov	($b_,&DWP(4*1,"esp"));
+	&mov	($c, &DWP(4*2,"esp"));
+	&mov	($c_,&DWP(4*3,"esp"));
+	&mov	($d, &DWP(4*5,"esp"));
+	&mov	($d_,&DWP(4*6,"esp"));
+	&add	($b_,0x3320646e);		# accumulate key material
+	&add	($c, 0x79622d32);
+	&add	($c_,0x6b206574);
+	&add	($d, &DWP(64+4*5,"esp"));
+	&add	($d_,&DWP(64+4*6,"esp"));
+	&mov	(&DWP(4*1,"esp"),$b_);
+	&mov	(&DWP(4*2,"esp"),$c);
+	&mov	(&DWP(4*3,"esp"),$c_);
+	&mov	(&DWP(4*5,"esp"),$d);
+	&mov	(&DWP(4*6,"esp"),$d_);
+
+	&mov	($b_,&DWP(4*7,"esp"));
+	&mov	($c, &DWP(4*10,"esp"));
+	&mov	($c_,&DWP(4*11,"esp"));
+	&mov	($d, &DWP(4*13,"esp"));
+	&mov	($d_,&DWP(4*15,"esp"));
+	&add	($b_,&DWP(64+4*7,"esp"));
+	&add	($c, &DWP(64+4*10,"esp"));
+	&add	($c_,&DWP(64+4*11,"esp"));
+	&add	($d, &DWP(64+4*13,"esp"));
+	&add	($d_,&DWP(64+4*15,"esp"));
+	&mov	(&DWP(4*7,"esp"),$b_);
+	&mov	($b_,&wparam(1));		# load input
+	&mov	(&DWP(4*10,"esp"),$c);
+	&mov	($c,&wparam(0));		# load output
+	&mov	(&DWP(4*11,"esp"),$c_);
+	&xor	($c_,$c_);
+	&mov	(&DWP(4*13,"esp"),$d);
+	&mov	(&DWP(4*15,"esp"),$d_);
+
+	&xor	("eax","eax");
+	&xor	("edx","edx");
+&set_label("tail_loop");
+	&movb	("al",&BP(0,$c_,$b_));
+	&movb	("dl",&BP(0,"esp",$c_));
+	&lea	($c_,&DWP(1,$c_));
+	&xor	("al","dl");
+	&mov	(&BP(-1,$c,$c_),"al");
+	&dec	($b);
+	&jnz	(&label("tail_loop"));
+
+&set_label("done");
+	&stack_pop(33);
+&set_label("no_data");
+&function_end("ChaCha20_ctr32");
+
+if ($xmm) {
+my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
+my ($out,$inp,$len)=("edi","esi","ecx");
+
+sub QUARTERROUND_SSSE3 {
+my ($ai,$bi,$ci,$di,$i)=@_;
+my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
+my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
+
+	#       a   b   c   d
+	#
+	#       0   4   8  12 < even round
+	#       1   5   9  13
+	#       2   6  10  14
+	#       3   7  11  15
+	#       0   5  10  15 < odd round
+	#       1   6  11  12
+	#       2   7   8  13
+	#       3   4   9  14
+
+	if ($i==0) {
+            my $j=4;
+	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
+	} elsif ($i==3) {
+            my $j=0;
+	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
+	} elsif ($i==4) {
+            my $j=4;
+	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
+	} elsif ($i==7) {
+            my $j=0;
+	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
+	}
+
+	#&paddd	($xa,$xb);			# see elsewhere
+	#&pxor	($xd,$xa);			# see elsewhere
+	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
+	&pshufb	($xd,&QWP(0,"eax"));		# rot16
+	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
+	&paddd	($xc,$xd);
+	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
+	&pxor	($xb,$xc);
+	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
+	&movdqa	($xa_,$xb);			# borrow as temporary
+	&pslld	($xb,12);
+	&psrld	($xa_,20);
+	&por	($xb,$xa_);
+	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
+	&paddd	($xa,$xb);
+	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
+	&pxor	($xd,$xa);
+	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
+	&pshufb	($xd,&QWP(16,"eax"));		# rot8
+	&paddd	($xc,$xd);
+	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
+	&movdqa	($xd_,$xd)			if ($di==$dn);
+	&pxor	($xb,$xc);
+	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
+	&movdqa	($xa,$xb);			# borrow as temporary
+	&pslld	($xb,7);
+	&psrld	($xa,25);
+	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
+	&por	($xb,$xa);
+
+	($xa,$xa_)=($xa_,$xa);
+	($xb,$xb_)=($xb_,$xb);
+	($xc,$xc_)=($xc_,$xc);
+	($xd,$xd_)=($xd_,$xd);
+}
+
+&function_begin("ChaCha20_ssse3");
+&set_label("ssse3_shortcut");
+	&mov		($out,&wparam(0));
+	&mov		($inp,&wparam(1));
+	&mov		($len,&wparam(2));
+	&mov		("edx",&wparam(3));		# key
+	&mov		("ebx",&wparam(4));		# counter and nonce
+
+	&mov		("ebp","esp");
+	&stack_push	(131);
+	&and		("esp",-64);
+	&mov		(&DWP(512,"esp"),"ebp");
+
+	&lea		("eax",&DWP(&label("ssse3_data")."-".
+				    &label("pic_point"),"eax"));
+	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
+
+	&cmp		($len,64*4);
+	&jb		(&label("1x"));
+
+	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
+	&mov		(&DWP(512+8,"esp"),"ebx");
+	&sub		($len,64*4);			# bias len
+	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
+
+	&movdqu		("xmm7",&QWP(0,"edx"));		# key
+	&pshufd		("xmm0","xmm3",0x00);
+	&pshufd		("xmm1","xmm3",0x55);
+	&pshufd		("xmm2","xmm3",0xaa);
+	&pshufd		("xmm3","xmm3",0xff);
+	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
+	&pshufd		("xmm4","xmm7",0x00);
+	&pshufd		("xmm5","xmm7",0x55);
+	 &psubd		("xmm0",&QWP(16*4,"eax"));
+	&pshufd		("xmm6","xmm7",0xaa);
+	&pshufd		("xmm7","xmm7",0xff);
+	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
+	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
+	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
+	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
+	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
+	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
+	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
+	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
+	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
+	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
+	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
+
+	&pshufd		("xmm0","xmm3",0x00);
+	&pshufd		("xmm1","xmm3",0x55);
+	&pshufd		("xmm2","xmm3",0xaa);
+	&pshufd		("xmm3","xmm3",0xff);
+	&pshufd		("xmm4","xmm7",0x00);
+	&pshufd		("xmm5","xmm7",0x55);
+	&pshufd		("xmm6","xmm7",0xaa);
+	&pshufd		("xmm7","xmm7",0xff);
+	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
+	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
+	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
+	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
+	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
+	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
+	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
+	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
+
+	&lea		($inp,&DWP(128,$inp));		# size optimization
+	&lea		($out,&DWP(128,$out));		# size optimization
+	&jmp		(&label("outer_loop"));
+
+&set_label("outer_loop",16);
+	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
+	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
+	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
+	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
+	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
+	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
+	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
+	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
+	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
+	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
+	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
+	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
+	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
+	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
+	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
+	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
+	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
+	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
+	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
+	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
+	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
+	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
+	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
+	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
+	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
+	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
+	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
+	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
+	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
+	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
+	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
+	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
+	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
+	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
+
+	&movdqa		($xa, &QWP(16*0-128,"ebp"));
+	&movdqa		($xd, "xmm4");
+	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
+	&movdqa		($xc, &QWP(16*8-128,"ebp"));
+	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
+
+	&mov		("edx",10);			# loop counter
+	&nop		();
+
+&set_label("loop",16);
+	&paddd		($xa,$xb_);			# elsewhere
+	&movdqa		($xb,$xb_);
+	&pxor		($xd,$xa);			# elsewhere
+	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
+	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
+	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
+	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
+	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
+	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
+	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
+	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
+	&dec		("edx");
+	&jnz		(&label("loop"));
+
+	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
+	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
+	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
+	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
+	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
+
+    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
+
+    for($i=0;$i<256;$i+=64) {
+	#&movdqa	($xa0,&QWP($i+16*0-128,"ebx"));	# it's there
+	&movdqa		($xa1,&QWP($i+16*1-128,"ebx"));
+	&movdqa		($xa2,&QWP($i+16*2-128,"ebx"));
+	&movdqa		($xa3,&QWP($i+16*3-128,"ebx"));
+
+	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
+	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
+	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
+	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
+
+	&movdqa		($xt2,$xa0);		# "de-interlace" data
+	&punpckldq	($xa0,$xa1);
+	&movdqa		($xt3,$xa2);
+	&punpckldq	($xa2,$xa3);
+	&punpckhdq	($xt2,$xa1);
+	&punpckhdq	($xt3,$xa3);
+	&movdqa		($xa1,$xa0);
+	&punpcklqdq	($xa0,$xa2);		# "a0"
+	&movdqa		($xa3,$xt2);
+	&punpcklqdq	($xt2,$xt3);		# "a2"
+	&punpckhqdq	($xa1,$xa2);		# "a1"
+	&punpckhqdq	($xa3,$xt3);		# "a3"
+
+	#($xa2,$xt2)=($xt2,$xa2);
+
+	&movdqa		(&QWP($i+16*0-128,"ebx"),$xa0);
+	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
+	&movdqa		(&QWP($i+16*1-128,"ebx"),$xa1);
+	&movdqa		(&QWP($i+16*2-128,"ebx"),$xt2);
+	&movdqa		(&QWP($i+16*3-128,"ebx"),$xa3);
+    }
+    for($i=0;$i<256;$i+=64) {
+	my $j = 16*($i/64);
+	&movdqu		($xa0,&QWP($i+16*0-128,$inp));	# load input
+	&movdqu		($xa1,&QWP($i+16*1-128,$inp));
+	&movdqu		($xa2,&QWP($i+16*2-128,$inp));
+	&movdqu		($xa3,&QWP($i+16*3-128,$inp));
+	&pxor		($xa0,&QWP($j+64*0-128,"ebx"));
+	&pxor		($xa1,&QWP($j+64*1-128,"ebx"));
+	&pxor		($xa2,&QWP($j+64*2-128,"ebx"));
+	&pxor		($xa3,&QWP($j+64*3-128,"ebx"));
+	&movdqu		(&QWP($i+16*0-128,$out),$xa0);	# write output
+	&movdqu		(&QWP($i+16*1-128,$out),$xa1);
+	&movdqu		(&QWP($i+16*2-128,$out),$xa2);
+	&movdqu		(&QWP($i+16*3-128,$out),$xa3);
+    }
+	&lea		($inp,&DWP(256,$inp));
+	&lea		($out,&DWP(256,$out));
+	&sub		($len,64*4);
+	&jnc		(&label("outer_loop"));
+
+	&add		($len,64*4);
+	&jz		(&label("done"));
+
+	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
+	&lea		($inp,&DWP(-128,$inp));
+	&mov		("edx",&DWP(512+4,"esp"));
+	&lea		($out,&DWP(-128,$out));
+
+	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
+	&movdqu		("xmm3",&QWP(0,"ebx"));
+	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
+	&pand		("xmm3",&QWP(16*7,"eax"));
+	&por		("xmm3","xmm2");		# counter value
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
+
+sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
+	&paddd		($a,$b);
+	&pxor		($d,$a);
+	&pshufb		($d,$rot16);
+
+	&paddd		($c,$d);
+	&pxor		($b,$c);
+	&movdqa		($t,$b);
+	&psrld		($b,20);
+	&pslld		($t,12);
+	&por		($b,$t);
+
+	&paddd		($a,$b);
+	&pxor		($d,$a);
+	&pshufb		($d,$rot24);
+
+	&paddd		($c,$d);
+	&pxor		($b,$c);
+	&movdqa		($t,$b);
+	&psrld		($b,25);
+	&pslld		($t,7);
+	&por		($b,$t);
+}
+
+&set_label("1x");
+	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
+	&movdqu		($b,&QWP(0,"edx"));
+	&movdqu		($c,&QWP(16,"edx"));
+	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
+	&movdqa		($rot16,&QWP(0,"eax"));
+	&movdqa		($rot24,&QWP(16,"eax"));
+	&mov		(&DWP(16*3,"esp"),"ebp");
+
+	&movdqa		(&QWP(16*0,"esp"),$a);
+	&movdqa		(&QWP(16*1,"esp"),$b);
+	&movdqa		(&QWP(16*2,"esp"),$c);
+	&movdqa		(&QWP(16*3,"esp"),$d);
+	&mov		("edx",10);
+	&jmp		(&label("loop1x"));
+
+&set_label("outer1x",16);
+	&movdqa		($d,&QWP(16*5,"eax"));		# one
+	&movdqa		($a,&QWP(16*0,"esp"));
+	&movdqa		($b,&QWP(16*1,"esp"));
+	&movdqa		($c,&QWP(16*2,"esp"));
+	&paddd		($d,&QWP(16*3,"esp"));
+	&mov		("edx",10);
+	&movdqa		(&QWP(16*3,"esp"),$d);
+	&jmp		(&label("loop1x"));
+
+&set_label("loop1x",16);
+	&SSSE3ROUND();
+	&pshufd	($c,$c,0b01001110);
+	&pshufd	($b,$b,0b00111001);
+	&pshufd	($d,$d,0b10010011);
+	&nop	();
+
+	&SSSE3ROUND();
+	&pshufd	($c,$c,0b01001110);
+	&pshufd	($b,$b,0b10010011);
+	&pshufd	($d,$d,0b00111001);
+
+	&dec		("edx");
+	&jnz		(&label("loop1x"));
+
+	&paddd		($a,&QWP(16*0,"esp"));
+	&paddd		($b,&QWP(16*1,"esp"));
+	&paddd		($c,&QWP(16*2,"esp"));
+	&paddd		($d,&QWP(16*3,"esp"));
+
+	&cmp		($len,64);
+	&jb		(&label("tail"));
+
+	&movdqu		($t,&QWP(16*0,$inp));
+	&movdqu		($t1,&QWP(16*1,$inp));
+	&pxor		($a,$t);		# xor with input
+	&movdqu		($t,&QWP(16*2,$inp));
+	&pxor		($b,$t1);
+	&movdqu		($t1,&QWP(16*3,$inp));
+	&pxor		($c,$t);
+	&pxor		($d,$t1);
+	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
+
+	&movdqu		(&QWP(16*0,$out),$a);	# write output
+	&movdqu		(&QWP(16*1,$out),$b);
+	&movdqu		(&QWP(16*2,$out),$c);
+	&movdqu		(&QWP(16*3,$out),$d);
+	&lea		($out,&DWP(16*4,$out));	# inp+=64
+
+	&sub		($len,64);
+	&jnz		(&label("outer1x"));
+
+	&jmp		(&label("done"));
+
+&set_label("tail");
+	&movdqa		(&QWP(16*0,"esp"),$a);
+	&movdqa		(&QWP(16*1,"esp"),$b);
+	&movdqa		(&QWP(16*2,"esp"),$c);
+	&movdqa		(&QWP(16*3,"esp"),$d);
+
+	&xor		("eax","eax");
+	&xor		("edx","edx");
+	&xor		("ebp","ebp");
+
+&set_label("tail_loop");
+	&movb		("al",&BP(0,"esp","ebp"));
+	&movb		("dl",&BP(0,$inp,"ebp"));
+	&lea		("ebp",&DWP(1,"ebp"));
+	&xor		("al","dl");
+	&movb		(&BP(-1,$out,"ebp"),"al");
+	&dec		($len);
+	&jnz		(&label("tail_loop"));
+}
+&set_label("done");
+	&mov		("esp",&DWP(512,"esp"));
+&function_end("ChaCha20_ssse3");
+
+&align	(64);
+&set_label("ssse3_data");
+&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
+&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
+&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
+&data_word(0,1,2,3);
+&data_word(4,4,4,4);
+&data_word(1,0,0,0);
+&data_word(4,0,0,0);
+&data_word(0,-1,-1,-1);
+&align	(64);
+}
+&asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
@@ -21,7 +21,49 @@
 #include <openssl/cpu.h>


-#if defined(OPENSSL_WINDOWS) || (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || !defined(__SSE2__)
+#define U8TO32_LITTLE(p)                              \
+  (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \
+   ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+#if !defined(OPENSSL_NO_ASM) &&                         \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+
+/* ChaCha20_ctr32 is defined in asm/chacha-*.pl. */
+void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len,
+                    const uint32_t key[8], const uint32_t counter[4]);
+
+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
+                      const uint8_t key[32], const uint8_t nonce[12],
+                      uint32_t counter) {
+  uint32_t counter_nonce[4];
+  counter_nonce[0] = counter;
+  counter_nonce[1] = U8TO32_LITTLE(nonce + 0);
+  counter_nonce[2] = U8TO32_LITTLE(nonce + 4);
+  counter_nonce[3] = U8TO32_LITTLE(nonce + 8);
+
+  const uint32_t *key_ptr = (const uint32_t *)key;
+#if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64)
+  /* The assembly expects the key to be four-byte aligned. */
+  uint32_t key_u32[8];
+  if ((((uintptr_t)key) & 3) != 0) {
+    key_u32[0] = U8TO32_LITTLE(key + 0);
+    key_u32[1] = U8TO32_LITTLE(key + 4);
+    key_u32[2] = U8TO32_LITTLE(key + 8);
+    key_u32[3] = U8TO32_LITTLE(key + 12);
+    key_u32[4] = U8TO32_LITTLE(key + 16);
+    key_u32[5] = U8TO32_LITTLE(key + 20);
+    key_u32[6] = U8TO32_LITTLE(key + 24);
+    key_u32[7] = U8TO32_LITTLE(key + 28);
+
+    key_ptr = key_u32;
+  }
+#endif
+
+  ChaCha20_ctr32(out, in, in_len, key_ptr, counter_nonce);
+}
+
+#else

 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */
 static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3',
@@ -40,10 +82,6 @@ static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3',
    (p)[3] = (v >> 24) & 0xff; \
  }

-#define U8TO32_LITTLE(p)                              \
-  (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \
-   ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
-
 /* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
 #define QUARTERROUND(a,b,c,d) \
  x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
@@ -51,13 +89,6 @@ static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3',
  x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
  x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);

-#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM)
-/* Defined in chacha_vec.c */
-void CRYPTO_chacha_20_neon(uint8_t *out, const uint8_t *in, size_t in_len,
-                           const uint8_t key[32], const uint8_t nonce[12],
-                           uint32_t counter);
-#endif
-
 /* chacha_core performs 20 rounds of ChaCha on the input words in
 * |input| and writes the 64 output bytes to |output|. */
 static void chacha_core(uint8_t output[64], const uint32_t input[16]) {
@@ -91,13 +122,6 @@ void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
  uint8_t buf[64];
  size_t todo, i;

-#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM)
-  if (CRYPTO_is_NEON_capable()) {
-    CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter);
-    return;
-  }
-#endif
-
  input[0] = U8TO32_LITTLE(sigma + 0);
  input[1] = U8TO32_LITTLE(sigma + 4);
  input[2] = U8TO32_LITTLE(sigma + 8);
@@ -137,4 +161,4 @@ void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
  }
 }

-#endif /* OPENSSL_WINDOWS || !OPENSSL_X86_64 && !OPENSSL_X86 || !__SSE2__ */
+#endif
@@ -0,0 +1,257 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <memory>
+
+#include <openssl/crypto.h>
+#include <openssl/chacha.h>
+
+
+static const uint8_t kKey[32] = {
+    0x98, 0xbe, 0xf1, 0x46, 0x9b, 0xe7, 0x26, 0x98, 0x37, 0xa4, 0x5b,
+    0xfb, 0xc9, 0x2a, 0x5a, 0x6a, 0xc7, 0x62, 0x50, 0x7c, 0xf9, 0x64,
+    0x43, 0xbf, 0x33, 0xb9, 0x6b, 0x1b, 0xd4, 0xc6, 0xf8, 0xf6,
+};
+
+static const uint8_t kNonce[12] = {
+    0x44, 0xe7, 0x92, 0xd6, 0x33, 0x35, 0xab, 0xb1, 0x58, 0x2e, 0x92, 0x53,
+};
+
+static uint32_t kCounter = 42;
+
+static const uint8_t kInput[] = {
+    0x58, 0x28, 0xd5, 0x30, 0x36, 0x2c, 0x60, 0x55, 0x29, 0xf8, 0xe1, 0x8c,
+    0xae, 0x15, 0x15, 0x26, 0xf2, 0x3a, 0x73, 0xa0, 0xf3, 0x12, 0xa3, 0x88,
+    0x5f, 0x2b, 0x74, 0x23, 0x3d, 0xc9, 0x05, 0x23, 0xc6, 0x54, 0x49, 0x1e,
+    0x44, 0x88, 0x14, 0xd9, 0xda, 0x37, 0x15, 0xdc, 0xb7, 0xe4, 0x23, 0xb3,
+    0x9d, 0x7e, 0x16, 0x68, 0x35, 0xfc, 0x02, 0x6d, 0xcc, 0x8a, 0xe5, 0xdd,
+    0x5f, 0xe4, 0xd2, 0x56, 0x6f, 0x12, 0x9c, 0x9c, 0x7d, 0x6a, 0x38, 0x48,
+    0xbd, 0xdf, 0xd9, 0xac, 0x1b, 0xa2, 0x4d, 0xc5, 0x43, 0x04, 0x3c, 0xd7,
+    0x99, 0xe1, 0xa7, 0x13, 0x9c, 0x51, 0xc2, 0x6d, 0xf9, 0xcf, 0x07, 0x3b,
+    0xe4, 0xbf, 0x93, 0xa3, 0xa9, 0xb4, 0xc5, 0xf0, 0x1a, 0xe4, 0x8d, 0x5f,
+    0xc6, 0xc4, 0x7c, 0x69, 0x7a, 0xde, 0x1a, 0xc1, 0xc9, 0xcf, 0xc2, 0x4e,
+    0x7a, 0x25, 0x2c, 0x32, 0xe9, 0x17, 0xba, 0x68, 0xf1, 0x37, 0x5d, 0x62,
+    0x84, 0x46, 0xf5, 0x80, 0x7f, 0x1a, 0x71, 0xf7, 0xbe, 0x72, 0x4b, 0xb8,
+    0x1c, 0xfe, 0x3e, 0xbd, 0xae, 0x0d, 0x73, 0x0d, 0x87, 0x4a, 0x31, 0xc3,
+    0x3d, 0x46, 0x6f, 0xb3, 0xd7, 0x6b, 0xe3, 0xb8, 0x70, 0x17, 0x8e, 0x7a,
+    0x6a, 0x0e, 0xbf, 0xa8, 0xbc, 0x2b, 0xdb, 0xfa, 0x4f, 0xb6, 0x26, 0x20,
+    0xee, 0x63, 0xf0, 0x6d, 0x26, 0xac, 0x6a, 0x18, 0x37, 0x6e, 0x59, 0x81,
+    0xd1, 0x60, 0xe6, 0x40, 0xd5, 0x6d, 0x68, 0xba, 0x8b, 0x65, 0x4a, 0xf9,
+    0xf1, 0xae, 0x56, 0x24, 0x8f, 0xe3, 0x8e, 0xe7, 0x7e, 0x6f, 0xcf, 0x92,
+    0xdf, 0xa9, 0x75, 0x3a, 0xd6, 0x2e, 0x1c, 0xaf, 0xf2, 0xd6, 0x8b, 0x39,
+    0xad, 0xd2, 0x5d, 0xfb, 0xd7, 0xdf, 0x05, 0x57, 0x0d, 0xf7, 0xf6, 0x8f,
+    0x2d, 0x14, 0xb0, 0x4e, 0x1a, 0x3c, 0x77, 0x04, 0xcd, 0x3c, 0x5c, 0x58,
+    0x52, 0x10, 0x6f, 0xcf, 0x5c, 0x03, 0xc8, 0x5f, 0x85, 0x2b, 0x05, 0x82,
+    0x60, 0xda, 0xcc, 0xcd, 0xd6, 0x88, 0xbf, 0xc0, 0x10, 0xb3, 0x6f, 0x54,
+    0x54, 0x42, 0xbc, 0x4b, 0x77, 0x21, 0x4d, 0xee, 0x87, 0x45, 0x06, 0x4c,
+    0x60, 0x38, 0xd2, 0x7e, 0x1d, 0x30, 0x6c, 0x55, 0xf0, 0x38, 0x80, 0x1c,
+    0xde, 0x3d, 0xea, 0x68, 0x3e, 0xf6, 0x3e, 0x59, 0xcf, 0x0d, 0x08, 0xae,
+    0x8c, 0x02, 0x0b, 0xc1, 0x72, 0x6a, 0xb4, 0x6d, 0xf3, 0xf7, 0xb3, 0xef,
+    0x3a, 0xb1, 0x06, 0xf2, 0xf4, 0xd6, 0x69, 0x7b, 0x3e, 0xa2, 0x16, 0x31,
+    0x31, 0x79, 0xb6, 0x33, 0xa9, 0xca, 0x8a, 0xa8, 0xbe, 0xf3, 0xe9, 0x38,
+    0x28, 0xd1, 0xe1, 0x3b, 0x4e, 0x2e, 0x47, 0x35, 0xa4, 0x61, 0x14, 0x1e,
+    0x42, 0x2c, 0x49, 0x55, 0xea, 0xe3, 0xb3, 0xce, 0x39, 0xd3, 0xb3, 0xef,
+    0x4a, 0x4d, 0x78, 0x49, 0xbd, 0xf6, 0x7c, 0x0a, 0x2c, 0xd3, 0x26, 0xcb,
+    0xd9, 0x6a, 0xad, 0x63, 0x93, 0xa7, 0x29, 0x92, 0xdc, 0x1f, 0xaf, 0x61,
+    0x82, 0x80, 0x74, 0xb2, 0x9c, 0x4a, 0x86, 0x73, 0x50, 0xd8, 0xd1, 0xff,
+    0xee, 0x1a, 0xe2, 0xdd, 0xa2, 0x61, 0xbd, 0x10, 0xc3, 0x5f, 0x67, 0x9f,
+    0x29, 0xe4, 0xd3, 0x70, 0xe5, 0x67, 0x3a, 0xd2, 0x20, 0x00, 0xcc, 0x25,
+    0x15, 0x96, 0x54, 0x45, 0x85, 0xed, 0x82, 0x88, 0x3b, 0x9f, 0x3b, 0xc3,
+    0x04, 0xd4, 0x23, 0xb1, 0x0d, 0xdc, 0xc8, 0x26, 0x9d, 0x28, 0xb3, 0x25,
+    0x4d, 0x52, 0xe5, 0x33, 0xf3, 0xed, 0x2c, 0xb8, 0x1a, 0xcf, 0xc3, 0x52,
+    0xb4, 0x2f, 0xc7, 0x79, 0x96, 0x14, 0x7d, 0x72, 0x27, 0x72, 0x85, 0xea,
+    0x6d, 0x41, 0xa0, 0x22, 0x13, 0x6d, 0x06, 0x83, 0xa4, 0xdd, 0x0f, 0x69,
+    0xd2, 0x01, 0xcd, 0xc6, 0xb8, 0x64, 0x5c, 0x2c, 0x79, 0xd1, 0xc7, 0xd3,
+    0x31, 0xdb, 0x2c, 0xff, 0xda, 0xd0, 0x69, 0x31, 0xad, 0x83, 0x5f, 0xed,
+    0x6a, 0x97, 0xe4, 0x00, 0x43, 0xb0, 0x2e, 0x97, 0xae, 0x00, 0x5f, 0x5c,
+    0xb9, 0xe8, 0x39, 0x80, 0x10, 0xca, 0x0c, 0xfa, 0xf0, 0xb5, 0xcd, 0xaa,
+    0x27, 0x11, 0x60, 0xd9, 0x21, 0x86, 0x93, 0x91, 0x9f, 0x2d, 0x1a, 0x8e,
+    0xde, 0x0b, 0xb5, 0xcb, 0x05, 0x24, 0x30, 0x45, 0x4d, 0x11, 0x75, 0xfd,
+    0xe5, 0xa0, 0xa9, 0x4e, 0x3a, 0x8c, 0x3b, 0x52, 0x5a, 0x37, 0x18, 0x05,
+    0x4a, 0x7a, 0x09, 0x6a, 0xe6, 0xd5, 0xa9, 0xa6, 0x71, 0x47, 0x4c, 0x50,
+    0xe1, 0x3e, 0x8a, 0x21, 0x2b, 0x4f, 0x0e, 0xe3, 0xcb, 0x72, 0xc5, 0x28,
+    0x3e, 0x5a, 0x33, 0xec, 0x48, 0x92, 0x2e, 0xa1, 0x24, 0x57, 0x09, 0x0f,
+    0x01, 0x85, 0x3b, 0x34, 0x39, 0x7e, 0xc7, 0x90, 0x62, 0xe2, 0xdc, 0x5d,
+    0x0a, 0x2c, 0x51, 0x26, 0x95, 0x3a, 0x95, 0x92, 0xa5, 0x39, 0x8f, 0x0c,
+    0x83, 0x0b, 0x9d, 0x38, 0xab, 0x98, 0x2a, 0xc4, 0x01, 0xc4, 0x0d, 0x77,
+    0x13, 0xcb, 0xca, 0xf1, 0x28, 0x31, 0x52, 0x75, 0x27, 0x2c, 0xf0, 0x04,
+    0x86, 0xc8, 0xf3, 0x3d, 0xf2, 0x9d, 0x8f, 0x55, 0x52, 0x40, 0x3f, 0xaa,
+    0x22, 0x7f, 0xe7, 0x69, 0x3b, 0xee, 0x44, 0x09, 0xde, 0xff, 0xb0, 0x69,
+    0x3a, 0xae, 0x74, 0xe9, 0x9d, 0x33, 0xae, 0x8b, 0x6d, 0x60, 0x04, 0xff,
+    0x53, 0x3f, 0x88, 0xe9, 0x63, 0x9b, 0xb1, 0x6d, 0x2c, 0x22, 0x15, 0x5a,
+    0x15, 0xd9, 0xe5, 0xcb, 0x03, 0x78, 0x3c, 0xca, 0x59, 0x8c, 0xc8, 0xc2,
+    0x86, 0xff, 0xd2, 0x79, 0xd6, 0xc6, 0xec, 0x5b, 0xbb, 0xa0, 0xae, 0x01,
+    0x20, 0x09, 0x2e, 0x38, 0x5d, 0xda, 0x5d, 0xe0, 0x59, 0x4e, 0xe5, 0x8b,
+    0x84, 0x8f, 0xb6, 0xe0, 0x56, 0x9f, 0x21, 0xa1, 0xcf, 0xb2, 0x0f, 0x2c,
+    0x93, 0xf8, 0xcf, 0x37, 0xc1, 0x9f, 0x32, 0x98, 0x21, 0x65, 0x52, 0x66,
+    0x6e, 0xd3, 0x71, 0x98, 0x55, 0xb9, 0x46, 0x9f, 0x1a, 0x35, 0xc4, 0x47,
+    0x69, 0x62, 0x70, 0x4b, 0x77, 0x9e, 0xe4, 0x21, 0xe6, 0x32, 0x5a, 0x26,
+    0x05, 0xba, 0x57, 0x53, 0xd7, 0x9b, 0x55, 0x3c, 0xbb, 0x53, 0x79, 0x60,
+    0x9c, 0xc8, 0x4d, 0xf7, 0xf5, 0x1d, 0x54, 0x02, 0x91, 0x68, 0x0e, 0xaa,
+    0xca, 0x5a, 0x78, 0x0c, 0x28, 0x9a, 0xc3, 0xac, 0x49, 0xc0, 0xf4, 0x85,
+    0xee, 0x59, 0x76, 0x7e, 0x28, 0x4e, 0xf1, 0x5c, 0x63, 0xf7, 0xce, 0x0e,
+    0x2c, 0x21, 0xa0, 0x58, 0xe9, 0x01, 0xfd, 0xeb, 0xd1, 0xaf, 0xe6, 0xef,
+    0x93, 0xb3, 0x95, 0x51, 0x60, 0xa2, 0x74, 0x40, 0x15, 0xe5, 0xf4, 0x0a,
+    0xca, 0x6d, 0x9a, 0x37, 0x42, 0x4d, 0x5a, 0x58, 0x49, 0x0f, 0xe9, 0x02,
+    0xfc, 0x77, 0xd8, 0x59, 0xde, 0xdd, 0xad, 0x4b, 0x99, 0x2e, 0x64, 0x73,
+    0xad, 0x42, 0x2f, 0xf3, 0x2c, 0x0d, 0x49, 0xe4, 0x2e, 0x6c, 0xa4, 0x73,
+    0x75, 0x18, 0x14, 0x85, 0xbb, 0x64, 0xb4, 0xa1, 0xb0, 0x6e, 0x01, 0xc0,
+    0xcf, 0x17, 0x9c, 0xc5, 0x28, 0xc3, 0x2d, 0x6c, 0x17, 0x2a, 0x3d, 0x06,
+    0x5c, 0xf3, 0xb4, 0x49, 0x75, 0xad, 0x17, 0x69, 0xd4, 0xca, 0x65, 0xae,
+    0x44, 0x71, 0xa5, 0xf6, 0x0d, 0x0f, 0x8e, 0x37, 0xc7, 0x43, 0xce, 0x6b,
+    0x08, 0xe9, 0xd1, 0x34, 0x48, 0x8f, 0xc9, 0xfc, 0xf3, 0x5d, 0x2d, 0xec,
+    0x62, 0xd3, 0xf0, 0xb3, 0xfe, 0x2e, 0x40, 0x55, 0x76, 0x54, 0xc7, 0xb4,
+    0x61, 0x16, 0xcc, 0x7c, 0x1c, 0x19, 0x24, 0xe6, 0x4d, 0xd4, 0xc3, 0x77,
+    0x67, 0x1f, 0x3c, 0x74, 0x79, 0xa1, 0xf8, 0x85, 0x88, 0x1d, 0x6f, 0xa4,
+    0x7e, 0x2c, 0x21, 0x9f, 0x49, 0xf5, 0xaa, 0x4e, 0xf3, 0x4a, 0xfa, 0x9d,
+    0xbe, 0xf6, 0xce, 0xda, 0xb5, 0xab, 0x39, 0xbd, 0x16, 0x41, 0xa9, 0x4a,
+    0xac, 0x09, 0x01, 0xca,
+};
+static const uint8_t kOutput[] = {
+    0x54, 0x30, 0x6a, 0x13, 0xda, 0x59, 0x6b, 0x6d, 0x59, 0x49, 0xc8, 0xc5,
+    0xab, 0x26, 0xd4, 0x8a, 0xad, 0xc0, 0x3d, 0xaf, 0x14, 0xb9, 0x15, 0xb8,
+    0xca, 0xdf, 0x17, 0xa7, 0x03, 0xd3, 0xc5, 0x06, 0x01, 0xef, 0x21, 0xdd,
+    0xa3, 0x0b, 0x9e, 0x48, 0xb8, 0x5e, 0x0b, 0x87, 0x9f, 0x95, 0x23, 0x68,
+    0x85, 0x69, 0xd2, 0x5d, 0xaf, 0x57, 0xe9, 0x27, 0x11, 0x3d, 0x49, 0xfa,
+    0xf1, 0x08, 0xcc, 0x15, 0xec, 0x1d, 0x19, 0x16, 0x12, 0x9b, 0xc8, 0x66,
+    0x1f, 0xfa, 0x2c, 0x93, 0xf4, 0x99, 0x11, 0x27, 0x31, 0x0e, 0xd8, 0x46,
+    0x47, 0x40, 0x11, 0x70, 0x01, 0xca, 0xe8, 0x5b, 0xc5, 0x91, 0xc8, 0x3a,
+    0xdc, 0xaa, 0xf3, 0x4b, 0x80, 0xe5, 0xbc, 0x03, 0xd0, 0x89, 0x72, 0xbc,
+    0xce, 0x2a, 0x76, 0x0c, 0xf5, 0xda, 0x4c, 0x10, 0x06, 0x35, 0x41, 0xb1,
+    0xe6, 0xb4, 0xaa, 0x7a, 0xef, 0xf0, 0x62, 0x4a, 0xc5, 0x9f, 0x2c, 0xaf,
+    0xb8, 0x2f, 0xd9, 0xd1, 0x01, 0x7a, 0x36, 0x2f, 0x3e, 0x83, 0xa5, 0xeb,
+    0x81, 0x70, 0xa0, 0x57, 0x17, 0x46, 0xea, 0x9e, 0xcb, 0x0e, 0x74, 0xd3,
+    0x44, 0x57, 0x1d, 0x40, 0x06, 0xf8, 0xb7, 0xcb, 0x5f, 0xf4, 0x79, 0xbd,
+    0x11, 0x19, 0xd6, 0xee, 0xf8, 0xb0, 0xaa, 0xdd, 0x00, 0x62, 0xad, 0x3b,
+    0x88, 0x9a, 0x88, 0x5b, 0x1b, 0x07, 0xc9, 0xae, 0x9e, 0xa6, 0x94, 0xe5,
+    0x55, 0xdb, 0x45, 0x23, 0xb9, 0x2c, 0xcd, 0x29, 0xd3, 0x54, 0xc3, 0x88,
+    0x1e, 0x5f, 0x52, 0xf2, 0x09, 0x00, 0x26, 0x26, 0x1a, 0xed, 0xf5, 0xc2,
+    0xa9, 0x7d, 0xf9, 0x21, 0x5a, 0xaf, 0x6d, 0xab, 0x8e, 0x16, 0x84, 0x96,
+    0xb5, 0x4f, 0xcf, 0x1e, 0xa3, 0xaf, 0x08, 0x9f, 0x79, 0x86, 0xc3, 0xbe,
+    0x0c, 0x70, 0xcb, 0x8f, 0xf3, 0xc5, 0xf8, 0xe8, 0x4b, 0x21, 0x7d, 0x18,
+    0xa9, 0xed, 0x8b, 0xfb, 0x6b, 0x5a, 0x6f, 0x26, 0x0b, 0x56, 0x04, 0x7c,
+    0xfe, 0x0e, 0x1e, 0xc1, 0x3f, 0x82, 0xc5, 0x73, 0xbd, 0x53, 0x0c, 0xf0,
+    0xe2, 0xc9, 0xf3, 0x3d, 0x1b, 0x6d, 0xba, 0x70, 0xc1, 0x6d, 0xb6, 0x00,
+    0x28, 0xe1, 0xc4, 0x78, 0x62, 0x04, 0xda, 0x23, 0x86, 0xc3, 0xda, 0x74,
+    0x3d, 0x7c, 0xd6, 0x76, 0x29, 0xb2, 0x27, 0x2e, 0xb2, 0x35, 0x42, 0x60,
+    0x82, 0xcf, 0x30, 0x2c, 0x59, 0xe4, 0xe3, 0xd0, 0x74, 0x1f, 0x58, 0xe8,
+    0xda, 0x47, 0x45, 0x73, 0x1c, 0x05, 0x93, 0xae, 0x75, 0xbe, 0x1f, 0x81,
+    0xd8, 0xb7, 0xb3, 0xff, 0xfc, 0x8b, 0x52, 0x9e, 0xed, 0x8b, 0x37, 0x9f,
+    0xe0, 0xb8, 0xa2, 0x66, 0xe1, 0x6a, 0xc5, 0x1f, 0x1d, 0xf0, 0xde, 0x3f,
+    0x3d, 0xb0, 0x28, 0xf3, 0xaa, 0x4e, 0x4d, 0x31, 0xb0, 0x26, 0x79, 0x2b,
+    0x08, 0x0f, 0xe9, 0x2f, 0x79, 0xb3, 0xc8, 0xdd, 0xa7, 0x89, 0xa8, 0xa8,
+    0x1d, 0x59, 0x0e, 0x4f, 0x1e, 0x93, 0x1f, 0x70, 0x7f, 0x4e, 0x7e, 0xfe,
+    0xb8, 0xca, 0x63, 0xe0, 0xa6, 0x05, 0xcc, 0xd7, 0xde, 0x2a, 0x49, 0x31,
+    0x78, 0x5c, 0x5f, 0x44, 0xb2, 0x9b, 0x91, 0x99, 0x14, 0x29, 0x63, 0x09,
+    0x12, 0xdd, 0x02, 0xd9, 0x7b, 0xe9, 0xf5, 0x12, 0x07, 0xd0, 0xe7, 0xe6,
+    0xe8, 0xdd, 0xda, 0xa4, 0x73, 0xc4, 0x8e, 0xbd, 0x7b, 0xb7, 0xbb, 0xcb,
+    0x83, 0x2f, 0x43, 0xf6, 0x1c, 0x50, 0xae, 0x9b, 0x2e, 0x52, 0x80, 0x18,
+    0x85, 0xa8, 0x23, 0x52, 0x7a, 0x6a, 0xf7, 0x42, 0x36, 0xca, 0x91, 0x5a,
+    0x3d, 0x2a, 0xa0, 0x35, 0x7d, 0x70, 0xfc, 0x4c, 0x18, 0x7c, 0x57, 0x72,
+    0xcf, 0x9b, 0x29, 0xd6, 0xd0, 0xb4, 0xd7, 0xe6, 0x89, 0x70, 0x69, 0x22,
+    0x5e, 0x45, 0x09, 0x4d, 0x49, 0x87, 0x84, 0x5f, 0x8a, 0x5f, 0xe4, 0x15,
+    0xd3, 0xe3, 0x72, 0xaf, 0xb2, 0x30, 0x9c, 0xc1, 0xff, 0x8e, 0x6d, 0x2a,
+    0x76, 0x9e, 0x08, 0x03, 0x7e, 0xe0, 0xc3, 0xc2, 0x97, 0x06, 0x6b, 0x33,
+    0x2b, 0x08, 0xe3, 0xd5, 0x0b, 0xd8, 0x32, 0x67, 0x61, 0x10, 0xed, 0x6b,
+    0xed, 0x50, 0xef, 0xd7, 0x1c, 0x1b, 0xe0, 0x6d, 0xa1, 0x64, 0x19, 0x34,
+    0x2f, 0xe4, 0xe8, 0x54, 0xbf, 0x84, 0x0e, 0xdf, 0x0e, 0x8b, 0xd8, 0xdd,
+    0x77, 0x96, 0xb8, 0x54, 0xab, 0xf2, 0x95, 0x59, 0x0d, 0x0d, 0x0a, 0x15,
+    0x6e, 0x01, 0xf2, 0x24, 0xab, 0xa0, 0xd8, 0xdf, 0x38, 0xea, 0x97, 0x58,
+    0x76, 0x88, 0xbe, 0xaf, 0x45, 0xe3, 0x56, 0x4f, 0x68, 0xe8, 0x4b, 0xe7,
+    0x2b, 0x22, 0x18, 0x96, 0x82, 0x89, 0x25, 0x34, 0xd1, 0xdd, 0x08, 0xea,
+    0x7e, 0x21, 0xef, 0x57, 0x55, 0x43, 0xf7, 0xfa, 0xca, 0x1c, 0xde, 0x99,
+    0x2e, 0x8b, 0xd8, 0xc3, 0xcf, 0x89, 0x4d, 0xfc, 0x3b, 0x7d, 0x4a, 0xc9,
+    0x99, 0xc4, 0x31, 0xb6, 0x7a, 0xae, 0xf8, 0x49, 0xb2, 0x46, 0xc1, 0x60,
+    0x05, 0x75, 0xf3, 0x3d, 0xf2, 0xc9, 0x84, 0xa4, 0xb9, 0x8a, 0x87, 0x2a,
+    0x87, 0x5c, 0x0a, 0xbc, 0x51, 0x7d, 0x9a, 0xf5, 0xc9, 0x24, 0x2d, 0x5e,
+    0xe6, 0xc6, 0xe3, 0xcd, 0x7e, 0xe4, 0xaf, 0x8a, 0x6c, 0x00, 0x04, 0xc8,
+    0xd7, 0xa5, 0xad, 0xfa, 0xb2, 0x08, 0x4a, 0x26, 0x9b, 0x7c, 0xd0, 0xc6,
+    0x13, 0xb1, 0xb9, 0x65, 0x3f, 0x70, 0x30, 0xf9, 0x98, 0x9d, 0x87, 0x99,
+    0x57, 0x71, 0x3e, 0xb1, 0xc3, 0x24, 0xf0, 0xa6, 0xa2, 0x60, 0x9d, 0x66,
+    0xd2, 0x5f, 0xae, 0xe3, 0x94, 0x87, 0xea, 0xd1, 0xea, 0x0d, 0x2a, 0x77,
+    0xef, 0x31, 0xcc, 0xeb, 0xf9, 0x0c, 0xdc, 0x9c, 0x12, 0x80, 0xbb, 0xb0,
+    0x8e, 0xab, 0x9a, 0x04, 0xcd, 0x4b, 0x95, 0x4f, 0x7a, 0x0b, 0x53, 0x7c,
+    0x16, 0xcc, 0x0e, 0xb1, 0x73, 0x10, 0xdd, 0xaa, 0x76, 0x94, 0x90, 0xd9,
+    0x8b, 0x66, 0x41, 0x31, 0xed, 0x8c, 0x7d, 0x74, 0xc4, 0x33, 0xfa, 0xc3,
+    0x43, 0x8d, 0x10, 0xbc, 0x84, 0x4d, 0x0e, 0x95, 0x32, 0xdf, 0x17, 0x43,
+    0x6d, 0xd2, 0x5e, 0x12, 0xb9, 0xed, 0x33, 0xd9, 0x97, 0x6f, 0x4a, 0xcd,
+    0xc3, 0xcd, 0x81, 0x34, 0xbe, 0x7e, 0xa2, 0xd0, 0xa7, 0x91, 0x5d, 0x90,
+    0xf6, 0x5e, 0x4a, 0x25, 0x0f, 0xcc, 0x24, 0xeb, 0xe1, 0xe4, 0x62, 0x6c,
+    0x8f, 0x45, 0x36, 0x97, 0x5d, 0xda, 0x20, 0x2b, 0x86, 0x00, 0x8c, 0x94,
+    0xa9, 0x6a, 0x69, 0xb2, 0xe9, 0xbb, 0x82, 0x8e, 0x41, 0x95, 0xb4, 0xb7,
+    0xf1, 0x55, 0x52, 0x30, 0x39, 0x48, 0xb3, 0x25, 0x82, 0xa9, 0x10, 0x27,
+    0x89, 0xb5, 0xe5, 0x1f, 0xab, 0x72, 0x3c, 0x70, 0x08, 0xce, 0xe6, 0x61,
+    0xbf, 0x19, 0xc8, 0x90, 0x2b, 0x29, 0x30, 0x3e, 0xb8, 0x4c, 0x33, 0xf0,
+    0xf0, 0x15, 0x2e, 0xb7, 0x25, 0xca, 0x99, 0x4b, 0x6f, 0x4b, 0x41, 0x50,
+    0xee, 0x56, 0x99, 0xcf, 0x2b, 0xa4, 0xc4, 0x7c, 0x5c, 0xa6, 0xd4, 0x67,
+    0x04, 0x5c, 0x5d, 0x5f, 0x26, 0x9e, 0x0f, 0xe2, 0x58, 0x68, 0x4c, 0x30,
+    0xcd, 0xef, 0x46, 0xdb, 0x37, 0x6f, 0xbb, 0xc4, 0x80, 0xca, 0x8a, 0x54,
+    0x5d, 0x71, 0x9d, 0x0c, 0xe8, 0xb8, 0x2c, 0x10, 0x90, 0x44, 0xa4, 0x88,
+    0x3f, 0xbc, 0x15, 0x3c, 0xd2, 0xca, 0x0e, 0xc3, 0xe4, 0x6e, 0xef, 0xb0,
+    0xcb, 0xfd, 0x61, 0x7c, 0x27, 0xf2, 0x25, 0xea, 0x71, 0x6d, 0xf7, 0x49,
+    0x9c, 0x81, 0x27, 0xf0, 0x61, 0x33, 0xcf, 0x55, 0x68, 0xd3, 0x73, 0xa4,
+    0xed, 0x35, 0x65, 0x2a, 0xf2, 0x3e, 0xcf, 0x90, 0x98, 0x54, 0x6d, 0x95,
+    0x6a, 0x0c, 0x9c, 0x24, 0x0e, 0xb4, 0xb7, 0x9b, 0x8d, 0x6e, 0x1c, 0xbc,
+    0xeb, 0x17, 0x10, 0x86, 0xda, 0x91, 0x6d, 0x89, 0x4c, 0xeb, 0xf5, 0x50,
+    0x8f, 0x40, 0xcf, 0x4a,
+};
+
+static_assert(sizeof(kInput) == sizeof(kOutput),
+              "Input and output lengths don't match.");
+
+static bool TestChaCha20(size_t len) {
+  std::unique_ptr<uint8_t[]> buf(new uint8_t[len]);
+  CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter);
+  if (memcmp(buf.get(), kOutput, len) != 0) {
+    fprintf(stderr, "Mismatch at length %u.\n", static_cast<unsigned>(len));
+    return false;
+  }
+
+  // Test in-place at various offsets.
+  static const size_t kOffsets[] = {
+      0,  1,  2,  8,  15, 16,  17,  31,  32,  33,  63,
+      64, 65, 95, 96, 97, 127, 128, 129, 255, 256, 257,
+  };
+  for (size_t offset : kOffsets) {
+    buf.reset(new uint8_t[len + offset]);
+    memcpy(buf.get() + offset, kInput, len);
+    CRYPTO_chacha_20(buf.get(), buf.get() + offset, len, kKey, kNonce,
+                     kCounter);
+    if (memcmp(buf.get(), kOutput, len) != 0) {
+      fprintf(stderr, "Mismatch at length %u with in-place offset %u.\n",
+              static_cast<unsigned>(len), static_cast<unsigned>(offset));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int main(int argc, char **argv) {
+  CRYPTO_library_init();
+
+  // Run the test with the test vector at all lengths.
+  for (size_t len = 0; len <= sizeof(kInput); len++) {
+    if (!TestChaCha20(len)) {
+      return 1;
+    }
+  }
+
+  printf("PASS\n");
+  return 0;
+}
@@ -1,328 +0,0 @@
-/* Copyright (c) 2014, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-/* ====================================================================
- *
- * When updating this file, also update chacha_vec_arm.S
- *
- * ==================================================================== */
-
-
-/* This implementation is by Ted Krovetz and was submitted to SUPERCOP and
- * marked as public domain. It was been altered to allow for non-aligned inputs
- * and to allow the block counter to be passed in specifically. */
-
-#include <openssl/chacha.h>
-
-#include "../internal.h"
-
-
-#if defined(ASM_GEN) ||          \
-    !defined(OPENSSL_WINDOWS) && \
-        (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) && defined(__SSE2__)
-
-#define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */
-
-/* Architecture-neutral way to specify 16-byte vector of ints              */
-typedef unsigned vec __attribute__((vector_size(16)));
-
-/* This implementation is designed for Neon, SSE and AltiVec machines. The
- * following specify how to do certain vector operations efficiently on
- * each architecture, using intrinsics.
- * This implementation supports parallel processing of multiple blocks,
- * including potentially using general-purpose registers. */
-#if __ARM_NEON__
-#include <string.h>
-#include <arm_neon.h>
-#define GPR_TOO 1
-#define VBPI 2
-#define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0)
-#define LOAD_ALIGNED(m) (vec)(*((vec *)(m)))
-#define LOAD(m) ({ \
-    memcpy(alignment_buffer, m, 16); \
-    LOAD_ALIGNED(alignment_buffer); \
-  })
-#define STORE(m, r) ({ \
-    (*((vec *)(alignment_buffer))) = (r); \
-    memcpy(m, alignment_buffer, 16); \
-  })
-#define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1)
-#define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2)
-#define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3)
-#define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x)
-#if __clang__
-#define ROTW7(x) (x << ((vec) {7, 7, 7, 7})) ^ (x >> ((vec) {25, 25, 25, 25}))
-#define ROTW8(x) (x << ((vec) {8, 8, 8, 8})) ^ (x >> ((vec) {24, 24, 24, 24}))
-#define ROTW12(x) \
-  (x << ((vec) {12, 12, 12, 12})) ^ (x >> ((vec) {20, 20, 20, 20}))
-#else
-#define ROTW7(x) \
-  (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25)
-#define ROTW8(x) \
-  (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24)
-#define ROTW12(x) \
-  (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20)
-#endif
-#elif __SSE2__
-#include <emmintrin.h>
-#define GPR_TOO 0
-#if __clang__
-#define VBPI 4
-#else
-#define VBPI 3
-#endif
-#define ONE (vec) _mm_set_epi32(0, 0, 0, 1)
-#define LOAD(m) (vec) _mm_loadu_si128((const __m128i *)(m))
-#define LOAD_ALIGNED(m) (vec) _mm_load_si128((const __m128i *)(m))
-#define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r))
-#define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1))
-#define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2))
-#define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3))
-#define ROTW7(x) \
-  (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25))
-#define ROTW12(x) \
-  (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20))
-#if __SSSE3__
-#include <tmmintrin.h>
-#define ROTW8(x)                                                            \
-  (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, \
-                                                  11, 6, 5, 4, 7, 2, 1, 0, 3))
-#define ROTW16(x)                                                           \
-  (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \
-                                                  10, 5, 4, 7, 6, 1, 0, 3, 2))
-#else
-#define ROTW8(x) \
-  (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24))
-#define ROTW16(x) \
-  (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16))
-#endif
-#else
-#error-- Implementation supports only machines with neon or SSE2
-#endif
-
-#ifndef REVV_BE
-#define REVV_BE(x)  (x)
-#endif
-
-#ifndef REVW_BE
-#define REVW_BE(x)  (x)
-#endif
-
-#define BPI      (VBPI + GPR_TOO)  /* Blocks computed per loop iteration   */
-
-#define DQROUND_VECTORS(a,b,c,d)                \
-    a += b; d ^= a; d = ROTW16(d);              \
-    c += d; b ^= c; b = ROTW12(b);              \
-    a += b; d ^= a; d = ROTW8(d);               \
-    c += d; b ^= c; b = ROTW7(b);               \
-    b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);  \
-    a += b; d ^= a; d = ROTW16(d);              \
-    c += d; b ^= c; b = ROTW12(b);              \
-    a += b; d ^= a; d = ROTW8(d);               \
-    c += d; b ^= c; b = ROTW7(b);               \
-    b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
-
-#define QROUND_WORDS(a,b,c,d) \
-  a = a+b; d ^= a; d = d<<16 | d>>16; \
-  c = c+d; b ^= c; b = b<<12 | b>>20; \
-  a = a+b; d ^= a; d = d<< 8 | d>>24; \
-  c = c+d; b ^= c; b = b<< 7 | b>>25;
-
-#define WRITE_XOR(in, op, d, v0, v1, v2, v3)                   \
-	STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0));      \
-	STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1));      \
-	STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2));      \
-	STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
-
-#if __ARM_NEON__
-/* For ARM, we can't depend on NEON support, so this function is compiled with
- * a different name, along with the generic code, and can be enabled at
- * run-time. */
-void CRYPTO_chacha_20_neon(
-#else
-void CRYPTO_chacha_20(
-#endif
-	uint8_t *out,
-	const uint8_t *in,
-	size_t inlen,
-	const uint8_t key[32],
-	const uint8_t nonce[12],
-	uint32_t counter)
-	{
-	unsigned iters, i;
-	unsigned *op = (unsigned *)out;
-	const unsigned *ip = (const unsigned *)in;
-	const unsigned *kp = (const unsigned *)key;
-#if defined(__ARM_NEON__)
-	uint32_t np[3];
-	alignas(16) uint8_t alignment_buffer[16];
-#endif
-	vec s0, s1, s2, s3;
-	alignas(16) unsigned chacha_const[] =
-		{0x61707865,0x3320646E,0x79622D32,0x6B206574};
-#if defined(__ARM_NEON__)
-	memcpy(np, nonce, 12);
-#endif
-	s0 = LOAD_ALIGNED(chacha_const);
-	s1 = LOAD(&((const vec*)kp)[0]);
-	s2 = LOAD(&((const vec*)kp)[1]);
-	s3 = (vec){
-		counter,
-		((const uint32_t*)nonce)[0],
-		((const uint32_t*)nonce)[1],
-		((const uint32_t*)nonce)[2]
-	};
-
-	for (iters = 0; iters < inlen/(BPI*64); iters++)
-		{
-#if GPR_TOO
-		register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8,
-				  x9, x10, x11, x12, x13, x14, x15;
-#endif
-#if VBPI > 2
-		vec v8,v9,v10,v11;
-#endif
-#if VBPI > 3
-		vec v12,v13,v14,v15;
-#endif
-
-		vec v0,v1,v2,v3,v4,v5,v6,v7;
-		v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3;
-		v7 = v3 + ONE;
-#if VBPI > 2
-		v8 = v4; v9 = v5; v10 = v6;
-		v11 =  v7 + ONE;
-#endif
-#if VBPI > 3
-		v12 = v8; v13 = v9; v14 = v10;
-		v15 = v11 + ONE;
-#endif
-#if GPR_TOO
-		x0 = chacha_const[0]; x1 = chacha_const[1];
-		x2 = chacha_const[2]; x3 = chacha_const[3];
-		x4 = kp[0]; x5 = kp[1]; x6  = kp[2]; x7  = kp[3];
-		x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7];
-		x12 = counter+BPI*iters+(BPI-1); x13 = np[0];
-		x14 = np[1]; x15 = np[2];
-#endif
-		for (i = CHACHA_RNDS/2; i; i--)
-			{
-			DQROUND_VECTORS(v0,v1,v2,v3)
-			DQROUND_VECTORS(v4,v5,v6,v7)
-#if VBPI > 2
-			DQROUND_VECTORS(v8,v9,v10,v11)
-#endif
-#if VBPI > 3
-			DQROUND_VECTORS(v12,v13,v14,v15)
-#endif
-#if GPR_TOO
-			QROUND_WORDS( x0, x4, x8,x12)
-			QROUND_WORDS( x1, x5, x9,x13)
-			QROUND_WORDS( x2, x6,x10,x14)
-			QROUND_WORDS( x3, x7,x11,x15)
-			QROUND_WORDS( x0, x5,x10,x15)
-			QROUND_WORDS( x1, x6,x11,x12)
-			QROUND_WORDS( x2, x7, x8,x13)
-			QROUND_WORDS( x3, x4, x9,x14)
-#endif
-			}
-
-		WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
-		s3 += ONE;
-		WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3)
-		s3 += ONE;
-#if VBPI > 2
-		WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3)
-		s3 += ONE;
-#endif
-#if VBPI > 3
-		WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3)
-		s3 += ONE;
-#endif
-		ip += VBPI*16;
-		op += VBPI*16;
-#if GPR_TOO
-		op[0]  = REVW_BE(REVW_BE(ip[0])  ^ (x0  + chacha_const[0]));
-		op[1]  = REVW_BE(REVW_BE(ip[1])  ^ (x1  + chacha_const[1]));
-		op[2]  = REVW_BE(REVW_BE(ip[2])  ^ (x2  + chacha_const[2]));
-		op[3]  = REVW_BE(REVW_BE(ip[3])  ^ (x3  + chacha_const[3]));
-		op[4]  = REVW_BE(REVW_BE(ip[4])  ^ (x4  + kp[0]));
-		op[5]  = REVW_BE(REVW_BE(ip[5])  ^ (x5  + kp[1]));
-		op[6]  = REVW_BE(REVW_BE(ip[6])  ^ (x6  + kp[2]));
-		op[7]  = REVW_BE(REVW_BE(ip[7])  ^ (x7  + kp[3]));
-		op[8]  = REVW_BE(REVW_BE(ip[8])  ^ (x8  + kp[4]));
-		op[9]  = REVW_BE(REVW_BE(ip[9])  ^ (x9  + kp[5]));
-		op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));
-		op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));
-		op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1)));
-		op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13 + np[0]));
-		op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[1]));
-		op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[2]));
-		s3 += ONE;
-		ip += 16;
-		op += 16;
-#endif
-		}
-
-	for (iters = inlen%(BPI*64)/64; iters != 0; iters--)
-		{
-		vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;
-		for (i = CHACHA_RNDS/2; i; i--)
-			{
-			DQROUND_VECTORS(v0,v1,v2,v3);
-			}
-		WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
-		s3 += ONE;
-		ip += 16;
-		op += 16;
-		}
-
-	inlen = inlen % 64;
-	if (inlen)
-		{
-		alignas(16) vec buf[4];
-		vec v0,v1,v2,v3;
-		v0 = s0; v1 = s1; v2 = s2; v3 = s3;
-		for (i = CHACHA_RNDS/2; i; i--)
-			{
-			DQROUND_VECTORS(v0,v1,v2,v3);
-			}
-
-		if (inlen >= 16)
-			{
-			STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));
-			if (inlen >= 32)
-				{
-				STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));
-				if (inlen >= 48)
-					{
-					STORE(op + 8, LOAD(ip +  8) ^
-						      REVV_BE(v2 + s2));
-					buf[3] = REVV_BE(v3 + s3);
-					}
-				else
-					buf[2] = REVV_BE(v2 + s2);
-				}
-			else
-				buf[1] = REVV_BE(v1 + s1);
-			}
-		else
-			buf[0] = REVV_BE(v0 + s0);
-
-		for (i=inlen & ~15; i<inlen; i++)
-			((char *)op)[i] = ((const char *)ip)[i] ^ ((const char *)buf)[i];
-		}
-	}
-
-#endif /* ASM_GEN || !OPENSSL_WINDOWS && (OPENSSL_X86_64 || OPENSSL_X86) && SSE2 */
@@ -1,153 +0,0 @@
-// Copyright (c) 2014, Google Inc.
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice appear in all copies.
-//
-// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-// This package generates chacha_vec_arm.S from chacha_vec.c. Install the
-// arm-linux-gnueabihf-gcc compiler as described in BUILDING.md. Then:
-// `(cd crypto/chacha && go run chacha_vec_arm_generate.go)`.
-
-package main
-
-import (
-	"bufio"
-	"bytes"
-	"os"
-	"os/exec"
-	"strings"
-)
-
-const defaultCompiler = "/opt/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc"
-
-func main() {
-	compiler := defaultCompiler
-	if len(os.Args) > 1 {
-		compiler = os.Args[1]
-	}
-
-	args := []string{
-		"-O3",
-		"-mcpu=cortex-a8",
-		"-mfpu=neon",
-		"-fpic",
-		"-DASM_GEN",
-		"-I", "../../include",
-		"-S", "chacha_vec.c",
-		"-o", "-",
-	}
-
-	output, err := os.OpenFile("chacha_vec_arm.S", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
-	if err != nil {
-		panic(err)
-	}
-	defer output.Close()
-
-	output.WriteString(preamble)
-	output.WriteString(compiler)
-	output.WriteString(" ")
-	output.WriteString(strings.Join(args, " "))
-	output.WriteString("\n\n#if !defined(OPENSSL_NO_ASM)\n")
-	output.WriteString("#if defined(__arm__)\n\n")
-
-	cmd := exec.Command(compiler, args...)
-	cmd.Stderr = os.Stderr
-	asm, err := cmd.StdoutPipe()
-	if err != nil {
-		panic(err)
-	}
-	if err := cmd.Start(); err != nil {
-		panic(err)
-	}
-
-	attr28 := []byte(".eabi_attribute 28,")
-	globalDirective := []byte(".global\t")
-	newLine := []byte("\n")
-	attr28Handled := false
-
-	scanner := bufio.NewScanner(asm)
-	for scanner.Scan() {
-		line := scanner.Bytes()
-
-		if bytes.Contains(line, attr28) {
-			output.WriteString(attr28Block)
-			attr28Handled = true
-			continue
-		}
-
-		output.Write(line)
-		output.Write(newLine)
-
-		if i := bytes.Index(line, globalDirective); i >= 0 {
-			output.Write(line[:i])
-			output.WriteString(".hidden\t")
-			output.Write(line[i+len(globalDirective):])
-			output.Write(newLine)
-		}
-	}
-
-	if err := scanner.Err(); err != nil {
-		panic(err)
-	}
-
-	if !attr28Handled {
-		panic("EABI attribute 28 not seen in processing")
-	}
-
-	if err := cmd.Wait(); err != nil {
-		panic(err)
-	}
-
-	output.WriteString(trailer)
-}
-
-const preamble = `# Copyright (c) 2014, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-# This file contains a pre-compiled version of chacha_vec.c for ARM. This is
-# needed to support switching on NEON code at runtime. If the whole of OpenSSL
-# were to be compiled with the needed flags to build chacha_vec.c, then it
-# wouldn't be possible to run on non-NEON systems.
-#
-# This file was generated by chacha_vec_arm_generate.go using the following
-# compiler command:
-#
-#     `
-
-const attr28Block = `
-# EABI attribute 28 sets whether VFP register arguments were used to build this
-# file. If object files are inconsistent on this point, the linker will refuse
-# to link them. Thus we report whatever the compiler expects since we don't use
-# VFP arguments.
-
-#if defined(__ARM_PCS_VFP)
-	.eabi_attribute 28, 1
-#else
-	.eabi_attribute 28, 0
-#endif
-
-`
-
-const trailer = `
-#endif  /* __arm__ */
-#endif  /* !OPENSSL_NO_ASM */
-`
@@ -192,37 +192,158 @@ static int TestCleanupAfterInitFailure(const EVP_AEAD *aead) {
  return 1;
 }

-struct AEADName {
+static bool TestWithAliasedBuffers(const EVP_AEAD *aead) {
+  const size_t key_len = EVP_AEAD_key_length(aead);
+  const size_t nonce_len = EVP_AEAD_nonce_length(aead);
+  const size_t max_overhead = EVP_AEAD_max_overhead(aead);
+
+  std::vector<uint8_t> key(key_len, 'a');
+  ScopedEVP_AEAD_CTX ctx;
+  if (!EVP_AEAD_CTX_init(ctx.get(), aead, key.data(), key_len,
+                         EVP_AEAD_DEFAULT_TAG_LENGTH, nullptr)) {
+    return false;
+  }
+
+  static const uint8_t kPlaintext[260] =
+      "testing123456testing123456testing123456testing123456testing123456testing"
+      "123456testing123456testing123456testing123456testing123456testing123456t"
+      "esting123456testing123456testing123456testing123456testing123456testing1"
+      "23456testing123456testing123456testing12345";
+  const std::vector<size_t> offsets = {
+      0,  1,  2,  8,  15, 16,  17,  31,  32,  33,  63,
+      64, 65, 95, 96, 97, 127, 128, 129, 255, 256, 257,
+  };
+
+  std::vector<uint8_t> nonce(nonce_len, 'b');
+  std::vector<uint8_t> valid_encryption(sizeof(kPlaintext) + max_overhead);
+  size_t valid_encryption_len;
+  if (!EVP_AEAD_CTX_seal(
+          ctx.get(), valid_encryption.data(), &valid_encryption_len,
+          sizeof(kPlaintext) + max_overhead, nonce.data(), nonce_len,
+          kPlaintext, sizeof(kPlaintext), nullptr, 0)) {
+    fprintf(stderr, "EVP_AEAD_CTX_seal failed with disjoint buffers.\n");
+    return false;
+  }
+
+  // First test with out > in, which we expect to fail.
+  for (auto offset : offsets) {
+    if (offset == 0) {
+      // Will be tested in the next loop.
+      continue;
+    }
+
+    std::vector<uint8_t> buffer(offset + valid_encryption_len);
+    memcpy(buffer.data(), kPlaintext, sizeof(kPlaintext));
+    uint8_t *out = buffer.data() + offset;
+
+    size_t out_len;
+    if (!EVP_AEAD_CTX_seal(ctx.get(), out, &out_len,
+                           sizeof(kPlaintext) + max_overhead, nonce.data(),
+                           nonce_len, buffer.data(), sizeof(kPlaintext),
+                           nullptr, 0)) {
+      // We expect offsets where the output is greater than the input to fail.
+      ERR_clear_error();
+    } else {
+      fprintf(stderr,
+              "EVP_AEAD_CTX_seal unexpectedly succeeded for offset %u.\n",
+              static_cast<unsigned>(offset));
+      return false;
+    }
+
+    memcpy(buffer.data(), valid_encryption.data(), valid_encryption_len);
+    if (!EVP_AEAD_CTX_open(ctx.get(), out, &out_len, valid_encryption_len,
+                           nonce.data(), nonce_len, buffer.data(),
+                           valid_encryption_len, nullptr, 0)) {
+      // We expect offsets where the output is greater than the input to fail.
+      ERR_clear_error();
+    } else {
+      fprintf(stderr,
+              "EVP_AEAD_CTX_open unexpectedly succeeded for offset %u.\n",
+              static_cast<unsigned>(offset));
+      ERR_print_errors_fp(stderr);
+      return false;
+    }
+  }
+
+  // Test with out <= in, which we expect to work.
+  for (auto offset : offsets) {
+    std::vector<uint8_t> buffer(offset + valid_encryption_len);
+    uint8_t *const out = buffer.data();
+    uint8_t *const in = buffer.data() + offset;
+    memcpy(in, kPlaintext, sizeof(kPlaintext));
+
+    size_t out_len;
+    if (!EVP_AEAD_CTX_seal(ctx.get(), out, &out_len,
+                           sizeof(kPlaintext) + max_overhead, nonce.data(),
+                           nonce_len, in, sizeof(kPlaintext), nullptr, 0)) {
+      fprintf(stderr, "EVP_AEAD_CTX_seal failed for offset -%u.\n",
+              static_cast<unsigned>(offset));
+      return false;
+    }
+
+    if (out_len != valid_encryption_len ||
+        memcmp(out, valid_encryption.data(), out_len) != 0) {
+      fprintf(stderr, "EVP_AEAD_CTX_seal produced bad output for offset -%u.\n",
+              static_cast<unsigned>(offset));
+      return false;
+    }
+
+    memcpy(in, valid_encryption.data(), valid_encryption_len);
+    if (!EVP_AEAD_CTX_open(ctx.get(), out, &out_len,
+                           offset + valid_encryption_len, nonce.data(),
+                           nonce_len, in, valid_encryption_len, nullptr, 0)) {
+      fprintf(stderr, "EVP_AEAD_CTX_open failed for offset -%u.\n",
+              static_cast<unsigned>(offset));
+      return false;
+    }
+
+    if (out_len != sizeof(kPlaintext) ||
+        memcmp(out, kPlaintext, out_len) != 0) {
+      fprintf(stderr, "EVP_AEAD_CTX_open produced bad output for offset -%u.\n",
+              static_cast<unsigned>(offset));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+struct KnownAEAD {
  const char name[40];
  const EVP_AEAD *(*func)(void);
+  // limited_implementation indicates that tests that assume a generic AEAD
+  // interface should not be performed. For example, the key-wrap AEADs only
+  // handle inputs that are a multiple of eight bytes in length and the
+  // SSLv3/TLS AEADs have the concept of “direction”.
+  bool limited_implementation;
 };

-static const struct AEADName kAEADs[] = {
-  { "aes-128-gcm", EVP_aead_aes_128_gcm },
-  { "aes-256-gcm", EVP_aead_aes_256_gcm },
-  { "chacha20-poly1305", EVP_aead_chacha20_poly1305 },
-  { "chacha20-poly1305-old", EVP_aead_chacha20_poly1305_old },
-  { "rc4-md5-tls", EVP_aead_rc4_md5_tls },
-  { "rc4-sha1-tls", EVP_aead_rc4_sha1_tls },
-  { "aes-128-cbc-sha1-tls", EVP_aead_aes_128_cbc_sha1_tls },
-  { "aes-128-cbc-sha1-tls-implicit-iv", EVP_aead_aes_128_cbc_sha1_tls_implicit_iv },
-  { "aes-128-cbc-sha256-tls", EVP_aead_aes_128_cbc_sha256_tls },
-  { "aes-256-cbc-sha1-tls", EVP_aead_aes_256_cbc_sha1_tls },
-  { "aes-256-cbc-sha1-tls-implicit-iv", EVP_aead_aes_256_cbc_sha1_tls_implicit_iv },
-  { "aes-256-cbc-sha256-tls", EVP_aead_aes_256_cbc_sha256_tls },
-  { "aes-256-cbc-sha384-tls", EVP_aead_aes_256_cbc_sha384_tls },
-  { "des-ede3-cbc-sha1-tls", EVP_aead_des_ede3_cbc_sha1_tls },
-  { "des-ede3-cbc-sha1-tls-implicit-iv", EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv },
-  { "rc4-md5-ssl3", EVP_aead_rc4_md5_ssl3 },
-  { "rc4-sha1-ssl3", EVP_aead_rc4_sha1_ssl3 },
-  { "aes-128-cbc-sha1-ssl3", EVP_aead_aes_128_cbc_sha1_ssl3 },
-  { "aes-256-cbc-sha1-ssl3", EVP_aead_aes_256_cbc_sha1_ssl3 },
-  { "des-ede3-cbc-sha1-ssl3", EVP_aead_des_ede3_cbc_sha1_ssl3 },
-  { "aes-128-key-wrap", EVP_aead_aes_128_key_wrap },
-  { "aes-256-key-wrap", EVP_aead_aes_256_key_wrap },
-  { "aes-128-ctr-hmac-sha256", EVP_aead_aes_128_ctr_hmac_sha256 },
-  { "aes-256-ctr-hmac-sha256", EVP_aead_aes_256_ctr_hmac_sha256 },
-  { "", NULL },
+static const struct KnownAEAD kAEADs[] = {
+  { "aes-128-gcm", EVP_aead_aes_128_gcm, false },
+  { "aes-256-gcm", EVP_aead_aes_256_gcm, false },
+  { "chacha20-poly1305", EVP_aead_chacha20_poly1305, false },
+  { "chacha20-poly1305-old", EVP_aead_chacha20_poly1305_old, false },
+  { "rc4-md5-tls", EVP_aead_rc4_md5_tls, true },
+  { "rc4-sha1-tls", EVP_aead_rc4_sha1_tls, true },
+  { "aes-128-cbc-sha1-tls", EVP_aead_aes_128_cbc_sha1_tls, true },
+  { "aes-128-cbc-sha1-tls-implicit-iv", EVP_aead_aes_128_cbc_sha1_tls_implicit_iv, true },
+  { "aes-128-cbc-sha256-tls", EVP_aead_aes_128_cbc_sha256_tls, true },
+  { "aes-256-cbc-sha1-tls", EVP_aead_aes_256_cbc_sha1_tls, true },
+  { "aes-256-cbc-sha1-tls-implicit-iv", EVP_aead_aes_256_cbc_sha1_tls_implicit_iv, true },
+  { "aes-256-cbc-sha256-tls", EVP_aead_aes_256_cbc_sha256_tls, true },
+  { "aes-256-cbc-sha384-tls", EVP_aead_aes_256_cbc_sha384_tls, true },
+  { "des-ede3-cbc-sha1-tls", EVP_aead_des_ede3_cbc_sha1_tls, true },
+  { "des-ede3-cbc-sha1-tls-implicit-iv", EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv, true },
+  { "rc4-md5-ssl3", EVP_aead_rc4_md5_ssl3, true },
+  { "rc4-sha1-ssl3", EVP_aead_rc4_sha1_ssl3, true },
+  { "aes-128-cbc-sha1-ssl3", EVP_aead_aes_128_cbc_sha1_ssl3, true },
+  { "aes-256-cbc-sha1-ssl3", EVP_aead_aes_256_cbc_sha1_ssl3, true },
+  { "des-ede3-cbc-sha1-ssl3", EVP_aead_des_ede3_cbc_sha1_ssl3, true },
+  { "aes-128-key-wrap", EVP_aead_aes_128_key_wrap, true },
+  { "aes-256-key-wrap", EVP_aead_aes_256_key_wrap, true },
+  { "aes-128-ctr-hmac-sha256", EVP_aead_aes_128_ctr_hmac_sha256, false },
+  { "aes-256-ctr-hmac-sha256", EVP_aead_aes_256_ctr_hmac_sha256, false },
+  { "", NULL, false },
 };

 int main(int argc, char **argv) {
@@ -233,22 +354,28 @@ int main(int argc, char **argv) {
    return 1;
  }

-  const EVP_AEAD *aead;
+  const struct KnownAEAD *known_aead;
  for (unsigned i = 0;; i++) {
-    const struct AEADName &aead_name = kAEADs[i];
-    if (aead_name.func == NULL) {
+    known_aead = &kAEADs[i];
+    if (known_aead->func == NULL) {
      fprintf(stderr, "Unknown AEAD: %s\n", argv[1]);
      return 2;
    }
-    if (strcmp(aead_name.name, argv[1]) == 0) {
-      aead = aead_name.func();
+    if (strcmp(known_aead->name, argv[1]) == 0) {
      break;
    }
  }

+  const EVP_AEAD *const aead = known_aead->func();
+
  if (!TestCleanupAfterInitFailure(aead)) {
    return 1;
  }

+  if (!known_aead->limited_implementation && !TestWithAliasedBuffers(aead)) {
+    fprintf(stderr, "Aliased buffers test failed for %s.\n", known_aead->name);
+    return 1;
+  }
+
  return FileTestMain(TestAEAD, const_cast<EVP_AEAD*>(aead), argv[2]);
 }
@@ -61,7 +61,7 @@

 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"

@@ -109,7 +109,7 @@ static const EVP_CIPHER *GetCipher(const std::string &name) {
 static bool TestOperation(FileTest *t,
                          const EVP_CIPHER *cipher,
                          bool encrypt,
-                          bool streaming,
+                          size_t chunk_size,
                          const std::vector<uint8_t> &key,
                          const std::vector<uint8_t> &iv,
                          const std::vector<uint8_t> &plaintext,
@@ -170,16 +170,21 @@ static bool TestOperation(FileTest *t,
    t->PrintLine("Operation failed.");
    return false;
  }
-  if (streaming) {
-    for (size_t i = 0; i < in->size(); i++) {
-      uint8_t c = (*in)[i];
+  if (chunk_size != 0) {
+    for (size_t i = 0; i < in->size();) {
+      size_t todo = chunk_size;
+      if (i + todo > in->size()) {
+        todo = in->size() - i;
+      }
+
      int len;
-      if (!EVP_CipherUpdate(ctx.get(), result.data() + result_len1, &len, &c,
-                            1)) {
+      if (!EVP_CipherUpdate(ctx.get(), result.data() + result_len1, &len,
+                            in->data() + i, todo)) {
        t->PrintLine("Operation failed.");
        return false;
      }
      result_len1 += len;
+      i += todo;
    }
  } else if (!in->empty() &&
             !EVP_CipherUpdate(ctx.get(), result.data(), &result_len1,
@@ -258,20 +263,20 @@ static bool TestCipher(FileTest *t, void *arg) {
    }
  }

-  // By default, both directions are run, unless overridden by the operation.
-  if (operation != kDecrypt) {
-    if (!TestOperation(t, cipher, true /* encrypt */, false /* single-shot */,
-                       key, iv, plaintext, ciphertext, aad, tag) ||
-        !TestOperation(t, cipher, true /* encrypt */, true /* streaming */, key,
-                       iv, plaintext, ciphertext, aad, tag)) {
+  const std::vector<size_t> chunk_sizes = {0,  1,  2,  5,  7,  8,  9,  15, 16,
+                                           17, 31, 32, 33, 63, 64, 65, 512};
+
+  for (size_t chunk_size : chunk_sizes) {
+    // By default, both directions are run, unless overridden by the operation.
+    if (operation != kDecrypt &&
+        !TestOperation(t, cipher, true /* encrypt */, chunk_size, key, iv,
+                       plaintext, ciphertext, aad, tag)) {
      return false;
    }
-  }
-  if (operation != kEncrypt) {
-    if (!TestOperation(t, cipher, false /* decrypt */, false /* single-shot */,
-                       key, iv, plaintext, ciphertext, aad, tag) ||
-        !TestOperation(t, cipher, false /* decrypt */, true /* streaming */,
-                       key, iv, plaintext, ciphertext, aad, tag)) {
+
+    if (operation != kEncrypt &&
+        !TestOperation(t, cipher, false /* decrypt */, chunk_size, key, iv,
+                       plaintext, ciphertext, aad, tag)) {
      return false;
    }
  }
@@ -54,7 +54,7 @@
 #include <openssl/cpu.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/rand.h>
 #include <openssl/sha.h>

@@ -67,6 +67,10 @@
 #endif


+#if defined(_MSC_VER)
+#pragma warning(disable: 4702) /* Unreachable code. */
+#endif
+
 typedef struct {
  union {
    double align;
@@ -252,22 +256,6 @@ void aesni_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length,
 void aesni_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
                       const AES_KEY *key, uint8_t *ivec, int enc);

-void aesni_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t blocks,
-                                const void *key, const uint8_t *ivec);
-
-#if defined(OPENSSL_X86_64)
-size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
-                         const void *key, uint8_t ivec[16], uint64_t *Xi);
-#define AES_gcm_encrypt aesni_gcm_encrypt
-size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
-                         const void *key, uint8_t ivec[16], uint64_t *Xi);
-#define AES_gcm_decrypt aesni_gcm_decrypt
-void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
-                   size_t len);
-#define AES_GCM_ASM(gctx) \
-  (gctx->ctr == aesni_ctr32_encrypt_blocks && gctx->gcm.ghash == gcm_ghash_avx)
-#endif  /* OPENSSL_X86_64 */
-
 #else

 /* On other platforms, aesni_capable() will always return false and so the
@@ -288,8 +276,7 @@ static void aesni_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
 #endif

 static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
-                        const uint8_t *iv, int enc)
-                        OPENSSL_SUPPRESS_UNREACHABLE_CODE_WARNINGS {
+                        const uint8_t *iv, int enc) {
  int ret, mode;
  EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;

@@ -410,8 +397,7 @@ static char aesni_capable(void);

 static ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_CONTEXT *gcm_ctx,
                                block128_f *out_block, const uint8_t *key,
-                                size_t key_len)
-                                OPENSSL_SUPPRESS_UNREACHABLE_CODE_WARNINGS {
+                                size_t key_len) {
  if (aesni_capable()) {
    aesni_set_encrypt_key(key, key_len * 8, aes_key);
    if (gcm_ctx != NULL) {
@@ -651,57 +637,23 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
      }
    } else if (ctx->encrypt) {
      if (gctx->ctr) {
-        size_t bulk = 0;
-#if defined(AES_GCM_ASM)
-        if (len >= 32 && AES_GCM_ASM(gctx)) {
-          size_t res = (16 - gctx->gcm.mres) % 16;
-
-          if (!CRYPTO_gcm128_encrypt(&gctx->gcm, &gctx->ks.ks, in, out, res)) {
-            return -1;
-          }
-
-          bulk = AES_gcm_encrypt(in + res, out + res, len - res, &gctx->ks.ks,
-                                 gctx->gcm.Yi.c, gctx->gcm.Xi.u);
-          gctx->gcm.len.u[1] += bulk;
-          bulk += res;
-        }
-#endif
-        if (!CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in + bulk,
-                                         out + bulk, len - bulk, gctx->ctr)) {
+        if (!CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len,
+                                         gctx->ctr)) {
          return -1;
        }
      } else {
-        size_t bulk = 0;
-        if (!CRYPTO_gcm128_encrypt(&gctx->gcm, &gctx->ks.ks, in + bulk,
-                                   out + bulk, len - bulk)) {
+        if (!CRYPTO_gcm128_encrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) {
          return -1;
        }
      }
    } else {
      if (gctx->ctr) {
-        size_t bulk = 0;
-#if defined(AES_GCM_ASM)
-        if (len >= 16 && AES_GCM_ASM(gctx)) {
-          size_t res = (16 - gctx->gcm.mres) % 16;
-
-          if (!CRYPTO_gcm128_decrypt(&gctx->gcm, &gctx->ks.ks, in, out, res)) {
-            return -1;
-          }
-
-          bulk = AES_gcm_decrypt(in + res, out + res, len - res, &gctx->ks.ks,
-                                 gctx->gcm.Yi.c, gctx->gcm.Xi.u);
-          gctx->gcm.len.u[1] += bulk;
-          bulk += res;
-        }
-#endif
-        if (!CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in + bulk,
-                                         out + bulk, len - bulk, gctx->ctr)) {
+        if (!CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len,
+                                         gctx->ctr)) {
          return -1;
        }
      } else {
-        size_t bulk = 0;
-        if (!CRYPTO_gcm128_decrypt(&gctx->gcm, &gctx->ks.ks, in + bulk,
-                                   out + bulk, len - bulk)) {
+        if (!CRYPTO_gcm128_decrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) {
          return -1;
        }
      }
@@ -117,7 +117,7 @@ static int seal_impl(aead_poly1305_update poly1305_update,
   * 32-bits and this produces a warning because it's always false.
   * Casting to uint64_t inside the conditional is not sufficient to stop
   * the warning. */
-  if (in_len_64 >= (1ull << 32) * 64 - 64) {
+  if (in_len_64 >= (UINT64_C(1) << 32) * 64 - 64) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
    return 0;
  }
@@ -162,7 +162,7 @@ static int open_impl(aead_poly1305_update poly1305_update,
   * 32-bits and this produces a warning because it's always false.
   * Casting to uint64_t inside the conditional is not sufficient to stop
   * the warning. */
-  if (in_len_64 >= (1ull << 32) * 64 - 64) {
+  if (in_len_64 >= (UINT64_C(1) << 32) * 64 - 64) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
    return 0;
  }
@@ -56,7 +56,7 @@

 #include <openssl/cipher.h>
 #include <openssl/des.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"

@@ -58,7 +58,7 @@

 #include <string.h>

-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"

@@ -55,7 +55,7 @@
 * [including the GNU Public Licence.] */

 #include <openssl/cipher.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"

@@ -58,7 +58,7 @@
 #include <string.h>

 #include <openssl/cipher.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/rc4.h>


@@ -54,10 +54,11 @@
 #include <string.h>

 #include <openssl/digest.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/sha.h>

 #include "../internal.h"
+#include "internal.h"


 /* TODO(davidben): unsigned should be size_t. The various constant_time
@@ -44,7 +44,7 @@ static int test(const char *name, const uint8_t *key, size_t key_len,
  }

  ScopedCMAC_CTX ctx(CMAC_CTX_new());
-  if (!CMAC_Init(ctx.get(), key, key_len, EVP_aes_128_cbc(), NULL)) {
+  if (!ctx || !CMAC_Init(ctx.get(), key, key_len, EVP_aes_128_cbc(), NULL)) {
    fprintf(stderr, "%s: CMAC_Init failed.\n", name);
    return 0;
  }
@@ -65,6 +65,7 @@
 #include <openssl/mem.h>

 #include "conf_def.h"
+#include "internal.h"


 static uint32_t conf_value_hash(const CONF_VALUE *v) {
@@ -152,7 +153,7 @@ void NCONF_free(CONF *conf) {
  OPENSSL_free(conf);
 }

-CONF_VALUE *NCONF_new_section(const CONF *conf, const char *section) {
+static CONF_VALUE *NCONF_new_section(const CONF *conf, const char *section) {
  STACK_OF(CONF_VALUE) *sk = NULL;
  int ok = 0;
  CONF_VALUE *v = NULL, *old_value;
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/cpu.h>
+
+#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_STATIC_ARMCAP)
+
+#include <sys/auxv.h>
+
+#include <openssl/arm_arch.h>
+
+#include "internal.h"
+
+
+extern uint32_t OPENSSL_armcap_P;
+
+void OPENSSL_cpuid_setup(void) {
+  unsigned long hwcap = getauxval(AT_HWCAP);
+
+  /* See /usr/include/asm/hwcap.h on an aarch64 installation for the source of
+   * these values. */
+  static const unsigned long kNEON = 1 << 1;
+  static const unsigned long kAES = 1 << 3;
+  static const unsigned long kPMULL = 1 << 4;
+  static const unsigned long kSHA1 = 1 << 5;
+  static const unsigned long kSHA256 = 1 << 6;
+
+  if ((hwcap & kNEON) == 0) {
+    /* Matching OpenSSL, if NEON is missing, don't report other features
+     * either. */
+    return;
+  }
+
+  OPENSSL_armcap_P |= ARMV7_NEON;
+
+  if (hwcap & kAES) {
+    OPENSSL_armcap_P |= ARMV8_AES;
+  }
+  if (hwcap & kPMULL) {
+    OPENSSL_armcap_P |= ARMV8_PMULL;
+  }
+  if (hwcap & kSHA1) {
+    OPENSSL_armcap_P |= ARMV8_SHA1;
+  }
+  if (hwcap & kSHA256) {
+    OPENSSL_armcap_P |= ARMV8_SHA256;
+  }
+}
+
+#endif /* OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP */
@@ -1,32 +0,0 @@
-# Copyright (c) 2014, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-#if !defined(OPENSSL_NO_ASM) && defined(__arm__)
-
-.syntax unified
-.cpu cortex-a8
-.fpu neon
-.text
-.thumb
-.align 2
-.global CRYPTO_arm_neon_probe
-.hidden CRYPTO_arm_neon_probe
-.type CRYPTO_arm_neon_probe, %function
-.thumb_func
-CRYPTO_arm_neon_probe:
-  vorr q1, q1, q1
-  bx lr
-.section .note.GNU-stack,"",%progbits
-
-#endif  /* !OPENSSL_NO_ASM && __arm__ */
@@ -0,0 +1,360 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/cpu.h>
+
+#if defined(OPENSSL_ARM) && !defined(OPENSSL_STATIC_ARMCAP)
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <openssl/arm_arch.h>
+#include <openssl/buf.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+#define AT_HWCAP 16
+#define AT_HWCAP2 26
+
+#define HWCAP_NEON (1 << 12)
+
+/* See /usr/include/asm/hwcap.h on an ARM installation for the source of
+ * these values. */
+#define HWCAP2_AES (1 << 0)
+#define HWCAP2_PMULL (1 << 1)
+#define HWCAP2_SHA1 (1 << 2)
+#define HWCAP2_SHA2 (1 << 3)
+
+/* |getauxval| is not available on Android until API level 20. Link it as a weak
+ * symbol and use other methods as fallback. */
+unsigned long getauxval(unsigned long type) __attribute__((weak));
+
+static int open_eintr(const char *path, int flags) {
+  int ret;
+  do {
+    ret = open(path, flags);
+  } while (ret < 0 && errno == EINTR);
+  return ret;
+}
+
+static ssize_t read_eintr(int fd, void *out, size_t len) {
+  ssize_t ret;
+  do {
+    ret = read(fd, out, len);
+  } while (ret < 0 && errno == EINTR);
+  return ret;
+}
+
+/* read_full reads exactly |len| bytes from |fd| to |out|. On error or end of
+ * file, it returns zero. */
+static int read_full(int fd, void *out, size_t len) {
+  char *outp = out;
+  while (len > 0) {
+    ssize_t ret = read_eintr(fd, outp, len);
+    if (ret <= 0) {
+      return 0;
+    }
+    outp += ret;
+    len -= ret;
+  }
+  return 1;
+}
+
+/* read_file opens |path| and reads until end-of-file. On success, it returns
+ * one and sets |*out_ptr| and |*out_len| to a newly-allocated buffer with the
+ * contents. Otherwise, it returns zero. */
+static int read_file(char **out_ptr, size_t *out_len, const char *path) {
+  int fd = open_eintr(path, O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  static const size_t kReadSize = 1024;
+  int ret = 0;
+  size_t cap = kReadSize, len = 0;
+  char *buf = OPENSSL_malloc(cap);
+  if (buf == NULL) {
+    goto err;
+  }
+
+  for (;;) {
+    if (cap - len < kReadSize) {
+      size_t new_cap = cap * 2;
+      if (new_cap < cap) {
+        goto err;
+      }
+      char *new_buf = OPENSSL_realloc(buf, new_cap);
+      if (new_buf == NULL) {
+        goto err;
+      }
+      buf = new_buf;
+      cap = new_cap;
+    }
+
+    ssize_t bytes_read = read_eintr(fd, buf + len, kReadSize);
+    if (bytes_read < 0) {
+      goto err;
+    }
+    if (bytes_read == 0) {
+      break;
+    }
+    len += bytes_read;
+  }
+
+  *out_ptr = buf;
+  *out_len = len;
+  ret = 1;
+  buf = NULL;
+
+err:
+  OPENSSL_free(buf);
+  close(fd);
+  return ret;
+}
+
+/* getauxval_proc behaves like |getauxval| but reads from /proc/self/auxv. */
+static unsigned long getauxval_proc(unsigned long type) {
+  int fd = open_eintr("/proc/self/auxv", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  struct {
+    unsigned long tag;
+    unsigned long value;
+  } entry;
+
+  for (;;) {
+    if (!read_full(fd, &entry, sizeof(entry)) ||
+        (entry.tag == 0 && entry.value == 0)) {
+      break;
+    }
+    if (entry.tag == type) {
+      close(fd);
+      return entry.value;
+    }
+  }
+  close(fd);
+  return 0;
+}
+
+typedef struct {
+  const char *data;
+  size_t len;
+} STRING_PIECE;
+
+static int STRING_PIECE_equals(const STRING_PIECE *a, const char *b) {
+  size_t b_len = strlen(b);
+  return a->len == b_len && memcmp(a->data, b, b_len) == 0;
+}
+
+/* STRING_PIECE_split finds the first occurence of |sep| in |in| and, if found,
+ * sets |*out_left| and |*out_right| to |in| split before and after it. It
+ * returns one if |sep| was found and zero otherwise. */
+static int STRING_PIECE_split(STRING_PIECE *out_left, STRING_PIECE *out_right,
+                              const STRING_PIECE *in, char sep) {
+  const char *p = memchr(in->data, sep, in->len);
+  if (p == NULL) {
+    return 0;
+  }
+  /* |out_left| or |out_right| may alias |in|, so make a copy. */
+  STRING_PIECE in_copy = *in;
+  out_left->data = in_copy.data;
+  out_left->len = p - in_copy.data;
+  out_right->data = in_copy.data + out_left->len + 1;
+  out_right->len = in_copy.len - out_left->len - 1;
+  return 1;
+}
+
+/* STRING_PIECE_trim removes leading and trailing whitespace from |s|. */
+static void STRING_PIECE_trim(STRING_PIECE *s) {
+  while (s->len != 0 && (s->data[0] == ' ' || s->data[0] == '\t')) {
+    s->data++;
+    s->len--;
+  }
+  while (s->len != 0 &&
+         (s->data[s->len - 1] == ' ' || s->data[s->len - 1] == '\t')) {
+    s->len--;
+  }
+}
+
+/* extract_cpuinfo_field extracts a /proc/cpuinfo field named |field| from
+ * |in|.  If found, it sets |*out| to the value and returns one. Otherwise, it
+ * returns zero. */
+static int extract_cpuinfo_field(STRING_PIECE *out, const STRING_PIECE *in,
+                                 const char *field) {
+  /* Process |in| one line at a time. */
+  STRING_PIECE remaining = *in, line;
+  while (STRING_PIECE_split(&line, &remaining, &remaining, '\n')) {
+    STRING_PIECE key, value;
+    if (!STRING_PIECE_split(&key, &value, &line, ':')) {
+      continue;
+    }
+    STRING_PIECE_trim(&key);
+    if (STRING_PIECE_equals(&key, field)) {
+      STRING_PIECE_trim(&value);
+      *out = value;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+static int cpuinfo_field_equals(const STRING_PIECE *cpuinfo, const char *field,
+                                const char *value) {
+  STRING_PIECE extracted;
+  return extract_cpuinfo_field(&extracted, cpuinfo, field) &&
+         STRING_PIECE_equals(&extracted, value);
+}
+
+/* has_list_item treats |list| as a space-separated list of items and returns
+ * one if |item| is contained in |list| and zero otherwise. */
+static int has_list_item(const STRING_PIECE *list, const char *item) {
+  STRING_PIECE remaining = *list, feature;
+  while (STRING_PIECE_split(&feature, &remaining, &remaining, ' ')) {
+    if (STRING_PIECE_equals(&feature, item)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static unsigned long get_hwcap_cpuinfo(const STRING_PIECE *cpuinfo) {
+  if (cpuinfo_field_equals(cpuinfo, "CPU architecture", "8")) {
+    /* This is a 32-bit ARM binary running on a 64-bit kernel. NEON is always
+     * available on ARMv8. Linux omits required features, so reading the
+     * "Features" line does not work. (For simplicity, use strict equality. We
+     * assume everything running on future ARM architectures will have a
+     * working |getauxval|.) */
+    return HWCAP_NEON;
+  }
+
+  STRING_PIECE features;
+  if (extract_cpuinfo_field(&features, cpuinfo, "Features") &&
+      has_list_item(&features, "neon")) {
+    return HWCAP_NEON;
+  }
+  return 0;
+}
+
+static unsigned long get_hwcap2_cpuinfo(const STRING_PIECE *cpuinfo) {
+  STRING_PIECE features;
+  if (!extract_cpuinfo_field(&features, cpuinfo, "Features")) {
+    return 0;
+  }
+
+  unsigned long ret = 0;
+  if (has_list_item(&features, "aes")) {
+    ret |= HWCAP2_AES;
+  }
+  if (has_list_item(&features, "pmull")) {
+    ret |= HWCAP2_PMULL;
+  }
+  if (has_list_item(&features, "sha1")) {
+    ret |= HWCAP2_SHA1;
+  }
+  if (has_list_item(&features, "sha2")) {
+    ret |= HWCAP2_SHA2;
+  }
+  return ret;
+}
+
+/* has_broken_neon returns one if |in| matches a CPU known to have a broken
+ * NEON unit. See https://crbug.com/341598. */
+static int has_broken_neon(const STRING_PIECE *cpuinfo) {
+  return cpuinfo_field_equals(cpuinfo, "CPU implementer", "0x51") &&
+         cpuinfo_field_equals(cpuinfo, "CPU architecture", "7") &&
+         cpuinfo_field_equals(cpuinfo, "CPU variant", "0x1") &&
+         cpuinfo_field_equals(cpuinfo, "CPU part", "0x04d") &&
+         cpuinfo_field_equals(cpuinfo, "CPU revision", "0");
+}
+
+extern uint32_t OPENSSL_armcap_P;
+
+static int g_has_broken_neon;
+
+void OPENSSL_cpuid_setup(void) {
+  char *cpuinfo_data;
+  size_t cpuinfo_len;
+  if (!read_file(&cpuinfo_data, &cpuinfo_len, "/proc/cpuinfo")) {
+    return;
+  }
+  STRING_PIECE cpuinfo;
+  cpuinfo.data = cpuinfo_data;
+  cpuinfo.len = cpuinfo_len;
+
+  /* |getauxval| is not available on Android until API level 20. If it is
+   * unavailable, read from /proc/self/auxv as a fallback. This is unreadable
+   * on some versions of Android, so further fall back to /proc/cpuinfo.
+   *
+   * See
+   * https://android.googlesource.com/platform/ndk/+/882ac8f3392858991a0e1af33b4b7387ec856bd2
+   * and b/13679666 (Google-internal) for details. */
+  unsigned long hwcap = 0;
+  if (getauxval != NULL) {
+    hwcap = getauxval(AT_HWCAP);
+  }
+  if (hwcap == 0) {
+    hwcap = getauxval_proc(AT_HWCAP);
+  }
+  if (hwcap == 0) {
+    hwcap = get_hwcap_cpuinfo(&cpuinfo);
+  }
+
+  /* Clear NEON support if known broken. */
+  g_has_broken_neon = has_broken_neon(&cpuinfo);
+  if (g_has_broken_neon) {
+    hwcap &= ~HWCAP_NEON;
+  }
+
+  /* Matching OpenSSL, only report other features if NEON is present. */
+  if (hwcap & HWCAP_NEON) {
+    OPENSSL_armcap_P |= ARMV7_NEON;
+
+    /* Some ARMv8 Android devices don't expose AT_HWCAP2. Fall back to
+     * /proc/cpuinfo. See https://crbug.com/596156. */
+    unsigned long hwcap2 = 0;
+    if (getauxval != NULL) {
+      hwcap2 = getauxval(AT_HWCAP2);
+    }
+    if (hwcap2 == 0) {
+      hwcap2 = get_hwcap2_cpuinfo(&cpuinfo);
+    }
+
+    if (hwcap2 & HWCAP2_AES) {
+      OPENSSL_armcap_P |= ARMV8_AES;
+    }
+    if (hwcap2 & HWCAP2_PMULL) {
+      OPENSSL_armcap_P |= ARMV8_PMULL;
+    }
+    if (hwcap2 & HWCAP2_SHA1) {
+      OPENSSL_armcap_P |= ARMV8_SHA1;
+    }
+    if (hwcap2 & HWCAP2_SHA2) {
+      OPENSSL_armcap_P |= ARMV8_SHA256;
+    }
+  }
+
+  OPENSSL_free(cpuinfo_data);
+}
+
+int CRYPTO_has_broken_NEON(void) { return g_has_broken_neon; }
+
+#endif /* OPENSSL_ARM && !OPENSSL_STATIC_ARMCAP */
@@ -17,52 +17,15 @@
 #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
    !defined(OPENSSL_STATIC_ARMCAP)

-#include <inttypes.h>
-#include <string.h>
-
-#include <setjmp.h>
-#include <signal.h>
-
 #include <openssl/arm_arch.h>


-/* We can't include <sys/auxv.h> because the Android SDK version against which
- * Chromium builds is too old to have it. Instead we define all the constants
- * that we need and have a weak pointer to getauxval. */
-
-unsigned long getauxval(unsigned long type) __attribute__((weak));
-
 extern uint32_t OPENSSL_armcap_P;

 char CRYPTO_is_NEON_capable_at_runtime(void) {
  return (OPENSSL_armcap_P & ARMV7_NEON) != 0;
 }

-static char g_set_neon_called = 0;
-
-void CRYPTO_set_NEON_capable(char neon_capable) {
-  g_set_neon_called = 1;
-
-  if (neon_capable) {
-    OPENSSL_armcap_P |= ARMV7_NEON;
-  } else {
-    OPENSSL_armcap_P &= ~ARMV7_NEON;
-  }
-}
-
-char CRYPTO_is_NEON_functional(void) {
-  static const uint32_t kWantFlags = ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
-  return (OPENSSL_armcap_P & kWantFlags) == kWantFlags;
-}
-
-void CRYPTO_set_NEON_functional(char neon_functional) {
-  if (neon_functional) {
-    OPENSSL_armcap_P |= ARMV7_NEON_FUNCTIONAL;
-  } else {
-    OPENSSL_armcap_P &= ~ARMV7_NEON_FUNCTIONAL;
-  }
-}
-
 int CRYPTO_is_ARMv8_AES_capable(void) {
  return (OPENSSL_armcap_P & ARMV8_AES) != 0;
 }
@@ -71,129 +34,5 @@ int CRYPTO_is_ARMv8_PMULL_capable(void) {
  return (OPENSSL_armcap_P & ARMV8_PMULL) != 0;
 }

-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
-
-static sigjmp_buf sigill_jmp;
-
-static void sigill_handler(int signal) {
-  siglongjmp(sigill_jmp, signal);
-}
-
-void CRYPTO_arm_neon_probe(void);
-
-// probe_for_NEON returns 1 if a NEON instruction runs successfully. Because
-// getauxval doesn't exist on Android until Jelly Bean, supporting NEON on
-// older devices requires this.
-static int probe_for_NEON(void) {
-  int supported = 0;
-
-  sigset_t sigmask;
-  sigfillset(&sigmask);
-  sigdelset(&sigmask, SIGILL);
-  sigdelset(&sigmask, SIGTRAP);
-  sigdelset(&sigmask, SIGFPE);
-  sigdelset(&sigmask, SIGBUS);
-  sigdelset(&sigmask, SIGSEGV);
-
-  struct sigaction sigill_original_action, sigill_action;
-  memset(&sigill_action, 0, sizeof(sigill_action));
-  sigill_action.sa_handler = sigill_handler;
-  sigill_action.sa_mask = sigmask;
-
-  sigset_t original_sigmask;
-  sigprocmask(SIG_SETMASK, &sigmask, &original_sigmask);
-
-  if (sigsetjmp(sigill_jmp, 1 /* save signals */) == 0) {
-    sigaction(SIGILL, &sigill_action, &sigill_original_action);
-
-    // This function cannot be inline asm because GCC will refuse to compile
-    // inline NEON instructions unless building with -mfpu=neon, which would
-    // defeat the point of probing for support at runtime.
-    CRYPTO_arm_neon_probe();
-    supported = 1;
-  }
-  // Note that Android up to and including Lollipop doesn't restore the signal
-  // mask correctly after returning from a sigsetjmp. So that would need to be
-  // set again here if more probes were added.
-  // See https://android-review.googlesource.com/#/c/127624/
-
-  sigaction(SIGILL, &sigill_original_action, NULL);
-  sigprocmask(SIG_SETMASK, &original_sigmask, NULL);
-
-  return supported;
-}
-
-#else
-
-static int probe_for_NEON(void) {
-  return 0;
-}
-
-#endif  /* !OPENSSL_NO_ASM && OPENSSL_ARM */
-
-void OPENSSL_cpuid_setup(void) {
-  if (getauxval == NULL) {
-    // On ARM, but not AArch64, try a NEON instruction and see whether it works
-    // in order to probe for NEON support.
-    //
-    // Note that |CRYPTO_is_NEON_capable| can be true even if
-    // |CRYPTO_set_NEON_capable| has never been called if the code was compiled
-    // with NEON support enabled (e.g. -mfpu=neon).
-    if (!g_set_neon_called && !CRYPTO_is_NEON_capable() && probe_for_NEON()) {
-      OPENSSL_armcap_P |= ARMV7_NEON;
-    }
-    return;
-  }
-
-  static const unsigned long AT_HWCAP = 16;
-  unsigned long hwcap = getauxval(AT_HWCAP);
-
-#if defined(OPENSSL_ARM)
-  static const unsigned long kNEON = 1 << 12;
-  if ((hwcap & kNEON) == 0) {
-    return;
-  }
-
-  /* In 32-bit mode, the ARMv8 feature bits are in a different aux vector
-   * value. */
-  static const unsigned long AT_HWCAP2 = 26;
-  hwcap = getauxval(AT_HWCAP2);
-
-  /* See /usr/include/asm/hwcap.h on an ARM installation for the source of
-   * these values. */
-  static const unsigned long kAES = 1 << 0;
-  static const unsigned long kPMULL = 1 << 1;
-  static const unsigned long kSHA1 = 1 << 2;
-  static const unsigned long kSHA256 = 1 << 3;
-#elif defined(OPENSSL_AARCH64)
-  /* See /usr/include/asm/hwcap.h on an aarch64 installation for the source of
-   * these values. */
-  static const unsigned long kNEON = 1 << 1;
-  static const unsigned long kAES = 1 << 3;
-  static const unsigned long kPMULL = 1 << 4;
-  static const unsigned long kSHA1 = 1 << 5;
-  static const unsigned long kSHA256 = 1 << 6;
-
-  if ((hwcap & kNEON) == 0) {
-    return;
-  }
-#endif
-
-  OPENSSL_armcap_P |= ARMV7_NEON;
-
-  if (hwcap & kAES) {
-    OPENSSL_armcap_P |= ARMV8_AES;
-  }
-  if (hwcap & kPMULL) {
-    OPENSSL_armcap_P |= ARMV8_PMULL;
-  }
-  if (hwcap & kSHA1) {
-    OPENSSL_armcap_P |= ARMV8_SHA1;
-  }
-  if (hwcap & kSHA256) {
-    OPENSSL_armcap_P |= ARMV8_SHA256;
-  }
-}
-
 #endif  /* (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) &&
           !defined(OPENSSL_STATIC_ARMCAP) */
@@ -64,8 +64,8 @@
 #if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))

 #include <inttypes.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>

 #if defined(OPENSSL_WINDOWS)
@@ -75,6 +75,8 @@
 #pragma warning(pop)
 #endif

+#include "internal.h"
+

 /* OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX
 * is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through
@@ -63,7 +63,7 @@ uint32_t OPENSSL_ia32cap_P[4] = {0};

 uint32_t OPENSSL_armcap_P =
 #if defined(OPENSSL_STATIC_ARMCAP_NEON) || defined(__ARM_NEON__)
-    ARMV7_NEON | ARMV7_NEON_FUNCTIONAL |
+    ARMV7_NEON |
 #endif
 #if defined(OPENSSL_STATIC_ARMCAP_AES)
    ARMV8_AES |
@@ -79,10 +79,8 @@ uint32_t OPENSSL_armcap_P =
 #endif
    0;

-#elif defined(__ARM_NEON__)
-uint32_t OPENSSL_armcap_P = ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
 #else
-uint32_t OPENSSL_armcap_P = ARMV7_NEON_FUNCTIONAL;
+uint32_t OPENSSL_armcap_P = 0;
 #endif

 #endif
@@ -140,3 +138,5 @@ int CRYPTO_malloc_init(void) {
 void ENGINE_load_builtin_engines(void) {}

 void OPENSSL_load_builtin_modules(void) {}
+
+int FIPS_mode(void) { return 0; }
@@ -22,6 +22,7 @@ add_library(
  OBJECT

  curve25519.c
+  spake25519.c
  x25519-x86_64.c

  ${CURVE25519_ARCH_SOURCES}
@@ -45,3 +46,12 @@ add_executable(

 target_link_libraries(x25519_test crypto)
 add_dependencies(all_tests x25519_test)
+
+add_executable(
+  spake25519_test
+
+  spake25519_test.cc
+)
+
+target_link_libraries(spake25519_test crypto)
+add_dependencies(all_tests spake25519_test)
@@ -31,11 +31,10 @@
 #include "internal.h"


-/* fe means field element. Here the field is \Z/(2^255-19). An element t,
- * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
- * t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on
- * context.  */
-typedef int32_t fe[10];
+static const int64_t kBottom25Bits = INT64_C(0x1ffffff);
+static const int64_t kBottom26Bits = INT64_C(0x3ffffff);
+static const int64_t kTop39Bits = INT64_C(0xfffffffffe000000);
+static const int64_t kTop38Bits = INT64_C(0xfffffffffc000000);

 static uint64_t load_3(const uint8_t *in) {
  uint64_t result;
@@ -77,17 +76,17 @@ static void fe_frombytes(fe h, const uint8_t *s) {
  int64_t carry8;
  int64_t carry9;

-  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
-  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
-  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+  carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits;
+  carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits;
+  carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits;
+  carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits;
+  carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits;

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
-  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
+  carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
+  carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits;
+  carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits;

  h[0] = h0;
  h[1] = h1;
@@ -135,16 +134,6 @@ static void fe_tobytes(uint8_t *s, const fe h) {
  int32_t h8 = h[8];
  int32_t h9 = h[9];
  int32_t q;
-  int32_t carry0;
-  int32_t carry1;
-  int32_t carry2;
-  int32_t carry3;
-  int32_t carry4;
-  int32_t carry5;
-  int32_t carry6;
-  int32_t carry7;
-  int32_t carry8;
-  int32_t carry9;

  q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
  q = (h0 + q) >> 26;
@@ -162,16 +151,16 @@ static void fe_tobytes(uint8_t *s, const fe h) {
  h0 += 19 * q;
  /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */

-  carry0 = h0 >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry1 = h1 >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry2 = h2 >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry3 = h3 >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry4 = h4 >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry5 = h5 >> 25; h6 += carry5; h5 -= carry5 << 25;
-  carry6 = h6 >> 26; h7 += carry6; h6 -= carry6 << 26;
-  carry7 = h7 >> 25; h8 += carry7; h7 -= carry7 << 25;
-  carry8 = h8 >> 26; h9 += carry8; h8 -= carry8 << 26;
-  carry9 = h9 >> 25;               h9 -= carry9 << 25;
+  h1 += h0 >> 26; h0 &= kBottom26Bits;
+  h2 += h1 >> 25; h1 &= kBottom25Bits;
+  h3 += h2 >> 26; h2 &= kBottom26Bits;
+  h4 += h3 >> 25; h3 &= kBottom25Bits;
+  h5 += h4 >> 26; h4 &= kBottom26Bits;
+  h6 += h5 >> 25; h5 &= kBottom25Bits;
+  h7 += h6 >> 26; h6 &= kBottom26Bits;
+  h8 += h7 >> 25; h7 &= kBottom25Bits;
+  h9 += h8 >> 26; h8 &= kBottom26Bits;
+                  h9 &= kBottom25Bits;
                  /* h10 = carry9 */

  /* Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
@@ -182,32 +171,32 @@ static void fe_tobytes(uint8_t *s, const fe h) {
  s[0] = h0 >> 0;
  s[1] = h0 >> 8;
  s[2] = h0 >> 16;
-  s[3] = (h0 >> 24) | (h1 << 2);
+  s[3] = (h0 >> 24) | ((uint32_t)(h1) << 2);
  s[4] = h1 >> 6;
  s[5] = h1 >> 14;
-  s[6] = (h1 >> 22) | (h2 << 3);
+  s[6] = (h1 >> 22) | ((uint32_t)(h2) << 3);
  s[7] = h2 >> 5;
  s[8] = h2 >> 13;
-  s[9] = (h2 >> 21) | (h3 << 5);
+  s[9] = (h2 >> 21) | ((uint32_t)(h3) << 5);
  s[10] = h3 >> 3;
  s[11] = h3 >> 11;
-  s[12] = (h3 >> 19) | (h4 << 6);
+  s[12] = (h3 >> 19) | ((uint32_t)(h4) << 6);
  s[13] = h4 >> 2;
  s[14] = h4 >> 10;
  s[15] = h4 >> 18;
  s[16] = h5 >> 0;
  s[17] = h5 >> 8;
  s[18] = h5 >> 16;
-  s[19] = (h5 >> 24) | (h6 << 1);
+  s[19] = (h5 >> 24) | ((uint32_t)(h6) << 1);
  s[20] = h6 >> 7;
  s[21] = h6 >> 15;
-  s[22] = (h6 >> 23) | (h7 << 3);
+  s[22] = (h6 >> 23) | ((uint32_t)(h7) << 3);
  s[23] = h7 >> 5;
  s[24] = h7 >> 13;
-  s[25] = (h7 >> 21) | (h8 << 4);
+  s[25] = (h7 >> 21) | ((uint32_t)(h8) << 4);
  s[26] = h8 >> 4;
  s[27] = h8 >> 12;
-  s[28] = (h8 >> 20) | (h9 << 6);
+  s[28] = (h8 >> 20) | ((uint32_t)(h9) << 6);
  s[29] = h9 >> 2;
  s[30] = h9 >> 10;
  s[31] = h9 >> 18;
@@ -447,46 +436,46 @@ static void fe_mul(fe h, const fe f, const fe g) {
   * |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
   *   i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9 */

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
  /* |h0| <= 2^25 */
  /* |h4| <= 2^25 */
  /* |h1| <= 1.71*2^59 */
  /* |h5| <= 1.71*2^59 */

-  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+  carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits;
+  carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits;
  /* |h1| <= 2^24; from now on fits into int32 */
  /* |h5| <= 2^24; from now on fits into int32 */
  /* |h2| <= 1.41*2^60 */
  /* |h6| <= 1.41*2^60 */

-  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+  carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits;
+  carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits;
  /* |h2| <= 2^25; from now on fits into int32 unchanged */
  /* |h6| <= 2^25; from now on fits into int32 unchanged */
  /* |h3| <= 1.71*2^59 */
  /* |h7| <= 1.71*2^59 */

-  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+  carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits;
+  carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits;
  /* |h3| <= 2^24; from now on fits into int32 unchanged */
  /* |h7| <= 2^24; from now on fits into int32 unchanged */
  /* |h4| <= 1.72*2^34 */
  /* |h8| <= 1.41*2^60 */

-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
+  carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits;
  /* |h4| <= 2^25; from now on fits into int32 unchanged */
  /* |h8| <= 2^25; from now on fits into int32 unchanged */
  /* |h5| <= 1.01*2^24 */
  /* |h9| <= 1.71*2^59 */

-  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+  carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits;
  /* |h9| <= 2^24; from now on fits into int32 unchanged */
  /* |h0| <= 1.1*2^39 */

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
  /* |h0| <= 2^25; from now on fits into int32 unchanged */
  /* |h1| <= 1.01*2^24 */

@@ -612,24 +601,24 @@ static void fe_sq(fe h, const fe f) {
  int64_t carry8;
  int64_t carry9;

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;

-  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+  carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits;
+  carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits;

-  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+  carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits;
+  carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits;

-  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+  carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits;
+  carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits;

-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
+  carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits;

-  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+  carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits;

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;

  h[0] = h0;
  h[1] = h1;
@@ -880,24 +869,24 @@ static void fe_sq2(fe h, const fe f) {
  h8 += h8;
  h9 += h9;

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;

-  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+  carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits;
+  carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits;

-  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+  carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits;
+  carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits;

-  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+  carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits;
+  carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits;

-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
+  carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits;

-  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+  carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits;

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;

  h[0] = h0;
  h[1] = h1;
@@ -974,52 +963,7 @@ static void fe_pow22523(fe out, const fe z) {
  fe_mul(out, t0, z);
 }

-/* ge means group element.
-
- * Here the group is the set of pairs (x,y) of field elements (see fe.h)
- * satisfying -x^2 + y^2 = 1 + d x^2y^2
- * where d = -121665/121666.
- *
- * Representations:
- *   ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z
- *   ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT
- *   ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T
- *   ge_precomp (Duif): (y+x,y-x,2dxy) */
-
-typedef struct {
-  fe X;
-  fe Y;
-  fe Z;
-} ge_p2;
-
-typedef struct {
-  fe X;
-  fe Y;
-  fe Z;
-  fe T;
-} ge_p3;
-
-typedef struct {
-  fe X;
-  fe Y;
-  fe Z;
-  fe T;
-} ge_p1p1;
-
-typedef struct {
-  fe yplusx;
-  fe yminusx;
-  fe xy2d;
-} ge_precomp;
-
-typedef struct {
-  fe YplusX;
-  fe YminusX;
-  fe Z;
-  fe T2d;
-} ge_cached;
-
-static void ge_tobytes(uint8_t *s, const ge_p2 *h) {
+void x25519_ge_tobytes(uint8_t *s, const ge_p2 *h) {
  fe recip;
  fe x;
  fe y;
@@ -1049,7 +993,7 @@ static const fe d = {-10913610, 13857413, -15372611, 6949391,   114729,
 static const fe sqrtm1 = {-32595792, -7943725,  9377950,  3500415, 12389472,
                          -272473,   -25146209, -2005654, 326686,  11406482};

-static int ge_frombytes_vartime(ge_p3 *h, const uint8_t *s) {
+int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t *s) {
  fe u;
  fe v;
  fe v3;
@@ -1105,6 +1049,13 @@ static void ge_p3_0(ge_p3 *h) {
  fe_0(h->T);
 }

+static void ge_cached_0(ge_cached *h) {
+  fe_1(h->YplusX);
+  fe_1(h->YminusX);
+  fe_1(h->Z);
+  fe_0(h->T2d);
+}
+
 static void ge_precomp_0(ge_precomp *h) {
  fe_1(h->yplusx);
  fe_1(h->yminusx);
@@ -1122,7 +1073,7 @@ static const fe d2 = {-21827239, -5839606,  -30745221, 13898782, 229458,
                      15978800,  -12551817, -6495438,  29715968, 9444199};

 /* r = p */
-static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
+void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
  fe_add(r->YplusX, p->Y, p->X);
  fe_sub(r->YminusX, p->Y, p->X);
  fe_copy(r->Z, p->Z);
@@ -1130,20 +1081,27 @@ static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
 }

 /* r = p */
-static void ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
+void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
  fe_mul(r->X, p->X, p->T);
  fe_mul(r->Y, p->Y, p->Z);
  fe_mul(r->Z, p->Z, p->T);
 }

 /* r = p */
-static void ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
+void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
  fe_mul(r->X, p->X, p->T);
  fe_mul(r->Y, p->Y, p->Z);
  fe_mul(r->Z, p->Z, p->T);
  fe_mul(r->T, p->X, p->Y);
 }

+/* r = p */
+static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
+  ge_p3 t;
+  x25519_ge_p1p1_to_p3(&t, p);
+  x25519_ge_p3_to_cached(r, &t);
+}
+
 /* r = 2 * p */
 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
  fe t0;
@@ -1199,7 +1157,7 @@ static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
 }

 /* r = p + q */
-static void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
+void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
  fe t0;

  fe_add(r->X, p->Y, p->X);
@@ -1216,7 +1174,7 @@ static void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
 }

 /* r = p - q */
-static void ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
+void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
  fe t0;

  fe_add(r->X, p->Y, p->X);
@@ -1242,12 +1200,64 @@ static uint8_t equal(signed char b, signed char c) {
  return y;
 }

-static void cmov(ge_precomp *t, ge_precomp *u, uint8_t b) {
+static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
  fe_cmov(t->yplusx, u->yplusx, b);
  fe_cmov(t->yminusx, u->yminusx, b);
  fe_cmov(t->xy2d, u->xy2d, b);
 }

+void x25519_ge_scalarmult_small_precomp(
+    ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
+  /* precomp_table is first expanded into matching |ge_precomp|
+   * elements. */
+  ge_precomp multiples[15];
+
+  unsigned i;
+  for (i = 0; i < 15; i++) {
+    const uint8_t *bytes = &precomp_table[i*(2 * 32)];
+    fe x, y;
+    fe_frombytes(x, bytes);
+    fe_frombytes(y, bytes + 32);
+
+    ge_precomp *out = &multiples[i];
+    fe_add(out->yplusx, y, x);
+    fe_sub(out->yminusx, y, x);
+    fe_mul(out->xy2d, x, y);
+    fe_mul(out->xy2d, out->xy2d, d2);
+  }
+
+  /* See the comment above |k25519SmallPrecomp| about the structure of the
+   * precomputed elements. This loop does 64 additions and 64 doublings to
+   * calculate the result. */
+  ge_p3_0(h);
+
+  for (i = 63; i < 64; i--) {
+    unsigned j;
+    signed char index = 0;
+
+    for (j = 0; j < 4; j++) {
+      const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
+      index |= (bit << j);
+    }
+
+    ge_precomp e;
+    ge_precomp_0(&e);
+
+    for (j = 1; j < 16; j++) {
+      cmov(&e, &multiples[j-1], equal(index, j));
+    }
+
+    ge_cached cached;
+    ge_p1p1 r;
+    x25519_ge_p3_to_cached(&cached, h);
+    x25519_ge_add(&r, h, &cached);
+    x25519_ge_p1p1_to_p3(h, &r);
+
+    ge_madd(&r, h, &e);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+}
+
 #if defined(OPENSSL_SMALL)

 /* This block of code replaces the standard base-point table with a much smaller
@@ -1341,61 +1351,14 @@ static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {
    0x45, 0xc9, 0x8b, 0x17, 0x79, 0xe7, 0xc7, 0x90, 0x99, 0x3a, 0x18, 0x25,
 };

-static void ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
-  /* k25519SmallPrecomp is first expanded into matching |ge_precomp|
-   * elements. */
-  ge_precomp multiples[15];
-
-  unsigned i;
-  for (i = 0; i < 15; i++) {
-    const uint8_t *bytes = &k25519SmallPrecomp[i*(2 * 32)];
-    fe x, y;
-    fe_frombytes(x, bytes);
-    fe_frombytes(y, bytes + 32);
-
-    ge_precomp *out = &multiples[i];
-    fe_add(out->yplusx, y, x);
-    fe_sub(out->yminusx, y, x);
-    fe_mul(out->xy2d, x, y);
-    fe_mul(out->xy2d, out->xy2d, d2);
-  }
-
-  /* See the comment above |k25519SmallPrecomp| about the structure of the
-   * precomputed elements. This loop does 64 additions and 64 doublings to
-   * calculate the result. */
-  ge_p3_0(h);
-
-  for (i = 63; i < 64; i--) {
-    unsigned j;
-    signed char index = 0;
-
-    for (j = 0; j < 4; j++) {
-      const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
-      index |= (bit << j);
-    }
-
-    ge_precomp e;
-    ge_precomp_0(&e);
-
-    for (j = 1; j < 16; j++) {
-      cmov(&e, &multiples[j-1], equal(index, j));
-    }
-
-    ge_cached cached;
-    ge_p1p1 r;
-    ge_p3_to_cached(&cached, h);
-    ge_add(&r, h, &cached);
-    ge_p1p1_to_p3(h, &r);
-
-    ge_madd(&r, h, &e);
-    ge_p1p1_to_p3(h, &r);
-  }
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
+  x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
 }

 #else

 /* k25519Precomp[i][j] = (j+1)*256^i*B */
-static ge_precomp k25519Precomp[32][8] = {
+static const ge_precomp k25519Precomp[32][8] = {
    {
        {
            {25967493, -14356035, 29566456, 3660896, -12694345, 4014787,
@@ -3519,7 +3482,7 @@ static uint8_t negative(signed char b) {
 static void table_select(ge_precomp *t, int pos, signed char b) {
  ge_precomp minust;
  uint8_t bnegative = negative(b);
-  uint8_t babs = b - (((-bnegative) & b) << 1);
+  uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);

  ge_precomp_0(t);
  cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
@@ -3542,7 +3505,7 @@ static void table_select(ge_precomp *t, int pos, signed char b) {
 *
 * Preconditions:
 *   a[31] <= 127 */
-static void ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
  signed char e[64];
  signed char carry;
  ge_p1p1 r;
@@ -3571,27 +3534,88 @@ static void ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
  for (i = 1; i < 64; i += 2) {
    table_select(&t, i / 2, e[i]);
    ge_madd(&r, h, &t);
-    ge_p1p1_to_p3(h, &r);
+    x25519_ge_p1p1_to_p3(h, &r);
  }

  ge_p3_dbl(&r, h);
-  ge_p1p1_to_p2(&s, &r);
+  x25519_ge_p1p1_to_p2(&s, &r);
  ge_p2_dbl(&r, &s);
-  ge_p1p1_to_p2(&s, &r);
+  x25519_ge_p1p1_to_p2(&s, &r);
  ge_p2_dbl(&r, &s);
-  ge_p1p1_to_p2(&s, &r);
+  x25519_ge_p1p1_to_p2(&s, &r);
  ge_p2_dbl(&r, &s);
-  ge_p1p1_to_p3(h, &r);
+  x25519_ge_p1p1_to_p3(h, &r);

  for (i = 0; i < 64; i += 2) {
    table_select(&t, i / 2, e[i]);
    ge_madd(&r, h, &t);
-    ge_p1p1_to_p3(h, &r);
+    x25519_ge_p1p1_to_p3(h, &r);
  }
 }

 #endif

+static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
+  fe_cmov(t->YplusX, u->YplusX, b);
+  fe_cmov(t->YminusX, u->YminusX, b);
+  fe_cmov(t->Z, u->Z, b);
+  fe_cmov(t->T2d, u->T2d, b);
+}
+
+/* r = scalar * A.
+ * where a = a[0]+256*a[1]+...+256^31 a[31]. */
+void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
+  ge_p2 Ai_p2[8];
+  ge_cached Ai[16];
+  ge_p1p1 t;
+
+  ge_cached_0(&Ai[0]);
+  x25519_ge_p3_to_cached(&Ai[1], A);
+  ge_p3_to_p2(&Ai_p2[1], A);
+
+  unsigned i;
+  for (i = 2; i < 16; i += 2) {
+    ge_p2_dbl(&t, &Ai_p2[i / 2]);
+    ge_p1p1_to_cached(&Ai[i], &t);
+    if (i < 8) {
+      x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
+    }
+    x25519_ge_add(&t, A, &Ai[i]);
+    ge_p1p1_to_cached(&Ai[i + 1], &t);
+    if (i < 7) {
+      x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
+    }
+  }
+
+  ge_p2_0(r);
+  ge_p3 u;
+
+  for (i = 0; i < 256; i += 4) {
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p3(&u, &t);
+
+    uint8_t index = scalar[31 - i/8];
+    index >>= 4 - (i & 4);
+    index &= 0xf;
+
+    unsigned j;
+    ge_cached selected;
+    ge_cached_0(&selected);
+    for (j = 0; j < 16; j++) {
+      cmov_cached(&selected, &Ai[j], equal(j, index));
+    }
+
+    x25519_ge_add(&t, &u, &selected);
+    x25519_ge_p1p1_to_p2(r, &t);
+  }
+}
+
 static void slide(signed char *r, const uint8_t *a) {
  int i;
  int b;
@@ -3626,7 +3650,7 @@ static void slide(signed char *r, const uint8_t *a) {
  }
 }

-static ge_precomp Bi[8] = {
+static const ge_precomp Bi[8] = {
    {
        {25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
         -11754271, -6079156, 2047605},
@@ -3697,8 +3721,8 @@ static ge_precomp Bi[8] = {
 * where a = a[0]+256*a[1]+...+256^31 a[31].
 * and b = b[0]+256*b[1]+...+256^31 b[31].
 * B is the Ed25519 base point (x,4/5) with x positive. */
-void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
-                                  const ge_p3 *A, const uint8_t *b) {
+static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
+                                         const ge_p3 *A, const uint8_t *b) {
  signed char aslide[256];
  signed char bslide[256];
  ge_cached Ai[8]; /* A,3A,5A,7A,9A,11A,13A,15A */
@@ -3710,30 +3734,30 @@ void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
  slide(aslide, a);
  slide(bslide, b);

-  ge_p3_to_cached(&Ai[0], A);
+  x25519_ge_p3_to_cached(&Ai[0], A);
  ge_p3_dbl(&t, A);
-  ge_p1p1_to_p3(&A2, &t);
-  ge_add(&t, &A2, &Ai[0]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[1], &u);
-  ge_add(&t, &A2, &Ai[1]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[2], &u);
-  ge_add(&t, &A2, &Ai[2]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[3], &u);
-  ge_add(&t, &A2, &Ai[3]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[4], &u);
-  ge_add(&t, &A2, &Ai[4]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[5], &u);
-  ge_add(&t, &A2, &Ai[5]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[6], &u);
-  ge_add(&t, &A2, &Ai[6]);
-  ge_p1p1_to_p3(&u, &t);
-  ge_p3_to_cached(&Ai[7], &u);
+  x25519_ge_p1p1_to_p3(&A2, &t);
+  x25519_ge_add(&t, &A2, &Ai[0]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[1], &u);
+  x25519_ge_add(&t, &A2, &Ai[1]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[2], &u);
+  x25519_ge_add(&t, &A2, &Ai[2]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[3], &u);
+  x25519_ge_add(&t, &A2, &Ai[3]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[4], &u);
+  x25519_ge_add(&t, &A2, &Ai[4]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[5], &u);
+  x25519_ge_add(&t, &A2, &Ai[5]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[6], &u);
+  x25519_ge_add(&t, &A2, &Ai[6]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[7], &u);

  ge_p2_0(r);

@@ -3747,22 +3771,22 @@ void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
    ge_p2_dbl(&t, r);

    if (aslide[i] > 0) {
-      ge_p1p1_to_p3(&u, &t);
-      ge_add(&t, &u, &Ai[aslide[i] / 2]);
+      x25519_ge_p1p1_to_p3(&u, &t);
+      x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
    } else if (aslide[i] < 0) {
-      ge_p1p1_to_p3(&u, &t);
-      ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
+      x25519_ge_p1p1_to_p3(&u, &t);
+      x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
    }

    if (bslide[i] > 0) {
-      ge_p1p1_to_p3(&u, &t);
+      x25519_ge_p1p1_to_p3(&u, &t);
      ge_madd(&t, &u, &Bi[bslide[i] / 2]);
    } else if (bslide[i] < 0) {
-      ge_p1p1_to_p3(&u, &t);
+      x25519_ge_p1p1_to_p3(&u, &t);
      ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
    }

-    ge_p1p1_to_p2(r, &t);
+    x25519_ge_p1p1_to_p2(r, &t);
  }
 }

@@ -3776,7 +3800,7 @@ void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
 *   s[0]+256*s[1]+...+256^31*s[31] = s mod l
 *   where l = 2^252 + 27742317777372353535851937790883648493.
 *   Overwrites s in place. */
-static void sc_reduce(uint8_t *s) {
+void x25519_sc_reduce(uint8_t *s) {
  int64_t s0 = 2097151 & load_3(s);
  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
@@ -4610,7 +4634,7 @@ void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
  az[31] |= 64;

  ge_p3 A;
-  ge_scalarmult_base(&A, az);
+  x25519_ge_scalarmult_base(&A, az);
  ge_p3_tobytes(out_public_key, &A);

  memcpy(out_private_key, seed, 32);
@@ -4633,9 +4657,9 @@ int ED25519_sign(uint8_t *out_sig, const uint8_t *message, size_t message_len,
  uint8_t nonce[SHA512_DIGEST_LENGTH];
  SHA512_Final(nonce, &hash_ctx);

-  sc_reduce(nonce);
+  x25519_sc_reduce(nonce);
  ge_p3 R;
-  ge_scalarmult_base(&R, nonce);
+  x25519_ge_scalarmult_base(&R, nonce);
  ge_p3_tobytes(out_sig, &R);

  SHA512_Init(&hash_ctx);
@@ -4645,7 +4669,7 @@ int ED25519_sign(uint8_t *out_sig, const uint8_t *message, size_t message_len,
  uint8_t hram[SHA512_DIGEST_LENGTH];
  SHA512_Final(hram, &hash_ctx);

-  sc_reduce(hram);
+  x25519_sc_reduce(hram);
  sc_muladd(out_sig + 32, hram, az, nonce);

  return 1;
@@ -4655,7 +4679,7 @@ int ED25519_verify(const uint8_t *message, size_t message_len,
                   const uint8_t signature[64], const uint8_t public_key[32]) {
  ge_p3 A;
  if ((signature[63] & 224) != 0 ||
-      ge_frombytes_vartime(&A, public_key) != 0) {
+      x25519_ge_frombytes_vartime(&A, public_key) != 0) {
    return 0;
  }

@@ -4677,13 +4701,13 @@ int ED25519_verify(const uint8_t *message, size_t message_len,
  uint8_t h[SHA512_DIGEST_LENGTH];
  SHA512_Final(h, &hash_ctx);

-  sc_reduce(h);
+  x25519_sc_reduce(h);

  ge_p2 R;
  ge_double_scalarmult_vartime(&R, h, &A, scopy);

  uint8_t rcheck[32];
-  ge_tobytes(rcheck, &R);
+  x25519_ge_tobytes(rcheck, &R);

  return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
 }
@@ -4753,17 +4777,17 @@ static void fe_mul121666(fe h, fe f) {
  int64_t carry8;
  int64_t carry9;

-  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
-  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
-  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+  carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits;
+  carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits;
+  carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits;
+  carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits;
+  carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits;

-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
-  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+  carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
+  carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits;
+  carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
+  carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits;
+  carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits;

  h[0] = h0;
  h[1] = h1;
@@ -4887,7 +4911,7 @@ void X25519_public_from_private(uint8_t out_public_value[32],
  e[31] |= 64;

  ge_p3 A;
-  ge_scalarmult_base(&A, e);
+  x25519_ge_scalarmult_base(&A, e);

  /* We only need the u-coordinate of the curve25519 point. The map is
   * u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). */
@@ -37,6 +37,70 @@ void x25519_NEON(uint8_t out[32], const uint8_t scalar[32],
                 const uint8_t point[32]);
 #endif

+/* fe means field element. Here the field is \Z/(2^255-19). An element t,
+ * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
+ * t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on
+ * context.  */
+typedef int32_t fe[10];
+
+/* ge means group element.
+
+ * Here the group is the set of pairs (x,y) of field elements (see fe.h)
+ * satisfying -x^2 + y^2 = 1 + d x^2y^2
+ * where d = -121665/121666.
+ *
+ * Representations:
+ *   ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z
+ *   ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT
+ *   ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T
+ *   ge_precomp (Duif): (y+x,y-x,2dxy) */
+
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+} ge_p2;
+
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+  fe T;
+} ge_p3;
+
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+  fe T;
+} ge_p1p1;
+
+typedef struct {
+  fe yplusx;
+  fe yminusx;
+  fe xy2d;
+} ge_precomp;
+
+typedef struct {
+  fe YplusX;
+  fe YminusX;
+  fe Z;
+  fe T2d;
+} ge_cached;
+
+void x25519_ge_tobytes(uint8_t *s, const ge_p2 *h);
+int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t *s);
+void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p);
+void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p);
+void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p);
+void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
+void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
+void x25519_ge_scalarmult_small_precomp(
+    ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]);
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]);
+void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A);
+void x25519_sc_reduce(uint8_t *s);
+

 #if defined(__cplusplus)
 }  /* extern C */
@@ -0,0 +1,464 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/curve25519.h>
+
+#include <string.h>
+
+#include <openssl/bytestring.h>
+#include <openssl/mem.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
+
+#include "internal.h"
+
+
+/* The following precomputation tables are for the following
+ * points used in the SPAKE2 protocol.
+ *
+ * N:
+ *   x: 49918732221787544735331783592030787422991506689877079631459872391322455579424
+ *   y: 54629554431565467720832445949441049581317094546788069926228343916274969994000
+ *   encoded: 10e3df0ae37d8e7a99b5fe74b44672103dbddcbd06af680d71329a11693bc778
+ *
+ * M:
+ *   x: 31406539342727633121250288103050113562375374900226415211311216773867585644232
+ *   y: 21177308356423958466833845032658859666296341766942662650232962324899758529114
+ *   encoded: 5ada7e4bf6ddd9adb6626d32131c6b5c51a1e347a3478f53cfcf441b88eed12e
+ *
+ * These points and their precomputation tables are generated with the
+ * following Python code. For a description of the precomputation table,
+ * see curve25519.c in this directory.
+ *
+ * Exact copies of the source code are kept in bug 27296743.
+ *
+ * import hashlib
+ * import ed25519 as E  # http://ed25519.cr.yp.to/python/ed25519.py
+ *
+ * SEED_N = 'edwards25519 point generation seed (N)'
+ * SEED_M = 'edwards25519 point generation seed (M)'
+ *
+ * def genpoint(seed):
+ *     v = hashlib.sha256(seed).digest()
+ *     it = 1
+ *     while True:
+ *         try:
+ *             x,y = E.decodepoint(v)
+ *         except Exception, e:
+ *             print e
+ *             it += 1
+ *             v = hashlib.sha256(v).digest()
+ *             continue
+ *         print "Found in %d iterations:" % it
+ *         print "  x = %d" % x
+ *         print "  y = %d" % y
+ *         print " Encoded (hex)"
+ *         print E.encodepoint((x,y)).encode('hex')
+ *         return (x,y)
+ *
+ * def gentable(P):
+ *     t = []
+ *     for i in range(1,16):
+ *         k = (i >> 3 & 1) * (1 << 192) + \
+ *             (i >> 2 & 1) * (1 << 128) + \
+ *             (i >> 1 & 1) * (1 <<  64) + \
+ *             (i      & 1)
+ *         t.append(E.scalarmult(P, k))
+ *     return ''.join(E.encodeint(x) + E.encodeint(y) for (x,y) in t)
+ *
+ * def printtable(table, name):
+ *     print "static const uint8_t %s[15 * 2 * 32] = {" % name,
+ *     for i in range(15 * 2 * 32):
+ *         if i % 12 == 0:
+ *             print "\n   ",
+ *         print " 0x%02x," % ord(table[i]),
+ *     print "\n};"
+ *
+ * if __name__ == "__main__":
+ *     print "Searching for N"
+ *     N = genpoint(SEED_N)
+ *     print "Generating precomputation table for N"
+ *     Ntable = gentable(N)
+ *     printtable(Ntable, "kSpakeNSmallPrecomp")
+ *
+ *     print "Searching for M"
+ *     M = genpoint(SEED_M)
+ *     print "Generating precomputation table for M"
+ *     Mtable = gentable(M)
+ *     printtable(Mtable, "kSpakeMSmallPrecomp")
+ */
+static const uint8_t kSpakeNSmallPrecomp[15 * 2 * 32] = {
+    0x20, 0x1b, 0xc5, 0xb3, 0x43, 0x17, 0x71, 0x10, 0x44, 0x1e, 0x73, 0xb3,
+    0xae, 0x3f, 0xbf, 0x9f, 0xf5, 0x44, 0xc8, 0x13, 0x8f, 0xd1, 0x01, 0xc2,
+    0x8a, 0x1a, 0x6d, 0xea, 0x4d, 0x00, 0x5d, 0x6e, 0x10, 0xe3, 0xdf, 0x0a,
+    0xe3, 0x7d, 0x8e, 0x7a, 0x99, 0xb5, 0xfe, 0x74, 0xb4, 0x46, 0x72, 0x10,
+    0x3d, 0xbd, 0xdc, 0xbd, 0x06, 0xaf, 0x68, 0x0d, 0x71, 0x32, 0x9a, 0x11,
+    0x69, 0x3b, 0xc7, 0x78, 0x93, 0xf1, 0x57, 0x97, 0x6e, 0xf0, 0x6e, 0x45,
+    0x37, 0x4a, 0xf4, 0x0b, 0x18, 0x51, 0xf5, 0x4f, 0x67, 0x3c, 0xdc, 0xec,
+    0x84, 0xed, 0xd0, 0xeb, 0xca, 0xfb, 0xdb, 0xff, 0x7f, 0xeb, 0xa8, 0x23,
+    0x68, 0x87, 0x13, 0x64, 0x6a, 0x10, 0xf7, 0x45, 0xe0, 0x0f, 0x32, 0x21,
+    0x59, 0x7c, 0x0e, 0x50, 0xad, 0x56, 0xd7, 0x12, 0x69, 0x7b, 0x58, 0xf8,
+    0xb9, 0x3b, 0xa5, 0xbb, 0x4d, 0x1b, 0x87, 0x1c, 0x46, 0xa7, 0x17, 0x9d,
+    0x6d, 0x84, 0x45, 0xbe, 0x7f, 0x95, 0xd2, 0x34, 0xcd, 0x89, 0x95, 0xc0,
+    0xf0, 0xd3, 0xdf, 0x6e, 0x10, 0x4a, 0xe3, 0x7b, 0xce, 0x7f, 0x40, 0x27,
+    0xc7, 0x2b, 0xab, 0x66, 0x03, 0x59, 0xb4, 0x7b, 0xc7, 0xc7, 0xf0, 0x39,
+    0x9a, 0x33, 0x35, 0xbf, 0xcc, 0x2f, 0xf3, 0x2e, 0x68, 0x9d, 0x53, 0x5c,
+    0x88, 0x52, 0xe3, 0x77, 0x90, 0xa1, 0x27, 0x85, 0xc5, 0x74, 0x7f, 0x23,
+    0x0e, 0x93, 0x01, 0x3e, 0xe7, 0x2e, 0x2e, 0x95, 0xf3, 0x0d, 0xc2, 0x25,
+    0x25, 0x39, 0x39, 0x3d, 0x6e, 0x8e, 0x89, 0xbd, 0xe8, 0xbb, 0x67, 0x5e,
+    0x8c, 0x66, 0x8b, 0x63, 0x28, 0x1e, 0x4e, 0x74, 0x85, 0xa8, 0xaf, 0x0f,
+    0x12, 0x5d, 0xb6, 0x8a, 0x83, 0x1a, 0x77, 0x76, 0x5e, 0x62, 0x8a, 0xa7,
+    0x3c, 0xb8, 0x05, 0x57, 0x2b, 0xaf, 0x36, 0x2e, 0x10, 0x90, 0xb2, 0x39,
+    0xb4, 0x3e, 0x75, 0x6d, 0x3a, 0xa8, 0x31, 0x35, 0xc2, 0x1e, 0x8f, 0xc2,
+    0x79, 0x89, 0x35, 0x16, 0x26, 0xd1, 0xc7, 0x0b, 0x04, 0x1f, 0x1d, 0xf9,
+    0x9c, 0x05, 0xa6, 0x6b, 0xb5, 0x19, 0x5a, 0x24, 0x6d, 0x91, 0xc5, 0x31,
+    0xfd, 0xc5, 0xfa, 0xe7, 0xa6, 0xcb, 0x0e, 0x4b, 0x18, 0x0d, 0x94, 0xc7,
+    0xee, 0x1d, 0x46, 0x1f, 0x92, 0xb1, 0xb2, 0x4a, 0x2b, 0x43, 0x37, 0xfe,
+    0xc2, 0x15, 0x11, 0x89, 0xef, 0x59, 0x73, 0x3c, 0x06, 0x76, 0x78, 0xcb,
+    0xa6, 0x0d, 0x79, 0x5f, 0x28, 0x0b, 0x5b, 0x8c, 0x9e, 0xe4, 0xaa, 0x51,
+    0x9a, 0x42, 0x6f, 0x11, 0x50, 0x3d, 0x01, 0xd6, 0x21, 0xc0, 0x99, 0x5e,
+    0x1a, 0xe8, 0x81, 0x25, 0x80, 0xeb, 0xed, 0x5d, 0x37, 0x47, 0x30, 0x70,
+    0xa0, 0x4e, 0x0b, 0x43, 0x17, 0xbe, 0xb6, 0x47, 0xe7, 0x2a, 0x62, 0x9d,
+    0x5d, 0xa6, 0xc5, 0x33, 0x62, 0x9d, 0x56, 0x24, 0x9d, 0x1d, 0xb2, 0x13,
+    0xbc, 0x17, 0x66, 0x43, 0xd1, 0x68, 0xd5, 0x3b, 0x17, 0x69, 0x17, 0xa6,
+    0x06, 0x9e, 0x12, 0xb8, 0x7c, 0xd5, 0xaf, 0x3e, 0x21, 0x1b, 0x31, 0xeb,
+    0x0b, 0xa4, 0x98, 0x1c, 0xf2, 0x6a, 0x5e, 0x7c, 0x9b, 0x45, 0x8f, 0xb2,
+    0x12, 0x06, 0xd5, 0x8c, 0x1d, 0xb2, 0xa7, 0x57, 0x5f, 0x2f, 0x4f, 0xdb,
+    0x52, 0x99, 0x7c, 0x58, 0x01, 0x5f, 0xf2, 0xa5, 0xf6, 0x51, 0x86, 0x21,
+    0x2f, 0x5b, 0x8d, 0x6a, 0xae, 0x83, 0x34, 0x6d, 0x58, 0x4b, 0xef, 0xfe,
+    0xbf, 0x73, 0x5d, 0xdb, 0xc4, 0x97, 0x2a, 0x85, 0xf3, 0x6c, 0x46, 0x42,
+    0xb3, 0x90, 0xc1, 0x57, 0x97, 0x50, 0x35, 0xb1, 0x9d, 0xb7, 0xc7, 0x3c,
+    0x85, 0x6d, 0x6c, 0xfd, 0xce, 0xb0, 0xc9, 0xa2, 0x77, 0xee, 0xc3, 0x6b,
+    0x0c, 0x37, 0xfa, 0x30, 0x91, 0xd1, 0x2c, 0xb8, 0x5e, 0x7f, 0x81, 0x5f,
+    0x87, 0xfd, 0x18, 0x02, 0x5a, 0x30, 0x4e, 0x62, 0xbc, 0x65, 0xc6, 0xce,
+    0x1a, 0xcf, 0x2b, 0xaa, 0x56, 0x3e, 0x4d, 0xcf, 0xba, 0x62, 0x5f, 0x9a,
+    0xd0, 0x72, 0xff, 0xef, 0x28, 0xbd, 0xbe, 0xd8, 0x57, 0x3d, 0xf5, 0x57,
+    0x7d, 0xe9, 0x71, 0x31, 0xec, 0x98, 0x90, 0x94, 0xd9, 0x54, 0xbf, 0x84,
+    0x0b, 0xe3, 0x06, 0x47, 0x19, 0x9a, 0x13, 0x1d, 0xef, 0x9d, 0x13, 0xf3,
+    0xdb, 0xc3, 0x5c, 0x72, 0x9e, 0xed, 0x24, 0xaa, 0x64, 0xed, 0xe7, 0x0d,
+    0xa0, 0x7c, 0x73, 0xba, 0x9b, 0x86, 0xa7, 0x3b, 0x55, 0xab, 0x58, 0x30,
+    0xf1, 0x15, 0x81, 0x83, 0x2f, 0xf9, 0x62, 0x84, 0x98, 0x66, 0xf6, 0x55,
+    0x21, 0xd8, 0xf2, 0x25, 0x64, 0x71, 0x4b, 0x12, 0x76, 0x59, 0xc5, 0xaa,
+    0x93, 0x67, 0xc3, 0x86, 0x25, 0xab, 0x4e, 0x4b, 0xf6, 0xd8, 0x3f, 0x44,
+    0x2e, 0x11, 0xe0, 0xbd, 0x6a, 0xf2, 0x5d, 0xf5, 0xf9, 0x53, 0xea, 0xa4,
+    0xc8, 0xd9, 0x50, 0x33, 0x81, 0xd9, 0xa8, 0x2d, 0x91, 0x7d, 0x13, 0x2a,
+    0x11, 0xcf, 0xde, 0x3f, 0x0a, 0xd2, 0xbc, 0x33, 0xb2, 0x62, 0x53, 0xea,
+    0x77, 0x88, 0x43, 0x66, 0x27, 0x43, 0x85, 0xe9, 0x5f, 0x55, 0xf5, 0x2a,
+    0x8a, 0xac, 0xdf, 0xff, 0x9b, 0x4c, 0x96, 0x9c, 0xa5, 0x7a, 0xce, 0xd5,
+    0x79, 0x18, 0xf1, 0x0b, 0x58, 0x95, 0x7a, 0xe7, 0xd3, 0x74, 0x65, 0x0b,
+    0xa4, 0x64, 0x30, 0xe8, 0x5c, 0xfc, 0x55, 0x56, 0xee, 0x14, 0x14, 0xd3,
+    0x45, 0x3b, 0xf8, 0xde, 0x05, 0x3e, 0xb9, 0x3c, 0xd7, 0x6a, 0x52, 0x72,
+    0x5b, 0x39, 0x09, 0xbe, 0x82, 0x23, 0x10, 0x4a, 0xb7, 0xc3, 0xdc, 0x4c,
+    0x5d, 0xc9, 0xf1, 0x14, 0x83, 0xf9, 0x0b, 0x9b, 0xe9, 0x23, 0x84, 0x6a,
+    0xc4, 0x08, 0x3d, 0xda, 0x3d, 0x12, 0x95, 0x87, 0x18, 0xa4, 0x7d, 0x3f,
+    0x23, 0xde, 0xd4, 0x1e, 0xa8, 0x47, 0xc3, 0x71, 0xdb, 0xf5, 0x03, 0x6c,
+    0x57, 0xe7, 0xa4, 0x43, 0x82, 0x33, 0x7b, 0x62, 0x46, 0x7d, 0xf7, 0x10,
+    0x69, 0x18, 0x38, 0x27, 0x9a, 0x6f, 0x38, 0xac, 0xfa, 0x92, 0xc5, 0xae,
+    0x66, 0xa6, 0x73, 0x95, 0x15, 0x0e, 0x4c, 0x04, 0xb6, 0xfc, 0xf5, 0xc7,
+    0x21, 0x3a, 0x99, 0xdb, 0x0e, 0x36, 0xf0, 0x56, 0xbc, 0x75, 0xf9, 0x87,
+    0x9b, 0x11, 0x18, 0x92, 0x64, 0x1a, 0xe7, 0xc7, 0xab, 0x5a, 0xc7, 0x26,
+    0x7f, 0x13, 0x98, 0x42, 0x52, 0x43, 0xdb, 0xc8, 0x6d, 0x0b, 0xb7, 0x31,
+    0x93, 0x24, 0xd6, 0xe8, 0x24, 0x1f, 0x6f, 0x21, 0xa7, 0x8c, 0xeb, 0xdb,
+    0x83, 0xb8, 0x89, 0xe3, 0xc1, 0xd7, 0x69, 0x3b, 0x02, 0x6b, 0x54, 0x0f,
+    0x84, 0x2f, 0xb5, 0x5c, 0x17, 0x77, 0xbe, 0xe5, 0x61, 0x0d, 0xc5, 0xdf,
+    0x3b, 0xcf, 0x3e, 0x93, 0x4f, 0xf5, 0x89, 0xb9, 0x5a, 0xc5, 0x29, 0x31,
+    0xc0, 0xc2, 0xff, 0xe5, 0x3f, 0xa6, 0xac, 0x03, 0xca, 0xf5, 0xff, 0xe0,
+    0x36, 0xce, 0xf3, 0xe2, 0xb7, 0x9c, 0x02, 0xe9, 0x9e, 0xd2, 0xbc, 0x87,
+    0x2f, 0x3d, 0x9a, 0x1d, 0x8f, 0xc5, 0x72, 0xb8, 0xa2, 0x01, 0xd4, 0x68,
+    0xb1, 0x84, 0x16, 0x10, 0xf6, 0xf3, 0x52, 0x25, 0xd9, 0xdc, 0x4c, 0xdd,
+    0x0f, 0xd6, 0x4a, 0xcf, 0x60, 0x96, 0x7e, 0xcc, 0x42, 0x0f, 0x64, 0x9d,
+    0x72, 0x46, 0x04, 0x07, 0xf2, 0x5b, 0xf4, 0x07, 0xd1, 0xf4, 0x59, 0x71,
+};
+
+static const uint8_t kSpakeMSmallPrecomp[15 * 2 * 32] = {
+    0xc8, 0xa6, 0x63, 0xc5, 0x97, 0xf1, 0xee, 0x40, 0xab, 0x62, 0x42, 0xee,
+    0x25, 0x6f, 0x32, 0x6c, 0x75, 0x2c, 0xa7, 0xd3, 0xbd, 0x32, 0x3b, 0x1e,
+    0x11, 0x9c, 0xbd, 0x04, 0xa9, 0x78, 0x6f, 0x45, 0x5a, 0xda, 0x7e, 0x4b,
+    0xf6, 0xdd, 0xd9, 0xad, 0xb6, 0x62, 0x6d, 0x32, 0x13, 0x1c, 0x6b, 0x5c,
+    0x51, 0xa1, 0xe3, 0x47, 0xa3, 0x47, 0x8f, 0x53, 0xcf, 0xcf, 0x44, 0x1b,
+    0x88, 0xee, 0xd1, 0x2e, 0x03, 0x89, 0xaf, 0xc0, 0x61, 0x2d, 0x9e, 0x35,
+    0xeb, 0x0e, 0x03, 0xe0, 0xb7, 0xfb, 0xa5, 0xbc, 0x44, 0xbe, 0x0c, 0x89,
+    0x0a, 0x0f, 0xd6, 0x59, 0x47, 0x9e, 0xe6, 0x3d, 0x36, 0x9d, 0xff, 0x44,
+    0x5e, 0xac, 0xab, 0xe5, 0x3a, 0xd5, 0xb0, 0x35, 0x9f, 0x6d, 0x7f, 0xba,
+    0xc0, 0x85, 0x0e, 0xf4, 0x70, 0x3f, 0x13, 0x90, 0x4c, 0x50, 0x1a, 0xee,
+    0xc5, 0xeb, 0x69, 0xfe, 0x98, 0x42, 0x87, 0x1d, 0xce, 0x6c, 0x29, 0xaa,
+    0x2b, 0x31, 0xc2, 0x38, 0x7b, 0x6b, 0xee, 0x88, 0x0b, 0xba, 0xce, 0xa8,
+    0xca, 0x19, 0x60, 0x1b, 0x16, 0xf1, 0x25, 0x1e, 0xcf, 0x63, 0x66, 0x1e,
+    0xbb, 0x63, 0xeb, 0x7d, 0xca, 0xd2, 0xb4, 0x23, 0x5a, 0x01, 0x6f, 0x05,
+    0xd1, 0xdc, 0x41, 0x73, 0x75, 0xc0, 0xfd, 0x30, 0x91, 0x52, 0x68, 0x96,
+    0x45, 0xb3, 0x66, 0x01, 0x3b, 0x53, 0x89, 0x3c, 0x69, 0xbc, 0x6c, 0x69,
+    0xe3, 0x51, 0x8f, 0xe3, 0xd2, 0x84, 0xd5, 0x28, 0x66, 0xb5, 0xe6, 0x06,
+    0x09, 0xfe, 0x6d, 0xb0, 0x72, 0x16, 0xe0, 0x8a, 0xce, 0x61, 0x65, 0xa9,
+    0x21, 0x32, 0x48, 0xdc, 0x7a, 0x1d, 0xe1, 0x38, 0x7f, 0x8c, 0x75, 0x88,
+    0x3d, 0x08, 0xa9, 0x4a, 0x6f, 0x3d, 0x9f, 0x7f, 0x3f, 0xbd, 0x57, 0x6b,
+    0x19, 0xce, 0x3f, 0x4a, 0xc9, 0xd3, 0xf9, 0x6e, 0x72, 0x7b, 0x5b, 0x74,
+    0xea, 0xbe, 0x9c, 0x7a, 0x6d, 0x9c, 0x40, 0x49, 0xe6, 0xfb, 0x2a, 0x1a,
+    0x75, 0x70, 0xe5, 0x4e, 0xed, 0x74, 0xe0, 0x75, 0xac, 0xc0, 0xb1, 0x11,
+    0x3e, 0xf2, 0xaf, 0x88, 0x4d, 0x66, 0xb6, 0xf6, 0x15, 0x4f, 0x3c, 0x6c,
+    0x77, 0xae, 0x47, 0x51, 0x63, 0x9a, 0xfe, 0xe1, 0xb4, 0x1a, 0x12, 0xdf,
+    0xe9, 0x54, 0x8d, 0x3b, 0x30, 0x2a, 0x75, 0xe3, 0xe5, 0x29, 0xb1, 0x4c,
+    0xb0, 0x7c, 0x6d, 0xb5, 0xae, 0x85, 0xdb, 0x1e, 0x38, 0x55, 0x96, 0xa5,
+    0x5b, 0x9f, 0x15, 0x23, 0x28, 0x36, 0xb8, 0xa2, 0x41, 0xb4, 0xd7, 0x19,
+    0x91, 0x8d, 0x26, 0x3e, 0xca, 0x9c, 0x05, 0x7a, 0x2b, 0x60, 0x45, 0x86,
+    0x8b, 0xee, 0x64, 0x6f, 0x5c, 0x09, 0x4d, 0x4b, 0x5a, 0x7f, 0xb0, 0xc3,
+    0x26, 0x9d, 0x8b, 0xb8, 0x83, 0x69, 0xcf, 0x16, 0x72, 0x62, 0x3e, 0x5e,
+    0x53, 0x4f, 0x9c, 0x73, 0x76, 0xfc, 0x19, 0xef, 0xa0, 0x74, 0x3a, 0x11,
+    0x1e, 0xd0, 0x4d, 0xb7, 0x87, 0xa1, 0xd6, 0x87, 0x6c, 0x0e, 0x6c, 0x8c,
+    0xe9, 0xa0, 0x44, 0xc4, 0x72, 0x3e, 0x73, 0x17, 0x13, 0xd1, 0x4e, 0x3d,
+    0x8e, 0x1d, 0x5a, 0x8b, 0x75, 0xcb, 0x59, 0x2c, 0x47, 0x87, 0x15, 0x41,
+    0xfe, 0x08, 0xe9, 0xa6, 0x97, 0x17, 0x08, 0x26, 0x6a, 0xb5, 0xbb, 0x73,
+    0xaa, 0xb8, 0x5b, 0x65, 0x65, 0x5b, 0x30, 0x9e, 0x62, 0x59, 0x02, 0xf8,
+    0xb8, 0x0f, 0x32, 0x10, 0xc1, 0x36, 0x08, 0x52, 0x98, 0x4a, 0x1e, 0xf0,
+    0xab, 0x21, 0x5e, 0xde, 0x16, 0x0c, 0xda, 0x09, 0x99, 0x6b, 0x9e, 0xc0,
+    0x90, 0xa5, 0x5a, 0xcc, 0xb0, 0xb7, 0xbb, 0xd2, 0x8b, 0x5f, 0xd3, 0x3b,
+    0x3e, 0x8c, 0xa5, 0x71, 0x66, 0x06, 0xe3, 0x28, 0xd4, 0xf8, 0x3f, 0xe5,
+    0x27, 0xdf, 0xfe, 0x0f, 0x09, 0xb2, 0x8a, 0x09, 0x5a, 0x23, 0x61, 0x0d,
+    0x2d, 0xf5, 0x44, 0xf1, 0x5c, 0xf8, 0x82, 0x4e, 0xdc, 0x78, 0x7a, 0xab,
+    0xc3, 0x57, 0x91, 0xaf, 0x65, 0x6e, 0x71, 0xf1, 0x44, 0xbf, 0xed, 0x43,
+    0x50, 0xb4, 0x67, 0x48, 0xef, 0x5a, 0x10, 0x46, 0x81, 0xb4, 0x0c, 0xc8,
+    0x48, 0xed, 0x99, 0x7a, 0x45, 0xa5, 0x92, 0xc3, 0x69, 0xd6, 0xd7, 0x8a,
+    0x20, 0x1b, 0xeb, 0x8f, 0xb2, 0xff, 0xec, 0x6d, 0x76, 0x04, 0xf8, 0xc2,
+    0x58, 0x9b, 0xf2, 0x20, 0x53, 0xc4, 0x74, 0x91, 0x19, 0xdd, 0x2d, 0x12,
+    0x53, 0xc7, 0x6e, 0xd0, 0x02, 0x51, 0x3c, 0xa6, 0x7d, 0x80, 0x75, 0x6b,
+    0x1d, 0xdf, 0xf8, 0x6a, 0x52, 0xbb, 0x81, 0xf8, 0x30, 0x45, 0xef, 0x51,
+    0x85, 0x36, 0xbe, 0x8e, 0xcf, 0x0b, 0x9a, 0x46, 0xe8, 0x3f, 0x99, 0xfd,
+    0xf7, 0xd9, 0x3e, 0x84, 0xe5, 0xe3, 0x37, 0xcf, 0x98, 0x7f, 0xeb, 0x5e,
+    0x5a, 0x53, 0x77, 0x1c, 0x20, 0xdc, 0xf1, 0x20, 0x99, 0xec, 0x60, 0x40,
+    0x93, 0xef, 0x5c, 0x1c, 0x81, 0xe2, 0xa5, 0xad, 0x2a, 0xc2, 0xdb, 0x6b,
+    0xc1, 0x7e, 0x8f, 0xa9, 0x23, 0x5b, 0xd9, 0x0d, 0xfe, 0xa0, 0xac, 0x11,
+    0x28, 0xba, 0x8e, 0x92, 0x07, 0x2d, 0x07, 0x40, 0x83, 0x14, 0x4c, 0x35,
+    0x8d, 0xd0, 0x11, 0xff, 0x98, 0xdb, 0x00, 0x30, 0x6f, 0x65, 0xb6, 0xa0,
+    0x7f, 0x9c, 0x08, 0xb8, 0xce, 0xb3, 0xa8, 0x42, 0xd3, 0x84, 0x45, 0xe1,
+    0xe3, 0x8f, 0xa6, 0x89, 0x21, 0xd7, 0x74, 0x02, 0x4d, 0x64, 0xdf, 0x54,
+    0x15, 0x9e, 0xba, 0x12, 0x49, 0x09, 0x41, 0xf6, 0x10, 0x24, 0xa1, 0x84,
+    0x15, 0xfd, 0x68, 0x6a, 0x57, 0x66, 0xb3, 0x6d, 0x4c, 0xea, 0xbf, 0xbc,
+    0x60, 0x3f, 0x52, 0x1c, 0x44, 0x1b, 0xc0, 0x4a, 0x25, 0xe3, 0xd9, 0x4c,
+    0x9a, 0x74, 0xad, 0xfc, 0x9e, 0x8d, 0x0b, 0x18, 0x66, 0x24, 0xd1, 0x06,
+    0xac, 0x68, 0xc1, 0xae, 0x14, 0xce, 0xb1, 0xf3, 0x86, 0x9f, 0x87, 0x11,
+    0xd7, 0x9f, 0x30, 0x92, 0xdb, 0xec, 0x0b, 0x4a, 0xe8, 0xf6, 0x53, 0x36,
+    0x68, 0x12, 0x11, 0x5e, 0xe0, 0x34, 0xa4, 0xff, 0x00, 0x0a, 0x26, 0xb8,
+    0x62, 0x79, 0x9c, 0x0c, 0xd5, 0xe5, 0xf5, 0x1c, 0x1a, 0x16, 0x84, 0x4d,
+    0x8e, 0x5d, 0x31, 0x7e, 0xf7, 0xe2, 0xd3, 0xa1, 0x41, 0x90, 0x61, 0x5d,
+    0x04, 0xb2, 0x9a, 0x18, 0x9e, 0x54, 0xfb, 0xd1, 0x61, 0x95, 0x1b, 0x08,
+    0xca, 0x7c, 0x49, 0x44, 0x74, 0x1d, 0x2f, 0xca, 0xc4, 0x7a, 0xe1, 0x8b,
+    0x2f, 0xbb, 0x96, 0xee, 0x19, 0x8a, 0x5d, 0xfb, 0x3e, 0x82, 0xe7, 0x15,
+    0xdb, 0x29, 0x14, 0xee, 0xc9, 0x4d, 0x9a, 0xfb, 0x9f, 0x8a, 0xbb, 0x17,
+    0x37, 0x1b, 0x6e, 0x28, 0x6c, 0xf9, 0xff, 0xb5, 0xb5, 0x8b, 0x9d, 0x88,
+    0x20, 0x08, 0x10, 0xd7, 0xca, 0x58, 0xf6, 0xe1, 0x32, 0x91, 0x6f, 0x36,
+    0xc0, 0xad, 0xc1, 0x57, 0x5d, 0x76, 0x31, 0x43, 0xf3, 0xdd, 0xec, 0xf1,
+    0xa9, 0x79, 0xe9, 0xe9, 0x85, 0xd7, 0x91, 0xc7, 0x31, 0x62, 0x3c, 0xd2,
+    0x90, 0x2c, 0x9c, 0xa4, 0x56, 0x37, 0x7b, 0xbe, 0x40, 0x58, 0xc0, 0x81,
+    0x83, 0x22, 0xe8, 0x13, 0x79, 0x18, 0xdb, 0x3a, 0x1b, 0x31, 0x0d, 0x00,
+    0x6c, 0x22, 0x62, 0x75, 0x70, 0xd8, 0x96, 0x59, 0x99, 0x44, 0x79, 0x71,
+    0xa6, 0x76, 0x81, 0x28, 0xb2, 0x65, 0xe8, 0x47, 0x14, 0xc6, 0x39, 0x06,
+};
+
+enum spake2_state_t {
+  spake2_state_init = 0,
+  spake2_state_msg_generated,
+  spake2_state_key_generated,
+};
+
+struct spake2_ctx_st {
+  uint8_t private_key[32];
+  uint8_t my_msg[32];
+  uint8_t password_scalar[32];
+  uint8_t password_hash[SHA512_DIGEST_LENGTH];
+  uint8_t *my_name;
+  size_t my_name_len;
+  uint8_t *their_name;
+  size_t their_name_len;
+  enum spake2_role_t my_role;
+  enum spake2_state_t state;
+};
+
+SPAKE2_CTX *SPAKE2_CTX_new(enum spake2_role_t my_role,
+                           const uint8_t *my_name, size_t my_name_len,
+                           const uint8_t *their_name, size_t their_name_len) {
+  SPAKE2_CTX *ctx = OPENSSL_malloc(sizeof(SPAKE2_CTX));
+  if (ctx == NULL) {
+    return NULL;
+  }
+
+  memset(ctx, 0, sizeof(SPAKE2_CTX));
+  ctx->my_role = my_role;
+
+  CBS my_name_cbs, their_name_cbs;
+  CBS_init(&my_name_cbs, my_name, my_name_len);
+  CBS_init(&their_name_cbs, their_name, their_name_len);
+  if (!CBS_stow(&my_name_cbs, &ctx->my_name, &ctx->my_name_len) ||
+      !CBS_stow(&their_name_cbs, &ctx->their_name, &ctx->their_name_len)) {
+    SPAKE2_CTX_free(ctx);
+    return NULL;
+  }
+
+  return ctx;
+}
+
+void SPAKE2_CTX_free(SPAKE2_CTX *ctx) {
+  if (ctx == NULL) {
+    return;
+  }
+
+  OPENSSL_free(ctx->my_name);
+  OPENSSL_free(ctx->their_name);
+  OPENSSL_free(ctx);
+}
+
+/* left_shift_3 sets |n| to |n|*8, where |n| is represented in little-endian
+ * order. */
+static void left_shift_3(uint8_t n[32]) {
+  uint8_t carry = 0;
+  unsigned i;
+
+  for (i = 0; i < 32; i++) {
+    const uint8_t next_carry = n[i] >> 5;
+    n[i] = (n[i] << 3) | carry;
+    carry = next_carry;
+  }
+}
+
+int SPAKE2_generate_msg(SPAKE2_CTX *ctx, uint8_t *out, size_t *out_len,
+                         size_t max_out_len, const uint8_t *password,
+                         size_t password_len) {
+  if (ctx->state != spake2_state_init) {
+    return 0;
+  }
+
+  if (max_out_len < sizeof(ctx->my_msg)) {
+    return 0;
+  }
+
+  uint8_t private_tmp[64];
+  RAND_bytes(private_tmp, sizeof(private_tmp));
+  x25519_sc_reduce(private_tmp);
+  /* Multiply by the cofactor (eight) so that we'll clear it when operating on
+   * the peer's point later in the protocol. */
+  left_shift_3(private_tmp);
+  memcpy(ctx->private_key, private_tmp, sizeof(ctx->private_key));
+
+  ge_p3 P;
+  x25519_ge_scalarmult_base(&P, ctx->private_key);
+
+  /* mask = h(password) * <N or M>. */
+  uint8_t password_tmp[SHA512_DIGEST_LENGTH];
+  SHA512(password, password_len, password_tmp);
+  memcpy(ctx->password_hash, password_tmp, sizeof(ctx->password_hash));
+  x25519_sc_reduce(password_tmp);
+  memcpy(ctx->password_scalar, password_tmp, sizeof(ctx->password_scalar));
+
+  ge_p3 mask;
+  x25519_ge_scalarmult_small_precomp(&mask, ctx->password_scalar,
+                              ctx->my_role == spake2_role_alice
+                                  ? kSpakeMSmallPrecomp
+                                  : kSpakeNSmallPrecomp);
+
+  /* P* = P + mask. */
+  ge_cached mask_cached;
+  x25519_ge_p3_to_cached(&mask_cached, &mask);
+  ge_p1p1 Pstar;
+  x25519_ge_add(&Pstar, &P, &mask_cached);
+
+  /* Encode P* */
+  ge_p2 Pstar_proj;
+  x25519_ge_p1p1_to_p2(&Pstar_proj, &Pstar);
+  x25519_ge_tobytes(ctx->my_msg, &Pstar_proj);
+
+  memcpy(out, ctx->my_msg, sizeof(ctx->my_msg));
+  *out_len = sizeof(ctx->my_msg);
+  ctx->state = spake2_state_msg_generated;
+
+  return 1;
+}
+
+static void update_with_length_prefix(SHA512_CTX *sha, const uint8_t *data,
+                                      const size_t len) {
+  uint8_t len_le[8];
+  size_t l = len;
+  unsigned i;
+
+  for (i = 0; i < 8; i++) {
+    len_le[i] = l & 0xff;
+    l >>= 8;
+  }
+
+  SHA512_Update(sha, len_le, sizeof(len_le));
+  SHA512_Update(sha, data, len);
+}
+
+int SPAKE2_process_msg(SPAKE2_CTX *ctx, uint8_t *out_key, size_t *out_key_len,
+                       size_t max_out_key_len, const uint8_t *their_msg,
+                       size_t their_msg_len) {
+  if (ctx->state != spake2_state_msg_generated ||
+      their_msg_len != 32) {
+    return 0;
+  }
+
+  ge_p3 Qstar;
+  if (0 != x25519_ge_frombytes_vartime(&Qstar, their_msg)) {
+    /* Point received from peer was not on the curve. */
+    return 0;
+  }
+
+  /* Unmask peer's value. */
+  ge_p3 peers_mask;
+  x25519_ge_scalarmult_small_precomp(&peers_mask, ctx->password_scalar,
+                                    ctx->my_role == spake2_role_alice
+                                        ? kSpakeNSmallPrecomp
+                                        : kSpakeMSmallPrecomp);
+
+  ge_cached peers_mask_cached;
+  x25519_ge_p3_to_cached(&peers_mask_cached, &peers_mask);
+
+  ge_p1p1 Q_compl;
+  ge_p3 Q_ext;
+  x25519_ge_sub(&Q_compl, &Qstar, &peers_mask_cached);
+  x25519_ge_p1p1_to_p3(&Q_ext, &Q_compl);
+
+  ge_p2 dh_shared;
+  x25519_ge_scalarmult(&dh_shared, ctx->private_key, &Q_ext);
+
+  uint8_t dh_shared_encoded[32];
+  x25519_ge_tobytes(dh_shared_encoded, &dh_shared);
+
+  SHA512_CTX sha;
+  SHA512_Init(&sha);
+  if (ctx->my_role == spake2_role_alice) {
+    update_with_length_prefix(&sha, ctx->my_name, ctx->my_name_len);
+    update_with_length_prefix(&sha, ctx->their_name, ctx->their_name_len);
+    update_with_length_prefix(&sha, ctx->my_msg, sizeof(ctx->my_msg));
+    update_with_length_prefix(&sha, their_msg, 32);
+  } else {
+    update_with_length_prefix(&sha, ctx->their_name, ctx->their_name_len);
+    update_with_length_prefix(&sha, ctx->my_name, ctx->my_name_len);
+    update_with_length_prefix(&sha, their_msg, 32);
+    update_with_length_prefix(&sha, ctx->my_msg, sizeof(ctx->my_msg));
+  }
+  update_with_length_prefix(&sha, dh_shared_encoded, sizeof(dh_shared_encoded));
+  update_with_length_prefix(&sha, ctx->password_hash,
+                            sizeof(ctx->password_hash));
+
+  uint8_t key[SHA512_DIGEST_LENGTH];
+  SHA512_Final(key, &sha);
+
+  size_t to_copy = max_out_key_len;
+  if (to_copy > sizeof(key)) {
+    to_copy = sizeof(key);
+  }
+  memcpy(out_key, key, to_copy);
+  *out_key_len = to_copy;
+  ctx->state = spake2_state_key_generated;
+
+  return 1;
+}
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <string>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <openssl/curve25519.h>
+#include "../test/scoped_types.h"
+
+
+struct SPAKE2Run {
+  bool Run() {
+    ScopedSPAKE2_CTX alice(SPAKE2_CTX_new(
+        spake2_role_alice,
+        reinterpret_cast<const uint8_t *>(alice_names.first.data()),
+        alice_names.first.size(),
+        reinterpret_cast<const uint8_t *>(alice_names.second.data()),
+        alice_names.second.size()));
+    ScopedSPAKE2_CTX bob(SPAKE2_CTX_new(
+        spake2_role_bob,
+        reinterpret_cast<const uint8_t *>(bob_names.first.data()),
+        bob_names.first.size(),
+        reinterpret_cast<const uint8_t *>(bob_names.second.data()),
+        bob_names.second.size()));
+
+    if (!alice || !bob) {
+      return false;
+    }
+
+    uint8_t alice_msg[SPAKE2_MAX_MSG_SIZE];
+    uint8_t bob_msg[SPAKE2_MAX_MSG_SIZE];
+    size_t alice_msg_len, bob_msg_len;
+
+    if (!SPAKE2_generate_msg(
+            alice.get(), alice_msg, &alice_msg_len, sizeof(alice_msg),
+            reinterpret_cast<const uint8_t *>(alice_password.data()),
+            alice_password.size()) ||
+        !SPAKE2_generate_msg(
+            bob.get(), bob_msg, &bob_msg_len, sizeof(bob_msg),
+            reinterpret_cast<const uint8_t *>(bob_password.data()),
+            bob_password.size())) {
+      return false;
+    }
+
+    if (alice_corrupt_msg_bit >= 0 &&
+        static_cast<size_t>(alice_corrupt_msg_bit) < 8 * alice_msg_len) {
+      alice_msg[alice_corrupt_msg_bit/8] ^= 1 << (alice_corrupt_msg_bit & 7);
+    }
+
+    uint8_t alice_key[64], bob_key[64];
+    size_t alice_key_len, bob_key_len;
+
+    if (!SPAKE2_process_msg(alice.get(), alice_key, &alice_key_len,
+                            sizeof(alice_key), bob_msg, bob_msg_len) ||
+        !SPAKE2_process_msg(bob.get(), bob_key, &bob_key_len, sizeof(bob_key),
+                            alice_msg, alice_msg_len)) {
+      return false;
+    }
+
+    key_matches_ = (alice_key_len == bob_key_len &&
+                    memcmp(alice_key, bob_key, alice_key_len) == 0);
+
+    return true;
+  }
+
+  bool key_matches() const {
+    return key_matches_;
+  }
+
+  std::string alice_password = "password";
+  std::string bob_password = "password";
+  std::pair<std::string, std::string> alice_names = {"alice", "bob"};
+  std::pair<std::string, std::string> bob_names = {"bob", "alice"};
+  int alice_corrupt_msg_bit = -1;
+
+ private:
+  bool key_matches_ = false;
+};
+
+static bool TestSPAKE2() {
+  for (unsigned i = 0; i < 20; i++) {
+    SPAKE2Run spake2;
+    if (!spake2.Run()) {
+      fprintf(stderr, "TestSPAKE2: SPAKE2 failed.\n");
+      return false;
+    }
+
+    if (!spake2.key_matches()) {
+      fprintf(stderr, "Key didn't match for equal passwords.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool TestWrongPassword() {
+  SPAKE2Run spake2;
+  spake2.bob_password = "wrong password";
+  if (!spake2.Run()) {
+    fprintf(stderr, "TestSPAKE2: SPAKE2 failed.\n");
+    return false;
+  }
+
+  if (spake2.key_matches()) {
+    fprintf(stderr, "Key matched for unequal passwords.\n");
+    return false;
+  }
+
+  return true;
+}
+
+static bool TestWrongNames() {
+  SPAKE2Run spake2;
+  spake2.alice_names.second = "charlie";
+  spake2.bob_names.second = "charlie";
+  if (!spake2.Run()) {
+    fprintf(stderr, "TestSPAKE2: SPAKE2 failed.\n");
+    return false;
+  }
+
+  if (spake2.key_matches()) {
+    fprintf(stderr, "Key matched for unequal names.\n");
+    return false;
+  }
+
+  return true;
+}
+
+static bool TestCorruptMessages() {
+  for (int i = 0; i < 8 * SPAKE2_MAX_MSG_SIZE; i++) {
+    SPAKE2Run spake2;
+    spake2.alice_corrupt_msg_bit = i;
+    if (spake2.Run() && spake2.key_matches()) {
+      fprintf(stderr, "Passed after corrupting Alice's message, bit %d\n", i);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/* TODO(agl): add tests with fixed vectors once SPAKE2 is nailed down. */
+
+int main(int argc, char **argv) {
+  if (!TestSPAKE2() ||
+      !TestWrongPassword() ||
+      !TestWrongNames() ||
+      !TestCorruptMessages()) {
+    return 1;
+  }
+
+  printf("PASS\n");
+  return 0;
+}
@@ -60,7 +60,6 @@
 #include <string.h>

 #include <openssl/err.h>
-#include <openssl/obj.h>
 #include <openssl/mem.h>

 #include "internal.h"
@@ -166,6 +165,7 @@ int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *engine) {
  if (ctx->digest != type) {
    if (ctx->digest && ctx->digest->ctx_size > 0) {
      OPENSSL_free(ctx->md_data);
+      ctx->md_data = NULL;
    }
    ctx->digest = type;
    if (type->ctx_size > 0) {
@@ -248,6 +248,7 @@ int i2d_DSA_SIG(const DSA_SIG *in, uint8_t **outp) {
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
      !DSA_SIG_marshal(&cbb, in)) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -275,6 +276,7 @@ int i2d_DSAPublicKey(const DSA *in, uint8_t **outp) {
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
      !DSA_marshal_public_key(&cbb, in)) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -302,6 +304,7 @@ int i2d_DSAPrivateKey(const DSA *in, uint8_t **outp) {
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
      !DSA_marshal_private_key(&cbb, in)) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -329,6 +332,7 @@ int i2d_DSAparams(const DSA *in, uint8_t **outp) {
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
      !DSA_marshal_parameters(&cbb, in)) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -1729,6 +1729,7 @@ $code.=<<___;
 	push	%r15
 	sub	\$32*5+8, %rsp

+.Lpoint_double_shortcut$x:
 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
 	mov	$a_ptr, $b_ptr			# backup copy
 	movdqu	0x10($a_ptr), %xmm1
@@ -2019,6 +2020,7 @@ $code.=<<___;
 	 mov	0x40+8*1($b_ptr), $acc6
 	 mov	0x40+8*2($b_ptr), $acc7
 	 mov	0x40+8*3($b_ptr), $acc0
+	movq	$b_ptr, %xmm1

 	lea	0x40-$bias($b_ptr), $a_ptr
 	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
@@ -2074,7 +2076,7 @@ $code.=<<___;
 	test	$acc0, $acc0
 	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
 	test	$acc1, $acc1
-	jz	.Ladd_proceed$x			# is_equal(S1,S2)?
+	jz	.Ladd_double$x			# is_equal(S1,S2)?

 	movq	%xmm0, $r_ptr			# restore $r_ptr
 	pxor	%xmm0, %xmm0
@@ -2086,6 +2088,13 @@ $code.=<<___;
 	movdqu	%xmm0, 0x50($r_ptr)
 	jmp	.Ladd_done$x

+.align	32
+.Ladd_double$x:
+	movq	%xmm1, $a_ptr			# restore $a_ptr
+	movq	%xmm0, $r_ptr			# restore $r_ptr
+	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
+	jmp	.Lpoint_double_shortcut$x
+
 .align	32
 .Ladd_proceed$x:
 	`&load_for_sqr("$R(%rsp)", "$src0")`
@@ -73,7 +73,7 @@
 #include <openssl/bn.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"
 #include "../internal.h"
@@ -228,10 +228,25 @@ static const struct curve_data P521 = {
 #endif

 const struct built_in_curve OPENSSL_built_in_curves[] = {
-    {NID_secp521r1, &P521, 0},
-    {NID_secp384r1, &P384, 0},
    {
-        NID_X9_62_prime256v1, &P256,
+        NID_secp521r1,
+        /* 1.3.132.0.35 */
+        {0x2b, 0x81, 0x04, 0x00, 0x23}, 5,
+        &P521,
+        NULL,
+      },
+    {
+        NID_secp384r1,
+        /* 1.3.132.0.34 */
+        {0x2b, 0x81, 0x04, 0x00, 0x22}, 5,
+        &P384,
+        NULL,
+    },
+    {
+        NID_X9_62_prime256v1,
+        /* 1.2.840.10045.3.1.7 */
+        {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x03, 0x01, 0x07}, 8,
+        &P256,
 #if defined(BORINGSSL_USE_INT128_CODE)
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
    !defined(OPENSSL_SMALL)
@@ -240,18 +255,21 @@ const struct built_in_curve OPENSSL_built_in_curves[] = {
        EC_GFp_nistp256_method,
 #endif
 #else
-        0,
+        NULL,
 #endif
    },
    {
-        NID_secp224r1, &P224,
+        NID_secp224r1,
+        /* 1.3.132.0.33 */
+        {0x2b, 0x81, 0x04, 0x00, 0x21}, 5,
+        &P224,
 #if defined(BORINGSSL_USE_INT128_CODE) && !defined(OPENSSL_SMALL)
        EC_GFp_nistp224_method,
 #else
-        0,
+        NULL,
 #endif
    },
-    {NID_undef, 0, 0},
+    {NID_undef, {0}, 0, NULL, NULL},
 };

 /* built_in_curve_scalar_field_monts contains Montgomery contexts for
@@ -350,8 +368,8 @@ EC_GROUP *ec_group_new(const EC_METHOD *meth) {
  return ret;
 }

-static EC_GROUP *ec_group_new_curve_GFp(const BIGNUM *p, const BIGNUM *a,
-                                        const BIGNUM *b, BN_CTX *ctx) {
+EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a,
+                                 const BIGNUM *b, BN_CTX *ctx) {
  const EC_METHOD *meth = EC_GFp_mont_method();
  EC_GROUP *ret;

@@ -371,35 +389,49 @@ static EC_GROUP *ec_group_new_curve_GFp(const BIGNUM *p, const BIGNUM *a,
  return ret;
 }

+int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
+                           const BIGNUM *order, const BIGNUM *cofactor) {
+  if (group->curve_name != NID_undef || group->generator != NULL) {
+    /* |EC_GROUP_set_generator| may only be used with |EC_GROUP|s returned by
+     * |EC_GROUP_new_curve_GFp| and may only used once on each group. */
+    return 0;
+  }
+
+  group->generator = EC_POINT_new(group);
+  return group->generator != NULL &&
+         EC_POINT_copy(group->generator, generator) &&
+         BN_copy(&group->order, order) &&
+         BN_copy(&group->cofactor, cofactor);
+}
+
 EC_GROUP *EC_GROUP_new_arbitrary(const BIGNUM *p, const BIGNUM *a,
                                 const BIGNUM *b, const BIGNUM *gx,
                                 const BIGNUM *gy, const BIGNUM *order,
                                 const BIGNUM *cofactor) {
-  EC_GROUP *ret = NULL;
-  BN_CTX *ctx;
-
-  ctx = BN_CTX_new();
+  BN_CTX *ctx = BN_CTX_new();
  if (ctx == NULL) {
-    goto err;
+    return NULL;
  }

-  ret = ec_group_new_curve_GFp(p, a, b, ctx);
+  EC_POINT *generator = NULL;
+  EC_GROUP *ret = EC_GROUP_new_curve_GFp(p, a, b, ctx);
  if (ret == NULL) {
    goto err;
  }

-  ret->generator = EC_POINT_new(ret);
-  if (ret->generator == NULL ||
-      !EC_POINT_set_affine_coordinates_GFp(ret, ret->generator, gx, gy, ctx) ||
-      !BN_copy(&ret->order, order) ||
-      !BN_copy(&ret->cofactor, cofactor)) {
+  generator = EC_POINT_new(ret);
+  if (generator == NULL ||
+      !EC_POINT_set_affine_coordinates_GFp(ret, generator, gx, gy, ctx) ||
+      !EC_GROUP_set_generator(ret, generator, order, cofactor)) {
    goto err;
  }

+  EC_POINT_free(generator);
  BN_CTX_free(ctx);
  return ret;

 err:
+  EC_POINT_free(generator);
  EC_GROUP_free(ret);
  BN_CTX_free(ctx);
  return NULL;
@@ -438,7 +470,7 @@ static EC_GROUP *ec_group_new_from_data(unsigned built_in_index) {
      goto err;
    }
  } else {
-    if ((group = ec_group_new_curve_GFp(p, a, b, ctx)) == NULL) {
+    if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) {
      OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB);
      goto err;
    }
@@ -60,7 +60,7 @@
 #include <openssl/bn.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"
 #include "../bytestring/internal.h"
@@ -207,14 +207,9 @@ int EC_KEY_marshal_private_key(CBB *cbb, const EC_KEY *key,
  }

  if (!(enc_flags & EC_PKEY_NO_PARAMETERS)) {
-    int curve_nid = EC_GROUP_get_curve_name(key->group);
-    if (curve_nid == NID_undef) {
-      OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
-      return 0;
-    }
    CBB child;
    if (!CBB_add_asn1(&ec_private_key, &child, kParametersTag) ||
-        !OBJ_nid2cbb(&child, curve_nid) ||
+        !EC_KEY_marshal_curve_name(&child, key->group) ||
        !CBB_flush(&ec_private_key)) {
      OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR);
      return 0;
@@ -260,6 +255,9 @@ static int is_unsigned_integer(const CBS *cbs) {
  return 1;
 }

+/* kPrimeFieldOID is the encoding of 1.2.840.10045.1.1. */
+static const uint8_t kPrimeField[] = {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x01, 0x01};
+
 static int parse_explicit_prime_curve(CBS *in, CBS *out_prime, CBS *out_a,
                                      CBS *out_b, CBS *out_base_x,
                                      CBS *out_base_y, CBS *out_order) {
@@ -272,7 +270,8 @@ static int parse_explicit_prime_curve(CBS *in, CBS *out_prime, CBS *out_a,
      version != 1 ||
      !CBS_get_asn1(&params, &field_id, CBS_ASN1_SEQUENCE) ||
      !CBS_get_asn1(&field_id, &field_type, CBS_ASN1_OBJECT) ||
-      OBJ_cbs2nid(&field_type) != NID_X9_62_prime_field ||
+      CBS_len(&field_type) != sizeof(kPrimeField) ||
+      memcmp(CBS_data(&field_type), kPrimeField, sizeof(kPrimeField)) != 0 ||
      !CBS_get_asn1(&field_id, out_prime, CBS_ASN1_INTEGER) ||
      !is_unsigned_integer(out_prime) ||
      CBS_len(&field_id) != 0 ||
@@ -324,51 +323,86 @@ static int integers_equal(const CBS *a, const uint8_t *b, size_t b_len) {
  return CBS_mem_equal(&a_copy, b, b_len);
 }

-EC_GROUP *EC_KEY_parse_parameters(CBS *cbs) {
-  if (CBS_peek_asn1_tag(cbs, CBS_ASN1_SEQUENCE)) {
-    /* OpenSSL sometimes produces ECPrivateKeys with explicitly-encoded versions
-     * of named curves.
-     *
-     * TODO(davidben): Remove support for this. */
-    CBS prime, a, b, base_x, base_y, order;
-    if (!parse_explicit_prime_curve(cbs, &prime, &a, &b, &base_x, &base_y,
-                                    &order)) {
-      return NULL;
-    }
-
-    /* Look for a matching prime curve. */
-    unsigned i;
-    for (i = 0; OPENSSL_built_in_curves[i].nid != NID_undef; i++) {
-      const struct built_in_curve *curve = &OPENSSL_built_in_curves[i];
-      const unsigned param_len = curve->data->param_len;
-      /* |curve->data->data| is ordered p, a, b, x, y, order, each component
-       * zero-padded up to the field length. Although SEC 1 states that the
-       * Field-Element-to-Octet-String conversion also pads, OpenSSL mis-encodes
-       * |a| and |b|, so this comparison must allow omitting leading zeros.
-       * (This is relevant for P-521 whose |b| has a leading 0.) */
-      if (integers_equal(&prime, curve->data->data, param_len) &&
-          integers_equal(&a, curve->data->data + param_len, param_len) &&
-          integers_equal(&b, curve->data->data + param_len * 2, param_len) &&
-          integers_equal(&base_x, curve->data->data + param_len * 3,
-                         param_len) &&
-          integers_equal(&base_y, curve->data->data + param_len * 4,
-                         param_len) &&
-          integers_equal(&order, curve->data->data + param_len * 5,
-                         param_len)) {
-        return EC_GROUP_new_by_curve_name(curve->nid);
-      }
-    }
-
-    OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
-    return NULL;
-  }
-
+EC_GROUP *EC_KEY_parse_curve_name(CBS *cbs) {
  CBS named_curve;
  if (!CBS_get_asn1(cbs, &named_curve, CBS_ASN1_OBJECT)) {
    OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR);
    return NULL;
  }
-  return EC_GROUP_new_by_curve_name(OBJ_cbs2nid(&named_curve));
+
+  /* Look for a matching curve. */
+  unsigned i;
+  for (i = 0; OPENSSL_built_in_curves[i].nid != NID_undef; i++) {
+    const struct built_in_curve *curve = &OPENSSL_built_in_curves[i];
+    if (CBS_len(&named_curve) == curve->oid_len &&
+        memcmp(CBS_data(&named_curve), curve->oid, curve->oid_len) == 0) {
+      return EC_GROUP_new_by_curve_name(curve->nid);
+    }
+  }
+
+  OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
+  return NULL;
+}
+
+int EC_KEY_marshal_curve_name(CBB *cbb, const EC_GROUP *group) {
+  int nid = EC_GROUP_get_curve_name(group);
+  if (nid == NID_undef) {
+    OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
+    return 0;
+  }
+
+  unsigned i;
+  for (i = 0; OPENSSL_built_in_curves[i].nid != NID_undef; i++) {
+    const struct built_in_curve *curve = &OPENSSL_built_in_curves[i];
+    if (curve->nid == nid) {
+      CBB child;
+      return CBB_add_asn1(cbb, &child, CBS_ASN1_OBJECT) &&
+             CBB_add_bytes(&child, curve->oid, curve->oid_len) &&
+             CBB_flush(cbb);
+    }
+  }
+
+  OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
+  return 0;
+}
+
+EC_GROUP *EC_KEY_parse_parameters(CBS *cbs) {
+  if (!CBS_peek_asn1_tag(cbs, CBS_ASN1_SEQUENCE)) {
+    return EC_KEY_parse_curve_name(cbs);
+  }
+
+  /* OpenSSL sometimes produces ECPrivateKeys with explicitly-encoded versions
+   * of named curves.
+   *
+   * TODO(davidben): Remove support for this. */
+  CBS prime, a, b, base_x, base_y, order;
+  if (!parse_explicit_prime_curve(cbs, &prime, &a, &b, &base_x, &base_y,
+                                  &order)) {
+    return NULL;
+  }
+
+  /* Look for a matching prime curve. */
+  unsigned i;
+  for (i = 0; OPENSSL_built_in_curves[i].nid != NID_undef; i++) {
+    const struct built_in_curve *curve = &OPENSSL_built_in_curves[i];
+    const unsigned param_len = curve->data->param_len;
+    /* |curve->data->data| is ordered p, a, b, x, y, order, each component
+     * zero-padded up to the field length. Although SEC 1 states that the
+     * Field-Element-to-Octet-String conversion also pads, OpenSSL mis-encodes
+     * |a| and |b|, so this comparison must allow omitting leading zeros. (This
+     * is relevant for P-521 whose |b| has a leading 0.) */
+    if (integers_equal(&prime, curve->data->data, param_len) &&
+        integers_equal(&a, curve->data->data + param_len, param_len) &&
+        integers_equal(&b, curve->data->data + param_len * 2, param_len) &&
+        integers_equal(&base_x, curve->data->data + param_len * 3, param_len) &&
+        integers_equal(&base_y, curve->data->data + param_len * 4, param_len) &&
+        integers_equal(&order, curve->data->data + param_len * 5, param_len)) {
+      return EC_GROUP_new_by_curve_name(curve->nid);
+    }
+  }
+
+  OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
+  return NULL;
 }

 EC_KEY *d2i_ECPrivateKey(EC_KEY **out, const uint8_t **inp, long len) {
@@ -401,6 +435,7 @@ int i2d_ECPrivateKey(const EC_KEY *key, uint8_t **outp) {
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
      !EC_KEY_marshal_private_key(&cbb, key, EC_KEY_get_enc_flags(key))) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -440,15 +475,10 @@ int i2d_ECParameters(const EC_KEY *key, uint8_t **outp) {
    return -1;
  }

-  int curve_nid = EC_GROUP_get_curve_name(key->group);
-  if (curve_nid == NID_undef) {
-    OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP);
-    return -1;
-  }
-
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
-      !OBJ_nid2cbb(&cbb, curve_nid)) {
+      !EC_KEY_marshal_curve_name(&cbb, key->group)) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -365,15 +365,24 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
    return 0;
  }
  ctx = BN_CTX_new();
+
+  if (ctx == NULL) {
+    return 0;
+  }
+
+  BN_CTX_start(ctx);
  point = EC_POINT_new(key->group);

-  if (ctx == NULL ||
-      point == NULL) {
+  if (point == NULL) {
    goto err;
  }

  tx = BN_CTX_get(ctx);
  ty = BN_CTX_get(ctx);
+  if (tx == NULL ||
+      ty == NULL) {
+    goto err;
+  }

  if (!EC_POINT_set_affine_coordinates_GFp(key->group, point, x, y, ctx) ||
      !EC_POINT_get_affine_coordinates_GFp(key->group, point, tx, ty, ctx)) {
@@ -398,6 +407,7 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
  ok = 1;

 err:
+  BN_CTX_end(ctx);
  BN_CTX_free(ctx);
  EC_POINT_free(point);
  return ok;
@@ -79,23 +79,18 @@ int ec_GFp_mont_group_init(EC_GROUP *group) {

  ok = ec_GFp_simple_group_init(group);
  group->mont = NULL;
-  group->one = NULL;
  return ok;
 }

 void ec_GFp_mont_group_finish(EC_GROUP *group) {
  BN_MONT_CTX_free(group->mont);
  group->mont = NULL;
-  BN_free(group->one);
-  group->one = NULL;
  ec_GFp_simple_group_finish(group);
 }

 int ec_GFp_mont_group_copy(EC_GROUP *dest, const EC_GROUP *src) {
  BN_MONT_CTX_free(dest->mont);
  dest->mont = NULL;
-  BN_clear_free(dest->one);
-  dest->one = NULL;

  if (!ec_GFp_simple_group_copy(dest, src)) {
    return 0;
@@ -110,12 +105,6 @@ int ec_GFp_mont_group_copy(EC_GROUP *dest, const EC_GROUP *src) {
      goto err;
    }
  }
-  if (src->one != NULL) {
-    dest->one = BN_dup(src->one);
-    if (dest->one == NULL) {
-      goto err;
-    }
-  }

  return 1;

@@ -129,13 +118,10 @@ int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
                                const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
  BN_CTX *new_ctx = NULL;
  BN_MONT_CTX *mont = NULL;
-  BIGNUM *one = NULL;
  int ret = 0;

  BN_MONT_CTX_free(group->mont);
  group->mont = NULL;
-  BN_free(group->one);
-  group->one = NULL;

  if (ctx == NULL) {
    ctx = new_ctx = BN_CTX_new();
@@ -152,29 +138,20 @@ int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
    OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
    goto err;
  }
-  one = BN_new();
-  if (one == NULL || !BN_to_montgomery(one, BN_value_one(), mont, ctx)) {
-    goto err;
-  }

  group->mont = mont;
  mont = NULL;
-  group->one = one;
-  one = NULL;

  ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);

  if (!ret) {
    BN_MONT_CTX_free(group->mont);
    group->mont = NULL;
-    BN_free(group->one);
-    group->one = NULL;
  }

 err:
  BN_CTX_free(new_ctx);
  BN_MONT_CTX_free(mont);
-  BN_free(one);
  return ret;
 }

@@ -218,19 +195,6 @@ int ec_GFp_mont_field_decode(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a,
  return BN_from_montgomery(r, a, group->mont, ctx);
 }

-int ec_GFp_mont_field_set_to_one(const EC_GROUP *group, BIGNUM *r,
-                                 BN_CTX *ctx) {
-  if (group->one == NULL) {
-    OPENSSL_PUT_ERROR(EC, EC_R_NOT_INITIALIZED);
-    return 0;
-  }
-
-  if (!BN_copy(r, group->one)) {
-    return 0;
-  }
-  return 1;
-}
-
 static int ec_GFp_mont_check_pub_key_order(const EC_GROUP *group,
                                           const EC_POINT* pub_key,
                                           BN_CTX *ctx) {
@@ -251,20 +215,98 @@ err:
  return ret;
 }

+static int ec_GFp_mont_point_get_affine_coordinates(const EC_GROUP *group,
+                                                    const EC_POINT *point,
+                                                    BIGNUM *x, BIGNUM *y,
+                                                    BN_CTX *ctx) {
+  BN_CTX *new_ctx = NULL;
+  BIGNUM *Z, *Z_1, *Z_2, *Z_3;
+  int ret = 0;
+
+  if (EC_POINT_is_at_infinity(group, point)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
+    return 0;
+  }
+
+  if (ctx == NULL) {
+    ctx = new_ctx = BN_CTX_new();
+    if (ctx == NULL) {
+      return 0;
+    }
+  }
+
+  BN_CTX_start(ctx);
+  Z = BN_CTX_get(ctx);
+  Z_1 = BN_CTX_get(ctx);
+  Z_2 = BN_CTX_get(ctx);
+  Z_3 = BN_CTX_get(ctx);
+  if (Z == NULL || Z_1 == NULL || Z_2 == NULL || Z_3 == NULL) {
+    goto err;
+  }
+
+  /* transform  (X, Y, Z)  into  (x, y) := (X/Z^2, Y/Z^3) */
+
+  if (!group->meth->field_decode(group, Z, &point->Z, ctx)) {
+    goto err;
+  }
+
+  if (BN_is_one(Z)) {
+    if (x != NULL && !group->meth->field_decode(group, x, &point->X, ctx)) {
+      goto err;
+    }
+    if (y != NULL && !group->meth->field_decode(group, y, &point->Y, ctx)) {
+      goto err;
+    }
+  } else {
+    if (!BN_mod_inverse(Z_1, Z, &group->field, ctx)) {
+      OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
+      goto err;
+    }
+
+    if (!BN_mod_sqr(Z_2, Z_1, &group->field, ctx)) {
+      goto err;
+    }
+
+    /* in the Montgomery case, field_mul will cancel out Montgomery factor in
+     * X: */
+    if (x != NULL && !group->meth->field_mul(group, x, &point->X, Z_2, ctx)) {
+      goto err;
+    }
+
+    if (y != NULL) {
+      if (!BN_mod_mul(Z_3, Z_2, Z_1, &group->field, ctx)) {
+        goto err;
+      }
+
+      /* in the Montgomery case, field_mul will cancel out Montgomery factor in
+       * Y: */
+      if (!group->meth->field_mul(group, y, &point->Y, Z_3, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  BN_CTX_free(new_ctx);
+  return ret;
+}
+
 const EC_METHOD *EC_GFp_mont_method(void) {
  static const EC_METHOD ret = {
    ec_GFp_mont_group_init,
    ec_GFp_mont_group_finish,
    ec_GFp_mont_group_copy,
    ec_GFp_mont_group_set_curve,
-    ec_GFp_simple_point_get_affine_coordinates,
+    ec_GFp_mont_point_get_affine_coordinates,
    ec_wNAF_mul /* XXX: Not constant time. */,
    ec_GFp_mont_check_pub_key_order,
    ec_GFp_mont_field_mul,
    ec_GFp_mont_field_sqr,
    ec_GFp_mont_field_encode,
    ec_GFp_mont_field_decode,
-    ec_GFp_mont_field_set_to_one,
  };

  return &ret;
@@ -122,7 +122,7 @@ static bool EncodeECPrivateKey(std::vector<uint8_t> *out, const EC_KEY *key) {
  return true;
 }

-bool Testd2i_ECPrivateKey() {
+static bool Testd2i_ECPrivateKey() {
  ScopedEC_KEY key = DecodeECPrivateKey(kECKeyWithoutPublic,
                                        sizeof(kECKeyWithoutPublic));
  if (!key) {
@@ -349,23 +349,32 @@ static bool TestArbitraryCurve() {
      0xff, 0xff, 0xff, 0xff, 0xff, 0xbc, 0xe6, 0xfa, 0xad, 0xa7, 0x17,
      0x9e, 0x84, 0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51,
  };
+  ScopedBN_CTX ctx(BN_CTX_new());
  ScopedBIGNUM p(BN_bin2bn(kP, sizeof(kP), nullptr));
  ScopedBIGNUM a(BN_bin2bn(kA, sizeof(kA), nullptr));
  ScopedBIGNUM b(BN_bin2bn(kB, sizeof(kB), nullptr));
-  ScopedBIGNUM x(BN_bin2bn(kX, sizeof(kX), nullptr));
-  ScopedBIGNUM y(BN_bin2bn(kY, sizeof(kY), nullptr));
+  ScopedBIGNUM gx(BN_bin2bn(kX, sizeof(kX), nullptr));
+  ScopedBIGNUM gy(BN_bin2bn(kY, sizeof(kY), nullptr));
  ScopedBIGNUM order(BN_bin2bn(kOrder, sizeof(kOrder), nullptr));
  ScopedBIGNUM cofactor(BN_new());
-  if (!p || !a || !b || !x || !y || !order || !cofactor ||
+  if (!ctx || !p || !a || !b || !gx || !gy || !order || !cofactor ||
      !BN_set_word(cofactor.get(), 1)) {
    return false;
  }
-  ScopedEC_GROUP group(EC_GROUP_new_arbitrary(p.get(), a.get(), b.get(),
-                                              x.get(), y.get(), order.get(),
-                                              cofactor.get()));
+
+  ScopedEC_GROUP group(
+      EC_GROUP_new_curve_GFp(p.get(), a.get(), b.get(), ctx.get()));
  if (!group) {
    return false;
  }
+  ScopedEC_POINT generator(EC_POINT_new(group.get()));
+  if (!generator ||
+      !EC_POINT_set_affine_coordinates_GFp(group.get(), generator.get(),
+                                           gx.get(), gy.get(), ctx.get()) ||
+      !EC_GROUP_set_generator(group.get(), generator.get(), order.get(),
+                              cofactor.get())) {
+    return false;
+  }

  // |group| should not have a curve name.
  if (EC_GROUP_get_curve_name(group.get()) != NID_undef) {
@@ -375,7 +384,8 @@ static bool TestArbitraryCurve() {
  // Copy |key| to |key2| using |group|.
  ScopedEC_KEY key2(EC_KEY_new());
  ScopedEC_POINT point(EC_POINT_new(group.get()));
-  if (!key2 || !point ||
+  ScopedBIGNUM x(BN_new()), y(BN_new());
+  if (!key2 || !point || !x || !y ||
      !EC_KEY_set_group(key2.get(), group.get()) ||
      !EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())) ||
      !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
@@ -394,6 +404,101 @@ static bool TestArbitraryCurve() {
    return false;
  }

+  // Repeat the process for |EC_GROUP_new_arbitrary|.
+  group.reset(EC_GROUP_new_arbitrary(p.get(), a.get(), b.get(), gx.get(),
+                                     gy.get(), order.get(), cofactor.get()));
+  if (!group) {
+    return false;
+  }
+
+  // |group| should not have a curve name.
+  if (EC_GROUP_get_curve_name(group.get()) != NID_undef) {
+    return false;
+  }
+
+  // Copy |key| to |key2| using |group|.
+  key2.reset(EC_KEY_new());
+  point.reset(EC_POINT_new(group.get()));
+  if (!key2 || !point ||
+      !EC_KEY_set_group(key2.get(), group.get()) ||
+      !EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())) ||
+      !EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(), x.get(),
+                                           y.get(), nullptr) ||
+      !EC_KEY_set_public_key(key2.get(), point.get())) {
+    fprintf(stderr, "Could not copy key.\n");
+    return false;
+  }
+
+  // The key must be valid according to the new group too.
+  if (!EC_KEY_check_key(key2.get())) {
+    fprintf(stderr, "Copied key is not valid.\n");
+    return false;
+  }
+
+  return true;
+}
+
+static bool TestAddingEqualPoints(int nid) {
+  ScopedEC_KEY key(EC_KEY_new_by_curve_name(nid));
+  if (!key) {
+    return false;
+  }
+
+  const EC_GROUP *const group = EC_KEY_get0_group(key.get());
+
+  if (!EC_KEY_generate_key(key.get())) {
+    fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid);
+    ERR_print_errors_fp(stderr);
+    return false;
+  }
+
+  ScopedEC_POINT p1(EC_POINT_new(group));
+  ScopedEC_POINT p2(EC_POINT_new(group));
+  ScopedEC_POINT double_p1(EC_POINT_new(group));
+  ScopedEC_POINT p1_plus_p2(EC_POINT_new(group));
+  if (!p1 || !p2 || !double_p1 || !p1_plus_p2) {
+    return false;
+  }
+
+  if (!EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())) ||
+      !EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get()))) {
+    fprintf(stderr, "EC_POINT_COPY failed with nid %d\n", nid);
+    ERR_print_errors_fp(stderr);
+    return false;
+  }
+
+  ScopedBN_CTX ctx(BN_CTX_new());
+  if (!ctx) {
+    return false;
+  }
+
+  if (!EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()) ||
+      !EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get())) {
+    fprintf(stderr, "Point operation failed with nid %d\n", nid);
+    ERR_print_errors_fp(stderr);
+    return false;
+  }
+
+  if (EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()) != 0) {
+    fprintf(stderr, "A+A != 2A for nid %d", nid);
+    return false;
+  }
+
+  return true;
+}
+
+static bool ForEachCurve(bool (*test_func)(int nid)) {
+  const size_t num_curves = EC_get_builtin_curves(nullptr, 0);
+  std::vector<EC_builtin_curve> curves(num_curves);
+  EC_get_builtin_curves(curves.data(), num_curves);
+
+  for (const auto& curve : curves) {
+    if (!test_func(curve.nid)) {
+      fprintf(stderr, "Test failed for %s\n", curve.comment);
+      return false;
+    }
+  }
+
  return true;
 }

@@ -403,10 +508,8 @@ int main(void) {
  if (!Testd2i_ECPrivateKey() ||
      !TestZeroPadding() ||
      !TestSpecifiedCurve() ||
-      !TestSetAffine(NID_secp224r1) ||
-      !TestSetAffine(NID_X9_62_prime256v1) ||
-      !TestSetAffine(NID_secp384r1) ||
-      !TestSetAffine(NID_secp521r1) ||
+      !ForEachCurve(TestSetAffine) ||
+      !ForEachCurve(TestAddingEqualPoints) ||
      !TestArbitraryCurve()) {
    fprintf(stderr, "failed\n");
    return 1;
@@ -70,10 +70,10 @@
 #include <openssl/bn.h>
 #include <openssl/crypto.h>
 #include <openssl/ec.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>


-int example_EC_POINT_mul(void) {
+static int example_EC_POINT_mul(void) {
  /* This example ensures that 10×∞ + G = G, in P-256. */
  EC_GROUP *group = NULL;
  EC_POINT *p = NULL, *result = NULL;
@@ -116,7 +116,6 @@ struct ec_method_st {
                      BN_CTX *); /* e.g. to Montgomery */
  int (*field_decode)(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                      BN_CTX *); /* e.g. from Montgomery */
-  int (*field_set_to_one)(const EC_GROUP *, BIGNUM *r, BN_CTX *);
 } /* EC_METHOD */;

 const EC_METHOD* EC_GFp_mont_method(void);
@@ -141,7 +140,8 @@ struct ec_group_st {
  int a_is_minus3; /* enable optimized point arithmetics for special case */

  BN_MONT_CTX *mont; /* Montgomery structure. */
-  BIGNUM *one; /* The value one */
+
+  BIGNUM one; /* The value one. */
 } /* EC_GROUP */;

 struct ec_point_st {
@@ -151,7 +151,6 @@ struct ec_point_st {
  BIGNUM Y;
  BIGNUM Z; /* Jacobian projective coordinates:
             * (X, Y, Z)  represents  (X/Z^2, Y/Z^3)  if  Z != 0 */
-  int Z_is_one; /* enable optimized point arithmetics for special case */
 } /* EC_POINT */;

 EC_GROUP *ec_group_new(const EC_METHOD *meth);
@@ -190,9 +189,6 @@ int ec_GFp_simple_get_Jprojective_coordinates_GFp(const EC_GROUP *,
 int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *, EC_POINT *,
                                               const BIGNUM *x, const BIGNUM *y,
                                               BN_CTX *);
-int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *,
-                                               const EC_POINT *, BIGNUM *x,
-                                               BIGNUM *y, BN_CTX *);
 int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *, EC_POINT *,
                                             const BIGNUM *x, int y_bit,
                                             BN_CTX *);
@@ -227,22 +223,12 @@ int ec_GFp_mont_field_encode(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                             BN_CTX *);
 int ec_GFp_mont_field_decode(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                             BN_CTX *);
-int ec_GFp_mont_field_set_to_one(const EC_GROUP *, BIGNUM *r, BN_CTX *);

 int ec_point_set_Jprojective_coordinates_GFp(const EC_GROUP *group,
                                             EC_POINT *point, const BIGNUM *x,
                                             const BIGNUM *y, const BIGNUM *z,
                                             BN_CTX *ctx);

-void ec_GFp_nistp_points_make_affine_internal(
-    size_t num, void *point_array, size_t felem_size, void *tmp_felems,
-    void (*felem_one)(void *out), int (*felem_is_zero)(const void *in),
-    void (*felem_assign)(void *out, const void *in),
-    void (*felem_square)(void *out, const void *in),
-    void (*felem_mul)(void *out, const void *in1, const void *in2),
-    void (*felem_inv)(void *out, const void *in),
-    void (*felem_contract)(void *out, const void *in));
-
 void ec_GFp_nistp_recode_scalar_bits(uint8_t *sign, uint8_t *digit, uint8_t in);

 const EC_METHOD *EC_GFp_nistp224_method(void);
@@ -285,6 +271,8 @@ struct curve_data {

 struct built_in_curve {
  int nid;
+  uint8_t oid[8];
+  uint8_t oid_len;
  const struct curve_data *data;
  const EC_METHOD *(*method)(void);
 };
@@ -281,10 +281,15 @@ int EC_POINT_point2cbb(CBB *out, const EC_GROUP *group, const EC_POINT *point,
 }

 int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group,
-                                             EC_POINT *point, const BIGNUM *x_,
+                                             EC_POINT *point, const BIGNUM *x,
                                             int y_bit, BN_CTX *ctx) {
+  if (BN_is_negative(x) || BN_cmp(x, &group->field) >= 0) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSION_BIT);
+    return 0;
+  }
+
  BN_CTX *new_ctx = NULL;
-  BIGNUM *tmp1, *tmp2, *x, *y;
+  BIGNUM *tmp1, *tmp2, *y;
  int ret = 0;

  ERR_clear_error();
@@ -301,7 +306,6 @@ int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group,
  BN_CTX_start(ctx);
  tmp1 = BN_CTX_get(ctx);
  tmp2 = BN_CTX_get(ctx);
-  x = BN_CTX_get(ctx);
  y = BN_CTX_get(ctx);
  if (y == NULL) {
    goto err;
@@ -312,19 +316,15 @@ int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group,
   * so  y  is one of the square roots of  x^3 + a*x + b. */

  /* tmp1 := x^3 */
-  if (!BN_nnmod(x, x_, &group->field, ctx)) {
-    goto err;
-  }
-
  if (group->meth->field_decode == 0) {
    /* field_{sqr,mul} work on standard representation */
-    if (!group->meth->field_sqr(group, tmp2, x_, ctx) ||
-        !group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) {
+    if (!group->meth->field_sqr(group, tmp2, x, ctx) ||
+        !group->meth->field_mul(group, tmp1, tmp2, x, ctx)) {
      goto err;
    }
  } else {
-    if (!BN_mod_sqr(tmp2, x_, &group->field, ctx) ||
-        !BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) {
+    if (!BN_mod_sqr(tmp2, x, &group->field, ctx) ||
+        !BN_mod_mul(tmp1, tmp2, x, &group->field, ctx)) {
      goto err;
    }
  }
@@ -26,7 +26,6 @@
 #include <openssl/ec.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>

 #include <string.h>

@@ -193,7 +192,7 @@ static void bin28_to_felem(felem out, const u8 in[28]) {
 }

 static void felem_to_bin28(u8 out[28], const felem in) {
-  unsigned i;
+  size_t i;
  for (i = 0; i < 7; ++i) {
    out[i] = in[0] >> (8 * i);
    out[i + 7] = in[1] >> (8 * i);
@@ -203,8 +202,8 @@ static void felem_to_bin28(u8 out[28], const felem in) {
 }

 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
-static void flip_endian(u8 *out, const u8 *in, unsigned len) {
-  unsigned i;
+static void flip_endian(u8 *out, const u8 *in, size_t len) {
+  size_t i;
  for (i = 0; i < len; ++i) {
    out[i] = in[len - 1 - i];
  }
@@ -215,7 +214,7 @@ static int BN_to_felem(felem out, const BIGNUM *bn) {
  /* BN_bn2bin eats leading zeroes */
  felem_bytearray b_out;
  memset(b_out, 0, sizeof(b_out));
-  unsigned num_bytes = BN_num_bytes(bn);
+  size_t num_bytes = BN_num_bytes(bn);
  if (num_bytes > sizeof(b_out) ||
      BN_is_negative(bn)) {
    OPENSSL_PUT_ERROR(EC, EC_R_BIGNUM_OUT_OF_RANGE);
@@ -242,13 +241,6 @@ static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) {
 * expected to be correct in general - e.g., multiplication with a large scalar
 * will cause an overflow. */

-static void felem_one(felem out) {
-  out[0] = 1;
-  out[1] = 0;
-  out[2] = 0;
-  out[3] = 0;
-}
-
 static void felem_assign(felem out, const felem in) {
  out[0] = in[0];
  out[1] = in[1];
@@ -460,18 +452,6 @@ static void felem_reduce(felem out, const widefelem in) {
  out[3] = output[3];
 }

-static void felem_square_reduce(felem out, const felem in) {
-  widefelem tmp;
-  felem_square(tmp, in);
-  felem_reduce(out, tmp);
-}
-
-static void felem_mul_reduce(felem out, const felem in1, const felem in2) {
-  widefelem tmp;
-  felem_mul(tmp, in1, in2);
-  felem_reduce(out, tmp);
-}
-
 /* Reduce to unique minimal representation.
 * Requires 0 <= in < 2*p (always call felem_reduce first) */
 static void felem_contract(felem out, const felem in) {
@@ -539,16 +519,12 @@ static limb felem_is_zero(const felem in) {
  return (zero | two224m96p1 | two225m97p2);
 }

-static limb felem_is_zero_int(const felem in) {
-  return (int)(felem_is_zero(in) & ((limb)1));
-}
-
 /* Invert a field element */
 /* Computation chain copied from djb's code */
 static void felem_inv(felem out, const felem in) {
  felem ftmp, ftmp2, ftmp3, ftmp4;
  widefelem tmp;
-  unsigned i;
+  size_t i;

  felem_square(tmp, in);
  felem_reduce(ftmp, tmp); /* 2 */
@@ -628,7 +604,7 @@ static void felem_inv(felem out, const felem in) {
 * if icopy == 1, copy in to out,
 * if icopy == 0, copy out to itself. */
 static void copy_conditional(felem out, const felem in, limb icopy) {
-  unsigned i;
+  size_t i;
  /* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */
  const limb copy = -icopy;
  for (i = 0; i < 4; ++i) {
@@ -885,12 +861,12 @@ static void point_add(felem x3, felem y3, felem z3, const felem x1,

 /* select_point selects the |idx|th point from a precomputation table and
 * copies it to out. */
-static void select_point(const u64 idx, unsigned int size,
+static void select_point(const u64 idx, size_t size,
                         const felem pre_comp[/*size*/][3], felem out[3]) {
-  unsigned i, j;
  limb *outlimbs = &out[0][0];
  memset(outlimbs, 0, 3 * sizeof(felem));

+  size_t i;
  for (i = 0; i < size; i++) {
    const limb *inlimbs = &pre_comp[i][0][0];
    u64 mask = i ^ idx;
@@ -899,6 +875,7 @@ static void select_point(const u64 idx, unsigned int size,
    mask |= mask >> 1;
    mask &= 1;
    mask--;
+    size_t j;
    for (j = 0; j < 4 * 3; j++) {
      outlimbs[j] |= inlimbs[j] & mask;
    }
@@ -906,7 +883,7 @@ static void select_point(const u64 idx, unsigned int size,
 }

 /* get_bit returns the |i|th bit in |in| */
-static char get_bit(const felem_bytearray in, unsigned i) {
+static char get_bit(const felem_bytearray in, size_t i) {
  if (i >= 224) {
    return 0;
  }
@@ -920,11 +897,8 @@ static char get_bit(const felem_bytearray in, unsigned i) {
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
 static void batch_mul(felem x_out, felem y_out, felem z_out,
                      const felem_bytearray scalars[],
-                      const unsigned num_points, const u8 *g_scalar,
-                      const int mixed, const felem pre_comp[][17][3]) {
-  int i, skip;
-  unsigned num;
-  unsigned gen_mul = (g_scalar != NULL);
+                      const size_t num_points, const u8 *g_scalar,
+                      const felem pre_comp[][17][3]) {
  felem nq[3], tmp[4];
  u64 bits;
  u8 sign, digit;
@@ -935,15 +909,16 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
  /* Loop over all scalars msb-to-lsb, interleaving additions
   * of multiples of the generator (two in each of the last 28 rounds)
   * and additions of other points multiples (every 5th round). */
-  skip = 1; /* save two point operations in the first round */
-  for (i = (num_points ? 220 : 27); i >= 0; --i) {
+  int skip = 1; /* save two point operations in the first round */
+  size_t i = num_points != 0 ? 220 : 27;
+  for (;;) {
    /* double */
    if (!skip) {
      point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
    }

    /* add multiples of the generator */
-    if (gen_mul && (i <= 27)) {
+    if (g_scalar != NULL && i <= 27) {
      /* first, look 28 bits upwards */
      bits = get_bit(g_scalar, i + 196) << 3;
      bits |= get_bit(g_scalar, i + 140) << 2;
@@ -972,8 +947,9 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
    }

    /* do other additions every 5 doublings */
-    if (num_points && (i % 5 == 0)) {
+    if (num_points != 0 && i % 5 == 0) {
      /* loop over all scalars */
+      size_t num;
      for (num = 0; num < num_points; ++num) {
        bits = get_bit(scalars[num], i + 4) << 5;
        bits |= get_bit(scalars[num], i + 3) << 4;
@@ -989,14 +965,19 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
        copy_conditional(tmp[1], tmp[3], sign);

        if (!skip) {
-          point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], mixed, tmp[0],
-                    tmp[1], tmp[2]);
+          point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
+                    tmp[0], tmp[1], tmp[2]);
        } else {
          memcpy(nq, tmp, 3 * sizeof(felem));
          skip = 0;
        }
      }
    }
+
+    if (i == 0) {
+      break;
+    }
+    --i;
  }
  felem_assign(x_out, nq[0]);
  felem_assign(y_out, nq[1]);
@@ -1005,10 +986,10 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,

 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
 * (X', Y') = (X/Z^2, Y/Z^3) */
-int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
-                                                 const EC_POINT *point,
-                                                 BIGNUM *x, BIGNUM *y,
-                                                 BN_CTX *ctx) {
+static int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
+                                                        const EC_POINT *point,
+                                                        BIGNUM *x, BIGNUM *y,
+                                                        BN_CTX *ctx) {
  felem z1, z2, x_in, y_in, x_out, y_out;
  widefelem tmp;

@@ -1047,23 +1028,12 @@ int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
  return 1;
 }

-static void make_points_affine(size_t num, felem points[/*num*/][3],
-                               felem tmp_felems[/*num+1*/]) {
-  /* Runs in constant time, unless an input is the point at infinity
-   * (which normally shouldn't happen). */
-  ec_GFp_nistp_points_make_affine_internal(
-      num, points, sizeof(felem), tmp_felems, (void (*)(void *))felem_one,
-      (int (*)(const void *))felem_is_zero_int,
-      (void (*)(void *, const void *))felem_assign,
-      (void (*)(void *, const void *))felem_square_reduce,
-      (void (*)(void *, const void *, const void *))felem_mul_reduce,
-      (void (*)(void *, const void *))felem_inv,
-      (void (*)(void *, const void *))felem_contract);
-}
-
-int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
-                               const BIGNUM *g_scalar, const EC_POINT *p_,
-                               const BIGNUM *p_scalar_, BN_CTX *ctx) {
+static int ec_GFp_nistp224_points_mul(const EC_GROUP *group,
+                                      EC_POINT *r,
+                                      const BIGNUM *g_scalar,
+                                      const EC_POINT *p_,
+                                      const BIGNUM *p_scalar_,
+                                      BN_CTX *ctx) {
  /* TODO: This function used to take |points| and |scalars| as arrays of
   * |num| elements. The code below should be simplified to work in terms of
   * |p_| and |p_scalar_|. */
@@ -1072,17 +1042,12 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
  BIGNUM const *const *scalars = p_ != NULL ? &p_scalar_ : NULL;

  int ret = 0;
-  int j;
-  unsigned i;
-  int mixed = 0;
  BN_CTX *new_ctx = NULL;
  BIGNUM *x, *y, *z, *tmp_scalar;
  felem_bytearray g_secret;
  felem_bytearray *secrets = NULL;
  felem(*pre_comp)[17][3] = NULL;
-  felem *tmp_felems = NULL;
  felem_bytearray tmp;
-  unsigned num_bytes;
  size_t num_points = num;
  felem x_in, y_in, z_in, x_out, y_out, z_out;
  const EC_POINT *p = NULL;
@@ -1105,19 +1070,10 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
  }

  if (num_points > 0) {
-    if (num_points >= 3) {
-      /* unless we precompute multiples for just one or two points,
-       * converting those into affine form is time well spent  */
-      mixed = 1;
-    }
    secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
    pre_comp = OPENSSL_malloc(num_points * sizeof(felem[17][3]));
-    if (mixed) {
-      tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
-    }
    if (secrets == NULL ||
-        pre_comp == NULL ||
-        (mixed && tmp_felems == NULL)) {
+        pre_comp == NULL) {
      OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
      goto err;
    }
@@ -1126,6 +1082,7 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
     * i.e., they contribute nothing to the linear combination */
    memset(secrets, 0, num_points * sizeof(felem_bytearray));
    memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
+    size_t i;
    for (i = 0; i < num_points; ++i) {
      if (i == num) {
        /* the generator */
@@ -1138,6 +1095,7 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
      }

      if (p_scalar != NULL && p != NULL) {
+        size_t num_bytes;
        /* reduce g_scalar to 0 <= g_scalar < 2^224 */
        if (BN_num_bits(p_scalar) > 224 || BN_is_negative(p_scalar)) {
          /* this is an unusual input, and we don't guarantee
@@ -1163,6 +1121,7 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
        felem_assign(pre_comp[i][1][1], y_out);
        felem_assign(pre_comp[i][1][2], z_out);

+        size_t j;
        for (j = 2; j <= 16; ++j) {
          if (j & 1) {
            point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
@@ -1177,14 +1136,11 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
        }
      }
    }
-
-    if (mixed) {
-      make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
-    }
  }

  if (g_scalar != NULL) {
    memset(g_secret, 0, sizeof(g_secret));
+    size_t num_bytes;
    /* reduce g_scalar to 0 <= g_scalar < 2^224 */
    if (BN_num_bits(g_scalar) > 224 || BN_is_negative(g_scalar)) {
      /* this is an unusual input, and we don't guarantee constant-timeness */
@@ -1200,7 +1156,7 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
    flip_endian(g_secret, tmp, num_bytes);
  }
  batch_mul(x_out, y_out, z_out, (const felem_bytearray(*))secrets,
-            num_points, g_scalar != NULL ? g_secret : NULL, mixed,
+            num_points, g_scalar != NULL ? g_secret : NULL,
            (const felem(*)[17][3])pre_comp);

  /* reduce the output to its unique minimal representation */
@@ -1220,7 +1176,6 @@ err:
  BN_CTX_free(new_ctx);
  OPENSSL_free(secrets);
  OPENSSL_free(pre_comp);
-  OPENSSL_free(tmp_felems);
  return ret;
 }

@@ -1235,8 +1190,7 @@ const EC_METHOD *EC_GFp_nistp224_method(void) {
                                ec_GFp_simple_field_mul,
                                ec_GFp_simple_field_sqr,
                                0 /* field_encode */,
-                                0 /* field_decode */,
-                                0 /* field_set_to_one */};
+                                0 /* field_decode */};

  return &ret;
 }
@@ -27,7 +27,6 @@
 #include <openssl/ec.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>

 #include <string.h>

@@ -94,8 +93,8 @@ static void smallfelem_to_bin32(u8 out[32], const smallfelem in) {
 }

 /* To preserve endianness when using BN_bn2bin and BN_bin2bn. */
-static void flip_endian(u8 *out, const u8 *in, unsigned len) {
-  unsigned i;
+static void flip_endian(u8 *out, const u8 *in, size_t len) {
+  size_t i;
  for (i = 0; i < len; ++i) {
    out[i] = in[len - 1 - i];
  }
@@ -111,7 +110,7 @@ static int BN_to_felem(felem out, const BIGNUM *bn) {
  felem_bytearray b_out;
  /* BN_bn2bin eats leading zeroes */
  memset(b_out, 0, sizeof(b_out));
-  unsigned num_bytes = BN_num_bytes(bn);
+  size_t num_bytes = BN_num_bytes(bn);
  if (num_bytes > sizeof(b_out)) {
    OPENSSL_PUT_ERROR(EC, EC_R_BIGNUM_OUT_OF_RANGE);
    return 0;
@@ -134,20 +133,6 @@ static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in) {

 /* Field operations. */

-static void smallfelem_one(smallfelem out) {
-  out[0] = 1;
-  out[1] = 0;
-  out[2] = 0;
-  out[3] = 0;
-}
-
-static void smallfelem_assign(smallfelem out, const smallfelem in) {
-  out[0] = in[0];
-  out[1] = in[1];
-  out[2] = in[2];
-  out[3] = in[3];
-}
-
 static void felem_assign(felem out, const felem in) {
  out[0] = in[0];
  out[1] = in[1];
@@ -735,7 +720,7 @@ static void felem_contract(smallfelem out, const felem in) {
   * each u64, from most-significant to least significant. For each one, if
   * all words so far have been equal (m is all ones) then a non-equal
   * result is the answer. Otherwise we continue. */
-  unsigned i;
+  size_t i;
  for (i = 3; i < 4; i--) {
    u64 equal;
    uint128_t a = ((uint128_t)kPrime[i]) - out[i];
@@ -779,25 +764,6 @@ static void felem_contract(smallfelem out, const felem in) {
  subtract_u64(&out[3], &carry, result & kPrime[3]);
 }

-static void smallfelem_square_contract(smallfelem out, const smallfelem in) {
-  longfelem longtmp;
-  felem tmp;
-
-  smallfelem_square(longtmp, in);
-  felem_reduce(tmp, longtmp);
-  felem_contract(out, tmp);
-}
-
-static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
-                                    const smallfelem in2) {
-  longfelem longtmp;
-  felem tmp;
-
-  smallfelem_mul(longtmp, in1, in2);
-  felem_reduce(tmp, longtmp);
-  felem_contract(out, tmp);
-}
-
 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 * otherwise.
 * On entry:
@@ -834,10 +800,6 @@ static limb smallfelem_is_zero(const smallfelem small) {
  return result;
 }

-static int smallfelem_is_zero_int(const smallfelem small) {
-  return (int)(smallfelem_is_zero(small) & ((limb)1));
-}
-
 /* felem_inv calculates |out| = |in|^{-1}
 *
 * Based on Fermat's Little Theorem:
@@ -849,7 +811,7 @@ static void felem_inv(felem out, const felem in) {
  /* each e_I will hold |in|^{2^I - 1} */
  felem e2, e4, e8, e16, e32, e64;
  longfelem tmp;
-  unsigned i;
+  size_t i;

  felem_square(tmp, in);
  felem_reduce(ftmp, tmp); /* 2^1 */
@@ -937,14 +899,6 @@ static void felem_inv(felem out, const felem in) {
  felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
 }

-static void smallfelem_inv_contract(smallfelem out, const smallfelem in) {
-  felem tmp;
-
-  smallfelem_expand(tmp, in);
-  felem_inv(tmp, tmp);
-  felem_contract(out, tmp);
-}
-
 /* Group operations
 * ----------------
 *
@@ -1055,7 +1009,7 @@ static void point_double_small(smallfelem x_out, smallfelem y_out,

 /* copy_conditional copies in to out iff mask is all ones. */
 static void copy_conditional(felem out, const felem in, limb mask) {
-  unsigned i;
+  size_t i;
  for (i = 0; i < NLIMBS; ++i) {
    const limb tmp = mask & (in[i] ^ out[i]);
    out[i] ^= tmp;
@@ -1064,7 +1018,7 @@ static void copy_conditional(felem out, const felem in, limb mask) {

 /* copy_small_conditional copies in to out iff mask is all ones. */
 static void copy_small_conditional(felem out, const smallfelem in, limb mask) {
-  unsigned i;
+  size_t i;
  const u64 mask64 = mask;
  for (i = 0; i < NLIMBS; ++i) {
    out[i] = ((limb)(in[i] & mask64)) | (out[i] & ~mask);
@@ -1448,12 +1402,13 @@ static const smallfelem g_pre_comp[2][16][3] = {

 /* select_point selects the |idx|th point from a precomputation table and
 * copies it to out. */
-static void select_point(const u64 idx, unsigned int size,
-                         const smallfelem pre_comp[16][3], smallfelem out[3]) {
-  unsigned i, j;
+static void select_point(const u64 idx, size_t size,
+                         const smallfelem pre_comp[/*size*/][3],
+                         smallfelem out[3]) {
  u64 *outlimbs = &out[0][0];
  memset(outlimbs, 0, 3 * sizeof(smallfelem));

+  size_t i;
  for (i = 0; i < size; i++) {
    const u64 *inlimbs = (const u64 *)&pre_comp[i][0][0];
    u64 mask = i ^ idx;
@@ -1462,6 +1417,7 @@ static void select_point(const u64 idx, unsigned int size,
    mask |= mask >> 1;
    mask &= 1;
    mask--;
+    size_t j;
    for (j = 0; j < NLIMBS * 3; j++) {
      outlimbs[j] |= inlimbs[j] & mask;
    }
@@ -1483,10 +1439,8 @@ static char get_bit(const felem_bytearray in, int i) {
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out. */
 static void batch_mul(felem x_out, felem y_out, felem z_out,
                      const felem_bytearray scalars[],
-                      const unsigned num_points, const u8 *g_scalar,
-                      const int mixed, const smallfelem pre_comp[][17][3]) {
-  int i, skip;
-  unsigned num, gen_mul = (g_scalar != NULL);
+                      const size_t num_points, const u8 *g_scalar,
+                      const smallfelem pre_comp[][17][3]) {
  felem nq[3], ftmp;
  smallfelem tmp[3];
  u64 bits;
@@ -1499,16 +1453,16 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
   * of the generator (two in each of the last 32 rounds) and additions of
   * other points multiples (every 5th round). */

-  skip = 1; /* save two point operations in the first
-             * round */
-  for (i = (num_points ? 255 : 31); i >= 0; --i) {
+  int skip = 1; /* save two point operations in the first round */
+  size_t i = num_points != 0 ? 255 : 31;
+  for (;;) {
    /* double */
    if (!skip) {
      point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
    }

    /* add multiples of the generator */
-    if (gen_mul && i <= 31) {
+    if (g_scalar != NULL && i <= 31) {
      /* first, look 32 bits upwards */
      bits = get_bit(g_scalar, i + 224) << 3;
      bits |= get_bit(g_scalar, i + 160) << 2;
@@ -1518,9 +1472,8 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
      select_point(bits, 16, g_pre_comp[1], tmp);

      if (!skip) {
-        /* Arg 1 below is for "mixed" */
-        point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1, tmp[0], tmp[1],
-                  tmp[2]);
+        point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
+                  tmp[0], tmp[1], tmp[2]);
      } else {
        smallfelem_expand(nq[0], tmp[0]);
        smallfelem_expand(nq[1], tmp[1]);
@@ -1535,14 +1488,14 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
      bits |= get_bit(g_scalar, i);
      /* select the point to add, in constant time */
      select_point(bits, 16, g_pre_comp[0], tmp);
-      /* Arg 1 below is for "mixed" */
-      point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1, tmp[0], tmp[1],
-                tmp[2]);
+      point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0],
+                tmp[1], tmp[2]);
    }

    /* do other additions every 5 doublings */
-    if (num_points && (i % 5 == 0)) {
+    if (num_points != 0 && i % 5 == 0) {
      /* loop over all scalars */
+      size_t num;
      for (num = 0; num < num_points; ++num) {
        bits = get_bit(scalars[num], i + 4) << 5;
        bits |= get_bit(scalars[num], i + 3) << 4;
@@ -1560,8 +1513,8 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
        felem_contract(tmp[1], ftmp);

        if (!skip) {
-          point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], mixed, tmp[0],
-                    tmp[1], tmp[2]);
+          point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
+                    tmp[0], tmp[1], tmp[2]);
        } else {
          smallfelem_expand(nq[0], tmp[0]);
          smallfelem_expand(nq[1], tmp[1]);
@@ -1570,6 +1523,11 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,
        }
      }
    }
+
+    if (i == 0) {
+      break;
+    }
+    --i;
  }
  felem_assign(x_out, nq[0]);
  felem_assign(y_out, nq[1]);
@@ -1583,10 +1541,10 @@ static void batch_mul(felem x_out, felem y_out, felem z_out,

 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
 * (X/Z^2, Y/Z^3). */
-int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
-                                                 const EC_POINT *point,
-                                                 BIGNUM *x, BIGNUM *y,
-                                                 BN_CTX *ctx) {
+static int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
+                                                        const EC_POINT *point,
+                                                        BIGNUM *x, BIGNUM *y,
+                                                        BN_CTX *ctx) {
  felem z1, z2, x_in, y_in;
  smallfelem x_out, y_out;
  longfelem tmp;
@@ -1622,26 +1580,12 @@ int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
  return 1;
 }

-/* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
-static void make_points_affine(size_t num, smallfelem points[][3],
-                               smallfelem tmp_smallfelems[]) {
-  /* Runs in constant time, unless an input is the point at infinity (which
-   * normally shouldn't happen). */
-  ec_GFp_nistp_points_make_affine_internal(
-      num, points, sizeof(smallfelem), tmp_smallfelems,
-      (void (*)(void *))smallfelem_one,
-      (int (*)(const void *))smallfelem_is_zero_int,
-      (void (*)(void *, const void *))smallfelem_assign,
-      (void (*)(void *, const void *))smallfelem_square_contract,
-      (void (*)(void *, const void *, const void *))smallfelem_mul_contract,
-      (void (*)(void *, const void *))smallfelem_inv_contract,
-      /* nothing to contract */
-      (void (*)(void *, const void *))smallfelem_assign);
-}
-
-int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
-                               const BIGNUM *g_scalar, const EC_POINT *p_,
-                               const BIGNUM *p_scalar_, BN_CTX *ctx) {
+static int ec_GFp_nistp256_points_mul(const EC_GROUP *group,
+                                      EC_POINT *r,
+                                      const BIGNUM *g_scalar,
+                                      const EC_POINT *p_,
+                                      const BIGNUM *p_scalar_,
+                                      BN_CTX *ctx) {
  /* TODO: This function used to take |points| and |scalars| as arrays of
   * |num| elements. The code below should be simplified to work in terms of |p|
   * and |p_scalar|. */
@@ -1650,16 +1594,12 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
  BIGNUM const *const *scalars = p_ != NULL ? &p_scalar_ : NULL;

  int ret = 0;
-  int j;
-  int mixed = 0;
  BN_CTX *new_ctx = NULL;
  BIGNUM *x, *y, *z, *tmp_scalar;
  felem_bytearray g_secret;
  felem_bytearray *secrets = NULL;
  smallfelem(*pre_comp)[17][3] = NULL;
-  smallfelem *tmp_smallfelems = NULL;
  felem_bytearray tmp;
-  unsigned i, num_bytes;
  size_t num_points = num;
  smallfelem x_in, y_in, z_in;
  felem x_out, y_out, z_out;
@@ -1682,19 +1622,9 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
  }

  if (num_points > 0) {
-    if (num_points >= 3) {
-      /* unless we precompute multiples for just one or two points,
-       * converting those into affine form is time well spent */
-      mixed = 1;
-    }
    secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
    pre_comp = OPENSSL_malloc(num_points * sizeof(smallfelem[17][3]));
-    if (mixed) {
-      tmp_smallfelems =
-          OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
-    }
-    if (secrets == NULL || pre_comp == NULL ||
-        (mixed && tmp_smallfelems == NULL)) {
+    if (secrets == NULL || pre_comp == NULL) {
      OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
      goto err;
    }
@@ -1703,6 +1633,7 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
     * i.e., they contribute nothing to the linear combination. */
    memset(secrets, 0, num_points * sizeof(felem_bytearray));
    memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
+    size_t i;
    for (i = 0; i < num_points; ++i) {
      if (i == num) {
        /* we didn't have a valid precomputation, so we pick the generator. */
@@ -1714,6 +1645,7 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
        p_scalar = scalars[i];
      }
      if (p_scalar != NULL && p != NULL) {
+        size_t num_bytes;
        /* reduce g_scalar to 0 <= g_scalar < 2^256 */
        if (BN_num_bits(p_scalar) > 256 || BN_is_negative(p_scalar)) {
          /* this is an unusual input, and we don't guarantee
@@ -1736,6 +1668,7 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
        felem_shrink(pre_comp[i][1][0], x_out);
        felem_shrink(pre_comp[i][1][1], y_out);
        felem_shrink(pre_comp[i][1][2], z_out);
+        size_t j;
        for (j = 2; j <= 16; ++j) {
          if (j & 1) {
            point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
@@ -1751,12 +1684,11 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
        }
      }
    }
-    if (mixed) {
-      make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
-    }
  }

  if (g_scalar != NULL) {
+    size_t num_bytes;
+
    memset(g_secret, 0, sizeof(g_secret));
    /* reduce g_scalar to 0 <= g_scalar < 2^256 */
    if (BN_num_bits(g_scalar) > 256 || BN_is_negative(g_scalar)) {
@@ -1773,7 +1705,7 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
    flip_endian(g_secret, tmp, num_bytes);
  }
  batch_mul(x_out, y_out, z_out, (const felem_bytearray(*))secrets,
-            num_points, g_scalar != NULL ? g_secret : NULL, mixed,
+            num_points, g_scalar != NULL ? g_secret : NULL,
            (const smallfelem(*)[17][3])pre_comp);

  /* reduce the output to its unique minimal representation */
@@ -1793,7 +1725,6 @@ err:
  BN_CTX_free(new_ctx);
  OPENSSL_free(secrets);
  OPENSSL_free(pre_comp);
-  OPENSSL_free(tmp_smallfelems);
  return ret;
 }

@@ -1807,7 +1738,7 @@ const EC_METHOD *EC_GFp_nistp256_method(void) {
      ec_GFp_nistp256_points_mul,
      0 /* check_pub_key_order */,
      ec_GFp_simple_field_mul, ec_GFp_simple_field_sqr,
-      0 /* field_encode */, 0 /* field_decode */, 0 /* field_set_to_one */
+      0 /* field_encode */, 0 /* field_decode */,
  };

  return &ret;
@@ -205,9 +205,7 @@ static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS],

  ecp_nistz256_sqr_mont(res, res);
  ecp_nistz256_sqr_mont(res, res);
-  ecp_nistz256_mul_mont(res, res, in);
-
-  memcpy(r, res, sizeof(res));
+  ecp_nistz256_mul_mont(r, res, in);
 }

 /* ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and
@@ -390,17 +388,6 @@ static int ecp_nistz256_points_mul(
  BN_CTX *new_ctx = NULL;
  int ctx_started = 0;

-  /* Need 256 bits for space for all coordinates. */
-  if (bn_wexpand(&r->X, P256_LIMBS) == NULL ||
-      bn_wexpand(&r->Y, P256_LIMBS) == NULL ||
-      bn_wexpand(&r->Z, P256_LIMBS) == NULL) {
-    OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-    goto err;
-  }
-  r->X.top = P256_LIMBS;
-  r->Y.top = P256_LIMBS;
-  r->Z.top = P256_LIMBS;
-
  if (g_scalar != NULL) {
    if (BN_num_bits(g_scalar) > 256 || BN_is_negative(g_scalar)) {
      if (ctx == NULL) {
@@ -494,15 +481,12 @@ static int ecp_nistz256_points_mul(
    }
  }

-  memcpy(r->X.d, p.p.X, sizeof(p.p.X));
-  memcpy(r->Y.d, p.p.Y, sizeof(p.p.Y));
-  memcpy(r->Z.d, p.p.Z, sizeof(p.p.Z));
-
  /* Not constant-time, but we're only operating on the public output. */
-  bn_correct_top(&r->X);
-  bn_correct_top(&r->Y);
-  bn_correct_top(&r->Z);
-  r->Z_is_one = BN_is_one(&r->Z);
+  if (!bn_set_words(&r->X, p.p.X, P256_LIMBS) ||
+      !bn_set_words(&r->Y, p.p.Y, P256_LIMBS) ||
+      !bn_set_words(&r->Z, p.p.Z, P256_LIMBS)) {
+    return 0;
+  }

  ret = 1;

@@ -576,7 +560,6 @@ const EC_METHOD *EC_GFp_nistz256_method(void) {
      ec_GFp_mont_field_sqr,
      ec_GFp_mont_field_encode,
      ec_GFp_mont_field_decode,
-      ec_GFp_mont_field_set_to_one,
  };

  return &ret;
@@ -82,16 +82,16 @@
 * field_sqr methods will be used for multiplication, and field_encode and
 * field_decode (if defined) will be used for converting between
 * representations.
-
- * Functions ec_GFp_simple_points_make_affine() and
- * ec_GFp_simple_point_get_affine_coordinates() specifically assume that if a
- * non-trivial representation is used, it is a Montgomery representation (i.e.
- * 'encoding' means multiplying by some factor R). */
+ *
+ * Functions here specifically assume that if a non-trivial representation is
+ * used, it is a Montgomery representation (i.e. 'encoding' means multiplying
+ * by some factor R). */

 int ec_GFp_simple_group_init(EC_GROUP *group) {
  BN_init(&group->field);
  BN_init(&group->a);
  BN_init(&group->b);
+  BN_init(&group->one);
  group->a_is_minus3 = 0;
  return 1;
 }
@@ -100,12 +100,14 @@ void ec_GFp_simple_group_finish(EC_GROUP *group) {
  BN_free(&group->field);
  BN_free(&group->a);
  BN_free(&group->b);
+  BN_free(&group->one);
 }

 int ec_GFp_simple_group_copy(EC_GROUP *dest, const EC_GROUP *src) {
  if (!BN_copy(&dest->field, &src->field) ||
      !BN_copy(&dest->a, &src->a) ||
-      !BN_copy(&dest->b, &src->b)) {
+      !BN_copy(&dest->b, &src->b) ||
+      !BN_copy(&dest->one, &src->one)) {
    return 0;
  }

@@ -172,6 +174,14 @@ int ec_GFp_simple_group_set_curve(EC_GROUP *group, const BIGNUM *p,
  }
  group->a_is_minus3 = (0 == BN_cmp(tmp_a, &group->field));

+  if (group->meth->field_encode != NULL) {
+    if (!group->meth->field_encode(group, &group->one, BN_value_one(), ctx)) {
+      goto err;
+    }
+  } else if (!BN_copy(&group->one, BN_value_one())) {
+    goto err;
+  }
+
  ret = 1;

 err:
@@ -228,7 +238,6 @@ int ec_GFp_simple_point_init(EC_POINT *point) {
  BN_init(&point->X);
  BN_init(&point->Y);
  BN_init(&point->Z);
-  point->Z_is_one = 0;

  return 1;
 }
@@ -243,7 +252,6 @@ void ec_GFp_simple_point_clear_finish(EC_POINT *point) {
  BN_clear_free(&point->X);
  BN_clear_free(&point->Y);
  BN_clear_free(&point->Z);
-  point->Z_is_one = 0;
 }

 int ec_GFp_simple_point_copy(EC_POINT *dest, const EC_POINT *src) {
@@ -252,18 +260,32 @@ int ec_GFp_simple_point_copy(EC_POINT *dest, const EC_POINT *src) {
      !BN_copy(&dest->Z, &src->Z)) {
    return 0;
  }
-  dest->Z_is_one = src->Z_is_one;

  return 1;
 }

 int ec_GFp_simple_point_set_to_infinity(const EC_GROUP *group,
                                        EC_POINT *point) {
-  point->Z_is_one = 0;
  BN_zero(&point->Z);
  return 1;
 }

+static int set_Jprojective_coordinate_GFp(const EC_GROUP *group, BIGNUM *out,
+                                          const BIGNUM *in, BN_CTX *ctx) {
+  if (in == NULL) {
+    return 1;
+  }
+  if (BN_is_negative(in) ||
+      BN_cmp(in, &group->field) >= 0) {
+    OPENSSL_PUT_ERROR(EC, EC_R_COORDINATES_OUT_OF_RANGE);
+    return 0;
+  }
+  if (group->meth->field_encode) {
+    return group->meth->field_encode(group, out, in, ctx);
+  }
+  return BN_copy(out, in) != NULL;
+}
+
 int ec_GFp_simple_set_Jprojective_coordinates_GFp(
    const EC_GROUP *group, EC_POINT *point, const BIGNUM *x, const BIGNUM *y,
    const BIGNUM *z, BN_CTX *ctx) {
@@ -277,43 +299,10 @@ int ec_GFp_simple_set_Jprojective_coordinates_GFp(
    }
  }

-  if (x != NULL) {
-    if (!BN_nnmod(&point->X, x, &group->field, ctx)) {
-      goto err;
-    }
-    if (group->meth->field_encode &&
-        !group->meth->field_encode(group, &point->X, &point->X, ctx)) {
-      goto err;
-    }
-  }
-
-  if (y != NULL) {
-    if (!BN_nnmod(&point->Y, y, &group->field, ctx)) {
-      goto err;
-    }
-    if (group->meth->field_encode &&
-        !group->meth->field_encode(group, &point->Y, &point->Y, ctx)) {
-      goto err;
-    }
-  }
-
-  if (z != NULL) {
-    int Z_is_one;
-
-    if (!BN_nnmod(&point->Z, z, &group->field, ctx)) {
-      goto err;
-    }
-    Z_is_one = BN_is_one(&point->Z);
-    if (group->meth->field_encode) {
-      if (Z_is_one && (group->meth->field_set_to_one != 0)) {
-        if (!group->meth->field_set_to_one(group, &point->Z, ctx)) {
-          goto err;
-        }
-      } else if (!group->meth->field_encode(group, &point->Z, &point->Z, ctx)) {
-        goto err;
-      }
-    }
-    point->Z_is_one = Z_is_one;
+  if (!set_Jprojective_coordinate_GFp(group, &point->X, x, ctx) ||
+      !set_Jprojective_coordinate_GFp(group, &point->Y, y, ctx) ||
+      !set_Jprojective_coordinate_GFp(group, &point->Z, z, ctx)) {
+    goto err;
  }

  ret = 1;
@@ -379,109 +368,6 @@ int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *group,
                                                  BN_value_one(), ctx);
 }

-int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group,
-                                               const EC_POINT *point, BIGNUM *x,
-                                               BIGNUM *y, BN_CTX *ctx) {
-  BN_CTX *new_ctx = NULL;
-  BIGNUM *Z, *Z_1, *Z_2, *Z_3;
-  const BIGNUM *Z_;
-  int ret = 0;
-
-  if (EC_POINT_is_at_infinity(group, point)) {
-    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
-    return 0;
-  }
-
-  if (ctx == NULL) {
-    ctx = new_ctx = BN_CTX_new();
-    if (ctx == NULL) {
-      return 0;
-    }
-  }
-
-  BN_CTX_start(ctx);
-  Z = BN_CTX_get(ctx);
-  Z_1 = BN_CTX_get(ctx);
-  Z_2 = BN_CTX_get(ctx);
-  Z_3 = BN_CTX_get(ctx);
-  if (Z == NULL || Z_1 == NULL || Z_2 == NULL || Z_3 == NULL) {
-    goto err;
-  }
-
-  /* transform  (X, Y, Z)  into  (x, y) := (X/Z^2, Y/Z^3) */
-
-  if (group->meth->field_decode) {
-    if (!group->meth->field_decode(group, Z, &point->Z, ctx)) {
-      goto err;
-    }
-    Z_ = Z;
-  } else {
-    Z_ = &point->Z;
-  }
-
-  if (BN_is_one(Z_)) {
-    if (group->meth->field_decode) {
-      if (x != NULL && !group->meth->field_decode(group, x, &point->X, ctx)) {
-        goto err;
-      }
-      if (y != NULL && !group->meth->field_decode(group, y, &point->Y, ctx)) {
-        goto err;
-      }
-    } else {
-      if (x != NULL && !BN_copy(x, &point->X)) {
-        goto err;
-      }
-      if (y != NULL && !BN_copy(y, &point->Y)) {
-        goto err;
-      }
-    }
-  } else {
-    if (!BN_mod_inverse(Z_1, Z_, &group->field, ctx)) {
-      OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
-      goto err;
-    }
-
-    if (group->meth->field_encode == 0) {
-      /* field_sqr works on standard representation */
-      if (!group->meth->field_sqr(group, Z_2, Z_1, ctx)) {
-        goto err;
-      }
-    } else if (!BN_mod_sqr(Z_2, Z_1, &group->field, ctx)) {
-      goto err;
-    }
-
-    /* in the Montgomery case, field_mul will cancel out Montgomery factor in
-     * X: */
-    if (x != NULL && !group->meth->field_mul(group, x, &point->X, Z_2, ctx)) {
-      goto err;
-    }
-
-    if (y != NULL) {
-      if (group->meth->field_encode == 0) {
-        /* field_mul works on standard representation */
-        if (!group->meth->field_mul(group, Z_3, Z_2, Z_1, ctx)) {
-          goto err;
-        }
-      } else if (!BN_mod_mul(Z_3, Z_2, Z_1, &group->field, ctx)) {
-        goto err;
-      }
-
-      /* in the Montgomery case, field_mul will cancel out Montgomery factor in
-       * Y: */
-      if (!group->meth->field_mul(group, y, &point->Y, Z_3, ctx)) {
-        goto err;
-      }
-    }
-  }
-
-  ret = 1;
-
-err:
-  BN_CTX_end(ctx);
-  BN_CTX_free(new_ctx);
-  return ret;
-}
-
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
                      const EC_POINT *b, BN_CTX *ctx) {
  int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *,
@@ -531,7 +417,9 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
   */

  /* n1, n2 */
-  if (b->Z_is_one) {
+  int b_Z_is_one = BN_cmp(&b->Z, &group->one) == 0;
+
+  if (b_Z_is_one) {
    if (!BN_copy(n1, &a->X) || !BN_copy(n2, &a->Y)) {
      goto end;
    }
@@ -552,7 +440,8 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  }

  /* n3, n4 */
-  if (a->Z_is_one) {
+  int a_Z_is_one = BN_cmp(&a->Z, &group->one) == 0;
+  if (a_Z_is_one) {
    if (!BN_copy(n3, &b->X) || !BN_copy(n4, &b->Y)) {
      goto end;
    }
@@ -590,7 +479,6 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
    } else {
      /* a is the inverse of b */
      BN_zero(&r->Z);
-      r->Z_is_one = 0;
      ret = 1;
      goto end;
    }
@@ -605,16 +493,16 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  /* 'n8' = n2 + n4 */

  /* Z_r */
-  if (a->Z_is_one && b->Z_is_one) {
+  if (a_Z_is_one && b_Z_is_one) {
    if (!BN_copy(&r->Z, n5)) {
      goto end;
    }
  } else {
-    if (a->Z_is_one) {
+    if (a_Z_is_one) {
      if (!BN_copy(n0, &b->Z)) {
        goto end;
      }
-    } else if (b->Z_is_one) {
+    } else if (b_Z_is_one) {
      if (!BN_copy(n0, &a->Z)) {
        goto end;
      }
@@ -625,7 +513,7 @@ int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
      goto end;
    }
  }
-  r->Z_is_one = 0;
+
  /* Z_r = Z_a * Z_b * n5 */

  /* X_r */
@@ -685,7 +573,6 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,

  if (EC_POINT_is_at_infinity(group, a)) {
    BN_zero(&r->Z);
-    r->Z_is_one = 0;
    return 1;
  }

@@ -715,7 +602,7 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
   */

  /* n1 */
-  if (a->Z_is_one) {
+  if (BN_cmp(&a->Z, &group->one) == 0) {
    if (!field_sqr(group, n0, &a->X, ctx) ||
        !BN_mod_lshift1_quick(n1, n0, p) ||
        !BN_mod_add_quick(n0, n0, n1, p) ||
@@ -748,7 +635,7 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  }

  /* Z_r */
-  if (a->Z_is_one) {
+  if (BN_cmp(&a->Z, &group->one) == 0) {
    if (!BN_copy(n0, &a->Y)) {
      goto err;
    }
@@ -758,7 +645,6 @@ int ec_GFp_simple_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
  if (!BN_mod_lshift1_quick(&r->Z, n0, p)) {
    goto err;
  }
-  r->Z_is_one = 0;
  /* Z_r = 2 * Y_a * Z_a */

  /* n2 */
@@ -810,7 +696,7 @@ int ec_GFp_simple_invert(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx) {
 }

 int ec_GFp_simple_is_at_infinity(const EC_GROUP *group, const EC_POINT *point) {
-  return !point->Z_is_one && BN_is_zero(&point->Z);
+  return BN_is_zero(&point->Z);
 }

 int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,
@@ -821,7 +707,7 @@ int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,
  const BIGNUM *p;
  BN_CTX *new_ctx = NULL;
  BIGNUM *rh, *tmp, *Z4, *Z6;
-  int ret = -1;
+  int ret = 0;

  if (EC_POINT_is_at_infinity(group, point)) {
    return 1;
@@ -834,7 +720,7 @@ int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,
  if (ctx == NULL) {
    ctx = new_ctx = BN_CTX_new();
    if (ctx == NULL) {
-      return -1;
+      return 0;
    }
  }

@@ -862,7 +748,7 @@ int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,
    goto err;
  }

-  if (!point->Z_is_one) {
+  if (BN_cmp(&point->Z, &group->one) != 0) {
    if (!field_sqr(group, tmp, &point->Z, ctx) ||
        !field_sqr(group, Z4, tmp, ctx) ||
        !field_mul(group, Z6, Z4, tmp, ctx)) {
@@ -891,8 +777,6 @@ int ec_GFp_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point,
      goto err;
    }
  } else {
-    /* point->Z_is_one */
-
    /* rh := (rh + a)*X */
    if (!BN_mod_add_quick(rh, rh, &group->a, p) ||
        !field_mul(group, rh, rh, &point->X, ctx)) {
@@ -941,7 +825,10 @@ int ec_GFp_simple_cmp(const EC_GROUP *group, const EC_POINT *a,
    return 1;
  }

-  if (a->Z_is_one && b->Z_is_one) {
+  int a_Z_is_one = BN_cmp(&a->Z, &group->one) == 0;
+  int b_Z_is_one = BN_cmp(&b->Z, &group->one) == 0;
+
+  if (a_Z_is_one && b_Z_is_one) {
    return ((BN_cmp(&a->X, &b->X) == 0) && BN_cmp(&a->Y, &b->Y) == 0) ? 0 : 1;
  }

@@ -970,7 +857,7 @@ int ec_GFp_simple_cmp(const EC_GROUP *group, const EC_POINT *a,
   *     (X_a*Z_b^2, Y_a*Z_b^3) = (X_b*Z_a^2, Y_b*Z_a^3).
   */

-  if (!b->Z_is_one) {
+  if (!b_Z_is_one) {
    if (!field_sqr(group, Zb23, &b->Z, ctx) ||
        !field_mul(group, tmp1, &a->X, Zb23, ctx)) {
      goto end;
@@ -979,7 +866,7 @@ int ec_GFp_simple_cmp(const EC_GROUP *group, const EC_POINT *a,
  } else {
    tmp1_ = &a->X;
  }
-  if (!a->Z_is_one) {
+  if (!a_Z_is_one) {
    if (!field_sqr(group, Za23, &a->Z, ctx) ||
        !field_mul(group, tmp2, &b->X, Za23, ctx)) {
      goto end;
@@ -996,7 +883,7 @@ int ec_GFp_simple_cmp(const EC_GROUP *group, const EC_POINT *a,
  }


-  if (!b->Z_is_one) {
+  if (!b_Z_is_one) {
    if (!field_mul(group, Zb23, Zb23, &b->Z, ctx) ||
        !field_mul(group, tmp1, &a->Y, Zb23, ctx)) {
      goto end;
@@ -1005,7 +892,7 @@ int ec_GFp_simple_cmp(const EC_GROUP *group, const EC_POINT *a,
  } else {
    tmp1_ = &a->Y;
  }
-  if (!a->Z_is_one) {
+  if (!a_Z_is_one) {
    if (!field_mul(group, Za23, Za23, &a->Z, ctx) ||
        !field_mul(group, tmp2, &b->Y, Za23, ctx)) {
      goto end;
@@ -1036,7 +923,8 @@ int ec_GFp_simple_make_affine(const EC_GROUP *group, EC_POINT *point,
  BIGNUM *x, *y;
  int ret = 0;

-  if (point->Z_is_one || EC_POINT_is_at_infinity(group, point)) {
+  if (BN_cmp(&point->Z, &group->one) == 0 ||
+      EC_POINT_is_at_infinity(group, point)) {
    return 1;
  }

@@ -1058,7 +946,7 @@ int ec_GFp_simple_make_affine(const EC_GROUP *group, EC_POINT *point,
      !EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) {
    goto err;
  }
-  if (!point->Z_is_one) {
+  if (BN_cmp(&point->Z, &group->one) != 0) {
    OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
    goto err;
  }
@@ -1117,14 +1005,8 @@ int ec_GFp_simple_points_make_affine(const EC_GROUP *group, size_t num,
      goto err;
    }
  } else {
-    if (group->meth->field_set_to_one != 0) {
-      if (!group->meth->field_set_to_one(group, prod_Z[0], ctx)) {
-        goto err;
-      }
-    } else {
-      if (!BN_one(prod_Z[0])) {
-        goto err;
-      }
+    if (BN_copy(prod_Z[0], &group->one) == NULL) {
+      goto err;
    }
  }

@@ -1195,16 +1077,9 @@ int ec_GFp_simple_points_make_affine(const EC_GROUP *group, size_t num,
        goto err;
      }

-      if (group->meth->field_set_to_one != NULL) {
-        if (!group->meth->field_set_to_one(group, &p->Z, ctx)) {
-          goto err;
-        }
-      } else {
-        if (!BN_one(&p->Z)) {
-          goto err;
-        }
+      if (BN_copy(&p->Z, &group->one) == NULL) {
+        goto err;
      }
-      p->Z_is_one = 1;
    }
  }

@@ -21,80 +21,6 @@

 #include "internal.h"

-/* Convert an array of points into affine coordinates. (If the point at
- * infinity is found (Z = 0), it remains unchanged.) This function is
- * essentially an equivalent to EC_POINTs_make_affine(), but works with the
- * internal representation of points as used by ecp_nistp###.c rather than
- * with (BIGNUM-based) EC_POINT data structures. point_array is the
- * input/output buffer ('num' points in projective form, i.e. three
- * coordinates each), based on an internal representation of field elements
- * of size 'felem_size'. tmp_felems needs to point to a temporary array of
- * 'num'+1 field elements for storage of intermediate values. */
-void ec_GFp_nistp_points_make_affine_internal(
-    size_t num, void *point_array, size_t felem_size, void *tmp_felems,
-    void (*felem_one)(void *out), int (*felem_is_zero)(const void *in),
-    void (*felem_assign)(void *out, const void *in),
-    void (*felem_square)(void *out, const void *in),
-    void (*felem_mul)(void *out, const void *in1, const void *in2),
-    void (*felem_inv)(void *out, const void *in),
-    void (*felem_contract)(void *out, const void *in)) {
-  int i = 0;
-
-#define tmp_felem(I) (&((char *)tmp_felems)[(I)*felem_size])
-#define X(I) (&((char *)point_array)[3 * (I)*felem_size])
-#define Y(I) (&((char *)point_array)[(3 * (I) + 1) * felem_size])
-#define Z(I) (&((char *)point_array)[(3 * (I) + 2) * felem_size])
-
-  if (!felem_is_zero(Z(0))) {
-    felem_assign(tmp_felem(0), Z(0));
-  } else {
-    felem_one(tmp_felem(0));
-  }
-
-  for (i = 1; i < (int)num; i++) {
-    if (!felem_is_zero(Z(i))) {
-      felem_mul(tmp_felem(i), tmp_felem(i - 1), Z(i));
-    } else {
-      felem_assign(tmp_felem(i), tmp_felem(i - 1));
-    }
-  }
-  /* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any
-   * zero-valued factors: if Z(i) = 0, we essentially pretend that Z(i) = 1. */
-
-  felem_inv(tmp_felem(num - 1), tmp_felem(num - 1));
-  for (i = num - 1; i >= 0; i--) {
-    if (i > 0) {
-      /* tmp_felem(i-1) is the product of Z(0) .. Z(i-1), tmp_felem(i)
-       * is the inverse of the product of Z(0) .. Z(i). */
-      /* 1/Z(i) */
-      felem_mul(tmp_felem(num), tmp_felem(i - 1), tmp_felem(i));
-    } else {
-      felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */
-    }
-
-    if (!felem_is_zero(Z(i))) {
-      if (i > 0) {
-        /* For next iteration, replace tmp_felem(i-1) by its inverse. */
-        felem_mul(tmp_felem(i - 1), tmp_felem(i), Z(i));
-      }
-
-      /* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1). */
-      felem_square(Z(i), tmp_felem(num));    /* 1/(Z^2) */
-      felem_mul(X(i), X(i), Z(i));           /* X/(Z^2) */
-      felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */
-      felem_mul(Y(i), Y(i), Z(i));           /* Y/(Z^3) */
-      felem_contract(X(i), X(i));
-      felem_contract(Y(i), Y(i));
-      felem_one(Z(i));
-    } else {
-      if (i > 0) {
-        /* For next iteration, replace tmp_felem(i-1) by its inverse. */
-        felem_assign(tmp_felem(i - 1), tmp_felem(i));
-      }
-    }
-  }
-}
-
 /* This function looks at 5+1 scalar bits (5 current, 1 adjacent less
 * significant bit), and recodes them into a signed digit for use in fast point
 * multiplication: the use of signed rather than unsigned digits means that
@@ -220,6 +220,7 @@ int i2d_ECDSA_SIG(const ECDSA_SIG *sig, uint8_t **outp) {
  CBB cbb;
  if (!CBB_init(&cbb, 0) ||
      !ECDSA_SIG_marshal(&cbb, sig)) {
+    CBB_cleanup(&cbb);
    return -1;
  }
  return CBB_finish_i2d(&cbb, outp);
@@ -59,7 +59,7 @@
 #include <openssl/ec.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/rand.h>

 #include "../test/scoped_types.h"
@@ -6,82 +6,87 @@ ASN1,104,BMPSTRING_IS_WRONG_LENGTH
 ASN1,105,BN_LIB
 ASN1,106,BOOLEAN_IS_WRONG_LENGTH
 ASN1,107,BUFFER_TOO_SMALL
-ASN1,108,DECODE_ERROR
-ASN1,109,DEPTH_EXCEEDED
-ASN1,110,ENCODE_ERROR
-ASN1,111,ERROR_GETTING_TIME
-ASN1,112,EXPECTING_AN_ASN1_SEQUENCE
-ASN1,113,EXPECTING_AN_INTEGER
-ASN1,114,EXPECTING_AN_OBJECT
-ASN1,115,EXPECTING_A_BOOLEAN
-ASN1,116,EXPECTING_A_TIME
-ASN1,117,EXPLICIT_LENGTH_MISMATCH
-ASN1,118,EXPLICIT_TAG_NOT_CONSTRUCTED
-ASN1,119,FIELD_MISSING
-ASN1,120,FIRST_NUM_TOO_LARGE
-ASN1,121,HEADER_TOO_LONG
-ASN1,122,ILLEGAL_BITSTRING_FORMAT
-ASN1,123,ILLEGAL_BOOLEAN
-ASN1,124,ILLEGAL_CHARACTERS
-ASN1,125,ILLEGAL_FORMAT
-ASN1,126,ILLEGAL_HEX
-ASN1,127,ILLEGAL_IMPLICIT_TAG
-ASN1,128,ILLEGAL_INTEGER
-ASN1,129,ILLEGAL_NESTED_TAGGING
-ASN1,130,ILLEGAL_NULL
-ASN1,131,ILLEGAL_NULL_VALUE
-ASN1,132,ILLEGAL_OBJECT
-ASN1,133,ILLEGAL_OPTIONAL_ANY
-ASN1,134,ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE
-ASN1,135,ILLEGAL_TAGGED_ANY
-ASN1,136,ILLEGAL_TIME_VALUE
-ASN1,137,INTEGER_NOT_ASCII_FORMAT
-ASN1,138,INTEGER_TOO_LARGE_FOR_LONG
-ASN1,139,INVALID_BIT_STRING_BITS_LEFT
-ASN1,140,INVALID_BMPSTRING_LENGTH
-ASN1,141,INVALID_DIGIT
-ASN1,142,INVALID_MODIFIER
-ASN1,143,INVALID_NUMBER
-ASN1,144,INVALID_OBJECT_ENCODING
-ASN1,145,INVALID_SEPARATOR
-ASN1,146,INVALID_TIME_FORMAT
-ASN1,147,INVALID_UNIVERSALSTRING_LENGTH
-ASN1,148,INVALID_UTF8STRING
-ASN1,149,LIST_ERROR
-ASN1,150,MISSING_ASN1_EOS
-ASN1,151,MISSING_EOC
-ASN1,152,MISSING_SECOND_NUMBER
-ASN1,153,MISSING_VALUE
-ASN1,154,MSTRING_NOT_UNIVERSAL
-ASN1,155,MSTRING_WRONG_TAG
-ASN1,156,NESTED_ASN1_ERROR
-ASN1,157,NESTED_ASN1_STRING
-ASN1,158,NON_HEX_CHARACTERS
-ASN1,159,NOT_ASCII_FORMAT
-ASN1,160,NOT_ENOUGH_DATA
-ASN1,161,NO_MATCHING_CHOICE_TYPE
-ASN1,162,NULL_IS_WRONG_LENGTH
-ASN1,163,OBJECT_NOT_ASCII_FORMAT
-ASN1,164,ODD_NUMBER_OF_CHARS
-ASN1,165,SECOND_NUMBER_TOO_LARGE
-ASN1,166,SEQUENCE_LENGTH_MISMATCH
-ASN1,167,SEQUENCE_NOT_CONSTRUCTED
-ASN1,168,SEQUENCE_OR_SET_NEEDS_CONFIG
-ASN1,169,SHORT_LINE
-ASN1,170,STREAMING_NOT_SUPPORTED
-ASN1,171,STRING_TOO_LONG
-ASN1,172,STRING_TOO_SHORT
-ASN1,173,TAG_VALUE_TOO_HIGH
-ASN1,174,TIME_NOT_ASCII_FORMAT
-ASN1,175,TOO_LONG
-ASN1,176,TYPE_NOT_CONSTRUCTED
-ASN1,177,TYPE_NOT_PRIMITIVE
-ASN1,178,UNEXPECTED_EOC
-ASN1,179,UNIVERSALSTRING_IS_WRONG_LENGTH
-ASN1,180,UNKNOWN_FORMAT
-ASN1,181,UNKNOWN_TAG
-ASN1,182,UNSUPPORTED_ANY_DEFINED_BY_TYPE
-ASN1,183,UNSUPPORTED_PUBLIC_KEY_TYPE
-ASN1,184,UNSUPPORTED_TYPE
-ASN1,185,WRONG_TAG
-ASN1,186,WRONG_TYPE
+ASN1,108,CONTEXT_NOT_INITIALISED
+ASN1,109,DECODE_ERROR
+ASN1,110,DEPTH_EXCEEDED
+ASN1,111,DIGEST_AND_KEY_TYPE_NOT_SUPPORTED
+ASN1,112,ENCODE_ERROR
+ASN1,113,ERROR_GETTING_TIME
+ASN1,114,EXPECTING_AN_ASN1_SEQUENCE
+ASN1,115,EXPECTING_AN_INTEGER
+ASN1,116,EXPECTING_AN_OBJECT
+ASN1,117,EXPECTING_A_BOOLEAN
+ASN1,118,EXPECTING_A_TIME
+ASN1,119,EXPLICIT_LENGTH_MISMATCH
+ASN1,120,EXPLICIT_TAG_NOT_CONSTRUCTED
+ASN1,121,FIELD_MISSING
+ASN1,122,FIRST_NUM_TOO_LARGE
+ASN1,123,HEADER_TOO_LONG
+ASN1,124,ILLEGAL_BITSTRING_FORMAT
+ASN1,125,ILLEGAL_BOOLEAN
+ASN1,126,ILLEGAL_CHARACTERS
+ASN1,127,ILLEGAL_FORMAT
+ASN1,128,ILLEGAL_HEX
+ASN1,129,ILLEGAL_IMPLICIT_TAG
+ASN1,130,ILLEGAL_INTEGER
+ASN1,131,ILLEGAL_NESTED_TAGGING
+ASN1,132,ILLEGAL_NULL
+ASN1,133,ILLEGAL_NULL_VALUE
+ASN1,134,ILLEGAL_OBJECT
+ASN1,135,ILLEGAL_OPTIONAL_ANY
+ASN1,136,ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE
+ASN1,137,ILLEGAL_TAGGED_ANY
+ASN1,138,ILLEGAL_TIME_VALUE
+ASN1,139,INTEGER_NOT_ASCII_FORMAT
+ASN1,140,INTEGER_TOO_LARGE_FOR_LONG
+ASN1,141,INVALID_BIT_STRING_BITS_LEFT
+ASN1,142,INVALID_BMPSTRING_LENGTH
+ASN1,143,INVALID_DIGIT
+ASN1,144,INVALID_MODIFIER
+ASN1,145,INVALID_NUMBER
+ASN1,146,INVALID_OBJECT_ENCODING
+ASN1,147,INVALID_SEPARATOR
+ASN1,148,INVALID_TIME_FORMAT
+ASN1,149,INVALID_UNIVERSALSTRING_LENGTH
+ASN1,150,INVALID_UTF8STRING
+ASN1,151,LIST_ERROR
+ASN1,152,MISSING_ASN1_EOS
+ASN1,153,MISSING_EOC
+ASN1,154,MISSING_SECOND_NUMBER
+ASN1,155,MISSING_VALUE
+ASN1,156,MSTRING_NOT_UNIVERSAL
+ASN1,157,MSTRING_WRONG_TAG
+ASN1,158,NESTED_ASN1_ERROR
+ASN1,159,NESTED_ASN1_STRING
+ASN1,160,NON_HEX_CHARACTERS
+ASN1,161,NOT_ASCII_FORMAT
+ASN1,162,NOT_ENOUGH_DATA
+ASN1,163,NO_MATCHING_CHOICE_TYPE
+ASN1,164,NULL_IS_WRONG_LENGTH
+ASN1,165,OBJECT_NOT_ASCII_FORMAT
+ASN1,166,ODD_NUMBER_OF_CHARS
+ASN1,167,SECOND_NUMBER_TOO_LARGE
+ASN1,168,SEQUENCE_LENGTH_MISMATCH
+ASN1,169,SEQUENCE_NOT_CONSTRUCTED
+ASN1,170,SEQUENCE_OR_SET_NEEDS_CONFIG
+ASN1,171,SHORT_LINE
+ASN1,172,STREAMING_NOT_SUPPORTED
+ASN1,173,STRING_TOO_LONG
+ASN1,174,STRING_TOO_SHORT
+ASN1,175,TAG_VALUE_TOO_HIGH
+ASN1,176,TIME_NOT_ASCII_FORMAT
+ASN1,177,TOO_LONG
+ASN1,178,TYPE_NOT_CONSTRUCTED
+ASN1,179,TYPE_NOT_PRIMITIVE
+ASN1,180,UNEXPECTED_EOC
+ASN1,181,UNIVERSALSTRING_IS_WRONG_LENGTH
+ASN1,182,UNKNOWN_FORMAT
+ASN1,183,UNKNOWN_MESSAGE_DIGEST_ALGORITHM
+ASN1,184,UNKNOWN_SIGNATURE_ALGORITHM
+ASN1,185,UNKNOWN_TAG
+ASN1,186,UNSUPPORTED_ANY_DEFINED_BY_TYPE
+ASN1,187,UNSUPPORTED_PUBLIC_KEY_TYPE
+ASN1,188,UNSUPPORTED_TYPE
+ASN1,189,WRONG_PUBLIC_KEY_TYPE
+ASN1,190,WRONG_TAG
+ASN1,191,WRONG_TYPE
@@ -1,47 +1,30 @@
-EVP,151,BN_DECODE_ERROR
 EVP,100,BUFFER_TOO_SMALL
 EVP,101,COMMAND_NOT_SUPPORTED
-EVP,146,CONTEXT_NOT_INITIALISED
-EVP,143,DECODE_ERROR
-EVP,104,DIFFERENT_KEY_TYPES
-EVP,105,DIFFERENT_PARAMETERS
-EVP,147,DIGEST_AND_KEY_TYPE_NOT_SUPPORTED
-EVP,155,ENCODE_ERROR
-EVP,107,EXPECTING_AN_EC_KEY_KEY
-EVP,141,EXPECTING_AN_RSA_KEY
-EVP,109,EXPECTING_A_DH_KEY
-EVP,110,EXPECTING_A_DSA_KEY
-EVP,111,ILLEGAL_OR_UNSUPPORTED_PADDING_MODE
-EVP,112,INVALID_CURVE
-EVP,113,INVALID_DIGEST_LENGTH
-EVP,114,INVALID_DIGEST_TYPE
-EVP,115,INVALID_KEYBITS
-EVP,116,INVALID_MGF1_MD
-EVP,142,INVALID_OPERATION
-EVP,118,INVALID_PADDING_MODE
-EVP,119,INVALID_PSS_PARAMETERS
-EVP,144,INVALID_PSS_SALTLEN
-EVP,121,INVALID_SALT_LENGTH
-EVP,122,INVALID_TRAILER
-EVP,123,KEYS_NOT_SET
-EVP,124,MISSING_PARAMETERS
-EVP,125,NO_DEFAULT_DIGEST
-EVP,126,NO_KEY_SET
-EVP,127,NO_MDC2_SUPPORT
-EVP,128,NO_NID_FOR_CURVE
-EVP,129,NO_OPERATION_SET
-EVP,130,NO_PARAMETERS_SET
-EVP,131,OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE
-EVP,132,OPERATON_NOT_INITIALIZED
-EVP,152,PARAMETER_ENCODING_ERROR
-EVP,133,UNKNOWN_DIGEST
-EVP,134,UNKNOWN_MASK_DIGEST
-EVP,150,UNKNOWN_MESSAGE_DIGEST_ALGORITHM
-EVP,145,UNKNOWN_PUBLIC_KEY_TYPE
-EVP,149,UNKNOWN_SIGNATURE_ALGORITHM
-EVP,138,UNSUPPORTED_ALGORITHM
-EVP,139,UNSUPPORTED_MASK_ALGORITHM
-EVP,140,UNSUPPORTED_MASK_PARAMETER
-EVP,153,UNSUPPORTED_PUBLIC_KEY_TYPE
-EVP,154,UNSUPPORTED_SIGNATURE_TYPE
-EVP,148,WRONG_PUBLIC_KEY_TYPE
+EVP,102,DECODE_ERROR
+EVP,103,DIFFERENT_KEY_TYPES
+EVP,104,DIFFERENT_PARAMETERS
+EVP,105,ENCODE_ERROR
+EVP,106,EXPECTING_AN_EC_KEY_KEY
+EVP,107,EXPECTING_AN_RSA_KEY
+EVP,108,EXPECTING_A_DSA_KEY
+EVP,109,ILLEGAL_OR_UNSUPPORTED_PADDING_MODE
+EVP,110,INVALID_DIGEST_LENGTH
+EVP,111,INVALID_DIGEST_TYPE
+EVP,112,INVALID_KEYBITS
+EVP,113,INVALID_MGF1_MD
+EVP,114,INVALID_OPERATION
+EVP,115,INVALID_PADDING_MODE
+EVP,116,INVALID_PSS_SALTLEN
+EVP,117,KEYS_NOT_SET
+EVP,118,MISSING_PARAMETERS
+EVP,119,NO_DEFAULT_DIGEST
+EVP,120,NO_KEY_SET
+EVP,121,NO_MDC2_SUPPORT
+EVP,122,NO_NID_FOR_CURVE
+EVP,123,NO_OPERATION_SET
+EVP,124,NO_PARAMETERS_SET
+EVP,125,OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE
+EVP,126,OPERATON_NOT_INITIALIZED
+EVP,127,UNKNOWN_PUBLIC_KEY_TYPE
+EVP,128,UNSUPPORTED_ALGORITHM
+EVP,129,UNSUPPORTED_PUBLIC_KEY_TYPE
@@ -1,46 +1,46 @@
-RSA,143,BAD_ENCODING
-RSA,100,BAD_E_VALUE
-RSA,101,BAD_FIXED_HEADER_DECRYPT
-RSA,102,BAD_PAD_BYTE_COUNT
-RSA,103,BAD_RSA_PARAMETERS
-RSA,104,BAD_SIGNATURE
-RSA,145,BAD_VERSION
-RSA,105,BLOCK_TYPE_IS_NOT_01
-RSA,106,BN_NOT_INITIALIZED
-RSA,142,CANNOT_RECOVER_MULTI_PRIME_KEY
-RSA,107,CRT_PARAMS_ALREADY_GIVEN
-RSA,108,CRT_VALUES_INCORRECT
-RSA,109,DATA_LEN_NOT_EQUAL_TO_MOD_LEN
-RSA,110,DATA_TOO_LARGE
-RSA,111,DATA_TOO_LARGE_FOR_KEY_SIZE
-RSA,112,DATA_TOO_LARGE_FOR_MODULUS
-RSA,113,DATA_TOO_SMALL
-RSA,114,DATA_TOO_SMALL_FOR_KEY_SIZE
-RSA,115,DIGEST_TOO_BIG_FOR_RSA_KEY
-RSA,116,D_E_NOT_CONGRUENT_TO_1
-RSA,117,EMPTY_PUBLIC_KEY
-RSA,144,ENCODE_ERROR
-RSA,118,FIRST_OCTET_INVALID
-RSA,119,INCONSISTENT_SET_OF_CRT_VALUES
-RSA,120,INTERNAL_ERROR
-RSA,121,INVALID_MESSAGE_LENGTH
-RSA,122,KEY_SIZE_TOO_SMALL
-RSA,123,LAST_OCTET_INVALID
-RSA,124,MODULUS_TOO_LARGE
-RSA,141,MUST_HAVE_AT_LEAST_TWO_PRIMES
-RSA,125,NO_PUBLIC_EXPONENT
-RSA,126,NULL_BEFORE_BLOCK_MISSING
-RSA,127,N_NOT_EQUAL_P_Q
-RSA,128,OAEP_DECODING_ERROR
-RSA,129,ONLY_ONE_OF_P_Q_GIVEN
-RSA,130,OUTPUT_BUFFER_TOO_SMALL
-RSA,131,PADDING_CHECK_FAILED
-RSA,132,PKCS_DECODING_ERROR
-RSA,133,SLEN_CHECK_FAILED
-RSA,134,SLEN_RECOVERY_FAILED
-RSA,135,TOO_LONG
-RSA,136,TOO_MANY_ITERATIONS
-RSA,137,UNKNOWN_ALGORITHM_TYPE
-RSA,138,UNKNOWN_PADDING_TYPE
-RSA,139,VALUE_MISSING
-RSA,140,WRONG_SIGNATURE_LENGTH
+RSA,100,BAD_ENCODING
+RSA,101,BAD_E_VALUE
+RSA,102,BAD_FIXED_HEADER_DECRYPT
+RSA,103,BAD_PAD_BYTE_COUNT
+RSA,104,BAD_RSA_PARAMETERS
+RSA,105,BAD_SIGNATURE
+RSA,106,BAD_VERSION
+RSA,107,BLOCK_TYPE_IS_NOT_01
+RSA,108,BN_NOT_INITIALIZED
+RSA,109,CANNOT_RECOVER_MULTI_PRIME_KEY
+RSA,110,CRT_PARAMS_ALREADY_GIVEN
+RSA,111,CRT_VALUES_INCORRECT
+RSA,112,DATA_LEN_NOT_EQUAL_TO_MOD_LEN
+RSA,113,DATA_TOO_LARGE
+RSA,114,DATA_TOO_LARGE_FOR_KEY_SIZE
+RSA,115,DATA_TOO_LARGE_FOR_MODULUS
+RSA,116,DATA_TOO_SMALL
+RSA,117,DATA_TOO_SMALL_FOR_KEY_SIZE
+RSA,118,DIGEST_TOO_BIG_FOR_RSA_KEY
+RSA,119,D_E_NOT_CONGRUENT_TO_1
+RSA,120,EMPTY_PUBLIC_KEY
+RSA,121,ENCODE_ERROR
+RSA,122,FIRST_OCTET_INVALID
+RSA,123,INCONSISTENT_SET_OF_CRT_VALUES
+RSA,124,INTERNAL_ERROR
+RSA,125,INVALID_MESSAGE_LENGTH
+RSA,126,KEY_SIZE_TOO_SMALL
+RSA,127,LAST_OCTET_INVALID
+RSA,128,MODULUS_TOO_LARGE
+RSA,129,MUST_HAVE_AT_LEAST_TWO_PRIMES
+RSA,130,NO_PUBLIC_EXPONENT
+RSA,131,NULL_BEFORE_BLOCK_MISSING
+RSA,132,N_NOT_EQUAL_P_Q
+RSA,133,OAEP_DECODING_ERROR
+RSA,134,ONLY_ONE_OF_P_Q_GIVEN
+RSA,135,OUTPUT_BUFFER_TOO_SMALL
+RSA,136,PADDING_CHECK_FAILED
+RSA,137,PKCS_DECODING_ERROR
+RSA,138,SLEN_CHECK_FAILED
+RSA,139,SLEN_RECOVERY_FAILED
+RSA,140,TOO_LONG
+RSA,141,TOO_MANY_ITERATIONS
+RSA,142,UNKNOWN_ALGORITHM_TYPE
+RSA,143,UNKNOWN_PADDING_TYPE
+RSA,144,VALUE_MISSING
+RSA,145,WRONG_SIGNATURE_LENGTH
@@ -108,6 +108,7 @@ SSL,206,SCSV_RECEIVED_WHEN_RENEGOTIATING
 SSL,207,SERVERHELLO_TLSEXT
 SSL,208,SESSION_ID_CONTEXT_UNINITIALIZED
 SSL,209,SESSION_MAY_NOT_BE_CREATED
+SSL,250,SHUTDOWN_WHILE_IN_INIT
 SSL,210,SIGNATURE_ALGORITHMS_EXTENSION_SENT_BY_SERVER
 SSL,211,SRTP_COULD_NOT_ALLOCATE_PROFILES
 SSL,212,SRTP_UNKNOWN_PROTECTION_PROFILE
@@ -10,28 +10,26 @@ X509,108,IDP_MISMATCH
 X509,109,INVALID_BIT_STRING_BITS_LEFT
 X509,110,INVALID_DIRECTORY
 X509,111,INVALID_FIELD_NAME
-X509,112,INVALID_TRUST
-X509,113,ISSUER_MISMATCH
-X509,114,KEY_TYPE_MISMATCH
-X509,115,KEY_VALUES_MISMATCH
-X509,116,LOADING_CERT_DIR
-X509,117,LOADING_DEFAULTS
-X509,118,METHOD_NOT_SUPPORTED
+X509,112,INVALID_PSS_PARAMETERS
+X509,113,INVALID_TRUST
+X509,114,ISSUER_MISMATCH
+X509,115,KEY_TYPE_MISMATCH
+X509,116,KEY_VALUES_MISMATCH
+X509,117,LOADING_CERT_DIR
+X509,118,LOADING_DEFAULTS
 X509,119,NEWER_CRL_NOT_NEWER
 X509,120,NOT_PKCS7_SIGNED_DATA
 X509,121,NO_CERTIFICATES_INCLUDED
 X509,122,NO_CERT_SET_FOR_US_TO_VERIFY
-X509,136,NO_CRLS_INCLUDED
-X509,123,NO_CRL_NUMBER
-X509,124,PUBLIC_KEY_DECODE_ERROR
-X509,125,PUBLIC_KEY_ENCODE_ERROR
-X509,126,SHOULD_RETRY
-X509,127,UNABLE_TO_FIND_PARAMETERS_IN_CHAIN
-X509,128,UNABLE_TO_GET_CERTS_PUBLIC_KEY
-X509,129,UNKNOWN_KEY_TYPE
-X509,130,UNKNOWN_NID
-X509,131,UNKNOWN_PURPOSE_ID
-X509,132,UNKNOWN_TRUST_ID
-X509,133,UNSUPPORTED_ALGORITHM
-X509,134,WRONG_LOOKUP_TYPE
-X509,135,WRONG_TYPE
+X509,123,NO_CRLS_INCLUDED
+X509,124,NO_CRL_NUMBER
+X509,125,PUBLIC_KEY_DECODE_ERROR
+X509,126,PUBLIC_KEY_ENCODE_ERROR
+X509,127,SHOULD_RETRY
+X509,128,UNKNOWN_KEY_TYPE
+X509,129,UNKNOWN_NID
+X509,130,UNKNOWN_PURPOSE_ID
+X509,131,UNKNOWN_TRUST_ID
+X509,132,UNSUPPORTED_ALGORITHM
+X509,133,WRONG_LOOKUP_TYPE
+X509,134,WRONG_TYPE
@@ -5,7 +5,6 @@ add_library(

  OBJECT

-  algorithm.c
  digestsign.c
  evp.c
  evp_asn1.c
@@ -16,6 +15,7 @@ add_library(
  p_rsa.c
  p_rsa_asn1.c
  pbkdf.c
+  print.c
  sign.c
 )

@@ -59,12 +59,11 @@
 #include <assert.h>
 #include <string.h>

-#include <openssl/bio.h>
 #include <openssl/dsa.h>
 #include <openssl/ec.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/rsa.h>
 #include <openssl/thread.h>

@@ -195,8 +194,10 @@ int EVP_PKEY_id(const EVP_PKEY *pkey) {
  return pkey->type;
 }

-/* TODO(fork): remove the first argument. */
-const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pengine, int nid) {
+/* evp_pkey_asn1_find returns the ASN.1 method table for the given |nid|, which
+ * should be one of the |EVP_PKEY_*| values. It returns NULL if |nid| is
+ * unknown. */
+static const EVP_PKEY_ASN1_METHOD *evp_pkey_asn1_find(int nid) {
  switch (nid) {
    case EVP_PKEY_RSA:
      return &rsa_asn1_meth;
@@ -210,7 +211,7 @@ const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pengine, int nid) {
 }

 int EVP_PKEY_type(int nid) {
-  const EVP_PKEY_ASN1_METHOD *meth = EVP_PKEY_asn1_find(NULL, nid);
+  const EVP_PKEY_ASN1_METHOD *meth = evp_pkey_asn1_find(nid);
  if (meth == NULL) {
    return NID_undef;
  }
@@ -309,21 +310,6 @@ int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) {
  return key != NULL;
 }

-const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find_str(ENGINE **pengine,
-                                                   const char *name,
-                                                   size_t len) {
-  if (len == 3 && memcmp(name, "RSA", 3) == 0) {
-    return &rsa_asn1_meth;
-  }
-  if (len == 2 && memcmp(name, "EC", 2) == 0) {
-    return &ec_asn1_meth;
-  }
-  if (len == 3 && memcmp(name, "DSA", 3) == 0) {
-    return &dsa_asn1_meth;
-  }
-  return NULL;
-}
-
 int EVP_PKEY_set_type(EVP_PKEY *pkey, int type) {
  const EVP_PKEY_ASN1_METHOD *ameth;

@@ -331,10 +317,10 @@ int EVP_PKEY_set_type(EVP_PKEY *pkey, int type) {
    free_it(pkey);
  }

-  ameth = EVP_PKEY_asn1_find(NULL, type);
+  ameth = evp_pkey_asn1_find(type);
  if (ameth == NULL) {
    OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
-    ERR_add_error_dataf("algorithm %d (%s)", type, OBJ_nid2sn(type));
+    ERR_add_error_dataf("algorithm %d", type);
    return 0;
  }

@@ -358,41 +344,6 @@ int EVP_PKEY_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b) {
  return -2;
 }

-static int print_unsupported(BIO *out, const EVP_PKEY *pkey, int indent,
-                             const char *kstr) {
-  BIO_indent(out, indent, 128);
-  BIO_printf(out, "%s algorithm \"%s\" unsupported\n", kstr,
-             OBJ_nid2ln(pkey->type));
-  return 1;
-}
-
-int EVP_PKEY_print_public(BIO *out, const EVP_PKEY *pkey, int indent,
-                          ASN1_PCTX *pctx) {
-  if (pkey->ameth && pkey->ameth->pub_print) {
-    return pkey->ameth->pub_print(out, pkey, indent, pctx);
-  }
-
-  return print_unsupported(out, pkey, indent, "Public Key");
-}
-
-int EVP_PKEY_print_private(BIO *out, const EVP_PKEY *pkey, int indent,
-                           ASN1_PCTX *pctx) {
-  if (pkey->ameth && pkey->ameth->priv_print) {
-    return pkey->ameth->priv_print(out, pkey, indent, pctx);
-  }
-
-  return print_unsupported(out, pkey, indent, "Private Key");
-}
-
-int EVP_PKEY_print_params(BIO *out, const EVP_PKEY *pkey, int indent,
-                          ASN1_PCTX *pctx) {
-  if (pkey->ameth && pkey->ameth->param_print) {
-    return pkey->ameth->param_print(out, pkey, indent, pctx);
-  }
-
-  return print_unsupported(out, pkey, indent, "Parameters");
-}
-
 int EVP_PKEY_CTX_set_signature_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) {
  return EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, EVP_PKEY_CTRL_MD, 0,
                           (void *)md);
@@ -56,22 +56,50 @@

 #include <openssl/evp.h>

-#include <openssl/asn1.h>
+#include <string.h>
+
 #include <openssl/bytestring.h>
+#include <openssl/dsa.h>
+#include <openssl/ec_key.h>
 #include <openssl/err.h>
-#include <openssl/obj.h>
-#include <openssl/x509.h>
+#include <openssl/rsa.h>

 #include "internal.h"


+static const EVP_PKEY_ASN1_METHOD *const kASN1Methods[] = {
+    &rsa_asn1_meth,
+    &ec_asn1_meth,
+    &dsa_asn1_meth,
+};
+
+static int parse_key_type(CBS *cbs, int *out_type) {
+  CBS oid;
+  if (!CBS_get_asn1(cbs, &oid, CBS_ASN1_OBJECT)) {
+    return 0;
+  }
+
+  unsigned i;
+  for (i = 0; i < sizeof(kASN1Methods)/sizeof(kASN1Methods[0]); i++) {
+    const EVP_PKEY_ASN1_METHOD *method = kASN1Methods[i];
+    if (CBS_len(&oid) == method->oid_len &&
+        memcmp(CBS_data(&oid), method->oid, method->oid_len) == 0) {
+      *out_type = method->pkey_id;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
 EVP_PKEY *EVP_parse_public_key(CBS *cbs) {
  /* Parse the SubjectPublicKeyInfo. */
-  CBS spki, algorithm, oid, key;
+  CBS spki, algorithm, key;
+  int type;
  uint8_t padding;
  if (!CBS_get_asn1(cbs, &spki, CBS_ASN1_SEQUENCE) ||
      !CBS_get_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !CBS_get_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) ||
+      !parse_key_type(&algorithm, &type) ||
      !CBS_get_asn1(&spki, &key, CBS_ASN1_BITSTRING) ||
      CBS_len(&spki) != 0 ||
      /* Every key type defined encodes the key as a byte string with the same
@@ -85,7 +113,7 @@ EVP_PKEY *EVP_parse_public_key(CBS *cbs) {
  /* Set up an |EVP_PKEY| of the appropriate type. */
  EVP_PKEY *ret = EVP_PKEY_new();
  if (ret == NULL ||
-      !EVP_PKEY_set_type(ret, OBJ_cbs2nid(&oid))) {
+      !EVP_PKEY_set_type(ret, type)) {
    goto err;
  }

@@ -106,7 +134,7 @@ err:
 }

 int EVP_marshal_public_key(CBB *cbb, const EVP_PKEY *key) {
-  if (key->ameth->pub_encode == NULL) {
+  if (key->ameth == NULL || key->ameth->pub_encode == NULL) {
    OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
    return 0;
  }
@@ -116,13 +144,14 @@ int EVP_marshal_public_key(CBB *cbb, const EVP_PKEY *key) {

 EVP_PKEY *EVP_parse_private_key(CBS *cbs) {
  /* Parse the PrivateKeyInfo. */
-  CBS pkcs8, algorithm, oid, key;
+  CBS pkcs8, algorithm, key;
  uint64_t version;
+  int type;
  if (!CBS_get_asn1(cbs, &pkcs8, CBS_ASN1_SEQUENCE) ||
      !CBS_get_asn1_uint64(&pkcs8, &version) ||
      version != 0 ||
      !CBS_get_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !CBS_get_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) ||
+      !parse_key_type(&algorithm, &type) ||
      !CBS_get_asn1(&pkcs8, &key, CBS_ASN1_OCTETSTRING)) {
    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
    return NULL;
@@ -133,7 +162,7 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) {
  /* Set up an |EVP_PKEY| of the appropriate type. */
  EVP_PKEY *ret = EVP_PKEY_new();
  if (ret == NULL ||
-      !EVP_PKEY_set_type(ret, OBJ_cbs2nid(&oid))) {
+      !EVP_PKEY_set_type(ret, type)) {
    goto err;
  }

@@ -154,7 +183,7 @@ err:
 }

 int EVP_marshal_private_key(CBB *cbb, const EVP_PKEY *key) {
-  if (key->ameth->priv_encode == NULL) {
+  if (key->ameth == NULL || key->ameth->priv_encode == NULL) {
    OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
    return 0;
  }
@@ -162,107 +191,134 @@ int EVP_marshal_private_key(CBB *cbb, const EVP_PKEY *key) {
  return key->ameth->priv_encode(cbb, key);
 }

+static EVP_PKEY *old_priv_decode(CBS *cbs, int type) {
+  EVP_PKEY *ret = EVP_PKEY_new();
+  if (ret == NULL) {
+    return NULL;
+  }
+
+  switch (type) {
+    case EVP_PKEY_EC: {
+      EC_KEY *ec_key = EC_KEY_parse_private_key(cbs, NULL);
+      if (ec_key == NULL || !EVP_PKEY_assign_EC_KEY(ret, ec_key)) {
+        EC_KEY_free(ec_key);
+        goto err;
+      }
+      return ret;
+    }
+    case EVP_PKEY_DSA: {
+      DSA *dsa = DSA_parse_private_key(cbs);
+      if (dsa == NULL || !EVP_PKEY_assign_DSA(ret, dsa)) {
+        DSA_free(dsa);
+        goto err;
+      }
+      return ret;
+    }
+    case EVP_PKEY_RSA: {
+      RSA *rsa = RSA_parse_private_key(cbs);
+      if (rsa == NULL || !EVP_PKEY_assign_RSA(ret, rsa)) {
+        RSA_free(rsa);
+        goto err;
+      }
+      return ret;
+    }
+    default:
+      OPENSSL_PUT_ERROR(EVP, EVP_R_UNKNOWN_PUBLIC_KEY_TYPE);
+      goto err;
+  }
+
+err:
+  EVP_PKEY_free(ret);
+  return NULL;
+}
+
 EVP_PKEY *d2i_PrivateKey(int type, EVP_PKEY **out, const uint8_t **inp,
                         long len) {
-  EVP_PKEY *ret;
+  if (len < 0) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
+    return NULL;
+  }

-  if (out == NULL || *out == NULL) {
-    ret = EVP_PKEY_new();
+  /* Parse with the legacy format. */
+  CBS cbs;
+  CBS_init(&cbs, *inp, (size_t)len);
+  EVP_PKEY *ret = old_priv_decode(&cbs, type);
+  if (ret == NULL) {
+    /* Try again with PKCS#8. */
+    ERR_clear_error();
+    CBS_init(&cbs, *inp, (size_t)len);
+    ret = EVP_parse_private_key(&cbs);
    if (ret == NULL) {
-      OPENSSL_PUT_ERROR(EVP, ERR_R_EVP_LIB);
      return NULL;
    }
-  } else {
-    ret = *out;
-  }
-
-  if (!EVP_PKEY_set_type(ret, type)) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_UNKNOWN_PUBLIC_KEY_TYPE);
-    goto err;
-  }
-
-  const uint8_t *in = *inp;
-  /* If trying to remove |old_priv_decode|, note that some code depends on this
-   * function writing into |*out| and the |priv_decode| path doesn't support
-   * that. */
-  if (!ret->ameth->old_priv_decode ||
-      !ret->ameth->old_priv_decode(ret, &in, len)) {
-    if (ret->ameth->priv_decode) {
-      /* Reset |in| in case |old_priv_decode| advanced it on error. */
-      in = *inp;
-      PKCS8_PRIV_KEY_INFO *p8 = d2i_PKCS8_PRIV_KEY_INFO(NULL, &in, len);
-      if (!p8) {
-        goto err;
-      }
+    if (ret->type != type) {
+      OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES);
      EVP_PKEY_free(ret);
-      ret = EVP_PKCS82PKEY(p8);
-      PKCS8_PRIV_KEY_INFO_free(p8);
-      if (ret == NULL) {
-        goto err;
-      }
-    } else {
-      OPENSSL_PUT_ERROR(EVP, ERR_R_ASN1_LIB);
-      goto err;
+      return NULL;
    }
  }

  if (out != NULL) {
+    EVP_PKEY_free(*out);
    *out = ret;
  }
-  *inp = in;
+  *inp = CBS_data(&cbs);
  return ret;
+}

-err:
-  if (out == NULL || *out != ret) {
-    EVP_PKEY_free(ret);
+/* num_elements parses one SEQUENCE from |in| and returns the number of elements
+ * in it. On parse error, it returns zero. */
+static size_t num_elements(const uint8_t *in, size_t in_len) {
+  CBS cbs, sequence;
+  CBS_init(&cbs, in, (size_t)in_len);
+
+  if (!CBS_get_asn1(&cbs, &sequence, CBS_ASN1_SEQUENCE)) {
+    return 0;
  }
-  return NULL;
+
+  size_t count = 0;
+  while (CBS_len(&sequence) > 0) {
+    if (!CBS_get_any_asn1_element(&sequence, NULL, NULL, NULL)) {
+      return 0;
+    }
+
+    count++;
+  }
+
+  return count;
 }

 EVP_PKEY *d2i_AutoPrivateKey(EVP_PKEY **out, const uint8_t **inp, long len) {
-  STACK_OF(ASN1_TYPE) *inkey;
-  const uint8_t *p;
-  int keytype;
-  p = *inp;
-
-  /* Dirty trick: read in the ASN1 data into out STACK_OF(ASN1_TYPE):
-   * by analyzing it we can determine the passed structure: this
-   * assumes the input is surrounded by an ASN1 SEQUENCE. */
-  inkey = d2i_ASN1_SEQUENCE_ANY(NULL, &p, len);
-  /* Since we only need to discern "traditional format" RSA and DSA
-   * keys we can just count the elements. */
-  if (sk_ASN1_TYPE_num(inkey) == 6) {
-    keytype = EVP_PKEY_DSA;
-  } else if (sk_ASN1_TYPE_num(inkey) == 4) {
-    keytype = EVP_PKEY_EC;
-  } else if (sk_ASN1_TYPE_num(inkey) == 3) {
-    /* This seems to be PKCS8, not traditional format */
-    p = *inp;
-    PKCS8_PRIV_KEY_INFO *p8 = d2i_PKCS8_PRIV_KEY_INFO(NULL, &p, len);
-    EVP_PKEY *ret;
-
-    sk_ASN1_TYPE_pop_free(inkey, ASN1_TYPE_free);
-    if (!p8) {
-      OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE);
-      return NULL;
-    }
-    ret = EVP_PKCS82PKEY(p8);
-    PKCS8_PRIV_KEY_INFO_free(p8);
-    if (ret == NULL) {
-      return NULL;
-    }
-
-    *inp = p;
-    if (out) {
-      *out = ret;
-    }
-    return ret;
-  } else {
-    keytype = EVP_PKEY_RSA;
+  if (len < 0) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
+    return NULL;
  }

-  sk_ASN1_TYPE_pop_free(inkey, ASN1_TYPE_free);
-  return d2i_PrivateKey(keytype, out, inp, len);
+  /* Parse the input as a PKCS#8 PrivateKeyInfo. */
+  CBS cbs;
+  CBS_init(&cbs, *inp, (size_t)len);
+  EVP_PKEY *ret = EVP_parse_private_key(&cbs);
+  if (ret != NULL) {
+    if (out != NULL) {
+      EVP_PKEY_free(*out);
+      *out = ret;
+    }
+    *inp = CBS_data(&cbs);
+    return ret;
+  }
+  ERR_clear_error();
+
+  /* Count the elements to determine the legacy key format. */
+  switch (num_elements(*inp, (size_t)len)) {
+    case 4:
+      return d2i_PrivateKey(EVP_PKEY_EC, out, inp, len);
+
+    case 6:
+      return d2i_PrivateKey(EVP_PKEY_DSA, out, inp, len);
+
+    default:
+      return d2i_PrivateKey(EVP_PKEY_RSA, out, inp, len);
+  }
 }

 int i2d_PublicKey(EVP_PKEY *key, uint8_t **outp) {
@@ -56,12 +56,10 @@

 #include <openssl/evp.h>

-#include <stdio.h>
 #include <string.h>

 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>

 #include "internal.h"

@@ -98,8 +96,7 @@ static EVP_PKEY_CTX *evp_pkey_ctx_new(EVP_PKEY *pkey, ENGINE *e, int id) {

  if (pmeth == NULL) {
    OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
-    const char *name = OBJ_nid2sn(id);
-    ERR_add_error_dataf("algorithm %d (%s)", id, name);
+    ERR_add_error_dataf("algorithm %d", id);
    return NULL;
  }

@@ -26,7 +26,6 @@
 #include <openssl/err.h>
 #include <openssl/evp.h>
 #include <openssl/rsa.h>
-#include <openssl/x509.h>

 #include "../test/scoped_types.h"

@@ -178,142 +177,6 @@ static const uint8_t kSignature[] = {
    0x55, 0xa7, 0xab, 0x45, 0x02, 0x97, 0x60, 0x42,
 };

-// kExamplePSSCert is an example self-signed certificate, signed with
-// kExampleRSAKeyDER using RSA-PSS with default hash functions.
-static const uint8_t kExamplePSSCert[] = {
-    0x30, 0x82, 0x02, 0x62, 0x30, 0x82, 0x01, 0xc6, 0xa0, 0x03, 0x02, 0x01,
-    0x02, 0x02, 0x09, 0x00, 0x8d, 0xea, 0x53, 0x24, 0xfa, 0x48, 0x87, 0xf3,
-    0x30, 0x12, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01,
-    0x0a, 0x30, 0x05, 0xa2, 0x03, 0x02, 0x01, 0x6a, 0x30, 0x45, 0x31, 0x0b,
-    0x30, 0x09, 0x06, 0x03, 0x55, 0x04, 0x06, 0x13, 0x02, 0x41, 0x55, 0x31,
-    0x13, 0x30, 0x11, 0x06, 0x03, 0x55, 0x04, 0x08, 0x0c, 0x0a, 0x53, 0x6f,
-    0x6d, 0x65, 0x2d, 0x53, 0x74, 0x61, 0x74, 0x65, 0x31, 0x21, 0x30, 0x1f,
-    0x06, 0x03, 0x55, 0x04, 0x0a, 0x0c, 0x18, 0x49, 0x6e, 0x74, 0x65, 0x72,
-    0x6e, 0x65, 0x74, 0x20, 0x57, 0x69, 0x64, 0x67, 0x69, 0x74, 0x73, 0x20,
-    0x50, 0x74, 0x79, 0x20, 0x4c, 0x74, 0x64, 0x30, 0x1e, 0x17, 0x0d, 0x31,
-    0x34, 0x31, 0x30, 0x30, 0x39, 0x31, 0x39, 0x30, 0x39, 0x35, 0x35, 0x5a,
-    0x17, 0x0d, 0x31, 0x35, 0x31, 0x30, 0x30, 0x39, 0x31, 0x39, 0x30, 0x39,
-    0x35, 0x35, 0x5a, 0x30, 0x45, 0x31, 0x0b, 0x30, 0x09, 0x06, 0x03, 0x55,
-    0x04, 0x06, 0x13, 0x02, 0x41, 0x55, 0x31, 0x13, 0x30, 0x11, 0x06, 0x03,
-    0x55, 0x04, 0x08, 0x0c, 0x0a, 0x53, 0x6f, 0x6d, 0x65, 0x2d, 0x53, 0x74,
-    0x61, 0x74, 0x65, 0x31, 0x21, 0x30, 0x1f, 0x06, 0x03, 0x55, 0x04, 0x0a,
-    0x0c, 0x18, 0x49, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x20, 0x57,
-    0x69, 0x64, 0x67, 0x69, 0x74, 0x73, 0x20, 0x50, 0x74, 0x79, 0x20, 0x4c,
-    0x74, 0x64, 0x30, 0x81, 0x9f, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48,
-    0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05, 0x00, 0x03, 0x81, 0x8d, 0x00,
-    0x30, 0x81, 0x89, 0x02, 0x81, 0x81, 0x00, 0xf8, 0xb8, 0x6c, 0x83, 0xb4,
-    0xbc, 0xd9, 0xa8, 0x57, 0xc0, 0xa5, 0xb4, 0x59, 0x76, 0x8c, 0x54, 0x1d,
-    0x79, 0xeb, 0x22, 0x52, 0x04, 0x7e, 0xd3, 0x37, 0xeb, 0x41, 0xfd, 0x83,
-    0xf9, 0xf0, 0xa6, 0x85, 0x15, 0x34, 0x75, 0x71, 0x5a, 0x84, 0xa8, 0x3c,
-    0xd2, 0xef, 0x5a, 0x4e, 0xd3, 0xde, 0x97, 0x8a, 0xdd, 0xff, 0xbb, 0xcf,
-    0x0a, 0xaa, 0x86, 0x92, 0xbe, 0xb8, 0x50, 0xe4, 0xcd, 0x6f, 0x80, 0x33,
-    0x30, 0x76, 0x13, 0x8f, 0xca, 0x7b, 0xdc, 0xec, 0x5a, 0xca, 0x63, 0xc7,
-    0x03, 0x25, 0xef, 0xa8, 0x8a, 0x83, 0x58, 0x76, 0x20, 0xfa, 0x16, 0x77,
-    0xd7, 0x79, 0x92, 0x63, 0x01, 0x48, 0x1a, 0xd8, 0x7b, 0x67, 0xf1, 0x52,
-    0x55, 0x49, 0x4e, 0xd6, 0x6e, 0x4a, 0x5c, 0xd7, 0x7a, 0x37, 0x36, 0x0c,
-    0xde, 0xdd, 0x8f, 0x44, 0xe8, 0xc2, 0xa7, 0x2c, 0x2b, 0xb5, 0xaf, 0x64,
-    0x4b, 0x61, 0x07, 0x02, 0x03, 0x01, 0x00, 0x01, 0xa3, 0x50, 0x30, 0x4e,
-    0x30, 0x1d, 0x06, 0x03, 0x55, 0x1d, 0x0e, 0x04, 0x16, 0x04, 0x14, 0xd0,
-    0x41, 0xfb, 0x89, 0x41, 0x1e, 0xa7, 0xad, 0x5a, 0xec, 0x34, 0x5d, 0x49,
-    0x11, 0xf9, 0x55, 0x81, 0x78, 0x1f, 0x13, 0x30, 0x1f, 0x06, 0x03, 0x55,
-    0x1d, 0x23, 0x04, 0x18, 0x30, 0x16, 0x80, 0x14, 0xd0, 0x41, 0xfb, 0x89,
-    0x41, 0x1e, 0xa7, 0xad, 0x5a, 0xec, 0x34, 0x5d, 0x49, 0x11, 0xf9, 0x55,
-    0x81, 0x78, 0x1f, 0x13, 0x30, 0x0c, 0x06, 0x03, 0x55, 0x1d, 0x13, 0x04,
-    0x05, 0x30, 0x03, 0x01, 0x01, 0xff, 0x30, 0x12, 0x06, 0x09, 0x2a, 0x86,
-    0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0a, 0x30, 0x05, 0xa2, 0x03, 0x02,
-    0x01, 0x6a, 0x03, 0x81, 0x81, 0x00, 0x49, 0x4c, 0xb6, 0x45, 0x97, 0x20,
-    0x35, 0xb3, 0x50, 0x64, 0x0d, 0x3f, 0xec, 0x5f, 0x95, 0xd5, 0x84, 0xcb,
-    0x11, 0x7c, 0x03, 0xd7, 0xa6, 0xe6, 0xfa, 0x24, 0x95, 0x9f, 0x31, 0xb0,
-    0xb5, 0xec, 0x66, 0x41, 0x51, 0x18, 0x21, 0x91, 0xbb, 0xe0, 0xaf, 0xf0,
-    0xc5, 0xb7, 0x59, 0x41, 0xd4, 0xdb, 0xa4, 0xd2, 0x64, 0xa7, 0x54, 0x0f,
-    0x8c, 0xf7, 0xe1, 0xd3, 0x3b, 0x1a, 0xb7, 0x0e, 0x9d, 0x9a, 0xde, 0x50,
-    0xa1, 0x9f, 0x0a, 0xf0, 0xda, 0x34, 0x0e, 0x34, 0x7d, 0x76, 0x07, 0xfe,
-    0x5a, 0xfb, 0xf9, 0x58, 0x9b, 0xc9, 0x50, 0x84, 0x01, 0xa0, 0x05, 0x4d,
-    0x67, 0x42, 0x0b, 0xf8, 0xe4, 0x05, 0xcf, 0xaf, 0x8b, 0x71, 0x31, 0xf1,
-    0x0f, 0x6e, 0xc9, 0x24, 0x27, 0x9b, 0xac, 0x04, 0xd7, 0x64, 0x0d, 0x30,
-    0x4e, 0x11, 0x93, 0x40, 0x39, 0xbb, 0x72, 0xb2, 0xfe, 0x6b, 0xe4, 0xae,
-    0x8c, 0x16,
-};
-
-// kBadPSSCert is an example RSA-PSS certificate with bad parameters.
-static const uint8_t kBadPSSCert[] = {
-    0x30, 0x82, 0x03, 0x76, 0x30, 0x82, 0x02, 0x3a, 0xa0, 0x03, 0x02, 0x01,
-    0x02, 0x02, 0x09, 0x00, 0xd7, 0x30, 0x64, 0xbc, 0x9f, 0x12, 0xfe, 0xc3,
-    0x30, 0x3e, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01,
-    0x0a, 0x30, 0x31, 0xa0, 0x0d, 0x30, 0x0b, 0x06, 0x09, 0x60, 0x86, 0x48,
-    0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0xa1, 0x1a, 0x30, 0x18, 0x06, 0x09,
-    0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x08, 0x30, 0x0b, 0x06,
-    0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0xa2, 0x04,
-    0x02, 0x02, 0x00, 0xde, 0x30, 0x27, 0x31, 0x25, 0x30, 0x23, 0x06, 0x03,
-    0x55, 0x04, 0x03, 0x0c, 0x1c, 0x54, 0x65, 0x73, 0x74, 0x20, 0x49, 0x6e,
-    0x76, 0x61, 0x6c, 0x69, 0x64, 0x20, 0x50, 0x53, 0x53, 0x20, 0x63, 0x65,
-    0x72, 0x74, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74, 0x65, 0x30, 0x1e, 0x17,
-    0x0d, 0x31, 0x35, 0x31, 0x31, 0x30, 0x34, 0x31, 0x36, 0x30, 0x32, 0x33,
-    0x35, 0x5a, 0x17, 0x0d, 0x31, 0x35, 0x31, 0x32, 0x30, 0x34, 0x31, 0x36,
-    0x30, 0x32, 0x33, 0x35, 0x5a, 0x30, 0x27, 0x31, 0x25, 0x30, 0x23, 0x06,
-    0x03, 0x55, 0x04, 0x03, 0x0c, 0x1c, 0x54, 0x65, 0x73, 0x74, 0x20, 0x49,
-    0x6e, 0x76, 0x61, 0x6c, 0x69, 0x64, 0x20, 0x50, 0x53, 0x53, 0x20, 0x63,
-    0x65, 0x72, 0x74, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74, 0x65, 0x30, 0x82,
-    0x01, 0x22, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d,
-    0x01, 0x01, 0x01, 0x05, 0x00, 0x03, 0x82, 0x01, 0x0f, 0x00, 0x30, 0x82,
-    0x01, 0x0a, 0x02, 0x82, 0x01, 0x01, 0x00, 0xc4, 0xda, 0x33, 0xb5, 0x87,
-    0xa9, 0x50, 0x80, 0x18, 0x02, 0x00, 0xfb, 0x32, 0xf5, 0x29, 0x6b, 0xef,
-    0x01, 0x24, 0xeb, 0x86, 0x5a, 0xbe, 0xd5, 0xe3, 0xdd, 0x3b, 0xbc, 0x2c,
-    0xad, 0x65, 0xf6, 0x2a, 0x26, 0x28, 0x4d, 0x8a, 0xc9, 0x61, 0x39, 0xf1,
-    0x84, 0xb9, 0xe7, 0xd3, 0x0a, 0xc7, 0xa8, 0x0a, 0x6d, 0xef, 0xd9, 0xcb,
-    0x20, 0x11, 0xbb, 0x71, 0xf4, 0xa1, 0xc9, 0x9a, 0x85, 0x1c, 0xe6, 0x3f,
-    0x23, 0x39, 0x58, 0x3c, 0xc5, 0x6d, 0xfa, 0x03, 0xe8, 0xdb, 0xdd, 0xe0,
-    0xc3, 0xde, 0x85, 0x76, 0xce, 0x49, 0x06, 0xc8, 0xe1, 0x8e, 0x4c, 0x86,
-    0x9c, 0xec, 0xab, 0xf4, 0xe5, 0x27, 0xb4, 0x5a, 0xaf, 0xc4, 0x36, 0xd3,
-    0x20, 0x81, 0x54, 0xee, 0x8f, 0x48, 0x77, 0x10, 0xf8, 0x79, 0xd6, 0xaa,
-    0x8d, 0x1b, 0xfe, 0x7d, 0xe8, 0x15, 0x13, 0xe0, 0x7b, 0xf6, 0x90, 0xe4,
-    0xe2, 0xcd, 0x2e, 0x8e, 0xc9, 0x3a, 0x75, 0x42, 0xed, 0x0a, 0x0f, 0x51,
-    0xb2, 0xdd, 0x2e, 0x70, 0x61, 0x68, 0xd7, 0xd9, 0xab, 0xf9, 0xbe, 0xe4,
-    0x75, 0xb7, 0xe7, 0xf2, 0x96, 0x7b, 0xd9, 0x93, 0x43, 0x24, 0xfb, 0x9e,
-    0x55, 0xda, 0xd4, 0x01, 0x6c, 0x3d, 0xa2, 0x59, 0x7a, 0xd5, 0x47, 0x18,
-    0x7e, 0x4e, 0xf9, 0x5d, 0xda, 0xcb, 0x93, 0xa2, 0x65, 0x2f, 0x8d, 0x46,
-    0xad, 0x81, 0xdc, 0xf0, 0xa9, 0x5f, 0x5d, 0xfe, 0x37, 0x80, 0x64, 0x2a,
-    0x41, 0xfa, 0xe9, 0x1e, 0x48, 0x38, 0x22, 0x1d, 0x9c, 0x23, 0xa5, 0xad,
-    0xda, 0x78, 0x45, 0x18, 0x0c, 0xeb, 0x95, 0xca, 0x2b, 0xcc, 0xb9, 0x62,
-    0x40, 0x85, 0x09, 0x44, 0x88, 0x4c, 0xf2, 0x1e, 0x08, 0x80, 0x37, 0xe9,
-    0x06, 0x96, 0x8f, 0x75, 0x54, 0x0b, 0xa9, 0x2d, 0xa9, 0x15, 0xb5, 0xda,
-    0xe5, 0xe4, 0x23, 0xaa, 0x2c, 0x89, 0xc1, 0xa9, 0x36, 0xbc, 0x9f, 0x02,
-    0x03, 0x01, 0x00, 0x01, 0xa3, 0x50, 0x30, 0x4e, 0x30, 0x1d, 0x06, 0x03,
-    0x55, 0x1d, 0x0e, 0x04, 0x16, 0x04, 0x14, 0x2b, 0x75, 0xf3, 0x43, 0x78,
-    0xa0, 0x65, 0x2d, 0xe4, 0xb6, 0xf3, 0x07, 0x04, 0x38, 0x21, 0xaf, 0xb6,
-    0xe1, 0x5f, 0x7b, 0x30, 0x1f, 0x06, 0x03, 0x55, 0x1d, 0x23, 0x04, 0x18,
-    0x30, 0x16, 0x80, 0x14, 0x2b, 0x75, 0xf3, 0x43, 0x78, 0xa0, 0x65, 0x2d,
-    0xe4, 0xb6, 0xf3, 0x07, 0x04, 0x38, 0x21, 0xaf, 0xb6, 0xe1, 0x5f, 0x7b,
-    0x30, 0x0c, 0x06, 0x03, 0x55, 0x1d, 0x13, 0x04, 0x05, 0x30, 0x03, 0x01,
-    0x01, 0xff, 0x30, 0x31, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d,
-    0x01, 0x01, 0x0a, 0x30, 0x24, 0xa0, 0x0d, 0x30, 0x0b, 0x06, 0x09, 0x60,
-    0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0xa1, 0x0d, 0x30, 0x0b,
-    0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x08, 0xa2,
-    0x04, 0x02, 0x02, 0x00, 0xde, 0x03, 0x82, 0x01, 0x01, 0x00, 0x08, 0xc1,
-    0xb6, 0x6f, 0x74, 0x94, 0x6c, 0x60, 0x75, 0xd8, 0xdc, 0xe1, 0x7b, 0xbf,
-    0x9d, 0xb5, 0xd7, 0x14, 0x75, 0x6c, 0xdb, 0x35, 0x5c, 0x1e, 0xff, 0xe6,
-    0xa8, 0xe6, 0x68, 0x42, 0x41, 0x81, 0xf6, 0xbf, 0xc1, 0x56, 0x02, 0xdb,
-    0xc6, 0x11, 0xeb, 0x15, 0x9d, 0xa9, 0x1c, 0x61, 0x25, 0x6d, 0x46, 0x0f,
-    0x7e, 0x27, 0xdd, 0x4b, 0xdc, 0xed, 0x07, 0xbd, 0xde, 0xd5, 0xde, 0x09,
-    0xf8, 0xfd, 0xbd, 0xa3, 0x4c, 0x81, 0xa9, 0xf7, 0x78, 0xff, 0x01, 0x80,
-    0x73, 0xf2, 0x40, 0xf2, 0xa8, 0x27, 0xe8, 0x00, 0x04, 0x3b, 0xf5, 0xe7,
-    0xa6, 0x58, 0x45, 0x79, 0x34, 0x49, 0x42, 0xd2, 0xd9, 0x56, 0x5e, 0xf9,
-    0x0a, 0x41, 0xd7, 0x81, 0x41, 0x94, 0x77, 0x78, 0x7e, 0x00, 0x3b, 0xca,
-    0xb5, 0xc0, 0x6e, 0x5b, 0xd7, 0x52, 0x52, 0x77, 0x1a, 0x52, 0xb8, 0x0d,
-    0x29, 0x1f, 0x2e, 0xfe, 0x1f, 0xf6, 0xb0, 0xc1, 0xb7, 0xf1, 0x15, 0x98,
-    0x0f, 0x30, 0x5d, 0x74, 0x2f, 0xfa, 0xe9, 0x84, 0xda, 0xde, 0xbe, 0xca,
-    0x91, 0x55, 0x1f, 0x5b, 0xbc, 0xaa, 0x45, 0x07, 0xc4, 0x2e, 0x21, 0x8a,
-    0x75, 0xc9, 0xbe, 0x6e, 0x39, 0x53, 0x10, 0xcb, 0x2f, 0x4b, 0xe1, 0x21,
-    0x1e, 0xea, 0x7d, 0x0b, 0x36, 0xe9, 0xa0, 0x2c, 0x76, 0x17, 0x1f, 0x69,
-    0x34, 0xfb, 0x45, 0x63, 0x7c, 0x84, 0x39, 0xb4, 0x21, 0x98, 0xbd, 0x49,
-    0xca, 0x80, 0x91, 0x5a, 0xa0, 0x44, 0xef, 0x91, 0xb3, 0x14, 0xf6, 0xd1,
-    0x6a, 0x2b, 0xb1, 0xe5, 0x4a, 0x44, 0x92, 0x7b, 0x3e, 0x8b, 0x7b, 0x6b,
-    0x90, 0x6b, 0x2c, 0x67, 0x3b, 0x0e, 0xb9, 0x5a, 0x87, 0x35, 0x33, 0x59,
-    0x94, 0x2f, 0x7e, 0xf6, 0x13, 0xc7, 0x22, 0x87, 0x3d, 0x50, 0xc9, 0x80,
-    0x40, 0xda, 0x35, 0xbc, 0x62, 0x16, 0xdc, 0xd5, 0x95, 0xa1, 0xe1, 0x9b,
-    0x68, 0x9f,
-};
-
 // kExampleRSAKeyPKCS8 is kExampleRSAKeyDER encoded in a PKCS #8
 // PrivateKeyInfo.
 static const uint8_t kExampleRSAKeyPKCS8[] = {
@@ -558,162 +421,6 @@ static bool TestEVP_DigestVerifyInit(void) {
  return true;
 }

-// TestAlgorithmRoundtrip signs a message using an already-initialized
-// |md_ctx|, sampling the AlgorithmIdentifier. It then uses |pkey| and the
-// AlgorithmIdentifier to verify the signature.
-static bool TestAlgorithmRoundtrip(EVP_MD_CTX *md_ctx, EVP_PKEY *pkey) {
-  if (!EVP_DigestSignUpdate(md_ctx, kMsg, sizeof(kMsg))) {
-    return false;
-  }
-
-  // Save the algorithm.
-  ScopedX509_ALGOR algor(X509_ALGOR_new());
-  if (!algor || !EVP_DigestSignAlgorithm(md_ctx, algor.get())) {
-    return false;
-  }
-
-  // Determine the size of the signature.
-  size_t sig_len = 0;
-  if (!EVP_DigestSignFinal(md_ctx, NULL, &sig_len)) {
-    return false;
-  }
-  // Sanity check for testing.
-  if (sig_len != (size_t)EVP_PKEY_size(pkey)) {
-    fprintf(stderr, "sig_len mismatch\n");
-    return false;
-  }
-
-  std::vector<uint8_t> sig;
-  sig.resize(sig_len);
-  if (!EVP_DigestSignFinal(md_ctx, sig.data(), &sig_len)) {
-    return false;
-  }
-  sig.resize(sig_len);
-
-  // Ensure that the signature round-trips.
-  ScopedEVP_MD_CTX md_ctx_verify;
-  if (!EVP_DigestVerifyInitFromAlgorithm(md_ctx_verify.get(), algor.get(),
-                                         pkey) ||
-      !EVP_DigestVerifyUpdate(md_ctx_verify.get(), kMsg, sizeof(kMsg)) ||
-      !EVP_DigestVerifyFinal(md_ctx_verify.get(), sig.data(), sig_len)) {
-    return false;
-  }
-
-  return true;
-}
-
-static bool TestEVP_DigestSignAlgorithm(void) {
-  ScopedEVP_PKEY pkey = LoadExampleRSAKey();
-
-  // Test a simple AlgorithmIdentifier.
-  ScopedEVP_MD_CTX md_ctx;
-  if (!pkey ||
-      !EVP_DigestSignInit(md_ctx.get(), NULL, EVP_sha256(), NULL, pkey.get()) ||
-      !TestAlgorithmRoundtrip(md_ctx.get(), pkey.get())) {
-    fprintf(stderr, "RSA with SHA-256 failed\n");
-    return false;
-  }
-
-  // Test RSA-PSS with custom parameters.
-  md_ctx.Reset();
-  EVP_PKEY_CTX *pkey_ctx;
-  if (!EVP_DigestSignInit(md_ctx.get(), &pkey_ctx, EVP_sha256(), NULL,
-                          pkey.get()) ||
-      !EVP_PKEY_CTX_set_rsa_padding(pkey_ctx, RSA_PKCS1_PSS_PADDING) ||
-      !EVP_PKEY_CTX_set_rsa_mgf1_md(pkey_ctx, EVP_sha512()) ||
-      !TestAlgorithmRoundtrip(md_ctx.get(), pkey.get())) {
-    fprintf(stderr, "RSA-PSS failed\n");
-    return false;
-  }
-
-  return true;
-}
-
-static bool ParseCertificate(CBS *out_tbs_cert,
-                             ScopedEVP_PKEY *out_pubkey,
-                             ScopedX509_ALGOR *out_algor,
-                             CBS *out_signature,
-                             const CBS *in_) {
-  CBS in = *in_;
-  CBS cert_body, tbs_cert, algorithm, signature;
-  if (!CBS_get_asn1(&in, &cert_body, CBS_ASN1_SEQUENCE) ||
-      CBS_len(&in) != 0 ||
-      !CBS_get_any_asn1_element(&cert_body, &tbs_cert, NULL, NULL) ||
-      !CBS_get_asn1_element(&cert_body, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !CBS_get_asn1(&cert_body, &signature, CBS_ASN1_BITSTRING) ||
-      CBS_len(&cert_body) != 0) {
-    return false;
-  }
-
-  CBS tbs_cert_copy = tbs_cert;
-  CBS tbs_cert_body, discard, spki;
-  if (!CBS_get_asn1(&tbs_cert_copy, &tbs_cert_body, CBS_ASN1_SEQUENCE) ||
-      CBS_len(&tbs_cert_copy) != 0 ||
-      !CBS_get_optional_asn1(
-          &tbs_cert_body, &discard, NULL,
-          CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) ||
-      !CBS_get_asn1(&tbs_cert_body, &discard /* serialNumber */,
-                    CBS_ASN1_INTEGER) ||
-      !CBS_get_asn1(&tbs_cert_body, &discard /* signature */,
-                    CBS_ASN1_SEQUENCE) ||
-      !CBS_get_any_asn1_element(&tbs_cert_body, &discard /* issuer */,
-                                NULL, NULL) ||
-      !CBS_get_asn1(&tbs_cert_body, &discard /* validity */,
-                    CBS_ASN1_SEQUENCE) ||
-      !CBS_get_any_asn1_element(&tbs_cert_body, &discard /* subject */,
-                                NULL, NULL) ||
-      !CBS_get_asn1_element(&tbs_cert_body, &spki, CBS_ASN1_SEQUENCE)) {
-    return false;
-  }
-
-  const uint8_t *derp = CBS_data(&spki);
-  ScopedEVP_PKEY pubkey(d2i_PUBKEY(NULL, &derp, CBS_len(&spki)));
-  if (!pubkey || derp != CBS_data(&spki) + CBS_len(&spki)) {
-    return false;
-  }
-
-  derp = CBS_data(&algorithm);
-  ScopedX509_ALGOR algor(d2i_X509_ALGOR(NULL, &derp, CBS_len(&algorithm)));
-  if (!algor || derp != CBS_data(&algorithm) + CBS_len(&algorithm)) {
-    return false;
-  }
-
-  // Signatures are BIT STRINGs, but they have are multiple of 8 bytes, so the
-  // leading phase byte is just a zero.
-  uint8_t padding;
-  if (!CBS_get_u8(&signature, &padding) || padding != 0) {
-    return false;
-  }
-
-  *out_tbs_cert = tbs_cert;
-  *out_pubkey = std::move(pubkey);
-  *out_algor = std::move(algor);
-  *out_signature = signature;
-  return true;
-}
-
-static bool TestEVP_DigestVerifyInitFromAlgorithm(void) {
-  CBS in, tbs_cert, signature;
-  ScopedEVP_PKEY pkey;
-  ScopedX509_ALGOR algor;
-  CBS_init(&in, kExamplePSSCert, sizeof(kExamplePSSCert));
-  if (!ParseCertificate(&tbs_cert, &pkey, &algor, &signature, &in)) {
-    fprintf(stderr, "Failed to parse certificate\n");
-    return false;
-  }
-
-  ScopedEVP_MD_CTX md_ctx;
-  if (!EVP_DigestVerifyInitFromAlgorithm(md_ctx.get(), algor.get(),
-                                         pkey.get()) ||
-      !EVP_DigestVerifyUpdate(md_ctx.get(), CBS_data(&tbs_cert),
-                              CBS_len(&tbs_cert)) ||
-      !EVP_DigestVerifyFinal(md_ctx.get(), CBS_data(&signature),
-                             CBS_len(&signature))) {
-    return false;
-  }
-  return true;
-}
-
 static bool TestVerifyRecover() {
  ScopedEVP_PKEY pkey = LoadExampleRSAKey();
  if (!pkey) {
@@ -792,26 +499,6 @@ static bool TestVerifyRecover() {
  return true;
 }

-static bool TestBadPSSParameters(void) {
-  CBS in, tbs_cert, signature;
-  ScopedEVP_PKEY pkey;
-  ScopedX509_ALGOR algor;
-  CBS_init(&in, kBadPSSCert, sizeof(kBadPSSCert));
-  if (!ParseCertificate(&tbs_cert, &pkey, &algor, &signature, &in)) {
-    fprintf(stderr, "Failed to parse certificate\n");
-    return false;
-  }
-
-  ScopedEVP_MD_CTX md_ctx;
-  if (EVP_DigestVerifyInitFromAlgorithm(md_ctx.get(), algor.get(),
-                                        pkey.get())) {
-    fprintf(stderr, "Unexpectedly processed bad signature parameters\n");
-    return false;
-  }
-  ERR_clear_error();
-  return true;
-}
-
 static bool TestValidPrivateKey(const uint8_t *input, size_t input_len,
                                int expected_id) {
  const uint8_t *p = input;
@@ -899,6 +586,25 @@ static bool TestEVP_PKCS82PKEY(void) {
  return true;
 }

+// TestEVPMarshalEmptyPublicKey tests |EVP_marshal_public_key| on an empty key.
+static bool TestEVPMarshalEmptyPublicKey(void) {
+  ScopedEVP_PKEY empty(EVP_PKEY_new());
+  if (!empty) {
+    return false;
+  }
+  ScopedCBB cbb;
+  if (EVP_marshal_public_key(cbb.get(), empty.get())) {
+    fprintf(stderr, "Marshalled empty public key.\n");
+    return false;
+  }
+  if (ERR_GET_REASON(ERR_peek_last_error()) != EVP_R_UNSUPPORTED_ALGORITHM) {
+    fprintf(stderr, "Marshalling an empty public key gave wrong error.\n");
+    return false;
+  }
+  ERR_clear_error();
+  return true;
+}
+
 // Testd2i_PrivateKey tests |d2i_PrivateKey|.
 static bool Testd2i_PrivateKey(void) {
  const uint8_t *derp = kExampleRSAKeyDER;
@@ -953,6 +659,15 @@ static bool Testd2i_PrivateKey(void) {
  }
  ERR_clear_error();

+  derp = kExampleRSAKeyPKCS8;
+  pkey.reset(d2i_PrivateKey(EVP_PKEY_EC, nullptr, &derp,
+             sizeof(kExampleRSAKeyPKCS8)));
+  if (pkey) {
+    fprintf(stderr, "Imported RSA key as EC key.\n");
+    return false;
+  }
+  ERR_clear_error();
+
  return true;
 }

@@ -971,30 +686,12 @@ int main(void) {
    return 1;
  }

-  if (!TestEVP_DigestSignAlgorithm()) {
-    fprintf(stderr, "EVP_DigestSignInit failed\n");
-    ERR_print_errors_fp(stderr);
-    return 1;
-  }
-
-  if (!TestEVP_DigestVerifyInitFromAlgorithm()) {
-    fprintf(stderr, "EVP_DigestVerifyInitFromAlgorithm failed\n");
-    ERR_print_errors_fp(stderr);
-    return 1;
-  }
-
  if (!TestVerifyRecover()) {
    fprintf(stderr, "EVP_PKEY_verify_recover failed\n");
    ERR_print_errors_fp(stderr);
    return 1;
  }

-  if (!TestBadPSSParameters()) {
-    fprintf(stderr, "TestBadPSSParameters failed\n");
-    ERR_print_errors_fp(stderr);
-    return 1;
-  }
-
  if (!Testd2i_AutoPrivateKey()) {
    fprintf(stderr, "Testd2i_AutoPrivateKey failed\n");
    ERR_print_errors_fp(stderr);
@@ -1007,6 +704,12 @@ int main(void) {
    return 1;
  }

+  if (!TestEVPMarshalEmptyPublicKey()) {
+    fprintf(stderr, "TestEVPMarshalEmptyPublicKey failed\n");
+    ERR_print_errors_fp(stderr);
+    return 1;
+  }
+
  if (!Testd2i_PrivateKey()) {
    fprintf(stderr, "Testd2i_PrivateKey failed\n");
    ERR_print_errors_fp(stderr);
@@ -59,36 +59,17 @@

 #include <openssl/base.h>

+#include <openssl/rsa.h>
+
 #if defined(__cplusplus)
 extern "C" {
 #endif


-/* These values are flags for EVP_PKEY_ASN1_METHOD.flags. */
-
-/* ASN1_PKEY_SIGPARAM_NULL controls whether the default behavior of
- * EVP_DigestSignAlgorithm writes an explicit NULL parameter in the
- * AlgorithmIdentifier. */
-#define ASN1_PKEY_SIGPARAM_NULL 0x1
-
-/* evp_digest_sign_algorithm_result_t is the return value of the
- * digest_sign_algorithm function in EVP_PKEY_ASN1_METHOD. */
-typedef enum {
-  /* EVP_DIGEST_SIGN_ALGORITHM_ERROR signals an error. */
-  EVP_DIGEST_SIGN_ALGORITHM_ERROR = 0,
-  /* EVP_DIGEST_SIGN_ALGORITHM_SUCCESS signals that the parameters were
-   * serialized in the AlgorithmIdentifier. */
-  EVP_DIGEST_SIGN_ALGORITHM_SUCCESS = 1,
-  /* EVP_DIGEST_SIGN_ALGORITHM_DEFAULT signals that the parameters are
-   * serialized using the default behavior. */
-  EVP_DIGEST_SIGN_ALGORITHM_DEFAULT = 2,
-} evp_digest_sign_algorithm_result_t;
-
 struct evp_pkey_asn1_method_st {
  int pkey_id;
-  unsigned long pkey_flags;
-
-  const char *pem_str;
+  uint8_t oid[9];
+  uint8_t oid_len;

  /* pub_decode decodes |params| and |key| as a SubjectPublicKeyInfo
   * and writes the result into |out|. It returns one on success and zero on
@@ -104,7 +85,6 @@ struct evp_pkey_asn1_method_st {
  int (*pub_encode)(CBB *out, const EVP_PKEY *key);

  int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
-  int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent, ASN1_PCTX *pctx);

  /* priv_decode decodes |params| and |key| as a PrivateKeyInfo and writes the
   * result into |out|. It returns one on success and zero on error. |params| is
@@ -116,9 +96,6 @@ struct evp_pkey_asn1_method_st {
   * |out|. It returns one on success and zero on error. */
  int (*priv_encode)(CBB *out, const EVP_PKEY *key);

-  int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent,
-                    ASN1_PCTX *pctx);
-
  /* pkey_opaque returns 1 if the |pk| is opaque. Opaque keys are backed by
   * custom implementations which do not expose key material and parameters.*/
  int (*pkey_opaque)(const EVP_PKEY *pk);
@@ -135,27 +112,8 @@ struct evp_pkey_asn1_method_st {
  int (*param_missing)(const EVP_PKEY *pk);
  int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from);
  int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
-  int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
-                     ASN1_PCTX *pctx);
-  int (*sig_print)(BIO *out, const X509_ALGOR *sigalg, const ASN1_STRING *sig,
-                   int indent, ASN1_PCTX *pctx);
-

  void (*pkey_free)(EVP_PKEY *pkey);
-
-  /* Legacy functions for old PEM */
-
-  int (*old_priv_decode)(EVP_PKEY *pkey, const uint8_t **pder,
-                         int derlen);
-
-  /* Converting parameters to/from AlgorithmIdentifier (X509_ALGOR). */
-  int (*digest_verify_init_from_algorithm)(EVP_MD_CTX *ctx,
-                                           X509_ALGOR *algor,
-                                           EVP_PKEY *pkey);
-  evp_digest_sign_algorithm_result_t (*digest_sign_algorithm)(
-      EVP_MD_CTX *ctx,
-      X509_ALGOR *algor);
-
 } /* EVP_PKEY_ASN1_METHOD */;


@@ -55,16 +55,11 @@

 #include <openssl/evp.h>

-#include <openssl/asn1.h>
-#include <openssl/asn1t.h>
 #include <openssl/digest.h>
 #include <openssl/bn.h>
 #include <openssl/bytestring.h>
 #include <openssl/dsa.h>
 #include <openssl/err.h>
-#include <openssl/mem.h>
-#include <openssl/obj.h>
-#include <openssl/x509.h>

 #include "internal.h"

@@ -111,10 +106,11 @@ static int dsa_pub_encode(CBB *out, const EVP_PKEY *key) {
  const int has_params = dsa->p != NULL && dsa->q != NULL && dsa->g != NULL;

  /* See RFC 5480, section 2. */
-  CBB spki, algorithm, key_bitstring;
+  CBB spki, algorithm, oid, key_bitstring;
  if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) ||
      !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !OBJ_nid2cbb(&algorithm, NID_dsa) ||
+      !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) ||
+      !CBB_add_bytes(&oid, dsa_asn1_meth.oid, dsa_asn1_meth.oid_len) ||
      (has_params &&
       !DSA_marshal_parameters(&algorithm, dsa)) ||
      !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) ||
@@ -177,11 +173,12 @@ static int dsa_priv_encode(CBB *out, const EVP_PKEY *key) {
  }

  /* See PKCS#11, v2.40, section 2.5. */
-  CBB pkcs8, algorithm, private_key;
+  CBB pkcs8, algorithm, oid, private_key;
  if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) ||
      !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) ||
      !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !OBJ_nid2cbb(&algorithm, NID_dsa) ||
+      !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) ||
+      !CBB_add_bytes(&oid, dsa_asn1_meth.oid, dsa_asn1_meth.oid_len) ||
      !DSA_marshal_parameters(&algorithm, dsa) ||
      !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) ||
      !BN_marshal_asn1(&private_key, dsa->priv_key) ||
@@ -245,157 +242,17 @@ static int dsa_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) {

 static void int_dsa_free(EVP_PKEY *pkey) { DSA_free(pkey->pkey.dsa); }

-static void update_buflen(const BIGNUM *b, size_t *pbuflen) {
-  size_t i;
-
-  if (!b) {
-    return;
-  }
-  i = BN_num_bytes(b);
-  if (*pbuflen < i) {
-    *pbuflen = i;
-  }
-}
-
-static int do_dsa_print(BIO *bp, const DSA *x, int off, int ptype) {
-  uint8_t *m = NULL;
-  int ret = 0;
-  size_t buf_len = 0;
-  const char *ktype = NULL;
-
-  const BIGNUM *priv_key, *pub_key;
-
-  priv_key = NULL;
-  if (ptype == 2) {
-    priv_key = x->priv_key;
-  }
-
-  pub_key = NULL;
-  if (ptype > 0) {
-    pub_key = x->pub_key;
-  }
-
-  ktype = "DSA-Parameters";
-  if (ptype == 2) {
-    ktype = "Private-Key";
-  } else if (ptype == 1) {
-    ktype = "Public-Key";
-  }
-
-  update_buflen(x->p, &buf_len);
-  update_buflen(x->q, &buf_len);
-  update_buflen(x->g, &buf_len);
-  update_buflen(priv_key, &buf_len);
-  update_buflen(pub_key, &buf_len);
-
-  m = OPENSSL_malloc(buf_len + 10);
-  if (m == NULL) {
-    OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE);
-    goto err;
-  }
-
-  if (priv_key) {
-    if (!BIO_indent(bp, off, 128) ||
-        BIO_printf(bp, "%s: (%d bit)\n", ktype, BN_num_bits(x->p)) <= 0) {
-      goto err;
-    }
-  }
-
-  if (!ASN1_bn_print(bp, "priv:", priv_key, m, off) ||
-      !ASN1_bn_print(bp, "pub: ", pub_key, m, off) ||
-      !ASN1_bn_print(bp, "P:   ", x->p, m, off) ||
-      !ASN1_bn_print(bp, "Q:   ", x->q, m, off) ||
-      !ASN1_bn_print(bp, "G:   ", x->g, m, off)) {
-    goto err;
-  }
-  ret = 1;
-
-err:
-  OPENSSL_free(m);
-  return ret;
-}
-
-static int dsa_param_print(BIO *bp, const EVP_PKEY *pkey, int indent,
-                           ASN1_PCTX *ctx) {
-  return do_dsa_print(bp, pkey->pkey.dsa, indent, 0);
-}
-
-static int dsa_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent,
-                         ASN1_PCTX *ctx) {
-  return do_dsa_print(bp, pkey->pkey.dsa, indent, 1);
-}
-
-static int dsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
-                          ASN1_PCTX *ctx) {
-  return do_dsa_print(bp, pkey->pkey.dsa, indent, 2);
-}
-
-static int old_dsa_priv_decode(EVP_PKEY *pkey, const uint8_t **pder,
-                               int derlen) {
-  DSA *dsa;
-  dsa = d2i_DSAPrivateKey(NULL, pder, derlen);
-  if (dsa == NULL) {
-    OPENSSL_PUT_ERROR(EVP, ERR_R_DSA_LIB);
-    return 0;
-  }
-  EVP_PKEY_assign_DSA(pkey, dsa);
-  return 1;
-}
-
-static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
-                         const ASN1_STRING *sig, int indent, ASN1_PCTX *pctx) {
-  DSA_SIG *dsa_sig;
-  const uint8_t *p;
-
-  if (!sig) {
-    return BIO_puts(bp, "\n") > 0;
-  }
-
-  p = sig->data;
-  dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
-  if (dsa_sig == NULL) {
-    return X509_signature_dump(bp, sig, indent);
-  }
-
-  int rv = 0;
-  size_t buf_len = 0;
-  uint8_t *m = NULL;
-
-  update_buflen(dsa_sig->r, &buf_len);
-  update_buflen(dsa_sig->s, &buf_len);
-  m = OPENSSL_malloc(buf_len + 10);
-  if (m == NULL) {
-    OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE);
-    goto err;
-  }
-
-  if (BIO_write(bp, "\n", 1) != 1 ||
-      !ASN1_bn_print(bp, "r:   ", dsa_sig->r, m, indent) ||
-      !ASN1_bn_print(bp, "s:   ", dsa_sig->s, m, indent)) {
-    goto err;
-  }
-  rv = 1;
-
-err:
-  OPENSSL_free(m);
-  DSA_SIG_free(dsa_sig);
-  return rv;
-}
-
 const EVP_PKEY_ASN1_METHOD dsa_asn1_meth = {
  EVP_PKEY_DSA,
-  0,
-
-  "DSA",
+  /* 1.2.840.10040.4.1 */
+  {0x2a, 0x86, 0x48, 0xce, 0x38, 0x04, 0x01}, 7,

  dsa_pub_decode,
  dsa_pub_encode,
  dsa_pub_cmp,
-  dsa_pub_print,

  dsa_priv_decode,
  dsa_priv_encode,
-  dsa_priv_print,

  NULL /* pkey_opaque */,
  NULL /* pkey_supports_digest */,
@@ -406,12 +263,6 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meth = {
  dsa_missing_parameters,
  dsa_copy_parameters,
  dsa_cmp_parameters,
-  dsa_param_print,
-  dsa_sig_print,

  int_dsa_free,
-  old_dsa_priv_decode,
-
-  NULL  /* digest_verify_init_from_algorithm */,
-  NULL  /* digest_sign_algorithm */,
 };
@@ -66,7 +66,7 @@
 #include <openssl/ecdsa.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>

 #include "internal.h"
 #include "../ec/internal.h"
@@ -55,16 +55,12 @@

 #include <openssl/evp.h>

-#include <openssl/asn1t.h>
 #include <openssl/bn.h>
 #include <openssl/bytestring.h>
 #include <openssl/ec.h>
 #include <openssl/ec_key.h>
 #include <openssl/ecdsa.h>
 #include <openssl/err.h>
-#include <openssl/mem.h>
-#include <openssl/obj.h>
-#include <openssl/x509.h>

 #include "internal.h"

@@ -72,19 +68,15 @@
 static int eckey_pub_encode(CBB *out, const EVP_PKEY *key) {
  const EC_KEY *ec_key = key->pkey.ec;
  const EC_GROUP *group = EC_KEY_get0_group(ec_key);
-  int curve_nid = EC_GROUP_get_curve_name(group);
-  if (curve_nid == NID_undef) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_NO_NID_FOR_CURVE);
-    return 0;
-  }
  const EC_POINT *public_key = EC_KEY_get0_public_key(ec_key);

  /* See RFC 5480, section 2. */
-  CBB spki, algorithm, key_bitstring;
+  CBB spki, algorithm, oid, key_bitstring;
  if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) ||
      !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !OBJ_nid2cbb(&algorithm, NID_X9_62_id_ecPublicKey) ||
-      !OBJ_nid2cbb(&algorithm, curve_nid) ||
+      !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) ||
+      !CBB_add_bytes(&oid, ec_asn1_meth.oid, ec_asn1_meth.oid_len) ||
+      !EC_KEY_marshal_curve_name(&algorithm, group) ||
      !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) ||
      !CBB_add_u8(&key_bitstring, 0 /* padding */) ||
      !EC_POINT_point2cbb(&key_bitstring, group, public_key,
@@ -101,31 +93,32 @@ static int eckey_pub_decode(EVP_PKEY *out, CBS *params, CBS *key) {
  /* See RFC 5480, section 2. */

  /* The parameters are a named curve. */
-  CBS named_curve;
-  if (!CBS_get_asn1(params, &named_curve, CBS_ASN1_OBJECT) ||
-      CBS_len(params) != 0) {
+  EC_GROUP *group = EC_KEY_parse_curve_name(params);
+  if (group == NULL || CBS_len(params) != 0) {
    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
    return 0;
  }

-  EC_KEY *eckey = EC_KEY_new_by_curve_name(OBJ_cbs2nid(&named_curve));
-  if (eckey == NULL) {
-    return 0;
+  EC_POINT *point = NULL;
+  EC_KEY *eckey = EC_KEY_new();
+  if (eckey == NULL || !EC_KEY_set_group(eckey, group)) {
+    goto err;
  }

-  EC_POINT *point = EC_POINT_new(EC_KEY_get0_group(eckey));
+  point = EC_POINT_new(group);
  if (point == NULL ||
-      !EC_POINT_oct2point(EC_KEY_get0_group(eckey), point, CBS_data(key),
-                          CBS_len(key), NULL) ||
+      !EC_POINT_oct2point(group, point, CBS_data(key), CBS_len(key), NULL) ||
      !EC_KEY_set_public_key(eckey, point)) {
    goto err;
  }

+  EC_GROUP_free(group);
  EC_POINT_free(point);
  EVP_PKEY_assign_EC_KEY(out, eckey);
  return 1;

 err:
+  EC_GROUP_free(group);
  EC_POINT_free(point);
  EC_KEY_free(eckey);
  return 0;
@@ -169,11 +162,6 @@ static int eckey_priv_decode(EVP_PKEY *out, CBS *params, CBS *key) {

 static int eckey_priv_encode(CBB *out, const EVP_PKEY *key) {
  const EC_KEY *ec_key = key->pkey.ec;
-  int curve_nid = EC_GROUP_get_curve_name(EC_KEY_get0_group(ec_key));
-  if (curve_nid == NID_undef) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_NO_NID_FOR_CURVE);
-    return 0;
-  }

  /* Omit the redundant copy of the curve name. This contradicts RFC 5915 but
   * aligns with PKCS #11. SEC 1 only says they may be omitted if known by other
@@ -182,12 +170,13 @@ static int eckey_priv_encode(CBB *out, const EVP_PKEY *key) {
  unsigned enc_flags = EC_KEY_get_enc_flags(ec_key) | EC_PKEY_NO_PARAMETERS;

  /* See RFC 5915. */
-  CBB pkcs8, algorithm, private_key;
+  CBB pkcs8, algorithm, oid, private_key;
  if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) ||
      !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) ||
      !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) ||
-      !OBJ_nid2cbb(&algorithm, NID_X9_62_id_ecPublicKey) ||
-      !OBJ_nid2cbb(&algorithm, curve_nid) ||
+      !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) ||
+      !CBB_add_bytes(&oid, ec_asn1_meth.oid, ec_asn1_meth.oid_len) ||
+      !EC_KEY_marshal_curve_name(&algorithm, EC_KEY_get0_group(ec_key)) ||
      !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) ||
      !EC_KEY_marshal_private_key(&private_key, ec_key, enc_flags) ||
      !CBB_flush(out)) {
@@ -237,153 +226,21 @@ static int ec_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b) {

 static void int_ec_free(EVP_PKEY *pkey) { EC_KEY_free(pkey->pkey.ec); }

-static int do_EC_KEY_print(BIO *bp, const EC_KEY *x, int off, int ktype) {
-  uint8_t *buffer = NULL;
-  const char *ecstr;
-  size_t buf_len = 0, i;
-  int ret = 0, reason = ERR_R_BIO_LIB;
-  BN_CTX *ctx = NULL;
-  const EC_GROUP *group;
-  const EC_POINT *public_key;
-  const BIGNUM *priv_key;
-  uint8_t *pub_key_bytes = NULL;
-  size_t pub_key_bytes_len = 0;
-
-  if (x == NULL || (group = EC_KEY_get0_group(x)) == NULL) {
-    reason = ERR_R_PASSED_NULL_PARAMETER;
-    goto err;
-  }
-
-  ctx = BN_CTX_new();
-  if (ctx == NULL) {
-    reason = ERR_R_MALLOC_FAILURE;
-    goto err;
-  }
-
-  if (ktype > 0) {
-    public_key = EC_KEY_get0_public_key(x);
-    if (public_key != NULL) {
-      pub_key_bytes_len = EC_POINT_point2oct(
-          group, public_key, EC_KEY_get_conv_form(x), NULL, 0, ctx);
-      if (pub_key_bytes_len == 0) {
-        reason = ERR_R_MALLOC_FAILURE;
-        goto err;
-      }
-      pub_key_bytes = OPENSSL_malloc(pub_key_bytes_len);
-      if (pub_key_bytes == NULL) {
-        reason = ERR_R_MALLOC_FAILURE;
-        goto err;
-      }
-      pub_key_bytes_len =
-          EC_POINT_point2oct(group, public_key, EC_KEY_get_conv_form(x),
-                             pub_key_bytes, pub_key_bytes_len, ctx);
-      if (pub_key_bytes_len == 0) {
-        reason = ERR_R_MALLOC_FAILURE;
-        goto err;
-      }
-      buf_len = pub_key_bytes_len;
-    }
-  }
-
-  if (ktype == 2) {
-    priv_key = EC_KEY_get0_private_key(x);
-    if (priv_key && (i = (size_t)BN_num_bytes(priv_key)) > buf_len) {
-      buf_len = i;
-    }
-  } else {
-    priv_key = NULL;
-  }
-
-  if (ktype > 0) {
-    buf_len += 10;
-    if ((buffer = OPENSSL_malloc(buf_len)) == NULL) {
-      reason = ERR_R_MALLOC_FAILURE;
-      goto err;
-    }
-  }
-  if (ktype == 2) {
-    ecstr = "Private-Key";
-  } else if (ktype == 1) {
-    ecstr = "Public-Key";
-  } else {
-    ecstr = "ECDSA-Parameters";
-  }
-
-  if (!BIO_indent(bp, off, 128)) {
-    goto err;
-  }
-  const BIGNUM *order = EC_GROUP_get0_order(group);
-  if (BIO_printf(bp, "%s: (%d bit)\n", ecstr, BN_num_bits(order)) <= 0) {
-    goto err;
-  }
-
-  if ((priv_key != NULL) &&
-      !ASN1_bn_print(bp, "priv:", priv_key, buffer, off)) {
-    goto err;
-  }
-  if (pub_key_bytes != NULL) {
-    BIO_hexdump(bp, pub_key_bytes, pub_key_bytes_len, off);
-  }
-  /* TODO(fork): implement */
-  /*
-  if (!ECPKParameters_print(bp, group, off))
-    goto err; */
-  ret = 1;
-
-err:
-  if (!ret) {
-    OPENSSL_PUT_ERROR(EVP, reason);
-  }
-  OPENSSL_free(pub_key_bytes);
-  BN_CTX_free(ctx);
-  OPENSSL_free(buffer);
-  return ret;
-}
-
-static int eckey_param_print(BIO *bp, const EVP_PKEY *pkey, int indent,
-                             ASN1_PCTX *ctx) {
-  return do_EC_KEY_print(bp, pkey->pkey.ec, indent, 0);
-}
-
-static int eckey_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent,
-                           ASN1_PCTX *ctx) {
-  return do_EC_KEY_print(bp, pkey->pkey.ec, indent, 1);
-}
-
-
-static int eckey_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
-                            ASN1_PCTX *ctx) {
-  return do_EC_KEY_print(bp, pkey->pkey.ec, indent, 2);
-}
-
 static int eckey_opaque(const EVP_PKEY *pkey) {
  return EC_KEY_is_opaque(pkey->pkey.ec);
 }

-static int old_ec_priv_decode(EVP_PKEY *pkey, const uint8_t **pder,
-                              int derlen) {
-  EC_KEY *ec;
-  if (!(ec = d2i_ECPrivateKey(NULL, pder, derlen))) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
-    return 0;
-  }
-  EVP_PKEY_assign_EC_KEY(pkey, ec);
-  return 1;
-}
-
 const EVP_PKEY_ASN1_METHOD ec_asn1_meth = {
  EVP_PKEY_EC,
-  0,
-  "EC",
+  /* 1.2.840.10045.2.1 */
+  {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x02, 0x01}, 7,

  eckey_pub_decode,
  eckey_pub_encode,
  eckey_pub_cmp,
-  eckey_pub_print,

  eckey_priv_decode,
  eckey_priv_encode,
-  eckey_priv_print,

  eckey_opaque,
  0 /* pkey_supports_digest */,
@@ -394,12 +251,6 @@ const EVP_PKEY_ASN1_METHOD ec_asn1_meth = {
  ec_missing_parameters,
  ec_copy_parameters,
  ec_cmp_parameters,
-  eckey_param_print,
-  0,

  int_ec_free,
-  old_ec_priv_decode,
-
-  NULL /* digest_verify_init_from_algorithm */,
-  NULL /* digest_sign_algorithm */,
 };
@@ -64,7 +64,7 @@
 #include <openssl/digest.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/rsa.h>

 #include "../rsa/internal.h"
--- a/Show More
+++ b/Show More