Squashed 'src/secp256k1/' changes from 50cc6ab..1897b8e

1897b8e Merge pull request #229 efc571c Add simple testcases for signing with rfc6979 extra entropy. 1573a10 Add ability to pass extra entropy to rfc6979 3087bc4 Merge pull request #228 d9b9f11 Merge pull request #218 0065a8f Eliminate multiple-returns from secp256k1.c. 354ffa3 Make secp256k1_ec_pubkey_create reject oversized secrets. 27bc131 Silence some warnings from pedantic static analysis tools, improve compatibility with C++. 3b7ea63 Merge pull request #221 f789c5b Merge pull request #215 4bc273b Merge pull request #222 137a8ec Merge pull request #216 7c3771d Disable overlength-strings warnings. 8956111 use 128-bit hex seed 02efd06 Use RFC6979 for test PRNGs ae55e85 Use faster byteswapping and avoid alignment-increasing casts. 443cd4b Get rid of hex format and some binary conversions 0bada0e Merge #214: Improve signing API documentation & specification 8030d7c Improve signing API documentation & specification 7b2fc1c Merge #213: Removed gotos, which are hard to trace and maintain. 11690d3 Removed gotos, which are hard to trace and maintain. 122a1ec Merge pull request #205 035406d Merge pull request #206 2d4cd53 Merge pull request #161 34b898d Additional comments for the testing PRNG and a seeding fix. 6efd6e7 Some comments explaining some of the constants in the code. ffccfd2 x86_64 assembly optimization for scalar_4x64 67cbdf0 Merge pull request #207 039723d Benchmarks for all internal operations 6cc8425 Include a comment on secp256k1_ecdsa_sign explaining low-s. f88343f Merge pull request #203 d61e899 Add group operation counts 2473f17 Merge pull request #202 b5bbce6 Some readme updates, e.g. removal of the GMP field. f0d851e Merge pull request #201 a0ea884 Merge pull request #200 f735446 Convert the rest of the codebase to C89. bf2e1ac Convert tests to C89. (also fixes a use of bare "inline" in field) fc8285f Merge pull request #199 fff412e Merge pull request #197 4be8d6f Centralize the definition of uint128_t and use it uniformly. d9543c9 Switch scalar code to C89. fcc48c4 Remove the non-storage cmov 55422b6 Switch ecmult_gen to use storage types 41f8455 Use group element storage type in EC multiplications e68d720 Add group element storage type ff889f7 Field storage type 7137be8 Merge pull request #196 0768bd5 Get rid of variable-length hex string conversions e84e761 Merge pull request #195 792bcdb Covert several more files to C89. 45cdf44 Merge pull request #193 17db09e Merge pull request #194 402878a fix ifdef/ifndef 25b35c7 Convert field code to strict C89 (+ long long, +__int128) 3627437 C89 nits and dead code removal. a9f350d Merge pull request #191 4732d26 Convert the field/group/ecdsa constant initialization to static consts 19f3e76 Remove unused secp256k1_fe_inner_{start, stop} functions f1ebfe3 Convert the scalar constant initialization to static consts git-subtree-dir: src/secp256k1 git-subtree-split: 1897b8e90bbbdcd919427c9a8ae35b420e919d8f
2015-03-27 14:03:36 -07:00
parent 7873633b57
commit 9d09322b41
38 changed files with 2529 additions and 1469 deletions
--- a/src/bench.h
+++ b/src/bench.h
@@ -17,21 +17,40 @@ static double gettimedouble(void) {
    return tv.tv_usec * 0.000001 + tv.tv_sec;
 }

-void run_benchmark(void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) {
+void print_number(double x) {
+    double y = x;
+    int c = 0;
+    if (y < 0.0) y = -y;
+    while (y < 100.0) {
+        y *= 10.0;
+        c++;
+    }
+    printf("%.*f", c, x);
+}
+
+void run_benchmark(char *name, void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) {
+    int i;
    double min = HUGE_VAL;
    double sum = 0.0;
    double max = 0.0;
-    for (int i = 0; i < count; i++) {
+    for (i = 0; i < count; i++) {
+        double begin, total;
        if (setup) setup(data);
-        double begin = gettimedouble();
+        begin = gettimedouble();
        benchmark(data);
-        double total = gettimedouble() - begin;
+        total = gettimedouble() - begin;
        if (teardown) teardown(data);
        if (total < min) min = total;
        if (total > max) max = total;
        sum += total;
    }
-    printf("min %.3fus / avg %.3fus / max %.3fus\n", min * 1000000.0 / iter, (sum / count) * 1000000.0 / iter, max * 1000000.0 / iter);
+    printf("%s: min ", name);
+    print_number(min * 1000000.0 / iter);
+    printf("us / avg ");
+    print_number((sum / count) * 1000000.0 / iter);
+    printf("us / avg ");
+    print_number(max * 1000000.0 / iter);
+    printf("us\n");
 }

 #endif
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -0,0 +1,318 @@
+/**********************************************************************
+ * Copyright (c) 2014-2015 Pieter Wuille                              *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+#include <stdio.h>
+
+#include "include/secp256k1.h"
+
+#include "util.h"
+#include "hash_impl.h"
+#include "num_impl.h"
+#include "field_impl.h"
+#include "group_impl.h"
+#include "scalar_impl.h"
+#include "ecmult_impl.h"
+#include "bench.h"
+
+typedef struct {
+    secp256k1_scalar_t scalar_x, scalar_y;
+    secp256k1_fe_t fe_x, fe_y;
+    secp256k1_ge_t ge_x, ge_y;
+    secp256k1_gej_t gej_x, gej_y;
+    unsigned char data[32];
+    int wnaf[256];
+} bench_inv_t;
+
+void bench_setup(void* arg) {
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    static const unsigned char init_x[32] = {
+        0x02, 0x03, 0x05, 0x07, 0x0b, 0x0d, 0x11, 0x13,
+        0x17, 0x1d, 0x1f, 0x25, 0x29, 0x2b, 0x2f, 0x35,
+        0x3b, 0x3d, 0x43, 0x47, 0x49, 0x4f, 0x53, 0x59,
+        0x61, 0x65, 0x67, 0x6b, 0x6d, 0x71, 0x7f, 0x83
+    };
+
+    static const unsigned char init_y[32] = {
+        0x82, 0x83, 0x85, 0x87, 0x8b, 0x8d, 0x81, 0x83,
+        0x97, 0xad, 0xaf, 0xb5, 0xb9, 0xbb, 0xbf, 0xc5,
+        0xdb, 0xdd, 0xe3, 0xe7, 0xe9, 0xef, 0xf3, 0xf9,
+        0x11, 0x15, 0x17, 0x1b, 0x1d, 0xb1, 0xbf, 0xd3
+    };
+
+    secp256k1_scalar_set_b32(&data->scalar_x, init_x, NULL);
+    secp256k1_scalar_set_b32(&data->scalar_y, init_y, NULL);
+    secp256k1_fe_set_b32(&data->fe_x, init_x);
+    secp256k1_fe_set_b32(&data->fe_y, init_y);
+    CHECK(secp256k1_ge_set_xo_var(&data->ge_x, &data->fe_x, 0));
+    CHECK(secp256k1_ge_set_xo_var(&data->ge_y, &data->fe_y, 1));
+    secp256k1_gej_set_ge(&data->gej_x, &data->ge_x);
+    secp256k1_gej_set_ge(&data->gej_y, &data->ge_y);
+    memcpy(data->data, init_x, 32);
+}
+
+void bench_scalar_add(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+void bench_scalar_negate(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_scalar_negate(&data->scalar_x, &data->scalar_x);
+    }
+}
+
+void bench_scalar_sqr(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_scalar_sqr(&data->scalar_x, &data->scalar_x);
+    }
+}
+
+void bench_scalar_mul(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_scalar_mul(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+#ifdef USE_ENDOMORPHISM
+void bench_scalar_split(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_scalar_t l, r;
+        secp256k1_scalar_split_lambda_var(&l, &r, &data->scalar_x);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+#endif
+
+void bench_scalar_inverse(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000; i++) {
+        secp256k1_scalar_inverse(&data->scalar_x, &data->scalar_x);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+void bench_scalar_inverse_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000; i++) {
+        secp256k1_scalar_inverse_var(&data->scalar_x, &data->scalar_x);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+void bench_field_normalize(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_fe_normalize(&data->fe_x);
+    }
+}
+
+void bench_field_normalize_weak(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_fe_normalize_weak(&data->fe_x);
+    }
+}
+
+void bench_field_mul(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_fe_mul(&data->fe_x, &data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_sqr(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_fe_sqr(&data->fe_x, &data->fe_x);
+    }
+}
+
+void bench_field_inverse(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_inv(&data->fe_x, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_inverse_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_inv_var(&data->fe_x, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_sqrt_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_sqrt_var(&data->fe_x, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_group_double_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_double_var(&data->gej_x, &data->gej_x);
+    }
+}
+
+void bench_group_add_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_add_var(&data->gej_x, &data->gej_x, &data->gej_y);
+    }
+}
+
+void bench_group_add_affine(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_add_ge(&data->gej_x, &data->gej_x, &data->ge_y);
+    }
+}
+
+void bench_group_add_affine_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_add_ge_var(&data->gej_x, &data->gej_x, &data->ge_y);
+    }
+}
+
+void bench_ecmult_wnaf(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_ecmult_wnaf(data->wnaf, &data->scalar_x, WINDOW_A);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+
+void bench_sha256(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_sha256_t sha;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_sha256_initialize(&sha);
+        secp256k1_sha256_write(&sha, data->data, 32);
+        secp256k1_sha256_finalize(&sha, data->data);
+    }
+}
+
+void bench_hmac_sha256(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_hmac_sha256_t hmac;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_hmac_sha256_initialize(&hmac, data->data, 32);
+        secp256k1_hmac_sha256_write(&hmac, data->data, 32);
+        secp256k1_hmac_sha256_finalize(&hmac, data->data);
+    }
+}
+
+void bench_rfc6979_hmac_sha256(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_rfc6979_hmac_sha256_t rng;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_rfc6979_hmac_sha256_initialize(&rng, data->data, 32, data->data, 32, NULL, 0);
+        secp256k1_rfc6979_hmac_sha256_generate(&rng, data->data, 32);
+    }
+}
+
+
+int have_flag(int argc, char** argv, char *flag) {
+    char** argm = argv + argc;
+    argv++;
+    if (argv == argm) {
+        return 1;
+    }
+    while (argv != NULL && argv != argm) {
+        if (strcmp(*argv, flag) == 0) return 1;
+        argv++;
+    }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    bench_inv_t data;
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "sqr")) run_benchmark("scalar_sqr", bench_scalar_sqr, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, 200000);
+#ifdef USE_ENDOMORPHISM
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "split")) run_benchmark("scalar_split", bench_scalar_split, bench_setup, NULL, &data, 10, 20000);
+#endif
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, 2000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, 2000);
+
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize", bench_field_normalize, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize_weak", bench_field_normalize_weak, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqr")) run_benchmark("field_sqr", bench_field_sqr, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "mul")) run_benchmark("field_mul", bench_field_mul, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse", bench_field_inverse, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse_var", bench_field_inverse_var, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt")) run_benchmark("field_sqrt_var", bench_field_sqrt_var, bench_setup, NULL, &data, 10, 20000);
+
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "double")) run_benchmark("group_double_var", bench_group_double_var, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine", bench_group_add_affine, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine_var", bench_group_add_affine_var, bench_setup, NULL, &data, 10, 200000);
+
+    if (have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("ecmult_wnaf", bench_ecmult_wnaf, bench_setup, NULL, &data, 10, 20000);
+
+    if (have_flag(argc, argv, "hash") || have_flag(argc, argv, "sha256")) run_benchmark("hash_sha256", bench_sha256, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "hash") || have_flag(argc, argv, "hmac")) run_benchmark("hash_hmac_sha256", bench_hmac_sha256, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "hash") || have_flag(argc, argv, "rng6979")) run_benchmark("hash_rfc6979_hmac_sha256", bench_rfc6979_hmac_sha256, bench_setup, NULL, &data, 10, 20000);
+    return 0;
+}
--- a/src/bench_inv.c
+++ b/src/bench_inv.c
@@ -1,52 +0,0 @@
-/**********************************************************************
- * Copyright (c) 2014 Pieter Wuille                                   *
- * Distributed under the MIT software license, see the accompanying   *
- * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
- **********************************************************************/
-#include <stdio.h>
-
-#include "include/secp256k1.h"
-
-#include "util.h"
-#include "num_impl.h"
-#include "field_impl.h"
-#include "group_impl.h"
-#include "scalar_impl.h"
-#include "bench.h"
-
-typedef struct {
-    secp256k1_scalar_t base, x;
-} bench_inv_t;
-
-void bench_inv_setup(void* arg) {
-    bench_inv_t *data = (bench_inv_t*)arg;
-
-    static const unsigned char init[32] = {
-        0x02, 0x03, 0x05, 0x07, 0x0b, 0x0d, 0x11, 0x13,
-        0x17, 0x1d, 0x1f, 0x25, 0x29, 0x2b, 0x2f, 0x35,
-        0x3b, 0x3d, 0x43, 0x47, 0x49, 0x4f, 0x53, 0x59,
-        0x61, 0x65, 0x67, 0x6b, 0x6d, 0x71, 0x7f, 0x83
-    };
-
-    secp256k1_scalar_set_b32(&data->base, init, NULL);
-    secp256k1_scalar_set_b32(&data->x, init, NULL);
-}
-
-void bench_inv(void* arg) {
-    bench_inv_t *data = (bench_inv_t*)arg;
-
-    for (int i=0; i<20000; i++) {
-        secp256k1_scalar_inverse(&data->x, &data->x);
-        secp256k1_scalar_add(&data->x, &data->x, &data->base);
-    }
-}
-
-int main(void) {
-    secp256k1_ge_start();
-
-    bench_inv_t data;
-    run_benchmark(bench_inv, bench_inv_setup, NULL, &data, 10, 20000);
-
-    secp256k1_ge_stop();
-    return 0;
-}
--- a/src/bench_recover.c
+++ b/src/bench_recover.c
@@ -14,13 +14,15 @@ typedef struct {
 } bench_recover_t;

 void bench_recover(void* arg) {
+    int i;
    bench_recover_t *data = (bench_recover_t*)arg;
-
    unsigned char pubkey[33];
-    for (int i=0; i<20000; i++) {
+
+    for (i = 0; i < 20000; i++) {
+        int j;
        int pubkeylen = 33;
        CHECK(secp256k1_ecdsa_recover_compact(data->msg, data->sig, pubkey, &pubkeylen, 1, i % 2));
-        for (int j = 0; j < 32; j++) {
+        for (j = 0; j < 32; j++) {
            data->sig[j + 32] = data->msg[j];    /* Move former message to S. */
            data->msg[j] = data->sig[j];         /* Move former R to message. */
            data->sig[j] = pubkey[j + 1];        /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */
@@ -29,17 +31,18 @@ void bench_recover(void* arg) {
 }

 void bench_recover_setup(void* arg) {
+    int i;
    bench_recover_t *data = (bench_recover_t*)arg;

-    for (int i = 0; i < 32; i++) data->msg[i] = 1 + i;
-    for (int i = 0; i < 64; i++) data->sig[i] = 65 + i;
+    for (i = 0; i < 32; i++) data->msg[i] = 1 + i;
+    for (i = 0; i < 64; i++) data->sig[i] = 65 + i;
 }

 int main(void) {
+    bench_recover_t data;
    secp256k1_start(SECP256K1_START_VERIFY);

-    bench_recover_t data;
-    run_benchmark(bench_recover, bench_recover_setup, NULL, &data, 10, 20000);
+    run_benchmark("ecdsa_recover", bench_recover, bench_recover_setup, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/bench_sign.c
+++ b/src/bench_sign.c
@@ -14,20 +14,23 @@ typedef struct {
 } bench_sign_t;

 static void bench_sign_setup(void* arg) {
+    int i;
    bench_sign_t *data = (bench_sign_t*)arg;

-    for (int i = 0; i < 32; i++) data->msg[i] = i + 1;
-    for (int i = 0; i < 32; i++) data->key[i] = i + 65;
+    for (i = 0; i < 32; i++) data->msg[i] = i + 1;
+    for (i = 0; i < 32; i++) data->key[i] = i + 65;
 }

 static void bench_sign(void* arg) {
+    int i;
    bench_sign_t *data = (bench_sign_t*)arg;

    unsigned char sig[64];
-    for (int i=0; i<20000; i++) {
+    for (i = 0; i < 20000; i++) {
+        int j;
        int recid = 0;
        CHECK(secp256k1_ecdsa_sign_compact(data->msg, sig, data->key, NULL, NULL, &recid));
-        for (int j = 0; j < 32; j++) {
+        for (j = 0; j < 32; j++) {
            data->msg[j] = sig[j];             /* Move former R to message. */
            data->key[j] = sig[j + 32];        /* Move former S to key.     */
        }
@@ -35,10 +38,10 @@ static void bench_sign(void* arg) {
 }

 int main(void) {
+    bench_sign_t data;
    secp256k1_start(SECP256K1_START_SIGN);

-    bench_sign_t data;
-    run_benchmark(bench_sign, bench_sign_setup, NULL, &data, 10, 20000);
+    run_benchmark("ecdsa_sign", bench_sign, bench_sign_setup, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/bench_verify.c
+++ b/src/bench_verify.c
@@ -21,9 +21,10 @@ typedef struct {
 } benchmark_verify_t;

 static void benchmark_verify(void* arg) {
+    int i;
    benchmark_verify_t* data = (benchmark_verify_t*)arg;

-    for (int i=0; i<20000; i++) {
+    for (i = 0; i < 20000; i++) {
        data->sig[data->siglen - 1] ^= (i & 0xFF);
        data->sig[data->siglen - 2] ^= ((i >> 8) & 0xFF);
        data->sig[data->siglen - 3] ^= ((i >> 16) & 0xFF);
@@ -35,18 +36,19 @@ static void benchmark_verify(void* arg) {
 }

 int main(void) {
-    secp256k1_start(SECP256K1_START_VERIFY | SECP256K1_START_SIGN);
-
+    int i;
    benchmark_verify_t data;

-    for (int i = 0; i < 32; i++) data.msg[i] = 1 + i;
-    for (int i = 0; i < 32; i++) data.key[i] = 33 + i;
+    secp256k1_start(SECP256K1_START_VERIFY | SECP256K1_START_SIGN);
+
+    for (i = 0; i < 32; i++) data.msg[i] = 1 + i;
+    for (i = 0; i < 32; i++) data.key[i] = 33 + i;
    data.siglen = 72;
    secp256k1_ecdsa_sign(data.msg, data.sig, &data.siglen, data.key, NULL, NULL);
    data.pubkeylen = 33;
    CHECK(secp256k1_ec_pubkey_create(data.pubkey, &data.pubkeylen, data.key, 1));

-    run_benchmark(benchmark_verify, NULL, NULL, &data, 10, 20000);
+    run_benchmark("ecdsa_verify", benchmark_verify, NULL, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/ecdsa.h
+++ b/src/ecdsa.h
@@ -10,9 +10,6 @@
 #include "scalar.h"
 #include "group.h"

-static void secp256k1_ecsda_start(void);
-static void secp256k1_ecdsa_stop(void);
-
 typedef struct {
    secp256k1_scalar_t r, s;
 } secp256k1_ecdsa_sig_t;
@@ -22,6 +19,5 @@ static int secp256k1_ecdsa_sig_serialize(unsigned char *sig, int *size, const se
 static int secp256k1_ecdsa_sig_verify(const secp256k1_ecdsa_sig_t *sig, const secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message);
 static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *seckey, const secp256k1_scalar_t *message, const secp256k1_scalar_t *nonce, int *recid);
 static int secp256k1_ecdsa_sig_recover(const secp256k1_ecdsa_sig_t *sig, secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message, int recid);
-static void secp256k1_ecdsa_sig_set_rs(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *r, const secp256k1_scalar_t *s);

 #endif
--- a/src/ecdsa_impl.h
+++ b/src/ecdsa_impl.h
@@ -15,71 +15,69 @@
 #include "ecmult_gen.h"
 #include "ecdsa.h"

-typedef struct {
-    secp256k1_fe_t order_as_fe;
-    secp256k1_fe_t p_minus_order;
-} secp256k1_ecdsa_consts_t;
+/** Group order for secp256k1 defined as 'n' in "Standards for Efficient Cryptography" (SEC2) 2.7.1
+ *  sage: for t in xrange(1023, -1, -1):
+ *     ..   p = 2**256 - 2**32 - t
+ *     ..   if p.is_prime():
+ *     ..     print '%x'%p
+ *     ..     break
+ *   'fffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f'
+ *  sage: a = 0
+ *  sage: b = 7
+ *  sage: F = FiniteField (p)
+ *  sage: '%x' % (EllipticCurve ([F (a), F (b)]).order())
+ *   'fffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141'
+ */
+static const secp256k1_fe_t secp256k1_ecdsa_const_order_as_fe = SECP256K1_FE_CONST(
+    0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL,
+    0xBAAEDCE6UL, 0xAF48A03BUL, 0xBFD25E8CUL, 0xD0364141UL
+);

-static const secp256k1_ecdsa_consts_t *secp256k1_ecdsa_consts = NULL;
-
-static void secp256k1_ecdsa_start(void) {
-    if (secp256k1_ecdsa_consts != NULL)
-        return;
-
-    /* Allocate. */
-    secp256k1_ecdsa_consts_t *ret = (secp256k1_ecdsa_consts_t*)checked_malloc(sizeof(secp256k1_ecdsa_consts_t));
-
-    static const unsigned char order[] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
-        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
-        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
-    };
-
-    secp256k1_fe_set_b32(&ret->order_as_fe, order);
-    secp256k1_fe_negate(&ret->p_minus_order, &ret->order_as_fe, 1);
-    secp256k1_fe_normalize_var(&ret->p_minus_order);
-
-    /* Set the global pointer. */
-    secp256k1_ecdsa_consts = ret;
-}
-
-static void secp256k1_ecdsa_stop(void) {
-    if (secp256k1_ecdsa_consts == NULL)
-        return;
-
-    secp256k1_ecdsa_consts_t *c = (secp256k1_ecdsa_consts_t*)secp256k1_ecdsa_consts;
-    secp256k1_ecdsa_consts = NULL;
-    free(c);
-}
+/** Difference between field and order, values 'p' and 'n' values defined in
+ *  "Standards for Efficient Cryptography" (SEC2) 2.7.1.
+ *  sage: p = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
+ *  sage: a = 0
+ *  sage: b = 7
+ *  sage: F = FiniteField (p)
+ *  sage: '%x' % (p - EllipticCurve ([F (a), F (b)]).order())
+ *   '14551231950b75fc4402da1722fc9baee'
+ */
+static const secp256k1_fe_t secp256k1_ecdsa_const_p_minus_order = SECP256K1_FE_CONST(
+    0, 0, 0, 1, 0x45512319UL, 0x50B75FC4UL, 0x402DA172UL, 0x2FC9BAEEUL
+);

 static int secp256k1_ecdsa_sig_parse(secp256k1_ecdsa_sig_t *r, const unsigned char *sig, int size) {
+    unsigned char ra[32] = {0}, sa[32] = {0};
+    const unsigned char *rp;
+    const unsigned char *sp;
+    int lenr;
+    int lens;
+    int overflow;
    if (sig[0] != 0x30) return 0;
-    int lenr = sig[3];
+    lenr = sig[3];
    if (5+lenr >= size) return 0;
-    int lens = sig[lenr+5];
+    lens = sig[lenr+5];
    if (sig[1] != lenr+lens+4) return 0;
    if (lenr+lens+6 > size) return 0;
    if (sig[2] != 0x02) return 0;
    if (lenr == 0) return 0;
    if (sig[lenr+4] != 0x02) return 0;
    if (lens == 0) return 0;
-    const unsigned char *sp = sig + 6 + lenr;
+    sp = sig + 6 + lenr;
    while (lens > 0 && sp[0] == 0) {
        lens--;
        sp++;
    }
    if (lens > 32) return 0;
-    const unsigned char *rp = sig + 4;
+    rp = sig + 4;
    while (lenr > 0 && rp[0] == 0) {
        lenr--;
        rp++;
    }
    if (lenr > 32) return 0;
-    unsigned char ra[32] = {0}, sa[32] = {0};
    memcpy(ra + 32 - lenr, rp, lenr);
    memcpy(sa + 32 - lens, sp, lens);
-    int overflow = 0;
+    overflow = 0;
    secp256k1_scalar_set_b32(&r->r, ra, &overflow);
    if (overflow) return 0;
    secp256k1_scalar_set_b32(&r->s, sa, &overflow);
@@ -89,10 +87,10 @@ static int secp256k1_ecdsa_sig_parse(secp256k1_ecdsa_sig_t *r, const unsigned ch

 static int secp256k1_ecdsa_sig_serialize(unsigned char *sig, int *size, const secp256k1_ecdsa_sig_t *a) {
    unsigned char r[33] = {0}, s[33] = {0};
-    secp256k1_scalar_get_b32(&r[1], &a->r);
-    secp256k1_scalar_get_b32(&s[1], &a->s);
    unsigned char *rp = r, *sp = s;
    int lenR = 33, lenS = 33;
+    secp256k1_scalar_get_b32(&r[1], &a->r);
+    secp256k1_scalar_get_b32(&s[1], &a->s);
    while (lenR > 1 && rp[0] == 0 && rp[1] < 0x80) { lenR--; rp++; }
    while (lenS > 1 && sp[0] == 0 && sp[1] < 0x80) { lenS--; sp++; }
    if (*size < 6+lenS+lenR)
@@ -110,93 +108,100 @@ static int secp256k1_ecdsa_sig_serialize(unsigned char *sig, int *size, const se
 }

 static int secp256k1_ecdsa_sig_verify(const secp256k1_ecdsa_sig_t *sig, const secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message) {
+    unsigned char c[32];
+    secp256k1_scalar_t sn, u1, u2;
+    secp256k1_fe_t xr;
+    secp256k1_gej_t pubkeyj;
+    secp256k1_gej_t pr;
+
    if (secp256k1_scalar_is_zero(&sig->r) || secp256k1_scalar_is_zero(&sig->s))
        return 0;

-    secp256k1_scalar_t sn, u1, u2;
    secp256k1_scalar_inverse_var(&sn, &sig->s);
    secp256k1_scalar_mul(&u1, &sn, message);
    secp256k1_scalar_mul(&u2, &sn, &sig->r);
-    secp256k1_gej_t pubkeyj; secp256k1_gej_set_ge(&pubkeyj, pubkey);
-    secp256k1_gej_t pr; secp256k1_ecmult(&pr, &pubkeyj, &u2, &u1);
+    secp256k1_gej_set_ge(&pubkeyj, pubkey);
+    secp256k1_ecmult(&pr, &pubkeyj, &u2, &u1);
    if (secp256k1_gej_is_infinity(&pr)) {
        return 0;
    }
-    unsigned char c[32];
    secp256k1_scalar_get_b32(c, &sig->r);
-    secp256k1_fe_t xr;
    secp256k1_fe_set_b32(&xr, c);

-    // We now have the recomputed R point in pr, and its claimed x coordinate (modulo n)
-    // in xr. Naively, we would extract the x coordinate from pr (requiring a inversion modulo p),
-    // compute the remainder modulo n, and compare it to xr. However:
-    //
-    //       xr == X(pr) mod n
-    //   <=> exists h. (xr + h * n < p && xr + h * n == X(pr))
-    //   [Since 2 * n > p, h can only be 0 or 1]
-    //   <=> (xr == X(pr)) || (xr + n < p && xr + n == X(pr))
-    //   [In Jacobian coordinates, X(pr) is pr.x / pr.z^2 mod p]
-    //   <=> (xr == pr.x / pr.z^2 mod p) || (xr + n < p && xr + n == pr.x / pr.z^2 mod p)
-    //   [Multiplying both sides of the equations by pr.z^2 mod p]
-    //   <=> (xr * pr.z^2 mod p == pr.x) || (xr + n < p && (xr + n) * pr.z^2 mod p == pr.x)
-    //
-    // Thus, we can avoid the inversion, but we have to check both cases separately.
-    // secp256k1_gej_eq_x implements the (xr * pr.z^2 mod p == pr.x) test.
+    /** We now have the recomputed R point in pr, and its claimed x coordinate (modulo n)
+     *  in xr. Naively, we would extract the x coordinate from pr (requiring a inversion modulo p),
+     *  compute the remainder modulo n, and compare it to xr. However:
+     *
+     *        xr == X(pr) mod n
+     *    <=> exists h. (xr + h * n < p && xr + h * n == X(pr))
+     *    [Since 2 * n > p, h can only be 0 or 1]
+     *    <=> (xr == X(pr)) || (xr + n < p && xr + n == X(pr))
+     *    [In Jacobian coordinates, X(pr) is pr.x / pr.z^2 mod p]
+     *    <=> (xr == pr.x / pr.z^2 mod p) || (xr + n < p && xr + n == pr.x / pr.z^2 mod p)
+     *    [Multiplying both sides of the equations by pr.z^2 mod p]
+     *    <=> (xr * pr.z^2 mod p == pr.x) || (xr + n < p && (xr + n) * pr.z^2 mod p == pr.x)
+     *
+     *  Thus, we can avoid the inversion, but we have to check both cases separately.
+     *  secp256k1_gej_eq_x implements the (xr * pr.z^2 mod p == pr.x) test.
+     */
    if (secp256k1_gej_eq_x_var(&xr, &pr)) {
-        // xr.x == xr * xr.z^2 mod p, so the signature is valid.
+        /* xr.x == xr * xr.z^2 mod p, so the signature is valid. */
        return 1;
    }
-    if (secp256k1_fe_cmp_var(&xr, &secp256k1_ecdsa_consts->p_minus_order) >= 0) {
-        // xr + p >= n, so we can skip testing the second case.
+    if (secp256k1_fe_cmp_var(&xr, &secp256k1_ecdsa_const_p_minus_order) >= 0) {
+        /* xr + p >= n, so we can skip testing the second case. */
        return 0;
    }
-    secp256k1_fe_add(&xr, &secp256k1_ecdsa_consts->order_as_fe);
+    secp256k1_fe_add(&xr, &secp256k1_ecdsa_const_order_as_fe);
    if (secp256k1_gej_eq_x_var(&xr, &pr)) {
-        // (xr + n) * pr.z^2 mod p == pr.x, so the signature is valid.
+        /* (xr + n) * pr.z^2 mod p == pr.x, so the signature is valid. */
        return 1;
    }
    return 0;
 }

 static int secp256k1_ecdsa_sig_recover(const secp256k1_ecdsa_sig_t *sig, secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message, int recid) {
+    unsigned char brx[32];
+    secp256k1_fe_t fx;
+    secp256k1_ge_t x;
+    secp256k1_gej_t xj;
+    secp256k1_scalar_t rn, u1, u2;
+    secp256k1_gej_t qj;
+
    if (secp256k1_scalar_is_zero(&sig->r) || secp256k1_scalar_is_zero(&sig->s))
        return 0;

-    unsigned char brx[32];
    secp256k1_scalar_get_b32(brx, &sig->r);
-    secp256k1_fe_t fx;
    VERIFY_CHECK(secp256k1_fe_set_b32(&fx, brx)); /* brx comes from a scalar, so is less than the order; certainly less than p */
    if (recid & 2) {
-        if (secp256k1_fe_cmp_var(&fx, &secp256k1_ecdsa_consts->p_minus_order) >= 0)
+        if (secp256k1_fe_cmp_var(&fx, &secp256k1_ecdsa_const_p_minus_order) >= 0)
            return 0;
-        secp256k1_fe_add(&fx, &secp256k1_ecdsa_consts->order_as_fe);
+        secp256k1_fe_add(&fx, &secp256k1_ecdsa_const_order_as_fe);
    }
-    secp256k1_ge_t x;
    if (!secp256k1_ge_set_xo_var(&x, &fx, recid & 1))
        return 0;
-    secp256k1_gej_t xj;
    secp256k1_gej_set_ge(&xj, &x);
-    secp256k1_scalar_t rn, u1, u2;
    secp256k1_scalar_inverse_var(&rn, &sig->r);
    secp256k1_scalar_mul(&u1, &rn, message);
    secp256k1_scalar_negate(&u1, &u1);
    secp256k1_scalar_mul(&u2, &rn, &sig->s);
-    secp256k1_gej_t qj;
    secp256k1_ecmult(&qj, &xj, &u2, &u1);
    secp256k1_ge_set_gej_var(pubkey, &qj);
    return !secp256k1_gej_is_infinity(&qj);
 }

 static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *seckey, const secp256k1_scalar_t *message, const secp256k1_scalar_t *nonce, int *recid) {
-    secp256k1_gej_t rp;
-    secp256k1_ecmult_gen(&rp, nonce);
-    secp256k1_ge_t r;
-    secp256k1_ge_set_gej(&r, &rp);
    unsigned char b[32];
+    secp256k1_gej_t rp;
+    secp256k1_ge_t r;
+    secp256k1_scalar_t n;
+    int overflow = 0;
+
+    secp256k1_ecmult_gen(&rp, nonce);
+    secp256k1_ge_set_gej(&r, &rp);
    secp256k1_fe_normalize(&r.x);
    secp256k1_fe_normalize(&r.y);
    secp256k1_fe_get_b32(b, &r.x);
-    int overflow = 0;
    secp256k1_scalar_set_b32(&sig->r, b, &overflow);
    if (secp256k1_scalar_is_zero(&sig->r)) {
        /* P.x = order is on the curve, so technically sig->r could end up zero, which would be an invalid signature. */
@@ -206,7 +211,6 @@ static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_
    }
    if (recid)
        *recid = (overflow ? 2 : 0) | (secp256k1_fe_is_odd(&r.y) ? 1 : 0);
-    secp256k1_scalar_t n;
    secp256k1_scalar_mul(&n, &sig->r, seckey);
    secp256k1_scalar_add(&n, &n, message);
    secp256k1_scalar_inverse(&sig->s, nonce);
@@ -224,9 +228,4 @@ static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_
    return 1;
 }

-static void secp256k1_ecdsa_sig_set_rs(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *r, const secp256k1_scalar_t *s) {
-    sig->r = *r;
-    sig->s = *s;
-}
-
 #endif
--- a/src/eckey_impl.h
+++ b/src/eckey_impl.h
@@ -51,13 +51,16 @@ static int secp256k1_eckey_pubkey_serialize(secp256k1_ge_t *elem, unsigned char
 }

 static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned char *privkey, int privkeylen) {
+    unsigned char c[32] = {0};
    const unsigned char *end = privkey + privkeylen;
+    int lenb = 0;
+    int len = 0;
+    int overflow = 0;
    /* sequence header */
    if (end < privkey+1 || *privkey != 0x30)
        return 0;
    privkey++;
    /* sequence length constructor */
-    int lenb = 0;
    if (end < privkey+1 || !(*privkey & 0x80))
        return 0;
    lenb = *privkey & ~0x80; privkey++;
@@ -66,7 +69,6 @@ static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned
    if (end < privkey+lenb)
        return 0;
    /* sequence length */
-    int len = 0;
    len = privkey[lenb-1] | (lenb > 1 ? privkey[lenb-2] << 8 : 0);
    privkey += lenb;
    if (end < privkey+len)
@@ -78,8 +80,6 @@ static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned
    /* sequence element 1: octet string, up to 32 bytes */
    if (end < privkey+2 || privkey[0] != 0x04 || privkey[1] > 0x20 || end < privkey+2+privkey[1])
        return 0;
-    int overflow = 0;
-    unsigned char c[32] = {0};
    memcpy(c + 32 - privkey[1], privkey + 2, privkey[1]);
    secp256k1_scalar_set_b32(key, c, &overflow);
    memset(c, 0, 32);
@@ -88,8 +88,9 @@ static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned

 static int secp256k1_eckey_privkey_serialize(unsigned char *privkey, int *privkeylen, const secp256k1_scalar_t *key, int compressed) {
    secp256k1_gej_t rp;
-    secp256k1_ecmult_gen(&rp, key);
    secp256k1_ge_t r;
+    int pubkeylen = 0;
+    secp256k1_ecmult_gen(&rp, key);
    secp256k1_ge_set_gej(&r, &rp);
    if (compressed) {
        static const unsigned char begin[] = {
@@ -110,7 +111,6 @@ static int secp256k1_eckey_privkey_serialize(unsigned char *privkey, int *privke
        memcpy(ptr, begin, sizeof(begin)); ptr += sizeof(begin);
        secp256k1_scalar_get_b32(ptr, key); ptr += 32;
        memcpy(ptr, middle, sizeof(middle)); ptr += sizeof(middle);
-        int pubkeylen = 0;
        if (!secp256k1_eckey_pubkey_serialize(&r, ptr, &pubkeylen, 1)) {
            return 0;
        }
@@ -137,7 +137,6 @@ static int secp256k1_eckey_privkey_serialize(unsigned char *privkey, int *privke
        memcpy(ptr, begin, sizeof(begin)); ptr += sizeof(begin);
        secp256k1_scalar_get_b32(ptr, key); ptr += 32;
        memcpy(ptr, middle, sizeof(middle)); ptr += sizeof(middle);
-        int pubkeylen = 0;
        if (!secp256k1_eckey_pubkey_serialize(&r, ptr, &pubkeylen, 0)) {
            return 0;
        }
@@ -156,8 +155,8 @@ static int secp256k1_eckey_privkey_tweak_add(secp256k1_scalar_t *key, const secp

 static int secp256k1_eckey_pubkey_tweak_add(secp256k1_ge_t *key, const secp256k1_scalar_t *tweak) {
    secp256k1_gej_t pt;
-    secp256k1_gej_set_ge(&pt, key);
    secp256k1_scalar_t one;
+    secp256k1_gej_set_ge(&pt, key);
    secp256k1_scalar_set_int(&one, 1);
    secp256k1_ecmult(&pt, &pt, &one, tweak);

@@ -176,12 +175,12 @@ static int secp256k1_eckey_privkey_tweak_mul(secp256k1_scalar_t *key, const secp
 }

 static int secp256k1_eckey_pubkey_tweak_mul(secp256k1_ge_t *key, const secp256k1_scalar_t *tweak) {
+    secp256k1_scalar_t zero;
+    secp256k1_gej_t pt;
    if (secp256k1_scalar_is_zero(tweak))
        return 0;

-    secp256k1_scalar_t zero;
    secp256k1_scalar_set_int(&zero, 0);
-    secp256k1_gej_t pt;
    secp256k1_gej_set_ge(&pt, key);
    secp256k1_ecmult(&pt, &pt, tweak, &zero);
    secp256k1_ge_set_gej(key, &pt);
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@@ -24,49 +24,53 @@ typedef struct {
     * None of the resulting prec group elements have a known scalar, and neither do any of
     * the intermediate sums while computing a*G.
     */
-    secp256k1_fe_t prec[64][16][2]; /* prec[j][i] = (16^j * i * G + U_i).{x,y} */
+    secp256k1_ge_storage_t prec[64][16]; /* prec[j][i] = 16^j * i * G + U_i */
 } secp256k1_ecmult_gen_consts_t;

 static const secp256k1_ecmult_gen_consts_t *secp256k1_ecmult_gen_consts = NULL;

 static void secp256k1_ecmult_gen_start(void) {
+    secp256k1_ge_t prec[1024];
+    secp256k1_gej_t gj;
+    secp256k1_gej_t nums_gej;
+    secp256k1_ecmult_gen_consts_t *ret;
+    int i, j;
    if (secp256k1_ecmult_gen_consts != NULL)
        return;

    /* Allocate the precomputation table. */
-    secp256k1_ecmult_gen_consts_t *ret = (secp256k1_ecmult_gen_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_gen_consts_t));
+    ret = (secp256k1_ecmult_gen_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_gen_consts_t));

    /* get the generator */
-    const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
-    secp256k1_gej_t gj; secp256k1_gej_set_ge(&gj, g);
+    secp256k1_gej_set_ge(&gj, &secp256k1_ge_const_g);

    /* Construct a group element with no known corresponding scalar (nothing up my sleeve). */
-    secp256k1_gej_t nums_gej;
    {
-        static const unsigned char nums_b32[32] = "The scalar for this x is unknown";
+        static const unsigned char nums_b32[33] = "The scalar for this x is unknown";
        secp256k1_fe_t nums_x;
-        VERIFY_CHECK(secp256k1_fe_set_b32(&nums_x, nums_b32));
        secp256k1_ge_t nums_ge;
+        VERIFY_CHECK(secp256k1_fe_set_b32(&nums_x, nums_b32));
        VERIFY_CHECK(secp256k1_ge_set_xo_var(&nums_ge, &nums_x, 0));
        secp256k1_gej_set_ge(&nums_gej, &nums_ge);
        /* Add G to make the bits in x uniformly distributed. */
-        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, g);
+        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, &secp256k1_ge_const_g);
    }

    /* compute prec. */
-    secp256k1_ge_t prec[1024];
    {
        secp256k1_gej_t precj[1024]; /* Jacobian versions of prec. */
-        secp256k1_gej_t gbase; gbase = gj; /* 16^j * G */
-        secp256k1_gej_t numsbase; numsbase = nums_gej; /* 2^j * nums. */
-        for (int j=0; j<64; j++) {
+        secp256k1_gej_t gbase;
+        secp256k1_gej_t numsbase;
+        gbase = gj; /* 16^j * G */
+        numsbase = nums_gej; /* 2^j * nums. */
+        for (j = 0; j < 64; j++) {
            /* Set precj[j*16 .. j*16+15] to (numsbase, numsbase + gbase, ..., numsbase + 15*gbase). */
            precj[j*16] = numsbase;
-            for (int i=1; i<16; i++) {
+            for (i = 1; i < 16; i++) {
                secp256k1_gej_add_var(&precj[j*16 + i], &precj[j*16 + i - 1], &gbase);
            }
            /* Multiply gbase by 16. */
-            for (int i=0; i<4; i++) {
+            for (i = 0; i < 4; i++) {
                secp256k1_gej_double_var(&gbase, &gbase);
            }
            /* Multiply numbase by 2. */
@@ -79,11 +83,9 @@ static void secp256k1_ecmult_gen_start(void) {
        }
        secp256k1_ge_set_all_gej_var(1024, prec, precj);
    }
-    for (int j=0; j<64; j++) {
-        for (int i=0; i<16; i++) {
-            VERIFY_CHECK(!secp256k1_ge_is_infinity(&prec[j*16 + i]));
-            ret->prec[j][i][0] = prec[j*16 + i].x;
-            ret->prec[j][i][1] = prec[j*16 + i].y;
+    for (j = 0; j < 64; j++) {
+        for (i = 0; i < 16; i++) {
+            secp256k1_ge_to_storage(&ret->prec[j][i], &prec[j*16 + i]);
        }
    }

@@ -92,26 +94,29 @@ static void secp256k1_ecmult_gen_start(void) {
 }

 static void secp256k1_ecmult_gen_stop(void) {
+    secp256k1_ecmult_gen_consts_t *c;
    if (secp256k1_ecmult_gen_consts == NULL)
        return;

-    secp256k1_ecmult_gen_consts_t *c = (secp256k1_ecmult_gen_consts_t*)secp256k1_ecmult_gen_consts;
+    c = (secp256k1_ecmult_gen_consts_t*)secp256k1_ecmult_gen_consts;
    secp256k1_ecmult_gen_consts = NULL;
    free(c);
 }

 static void secp256k1_ecmult_gen(secp256k1_gej_t *r, const secp256k1_scalar_t *gn) {
    const secp256k1_ecmult_gen_consts_t *c = secp256k1_ecmult_gen_consts;
-    secp256k1_gej_set_infinity(r);
    secp256k1_ge_t add;
-    add.infinity = 0;
+    secp256k1_ge_storage_t adds;
    int bits;
-    for (int j=0; j<64; j++) {
+    int i, j;
+    secp256k1_gej_set_infinity(r);
+    add.infinity = 0;
+    for (j = 0; j < 64; j++) {
        bits = secp256k1_scalar_get_bits(gn, j * 4, 4);
-        for (int i=0; i<16; i++) {
-            secp256k1_fe_cmov(&add.x, &c->prec[j][i][0], i == bits);
-            secp256k1_fe_cmov(&add.y, &c->prec[j][i][1], i == bits);
+        for (i = 0; i < 16; i++) {
+            secp256k1_ge_storage_cmov(&adds, &c->prec[j][i], i == bits);
        }
+        secp256k1_ge_from_storage(&add, &adds);
        secp256k1_gej_add_ge(r, r, &add);
    }
    bits = 0;
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -37,22 +37,31 @@
 *  G is constant, so it only needs to be done once in advance.
 */
 static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *pre, const secp256k1_gej_t *a, int w) {
+    secp256k1_gej_t d;
+    int i;
    pre[0] = *a;
-    secp256k1_gej_t d; secp256k1_gej_double_var(&d, &pre[0]);
-    for (int i=1; i<(1 << (w-2)); i++)
+    secp256k1_gej_double_var(&d, &pre[0]);
+    for (i = 1; i < (1 << (w-2)); i++)
        secp256k1_gej_add_var(&pre[i], &d, &pre[i-1]);
 }

-static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const secp256k1_gej_t *a, int w) {
+static void secp256k1_ecmult_table_precomp_ge_storage_var(secp256k1_ge_storage_t *pre, const secp256k1_gej_t *a, int w) {
+    secp256k1_gej_t d;
+    int i;
    const int table_size = 1 << (w-2);
    secp256k1_gej_t *prej = checked_malloc(sizeof(secp256k1_gej_t) * table_size);
+    secp256k1_ge_t *prea = checked_malloc(sizeof(secp256k1_ge_t) * table_size);
    prej[0] = *a;
-    secp256k1_gej_t d; secp256k1_gej_double_var(&d, a);
-    for (int i=1; i<table_size; i++) {
+    secp256k1_gej_double_var(&d, a);
+    for (i = 1; i < table_size; i++) {
        secp256k1_gej_add_var(&prej[i], &d, &prej[i-1]);
    }
-    secp256k1_ge_set_all_gej_var(table_size, pre, prej);
+    secp256k1_ge_set_all_gej_var(table_size, prea, prej);
+    for (i = 0; i < table_size; i++) {
+        secp256k1_ge_to_storage(&pre[i], &prea[i]);
+    }
    free(prej);
+    free(prea);
 }

 /** The number of entries a table with precomputed multiples needs to have. */
@@ -60,51 +69,63 @@ static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const sec

 /** The following two macro retrieves a particular odd multiple from a table
 *  of precomputed multiples. */
-#define ECMULT_TABLE_GET(r,pre,n,w,neg) do { \
+#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) do { \
    VERIFY_CHECK(((n) & 1) == 1); \
    VERIFY_CHECK((n) >= -((1 << ((w)-1)) - 1)); \
    VERIFY_CHECK((n) <=  ((1 << ((w)-1)) - 1)); \
    if ((n) > 0) \
        *(r) = (pre)[((n)-1)/2]; \
    else \
-        (neg)((r), &(pre)[(-(n)-1)/2]); \
+        secp256k1_gej_neg((r), &(pre)[(-(n)-1)/2]); \
+} while(0)
+#define ECMULT_TABLE_GET_GE_STORAGE(r,pre,n,w) do { \
+    VERIFY_CHECK(((n) & 1) == 1); \
+    VERIFY_CHECK((n) >= -((1 << ((w)-1)) - 1)); \
+    VERIFY_CHECK((n) <=  ((1 << ((w)-1)) - 1)); \
+    if ((n) > 0) \
+        secp256k1_ge_from_storage((r), &(pre)[((n)-1)/2]); \
+    else {\
+        secp256k1_ge_from_storage((r), &(pre)[(-(n)-1)/2]); \
+        secp256k1_ge_neg((r), (r)); \
+    } \
 } while(0)
-
-#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_gej_neg)
-#define ECMULT_TABLE_GET_GE(r,pre,n,w)  ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_ge_neg)

 typedef struct {
    /* For accelerating the computation of a*P + b*G: */
-    secp256k1_ge_t pre_g[ECMULT_TABLE_SIZE(WINDOW_G)];    /* odd multiples of the generator */
+    secp256k1_ge_storage_t pre_g[ECMULT_TABLE_SIZE(WINDOW_G)];    /* odd multiples of the generator */
 #ifdef USE_ENDOMORPHISM
-    secp256k1_ge_t pre_g_128[ECMULT_TABLE_SIZE(WINDOW_G)]; /* odd multiples of 2^128*generator */
+    secp256k1_ge_storage_t pre_g_128[ECMULT_TABLE_SIZE(WINDOW_G)]; /* odd multiples of 2^128*generator */
 #endif
 } secp256k1_ecmult_consts_t;

 static const secp256k1_ecmult_consts_t *secp256k1_ecmult_consts = NULL;

 static void secp256k1_ecmult_start(void) {
+    secp256k1_gej_t gj;
+    secp256k1_ecmult_consts_t *ret;
    if (secp256k1_ecmult_consts != NULL)
        return;

    /* Allocate the precomputation table. */
-    secp256k1_ecmult_consts_t *ret = (secp256k1_ecmult_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_consts_t));
+    ret = (secp256k1_ecmult_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_consts_t));

    /* get the generator */
-    const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
-    secp256k1_gej_t gj; secp256k1_gej_set_ge(&gj, g);
+    secp256k1_gej_set_ge(&gj, &secp256k1_ge_const_g);

-#ifdef USE_ENDOMORPHISM
-    /* calculate 2^128*generator */
-    secp256k1_gej_t g_128j = gj;
-    for (int i=0; i<128; i++)
-        secp256k1_gej_double_var(&g_128j, &g_128j);
-#endif

    /* precompute the tables with odd multiples */
-    secp256k1_ecmult_table_precomp_ge_var(ret->pre_g, &gj, WINDOW_G);
+    secp256k1_ecmult_table_precomp_ge_storage_var(ret->pre_g, &gj, WINDOW_G);
+
 #ifdef USE_ENDOMORPHISM
-    secp256k1_ecmult_table_precomp_ge_var(ret->pre_g_128, &g_128j, WINDOW_G);
+    {
+        secp256k1_gej_t g_128j;
+        int i;
+        /* calculate 2^128*generator */
+        g_128j = gj;
+        for (i = 0; i < 128; i++)
+            secp256k1_gej_double_var(&g_128j, &g_128j);
+        secp256k1_ecmult_table_precomp_ge_storage_var(ret->pre_g_128, &g_128j, WINDOW_G);
+    }
 #endif

    /* Set the global pointer to the precomputation table. */
@@ -112,10 +133,11 @@ static void secp256k1_ecmult_start(void) {
 }

 static void secp256k1_ecmult_stop(void) {
+    secp256k1_ecmult_consts_t *c;
    if (secp256k1_ecmult_consts == NULL)
        return;

-    secp256k1_ecmult_consts_t *c = (secp256k1_ecmult_consts_t*)secp256k1_ecmult_consts;
+    c = (secp256k1_ecmult_consts_t*)secp256k1_ecmult_consts;
    secp256k1_ecmult_consts = NULL;
    free(c);
 }
@@ -129,16 +151,18 @@ static void secp256k1_ecmult_stop(void) {
 */
 static int secp256k1_ecmult_wnaf(int *wnaf, const secp256k1_scalar_t *a, int w) {
    secp256k1_scalar_t s = *a;
-
+    int set_bits = 0;
+    int bit = 0;
    int sign = 1;
+
    if (secp256k1_scalar_get_bits(&s, 255, 1)) {
        secp256k1_scalar_negate(&s, &s);
        sign = -1;
    }

-    int set_bits = 0;
-    int bit = 0;
    while (bit < 256) {
+        int now;
+        int word;
        if (secp256k1_scalar_get_bits(&s, bit, 1) == 0) {
            bit++;
            continue;
@@ -146,11 +170,11 @@ static int secp256k1_ecmult_wnaf(int *wnaf, const secp256k1_scalar_t *a, int w)
        while (set_bits < bit) {
            wnaf[set_bits++] = 0;
        }
-        int now = w;
+        now = w;
        if (bit + now > 256) {
            now = 256 - bit;
        }
-        int word = secp256k1_scalar_get_bits_var(&s, bit, now);
+        word = secp256k1_scalar_get_bits_var(&s, bit, now);
        if (word & (1 << (w-1))) {
            secp256k1_scalar_add_bit(&s, bit + w);
            wnaf[set_bits++] = sign * (word - (1 << w));
@@ -163,58 +187,74 @@ static int secp256k1_ecmult_wnaf(int *wnaf, const secp256k1_scalar_t *a, int w)
 }

 static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_scalar_t *na, const secp256k1_scalar_t *ng) {
+    secp256k1_gej_t tmpj;
+    secp256k1_gej_t pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
+    secp256k1_ge_t tmpa;
    const secp256k1_ecmult_consts_t *c = secp256k1_ecmult_consts;
+#ifdef USE_ENDOMORPHISM
+    secp256k1_gej_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
+    secp256k1_scalar_t na_1, na_lam;
+    /* Splitted G factors. */
+    secp256k1_scalar_t ng_1, ng_128;
+    int wnaf_na_1[130];
+    int wnaf_na_lam[130];
+    int bits_na_1;
+    int bits_na_lam;
+    int wnaf_ng_1[129];
+    int bits_ng_1;
+    int wnaf_ng_128[129];
+    int bits_ng_128;
+#else
+    int wnaf_na[256];
+    int bits_na;
+    int wnaf_ng[257];
+    int bits_ng;
+#endif
+    int i;
+    int bits;

 #ifdef USE_ENDOMORPHISM
-    secp256k1_scalar_t na_1, na_lam;
    /* split na into na_1 and na_lam (where na = na_1 + na_lam*lambda, and na_1 and na_lam are ~128 bit) */
    secp256k1_scalar_split_lambda_var(&na_1, &na_lam, na);

    /* build wnaf representation for na_1 and na_lam. */
-    int wnaf_na_1[130];   int bits_na_1   = secp256k1_ecmult_wnaf(wnaf_na_1,   &na_1,   WINDOW_A);
-    int wnaf_na_lam[130]; int bits_na_lam = secp256k1_ecmult_wnaf(wnaf_na_lam, &na_lam, WINDOW_A);
+    bits_na_1   = secp256k1_ecmult_wnaf(wnaf_na_1,   &na_1,   WINDOW_A);
+    bits_na_lam = secp256k1_ecmult_wnaf(wnaf_na_lam, &na_lam, WINDOW_A);
    VERIFY_CHECK(bits_na_1 <= 130);
    VERIFY_CHECK(bits_na_lam <= 130);
-    int bits = bits_na_1;
+    bits = bits_na_1;
    if (bits_na_lam > bits) bits = bits_na_lam;
 #else
    /* build wnaf representation for na. */
-    int wnaf_na[256];     int bits_na     = secp256k1_ecmult_wnaf(wnaf_na,     na,      WINDOW_A);
-    int bits = bits_na;
+    bits_na     = secp256k1_ecmult_wnaf(wnaf_na,     na,      WINDOW_A);
+    bits = bits_na;
 #endif

    /* calculate odd multiples of a */
-    secp256k1_gej_t pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
    secp256k1_ecmult_table_precomp_gej_var(pre_a, a, WINDOW_A);

 #ifdef USE_ENDOMORPHISM
-    secp256k1_gej_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
-    for (int i=0; i<ECMULT_TABLE_SIZE(WINDOW_A); i++)
+    for (i = 0; i < ECMULT_TABLE_SIZE(WINDOW_A); i++)
        secp256k1_gej_mul_lambda(&pre_a_lam[i], &pre_a[i]);

-    /* Splitted G factors. */
-    secp256k1_scalar_t ng_1, ng_128;
-
    /* split ng into ng_1 and ng_128 (where gn = gn_1 + gn_128*2^128, and gn_1 and gn_128 are ~128 bit) */
    secp256k1_scalar_split_128(&ng_1, &ng_128, ng);

    /* Build wnaf representation for ng_1 and ng_128 */
-    int wnaf_ng_1[129];   int bits_ng_1   = secp256k1_ecmult_wnaf(wnaf_ng_1,   &ng_1,   WINDOW_G);
-    int wnaf_ng_128[129]; int bits_ng_128 = secp256k1_ecmult_wnaf(wnaf_ng_128, &ng_128, WINDOW_G);
+    bits_ng_1   = secp256k1_ecmult_wnaf(wnaf_ng_1,   &ng_1,   WINDOW_G);
+    bits_ng_128 = secp256k1_ecmult_wnaf(wnaf_ng_128, &ng_128, WINDOW_G);
    if (bits_ng_1 > bits) bits = bits_ng_1;
    if (bits_ng_128 > bits) bits = bits_ng_128;
 #else
-    int wnaf_ng[257];     int bits_ng     = secp256k1_ecmult_wnaf(wnaf_ng,     ng,      WINDOW_G);
+    bits_ng     = secp256k1_ecmult_wnaf(wnaf_ng,     ng,      WINDOW_G);
    if (bits_ng > bits) bits = bits_ng;
 #endif

    secp256k1_gej_set_infinity(r);
-    secp256k1_gej_t tmpj;
-    secp256k1_ge_t tmpa;

-    for (int i=bits-1; i>=0; i--) {
-        secp256k1_gej_double_var(r, r);
+    for (i = bits-1; i >= 0; i--) {
        int n;
+        secp256k1_gej_double_var(r, r);
 #ifdef USE_ENDOMORPHISM
        if (i < bits_na_1 && (n = wnaf_na_1[i])) {
            ECMULT_TABLE_GET_GEJ(&tmpj, pre_a, n, WINDOW_A);
@@ -225,11 +265,11 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
            secp256k1_gej_add_var(r, r, &tmpj);
        }
        if (i < bits_ng_1 && (n = wnaf_ng_1[i])) {
-            ECMULT_TABLE_GET_GE(&tmpa, c->pre_g, n, WINDOW_G);
+            ECMULT_TABLE_GET_GE_STORAGE(&tmpa, c->pre_g, n, WINDOW_G);
            secp256k1_gej_add_ge_var(r, r, &tmpa);
        }
        if (i < bits_ng_128 && (n = wnaf_ng_128[i])) {
-            ECMULT_TABLE_GET_GE(&tmpa, c->pre_g_128, n, WINDOW_G);
+            ECMULT_TABLE_GET_GE_STORAGE(&tmpa, c->pre_g_128, n, WINDOW_G);
            secp256k1_gej_add_ge_var(r, r, &tmpa);
        }
 #else
@@ -238,7 +278,7 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
            secp256k1_gej_add_var(r, r, &tmpj);
        }
        if (i < bits_ng && (n = wnaf_ng[i])) {
-            ECMULT_TABLE_GET_GE(&tmpa, c->pre_g, n, WINDOW_G);
+            ECMULT_TABLE_GET_GE_STORAGE(&tmpa, c->pre_g, n, WINDOW_G);
            secp256k1_gej_add_ge_var(r, r, &tmpa);
        }
 #endif
--- a/src/field.h
+++ b/src/field.h
@@ -30,21 +30,6 @@
 #error "Please select field implementation"
 #endif

-typedef struct {
-#ifndef USE_NUM_NONE
-    secp256k1_num_t p;
-#endif
-    secp256k1_fe_t order;
-} secp256k1_fe_consts_t;
-
-static const secp256k1_fe_consts_t *secp256k1_fe_consts = NULL;
-
-/** Initialize field element precomputation data. */
-static void secp256k1_fe_start(void);
-
-/** Unload field element precomputation data. */
-static void secp256k1_fe_stop(void);
-
 /** Normalize a field element. */
 static void secp256k1_fe_normalize(secp256k1_fe_t *r);

@@ -117,15 +102,15 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 /** Calculate the (modular) inverses of a batch of field elements. Requires the inputs' magnitudes to be
 *  at most 8. The output magnitudes are 1 (but not guaranteed to be normalized). The inputs and
 *  outputs must not overlap in memory. */
-static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]);
+static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t *r, const secp256k1_fe_t *a);

-/** Convert a field element to a hexadecimal string. */
-static void secp256k1_fe_get_hex(char *r, int *rlen, const secp256k1_fe_t *a);
+/** Convert a field element to the storage type. */
+static void secp256k1_fe_to_storage(secp256k1_fe_storage_t *r, const secp256k1_fe_t*);

-/** Convert a hexadecimal string to a field element. */
-static int secp256k1_fe_set_hex(secp256k1_fe_t *r, const char *a, int alen);
+/** Convert a field element back from the storage type. */
+static void secp256k1_fe_from_storage(secp256k1_fe_t *r, const secp256k1_fe_storage_t*);

 /** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
-static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int flag);
+static void secp256k1_fe_storage_cmov(secp256k1_fe_storage_t *r, const secp256k1_fe_storage_t *a, int flag);

 #endif
--- a/src/field_10x26.h
+++ b/src/field_10x26.h
@@ -18,4 +18,30 @@ typedef struct {
 #endif
 } secp256k1_fe_t;

+/* Unpacks a constant into a overlapping multi-limbed FE element. */
+#define SECP256K1_FE_CONST_INNER(d7, d6, d5, d4, d3, d2, d1, d0) { \
+    (d0) & 0x3FFFFFFUL, \
+    ((d0) >> 26) | ((d1) & 0xFFFFFUL) << 6, \
+    ((d1) >> 20) | ((d2) & 0x3FFFUL) << 12, \
+    ((d2) >> 14) | ((d3) & 0xFFUL) << 18, \
+    ((d3) >> 8) | ((d4) & 0x3) << 24, \
+    ((d4) >> 2) & 0x3FFFFFFUL, \
+    ((d4) >> 28) | ((d5) & 0x3FFFFFUL) << 4, \
+    ((d5) >> 22) | ((d6) & 0xFFFF) << 10, \
+    ((d6) >> 16) | ((d7) & 0x3FF) << 16, \
+    ((d7) >> 10) \
+}
+
+#ifdef VERIFY
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1}
+#else
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0))}
+#endif
+
+typedef struct {
+    uint32_t n[8];
+} secp256k1_fe_storage_t;
+
+#define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}
+
 #endif
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -13,9 +13,6 @@
 #include "num.h"
 #include "field.h"

-static void secp256k1_fe_inner_start(void) {}
-static void secp256k1_fe_inner_stop(void) {}
-
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe_t *a) {
    const uint32_t *d = a->n;
@@ -54,8 +51,8 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
    uint32_t m;
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL; t1 += (x << 6);
@@ -140,8 +137,8 @@ static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
    uint32_t m;
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL; t1 += (x << 6);
@@ -195,12 +192,12 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
    uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

-    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
-
    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
    uint32_t z0, z1;

+    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
+
    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL; t1 += (x << 6);
    t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL; z0  = t0; z1  = t0 ^ 0x3D0UL;
@@ -221,23 +218,36 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
 }

 static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe_t *r) {
-    uint32_t t0 = r->n[0], t9 = r->n[9];
+    uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+    uint32_t z0, z1;
+    uint32_t x;
+
+    t0 = r->n[0];
+    t9 = r->n[9];

    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22;
+    x = t9 >> 22;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL;

    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
-    uint32_t z0 = t0 & 0x3FFFFFFUL, z1 = z0 ^ 0x3D0UL;
+    z0 = t0 & 0x3FFFFFFUL;
+    z1 = z0 ^ 0x3D0UL;

    /* Fast return path should catch the majority of cases */
    if ((z0 != 0UL) & (z1 != 0x3FFFFFFUL))
        return 0;

-    uint32_t t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
-             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8];
+    t1 = r->n[1];
+    t2 = r->n[2];
+    t3 = r->n[3];
+    t4 = r->n[4];
+    t5 = r->n[5];
+    t6 = r->n[6];
+    t7 = r->n[7];
+    t8 = r->n[8];
+
    t9 &= 0x03FFFFFUL;
    t1 += (x << 6);

@@ -269,11 +279,11 @@ SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
 }

 SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe_t *a) {
+    const uint32_t *t = a->n;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    const uint32_t *t = a->n;
    return (t[0] | t[1] | t[2] | t[3] | t[4] | t[5] | t[6] | t[7] | t[8] | t[9]) == 0;
 }

@@ -286,23 +296,25 @@ SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe_t *a) {
 }

 SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    a->magnitude = 0;
    a->normalized = 1;
 #endif
-    for (int i=0; i<10; i++) {
+    for (i=0; i<10; i++) {
        a->n[i] = 0;
    }
 }

 static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    VERIFY_CHECK(b->normalized);
    secp256k1_fe_verify(a);
    secp256k1_fe_verify(b);
 #endif
-    for (int i = 9; i >= 0; i--) {
+    for (i = 9; i >= 0; i--) {
        if (a->n[i] > b->n[i]) return 1;
        if (a->n[i] < b->n[i]) return -1;
    }
@@ -310,10 +322,12 @@ static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b
 }

 static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {
+    int i;
    r->n[0] = r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
    r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
-    for (int i=0; i<32; i++) {
-        for (int j=0; j<4; j++) {
+    for (i=0; i<32; i++) {
+        int j;
+        for (j=0; j<4; j++) {
            int limb = (8*i+2*j)/26;
            int shift = (8*i+2*j)%26;
            r->n[limb] |= (uint32_t)((a[31-i] >> (2*j)) & 0x3) << shift;
@@ -332,13 +346,15 @@ static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {

 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    for (int i=0; i<32; i++) {
+    for (i=0; i<32; i++) {
+        int j;
        int c = 0;
-        for (int j=0; j<4; j++) {
+        for (j=0; j<4; j++) {
            int limb = (8*i+2*j)/26;
            int shift = (8*i+2*j)%26;
            c |= ((a->n[limb] >> shift) & 0x3) << (2 * j);
@@ -415,6 +431,11 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1
 #endif

 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) {
+    uint64_t c, d;
+    uint64_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
+    uint32_t t9, t1, t0, t2, t3, t4, t5, t6, t7;
+    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
+
    VERIFY_BITS(a[0], 30);
    VERIFY_BITS(a[1], 30);
    VERIFY_BITS(a[2], 30);
@@ -436,14 +457,11 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
    VERIFY_BITS(b[8], 30);
    VERIFY_BITS(b[9], 26);

-    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
    /** [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*b[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
     */

-    uint64_t c, d;
-
    d  = (uint64_t)a[0] * b[9]
       + (uint64_t)a[1] * b[8]
       + (uint64_t)a[2] * b[7]
@@ -456,7 +474,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[0];
    /* VERIFY_BITS(d, 64); */
    /* [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
-    uint32_t t9 = d & M; d >>= 26;
+    t9 = d & M; d >>= 26;
    VERIFY_BITS(t9, 26);
    VERIFY_BITS(d, 38);
    /* [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
@@ -475,12 +493,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[1];
    VERIFY_BITS(d, 63);
    /* [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 26; c += u0 * R0;
+    u0 = d & M; d >>= 26; c += u0 * R0;
    VERIFY_BITS(u0, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 61);
    /* [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint32_t t0 = c & M; c >>= 26; c += u0 * R1;
+    t0 = c & M; c >>= 26; c += u0 * R1;
    VERIFY_BITS(t0, 26);
    VERIFY_BITS(c, 37);
    /* [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
@@ -500,12 +518,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[2];
    VERIFY_BITS(d, 63);
    /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint64_t u1 = d & M; d >>= 26; c += u1 * R0;
+    u1 = d & M; d >>= 26; c += u1 * R0;
    VERIFY_BITS(u1, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint32_t t1 = c & M; c >>= 26; c += u1 * R1;
+    t1 = c & M; c >>= 26; c += u1 * R1;
    VERIFY_BITS(t1, 26);
    VERIFY_BITS(c, 38);
    /* [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
@@ -525,12 +543,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[3];
    VERIFY_BITS(d, 63);
    /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint64_t u2 = d & M; d >>= 26; c += u2 * R0;
+    u2 = d & M; d >>= 26; c += u2 * R0;
    VERIFY_BITS(u2, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint32_t t2 = c & M; c >>= 26; c += u2 * R1;
+    t2 = c & M; c >>= 26; c += u2 * R1;
    VERIFY_BITS(t2, 26);
    VERIFY_BITS(c, 38);
    /* [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
@@ -550,12 +568,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[4];
    VERIFY_BITS(d, 63);
    /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint64_t u3 = d & M; d >>= 26; c += u3 * R0;
+    u3 = d & M; d >>= 26; c += u3 * R0;
    VERIFY_BITS(u3, 26);
    VERIFY_BITS(d, 37);
    /* VERIFY_BITS(c, 64); */
    /* [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint32_t t3 = c & M; c >>= 26; c += u3 * R1;
+    t3 = c & M; c >>= 26; c += u3 * R1;
    VERIFY_BITS(t3, 26);
    VERIFY_BITS(c, 39);
    /* [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
@@ -575,12 +593,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[5];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint64_t u4 = d & M; d >>= 26; c += u4 * R0;
+    u4 = d & M; d >>= 26; c += u4 * R0;
    VERIFY_BITS(u4, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint32_t t4 = c & M; c >>= 26; c += u4 * R1;
+    t4 = c & M; c >>= 26; c += u4 * R1;
    VERIFY_BITS(t4, 26);
    VERIFY_BITS(c, 39);
    /* [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
@@ -600,12 +618,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[6];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint64_t u5 = d & M; d >>= 26; c += u5 * R0;
+    u5 = d & M; d >>= 26; c += u5 * R0;
    VERIFY_BITS(u5, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint32_t t5 = c & M; c >>= 26; c += u5 * R1;
+    t5 = c & M; c >>= 26; c += u5 * R1;
    VERIFY_BITS(t5, 26);
    VERIFY_BITS(c, 39);
    /* [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
@@ -625,12 +643,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[7];
    VERIFY_BITS(d, 61);
    /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u6 = d & M; d >>= 26; c += u6 * R0;
+    u6 = d & M; d >>= 26; c += u6 * R0;
    VERIFY_BITS(u6, 26);
    VERIFY_BITS(d, 35);
    /* VERIFY_BITS(c, 64); */
    /* [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t6 = c & M; c >>= 26; c += u6 * R1;
+    t6 = c & M; c >>= 26; c += u6 * R1;
    VERIFY_BITS(t6, 26);
    VERIFY_BITS(c, 39);
    /* [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
@@ -651,13 +669,13 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[8];
    VERIFY_BITS(d, 58);
    /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u7 = d & M; d >>= 26; c += u7 * R0;
+    u7 = d & M; d >>= 26; c += u7 * R0;
    VERIFY_BITS(u7, 26);
    VERIFY_BITS(d, 32);
    /* VERIFY_BITS(c, 64); */
    VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
    /* [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t7 = c & M; c >>= 26; c += u7 * R1;
+    t7 = c & M; c >>= 26; c += u7 * R1;
    VERIFY_BITS(t7, 26);
    VERIFY_BITS(c, 38);
    /* [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
@@ -678,7 +696,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
    d += (uint64_t)a[9] * b[9];
    VERIFY_BITS(d, 57);
    /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u8 = d & M; d >>= 26; c += u8 * R0;
+    u8 = d & M; d >>= 26; c += u8 * R0;
    VERIFY_BITS(u8, 26);
    VERIFY_BITS(d, 31);
    /* VERIFY_BITS(c, 64); */
@@ -742,6 +760,11 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
 }

 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) {
+    uint64_t c, d;
+    uint64_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
+    uint32_t t9, t0, t1, t2, t3, t4, t5, t6, t7;
+    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
+
    VERIFY_BITS(a[0], 30);
    VERIFY_BITS(a[1], 30);
    VERIFY_BITS(a[2], 30);
@@ -753,14 +776,11 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
    VERIFY_BITS(a[8], 30);
    VERIFY_BITS(a[9], 26);

-    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
    /** [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*a[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
     */

-    uint64_t c, d;
-
    d  = (uint64_t)(a[0]*2) * a[9]
       + (uint64_t)(a[1]*2) * a[8]
       + (uint64_t)(a[2]*2) * a[7]
@@ -768,7 +788,7 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[4]*2) * a[5];
    /* VERIFY_BITS(d, 64); */
    /* [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
-    uint32_t t9 = d & M; d >>= 26;
+    t9 = d & M; d >>= 26;
    VERIFY_BITS(t9, 26);
    VERIFY_BITS(d, 38);
    /* [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
@@ -783,12 +803,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[5] * a[5];
    VERIFY_BITS(d, 63);
    /* [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 26; c += u0 * R0;
+    u0 = d & M; d >>= 26; c += u0 * R0;
    VERIFY_BITS(u0, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 61);
    /* [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint32_t t0 = c & M; c >>= 26; c += u0 * R1;
+    t0 = c & M; c >>= 26; c += u0 * R1;
    VERIFY_BITS(t0, 26);
    VERIFY_BITS(c, 37);
    /* [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
@@ -803,12 +823,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[5]*2) * a[6];
    VERIFY_BITS(d, 63);
    /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint64_t u1 = d & M; d >>= 26; c += u1 * R0;
+    u1 = d & M; d >>= 26; c += u1 * R0;
    VERIFY_BITS(u1, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint32_t t1 = c & M; c >>= 26; c += u1 * R1;
+    t1 = c & M; c >>= 26; c += u1 * R1;
    VERIFY_BITS(t1, 26);
    VERIFY_BITS(c, 38);
    /* [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
@@ -824,12 +844,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[6] * a[6];
    VERIFY_BITS(d, 63);
    /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint64_t u2 = d & M; d >>= 26; c += u2 * R0;
+    u2 = d & M; d >>= 26; c += u2 * R0;
    VERIFY_BITS(u2, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint32_t t2 = c & M; c >>= 26; c += u2 * R1;
+    t2 = c & M; c >>= 26; c += u2 * R1;
    VERIFY_BITS(t2, 26);
    VERIFY_BITS(c, 38);
    /* [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
@@ -844,12 +864,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[6]*2) * a[7];
    VERIFY_BITS(d, 63);
    /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint64_t u3 = d & M; d >>= 26; c += u3 * R0;
+    u3 = d & M; d >>= 26; c += u3 * R0;
    VERIFY_BITS(u3, 26);
    VERIFY_BITS(d, 37);
    /* VERIFY_BITS(c, 64); */
    /* [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint32_t t3 = c & M; c >>= 26; c += u3 * R1;
+    t3 = c & M; c >>= 26; c += u3 * R1;
    VERIFY_BITS(t3, 26);
    VERIFY_BITS(c, 39);
    /* [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
@@ -865,12 +885,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[7] * a[7];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint64_t u4 = d & M; d >>= 26; c += u4 * R0;
+    u4 = d & M; d >>= 26; c += u4 * R0;
    VERIFY_BITS(u4, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint32_t t4 = c & M; c >>= 26; c += u4 * R1;
+    t4 = c & M; c >>= 26; c += u4 * R1;
    VERIFY_BITS(t4, 26);
    VERIFY_BITS(c, 39);
    /* [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
@@ -885,12 +905,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[7]*2) * a[8];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint64_t u5 = d & M; d >>= 26; c += u5 * R0;
+    u5 = d & M; d >>= 26; c += u5 * R0;
    VERIFY_BITS(u5, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint32_t t5 = c & M; c >>= 26; c += u5 * R1;
+    t5 = c & M; c >>= 26; c += u5 * R1;
    VERIFY_BITS(t5, 26);
    VERIFY_BITS(c, 39);
    /* [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
@@ -906,12 +926,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[8] * a[8];
    VERIFY_BITS(d, 61);
    /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u6 = d & M; d >>= 26; c += u6 * R0;
+    u6 = d & M; d >>= 26; c += u6 * R0;
    VERIFY_BITS(u6, 26);
    VERIFY_BITS(d, 35);
    /* VERIFY_BITS(c, 64); */
    /* [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t6 = c & M; c >>= 26; c += u6 * R1;
+    t6 = c & M; c >>= 26; c += u6 * R1;
    VERIFY_BITS(t6, 26);
    VERIFY_BITS(c, 39);
    /* [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
@@ -927,13 +947,13 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
    d += (uint64_t)(a[8]*2) * a[9];
    VERIFY_BITS(d, 58);
    /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u7 = d & M; d >>= 26; c += u7 * R0;
+    u7 = d & M; d >>= 26; c += u7 * R0;
    VERIFY_BITS(u7, 26);
    VERIFY_BITS(d, 32);
    /* VERIFY_BITS(c, 64); */
    VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
    /* [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t7 = c & M; c >>= 26; c += u7 * R1;
+    t7 = c & M; c >>= 26; c += u7 * R1;
    VERIFY_BITS(t7, 26);
    VERIFY_BITS(c, 38);
    /* [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
@@ -950,7 +970,7 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
    d += (uint64_t)a[9] * a[9];
    VERIFY_BITS(d, 57);
    /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u8 = d & M; d >>= 26; c += u8 * R0;
+    u8 = d & M; d >>= 26; c += u8 * R0;
    VERIFY_BITS(u8, 26);
    VERIFY_BITS(d, 31);
    /* VERIFY_BITS(c, 64); */
@@ -1043,8 +1063,10 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int flag) {
-    uint32_t mask0 = flag + ~((uint32_t)0), mask1 = ~mask0;
+static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage_t *r, const secp256k1_fe_storage_t *a, int flag) {
+    uint32_t mask0, mask1;
+    mask0 = flag + ~((uint32_t)0);
+    mask1 = ~mask0;
    r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
    r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
    r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
@@ -1053,13 +1075,36 @@ static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int fl
    r->n[5] = (r->n[5] & mask0) | (a->n[5] & mask1);
    r->n[6] = (r->n[6] & mask0) | (a->n[6] & mask1);
    r->n[7] = (r->n[7] & mask0) | (a->n[7] & mask1);
-    r->n[8] = (r->n[8] & mask0) | (a->n[8] & mask1);
-    r->n[9] = (r->n[9] & mask0) | (a->n[9] & mask1);
+}
+
+static void secp256k1_fe_to_storage(secp256k1_fe_storage_t *r, const secp256k1_fe_t *a) {
 #ifdef VERIFY
-    if (flag) {
-        r->magnitude = a->magnitude;
-        r->normalized = a->normalized;
-    }
+    VERIFY_CHECK(a->normalized);
+#endif
+    r->n[0] = a->n[0] | a->n[1] << 26;
+    r->n[1] = a->n[1] >> 6 | a->n[2] << 20;
+    r->n[2] = a->n[2] >> 12 | a->n[3] << 14;
+    r->n[3] = a->n[3] >> 18 | a->n[4] << 8;
+    r->n[4] = a->n[4] >> 24 | a->n[5] << 2 | a->n[6] << 28;
+    r->n[5] = a->n[6] >> 4 | a->n[7] << 22;
+    r->n[6] = a->n[7] >> 10 | a->n[8] << 16;
+    r->n[7] = a->n[8] >> 16 | a->n[9] << 10;
+}
+
+static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe_t *r, const secp256k1_fe_storage_t *a) {
+    r->n[0] = a->n[0] & 0x3FFFFFFUL;
+    r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL);
+    r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL);
+    r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL);
+    r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL);
+    r->n[5] = (a->n[4] >> 2) & 0x3FFFFFFUL;
+    r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL);
+    r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL);
+    r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL);
+    r->n[9] = a->n[7] >> 10;
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
 #endif
 }

--- a/src/field_5x52.h
+++ b/src/field_5x52.h
@@ -18,4 +18,30 @@ typedef struct {
 #endif
 } secp256k1_fe_t;

+/* Unpacks a constant into a overlapping multi-limbed FE element. */
+#define SECP256K1_FE_CONST_INNER(d7, d6, d5, d4, d3, d2, d1, d0) { \
+    (d0) | ((uint64_t)(d1) & 0xFFFFFUL) << 32, \
+    ((d1) >> 20) | ((uint64_t)(d2)) << 12 | ((uint64_t)(d3) & 0xFFUL) << 44, \
+    ((d3) >> 8) | ((uint64_t)(d4) & 0xFFFFFFFUL) << 24, \
+    ((d4) >> 28) | ((uint64_t)(d5)) << 4 | ((uint64_t)(d6) & 0xFFFFUL) << 36, \
+    ((d6) >> 16) | ((uint64_t)(d7)) << 16 \
+}
+
+#ifdef VERIFY
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1}
+#else
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0))}
+#endif
+
+typedef struct {
+    uint64_t n[4];
+} secp256k1_fe_storage_t;
+
+#define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ \
+    (d0) | ((uint64_t)(d1)) << 32, \
+    (d2) | ((uint64_t)(d3)) << 32, \
+    (d4) | ((uint64_t)(d5)) << 32, \
+    (d6) | ((uint64_t)(d7)) << 32 \
+}}
+
 #endif
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -30,13 +30,11 @@
 *  output.
 */

-static void secp256k1_fe_inner_start(void) {}
-static void secp256k1_fe_inner_stop(void) {}
-
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe_t *a) {
    const uint64_t *d = a->n;
    int m = a->normalized ? 1 : 2 * a->magnitude, r = 1;
+   /* secp256k1 'p' value defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
    r &= (d[0] <= 0xFFFFFFFFFFFFFULL * m);
    r &= (d[1] <= 0xFFFFFFFFFFFFFULL * m);
    r &= (d[2] <= 0xFFFFFFFFFFFFFULL * m);
@@ -62,8 +60,8 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];

    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
    uint64_t m;
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;
@@ -129,8 +127,8 @@ static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];

    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
    uint64_t m;
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;
@@ -172,12 +170,12 @@ static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
 static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];

-    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
-
    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
    uint64_t z0, z1;

+    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
+
    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;
    t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; z0  = t0; z1  = t0 ^ 0x1000003D0ULL;
@@ -193,22 +191,31 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
 }

 static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe_t *r) {
-    uint64_t t0 = r->n[0], t4 = r->n[4];
+    uint64_t t0, t1, t2, t3, t4;
+    uint64_t z0, z1;
+    uint64_t x;
+
+    t0 = r->n[0];
+    t4 = r->n[4];

    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48;
+    x = t4 >> 48;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;

    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
-    uint64_t z0 = t0 & 0xFFFFFFFFFFFFFULL, z1 = z0 ^ 0x1000003D0ULL;
+    z0 = t0 & 0xFFFFFFFFFFFFFULL;
+    z1 = z0 ^ 0x1000003D0ULL;

    /* Fast return path should catch the majority of cases */
    if ((z0 != 0ULL) & (z1 != 0xFFFFFFFFFFFFFULL))
        return 0;

-    uint64_t t1 = r->n[1], t2 = r->n[2], t3 = r->n[3];
+    t1 = r->n[1];
+    t2 = r->n[2];
+    t3 = r->n[3];
+
    t4 &= 0x0FFFFFFFFFFFFULL;

    t1 += (t0 >> 52); t0  = z0;
@@ -234,11 +241,11 @@ SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
 }

 SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe_t *a) {
+    const uint64_t *t = a->n;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    const uint64_t *t = a->n;
    return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0;
 }

@@ -251,23 +258,25 @@ SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe_t *a) {
 }

 SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    a->magnitude = 0;
    a->normalized = 1;
 #endif
-    for (int i=0; i<5; i++) {
+    for (i=0; i<5; i++) {
        a->n[i] = 0;
    }
 }

 static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    VERIFY_CHECK(b->normalized);
    secp256k1_fe_verify(a);
    secp256k1_fe_verify(b);
 #endif
-    for (int i = 4; i >= 0; i--) {
+    for (i = 4; i >= 0; i--) {
        if (a->n[i] > b->n[i]) return 1;
        if (a->n[i] < b->n[i]) return -1;
    }
@@ -275,9 +284,11 @@ static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b
 }

 static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {
+    int i;
    r->n[0] = r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
-    for (int i=0; i<32; i++) {
-        for (int j=0; j<2; j++) {
+    for (i=0; i<32; i++) {
+        int j;
+        for (j=0; j<2; j++) {
            int limb = (8*i+4*j)/52;
            int shift = (8*i+4*j)%52;
            r->n[limb] |= (uint64_t)((a[31-i] >> (4*j)) & 0xF) << shift;
@@ -296,13 +307,15 @@ static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {

 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    for (int i=0; i<32; i++) {
+    for (i=0; i<32; i++) {
+        int j;
        int c = 0;
-        for (int j=0; j<2; j++) {
+        for (j=0; j<2; j++) {
            int limb = (8*i+4*j)/52;
            int shift = (8*i+4*j)%52;
            c |= ((a->n[limb] >> shift) & 0xF) << (4 * j);
@@ -386,18 +399,35 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int flag) {
-    uint64_t mask0 = flag + ~((uint64_t)0), mask1 = ~mask0;
+static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage_t *r, const secp256k1_fe_storage_t *a, int flag) {
+    uint64_t mask0, mask1;
+    mask0 = flag + ~((uint64_t)0);
+    mask1 = ~mask0;
    r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
    r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
    r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
    r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
-    r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
+}
+
+static void secp256k1_fe_to_storage(secp256k1_fe_storage_t *r, const secp256k1_fe_t *a) {
 #ifdef VERIFY
-    if (flag) {
-        r->magnitude = a->magnitude;
-        r->normalized = a->normalized;
-    }
+    VERIFY_CHECK(a->normalized);
+#endif
+    r->n[0] = a->n[0] | a->n[1] << 52;
+    r->n[1] = a->n[1] >> 12 | a->n[2] << 40;
+    r->n[2] = a->n[2] >> 24 | a->n[3] << 28;
+    r->n[3] = a->n[3] >> 36 | a->n[4] << 16;
+}
+
+static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe_t *r, const secp256k1_fe_storage_t *a) {
+    r->n[0] = a->n[0] & 0xFFFFFFFFFFFFFULL;
+    r->n[1] = a->n[0] >> 52 | ((a->n[1] << 12) & 0xFFFFFFFFFFFFFULL);
+    r->n[2] = a->n[1] >> 40 | ((a->n[2] << 24) & 0xFFFFFFFFFFFFFULL);
+    r->n[3] = a->n[2] >> 28 | ((a->n[3] << 36) & 0xFFFFFFFFFFFFFULL);
+    r->n[4] = a->n[3] >> 16;
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
 #endif
 }

--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@@ -16,6 +16,11 @@
 #endif

 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
+    uint128_t c, d;
+    uint64_t t3, t4, tx, u0;
+    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
+
    VERIFY_BITS(a[0], 56);
    VERIFY_BITS(a[1], 56);
    VERIFY_BITS(a[2], 56);
@@ -28,63 +33,58 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(b[4], 52);
    VERIFY_CHECK(r != b);

-    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
    /*  [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*b[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-
-    __int128 c, d;
-
-    d  = (__int128)a0 * b[3]
-       + (__int128)a1 * b[2]
-       + (__int128)a2 * b[1]
-       + (__int128)a3 * b[0];
+    d  = (uint128_t)a0 * b[3]
+       + (uint128_t)a1 * b[2]
+       + (uint128_t)a2 * b[1]
+       + (uint128_t)a3 * b[0];
    VERIFY_BITS(d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (__int128)a4 * b[4];
+    c  = (uint128_t)a4 * b[4];
    VERIFY_BITS(c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
    d += (c & M) * R; c >>= 52;
    VERIFY_BITS(d, 115);
    VERIFY_BITS(c, 60);
    /* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    uint64_t t3 = d & M; d >>= 52;
+    t3 = d & M; d >>= 52;
    VERIFY_BITS(t3, 52);
    VERIFY_BITS(d, 63);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

-    d += (__int128)a0 * b[4]
-       + (__int128)a1 * b[3]
-       + (__int128)a2 * b[2]
-       + (__int128)a3 * b[1]
-       + (__int128)a4 * b[0];
+    d += (uint128_t)a0 * b[4]
+       + (uint128_t)a1 * b[3]
+       + (uint128_t)a2 * b[2]
+       + (uint128_t)a3 * b[1]
+       + (uint128_t)a4 * b[0];
    VERIFY_BITS(d, 115);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    d += c * R;
    VERIFY_BITS(d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t t4 = d & M; d >>= 52;
+    t4 = d & M; d >>= 52;
    VERIFY_BITS(t4, 52);
    VERIFY_BITS(d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t tx = (t4 >> 48); t4 &= (M >> 4);
+    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (__int128)a0 * b[0];
+    c  = (uint128_t)a0 * b[0];
    VERIFY_BITS(c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (__int128)a1 * b[4]
-       + (__int128)a2 * b[3]
-       + (__int128)a3 * b[2]
-       + (__int128)a4 * b[1];
+    d += (uint128_t)a1 * b[4]
+       + (uint128_t)a2 * b[3]
+       + (uint128_t)a3 * b[2]
+       + (uint128_t)a4 * b[1];
    VERIFY_BITS(d, 115);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 52;
+    u0 = d & M; d >>= 52;
    VERIFY_BITS(u0, 52);
    VERIFY_BITS(d, 63);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
@@ -92,7 +92,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (__int128)u0 * (R >> 4);
+    c += (uint128_t)u0 * (R >> 4);
    VERIFY_BITS(c, 115);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    r[0] = c & M; c >>= 52;
@@ -100,13 +100,13 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(c, 61);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

-    c += (__int128)a0 * b[1]
-       + (__int128)a1 * b[0];
+    c += (uint128_t)a0 * b[1]
+       + (uint128_t)a1 * b[0];
    VERIFY_BITS(c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (__int128)a2 * b[4]
-       + (__int128)a3 * b[3]
-       + (__int128)a4 * b[2];
+    d += (uint128_t)a2 * b[4]
+       + (uint128_t)a3 * b[3]
+       + (uint128_t)a4 * b[2];
    VERIFY_BITS(d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
    c += (d & M) * R; d >>= 52;
@@ -118,13 +118,13 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (__int128)a0 * b[2]
-       + (__int128)a1 * b[1]
-       + (__int128)a2 * b[0];
+    c += (uint128_t)a0 * b[2]
+       + (uint128_t)a1 * b[1]
+       + (uint128_t)a2 * b[0];
    VERIFY_BITS(c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (__int128)a3 * b[4]
-       + (__int128)a4 * b[3];
+    d += (uint128_t)a3 * b[4]
+       + (uint128_t)a4 * b[3];
    VERIFY_BITS(d, 114);
    /* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
    c += (d & M) * R; d >>= 52;
@@ -153,64 +153,64 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
 }

 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
+    uint128_t c, d;
+    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    int64_t t3, t4, tx, u0;
+    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
+
    VERIFY_BITS(a[0], 56);
    VERIFY_BITS(a[1], 56);
    VERIFY_BITS(a[2], 56);
    VERIFY_BITS(a[3], 56);
    VERIFY_BITS(a[4], 52);

-    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
    /**  [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*a[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    __int128 c, d;
-
-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-
-    d  = (__int128)(a0*2) * a3
-       + (__int128)(a1*2) * a2;
+    d  = (uint128_t)(a0*2) * a3
+       + (uint128_t)(a1*2) * a2;
    VERIFY_BITS(d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (__int128)a4 * a4;
+    c  = (uint128_t)a4 * a4;
    VERIFY_BITS(c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
    d += (c & M) * R; c >>= 52;
    VERIFY_BITS(d, 115);
    VERIFY_BITS(c, 60);
    /* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    uint64_t t3 = d & M; d >>= 52;
+    t3 = d & M; d >>= 52;
    VERIFY_BITS(t3, 52);
    VERIFY_BITS(d, 63);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

    a4 *= 2;
-    d += (__int128)a0 * a4
-       + (__int128)(a1*2) * a3
-       + (__int128)a2 * a2;
+    d += (uint128_t)a0 * a4
+       + (uint128_t)(a1*2) * a3
+       + (uint128_t)a2 * a2;
    VERIFY_BITS(d, 115);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    d += c * R;
    VERIFY_BITS(d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t t4 = d & M; d >>= 52;
+    t4 = d & M; d >>= 52;
    VERIFY_BITS(t4, 52);
    VERIFY_BITS(d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t tx = (t4 >> 48); t4 &= (M >> 4);
+    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (__int128)a0 * a0;
+    c  = (uint128_t)a0 * a0;
    VERIFY_BITS(c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (__int128)a1 * a4
-       + (__int128)(a2*2) * a3;
+    d += (uint128_t)a1 * a4
+       + (uint128_t)(a2*2) * a3;
    VERIFY_BITS(d, 114);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 52;
+    u0 = d & M; d >>= 52;
    VERIFY_BITS(u0, 52);
    VERIFY_BITS(d, 62);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
@@ -218,7 +218,7 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (__int128)u0 * (R >> 4);
+    c += (uint128_t)u0 * (R >> 4);
    VERIFY_BITS(c, 113);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    r[0] = c & M; c >>= 52;
@@ -227,11 +227,11 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

    a0 *= 2;
-    c += (__int128)a0 * a1;
+    c += (uint128_t)a0 * a1;
    VERIFY_BITS(c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (__int128)a2 * a4
-       + (__int128)a3 * a3;
+    d += (uint128_t)a2 * a4
+       + (uint128_t)a3 * a3;
    VERIFY_BITS(d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
    c += (d & M) * R; d >>= 52;
@@ -243,11 +243,11 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (__int128)a0 * a2
-       + (__int128)a1 * a1;
+    c += (uint128_t)a0 * a2
+       + (uint128_t)a1 * a1;
    VERIFY_BITS(c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (__int128)a3 * a4;
+    d += (uint128_t)a3 * a4;
    VERIFY_BITS(d, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
    c += (d & M) * R; d >>= 52;
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -21,49 +21,6 @@
 #error "Please select field implementation"
 #endif

-static void secp256k1_fe_get_hex(char *r, int *rlen, const secp256k1_fe_t *a) {
-    if (*rlen < 65) {
-        *rlen = 65;
-        return;
-    }
-    *rlen = 65;
-    unsigned char tmp[32];
-    secp256k1_fe_t b = *a;
-    secp256k1_fe_normalize(&b);
-    secp256k1_fe_get_b32(tmp, &b);
-    for (int i=0; i<32; i++) {
-        static const char *c = "0123456789ABCDEF";
-        r[2*i]   = c[(tmp[i] >> 4) & 0xF];
-        r[2*i+1] = c[(tmp[i]) & 0xF];
-    }
-    r[64] = 0x00;
-}
-
-static int secp256k1_fe_set_hex(secp256k1_fe_t *r, const char *a, int alen) {
-    unsigned char tmp[32] = {};
-    static const int cvt[256] = {0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 1, 2, 3, 4, 5, 6,7,8,9,0,0,0,0,0,0,
-                                 0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0};
-    for (int i=0; i<32; i++) {
-        if (alen > i*2)
-            tmp[32 - alen/2 + i] = (cvt[(unsigned char)a[2*i]] << 4) + cvt[(unsigned char)a[2*i+1]];
-    }
-    return secp256k1_fe_set_b32(r, tmp);
-}
-
 SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
    secp256k1_fe_t na;
    secp256k1_fe_negate(&na, a, 1);
@@ -72,62 +29,62 @@ SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe_t *a, cons
 }

 static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+    secp256k1_fe_t x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;

    /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
     *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
     *  1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
     */

-    secp256k1_fe_t x2;
    secp256k1_fe_sqr(&x2, a);
    secp256k1_fe_mul(&x2, &x2, a);

-    secp256k1_fe_t x3;
    secp256k1_fe_sqr(&x3, &x2);
    secp256k1_fe_mul(&x3, &x3, a);

-    secp256k1_fe_t x6 = x3;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
+    x6 = x3;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
    secp256k1_fe_mul(&x6, &x6, &x3);

-    secp256k1_fe_t x9 = x6;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
+    x9 = x6;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
    secp256k1_fe_mul(&x9, &x9, &x3);

-    secp256k1_fe_t x11 = x9;
-    for (int j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
+    x11 = x9;
+    for (j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
    secp256k1_fe_mul(&x11, &x11, &x2);

-    secp256k1_fe_t x22 = x11;
-    for (int j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
+    x22 = x11;
+    for (j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
    secp256k1_fe_mul(&x22, &x22, &x11);

-    secp256k1_fe_t x44 = x22;
-    for (int j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
+    x44 = x22;
+    for (j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
    secp256k1_fe_mul(&x44, &x44, &x22);

-    secp256k1_fe_t x88 = x44;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
+    x88 = x44;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
    secp256k1_fe_mul(&x88, &x88, &x44);

-    secp256k1_fe_t x176 = x88;
-    for (int j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
+    x176 = x88;
+    for (j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
    secp256k1_fe_mul(&x176, &x176, &x88);

-    secp256k1_fe_t x220 = x176;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
+    x220 = x176;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
    secp256k1_fe_mul(&x220, &x220, &x44);

-    secp256k1_fe_t x223 = x220;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
+    x223 = x220;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
    secp256k1_fe_mul(&x223, &x223, &x3);

    /* The final result is then assembled using a sliding window over the blocks. */

-    secp256k1_fe_t t1 = x223;
-    for (int j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
+    t1 = x223;
+    for (j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (int j=0; j<6; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<6; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x2);
    secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_sqr(r, &t1);
@@ -139,66 +96,66 @@ static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 }

 static void secp256k1_fe_inv(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+    secp256k1_fe_t x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;

    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
     */

-    secp256k1_fe_t x2;
    secp256k1_fe_sqr(&x2, a);
    secp256k1_fe_mul(&x2, &x2, a);

-    secp256k1_fe_t x3;
    secp256k1_fe_sqr(&x3, &x2);
    secp256k1_fe_mul(&x3, &x3, a);

-    secp256k1_fe_t x6 = x3;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
+    x6 = x3;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
    secp256k1_fe_mul(&x6, &x6, &x3);

-    secp256k1_fe_t x9 = x6;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
+    x9 = x6;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
    secp256k1_fe_mul(&x9, &x9, &x3);

-    secp256k1_fe_t x11 = x9;
-    for (int j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
+    x11 = x9;
+    for (j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
    secp256k1_fe_mul(&x11, &x11, &x2);

-    secp256k1_fe_t x22 = x11;
-    for (int j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
+    x22 = x11;
+    for (j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
    secp256k1_fe_mul(&x22, &x22, &x11);

-    secp256k1_fe_t x44 = x22;
-    for (int j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
+    x44 = x22;
+    for (j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
    secp256k1_fe_mul(&x44, &x44, &x22);

-    secp256k1_fe_t x88 = x44;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
+    x88 = x44;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
    secp256k1_fe_mul(&x88, &x88, &x44);

-    secp256k1_fe_t x176 = x88;
-    for (int j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
+    x176 = x88;
+    for (j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
    secp256k1_fe_mul(&x176, &x176, &x88);

-    secp256k1_fe_t x220 = x176;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
+    x220 = x176;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
    secp256k1_fe_mul(&x220, &x220, &x44);

-    secp256k1_fe_t x223 = x220;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
+    x223 = x220;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
    secp256k1_fe_mul(&x223, &x223, &x3);

    /* The final result is then assembled using a sliding window over the blocks. */

-    secp256k1_fe_t t1 = x223;
-    for (int j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
+    t1 = x223;
+    for (j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (int j=0; j<5; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<5; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, a);
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (int j=0; j<2; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<2; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(r, a, &t1);
 }

@@ -206,13 +163,21 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #if defined(USE_FIELD_INV_BUILTIN)
    secp256k1_fe_inv(r, a);
 #elif defined(USE_FIELD_INV_NUM)
+    secp256k1_num_t n, m;
+    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
+    static const unsigned char prime[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+    };
    unsigned char b[32];
    secp256k1_fe_t c = *a;
    secp256k1_fe_normalize_var(&c);
    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_t n;
    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_mod_inverse(&n, &n, &secp256k1_fe_consts->p);
+    secp256k1_num_set_bin(&m, prime, 32);
+    secp256k1_num_mod_inverse(&n, &n, &m);
    secp256k1_num_get_bin(b, 32, &n);
    VERIFY_CHECK(secp256k1_fe_set_b32(r, b));
 #else
@@ -220,7 +185,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]) {
+static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+    secp256k1_fe_t u;
+    size_t i;
    if (len < 1)
        return;

@@ -228,12 +195,12 @@ static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const se

    r[0] = a[0];

-    size_t i = 0;
+    i = 0;
    while (++i < len) {
        secp256k1_fe_mul(&r[i], &r[i - 1], &a[i]);
    }

-    secp256k1_fe_t u; secp256k1_fe_inv_var(&u, &r[--i]);
+    secp256k1_fe_inv_var(&u, &r[--i]);

    while (i > 0) {
        int j = i--;
@@ -244,32 +211,4 @@ static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const se
    r[0] = u;
 }

-static void secp256k1_fe_start(void) {
-#ifndef USE_NUM_NONE
-    static const unsigned char secp256k1_fe_consts_p[] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-#endif
-    if (secp256k1_fe_consts == NULL) {
-        secp256k1_fe_inner_start();
-        secp256k1_fe_consts_t *ret = (secp256k1_fe_consts_t*)checked_malloc(sizeof(secp256k1_fe_consts_t));
-#ifndef USE_NUM_NONE
-        secp256k1_num_set_bin(&ret->p, secp256k1_fe_consts_p, sizeof(secp256k1_fe_consts_p));
-#endif
-        secp256k1_fe_consts = ret;
-    }
-}
-
-static void secp256k1_fe_stop(void) {
-    if (secp256k1_fe_consts != NULL) {
-        secp256k1_fe_consts_t *c = (secp256k1_fe_consts_t*)secp256k1_fe_consts;
-        free((void*)c);
-        secp256k1_fe_consts = NULL;
-        secp256k1_fe_inner_stop();
-    }
-}
-
 #endif
--- a/src/group.h
+++ b/src/group.h
@@ -17,6 +17,9 @@ typedef struct {
    int infinity; /* whether this represents the point at infinity */
 } secp256k1_ge_t;

+#define SECP256K1_GE_CONST(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) {SECP256K1_FE_CONST((a),(b),(c),(d),(e),(f),(g),(h)), SECP256K1_FE_CONST((i),(j),(k),(l),(m),(n),(o),(p)), 0}
+#define SECP256K1_GE_CONST_INFINITY {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), 1}
+
 /** A group element of the secp256k1 curve, in jacobian coordinates. */
 typedef struct {
    secp256k1_fe_t x; /* actual X: x/z^2 */
@@ -25,23 +28,15 @@ typedef struct {
    int infinity; /* whether this represents the point at infinity */
 } secp256k1_gej_t;

-/** Global constants related to the group */
+#define SECP256K1_GEJ_CONST(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) {SECP256K1_FE_CONST((a),(b),(c),(d),(e),(f),(g),(h)), SECP256K1_FE_CONST((i),(j),(k),(l),(m),(n),(o),(p)), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1), 0}
+#define SECP256K1_GEJ_CONST_INFINITY {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), 1}
+
 typedef struct {
-    secp256k1_ge_t g; /* the generator point */
+    secp256k1_fe_storage_t x;
+    secp256k1_fe_storage_t y;
+} secp256k1_ge_storage_t;

-#ifdef USE_ENDOMORPHISM
-    /* constants related to secp256k1's efficiently computable endomorphism */
-    secp256k1_fe_t beta;
-#endif
-} secp256k1_ge_consts_t;
-
-static const secp256k1_ge_consts_t *secp256k1_ge_consts = NULL;
-
-/** Initialize the group module. */
-static void secp256k1_ge_start(void);
-
-/** De-initialize the group module. */
-static void secp256k1_ge_stop(void);
+#define SECP256K1_GE_STORAGE_CONST(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) {SECP256K1_FE_STORAGE_CONST((a),(b),(c),(d),(e),(f),(g),(h)), SECP256K1_FE_STORAGE_CONST((i),(j),(k),(l),(m),(n),(o),(p))}

 /** Set a group element equal to the point at infinity */
 static void secp256k1_ge_set_infinity(secp256k1_ge_t *r);
@@ -61,14 +56,11 @@ static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a);

 static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a);

-/** Get a hex representation of a point. *rlen will be overwritten with the real length. */
-static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a);
-
 /** Set a group element equal to another which is given in jacobian coordinates */
 static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a);

 /** Set a batch of group elements equal to the inputs given in jacobian coordinates */
-static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]);
+static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t *r, const secp256k1_gej_t *a);


 /** Set a group element (jacobian) equal to the point at infinity. */
@@ -103,9 +95,6 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
    guarantee, and b is allowed to be infinity. */
 static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b);

-/** Get a hex representation of a point. *rlen will be overwritten with the real length. */
-static void secp256k1_gej_get_hex(char *r, int *rlen, const secp256k1_gej_t *a);
-
 #ifdef USE_ENDOMORPHISM
 /** Set r to be equal to lambda times a, where lambda is chosen in a way such that this is very fast. */
 static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *a);
@@ -117,4 +106,13 @@ static void secp256k1_gej_clear(secp256k1_gej_t *r);
 /** Clear a secp256k1_ge_t to prevent leaking sensitive information. */
 static void secp256k1_ge_clear(secp256k1_ge_t *r);

+/** Convert a group element to the storage type. */
+static void secp256k1_ge_to_storage(secp256k1_ge_storage_t *r, const secp256k1_ge_t*);
+
+/** Convert a group element back from the storage type. */
+static void secp256k1_ge_from_storage(secp256k1_ge_t *r, const secp256k1_ge_storage_t*);
+
+/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
+static void secp256k1_ge_storage_cmov(secp256k1_ge_storage_t *r, const secp256k1_ge_storage_t *a, int flag);
+
 #endif
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -13,6 +13,16 @@
 #include "field.h"
 #include "group.h"

+/** Generator for secp256k1, value 'g' defined in
+ *  "Standards for Efficient Cryptography" (SEC2) 2.7.1.
+ */
+static const secp256k1_ge_t secp256k1_ge_const_g = SECP256K1_GE_CONST(
+    0x79BE667EUL, 0xF9DCBBACUL, 0x55A06295UL, 0xCE870B07UL,
+    0x029BFCDBUL, 0x2DCE28D9UL, 0x59F2815BUL, 0x16F81798UL,
+    0x483ADA77UL, 0x26A3C465UL, 0x5DA4FBFCUL, 0x0E1108A8UL,
+    0xFD17B448UL, 0xA6855419UL, 0x9C47D08FUL, 0xFB10D4B8UL
+);
+
 static void secp256k1_ge_set_infinity(secp256k1_ge_t *r) {
    r->infinity = 1;
 }
@@ -33,32 +43,12 @@ static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
    secp256k1_fe_negate(&r->y, &r->y, 1);
 }

-static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a) {
-    char cx[65]; int lx=65;
-    char cy[65]; int ly=65;
-    secp256k1_fe_get_hex(cx, &lx, &a->x);
-    secp256k1_fe_get_hex(cy, &ly, &a->y);
-    lx = strlen(cx);
-    ly = strlen(cy);
-    int len = lx + ly + 3 + 1;
-    if (*rlen < len) {
-        *rlen = len;
-        return;
-    }
-    *rlen = len;
-    r[0] = '(';
-    memcpy(r+1, cx, lx);
-    r[1+lx] = ',';
-    memcpy(r+2+lx, cy, ly);
-    r[2+lx+ly] = ')';
-    r[3+lx+ly] = 0;
-}
-
 static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a) {
+    secp256k1_fe_t z2, z3;
    r->infinity = a->infinity;
    secp256k1_fe_inv(&a->z, &a->z);
-    secp256k1_fe_t z2; secp256k1_fe_sqr(&z2, &a->z);
-    secp256k1_fe_t z3; secp256k1_fe_mul(&z3, &a->z, &z2);
+    secp256k1_fe_sqr(&z2, &a->z);
+    secp256k1_fe_mul(&z3, &a->z, &z2);
    secp256k1_fe_mul(&a->x, &a->x, &z2);
    secp256k1_fe_mul(&a->y, &a->y, &z3);
    secp256k1_fe_set_int(&a->z, 1);
@@ -67,13 +57,14 @@ static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a) {
 }

 static void secp256k1_ge_set_gej_var(secp256k1_ge_t *r, secp256k1_gej_t *a) {
+    secp256k1_fe_t z2, z3;
    r->infinity = a->infinity;
    if (a->infinity) {
        return;
    }
    secp256k1_fe_inv_var(&a->z, &a->z);
-    secp256k1_fe_t z2; secp256k1_fe_sqr(&z2, &a->z);
-    secp256k1_fe_t z3; secp256k1_fe_mul(&z3, &a->z, &z2);
+    secp256k1_fe_sqr(&z2, &a->z);
+    secp256k1_fe_mul(&z3, &a->z, &z2);
    secp256k1_fe_mul(&a->x, &a->x, &z2);
    secp256k1_fe_mul(&a->y, &a->y, &z3);
    secp256k1_fe_set_int(&a->z, 1);
@@ -81,26 +72,30 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge_t *r, secp256k1_gej_t *a) {
    r->y = a->y;
 }

-static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]) {
+static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t *r, const secp256k1_gej_t *a) {
+    secp256k1_fe_t *az;
+    secp256k1_fe_t *azi;
+    size_t i;
    size_t count = 0;
-    secp256k1_fe_t *az = checked_malloc(sizeof(secp256k1_fe_t) * len);
-    for (size_t i=0; i<len; i++) {
+    az = checked_malloc(sizeof(secp256k1_fe_t) * len);
+    for (i = 0; i < len; i++) {
        if (!a[i].infinity) {
            az[count++] = a[i].z;
        }
    }

-    secp256k1_fe_t *azi = checked_malloc(sizeof(secp256k1_fe_t) * count);
+    azi = checked_malloc(sizeof(secp256k1_fe_t) * count);
    secp256k1_fe_inv_all_var(count, azi, az);
    free(az);

    count = 0;
-    for (size_t i=0; i<len; i++) {
+    for (i = 0; i < len; i++) {
        r[i].infinity = a[i].infinity;
        if (!a[i].infinity) {
+            secp256k1_fe_t zi2, zi3;
            secp256k1_fe_t *zi = &azi[count++];
-            secp256k1_fe_t zi2; secp256k1_fe_sqr(&zi2, zi);
-            secp256k1_fe_t zi3; secp256k1_fe_mul(&zi3, &zi2, zi);
+            secp256k1_fe_sqr(&zi2, zi);
+            secp256k1_fe_mul(&zi3, &zi2, zi);
            secp256k1_fe_mul(&r[i].x, &a[i].x, &zi2);
            secp256k1_fe_mul(&r[i].y, &a[i].y, &zi3);
        }
@@ -136,11 +131,12 @@ static void secp256k1_ge_clear(secp256k1_ge_t *r) {
 }

 static int secp256k1_ge_set_xo_var(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd) {
+    secp256k1_fe_t x2, x3, c;
    r->x = *x;
-    secp256k1_fe_t x2; secp256k1_fe_sqr(&x2, x);
-    secp256k1_fe_t x3; secp256k1_fe_mul(&x3, x, &x2);
+    secp256k1_fe_sqr(&x2, x);
+    secp256k1_fe_mul(&x3, x, &x2);
    r->infinity = 0;
-    secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
+    secp256k1_fe_set_int(&c, 7);
    secp256k1_fe_add(&c, &x3);
    if (!secp256k1_fe_sqrt_var(&r->y, &c))
        return 0;
@@ -158,9 +154,10 @@ static void secp256k1_gej_set_ge(secp256k1_gej_t *r, const secp256k1_ge_t *a) {
 }

 static int secp256k1_gej_eq_x_var(const secp256k1_fe_t *x, const secp256k1_gej_t *a) {
+    secp256k1_fe_t r, r2;
    VERIFY_CHECK(!a->infinity);
-    secp256k1_fe_t r; secp256k1_fe_sqr(&r, &a->z); secp256k1_fe_mul(&r, &r, x);
-    secp256k1_fe_t r2 = a->x; secp256k1_fe_normalize_weak(&r2);
+    secp256k1_fe_sqr(&r, &a->z); secp256k1_fe_mul(&r, &r, x);
+    r2 = a->x; secp256k1_fe_normalize_weak(&r2);
    return secp256k1_fe_equal_var(&r, &r2);
 }

@@ -178,6 +175,7 @@ static int secp256k1_gej_is_infinity(const secp256k1_gej_t *a) {
 }

 static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
+    secp256k1_fe_t y2, x3, z2, z6;
    if (a->infinity)
        return 0;
    /** y^2 = x^3 + 7
@@ -185,10 +183,10 @@ static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
     *  Y^2 / Z^6 = X^3 / Z^6 + 7
     *  Y^2 = X^3 + 7*Z^6
     */
-    secp256k1_fe_t y2; secp256k1_fe_sqr(&y2, &a->y);
-    secp256k1_fe_t x3; secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
-    secp256k1_fe_t z2; secp256k1_fe_sqr(&z2, &a->z);
-    secp256k1_fe_t z6; secp256k1_fe_sqr(&z6, &z2); secp256k1_fe_mul(&z6, &z6, &z2);
+    secp256k1_fe_sqr(&y2, &a->y);
+    secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
+    secp256k1_fe_sqr(&z2, &a->z);
+    secp256k1_fe_sqr(&z6, &z2); secp256k1_fe_mul(&z6, &z6, &z2);
    secp256k1_fe_mul_int(&z6, 7);
    secp256k1_fe_add(&x3, &z6);
    secp256k1_fe_normalize_weak(&x3);
@@ -196,27 +194,30 @@ static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
 }

 static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a) {
+    secp256k1_fe_t y2, x3, c;
    if (a->infinity)
        return 0;
    /* y^2 = x^3 + 7 */
-    secp256k1_fe_t y2; secp256k1_fe_sqr(&y2, &a->y);
-    secp256k1_fe_t x3; secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
-    secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
+    secp256k1_fe_sqr(&y2, &a->y);
+    secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
+    secp256k1_fe_set_int(&c, 7);
    secp256k1_fe_add(&x3, &c);
    secp256k1_fe_normalize_weak(&x3);
    return secp256k1_fe_equal_var(&y2, &x3);
 }

 static void secp256k1_gej_double_var(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
-    // For secp256k1, 2Q is infinity if and only if Q is infinity. This is because if 2Q = infinity,
-    // Q must equal -Q, or that Q.y == -(Q.y), or Q.y is 0. For a point on y^2 = x^3 + 7 to have
-    // y=0, x^3 must be -7 mod p. However, -7 has no cube root mod p.
+    /* Operations: 3 mul, 4 sqr, 0 normalize, 12 mul_int/add/negate */
+    secp256k1_fe_t t1,t2,t3,t4;
+    /** For secp256k1, 2Q is infinity if and only if Q is infinity. This is because if 2Q = infinity,
+     *  Q must equal -Q, or that Q.y == -(Q.y), or Q.y is 0. For a point on y^2 = x^3 + 7 to have
+     *  y=0, x^3 must be -7 mod p. However, -7 has no cube root mod p.
+     */
    r->infinity = a->infinity;
    if (r->infinity) {
        return;
    }

-    secp256k1_fe_t t1,t2,t3,t4;
    secp256k1_fe_mul(&r->z, &a->z, &a->y);
    secp256k1_fe_mul_int(&r->z, 2);       /* Z' = 2*Y*Z (2) */
    secp256k1_fe_sqr(&t1, &a->x);
@@ -240,6 +241,8 @@ static void secp256k1_gej_double_var(secp256k1_gej_t *r, const secp256k1_gej_t *
 }

 static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_gej_t *b) {
+    /* Operations: 12 mul, 4 sqr, 2 normalize, 12 mul_int/add/negate */
+    secp256k1_fe_t z22, z12, u1, u2, s1, s2, h, i, i2, h2, h3, t;
    if (a->infinity) {
        *r = *b;
        return;
@@ -249,14 +252,14 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
        return;
    }
    r->infinity = 0;
-    secp256k1_fe_t z22; secp256k1_fe_sqr(&z22, &b->z);
-    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
-    secp256k1_fe_t u1; secp256k1_fe_mul(&u1, &a->x, &z22);
-    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
-    secp256k1_fe_t s1; secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z);
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_t h; secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
-    secp256k1_fe_t i; secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
+    secp256k1_fe_sqr(&z22, &b->z);
+    secp256k1_fe_sqr(&z12, &a->z);
+    secp256k1_fe_mul(&u1, &a->x, &z22);
+    secp256k1_fe_mul(&u2, &b->x, &z12);
+    secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z);
+    secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
+    secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
+    secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
    if (secp256k1_fe_normalizes_to_zero_var(&h)) {
        if (secp256k1_fe_normalizes_to_zero_var(&i)) {
            secp256k1_gej_double_var(r, a);
@@ -265,11 +268,11 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
        }
        return;
    }
-    secp256k1_fe_t i2; secp256k1_fe_sqr(&i2, &i);
-    secp256k1_fe_t h2; secp256k1_fe_sqr(&h2, &h);
-    secp256k1_fe_t h3; secp256k1_fe_mul(&h3, &h, &h2);
+    secp256k1_fe_sqr(&i2, &i);
+    secp256k1_fe_sqr(&h2, &h);
+    secp256k1_fe_mul(&h3, &h, &h2);
    secp256k1_fe_mul(&r->z, &a->z, &b->z); secp256k1_fe_mul(&r->z, &r->z, &h);
-    secp256k1_fe_t t; secp256k1_fe_mul(&t, &u1, &h2);
+    secp256k1_fe_mul(&t, &u1, &h2);
    r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2);
    secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i);
    secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1);
@@ -277,6 +280,8 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
 }

 static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b) {
+    /* 8 mul, 3 sqr, 4 normalize, 12 mul_int/add/negate */
+    secp256k1_fe_t z12, u1, u2, s1, s2, h, i, i2, h2, h3, t;
    if (a->infinity) {
        r->infinity = b->infinity;
        r->x = b->x;
@@ -289,13 +294,13 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
        return;
    }
    r->infinity = 0;
-    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
-    secp256k1_fe_t u1 = a->x; secp256k1_fe_normalize_weak(&u1);
-    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
-    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_weak(&s1);
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_t h; secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
-    secp256k1_fe_t i; secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
+    secp256k1_fe_sqr(&z12, &a->z);
+    u1 = a->x; secp256k1_fe_normalize_weak(&u1);
+    secp256k1_fe_mul(&u2, &b->x, &z12);
+    s1 = a->y; secp256k1_fe_normalize_weak(&s1);
+    secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
+    secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
+    secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
    if (secp256k1_fe_normalizes_to_zero_var(&h)) {
        if (secp256k1_fe_normalizes_to_zero_var(&i)) {
            secp256k1_gej_double_var(r, a);
@@ -304,11 +309,11 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
        }
        return;
    }
-    secp256k1_fe_t i2; secp256k1_fe_sqr(&i2, &i);
-    secp256k1_fe_t h2; secp256k1_fe_sqr(&h2, &h);
-    secp256k1_fe_t h3; secp256k1_fe_mul(&h3, &h, &h2);
+    secp256k1_fe_sqr(&i2, &i);
+    secp256k1_fe_sqr(&h2, &h);
+    secp256k1_fe_mul(&h3, &h, &h2);
    r->z = a->z; secp256k1_fe_mul(&r->z, &r->z, &h);
-    secp256k1_fe_t t; secp256k1_fe_mul(&t, &u1, &h2);
+    secp256k1_fe_mul(&t, &u1, &h2);
    r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2);
    secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i);
    secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1);
@@ -316,6 +321,9 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
 }

 static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b) {
+    /* Operations: 7 mul, 5 sqr, 5 normalize, 19 mul_int/add/negate */
+    secp256k1_fe_t zz, u1, u2, s1, s2, z, t, m, n, q, rr;
+    int infinity;
    VERIFY_CHECK(!b->infinity);
    VERIFY_CHECK(a->infinity == 0 || a->infinity == 1);

@@ -341,24 +349,24 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
     *  (Note that the paper uses xi = Xi / Zi and yi = Yi / Zi instead.)
     */

-    secp256k1_fe_t zz; secp256k1_fe_sqr(&zz, &a->z);                /* z = Z1^2 */
-    secp256k1_fe_t u1 = a->x; secp256k1_fe_normalize_weak(&u1);     /* u1 = U1 = X1*Z2^2 (1) */
-    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &zz);           /* u2 = U2 = X2*Z1^2 (1) */
-    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_weak(&s1);     /* s1 = S1 = Y1*Z2^3 (1) */
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &zz);           /* s2 = Y2*Z2^2 (1) */
-    secp256k1_fe_mul(&s2, &s2, &a->z);                              /* s2 = S2 = Y2*Z1^3 (1) */
-    secp256k1_fe_t z = a->z;                                        /* z = Z = Z1*Z2 (8) */
-    secp256k1_fe_t t = u1; secp256k1_fe_add(&t, &u2);               /* t = T = U1+U2 (2) */
-    secp256k1_fe_t m = s1; secp256k1_fe_add(&m, &s2);               /* m = M = S1+S2 (2) */
-    secp256k1_fe_t n; secp256k1_fe_sqr(&n, &m);                     /* n = M^2 (1) */
-    secp256k1_fe_t q; secp256k1_fe_mul(&q, &n, &t);                 /* q = Q = T*M^2 (1) */
-    secp256k1_fe_sqr(&n, &n);                                       /* n = M^4 (1) */
-    secp256k1_fe_t rr; secp256k1_fe_sqr(&rr, &t);                   /* rr = T^2 (1) */
+    secp256k1_fe_sqr(&zz, &a->z);                       /* z = Z1^2 */
+    u1 = a->x; secp256k1_fe_normalize_weak(&u1);        /* u1 = U1 = X1*Z2^2 (1) */
+    secp256k1_fe_mul(&u2, &b->x, &zz);                  /* u2 = U2 = X2*Z1^2 (1) */
+    s1 = a->y; secp256k1_fe_normalize_weak(&s1);        /* s1 = S1 = Y1*Z2^3 (1) */
+    secp256k1_fe_mul(&s2, &b->y, &zz);                  /* s2 = Y2*Z2^2 (1) */
+    secp256k1_fe_mul(&s2, &s2, &a->z);                  /* s2 = S2 = Y2*Z1^3 (1) */
+    z = a->z;                                           /* z = Z = Z1*Z2 (8) */
+    t = u1; secp256k1_fe_add(&t, &u2);                  /* t = T = U1+U2 (2) */
+    m = s1; secp256k1_fe_add(&m, &s2);                  /* m = M = S1+S2 (2) */
+    secp256k1_fe_sqr(&n, &m);                           /* n = M^2 (1) */
+    secp256k1_fe_mul(&q, &n, &t);                       /* q = Q = T*M^2 (1) */
+    secp256k1_fe_sqr(&n, &n);                           /* n = M^4 (1) */
+    secp256k1_fe_sqr(&rr, &t);                          /* rr = T^2 (1) */
    secp256k1_fe_mul(&t, &u1, &u2); secp256k1_fe_negate(&t, &t, 1); /* t = -U1*U2 (2) */
    secp256k1_fe_add(&rr, &t);                                      /* rr = R = T^2-U1*U2 (3) */
    secp256k1_fe_sqr(&t, &rr);                                      /* t = R^2 (1) */
    secp256k1_fe_mul(&r->z, &m, &z);                                /* r->z = M*Z (1) */
-    int infinity = secp256k1_fe_normalizes_to_zero(&r->z) * (1 - a->infinity);
+    infinity = secp256k1_fe_normalizes_to_zero(&r->z) * (1 - a->infinity);
    secp256k1_fe_mul_int(&r->z, 2 * (1 - a->infinity)); /* r->z = Z3 = 2*M*Z (2) */
    r->x = t;                                           /* r->x = R^2 (1) */
    secp256k1_fe_negate(&q, &q, 1);                     /* q = -Q (2) */
@@ -386,63 +394,37 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
    r->infinity = infinity;
 }

+static void secp256k1_ge_to_storage(secp256k1_ge_storage_t *r, const secp256k1_ge_t *a) {
+    secp256k1_fe_t x, y;
+    VERIFY_CHECK(!a->infinity);
+    x = a->x;
+    secp256k1_fe_normalize(&x);
+    y = a->y;
+    secp256k1_fe_normalize(&y);
+    secp256k1_fe_to_storage(&r->x, &x);
+    secp256k1_fe_to_storage(&r->y, &y);
+}

+static void secp256k1_ge_from_storage(secp256k1_ge_t *r, const secp256k1_ge_storage_t *a) {
+    secp256k1_fe_from_storage(&r->x, &a->x);
+    secp256k1_fe_from_storage(&r->y, &a->y);
+    r->infinity = 0;
+}

-static void secp256k1_gej_get_hex(char *r, int *rlen, const secp256k1_gej_t *a) {
-    secp256k1_gej_t c = *a;
-    secp256k1_ge_t t; secp256k1_ge_set_gej(&t, &c);
-    secp256k1_ge_get_hex(r, rlen, &t);
+static SECP256K1_INLINE void secp256k1_ge_storage_cmov(secp256k1_ge_storage_t *r, const secp256k1_ge_storage_t *a, int flag) {
+    secp256k1_fe_storage_cmov(&r->x, &a->x, flag);
+    secp256k1_fe_storage_cmov(&r->y, &a->y, flag);
 }

 #ifdef USE_ENDOMORPHISM
 static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
-    const secp256k1_fe_t *beta = &secp256k1_ge_consts->beta;
+    static const secp256k1_fe_t beta = SECP256K1_FE_CONST(
+        0x7ae96a2bul, 0x657c0710ul, 0x6e64479eul, 0xac3434e9ul,
+        0x9cf04975ul, 0x12f58995ul, 0xc1396c28ul, 0x719501eeul
+    );
    *r = *a;
-    secp256k1_fe_mul(&r->x, &r->x, beta);
+    secp256k1_fe_mul(&r->x, &r->x, &beta);
 }
 #endif

-static void secp256k1_ge_start(void) {
-    static const unsigned char secp256k1_ge_consts_g_x[] = {
-        0x79,0xBE,0x66,0x7E,0xF9,0xDC,0xBB,0xAC,
-        0x55,0xA0,0x62,0x95,0xCE,0x87,0x0B,0x07,
-        0x02,0x9B,0xFC,0xDB,0x2D,0xCE,0x28,0xD9,
-        0x59,0xF2,0x81,0x5B,0x16,0xF8,0x17,0x98
-    };
-    static const unsigned char secp256k1_ge_consts_g_y[] = {
-        0x48,0x3A,0xDA,0x77,0x26,0xA3,0xC4,0x65,
-        0x5D,0xA4,0xFB,0xFC,0x0E,0x11,0x08,0xA8,
-        0xFD,0x17,0xB4,0x48,0xA6,0x85,0x54,0x19,
-        0x9C,0x47,0xD0,0x8F,0xFB,0x10,0xD4,0xB8
-    };
-#ifdef USE_ENDOMORPHISM
-    /* properties of secp256k1's efficiently computable endomorphism */
-    static const unsigned char secp256k1_ge_consts_beta[] = {
-        0x7a,0xe9,0x6a,0x2b,0x65,0x7c,0x07,0x10,
-        0x6e,0x64,0x47,0x9e,0xac,0x34,0x34,0xe9,
-        0x9c,0xf0,0x49,0x75,0x12,0xf5,0x89,0x95,
-        0xc1,0x39,0x6c,0x28,0x71,0x95,0x01,0xee
-    };
-#endif
-    if (secp256k1_ge_consts == NULL) {
-        secp256k1_ge_consts_t *ret = (secp256k1_ge_consts_t*)checked_malloc(sizeof(secp256k1_ge_consts_t));
-#ifdef USE_ENDOMORPHISM
-        VERIFY_CHECK(secp256k1_fe_set_b32(&ret->beta, secp256k1_ge_consts_beta));
-#endif
-        secp256k1_fe_t g_x, g_y;
-        VERIFY_CHECK(secp256k1_fe_set_b32(&g_x, secp256k1_ge_consts_g_x));
-        VERIFY_CHECK(secp256k1_fe_set_b32(&g_y, secp256k1_ge_consts_g_y));
-        secp256k1_ge_set_xy(&ret->g, &g_x, &g_y);
-        secp256k1_ge_consts = ret;
-    }
-}
-
-static void secp256k1_ge_stop(void) {
-    if (secp256k1_ge_consts != NULL) {
-        secp256k1_ge_consts_t *c = (secp256k1_ge_consts_t*)secp256k1_ge_consts;
-        free((void*)c);
-        secp256k1_ge_consts = NULL;
-    }
-}
-
 #endif
--- a/src/hash.h
+++ b/src/hash.h
@@ -12,7 +12,7 @@

 typedef struct {
    uint32_t s[32];
-    unsigned char buf[64];
+    uint32_t buf[16]; /* In big endian */
    size_t bytes;
 } secp256k1_sha256_t;

@@ -34,7 +34,7 @@ typedef struct {
    int retry;
 } secp256k1_rfc6979_hmac_sha256_t;

-static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen);
+static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen, const unsigned char *rnd, size_t rndlen);
 static void secp256k1_rfc6979_hmac_sha256_generate(secp256k1_rfc6979_hmac_sha256_t *rng, unsigned char *out, size_t outlen);
 static void secp256k1_rfc6979_hmac_sha256_finalize(secp256k1_rfc6979_hmac_sha256_t *rng);

--- a/src/hash_impl.h
+++ b/src/hash_impl.h
@@ -11,6 +11,7 @@

 #include <stdlib.h>
 #include <stdint.h>
+#include <string.h>

 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 #define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
@@ -26,8 +27,11 @@
    (h) = t1 + t2; \
 } while(0)

-#define ReadBE32(p) (((uint32_t)((p)[0])) << 24 | ((uint32_t)((p)[1])) << 16 | ((uint32_t)((p)[2])) << 8 | ((uint32_t)((p)[3])))
-#define WriteBE32(p, v) do { (p)[0] = (v) >> 24; (p)[1] = (v) >> 16; (p)[2] = (v) >> 8; (p)[3] = (v); } while(0)
+#ifdef WORDS_BIGENDIAN
+#define BE32(x) (x)
+#else
+#define BE32(p) ((((p) & 0xFF) << 24) | (((p) & 0xFF00) << 8) | (((p) & 0xFF0000) >> 8) | (((p) & 0xFF000000) >> 24))
+#endif

 static void secp256k1_sha256_initialize(secp256k1_sha256_t *hash) {
    hash->s[0] = 0x6a09e667ul;
@@ -41,27 +45,27 @@ static void secp256k1_sha256_initialize(secp256k1_sha256_t *hash) {
    hash->bytes = 0;
 }

-/** Perform one SHA-256 transformation, processing a 64-byte chunk. */
-static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* chunk) {
+/** Perform one SHA-256 transformation, processing 16 big endian 32-bit words. */
+static void secp256k1_sha256_transform(uint32_t* s, const uint32_t* chunk) {
    uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
    uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;

-    Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = ReadBE32(chunk + 0));
-    Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = ReadBE32(chunk + 4));
-    Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = ReadBE32(chunk + 8));
-    Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = ReadBE32(chunk + 12));
-    Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = ReadBE32(chunk + 16));
-    Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = ReadBE32(chunk + 20));
-    Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = ReadBE32(chunk + 24));
-    Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = ReadBE32(chunk + 28));
-    Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = ReadBE32(chunk + 32));
-    Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = ReadBE32(chunk + 36));
-    Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = ReadBE32(chunk + 40));
-    Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = ReadBE32(chunk + 44));
-    Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = ReadBE32(chunk + 48));
-    Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = ReadBE32(chunk + 52));
-    Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = ReadBE32(chunk + 56));
-    Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = ReadBE32(chunk + 60));
+    Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = BE32(chunk[0]));
+    Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = BE32(chunk[1]));
+    Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = BE32(chunk[2]));
+    Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = BE32(chunk[3]));
+    Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = BE32(chunk[4]));
+    Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = BE32(chunk[5]));
+    Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = BE32(chunk[6]));
+    Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = BE32(chunk[7]));
+    Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = BE32(chunk[8]));
+    Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = BE32(chunk[9]));
+    Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = BE32(chunk[10]));
+    Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = BE32(chunk[11]));
+    Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = BE32(chunk[12]));
+    Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = BE32(chunk[13]));
+    Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = BE32(chunk[14]));
+    Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = BE32(chunk[15]));

    Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1));
    Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2));
@@ -125,55 +129,40 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* chunk)
 }

 static void secp256k1_sha256_write(secp256k1_sha256_t *hash, const unsigned char *data, size_t len) {
-    const unsigned char* end = data + len;
-    size_t bufsize = hash->bytes % 64;
-    if (bufsize && bufsize + len >= 64) {
-        // Fill the buffer, and process it.
-        memcpy(hash->buf + bufsize, data, 64 - bufsize);
-        hash->bytes += 64 - bufsize;
+    size_t bufsize = hash->bytes & 0x3F;
+    hash->bytes += len;
+    while (bufsize + len >= 64) {
+        /* Fill the buffer, and process it. */
+        memcpy(((unsigned char*)hash->buf) + bufsize, data, 64 - bufsize);
        data += 64 - bufsize;
+        len -= 64 - bufsize;
        secp256k1_sha256_transform(hash->s, hash->buf);
        bufsize = 0;
    }
-    while (end >= data + 64) {
-        // Process full chunks directly from the source.
-        secp256k1_sha256_transform(hash->s, data);
-        hash->bytes += 64;
-        data += 64;
-    }
-    if (end > data) {
-        // Fill the buffer with what remains.
-        memcpy(hash->buf + bufsize, data, end - data);
-        hash->bytes += end - data;
+    if (len) {
+        /* Fill the buffer with what remains. */
+        memcpy(((unsigned char*)hash->buf) + bufsize, data, len);
    }
 }

 static void secp256k1_sha256_finalize(secp256k1_sha256_t *hash, unsigned char *out32) {
    static const unsigned char pad[64] = {0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    unsigned char sizedesc[8];
-    WriteBE32(sizedesc, hash->bytes >> 29);
-    WriteBE32(sizedesc + 4, hash->bytes << 3);
+    uint32_t sizedesc[2];
+    uint32_t out[8];
+    int i = 0;
+    sizedesc[0] = BE32(hash->bytes >> 29);
+    sizedesc[1] = BE32(hash->bytes << 3);
    secp256k1_sha256_write(hash, pad, 1 + ((119 - (hash->bytes % 64)) % 64));
-    secp256k1_sha256_write(hash, sizedesc, 8);
-    WriteBE32(out32, hash->s[0]);
-    hash->s[0] = 0;
-    WriteBE32(out32 + 4, hash->s[1]);
-    hash->s[1] = 0;
-    WriteBE32(out32 + 8, hash->s[2]);
-    hash->s[2] = 0;
-    WriteBE32(out32 + 12, hash->s[3]);
-    hash->s[3] = 0;
-    WriteBE32(out32 + 16, hash->s[4]);
-    hash->s[4] = 0;
-    WriteBE32(out32 + 20, hash->s[5]);
-    hash->s[5] = 0;
-    WriteBE32(out32 + 24, hash->s[6]);
-    hash->s[6] = 0;
-    WriteBE32(out32 + 28, hash->s[7]);
-    hash->s[7] = 0;
+    secp256k1_sha256_write(hash, (const unsigned char*)sizedesc, 8);
+    for (i = 0; i < 8; i++) {
+        out[i] = BE32(hash->s[i]);
+        hash->s[i] = 0;
+    }
+    memcpy(out32, (const unsigned char*)out, 32);
 }

 static void secp256k1_hmac_sha256_initialize(secp256k1_hmac_sha256_t *hash, const unsigned char *key, size_t keylen) {
+    int n;
    unsigned char rkey[64];
    if (keylen <= 64) {
        memcpy(rkey, key, keylen);
@@ -187,12 +176,12 @@ static void secp256k1_hmac_sha256_initialize(secp256k1_hmac_sha256_t *hash, cons
    }

    secp256k1_sha256_initialize(&hash->outer);
-    for (int n = 0; n < 64; n++)
+    for (n = 0; n < 64; n++)
        rkey[n] ^= 0x5c;
    secp256k1_sha256_write(&hash->outer, rkey, 64);

    secp256k1_sha256_initialize(&hash->inner);
-    for (int n = 0; n < 64; n++)
+    for (n = 0; n < 64; n++)
        rkey[n] ^= 0x5c ^ 0x36;
    secp256k1_sha256_write(&hash->inner, rkey, 64);
    memset(rkey, 0, 64);
@@ -211,19 +200,22 @@ static void secp256k1_hmac_sha256_finalize(secp256k1_hmac_sha256_t *hash, unsign
 }


-static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen) {
+static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen, const unsigned char *rnd, size_t rndlen) {
+    secp256k1_hmac_sha256_t hmac;
    static const unsigned char zero[1] = {0x00};
    static const unsigned char one[1] = {0x01};

    memset(rng->v, 0x01, 32);
    memset(rng->k, 0x00, 32);

-    secp256k1_hmac_sha256_t hmac;
    secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
    secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
    secp256k1_hmac_sha256_write(&hmac, zero, 1);
    secp256k1_hmac_sha256_write(&hmac, key, keylen);
    secp256k1_hmac_sha256_write(&hmac, msg, msglen);
+    if (rnd && rndlen) {
+        secp256k1_hmac_sha256_write(&hmac, rnd, rndlen);
+    }
    secp256k1_hmac_sha256_finalize(&hmac, rng->k);
    secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
    secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
@@ -234,6 +226,9 @@ static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha2
    secp256k1_hmac_sha256_write(&hmac, one, 1);
    secp256k1_hmac_sha256_write(&hmac, key, keylen);
    secp256k1_hmac_sha256_write(&hmac, msg, msglen);
+    if (rnd && rndlen) {
+        secp256k1_hmac_sha256_write(&hmac, rnd, rndlen);
+    }
    secp256k1_hmac_sha256_finalize(&hmac, rng->k);
    secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
    secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
@@ -256,10 +251,10 @@ static void secp256k1_rfc6979_hmac_sha256_generate(secp256k1_rfc6979_hmac_sha256

    while (outlen > 0) {
        secp256k1_hmac_sha256_t hmac;
+        int now = outlen;
        secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
        secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
        secp256k1_hmac_sha256_finalize(&hmac, rng->v);
-        int now = outlen;
        if (now > 32) {
            now = 32;
        }
--- a/src/num_gmp_impl.h
+++ b/src/num_gmp_impl.h
@@ -29,10 +29,10 @@ static void secp256k1_num_copy(secp256k1_num_t *r, const secp256k1_num_t *a) {
 static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num_t *a) {
    unsigned char tmp[65];
    int len = 0;
+    int shift = 0;
    if (a->limbs>1 || a->data[0] != 0) {
        len = mpn_get_str(tmp, 256, (mp_limb_t*)a->data, a->limbs);
    }
-    int shift = 0;
    while (shift < len && tmp[shift] == 0) shift++;
    VERIFY_CHECK(len-shift <= (int)rlen);
    memset(r, 0, rlen - len + shift);
@@ -43,9 +43,10 @@ static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const sec
 }

 static void secp256k1_num_set_bin(secp256k1_num_t *r, const unsigned char *a, unsigned int alen) {
+    int len;
    VERIFY_CHECK(alen > 0);
    VERIFY_CHECK(alen <= 64);
-    int len = mpn_set_str(r->data, a, alen, 256);
+    len = mpn_set_str(r->data, a, alen, 256);
    if (len == 0) {
        r->data[0] = 0;
        len = 1;
@@ -91,6 +92,12 @@ static void secp256k1_num_mod(secp256k1_num_t *r, const secp256k1_num_t *m) {
 }

 static void secp256k1_num_mod_inverse(secp256k1_num_t *r, const secp256k1_num_t *a, const secp256k1_num_t *m) {
+    int i;
+    mp_limb_t g[NUM_LIMBS+1];
+    mp_limb_t u[NUM_LIMBS+1];
+    mp_limb_t v[NUM_LIMBS+1];
+    mp_size_t sn;
+    mp_size_t gn;
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(m);

@@ -106,15 +113,12 @@ static void secp256k1_num_mod_inverse(secp256k1_num_t *r, const secp256k1_num_t
     */
    VERIFY_CHECK(m->limbs <= NUM_LIMBS);
    VERIFY_CHECK(m->data[m->limbs-1] != 0);
-    mp_limb_t g[NUM_LIMBS+1];
-    mp_limb_t u[NUM_LIMBS+1];
-    mp_limb_t v[NUM_LIMBS+1];
-    for (int i=0; i < m->limbs; i++) {
+    for (i = 0; i < m->limbs; i++) {
        u[i] = (i < a->limbs) ? a->data[i] : 0;
        v[i] = m->data[i];
    }
-    mp_size_t sn = NUM_LIMBS+1;
-    mp_size_t gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
+    sn = NUM_LIMBS+1;
+    gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
    VERIFY_CHECK(gn == 1);
    VERIFY_CHECK(g[0] == 1);
    r->neg = a->neg ^ m->neg;
@@ -183,10 +187,10 @@ static void secp256k1_num_sub(secp256k1_num_t *r, const secp256k1_num_t *a, cons
 }

 static void secp256k1_num_mul(secp256k1_num_t *r, const secp256k1_num_t *a, const secp256k1_num_t *b) {
+    mp_limb_t tmp[2*NUM_LIMBS+1];
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(b);

-    mp_limb_t tmp[2*NUM_LIMBS+1];
    VERIFY_CHECK(a->limbs + b->limbs <= 2*NUM_LIMBS+1);
    if ((a->limbs==1 && a->data[0]==0) || (b->limbs==1 && b->data[0]==0)) {
        r->limbs = 1;
@@ -207,13 +211,14 @@ static void secp256k1_num_mul(secp256k1_num_t *r, const secp256k1_num_t *a, cons
 }

 static void secp256k1_num_shift(secp256k1_num_t *r, int bits) {
+    int i;
    if (bits % GMP_NUMB_BITS) {
-        // Shift within limbs.
+        /* Shift within limbs. */
        mpn_rshift(r->data, r->data, r->limbs, bits % GMP_NUMB_BITS);
    }
    if (bits >= GMP_NUMB_BITS) {
-        // Shift full limbs.
-        for (int i = 0; i < r->limbs; i++) {
+        /* Shift full limbs. */
+        for (i = 0; i < r->limbs; i++) {
            int index = i + (bits / GMP_NUMB_BITS);
            if (index < r->limbs && index < 2*NUM_LIMBS) {
                r->data[i] = r->data[index];
--- a/src/scalar.h
+++ b/src/scalar.h
@@ -21,9 +21,6 @@
 #error "Please select scalar implementation"
 #endif

-static void secp256k1_scalar_start(void);
-static void secp256k1_scalar_stop(void);
-
 /** Clear a scalar to prevent the leak of sensitive data. */
 static void secp256k1_scalar_clear(secp256k1_scalar_t *r);

@@ -83,9 +80,9 @@ static void secp256k1_scalar_order_get_num(secp256k1_num_t *r);
 /** Compare two scalars. */
 static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, const secp256k1_scalar_t *b);

-static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a);
-
 #ifdef USE_ENDOMORPHISM
+/** Find r1 and r2 such that r1+r2*2^128 = a. */
+static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a);
 /** Find r1 and r2 such that r1+r2*lambda = a, and r1 and r2 are maximum 128 bits long (see secp256k1_gej_mul_lambda). */
 static void secp256k1_scalar_split_lambda_var(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a);
 #endif
--- a/src/scalar_4x64.h
+++ b/src/scalar_4x64.h
@@ -14,4 +14,6 @@ typedef struct {
    uint64_t d[4];
 } secp256k1_scalar_t;

+#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{((uint64_t)(d1)) << 32 | (d0), ((uint64_t)(d3)) << 32 | (d2), ((uint64_t)(d5)) << 32 | (d4), ((uint64_t)(d7)) << 32 | (d6)}}
+
 #endif
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -7,8 +7,6 @@
 #ifndef _SECP256K1_SCALAR_REPR_IMPL_H_
 #define _SECP256K1_SCALAR_REPR_IMPL_H_

-typedef unsigned __int128 uint128_t;
-
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@@ -69,8 +67,9 @@ SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scal
 }

 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, unsigned int overflow) {
+    uint128_t t;
    VERIFY_CHECK(overflow <= 1);
-    uint128_t t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
+    t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1;
    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
@@ -82,6 +81,7 @@ SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, unsig
 }

 static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+    int overflow;
    uint128_t t = (uint128_t)a->d[0] + b->d[0];
    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)a->d[1] + b->d[1];
@@ -90,15 +90,16 @@ static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t
    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)a->d[3] + b->d[3];
    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    int overflow = t + secp256k1_scalar_check_overflow(r);
+    overflow = t + secp256k1_scalar_check_overflow(r);
    VERIFY_CHECK(overflow == 0 || overflow == 1);
    secp256k1_scalar_reduce(r, overflow);
    return overflow;
 }

 static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
+    uint128_t t;
    VERIFY_CHECK(bit < 256);
-    uint128_t t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
+    t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
@@ -113,11 +114,12 @@ static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
 }

 static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char *b32, int *overflow) {
+    int over;
    r->d[0] = (uint64_t)b32[31] | (uint64_t)b32[30] << 8 | (uint64_t)b32[29] << 16 | (uint64_t)b32[28] << 24 | (uint64_t)b32[27] << 32 | (uint64_t)b32[26] << 40 | (uint64_t)b32[25] << 48 | (uint64_t)b32[24] << 56;
    r->d[1] = (uint64_t)b32[23] | (uint64_t)b32[22] << 8 | (uint64_t)b32[21] << 16 | (uint64_t)b32[20] << 24 | (uint64_t)b32[19] << 32 | (uint64_t)b32[18] << 40 | (uint64_t)b32[17] << 48 | (uint64_t)b32[16] << 56;
    r->d[2] = (uint64_t)b32[15] | (uint64_t)b32[14] << 8 | (uint64_t)b32[13] << 16 | (uint64_t)b32[12] << 24 | (uint64_t)b32[11] << 32 | (uint64_t)b32[10] << 40 | (uint64_t)b32[9] << 48 | (uint64_t)b32[8] << 56;
    r->d[3] = (uint64_t)b32[7] | (uint64_t)b32[6] << 8 | (uint64_t)b32[5] << 16 | (uint64_t)b32[4] << 24 | (uint64_t)b32[3] << 32 | (uint64_t)b32[2] << 40 | (uint64_t)b32[1] << 48 | (uint64_t)b32[0] << 56;
-    int over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
+    over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
    if (overflow) {
        *overflow = over;
    }
@@ -195,16 +197,16 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
-    uint64_t tl, th; \
+    uint64_t tl, th, th2, tl2; \
    { \
        uint128_t t = (uint128_t)a * b; \
        th = t >> 64;               /* at most 0xFFFFFFFFFFFFFFFE */ \
        tl = t; \
    } \
-    uint64_t th2 = th + th;         /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
+    th2 = th + th;                  /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
    c2 += (th2 < th) ? 1 : 0;       /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    uint64_t tl2 = tl + tl;         /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
+    tl2 = tl + tl;                  /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
    th2 += (tl2 < tl) ? 1 : 0;      /* at most 0xFFFFFFFFFFFFFFFF */ \
    c0 += tl2;                      /* overflow is handled on the next line */ \
    th2 += (c0 < tl2) ? 1 : 0;      /* second overflow is handled on the next line */ \
@@ -217,8 +219,9 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
+    unsigned int over; \
    c0 += (a);                  /* overflow is handled on the next line */ \
-    unsigned int over = (c0 < (a)) ? 1 : 0; \
+    over = (c0 < (a)) ? 1 : 0; \
    c1 += over;                 /* overflow is handled on the next line */ \
    c2 += (c1 < over) ? 1 : 0;  /* never overflows by contract */ \
 }
@@ -248,63 +251,301 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {
 }

 static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint64_t *l) {
-    uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
+#ifdef USE_ASM_X86_64
+    /* Reduce 512 bits into 385. */
+    uint64_t m0, m1, m2, m3, m4, m5, m6;
+    uint64_t p0, p1, p2, p3, p4;
+    uint64_t c;

-    /* 160 bit accumulator. */
-    uint64_t c0, c1;
-    uint32_t c2;
+    __asm__ __volatile__(
+    /* Preload. */
+    "movq 32(%%rsi), %%r11\n"
+    "movq 40(%%rsi), %%r12\n"
+    "movq 48(%%rsi), %%r13\n"
+    "movq 56(%%rsi), %%r14\n"
+    /* Initialize r8,r9,r10 */
+    "movq 0(%%rsi), %%r8\n"
+    "movq $0, %%r9\n"
+    "movq $0, %%r10\n"
+    /* (r8,r9) += n0 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* extract m0 */
+    "movq %%r8, %q0\n"
+    "movq $0, %%r8\n"
+    /* (r9,r10) += l1 */
+    "addq 8(%%rsi), %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r9,r10,r8) += n1 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += n0 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract m1 */
+    "movq %%r9, %q1\n"
+    "movq $0, %%r9\n"
+    /* (r10,r8,r9) += l2 */
+    "addq 16(%%rsi), %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += n2 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += n1 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += n0 */
+    "addq %%r11, %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* extract m2 */
+    "movq %%r10, %q2\n"
+    "movq $0, %%r10\n"
+    /* (r8,r9,r10) += l3 */
+    "addq 24(%%rsi), %%r8\n"
+    "adcq $0, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += n3 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += n2 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += n1 */
+    "addq %%r12, %%r8\n"
+    "adcq $0, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* extract m3 */
+    "movq %%r8, %q3\n"
+    "movq $0, %%r8\n"
+    /* (r9,r10,r8) += n3 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += n2 */
+    "addq %%r13, %%r9\n"
+    "adcq $0, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract m4 */
+    "movq %%r9, %q4\n"
+    /* (r10,r8) += n3 */
+    "addq %%r14, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract m5 */
+    "movq %%r10, %q5\n"
+    /* extract m6 */
+    "movq %%r8, %q6\n"
+    : "=g"(m0), "=g"(m1), "=g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6)
+    : "S"(l), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
+    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc");
+
+    /* Reduce 385 bits into 258. */
+    __asm__ __volatile__(
+    /* Preload */
+    "movq %q9, %%r11\n"
+    "movq %q10, %%r12\n"
+    "movq %q11, %%r13\n"
+    /* Initialize (r8,r9,r10) */
+    "movq %q5, %%r8\n"
+    "movq $0, %%r9\n"
+    "movq $0, %%r10\n"
+    /* (r8,r9) += m4 * c0 */
+    "movq %12, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* extract p0 */
+    "movq %%r8, %q0\n"
+    "movq $0, %%r8\n"
+    /* (r9,r10) += m1 */
+    "addq %q6, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r9,r10,r8) += m5 * c0 */
+    "movq %12, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += m4 * c1 */
+    "movq %13, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract p1 */
+    "movq %%r9, %q1\n"
+    "movq $0, %%r9\n"
+    /* (r10,r8,r9) += m2 */
+    "addq %q7, %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += m6 * c0 */
+    "movq %12, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += m5 * c1 */
+    "movq %13, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += m4 */
+    "addq %%r11, %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* extract p2 */
+    "movq %%r10, %q2\n"
+    /* (r8,r9) += m3 */
+    "addq %q8, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r8,r9) += m6 * c1 */
+    "movq %13, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* (r8,r9) += m5 */
+    "addq %%r12, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* extract p3 */
+    "movq %%r8, %q3\n"
+    /* (r9) += m6 */
+    "addq %%r13, %%r9\n"
+    /* extract p4 */
+    "movq %%r9, %q4\n"
+    : "=&g"(p0), "=&g"(p1), "=&g"(p2), "=g"(p3), "=g"(p4)
+    : "g"(m0), "g"(m1), "g"(m2), "g"(m3), "g"(m4), "g"(m5), "g"(m6), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
+    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "cc");
+
+    /* Reduce 258 bits into 256. */
+    __asm__ __volatile__(
+    /* Preload */
+    "movq %q5, %%r10\n"
+    /* (rax,rdx) = p4 * c0 */
+    "movq %7, %%rax\n"
+    "mulq %%r10\n"
+    /* (rax,rdx) += p0 */
+    "addq %q1, %%rax\n"
+    "adcq $0, %%rdx\n"
+    /* extract r0 */
+    "movq %%rax, 0(%q6)\n"
+    /* Move to (r8,r9) */
+    "movq %%rdx, %%r8\n"
+    "movq $0, %%r9\n"
+    /* (r8,r9) += p1 */
+    "addq %q2, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r8,r9) += p4 * c1 */
+    "movq %8, %%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* Extract r1 */
+    "movq %%r8, 8(%q6)\n"
+    "movq $0, %%r8\n"
+    /* (r9,r8) += p4 */
+    "addq %%r10, %%r9\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r8) += p2 */
+    "addq %q3, %%r9\n"
+    "adcq $0, %%r8\n"
+    /* Extract r2 */
+    "movq %%r9, 16(%q6)\n"
+    "movq $0, %%r9\n"
+    /* (r8,r9) += p3 */
+    "addq %q4, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Extract r3 */
+    "movq %%r8, 24(%q6)\n"
+    /* Extract c */
+    "movq %%r9, %q0\n"
+    : "=g"(c)
+    : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
+    : "rax", "rdx", "r8", "r9", "r10", "cc", "memory");
+#else
+    uint128_t c;
+    uint64_t c0, c1, c2;
+    uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
+    uint64_t m0, m1, m2, m3, m4, m5;
+    uint32_t m6;
+    uint64_t p0, p1, p2, p3;
+    uint32_t p4;

    /* Reduce 512 bits into 385. */
    /* m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C. */
    c0 = l[0]; c1 = 0; c2 = 0;
    muladd_fast(n0, SECP256K1_N_C_0);
-    uint64_t m0; extract_fast(m0);
+    extract_fast(m0);
    sumadd_fast(l[1]);
    muladd(n1, SECP256K1_N_C_0);
    muladd(n0, SECP256K1_N_C_1);
-    uint64_t m1; extract(m1);
+    extract(m1);
    sumadd(l[2]);
    muladd(n2, SECP256K1_N_C_0);
    muladd(n1, SECP256K1_N_C_1);
    sumadd(n0);
-    uint64_t m2; extract(m2);
+    extract(m2);
    sumadd(l[3]);
    muladd(n3, SECP256K1_N_C_0);
    muladd(n2, SECP256K1_N_C_1);
    sumadd(n1);
-    uint64_t m3; extract(m3);
+    extract(m3);
    muladd(n3, SECP256K1_N_C_1);
    sumadd(n2);
-    uint64_t m4; extract(m4);
+    extract(m4);
    sumadd_fast(n3);
-    uint64_t m5; extract_fast(m5);
+    extract_fast(m5);
    VERIFY_CHECK(c0 <= 1);
-    uint32_t m6 = c0;
+    m6 = c0;

    /* Reduce 385 bits into 258. */
    /* p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C. */
    c0 = m0; c1 = 0; c2 = 0;
    muladd_fast(m4, SECP256K1_N_C_0);
-    uint64_t p0; extract_fast(p0);
+    extract_fast(p0);
    sumadd_fast(m1);
    muladd(m5, SECP256K1_N_C_0);
    muladd(m4, SECP256K1_N_C_1);
-    uint64_t p1; extract(p1);
+    extract(p1);
    sumadd(m2);
    muladd(m6, SECP256K1_N_C_0);
    muladd(m5, SECP256K1_N_C_1);
    sumadd(m4);
-    uint64_t p2; extract(p2);
+    extract(p2);
    sumadd_fast(m3);
    muladd_fast(m6, SECP256K1_N_C_1);
    sumadd_fast(m5);
-    uint64_t p3; extract_fast(p3);
-    uint32_t p4 = c0 + m6;
+    extract_fast(p3);
+    p4 = c0 + m6;
    VERIFY_CHECK(p4 <= 2);

    /* Reduce 258 bits into 256. */
    /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */
-    uint128_t c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
+    c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
    r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
    c += p1 + (uint128_t)SECP256K1_N_C_1 * p4;
    r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
@@ -312,12 +553,146 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint64_t *l
    r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
    c += p3;
    r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
+#endif

    /* Final reduction of r. */
    secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
 }

 static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+#ifdef USE_ASM_X86_64
+    const uint64_t *pb = b->d;
+    __asm__ __volatile__(
+    /* Preload */
+    "movq 0(%%rdi), %%r15\n"
+    "movq 8(%%rdi), %%rbx\n"
+    "movq 16(%%rdi), %%rcx\n"
+    "movq 0(%%rdx), %%r11\n"
+    "movq 8(%%rdx), %%r12\n"
+    "movq 16(%%rdx), %%r13\n"
+    "movq 24(%%rdx), %%r14\n"
+    /* (rax,rdx) = a0 * b0 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r11\n"
+    /* Extract l0 */
+    "movq %%rax, 0(%%rsi)\n"
+    /* (r8,r9,r10) = (rdx) */
+    "movq %%rdx, %%r8\n"
+    "xorq %%r9, %%r9\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += a0 * b1 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a1 * b0 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l1 */
+    "movq %%r8, 8(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += a0 * b2 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a1 * b1 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a2 * b0 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l2 */
+    "movq %%r9, 16(%%rsi)\n"
+    "xorq %%r9, %%r9\n"
+    /* (r10,r8,r9) += a0 * b3 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Preload a3 */
+    "movq 24(%%rdi), %%r15\n"
+    /* (r10,r8,r9) += a1 * b2 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += a2 * b1 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += a3 * b0 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Extract l3 */
+    "movq %%r10, 24(%%rsi)\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += a1 * b3 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a2 * b2 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a3 * b1 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l4 */
+    "movq %%r8, 32(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += a2 * b3 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a3 * b2 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l5 */
+    "movq %%r9, 40(%%rsi)\n"
+    /* (r10,r8) += a3 * b3 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    /* Extract l6 */
+    "movq %%r10, 48(%%rsi)\n"
+    /* Extract l7 */
+    "movq %%r8, 56(%%rsi)\n"
+    : "+d"(pb)
+    : "S"(l), "D"(a->d)
+    : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
+#else
    /* 160 bit accumulator. */
    uint64_t c0 = 0, c1 = 0;
    uint32_t c2 = 0;
@@ -348,9 +723,119 @@ static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar_t *a,
    extract_fast(l[6]);
    VERIFY_CHECK(c1 <= 0);
    l[7] = c0;
+#endif
 }

 static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar_t *a) {
+#ifdef USE_ASM_X86_64
+    __asm__ __volatile__(
+    /* Preload */
+    "movq 0(%%rdi), %%r11\n"
+    "movq 8(%%rdi), %%r12\n"
+    "movq 16(%%rdi), %%r13\n"
+    "movq 24(%%rdi), %%r14\n"
+    /* (rax,rdx) = a0 * a0 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r11\n"
+    /* Extract l0 */
+    "movq %%rax, 0(%%rsi)\n"
+    /* (r8,r9,r10) = (rdx,0) */
+    "movq %%rdx, %%r8\n"
+    "xorq %%r9, %%r9\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += 2 * a0 * a1 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l1 */
+    "movq %%r8, 8(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += 2 * a0 * a2 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a1 * a1 */
+    "movq %%r12, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l2 */
+    "movq %%r9, 16(%%rsi)\n"
+    "xorq %%r9, %%r9\n"
+    /* (r10,r8,r9) += 2 * a0 * a3 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += 2 * a1 * a2 */
+    "movq %%r12, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Extract l3 */
+    "movq %%r10, 24(%%rsi)\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += 2 * a1 * a3 */
+    "movq %%r12, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a2 * a2 */
+    "movq %%r13, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l4 */
+    "movq %%r8, 32(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += 2 * a2 * a3 */
+    "movq %%r13, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l5 */
+    "movq %%r9, 40(%%rsi)\n"
+    /* (r10,r8) += a3 * a3 */
+    "movq %%r14, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    /* Extract l6 */
+    "movq %%r10, 48(%%rsi)\n"
+    /* Extract l7 */
+    "movq %%r8, 56(%%rsi)\n"
+    :
+    : "S"(l), "D"(a->d)
+    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
+#else
    /* 160 bit accumulator. */
    uint64_t c0 = 0, c1 = 0;
    uint32_t c2 = 0;
@@ -375,6 +860,7 @@ static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar_t *a)
    extract_fast(l[6]);
    VERIFY_CHECK(c1 == 0);
    l[7] = c0;
+#endif
 }

 #undef sumadd
@@ -413,12 +899,15 @@ SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, con
 }

 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b, unsigned int shift) {
-    VERIFY_CHECK(shift >= 256);
    uint64_t l[8];
+    unsigned int shiftlimbs;
+    unsigned int shiftlow;
+    unsigned int shifthigh;
+    VERIFY_CHECK(shift >= 256);
    secp256k1_scalar_mul_512(l, a, b);
-    unsigned int shiftlimbs = shift >> 6;
-    unsigned int shiftlow = shift & 0x3F;
-    unsigned int shifthigh = 64 - shiftlow;
+    shiftlimbs = shift >> 6;
+    shiftlow = shift & 0x3F;
+    shifthigh = 64 - shiftlow;
    r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
--- a/src/scalar_8x32.h
+++ b/src/scalar_8x32.h
@@ -14,4 +14,6 @@ typedef struct {
    uint32_t d[8];
 } secp256k1_scalar_t;

+#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7)}}
+
 #endif
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -91,8 +91,9 @@ SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scal
 }

 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, uint32_t overflow) {
+    uint64_t t;
    VERIFY_CHECK(overflow <= 1);
-    uint64_t t = (uint64_t)r->d[0] + overflow * SECP256K1_N_C_0;
+    t = (uint64_t)r->d[0] + overflow * SECP256K1_N_C_0;
    r->d[0] = t & 0xFFFFFFFFUL; t >>= 32;
    t += (uint64_t)r->d[1] + overflow * SECP256K1_N_C_1;
    r->d[1] = t & 0xFFFFFFFFUL; t >>= 32;
@@ -112,6 +113,7 @@ SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, uint3
 }

 static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+    int overflow;
    uint64_t t = (uint64_t)a->d[0] + b->d[0];
    r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
    t += (uint64_t)a->d[1] + b->d[1];
@@ -128,15 +130,16 @@ static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t
    r->d[6] = t & 0xFFFFFFFFULL; t >>= 32;
    t += (uint64_t)a->d[7] + b->d[7];
    r->d[7] = t & 0xFFFFFFFFULL; t >>= 32;
-    int overflow = t + secp256k1_scalar_check_overflow(r);
+    overflow = t + secp256k1_scalar_check_overflow(r);
    VERIFY_CHECK(overflow == 0 || overflow == 1);
    secp256k1_scalar_reduce(r, overflow);
    return overflow;
 }

 static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
+    uint64_t t;
    VERIFY_CHECK(bit < 256);
-    uint64_t t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
+    t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
    r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
    t += (uint64_t)r->d[1] + (((uint32_t)((bit >> 5) == 1)) << (bit & 0x1F));
    r->d[1] = t & 0xFFFFFFFFULL; t >>= 32;
@@ -159,6 +162,7 @@ static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
 }

 static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char *b32, int *overflow) {
+    int over;
    r->d[0] = (uint32_t)b32[31] | (uint32_t)b32[30] << 8 | (uint32_t)b32[29] << 16 | (uint32_t)b32[28] << 24;
    r->d[1] = (uint32_t)b32[27] | (uint32_t)b32[26] << 8 | (uint32_t)b32[25] << 16 | (uint32_t)b32[24] << 24;
    r->d[2] = (uint32_t)b32[23] | (uint32_t)b32[22] << 8 | (uint32_t)b32[21] << 16 | (uint32_t)b32[20] << 24;
@@ -167,7 +171,7 @@ static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char
    r->d[5] = (uint32_t)b32[11] | (uint32_t)b32[10] << 8 | (uint32_t)b32[9] << 16 | (uint32_t)b32[8] << 24;
    r->d[6] = (uint32_t)b32[7] | (uint32_t)b32[6] << 8 | (uint32_t)b32[5] << 16 | (uint32_t)b32[4] << 24;
    r->d[7] = (uint32_t)b32[3] | (uint32_t)b32[2] << 8 | (uint32_t)b32[1] << 16 | (uint32_t)b32[0] << 24;
-    int over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
+    over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
    if (overflow) {
        *overflow = over;
    }
@@ -263,16 +267,16 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
-    uint32_t tl, th; \
+    uint32_t tl, th, th2, tl2; \
    { \
        uint64_t t = (uint64_t)a * b; \
        th = t >> 32;               /* at most 0xFFFFFFFE */ \
        tl = t; \
    } \
-    uint32_t th2 = th + th;         /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
+    th2 = th + th;                  /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
    c2 += (th2 < th) ? 1 : 0;       /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    uint32_t tl2 = tl + tl;         /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
+    tl2 = tl + tl;                  /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
    th2 += (tl2 < tl) ? 1 : 0;      /* at most 0xFFFFFFFF */ \
    c0 += tl2;                      /* overflow is handled on the next line */ \
    th2 += (c0 < tl2) ? 1 : 0;      /* second overflow is handled on the next line */ \
@@ -285,8 +289,9 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
+    unsigned int over; \
    c0 += (a);                  /* overflow is handled on the next line */ \
-    unsigned int over = (c0 < (a)) ? 1 : 0; \
+    over = (c0 < (a)) ? 1 : 0; \
    c1 += over;                 /* overflow is handled on the next line */ \
    c2 += (c1 < over) ? 1 : 0;  /* never overflows by contract */ \
 }
@@ -316,7 +321,10 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {
 }

 static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint32_t *l) {
+    uint64_t c;
    uint32_t n0 = l[8], n1 = l[9], n2 = l[10], n3 = l[11], n4 = l[12], n5 = l[13], n6 = l[14], n7 = l[15];
+    uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12;
+    uint32_t p0, p1, p2, p3, p4, p5, p6, p7, p8;

    /* 96 bit accumulator. */
    uint32_t c0, c1, c2;
@@ -325,115 +333,115 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint32_t *l
    /* m[0..12] = l[0..7] + n[0..7] * SECP256K1_N_C. */
    c0 = l[0]; c1 = 0; c2 = 0;
    muladd_fast(n0, SECP256K1_N_C_0);
-    uint32_t m0; extract_fast(m0);
+    extract_fast(m0);
    sumadd_fast(l[1]);
    muladd(n1, SECP256K1_N_C_0);
    muladd(n0, SECP256K1_N_C_1);
-    uint32_t m1; extract(m1);
+    extract(m1);
    sumadd(l[2]);
    muladd(n2, SECP256K1_N_C_0);
    muladd(n1, SECP256K1_N_C_1);
    muladd(n0, SECP256K1_N_C_2);
-    uint32_t m2; extract(m2);
+    extract(m2);
    sumadd(l[3]);
    muladd(n3, SECP256K1_N_C_0);
    muladd(n2, SECP256K1_N_C_1);
    muladd(n1, SECP256K1_N_C_2);
    muladd(n0, SECP256K1_N_C_3);
-    uint32_t m3; extract(m3);
+    extract(m3);
    sumadd(l[4]);
    muladd(n4, SECP256K1_N_C_0);
    muladd(n3, SECP256K1_N_C_1);
    muladd(n2, SECP256K1_N_C_2);
    muladd(n1, SECP256K1_N_C_3);
    sumadd(n0);
-    uint32_t m4; extract(m4);
+    extract(m4);
    sumadd(l[5]);
    muladd(n5, SECP256K1_N_C_0);
    muladd(n4, SECP256K1_N_C_1);
    muladd(n3, SECP256K1_N_C_2);
    muladd(n2, SECP256K1_N_C_3);
    sumadd(n1);
-    uint32_t m5; extract(m5);
+    extract(m5);
    sumadd(l[6]);
    muladd(n6, SECP256K1_N_C_0);
    muladd(n5, SECP256K1_N_C_1);
    muladd(n4, SECP256K1_N_C_2);
    muladd(n3, SECP256K1_N_C_3);
    sumadd(n2);
-    uint32_t m6; extract(m6);
+    extract(m6);
    sumadd(l[7]);
    muladd(n7, SECP256K1_N_C_0);
    muladd(n6, SECP256K1_N_C_1);
    muladd(n5, SECP256K1_N_C_2);
    muladd(n4, SECP256K1_N_C_3);
    sumadd(n3);
-    uint32_t m7; extract(m7);
+    extract(m7);
    muladd(n7, SECP256K1_N_C_1);
    muladd(n6, SECP256K1_N_C_2);
    muladd(n5, SECP256K1_N_C_3);
    sumadd(n4);
-    uint32_t m8; extract(m8);
+    extract(m8);
    muladd(n7, SECP256K1_N_C_2);
    muladd(n6, SECP256K1_N_C_3);
    sumadd(n5);
-    uint32_t m9; extract(m9);
+    extract(m9);
    muladd(n7, SECP256K1_N_C_3);
    sumadd(n6);
-    uint32_t m10; extract(m10);
+    extract(m10);
    sumadd_fast(n7);
-    uint32_t m11; extract_fast(m11);
+    extract_fast(m11);
    VERIFY_CHECK(c0 <= 1);
-    uint32_t m12 = c0;
+    m12 = c0;

    /* Reduce 385 bits into 258. */
    /* p[0..8] = m[0..7] + m[8..12] * SECP256K1_N_C. */
    c0 = m0; c1 = 0; c2 = 0;
    muladd_fast(m8, SECP256K1_N_C_0);
-    uint32_t p0; extract_fast(p0);
+    extract_fast(p0);
    sumadd_fast(m1);
    muladd(m9, SECP256K1_N_C_0);
    muladd(m8, SECP256K1_N_C_1);
-    uint32_t p1; extract(p1);
+    extract(p1);
    sumadd(m2);
    muladd(m10, SECP256K1_N_C_0);
    muladd(m9, SECP256K1_N_C_1);
    muladd(m8, SECP256K1_N_C_2);
-    uint32_t p2; extract(p2);
+    extract(p2);
    sumadd(m3);
    muladd(m11, SECP256K1_N_C_0);
    muladd(m10, SECP256K1_N_C_1);
    muladd(m9, SECP256K1_N_C_2);
    muladd(m8, SECP256K1_N_C_3);
-    uint32_t p3; extract(p3);
+    extract(p3);
    sumadd(m4);
    muladd(m12, SECP256K1_N_C_0);
    muladd(m11, SECP256K1_N_C_1);
    muladd(m10, SECP256K1_N_C_2);
    muladd(m9, SECP256K1_N_C_3);
    sumadd(m8);
-    uint32_t p4; extract(p4);
+    extract(p4);
    sumadd(m5);
    muladd(m12, SECP256K1_N_C_1);
    muladd(m11, SECP256K1_N_C_2);
    muladd(m10, SECP256K1_N_C_3);
    sumadd(m9);
-    uint32_t p5; extract(p5);
+    extract(p5);
    sumadd(m6);
    muladd(m12, SECP256K1_N_C_2);
    muladd(m11, SECP256K1_N_C_3);
    sumadd(m10);
-    uint32_t p6; extract(p6);
+    extract(p6);
    sumadd_fast(m7);
    muladd_fast(m12, SECP256K1_N_C_3);
    sumadd_fast(m11);
-    uint32_t p7; extract_fast(p7);
-    uint32_t p8 = c0 + m12;
+    extract_fast(p7);
+    p8 = c0 + m12;
    VERIFY_CHECK(p8 <= 2);

    /* Reduce 258 bits into 256. */
    /* r[0..7] = p[0..7] + p[8] * SECP256K1_N_C. */
-    uint64_t c = p0 + (uint64_t)SECP256K1_N_C_0 * p8;
+    c = p0 + (uint64_t)SECP256K1_N_C_0 * p8;
    r->d[0] = c & 0xFFFFFFFFUL; c >>= 32;
    c += p1 + (uint64_t)SECP256K1_N_C_1 * p8;
    r->d[1] = c & 0xFFFFFFFFUL; c >>= 32;
@@ -454,7 +462,7 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint32_t *l
    secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
 }

-static void secp256k1_scalar_mul_512(uint32_t l[16], const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+static void secp256k1_scalar_mul_512(uint32_t *l, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
    /* 96 bit accumulator. */
    uint32_t c0 = 0, c1 = 0, c2 = 0;

@@ -542,7 +550,7 @@ static void secp256k1_scalar_mul_512(uint32_t l[16], const secp256k1_scalar_t *a
    l[15] = c0;
 }

-static void secp256k1_scalar_sqr_512(uint32_t l[16], const secp256k1_scalar_t *a) {
+static void secp256k1_scalar_sqr_512(uint32_t *l, const secp256k1_scalar_t *a) {
    /* 96 bit accumulator. */
    uint32_t c0 = 0, c1 = 0, c2 = 0;

@@ -622,6 +630,7 @@ static void secp256k1_scalar_sqr(secp256k1_scalar_t *r, const secp256k1_scalar_t
    secp256k1_scalar_reduce_512(r, l);
 }

+#ifdef USE_ENDOMORPHISM
 static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a) {
    r1->d[0] = a->d[0];
    r1->d[1] = a->d[1];
@@ -640,18 +649,22 @@ static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_
    r2->d[6] = 0;
    r2->d[7] = 0;
 }
+#endif

 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
    return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3]) | (a->d[4] ^ b->d[4]) | (a->d[5] ^ b->d[5]) | (a->d[6] ^ b->d[6]) | (a->d[7] ^ b->d[7])) == 0;
 }

 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b, unsigned int shift) {
-    VERIFY_CHECK(shift >= 256);
    uint32_t l[16];
+    unsigned int shiftlimbs;
+    unsigned int shiftlow;
+    unsigned int shifthigh;
+    VERIFY_CHECK(shift >= 256);
    secp256k1_scalar_mul_512(l, a, b);
-    unsigned int shiftlimbs = shift >> 5;
-    unsigned int shiftlow = shift & 0x1F;
-    unsigned int shifthigh = 32 - shiftlow;
+    shiftlimbs = shift >> 5;
+    shiftlow = shift & 0x1F;
+    shifthigh = 32 - shiftlow;
    r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 480 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[1] = shift < 480 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[2] = shift < 448 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 416 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -24,121 +24,6 @@
 #error "Please select scalar implementation"
 #endif

-typedef struct {
-#ifndef USE_NUM_NONE
-    secp256k1_num_t order;
-#endif
-#ifdef USE_ENDOMORPHISM
-    secp256k1_scalar_t minus_lambda, minus_b1, minus_b2, g1, g2;
-#endif
-} secp256k1_scalar_consts_t;
-
-static const secp256k1_scalar_consts_t *secp256k1_scalar_consts = NULL;
-
-static void secp256k1_scalar_start(void) {
-    if (secp256k1_scalar_consts != NULL)
-        return;
-
-    /* Allocate. */
-    secp256k1_scalar_consts_t *ret = (secp256k1_scalar_consts_t*)checked_malloc(sizeof(secp256k1_scalar_consts_t));
-
-#ifndef USE_NUM_NONE
-    static const unsigned char secp256k1_scalar_consts_order[] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
-        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
-        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
-    };
-    secp256k1_num_set_bin(&ret->order, secp256k1_scalar_consts_order, sizeof(secp256k1_scalar_consts_order));
-#endif
-#ifdef USE_ENDOMORPHISM
-    /**
-     * Lambda is a scalar which has the property for secp256k1 that point multiplication by
-     * it is efficiently computable (see secp256k1_gej_mul_lambda). */
-    static const unsigned char secp256k1_scalar_consts_lambda[32] = {
-         0x53,0x63,0xad,0x4c,0xc0,0x5c,0x30,0xe0,
-         0xa5,0x26,0x1c,0x02,0x88,0x12,0x64,0x5a,
-         0x12,0x2e,0x22,0xea,0x20,0x81,0x66,0x78,
-         0xdf,0x02,0x96,0x7c,0x1b,0x23,0xbd,0x72
-    };
-    /**
-     * "Guide to Elliptic Curve Cryptography" (Hankerson, Menezes, Vanstone) gives an algorithm
-     * (algorithm 3.74) to find k1 and k2 given k, such that k1 + k2 * lambda == k mod n, and k1
-     * and k2 have a small size.
-     * It relies on constants a1, b1, a2, b2. These constants for the value of lambda above are:
-     *
-     * - a1 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
-     * - b1 =     -{0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3}
-     * - a2 = {0x01,0x14,0xca,0x50,0xf7,0xa8,0xe2,0xf3,0xf6,0x57,0xc1,0x10,0x8d,0x9d,0x44,0xcf,0xd8}
-     * - b2 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
-     *
-     * The algorithm then computes c1 = round(b1 * k / n) and c2 = round(b2 * k / n), and gives
-     * k1 = k - (c1*a1 + c2*a2) and k2 = -(c1*b1 + c2*b2). Instead, we use modular arithmetic, and
-     * compute k1 as k - k2 * lambda, avoiding the need for constants a1 and a2.
-     *
-     * g1, g2 are precomputed constants used to replace division with a rounded multiplication
-     * when decomposing the scalar for an endomorphism-based point multiplication.
-     *
-     * The possibility of using precomputed estimates is mentioned in "Guide to Elliptic Curve
-     * Cryptography" (Hankerson, Menezes, Vanstone) in section 3.5.
-     *
-     * The derivation is described in the paper "Efficient Software Implementation of Public-Key
-     * Cryptography on Sensor Networks Using the MSP430X Microcontroller" (Gouvea, Oliveira, Lopez),
-     * Section 4.3 (here we use a somewhat higher-precision estimate):
-     * d = a1*b2 - b1*a2
-     * g1 = round((2^272)*b2/d)
-     * g2 = round((2^272)*b1/d)
-     *
-     * (Note that 'd' is also equal to the curve order here because [a1,b1] and [a2,b2] are found
-     * as outputs of the Extended Euclidean Algorithm on inputs 'order' and 'lambda').
-     */
-    static const unsigned char secp256k1_scalar_consts_minus_b1[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,
-        0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3
-    };
-    static const unsigned char secp256k1_scalar_consts_b2[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,
-        0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15
-    };
-    static const unsigned char secp256k1_scalar_consts_g1[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x30,0x86,
-        0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,
-        0x90,0xe4,0x92,0x84,0xeb,0x15,0x3d,0xab
-    };
-    static const unsigned char secp256k1_scalar_consts_g2[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0xe4,0x43,
-        0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,
-        0x7f,0xa9,0x0a,0xbf,0xe4,0xc4,0x22,0x12
-    };
-
-    secp256k1_scalar_set_b32(&ret->minus_lambda, secp256k1_scalar_consts_lambda, NULL);
-    secp256k1_scalar_negate(&ret->minus_lambda, &ret->minus_lambda);
-    secp256k1_scalar_set_b32(&ret->minus_b1, secp256k1_scalar_consts_minus_b1, NULL);
-    secp256k1_scalar_set_b32(&ret->minus_b2, secp256k1_scalar_consts_b2, NULL);
-    secp256k1_scalar_negate(&ret->minus_b2, &ret->minus_b2);
-    secp256k1_scalar_set_b32(&ret->g1, secp256k1_scalar_consts_g1, NULL);
-    secp256k1_scalar_set_b32(&ret->g2, secp256k1_scalar_consts_g2, NULL);
-#endif
-
-    /* Set the global pointer. */
-    secp256k1_scalar_consts = ret;
-}
-
-static void secp256k1_scalar_stop(void) {
-    if (secp256k1_scalar_consts == NULL)
-        return;
-
-    secp256k1_scalar_consts_t *c = (secp256k1_scalar_consts_t*)secp256k1_scalar_consts;
-    secp256k1_scalar_consts = NULL;
-    free(c);
-}
-
 #ifndef USE_NUM_NONE
 static void secp256k1_scalar_get_num(secp256k1_num_t *r, const secp256k1_scalar_t *a) {
    unsigned char c[32];
@@ -146,12 +31,21 @@ static void secp256k1_scalar_get_num(secp256k1_num_t *r, const secp256k1_scalar_
    secp256k1_num_set_bin(r, c, 32);
 }

+/** secp256k1 curve order, see secp256k1_ecdsa_const_order_as_fe in ecdsa_impl.h */
 static void secp256k1_scalar_order_get_num(secp256k1_num_t *r) {
-    *r = secp256k1_scalar_consts->order;
+    static const unsigned char order[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
+        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
+        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
+    };
+    secp256k1_num_set_bin(r, order, 32);
 }
 #endif

 static void secp256k1_scalar_inverse(secp256k1_scalar_t *r, const secp256k1_scalar_t *x) {
+    secp256k1_scalar_t *t;
+    int i;
    /* First compute x ^ (2^N - 1) for some values of N. */
    secp256k1_scalar_t x2, x3, x4, x6, x7, x8, x15, x30, x60, x120, x127;

@@ -175,129 +69,129 @@ static void secp256k1_scalar_inverse(secp256k1_scalar_t *r, const secp256k1_scal
    secp256k1_scalar_mul(&x8, &x8,  x);

    secp256k1_scalar_sqr(&x15, &x8);
-    for (int i=0; i<6; i++)
+    for (i = 0; i < 6; i++)
        secp256k1_scalar_sqr(&x15, &x15);
    secp256k1_scalar_mul(&x15, &x15, &x7);

    secp256k1_scalar_sqr(&x30, &x15);
-    for (int i=0; i<14; i++)
+    for (i = 0; i < 14; i++)
        secp256k1_scalar_sqr(&x30, &x30);
    secp256k1_scalar_mul(&x30, &x30, &x15);

    secp256k1_scalar_sqr(&x60, &x30);
-    for (int i=0; i<29; i++)
+    for (i = 0; i < 29; i++)
        secp256k1_scalar_sqr(&x60, &x60);
    secp256k1_scalar_mul(&x60, &x60, &x30);

    secp256k1_scalar_sqr(&x120, &x60);
-    for (int i=0; i<59; i++)
+    for (i = 0; i < 59; i++)
        secp256k1_scalar_sqr(&x120, &x120);
    secp256k1_scalar_mul(&x120, &x120, &x60);

    secp256k1_scalar_sqr(&x127, &x120);
-    for (int i=0; i<6; i++)
+    for (i = 0; i < 6; i++)
        secp256k1_scalar_sqr(&x127, &x127);
    secp256k1_scalar_mul(&x127, &x127, &x7);

    /* Then accumulate the final result (t starts at x127). */
-    secp256k1_scalar_t *t = &x127;
-    for (int i=0; i<2; i++) /* 0 */
+    t = &x127;
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<3; i++) /* 0 */
+    for (i = 0; i < 3; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<5; i++) /* 00 */
+    for (i = 0; i < 5; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<4; i++) /* 00 */
+    for (i = 0; i < 4; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<5; i++) /* 0 */
+    for (i = 0; i < 5; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x4); /* 1111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<4; i++) /* 000 */
+    for (i = 0; i < 4; i++) /* 000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<10; i++) /* 0000000 */
+    for (i = 0; i < 10; i++) /* 0000000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<9; i++) /* 0 */
+    for (i = 0; i < 9; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<5; i++) /* 0 */
+    for (i = 0; i < 5; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x4); /* 1111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<5; i++) /* 000 */
+    for (i = 0; i < 5; i++) /* 000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<4; i++) /* 00 */
+    for (i = 0; i < 4; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<8; i++) /* 000000 */
+    for (i = 0; i < 8; i++) /* 000000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<3; i++) /* 0 */
+    for (i = 0; i < 3; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<6; i++) /* 00000 */
+    for (i = 0; i < 6; i++) /* 00000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<8; i++) /* 00 */
+    for (i = 0; i < 8; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
 }
@@ -307,10 +201,11 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar_t *r, const secp256k1_
    secp256k1_scalar_inverse(r, x);
 #elif defined(USE_SCALAR_INV_NUM)
    unsigned char b[32];
+    secp256k1_num_t n, m;
    secp256k1_scalar_get_b32(b, x);
-    secp256k1_num_t n;
    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_mod_inverse(&n, &n, &secp256k1_scalar_consts->order);
+    secp256k1_scalar_order_get_num(&m);
+    secp256k1_num_mod_inverse(&n, &n, &m);
    secp256k1_num_get_bin(b, 32, &n);
    secp256k1_scalar_set_b32(r, b, NULL);
 #else
@@ -319,16 +214,74 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar_t *r, const secp256k1_
 }

 #ifdef USE_ENDOMORPHISM
+/**
+ * The Secp256k1 curve has an endomorphism, where lambda * (x, y) = (beta * x, y), where
+ * lambda is {0x53,0x63,0xad,0x4c,0xc0,0x5c,0x30,0xe0,0xa5,0x26,0x1c,0x02,0x88,0x12,0x64,0x5a,
+ *            0x12,0x2e,0x22,0xea,0x20,0x81,0x66,0x78,0xdf,0x02,0x96,0x7c,0x1b,0x23,0xbd,0x72}
+ *
+ * "Guide to Elliptic Curve Cryptography" (Hankerson, Menezes, Vanstone) gives an algorithm
+ * (algorithm 3.74) to find k1 and k2 given k, such that k1 + k2 * lambda == k mod n, and k1
+ * and k2 have a small size.
+ * It relies on constants a1, b1, a2, b2. These constants for the value of lambda above are:
+ *
+ * - a1 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
+ * - b1 =     -{0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3}
+ * - a2 = {0x01,0x14,0xca,0x50,0xf7,0xa8,0xe2,0xf3,0xf6,0x57,0xc1,0x10,0x8d,0x9d,0x44,0xcf,0xd8}
+ * - b2 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
+ *
+ * The algorithm then computes c1 = round(b1 * k / n) and c2 = round(b2 * k / n), and gives
+ * k1 = k - (c1*a1 + c2*a2) and k2 = -(c1*b1 + c2*b2). Instead, we use modular arithmetic, and
+ * compute k1 as k - k2 * lambda, avoiding the need for constants a1 and a2.
+ *
+ * g1, g2 are precomputed constants used to replace division with a rounded multiplication
+ * when decomposing the scalar for an endomorphism-based point multiplication.
+ *
+ * The possibility of using precomputed estimates is mentioned in "Guide to Elliptic Curve
+ * Cryptography" (Hankerson, Menezes, Vanstone) in section 3.5.
+ *
+ * The derivation is described in the paper "Efficient Software Implementation of Public-Key
+ * Cryptography on Sensor Networks Using the MSP430X Microcontroller" (Gouvea, Oliveira, Lopez),
+ * Section 4.3 (here we use a somewhat higher-precision estimate):
+ * d = a1*b2 - b1*a2
+ * g1 = round((2^272)*b2/d)
+ * g2 = round((2^272)*b1/d)
+ *
+ * (Note that 'd' is also equal to the curve order here because [a1,b1] and [a2,b2] are found
+ * as outputs of the Extended Euclidean Algorithm on inputs 'order' and 'lambda').
+ *
+ * The function below splits a in r1 and r2, such that r1 + lambda * r2 == a (mod order).
+ */
+
 static void secp256k1_scalar_split_lambda_var(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a) {
+    secp256k1_scalar_t c1, c2;
+    static const secp256k1_scalar_t minus_lambda = SECP256K1_SCALAR_CONST(
+        0xAC9C52B3UL, 0x3FA3CF1FUL, 0x5AD9E3FDUL, 0x77ED9BA4UL,
+        0xA880B9FCUL, 0x8EC739C2UL, 0xE0CFC810UL, 0xB51283CFUL
+    );
+    static const secp256k1_scalar_t minus_b1 = SECP256K1_SCALAR_CONST(
+        0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
+        0xE4437ED6UL, 0x010E8828UL, 0x6F547FA9UL, 0x0ABFE4C3UL
+    );
+    static const secp256k1_scalar_t minus_b2 = SECP256K1_SCALAR_CONST(
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL,
+        0x8A280AC5UL, 0x0774346DUL, 0xD765CDA8UL, 0x3DB1562CUL
+    );
+    static const secp256k1_scalar_t g1 = SECP256K1_SCALAR_CONST(
+        0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00003086UL,
+        0xD221A7D4UL, 0x6BCDE86CUL, 0x90E49284UL, 0xEB153DABUL
+    );
+    static const secp256k1_scalar_t g2 = SECP256K1_SCALAR_CONST(
+        0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0000E443UL,
+        0x7ED6010EUL, 0x88286F54UL, 0x7FA90ABFUL, 0xE4C42212UL
+    );
    VERIFY_CHECK(r1 != a);
    VERIFY_CHECK(r2 != a);
-    secp256k1_scalar_t c1, c2;
-    secp256k1_scalar_mul_shift_var(&c1, a, &secp256k1_scalar_consts->g1, 272);
-    secp256k1_scalar_mul_shift_var(&c2, a, &secp256k1_scalar_consts->g2, 272);
-    secp256k1_scalar_mul(&c1, &c1, &secp256k1_scalar_consts->minus_b1);
-    secp256k1_scalar_mul(&c2, &c2, &secp256k1_scalar_consts->minus_b2);
+    secp256k1_scalar_mul_shift_var(&c1, a, &g1, 272);
+    secp256k1_scalar_mul_shift_var(&c2, a, &g2, 272);
+    secp256k1_scalar_mul(&c1, &c1, &minus_b1);
+    secp256k1_scalar_mul(&c2, &c2, &minus_b2);
    secp256k1_scalar_add(r2, &c1, &c2);
-    secp256k1_scalar_mul(r1, r2, &secp256k1_scalar_consts->minus_lambda);
+    secp256k1_scalar_mul(r1, r2, &minus_lambda);
    secp256k1_scalar_add(r1, r1, a);
 }
 #endif
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -20,10 +20,6 @@
 #include "hash_impl.h"

 void secp256k1_start(unsigned int flags) {
-    secp256k1_fe_start();
-    secp256k1_ge_start();
-    secp256k1_scalar_start();
-    secp256k1_ecdsa_start();
    if (flags & SECP256K1_START_SIGN) {
        secp256k1_ecmult_gen_start();
    }
@@ -35,46 +31,43 @@ void secp256k1_start(unsigned int flags) {
 void secp256k1_stop(void) {
    secp256k1_ecmult_stop();
    secp256k1_ecmult_gen_stop();
-    secp256k1_ecdsa_stop();
-    secp256k1_scalar_stop();
-    secp256k1_ge_stop();
-    secp256k1_fe_stop();
 }

 int secp256k1_ecdsa_verify(const unsigned char *msg32, const unsigned char *sig, int siglen, const unsigned char *pubkey, int pubkeylen) {
+    secp256k1_ge_t q;
+    secp256k1_ecdsa_sig_t s;
+    secp256k1_scalar_t m;
+    int ret = -3;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig != NULL);
    DEBUG_CHECK(pubkey != NULL);

-    int ret = -3;
-    secp256k1_scalar_t m;
-    secp256k1_ecdsa_sig_t s;
-    secp256k1_ge_t q;
    secp256k1_scalar_set_b32(&m, msg32, NULL);

-    if (!secp256k1_eckey_pubkey_parse(&q, pubkey, pubkeylen)) {
+    if (secp256k1_eckey_pubkey_parse(&q, pubkey, pubkeylen)) {
+        if (secp256k1_ecdsa_sig_parse(&s, sig, siglen)) {
+            if (secp256k1_ecdsa_sig_verify(&s, &q, &m)) {
+                /* success is 1, all other values are fail */
+                ret = 1;
+            } else {
+                ret = 0;
+            }
+        } else {
+            ret = -2;
+        }
+    } else {
        ret = -1;
-        goto end;
    }
-    if (!secp256k1_ecdsa_sig_parse(&s, sig, siglen)) {
-        ret = -2;
-        goto end;
-    }
-    if (!secp256k1_ecdsa_sig_verify(&s, &q, &m)) {
-        ret = 0;
-        goto end;
-    }
-    ret = 1;
-end:
+
    return ret;
 }

 static int nonce_function_rfc6979(unsigned char *nonce32, const unsigned char *msg32, const unsigned char *key32, unsigned int counter, const void *data) {
-   (void)data;
   secp256k1_rfc6979_hmac_sha256_t rng;
-   secp256k1_rfc6979_hmac_sha256_initialize(&rng, key32, 32, msg32, 32);
-   for (unsigned int i = 0; i <= counter; i++) {
+   unsigned int i;
+   secp256k1_rfc6979_hmac_sha256_initialize(&rng, key32, 32, msg32, 32, data, data != NULL ? 32 : 0);
+   for (i = 0; i <= counter; i++) {
       secp256k1_rfc6979_hmac_sha256_generate(&rng, nonce32, 32);
   }
   secp256k1_rfc6979_hmac_sha256_finalize(&rng);
@@ -85,6 +78,11 @@ const secp256k1_nonce_function_t secp256k1_nonce_function_rfc6979 = nonce_functi
 const secp256k1_nonce_function_t secp256k1_nonce_function_default = nonce_function_rfc6979;

 int secp256k1_ecdsa_sign(const unsigned char *msg32, unsigned char *signature, int *signaturelen, const unsigned char *seckey, secp256k1_nonce_function_t noncefp, const void* noncedata) {
+    secp256k1_ecdsa_sig_t sig;
+    secp256k1_scalar_t sec, non, msg;
+    int ret = 0;
+    int overflow = 0;
+    unsigned int count = 0;
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(signature != NULL);
@@ -94,38 +92,44 @@ int secp256k1_ecdsa_sign(const unsigned char *msg32, unsigned char *signature, i
        noncefp = secp256k1_nonce_function_default;
    }

-    secp256k1_scalar_t sec, non, msg;
-    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    secp256k1_scalar_set_b32(&msg, msg32, NULL);
-    int overflow = 0;
-    int ret = 0;
-    unsigned int count = 0;
-    secp256k1_ecdsa_sig_t sig;
-    while (1) {
-        unsigned char nonce32[32];
-        ret = noncefp(nonce32, msg32, seckey, count, noncedata);
-        if (!ret) {
-            break;
-        }
-        secp256k1_scalar_set_b32(&non, nonce32, &overflow);
-        memset(nonce32, 0, 32);
-        if (!secp256k1_scalar_is_zero(&non) && !overflow) {
-            if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, NULL)) {
+    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
+    /* Fail if the secret key is invalid. */
+    if (!overflow && !secp256k1_scalar_is_zero(&sec)) {
+        secp256k1_scalar_set_b32(&msg, msg32, NULL);
+        while (1) {
+            unsigned char nonce32[32];
+            ret = noncefp(nonce32, msg32, seckey, count, noncedata);
+            if (!ret) {
                break;
            }
+            secp256k1_scalar_set_b32(&non, nonce32, &overflow);
+            memset(nonce32, 0, 32);
+            if (!secp256k1_scalar_is_zero(&non) && !overflow) {
+                if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, NULL)) {
+                    break;
+                }
+            }
+            count++;
        }
-        count++;
+        if (ret) {
+            ret = secp256k1_ecdsa_sig_serialize(signature, signaturelen, &sig);
+        }
+        secp256k1_scalar_clear(&msg);
+        secp256k1_scalar_clear(&non);
+        secp256k1_scalar_clear(&sec);
    }
-    if (ret) {
-        ret = secp256k1_ecdsa_sig_serialize(signature, signaturelen, &sig);
+    if (!ret) {
+        *signaturelen = 0;
    }
-    secp256k1_scalar_clear(&msg);
-    secp256k1_scalar_clear(&non);
-    secp256k1_scalar_clear(&sec);
    return ret;
 }

 int secp256k1_ecdsa_sign_compact(const unsigned char *msg32, unsigned char *sig64, const unsigned char *seckey, secp256k1_nonce_function_t noncefp, const void* noncedata, int *recid) {
+    secp256k1_ecdsa_sig_t sig;
+    secp256k1_scalar_t sec, non, msg;
+    int ret = 0;
+    int overflow = 0;
+    unsigned int count = 0;
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig64 != NULL);
@@ -134,39 +138,45 @@ int secp256k1_ecdsa_sign_compact(const unsigned char *msg32, unsigned char *sig6
        noncefp = secp256k1_nonce_function_default;
    }

-    secp256k1_scalar_t sec, non, msg;
-    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    secp256k1_scalar_set_b32(&msg, msg32, NULL);
-    int overflow = 0;
-    int ret = 0;
-    unsigned int count = 0;
-    secp256k1_ecdsa_sig_t sig;
-    while (1) {
-        unsigned char nonce32[32];
-        ret = noncefp(nonce32, msg32, seckey, count, noncedata);
-        if (!ret) {
-            break;
-        }
-        secp256k1_scalar_set_b32(&non, nonce32, &overflow);
-        memset(nonce32, 0, 32);
-        if (!secp256k1_scalar_is_zero(&non) && !overflow) {
-            if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, recid)) {
+    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
+    /* Fail if the secret key is invalid. */
+    if (!overflow && !secp256k1_scalar_is_zero(&sec)) {
+        secp256k1_scalar_set_b32(&msg, msg32, NULL);
+        while (1) {
+            unsigned char nonce32[32];
+            ret = noncefp(nonce32, msg32, seckey, count, noncedata);
+            if (!ret) {
                break;
            }
+            secp256k1_scalar_set_b32(&non, nonce32, &overflow);
+            memset(nonce32, 0, 32);
+            if (!secp256k1_scalar_is_zero(&non) && !overflow) {
+                if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, recid)) {
+                    break;
+                }
+            }
+            count++;
        }
-        count++;
+        if (ret) {
+            secp256k1_scalar_get_b32(sig64, &sig.r);
+            secp256k1_scalar_get_b32(sig64 + 32, &sig.s);
+        }
+        secp256k1_scalar_clear(&msg);
+        secp256k1_scalar_clear(&non);
+        secp256k1_scalar_clear(&sec);
    }
-    if (ret) {
-        secp256k1_scalar_get_b32(sig64, &sig.r);
-        secp256k1_scalar_get_b32(sig64 + 32, &sig.s);
+    if (!ret) {
+        memset(sig64, 0, 64);
    }
-    secp256k1_scalar_clear(&msg);
-    secp256k1_scalar_clear(&non);
-    secp256k1_scalar_clear(&sec);
    return ret;
 }

 int secp256k1_ecdsa_recover_compact(const unsigned char *msg32, const unsigned char *sig64, unsigned char *pubkey, int *pubkeylen, int compressed, int recid) {
+    secp256k1_ge_t q;
+    secp256k1_ecdsa_sig_t sig;
+    secp256k1_scalar_t m;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig64 != NULL);
@@ -174,82 +184,87 @@ int secp256k1_ecdsa_recover_compact(const unsigned char *msg32, const unsigned c
    DEBUG_CHECK(pubkeylen != NULL);
    DEBUG_CHECK(recid >= 0 && recid <= 3);

-    int ret = 0;
-    secp256k1_scalar_t m;
-    secp256k1_ecdsa_sig_t sig;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&sig.r, sig64, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_scalar_set_b32(&sig.s, sig64 + 32, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_scalar_set_b32(&m, msg32, NULL);
+    if (!overflow) {
+        secp256k1_scalar_set_b32(&sig.s, sig64 + 32, &overflow);
+        if (!overflow) {
+            secp256k1_scalar_set_b32(&m, msg32, NULL);

-    secp256k1_ge_t q;
-    if (secp256k1_ecdsa_sig_recover(&sig, &q, &m, recid)) {
-        ret = secp256k1_eckey_pubkey_serialize(&q, pubkey, pubkeylen, compressed);
+            if (secp256k1_ecdsa_sig_recover(&sig, &q, &m, recid)) {
+                ret = secp256k1_eckey_pubkey_serialize(&q, pubkey, pubkeylen, compressed);
+            }
+        }
    }
    return ret;
 }

 int secp256k1_ec_seckey_verify(const unsigned char *seckey) {
+    secp256k1_scalar_t sec;
+    int ret;
+    int overflow;
    DEBUG_CHECK(seckey != NULL);

-    secp256k1_scalar_t sec;
-    int overflow;
    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
-    int ret = !secp256k1_scalar_is_zero(&sec) && !overflow;
+    ret = !secp256k1_scalar_is_zero(&sec) && !overflow;
    secp256k1_scalar_clear(&sec);
    return ret;
 }

 int secp256k1_ec_pubkey_verify(const unsigned char *pubkey, int pubkeylen) {
+    secp256k1_ge_t q;
    DEBUG_CHECK(pubkey != NULL);

-    secp256k1_ge_t q;
    return secp256k1_eckey_pubkey_parse(&q, pubkey, pubkeylen);
 }

 int secp256k1_ec_pubkey_create(unsigned char *pubkey, int *pubkeylen, const unsigned char *seckey, int compressed) {
+    secp256k1_gej_t pj;
+    secp256k1_ge_t p;
+    secp256k1_scalar_t sec;
+    int overflow;
+    int ret = 0;
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(pubkeylen != NULL);
    DEBUG_CHECK(seckey != NULL);

-    secp256k1_scalar_t sec;
-    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    secp256k1_gej_t pj;
-    secp256k1_ecmult_gen(&pj, &sec);
-    secp256k1_scalar_clear(&sec);
-    secp256k1_ge_t p;
-    secp256k1_ge_set_gej(&p, &pj);
-    return secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, compressed);
+    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
+    if (!overflow) {
+        secp256k1_ecmult_gen(&pj, &sec);
+        secp256k1_scalar_clear(&sec);
+        secp256k1_ge_set_gej(&p, &pj);
+        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, compressed);
+    }
+    if (!ret) {
+        *pubkeylen = 0;
+    }
+    return ret;
 }

 int secp256k1_ec_pubkey_decompress(unsigned char *pubkey, int *pubkeylen) {
+    secp256k1_ge_t p;
+    int ret = 0;
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(pubkeylen != NULL);

-    secp256k1_ge_t p;
-    if (!secp256k1_eckey_pubkey_parse(&p, pubkey, *pubkeylen))
-        return 0;
-    return secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, 0);
+    if (secp256k1_eckey_pubkey_parse(&p, pubkey, *pubkeylen)) {
+        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, 0);
+    }
+    return ret;
 }

 int secp256k1_ec_privkey_tweak_add(unsigned char *seckey, const unsigned char *tweak) {
+    secp256k1_scalar_t term;
+    secp256k1_scalar_t sec;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t term;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&term, tweak, &overflow);
-    secp256k1_scalar_t sec;
    secp256k1_scalar_set_b32(&sec, seckey, NULL);

-    int ret = secp256k1_eckey_privkey_tweak_add(&sec, &term) && !overflow;
+    ret = secp256k1_eckey_privkey_tweak_add(&sec, &term) && !overflow;
    if (ret) {
        secp256k1_scalar_get_b32(seckey, &sec);
    }
@@ -260,40 +275,41 @@ int secp256k1_ec_privkey_tweak_add(unsigned char *seckey, const unsigned char *t
 }

 int secp256k1_ec_pubkey_tweak_add(unsigned char *pubkey, int pubkeylen, const unsigned char *tweak) {
+    secp256k1_ge_t p;
+    secp256k1_scalar_t term;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t term;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&term, tweak, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_ge_t p;
-    int ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
-    if (ret) {
-        ret = secp256k1_eckey_pubkey_tweak_add(&p, &term);
-    }
-    if (ret) {
-        int oldlen = pubkeylen;
-        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
-        VERIFY_CHECK(pubkeylen == oldlen);
+    if (!overflow) {
+        ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
+        if (ret) {
+            ret = secp256k1_eckey_pubkey_tweak_add(&p, &term);
+        }
+        if (ret) {
+            int oldlen = pubkeylen;
+            ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
+            VERIFY_CHECK(pubkeylen == oldlen);
+        }
    }

    return ret;
 }

 int secp256k1_ec_privkey_tweak_mul(unsigned char *seckey, const unsigned char *tweak) {
+    secp256k1_scalar_t factor;
+    secp256k1_scalar_t sec;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t factor;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&factor, tweak, &overflow);
-    secp256k1_scalar_t sec;
    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    int ret = secp256k1_eckey_privkey_tweak_mul(&sec, &factor) && !overflow;
+    ret = secp256k1_eckey_privkey_tweak_mul(&sec, &factor) && !overflow;
    if (ret) {
        secp256k1_scalar_get_b32(seckey, &sec);
    }
@@ -304,50 +320,53 @@ int secp256k1_ec_privkey_tweak_mul(unsigned char *seckey, const unsigned char *t
 }

 int secp256k1_ec_pubkey_tweak_mul(unsigned char *pubkey, int pubkeylen, const unsigned char *tweak) {
+    secp256k1_ge_t p;
+    secp256k1_scalar_t factor;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t factor;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&factor, tweak, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_ge_t p;
-    int ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
-    if (ret) {
-        ret = secp256k1_eckey_pubkey_tweak_mul(&p, &factor);
-    }
-    if (ret) {
-        int oldlen = pubkeylen;
-        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
-        VERIFY_CHECK(pubkeylen == oldlen);
+    if (!overflow) {
+        ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
+        if (ret) {
+            ret = secp256k1_eckey_pubkey_tweak_mul(&p, &factor);
+        }
+        if (ret) {
+            int oldlen = pubkeylen;
+            ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
+            VERIFY_CHECK(pubkeylen == oldlen);
+        }
    }

    return ret;
 }

 int secp256k1_ec_privkey_export(const unsigned char *seckey, unsigned char *privkey, int *privkeylen, int compressed) {
+    secp256k1_scalar_t key;
+    int ret = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(privkey != NULL);
    DEBUG_CHECK(privkeylen != NULL);

-    secp256k1_scalar_t key;
    secp256k1_scalar_set_b32(&key, seckey, NULL);
-    int ret = secp256k1_eckey_privkey_serialize(privkey, privkeylen, &key, compressed);
+    ret = secp256k1_eckey_privkey_serialize(privkey, privkeylen, &key, compressed);
    secp256k1_scalar_clear(&key);
    return ret;
 }

 int secp256k1_ec_privkey_import(unsigned char *seckey, const unsigned char *privkey, int privkeylen) {
+    secp256k1_scalar_t key;
+    int ret = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(privkey != NULL);

-    secp256k1_scalar_t key;
-    int ret = secp256k1_eckey_privkey_parse(&key, privkey, privkeylen);
-    if (ret)
+    ret = secp256k1_eckey_privkey_parse(&key, privkey, privkeylen);
+    if (ret) {
        secp256k1_scalar_get_b32(seckey, &key);
+    }
    secp256k1_scalar_clear(&key);
    return ret;
 }
--- a/src/testrand.h
+++ b/src/testrand.h
@@ -11,8 +11,10 @@
 #include "libsecp256k1-config.h"
 #endif

-/** Seed the pseudorandom number generator. */
-SECP256K1_INLINE static void secp256k1_rand_seed(uint64_t v);
+/* A non-cryptographic RNG used only for test infrastructure. */
+
+/** Seed the pseudorandom number generator for testing. */
+SECP256K1_INLINE static void secp256k1_rand_seed(const unsigned char *seed16);

 /** Generate a pseudorandom 32-bit number. */
 static uint32_t secp256k1_rand32(void);
--- a/src/testrand_impl.h
+++ b/src/testrand_impl.h
@@ -11,44 +11,44 @@
 #include <string.h>

 #include "testrand.h"
+#include "hash.h"

-static uint32_t secp256k1_Rz = 11, secp256k1_Rw = 11;
+static secp256k1_rfc6979_hmac_sha256_t secp256k1_test_rng;
+static uint32_t secp256k1_test_rng_precomputed[8];
+static int secp256k1_test_rng_precomputed_used = 8;

-SECP256K1_INLINE static void secp256k1_rand_seed(uint64_t v) {
-    secp256k1_Rz = v >> 32;
-    secp256k1_Rw = v;
-
-    if (secp256k1_Rz == 0 || secp256k1_Rz == 0x9068ffffU) {
-        secp256k1_Rz = 111;
-    }
-    if (secp256k1_Rw == 0 || secp256k1_Rw == 0x464fffffU) {
-        secp256k1_Rw = 111;
-    }
+SECP256K1_INLINE static void secp256k1_rand_seed(const unsigned char *seed16) {
+    secp256k1_rfc6979_hmac_sha256_initialize(&secp256k1_test_rng, (const unsigned char*)"TestRNG", 7, seed16, 16, NULL, 0);
 }

 SECP256K1_INLINE static uint32_t secp256k1_rand32(void) {
-    secp256k1_Rz = 36969 * (secp256k1_Rz & 0xFFFF) + (secp256k1_Rz >> 16);
-    secp256k1_Rw = 18000 * (secp256k1_Rw & 0xFFFF) + (secp256k1_Rw >> 16);
-    return (secp256k1_Rw << 16) + (secp256k1_Rw >> 16) + secp256k1_Rz;
+    if (secp256k1_test_rng_precomputed_used == 8) {
+        secp256k1_rfc6979_hmac_sha256_generate(&secp256k1_test_rng, (unsigned char*)(&secp256k1_test_rng_precomputed[0]), sizeof(secp256k1_test_rng_precomputed));
+        secp256k1_test_rng_precomputed_used = 0;
+    }
+    return secp256k1_test_rng_precomputed[secp256k1_test_rng_precomputed_used++];
 }

 static void secp256k1_rand256(unsigned char *b32) {
-    for (int i=0; i<8; i++) {
-        uint32_t r = secp256k1_rand32();
-        b32[i*4 + 0] = (r >>  0) & 0xFF;
-        b32[i*4 + 1] = (r >>  8) & 0xFF;
-        b32[i*4 + 2] = (r >> 16) & 0xFF;
-        b32[i*4 + 3] = (r >> 24) & 0xFF;
-    }
+    secp256k1_rfc6979_hmac_sha256_generate(&secp256k1_test_rng, b32, 32);
 }

 static void secp256k1_rand256_test(unsigned char *b32) {
    int bits=0;
+    uint64_t ent = 0;
+    int entleft = 0;
    memset(b32, 0, 32);
    while (bits < 256) {
-        uint32_t ent = secp256k1_rand32();
-        int now = 1 + ((ent % 64)*((ent >> 6) % 32)+16)/31;
-        uint32_t val = 1 & (ent >> 11);
+        int now;
+        uint32_t val;
+        if (entleft < 12) {
+            ent |= ((uint64_t)secp256k1_rand32()) << entleft;
+            entleft += 32;
+        }
+        now = 1 + ((ent % 64)*((ent >> 6) % 32)+16)/31;
+        val = 1 & (ent >> 11);
+        ent >>= 12;
+        entleft -= 12;
        while (now > 0 && bits < 256) {
            b32[bits / 8] |= val << (bits % 8);
            now--;
--- a/src/tests.c
+++ b/src/tests.c
--- a/src/util.h
+++ b/src/util.h
@@ -27,7 +27,7 @@
 } while(0)
 #endif

-#ifndef HAVE_BUILTIN_EXPECT
+#ifdef HAVE_BUILTIN_EXPECT
 #define EXPECT(x,c) __builtin_expect((x),(c))
 #else
 #define EXPECT(x,c) (x)
@@ -61,7 +61,7 @@
 #define VERIFY_CHECK(cond) do { (void)(cond); } while(0)
 #endif

-static inline void *checked_malloc(size_t size) {
+static SECP256K1_INLINE void *checked_malloc(size_t size) {
    void *ret = malloc(size);
    CHECK(ret != NULL);
    return ret;
@@ -84,4 +84,21 @@ static inline void *checked_malloc(size_t size) {
 # endif
 #endif

+#if defined(_WIN32)
+# define I64FORMAT "I64d"
+# define I64uFORMAT "I64u"
+#else
+# define I64FORMAT "lld"
+# define I64uFORMAT "llu"
+#endif
+
+#if defined(HAVE___INT128)
+# if defined(__GNUC__)
+#  define SECP256K1_GNUC_EXT __extension__
+# else
+#  define SECP256K1_GNUC_EXT
+# endif
+SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
+#endif
+
 #endif