Try redundant representation for X25519.

Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
This commit is contained in:
NIIBE Yutaka 2023-12-14 15:29:01 +09:00
parent f06d3c1978
commit 3a70cee5bd
No known key found for this signature in database
GPG Key ID: 640114AF89DE6054
2 changed files with 736 additions and 67 deletions

View File

@ -1,3 +1,12 @@
2023-12-14 NIIBE Yutaka <gniibe@fsij.org>
* src/ecc-x25519.c (rr25519_0, rr25519_1, rr25519_copy)
(rr25519_swap_cond, rr25519_add, rr25519_sub, m32x32)
(rr25519_mul, rr25519_sqr, rr25638_mul_121665)
(get_uint32_le, get_uint24_le, rr25519_expand, rr25519_reduce)
(rr25519_contract): New.
(compute_nQ): Use the redundant representation for p25519 field.
2023-12-13 NIIBE Yutaka <gniibe@fsij.org>
* src/ecc-x25519.c (mod25638_0, mod25638_1, mod25638_copy)

View File

@ -50,88 +50,744 @@
* We don't avoid conditional jump if both cases have same timing,
* either.
*
* (1) We use Radix-32 field arithmetic. It's a representation like
* 2^256-38, but it's more redundant. For example, "1" can be
* represented in three ways in 256-bit: 1, 2^255-18, and
* 2^256-37.
* (1) We use the base of 2^25.5 (alternating 2^25 and 2^26).
*
* (2) We use Montgomery double-and-add.
*
*/
#ifndef BN256_C_IMPLEMENTATION
#define ASM_IMPLEMENTATION 1
#endif
/*
*
* 121665 = 0x1db41
* 1 1101 1011 0100 0001
/* Redundant representation with signed limb for bignum integer,
* using 2^25.5 for the base.
*/
static void
mod25638_mul_121665 (bn256 *x, const bn256 *a)
{
#if ASM_IMPLEMENTATION
#include "muladd_256.h"
const uint32_t *s;
uint32_t *d;
uint32_t w;
uint32_t c;
s = a->word;
d = x->word;
memset (d, 0, sizeof (bn256));
w = 121665;
MULADD_256_ASM (s, d, w, c);
#else
uint32_t c, c1;
bn256 m[1];
c = c1 = bn256_shift (m, a, 6); c += bn256_add (x, a, m);
c1 <<= 2; c1 |= bn256_shift (m, m, 2); c = c + c1 + bn256_add (x, x, m);
c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
c1 <<= 2; c1 |= bn256_shift (m, m, 2); c = c + c1 + bn256_add (x, x, m);
c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
c1 <<= 2; c1 |= bn256_shift (m, m, 2); c = c + c1 + bn256_add (x, x, m);
c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
#endif
c = bn256_add_uint (x, x, c*38);
x->word[0] += c * 38;
}
#define RR25519_WORDS 10
typedef struct rr25519 {
int32_t w[RR25519_WORDS];
} rr25519;
/* X = 0 */
static void
mod25638_0 (bn256 *x)
static inline void
rr25519_0 (rr25519 *x)
{
memset(x, 0, sizeof (bn256));
memset(x, 0, sizeof (rr25519));
}
/* X = 1 */
static void
mod25638_1 (bn256 *x)
static inline void
rr25519_1 (rr25519 *x)
{
x->word[0] = 1;
memset(&x->word[1], 0, sizeof (uint32_t) * (BN256_WORDS - 1));
x->w[0] = 1;
memset(&x->w[1], 0, sizeof (int32_t) * (RR25519_WORDS - 1));
}
/* DST = SRC */
static void
mod25638_copy (bn256 *dst, const bn256 *src)
static inline void
rr25519_copy (rr25519 *dst, const rr25519 *src)
{
memcpy (dst, src, sizeof (bn256));
memcpy (dst, src, sizeof (rr25519));
}
/* A <=> B conditionally */
static void
rr25519_swap_cond (rr25519 *a, rr25519 *b, uint32_t c)
{
int i;
uint32_t mask = 0UL - c;
uint32_t *p = (uint32_t *)a->w;
uint32_t *q = (uint32_t *)b->w;
asm volatile ("" : "+r" (mask) : : "memory");
for (i = 0; i < RR25519_WORDS; i++)
{
uint32_t t = mask & (*p^*q);
*p++ ^= t;
*q++ ^= t;
}
}
/* X = (A + B) mod 2^255-19 */
static void
rr25519_add (rr25519 *x, const rr25519 *a, const rr25519 *b)
{
x->w[0] = a->w[0] + b->w[0];
x->w[1] = a->w[1] + b->w[1];
x->w[2] = a->w[2] + b->w[2];
x->w[3] = a->w[3] + b->w[3];
x->w[4] = a->w[4] + b->w[4];
x->w[5] = a->w[5] + b->w[5];
x->w[6] = a->w[6] + b->w[6];
x->w[7] = a->w[7] + b->w[7];
x->w[8] = a->w[8] + b->w[8];
x->w[9] = a->w[9] + b->w[9];
}
/* X = (A - B) mod 2^255-19 */
static void
rr25519_sub (rr25519 *x, const rr25519 *a, const rr25519 *b)
{
x->w[0] = a->w[0] - b->w[0];
x->w[1] = a->w[1] - b->w[1];
x->w[2] = a->w[2] - b->w[2];
x->w[3] = a->w[3] - b->w[3];
x->w[4] = a->w[4] - b->w[4];
x->w[5] = a->w[5] - b->w[5];
x->w[6] = a->w[6] - b->w[6];
x->w[7] = a->w[7] - b->w[7];
x->w[8] = a->w[8] - b->w[8];
x->w[9] = a->w[9] - b->w[9];
}
/* Multiply two 32-bit integers, resulting 64-bit integer. */
static inline int64_t
m32x32 (int32_t a, int32_t b)
{
return a * (int64_t)b;
}
/* X = (A * B) mod 2^255-19 */
static void
rr25519_mul (rr25519 *x, const rr25519 *a, const rr25519 *b)
{
int64_t carry0, carry1, carry2, carry3, carry4;
int64_t carry5, carry6, carry7, carry8, carry9;
int32_t a0 = a->w[0];
int32_t a1 = a->w[1];
int32_t a2 = a->w[2];
int32_t a3 = a->w[3];
int32_t a4 = a->w[4];
int32_t a5 = a->w[5];
int32_t a6 = a->w[6];
int32_t a7 = a->w[7];
int32_t a8 = a->w[8];
int32_t a9 = a->w[9];
int32_t b0 = b->w[0];
int32_t b1 = b->w[1];
int32_t b2 = b->w[2];
int32_t b3 = b->w[3];
int32_t b4 = b->w[4];
int32_t b5 = b->w[5];
int32_t b6 = b->w[6];
int32_t b7 = b->w[7];
int32_t b8 = b->w[8];
int32_t b9 = b->w[9];
int32_t b1_19 = 19 * b1;
int32_t b2_19 = 19 * b2;
int32_t b3_19 = 19 * b3;
int32_t b4_19 = 19 * b4;
int32_t b5_19 = 19 * b5;
int32_t b6_19 = 19 * b6;
int32_t b7_19 = 19 * b7;
int32_t b8_19 = 19 * b8;
int32_t b9_19 = 19 * b9;
int32_t a1_2 = 2 * a1;
int32_t a3_2 = 2 * a3;
int32_t a5_2 = 2 * a5;
int32_t a7_2 = 2 * a7;
int32_t a9_2 = 2 * a9;
int64_t a0b0 = m32x32 (a0, b0);
int64_t a0b1 = m32x32 (a0, b1);
int64_t a0b2 = m32x32 (a0, b2);
int64_t a0b3 = m32x32 (a0, b3);
int64_t a0b4 = m32x32 (a0, b4);
int64_t a0b5 = m32x32 (a0, b5);
int64_t a0b6 = m32x32 (a0, b6);
int64_t a0b7 = m32x32 (a0, b7);
int64_t a0b8 = m32x32 (a0, b8);
int64_t a0b9 = m32x32 (a0, b9);
int64_t a1b0 = m32x32 (a1, b0);
int64_t a1b1_2 = m32x32 (a1_2, b1);
int64_t a1b2 = m32x32 (a1, b2);
int64_t a1b3_2 = m32x32 (a1_2, b3);
int64_t a1b4 = m32x32 (a1, b4);
int64_t a1b5_2 = m32x32 (a1_2, b5);
int64_t a1b6 = m32x32 (a1, b6);
int64_t a1b7_2 = m32x32 (a1_2, b7);
int64_t a1b8 = m32x32 (a1, b8);
int64_t a1b9_38 = m32x32 (a1_2, b9_19);
int64_t a2b0 = m32x32 (a2, b0);
int64_t a2b1 = m32x32 (a2, b1);
int64_t a2b2 = m32x32 (a2, b2);
int64_t a2b3 = m32x32 (a2, b3);
int64_t a2b4 = m32x32 (a2, b4);
int64_t a2b5 = m32x32 (a2, b5);
int64_t a2b6 = m32x32 (a2, b6);
int64_t a2b7 = m32x32 (a2, b7);
int64_t a2b8_19 = m32x32 (a2, b8_19);
int64_t a2b9_19 = m32x32 (a2, b9_19);
int64_t a3b0 = m32x32 (a3, b0);
int64_t a3b1_2 = m32x32 (a3_2, b1);
int64_t a3b2 = m32x32 (a3, b2);
int64_t a3b3_2 = m32x32 (a3_2, b3);
int64_t a3b4 = m32x32 (a3, b4);
int64_t a3b5_2 = m32x32 (a3_2, b5);
int64_t a3b6 = m32x32 (a3, b6);
int64_t a3b7_38 = m32x32 (a3_2, b7_19);
int64_t a3b8_19 = m32x32 (a3, b8_19);
int64_t a3b9_38 = m32x32 (a3_2, b9_19);
int64_t a4b0 = m32x32 (a4, b0);
int64_t a4b1 = m32x32 (a4, b1);
int64_t a4b2 = m32x32 (a4, b2);
int64_t a4b3 = m32x32 (a4, b3);
int64_t a4b4 = m32x32 (a4, b4);
int64_t a4b5 = m32x32 (a4, b5);
int64_t a4b6_19 = m32x32 (a4, b6_19);
int64_t a4b7_19 = m32x32 (a4, b7_19);
int64_t a4b8_19 = m32x32 (a4, b8_19);
int64_t a4b9_19 = m32x32 (a4, b9_19);
int64_t a5b0 = m32x32 (a5, b0);
int64_t a5b1_2 = m32x32 (a5_2, b1);
int64_t a5b2 = m32x32 (a5, b2);
int64_t a5b3_2 = m32x32 (a5_2, b3);
int64_t a5b4 = m32x32 (a5, b4);
int64_t a5b5_38 = m32x32 (a5_2, b5_19);
int64_t a5b6_19 = m32x32 (a5, b6_19);
int64_t a5b7_38 = m32x32 (a5_2, b7_19);
int64_t a5b8_19 = m32x32 (a5, b8_19);
int64_t a5b9_38 = m32x32 (a5_2, b9_19);
int64_t a6b0 = m32x32 (a6, b0);
int64_t a6b1 = m32x32 (a6, b1);
int64_t a6b2 = m32x32 (a6, b2);
int64_t a6b3 = m32x32 (a6, b3);
int64_t a6b4_19 = m32x32 (a6, b4_19);
int64_t a6b5_19 = m32x32 (a6, b5_19);
int64_t a6b6_19 = m32x32 (a6, b6_19);
int64_t a6b7_19 = m32x32 (a6, b7_19);
int64_t a6b8_19 = m32x32 (a6, b8_19);
int64_t a6b9_19 = m32x32 (a6, b9_19);
int64_t a7b0 = m32x32 (a7, b0);
int64_t a7b1_2 = m32x32 (a7_2, b1);
int64_t a7b2 = m32x32 (a7, b2);
int64_t a7b3_38 = m32x32 (a7_2, b3_19);
int64_t a7b4_19 = m32x32 (a7, b4_19);
int64_t a7b5_38 = m32x32 (a7_2, b5_19);
int64_t a7b6_19 = m32x32 (a7, b6_19);
int64_t a7b7_38 = m32x32 (a7_2, b7_19);
int64_t a7b8_19 = m32x32 (a7, b8_19);
int64_t a7b9_38 = m32x32 (a7_2, b9_19);
int64_t a8b0 = m32x32 (a8, b0);
int64_t a8b1 = m32x32 (a8, b1);
int64_t a8b2_19 = m32x32 (a8, b2_19);
int64_t a8b3_19 = m32x32 (a8, b3_19);
int64_t a8b4_19 = m32x32 (a8, b4_19);
int64_t a8b5_19 = m32x32 (a8, b5_19);
int64_t a8b6_19 = m32x32 (a8, b6_19);
int64_t a8b7_19 = m32x32 (a8, b7_19);
int64_t a8b8_19 = m32x32 (a8, b8_19);
int64_t a8b9_19 = m32x32 (a8, b9_19);
int64_t a9b0 = m32x32 (a9, b0);
int64_t a9b1_38 = m32x32 (a9_2, b1_19);
int64_t a9b2_19 = m32x32 (a9, b2_19);
int64_t a9b3_38 = m32x32 (a9_2, b3_19);
int64_t a9b4_19 = m32x32 (a9, b4_19);
int64_t a9b5_38 = m32x32 (a9_2, b5_19);
int64_t a9b6_19 = m32x32 (a9, b6_19);
int64_t a9b7_38 = m32x32 (a9_2, b7_19);
int64_t a9b8_19 = m32x32 (a9, b8_19);
int64_t a9b9_38 = m32x32 (a9_2, b9_19);
int64_t x0 = (a0b0 + a1b9_38 + a2b8_19 + a3b7_38 + a4b6_19 + a5b5_38
+ a6b4_19 + a7b3_38 + a8b2_19 + a9b1_38);
int64_t x1 = (a0b1 + a1b0 + a2b9_19 + a3b8_19 + a4b7_19 + a5b6_19 + a6b5_19
+ a7b4_19 + a8b3_19 + a9b2_19);
int64_t x2 = (a0b2 + a1b1_2 + a2b0 + a3b9_38 + a4b8_19 + a5b7_38 + a6b6_19
+ a7b5_38 + a8b4_19 + a9b3_38);
int64_t x3 = (a0b3 + a1b2 + a2b1 + a3b0 + a4b9_19 + a5b8_19 + a6b7_19
+ a7b6_19 + a8b5_19 + a9b4_19);
int64_t x4 = (a0b4 + a1b3_2 + a2b2 + a3b1_2 + a4b0 + a5b9_38 + a6b8_19
+ a7b7_38 + a8b6_19 + a9b5_38);
int64_t x5 = (a0b5 + a1b4 + a2b3 + a3b2 + a4b1 + a5b0 + a6b9_19 + a7b8_19
+ a8b7_19 + a9b6_19);
int64_t x6 = (a0b6 + a1b5_2 + a2b4 + a3b3_2 + a4b2 + a5b1_2 + a6b0
+ a7b9_38 + a8b8_19 + a9b7_38);
int64_t x7 = (a0b7 + a1b6 + a2b5 + a3b4 + a4b3 + a5b2 + a6b1 + a7b0
+ a8b9_19 + a9b8_19);
int64_t x8 = (a0b8 + a1b7_2 + a2b6 + a3b5_2 + a4b4 + a5b3_2 + a6b2 + a7b1_2
+ a8b0 + a9b9_38);
int64_t x9 = (a0b9 + a1b8 + a2b7 + a3b6 + a4b5 + a5b4 + a6b3 + a7b2
+ a8b1 + a9b0);
carry0 = (x0 + (1 << 25)) >> 26;
x1 += carry0;
x0 -= (carry0 << 26);
carry4 = (x4 + (1 << 25)) >> 26;
x5 += carry4;
x4 -= (carry4 << 26);
carry1 = (x1 + (1 << 24)) >> 25;
x2 += carry1;
x1 -= (carry1 << 25);
carry5 = (x5 + (1 << 24)) >> 25;
x6 += carry5;
x5 -= (carry5 << 25);
carry2 = (x2 + (1 << 25)) >> 26;
x3 += carry2;
x2 -= (carry2 << 26);
carry6 = (x6 + (1 << 25)) >> 26;
x7 += carry6;
x6 -= (carry6 << 26);
carry3 = (x3 + (1 << 24)) >> 25;
x4 += carry3;
x3 -= (carry3 << 25);
carry7 = (x7 + (1 << 24)) >> 25;
x8 += carry7;
x7 -= (carry7 << 25);
carry4 = (x4 + (1 << 25)) >> 26;
x5 += carry4;
x4 -= (carry4 << 26);
carry8 = (x8 + (1 << 25)) >> 26;
x9 += carry8;
x8 -= (carry8 << 26);
carry9 = (x9 + (1 << 24)) >> 25;
x0 += carry9 * 19;
x9 -= (carry9 << 25);
carry0 = (x0 + (1 << 25)) >> 26;
x1 += carry0;
x0 -= (carry0 << 26);
x->w[0] = (int32_t)x0;
x->w[1] = (int32_t)x1;
x->w[2] = (int32_t)x2;
x->w[3] = (int32_t)x3;
x->w[4] = (int32_t)x4;
x->w[5] = (int32_t)x5;
x->w[6] = (int32_t)x6;
x->w[7] = (int32_t)x7;
x->w[8] = (int32_t)x8;
x->w[9] = (int32_t)x9;
}
/* X = (A ^ 2) mod 2^255-19 */
static void
rr25519_sqr (rr25519 *x, const rr25519 *a)
{
int64_t carry0, carry1, carry2, carry3, carry4;
int64_t carry5, carry6, carry7, carry8, carry9;
int32_t a0 = a->w[0];
int32_t a1 = a->w[1];
int32_t a2 = a->w[2];
int32_t a3 = a->w[3];
int32_t a4 = a->w[4];
int32_t a5 = a->w[5];
int32_t a6 = a->w[6];
int32_t a7 = a->w[7];
int32_t a8 = a->w[8];
int32_t a9 = a->w[9];
int32_t a0_2 = 2 * a0;
int32_t a1_2 = 2 * a1;
int32_t a2_2 = 2 * a2;
int32_t a3_2 = 2 * a3;
int32_t a4_2 = 2 * a4;
int32_t a5_2 = 2 * a5;
int32_t a6_2 = 2 * a6;
int32_t a7_2 = 2 * a7;
int32_t a5_38 = 38 * a5;
int32_t a6_19 = 19 * a6;
int32_t a7_38 = 38 * a7;
int32_t a8_19 = 19 * a8;
int32_t a9_38 = 38 * a9;
int64_t a0a0 = m32x32 (a0, a0);
int64_t a0a1_2 = m32x32 (a0_2, a1);
int64_t a0a2_2 = m32x32 (a0_2, a2);
int64_t a0a3_2 = m32x32 (a0_2, a3);
int64_t a0a4_2 = m32x32 (a0_2, a4);
int64_t a0a5_2 = m32x32 (a0_2, a5);
int64_t a0a6_2 = m32x32 (a0_2, a6);
int64_t a0a7_2 = m32x32 (a0_2, a7);
int64_t a0a8_2 = m32x32 (a0_2, a8);
int64_t a0a9_2 = m32x32 (a0_2, a9);
int64_t a1a1_2 = m32x32 (a1_2, a1);
int64_t a1a2_2 = m32x32 (a1_2, a2);
int64_t a1a3_4 = m32x32 (a1_2, a3_2);
int64_t a1a4_2 = m32x32 (a1_2, a4);
int64_t a1a5_4 = m32x32 (a1_2, a5_2);
int64_t a1a6_2 = m32x32 (a1_2, a6);
int64_t a1a7_4 = m32x32 (a1_2, a7_2);
int64_t a1a8_2 = m32x32 (a1_2, a8);
int64_t a1a9_76 = m32x32 (a1_2, a9_38);
int64_t a2a2 = m32x32 (a2, a2);
int64_t a2a3_2 = m32x32 (a2_2, a3);
int64_t a2a4_2 = m32x32 (a2_2, a4);
int64_t a2a5_2 = m32x32 (a2_2, a5);
int64_t a2a6_2 = m32x32 (a2_2, a6);
int64_t a2a7_2 = m32x32 (a2_2, a7);
int64_t a2a8_38 = m32x32 (a2_2, a8_19);
int64_t a2a9_38 = m32x32 (a2, a9_38);
int64_t a3a3_2 = m32x32 (a3_2, a3);
int64_t a3a4_2 = m32x32 (a3_2, a4);
int64_t a3a5_4 = m32x32 (a3_2, a5_2);
int64_t a3a6_2 = m32x32 (a3_2, a6);
int64_t a3a7_76 = m32x32 (a3_2, a7_38);
int64_t a3a8_38 = m32x32 (a3_2, a8_19);
int64_t a3a9_76 = m32x32 (a3_2, a9_38);
int64_t a4a4 = m32x32 (a4, a4);
int64_t a4a5_2 = m32x32 (a4_2, a5);
int64_t a4a6_38 = m32x32 (a4_2, a6_19);
int64_t a4a7_38 = m32x32 (a4, a7_38);
int64_t a4a8_38 = m32x32 (a4_2, a8_19);
int64_t a4a9_38 = m32x32 (a4, a9_38);
int64_t a5a5_38 = m32x32 (a5, a5_38);
int64_t a5a6_38 = m32x32 (a5_2, a6_19);
int64_t a5a7_76 = m32x32 (a5_2, a7_38);
int64_t a5a8_38 = m32x32 (a5_2, a8_19);
int64_t a5a9_76 = m32x32 (a5_2, a9_38);
int64_t a6a6_19 = m32x32 (a6, a6_19);
int64_t a6a7_38 = m32x32 (a6, a7_38);
int64_t a6a8_38 = m32x32 (a6_2, a8_19);
int64_t a6a9_38 = m32x32 (a6, a9_38);
int64_t a7a7_38 = m32x32 (a7, a7_38);
int64_t a7a8_38 = m32x32 (a7_2, a8_19);
int64_t a7a9_76 = m32x32 (a7_2, a9_38);
int64_t a8a8_19 = m32x32 (a8, a8_19);
int64_t a8a9_38 = m32x32 (a8, a9_38);
int64_t a9a9_38 = m32x32 (a9, a9_38);
int64_t x0 = a0a0 + a1a9_76 + a2a8_38 + a3a7_76 + a4a6_38 + a5a5_38;
int64_t x1 = a0a1_2 + a2a9_38 + a3a8_38 + a4a7_38 + a5a6_38;
int64_t x2 = a0a2_2 + a1a1_2 + a3a9_76 + a4a8_38 + a5a7_76 + a6a6_19;
int64_t x3 = a0a3_2 + a1a2_2 + a4a9_38 + a5a8_38 + a6a7_38;
int64_t x4 = a0a4_2 + a1a3_4 + a2a2 + a5a9_76 + a6a8_38 + a7a7_38;
int64_t x5 = a0a5_2 + a1a4_2 + a2a3_2 + a6a9_38 + a7a8_38;
int64_t x6 = a0a6_2 + a1a5_4 + a2a4_2 + a3a3_2 + a7a9_76 + a8a8_19;
int64_t x7 = a0a7_2 + a1a6_2 + a2a5_2 + a3a4_2 + a8a9_38;
int64_t x8 = a0a8_2 + a1a7_4 + a2a6_2 + a3a5_4 + a4a4 + a9a9_38;
int64_t x9 = a0a9_2 + a1a8_2 + a2a7_2 + a3a6_2 + a4a5_2;
carry0 = (x0 + (1 << 25)) >> 26;
x1 += carry0;
x0 -= (carry0 << 26);
carry4 = (x4 + (1 << 25)) >> 26;
x5 += carry4;
x4 -= (carry4 << 26);
carry1 = (x1 + (1 << 24)) >> 25;
x2 += carry1;
x1 -= (carry1 << 25);
carry5 = (x5 + (1 << 24)) >> 25;
x6 += carry5;
x5 -= (carry5 << 25);
carry2 = (x2 + (1 << 25)) >> 26;
x3 += carry2;
x2 -= (carry2 << 26);
carry6 = (x6 + (1 << 25)) >> 26;
x7 += carry6;
x6 -= (carry6 << 26);
carry3 = (x3 + (1 << 24)) >> 25;
x4 += carry3;
x3 -= (carry3 << 25);
carry7 = (x7 + (1 << 24)) >> 25;
x8 += carry7;
x7 -= (carry7 << 25);
carry4 = (x4 + (1 << 25)) >> 26;
x5 += carry4;
x4 -= (carry4 << 26);
carry8 = (x8 + (1 << 25)) >> 26;
x9 += carry8;
x8 -= (carry8 << 26);
carry9 = (x9 + (1 << 24)) >> 25;
x0 += carry9 * 19;
x9 -= (carry9 << 25);
carry0 = (x0 + (1 << 25)) >> 26;
x1 += carry0;
x0 -= (carry0 << 26);
x->w[0] = (int32_t)x0;
x->w[1] = (int32_t)x1;
x->w[2] = (int32_t)x2;
x->w[3] = (int32_t)x3;
x->w[4] = (int32_t)x4;
x->w[5] = (int32_t)x5;
x->w[6] = (int32_t)x6;
x->w[7] = (int32_t)x7;
x->w[8] = (int32_t)x8;
x->w[9] = (int32_t)x9;
}
/*
* A = 486662
* a24 which stands for (A - 2)/4 = 121665
*/
static void
rr25519_mul_121665 (rr25519 *x, const rr25519 *a)
{
int64_t carry0, carry1, carry2, carry3, carry4;
int64_t carry5, carry6, carry7, carry8, carry9;
int64_t v0 = m32x32 (a->w[0], 121665);
int64_t v1 = m32x32 (a->w[1], 121665);
int64_t v2 = m32x32 (a->w[2], 121665);
int64_t v3 = m32x32 (a->w[3], 121665);
int64_t v4 = m32x32 (a->w[4], 121665);
int64_t v5 = m32x32 (a->w[5], 121665);
int64_t v6 = m32x32 (a->w[6], 121665);
int64_t v7 = m32x32 (a->w[7], 121665);
int64_t v8 = m32x32 (a->w[8], 121665);
int64_t v9 = m32x32 (a->w[9], 121665);
carry1 = (v1 + (1 << 24)) >> 25;
v2 += carry1;
v1 -= (carry1 << 25);
carry3 = (v3 + (1 << 24)) >> 25;
v4 += carry3;
v3 -= (carry3 << 25);
carry5 = (v5 + (1 << 24)) >> 25;
v6 += carry5;
v5 -= (carry5 << 25);
carry7 = (v7 + (1 << 24)) >> 25;
v8 += carry7;
v7 -= (carry7 << 25);
carry9 = (v9 + (1 << 24)) >> 25;
v0 += carry9 * 19;
v9 -= (carry9 << 25);
carry0 = (v0 + (1 << 25)) >> 26;
v1 += carry0;
v0 -= (carry0 << 26);
carry2 = (v2 + (1 << 25)) >> 26;
v3 += carry2;
v2 -= (carry2 << 26);
carry4 = (v4 + (1 << 25)) >> 26;
v5 += carry4;
v4 -= (carry4 << 26);
carry6 = (v6 + (1 << 25)) >> 26;
v7 += carry6;
v6 -= (carry6 << 26);
carry8 = (v8 + (1 << 25)) >> 26;
v9 += carry8;
v8 -= (carry8 << 26);
x->w[0] = (int32_t)v0;
x->w[1] = (int32_t)v1;
x->w[2] = (int32_t)v2;
x->w[3] = (int32_t)v3;
x->w[4] = (int32_t)v4;
x->w[5] = (int32_t)v5;
x->w[6] = (int32_t)v6;
x->w[7] = (int32_t)v7;
x->w[8] = (int32_t)v8;
x->w[9] = (int32_t)v9;
}
/* Copied from aes.c, changing the return type into 64-bit. */
static uint64_t
get_uint32_le (const unsigned char *b, unsigned int i)
{
return ( ((uint64_t)b[i ] )
| ((uint64_t)b[i + 1] << 8)
| ((uint64_t)b[i + 2] << 16)
| ((uint64_t)b[i + 3] << 24));
}
static uint64_t
get_uint24_le (const unsigned char *b, unsigned int i)
{
return ( ((uint64_t)b[i ] )
| ((uint64_t)b[i + 1] << 8)
| ((uint64_t)b[i + 2] << 16));
}
/* Expand byte representation into the redundant representation. */
static void
rr25519_expand (rr25519 *x, const unsigned char *src)
{
int64_t carry0, carry1, carry2, carry3, carry4;
int64_t carry5, carry6, carry7, carry8, carry9;
int64_t v0 = get_uint32_le (src, 0);
int64_t v1 = get_uint24_le (src, 4) << 6;
int64_t v2 = get_uint24_le (src, 7) << 5;
int64_t v3 = get_uint24_le (src, 10) << 3;
int64_t v4 = get_uint24_le (src, 13) << 2;
int64_t v5 = get_uint32_le (src, 16);
int64_t v6 = get_uint24_le (src, 20) << 7;
int64_t v7 = get_uint24_le (src, 23) << 5;
int64_t v8 = get_uint24_le (src, 26) << 4;
int64_t v9 = (get_uint24_le (src, 29) & 0x7fffff) << 2;
carry1 = (v1 + (1 << 24)) >> 25;
v2 += carry1;
v1 -= (carry1 << 25);
carry3 = (v3 + (1 << 24)) >> 25;
v4 += carry3;
v3 -= (carry3 << 25);
carry5 = (v5 + (1 << 24)) >> 25;
v6 += carry5;
v5 -= (carry5 << 25);
carry7 = (v7 + (1 << 24)) >> 25;
v8 += carry7;
v7 -= (carry7 << 25);
carry9 = (v9 + (1 << 24)) >> 25;
v0 += carry9 * 19;
v9 -= (carry9 << 25);
carry0 = (v0 + (1 << 25)) >> 26;
v1 += carry0;
v0 -= (carry0 << 26);
carry2 = (v2 + (1 << 25)) >> 26;
v3 += carry2;
v2 -= (carry2 << 26);
carry4 = (v4 + (1 << 25)) >> 26;
v5 += carry4;
v4 -= (carry4 << 26);
carry6 = (v6 + (1 << 25)) >> 26;
v7 += carry6;
v6 -= (carry6 << 26);
carry8 = (v8 + (1 << 25)) >> 26;
v9 += carry8;
v8 -= (carry8 << 26);
x->w[0] = (int32_t)v0;
x->w[1] = (int32_t)v1;
x->w[2] = (int32_t)v2;
x->w[3] = (int32_t)v3;
x->w[4] = (int32_t)v4;
x->w[5] = (int32_t)v5;
x->w[6] = (int32_t)v6;
x->w[7] = (int32_t)v7;
x->w[8] = (int32_t)v8;
x->w[9] = (int32_t)v9;
}
/* Strong reduce */
static void
rr25519_reduce (rr25519 *x, const rr25519 *a)
{
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4;
int32_t carry5, carry6, carry7, carry8, carry9;
int32_t x0 = a->w[0];
int32_t x1 = a->w[1];
int32_t x2 = a->w[2];
int32_t x3 = a->w[3];
int32_t x4 = a->w[4];
int32_t x5 = a->w[5];
int32_t x6 = a->w[6];
int32_t x7 = a->w[7];
int32_t x8 = a->w[8];
int32_t x9 = a->w[9];
q = (19 * x9 + (1 << 24)) >> 25;
q = (x0 + q) >> 26;
q = (x1 + q) >> 25;
q = (x2 + q) >> 26;
q = (x3 + q) >> 25;
q = (x4 + q) >> 26;
q = (x5 + q) >> 25;
q = (x6 + q) >> 26;
q = (x7 + q) >> 25;
q = (x8 + q) >> 26;
q = (x9 + q) >> 25;
x0 += 19 * q;
carry0 = x0 >> 26;
x1 += carry0;
x0 -= (carry0 << 26);
carry1 = x1 >> 25;
x2 += carry1;
x1 -= (carry1 << 25);
carry2 = x2 >> 26;
x3 += carry2;
x2 -= (carry2 << 26);
carry3 = x3 >> 25;
x4 += carry3;
x3 -= (carry3 << 25);
carry4 = x4 >> 26;
x5 += carry4;
x4 -= (carry4 << 26);
carry5 = x5 >> 25;
x6 += carry5;
x5 -= (carry5 << 25);
carry6 = x6 >> 26;
x7 += carry6;
x6 -= (carry6 << 26);
carry7 = x7 >> 25;
x8 += carry7;
x7 -= (carry7 << 25);
carry8 = x8 >> 26;
x9 += carry8;
x8 -= (carry8 << 26);
carry9 = x9 >> 25;
x9 -= (carry9 << 25);
x->w[0] = x0;
x->w[1] = x1;
x->w[2] = x2;
x->w[3] = x3;
x->w[4] = x4;
x->w[5] = x5;
x->w[6] = x6;
x->w[7] = x7;
x->w[8] = x8;
x->w[9] = x9;
}
static void
rr25519_contract (unsigned char *dst, const rr25519 *x)
{
rr25519 t[1];
rr25519_reduce (t, x);
dst[0] = t->w[0] >> 0;
dst[1] = t->w[0] >> 8;
dst[2] = t->w[0] >> 16;
dst[3] = (t->w[0] >> 24) | (t->w[1] << 2);
dst[4] = t->w[1] >> 6;
dst[5] = t->w[1] >> 14;
dst[6] = (t->w[1] >> 22) | (t->w[2] << 3);
dst[7] = t->w[2] >> 5;
dst[8] = t->w[2] >> 13;
dst[9] = (t->w[2] >> 21) | (t->w[3] << 5);
dst[10] = t->w[3] >> 3;
dst[11] = t->w[3] >> 11;
dst[12] = (t->w[3] >> 19) | (t->w[4] << 6);
dst[13] = t->w[4] >> 2;
dst[14] = t->w[4] >> 10;
dst[15] = t->w[4] >> 18;
dst[16] = t->w[5] >> 0;
dst[17] = t->w[5] >> 8;
dst[18] = t->w[5] >> 16;
dst[19] = (t->w[5] >> 24) | (t->w[6] << 1);
dst[20] = t->w[6] >> 7;
dst[21] = t->w[6] >> 15;
dst[22] = (t->w[6] >> 23) | (t->w[7] << 3);
dst[23] = t->w[7] >> 5;
dst[24] = t->w[7] >> 13;
dst[25] = (t->w[7] >> 21) | (t->w[8] << 4);
dst[26] = t->w[8] >> 4;
dst[27] = t->w[8] >> 12;
dst[28] = (t->w[8] >> 20) | (t->w[9] << 6);
dst[29] = t->w[9] >> 2;
dst[30] = t->w[9] >> 10;
dst[31] = t->w[9] >> 18;
}
/* fe: Field Element */
typedef bn256 fe;
#define fe_add mod25638_add
#define fe_sub mod25638_sub
#define fe_mul mod25638_mul
#define fe_sqr mod25638_sqr
#define fe_a24 mod25638_mul_121665
#define fe_swap_cond bn256_swap_cond
#define fe_0 mod25638_0
#define fe_1 mod25638_1
#define fe_copy mod25638_copy
typedef rr25519 fe;
#define fe_add rr25519_add
#define fe_sub rr25519_sub
#define fe_mul rr25519_mul
#define fe_sqr rr25519_sqr
#define fe_a24 rr25519_mul_121665
#define fe_swap_cond rr25519_swap_cond
#define fe_0 rr25519_0
#define fe_1 rr25519_1
#define fe_copy rr25519_copy
#define fe_expand rr25519_expand
#define fe_contract rr25519_contract
/**
* @brief Process Montgomery double-and-add
@ -197,7 +853,8 @@ compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
{
int i;
fe x0[1], z0[1], x1[1], z1[1];
fe t0[1], t1[1];
fe t0[1], t1[1], q_x_rr[1];
bn256 x0bn[1], z0bn[1];
uint32_t swap = 0;
const unsigned char *np = (const unsigned char *)n->word;
@ -206,7 +863,8 @@ compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
fe_0 (z0);
/* P1 = (X:1) */
fe_copy (x1, q_x);
fe_expand (x1, (const unsigned char *)q_x);
fe_copy (q_x_rr, x1);
fe_copy (z1, x0);
for (i = 254; i >= 0; i--)
@ -217,7 +875,7 @@ compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
fe_swap_cond (x0, x1, swap);
fe_swap_cond (z0, z1, swap);
swap = b;
mont_d_and_a (x0, z0, x1, z1, q_x, t0, t1);
mont_d_and_a (x0, z0, x1, z1, q_x_rr, t0, t1);
}
/* We know the LSB of N is always 0. Thus, result is always in P0. */
@ -226,8 +884,10 @@ compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
* but returns 0 (like the implementation of z^(p-2)), thus, RES will
* be 0 in that case, which is correct value.
*/
mod256_inv (res, z0, 1);
mod25638_mul (res, res, x0);
fe_contract ((unsigned char *)x0bn, x0);
fe_contract ((unsigned char *)z0bn, z0);
mod256_inv (res, z0bn, 1);
mod25638_mul (res, res, x0bn);
mod25519_reduce (res);
}