Refactor X25519 implementation.

Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
2024-09-20 02:40:08 +00:00 · 2023-12-01 15:19:10 +09:00 · 2023-12-01 15:19:10 +09:00 · 5a05a619c0
commit 5a05a619c0
parent f3cb2694ce
7 changed files with 163 additions and 115 deletions
--- a/12
+++ b/12
@ -1,3 +1,15 @@
 2023-12-01  NIIBE Yutaka  <gniibe@fsij.org>
 	* src/bn.c (bn256_swap_cond): New.
 	(bn256_set_cond): New.
 	* src/mod25638.c (mod25519_reduce): Use bn256_set_cond.
 	* src/ecc-x25519.c: Rename from ecc-mont.c, as computation is
 	actually X25519, while it's host side which uses big-endian
 	private key.
 	(mont_d_and_a): Refactor not using struct pt.
 	(compute_nQ): Use bn256_swap_cond.
 	* src/Makefile (CSRC): Follow the rename of exx-x25519.c.
 2023-09-05  NIIBE Yutaka  <gniibe@fsij.org>
 	* VERSION: 2.1.
--- a/src/Makefile
+++ b/src/Makefile
@ -11,7 +11,7 @@ CSRC = main.c \
        aes.c gcm-siv.c \
 	bn.c mod.c \
 	modp256k1.c jpc_p256k1.c ec_p256k1.c call-ec_p256k1.c \
-	mod25638.c ecc-ed25519.c ecc-mont.c sha512.c \
+	mod25638.c ecc-ed25519.c ecc-x25519.c sha512.c \
 	p448.c ecc-x448.c \
 	ecc-ed448.c shake256.c \
 	random.c neug.c sha256.c
--- a/src/bn.c
+++ b/src/bn.c
@ -1,7 +1,7 @@
 /*
 * bn.c -- 256-bit (and 512-bit) bignum calculation
 *
- * Copyright (C) 2011, 2013, 2014, 2019
+ * Copyright (C) 2011, 2013, 2014, 2019, 2023
 *               Free Software Initiative of Japan
 * Author: NIIBE Yutaka <gniibe@fsij.org>
 *
@ -18,7 +18,7 @@
 * License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */
@ -425,3 +425,35 @@ bn256_random (bn256 *X)
    }
 }
 #endif
 void
 bn256_swap_cond (bn256 *A, bn256 *B, uint32_t b)
 {
  uint32_t mask = 0UL - b;
  int i;
  uint32_t *p = A->word;
  uint32_t *q = B->word;
  for (i = 0; i < BN256_WORDS; i++)
    {
      uint32_t t = mask & (*p^*q);
      *p++ ^= t;
      *q++ ^= t;
    }
 }
 void
 bn256_set_cond (bn256 *A, const bn256 *B, uint32_t b)
 {
  uint32_t mask1 = 0UL - b;
  uint32_t mask2 = b - 1UL;
  int i;
  uint32_t *p = A->word;
  const uint32_t *q = B->word;
  for (i = 0; i < BN256_WORDS; i++)
    {
      *p = (*p & mask2) | (*q++ & mask1);
      p++;
    }
 }
--- a/src/bn.h
+++ b/src/bn.h
@ -21,3 +21,5 @@ int bn256_is_even (const bn256 *X);
 int bn256_is_ge (const bn256 *A, const bn256 *B);
 int bn256_cmp (const bn256 *A, const bn256 *B);
 void bn256_random (bn256 *X);
 void bn256_swap_cond (bn256 *A, bn256 *B, uint32_t b);
 void bn256_set_cond (bn256 *A, const bn256 *B, uint32_t b);
--- a/src/ecc-x25519.c
+++ b/src/ecc-x25519.c
@ -1,8 +1,9 @@
 /*                                                    -*- coding: utf-8 -*-
- * ecc-mont.c - Elliptic curve computation for
+ * ecc-x25519.c - Elliptic curve computation for
- *              the Montgomery curve: y^2 = x^3 + 486662*x^2 + x.
+ *                the Montgomery curve: y^2 = x^3 + 486662*x^2 + x.
 *
- * Copyright (C) 2014, 2015, 2017, 2021  Free Software Initiative of Japan
+ * Copyright (C) 2014, 2015, 2017, 2021, 2023
 *               Free Software Initiative of Japan
 * Author: NIIBE Yutaka <gniibe@fsij.org>
 *
 * This file is a part of Gnuk, a GnuPG USB Token implementation.
@ -18,7 +19,7 @@
 * License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */
@ -99,41 +100,94 @@ mod25638_mul_121665 (bn256 *x, const bn256 *a)
 }
-typedef struct
+/* fe: Field Element */
-{
+typedef bn256 fe;
-  bn256 x[1];
+#define fe_add mod25638_add
-  bn256 z[1];
+#define fe_sub mod25638_sub
-} pt;
+#define fe_mul mod25638_mul
-
+#define fe_sqr mod25638_sqr
 #define fe_m_d mod25638_mul_121665
 /**
 * @brief  Process Montgomery double-and-add
 *
 * With Q0, Q1, DIF (= Q0 - Q1), compute PRD = 2Q0, SUM = Q0 + Q1
- * Q0 and Q1 are clobbered.
+ * On return, PRD is in Q0, SUM is in Q1
 * Caller provides temporary T0 and T1
 *
 * Note: indentation graphycally expresses the ladder.
 */
 static void
-mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x)
+mont_d_and_a (fe *x0, fe *z0, fe *x1, fe *z1, const fe *dif_x, fe *t0, fe *t1)
 {
-                                        mod25638_add (sum->x, q1->x, q1->z);
+#define xp   x0
-                                        mod25638_sub (q1->z, q1->x, q1->z);
+#define zp   z0
-  mod25638_add (prd->x, q0->x, q0->z);
+#define xs   x1
-  mod25638_sub (q0->z, q0->x, q0->z);
+#define zs   z1
-                                        mod25638_mul (q1->x, q0->z, sum->x);
+
-                                        mod25638_mul (q1->z, prd->x, q1->z);
+#define tmp0 t0
-  mod25638_sqr (q0->x, prd->x);
+#define tmp1 t1
-  mod25638_sqr (q0->z, q0->z);
+#define tmp2 x1
-                                        mod25638_add (sum->x, q1->x, q1->z);
+#define tmp3 x0
-                                        mod25638_sub (q1->z, q1->x, q1->z);
+#define tmp4 t0
-  mod25638_mul (prd->x, q0->x, q0->z);
+#define tmp5 t1
-  mod25638_sub (q0->z, q0->x, q0->z);
+#define tmp6 z0
-                                        mod25638_sqr (sum->x, sum->x);
+#define tmp7 x1
-                                        mod25638_sqr (sum->z, q1->z);
+#define tmp8 z1
-  mod25638_mul_121665 (prd->z, q0->z);
+#define tmp9 t0
-                                        mod25638_mul (sum->z, sum->z, dif_x);
+#define tmpA t1
-  mod25638_add (prd->z, q0->x, prd->z);
+#define tmpB t0
-  mod25638_mul (prd->z, prd->z, q0->z);
+#define tmpC t0
 #define tmpD z0
                                    fe_add (tmp0,
                                              x1,
                                              z1);
                                            fe_sub (tmp1,
                                                      x1,
                                                      z1);
  fe_add (tmp2,
            x0,
            z0);
          fe_sub (tmp3,
                    x0,
                    z0);
                                    fe_mul (tmp4,
                                            tmp3,
                                            tmp0);
                                            fe_mul (tmp5,
                                                    tmp2,
                                                    tmp1);
  fe_sqr (tmp6,
          tmp2);
          fe_sqr (tmp7,
                  tmp3);
                                    fe_add (tmp8,
                                            tmp4,
                                            tmp5);
                                            fe_sub (tmp9,
                                                    tmp4,
                                                    tmp5);
  fe_mul (xp,
          tmp6,
          tmp7);
          fe_sub (tmpA,
                  tmp6,
                  tmp7);
                                    fe_sqr (xs,
                                            tmp8);
                                            fe_sqr (tmpB,
                                                    tmp9);
                                            fe_mul (zs,
                                                    tmpB, dif_x);
          fe_m_d (tmpC,
                  tmpA);
          fe_add (tmpD,
                  tmp6,
                  tmpC);
          fe_mul (zp,
                  tmpD,
                  tmpA);
 }
@ -147,42 +201,30 @@ mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x)
 static void
 compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
 {
-  int i, j;
+  int i;
-  pt p0[1], p1[1], p0_[1], p1_[1];
+  bn256 x0[1], z0[1], x1[1], z1[1];
  bn256 t0[1], t1[1];
  uint32_t swap = 0;
  const unsigned char *np = (const unsigned char *)n->word;
  /* P0 = O = (1:0)  */
-  memset (p0->x, 0, sizeof (bn256));
+  memset (x0, 0, sizeof (bn256));
-  p0->x->word[0] = 1;
+  x0->word[0] = 1;
-  memset (p0->z, 0, sizeof (bn256));
+  memset (z0, 0, sizeof (bn256));
  /* P1 = (X:1) */
-  memcpy (p1->x, q_x, sizeof (bn256));
+  memcpy (x1, q_x, sizeof (bn256));
-  memset (p1->z, 0, sizeof (bn256));
+  memcpy (z1, x0, sizeof (bn256));
  p1->z->word[0] = 1;
-  for (i = 0; i < 8; i++)
+  for (i = 254; i >= 0; i--)
    {
-      uint32_t u = n->word[7-i];
+      uint32_t b = (np[i>>3]>>(i&7))&1;
-      for (j = 0; j < 16; j++)
+      swap ^= b;
-	{
+      bn256_swap_cond (x0, x1, swap);
-	  pt *q0, *q1;
+      bn256_swap_cond (z0, z1, swap);
-	  pt *sum_n, *prd_n;
+      swap = b;
-
+      mont_d_and_a (x0, z0, x1, z1, q_x, t0, t1);
 	  if ((u & 0x80000000))
 	    q0 = p1,  q1 = p0,  sum_n = p0_, prd_n = p1_;
 	  else
 	    q0 = p0,  q1 = p1,  sum_n = p1_, prd_n = p0_;
 	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
 	  if ((u & 0x40000000))
 	    q0 = p1_, q1 = p0_, sum_n = p0,  prd_n = p1;
 	  else
 	    q0 = p0_, q1 = p1_, sum_n = p1,  prd_n = p0;
 	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
 	  u <<= 2;
 	}
    }
  /* We know the LSB of N is always 0.  Thus, result is always in P0.  */
@ -191,8 +233,8 @@ compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
   * but returns 0 (like the implementation of z^(p-2)), thus, RES will
   * be 0 in that case, which is correct value.
   */
-  mod_inv (res, p0->z, p25519);
+  mod_inv (res, z0, p25519);
-  mod25638_mul (res, res, p0->x);
+  mod25638_mul (res, res, x0);
  mod25519_reduce (res);
 }
--- a/src/mod25638.c
+++ b/src/mod25638.c
@ -231,57 +231,16 @@ void
 mod25519_reduce (bn256 *X)
 {
  uint32_t q;
-  bn256 r0[1], r1[1];
+  bn256 R[1];
  int flag;
-  memcpy (r0, X, sizeof (bn256));
+  q = (X->word[7] >> 31);
-  q = (r0->word[7] >> 31);
+  X->word[7] &= 0x7fffffff;
  r0->word[7] &= 0x7fffffff;
  if (q)
    {
      bn256_add_uint (r0, r0, 19);
      q = (r0->word[7] >> 31);
      r0->word[7] &= 0x7fffffff;
      if (q)
 	{
 	  bn256_add_uint (r1, r0, 19);
 	  q = (r1->word[7] >> 31);
 	  r1->word[7] &= 0x7fffffff;
 	  flag = 0;
 	}
      else
 	flag = 1;
    }
  else
    {
      bn256_add_uint (r1, r0, 19);
      q = (r1->word[7] >> 31);	 /* dummy */
      r1->word[7] &= 0x7fffffff; /* dummy */
      if (q)
 	flag = 2;
      else
 	flag = 3;
    }
-  if (flag)
+  bn256_add_uint (X, X, q * 19);
-    {
+
-      bn256_add_uint (r1, r0, 19);
+  bn256_add_uint (R, X, 19);
-      q = (r1->word[7] >> 31);
+  q = (R->word[7] >> 31);
-      r1->word[7] &= 0x7fffffff;
+  R->word[7] &= 0x7fffffff;
-      if (q)
+
-	memcpy (X, r1, sizeof (bn256));
+  bn256_set_cond (X, R, q);
      else
 	memcpy (X, r0, sizeof (bn256));
    }
  else
    {
      if (q)
 	{
 	  asm volatile ("" : : "r" (q) : "memory");
 	  memcpy (X, r1, sizeof (bn256));
 	  asm volatile ("" : : "r" (q) : "memory");
 	}
      else
 	memcpy (X, r1, sizeof (bn256));
    }
 }
--- a/src/openpgp-do.c
+++ b/src/openpgp-do.c
@ -1545,6 +1545,7 @@ proc_key_import (const uint8_t *data, int len)
      if (len - 12 != 32)
 	return 0;		/* Error.  */
      /* Revert the order, because it's big-endian MPI from server.  */
      for (i = 0; i < 32; i++)
 	priv[31-i] = data[12+i];
      ecdh_compute_public_25519 (priv, pubkey);