Refactor X25519 implementation.

Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
2024-09-20 02:40:08 +00:00 · 2023-12-01 15:19:10 +09:00 · 2023-12-01 15:19:10 +09:00 · 5a05a619c0
commit 5a05a619c0
parent f3cb2694ce
7 changed files with 163 additions and 115 deletions
--- a/12
+++ b/12
@ -1,3 +1,15 @@
+2023-12-01  NIIBE Yutaka  <gniibe@fsij.org>
+
+	* src/bn.c (bn256_swap_cond): New.
+	(bn256_set_cond): New.
+	* src/mod25638.c (mod25519_reduce): Use bn256_set_cond.
+	* src/ecc-x25519.c: Rename from ecc-mont.c, as computation is
+	actually X25519, while it's host side which uses big-endian
+	private key.
+	(mont_d_and_a): Refactor not using struct pt.
+	(compute_nQ): Use bn256_swap_cond.
+	* src/Makefile (CSRC): Follow the rename of exx-x25519.c.
+
 2023-09-05  NIIBE Yutaka  <gniibe@fsij.org>

 	* VERSION: 2.1.
--- a/src/Makefile
+++ b/src/Makefile
@ -11,7 +11,7 @@ CSRC = main.c \
        aes.c gcm-siv.c \
 	bn.c mod.c \
 	modp256k1.c jpc_p256k1.c ec_p256k1.c call-ec_p256k1.c \
-	mod25638.c ecc-ed25519.c ecc-mont.c sha512.c \
+	mod25638.c ecc-ed25519.c ecc-x25519.c sha512.c \
 	p448.c ecc-x448.c \
 	ecc-ed448.c shake256.c \
 	random.c neug.c sha256.c
--- a/src/bn.c
+++ b/src/bn.c
@ -1,7 +1,7 @@
 /*
 * bn.c -- 256-bit (and 512-bit) bignum calculation
 *
- * Copyright (C) 2011, 2013, 2014, 2019
+ * Copyright (C) 2011, 2013, 2014, 2019, 2023
 *               Free Software Initiative of Japan
 * Author: NIIBE Yutaka <gniibe@fsij.org>
 *
@ -18,7 +18,7 @@
 * License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */

@ -425,3 +425,35 @@ bn256_random (bn256 *X)
    }
 }
 #endif
+
+void
+bn256_swap_cond (bn256 *A, bn256 *B, uint32_t b)
+{
+  uint32_t mask = 0UL - b;
+  int i;
+  uint32_t *p = A->word;
+  uint32_t *q = B->word;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      uint32_t t = mask & (*p^*q);
+      *p++ ^= t;
+      *q++ ^= t;
+    }
+}
+
+void
+bn256_set_cond (bn256 *A, const bn256 *B, uint32_t b)
+{
+  uint32_t mask1 = 0UL - b;
+  uint32_t mask2 = b - 1UL;
+  int i;
+  uint32_t *p = A->word;
+  const uint32_t *q = B->word;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      *p = (*p & mask2) | (*q++ & mask1);
+      p++;
+    }
+}
--- a/src/bn.h
+++ b/src/bn.h
@ -21,3 +21,5 @@ int bn256_is_even (const bn256 *X);
 int bn256_is_ge (const bn256 *A, const bn256 *B);
 int bn256_cmp (const bn256 *A, const bn256 *B);
 void bn256_random (bn256 *X);
+void bn256_swap_cond (bn256 *A, bn256 *B, uint32_t b);
+void bn256_set_cond (bn256 *A, const bn256 *B, uint32_t b);
--- a/src/ecc-x25519.c
+++ b/src/ecc-x25519.c
@ -1,8 +1,9 @@
 /*                                                    -*- coding: utf-8 -*-
- * ecc-mont.c - Elliptic curve computation for
+ * ecc-x25519.c - Elliptic curve computation for
 *                the Montgomery curve: y^2 = x^3 + 486662*x^2 + x.
 *
- * Copyright (C) 2014, 2015, 2017, 2021  Free Software Initiative of Japan
+ * Copyright (C) 2014, 2015, 2017, 2021, 2023
+ *               Free Software Initiative of Japan
 * Author: NIIBE Yutaka <gniibe@fsij.org>
 *
 * This file is a part of Gnuk, a GnuPG USB Token implementation.
@ -18,7 +19,7 @@
 * License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */

@ -99,41 +100,94 @@ mod25638_mul_121665 (bn256 *x, const bn256 *a)
 }


-typedef struct
-{
-  bn256 x[1];
-  bn256 z[1];
-} pt;
-
+/* fe: Field Element */
+typedef bn256 fe;
+#define fe_add mod25638_add
+#define fe_sub mod25638_sub
+#define fe_mul mod25638_mul
+#define fe_sqr mod25638_sqr
+#define fe_m_d mod25638_mul_121665

 /**
 * @brief  Process Montgomery double-and-add
 *
 * With Q0, Q1, DIF (= Q0 - Q1), compute PRD = 2Q0, SUM = Q0 + Q1
- * Q0 and Q1 are clobbered.
+ * On return, PRD is in Q0, SUM is in Q1
+ * Caller provides temporary T0 and T1
 *
+ * Note: indentation graphycally expresses the ladder.
 */
 static void
-mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x)
+mont_d_and_a (fe *x0, fe *z0, fe *x1, fe *z1, const fe *dif_x, fe *t0, fe *t1)
 {
-                                        mod25638_add (sum->x, q1->x, q1->z);
-                                        mod25638_sub (q1->z, q1->x, q1->z);
-  mod25638_add (prd->x, q0->x, q0->z);
-  mod25638_sub (q0->z, q0->x, q0->z);
-                                        mod25638_mul (q1->x, q0->z, sum->x);
-                                        mod25638_mul (q1->z, prd->x, q1->z);
-  mod25638_sqr (q0->x, prd->x);
-  mod25638_sqr (q0->z, q0->z);
-                                        mod25638_add (sum->x, q1->x, q1->z);
-                                        mod25638_sub (q1->z, q1->x, q1->z);
-  mod25638_mul (prd->x, q0->x, q0->z);
-  mod25638_sub (q0->z, q0->x, q0->z);
-                                        mod25638_sqr (sum->x, sum->x);
-                                        mod25638_sqr (sum->z, q1->z);
-  mod25638_mul_121665 (prd->z, q0->z);
-                                        mod25638_mul (sum->z, sum->z, dif_x);
-  mod25638_add (prd->z, q0->x, prd->z);
-  mod25638_mul (prd->z, prd->z, q0->z);
+#define xp   x0
+#define zp   z0
+#define xs   x1
+#define zs   z1
+
+#define tmp0 t0
+#define tmp1 t1
+#define tmp2 x1
+#define tmp3 x0
+#define tmp4 t0
+#define tmp5 t1
+#define tmp6 z0
+#define tmp7 x1
+#define tmp8 z1
+#define tmp9 t0
+#define tmpA t1
+#define tmpB t0
+#define tmpC t0
+#define tmpD z0
+
+                                    fe_add (tmp0,
+                                              x1,
+                                              z1);
+                                            fe_sub (tmp1,
+                                                      x1,
+                                                      z1);
+  fe_add (tmp2,
+            x0,
+            z0);
+          fe_sub (tmp3,
+                    x0,
+                    z0);
+                                    fe_mul (tmp4,
+                                            tmp3,
+                                            tmp0);
+                                            fe_mul (tmp5,
+                                                    tmp2,
+                                                    tmp1);
+  fe_sqr (tmp6,
+          tmp2);
+          fe_sqr (tmp7,
+                  tmp3);
+                                    fe_add (tmp8,
+                                            tmp4,
+                                            tmp5);
+                                            fe_sub (tmp9,
+                                                    tmp4,
+                                                    tmp5);
+  fe_mul (xp,
+          tmp6,
+          tmp7);
+          fe_sub (tmpA,
+                  tmp6,
+                  tmp7);
+                                    fe_sqr (xs,
+                                            tmp8);
+                                            fe_sqr (tmpB,
+                                                    tmp9);
+                                            fe_mul (zs,
+                                                    tmpB, dif_x);
+          fe_m_d (tmpC,
+                  tmpA);
+          fe_add (tmpD,
+                  tmp6,
+                  tmpC);
+          fe_mul (zp,
+                  tmpD,
+                  tmpA);
 }


@ -147,42 +201,30 @@ mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x)
 static void
 compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
 {
-  int i, j;
-  pt p0[1], p1[1], p0_[1], p1_[1];
+  int i;
+  bn256 x0[1], z0[1], x1[1], z1[1];
+  bn256 t0[1], t1[1];
+  uint32_t swap = 0;
+  const unsigned char *np = (const unsigned char *)n->word;

  /* P0 = O = (1:0)  */
-  memset (p0->x, 0, sizeof (bn256));
-  p0->x->word[0] = 1;
-  memset (p0->z, 0, sizeof (bn256));
+  memset (x0, 0, sizeof (bn256));
+  x0->word[0] = 1;
+  memset (z0, 0, sizeof (bn256));

  /* P1 = (X:1) */
-  memcpy (p1->x, q_x, sizeof (bn256));
-  memset (p1->z, 0, sizeof (bn256));
-  p1->z->word[0] = 1;
+  memcpy (x1, q_x, sizeof (bn256));
+  memcpy (z1, x0, sizeof (bn256));

-  for (i = 0; i < 8; i++)
+  for (i = 254; i >= 0; i--)
    {
-      uint32_t u = n->word[7-i];
+      uint32_t b = (np[i>>3]>>(i&7))&1;

-      for (j = 0; j < 16; j++)
-	{
-	  pt *q0, *q1;
-	  pt *sum_n, *prd_n;
-
-	  if ((u & 0x80000000))
-	    q0 = p1,  q1 = p0,  sum_n = p0_, prd_n = p1_;
-	  else
-	    q0 = p0,  q1 = p1,  sum_n = p1_, prd_n = p0_;
-	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
-
-	  if ((u & 0x40000000))
-	    q0 = p1_, q1 = p0_, sum_n = p0,  prd_n = p1;
-	  else
-	    q0 = p0_, q1 = p1_, sum_n = p1,  prd_n = p0;
-	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
-
-	  u <<= 2;
-	}
+      swap ^= b;
+      bn256_swap_cond (x0, x1, swap);
+      bn256_swap_cond (z0, z1, swap);
+      swap = b;
+      mont_d_and_a (x0, z0, x1, z1, q_x, t0, t1);
    }

  /* We know the LSB of N is always 0.  Thus, result is always in P0.  */
@ -191,8 +233,8 @@ compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
   * but returns 0 (like the implementation of z^(p-2)), thus, RES will
   * be 0 in that case, which is correct value.
   */
-  mod_inv (res, p0->z, p25519);
-  mod25638_mul (res, res, p0->x);
+  mod_inv (res, z0, p25519);
+  mod25638_mul (res, res, x0);
  mod25519_reduce (res);
 }

--- a/src/mod25638.c
+++ b/src/mod25638.c
@ -231,57 +231,16 @@ void
 mod25519_reduce (bn256 *X)
 {
  uint32_t q;
-  bn256 r0[1], r1[1];
-  int flag;
+  bn256 R[1];

-  memcpy (r0, X, sizeof (bn256));
-  q = (r0->word[7] >> 31);
-  r0->word[7] &= 0x7fffffff;
-  if (q)
-    {
-      bn256_add_uint (r0, r0, 19);
-      q = (r0->word[7] >> 31);
-      r0->word[7] &= 0x7fffffff;
-      if (q)
-	{
-	  bn256_add_uint (r1, r0, 19);
-	  q = (r1->word[7] >> 31);
-	  r1->word[7] &= 0x7fffffff;
-	  flag = 0;
-	}
-      else
-	flag = 1;
-    }
-  else
-    {
-      bn256_add_uint (r1, r0, 19);
-      q = (r1->word[7] >> 31);	 /* dummy */
-      r1->word[7] &= 0x7fffffff; /* dummy */
-      if (q)
-	flag = 2;
-      else
-	flag = 3;
-    }
+  q = (X->word[7] >> 31);
+  X->word[7] &= 0x7fffffff;

-  if (flag)
-    {
-      bn256_add_uint (r1, r0, 19);
-      q = (r1->word[7] >> 31);
-      r1->word[7] &= 0x7fffffff;
-      if (q)
-	memcpy (X, r1, sizeof (bn256));
-      else
-	memcpy (X, r0, sizeof (bn256));
-    }
-  else
-    {
-      if (q)
-	{
-	  asm volatile ("" : : "r" (q) : "memory");
-	  memcpy (X, r1, sizeof (bn256));
-	  asm volatile ("" : : "r" (q) : "memory");
-	}
-      else
-	memcpy (X, r1, sizeof (bn256));
-    }
+  bn256_add_uint (X, X, q * 19);
+
+  bn256_add_uint (R, X, 19);
+  q = (R->word[7] >> 31);
+  R->word[7] &= 0x7fffffff;
+
+  bn256_set_cond (X, R, q);
 }
--- a/src/openpgp-do.c
+++ b/src/openpgp-do.c
@ -1545,6 +1545,7 @@ proc_key_import (const uint8_t *data, int len)
      if (len - 12 != 32)
 	return 0;		/* Error.  */

+      /* Revert the order, because it's big-endian MPI from server.  */
      for (i = 0; i < 32; i++)
 	priv[31-i] = data[12+i];
      ecdh_compute_public_25519 (priv, pubkey);