Adding the rest of files:

- ASM is disabled - Neug needs full rewrite - Flash is based on PiMoroni 4MB flash (needs adjust) Signed-off-by: Pol Henarejos <pol.henarejos@cttc.es>
2024-09-20 03:10:09 +00:00 · 2022-01-03 02:02:39 +01:00 · 2022-01-03 02:02:39 +01:00 · 0af5685495
commit 0af5685495
parent 0445f587f7
52 changed files with 21467 additions and 0 deletions
--- a/ac.c
+++ b/ac.c
@ -0,0 +1,301 @@
+/*
+ * ac.c -- Check access condition
+ *
+ * Copyright (C) 2010, 2012, 2013, 2017 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "config.h"
+
+#include "gnuk.h"
+#include "sha256.h"
+#include "random.h"
+
+uint8_t volatile auth_status;	/* Initialized to AC_NONE_AUTHORIZED */
+
+int
+ac_check_status (uint8_t ac_flag)
+{
+  if (ac_flag == AC_ALWAYS)
+    return 1;
+  else if (ac_flag == AC_NEVER)
+    return 0;
+  else
+    return (ac_flag & auth_status)? 1 : 0;
+}
+
+void
+ac_reset_pso_cds (void)
+{
+  gpg_do_clear_prvkey (GPG_KEY_FOR_SIGNING);
+  auth_status &= ~AC_PSO_CDS_AUTHORIZED;
+}
+
+void
+ac_reset_other (void)
+{
+  gpg_do_clear_prvkey (GPG_KEY_FOR_DECRYPTION);
+  gpg_do_clear_prvkey (GPG_KEY_FOR_AUTHENTICATION);
+  auth_status &= ~AC_OTHER_AUTHORIZED;
+}
+
+int
+verify_user_0 (uint8_t access, const uint8_t *pw, int buf_len, int pw_len_known,
+	       const uint8_t *ks_pw1, int save_ks)
+{
+  int pw_len;
+  int r;
+  uint8_t keystring[KEYSTRING_MD_SIZE];
+  const uint8_t *salt;
+  int salt_len;
+
+  if (gpg_pw_locked (PW_ERR_PW1))
+    return 0;
+
+  if (ks_pw1 == NULL)
+    {
+      const uint8_t *initial_pw;
+
+      salt = NULL;
+      salt_len = 0;
+      gpg_do_get_initial_pw_setting (0, &pw_len, &initial_pw);
+      if ((pw_len_known >= 0 && pw_len_known != pw_len)
+	  || buf_len < pw_len
+	  || memcmp (pw, initial_pw, pw_len))
+	goto failure;
+    }
+  else
+    {
+      pw_len = ks_pw1[0] & PW_LEN_MASK;
+      salt = KS_GET_SALT (ks_pw1);
+      salt_len = SALT_SIZE;
+
+      if ((pw_len_known >= 0 && pw_len_known != pw_len)
+	  || buf_len < pw_len)
+	goto failure;
+    }
+
+  s2k (salt, salt_len, pw, pw_len, keystring);
+  if (save_ks)
+    memcpy (keystring_md_pw3, keystring, KEYSTRING_MD_SIZE);
+
+  if (access == AC_PSO_CDS_AUTHORIZED)
+    r = gpg_do_load_prvkey (GPG_KEY_FOR_SIGNING, BY_USER, keystring);
+  else
+    {
+      int r1, r2;
+
+      r1 = gpg_do_load_prvkey (GPG_KEY_FOR_DECRYPTION, BY_USER, keystring);
+      r2 = gpg_do_load_prvkey (GPG_KEY_FOR_AUTHENTICATION, BY_USER, keystring);
+
+      if (r1 < 0 || r2 < 0)
+	r = -1;
+      else if (r1 == 0)
+	{
+	  if (r2 == 0)
+	    /* No encryption/authentication keys, then, check signing key.  */
+	    r = gpg_do_load_prvkey (GPG_KEY_FOR_SIGNING, BY_USER, keystring);
+	  else
+	    r = r2;
+	}
+      else if (r2 == 0)
+	r = r1;
+      else
+	r = 1;
+    }
+
+  if (r < 0)
+    {
+    failure:
+      gpg_pw_increment_err_counter (PW_ERR_PW1);
+      return -1;
+    }
+
+  gpg_pw_reset_err_counter (PW_ERR_PW1);
+  return pw_len;
+}
+
+/*
+ * Verify for "Perform Security Operation : Compute Digital Signature"
+ */
+int
+verify_pso_cds (const uint8_t *pw, int pw_len)
+{
+  const uint8_t *ks_pw1 = gpg_do_read_simple (NR_DO_KEYSTRING_PW1);
+  int r;
+
+  DEBUG_INFO ("verify_pso_cds\r\n");
+  DEBUG_BYTE (pw_len);
+
+  r = verify_user_0 (AC_PSO_CDS_AUTHORIZED, pw, pw_len, pw_len, ks_pw1, 0);
+  if (r > 0)
+    auth_status |= AC_PSO_CDS_AUTHORIZED;
+  return r;
+}
+
+int
+verify_other (const uint8_t *pw, int pw_len)
+{
+  const uint8_t *ks_pw1 = gpg_do_read_simple (NR_DO_KEYSTRING_PW1);
+  int r;
+
+  DEBUG_INFO ("verify_other\r\n");
+  DEBUG_BYTE (pw_len);
+
+  r = verify_user_0 (AC_OTHER_AUTHORIZED, pw, pw_len, pw_len, ks_pw1, 0);
+  if (r > 0)
+    auth_status |= AC_OTHER_AUTHORIZED;
+  return r;
+}
+
+
+static int
+verify_admin_00 (const uint8_t *pw, int buf_len, int pw_len_known,
+		 const uint8_t *ks, int save_ks)
+{
+  int pw_len;
+  int r;
+  uint8_t keystring[KEYSTRING_MD_SIZE];
+  const uint8_t *salt;
+  int salt_len;
+
+  pw_len = ks[0] & PW_LEN_MASK;
+  salt = KS_GET_SALT (ks);
+  salt_len = SALT_SIZE;
+
+  if ((pw_len_known >= 0 && pw_len_known != pw_len) || buf_len < pw_len)
+    return -1;
+
+  s2k (salt, salt_len, pw, pw_len, keystring);
+  if (save_ks)
+    memcpy (keystring_md_pw3, keystring, KEYSTRING_MD_SIZE);
+
+  r = gpg_do_load_prvkey (GPG_KEY_FOR_SIGNING, BY_ADMIN, keystring);
+
+  if (r < 0)
+    return -1;
+  else if (r == 0)
+    if ((ks[0] & PW_LEN_KEYSTRING_BIT) == 0
+	|| memcmp (KS_GET_KEYSTRING (ks), keystring, KEYSTRING_MD_SIZE) != 0)
+      return -1;
+
+  return pw_len;
+}
+
+uint8_t keystring_md_pw3[KEYSTRING_MD_SIZE];
+uint8_t admin_authorized;
+
+int
+verify_admin_0 (const uint8_t *pw, int buf_len, int pw_len_known,
+		const uint8_t *pw3_keystring, int save_ks)
+{
+  int pw_len;
+
+  if (pw3_keystring != NULL)
+    {
+      if (gpg_pw_locked (PW_ERR_PW3))
+	return 0;
+
+      pw_len = verify_admin_00 (pw, buf_len, pw_len_known, pw3_keystring,
+				save_ks);
+      if (pw_len < 0)
+	{
+	failure:
+	  gpg_pw_increment_err_counter (PW_ERR_PW3);
+	  return -1;
+	}
+
+      admin_authorized = BY_ADMIN;
+    success:		       /* OK, the admin is now authenticated.  */
+      gpg_pw_reset_err_counter (PW_ERR_PW3);
+      return pw_len;
+    }
+  else
+    {
+      const uint8_t *initial_pw;
+      const uint8_t *ks_pw1 = gpg_do_read_simple (NR_DO_KEYSTRING_PW1);
+
+      if (ks_pw1 != NULL)
+	{	  /* empty PW3, but PW1 exists */
+	  int r = verify_user_0 (AC_PSO_CDS_AUTHORIZED,
+				 pw, buf_len, pw_len_known, ks_pw1, save_ks);
+
+	  if (r > 0)
+	    admin_authorized = BY_USER;
+
+	  return r;
+	}
+
+      if (gpg_pw_locked (PW_ERR_PW3))
+	return 0;
+
+      /*
+       * For the case of empty PW3 (with empty PW1), passphrase is
+       * OPENPGP_CARD_INITIAL_PW3, or defined by KDF DO.
+       */
+      gpg_do_get_initial_pw_setting (1, &pw_len, &initial_pw);
+      if ((pw_len_known >=0 && pw_len_known != pw_len)
+	  || buf_len < pw_len
+	  || memcmp (pw, initial_pw, pw_len))
+	goto failure;
+
+      admin_authorized = BY_ADMIN;
+      if (save_ks)
+	s2k (NULL, 0, pw, pw_len, keystring_md_pw3);
+      goto success;
+    }
+}
+
+
+int
+verify_admin (const uint8_t *pw, int pw_len)
+{
+  int r;
+  const uint8_t *pw3_keystring;
+
+  pw3_keystring = gpg_do_read_simple (NR_DO_KEYSTRING_PW3);
+  r = verify_admin_0 (pw, pw_len, pw_len, pw3_keystring, 1);
+  if (r <= 0)
+    return r;
+
+  auth_status |= AC_ADMIN_AUTHORIZED;
+  return 1;
+}
+
+void
+ac_reset_admin (void)
+{
+  memset (keystring_md_pw3, 0, KEYSTRING_MD_SIZE);
+  auth_status &= ~AC_ADMIN_AUTHORIZED;
+  admin_authorized = 0;
+}
+
+void
+ac_fini (void)
+{
+  memset (keystring_md_pw3, 0, KEYSTRING_MD_SIZE);
+  gpg_do_clear_prvkey (GPG_KEY_FOR_SIGNING);
+  gpg_do_clear_prvkey (GPG_KEY_FOR_DECRYPTION);
+  gpg_do_clear_prvkey (GPG_KEY_FOR_AUTHENTICATION);
+  auth_status = AC_NONE_AUTHORIZED;
+  admin_authorized = 0;
+}
--- a/aes.c
+++ b/aes.c
--- a/affine.h
+++ b/affine.h
@ -0,0 +1,8 @@
+/**
+ * @brief	Affine coordinates
+ */
+typedef struct
+{
+  bn256 x[1];
+  bn256 y[1];
+} ac;
--- a/bignum.c
+++ b/bignum.c
--- a/bn.c
+++ b/bn.c
@ -0,0 +1,427 @@
+/*
+ * bn.c -- 256-bit (and 512-bit) bignum calculation
+ *
+ * Copyright (C) 2011, 2013, 2014, 2019
+ *               Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#ifndef BN256_NO_RANDOM
+#include "random.h"
+#endif
+#include "bn.h"
+
+uint32_t
+bn256_add (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  int i;
+  uint32_t v;
+  uint32_t carry = 0;
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  px = X->word;
+  pa = A->word;
+  pb = B->word;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      v = *pb;
+      *px = *pa + carry;
+      carry = (*px < carry);
+      *px += v;
+      carry += (*px < v);
+      px++;
+      pa++;
+      pb++;
+    }
+
+  return carry;
+}
+
+uint32_t
+bn256_sub (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  int i;
+  uint32_t v;
+  uint32_t borrow = 0;
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  px = X->word;
+  pa = A->word;
+  pb = B->word;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      uint32_t borrow0 = (*pa < borrow);
+
+      v = *pb;
+      *px = *pa - borrow;
+      borrow = (*px < v) + borrow0;
+      *px -= v;
+      px++;
+      pa++;
+      pb++;
+    }
+
+  return borrow;
+}
+
+uint32_t
+bn256_add_uint (bn256 *X, const bn256 *A, uint32_t w)
+{
+  int i;
+  uint32_t carry = w;
+  uint32_t *px;
+  const uint32_t *pa;
+
+  px = X->word;
+  pa = A->word;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      *px = *pa + carry;
+      carry = (*px < carry);
+      px++;
+      pa++;
+    }
+
+  return carry;
+}
+
+uint32_t
+bn256_sub_uint (bn256 *X, const bn256 *A, uint32_t w)
+{
+  int i;
+  uint32_t borrow = w;
+  uint32_t *px;
+  const uint32_t *pa;
+
+  px = X->word;
+  pa = A->word;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      uint32_t borrow0 = (*pa < borrow);
+
+      *px = *pa - borrow;
+      borrow = borrow0;
+      px++;
+      pa++;
+    }
+
+  return borrow;
+}
+
+#ifndef BN256_C_IMPLEMENTATION
+#define ASM_IMPLEMENTATION 0
+#endif
+void
+bn256_mul (bn512 *X, const bn256 *A, const bn256 *B)
+{
+#if ASM_IMPLEMENTATION
+#include "muladd_256.h"
+  const uint32_t *s;
+  uint32_t *d;
+  uint32_t w;
+  uint32_t c;
+
+  memset (X->word, 0, sizeof (uint32_t)*BN256_WORDS*2);
+
+  s = A->word;  d = &X->word[0];  w = B->word[0];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[1];  w = B->word[1];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[2];  w = B->word[2];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[3];  w = B->word[3];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[4];  w = B->word[4];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[5];  w = B->word[5];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[6];  w = B->word[6];  MULADD_256 (s, d, w, c);
+  s = A->word;  d = &X->word[7];  w = B->word[7];  MULADD_256 (s, d, w, c);
+#else
+  int i, j, k;
+  int i_beg, i_end;
+  uint32_t r0, r1, r2;
+
+  r0 = r1 = r2 = 0;
+  for (k = 0; k <= (BN256_WORDS - 1)*2; k++)
+    {
+      if (k < BN256_WORDS)
+	{
+	  i_beg = 0;
+	  i_end = k;
+	}
+      else
+	{
+	  i_beg = k - BN256_WORDS + 1;
+	  i_end = BN256_WORDS - 1;
+	}
+
+      for (i = i_beg; i <= i_end; i++)
+	{
+	  uint64_t uv;
+	  uint32_t u, v;
+	  uint32_t carry;
+
+	  j = k - i;
+
+	  uv = ((uint64_t )A->word[i])*((uint64_t )B->word[j]);
+	  v = uv;
+	  u = (uv >> 32);
+	  r0 += v;
+	  carry = (r0 < v);
+	  r1 += carry;
+	  carry = (r1 < carry);
+	  r1 += u;
+	  carry += (r1 < u);
+	  r2 += carry;
+	}
+
+      X->word[k] = r0;
+      r0 = r1;
+      r1 = r2;
+      r2 = 0;
+    }
+
+  X->word[k] = r0;
+#endif
+}
+
+void
+bn256_sqr (bn512 *X, const bn256 *A)
+{
+#if ASM_IMPLEMENTATION
+  int i;
+
+  memset (X->word, 0, sizeof (bn512));
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      uint32_t *wij = &X->word[i*2];
+      const uint32_t *xj = &A->word[i];
+      uint32_t x_i = *xj++;
+      uint32_t c;
+
+      asm (/* (C,R4,R5) := w_i_i + x_i*x_i; w_i_i := R5; */
+           "mov    %[c], #0\n\t"
+           "ldr    r5, [%[wij]]\n\t"          /* R5 := w_i_i; */
+           "mov    r4, %[c]\n\t"
+           "umlal  r5, r4, %[x_i], %[x_i]\n\t"
+           "str    r5, [%[wij]], #4\n\t"
+           "cmp    %[xj], %[x_max1]\n\t"
+           "bhi    0f\n\t"
+           "mov    r9, %[c]\n\t"  /* R9 := 0, the constant ZERO from here.  */
+           "beq    1f\n"
+   "2:\n\t"
+           "ldmia  %[xj]!, { r7, r8 }\n\t"
+           "ldmia  %[wij], { r5, r6 }\n\t"
+           /* (C,R4,R5) := (C,R4) + w_i_j + 2*x_i*x_j; */
+           "umull  r7, r12, %[x_i], r7\n\t"
+           "adds   r5, r5, r4\n\t"
+           "adc    r4, %[c], r9\n\t"
+           "adds   r5, r5, r7\n\t"
+           "adcs   r4, r4, r12\n\t"
+           "adc    %[c], r9, r9\n\t"
+           "adds   r5, r5, r7\n\t"
+           "adcs   r4, r4, r12\n\t"
+           "adc    %[c], %[c], r9\n\t"
+           /* (C,R4,R6) := (C,R4) + w_i_j + 2*x_i*x_j; */
+           "adds   r6, r6, r4\n\t"
+           "adc    r4, %[c], r9\n\t"
+           "umull  r7, r12, %[x_i], r8\n\t"
+           "adds   r6, r6, r7\n\t"
+           "adcs   r4, r4, r12\n\t"
+           "adc    %[c], r9, r9\n\t"
+           "adds   r6, r6, r7\n\t"
+           "adcs   r4, r4, r12\n\t"
+           "adc    %[c], %[c], r9\n\t"
+           /**/
+           "stmia  %[wij]!, { r5, r6 }\n\t"
+           "cmp    %[xj], %[x_max1]\n\t"
+           "bcc    2b\n\t"
+           "bne    0f\n"
+   "1:\n\t"
+           /* (C,R4,R5) := (C,R4) + w_i_j + 2*x_i*x_j; */
+           "ldr    r5, [%[wij]]\n\t"
+           "ldr    r6, [%[xj]], #4\n\t"
+           "adds   r5, r5, r4\n\t"
+           "adc    r4, %[c], r9\n\t"
+           "umull  r7, r12, %[x_i], r6\n\t"
+           "adds   r5, r5, r7\n\t"
+           "adcs   r4, r4, r12\n\t"
+           "adc    %[c], r9, r9\n\t"
+           "adds   r5, r5, r7\n\t"
+           "adcs   r4, r4, r12\n\t"
+           "adc    %[c], %[c], r9\n\t"
+           "str    r5, [%[wij]], #4\n"
+   "0:\n\t"
+           "ldr    r5, [%[wij]]\n\t"
+           "adds   r4, r4, r5\n\t"
+           "adc    %[c], %[c], #0\n\t"
+           "str    r4, [%[wij]], #4"
+           : [c] "=&r" (c), [wij] "=r" (wij), [xj] "=r" (xj)
+           : [x_i] "r" (x_i), [x_max1] "r" (&A->word[BN256_WORDS-1]),
+             "[wij]" (wij), "[xj]" (xj)
+           : "r4", "r5", "r6", "r7", "r8", "r9", "r12", "memory", "cc");
+
+      if (i < BN256_WORDS - 1)
+	*wij = c;
+    }
+#else
+  int i, j, k;
+  int i_beg, i_end;
+  uint32_t r0, r1, r2;
+
+  r0 = r1 = r2 = 0;
+  for (k = 0; k <= (BN256_WORDS - 1)*2; k++)
+    {
+      if (k < BN256_WORDS)
+	{
+	  i_beg = 0;
+	  i_end = k/2;
+	}
+      else
+	{
+	  i_beg = k - BN256_WORDS + 1;
+	  i_end = k/2;
+	}
+
+      for (i = i_beg; i <= i_end; i++)
+	{
+	  uint64_t uv;
+	  uint32_t u, v;
+	  uint32_t carry;
+
+	  j = k - i;
+
+	  uv = ((uint64_t )A->word[i])*((uint64_t )A->word[j]);
+	  if (i < j)
+	    {
+	      r2 += ((uv >> 63) != 0);
+	      uv <<= 1;
+	    }
+	  v = uv;
+	  u = (uv >> 32);
+	  r0 += v;
+	  carry = (r0 < v);
+	  r1 += carry;
+	  carry = (r1 < carry);
+	  r1 += u;
+	  carry += (r1 < u);
+	  r2 += carry;
+	}
+
+      X->word[k] = r0;
+      r0 = r1;
+      r1 = r2;
+      r2 = 0;
+    }
+
+  X->word[k] = r0;
+#endif
+}
+
+uint32_t
+bn256_shift (bn256 *X, const bn256 *A, int shift)
+{
+  int i;
+  uint32_t carry = 0, next_carry;
+
+  if (shift > 0)
+    {
+      for (i = 0; i < BN256_WORDS; i++)
+	{
+	  next_carry = A->word[i] >> (32 - shift);
+	  X->word[i] = (A->word[i] << shift) | carry;
+	  carry = next_carry;
+	}
+    }
+  else
+    {
+      shift = -shift;
+
+      for (i = BN256_WORDS - 1; i >= 0; i--)
+	{
+	  next_carry = A->word[i] & ((1 << shift) - 1);
+	  X->word[i] = (A->word[i] >> shift) | (carry << (32 - shift));
+	  carry = next_carry;
+	}
+    }
+
+  return carry;
+}
+
+int
+bn256_is_zero (const bn256 *X)
+{
+  int i;
+  int r = 1;
+
+  for (i = 0; i < BN256_WORDS; i++)
+    r &=  (X->word[i] == 0);
+
+  return r;
+}
+
+int
+bn256_is_even (const bn256 *X)
+{
+  return !(X->word[0] & 1);
+}
+
+int
+bn256_is_ge (const bn256 *A, const bn256 *B)
+{
+  uint32_t borrow;
+  bn256 tmp[1];
+
+  borrow = bn256_sub (tmp, A, B);
+  return borrow == 0;
+}
+
+
+int
+bn256_cmp (const bn256 *A, const bn256 *B)
+{
+  uint32_t borrow;
+  int is_zero;
+  bn256 tmp[1];
+
+  borrow = bn256_sub (tmp, A, B);
+  is_zero = bn256_is_zero (tmp);
+  return is_zero ? 0 : (borrow ? -1 : 1);
+}
+
+
+#ifndef BN256_NO_RANDOM
+void
+bn256_random (bn256 *X)
+{
+  int i, j;
+  const uint8_t *rand;
+
+  for (i = 0; i < 256/256; i++)
+    {
+      rand = random_bytes_get ();
+      for (j = 0; j < BN256_WORDS; j++)
+	X->word[i*BN256_WORDS+j] = ((uint32_t *)rand)[j];
+      random_bytes_free (rand);
+    }
+}
+#endif
--- a/bn.h
+++ b/bn.h
@ -0,0 +1,23 @@
+#define BN256_WORDS 8
+typedef struct bn256 {
+  uint32_t word[ BN256_WORDS ]; /* Little endian */
+} bn256;
+
+#define BN512_WORDS 16
+typedef struct bn512 {
+  uint32_t word[ BN512_WORDS ]; /* Little endian */
+} bn512;
+
+uint32_t bn256_add (bn256 *X, const bn256 *A, const bn256 *B);
+uint32_t bn256_sub (bn256 *X, const bn256 *A, const bn256 *B);
+uint32_t bn256_add_uint (bn256 *X, const bn256 *A, uint32_t w);
+uint32_t bn256_sub_uint (bn256 *X, const bn256 *A, uint32_t w);
+
+void bn256_mul (bn512 *X, const bn256 *A, const bn256 *B);
+void bn256_sqr (bn512 *X, const bn256 *A);
+uint32_t bn256_shift (bn256 *X, const bn256 *A, int shift);
+int bn256_is_zero (const bn256 *X);
+int bn256_is_even (const bn256 *X);
+int bn256_is_ge (const bn256 *A, const bn256 *B);
+int bn256_cmp (const bn256 *A, const bn256 *B);
+void bn256_random (bn256 *X);
--- a/call-ec.c
+++ b/call-ec.c
@ -0,0 +1,136 @@
+/*
+ * call-ec.c - interface between Gnuk and Elliptic curve over GF(prime)
+ *
+ * Copyright (C) 2013, 2014, 2017  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "field-group-select.h"
+
+/* We are little-endian in the computation, but the protocol is big-endian.  */
+
+#define ECDSA_BYTE_SIZE 32
+#define ECDH_BYTE_SIZE 32
+
+int
+FUNC(ecdsa_sign) (const uint8_t *hash, uint8_t *output,
+		  const uint8_t *key_data)
+{
+  int i;
+  bn256 r[1], s[1], z[1], d[1];
+  uint8_t *p;
+
+  p = (uint8_t *)d;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    p[ECDSA_BYTE_SIZE - i - 1] = key_data[i];
+
+  p = (uint8_t *)z;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    p[ECDSA_BYTE_SIZE - i - 1] = hash[i];
+
+  FUNC(ecdsa) (r, s, z, d);
+  p = (uint8_t *)r;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    *output++ = p[ECDSA_BYTE_SIZE - i - 1];
+  p = (uint8_t *)s;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    *output++ = p[ECDSA_BYTE_SIZE - i - 1];
+  return 0;
+}
+
+int
+FUNC(ecc_compute_public) (const uint8_t *key_data, uint8_t *pubkey)
+{
+  uint8_t *p, *p1;
+  ac q[1];
+  bn256 k[1];
+  int i;
+
+  p = (uint8_t *)k;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    p[ECDSA_BYTE_SIZE - i - 1] = key_data[i];
+  if (FUNC(compute_kG) (q, k) < 0)
+    return -1;
+
+  p = pubkey;
+  p1 = (uint8_t *)q->x;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    *p++ = p1[ECDSA_BYTE_SIZE - i - 1];
+  p1 = (uint8_t *)q->y;
+  for (i = 0; i < ECDSA_BYTE_SIZE; i++)
+    *p++ = p1[ECDSA_BYTE_SIZE - i - 1];
+
+  return 0;
+}
+
+int
+FUNC(ecdh_decrypt) (const uint8_t *input, uint8_t *output,
+		    const uint8_t *key_data)
+{
+  bn256 k[1];
+  ac X[1], P[1];
+  int i;
+  uint8_t *p0;
+  const uint8_t *p1;
+  int r;
+
+  p0 = (uint8_t *)k;
+  for (i = 0; i < ECDH_BYTE_SIZE; i++)
+    p0[ECDH_BYTE_SIZE - i - 1] = key_data[i];
+
+  p1 = input+1;			/* skip '04' */
+  p0 = (uint8_t *)P->x;
+  for (i = 0; i < ECDH_BYTE_SIZE; i++)
+    p0[ECDH_BYTE_SIZE - i - 1] = *p1++;
+  p0 = (uint8_t *)P->y;
+  for (i = 0; i < ECDH_BYTE_SIZE; i++)
+    p0[ECDH_BYTE_SIZE - i - 1] = *p1++;
+
+  r = FUNC(compute_kP) (X, k, P);
+  if (r == 0)
+    {
+      p0 = output;
+      p1 = (const uint8_t *)X->x;
+      *p0++ = 4;
+      for (i = 0; i < ECDH_BYTE_SIZE; i++)
+	*p0++ = p1[ECDH_BYTE_SIZE - i - 1];
+      p1 = (const uint8_t *)X->y;
+      for (i = 0; i < ECDH_BYTE_SIZE; i++)
+	*p0++ = p1[ECDH_BYTE_SIZE - i - 1];
+    }
+
+  return r;
+}
+
+
+/**
+ * @brief Check if a secret d0 is valid or not
+ *
+ * @param D0	scalar D0: secret
+ * @param D1	scalar D1: secret candidate N-D0
+ *
+ * Return 0 on error.
+ * Return -1 when D1 should be used as the secret
+ * Return 1 when D0 should be used as the secret
+ */
+int
+FUNC(ecc_check_secret) (const uint8_t *d0, uint8_t *d1)
+{
+  return FUNC(check_secret) ((const bn256 *)d0, (bn256 *)d1);
+}
--- a/call-ec_p256k1.c
+++ b/call-ec_p256k1.c
@ -0,0 +1,34 @@
+/*
+ * call-ec_p256k1.c - interface between Gnuk and Elliptic curve over
+ *                    GF(p256k1)
+ *
+ * Copyright (C) 2014, 2017  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "bn.h"
+#include "affine.h"
+#include "jpc-ac_p256k1.h"
+#include "ec_p256k1.h"
+
+#define FIELD p256k1
+
+#include "call-ec.c"
--- a/call-rsa.c
+++ b/call-rsa.c
@ -0,0 +1,267 @@
+/*
+ * call-rsa.c -- Glue code between RSA computation and OpenPGP card protocol
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2017
+ *               Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+//#include <chopstx.h>
+
+#include "config.h"
+
+#include "gnuk.h"
+#include "status-code.h"
+#include "random.h"
+#include "polarssl/config.h"
+#include "polarssl/rsa.h"
+
+static rsa_context rsa_ctx;
+//static struct chx_cleanup clp;
+
+static void
+rsa_cleanup (void *arg)
+{
+  (void)arg;
+  rsa_free (&rsa_ctx);
+}
+
+
+int
+rsa_sign (const uint8_t *raw_message, uint8_t *output, int msg_len,
+	  struct key_data *kd, int pubkey_len)
+{
+  mpi P1, Q1, H;
+  int ret = 0;
+  unsigned char temp[pubkey_len];
+
+  rsa_init (&rsa_ctx, RSA_PKCS_V15, 0);
+
+  mpi_init (&P1);  mpi_init (&Q1);  mpi_init (&H);
+
+  rsa_ctx.len = pubkey_len;
+  MPI_CHK( mpi_lset (&rsa_ctx.E, 0x10001) );
+  MPI_CHK( mpi_read_binary (&rsa_ctx.P, &kd->data[0], pubkey_len / 2) );
+  MPI_CHK( mpi_read_binary (&rsa_ctx.Q, &kd->data[pubkey_len / 2],
+			    pubkey_len / 2) );
+#if 0
+  MPI_CHK( mpi_mul_mpi (&rsa_ctx.N, &rsa_ctx.P, &rsa_ctx.Q) );
+#endif
+  MPI_CHK( mpi_sub_int (&P1, &rsa_ctx.P, 1) );
+  MPI_CHK( mpi_sub_int (&Q1, &rsa_ctx.Q, 1) );
+  MPI_CHK( mpi_mul_mpi (&H, &P1, &Q1) );
+  MPI_CHK( mpi_inv_mod (&rsa_ctx.D , &rsa_ctx.E, &H) );
+  MPI_CHK( mpi_mod_mpi (&rsa_ctx.DP, &rsa_ctx.D, &P1) );
+  MPI_CHK( mpi_mod_mpi (&rsa_ctx.DQ, &rsa_ctx.D, &Q1) );
+  MPI_CHK( mpi_inv_mod (&rsa_ctx.QP, &rsa_ctx.Q, &rsa_ctx.P) );
+ cleanup:
+  mpi_free (&P1);  mpi_free (&Q1);  mpi_free (&H);
+  if (ret == 0)
+    {
+      int cs;
+
+      DEBUG_INFO ("RSA sign...");
+      //clp.next = NULL;
+      //clp.routine = rsa_cleanup;
+      //clp.arg = NULL;
+      //chopstx_cleanup_push (&clp);
+      //cs = chopstx_setcancelstate (0); /* Allow cancellation.  */
+      ret = rsa_rsassa_pkcs1_v15_sign (&rsa_ctx, NULL, NULL,
+				       RSA_PRIVATE, SIG_RSA_RAW,
+				       msg_len, raw_message, temp);
+      memcpy (output, temp, pubkey_len);
+      rsa_cleanup(NULL);
+      //chopstx_setcancelstate (cs);
+      //chopstx_cleanup_pop (0);
+    }
+
+  rsa_free (&rsa_ctx);
+  if (ret != 0)
+    {
+      DEBUG_INFO ("fail:");
+      DEBUG_SHORT (ret);
+      return -1;
+    }
+  else
+    {
+      DEBUG_INFO ("done.\r\n");
+      GPG_SUCCESS ();
+      return 0;
+    }
+}
+
+/*
+ * LEN: length in byte
+ */
+int
+modulus_calc (const uint8_t *p, int len, uint8_t *pubkey)
+{
+  mpi P, Q, N;
+  int ret;
+
+  mpi_init (&P);  mpi_init (&Q);  mpi_init (&N);
+  MPI_CHK( mpi_read_binary (&P, p, len / 2) );
+  MPI_CHK( mpi_read_binary (&Q, p + len / 2, len / 2) );
+  MPI_CHK( mpi_mul_mpi (&N, &P, &Q) );
+  MPI_CHK( mpi_write_binary (&N, pubkey, len) );
+ cleanup:
+  mpi_free (&P);  mpi_free (&Q);  mpi_free (&N);
+  if (ret != 0)
+    return -1;
+
+  return 0;
+}
+
+
+int
+rsa_decrypt (const uint8_t *input, uint8_t *output, int msg_len,
+	     struct key_data *kd, unsigned int *output_len_p)
+{
+  mpi P1, Q1, H;
+  int ret;
+
+  DEBUG_INFO ("RSA decrypt:");
+  DEBUG_WORD ((uint32_t)&ret);
+
+  rsa_init (&rsa_ctx, RSA_PKCS_V15, 0);
+  mpi_init (&P1);  mpi_init (&Q1);  mpi_init (&H);
+
+  rsa_ctx.len = msg_len;
+  DEBUG_WORD (msg_len);
+
+  MPI_CHK( mpi_lset (&rsa_ctx.E, 0x10001) );
+  MPI_CHK( mpi_read_binary (&rsa_ctx.P, &kd->data[0], msg_len / 2) );
+  MPI_CHK( mpi_read_binary (&rsa_ctx.Q, &kd->data[msg_len / 2], msg_len / 2) );
+#if 0
+  MPI_CHK( mpi_mul_mpi (&rsa_ctx.N, &rsa_ctx.P, &rsa_ctx.Q) );
+#endif
+  MPI_CHK( mpi_sub_int (&P1, &rsa_ctx.P, 1) );
+  MPI_CHK( mpi_sub_int (&Q1, &rsa_ctx.Q, 1) );
+  MPI_CHK( mpi_mul_mpi (&H, &P1, &Q1) );
+  MPI_CHK( mpi_inv_mod (&rsa_ctx.D , &rsa_ctx.E, &H) );
+  MPI_CHK( mpi_mod_mpi (&rsa_ctx.DP, &rsa_ctx.D, &P1) );
+  MPI_CHK( mpi_mod_mpi (&rsa_ctx.DQ, &rsa_ctx.D, &Q1) );
+  MPI_CHK( mpi_inv_mod (&rsa_ctx.QP, &rsa_ctx.Q, &rsa_ctx.P) );
+ cleanup:
+  mpi_free (&P1);  mpi_free (&Q1);  mpi_free (&H);
+  if (ret == 0)
+    {
+      int cs;
+
+      DEBUG_INFO ("RSA decrypt ...");
+      //clp.next = NULL;
+      //clp.routine = rsa_cleanup;
+      //clp.arg = NULL;
+      //chopstx_cleanup_push (&clp);
+      //cs = chopstx_setcancelstate (0); /* Allow cancellation.  */
+      ret = rsa_rsaes_pkcs1_v15_decrypt (&rsa_ctx, NULL, NULL,
+					 RSA_PRIVATE, output_len_p, input,
+					 output, MAX_RES_APDU_DATA_SIZE);
+      rsa_cleanup(NULL);
+      //chopstx_setcancelstate (cs);
+      //chopstx_cleanup_pop (0);
+    }
+
+  rsa_free (&rsa_ctx);
+  if (ret != 0)
+    {
+      DEBUG_INFO ("fail:");
+      DEBUG_SHORT (ret);
+      return -1;
+    }
+  else
+    {
+      DEBUG_INFO ("done.\r\n");
+      GPG_SUCCESS ();
+      return 0;
+    }
+}
+
+int
+rsa_verify (const uint8_t *pubkey, int pubkey_len,
+	    const uint8_t *hash, const uint8_t *sig)
+{
+  int ret;
+
+  rsa_init (&rsa_ctx, RSA_PKCS_V15, 0);
+  rsa_ctx.len = pubkey_len;
+  MPI_CHK( mpi_lset (&rsa_ctx.E, 0x10001) );
+  MPI_CHK( mpi_read_binary (&rsa_ctx.N, pubkey, pubkey_len) );
+
+  DEBUG_INFO ("RSA verify...");
+
+  MPI_CHK( rsa_rsassa_pkcs1_v15_verify (&rsa_ctx, NULL, NULL,
+					RSA_PUBLIC, SIG_RSA_SHA256, 32,
+					hash, sig) );
+ cleanup:
+  rsa_free (&rsa_ctx);
+  if (ret != 0)
+    {
+      DEBUG_INFO ("fail:");
+      DEBUG_SHORT (ret);
+      return -1;
+    }
+  else
+    {
+      DEBUG_INFO ("verified.\r\n");
+      return 0;
+    }
+}
+
+#define RSA_EXPONENT 0x10001
+
+int
+rsa_genkey (int pubkey_len, uint8_t *pubkey, uint8_t *p_q)
+{
+  int ret;
+  uint8_t index = 0;
+  uint8_t *p = p_q;
+  uint8_t *q = p_q + pubkey_len / 2;
+  int cs;
+
+  extern int prng_seed (int (*f_rng)(void *, unsigned char *, size_t),
+			void *p_rng);
+  extern void neug_flush (void);
+
+  neug_flush ();
+  prng_seed (random_gen, &index);
+  rsa_init (&rsa_ctx, RSA_PKCS_V15, 0);
+
+  //clp.next = NULL;
+  //clp.routine = rsa_cleanup;
+  //clp.arg = NULL;
+  //chopstx_cleanup_push (&clp);
+  //cs = chopstx_setcancelstate (0); /* Allow cancellation.  */
+  MPI_CHK( rsa_gen_key (&rsa_ctx, random_gen, &index, pubkey_len * 8,
+			RSA_EXPONENT) );
+  MPI_CHK( mpi_write_binary (&rsa_ctx.P, p, pubkey_len / 2) );
+  MPI_CHK( mpi_write_binary (&rsa_ctx.Q, q, pubkey_len / 2) );
+  MPI_CHK( mpi_write_binary (&rsa_ctx.N, pubkey, pubkey_len) );
+
+ cleanup:
+  //chopstx_setcancelstate (cs);
+  //chopstx_cleanup_pop (1);
+  rsa_cleanup(NULL);
+  if (ret != 0)
+    return -1;
+  else
+    return 0;
+}
--- a/config.h
+++ b/config.h
@ -0,0 +1,17 @@
+#define DEBUG
+#ifdef DEBUG
+#define ENABLE_VIRTUAL_COM_PORT 1
+#endif
+#undef DFU_SUPPORT
+#define ORIGIN 0x08000000
+#define ORIGIN_REAL 0x08000000
+#undef PINPAD_SUPPORT
+
+#define CERTDO_SUPPORT 1
+#undef HID_CARD_CHANGE_SUPPORT
+#define LIFE_CYCLE_MANAGEMENT_SUPPORT 1
+#undef ACKBTN_SUPPORT 
+#define SERIALNO_STR_LEN 12
+#undef KDF_DO_REQUIRED
+
+#define MHZ 133
--- a/debug.c
+++ b/debug.c
@ -0,0 +1,134 @@
+/*
+ * debug.c -- Debuging with virtual COM port
+ *
+ * Copyright (C) 2010 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "tusb.h"
+
+void my_write (const char *s, int len)
+{
+  if (len == 0)
+    return;
+
+  TU_LOG2(s);
+}
+
+
+static void
+put_hex (uint8_t nibble)
+{
+  uint8_t c;
+
+  if (nibble < 0x0a)
+    c = '0' + nibble;
+  else
+    c = 'a' + nibble - 0x0a;
+
+  my_write ((const char *)&c, 1);
+}
+
+void
+put_byte (uint8_t b)
+{
+  put_hex (b >> 4);
+  put_hex (b &0x0f);
+  my_write ("\r\n", 2);
+}
+
+void
+put_byte_with_no_nl (uint8_t b)
+{
+  my_write (" ", 1);
+  put_hex (b >> 4);
+  put_hex (b &0x0f);
+}
+
+void
+put_short (uint16_t x)
+{
+  put_hex (x >> 12);
+  put_hex ((x >> 8)&0x0f);
+  put_hex ((x >> 4)&0x0f);
+  put_hex (x & 0x0f);
+  my_write ("\r\n", 2);
+}
+
+void
+put_word (uint32_t x)
+{
+  put_hex (x >> 28);
+  put_hex ((x >> 24)&0x0f);
+  put_hex ((x >> 20)&0x0f);
+  put_hex ((x >> 16)&0x0f);
+  put_hex ((x >> 12)&0x0f);
+  put_hex ((x >> 8)&0x0f);
+  put_hex ((x >> 4)&0x0f);
+  put_hex (x & 0x0f);
+  my_write ("\r\n", 2);
+}
+
+void
+put_int (uint32_t x)
+{
+  char s[10];
+  int i;
+
+  for (i = 0; i < 10; i++)
+    {
+      s[i] = '0' + (x % 10);
+      x /= 10;
+      if (x == 0)
+	break;
+    }
+
+  while (i)
+    {
+      my_write (s+i, 1);
+      i--;
+    }
+
+  my_write (s, 1);
+  my_write ("\r\n", 2);
+}
+
+void
+put_binary (const char *s, int len)
+{
+  int i;
+
+  for (i = 0; i < len; i++)
+    {
+      put_byte_with_no_nl (s[i]);
+      if ((i & 0x0f) == 0x0f)
+	my_write ("\r\n", 2);
+      }
+  my_write ("\r\n", 2);
+}
+
+void
+put_string (const char *s)
+{
+  my_write (s, strlen (s));
+}
+
+
--- a/ec_p256k1.c
+++ b/ec_p256k1.c
@ -0,0 +1,233 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ec_p256k1.c - Elliptic curve over GF(p256k1)
+ *
+ * Copyright (C) 2014 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * Note: we don't take advantage of the specific feature of this curve,
+ * but use same method of computation as NIST P-256 curve.  That's due
+ * to some software patent(s).
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "bn.h"
+#include "modp256k1.h"
+#include "affine.h"
+#include "jpc-ac_p256k1.h"
+#include "mod.h"
+#include "ec_p256k1.h"
+
+#define FIELD p256k1
+#define COEFFICIENT_A_IS_ZERO    1
+
+/*
+ * a = 0, b = 7
+ */
+#if 0
+static const bn256 coefficient_a[1] = {
+  {{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }}
+};
+#endif
+
+static const bn256 coefficient_b[1] = {
+  {{ 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }}
+};
+
+
+static const ac precomputed_KG[15] = {
+  {
+    {{{ 0x16f81798, 0x59f2815b, 0x2dce28d9, 0x029bfcdb,
+	0xce870b07, 0x55a06295, 0xf9dcbbac, 0x79be667e }}},
+    {{{ 0xfb10d4b8, 0x9c47d08f, 0xa6855419, 0xfd17b448,
+	0x0e1108a8, 0x5da4fbfc, 0x26a3c465, 0x483ada77 }}}
+  }, {
+    {{{ 0x42d0e6bd, 0x13b7e0e7, 0xdb0f5e53, 0xf774d163,
+	0x104d6ecb, 0x82a2147c, 0x243c4e25, 0x3322d401 }}},
+    {{{ 0x6c28b2a0, 0x24f3a2e9, 0xa2873af6, 0x2805f63e,
+	0x4ddaf9b7, 0xbfb019bc, 0xe9664ef5, 0x56e70797 }}}
+  }, {
+    {{{ 0x829d122a, 0xdca81127, 0x67e99549, 0x8f17f314,
+	0x6a8a9e73, 0x9b889085, 0x846dd99d, 0x583fdfd9 }}},
+    {{{ 0x63c4eac4, 0xf3c7719e, 0xb734b37a, 0xb44685a3,
+	0x572a47a6, 0x9f92d2d6, 0x2ff57d81, 0xabc6232f }}}
+  }, {
+    {{{ 0x9ec4c0da, 0x1b7b444c, 0x723ea335, 0xe88c5678,
+	0x981f162e, 0x9239c1ad, 0xf63b5f33, 0x8f68b9d2 }}},
+    {{{ 0x501fff82, 0xf23cbf79, 0x95510bfd, 0xbbea2cfe,
+	0xb6be215d, 0xde1d90c2, 0xba063986, 0x662a9f2d }}}
+  }, {
+    {{{ 0x114cbf09, 0x63c5e885, 0x7be77e3e, 0x2f27ce93,
+	0xf54a3e33, 0xdaa6d12d, 0x3eff872c, 0x8b300e51 }}},
+    {{{ 0xb3b10a39, 0x26c6ff28, 0x9aaf7169, 0x08f6a7aa,
+	0x6b8238ea, 0x446f0d46, 0x7f43c0cc, 0x1cec3067 }}}
+  }, {
+    {{{ 0x075e9070, 0xba16ce6a, 0x9b5cfe37, 0xbc26893d,
+	0x9c510774, 0xe1ddadfe, 0xfe3ae2f4, 0x90922d88 }}},
+    {{{ 0x5c08824a, 0x653943cc, 0xfce8f4bc, 0x06d74475,
+	0x533c615d, 0x8d101fa7, 0x742108a9, 0x7b1903f6 }}}
+  }, {
+    {{{ 0x6ebdc96c, 0x1bcfa45c, 0x1c7584ba, 0xe400bc04,
+	0x74cf531f, 0x6395e20e, 0xc5131b30, 0x1edd0bb1 }}},
+    {{{ 0xe358cf9e, 0xa117161b, 0x2724d11c, 0xe490d6f0,
+	0xee6dd8c9, 0xf75062f6, 0xfba373e4, 0x31e03b2b }}}
+  }, {
+    {{{ 0x2120e2b3, 0x7f3b58fa, 0x7f47f9aa, 0x7a58fdce,
+	0x4ce6e521, 0xe7be4ae3, 0x1f51bdba, 0xeaa649f2 }}},
+    {{{ 0xba5ad93d, 0xd47a5305, 0xf13f7e59, 0x01a6b965,
+	0x9879aa5a, 0xc69a80f8, 0x5bbbb03a, 0xbe3279ed }}}
+  }, {
+    {{{ 0x27bb4d71, 0xcf291a33, 0x33524832, 0x6caf7d6b,
+	0x766584ee, 0x6e0ee131, 0xd064c589, 0x160cb0f6 }}},
+    {{{ 0x17136e8d, 0x9d5de554, 0x1aab720e, 0xe3f2d468,
+	0xccf75cc2, 0xd1378b49, 0xc4ff16e1, 0x6920c375 }}}
+  }, {
+    {{{ 0x1a9ee611, 0x3eef9e96, 0x9cc37faf, 0xfe4d7bf3,
+	0xb321d965, 0x462aa9b3, 0x208736c5, 0x1702da3e }}},
+    {{{ 0x3a545ceb, 0xfba57bbf, 0x7ea858f5, 0x6dbcd766,
+	0x680d92f1, 0x088e897c, 0xbc626c80, 0x468c1fd8 }}}
+  }, {
+    {{{ 0xb188660a, 0xb40f85c7, 0x99bc3c36, 0xc5873c19,
+	0x7f33b54c, 0x3c7b4541, 0x1f8c9bf8, 0x4cd3a93c }}},
+    {{{ 0x33099cb0, 0xf8dce380, 0x2edd2f33, 0x7a167dd6,
+	0x0ffe35b7, 0x576d8987, 0xc68ace5c, 0xd2de0386 }}}
+  }, {
+    {{{ 0x6658bb08, 0x9a9e0a72, 0xc589607b, 0xe23c5f2a,
+	0xf2bfb4c8, 0xa048ca14, 0xc62c2291, 0x4d9a0f89 }}},
+    {{{ 0x0f827294, 0x427b5f31, 0x9f2c35cd, 0x1ea7a8b5,
+	0x85a3c00f, 0x95442e56, 0x9b57975a, 0x8cb83121 }}}
+  }, {
+    {{{ 0x51f5cf67, 0x4333f0da, 0xf4f0d3cb, 0x6d3ea47c,
+	0xa05a831f, 0x442fda14, 0x016d3e81, 0x6a496013 }}},
+    {{{ 0xe52e0f48, 0xf647318c, 0x4a0d5ff1, 0x5ff3a66e,
+	0x61199ba8, 0x046ed81a, 0x3e79c23a, 0x578edf08 }}}
+  }, {
+    {{{ 0x3ea01ea7, 0xb8f996f8, 0x7497bb15, 0xc0045d33,
+	0x6205647c, 0xc4749dc9, 0x0efd22c9, 0xd8946054 }}},
+    {{{ 0x12774ad5, 0x062dcb09, 0x8be06e3a, 0xcb13f310,
+	0x235de1a9, 0xca281d35, 0x69c3645c, 0xaf8a7412 }}}
+  }, {
+    {{{ 0xbeb8b1e2, 0x8808ca5f, 0xea0dda76, 0x0262b204,
+	0xddeb356b, 0xb6fffffc, 0xfbb83870, 0x52de253a }}},
+    {{{ 0x8f8d21ea, 0x961f40c0, 0x002f03ed, 0x89686278,
+	0x38e421ea, 0x0ff834d7, 0xd36fb8db, 0x3a270d6f }}}
+  }
+};
+
+static const ac precomputed_2E_KG[15] = {
+  {
+    {{{ 0x39a48db0, 0xefd7835b, 0x9b3c03bf, 0x9f1215a2,
+	0x9b7bde45, 0x2791d0a0, 0x696e7167, 0x100f44da }}},
+    {{{ 0x2bc65a09, 0x0fbd5cd6, 0xff5195ac, 0xb7ff4a18,
+	0x0c090666, 0x2ec8f330, 0x92a00b77, 0xcdd9e131 }}}
+  }, {
+    {{{ 0x40fb27b6, 0x32427e28, 0xbe430576, 0xc76e3db2,
+	0x61686aa5, 0x10f238ad, 0xbe778b1b, 0xfea74e3d }}},
+    {{{ 0xf23cb96f, 0x701d3db7, 0x973f7b77, 0x126b596b,
+	0xccb6af93, 0x7cf674de, 0x9b0b1329, 0x6e0568db }}}
+  }, {
+    {{{ 0x2c8118bc, 0x6cac5154, 0x399ddd98, 0x19bd4b34,
+	0x2e9c8949, 0x47248a8d, 0x2cefa3b1, 0x734cb6a8 }}},
+    {{{ 0x1e410fd5, 0xf1b340ad, 0xc4873539, 0xa2982bee,
+	0xd4de4530, 0x7b5a3ea4, 0x42202574, 0xae46e10e }}}
+  }, {
+    {{{ 0xac1f98cd, 0xcbfc99c8, 0x4d7f0308, 0x52348905,
+	0x1cc66021, 0xfaed8a9c, 0x4a474870, 0x9c3919a8 }}},
+    {{{ 0xd4fc599d, 0xbe7e5e03, 0x6c64c8e6, 0x905326f7,
+	0xf260e641, 0x584f044b, 0x4a4ddd57, 0xddb84f0f }}}
+  }, {
+    {{{ 0xed7cebed, 0xc4aacaa8, 0x4fae424e, 0xb75d2dce,
+	0xba20735e, 0xa01585a2, 0xba122399, 0x3d75f24b }}},
+    {{{ 0xd5570dce, 0xcbe4606f, 0x2da192c2, 0x9d00bfd7,
+	0xa57b7265, 0x9c3ce86b, 0xec4edf5e, 0x987a22f1 }}}
+  }, {
+    {{{ 0x73ea0665, 0x211b9715, 0xf3a1abbb, 0x86f485d4,
+	0xcd076f0e, 0xabd242d8, 0x0ba5dc88, 0x862332ab }}},
+    {{{ 0x7b784911, 0x09af505c, 0xcaf4fae7, 0xc89544e8,
+	0xae9a32eb, 0x256625f6, 0x606d1a3f, 0xe2532b72 }}}
+  }, {
+    {{{ 0x0deaf885, 0x79e9f313, 0x46df21c9, 0x938ff76e,
+	0xa953bb2c, 0x1968f5fb, 0x29155f27, 0xdff538bf }}},
+    {{{ 0x31d5d020, 0xf7bae0b1, 0x1a676a8d, 0x5afdc787,
+	0xfa9d53ff, 0x11b4f032, 0xc5959167, 0x86ba433e }}}
+  }, {
+    {{{ 0x9475b7ba, 0x884fdff0, 0xe4918b3d, 0xe039e730,
+	0xf5018cdb, 0x3d3e57ed, 0x1943785c, 0x95939698 }}},
+    {{{ 0x7524f2fd, 0xe9b8abf8, 0xc8709385, 0x9c653f64,
+	0x4b9cd684, 0x8ba0386a, 0x88c331dd, 0x2e7e5528 }}}
+  }, {
+    {{{ 0xeefe79e5, 0x940bef53, 0xbe9b87f3, 0xc518d286,
+	0x7833042c, 0x9e0c7c76, 0x11fbe152, 0x104e2cb5 }}},
+    {{{ 0x50bbec83, 0xc0d35e0f, 0x4acd0fcc, 0xee4879be,
+	0x006085ee, 0xc8d80f5d, 0x72fe1ac1, 0x3c51bc1c }}}
+  }, {
+    {{{ 0xb2de976e, 0x06187f61, 0xf5e4b4b6, 0x52869e18,
+	0x38d332ca, 0x74d4facd, 0xb3a2f8d9, 0x5c1c90b4 }}},
+    {{{ 0xdaa37893, 0x98644d09, 0xabe39818, 0x682435a8,
+	0x469c53a0, 0x17e46617, 0x77dc2e64, 0x642f9632 }}}
+  }, {
+    {{{ 0x222f6c54, 0xad2101c5, 0xfa74785e, 0xb05c7a58,
+	0x489bcdaf, 0xce55fa79, 0xffe88d54, 0xc1f920fd }}},
+    {{{ 0x9065e490, 0x32553ab0, 0x35329f74, 0x7611b9af,
+	0xab7b24c0, 0x57df19ef, 0x6181c447, 0xb9a78749 }}}
+  }, {
+    {{{ 0xa80b7ea8, 0x392f156f, 0x8ae4a8bf, 0x57ab7ca0,
+	0x50c4b178, 0xac320747, 0x0e781feb, 0x146041b9 }}},
+    {{{ 0x845279b2, 0xd343f075, 0x7387afa5, 0x2d4fe757,
+	0xa72f3c39, 0x151e0948, 0x550da168, 0x41a6d54e }}}
+  }, {
+    {{{ 0x075a0010, 0xb3134ed3, 0x7ae93e23, 0x9fa76f4b,
+	0x7bb4daaa, 0xc0db256f, 0x464dd8a3, 0x7668dc27 }}},
+    {{{ 0x9f5da977, 0x150063f5, 0x05efce00, 0x3acac5c8,
+	0x884493fe, 0xc8e12ffc, 0x88f06bd2, 0x4ab936d8 }}}
+  }, {
+    {{{ 0x5d09ea98, 0x996fde77, 0x4145da58, 0x16ddf512,
+	0xdc2fb225, 0xa97a6ca8, 0xfbdcdf5a, 0xc7331f30 }}},
+    {{{ 0x86a86e52, 0x838f99e0, 0x77795edd, 0x68d39b29,
+	0x9f412aaa, 0xe4e4f97e, 0x30d25352, 0xe5cc2c0a }}}
+  }, {
+    {{{ 0x9c21ff71, 0xb3d68650, 0xddbe3884, 0x11e7589d,
+	0x423bac67, 0x7efd4055, 0x46957425, 0x587a7293 }}},
+    {{{ 0x8f5a8fc6, 0x360adc2e, 0xbd69f12e, 0x6f8bbafb,
+	0x0a3f3b4d, 0xf671f423, 0x59942dc3, 0xb49acb47 }}}
+  }
+};
+
+/*
+ * N: order of G
+ *    0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141
+ */
+static const bn256 N[1] = {
+  {{ 0xd0364141, 0xbfd25e8c, 0xaf48a03b, 0xbaaedce6,
+     0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff }}
+};
+
+/*
+ * MU = 2^512 / N
+ * MU = ( (1 << 256) | MU_lower )
+ */
+static const bn256 MU_lower[1] = {
+  {{ 0x2fc9bec0, 0x402da173, 0x50b75fc4, 0x45512319,
+     0x1, 0x0, 0x0, 0x0 }}
+};
+
+
+#include "ecc.c"
--- a/ec_p256k1.h
+++ b/ec_p256k1.h
@ -0,0 +1,4 @@
+int compute_kP_p256k1 (ac *X, const bn256 *K, const ac *P);
+int compute_kG_p256k1 (ac *X, const bn256 *K);
+void ecdsa_p256k1 (bn256 *r, bn256 *s, const bn256 *z, const bn256 *d);
+int check_secret_p256k1 (const bn256 *q, bn256 *d1);
--- a/ecc-ed25519.c
+++ b/ecc-ed25519.c
@ -0,0 +1,952 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ecc-ed25519.c - Elliptic curve computation for
+ *                 the twisted Edwards curve: -x^2 + y^2 = 1 + d*x^2*y^2
+ *                 d = -121665/121666
+ *
+ * Copyright (C) 2014, 2017  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "bn.h"
+#include "mod.h"
+#include "mod25638.h"
+#include "sha512.h"
+
+/*
+ * References:
+ *
+ * [1] Daniel J. Bernstein, Niels Duif, Tanja Lange, Peter Schwabe, Bo-Yin Yang.
+ *     High-speed high-security signatures.
+ *     Journal of Cryptographic Engineering 2 (2012), 77--89.
+ *     http://cr.yp.to/papers.html#ed25519
+ *
+ * [2] Daniel J. Bernstein, Peter Birkner, Marc Joye, Tanja Lange,
+ *     Christiane Peters.
+ *     Twisted Edwards curves.
+ *     Pages 389--405 in Progress in cryptology---AFRICACRYPT 2008.
+ *     http://cr.yp.to/papers.html#twisted
+ */
+
+/*
+ * IMPLEMENTATION NOTE
+ *
+ * (0) We assume that the processor has no cache, nor branch target
+ *     prediction.  Thus, we don't avoid indexing by secret value.
+ *     We don't avoid conditional jump if both cases have same timing,
+ *     either.
+ *
+ * (1) We use Radix-32 field arithmetic.  It's a representation like
+ *     2^256-38, but it's more redundant.  For example, "1" can be
+ *     represented in three ways in 256-bit: 1, 2^255-18, and
+ *     2^256-37.
+ *
+ * (2) We use fixed base comb multiplication.  Scalar is 252-bit.
+ *     There are various possible choices for 252 = 2 * 2 * 3 * 3 * 7.
+ *     Current choice of total size is 3KB.  We use three tables, and
+ *     a table has 16 points (3 * 1KB).
+ *
+ *     Window size W = 4-bit, E = 21.
+ *                                                       <--21-bit-
+ *                                             <---42-bit----------
+ *     [        ][########][////////][        ][########][////////]
+ *                                   <-------63-bit----------------
+ *                         <-----------84-bit----------------------
+ *               <--------------105-bit----------------------------
+ *
+ *     [        ][########][////////][        ][########][////////]
+ *                                                                 <-126-bit-
+ *                                                       <-147-bit-
+ *                                             <----168-bit--------
+ *
+ *                                   <-------189-bit---------------
+ *                         <----------210-bit----------------------
+ *               <-------------231-bit-----------------------------
+ */
+
+/*
+ * Identity element: (0,1)
+ * Negation: -(x,y) = (-x,y)
+ *
+ * d: -0x2DFC9311D490018C7338BF8688861767FF8FF5B2BEBE27548A14B235ECA6874A
+ * order:
+ *     0x1000000000000000000000000000000014DEF9DEA2F79CD65812631A5CF5D3ED
+ * Gx: 0x216936D3CD6E53FEC0A4E231FDD6DC5C692CC7609525A7B2C9562D608F25D51A
+ * Gy: 0x6666666666666666666666666666666666666666666666666666666666666658
+ */
+
+/* d + 2^255 - 19 */
+static const bn256 coefficient_d[1] = {
+  {{ 0x135978a3, 0x75eb4dca, 0x4141d8ab, 0x00700a4d,
+     0x7779e898, 0x8cc74079, 0x2b6ffe73, 0x52036cee }} };
+
+
+/**
+ * @brief	Projective Twisted Coordinates
+ */
+typedef struct
+{
+  bn256 x[1];
+  bn256 y[1];
+  bn256 z[1];
+} ptc;
+
+#include "affine.h"
+
+
+static int
+mod25519_is_neg (const bn256 *a)
+{
+  return (a->word[0] & 1);
+}
+
+
+/**
+ * @brief  X = 2 * A
+ *
+ * Compute (X3 : Y3 : Z3) = 2 * (X1 : Y1 : Z1)
+ */
+static void
+point_double (ptc *X, const ptc *A)
+{
+  bn256 b[1], d[1], e[1];
+
+  /* Compute: B = (X1 + Y1)^2 */
+  mod25638_add (b, A->x, A->y);
+  mod25638_sqr (b, b);
+
+  /* Compute: C = X1^2        : E      */
+  mod25638_sqr (e, A->x);
+
+  /* Compute: D = Y1^2             */
+  mod25638_sqr (d, A->y);
+
+  /* E = aC; where a = -1 */
+  /* Compute: D - E = D + C : Y3_tmp */
+  mod25638_add (X->y, e, d);
+
+  /* Compute: -F = -(E + D) = C - D; where a = -1 : E */
+  mod25638_sub (e, e, d);
+
+  /* Compute: H = Z1^2        : D     */
+  mod25638_sqr (d, A->z);
+
+  /* Compute: -J = 2*H - F    : D     */
+  mod25638_add (d, d, d);
+  mod25638_add (d, d, e);
+
+  /* Compute: X3 = (B-C-D)*J = -J*(C+D-B) = -J*(Y3_tmp-B)  */
+  mod25638_sub (X->x, X->y, b);
+  mod25638_mul (X->x, X->x, d);
+
+  /* Compute: Y3 = -F*(D-E) = -F*Y3_tmp            */
+  mod25638_mul (X->y, X->y, e);
+
+  /* Z3 = -F*-J             */
+  mod25638_mul (X->z, e, d);
+}
+
+
+/**
+ * @brief	X = A + B
+ *
+ * @param X	Destination PTC
+ * @param A	PTC
+ * @param B	AC
+ *
+ * Compute: (X3 : Y3 : Z3) = (X1 : Y1 : Z1) + (X2 : Y2 : 1)
+ */
+static void
+point_add (ptc *X, const ptc *A, const ac *B)
+{
+  bn256 c[1], d[1], e[1], tmp[1];
+
+  /* Compute: C = X1 * X2 */
+  mod25638_mul (c, A->x, B->x);
+
+  /* Compute: D = Y1 * Y2 */
+  mod25638_mul (d, A->y, B->y);
+
+  /* Compute: E = d * C * D */
+  mod25638_mul (e, c, d);
+  mod25638_mul (e, coefficient_d, e);
+
+  /* Compute: C_1 = C + D */
+  mod25638_add (c, c, d);
+
+  /* Compute: D_1 = Z1^2 : B */
+  mod25638_sqr (d, A->z);
+
+  /* tmp = D_1 - E : F */
+  mod25638_sub (tmp, d, e);
+
+  /* D_2 = D_1 + E : G */
+  mod25638_add (d, d, e);
+
+  /* X3_final = Z1 * tmp * ((X1 + Y1) * (X2 + Y2) - C_1) */
+  mod25638_add (X->x, A->x, A->y);
+  mod25638_add (e, B->x, B->y);
+  mod25638_mul (e, X->x, e);
+  mod25638_sub (e, e, c);
+  mod25638_mul (e, tmp, e);
+  mod25638_mul (X->x, A->z, e);
+
+  /* Y3_final = Z1 * D_2 * C_1 */
+  mod25638_mul (c, d, c);
+  mod25638_mul (X->y, A->z, c);
+
+  /* Z3_final = tmp * D_2 */
+  mod25638_mul (X->z, tmp, d);
+
+  /* A = Z1 */
+  /* B = A^2 */
+  /* C = X1 * X2 */
+  /* D = Y1 * Y2 */
+  /* E = d * C * D */
+  /* F = B - E */
+  /* G = B + E */
+  /* X3 = A * F * ((X1 + Y1) * (X2 + Y2) - C - D) */
+  /* Y3 = A * G * (D - aC); where a = -1 */
+  /* Z3 = F * G */
+}
+
+
+/**
+ * @brief	X = convert A
+ *
+ * @param X	Destination AC
+ * @param A	PTC
+ *
+ * (X1:Y1:Z1) represents the affine point (x=X1/Z1, y=Y1/Z1)
+ */
+static void
+point_ptc_to_ac (ac *X, const ptc *A)
+{
+  bn256 z_inv[1];
+
+  /*
+   * A->z may be bigger than p25519, or two times bigger than p25519.
+   * But this is no problem for computation of mod_inv.
+   */
+  mod_inv (z_inv, A->z, p25519);
+
+  mod25638_mul (X->x, A->x, z_inv);
+  mod25519_reduce (X->x);
+  mod25638_mul (X->y, A->y, z_inv);
+  mod25519_reduce (X->y);
+}
+
+
+static const ac precomputed_KG[16] = {
+  { {{{ 0, 0, 0, 0, 0, 0, 0, 0 }}},
+    {{{ 1, 0, 0, 0, 0, 0, 0, 0 }}}                         },
+  { {{{ 0x8f25d51a, 0xc9562d60, 0x9525a7b2, 0x692cc760,
+        0xfdd6dc5c, 0xc0a4e231, 0xcd6e53fe, 0x216936d3 }}},
+    {{{ 0x66666658, 0x66666666, 0x66666666, 0x66666666,
+        0x66666666, 0x66666666, 0x66666666, 0x66666666 }}} },
+  { {{{ 0x3713af22, 0xac7137bd, 0xac634604, 0x25ed77a4,
+        0xa815e038, 0xce0d0064, 0xbca90151, 0x041c030f }}},
+    {{{ 0x0780f989, 0xe9b33fcf, 0x3d4445e7, 0xe4e97c2a,
+        0x655e5c16, 0xc67dc71c, 0xee43fb7a, 0x72467625 }}} },
+  { {{{ 0x3ee99893, 0x76a19171, 0x7ba9b065, 0xe647edd9,
+        0x6aeae260, 0x31f39299, 0x5f4a9bb2, 0x6d9e4545 }}},
+    {{{ 0x94cae280, 0xc41433da, 0x79061211, 0x8e842de8,
+        0xa259dc8a, 0xaab95e0b, 0x99013cd0, 0x28bd5fc3 }}} },
+  { {{{ 0x7d23ea24, 0x59e22c56, 0x0460850e, 0x1e745a88,
+        0xda13ef4b, 0x4583ff4c, 0x95083f85, 0x1f13202c }}},
+    {{{ 0x90275f48, 0xad42025c, 0xb55c4778, 0x0085087e,
+        0xfdfd7ffa, 0xf21109e7, 0x6c381b7e, 0x66336d35 }}} },
+  { {{{ 0xd00851f2, 0xaa9476ab, 0x4a61600b, 0xe7838534,
+        0x1a52df87, 0x0de65625, 0xbd675870, 0x5f0dd494 }}},
+    {{{ 0xe23493ba, 0xf20aec1b, 0x3414b0a8, 0x8f7f2741,
+        0xa80e1eb6, 0x497e74bd, 0xe9365b15, 0x1648eaac }}} },
+  { {{{ 0x04ac2b69, 0x5b78dcec, 0x32001a73, 0xecdb66ce,
+        0xb34cf697, 0xb75832f4, 0x3a2bce94, 0x7aaf57c5 }}},
+    {{{ 0x60fdfc6f, 0xb32ed2ce, 0x757924c6, 0x77bf20be,
+        0x48742dd1, 0xaebd15dd, 0x55d38439, 0x6311bb16 }}} },
+  { {{{ 0x42ff5c97, 0x139cdd73, 0xdbd82964, 0xee4c359e,
+        0x70611a3f, 0x91c1cd94, 0x8075dbcb, 0x1d0c34f6 }}},
+    {{{ 0x5f931219, 0x43eaa549, 0xa23d35a6, 0x3737aba7,
+        0x46f167bb, 0x54b1992f, 0xb74a9944, 0x01a11f3c }}} },
+  { {{{ 0xba46b161, 0x67a5310e, 0xd9d67f6c, 0x790f8527,
+        0x2f6cc814, 0x359c5b5f, 0x7786383d, 0x7b6a5565 }}},
+    {{{ 0x663ab0d3, 0xf1431b60, 0x09995826, 0x14a32d8f,
+        0xeddb8571, 0x61d526f6, 0x0eac739a, 0x0cb7acea }}} },
+  { {{{ 0x4a2d009f, 0x5eb1a697, 0xd8df987a, 0xdacb43b4,
+        0x8397f958, 0x4870f214, 0x8a175fbb, 0x5aa0c67c }}},
+    {{{ 0x78887db3, 0x27dbbd4c, 0x64e322ab, 0xe327b707,
+        0x7cbe4e3b, 0x87e293fa, 0xbda72395, 0x17040799 }}} },
+  { {{{ 0x99d1e696, 0xc833a5a2, 0x2d9d5877, 0x969bff8e,
+        0x2216fa67, 0x383a533a, 0x684d3925, 0x338bbe0a }}},
+    {{{ 0xd6cfb491, 0x35b5aae8, 0xaa12f3f8, 0x4a588279,
+        0x2e30380e, 0xa7c2e708, 0x9e4b3d62, 0x69f13e09 }}} },
+  { {{{ 0x27f1cd56, 0xec0dc2ef, 0xdb11cc97, 0x1af11548,
+        0x9ebc7613, 0xb642f86a, 0xcb77c3b9, 0x5ce45e73 }}},
+    {{{ 0x3eddd6de, 0x5d128786, 0x4859eab7, 0x16f9a6b4,
+        0xd8782345, 0x55c53916, 0xdb7b202a, 0x6b1dfa87 }}} },
+  { {{{ 0x19e30528, 0x2461a8ed, 0x665cfb1c, 0xaf756bf9,
+        0x3a6e8673, 0x0fcafd1d, 0x45d10f48, 0x0d264435 }}},
+    {{{ 0x5431db67, 0x543fd4c6, 0x60932432, 0xc153a5b3,
+        0xd2119aa4, 0x41d5b8eb, 0x8b09b6a5, 0x36bd9ab4 }}} },
+  { {{{ 0x21e06738, 0x6d39f935, 0x3765dd86, 0x4e6a7c59,
+        0xa4730880, 0xefc0dd80, 0x4079fe2f, 0x40617e56 }}},
+    {{{ 0x921439b9, 0xbc83cdff, 0x98833c09, 0xd5cccc06,
+        0xda13cdcb, 0xe315c425, 0x67ff5370, 0x37bc6e84 }}} },
+  { {{{ 0xf643b5f5, 0x65e7f028, 0x0ffbf5a8, 0x5b0d4831,
+        0xf4085f62, 0x0f540498, 0x0db7bd1b, 0x6f0bb035 }}},
+    {{{ 0x9733742c, 0x51f65571, 0xf513409f, 0x2fc047a0,
+        0x355facf6, 0x07f45010, 0x3a989a9c, 0x5cd416a9 }}} },
+  { {{{ 0x748f2a67, 0x0bdd7208, 0x415b7f7f, 0x0cf0b80b,
+        0x57aa0119, 0x44afdd5f, 0x430dc946, 0x05d68802 }}},
+    {{{ 0x1a60eeb2, 0x420c46e5, 0x665024f5, 0xc60a9b33,
+        0x48c51347, 0x37520265, 0x00a21bfb, 0x6f4be0af }}} }
+};
+
+static const ac precomputed_2E_KG[16] = {
+  { {{{ 0, 0, 0, 0, 0, 0, 0, 0 }}},
+    {{{ 1, 0, 0, 0, 0, 0, 0, 0 }}}                         },
+  { {{{ 0x199c4f7d, 0xec314ac0, 0xb2ebaaf9, 0x66a39c16,
+        0xedd4d15f, 0xab1c92b8, 0x57d9eada, 0x482a4cdf }}},
+    {{{ 0x6e4eb04b, 0xbd513b11, 0x25e4fd6a, 0x3f115fa5,
+        0x14519298, 0x0b3c5fc6, 0x81c2f7a8, 0x7391de43 }}} },
+  { {{{ 0x1254fe02, 0xa57dca18, 0x6da34368, 0xa56a2a14,
+        0x63e7328e, 0x44c6e34f, 0xca63ab3e, 0x3f748617 }}},
+    {{{ 0x7dc1641e, 0x5a13dc52, 0xee4e9ca1, 0x4cbb2899,
+        0x1ba9acee, 0x3938a289, 0x420fc47b, 0x0fed89e6 }}} },
+  { {{{ 0x49cbad08, 0x3c193f32, 0x15e80ef5, 0xdda71ef1,
+        0x9d128c33, 0xda44186c, 0xbf98c24f, 0x54183ede }}},
+    {{{ 0x93d165c1, 0x2cb483f7, 0x177f44aa, 0x51762ace,
+        0xb4ab035d, 0xb3fe651b, 0xa0b0d4e5, 0x426c99c3 }}} },
+  { {{{ 0xef3f3fb1, 0xb3fcf4d8, 0x065060a0, 0x7052292b,
+        0x24240b15, 0x18795ff8, 0x9989ffcc, 0x13aea184 }}},
+    {{{ 0xc2b81f44, 0x1930c101, 0x10600555, 0x672d6ca4,
+        0x1b25e570, 0xfbddbff2, 0x8ca12b70, 0x0884949c }}} },
+  { {{{ 0x00564bbf, 0x9983a033, 0xde61b72d, 0x95587d25,
+        0xeb17ad71, 0xb6719dfb, 0xc0bc3517, 0x46871ad0 }}},
+    {{{ 0xe95a6693, 0xb034fb61, 0x76eabad9, 0x5b0d8d18,
+        0x884785dc, 0xad295dd0, 0x74a1276a, 0x359debad }}} },
+  { {{{ 0xe89fb5ca, 0x2e5a2686, 0x5656c6c5, 0xd3d200ba,
+        0x9c969001, 0xef4c051e, 0x02cb45f4, 0x0d4ea946 }}},
+    {{{ 0x76d6e506, 0xa6f8a422, 0x63209e23, 0x454c768f,
+        0x2b372386, 0x5c12fd04, 0xdbfee11f, 0x1aedbd3e }}} },
+  { {{{ 0x00dbf569, 0x700ab50f, 0xd335b313, 0x9553643c,
+        0xa17dc97e, 0xeea9bddf, 0x3350a2bd, 0x0d12fe3d }}},
+    {{{ 0xa16a3dee, 0xe5ac35fe, 0xf81950c3, 0x4ae4664a,
+        0x3dbbf921, 0x75c63df4, 0x2958a5a6, 0x545b109c }}} },
+  { {{{ 0x0a61b29c, 0xd7a52a98, 0x65aca9ee, 0xe21e0acb,
+        0x5985dcbe, 0x57a69c0f, 0xeb87a534, 0x3c0c1e7b }}},
+    {{{ 0x6384bd2f, 0xf0a0b50d, 0xc6939e4b, 0xff349a34,
+        0x6e2f1973, 0x922c4554, 0xf1347631, 0x74e826b2 }}} },
+  { {{{ 0xa655803c, 0xd7eaa066, 0x38292c5c, 0x09504e76,
+        0x2c874953, 0xe298a02e, 0x8932b73f, 0x225093ed }}},
+    {{{ 0xe69c3efd, 0xf93e2b4d, 0x8a87c799, 0xa2cbd5fc,
+        0x85dba986, 0xdf41da94, 0xccee8edc, 0x36fe85e7 }}} },
+  { {{{ 0x7d742813, 0x78df7dc5, 0x4a193e64, 0x333bcc6d,
+        0x6a966d2d, 0x8242aa25, 0x4cd36d32, 0x03500a94 }}},
+    {{{ 0x580505d7, 0xd5d110fc, 0xfa11e1e9, 0xb2f47e16,
+        0x06eab6b4, 0xd0030f92, 0x62c91d46, 0x2dc80d5f }}} },
+  { {{{ 0x2a75e492, 0x5788b01a, 0xbae31352, 0x992acf54,
+        0x8159db27, 0x4591b980, 0xd3d84740, 0x36c6533c }}},
+    {{{ 0x103883b5, 0xc44c7c00, 0x515d0820, 0x10329423,
+        0x71b9dc16, 0xbd306903, 0xf88f8d32, 0x7edd5a95 }}} },
+  { {{{ 0x005523d7, 0xfd63b1ac, 0xad70dd21, 0x74482e0d,
+        0x02b56105, 0x67c9d9d0, 0x5971b456, 0x4d318012 }}},
+    {{{ 0x841106df, 0xdc9a6f6d, 0xa326987f, 0x7c52ed9d,
+        0x00607ea0, 0x4dbeaa6f, 0x6959e688, 0x115c221d }}} },
+  { {{{ 0xc80f7c16, 0xf8718464, 0xe9930634, 0x05dc8f40,
+        0xc2e9d5f4, 0xefa699bb, 0x021da209, 0x2469e813 }}},
+    {{{ 0xc602a3c4, 0x75c02845, 0x0a200f9d, 0x49d1b2ce,
+        0x2fb3ec8f, 0xd21b75e4, 0xd72a7545, 0x10dd726a }}} },
+  { {{{ 0x63ef1a6c, 0xeda58527, 0x051705e0, 0xb3fc0e72,
+        0x44f1161f, 0xbda6f3ee, 0xf339efe5, 0x7680aebf }}},
+    {{{ 0xb1b070a7, 0xe8d3fd01, 0xdbfbaaa0, 0xc3ff7dbf,
+        0xa320c916, 0xd81ef6f2, 0x62a3b54d, 0x3e22a1fb }}} },
+  { {{{ 0xb1fa18c8, 0xcdbb9187, 0xcb483a17, 0x8ddb5f6b,
+        0xea49af98, 0xc0a880b9, 0xf2dfddd0, 0x53bf600b }}},
+    {{{ 0x9e25b164, 0x4217404c, 0xafb74aa7, 0xfabf06ee,
+        0x2b9f233c, 0xb17712ae, 0xd0eb909e, 0x71f0b344 }}} }
+};
+
+static const ac precomputed_4E_KG[16] = {
+  { {{{ 0, 0, 0, 0, 0, 0, 0, 0 }}},
+    {{{ 1, 0, 0, 0, 0, 0, 0, 0 }}}                         },
+  { {{{ 0xe388a820, 0xbb6ec091, 0x5182278a, 0xa928b283,
+        0xa9a6eb83, 0x2259174d, 0x45500054, 0x184b48cb }}},
+    {{{ 0x26e77c33, 0xfe324dba, 0x83faf453, 0x6679a5e3,
+        0x2380ef73, 0xdd60c268, 0x03dc33a9, 0x3ee0e07a }}} },
+  { {{{ 0xce974493, 0x403aff28, 0x9bf6f5c4, 0x84076bf4,
+        0xecd898fb, 0xec57038c, 0xb663ed49, 0x2898ffaa }}},
+    {{{ 0xf335163d, 0xf4b3bc46, 0xfa4fb6c6, 0xe613a0f4,
+        0xb9934557, 0xe759d6bc, 0xab6c9477, 0x094f3b96 }}} },
+  { {{{ 0x6afffe9e, 0x168bb5a0, 0xee748c29, 0x950f7ad7,
+        0xda17203d, 0xa4850a2b, 0x77289e0f, 0x0062f7a7 }}},
+    {{{ 0x4b3829fa, 0x6265d4e9, 0xbdfcd386, 0x4f155ada,
+        0x475795f6, 0x9f38bda4, 0xdece4a4c, 0x560ed4b3 }}} },
+  { {{{ 0x141e648a, 0xdad4570a, 0x019b965c, 0x8bbf674c,
+        0xdb08fe30, 0xd7a8d50d, 0xa2851109, 0x7efb45d3 }}},
+    {{{ 0xd0c28cda, 0x52e818ac, 0xa321d436, 0x792257dd,
+        0x9d71f8b7, 0x867091c6, 0x11a1bf56, 0x0fe1198b }}} },
+  { {{{ 0x06137ab1, 0x4e848339, 0x3e6674cc, 0x5673e864,
+        0x0140502b, 0xad882043, 0x6ea1e46a, 0x34b5c0cb }}},
+    {{{ 0x1d70aa7c, 0x29786814, 0x8cdbb8aa, 0x840ae3f9,
+        0xbd4801fb, 0x78b4d622, 0xcf18ae9a, 0x6cf4e146 }}} },
+  { {{{ 0x36297168, 0x95c270ad, 0x942e7812, 0x2303ce80,
+        0x0205cf0e, 0x71908cc2, 0x32bcd754, 0x0cc15edd }}},
+    {{{ 0x2c7ded86, 0x1db94364, 0xf141b22c, 0xc694e39b,
+        0x5e5a9312, 0xf22f64ef, 0x3c5e6155, 0x649b8859 }}} },
+  { {{{ 0xb6417945, 0x0d5611c6, 0xac306c97, 0x9643fdbf,
+        0x0df500ff, 0xe81faaa4, 0x6f50e615, 0x0792c79b }}},
+    {{{ 0xd2af8c8d, 0xb45bbc49, 0x84f51bfe, 0x16c615ab,
+        0xc1d02d32, 0xdc57c526, 0x3c8aaa55, 0x5fb9a9a6 }}} },
+  { {{{ 0xdee40b98, 0x82faa8db, 0x6d520674, 0xff8a5208,
+        0x446ac562, 0x1f8c510f, 0x2cc6b66e, 0x4676d381 }}},
+    {{{ 0x2e7429f4, 0x8f1aa780, 0x8ed6bdf6, 0x2a95c1bf,
+        0x457fa0eb, 0x051450a0, 0x744c57b1, 0x7d89e2b7 }}} },
+  { {{{ 0x3f95ea15, 0xb6bdacd2, 0x2f1a5d69, 0xc9a9d1b1,
+        0xf4d22d72, 0xd4c2f1a9, 0x4dc516b5, 0x73ecfdf1 }}},
+    {{{ 0x05391e08, 0xa1ce93cd, 0x7b8aac17, 0x98f1e99e,
+        0xa098cbb3, 0x9ba84f2e, 0xf9bdd37a, 0x1425aa8b }}} },
+  { {{{ 0x966abfc0, 0x8a385bf4, 0xf081a640, 0x55e5e8bc,
+        0xee26f5ff, 0x835dff85, 0xe509e1ea, 0x4927e622 }}},
+    {{{ 0x352334b0, 0x164c8dbc, 0xa3fea31f, 0xcac1ad63,
+        0x682fd457, 0x9b87a676, 0x1a53145f, 0x75f382ff }}} },
+  { {{{ 0xc3efcb46, 0x16b944f5, 0x68cb184c, 0x1fb55714,
+        0x9ccf2dc8, 0xf1c2b116, 0x808283d8, 0x7417e00f }}},
+    {{{ 0x930199ba, 0x1ea67a22, 0x718990d8, 0x9fbaf765,
+        0x8f3d5d57, 0x231fc664, 0xe5853194, 0x38141a19 }}} },
+  { {{{ 0x2f81290d, 0xb9f00390, 0x04a9ca6c, 0x44877827,
+        0xe1dbdd65, 0x65d7f9b9, 0xf7c6698a, 0x7133424c }}},
+    {{{ 0xa7cd250f, 0x604cfb3c, 0x5acc18f3, 0x460c3c4b,
+        0xb518e3eb, 0xa53e50e0, 0x98a40196, 0x2b4b9267 }}} },
+  { {{{ 0xc5dbd06c, 0x591b0672, 0xaa1eeb65, 0x10d43dca,
+        0xcd2517af, 0x420cdef8, 0x0b695a8a, 0x513a307e }}},
+    {{{ 0x66503215, 0xee9d6a7b, 0x088fd9a4, 0xdea58720,
+        0x973afe12, 0x8f3cbbea, 0x872f2538, 0x005c2350 }}} },
+  { {{{ 0x35af3291, 0xe5024b70, 0x4f5e669a, 0x1d3eec2d,
+        0x6e79d539, 0xc1f6d766, 0x795b5248, 0x34ec043f }}},
+    {{{ 0x400960b6, 0xb2763511, 0x29e57df0, 0xff7a3d84,
+        0x1666c1f1, 0xaeac7792, 0x66084bc0, 0x72426e97 }}} },
+  { {{{ 0x44f826ca, 0x5b1c3199, 0x790aa408, 0x68b00b73,
+        0x69e9b92b, 0xaf0984b4, 0x3ffe9093, 0x5fe6736f }}},
+    {{{ 0xffd49312, 0xd67f2889, 0x5cb9ed21, 0x3520d747,
+        0x3c65a606, 0x94f893b1, 0x2d65496f, 0x2fee5e8c }}} }
+};
+
+/**
+ * @brief	X  = k * G
+ *
+ * @param K	scalar k
+ *
+ * Return -1 on error.
+ * Return 0 on success.
+ */
+static void
+compute_kG_25519 (ac *X, const bn256 *K)
+{
+  ptc Q[1];
+  int i;
+
+  /* identity element */
+  memset (Q, 0, sizeof (ptc));
+  Q->y->word[0] = 1;
+  Q->z->word[0] = 1;
+
+  for (i = 20; i >= 0; i--)
+    {
+      int k0, k1, k2;
+
+      k0 = ((K->word[0] >> i) & 1)
+	| (i < 1 ? ((K->word[1] >> 30) & 2)
+	   : (((K->word[2] >> (i-1)) & 1) << 1))
+	| (i < 2 ? ((K->word[3] >> (i+28)) & 4)
+	   : (((K->word[4] >> (i-2)) & 1) << 2))
+	| (i < 3 ? ((K->word[5] >> (i+26)) & 8)
+	   : (((K->word[6] >> (i-3)) & 1) << 3));
+
+      k1 = (i < 11 ? ((K->word[0] >> (i+21)) & 1)
+	    : ((K->word[1] >> (i-11)) & 1))
+	| (i < 12 ? ((K->word[2] >> (i+19)) & 2)
+	   : (((K->word[3] >> (i-12)) & 1) << 1))
+	| (i < 13 ? ((K->word[4] >> (i+17)) & 4)
+	   : (((K->word[5] >> (i-13)) & 1) << 2))
+	| (i < 14 ? ((K->word[6] >> (i+15)) & 8)
+	   : (((K->word[7] >> (i-14)) & 1) << 3));
+
+      k2 = ((K->word[1] >> (i+10)) & 1)
+	| ((K->word[3] >> (i+8)) & 2)
+	| ((K->word[5] >> (i+6)) & 4)
+	| ((K->word[7] >> (i+4)) & 8);
+
+      point_double (Q, Q);
+      point_add (Q, Q, &precomputed_KG[k0]);
+      point_add (Q, Q, &precomputed_2E_KG[k1]);
+      point_add (Q, Q, &precomputed_4E_KG[k2]);
+    }
+
+  point_ptc_to_ac (X, Q);
+}
+
+
+#define BN416_WORDS 13
+#define BN128_WORDS 4
+
+/* M: The order of the generator G.  */
+static const bn256 M[1] = {
+  {{  0x5CF5D3ED, 0x5812631A, 0xA2F79CD6, 0x14DEF9DE,
+      0x00000000, 0x00000000, 0x00000000, 0x10000000  }}
+};
+
+#define C ((const uint32_t *)M)
+
+static void
+bnX_mul_C (uint32_t *r, const uint32_t *q, int q_size)
+{
+  int i, j, k;
+  int i_beg, i_end;
+  uint32_t r0, r1, r2;
+
+  r0 = r1 = r2 = 0;
+  for (k = 0; k <= q_size + BN128_WORDS - 2; k++)
+    {
+      if (q_size < BN128_WORDS)
+	if (k < q_size)
+	  {
+	    i_beg = 0;
+	    i_end = k;
+	  }
+	else
+	  {
+	    i_beg = k - q_size + 1;
+	    i_end = k;
+	    if (i_end > BN128_WORDS - 1)
+	      i_end = BN128_WORDS - 1;
+	  }
+      else
+	if (k < BN128_WORDS)
+	  {
+	    i_beg = 0;
+	    i_end = k;
+	  }
+	else
+	  {
+	    i_beg = k - BN128_WORDS + 1;
+	    i_end = k;
+	    if (i_end > q_size - 1)
+	      i_end = q_size - 1;
+	  }
+
+      for (i = i_beg; i <= i_end; i++)
+	{
+	  uint64_t uv;
+	  uint32_t u, v;
+	  uint32_t carry;
+
+	  j = k - i;
+	  if (q_size < BN128_WORDS)
+	    uv = ((uint64_t )q[j])*((uint64_t )C[i]);
+	  else
+	    uv = ((uint64_t )q[i])*((uint64_t )C[j]);
+	  v = uv;
+	  u = (uv >> 32);
+	  r0 += v;
+	  carry = (r0 < v);
+	  r1 += carry;
+	  carry = (r1 < carry);
+	  r1 += u;
+	  carry += (r1 < u);
+	  r2 += carry;
+	}
+
+      r[k] = r0;
+      r0 = r1;
+      r1 = r2;
+      r2 = 0;
+    }
+
+  r[k] = r0;
+}
+
+/**
+ * @brief R = A mod M (using M=2^252+C) (Barret reduction)
+ *
+ * See HAC 14.47 and 14.52.
+ */
+static void
+mod_reduce_M (bn256 *R, const bn512 *A)
+{
+  uint32_t q[BN256_WORDS+1];
+  uint32_t tmp[BN416_WORDS];
+  bn256 r[1];
+  uint32_t carry, next_carry;
+  int i;
+#define borrow carry
+
+  q[8] = A->word[15]>>28;
+  carry = A->word[15] & 0x0fffffff;
+  for (i = BN256_WORDS - 1; i >= 0; i--)
+    {
+      next_carry = A->word[i+7] & 0x0fffffff;
+      q[i] = (A->word[i+7] >> 28) | (carry << 4);
+      carry = next_carry;
+    }
+  memcpy (R, A, sizeof (bn256));
+  R->word[7] &= 0x0fffffff;
+
+  /* Q_size: 9 */
+  bnX_mul_C (tmp, q, 9); /* TMP = Q*C */
+  /* Q = tmp / 2^252 */
+  carry = tmp[12] & 0x0fffffff;
+  for (i = 4; i >= 0; i--)
+    {
+      next_carry = tmp[i+7] & 0x0fffffff;
+      q[i] = (tmp[i+7] >> 28) | (carry << 4);
+      carry = next_carry;
+    }
+  /* R' = tmp % 2^252 */
+  memcpy (r, tmp, sizeof (bn256));
+  r->word[7] &= 0x0fffffff;
+  /* R -= R' */
+  borrow = bn256_sub (R, R, r);
+  if (borrow)
+    bn256_add (R, R, M);
+  else
+    bn256_add ((bn256 *)tmp, R, M);
+
+  /* Q_size: 5 */
+  bnX_mul_C (tmp, q, 5); /* TMP = Q*C */
+  carry = tmp[8] & 0x0fffffff;
+  q[0] = (tmp[7] >> 28) | (carry << 4);
+  /* R' = tmp % 2^252 */
+  memcpy (r, tmp, sizeof (bn256));
+  r->word[7] &= 0x0fffffff;
+  /* R += R' */
+  bn256_add (R, R, r);
+  borrow = bn256_sub (R, R, M);
+  if (borrow)
+    bn256_add (R, R, M);
+  else
+    bn256_add ((bn256 *)tmp, R, M);
+
+  /* Q_size: 1 */
+  bnX_mul_C (tmp, q, 1); /* TMP = Q*C */
+  /* R' = tmp % 2^252 */
+  memset (((uint8_t *)r)+(sizeof (uint32_t)*5), 0, sizeof (uint32_t)*3);
+  memcpy (r, tmp, sizeof (uint32_t)*5);
+  /* R -= R' */
+  borrow = bn256_sub (R, R, r);
+  if (borrow)
+    bn256_add (R, R, M);
+  else
+    bn256_add ((bn256 *)tmp, R, M);
+#undef borrow
+}
+
+
+int
+eddsa_sign_25519 (const uint8_t *input, size_t ilen, uint32_t *out,
+		  const bn256 *a, const uint8_t *seed, const bn256 *pk)
+{
+  bn256 *r, *s;
+  sha512_context ctx;
+  uint8_t hash[64];
+  bn256 tmp[1];
+  ac R[1];
+  uint32_t carry, borrow;
+
+  r = (bn256 *)out;
+  s = (bn256 *)(out+(32/4));
+
+  sha512_start (&ctx);
+  sha512_update (&ctx, seed, sizeof (bn256)); /* It's upper half of the hash */
+  sha512_update (&ctx, input, ilen);
+  sha512_finish (&ctx, hash);
+
+  mod_reduce_M (r, (bn512 *)hash);
+  compute_kG_25519 (R, r);
+
+  /* EdDSA encoding.  */
+  memcpy (tmp, R->y, sizeof (bn256));
+  tmp->word[7] ^= mod25519_is_neg (R->x) * 0x80000000;
+
+  sha512_start (&ctx);
+  sha512_update (&ctx, (uint8_t *)tmp, sizeof (bn256));
+  sha512_update (&ctx, (uint8_t *)pk, sizeof (bn256));
+  sha512_update (&ctx, input, ilen);
+  sha512_finish (&ctx, (uint8_t *)hash);
+
+  mod_reduce_M (s, (bn512 *)hash);
+  bn256_mul ((bn512 *)hash, s, a);
+  mod_reduce_M (s, (bn512 *)hash);
+  carry = bn256_add (s, s, r);
+  borrow = bn256_sub (s, s, M);
+
+  memcpy (r, tmp, sizeof (bn256));
+
+  if ((borrow && !carry))
+    bn256_add (s, s, M);
+  else
+    bn256_add (tmp, s, M);
+
+  return 0;
+}
+
+static void
+eddsa_public_key_25519 (bn256 *pk, const bn256 *a)
+{
+  ac R[1];
+  ptc X[1];
+  bn256 a0[1];
+
+  bn256_shift (a0, a, -3);
+  compute_kG_25519 (R, a0);
+  memcpy (X, R, sizeof (ac));
+  memset (X->z, 0, sizeof (bn256));
+  X->z->word[0] = 1;
+  point_double (X, X);
+  point_double (X, X);
+  point_double (X, X);
+  point_ptc_to_ac (R, X);
+  /* EdDSA encoding.  */
+  memcpy (pk, R->y, sizeof (bn256));
+  pk->word[7] ^= mod25519_is_neg (R->x) * 0x80000000;
+}
+
+
+void
+eddsa_compute_public_25519 (const uint8_t *kd, uint8_t *pubkey)
+{
+  eddsa_public_key_25519 ((bn256 *)pubkey, (const bn256 *)kd);
+}
+
+
+#if 0
+/**
+ * check if P is on the curve.
+ *
+ * Return -1 on error.
+ * Return 0 on success.
+ */
+static int
+point_is_on_the_curve (const ac *P)
+{
+  bn256 s[1], t[1];
+
+  /* Twisted Edwards curve: a*x^2 + y^2 = 1 + d*x^2*y^2 */
+}
+
+int
+compute_kP_25519 (ac *X, const bn256 *K, const ac *P);
+#endif
+
+#ifdef PRINT_OUT_TABLE
+static const ptc G[1] = {{
+  {{{ 0x8f25d51a, 0xc9562d60, 0x9525a7b2, 0x692cc760,
+      0xfdd6dc5c, 0xc0a4e231, 0xcd6e53fe, 0x216936d3 }}},
+  {{{ 0x66666658, 0x66666666, 0x66666666, 0x66666666,
+      0x66666666, 0x66666666, 0x66666666, 0x66666666 }}},
+  {{{ 1, 0, 0, 0, 0, 0, 0, 0 }}},
+}};
+
+#include <stdio.h>
+
+#ifdef TESTING_EDDSA
+static void
+print_bn256 (const bn256 *X)
+{
+  int i;
+
+  for (i = 7; i >= 0; i--)
+    printf ("%08x", X->word[i]);
+  puts ("");
+}
+#endif
+
+#if 0
+static void
+print_point (const ac *X)
+{
+  int i;
+
+#ifdef PRINT_OUT_TABLE_AS_C
+  fputs ("  { {{{ ", stdout);
+  for (i = 0; i < 4; i++)
+    printf ("0x%08x, ", X->x->word[i]);
+  fputs ("\n        ", stdout);
+  for (; i < 7; i++)
+    printf ("0x%08x, ", X->x->word[i]);
+  printf ("0x%08x }}},\n", X->x->word[i]);
+  fputs ("    {{{ ", stdout);
+  for (i = 0; i < 4; i++)
+    printf ("0x%08x, ", X->y->word[i]);
+  fputs ("\n        ", stdout);
+  for (; i < 7; i++)
+    printf ("0x%08x, ", X->y->word[i]);
+  printf ("0x%08x }}} },\n", X->y->word[i]);
+#else
+  puts ("--");
+  for (i = 7; i >= 0; i--)
+    printf ("%08x", X->x->word[i]);
+  puts ("");
+  for (i = 7; i >= 0; i--)
+    printf ("%08x", X->y->word[i]);
+  puts ("");
+  puts ("--");
+#endif
+}
+
+static void
+print_point_ptc (const ptc *X)
+{
+  int i;
+
+  puts ("---");
+  for (i = 7; i >= 0; i--)
+    printf ("%08x", X->x->word[i]);
+  puts ("");
+  for (i = 7; i >= 0; i--)
+    printf ("%08x", X->y->word[i]);
+  puts ("");
+  for (i = 7; i >= 0; i--)
+    printf ("%08x", X->z->word[i]);
+  puts ("");
+  puts ("---");
+}
+#endif
+
+
+#ifndef TESTING_EDDSA
+static void power_2 (ac *A, ptc *a, int N)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    ed_double_25638 (a, a);
+  ptc_to_ac_25519 (A, a);
+}
+
+static void print_table (ac *a0001, ac *a0010, ac *a0100, ac *a1000)
+{
+  int i;
+  ptc a[1];
+  ac x[1];
+
+  for (i = 1; i < 16; i++)
+    {
+      /* A := Identity Element  */
+      memset (a, 0, sizeof (ptc));
+      a->y->word[0] = 1;
+      a->z->word[0] = 1;
+
+      if ((i & 1))
+	ed_add_25638 (a, a, a0001);
+      if ((i & 2))
+	ed_add_25638 (a, a, a0010);
+      if ((i & 4))
+	ed_add_25638 (a, a, a0100);
+      if ((i & 8))
+	ed_add_25638 (a, a, a1000);
+
+      ptc_to_ac_25519 (x, a);
+      print_point (x);
+    }
+
+  fputs ("\n", stdout);
+}
+
+static void compute_and_print_table (ac *a0001, ac *a0010, ac *a0100, ac *a1000)
+{
+  ptc a[1];
+
+  memcpy (a, a0001, sizeof (ac));
+  memset (a->z, 0, sizeof (bn256));
+  a->z->word[0] = 1;
+  power_2 (a0010, a, 63);
+  power_2 (a0100, a, 63);
+  power_2 (a1000, a, 63);
+  print_table (a0001, a0010, a0100, a1000);
+}
+#endif
+
+int
+main (int argc, char *argv[])
+{
+#ifdef TESTING_EDDSA
+  uint8_t hash[64];
+  bn256 a[1];
+  uint8_t r_s[64];
+  bn256 pk[1];
+  bn256 *r, *s;
+
+  const bn256 sk[1] = {
+    {{ 0x9db1619d, 0x605afdef, 0xf44a84ba, 0xc42cec92,
+       0x69c54944, 0x1969327b, 0x03ac3b70, 0x607fae1c }} };
+
+  const bn256 r_expected[1] = {
+    {{ 0x004356e5, 0x72ac60c3, 0xcce28690, 0x8a826e80,
+       0x1e7f8784, 0x74d9e5b8, 0x65e073d8, 0x55014922 }} };
+
+  const bn256 s_expected[1] = {
+    {{ 0x1582b85f, 0xac3ba390, 0x70391ec6, 0x6bb4f91c,
+       0xf0f55bd2, 0x24be5b59, 0x43415165, 0x0b107a8e }} };
+
+  r = (bn256 *)r_s;
+  s = (bn256 *)(r_s+32);
+
+  sha512 ((uint8_t *)sk, sizeof (bn256), hash);
+  hash[0] &= 248;
+  hash[31] &= 127;
+  hash[31] |= 64;
+  memcpy (a, hash, sizeof (bn256));
+
+  eddsa_public_key_25519 (pk, a);
+  eddsa_sign_25519 ((const uint8_t *)"", 0, r_s, a, hash+32, pk);
+
+  if (memcmp (r, r_expected, sizeof (bn256)) != 0
+      || memcmp (s, s_expected, sizeof (bn256)) != 0)
+    {
+      print_bn256 (r);
+      print_bn256 (s);
+      return 1;
+    }
+#else
+  ac a0001[1], a0010[1], a0100[1], a1000[1];
+  ptc a[1];
+
+  memcpy (a, G, sizeof (ptc));
+  ptc_to_ac_25519 (a0001, a);
+  compute_and_print_table (a0001, a0010, a0100, a1000);
+
+  memcpy (a, a0001, sizeof (ac));
+  memset (a->z, 0, sizeof (bn256));
+  a->z->word[0] = 1;
+  power_2 (a0001, a, 21);
+  compute_and_print_table (a0001, a0010, a0100, a1000);
+
+  memcpy (a, a0001, sizeof (ac));
+  memset (a->z, 0, sizeof (bn256));
+  a->z->word[0] = 1;
+  power_2 (a0001, a, 21);
+  compute_and_print_table (a0001, a0010, a0100, a1000);
+#endif
+
+  return 0;
+}
+#endif
--- a/ecc-ed448.c
+++ b/ecc-ed448.c
@ -0,0 +1,824 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ecc-ed448.c - Elliptic curve computation for
+ *               the twisted Edwards curve: -x^2 + y^2 = 1 + d*x^2*y^2
+ *               d = -39081
+ *
+ * Copyright (C) 2021  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * IMPLEMENTATION NOTE
+ *
+ * (0) We assume that the processor has no cache, nor branch target
+ *     prediction.  Thus, we don't avoid indexing by secret value.
+ *     We don't avoid conditional jump if both cases have same timing,
+ *     either.
+ *
+ * (1) We use fixed base comb multiplication.  Scalar is 448-bit.
+ *     We use two tables, and a table has 16 points.
+ *     Window size W = 4-bit, E = 56.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "p448.h"
+#include "shake256.h"
+
+
+#define C_WORDS      7
+#define BN448_WORDS 14
+#define BN690_WORDS 22
+#define BN896_WORDS 28
+#define BN912_WORDS 29 /* 28.5 */
+
+typedef struct bn448 {
+  uint32_t word[ BN448_WORDS ]; /* Little endian */
+} bn448;
+
+typedef struct bn896 {
+  uint32_t word[ BN896_WORDS ]; /* Little endian */
+} bn896;
+
+typedef struct bn912 {
+  uint32_t word[ BN912_WORDS ]; /* Little endian */
+} bn912;
+
+static const bn448 M[1] = {{{
+  0xab5844f3, 0x2378c292, 0x8dc58f55, 0x216cc272,
+  0xaed63690, 0xc44edb49, 0x7cca23e9, 0xffffffff,
+  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+  0xffffffff, 0x3fffffff
+}}};
+
+static const uint32_t C[C_WORDS] = {
+  0x54a7bb0d, 0xdc873d6d, 0x723a70aa, 0xde933d8d,
+  0x5129c96f, 0x3bb124b6, 0x8335dc16
+};
+
+
+static uint32_t
+bn448_add (bn448 *X, const bn448 *A, const bn448 *B)
+{
+  int i;
+  uint32_t v;
+  uint32_t carry = 0;
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  px = X->word;
+  pa = A->word;
+  pb = B->word;
+
+  for (i = 0; i < BN448_WORDS; i++)
+    {
+      v = *pb;
+      *px = *pa + carry;
+      carry = (*px < carry);
+      *px += v;
+      carry += (*px < v);
+      px++;
+      pa++;
+      pb++;
+    }
+
+  return carry;
+}
+
+static uint32_t
+bn448_sub (bn448 *X, const bn448 *A, const bn448 *B)
+{
+  int i;
+  uint32_t v;
+  uint32_t borrow = 0;
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  px = X->word;
+  pa = A->word;
+  pb = B->word;
+
+  for (i = 0; i < BN448_WORDS; i++)
+    {
+      uint32_t borrow0 = (*pa < borrow);
+
+      v = *pb;
+      *px = *pa - borrow;
+      borrow = (*px < v) + borrow0;
+      *px -= v;
+      px++;
+      pa++;
+      pb++;
+    }
+
+  return borrow;
+}
+
+
+static void
+bnX_mul_C (uint32_t *r, const uint32_t *q, int q_size)
+{
+  int i, j, k;
+  int i_beg, i_end;
+  uint32_t r0, r1, r2;
+
+  r0 = r1 = r2 = 0;
+  for (k = 0; k <= q_size + C_WORDS - 2; k++)
+    {
+      if (q_size < C_WORDS)
+	if (k < q_size)
+	  {
+	    i_beg = 0;
+	    i_end = k;
+	  }
+	else
+	  {
+	    i_beg = k - q_size + 1;
+	    i_end = k;
+	    if (i_end > C_WORDS - 1)
+	      i_end = C_WORDS - 1;
+	  }
+      else
+	if (k < C_WORDS)
+	  {
+	    i_beg = 0;
+	    i_end = k;
+	  }
+	else
+	  {
+	    i_beg = k - C_WORDS + 1;
+	    i_end = k;
+	    if (i_end > q_size - 1)
+	      i_end = q_size - 1;
+	  }
+
+      for (i = i_beg; i <= i_end; i++)
+	{
+	  uint64_t uv;
+	  uint32_t u, v;
+	  uint32_t carry;
+
+	  j = k - i;
+	  if (q_size < C_WORDS)
+	    uv = ((uint64_t)q[j])*((uint64_t)C[i]);
+	  else
+	    uv = ((uint64_t)q[i])*((uint64_t)C[j]);
+	  v = uv;
+	  u = (uv >> 32);
+	  r0 += v;
+	  carry = (r0 < v);
+	  r1 += carry;
+	  carry = (r1 < carry);
+	  r1 += u;
+	  carry += (r1 < u);
+	  r2 += carry;
+	}
+
+      r[k] = r0;
+      r0 = r1;
+      r1 = r2;
+      r2 = 0;
+    }
+
+  r[k] = r0;
+}
+
+/* X <= X + A when COND!=0 */
+/* X <= X when COND==0 */
+static void
+bn448_add_cond (bn448 *X, const bn448 *A, int cond)
+{
+  int i;
+  uint32_t v;
+  uint32_t carry = 0;
+  uint32_t *px;
+  const uint32_t *pa;
+  uint32_t mask = -(!!cond);
+
+  px = X->word;
+  pa = A->word;
+
+  for (i = 0; i < BN448_WORDS; i++)
+    {
+      v = *px;
+      *px = (*pa & mask) + carry;
+      carry = (*px < carry);
+      *px += v;
+      carry += (*px < v);
+      px++;
+      pa++;
+    }
+}
+
+
+/* X <= X + A mod M */
+static void
+bn448_addm (bn448 *X, const bn448 *A)
+{
+  uint32_t borrow;
+
+  bn448_add (X, X, A);
+  borrow = bn448_sub (X, X, M);
+  bn448_add_cond (X, M, borrow);
+}
+
+/**
+ * @brief R = A mod M (using M=2^446-C) (Barret reduction)
+ *
+ * See HAC 14.47.
+ */
+void
+mod_reduce_M (bn448 *R, const bn912 *A)
+{
+  uint32_t q[BN448_WORDS+1];
+  uint32_t tmp[BN690_WORDS];
+  bn448 r[1];
+  uint32_t carry, next_carry;
+  int i;
+
+  /* Q = A / 2^446 *//* 466-bit */
+  /* Upper half of A->word[28] must be zero.  */
+  q[14] = (A->word[28] << 2) | (A->word[27] >> 30);
+  carry = A->word[27] & 0x3fffffff;
+  for (i = BN448_WORDS - 1; i >= 0; i--)
+    {
+      next_carry = A->word[i+13] & 0x3fffffff;
+      q[i] = (A->word[i+13] >> 30) | (carry << 2);
+      carry = next_carry;
+    }
+  memcpy (R, A, sizeof (bn448));
+  R->word[13] &= 0x3fffffff;
+
+  /* Q_size: 15 *//* 466-bit */
+  bnX_mul_C (tmp, q, 15); /* TMP = Q*C *//* 690-bit */
+  /* Q = tmp / 2^446 *//* 244-bit */
+  carry = tmp[21];
+  for (i = 7; i >= 0; i--)
+    {
+      next_carry = tmp[i+13] & 0x3fffffff;
+      q[i] = (tmp[i+13] >> 30) | (carry << 2);
+      carry = next_carry;
+    }
+  /* R' = tmp % 2^446 */
+  memcpy (r, tmp, sizeof (bn448));
+  r->word[13] &= 0x3fffffff;
+  /* R += R' */
+  bn448_addm (R, r);
+
+  /* Q_size: 8 *//* 244-bit */
+  bnX_mul_C (tmp, q, 8); /* TMP = Q*C *//* 468-bit */
+  /* Q = tmp / 2^446 *//* 22-bit */
+  carry = tmp[14];
+  q[0] = (tmp[13] >> 30) | (carry << 2);
+  /* R' = tmp % 2^446 */
+  memcpy (r, tmp, sizeof (bn448));
+  r->word[13] &= 0x3fffffff;
+  /* R += R' */
+  bn448_addm (R, r);
+
+  /* Q_size: 1 */
+  bnX_mul_C (tmp, q, 1); /* TMP = Q*C *//* 246-bit */
+  /* R' = tmp % 2^446 */
+  memset (((uint8_t *)r)+(sizeof (uint32_t)*8), 0, sizeof (uint32_t)*6);
+  memcpy (r, tmp, sizeof (uint32_t)*8);
+  /* R += R' */
+  bn448_addm (R, r);
+}
+
+
+static void
+bn448_mul (bn896 *X, const bn448 *A, const bn448 *B)
+{
+  int i, j, k;
+  int i_beg, i_end;
+  uint32_t r0, r1, r2;
+
+  r0 = r1 = r2 = 0;
+  for (k = 0; k <= (BN448_WORDS - 1)*2; k++)
+    {
+      if (k < BN448_WORDS)
+	{
+	  i_beg = 0;
+	  i_end = k;
+	}
+      else
+	{
+	  i_beg = k - BN448_WORDS + 1;
+	  i_end = BN448_WORDS - 1;
+	}
+
+      for (i = i_beg; i <= i_end; i++)
+	{
+	  uint64_t uv;
+	  uint32_t u, v;
+	  uint32_t carry;
+
+	  j = k - i;
+
+	  uv = ((uint64_t )A->word[i])*((uint64_t )B->word[j]);
+	  v = uv;
+	  u = (uv >> 32);
+	  r0 += v;
+	  carry = (r0 < v);
+	  r1 += carry;
+	  carry = (r1 < carry);
+	  r1 += u;
+	  carry += (r1 < u);
+	  r2 += carry;
+	}
+
+      X->word[k] = r0;
+      r0 = r1;
+      r1 = r2;
+      r2 = 0;
+    }
+
+  X->word[k] = r0;
+}
+
+static const p448_t nGx0[16] = {
+  { { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x070cc05e, 0x026a82bc, 0x00938e26, 0x080e18b0, 
+      0x0511433b, 0x0f72ab66, 0x0412ae1a, 0x0a3d3a46, 
+      0x0a6de324, 0x00f1767e, 0x04657047, 0x036da9e1, 
+      0x05a622bf, 0x0ed221d1, 0x066bed0d, 0x04f1970c } },
+  { { 0x0464238e, 0x00079817, 0x00d381ca, 0x02110302, 
+      0x0d9f01b5, 0x01cc4c6e, 0x05a131b1, 0x05e35dc5, 
+      0x006944eb, 0x0b61848d, 0x029631a3, 0x083792a0, 
+      0x0afca0dd, 0x0be1017f, 0x0782fcbb, 0x070aaa01 } },
+  { { 0x0e7661f9, 0x0b2f9f62, 0x009fae89, 0x03b99803, 
+      0x066014d2, 0x067900ef, 0x06556c10, 0x0c8eacf3, 
+      0x0ad4a82e, 0x020a44d0, 0x00572f1c, 0x0e7819e7, 
+      0x0fd08cdf, 0x0c0ed140, 0x09aee1da, 0x0a16934a } },
+  { { 0x091780c7, 0x0a7ea989, 0x0d2476b6, 0x004e4ecc, 
+      0x0c494b68, 0x00af9f58, 0x0dee64fd, 0x0e0f269f, 
+      0x0021bd26, 0x085a61f6, 0x0b5d284b, 0x0c265c35, 
+      0x03775afd, 0x058755ea, 0x02ecf2c6, 0x0617f174 } },
+  { { 0x067f4947, 0x0dbf4eb6, 0x0b8716d9, 0x02206a2a, 
+      0x0e7cad5a, 0x04a148b0, 0x0e483133, 0x0fbf12cd, 
+      0x0c6458f7, 0x0e022d5a, 0x01b7e39d, 0x0a60afe6, 
+      0x05a5208c, 0x0c62f458, 0x03311553, 0x0a08a4c3 } },
+  { { 0x0054a90d, 0x0ad5dc54, 0x00ac9fd6, 0x097f2af4, 
+      0x0f4ddbc7, 0x01b0f7b3, 0x0324ce0b, 0x01d5d092, 
+      0x0cd2798f, 0x08cb96e2, 0x0957bc39, 0x0bd045b5, 
+      0x0f76fbfb, 0x046308a9, 0x0ef679ce, 0x0c86d628 } },
+  { { 0x0d5d9262, 0x0f251539, 0x0711a956, 0x0240708f, 
+      0x04a0b0bc, 0x07f7e4dd, 0x055b70a8, 0x065dd24f, 
+      0x07ef8979, 0x0e83cec7, 0x09589db8, 0x0f1db2d1, 
+      0x09d93037, 0x0fcc7e8a, 0x04e0b8f4, 0x0cb99f0b } },
+  { { 0x04acea57, 0x06f24100, 0x0da68597, 0x0dace1c6, 
+      0x050ce77f, 0x0ea7dd41, 0x01585884, 0x01aecb84, 
+      0x0ea4a85c, 0x092ff208, 0x088eebd2, 0x0de9433c, 
+      0x03f4d289, 0x053cd318, 0x026539af, 0x03970858 } },
+  { { 0x0d229665, 0x06e9fd2b, 0x0878dd51, 0x049345aa, 
+      0x0f45bacf, 0x0ccde72a, 0x0be16b6f, 0x0bc249d1, 
+      0x0448a61d, 0x0a25bae9, 0x0d773878, 0x0c93b6ea, 
+      0x02cda508, 0x055f708a, 0x08cf49e6, 0x0fa56852 } },
+  { { 0x093bfef9, 0x07bec8db, 0x0fafda3d, 0x0ce4dcdc, 
+      0x06f62ed7, 0x0a75c872, 0x07b3dadd, 0x0c39ac92, 
+      0x0f926d90, 0x0ae1b8d1, 0x048da0a9, 0x0d7dbeca, 
+      0x02a52b3b, 0x0ec13f74, 0x0d4c5ce2, 0x02071cee } },
+  { { 0x05a644a6, 0x0e56b0a9, 0x0be6360b, 0x01ecf90e, 
+      0x023b73a8, 0x0c3bbcf7, 0x0292054b, 0x05417d25, 
+      0x07b91b46, 0x0ca1ea05, 0x07ea6c44, 0x01560b21, 
+      0x04f12989, 0x0463cd2a, 0x03d7e086, 0x0092781c } },
+  { { 0x0d59796d, 0x0ce08d7e, 0x055bc822, 0x0e464443, 
+      0x0d243cc4, 0x0542002f, 0x098259b3, 0x044fc576, 
+      0x012781de, 0x08650550, 0x0055e6b4, 0x0137f762, 
+      0x0fbf007e, 0x0a391ccc, 0x039fe6f6, 0x0a9c9ad3 } },
+  { { 0x01ca2765, 0x0ccddbb0, 0x0563b46c, 0x05d18f4c, 
+      0x0462647e, 0x02ff700d, 0x0822dc83, 0x0670b143, 
+      0x00013963, 0x01627d78, 0x055dbfb9, 0x0435f413, 
+      0x063d41e8, 0x066c95cd, 0x0c797bba, 0x08e27dfb } },
+  { { 0x03da4531, 0x01ff4dd6, 0x0cd39a3c, 0x02d0de4c, 
+      0x0bc9da8d, 0x0003561e, 0x033e1e9a, 0x001eea00, 
+      0x078bf710, 0x05458c53, 0x0f56338e, 0x069043ab, 
+      0x061ffba0, 0x0637cf41, 0x039fb551, 0x0fc09757 } },
+  { { 0x0256141f, 0x0f1e0e38, 0x00ab2673, 0x0efd5f47, 
+      0x0af4a4af, 0x0b749116, 0x0ac6540b, 0x04242f82, 
+      0x0abaf195, 0x0b26730c, 0x0d06842d, 0x076fbe60, 
+      0x0580cad8, 0x02613d91, 0x0b568ae0, 0x0c2e5b1d } }
+};
+
+static const p448_t nGy0[16] = {
+  { { 0x00000001, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x0230fa14, 0x008795bf, 0x07c8ad98, 0x0132c4ed, 
+      0x09c4fdbd, 0x01ce67c3, 0x073ad3ff, 0x005a0c2d, 
+      0x07789c1e, 0x0a398408, 0x0a73736c, 0x0c7624be, 
+      0x003756c9, 0x02488762, 0x016eb6bc, 0x0693f467 } },
+  { { 0x099945e7, 0x0c63b7a0, 0x0c4486c1, 0x0e9164ec, 
+      0x0885f2c1, 0x0b133e35, 0x0c99ae02, 0x0186f0d3, 
+      0x02bf53e6, 0x02fca492, 0x048a02bc, 0x0f922aa2, 
+      0x00dd3dca, 0x04fe6490, 0x0f6a8207, 0x0e8c313f } },
+  { { 0x0579a4e2, 0x0a1ffe8b, 0x0ce472b4, 0x01d006b3, 
+      0x089def96, 0x07c8f689, 0x0a32ae93, 0x079d7bd1, 
+      0x03a02760, 0x0ebb4776, 0x05b4c55e, 0x019b3c6c, 
+      0x07da436f, 0x066ff782, 0x0659536d, 0x0ee40076 } },
+  { { 0x05ec556a, 0x050109e2, 0x0fd57e39, 0x0235366b, 
+      0x044b6b2e, 0x07b3c976, 0x0b2b7b9c, 0x0f7f9e82, 
+      0x00ec6409, 0x0b6196ab, 0x00a20d9e, 0x088f1d16, 
+      0x0586f761, 0x0e3be3b4, 0x0e26395d, 0x09983c26 } },
+  { { 0x0fab8e56, 0x0ded288e, 0x057277e6, 0x0a4e6f4e, 
+      0x0e949681, 0x0a2a4c4f, 0x0721fdb3, 0x0508a46c, 
+      0x0fb44de2, 0x0f98049e, 0x02fb0f31, 0x071f3724, 
+      0x09067763, 0x0d3fbbb3, 0x0a83faaa, 0x0696ec4a } },
+  { { 0x07a04bb0, 0x0f52ae70, 0x0ae14cdb, 0x0784d14b, 
+      0x034acc37, 0x09aa3869, 0x09703f7b, 0x08f79c87, 
+      0x0264026c, 0x0859cde5, 0x0486b035, 0x0b2a45f7, 
+      0x03d5144b, 0x0809740f, 0x0416dc87, 0x0dcf324d } },
+  { { 0x0a0c8bc7, 0x04125cec, 0x0eac3f20, 0x0d30ff7e, 
+      0x029ad678, 0x06901f05, 0x04805ff1, 0x033c307d, 
+      0x049d6a79, 0x080f0710, 0x02dece6c, 0x0d1ba22b, 
+      0x0778cccb, 0x01692a0b, 0x02df78fb, 0x0f8c02d3 } },
+  { { 0x0b827d87, 0x04b57599, 0x03d77638, 0x0dc82ac0, 
+      0x052f6e61, 0x06943366, 0x0ad5e8a6, 0x0b8fc4b0, 
+      0x0f388642, 0x01b6f7dc, 0x0a74dd57, 0x06f24533, 
+      0x041750cf, 0x0c669378, 0x028a37af, 0x006757eb } },
+  { { 0x080128d5, 0x0ef186a8, 0x04a54843, 0x01ceb43b, 
+      0x045be148, 0x0c112a42, 0x01ac9412, 0x0621b93a, 
+      0x05e16552, 0x0a2ca24f, 0x086301c0, 0x0cf3fecf, 
+      0x05c2e2e0, 0x05108805, 0x09e9d8ab, 0x0d2ba341 } },
+  { { 0x02138911, 0x0f0d3e4c, 0x0c1a371b, 0x062382ce, 
+      0x05b3a392, 0x09d954e7, 0x0517d2a1, 0x0047d71a, 
+      0x07f70073, 0x09cd1733, 0x0efc3aea, 0x0549d0d1, 
+      0x0df78457, 0x0666e074, 0x0a48e084, 0x0f67e924 } },
+  { { 0x0b3114fe, 0x073bec50, 0x0e8b6172, 0x01c5e7b6, 
+      0x0e896bcc, 0x0a1c3ae1, 0x0bcd8cab, 0x0bb3f870, 
+      0x07e9fa9d, 0x0eea8546, 0x0042e2cf, 0x056431f0, 
+      0x0469e8d2, 0x08eb9b9c, 0x0a9adf2c, 0x06856458 } },
+  { { 0x07b2cfdd, 0x01855530, 0x073bd43a, 0x01816246, 
+      0x08897062, 0x02f82d12, 0x03563816, 0x06517857, 
+      0x0394a8c7, 0x0529bf2e, 0x075a3141, 0x0660c4f2, 
+      0x018e5a16, 0x0787c8ad, 0x045b679e, 0x0abaec01 } },
+  { { 0x06d87d9e, 0x07c9fabb, 0x03b2a99d, 0x0673b28a, 
+      0x068816ee, 0x0efb205e, 0x0dd5e3d5, 0x03d21920, 
+      0x07544f4d, 0x085f40c2, 0x06fb538d, 0x057d045b, 
+      0x05470e4e, 0x028a93c3, 0x063adfd4, 0x0d1cf7a5 } },
+  { { 0x06699694, 0x0c83c837, 0x0386dade, 0x0621103f, 
+      0x0f247dc3, 0x06058f43, 0x0aec07c3, 0x0b1ac29a, 
+      0x0bde5d50, 0x06e35e33, 0x078fd31c, 0x0516263c, 
+      0x00a9d127, 0x04a13379, 0x078bec6e, 0x0f39316a } },
+  { { 0x0e26ea19, 0x05ecf40e, 0x03bdf1b5, 0x07c284a0, 
+      0x06f461fa, 0x08393462, 0x064a69aa, 0x07d4f6a5, 
+      0x06e88ea4, 0x023059e9, 0x0f92bd0b, 0x0c4a8035, 
+      0x0c5c44a2, 0x0fccec22, 0x07f57ea1, 0x0598207c } }
+};
+
+static const p448_t nGx1[16] = {
+  { { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x0528af6f, 0x078c6f13, 0x094b74d9, 0x00001fe2, 
+      0x001aab44, 0x0ae77425, 0x0ef0039c, 0x07cbe937, 
+      0x00fa2a67, 0x0af3e4f0, 0x0da1378e, 0x0e28175f, 
+      0x08ccd90e, 0x072adeed, 0x000af22f, 0x016a8ce1 } },
+  { { 0x0fa0459e, 0x0f31f53f, 0x0315cd6b, 0x0f8742a1, 
+      0x0ae64e97, 0x0abe2f50, 0x09b9da48, 0x0bd78741, 
+      0x051e526e, 0x04521a33, 0x0e10ba45, 0x0fa05935, 
+      0x0e8f903c, 0x05c947e1, 0x05a754ee, 0x00aa47d1 } },
+  { { 0x00d9a33b, 0x0284f76f, 0x0e4d41e7, 0x09461141, 
+      0x0cc79344, 0x015371b9, 0x03dd8bdd, 0x0173f667, 
+      0x053f866b, 0x0c0d0f83, 0x030b45ea, 0x08b7d59b, 
+      0x0044dc82, 0x02b4cdec, 0x094fa772, 0x0e245b21 } },
+  { { 0x04ddc8a8, 0x02fe182d, 0x0ac056bf, 0x088d6e79, 
+      0x00e41e4e, 0x0c3ff2d1, 0x02c3679f, 0x032ec7f9, 
+      0x04e61051, 0x03561f09, 0x06c6250a, 0x04553f5a, 
+      0x0dd25c5b, 0x02b765ef, 0x06a1cd7f, 0x0e3a40a2 } },
+  { { 0x05e1f4b2, 0x0e9485c4, 0x070a1e6b, 0x01d85e53, 
+      0x077730a7, 0x0db61fa9, 0x050d418e, 0x0201a6bd, 
+      0x02774433, 0x0e78a475, 0x0622ea3a, 0x016424e5, 
+      0x0d5b9631, 0x01c7734d, 0x0f5064f2, 0x0c7586d3 } },
+  { { 0x0af6151d, 0x0c3ed603, 0x0aa19b93, 0x05a5e4a6, 
+      0x0536ff03, 0x07e465ce, 0x0b0be710, 0x0bbb36bf, 
+      0x09249bff, 0x0d15454d, 0x03736654, 0x0ba934d9, 
+      0x0370dc86, 0x0675c04e, 0x0d86eb3b, 0x06cd21cb } },
+  { { 0x030c7ce7, 0x04217221, 0x0e9dba4d, 0x0ec314cd, 
+      0x05439062, 0x0d7196cd, 0x0dd96166, 0x0b8295cd, 
+      0x0c15796f, 0x0c767da7, 0x00ab2036, 0x059120e7, 
+      0x0b7d07ec, 0x0e1562a9, 0x0231cdd9, 0x07d5c89f } },
+  { { 0x01a82a12, 0x091a5884, 0x080f3a62, 0x0a754175, 
+      0x0f73417a, 0x0399009f, 0x00a8c5cd, 0x02db1fb9, 
+      0x0c046d51, 0x082c8912, 0x08f18274, 0x00a3f577, 
+      0x026ccae2, 0x02ad0ede, 0x08a4e9c2, 0x07d6bd8b } },
+  { { 0x0afd28b4, 0x02b7b7be, 0x0298d67e, 0x0e834401, 
+      0x04b11493, 0x0e070d60, 0x063ce6fb, 0x04b67725, 
+      0x0a0cfb04, 0x0d3a0f67, 0x0f08f1b2, 0x0debe82e, 
+      0x0b402b9e, 0x07114482, 0x0b307043, 0x0af532e6 } },
+  { { 0x049ab457, 0x0f6483c2, 0x0818ac81, 0x05aced0a, 
+      0x0a900e3a, 0x080916bc, 0x02948675, 0x0145adb9, 
+      0x0d8b7821, 0x04fe2b0e, 0x0b1a62cc, 0x0a9e1bce, 
+      0x096c2408, 0x048f1f80, 0x0ac552fe, 0x0d17e7a0 } },
+  { { 0x08ce3344, 0x0ea48915, 0x0434ae70, 0x0c6cf019, 
+      0x0c48f5d2, 0x089d3c0f, 0x0ca7aa7e, 0x0c550a00, 
+      0x017fb3ab, 0x09f8b49f, 0x024844a0, 0x0366a6d5, 
+      0x0ceb4a83, 0x0f1f5bf4, 0x03b782f0, 0x099fd2f7 } },
+  { { 0x052daf76, 0x038fbbd7, 0x0bced01d, 0x0ffb0a8b, 
+      0x07c6bd6c, 0x0dc3b0ff, 0x041d595c, 0x03814ee7, 
+      0x01941d44, 0x0e1f8343, 0x0f89b18d, 0x0c083601, 
+      0x0e52ec62, 0x0fc338ff, 0x0e971788, 0x04601008 } },
+  { { 0x0add862e, 0x0e8c3a8e, 0x033cea23, 0x06d00cf1, 
+      0x0cdc039a, 0x0d7bda40, 0x0e0a2ac3, 0x04750dcb, 
+      0x0bec4388, 0x0a1bb0bc, 0x0d20c0f9, 0x077a4a7b, 
+      0x0b9e1f0b, 0x02ff072d, 0x07bd3e06, 0x0bd796d7 } },
+  { { 0x08e321b4, 0x08757de1, 0x0151699c, 0x06ba6bd4, 
+      0x0a156df0, 0x02ec93a1, 0x0dad4f9e, 0x04e547c5, 
+      0x0ee9310d, 0x01dcc8bf, 0x0f7b5016, 0x0355f710, 
+      0x0ce8f36d, 0x0389d7a9, 0x02b8056d, 0x0ff83804 } },
+  { { 0x060f6dcf, 0x0dcaa234, 0x0285b23d, 0x0ec8d56f, 
+      0x083dac2b, 0x01042255, 0x08e1bed7, 0x0c3fe788, 
+      0x0832c0af, 0x07258b0e, 0x02b2affc, 0x0a901bdb, 
+      0x0038f36e, 0x01a28d5f, 0x0dbb618d, 0x080838af } }
+};
+
+static const p448_t nGy1[16] = {
+  { { 0x00000001, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+      0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x0cbf63dd, 0x069fae17, 0x09e39e26, 0x06786172, 
+      0x0f827a18, 0x0e92b3d5, 0x08403682, 0x04d75e41, 
+      0x09056a79, 0x001a4fd9, 0x020008f5, 0x089efb2d, 
+      0x0b78ff15, 0x0a2f6918, 0x0a3437f5, 0x0f41c870 } },
+  { { 0x0d814825, 0x0b2849ef, 0x05c9968d, 0x09c2a5d2, 
+      0x004e634c, 0x024dbb26, 0x0db38194, 0x033f3a4c, 
+      0x0c8a2b6b, 0x0e04f609, 0x0abbbfdb, 0x0caefd8e, 
+      0x0404498b, 0x0683119a, 0x08b21cbd, 0x024ab7a9 } },
+  { { 0x0ede77b3, 0x0043b728, 0x0a043f1d, 0x003cf736, 
+      0x0ab4e700, 0x0d95a612, 0x0c8fe17c, 0x05ccaac2, 
+      0x0177bd28, 0x0dc3bd14, 0x05360c86, 0x0b3d5c96, 
+      0x04ec7e48, 0x01880c26, 0x04bb47c6, 0x0fd5dba8 } },
+  { { 0x05d821dd, 0x0b27309b, 0x0c2c17ca, 0x0950fb8d, 
+      0x08fb0d4c, 0x0feed015, 0x0f550179, 0x0762c479, 
+      0x0e095840, 0x0306cf44, 0x0d379e66, 0x084b413a, 
+      0x0bb2e4f1, 0x0d6e5d5a, 0x094b085d, 0x08bc12b7 } },
+  { { 0x0b8a16f6, 0x0b4dacd9, 0x003afc96, 0x0000b9b9, 
+      0x03f19cbf, 0x0ab930b8, 0x0b077171, 0x0541f92e, 
+      0x019baa42, 0x08758d9c, 0x0fea31a2, 0x0299b935, 
+      0x081d9e24, 0x03bc7232, 0x09d91676, 0x0fc081c2 } },
+  { { 0x02f05282, 0x04ca6fb6, 0x02e9801e, 0x051928b6, 
+      0x0b609dcb, 0x0c6f37b6, 0x06e32803, 0x06617fd7, 
+      0x0166f0bb, 0x07d1bffb, 0x0ac137d4, 0x0bfdebdd, 
+      0x0df8f3cb, 0x0d558ac9, 0x08fabbb4, 0x00217c7c } },
+  { { 0x0f5d72ad, 0x04c71050, 0x008880dd, 0x093209a0, 
+      0x07c3fef0, 0x0e1857c5, 0x022b21d2, 0x07584709, 
+      0x0e52fe8a, 0x039aeffa, 0x0a384e66, 0x0bd7c58b, 
+      0x0bfbbfe2, 0x022fc035, 0x0506e447, 0x0bc96411 } },
+  { { 0x04b3de44, 0x0aa0d797, 0x096ac9bb, 0x0f8658b9, 
+      0x05f6c334, 0x031e7be2, 0x04df12c9, 0x023836ce, 
+      0x059eb5c9, 0x0029027b, 0x05b8649d, 0x02f22531, 
+      0x0d907162, 0x0a0fdf03, 0x09e80226, 0x0101d9df } },
+  { { 0x05237b19, 0x00d0c997, 0x04a2bcdb, 0x0692bae3, 
+      0x0805b9e0, 0x0a0d3a98, 0x08c7dd07, 0x0a253f11, 
+      0x0e19738e, 0x0c0794d0, 0x019812a1, 0x041a8569, 
+      0x025d360c, 0x078e4ebd, 0x07ee8567, 0x0f02e9d6 } },
+  { { 0x00548584, 0x0bb1ee61, 0x0549030f, 0x0026e17a, 
+      0x0b4c52fb, 0x0a4e4e61, 0x0a1ca8f9, 0x0339754c, 
+      0x0ee8806f, 0x03d2a45e, 0x0e2028fa, 0x03c44782, 
+      0x0072e42b, 0x03328ae4, 0x0d21c91f, 0x07e98738 } },
+  { { 0x0b9618ad, 0x07f781fa, 0x09cf7662, 0x0855bfab, 
+      0x0c316a14, 0x0d98f9ff, 0x07b3046a, 0x0109f273, 
+      0x042cecfe, 0x0cc21cdc, 0x05be5a36, 0x05236b10, 
+      0x058a0700, 0x0ff2cf95, 0x005ad57d, 0x09cbf152 } },
+  { { 0x0ebe90d2, 0x049f0de4, 0x02243779, 0x0221424d, 
+      0x09051808, 0x0b52f44b, 0x0bb9c3fb, 0x0a5d64e3, 
+      0x07690354, 0x0d8bf65d, 0x0bc06e3f, 0x05d039f6, 
+      0x033a3443, 0x04e11c79, 0x04147a83, 0x06a7e42c } },
+  { { 0x082e4773, 0x00d276be, 0x0e1b9057, 0x0e9dd324, 
+      0x0369bc97, 0x0b3181ef, 0x002f04fa, 0x01d08726, 
+      0x07c2c5d3, 0x0bf49cbf, 0x09ecb59b, 0x098eae7e, 
+      0x02e09293, 0x052e08b6, 0x0c40f3e6, 0x04096c37 } },
+  { { 0x06074e1f, 0x07bc94ed, 0x0790175a, 0x040b2a81, 
+      0x0e307782, 0x0b7958e8, 0x089ff273, 0x07ed27c6, 
+      0x026db869, 0x0b6a32f8, 0x03d2e15c, 0x00446ef9, 
+      0x0777e1ac, 0x0492d2de, 0x01b69b63, 0x06b8dbab } },
+  { { 0x07e98bea, 0x0e7c9e7a, 0x02e17335, 0x09302c64, 
+      0x0acc1e93, 0x05dcdcd8, 0x04d90baa, 0x05982bae, 
+      0x0c686ed6, 0x07c08c6c, 0x0fce2c72, 0x04dd3cce, 
+      0x01dc8f12, 0x029ca465, 0x0161cbd7, 0x09324c0a } }
+};
+
+static void
+compute_kG_448 (uint8_t *out, const uint32_t k[16])
+{
+  int i;
+  p448_t x0[1], y0[1], z0[1]; /* P0 */
+  p448_t tmp0[1], tmp1[1];
+
+  /* P0 <= O */
+  memset (x0, 0, sizeof (p448_t));
+  memset (y0, 0, sizeof (p448_t));
+  memset (z0, 0, sizeof (p448_t));
+  y0->limb[0] = 1;
+  z0->limb[0] = 1;
+
+  for (i = 0; i < 56; i++)
+    {
+      p448_t b[1], c[1], d[1];
+      p448_t e[1], f[1], g[1], h[1];
+      int index0, index1;
+
+      if (i < 28)
+	{
+	  int i0 = 28 - i - 1;
+
+	  index0 = ((k[1] >> i0) & 1) | (((k[5] >> i0) & 1)<<1)
+	    | (((k[ 9] >> i0) & 1)<<2) | (((k[13] >> i0) & 1)<<3);
+	  index1 = ((k[3] >> i0) & 1) | (((k[7] >> i0) & 1)<<1)
+	    | (((k[11] >> i0) & 1)<<2) | (((k[15] >> i0) & 1)<<3);
+	}
+      else
+	{
+	  int i0 = 56 - i - 1;
+
+	  index0 = ((k[0] >> i0) & 1) | (((k[4] >> i0) & 1)<<1)
+	    | (((k[ 8] >> i0) & 1)<<2) | (((k[12] >> i0) & 1)<<3);
+	  index1 = ((k[2] >> i0) & 1) | (((k[6] >> i0) & 1)<<1)
+	    | (((k[10] >> i0) & 1)<<2) | (((k[14] >> i0) & 1)<<3);
+	}
+
+      /* Point double P0' <= P0 + P0 */
+      p448_add (tmp0, x0, y0);
+      p448_sqr (b, tmp0);
+      p448_sqr (c, x0);
+      p448_sqr (d, y0);
+      p448_add (e, c, d);
+      p448_sqr (h, z0);
+      p448_add (tmp0, h, h);
+      p448_sub (tmp1, e, tmp0);
+      p448_sub (tmp0, b, e);
+      p448_mul (x0, tmp0, tmp1);
+      p448_sub (tmp0, c, d);
+      p448_mul (y0, e, tmp0);
+      p448_mul (z0, e, tmp1);
+      /*
+	B = (X1+Y1)^2
+	C = X1^2
+	D = Y1^2
+	E = C+D
+	H = Z1^2
+	J = E-2*H
+	X3 = (B-E)*J
+	Y3 = E*(C-D)
+	Z3 = E*J
+      */
+
+      /* Point addition P0' <= P0 + [v0(index0)]G */
+      p448_sqr (b, z0);
+      p448_mul (c, x0, &nGx0[index0]);
+      p448_mul (d, y0, &nGy0[index0]);
+      p448_mul (tmp0, c, d);
+      p448_mul_39081 (e, tmp0);
+      p448_add (f, b, e);
+      p448_sub (g, b, e);
+      p448_add (tmp0, x0, y0);
+      p448_add (tmp1, &nGx0[index0], &nGy0[index0]);
+      p448_mul (h, tmp0, tmp1);
+      p448_sub (tmp0, h, c);
+      p448_sub (tmp1, tmp0, d);
+      p448_mul (tmp0, f, tmp1);
+      p448_mul (x0, z0, tmp0);
+      p448_sub (tmp0, d, c);
+      p448_mul (tmp1, g, tmp0);
+      p448_mul (y0, z0, tmp1);
+      p448_mul (z0, f, g);
+      /*
+	A = Z1*Z2
+	B = A^2
+	C = X1*X2
+	D = Y1*Y2
+	E = d*C*D
+	F = B-E
+	G = B+E
+	H = (X1+Y1)*(X2+Y2)
+	X3 = A*F*(H-C-D)
+	Y3 = A*G*(D-C)
+	Z3 = F*G
+      */
+      /* Point addition P0' <= P0 + [v1(index1)]G */
+      p448_sqr (b, z0);
+      p448_mul (c, x0, &nGx1[index1]);
+      p448_mul (d, y0, &nGy1[index1]);
+      p448_mul (tmp0, c, d);
+      p448_mul_39081 (e, tmp0);
+      p448_add (f, b, e);
+      p448_sub (g, b, e);
+      p448_add (tmp0, x0, y0);
+      p448_add (tmp1, &nGx1[index1], &nGy1[index1]);
+      p448_mul (h, tmp0, tmp1);
+      p448_sub (tmp0, h, c);
+      p448_sub (tmp1, tmp0, d);
+      p448_mul (tmp0, f, tmp1);
+      p448_mul (x0, z0, tmp0);
+      p448_sub (tmp0, d, c);
+      p448_mul (tmp1, g, tmp0);
+      p448_mul (y0, z0, tmp1);
+      p448_mul (z0, f, g);
+    }
+
+  /* Convert to affine coordinate.  */
+  p448_inv (tmp0, z0);
+  p448_mul (tmp1, x0, tmp0);
+  p448_serialize (out, tmp1);
+  /* EdDSA encoding.  */
+  out[56] = (out[0] & 1) << 7;
+  p448_mul (tmp1, y0, tmp0);
+  p448_serialize (out, tmp1);
+}
+
+
+#define SEED_SIZE 57
+
+#define DOM448       (const uint8_t *)"SigEd448"
+#define DOM448_LEN   8
+
+int
+ed448_sign (uint8_t *out, const uint8_t *input, unsigned int ilen,
+	    const uint8_t *a_in, const uint8_t *seed, const uint8_t *pk)
+{
+  bn448 a[1], k[1], s[1];
+  shake_context ctx;
+  const unsigned char x_olen[2] = { 0, 0 };
+  uint32_t hash[BN912_WORDS];
+  uint8_t r[57];
+  uint32_t carry, borrow;
+  p448_t k_redundant[1];
+
+  memset (hash, 0, sizeof (hash));
+
+  memcpy (a, a_in, sizeof (bn448));
+  a->word[13] |= 0x80000000;
+  a->word[0] &= ~3;
+
+  shake256_start (&ctx);
+  shake256_update (&ctx, DOM448, DOM448_LEN);
+  shake256_update (&ctx, x_olen, 2);
+  shake256_update (&ctx, seed, 57);
+  shake256_update (&ctx, input, ilen);
+  shake256_finish (&ctx, (uint8_t *)hash, 2*57);
+
+  mod_reduce_M (k, (const bn912 *)hash);
+  p448_deserialize (k_redundant, (uint8_t *)k);
+  compute_kG_448 (r, (uint32_t *)k_redundant);
+
+  shake256_start (&ctx);
+  shake256_update (&ctx, DOM448, DOM448_LEN);
+  shake256_update (&ctx, x_olen, 2);
+  shake256_update (&ctx, r, 57);
+  shake256_update (&ctx, pk, 57);
+  shake256_update (&ctx, input, ilen);
+  shake256_finish (&ctx, (uint8_t *)hash, 2*57);
+
+  mod_reduce_M (s, (const bn912 *)hash);
+
+  memset (hash, 0, sizeof (hash));
+  bn448_mul ((bn896 *)hash, s, a);
+  mod_reduce_M (s, (const bn912 *)hash);
+
+  carry = bn448_add (s, s, k);
+  borrow = bn448_sub (s, s, M);
+  bn448_add_cond (s, M, (borrow && !carry));
+
+  memcpy (out, r, 57);
+  memcpy (out+57, s, 56);
+  out[114-1] = 0;
+
+  return 0;
+}
+
+
+void
+ed448_compute_public (uint8_t *pk, const uint8_t *a_in)
+{
+  p448_t a[1];
+
+  p448_deserialize (a, a_in);
+  a->limb[15] |= 0x08000000;
+  a->limb[0] &= ~3;
+
+  compute_kG_448 (pk, (uint32_t *)a);
+}
--- a/ecc-mont.c
+++ b/ecc-mont.c
@ -0,0 +1,226 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ecc-mont.c - Elliptic curve computation for
+ *              the Montgomery curve: y^2 = x^3 + 486662*x^2 + x.
+ *
+ * Copyright (C) 2014, 2015, 2017  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "bn.h"
+#include "mod25638.h"
+#include "mod.h"
+
+/*
+ * References:
+ *
+ * [1] D. J. Bernstein. Curve25519: new Diffie-Hellman speed records.
+ *     Proceedings of PKC 2006, to appear.
+ *     http://cr.yp.to/papers.html#curve25519. Date: 2006.02.09.
+ *
+ * [2] D. J. Bernstein. Can we avoid tests for zero in fast
+ *     elliptic-curve arithmetic?
+ *     http://cr.yp.to/papers.html#curvezero. Date: 2006.07.26.
+ *
+ */
+
+/*
+ * IMPLEMENTATION NOTE
+ *
+ * (0) We assume that the processor has no cache, nor branch target
+ *     prediction.  Thus, we don't avoid indexing by secret value.
+ *     We don't avoid conditional jump if both cases have same timing,
+ *     either.
+ *
+ * (1) We use Radix-32 field arithmetic.  It's a representation like
+ *     2^256-38, but it's more redundant.  For example, "1" can be
+ *     represented in three ways in 256-bit: 1, 2^255-18, and
+ *     2^256-37.
+ *
+ * (2) We use Montgomery double-and-add.
+ *
+ */
+
+#ifndef BN256_C_IMPLEMENTATION
+#define ASM_IMPLEMENTATION 0
+#endif
+/*
+ *
+ * 121665 = 0x1db41
+ *            1 1101 1011 0100 0001
+ */
+static void
+mod25638_mul_121665 (bn256 *x, const bn256 *a)
+{
+#if ASM_IMPLEMENTATION
+#include "muladd_256.h"
+  const uint32_t *s;
+  uint32_t *d;
+  uint32_t w;
+  uint32_t c;
+
+  s = a->word;
+  d = x->word;
+  memset (d, 0, sizeof (bn256));
+  w = 121665;
+  MULADD_256_ASM (s, d, w, c);
+#else
+  uint32_t c, c1;
+  bn256 m[1];
+
+  c = c1 = bn256_shift (m, a, 6); c += bn256_add (x, a, m);
+  c1 <<= 2; c1 |= bn256_shift (m, m, 2); c = c + c1 + bn256_add (x, x, m);
+  c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
+  c1 <<= 2; c1 |= bn256_shift (m, m, 2); c = c + c1 + bn256_add (x, x, m);
+  c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
+  c1 <<= 2; c1 |= bn256_shift (m, m, 2); c = c + c1 + bn256_add (x, x, m);
+  c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
+  c1 <<= 1; c1 |= bn256_shift (m, m, 1); c = c + c1 + bn256_add (x, x, m);
+#endif
+  c = bn256_add_uint (x, x, c*38);
+  x->word[0] += c * 38;
+}
+
+
+typedef struct
+{
+  bn256 x[1];
+  bn256 z[1];
+} pt;
+
+
+/**
+ * @brief  Process Montgomery double-and-add
+ *
+ * With Q0, Q1, DIF (= Q0 - Q1), compute PRD = 2Q0, SUM = Q0 + Q1
+ * Q0 and Q1 are clobbered.
+ *
+ */
+static void
+mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x)
+{
+                                        mod25638_add (sum->x, q1->x, q1->z);
+                                        mod25638_sub (q1->z, q1->x, q1->z);
+  mod25638_add (prd->x, q0->x, q0->z);
+  mod25638_sub (q0->z, q0->x, q0->z);
+                                        mod25638_mul (q1->x, q0->z, sum->x);
+                                        mod25638_mul (q1->z, prd->x, q1->z);
+  mod25638_sqr (q0->x, prd->x);
+  mod25638_sqr (q0->z, q0->z);
+                                        mod25638_add (sum->x, q1->x, q1->z);
+                                        mod25638_sub (q1->z, q1->x, q1->z);
+  mod25638_mul (prd->x, q0->x, q0->z);
+  mod25638_sub (q0->z, q0->x, q0->z);
+                                        mod25638_sqr (sum->x, sum->x);
+                                        mod25638_sqr (sum->z, q1->z);
+  mod25638_mul_121665 (prd->z, q0->z);
+                                        mod25638_mul (sum->z, sum->z, dif_x);
+  mod25638_add (prd->z, q0->x, prd->z);
+  mod25638_mul (prd->z, prd->z, q0->z);
+}
+
+
+/**
+ * @brief	RES  = x-coordinate of [n]Q
+ *
+ * @param N	Scalar N (three least significant bits are 000)
+ * @param Q_X	x-coordinate of Q
+ *
+ */
+static void
+compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
+{
+  int i, j;
+  pt p0[1], p1[1], p0_[1], p1_[1];
+
+  /* P0 = O = (1:0)  */
+  memset (p0->x, 0, sizeof (bn256));
+  p0->x->word[0] = 1;
+  memset (p0->z, 0, sizeof (bn256));
+
+  /* P1 = (X:1) */
+  memcpy (p1->x, q_x, sizeof (bn256));
+  memset (p1->z, 0, sizeof (bn256));
+  p1->z->word[0] = 1;
+
+  for (i = 0; i < 8; i++)
+    {
+      uint32_t u = n->word[7-i];
+
+      for (j = 0; j < 16; j++)
+	{
+	  pt *q0, *q1;
+	  pt *sum_n, *prd_n;
+
+	  if ((u & 0x80000000))
+	    q0 = p1,  q1 = p0,  sum_n = p0_, prd_n = p1_;
+	  else
+	    q0 = p0,  q1 = p1,  sum_n = p1_, prd_n = p0_;
+	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
+
+	  if ((u & 0x40000000))
+	    q0 = p1_, q1 = p0_, sum_n = p0,  prd_n = p1;
+	  else
+	    q0 = p0_, q1 = p1_, sum_n = p1,  prd_n = p0;
+	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
+
+	  u <<= 2;
+	}
+    }
+
+  /* We know the LSB of N is always 0.  Thus, result is always in P0.  */
+  /*
+   * p0->z may be zero here, but our mod_inv doesn't raise error for 0,
+   * but returns 0 (like the implementation of z^(p-2)), thus, RES will
+   * be 0 in that case, which is correct value.
+   */
+  mod_inv (res, p0->z, p25519);
+  mod25638_mul (res, res, p0->x);
+  mod25519_reduce (res);
+}
+
+
+void
+ecdh_compute_public_25519 (const uint8_t *key_data, uint8_t *pubkey)
+{
+  bn256 gx[1];
+  bn256 k[1];
+
+  memset (gx, 0, sizeof (bn256));
+  gx[0].word[0] = 9;			/* Gx = 9 */
+  memcpy (k, key_data, sizeof (bn256));
+
+  compute_nQ ((bn256 *)pubkey, k, gx);
+}
+
+int
+ecdh_decrypt_curve25519 (const uint8_t *input, uint8_t *output,
+			 const uint8_t *key_data)
+{
+  bn256 q_x[1];
+  bn256 k[1];
+  bn256 shared[1];
+
+  memcpy (q_x, input, sizeof (bn256));
+  memcpy (k, key_data, sizeof (bn256));
+  compute_nQ (shared, k, q_x);
+  memcpy (output, shared, sizeof (bn256));
+  return 0;
+}
--- a/ecc-x448.c
+++ b/ecc-x448.c
@ -0,0 +1,177 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ecc-x448.c - Elliptic curve computation for
+ *              the Montgomery curve: y^2 = x^3 + 156326*x^2 + x
+ *
+ * Copyright (C) 2021  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * IMPLEMENTATION NOTE
+ *
+ * (0) We assume that the processor has no cache, nor branch target
+ *     prediction.
+ *     We don't avoid conditional jump if both cases have same timing,
+ *     either.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "p448.h"
+
+#define N_LIMBS 14
+
+/**
+ * @brief  Process Montgomery double-and-add
+ *
+ * With Q0, Q1, DIF (= Q0 - Q1), compute PRD = 2Q0 into Q0,
+ * and computute SUM = Q0 + Q1 into Q1
+ *
+ */
+static void
+mont_d_and_a (p448_t q0_x[1], p448_t q0_z[1], p448_t q1_x[1], p448_t q1_z[1],
+	      const p448_t dif_x[1])
+{
+  p448_t reg0[1], reg1[1];
+#define c  reg0
+#define d  reg1
+#define a  q1_x
+#define b  q1_z
+#define cb q0_x
+#define da reg0
+#define aa reg1
+#define bb q0_z
+#define da_plus_cb  q1_z
+#define da_minus_cb q1_x
+#define e      reg0
+#define dacb_2 q0_z
+#define a24_e  q1_x
+#define aa_    aa /* override is allowed by p448_add */
+
+					p448_add (c, q1_x, q1_z);
+					p448_sub (d, q1_x, q1_z);
+  p448_add (a, q0_x, q0_z);
+  p448_sub (b, q0_x, q0_z);
+					p448_mul (cb, c, b);
+					p448_mul (da, d, a);
+  p448_sqr (aa, a);
+  p448_sqr (bb, b);
+					p448_add (da_plus_cb, da, cb);
+					p448_sub (da_minus_cb, da, cb);
+  p448_mul (q0_x, aa, bb);
+  p448_sub (e, aa, bb);
+					p448_sqr (dacb_2, da_minus_cb);
+  p448_mul_39081 (a24_e, e);
+  p448_add (aa_, aa, a24_e);
+					p448_sqr (q1_x, da_plus_cb);
+					p448_mul (q1_z, dacb_2, dif_x);
+  p448_mul (q0_z, e, aa_);
+}
+
+
+typedef struct
+{
+  p448_t x[1];
+  p448_t z[1];
+} pt;
+
+
+/**
+ * @brief	RES  = x-coordinate of [n]Q
+ *
+ * @param N	Scalar N (three least significant bits are 00)
+ * @param Q_X	x-coordinate of Q
+ *
+ */
+static void
+compute_nQ (uint8_t *res, const uint32_t n[N_LIMBS], const p448_t q_x[1])
+{
+  int i, j;
+  pt p0[1], p1[1];
+#define tmp0 p0->z
+#define tmp1 p1->z
+
+  /* P0 = O = (1:0)  */
+  memset (p0->x, 0, sizeof (p0->x));
+  p0->x->limb[0] = 1;
+  memset (p0->z, 0, sizeof (p0->z));
+
+  /* P1 = (X:1) */
+  memcpy (p1->x, q_x, N_REDUNDANT_LIMBS*4);
+  memset (p1->z, 0, sizeof (p1->z));
+  p1->z->limb[0] = 1;
+
+  for (i = 0; i < N_LIMBS; i++)
+    {
+      uint32_t u = n[N_LIMBS-i-1];
+
+      for (j = 0; j < 32; j++)
+	{
+	  p448_t *q0_x, *q0_z, *q1_x, *q1_z;
+
+	  if ((u & 0x80000000))
+	    q0_x = p1->x, q0_z = p1->z,   q1_x = p0->x, q1_z = p0->z;
+	  else
+	    q0_x = p0->x, q0_z = p0->z,   q1_x = p1->x, q1_z = p1->z;
+	  mont_d_and_a (q0_x, q0_z, q1_x, q1_z, q_x);
+
+	  u <<= 1;
+	}
+    }
+
+  /* We know the LSB of N is always 0.  Thus, result is always in P0.  */
+  /*
+   * p0->z may be zero here, but our inverse function doesn't raise
+   * error for 0, but returns 0, thus, RES will be 0 in that case,
+   * which is correct value.
+   */
+  p448_inv (tmp1, p0->z);
+  p448_mul (tmp0, tmp1, p0->x);
+  p448_serialize (res, tmp0);
+}
+
+
+void
+ecdh_compute_public_x448 (uint8_t *pubkey, const uint8_t *key_data)
+{
+  const p448_t gx[1] = { { { 5, 0, }, } };
+  uint32_t k[N_LIMBS];
+
+  memcpy (k, key_data, N_LIMBS*4);
+  k[0] &= ~3;
+  k[N_LIMBS-1] |= 0x80000000;
+  compute_nQ (pubkey, k, gx);
+}
+
+int
+ecdh_decrypt_x448 (uint8_t *output, const uint8_t *input,
+		   const uint8_t *key_data)
+{
+  p448_t q_x[1];
+  uint32_t k[N_LIMBS];
+
+  p448_deserialize (q_x, input);
+  memcpy (k, key_data, N_LIMBS*4);
+  k[0] &= ~3;
+  k[N_LIMBS-1] |= 0x80000000;
+  compute_nQ (output, k, q_x);
+  return 0;
+}
--- a/ecc.c
+++ b/ecc.c
@ -0,0 +1,398 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ecc.c - Elliptic curve over GF(prime)
+ *
+ * Copyright (C) 2011, 2013, 2014, 2015
+ *               Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * References:
+ *
+ * [1] Suite B Implementer's Guide to FIPS 186-3 (ECDSA), February 3, 2010.
+ *
+ * [2] Michael Brown, Darrel Hankerson, Julio López, and Alfred Menezes,
+ *     Software Implementation of the NIST Elliptic Curves Over Prime Fields,
+ *     Proceedings of the 2001 Conference on Topics in Cryptology: The
+ *     Cryptographer's Track at RSA
+ *     Pages 250-265, Springer-Verlag London, UK, 2001
+ *     ISBN:3-540-41898-9
+ *
+ * [3] Mustapha Hedabou, Pierre Pinel, Lucien Bénéteau,
+ *     A comb method to render ECC resistant against Side Channel Attacks,
+ *     2004
+ */
+
+#include "field-group-select.h"
+
+/*
+ * Coefficients
+ */
+/*
+ * static const bn256 *coefficient_a;
+ * static const bn256 *coefficient_b;
+ */
+/*
+ * N: order of G
+ */
+/*
+ * static const bn256 N[1];
+ */
+/*
+ * MU = 2^512 / N
+ * MU = ( (1 << 256) | MU_lower )
+ */
+/*
+ * static const bn256 MU_lower[1];
+ */
+
+/*
+ * w = 4
+ * m = 256
+ * d = 64
+ * e = 32
+ */
+
+/*
+ * static const ac precomputed_KG[15];
+ * static const ac precomputed_2E_KG[15];
+ */
+
+#if TEST
+/*
+ * Generator of Elliptic curve over GF(p256)
+ */
+const ac *G = &precomputed_KG[0];
+#endif
+
+
+static int
+get_vk (const bn256 *K, int i)
+{
+  uint32_t w0, w1, w2, w3;
+
+  if (i < 32)
+    {
+      w3 = K->word[6]; w2 = K->word[4]; w1 = K->word[2]; w0 = K->word[0];
+    }
+  else
+    {
+      w3 = K->word[7]; w2 = K->word[5]; w1 = K->word[3]; w0 = K->word[1];
+      i -= 32;
+    }
+
+  w3 >>= i;  w2 >>= i;  w1 >>= i;  w0 >>= i;
+  return ((w3 & 1) << 3) | ((w2 & 1) << 2) | ((w1 & 1) << 1) | (w0 & 1);
+}
+
+
+/**
+ * @brief	X  = k * G
+ *
+ * @param K	scalar k
+ *
+ * Return -1 on error.
+ * Return 0 on success.
+ */
+int
+FUNC(compute_kG) (ac *X, const bn256 *K)
+{
+  uint8_t index[64]; /* Lower 4-bit for index absolute value, msb is
+			for sign (encoded as: 0 means 1, 1 means -1).  */
+  bn256 K_dash[1];
+  jpc Q[1], tmp[1], *dst;
+  int i;
+  int vk;
+  uint32_t k_is_even = bn256_is_even (K);
+
+  bn256_sub_uint (K_dash, K, k_is_even);
+  /* It keeps the condition: 1 <= K' <= N - 2, and K' is odd.  */
+
+  /* Fill index.  */
+  vk = get_vk (K_dash, 0);
+  for (i = 1; i < 64; i++)
+    {
+      int vk_next, is_zero;
+
+      vk_next = get_vk (K_dash, i);
+      is_zero = (vk_next == 0);
+      index[i-1] = (vk - 1) | (is_zero << 7);
+      vk = (is_zero ? vk : vk_next);
+    }
+  index[63] = vk - 1;
+
+  memset (Q->z, 0, sizeof (bn256)); /* infinity */
+  for (i = 31; i >= 0; i--)
+    {
+      FUNC(jpc_double) (Q, Q);
+      FUNC(jpc_add_ac_signed) (Q, Q, &precomputed_2E_KG[index[i+32]&0x0f],
+			       index[i+32] >> 7);
+      FUNC(jpc_add_ac_signed) (Q, Q, &precomputed_KG[index[i]&0x0f],
+			       index[i] >> 7);
+    }
+
+  dst = k_is_even ? Q : tmp;
+  FUNC(jpc_add_ac) (dst, Q, &precomputed_KG[0]);
+
+  return FUNC(jpc_to_ac) (X, Q);
+}
+
+
+
+/**
+ * check if P is on the curve.
+ *
+ * Return -1 on error.
+ * Return 0 on success.
+ */
+static int
+point_is_on_the_curve (const ac *P)
+{
+  bn256 s[1], t[1];
+
+  /* Elliptic curve: y^2 = x^3 + a*x + b */
+  MFNC(sqr) (s, P->x);
+  MFNC(mul) (s, s, P->x);
+
+#ifndef COEFFICIENT_A_IS_ZERO
+  MFNC(mul) (t, coefficient_a, P->x);
+  MFNC(add) (s, s, t);
+#endif
+  MFNC(add) (s, s, coefficient_b);
+
+  MFNC(sqr) (t, P->y);
+  if (bn256_cmp (s, t) == 0)
+    return 0;
+  else
+    return -1;
+}
+
+
+static int
+get_vk_kP (const bn256 *K, int i)
+{
+  uint32_t w;
+  uint8_t blk = i/32;
+  uint8_t pos = i%32;
+  uint8_t col = 3*(pos % 11) + (pos >= 11) + (pos >= 22);
+  uint8_t word_index = (blk * 3) + (pos / 11);
+
+  w = ((K->word[word_index] >> col) & 7);
+  if (word_index < 7 && (pos == 10 || pos == 21))
+    {
+      uint8_t mask;
+      uint8_t shift;
+
+      word_index++;
+      if (pos == 10)
+	{
+	  shift = 2;
+	  mask = 4;
+	}
+      else
+	{
+	  shift = 1;
+	  mask = 6;
+	}
+
+      w |= ((K->word[word_index] << shift) & mask);
+    }
+
+  return w;
+}
+
+/**
+ * @brief	X  = k * P
+ *
+ * @param K	scalar k
+ * @param P	P in affine coordiate
+ *
+ * Return -1 on error.
+ * Return 0 on success.
+ *
+ * For the curve (cofactor is 1 and n is prime), possible error cases are:
+ *
+ *     P is not on the curve.
+ *     P = G, k = n
+ *     Something wrong in the code.
+ *
+ * Mathmatically, k=1 and P=O is another possible case, but O cannot be
+ * represented by affine coordinate.
+ */
+int
+FUNC(compute_kP) (ac *X, const bn256 *K, const ac *P)
+{
+  uint8_t index[86]; /* Lower 2-bit for index absolute value, msb is
+			for sign (encoded as: 0 means 1, 1 means -1).  */
+  bn256 K_dash[1];
+  uint32_t k_is_even = bn256_is_even (K);
+  jpc Q[1], tmp[1], *dst;
+  int i;
+  int vk;
+  ac P3[1], P5[1], P7[1];
+  const ac *p_Pi[4];
+
+  if (point_is_on_the_curve (P) < 0)
+    return -1;
+
+  if (bn256_sub (K_dash, K, N) == 0)	/* >= N, it's too big.  */
+    return -1;
+
+  bn256_sub_uint (K_dash, K, k_is_even);
+  /* It keeps the condition: 1 <= K' <= N - 2, and K' is odd.  */
+
+  p_Pi[0] = P;
+  p_Pi[1] = P3;
+  p_Pi[2] = P5;
+  p_Pi[3] = P7;
+
+  {
+    jpc Q1[1];
+
+    memcpy (Q->x, P->x, sizeof (bn256));
+    memcpy (Q->y, P->y, sizeof (bn256));
+    memset (Q->z, 0, sizeof (bn256));
+    Q->z->word[0] = 1;
+
+    FUNC(jpc_double) (Q, Q);
+    FUNC(jpc_add_ac) (Q1, Q, P);
+    if (FUNC(jpc_to_ac) (P3, Q1) < 0) /* Never occurs, except coding errors.  */
+      return -1;
+    FUNC(jpc_double) (Q, Q);
+    FUNC(jpc_add_ac) (Q1, Q, P);
+    if (FUNC(jpc_to_ac) (P5, Q1) < 0) /* Never occurs, except coding errors.  */
+      return -1;
+
+    memcpy (Q->x, P3->x, sizeof (bn256));
+    memcpy (Q->y, P3->y, sizeof (bn256));
+    memset (Q->z, 0, sizeof (bn256));
+    Q->z->word[0] = 1;
+    FUNC(jpc_double) (Q, Q);
+    FUNC(jpc_add_ac) (Q1, Q, P);
+    if (FUNC(jpc_to_ac) (P7, Q1) < 0) /* Never occurs, except coding errors.  */
+      return -1;
+  }
+
+  /* Fill index.  */
+  vk = get_vk_kP (K_dash, 0);
+  for (i = 1; i < 86; i++)
+    {
+      int vk_next, is_even;
+
+      vk_next = get_vk_kP (K_dash, i);
+      is_even = ((vk_next & 1) == 0);
+      index[i-1] = (is_even << 7) | ((is_even?7-vk:vk-1) >> 1);
+      vk = vk_next + is_even;
+    }
+  index[85] = ((vk - 1) >> 1);
+
+  memset (Q->z, 0, sizeof (bn256)); /* infinity */
+  for (i = 85; i >= 0; i--)
+    {
+      FUNC(jpc_double) (Q, Q);
+      FUNC(jpc_double) (Q, Q);
+      FUNC(jpc_double) (Q, Q);
+      FUNC(jpc_add_ac_signed) (Q, Q, p_Pi[index[i]&0x03], index[i] >> 7);
+    }
+
+  dst = k_is_even ? Q : tmp;
+  FUNC(jpc_add_ac) (dst, Q, P);
+
+  return FUNC(jpc_to_ac) (X, Q);
+}
+
+
+/**
+ * @brief Compute signature (r,s) of hash string z with secret key d
+ */
+void
+FUNC(ecdsa) (bn256 *r, bn256 *s, const bn256 *z, const bn256 *d)
+{
+  bn256 k[1];
+  ac KG[1];
+  bn512 tmp[1];
+  bn256 k_inv[1];
+  uint32_t carry;
+#define borrow carry
+#define tmp_k k_inv
+
+  do
+    {
+      do
+	{
+	  bn256_random (k);
+	  if (bn256_add_uint (k, k, 1))
+	    continue;
+	  if (bn256_sub (tmp_k, k, N) == 0)	/* >= N, it's too big.  */
+	    continue;
+	  /* 1 <= k <= N - 1 */
+	  FUNC(compute_kG) (KG, k);
+	  borrow = bn256_sub (r, KG->x, N);
+	  if (borrow)
+	    memcpy (r, KG->x, sizeof (bn256));
+	  else
+	    memcpy (KG->x, r, sizeof (bn256));
+	}
+      while (bn256_is_zero (r));
+
+      mod_inv (k_inv, k, N);
+      bn256_mul (tmp, r, d);
+      mod_reduce (s, tmp, N, MU_lower);
+      carry = bn256_add (s, s, z);
+      if (carry)
+	bn256_sub (s, s, N);
+      else
+	bn256_sub ((bn256 *)tmp, s, N);
+      bn256_mul (tmp, s, k_inv);
+      mod_reduce (s, tmp, N, MU_lower);
+    }
+  while (bn256_is_zero (s));
+
+#undef tmp_k
+#undef borrow
+}
+
+
+/**
+ * @brief Check if a secret d0 is valid or not
+ *
+ * @param D0	scalar D0: secret
+ * @param D1	scalar D1: secret candidate N-D0
+ *
+ * Return 0 on error.
+ * Return -1 when D1 should be used as the secret
+ * Return 1 when D0 should be used as the secret
+ */
+int
+FUNC(check_secret) (const bn256 *d0, bn256 *d1)
+{
+  ac Q0[1], Q1[1];
+
+  if (bn256_is_zero (d0) || bn256_sub (d1, N, d0) != 0)
+    /* == 0 or >= N, it's not valid.  */
+    return 0;
+
+  FUNC(compute_kG) (Q0, d0);
+  FUNC(compute_kG) (Q1, d1);
+
+  /*
+   * Jivsov compliant key check
+   */
+  return bn256_cmp (Q1[0].y, Q0[0].y);
+}
--- a/field-group-select.h
+++ b/field-group-select.h
@ -0,0 +1,7 @@
+#define CONCAT0(a,b) a##b
+#define CONCAT1(a,b) CONCAT0(a,b)
+#define CONCAT2(a,b,c) CONCAT1(a,b##c)
+#define CONCAT3(a,b,c) CONCAT2(a,b,c)
+
+#define FUNC(func) CONCAT1(func##_,FIELD)
+#define MFNC(func) CONCAT3(mod,FIELD,_##func)
--- a/flash.c
+++ b/flash.c
@ -0,0 +1,738 @@
+/*
+ * flash.c -- Data Objects (DO) and GPG Key handling on Flash ROM
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018
+ *               Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * We assume single DO size is less than 256.
+ *
+ * NOTE: "Card holder certificate" (which size is larger than 256) is
+ *       not put into data pool, but is implemented by its own flash
+ *       page(s).
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "config.h"
+
+#include "sys.h"
+#include "gnuk.h"
+
+#include "pico/stdlib.h"
+#include "hardware/flash.h"
+#include "tusb.h"
+
+/*
+ * Flash memory map
+ *
+ * _text
+ *         .text
+ *         .ctors
+ *         .dtors
+ * _etext
+ *         .data
+ * _bss_start
+ *         .bss
+ * _end
+ *         <alignment to page>
+ * ch_certificate_startp
+ *         <2048 bytes>
+ * _keystore_pool
+ *         Three flash pages for keystore
+ *         a page contains a key data of:
+ *              For RSA-2048: 512-byte (p, q and N)
+ *              For RSA-4096: 1024-byte (p, q and N)
+ *              For ECDSA/ECDH and EdDSA, there are padding after public key
+ * _data_pool
+ *	   <two pages>
+ */
+
+#define FLASH_DATA_POOL_HEADER_SIZE	2
+#define FLASH_DATA_POOL_SIZE		(2048*1024)
+
+static uint16_t flash_page_size;
+static const uint8_t *data_pool;
+static uint8_t *last_p;
+
+/* The first halfword is generation for the data page (little endian) */
+const uint8_t flash_data[4] __attribute__ ((section (".gnuk_data"))) = {
+  0x00, 0x00, 0xff, 0xff
+};
+
+#define FLASH_TARGET_OFFSET (4096 * 1024) // DATA starts at the mid of flash
+
+
+const uint8_t *flash_addr_key_storage_start = (const uint8_t *) (XIP_BASE + FLASH_TARGET_OFFSET);
+const uint8_t *flash_addr_data_storage_start = (const uint8_t *) (XIP_BASE + FLASH_TARGET_OFFSET + 2048 * 1024); // 2 MB 
+const uint8_t *ch_certificate_start = (const uint8_t *) (XIP_BASE + FLASH_TARGET_OFFSET - FLASH_SECTOR_SIZE);
+#define FLASH_ADDR_KEY_STORAGE_START  flash_addr_key_storage_start
+#define FLASH_ADDR_DATA_STORAGE_START flash_addr_data_storage_start
+
+extern int flash_erase_page (uintptr_t addr);
+extern int flash_program_halfword (uintptr_t addr, uint16_t data);
+extern int flash_check_blank (const uint8_t *p_start, size_t size);
+extern int flash_write (uintptr_t dst_addr, const uint8_t *src, size_t len);
+
+static int key_available_at (const uint8_t *k, int key_size)
+{
+  int i;
+
+  for (i = 0; i < key_size; i++)
+    if (k[i])
+      break;
+  if (i == key_size)	/* It's ZERO.  Released key.  */
+    return 0;
+
+  for (i = 0; i < key_size; i++)
+    if (k[i] != 0xff)
+      break;
+  if (i == key_size)	/* It's FULL.  Unused key.  */
+    return 0;
+
+  return 1;
+}
+
+void
+flash_do_storage_init (const uint8_t **p_do_start, const uint8_t **p_do_end)
+{
+  uint16_t gen0, gen1;
+  uint16_t *gen0_p = (uint16_t *)FLASH_ADDR_DATA_STORAGE_START;
+  uint16_t *gen1_p;
+
+  flash_page_size = FLASH_SECTOR_SIZE;
+
+  gen1_p = (uint16_t *)(FLASH_ADDR_DATA_STORAGE_START + flash_page_size);
+  data_pool = FLASH_ADDR_DATA_STORAGE_START;
+
+  /* Check data pool generation and choose the page */
+  gen0 = *gen0_p;
+  gen1 = *gen1_p;
+
+  if (gen0 == 0xffff && gen1 == 0xffff)
+    {
+      /* It's terminated.  */
+      *p_do_start = *p_do_end = NULL;
+      return;
+    }
+
+  if (gen0 == 0xffff)
+    /* Use another page if a page is erased.  */
+    data_pool = FLASH_ADDR_DATA_STORAGE_START + flash_page_size;
+  else if (gen1 == 0xffff)
+    /* Or use different page if another page is erased.  */
+    data_pool = FLASH_ADDR_DATA_STORAGE_START;
+  else if ((gen0 == 0xfffe && gen1 == 0) || gen1 > gen0)
+    /* When both pages have valid header, use newer page.   */
+    data_pool = FLASH_ADDR_DATA_STORAGE_START + flash_page_size;
+
+  *p_do_start = data_pool + FLASH_DATA_POOL_HEADER_SIZE;
+  *p_do_end = data_pool + flash_page_size;
+}
+
+static uint8_t *flash_key_getpage (enum kind_of_key kk);
+
+void
+flash_terminate (void)
+{
+  int i;
+
+  for (i = 0; i < 3; i++)
+    flash_erase_page ((uintptr_t)flash_key_getpage (i));
+  flash_erase_page ((uintptr_t)FLASH_ADDR_DATA_STORAGE_START);
+  flash_erase_page ((uintptr_t)(FLASH_ADDR_DATA_STORAGE_START + flash_page_size));
+  data_pool = FLASH_ADDR_DATA_STORAGE_START;
+  last_p = (uint8_t *)FLASH_ADDR_DATA_STORAGE_START + FLASH_DATA_POOL_HEADER_SIZE;
+#if defined(CERTDO_SUPPORT)
+  flash_erase_page ((uintptr_t)ch_certificate_start);
+  if (FLASH_CH_CERTIFICATE_SIZE > flash_page_size)
+    flash_erase_page ((uintptr_t)(ch_certificate_start + flash_page_size));
+#endif
+}
+
+void
+flash_activate (void)
+{
+  flash_program_halfword ((uintptr_t)FLASH_ADDR_DATA_STORAGE_START, 0);
+}
+
+
+void
+flash_key_storage_init (void)
+{
+  const uint8_t *p;
+  int i;
+
+  /* For each key, find its address.  */
+  p = FLASH_ADDR_KEY_STORAGE_START;
+  for (i = 0; i < 3; i++)
+    {
+      const uint8_t *k;
+      int key_size = gpg_get_algo_attr_key_size (i, GPG_KEY_STORAGE);
+
+      kd[i].pubkey = NULL;
+      for (k = p; k < p + flash_page_size; k += key_size)
+	if (key_available_at (k, key_size))
+	  {
+	    int prv_len = gpg_get_algo_attr_key_size (i, GPG_KEY_PRIVATE);
+
+	    kd[i].pubkey = k + prv_len;
+	    break;
+	  }
+
+      p += flash_page_size;
+    }
+}
+
+/*
+ * Flash data pool managenent
+ *
+ * Flash data pool consists of two parts:
+ *   2-byte header
+ *   contents
+ *
+ * Flash data pool objects:
+ *   Data Object (DO) (of smart card)
+ *   Internal objects:
+ *     NONE (0x0000)
+ *     123-counter
+ *     14-bit counter
+ *     bool object
+ *     small enum
+ *
+ * Format of a Data Object:
+ *    NR:   8-bit tag_number
+ *    LEN:  8-bit length
+ *    DATA: data * LEN
+ *    PAD:  optional byte for 16-bit alignment
+ */
+
+void
+flash_set_data_pool_last (const uint8_t *p)
+{
+  last_p = (uint8_t *)p;
+}
+
+/*
+ * We use two pages
+ */
+static int
+flash_copying_gc (void)
+{
+  uint8_t *src, *dst;
+  uint16_t generation;
+
+  if (data_pool == FLASH_ADDR_DATA_STORAGE_START)
+    {
+      src = (uint8_t *)FLASH_ADDR_DATA_STORAGE_START;
+      dst = (uint8_t *)FLASH_ADDR_DATA_STORAGE_START + flash_page_size;
+    }
+  else
+    {
+      src = (uint8_t *)FLASH_ADDR_DATA_STORAGE_START + flash_page_size;
+      dst = (uint8_t *)FLASH_ADDR_DATA_STORAGE_START;
+    }
+
+  generation = *(uint16_t *)src;
+  data_pool = dst;
+  gpg_data_copy (data_pool + FLASH_DATA_POOL_HEADER_SIZE);
+  if (generation == 0xfffe)
+    generation = 0;
+  else
+    generation++;
+  flash_program_halfword ((uintptr_t)dst, generation);
+  flash_erase_page ((uintptr_t)src);
+  return 0;
+}
+
+static int
+is_data_pool_full (size_t size)
+{
+  return last_p + size > data_pool + flash_page_size;
+}
+
+static uint8_t *
+flash_data_pool_allocate (size_t size)
+{
+  uint8_t *p;
+
+  size = (size + 1) & ~1;	/* allocation unit is 1-halfword (2-byte) */
+
+  if (is_data_pool_full (size))
+    if (flash_copying_gc () < 0 || /*still*/ is_data_pool_full (size))
+      TU_LOG1 ("!!!! FATAL: %d\r\n",FATAL_FLASH);
+
+  p = last_p;
+  last_p += size;
+  return p;
+}
+
+void
+flash_do_write_internal (const uint8_t *p, int nr, const uint8_t *data, int len)
+{
+  uint16_t hw;
+  uintptr_t addr;
+  int i;
+
+  addr = (uintptr_t)p;
+  hw = nr | (len << 8);
+  if (flash_program_halfword (addr, hw) != 0)
+    flash_warning ("DO WRITE ERROR");
+  addr += 2;
+
+  for (i = 0; i < len/2; i++)
+    {
+      hw = data[i*2] | (data[i*2+1]<<8);
+      if (flash_program_halfword (addr, hw) != 0)
+	flash_warning ("DO WRITE ERROR");
+      addr += 2;
+    }
+
+  if ((len & 1))
+    {
+      hw = data[i*2] | 0xff00;
+      if (flash_program_halfword (addr, hw) != 0)
+	flash_warning ("DO WRITE ERROR");
+    }
+}
+
+const uint8_t *
+flash_do_write (uint8_t nr, const uint8_t *data, int len)
+{
+  const uint8_t *p;
+
+  DEBUG_INFO ("flash DO\r\n");
+
+  p = flash_data_pool_allocate (2 + len);
+  if (p == NULL)
+    {
+      DEBUG_INFO ("flash data pool allocation failure.\r\n");
+      return NULL;
+    }
+
+  flash_do_write_internal (p, nr, data, len);
+  DEBUG_INFO ("flash DO...done\r\n");
+  return p + 1;
+}
+
+void
+flash_warning (const char *msg)
+{
+  (void)msg;
+  DEBUG_INFO ("FLASH: ");
+  DEBUG_INFO (msg);
+  DEBUG_INFO ("\r\n");
+}
+
+void
+flash_do_release (const uint8_t *do_data)
+{
+  uintptr_t addr = (uintptr_t)do_data - 1;
+  uintptr_t addr_tag = addr;
+  int i;
+  int len = do_data[0];
+
+  /* Don't filling zero for data in code (such as ds_count_initial_value) */
+  if (do_data < FLASH_ADDR_DATA_STORAGE_START
+      || do_data > FLASH_ADDR_DATA_STORAGE_START + FLASH_DATA_POOL_SIZE)
+    return;
+
+  addr += 2;
+
+  /* Fill zero for content and pad */
+  for (i = 0; i < len/2; i ++)
+    {
+      if (flash_program_halfword (addr, 0) != 0)
+	flash_warning ("fill-zero failure");
+      addr += 2;
+    }
+
+  if ((len & 1))
+    {
+      if (flash_program_halfword (addr, 0) != 0)
+	flash_warning ("fill-zero pad failure");
+    }
+
+  /* Fill 0x0000 for "tag_number and length" word */
+  if (flash_program_halfword (addr_tag, 0) != 0)
+    flash_warning ("fill-zero tag_nr failure");
+}
+
+
+static uint8_t *
+flash_key_getpage (enum kind_of_key kk)
+{
+  /* There is a page for each KK.  */
+  return (uint8_t *)FLASH_ADDR_KEY_STORAGE_START + (flash_page_size * kk);
+}
+
+uint8_t *
+flash_key_alloc (enum kind_of_key kk)
+{
+  uint8_t *k, *k0 = flash_key_getpage (kk);
+  int i;
+  int key_size = gpg_get_algo_attr_key_size (kk, GPG_KEY_STORAGE);
+
+  /* Seek free space in the page.  */
+  for (k = k0; k < k0 + flash_page_size; k += key_size)
+    {
+      const uint32_t *p = (const uint32_t *)k;
+
+      for (i = 0; i < key_size/4; i++)
+	if (p[i] != 0xffffffff)
+	  break;
+
+      if (i == key_size/4)	/* Yes, it's empty.  */
+	return k;
+    }
+
+  /* Should not happen as we have enough free space all time, but just
+     in case.  */
+  return NULL;
+}
+
+int
+flash_key_write (uint8_t *key_addr,
+		 const uint8_t *key_data, int key_data_len,
+		 const uint8_t *pubkey, int pubkey_len)
+{
+  uint16_t hw;
+  uintptr_t addr;
+  int i;
+
+  addr = (uintptr_t)key_addr;
+  for (i = 0; i < key_data_len/2; i ++)
+    {
+      hw = key_data[i*2] | (key_data[i*2+1]<<8);
+      if (flash_program_halfword (addr, hw) != 0)
+	return -1;
+      addr += 2;
+    }
+
+  for (i = 0; i < pubkey_len/2; i ++)
+    {
+      hw = pubkey[i*2] | (pubkey[i*2+1]<<8);
+      if (flash_program_halfword (addr, hw) != 0)
+	return -1;
+      addr += 2;
+    }
+
+  return 0;
+}
+
+static int
+flash_check_all_other_keys_released (const uint8_t *key_addr, int key_size)
+{
+  uintptr_t start = (uintptr_t)key_addr & ~(flash_page_size - 1);
+  const uint32_t *p = (const uint32_t *)start;
+
+  while (p < (const uint32_t *)(start + flash_page_size))
+    if (p == (const uint32_t *)key_addr)
+      p += key_size/4;
+    else
+      if (*p)
+	return 0;
+      else
+	p++;
+
+  return 1;
+}
+
+static void
+flash_key_fill_zero_as_released (uint8_t *key_addr, int key_size)
+{
+  int i;
+  uintptr_t addr = (uintptr_t)key_addr;
+
+  for (i = 0; i < key_size/2; i++)
+    flash_program_halfword (addr + i*2, 0);
+}
+
+void
+flash_key_release (uint8_t *key_addr, int key_size)
+{
+  if (flash_check_all_other_keys_released (key_addr, key_size))
+    flash_erase_page (((uintptr_t)key_addr & ~(flash_page_size - 1)));
+  else
+    flash_key_fill_zero_as_released (key_addr, key_size);
+}
+
+void
+flash_key_release_page (enum kind_of_key kk)
+{
+  flash_erase_page ((uintptr_t)flash_key_getpage (kk));
+}
+
+
+void
+flash_clear_halfword (uintptr_t addr)
+{
+  flash_program_halfword (addr, 0);
+}
+
+
+void
+flash_put_data_internal (const uint8_t *p, uint16_t hw)
+{
+  flash_program_halfword ((uintptr_t)p, hw);
+}
+
+void
+flash_put_data (uint16_t hw)
+{
+  uint8_t *p;
+
+  p = flash_data_pool_allocate (2);
+  if (p == NULL)
+    {
+      DEBUG_INFO ("data allocation failure.\r\n");
+    }
+
+  flash_program_halfword ((uintptr_t)p, hw);
+}
+
+
+void
+flash_bool_clear (const uint8_t **addr_p)
+{
+  const uint8_t *p;
+
+  if ((p = *addr_p) == NULL)
+    return;
+
+  flash_program_halfword ((uintptr_t)p, 0);
+  *addr_p = NULL;
+}
+
+void
+flash_bool_write_internal (const uint8_t *p, int nr)
+{
+  flash_program_halfword ((uintptr_t)p, nr);
+}
+
+const uint8_t *
+flash_bool_write (uint8_t nr)
+{
+  uint8_t *p;
+  uint16_t hw = nr;
+
+  p = flash_data_pool_allocate (2);
+  if (p == NULL)
+    {
+      DEBUG_INFO ("bool allocation failure.\r\n");
+      return NULL;
+    }
+
+  flash_program_halfword ((uintptr_t)p, hw);
+  return p;
+}
+
+
+void
+flash_enum_clear (const uint8_t **addr_p)
+{
+  flash_bool_clear (addr_p);
+}
+
+void
+flash_enum_write_internal (const uint8_t *p, int nr, uint8_t v)
+{
+  uint16_t hw = nr | (v << 8);
+
+  flash_program_halfword ((uintptr_t)p, hw);
+}
+
+const uint8_t *
+flash_enum_write (uint8_t nr, uint8_t v)
+{
+  uint8_t *p;
+  uint16_t hw = nr | (v << 8);
+
+  p = flash_data_pool_allocate (2);
+  if (p == NULL)
+    {
+      DEBUG_INFO ("enum allocation failure.\r\n");
+      return NULL;
+    }
+
+  flash_program_halfword ((uintptr_t)p, hw);
+  return p;
+}
+
+
+int
+flash_cnt123_get_value (const uint8_t *p)
+{
+  if (p == NULL)
+    return 0;
+  else
+    {
+      uint8_t v = *p;
+
+      /*
+       * After erase, a halfword in flash memory becomes 0xffff.
+       * The halfword can be programmed to any value.
+       * Then, the halfword can be programmed to zero.
+       *
+       * Thus, we can represent value 1, 2, and 3.
+       */
+      if (v == 0xff)
+	return 1;
+      else if (v == 0x00)
+	return 3;
+      else
+	return 2;
+    }
+}
+
+void
+flash_cnt123_write_internal (const uint8_t *p, int which, int v)
+{
+  uint16_t hw;
+
+  hw = NR_COUNTER_123 | (which << 8);
+  flash_program_halfword ((uintptr_t)p, hw);
+
+  if (v == 1)
+    return;
+  else if (v == 2)
+    flash_program_halfword ((uintptr_t)p+2, 0xc3c3);
+  else				/* v == 3 */
+    flash_program_halfword ((uintptr_t)p+2, 0);
+}
+
+void
+flash_cnt123_increment (uint8_t which, const uint8_t **addr_p)
+{
+  const uint8_t *p;
+  uint16_t hw;
+
+  if ((p = *addr_p) == NULL)
+    {
+      p = flash_data_pool_allocate (4);
+      if (p == NULL)
+	{
+	  DEBUG_INFO ("cnt123 allocation failure.\r\n");
+	  return;
+	}
+      hw = NR_COUNTER_123 | (which << 8);
+      flash_program_halfword ((uintptr_t)p, hw);
+      *addr_p = p + 2;
+    }
+  else
+    {
+      uint8_t v = *p;
+
+      if (v == 0)
+	return;
+
+      if (v == 0xff)
+	hw = 0xc3c3;
+      else
+	hw = 0;
+
+      flash_program_halfword ((uintptr_t)p, hw);
+    }
+}
+
+void
+flash_cnt123_clear (const uint8_t **addr_p)
+{
+  const uint8_t *p;
+
+  if ((p = *addr_p) == NULL)
+    return;
+
+  flash_program_halfword ((uintptr_t)p, 0);
+  p -= 2;
+  flash_program_halfword ((uintptr_t)p, 0);
+  *addr_p = NULL;
+}
+
+
+#if defined(CERTDO_SUPPORT)
+int
+flash_erase_binary (uint8_t file_id)
+{
+  if (file_id == FILEID_CH_CERTIFICATE)
+    {
+      const uint8_t *p = ch_certificate_start;
+      if (flash_check_blank (p, FLASH_CH_CERTIFICATE_SIZE) == 0)
+	{
+	  flash_erase_page ((uintptr_t)p);
+	  if (FLASH_CH_CERTIFICATE_SIZE > flash_page_size)
+	    flash_erase_page ((uintptr_t)p + flash_page_size);
+	}
+
+      return 0;
+    }
+
+  return -1;
+}
+#endif
+
+
+int
+flash_write_binary (uint8_t file_id, const uint8_t *data,
+		    uint16_t len, uint16_t offset)
+{
+  uint16_t maxsize;
+  const uint8_t *p;
+
+  if (file_id == FILEID_SERIAL_NO)
+    {
+      maxsize = 6;
+      p = &openpgpcard_aid[8];
+    }
+#if defined(CERTDO_SUPPORT)
+  else if (file_id == FILEID_CH_CERTIFICATE)
+    {
+      maxsize = FLASH_CH_CERTIFICATE_SIZE;
+      p = ch_certificate_start;
+    }
+#endif
+  else
+    return -1;
+
+  if (offset + len > maxsize || (offset&1) || (len&1))
+    return -1;
+  else
+    {
+      uint16_t hw;
+      uintptr_t addr;
+      int i;
+
+      if (flash_check_blank (p + offset, len)  == 0)
+	return -1;
+
+      addr = (uintptr_t)p + offset;
+      for (i = 0; i < len/2; i++)
+	{
+	  hw = data[i*2] | (data[i*2+1]<<8);
+	  if (flash_program_halfword (addr, hw) != 0)
+	    flash_warning ("DO WRITE ERROR");
+	  addr += 2;
+	}
+
+      return 0;
+    }
+}
--- a/gnuk.h
+++ b/gnuk.h
@ -0,0 +1,501 @@
+/*
+ * Application layer <-> CCID layer data structure
+ */
+struct apdu {
+  uint8_t seq;
+
+  /* command APDU */
+  uint8_t *cmd_apdu_head;	/* CLS INS P1 P2 [ internal Lc ] */
+  uint8_t *cmd_apdu_data;
+  uint16_t cmd_apdu_data_len;	/* Nc, calculated by Lc field */
+  uint16_t expected_res_size;	/* Ne, calculated by Le field */
+
+  /* response APDU */
+  uint16_t sw;
+  uint16_t res_apdu_data_len;
+  uint8_t *res_apdu_data;
+};
+
+extern struct apdu apdu;
+
+#define CARD_CHANGE_INSERT 0
+#define CARD_CHANGE_REMOVE 1
+#define CARD_CHANGE_TOGGLE 2
+void ccid_card_change_signal (int how);
+
+/* CCID thread */
+#define EV_CARD_CHANGE        1
+#define EV_TX_FINISHED        2 /* CCID Tx finished  */
+#define EV_EXEC_ACK_REQUIRED  4 /* OpenPGPcard Execution ACK required */
+#define EV_EXEC_FINISHED      8 /* OpenPGPcard Execution finished */
+#define EV_RX_DATA_READY     16 /* USB Rx data available  */
+
+/* OpenPGPcard thread */
+#define EV_MODIFY_CMD_AVAILABLE   1
+#define EV_VERIFY_CMD_AVAILABLE   2
+#define EV_CMD_AVAILABLE          4
+#define EV_EXIT                   8
+#define EV_PINPAD_INPUT_DONE     16
+
+/* Maximum cmd apdu data is key import 24+4+256+256 (proc_key_import) */
+#define MAX_CMD_APDU_DATA_SIZE (24+4+256+256) /* without header */
+/* Maximum res apdu data is public key 5+9+512 (gpg_do_public_key) */
+#define MAX_RES_APDU_DATA_SIZE (5+9+512) /* without trailer */
+
+#define CCID_MSG_HEADER_SIZE	10
+
+#define res_APDU apdu.res_apdu_data
+#define res_APDU_size apdu.res_apdu_data_len
+
+/* USB buffer size of LL (Low-level): size of single Bulk transaction */
+#define USB_LL_BUF_SIZE 64
+
+enum ccid_state {
+  CCID_STATE_NOCARD,		/* No card available */
+  CCID_STATE_START,		/* Initial */
+  CCID_STATE_WAIT,		/* Waiting APDU */
+
+  CCID_STATE_EXECUTE,		/* Executing command */
+  CCID_STATE_ACK_REQUIRED_0,	/* Ack required (executing)*/
+  CCID_STATE_ACK_REQUIRED_1,	/* Waiting user's ACK (execution finished) */
+
+  CCID_STATE_EXITED,		/* CCID Thread Terminated */
+  CCID_STATE_EXEC_REQUESTED,	/* Exec requested */
+};
+
+
+enum ccid_state ccid_get_ccid_state (void);
+
+extern volatile uint8_t auth_status;
+#define AC_NONE_AUTHORIZED	0x00
+#define AC_PSO_CDS_AUTHORIZED	0x01  /* PW1 with 0x81 verified */
+#define AC_OTHER_AUTHORIZED	0x02  /* PW1 with 0x82 verified */
+#define AC_ADMIN_AUTHORIZED	0x04  /* PW3 verified */
+#define AC_NEVER		0x80
+#define AC_ALWAYS		0xFF
+
+#define PW_ERR_PW1 0
+#define PW_ERR_RC  1
+#define PW_ERR_PW3 2
+int gpg_pw_get_retry_counter (int who);
+int gpg_pw_locked (uint8_t which);
+void gpg_pw_reset_err_counter (uint8_t which);
+void gpg_pw_increment_err_counter (uint8_t which);
+
+int ac_check_status (uint8_t ac_flag);
+int verify_pso_cds (const uint8_t *pw, int pw_len);
+int verify_other (const uint8_t *pw, int pw_len);
+int verify_user_0 (uint8_t access, const uint8_t *pw, int buf_len,
+		   int pw_len_known, const uint8_t *ks_pw1, int saveks);
+int verify_admin (const uint8_t *pw, int pw_len);
+int verify_admin_0 (const uint8_t *pw, int buf_len, int pw_len_known,
+		    const uint8_t *ks_pw3, int saveks);
+
+void ac_reset_pso_cds (void);
+void ac_reset_other (void);
+void ac_reset_admin (void);
+void ac_fini (void);
+
+
+void set_res_sw (uint8_t sw1, uint8_t sw2);
+extern uint8_t file_selection;
+extern const uint8_t historical_bytes[];
+extern uint16_t data_objects_number_of_bytes;
+
+#define CHALLENGE_LEN	32
+
+void gpg_data_scan (const uint8_t *start, const uint8_t *end);
+void gpg_data_copy (const uint8_t *p);
+void gpg_do_terminate (void);
+void gpg_do_get_data (uint16_t tag, int with_tag);
+void gpg_do_put_data (uint16_t tag, const uint8_t *data, int len);
+void gpg_do_public_key (uint8_t kk_byte);
+void gpg_do_keygen (uint8_t *buf);
+
+const uint8_t *gpg_get_firmware_update_key (uint8_t keyno);
+
+/* Constants: algo+size */
+#define ALGO_RSA4K      0
+/* #define ALGO_NISTP256R1 1 */
+#define ALGO_SECP256K1  2
+#define ALGO_ED25519    3
+#define ALGO_CURVE25519 4
+#define ALGO_X448       5
+#define ALGO_ED448      6
+#define ALGO_RSA2K      255
+
+enum kind_of_key {
+  GPG_KEY_FOR_SIGNING = 0,
+  GPG_KEY_FOR_DECRYPTION = 1,
+  GPG_KEY_FOR_AUTHENTICATION = 2,
+};
+
+enum size_of_key {
+  GPG_KEY_STORAGE = 0,		/* PUBKEY + PRVKEY rounded to 2^N */
+  GPG_KEY_PUBLIC,
+  GPG_KEY_PRIVATE,
+};
+
+int gpg_get_algo_attr (enum kind_of_key kk);
+int gpg_get_algo_attr_key_size (enum kind_of_key kk, enum size_of_key s);
+
+void flash_do_storage_init (const uint8_t **, const uint8_t **);
+void flash_terminate (void);
+void flash_activate (void);
+void flash_key_storage_init (void);
+void flash_do_release (const uint8_t *);
+const uint8_t *flash_do_write (uint8_t nr, const uint8_t *data, int len);
+uint8_t *flash_key_alloc (enum kind_of_key);
+void flash_key_release (uint8_t *, int);
+void flash_key_release_page (enum kind_of_key);
+int flash_key_write (uint8_t *key_addr,
+		     const uint8_t *key_data, int key_data_len,
+		     const uint8_t *pubkey, int pubkey_len);
+void flash_set_data_pool_last (const uint8_t *p);
+void flash_clear_halfword (uintptr_t addr);
+void flash_increment_counter (uint8_t counter_tag_nr);
+void flash_reset_counter (uint8_t counter_tag_nr);
+
+#define FILEID_SERIAL_NO	0
+#define FILEID_UPDATE_KEY_0	1
+#define FILEID_UPDATE_KEY_1	2
+#define FILEID_UPDATE_KEY_2	3
+#define FILEID_UPDATE_KEY_3	4
+#define FILEID_CH_CERTIFICATE	5
+int flash_erase_binary (uint8_t file_id);
+int flash_write_binary (uint8_t file_id, const uint8_t *data,
+			uint16_t len, uint16_t offset);
+
+#define FLASH_CH_CERTIFICATE_SIZE 2048
+
+extern const uint8_t *ch_certificate_start;
+
+#define FIRMWARE_UPDATE_KEY_CONTENT_LEN 256	/* RSA-2048 (p and q) */
+
+#define INITIAL_VECTOR_SIZE 16
+#define DATA_ENCRYPTION_KEY_SIZE 16
+
+#define MAX_PRVKEY_LEN 512	/* Maximum is the case for RSA 4096-bit.  */
+
+struct key_data {
+  const uint8_t *pubkey;	/* Pointer to public key */
+  uint8_t data[MAX_PRVKEY_LEN]; /* decrypted private key data content */
+};
+
+struct prvkey_data {
+  /*
+   * IV: Initial Vector
+   */
+  uint8_t iv[INITIAL_VECTOR_SIZE];
+  /*
+   * Checksum
+   */
+  uint8_t checksum_encrypted[DATA_ENCRYPTION_KEY_SIZE];
+  /*
+   * DEK (Data Encryption Key) encrypted
+   */
+  uint8_t dek_encrypted_1[DATA_ENCRYPTION_KEY_SIZE]; /* For user */
+  uint8_t dek_encrypted_2[DATA_ENCRYPTION_KEY_SIZE]; /* For resetcode */
+  uint8_t dek_encrypted_3[DATA_ENCRYPTION_KEY_SIZE]; /* For admin */
+};
+
+#define BY_USER		1
+#define BY_RESETCODE	2
+#define BY_ADMIN	3
+
+/*
+ * Maximum length of pass phrase is 127.
+ * We use the top bit (0x80) to encode if keystring is available within DO.
+ */
+#define PW_LEN_MAX            127
+#define PW_LEN_MASK          0x7f
+#define PW_LEN_KEYSTRING_BIT 0x80
+
+#define SALT_SIZE 8
+
+void s2k (const unsigned char *salt, size_t slen,
+	  const unsigned char *input, size_t ilen, unsigned char output[32]);
+
+#define KEYSTRING_PASSLEN_SIZE  1
+#define KEYSTRING_SALT_SIZE     SALT_SIZE
+#define KEYSTRING_MD_SIZE       32
+#define KEYSTRING_SIZE        (KEYSTRING_PASSLEN_SIZE + KEYSTRING_SALT_SIZE \
+			       + KEYSTRING_MD_SIZE)
+#define KS_META_SIZE          (KEYSTRING_PASSLEN_SIZE + KEYSTRING_SALT_SIZE)
+#define KS_GET_SALT(ks)       (ks + KEYSTRING_PASSLEN_SIZE)
+#define KS_GET_KEYSTRING(ks)  (ks + KS_META_SIZE)
+
+void gpg_do_clear_prvkey (enum kind_of_key kk);
+int gpg_do_load_prvkey (enum kind_of_key kk, int who, const uint8_t *keystring);
+int gpg_do_chks_prvkey (enum kind_of_key kk,
+			int who_old, const uint8_t *old_ks,
+			int who_new, const uint8_t *new_ks);
+
+int gpg_change_keystring (int who_old, const uint8_t *old_ks,
+			  int who_new, const uint8_t *new_ks);
+
+extern struct key_data kd[3];
+
+#ifdef DEBUG
+void stdout_init (void);
+#define DEBUG_MORE 1
+/*
+ * Debug functions in debug.c
+ */
+void put_byte (uint8_t b);
+void put_byte_with_no_nl (uint8_t b);
+void put_short (uint16_t x);
+void put_word (uint32_t x);
+void put_int (uint32_t x);
+void put_string (const char *s);
+void put_binary (const char *s, int len);
+
+#define DEBUG_INFO(msg)	    put_string (msg)
+#define DEBUG_WORD(w)	    put_word (w)
+#define DEBUG_SHORT(h)	    put_short (h)
+#define DEBUG_BYTE(b)       put_byte (b)
+#define DEBUG_BINARY(s,len) put_binary ((const char *)s,len)
+#else
+#define DEBUG_INFO(msg)
+#define DEBUG_WORD(w)
+#define DEBUG_SHORT(h)
+#define DEBUG_BYTE(b)
+#define DEBUG_BINARY(s,len)
+#endif
+
+int rsa_sign (const uint8_t *, uint8_t *, int, struct key_data *, int);
+int modulus_calc (const uint8_t *, int, uint8_t *);
+int rsa_decrypt (const uint8_t *, uint8_t *, int, struct key_data *,
+		 unsigned int *);
+int rsa_verify (const uint8_t *, int, const uint8_t *, const uint8_t *);
+int rsa_genkey (int, uint8_t *, uint8_t *);
+
+int ecdsa_sign_p256k1 (const uint8_t *hash, uint8_t *output,
+		       const uint8_t *key_data);
+int ecc_compute_public_p256k1 (const uint8_t *key_data, uint8_t *);
+int ecc_check_secret_p256k1 (const uint8_t *d0, uint8_t  *d1);
+int ecdh_decrypt_p256k1 (const uint8_t *input, uint8_t *output,
+			 const uint8_t *key_data);
+
+int eddsa_sign_25519 (const uint8_t *input, size_t ilen, uint32_t *output,
+		      const uint8_t *sk_a, const uint8_t *seed,
+		      const uint8_t *pk);
+void eddsa_compute_public_25519 (const uint8_t *a, uint8_t *);
+void ecdh_compute_public_25519 (const uint8_t *a, uint8_t *);
+int ecdh_decrypt_curve25519 (const uint8_t *input, uint8_t *output,
+			     const uint8_t *key_data);
+
+void ecdh_compute_public_x448 (uint8_t *pubkey, const uint8_t *key_data);
+int ecdh_decrypt_x448 (uint8_t *output, const uint8_t *input,
+		       const uint8_t *key_data);
+
+int ed448_sign (uint8_t *out, const uint8_t *input, unsigned int ilen,
+		const uint8_t *a_in, const uint8_t *seed, const uint8_t *pk);
+void ed448_compute_public (uint8_t *pk, const uint8_t *a_in);
+
+
+const uint8_t *gpg_do_read_simple (uint8_t);
+void gpg_do_write_simple (uint8_t, const uint8_t *, int);
+void gpg_increment_digital_signature_counter (void);
+void gpg_do_get_initial_pw_setting (int is_pw3, int *r_len,
+				    const uint8_t **r_p);
+int gpg_do_kdf_check (int len, int how_many);
+int gpg_do_get_uif (enum kind_of_key kk);
+
+
+void fatal (uint8_t code) __attribute__ ((noreturn));
+#define FATAL_FLASH  1
+#define FATAL_RANDOM 2
+#define FATAL_HEAP   3
+
+extern uint8_t keystring_md_pw3[KEYSTRING_MD_SIZE];
+extern uint8_t admin_authorized;
+
+/*** Flash memory tag values ***/
+/* Data objects */
+/*
+ * Representation of data object:
+ *
+ *   <-1 halfword-> <--len/2 halfwords->
+ *   <-tag-><-len-> <---data content--->
+ */
+#define NR_DO_SEX		0x00
+#define NR_DO_FP_SIG		0x01
+#define NR_DO_FP_DEC		0x02
+#define NR_DO_FP_AUT		0x03
+#define NR_DO_CAFP_1		0x04
+#define NR_DO_CAFP_2		0x05
+#define NR_DO_CAFP_3		0x06
+#define NR_DO_KGTIME_SIG	0x07
+#define NR_DO_KGTIME_DEC	0x08
+#define NR_DO_KGTIME_AUT	0x09
+#define NR_DO_LOGIN_DATA	0x0a
+#define NR_DO_URL		0x0b
+#define NR_DO_NAME		0x0c
+#define NR_DO_LANGUAGE		0x0d
+#define NR_DO_PRVKEY_SIG	0x0e
+#define NR_DO_PRVKEY_DEC	0x0f
+#define NR_DO_PRVKEY_AUT	0x10
+#define NR_DO_KEYSTRING_PW1	0x11
+#define NR_DO_KEYSTRING_RC	0x12
+#define NR_DO_KEYSTRING_PW3	0x13
+#define NR_DO_KDF		0x14
+#define NR_DO__LAST__		21   /* == 0x15 */
+/* 14-bit counter for DS: Recorded in flash memory by 1-halfword (2-byte).  */
+/*
+ * Representation of 14-bit counter:
+ *      0: 0x8000
+ *      1: 0x8001
+ *     ...
+ *  16383: 0xbfff
+ */
+#define NR_COUNTER_DS		0x80 /* ..0xbf */
+/* 10-bit counter for DS: Recorded in flash memory by 1-halfword (2-byte).  */
+/*
+ * Representation of 10-bit counter:
+ *      0: 0xc000
+ *      1: 0xc001
+ *     ...
+ *   1023: 0xc3ff
+ */
+#define NR_COUNTER_DS_LSB	0xc0 /* ..0xc3 */
+/*
+ * Boolean object, small enum, or 8-bit integer:
+ * Recorded in flash memory by 1-halfword (2-byte)
+ */
+/*
+ * Representation of Boolean object:
+ *   0: No record in flash memory
+ *   1: 0xf000
+ */
+#define NR_BOOL_PW1_LIFETIME	0xf0
+/*
+ * Representation of algorithm attribute object:
+ *   RSA-2048:       No record in flash memory
+ *   RSA-4096:       0xf?00
+ *   ECC p256r1:     0xf?01
+ *   ECC p256k1:     0xf?02
+ *   ECC Ed25519:    0xf?03
+ *   ECC Curve25519: 0xf?04
+ * where <?> == 1 (signature), 2 (decryption) or 3 (authentication)
+ */
+#define NR_KEY_ALGO_ATTR_SIG	0xf1
+#define NR_KEY_ALGO_ATTR_DEC	0xf2
+#define NR_KEY_ALGO_ATTR_AUT	0xf3
+/*
+ * Representation of User Interaction Flag:
+ *  0 (UIF disabled):            0xf?00 or No record in flash memory
+ *  1 (UIF enabled):             0xf?01
+ *  2 (UIF permanently enabled): 0xf?02
+ *
+ */
+#define NR_DO_UIF_SIG		0xf6
+#define NR_DO_UIF_DEC		0xf7
+#define NR_DO_UIF_AUT		0xf8
+/*
+ * NR_UINT_SOMETHING could be here...  Use 0xf[459abcd]
+ */
+/* 123-counters: Recorded in flash memory by 2-halfword (4-byte).  */
+/*
+ * Representation of 123-counters:
+ *   0: No record in flash memory
+ *   1: 0xfe?? 0xffff
+ *   2: 0xfe?? 0xc3c3
+ *   3: 0xfe?? 0x0000
+ *                    where <counter_id> is placed at second byte <??>
+ */
+#define NR_COUNTER_123		0xfe
+#define NR_EMPTY		0xff
+
+#define SIZE_PW_STATUS_BYTES 7
+
+
+#define NUM_ALL_PRV_KEYS 3	/* SIG, DEC and AUT */
+
+#if !defined(OPENPGP_CARD_INITIAL_PW1)
+#define OPENPGP_CARD_INITIAL_PW1 "123456"
+#endif
+
+#if !defined(OPENPGP_CARD_INITIAL_PW3)
+#define OPENPGP_CARD_INITIAL_PW3 "12345678"
+#endif
+
+extern const uint8_t openpgpcard_aid[14];
+
+void flash_bool_clear (const uint8_t **addr_p);
+const uint8_t *flash_bool_write (uint8_t nr);
+void flash_enum_clear (const uint8_t **addr_p);
+const uint8_t *flash_enum_write (uint8_t nr, uint8_t v);
+int flash_cnt123_get_value (const uint8_t *p);
+void flash_cnt123_increment (uint8_t which, const uint8_t **addr_p);
+void flash_cnt123_clear (const uint8_t **addr_p);
+void flash_put_data (uint16_t hw);
+void flash_warning (const char *msg);
+
+void flash_put_data_internal (const uint8_t *p, uint16_t hw);
+void flash_bool_write_internal (const uint8_t *p, int nr);
+void flash_enum_write_internal (const uint8_t *p, int nr, uint8_t v);
+void flash_cnt123_write_internal (const uint8_t *p, int which, int v);
+void flash_do_write_internal (const uint8_t *p, int nr,
+			      const uint8_t *data, int len);
+
+extern const uint8_t gnuk_string_serial[];
+
+#define LED_ONESHOT		  1
+#define LED_TWOSHOTS		  2
+#define LED_SHOW_STATUS		  4
+#define LED_FATAL		  8
+#define LED_SYNC	         16
+#define LED_GNUK_EXEC		 32
+#define LED_START_COMMAND	 64
+#define LED_FINISH_COMMAND	128
+#define LED_WAIT_FOR_BUTTON	256
+#define LED_OFF	 LED_FINISH_COMMAND
+void led_blink (int spec);
+
+#if defined(PINPAD_SUPPORT)
+# if defined(PINPAD_CIR_SUPPORT)
+void cir_init (void);
+# elif defined(PINPAD_DIAL_SUPPORT)
+void dial_sw_disable (void);
+void dial_sw_enable (void);
+# elif defined(PINPAD_DND_SUPPORT)
+void msc_init (void);
+void msc_media_insert_change (int available);
+int msc_scsi_write (uint32_t lba, const uint8_t *buf, size_t size);
+int msc_scsi_read (uint32_t lba, const uint8_t **sector_p);
+void msc_scsi_stop (uint8_t code);
+# endif
+#define PIN_INPUT_CURRENT 1
+#define PIN_INPUT_NEW     2
+#define PIN_INPUT_CONFIRM 3
+#define MAX_PIN_CHARS 32
+extern uint8_t pin_input_buffer[MAX_PIN_CHARS];
+extern uint8_t pin_input_len;
+
+int pinpad_getline (int msg_code, uint32_t timeout_usec);
+
+#endif
+
+
+extern uint8_t _regnual_start, __heap_end__[];
+
+uint8_t * sram_address (uint32_t offset);
+
+static inline const uint8_t *
+unique_device_id (void)
+{
+  /*
+   * STM32F103 has 96-bit unique device identifier.
+   * This routine mimics that.
+   */
+
+  static const uint8_t id[] = { /* My RSA fingerprint */
+    0x12, 0x41, 0x24, 0xBD, 0x3B, 0x48, 0x62, 0xAF,
+    0x7A, 0x0A, 0x42, 0xF1, 0x00, 0xB4, 0x5E, 0xBD,
+    0x4C, 0xA7, 0xBA, 0xBE
+  };
+  
+  return id;
+}
+
--- a/jpc-ac_p256k1.h
+++ b/jpc-ac_p256k1.h
@ -0,0 +1,14 @@
+/**
+ * @brief	Jacobian projective coordinates
+ */
+typedef struct
+{
+  bn256 x[1];
+  bn256 y[1];
+  bn256 z[1];
+} jpc;
+
+void jpc_double_p256k1 (jpc *X, const jpc *A);
+void jpc_add_ac_p256k1 (jpc *X, const jpc *A, const ac *B);
+void jpc_add_ac_signed_p256k1 (jpc *X, const jpc *A, const ac *B, int minus);
+int jpc_to_ac_p256k1 (ac *X, const jpc *A);
--- a/jpc.c
+++ b/jpc.c
@ -0,0 +1,199 @@
+/*
+ * jpc.c -- arithmetic on Jacobian projective coordinates.
+ *
+ * Copyright (C) 2011, 2013 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "field-group-select.h"
+
+/**
+ * @brief	X = 2 * A
+ *
+ * @param X	Destination JPC
+ * @param A	JPC
+ */
+void
+FUNC(jpc_double) (jpc *X, const jpc *A)
+{
+  bn256 a[1], b[1], c[1], tmp0[1];
+  bn256 *d;
+
+  if (bn256_is_zero (A->z))		/* A is infinite */
+    return;
+
+  d = X->x;
+  MFNC(sqr) (a, A->y);
+  memcpy (b, a, sizeof (bn256));
+  MFNC(mul) (a, a, A->x);
+  MFNC(shift) (a, a, 2);
+
+  MFNC(sqr) (b, b);
+  MFNC(shift) (b, b, 3);
+
+#if defined(COEFFICIENT_A_IS_MINUS_3)
+  MFNC(sqr) (tmp0, A->z);
+  MFNC(sub) (c, A->x, tmp0);
+  MFNC(add) (tmp0, tmp0, A->x);
+  MFNC(mul) (tmp0, tmp0, c);
+  MFNC(shift) (c, tmp0, 1);
+  MFNC(add) (c, c, tmp0);
+#elif defined (COEFFICIENT_A_IS_ZERO)
+  MFNC(sqr) (tmp0, A->x);
+  MFNC(shift) (c, tmp0, 1);
+  MFNC(add) (c, c, tmp0);
+#else
+#error "not supported."
+#endif
+
+  MFNC(sqr) (d, c);
+  MFNC(shift) (tmp0, a, 1);
+  MFNC(sub) (d, d, tmp0);
+
+  MFNC(mul) (X->z, A->y, A->z);
+  MFNC(shift) (X->z, X->z, 1);
+
+  MFNC(sub) (tmp0, a, d);
+  MFNC(mul) (tmp0, c, tmp0);
+  MFNC(sub) (X->y, tmp0, b);
+}
+
+/**
+ * @brief	X = A + B
+ *
+ * @param X	Destination JPC
+ * @param A	JPC
+ * @param B	AC
+ * @param MINUS if 1 subtraction, addition otherwise.
+ */
+void
+FUNC(jpc_add_ac_signed) (jpc *X, const jpc *A, const ac *B, int minus)
+{
+  bn256 a[1], b[1], c[1], d[1], tmp[1];
+#define minus_B_y c
+#define c_sqr a
+#define c_cube b
+#define x1_c_sqr c
+#define x1_c_sqr_2 c
+#define c_cube_plus_x1_c_sqr_2 c
+#define x1_c_sqr_copy a
+#define y3_tmp c
+#define y1_c_cube a
+
+  if (bn256_is_zero (A->z))		/* A is infinite */
+    {
+      memcpy (X->x, B->x, sizeof (bn256));
+      if (minus)
+	{
+	  memcpy (tmp, B->y, sizeof (bn256));
+	  bn256_sub (X->y, CONST_P256, B->y);
+	}
+      else
+	{
+	  memcpy (X->y, B->y, sizeof (bn256));
+	  bn256_sub (tmp, CONST_P256, B->y);
+	}
+      memset (X->z, 0, sizeof (bn256));
+      X->z->word[0] = 1;
+      return;
+    }
+
+  MFNC(sqr) (a, A->z);
+  memcpy (b, a, sizeof (bn256));
+  MFNC(mul) (a, a, B->x);
+
+  MFNC(mul) (b, b, A->z);
+  if (minus)
+    {
+      bn256_sub (minus_B_y, CONST_P256, B->y);
+      MFNC(mul) (b, b, minus_B_y);
+    }
+  else
+    {
+      bn256_sub (tmp, CONST_P256, B->y);
+      MFNC(mul) (b, b, B->y);
+    }
+
+  if (bn256_cmp (A->x, a) == 0 && bn256_cmp (A->y, b) == 0)
+    {
+      FUNC(jpc_double) (X, A);
+      return;
+    }
+
+  MFNC(sub) (c, a, A->x);
+  MFNC(sub) (d, b, A->y);
+
+  MFNC(mul) (X->z, A->z, c);
+
+  MFNC(sqr) (c_sqr, c);
+  MFNC(mul) (c_cube, c_sqr, c);
+
+  MFNC(mul) (x1_c_sqr, A->x, c_sqr);
+
+  MFNC(sqr) (X->x, d);
+  memcpy (x1_c_sqr_copy, x1_c_sqr, sizeof (bn256));
+  MFNC(shift) (x1_c_sqr_2, x1_c_sqr, 1);
+  MFNC(add) (c_cube_plus_x1_c_sqr_2, x1_c_sqr_2, c_cube);
+  MFNC(sub) (X->x, X->x, c_cube_plus_x1_c_sqr_2);
+
+  MFNC(sub) (y3_tmp, x1_c_sqr_copy, X->x);
+  MFNC(mul) (y3_tmp, y3_tmp, d);
+  MFNC(mul) (y1_c_cube, A->y, c_cube);
+  MFNC(sub) (X->y, y3_tmp, y1_c_cube);
+}
+
+/**
+ * @brief	X = A + B
+ *
+ * @param X	Destination JPC
+ * @param A	JPC
+ * @param B	AC
+ */
+void
+FUNC(jpc_add_ac) (jpc *X, const jpc *A, const ac *B)
+{
+  FUNC(jpc_add_ac_signed) (X, A, B, 0);
+}
+
+/**
+ * @brief	X = convert A
+ *
+ * @param X	Destination AC
+ * @param A	JPC
+ *
+ * Return -1 on error (infinite).
+ * Return 0 on success.
+ */
+int
+FUNC(jpc_to_ac) (ac *X, const jpc *A)
+{
+  bn256 z_inv[1], z_inv_sqr[1];
+
+  if (bn256_is_zero (A->z))
+    return -1;
+
+  mod_inv (z_inv, A->z, CONST_P256);
+
+  MFNC(sqr) (z_inv_sqr, z_inv);
+  MFNC(mul) (z_inv, z_inv, z_inv_sqr);
+
+  MFNC(mul) (X->x, A->x, z_inv_sqr);
+  MFNC(mul) (X->y, A->y, z_inv);
+  return 0;
+}
--- a/jpc_p256k1.c
+++ b/jpc_p256k1.c
@ -0,0 +1,36 @@
+/*
+ * jpc_p256k1.c -- arithmetic on Jacobian projective coordinates for p256k1.
+ *
+ * Copyright (C) 2014 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "bn.h"
+#include "mod.h"
+#include "modp256k1.h"
+#include "affine.h"
+#include "jpc-ac_p256k1.h"
+
+#define FIELD p256k1
+#define CONST_P256 P256K1
+#define COEFFICIENT_A_IS_ZERO    1
+
+#include "jpc.c"
--- a/low_flash.c
+++ b/low_flash.c
@ -0,0 +1,54 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "pico/stdlib.h"
+#include "hardware/flash.h"
+#include "hardware/sync.h"
+#include <string.h>
+
+int
+flash_program_halfword (uintptr_t addr, uint16_t data)
+{
+  off_t offset;
+  uint8_t buf[FLASH_PAGE_SIZE];
+  memset(buf, 0, sizeof(uint8_t)*FLASH_PAGE_SIZE);
+
+  buf[0] = (data & 0xff);
+  buf[1] = (data >> 8);
+  uint32_t ints = save_and_disable_interrupts();
+  flash_range_program(addr-XIP_BASE, buf, FLASH_PAGE_SIZE);
+  restore_interrupts (ints);
+  return 0;
+}
+
+static const uint8_t erased[] = { [0 ... 1023 ] = 0xff };
+
+int
+flash_erase_page (uintptr_t addr)
+{
+  uint32_t ints = save_and_disable_interrupts();
+  flash_range_erase(addr-XIP_BASE, FLASH_SECTOR_SIZE);
+  restore_interrupts (ints);
+  return 0;
+}
+
+int
+flash_check_blank (const uint8_t *p_start, size_t size)
+{
+  const uint8_t *p;
+
+  for (p = p_start; p < p_start + size; p++)
+    if (*p != 0xff)
+      return 0;
+
+  return 1;
+}
+
+int
+flash_write (uintptr_t dst_addr, const uint8_t *src, size_t len)
+{
+  uint32_t ints = save_and_disable_interrupts();
+  flash_range_program(dst_addr-XIP_BASE, src, (len%FLASH_PAGE_SIZE == 0 ? len : ((size_t)(len/FLASH_PAGE_SIZE)+1)*FLASH_PAGE_SIZE));
+  restore_interrupts (ints);
+}
--- a/mod.c
+++ b/mod.c
@ -0,0 +1,352 @@
+/*
+ * mod.c -- modulo arithmetic
+ *
+ * Copyright (C) 2011, 2014 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "bn.h"
+
+/**
+ * @brief X = A mod B (using MU=(1<<(256)+MU_lower)) (Barret reduction)
+ *
+ */
+void
+mod_reduce (bn256 *X, const bn512 *A, const bn256 *B, const bn256 *MU_lower)
+{
+  bn256 q[1];
+  bn512 q_big[1], tmp[1];
+  uint32_t carry;
+#define borrow carry
+
+  memset (q, 0, sizeof (bn256));
+  q->word[0] = A->word[15];
+  bn256_mul (tmp, q, MU_lower);
+  tmp->word[8] += A->word[15];
+  carry = (tmp->word[8] < A->word[15]);
+  tmp->word[9] += carry;
+
+  q->word[7] = A->word[14];
+  q->word[6] = A->word[13];
+  q->word[5] = A->word[12];
+  q->word[4] = A->word[11];
+  q->word[3] = A->word[10];
+  q->word[2] = A->word[9];
+  q->word[1] = A->word[8];
+  q->word[0] = A->word[7];
+  bn256_mul (q_big, q, MU_lower);
+  bn256_add ((bn256 *)&q_big->word[8], (bn256 *)&q_big->word[8], q);
+
+  q->word[0] = q_big->word[9] + tmp->word[1];
+  carry = (q->word[0] < tmp->word[1]);
+
+  q->word[1] = q_big->word[10] + carry;
+  carry = (q->word[1] < carry);
+  q->word[1] += tmp->word[2];
+  carry += (q->word[1] < tmp->word[2]);
+
+  q->word[2] = q_big->word[11] + carry;
+  carry = (q->word[2] < carry);
+  q->word[2] += tmp->word[3];
+  carry += (q->word[2] < tmp->word[3]);
+
+  q->word[3] = q_big->word[12] + carry;
+  carry = (q->word[3] < carry);
+  q->word[3] += tmp->word[4];
+  carry += (q->word[3] < tmp->word[4]);
+
+  q->word[4] = q_big->word[13] + carry;
+  carry = (q->word[4] < carry);
+  q->word[4] += tmp->word[5];
+  carry += (q->word[4] < tmp->word[5]);
+
+  q->word[5] = q_big->word[14] + carry;
+  carry = (q->word[5] < carry);
+  q->word[5] += tmp->word[6];
+  carry += (q->word[5] < tmp->word[6]);
+
+  q->word[6] = q_big->word[15] + carry;
+  carry = (q->word[6] < carry);
+  q->word[6] += tmp->word[7];
+  carry += (q->word[6] < tmp->word[7]);
+
+  q->word[7] = carry;
+  q->word[7] += tmp->word[8];
+  carry = (q->word[7] < tmp->word[8]);
+
+  memset (q_big, 0, sizeof (bn512));
+  q_big->word[8] = A->word[8];
+  q_big->word[7] = A->word[7];
+  q_big->word[6] = A->word[6];
+  q_big->word[5] = A->word[5];
+  q_big->word[4] = A->word[4];
+  q_big->word[3] = A->word[3];
+  q_big->word[2] = A->word[2];
+  q_big->word[1] = A->word[1];
+  q_big->word[0] = A->word[0];
+
+  bn256_mul (tmp, q, B);
+  tmp->word[8] += carry * B->word[0];
+  tmp->word[15] = tmp->word[14] = tmp->word[13] = tmp->word[12]
+    = tmp->word[11] = tmp->word[10] = tmp->word[9] = 0;
+
+  borrow = bn256_sub (X, (bn256 *)&q_big->word[0], (bn256 *)&tmp->word[0]);
+  q_big->word[8] -= borrow;
+  q_big->word[8] -= tmp->word[8];
+
+  carry = q_big->word[8];
+  if (carry)
+    carry -= bn256_sub (X, X, B);
+  else
+    bn256_sub (q, X, B);
+
+  if (carry)
+    bn256_sub (X, X, B);
+  else
+    bn256_sub (q, X, B);
+
+  borrow = bn256_sub (q, X, B);
+  if (borrow)
+    memcpy (q, X, sizeof (bn256));
+  else
+    memcpy (X, q, sizeof (bn256));
+#undef borrow
+}
+
+/*
+ * Reference:
+ * Donald E. Knuth, The Art of Computer Programming, Vol. 2:
+ * Seminumerical Algorithms, 3rd ed. Reading, MA: Addison-Wesley, 1998
+ *
+ * Max loop: X=0x8000...0000 and N=0xffff...ffff
+ */
+#define MAX_GCD_STEPS_BN256 (3*256-2)
+
+/**
+ * @brief C = X^(-1) mod N
+ *
+ * Assume X and N are co-prime (or N is prime).
+ * NOTE: If X==0, it return 0.
+ *
+ */
+void
+mod_inv (bn256 *C, const bn256 *X, const bn256 *N)
+{
+  bn256 u[1], v[1], tmp[1];
+  bn256 A[1] = { { { 1, 0, 0, 0, 0, 0, 0, 0 } } };
+  uint32_t carry;
+#define borrow carry
+  int n = MAX_GCD_STEPS_BN256;
+
+  memset (tmp, 0, sizeof (bn256));
+  memset (C, 0, sizeof (bn256));
+  memcpy (u, X, sizeof (bn256));
+  memcpy (v, N, sizeof (bn256));
+
+  while (n--)
+    {
+      int c = (bn256_is_even (u) << 1) + bn256_is_even (v);
+
+      switch (c)
+	{
+	case 3:
+	  bn256_shift (u, u, -1);
+	  if (bn256_is_even (A))
+	    {
+	      bn256_add (tmp, A, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (A, A, N);
+
+	  bn256_shift (A, A, -1);
+	  A->word[7] |= carry * 0x80000000;
+
+	  bn256_shift (v, v, -1);
+	  if (bn256_is_even (C))
+	    {
+	      bn256_add (tmp, C, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (C, C, N);
+
+	  bn256_shift (C, C, -1);
+	  C->word[7] |= carry * 0x80000000;
+
+	  if (bn256_is_ge (tmp, tmp))
+	    {
+	      bn256_sub (tmp, tmp, tmp);
+	      borrow = bn256_sub (tmp, tmp, tmp);
+	      if (borrow)
+		bn256_add (tmp, tmp, tmp);
+	      else
+		bn256_add (tmp, A, N);
+	    }
+	  else
+	    {
+	      bn256_sub (tmp, tmp, tmp);
+	      borrow = bn256_sub (tmp, tmp, tmp);
+	      if (borrow)
+		bn256_add (tmp, tmp, tmp);
+	      else
+		bn256_add (tmp, tmp, N);
+	    }
+	  break;
+
+	case 1:
+	  bn256_shift (tmp, tmp, -1);
+	  if (bn256_is_even (tmp))
+	    {
+	      bn256_add (tmp, tmp, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (tmp, tmp, N);
+
+	  bn256_shift (tmp, tmp, -1);
+	  tmp->word[7] |= carry * 0x80000000;
+
+	  bn256_shift (v, v, -1);
+	  if (bn256_is_even (C))
+	    {
+	      bn256_add (tmp, C, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (C, C, N);
+
+	  bn256_shift (C, C, -1);
+	  C->word[7] |= carry * 0x80000000;
+
+	  if (bn256_is_ge (tmp, tmp))
+	    {
+	      bn256_sub (tmp, tmp, tmp);
+	      borrow = bn256_sub (tmp, tmp, tmp);
+	      if (borrow)
+		bn256_add (tmp, tmp, tmp);
+	      else
+		bn256_add (tmp, A, N);
+	    }
+	  else
+	    {
+	      bn256_sub (tmp, tmp, tmp);
+	      borrow = bn256_sub (tmp, tmp, tmp);
+	      if (borrow)
+		bn256_add (tmp, tmp, tmp);
+	      else
+		bn256_add (tmp, tmp, N);
+	    }
+	  break;
+
+	case 2:
+	  bn256_shift (u, u, -1);
+	  if (bn256_is_even (A))
+	    {
+	      bn256_add (tmp, A, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (A, A, N);
+
+	  bn256_shift (A, A, -1);
+	  A->word[7] |= carry * 0x80000000;
+
+	  bn256_shift (tmp, tmp, -1);
+	  if (bn256_is_even (tmp))
+	    {
+	      bn256_add (tmp, tmp, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (tmp, tmp, N);
+
+	  bn256_shift (tmp, tmp, -1);
+	  tmp->word[7] |= carry * 0x80000000;
+
+	  if (bn256_is_ge (tmp, tmp))
+	    {
+	      bn256_sub (tmp, tmp, tmp);
+	      borrow = bn256_sub (tmp, tmp, tmp);
+	      if (borrow)
+		bn256_add (tmp, tmp, tmp);
+	      else
+		bn256_add (tmp, A, N);
+	    }
+	  else
+	    {
+	      bn256_sub (tmp, tmp, tmp);
+	      borrow = bn256_sub (tmp, tmp, tmp);
+	      if (borrow)
+		bn256_add (tmp, tmp, tmp);
+	      else
+		bn256_add (tmp, tmp, N);
+	    }
+	  break;
+
+	case 0:
+	  bn256_shift (tmp, tmp, -1);
+	  if (bn256_is_even (tmp))
+	    {
+	      bn256_add (tmp, tmp, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (tmp, tmp, N);
+
+	  bn256_shift (tmp, tmp, -1);
+	  tmp->word[7] |= carry * 0x80000000;
+
+	  bn256_shift (tmp, tmp, -1);
+	  if (bn256_is_even (tmp))
+	    {
+	      bn256_add (tmp, tmp, N);
+	      carry = 0;
+	    }
+	  else
+	    carry = bn256_add (tmp, tmp, N);
+
+	  bn256_shift (tmp, tmp, -1);
+	  tmp->word[7] |= carry * 0x80000000;
+
+	  if (bn256_is_ge (u, v))
+	    {
+	      bn256_sub (u, u, v);
+	      borrow = bn256_sub (A, A, C);
+	      if (borrow)
+		bn256_add (A, A, N);
+	      else
+		bn256_add (tmp, A, N);
+	    }
+	  else
+	    {
+	      bn256_sub (v, v, u);
+	      borrow = bn256_sub (C, C, A);
+	      if (borrow)
+		bn256_add (C, C, N);
+	      else
+		bn256_add (tmp, C, N);
+	    }
+	  break;
+	}
+    }
+#undef borrow
+}
--- a/mod.h
+++ b/mod.h
@ -0,0 +1,3 @@
+void mod_reduce (bn256 *X, const bn512 *A, const bn256 *B,
+		 const bn256 *MU_lower);
+void mod_inv (bn256 *X, const bn256 *A, const bn256 *N);
--- a/mod25638.c
+++ b/mod25638.c
@ -0,0 +1,287 @@
+/*
+ * mod25638.c -- modulo arithmetic of 2^256-38 for 2^255-19 field
+ *
+ * Copyright (C) 2014 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * The field is \Z/(2^255-19)
+ *
+ * We use radix-32.  During computation, it's not reduced to 2^255-19,
+ * but it is represented in 256-bit (it is redundant representation),
+ * that is, something like 2^256-38.
+ *
+ * The idea is, keeping within 256-bit until it will be converted to
+ * affine coordinates.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "bn.h"
+#include "mod25638.h"
+
+#ifndef BN256_C_IMPLEMENTATION
+#define ASM_IMPLEMENTATION 0
+#endif
+
+#if ASM_IMPLEMENTATION
+#include "muladd_256.h"
+#define ADDWORD_256(d_,s_,w_,c_)		        \
+ asm ( "ldmia  %[s]!, { r4, r5, r6, r7 } \n\t"          \
+       "adds   r4, r4, %[w]             \n\t"           \
+       "adcs   r5, r5, #0               \n\t"           \
+       "adcs   r6, r6, #0               \n\t"           \
+       "adcs   r7, r7, #0               \n\t"           \
+       "stmia  %[d]!, { r4, r5, r6, r7 }\n\t"           \
+       "ldmia  %[s]!, { r4, r5, r6, r7 } \n\t"          \
+       "adcs   r4, r4, #0               \n\t"           \
+       "adcs   r5, r5, #0               \n\t"           \
+       "adcs   r6, r6, #0               \n\t"           \
+       "adcs   r7, r7, #0               \n\t"           \
+       "stmia  %[d]!, { r4, r5, r6, r7 }\n\t"           \
+       "mov    %[c], #0                 \n\t"           \
+       "adc    %[c], %[c], #0"                          \
+       : [s] "=&r" (s_), [d] "=&r" (d_), [c] "=&r" (c_)	\
+       : "[s]" (s_), "[d]" (d_), [w] "r" (w_)		\
+       : "r4", "r5", "r6", "r7", "memory", "cc" )
+#endif
+
+/*
+256      224      192      160      128       96       64       32        0
+2^256
+  1 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+2^256 - 16
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffff0
+2^256 - 16 - 2
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffee
+2^256 - 16 - 2 - 1
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed
+*/
+const bn256 p25519[1] = {
+  {{ 0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff,
+     0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }} };
+
+
+/*
+ * Implementation Note.
+ *
+ * It's not always modulo n25638.  The representation is redundant
+ * during computation.  For example, when we add the number - 1 and 1,
+ * it won't overflow to 2^256, and the result is represented within
+ * 256-bit.
+ */
+
+
+/**
+ * @brief  X = (A + B) mod 2^256-38
+ */
+void
+mod25638_add (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  uint32_t carry;
+
+  carry = bn256_add (X, A, B);
+  carry = bn256_add_uint (X, X, carry*38);
+  X->word[0] += carry * 38;
+}
+
+/**
+ * @brief  X = (A - B) mod 2^256-38
+ */
+void
+mod25638_sub (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  uint32_t borrow;
+
+  borrow = bn256_sub (X, A, B);
+  borrow = bn256_sub_uint (X, X, borrow*38);
+  X->word[0] -= borrow * 38;
+}
+
+
+/**
+ * @brief  X = A mod 2^256-38
+ *
+ * Note that the second argument is not "const bn512 *".
+ * A is modified during the computation of modulo.
+ *
+ * It's not precisely modulo 2^256-38 for all cases,
+ * but result may be redundant.
+ */
+static void
+mod25638_reduce (bn256 *X, bn512 *A)
+{
+  const uint32_t *s;
+  uint32_t *d;
+  uint32_t w;
+
+#if ASM_IMPLEMENTATION
+  uint32_t c, c0;
+
+  s = &A->word[8]; d = &A->word[0]; w = 38; MULADD_256 (s, d, w, c);
+  c0 = A->word[8] * 38;
+  d = &X->word[0];
+  s = &A->word[0];
+  ADDWORD_256 (d, s, c0, c);
+  X->word[0] += c * 38;
+#else
+  s = &A->word[8]; d = &A->word[0]; w = 38;
+  {
+    int i;
+    uint64_t r;
+    uint32_t carry;
+
+    r = 0;
+    for (i = 0; i < BN256_WORDS; i++)
+      {
+	uint64_t uv;
+
+	r += d[i];
+	carry = (r < d[i]);
+
+	uv = ((uint64_t)s[i])*w;
+	r += uv;
+	carry += (r < uv);
+
+	d[i] = (uint32_t)r;
+	r = ((r >> 32) | ((uint64_t)carry << 32));
+      }
+
+    carry = bn256_add_uint (X, (bn256 *)A, r * 38);
+    X->word[0] += carry * 38;
+  }
+#endif
+}
+
+/**
+ * @brief  X = (A * B) mod 2^256-38
+ */
+void
+mod25638_mul (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  bn512 tmp[1];
+
+  bn256_mul (tmp, A, B);
+  mod25638_reduce (X, tmp);
+}
+
+/**
+ * @brief  X = A * A mod 2^256-38
+ */
+void
+mod25638_sqr (bn256 *X, const bn256 *A)
+{
+  bn512 tmp[1];
+
+  bn256_sqr (tmp, A);
+  mod25638_reduce (X, tmp);
+}
+
+
+/**
+ * @brief  X = (A << shift) mod 2^256-38
+ * @note   shift < 32
+ */
+void
+mod25638_shift (bn256 *X, const bn256 *A, int shift)
+{
+  uint32_t carry;
+  bn256 tmp[1];
+
+  carry = bn256_shift (X, A, shift);
+  if (shift < 0)
+    return;
+
+  memset (tmp, 0, sizeof (bn256));
+  tmp->word[0] = (carry << 1);
+  /* tmp->word[1] = (carry >> 31);  always zero.  */
+  tmp->word[0] = tmp->word[0] + (carry << 2);
+  tmp->word[1] = (tmp->word[0] < (carry << 2)) + (carry >> 30);
+  tmp->word[0] = tmp->word[0] + (carry << 5);
+  tmp->word[1] = tmp->word[1] + (tmp->word[0] < (carry << 5)) + (carry >> 27);
+
+  mod25638_add (X, X, tmp);
+}
+
+
+/*
+ * @brief  X = A mod 2^255-19
+ *
+ * It's precisely modulo 2^255-19 (unlike mod25638_reduce).
+ */
+void
+mod25519_reduce (bn256 *X)
+{
+  uint32_t q;
+  bn256 r0[1], r1[1];
+  int flag;
+
+  memcpy (r0, X, sizeof (bn256));
+  q = (r0->word[7] >> 31);
+  r0->word[7] &= 0x7fffffff;
+  if (q)
+    {
+      bn256_add_uint (r0, r0, 19);
+      q = (r0->word[7] >> 31);
+      r0->word[7] &= 0x7fffffff;
+      if (q)
+	{
+	  bn256_add_uint (r1, r0, 19);
+	  q = (r1->word[7] >> 31);
+	  r1->word[7] &= 0x7fffffff;
+	  flag = 0;
+	}
+      else
+	flag = 1;
+    }
+  else
+    {
+      bn256_add_uint (r1, r0, 19);
+      q = (r1->word[7] >> 31);	 /* dummy */
+      r1->word[7] &= 0x7fffffff; /* dummy */
+      if (q)
+	flag = 2;
+      else
+	flag = 3;
+    }
+
+  if (flag)
+    {
+      bn256_add_uint (r1, r0, 19);
+      q = (r1->word[7] >> 31);
+      r1->word[7] &= 0x7fffffff;
+      if (q)
+	memcpy (X, r1, sizeof (bn256));
+      else
+	memcpy (X, r0, sizeof (bn256));
+    }
+  else
+    {
+      if (q)
+	{
+	  asm volatile ("" : : "r" (q) : "memory");
+	  memcpy (X, r1, sizeof (bn256));
+	  asm volatile ("" : : "r" (q) : "memory");
+	}
+      else
+	memcpy (X, r1, sizeof (bn256));
+    }
+}
--- a/mod25638.h
+++ b/mod25638.h
@ -0,0 +1,7 @@
+extern const bn256 p25519[1];
+
+void mod25638_add (bn256 *X, const bn256 *A, const bn256 *B);
+void mod25638_sub (bn256 *X, const bn256 *A, const bn256 *B);
+void mod25638_mul (bn256 *X, const bn256 *A, const bn256 *B);
+void mod25638_sqr (bn256 *X, const bn256 *A);
+void mod25519_reduce (bn256 *X);
--- a/modp256k1.c
+++ b/modp256k1.c
@ -0,0 +1,315 @@
+/*
+ * modp256k1.c -- modulo arithmetic for p256k1
+ *
+ * Copyright (C) 2014, 2016, 2020 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * p256k1 =  2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 1
+ */
+#include <stdint.h>
+#include <string.h>
+
+#include "bn.h"
+#include "modp256k1.h"
+
+/*
+256      224      192      160      128       96       64       32        0
+2^256
+  1 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+2^256 - 2^32
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff 00000000
+2^256 - 2^32 - 2^9
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe fffffe00
+2^256 - 2^32 - 2^9 - 2^8
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe fffffd00
+2^256 - 2^32 - 2^9 - 2^8 - 2^7
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe fffffc80
+2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe fffffc40
+2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6 - 2^4
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe fffffc30
+2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 1
+  0 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe fffffc2f
+*/
+const bn256 p256k1 = { {0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff,
+			0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff } };
+
+/*
+ * Implementation Note.
+ *
+ * It's always modulo p256k1.
+ *
+ * Once, I tried redundant representation which caused wrong
+ * calculation.  Implementation could be correct with redundant
+ * representation, but it found that it's more expensive.
+ *
+ */
+
+/**
+ * @brief  X = (A + B) mod p256k1
+ */
+void
+modp256k1_add (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  uint32_t cond;
+  bn256 tmp[1];
+  bn256 dummy[1];
+
+  cond = (bn256_add (X, A, B) == 0);
+  cond &= bn256_sub (tmp, X, P256K1);
+  memcpy (cond?dummy:X, tmp, sizeof (bn256));
+  asm ("" : "=m" (dummy) : "m" (dummy) : "memory");
+}
+
+/**
+ * @brief  X = (A - B) mod p256
+ */
+void
+modp256k1_sub (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  uint32_t borrow;
+  bn256 tmp[1];
+  bn256 dummy[1];
+
+  borrow = bn256_sub (X, A, B);
+  bn256_add (tmp, X, P256K1);
+  memcpy (borrow?X:dummy, tmp, sizeof (bn256));
+  asm ("" : "=m" (dummy) : "m" (dummy) : "memory");
+}
+
+/**
+ * @brief  X = A mod p256k1
+ */
+void
+modp256k1_reduce (bn256 *X, const bn512 *A)
+{
+  bn256 tmp[1];
+  uint32_t carry;
+#define borrow carry
+  uint32_t s0, s1;
+#define s00 tmp->word[0]
+#define s01 tmp->word[1]
+#define s02 tmp->word[2]
+
+#define W0 X
+#define W1 tmp
+#define W2 tmp
+#define W3 tmp
+#define W4 tmp
+#define W5 tmp
+#define W6 tmp
+#define W7 tmp
+#define S  tmp
+
+  /*
+   * Suppose: P256K1 = 2^256 - CONST
+   * Then, compute: W = A_low + A_high * CONST
+   *                256-bit W0 = W mod 2^256
+   *                64-bit (S1, S0) = W / 2^256
+   * where: CONST = 2^32 + 2^9 + 2^8 + 2^7 + 2^6 + 2^4 + 1
+   */
+
+  /* W0 = A_low   */
+  /* W7 = A_high  */
+  /* W0 += W7 */
+  carry = bn256_add (W0, (const bn256 *)&A->word[8], (const bn256 *)A);
+
+  /* W6 = W7 << 4 */
+  /* W0 += W6 */
+  bn256_shift (W6, (const bn256 *)&A->word[8], 4);
+  carry += bn256_add (W0, W0, W6);
+
+  /* W5 = W6 << 2 */
+  /* W0 += W5 */
+  bn256_shift (W5, W6, 2);
+  carry += bn256_add (W0, W0, W5);
+
+  /* W4 = W5 << 1 */
+  /* W0 += W4 */
+  bn256_shift (W4, W5, 1);
+  carry += bn256_add (W0, W0, W4);
+
+  /* W3 = W4 << 1 */
+  /* W0 += W3 */
+  bn256_shift (W3, W4, 1);
+  carry += bn256_add (W0, W0, W3);
+
+  /* W2 = W3 << 1 */
+  /* W0 += W2 */
+  bn256_shift (W2, W3, 1);
+  carry += bn256_add (W0, W0, W2);
+
+  /* W1 = A_high << 32 */
+  /* W0 += W1 */
+  W1->word[7] = A->word[14];
+  W1->word[6] = A->word[13];
+  W1->word[5] = A->word[12];
+  W1->word[4] = A->word[11];
+  W1->word[3] = A->word[10];
+  W1->word[2] = A->word[9];
+  W1->word[1] = A->word[8];
+  W1->word[0] = 0;
+  carry += bn256_add (W0, W0, W1);
+
+  /* (S1, S0) = W / 2^256 */
+  s0 = A->word[15];
+  carry += (s0 >> 28) + (s0 >> 26) + (s0 >> 25) + (s0 >> 24) + (s0 >> 23);
+  carry += s0;
+  s1 = (carry < s0) ? 1 : 0;
+  s0 = carry;
+
+  /*
+   * Compute: S:=(S02, S01, S00), S = (S1,S0)*CONST
+   */
+  S->word[7] = S->word[6] = S->word[5] = S->word[4] = S->word[3] = 0;
+
+  /* (S02, S01, S00) = (S1, S0) + (S1, S0)*2^32 */
+  s00 = s0;
+  s01 = s0 + s1;
+  s02 = s1 + ((s01 < s0)? 1 : 0);
+
+  /* (S02, S01, S00) += (S1, S0)*2^9 */
+  carry = (s0 >> 23) + s01;
+  s02 += (s1 >> 23) + ((carry < s01)? 1 : 0);
+  s01 = (s1 << 9) + carry;
+  s02 += ((s01 < carry)? 1 : 0);
+  s00 += (s0 << 9);
+  carry = ((s00 < (s0 << 9))? 1 : 0);
+  s01 += carry;
+  s02 += ((s01 < carry)? 1 : 0);
+
+  /* (S02, S01, S00) += (S1, S0)*2^8 */
+  carry = (s0 >> 24) + s01;
+  s02 += (s1 >> 24) + ((carry < s01)? 1 : 0);
+  s01 = (s1 << 8) + carry;
+  s02 += ((s01 < carry)? 1 : 0);
+  s00 += (s0 << 8);
+  carry = ((s00 < (s0 << 8))? 1 : 0);
+  s01 += carry;
+  s02 += ((s01 < carry)? 1 : 0);
+
+  /* (S02, S01, S00) += (S1, S0)*2^7 */
+  carry = (s0 >> 25) + s01;
+  s02 += (s1 >> 25) + ((carry < s01)? 1 : 0);
+  s01 = (s1 << 7) + carry;
+  s02 += ((s01 < carry)? 1 : 0);
+  s00 += (s0 << 7);
+  carry = ((s00 < (s0 << 7))? 1 : 0);
+  s01 += carry;
+  s02 += ((s01 < carry)? 1 : 0);
+
+  /* (S02, S01, S00) += (S1, S0)*2^6 */
+  carry = (s0 >> 26) + s01;
+  s02 += (s1 >> 26) + ((carry < s01)? 1 : 0);
+  s01 = (s1 << 6) + carry;
+  s02 += ((s01 < carry)? 1 : 0);
+  s00 += (s0 << 6);
+  carry = ((s00 < (s0 << 6))? 1 : 0);
+  s01 += carry;
+  s02 += ((s01 < carry)? 1 : 0);
+
+  /* (S02, S01, S00) += (S1, S0)*2^4 */
+  carry = (s0 >> 28) + s01;
+  s02 += (s1 >> 28) + ((carry < s01)? 1 : 0);
+  s01 = (s1 << 4) + carry;
+  s02 += ((s01 < carry)? 1 : 0);
+  s00 += (s0 << 4);
+  carry = ((s00 < (s0 << 4))? 1 : 0);
+  s01 += carry;
+  s02 += ((s01 < carry)? 1 : 0);
+
+  /* W0 += S */
+  modp256k1_add (W0, W0, S);
+
+  borrow = bn256_sub (tmp, W0, P256K1);
+  if (borrow)
+    memcpy (tmp, W0, sizeof (bn256));
+  else
+    memcpy (W0, tmp, sizeof (bn256));
+
+#undef W0
+#undef W1
+#undef W2
+#undef W3
+#undef W4
+#undef W5
+#undef W6
+#undef W7
+#undef S
+#undef s00
+#undef s01
+#undef s02
+#undef borrow
+}
+
+/**
+ * @brief  X = (A * B) mod p256k1
+ */
+void
+modp256k1_mul (bn256 *X, const bn256 *A, const bn256 *B)
+{
+  bn512 AB[1];
+
+  bn256_mul (AB, A, B);
+  modp256k1_reduce (X, AB);
+}
+
+/**
+ * @brief  X = A * A mod p256k1
+ */
+void
+modp256k1_sqr (bn256 *X, const bn256 *A)
+{
+  bn512 AA[1];
+
+  bn256_sqr (AA, A);
+  modp256k1_reduce (X, AA);
+}
+
+
+/**
+ * @brief  X = (A << shift) mod p256k1
+ * @note   shift < 32
+ */
+void
+modp256k1_shift (bn256 *X, const bn256 *A, int shift)
+{
+  uint32_t carry;
+  bn256 tmp[1];
+
+  carry = bn256_shift (X, A, shift);
+  if (shift < 0)
+    return;
+
+  memset (tmp, 0, sizeof (bn256));
+  tmp->word[0] = carry + (carry << 9);
+  tmp->word[1] = carry + (tmp->word[0] < (carry << 9)) + (carry >> 23);
+  tmp->word[0] = tmp->word[0] + (carry << 8);
+  tmp->word[1] = tmp->word[1] + (tmp->word[0] < (carry << 8)) + (carry >> 24);
+  tmp->word[0] = tmp->word[0] + (carry << 7);
+  tmp->word[1] = tmp->word[1] + (tmp->word[0] < (carry << 7)) + (carry >> 25);
+  tmp->word[0] = tmp->word[0] + (carry << 6);
+  tmp->word[1] = tmp->word[1] + (tmp->word[0] < (carry << 6)) + (carry >> 26);
+  tmp->word[0] = tmp->word[0] + (carry << 4);
+  tmp->word[1] = tmp->word[1] + (tmp->word[0] < (carry << 4)) + (carry >> 28);
+
+  modp256k1_add (X, X, tmp);
+}
--- a/modp256k1.h
+++ b/modp256k1.h
@ -0,0 +1,9 @@
+extern const bn256 p256k1;
+#define P256K1 (&p256k1)
+
+void modp256k1_add (bn256 *X, const bn256 *A, const bn256 *B);
+void modp256k1_sub (bn256 *X, const bn256 *A, const bn256 *B);
+void modp256k1_reduce (bn256 *X, const bn512 *A);
+void modp256k1_mul (bn256 *X, const bn256 *A, const bn256 *B);
+void modp256k1_sqr (bn256 *X, const bn256 *A);
+void modp256k1_shift (bn256 *X, const bn256 *A, int shift);
--- a/muladd_256.h
+++ b/muladd_256.h
@ -0,0 +1,50 @@
+#define MULADD_256_ASM(s_,d_,w_,c_)                      \
+ asm ( "ldmia  %[s]!, { r8, r9, r10 } \n\t"              \
+       "ldmia  %[d], { r5, r6, r7 }   \n\t"              \
+       "umull  r4, r8, %[w], r8       \n\t"              \
+       "adds   r5, r5, r4             \n\t"              \
+       "adcs   r6, r6, r8             \n\t"              \
+       "umull  r4, r8, %[w], r9       \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r6, r6, r4             \n\t"              \
+       "adcs   r7, r7, %[c]           \n\t"              \
+       "umull  r4, r8, %[w], r10      \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r7, r7, r4             \n\t"              \
+       "stmia  %[d]!, { r5, r6, r7 }  \n\t"              \
+       "ldmia  %[s]!, { r8, r9, r10 } \n\t"              \
+       "ldmia  %[d], { r5, r6, r7 }   \n\t"              \
+       "adcs   r5, r5, %[c]           \n\t"              \
+       "umull  r4, r8, %[w], r8       \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r5, r5, r4             \n\t"              \
+       "adcs   r6, r6, %[c]           \n\t"              \
+       "umull  r4, r8, %[w], r9       \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r6, r6, r4             \n\t"              \
+       "adcs   r7, r7, %[c]           \n\t"              \
+       "umull  r4, r8, %[w], r10      \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r7, r7, r4             \n\t"              \
+       "stmia  %[d]!, { r5, r6, r7 }  \n\t"              \
+       "ldmia  %[s]!, { r8, r9 }      \n\t"              \
+       "ldmia  %[d], { r5, r6 }       \n\t"              \
+       "adcs   r5, r5, %[c]           \n\t"              \
+       "umull  r4, r8, %[w], r8       \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r5, r5, r4             \n\t"              \
+       "adcs   r6, r6, %[c]           \n\t"              \
+       "umull  r4, r8, %[w], r9       \n\t"              \
+       "adc    %[c], r8, #0           \n\t"              \
+       "adds   r6, r6, r4             \n\t"              \
+       "adc    %[c], %[c], #0         \n\t"              \
+       "stmia  %[d]!, { r5, r6 }"                        \
+       : [s] "=&r" (s_), [d] "=&r" (d_), [c] "=&r" (c_)  \
+       : "[s]" (s_), "[d]" (d_), [w] "r" (w_)            \
+       : "r4", "r5", "r6", "r7", "r8", "r9", "r10",      \
+         "memory", "cc" )
+
+#define MULADD_256(s__,d__,w__,c__) do { \
+  MULADD_256_ASM(s__,d__,w__,c__);	 \
+  *d__ = c__;                            \
+} while (0)
--- a/openpgp-do.c
+++ b/openpgp-do.c
--- a/openpgp.c
+++ b/openpgp.c
--- a/p448.c
+++ b/p448.c
@ -0,0 +1,666 @@
+/*                                                    -*- coding: utf-8 -*-
+ * p448.c - Modular calculation with p448: 2^448 - 2^224 - 1
+ *
+ * Copyright (C) 2021  Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include "p448.h"
+
+#define MASK_28BITS 0x0fffffff
+
+static void
+p448_add_raw (p448_t *x, const p448_t *a, const p448_t *b)
+{
+  int i;
+
+  for (i = 0; i < N_REDUNDANT_LIMBS; i++)
+    x->limb[i] = a->limb[i] + b->limb[i];
+}
+
+static void
+p448_sub_raw (p448_t *x, const p448_t *a, const p448_t *b)
+{
+  int i;
+
+  for (i = 0; i < N_REDUNDANT_LIMBS; i++)
+    x->limb[i] = a->limb[i] - b->limb[i];
+}
+
+static uint64_t
+mul64_32x32 (const uint32_t a, const uint32_t b)
+{
+  return ((uint64_t)a) * b;
+}
+
+/**
+ * Compute X = A * B mod p448
+ */
+/*
+ * When we set phi = 2^224, p448 can be expressed as:
+ *
+ *     p448 = phi^2 - phy - 1
+ *
+ * Here, using the right hand side and make a fomula
+ *
+ *     phi^2 - phy - 1 = 0
+ *
+ * it is the fomula where it's solution is golden ratio.
+ *
+ * By analogy, so, p448 is called "golden-ratio prime".
+ *
+ * When we set phi = 2^224, Karatsuba multiplication goes like:
+ *
+ * (p + q * phi) * (r + s * phi)
+ * =  pr + (ps + qr)*phy + qs*phi^2
+ * == (pr + qs) + (ps + qr + qs) * phy (mod p448)
+ * = (pr + qs) + ((p + q)*(r + s) - pr) * phy
+ *
+ * That is, it can be done by three times of 224-bit multiplications
+ * (instead of four).
+ *
+ * Let us see more detail.
+ *
+ * The formula above is calculated to:
+ * = lower224(pr + qs) + upper224(pr + qs)*phy
+ *                     + lower224((p + q)*(r + s) - pr)*phy
+ *                     + upper224((p + q)*(r + s) - pr)*phy^2 (mod p448)
+ * == lower224(pr + qs)
+ *    + upper224((p + q)*(r + s) - pr)
+ *                      + (upper224(pr + qs)
+ *                         + lower224((p + q)*(r + s) - pr)
+ *                         + upper224((p + q)*(r + s) - pr))*phy (mod p448)
+ * = lower224(pr + qs)
+ *   + upper224((p + q)*(r + s) - pr)
+ *                      + (lower224((p + q)*(r + s) - pr)
+ *                         + upper224((p + q)*(r + s) + qs)) * phy
+ *
+ */
+/*
+
+Here is a figure of: multiplication by 8-limb * 8-limb
+
+                      a  b  c  d  e  f  g  h
+ *                    i  j  k  l  m  n  o  p
+---------------------------------------------
+                     ap bp cp dp ep fp gp hp
+                  ao bo co do eo fo go ho
+               an bn cn dn en fn gn hn
+            am bm cm dm em fm gm hm
+         al bl cl dl el fl gl hl
+      ak bk ck dk ek fk gk hk
+   aj bj cj dj ej fj gj hj
+ai bi ci di ei fi gi hi
+
+Considering lower224, it's:
+                     ap bp cp dp ep fp gp hp
+                     bo co do eo fo go ho
+                     cn dn en fn gn hn
+                     dm em fm gm hm
+                     el fl gl hl
+                     fk gk hk
+                     gj hj
+                     hi
+
+Considering upper224, it's:
+                                          ao
+                                       an bn
+                                    am bm cm
+                                 al bl cl dl
+                              ak bk ck dk ek
+                           aj bj cj dj ej fj
+                        ai bi ci di ei fi gi
+*/
+void
+p448_mul (p448_t *__restrict__ x, const p448_t *a, const p448_t *b)
+{
+  int i, j;
+  uint64_t v64_0, v64_1, v64_2;
+  uint32_t p_q[8], r_s[8];
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  px = x->limb;
+  pa = a->limb;
+  pb = b->limb;
+
+  /* Firstly, we do Karatsuba preparation.  */
+  for (i = 0; i < 8; i++)
+    {
+      p_q[i] = pa[i] + pa[i+8];
+      r_s[i] = pb[i] + pb[i+8];
+    }
+
+  v64_0 = v64_1 = 0;
+
+  for (j = 0; j < 8; j++)
+    {
+      v64_2 = 0;
+
+      /* Compute lower half of limbs (lower224) */
+      /*  __  <-- j
+       * | /        |
+       * |/         v i
+       *
+       */
+      for (i = 0; i <= j; i++)
+	{
+	  v64_0 += mul64_32x32 (pa[8+j-i], pb[8+i]);/* accumulating q*s     */
+	  v64_1 += mul64_32x32 (p_q[j-i], r_s[i]);  /* accumulating p_q*r_s */
+	  v64_2 += mul64_32x32 (pa[j-i], pb[i]);    /* accumulating p*r     */
+	}
+
+      v64_0 += v64_2; /* Compute pr+qs.         */
+      v64_1 -= v64_2; /* Compute p_q*r_s - pr.  */
+
+      v64_2 = 0;
+
+      /* Compute upper half of limbs (upper224) */
+      /*     <-- j
+       *  /|        |
+       * /_|        v i
+       *
+       */
+      for (; i < 8; i++)
+	{
+	  v64_0 -= mul64_32x32 (pa[8+j-i], pb[i]);   /* accumulating -p*r    */
+	  v64_1 += mul64_32x32 (pa[16+j-i], pb[8+i]);/* accumulating q*s     */
+	  v64_2 += mul64_32x32 (p_q[8+j-i], r_s[i]); /* accumulating p_q*r_s */
+	}
+
+      v64_0 += v64_2; /* Compute p_q*r_s - pr.  */
+      v64_1 += v64_2; /* Compute p_q*r_s + qs.  */
+
+      px[j] = v64_0 & MASK_28BITS;
+      px[j+8] = v64_1 & MASK_28BITS;
+
+      v64_0 >>= 28;
+      v64_1 >>= 28;
+    }
+
+  /* "Carry" remains as: 2^448 * v64_1 + 2^224 * v64_0 */
+  /*
+   * Subtract p448 times v64_1 to clear msbs, meaning, clear those
+   * bits and adding v64_1 to px[0] and px[8] (in mod p448
+   * calculation).
+   */
+  v64_0 += v64_1;
+  v64_0 += px[8];
+  v64_1 += px[0];
+  px[8] = v64_0 & MASK_28BITS;
+  px[0] = v64_1 & MASK_28BITS;
+
+  /* Still, it carries to... */
+  v64_0 >>= 28;
+  v64_1 >>= 28;
+  px[9] += v64_0;
+  px[1] += v64_1;
+  /* DONE.  */
+}
+
+
+/**
+ * Compute X = A * 39081
+ */
+void
+p448_mul_39081 (p448_t *x, const p448_t *a)
+{
+  int i;
+  const uint32_t w = 39081;
+  uint32_t *px;
+  const uint32_t *pa;
+  uint64_t v64;
+  uint32_t carry;
+
+  px = x->limb;
+  pa = a->limb;
+
+  v64 = 0;
+  for (i = 0; i < N_REDUNDANT_LIMBS; i++)
+    {
+      v64 += mul64_32x32 (w, pa[i]);
+      px[i] = v64 & MASK_28BITS;
+      v64 >>= 28;
+    }
+
+  carry = v64;
+  carry += px[0];
+  px[0] = carry & MASK_28BITS;
+  px[1] += carry >> 28;
+
+  carry = v64;
+  carry += px[8];
+  px[8] = carry & MASK_28BITS;
+  px[9] += carry >> 28;
+}
+
+/*
+                   ah bh ch dh eh fh gh HH
+                   bg cg dg eg fg GG
+                   cf df ef FF
+                   de EE
+                                        DD
+                                  CC dc ec
+                            BB cb db eb fb
+                      AA ba ca da ea fa ga
+
+ */
+/**
+ * Compute X = A^2 mod p448
+ */
+void
+p448_sqr (p448_t *__restrict__ x, const p448_t *a)
+{
+  int i, j;
+  uint64_t v64_0, v64_1, v64_2, v64_3;
+  uint32_t p_q[8];
+  uint32_t *px;
+  const uint32_t *pa;
+
+  px = x->limb;
+  pa = a->limb;
+
+  /* Firstly, we do Karatsuba preparation.  */
+  for (i = 0; i < 8; i++)
+    p_q[i] = pa[i] + pa[i+8];
+
+  v64_0 = v64_1 = 0;
+
+  for (j = 0; j < 8; j++)
+    {
+      v64_2 = 0;
+
+      /* Compute lower half of limbs (lower224) */
+      /*  __  <-- j
+       * | /        |
+       * |/         v i
+       *
+       */
+      for (i = 0; i <= j/2; i++)
+	{
+	  int cond = ((j & 1) || i != j/2);
+
+	  v64_3 = mul64_32x32 (pa[8+j-i], pa[8+i]);/* accumulating q*q     */
+	  v64_0 += (v64_3 << cond);
+	  v64_3 = mul64_32x32 (p_q[j-i], p_q[i]);  /* accumulating p_q^2   */
+	  v64_1 += (v64_3 << cond);
+	  v64_3 = mul64_32x32 (pa[j-i], pa[i]);    /* accumulating p*p     */
+	  v64_2 += (v64_3 << cond);
+	}
+
+      v64_0 += v64_2; /* Compute pp+qq.         */
+      v64_1 -= v64_2; /* Compute p_q^2 - pp.  */
+
+      v64_2 = 0;
+      /* Compute upper half of limbs (upper224) */
+      /*     <-- j
+       *  /|        |
+       * /_|        v i
+       *
+       */
+      if (!(j & 1))
+	{
+	  v64_0 -= mul64_32x32 (pa[4+i-1], pa[4+i-1]);	/* accumulating -p*p  */
+	  v64_1 += mul64_32x32 (pa[12+i-1], pa[12+i-1]);/* accumulating q*q   */
+	  v64_2 += mul64_32x32 (p_q[4+i-1], p_q[4+i-1]);/* accumulating p_q^2 */
+	}
+
+      for (; i < 4; i++)
+	{
+	  v64_3 = mul64_32x32 (pa[4+j-i], pa[4+i]);
+	  v64_0 -= (v64_3 << 1);   /* accumulating -p*p	   */
+	  v64_3 = mul64_32x32 (pa[12+j-i], pa[12+i]);
+	  v64_1 += (v64_3 << 1);   /* accumulating q*q	   */
+	  v64_3 = mul64_32x32 (p_q[4+j-i], p_q[4+i]);
+	  v64_2 += (v64_3 << 1);   /* accumulating p_q^2   */
+	}
+
+      v64_0 += v64_2; /* Compute p_q^2 - p^2.  */
+      v64_1 += v64_2; /* Compute p_q^2 + q^2.  */
+
+      px[j] = v64_0 & MASK_28BITS;
+      px[j+8] = v64_1 & MASK_28BITS;
+
+      v64_0 >>= 28;
+      v64_1 >>= 28;
+    }
+
+  /* "Carry" remains as: 2^448 * v64_1 + 2^224 * v64_0 */
+  /*
+   * Subtract p448 times v64_1 to clear msbs, meaning, clear those
+   * bits and adding v64_1 to px[0] and px[8] (in mod p448
+   * calculation).
+   */
+  v64_0 += v64_1;
+  v64_0 += px[8];
+  v64_1 += px[0];
+  px[8] = v64_0 & MASK_28BITS;
+  px[0] = v64_1 & MASK_28BITS;
+
+  /* Still, it carries to... */
+  v64_0 >>= 28;
+  v64_1 >>= 28;
+  px[9] += v64_0;
+  px[1] += v64_1;
+  /* DONE.  */
+}
+
+/**
+ * Weak reduce - Make each limb of redundunt representation smaller.
+ * Do our best weakly to zeroing most significant 4-bit.
+ *
+ * Note that: p448 = 2^448 - 2^224 - 1
+ *
+ * Subtracting p448 means that subtracting 2^448 then adding 2^224 + 1.
+ */
+void
+p448_weak_reduce (p448_t *a)
+{
+  int i;
+  uint32_t tmp = a->limb[15] >> 28;
+
+  a->limb[8] += tmp;  /* Adding TMP * 2^224 (28 * 8 = 224) */
+
+  /* Compute top to bottom.  */
+  for (i = 0; i < N_REDUNDANT_LIMBS - 1; i++)
+    a->limb[N_REDUNDANT_LIMBS - i - 1] =
+      (a->limb[N_REDUNDANT_LIMBS - i - 1] & MASK_28BITS)
+      + (a->limb[N_REDUNDANT_LIMBS - i - 2] >> 28);
+
+  a->limb[0] = (a->limb[0] & MASK_28BITS) + tmp;
+}
+
+static const p448_t p448[1] = {
+ {
+   {
+    0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff,
+    0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff,
+    0x0ffffffe, 0x0fffffff, 0x0fffffff, 0x0fffffff,
+    0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
+   }
+ }
+};
+
+
+static uint32_t
+p448_add_carry_cond (p448_t *x, const p448_t *a, const p448_t *b,
+		     uint32_t cond)
+{
+  int i;
+  uint32_t v;
+  uint32_t carry = 0;
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  cond = cond * MASK_28BITS;
+
+  px = x->limb;
+  pa = a->limb;
+  pb = b->limb;
+
+  for (i = 0; i < N_REDUNDANT_LIMBS; i++)
+    {
+      v = *pb & cond;
+      *px = *pa + carry;
+      carry = (*px < carry);
+      *px = (*px + v) & MASK_28BITS;
+      carry += (*px < v);
+      px++;
+      pa++;
+      pb++;
+    }
+
+  return carry;
+}
+
+
+static uint32_t
+p448_sub_borrow (p448_t *x, const p448_t *a, const p448_t *b)
+{
+  int i;
+  uint32_t v;
+  uint32_t borrow = 0;
+  uint32_t *px;
+  const uint32_t *pa, *pb;
+
+  px = x->limb;
+  pa = a->limb;
+  pb = b->limb;
+
+  for (i = 0; i < N_REDUNDANT_LIMBS; i++)
+    {
+      uint32_t borrow0 = (*pa < borrow);
+
+      v = *pb;
+      *px = *pa - borrow;
+      borrow = (*px < v) + borrow0;
+      *px = (*px - v) & MASK_28BITS;
+      px++;
+      pa++;
+      pb++;
+    }
+
+  return borrow;
+}
+
+/**
+ * Strong reduce - Make sure that each limb of redundunt
+ * representation has zeros of significant 4-bit.
+ */
+void
+p448_strong_reduce (p448_t *a)
+{
+  uint32_t tmp;
+  uint32_t is_negative;
+
+  /*
+   * Clear the 4-bit of the last (top) limb.  As stated in the comment
+   * of weak_reduce, subtracting p448 means that subtracting 2^448
+   * then adding 2^224 + 1.
+   */
+  tmp = a->limb[15] >> 28;
+  a->limb[8] += tmp;
+  a->limb[0] += tmp;
+  a->limb[15] &= MASK_28BITS;
+
+  /*
+   * Here, it's: 0 <= v < 2*p448
+   *
+   * When v > p448, subtract p448 from v, then it becomes strongly reduced.
+   * Otherwise, it's already strongly reduced.
+   */
+
+  /* Subtract p448 */
+  is_negative = p448_sub_borrow (a, a, p448);
+
+  /* Add p448 conditionally, when it becomes negative.  */
+  p448_add_carry_cond (a, a, p448, is_negative);
+}
+
+/**
+ * Convert to wire-format from internal redundant representation.
+ */
+void
+p448_serialize (uint8_t serial[56], const struct p448_t *x)
+{
+  int i;
+  p448_t tmp[1];
+  uint8_t *p = serial;
+
+  *tmp = *x;
+  p448_strong_reduce (tmp);
+
+  for (i = 0; i < 8; i++)
+    {
+      uint32_t limb0 = tmp->limb[2*i];
+      uint32_t limb1 = tmp->limb[2*i+1];
+
+      *p++ = limb0;
+      *p++ = (limb0 >> 8);
+      *p++ = (limb0 >> 16);
+      *p++ = ((limb0 >> 24) & 0x0f) | ((limb1 & 0x0f )<< 4);
+      *p++ = (limb1 >> 4);
+      *p++ = (limb1 >> 12);
+      *p++ = (limb1 >> 20);
+    }
+}
+
+/**
+ * Convert from wire-format to internal redundant representation.
+ */
+void
+p448_deserialize (p448_t *x, const uint8_t serial[56])
+{
+  int i;
+  const uint8_t *p = serial + 56;
+
+  for (i = 0; i < 8; i++)
+    {
+      uint32_t v;
+
+      v = *--p;
+      v <<= 8;
+      v |= *--p;
+      v <<= 8;
+      v |= *--p;
+      v <<= 8;
+      v |= *--p;
+
+      x->limb[N_REDUNDANT_LIMBS-2*i-1] = (v >> 4);
+
+      v = (v & 0x0f);
+      v <<= 8;
+      v |= *--p;
+      v <<= 8;
+      v |= *--p;
+      v <<= 8;
+      v |= *--p;
+
+      x->limb[N_REDUNDANT_LIMBS-2*i-2] = v & MASK_28BITS;
+    }
+}
+
+
+/* X = A^(2*N) */
+static void
+p448_sqrn (p448_t *__restrict__ x, const p448_t *a, int n)
+{
+  p448_t tmp[1];
+
+  if ((n&1))
+    {
+      p448_sqr (x, a);
+      n--;
+    }
+  else
+    {
+      p448_sqr (tmp, a);
+      p448_sqr (x, tmp);
+      n -= 2;
+    }
+
+  for (; n; n -= 2)
+    {
+      p448_sqr (tmp, x);
+      p448_sqr (x, tmp);
+    }
+}
+
+/**
+ * Compute X = A^(-1) mod p448 (if A=0, return X = 0)
+ *
+ * Internally, do A^(p448 - 2) to get A^(-1).
+ */
+void
+p448_inv (p448_t *__restrict__ x, const p448_t *a)
+{
+  p448_t  t[1],  u[1];
+
+  /*
+   * Bit pattern of p448-2: 1{223} 0 1{222}01
+   *
+   * 222-bit can be composed by 3-bit three times to get 9-bit, 9-bit
+   * two times to get 18-bit, 18-bit two times plus 1-bit to get 37-bit.
+   * 37-bit three times to get 111-bit, and lastly 111-bit two times.
+   *   222 = 111*2 = 37*3*2 = (18*2+1)*3*2 = (9*2*2+1)*3*2 = (3*3*2*2+1)*3*2
+   */
+  p448_sqr  ( x, a      );  /*        10 */
+  p448_mul  ( t, a, x   );  /*        11 */
+  p448_sqr  ( x, t      );  /*       110 */
+  p448_mul  ( t, a, x   );  /*       111 */
+  p448_sqrn ( x, t, 3   );  /*    111000 */
+  p448_mul  ( u, t, x   );  /*    111111 */
+  p448_sqrn ( x, u, 3   );  /* 111111000 */
+  p448_mul  ( u, t, x   );  /* 111111111 */
+  p448_sqrn ( t, u, 9   );  /* 1{9} 0{9}         */
+  p448_mul  ( x, u, t   );  /* 1{18}             */
+  p448_sqr  ( t, x      );  /* 1{18} 0           */
+  p448_mul  ( u, a, t   );  /* 1{19}             */
+  p448_sqrn ( t, u, 18  );  /* 1{19} 0{18}       */
+  p448_mul  ( u, x, t   );  /* 1{37}             */
+  p448_sqrn ( t, u, 37  );  /* 1{37} 0{37}       */
+  p448_mul  ( x, u, t   );  /* 1{74}             */
+  p448_sqrn ( t, x, 37  );  /* 1{74} 0{37}       */
+  p448_mul  ( x, u, t   );  /* 1{111}            */
+  p448_sqrn ( t, x, 111 );  /* 1{111} 0{111}     */
+  p448_mul  ( u, x, t   );  /* 1{222}            */
+  p448_sqr  ( t, u      );  /* 1{222} 0          */
+  p448_mul  ( x, a, t   );  /* 1{223}            */
+  p448_sqrn ( u, x, 224 );  /* 1{223} 0{224}     */
+  p448_mul  ( x, u, t   );  /* 1{223} 0 1{222}0  */
+  p448_sqr  ( t, x      );  /* 1{223} 0 1{222}00 */
+  p448_mul  ( x, a,  t  );  /* 1{223} 0 1{222}01 */
+}
+
+static const p448_t p448_times_2[1] = {
+  {
+    {
+      0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe,
+      0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe,
+      0x1ffffffc, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe,
+      0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe
+    }
+  }
+};
+
+/**
+ * Compute X = A + B mod p448, result is weakly reduced.
+ *
+ */
+void
+p448_add (p448_t *x, const p448_t *a, const p448_t *b)
+{
+  p448_add_raw (x, a, b);
+  p448_weak_reduce (x);
+}
+
+/**
+ * Compute X = A - B mod p448, result is weakly reduced.
+ *
+ */
+void
+p448_sub (p448_t *x, const p448_t *a, const p448_t *b)
+{
+  p448_t tmp[1];
+
+  p448_sub_raw (tmp, a, b);
+  p448_add_raw (x, p448_times_2, tmp);
+  p448_weak_reduce (x);
+}
--- a/p448.h
+++ b/p448.h
@ -0,0 +1,15 @@
+#define N_REDUNDANT_LIMBS 16
+typedef struct p448_t
+{
+  uint32_t limb[N_REDUNDANT_LIMBS];
+} p448_t;
+
+void p448_add (p448_t *x, const p448_t *a, const p448_t *b);
+void p448_sub (p448_t *x, const p448_t *a, const p448_t *b);
+void p448_mul (p448_t *__restrict__ x, const p448_t *a, const p448_t *b);
+void p448_mul_39081 (p448_t *x, const p448_t *a);
+void p448_sqr (p448_t *__restrict__ c, const p448_t *a);
+void p448_inv (p448_t *__restrict__ x, const p448_t *a);
+void p448_serialize (uint8_t serial[56], const p448_t *x);
+void p448_deserialize (p448_t *x, const uint8_t serial[56]);
+void p448_strong_reduce (p448_t *a);
--- a/polarssl/aes.h
+++ b/polarssl/aes.h
@ -0,0 +1,204 @@
+/**
+ * \file aes.h
+ *
+ * \brief AES block cipher
+ *
+ *  Copyright (C) 2006-2013, Brainspark B.V.
+ *
+ *  This file is part of PolarSSL (http://www.polarssl.org)
+ *  Lead Maintainer: Paul Bakker <polarssl_maintainer at polarssl.org>
+ *
+ *  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef POLARSSL_AES_H
+#define POLARSSL_AES_H
+
+#include "config.h"
+
+#include <string.h>
+
+#ifdef _MSC_VER
+#include <basetsd.h>
+typedef UINT32 uint32_t;
+#else
+#include <inttypes.h>
+#endif
+
+#define AES_ENCRYPT     1
+#define AES_DECRYPT     0
+
+#define POLARSSL_ERR_AES_INVALID_KEY_LENGTH                -0x0020  /**< Invalid key length. */
+#define POLARSSL_ERR_AES_INVALID_INPUT_LENGTH              -0x0022  /**< Invalid data input length. */
+
+#if !defined(POLARSSL_AES_ALT)
+// Regular implementation
+//
+
+/**
+ * \brief          AES context structure
+ */
+typedef struct
+{
+    int nr;                     /*!<  number of rounds  */
+    uint32_t *rk;               /*!<  AES round keys    */
+    uint32_t buf[68];           /*!<  unaligned data    */
+}
+aes_context;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          AES key schedule (encryption)
+ *
+ * \param ctx      AES context to be initialized
+ * \param key      encryption key
+ * \param keysize  must be 128, 192 or 256
+ *
+ * \return         0 if successful, or POLARSSL_ERR_AES_INVALID_KEY_LENGTH
+ */
+int aes_setkey_enc( aes_context *ctx, const unsigned char *key, unsigned int keysize );
+
+/**
+ * \brief          AES key schedule (decryption)
+ *
+ * \param ctx      AES context to be initialized
+ * \param key      decryption key
+ * \param keysize  must be 128, 192 or 256
+ *
+ * \return         0 if successful, or POLARSSL_ERR_AES_INVALID_KEY_LENGTH
+ */
+int aes_setkey_dec( aes_context *ctx, const unsigned char *key, unsigned int keysize );
+
+/**
+ * \brief          AES-ECB block encryption/decryption
+ *
+ * \param ctx      AES context
+ * \param mode     AES_ENCRYPT or AES_DECRYPT
+ * \param input    16-byte input block
+ * \param output   16-byte output block
+ *
+ * \return         0 if successful
+ */
+int aes_crypt_ecb( aes_context *ctx,
+                    int mode,
+                    const unsigned char input[16],
+                    unsigned char output[16] );
+
+#if 0
+/**
+ * \brief          AES-CBC buffer encryption/decryption
+ *                 Length should be a multiple of the block
+ *                 size (16 bytes)
+ *
+ * \param ctx      AES context
+ * \param mode     AES_ENCRYPT or AES_DECRYPT
+ * \param length   length of the input data
+ * \param iv       initialization vector (updated after use)
+ * \param input    buffer holding the input data
+ * \param output   buffer holding the output data
+ *
+ * \return         0 if successful, or POLARSSL_ERR_AES_INVALID_INPUT_LENGTH
+ */
+int aes_crypt_cbc( aes_context *ctx,
+                    int mode,
+                    size_t length,
+                    unsigned char iv[16],
+                    const unsigned char *input,
+                    unsigned char *output );
+#endif
+
+/**
+ * \brief          AES-CFB128 buffer encryption/decryption.
+ *
+ * Note: Due to the nature of CFB you should use the same key schedule for
+ * both encryption and decryption. So a context initialized with
+ * aes_setkey_enc() for both AES_ENCRYPT and AES_DECRYPT.
+ *
+ * both 
+ * \param ctx      AES context
+ * \param mode     AES_ENCRYPT or AES_DECRYPT
+ * \param length   length of the input data
+ * \param iv_off   offset in IV (updated after use)
+ * \param iv       initialization vector (updated after use)
+ * \param input    buffer holding the input data
+ * \param output   buffer holding the output data
+ *
+ * \return         0 if successful
+ */
+int aes_crypt_cfb128( aes_context *ctx,
+                       int mode,
+                       size_t length,
+                       size_t *iv_off,
+                       unsigned char iv[16],
+                       const unsigned char *input,
+                       unsigned char *output );
+
+/**
+ * \brief               AES-CTR buffer encryption/decryption
+ *
+ * Warning: You have to keep the maximum use of your counter in mind!
+ *
+ * Note: Due to the nature of CTR you should use the same key schedule for
+ * both encryption and decryption. So a context initialized with
+ * aes_setkey_enc() for both AES_ENCRYPT and AES_DECRYPT.
+ *
+ * \param length        The length of the data
+ * \param nc_off        The offset in the current stream_block (for resuming
+ *                      within current cipher stream). The offset pointer to
+ *                      should be 0 at the start of a stream.
+ * \param nonce_counter The 128-bit nonce and counter.
+ * \param stream_block  The saved stream-block for resuming. Is overwritten
+ *                      by the function.
+ * \param input         The input data stream
+ * \param output        The output data stream
+ *
+ * \return         0 if successful
+ */
+int aes_crypt_ctr( aes_context *ctx,
+                       size_t length,
+                       size_t *nc_off,
+                       unsigned char nonce_counter[16],
+                       unsigned char stream_block[16],
+                       const unsigned char *input,
+                       unsigned char *output );
+
+#ifdef __cplusplus
+}
+#endif
+
+#else  /* POLARSSL_AES_ALT */
+#include "aes_alt.h"
+#endif /* POLARSSL_AES_ALT */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          Checkup routine
+ *
+ * \return         0 if successful, or 1 if the test failed
+ */
+int aes_self_test( int verbose );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* aes.h */
--- a/polarssl/bignum.h
+++ b/polarssl/bignum.h
@ -0,0 +1,687 @@
+/**
+ * \file bignum.h
+ *
+ * \brief  Multi-precision integer library
+ *
+ *  Copyright (C) 2006-2013, Brainspark B.V.
+ *
+ *  This file is part of PolarSSL (http://www.polarssl.org)
+ *  Lead Maintainer: Paul Bakker <polarssl_maintainer at polarssl.org>
+ *
+ *  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef POLARSSL_BIGNUM_H
+#define POLARSSL_BIGNUM_H
+
+#include <stdio.h>
+#include <string.h>
+
+#include "config.h"
+
+#ifdef _MSC_VER
+#include <basetsd.h>
+#if (_MSC_VER <= 1200)
+typedef   signed short  int16_t;
+typedef unsigned short uint16_t;
+#else
+typedef  INT16  int16_t;
+typedef UINT16 uint16_t;
+#endif
+typedef  INT32  int32_t;
+typedef  INT64  int64_t;
+typedef UINT32 uint32_t;
+typedef UINT64 uint64_t;
+#else
+#include <inttypes.h>
+#endif
+
+#define POLARSSL_ERR_MPI_FILE_IO_ERROR                     -0x0002  /**< An error occurred while reading from or writing to a file. */
+#define POLARSSL_ERR_MPI_BAD_INPUT_DATA                    -0x0004  /**< Bad input parameters to function. */
+#define POLARSSL_ERR_MPI_INVALID_CHARACTER                 -0x0006  /**< There is an invalid character in the digit string. */
+#define POLARSSL_ERR_MPI_BUFFER_TOO_SMALL                  -0x0008  /**< The buffer is too small to write to. */
+#define POLARSSL_ERR_MPI_NEGATIVE_VALUE                    -0x000A  /**< The input arguments are negative or result in illegal output. */
+#define POLARSSL_ERR_MPI_DIVISION_BY_ZERO                  -0x000C  /**< The input argument for division is zero, which is not allowed. */
+#define POLARSSL_ERR_MPI_NOT_ACCEPTABLE                    -0x000E  /**< The input arguments are not acceptable. */
+#define POLARSSL_ERR_MPI_MALLOC_FAILED                     -0x0010  /**< Memory allocation failed. */
+
+#define MPI_CHK(f) if( ( ret = f ) != 0 ) goto cleanup
+
+/*
+ * Maximum size MPIs are allowed to grow to in number of limbs.
+ */
+#define POLARSSL_MPI_MAX_LIMBS                             10000
+
+#if !defined(POLARSSL_CONFIG_OPTIONS)
+/*
+ * Maximum window size used for modular exponentiation. Default: 6
+ * Minimum value: 1. Maximum value: 6.
+ *
+ * Result is an array of ( 2 << POLARSSL_MPI_WINDOW_SIZE ) MPIs used
+ * for the sliding window calculation. (So 64 by default)
+ *
+ * Reduction in size, reduces speed.
+ */
+#define POLARSSL_MPI_WINDOW_SIZE                           6        /**< Maximum windows size used. */
+
+/*
+ * Maximum size of MPIs allowed in bits and bytes for user-MPIs.
+ * ( Default: 512 bytes => 4096 bits, Maximum tested: 2048 bytes => 16384 bits )
+ *
+ * Note: Calculations can results temporarily in larger MPIs. So the number
+ * of limbs required (POLARSSL_MPI_MAX_LIMBS) is higher.
+ */
+#define POLARSSL_MPI_MAX_SIZE                              256      /**< Maximum number of bytes for usable MPIs. */
+
+#endif /* !POLARSSL_CONFIG_OPTIONS */
+
+#define POLARSSL_MPI_MAX_BITS                              ( 8 * POLARSSL_MPI_MAX_SIZE )    /**< Maximum number of bits for usable MPIs. */
+
+/*
+ * When reading from files with mpi_read_file() and writing to files with
+ * mpi_write_file() the buffer should have space
+ * for a (short) label, the MPI (in the provided radix), the newline
+ * characters and the '\0'.
+ *
+ * By default we assume at least a 10 char label, a minimum radix of 10
+ * (decimal) and a maximum of 4096 bit numbers (1234 decimal chars).
+ * Autosized at compile time for at least a 10 char label, a minimum radix
+ * of 10 (decimal) for a number of POLARSSL_MPI_MAX_BITS size.
+ *
+ * This used to be statically sized to 1250 for a maximum of 4096 bit
+ * numbers (1234 decimal chars).
+ *
+ * Calculate using the formula:
+ *  POLARSSL_MPI_RW_BUFFER_SIZE = ceil(POLARSSL_MPI_MAX_BITS / ln(10) * ln(2)) +
+ *                                LabelSize + 6
+ */
+#define POLARSSL_MPI_MAX_BITS_SCALE100          ( 100 * POLARSSL_MPI_MAX_BITS )
+#define LN_2_DIV_LN_10_SCALE100                 332
+#define POLARSSL_MPI_RW_BUFFER_SIZE             ( ((POLARSSL_MPI_MAX_BITS_SCALE100 + LN_2_DIV_LN_10_SCALE100 - 1) / LN_2_DIV_LN_10_SCALE100) + 10 + 6 )
+
+/*
+ * Define the base integer type, architecture-wise
+ */
+#if defined(POLARSSL_HAVE_INT8)
+typedef   signed char  t_sint;
+typedef unsigned char  t_uint;
+typedef uint16_t       t_udbl;
+#define POLARSSL_HAVE_UDBL
+#else
+#if defined(POLARSSL_HAVE_INT16)
+typedef  int16_t t_sint;
+typedef uint16_t t_uint;
+typedef uint32_t t_udbl;
+#define POLARSSL_HAVE_UDBL
+#else
+  #if ( defined(_MSC_VER) && defined(_M_AMD64) )
+    typedef  int64_t t_sint;
+    typedef uint64_t t_uint;
+  #else
+    #if ( defined(__GNUC__) && (                          \
+          defined(__amd64__) || defined(__x86_64__)    || \
+          defined(__ppc64__) || defined(__powerpc64__) || \
+          defined(__ia64__)  || defined(__alpha__)     || \
+          (defined(__sparc__) && defined(__arch64__))  || \
+          defined(__s390x__) ) )
+       typedef  int64_t t_sint;
+       typedef uint64_t t_uint;
+       typedef unsigned int t_udbl __attribute__((mode(TI)));
+       #define POLARSSL_HAVE_UDBL
+    #else
+       typedef  int32_t t_sint;
+       typedef uint32_t t_uint;
+       #if ( defined(_MSC_VER) && defined(_M_IX86) )
+         typedef uint64_t t_udbl;
+         #define POLARSSL_HAVE_UDBL
+       #else
+         #if defined( POLARSSL_HAVE_LONGLONG )
+           typedef unsigned long long t_udbl;
+           #define POLARSSL_HAVE_UDBL
+         #endif
+       #endif
+    #endif
+  #endif
+#endif /* POLARSSL_HAVE_INT16 */
+#endif /* POLARSSL_HAVE_INT8  */
+
+/**
+ * \brief          MPI structure
+ */
+typedef struct
+{
+    int s;              /*!<  integer sign      */
+    size_t n;           /*!<  total # of limbs  */
+    t_uint *p;          /*!<  pointer to limbs  */
+}
+mpi;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief           Initialize one MPI
+ *
+ * \param X         One MPI to initialize.
+ */
+void mpi_init( mpi *X );
+
+/**
+ * \brief          Unallocate one MPI
+ *
+ * \param X        One MPI to unallocate.
+ */
+void mpi_free( mpi *X );
+
+/**
+ * \brief          Enlarge to the specified number of limbs
+ *
+ * \param X        MPI to grow
+ * \param nblimbs  The target number of limbs
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_grow( mpi *X, size_t nblimbs );
+
+/**
+ * \brief          Copy the contents of Y into X
+ *
+ * \param X        Destination MPI
+ * \param Y        Source MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_copy( mpi *X, const mpi *Y );
+
+/**
+ * \brief          Swap the contents of X and Y
+ *
+ * \param X        First MPI value
+ * \param Y        Second MPI value
+ */
+void mpi_swap( mpi *X, mpi *Y );
+
+/**
+ * \brief          Set value from integer
+ *
+ * \param X        MPI to set
+ * \param z        Value to use
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_lset( mpi *X, t_sint z );
+
+/**
+ * \brief          Get a specific bit from X
+ *
+ * \param X        MPI to use
+ * \param pos      Zero-based index of the bit in X
+ *
+ * \return         Either a 0 or a 1
+ */
+int mpi_get_bit( const mpi *X, size_t pos );
+
+/**
+ * \brief          Set a bit of X to a specific value of 0 or 1
+ *
+ * \note           Will grow X if necessary to set a bit to 1 in a not yet
+ *                 existing limb. Will not grow if bit should be set to 0
+ *
+ * \param X        MPI to use
+ * \param pos      Zero-based index of the bit in X
+ * \param val      The value to set the bit to (0 or 1)
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_BAD_INPUT_DATA if val is not 0 or 1
+ */
+int mpi_set_bit( mpi *X, size_t pos, unsigned char val );
+
+/**
+ * \brief          Return the number of zero-bits before the least significant
+ *                 '1' bit
+ *
+ * Note: Thus also the zero-based index of the least significant '1' bit
+ *
+ * \param X        MPI to use
+ */
+size_t mpi_lsb( const mpi *X );
+
+/**
+ * \brief          Return the number of bits up to and including the most
+ *                 significant '1' bit'
+ *
+ * Note: Thus also the one-based index of the most significant '1' bit
+ *
+ * \param X        MPI to use
+ */
+size_t mpi_msb( const mpi *X );
+
+/**
+ * \brief          Return the total size in bytes
+ *
+ * \param X        MPI to use
+ */
+size_t mpi_size( const mpi *X );
+
+/**
+ * \brief          Import from an ASCII string
+ *
+ * \param X        Destination MPI
+ * \param radix    Input numeric base
+ * \param s        Null-terminated string buffer
+ *
+ * \return         0 if successful, or a POLARSSL_ERR_MPI_XXX error code
+ */
+int mpi_read_string( mpi *X, int radix, const char *s );
+
+/**
+ * \brief          Export into an ASCII string
+ *
+ * \param X        Source MPI
+ * \param radix    Output numeric base
+ * \param s        String buffer
+ * \param slen     String buffer size
+ *
+ * \return         0 if successful, or a POLARSSL_ERR_MPI_XXX error code.
+ *                 *slen is always updated to reflect the amount
+ *                 of data that has (or would have) been written.
+ *
+ * \note           Call this function with *slen = 0 to obtain the
+ *                 minimum required buffer size in *slen.
+ */
+int mpi_write_string( const mpi *X, int radix, char *s, size_t *slen );
+
+#if defined(POLARSSL_FS_IO)
+/**
+ * \brief          Read X from an opened file
+ *
+ * \param X        Destination MPI
+ * \param radix    Input numeric base
+ * \param fin      Input file handle
+ *
+ * \return         0 if successful, POLARSSL_ERR_MPI_BUFFER_TOO_SMALL if
+ *                 the file read buffer is too small or a
+ *                 POLARSSL_ERR_MPI_XXX error code
+ */
+int mpi_read_file( mpi *X, int radix, FILE *fin );
+
+/**
+ * \brief          Write X into an opened file, or stdout if fout is NULL
+ *
+ * \param p        Prefix, can be NULL
+ * \param X        Source MPI
+ * \param radix    Output numeric base
+ * \param fout     Output file handle (can be NULL)
+ *
+ * \return         0 if successful, or a POLARSSL_ERR_MPI_XXX error code
+ *
+ * \note           Set fout == NULL to print X on the console.
+ */
+int mpi_write_file( const char *p, const mpi *X, int radix, FILE *fout );
+#endif /* POLARSSL_FS_IO */
+
+/**
+ * \brief          Import X from unsigned binary data, big endian
+ *
+ * \param X        Destination MPI
+ * \param buf      Input buffer
+ * \param buflen   Input buffer size
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_read_binary( mpi *X, const unsigned char *buf, size_t buflen );
+
+/**
+ * \brief          Export X into unsigned binary data, big endian
+ *
+ * \param X        Source MPI
+ * \param buf      Output buffer
+ * \param buflen   Output buffer size
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_BUFFER_TOO_SMALL if buf isn't large enough
+ */
+int mpi_write_binary( const mpi *X, unsigned char *buf, size_t buflen );
+
+/**
+ * \brief          Left-shift: X <<= count
+ *
+ * \param X        MPI to shift
+ * \param count    Amount to shift
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_shift_l( mpi *X, size_t count );
+
+/**
+ * \brief          Right-shift: X >>= count
+ *
+ * \param X        MPI to shift
+ * \param count    Amount to shift
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_shift_r( mpi *X, size_t count );
+
+/**
+ * \brief          Compare unsigned values
+ *
+ * \param X        Left-hand MPI
+ * \param Y        Right-hand MPI
+ *
+ * \return         1 if |X| is greater than |Y|,
+ *                -1 if |X| is lesser  than |Y| or
+ *                 0 if |X| is equal to |Y|
+ */
+int mpi_cmp_abs( const mpi *X, const mpi *Y );
+
+/**
+ * \brief          Compare signed values
+ *
+ * \param X        Left-hand MPI
+ * \param Y        Right-hand MPI
+ *
+ * \return         1 if X is greater than Y,
+ *                -1 if X is lesser  than Y or
+ *                 0 if X is equal to Y
+ */
+int mpi_cmp_mpi( const mpi *X, const mpi *Y );
+
+/**
+ * \brief          Compare signed values
+ *
+ * \param X        Left-hand MPI
+ * \param z        The integer value to compare to
+ *
+ * \return         1 if X is greater than z,
+ *                -1 if X is lesser  than z or
+ *                 0 if X is equal to z
+ */
+int mpi_cmp_int( const mpi *X, t_sint z );
+
+/**
+ * \brief          Unsigned addition: X = |A| + |B|
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_add_abs( mpi *X, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Unsigned substraction: X = |A| - |B|
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_NEGATIVE_VALUE if B is greater than A
+ */
+int mpi_sub_abs( mpi *X, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Signed addition: X = A + B
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_add_mpi( mpi *X, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Signed substraction: X = A - B
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_sub_mpi( mpi *X, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Signed addition: X = A + b
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param b        The integer value to add
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_add_int( mpi *X, const mpi *A, t_sint b );
+
+/**
+ * \brief          Signed substraction: X = A - b
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param b        The integer value to subtract
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_sub_int( mpi *X, const mpi *A, t_sint b );
+
+/**
+ * \brief          Baseline multiplication: X = A * B
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_mul_mpi( mpi *X, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Baseline multiplication: X = A * b
+ *                 Note: b is an unsigned integer type, thus
+ *                 Negative values of b are ignored.
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param b        The integer value to multiply with
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_mul_int( mpi *X, const mpi *A, t_sint b );
+
+/**
+ * \brief          Division by mpi: A = Q * B + R
+ *
+ * \param Q        Destination MPI for the quotient
+ * \param R        Destination MPI for the rest value
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_DIVISION_BY_ZERO if B == 0
+ *
+ * \note           Either Q or R can be NULL.
+ */
+int mpi_div_mpi( mpi *Q, mpi *R, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Division by int: A = Q * b + R
+ *
+ * \param Q        Destination MPI for the quotient
+ * \param R        Destination MPI for the rest value
+ * \param A        Left-hand MPI
+ * \param b        Integer to divide by
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_DIVISION_BY_ZERO if b == 0
+ *
+ * \note           Either Q or R can be NULL.
+ */
+int mpi_div_int( mpi *Q, mpi *R, const mpi *A, t_sint b );
+
+/**
+ * \brief          Modulo: R = A mod B
+ *
+ * \param R        Destination MPI for the rest value
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_DIVISION_BY_ZERO if B == 0,
+ *                 POLARSSL_ERR_MPI_NEGATIVE_VALUE if B < 0
+ */
+int mpi_mod_mpi( mpi *R, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Modulo: r = A mod b
+ *
+ * \param r        Destination t_uint
+ * \param A        Left-hand MPI
+ * \param b        Integer to divide by
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_DIVISION_BY_ZERO if b == 0,
+ *                 POLARSSL_ERR_MPI_NEGATIVE_VALUE if b < 0
+ */
+int mpi_mod_int( t_uint *r, const mpi *A, t_sint b );
+
+/**
+ * \brief          Sliding-window exponentiation: X = A^E mod N
+ *
+ * \param X        Destination MPI 
+ * \param A        Left-hand MPI
+ * \param E        Exponent MPI
+ * \param N        Modular MPI
+ * \param _RR      Speed-up MPI used for recalculations
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_BAD_INPUT_DATA if N is negative or even or if
+ *                 E is negative
+ *
+ * \note           _RR is used to avoid re-computing R*R mod N across
+ *                 multiple calls, which speeds up things a bit. It can
+ *                 be set to NULL if the extra performance is unneeded.
+ */
+int mpi_exp_mod( mpi *X, const mpi *A, const mpi *E, const mpi *N, mpi *_RR );
+
+/**
+ * \brief          Fill an MPI X with size bytes of random
+ *
+ * \param X        Destination MPI
+ * \param size     Size in bytes
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_fill_random( mpi *X, size_t size,
+                     int (*f_rng)(void *, unsigned char *, size_t),
+                     void *p_rng );
+
+/**
+ * \brief          Greatest common divisor: G = gcd(A, B)
+ *
+ * \param G        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int mpi_gcd( mpi *G, const mpi *A, const mpi *B );
+
+/**
+ * \brief          Modular inverse: X = A^-1 mod N
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param N        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_BAD_INPUT_DATA if N is negative or nil
+                   POLARSSL_ERR_MPI_NOT_ACCEPTABLE if A has no inverse mod N
+ */
+int mpi_inv_mod( mpi *X, const mpi *A, const mpi *N );
+
+#if 0
+/**
+ * \brief          Miller-Rabin primality test
+ *
+ * \param X        MPI to check
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ *
+ * \return         0 if successful (probably prime),
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_NOT_ACCEPTABLE if X is not prime
+ */
+int mpi_is_prime( mpi *X,
+                  int (*f_rng)(void *, unsigned char *, size_t),
+                  void *p_rng );
+#endif
+
+/**
+ * \brief          Prime number generation
+ *
+ * \param X        Destination MPI
+ * \param nbits    Required size of X in bits ( 3 <= nbits <= POLARSSL_MPI_MAX_BITS )
+ * \param dh_flag  If 1, then (X-1)/2 will be prime too
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ *
+ * \return         0 if successful (probably prime),
+ *                 POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed,
+ *                 POLARSSL_ERR_MPI_BAD_INPUT_DATA if nbits is < 3
+ */
+int mpi_gen_prime( mpi *X, size_t nbits, int dh_flag,
+                   int (*f_rng)(void *, unsigned char *, size_t),
+                   void *p_rng );
+
+/**
+ * \brief          Checkup routine
+ *
+ * \return         0 if successful, or 1 if the test failed
+ */
+int mpi_self_test( int verbose );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* bignum.h */
--- a/polarssl/bn_mul.h
+++ b/polarssl/bn_mul.h
@ -0,0 +1,901 @@
+/**
+ * \file bn_mul.h
+ *
+ *  Copyright (C) 2006-2010, Brainspark B.V.
+ *
+ *  This file is part of PolarSSL (http://www.polarssl.org)
+ *  Lead Maintainer: Paul Bakker <polarssl_maintainer at polarssl.org>
+ *
+ *  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+/*
+ *      Multiply source vector [s] with b, add result
+ *       to destination vector [d] and set carry c.
+ *
+ *      Currently supports:
+ *
+ *         . IA-32 (386+)         . AMD64 / EM64T
+ *         . IA-32 (SSE2)         . Motorola 68000
+ *         . PowerPC, 32-bit      . MicroBlaze
+ *         . PowerPC, 64-bit      . TriCore
+ *         . SPARC v8             . ARM v3+
+ *         . Alpha                . MIPS32
+ *         . C, longlong          . C, generic
+ */
+#ifndef POLARSSL_BN_MUL_H
+#define POLARSSL_BN_MUL_H
+
+#include "polarssl/config.h"
+
+#if defined(POLARSSL_HAVE_ASM)
+
+#if defined(__GNUC__)
+#if defined(__i386__)
+
+#define MULADDC_INIT                \
+    asm( "                          \
+        movl   %%ebx, %0;           \
+        movl   %5, %%esi;           \
+        movl   %6, %%edi;           \
+        movl   %7, %%ecx;           \
+        movl   %8, %%ebx;           \
+        "
+
+#define MULADDC_CORE                \
+        "                           \
+        lodsl;                      \
+        mull   %%ebx;               \
+        addl   %%ecx,   %%eax;      \
+        adcl   $0,      %%edx;      \
+        addl   (%%edi), %%eax;      \
+        adcl   $0,      %%edx;      \
+        movl   %%edx,   %%ecx;      \
+        stosl;                      \
+        "
+
+#if defined(POLARSSL_HAVE_SSE2)
+
+#define MULADDC_HUIT                    \
+        "                               \
+        movd     %%ecx,     %%mm1;      \
+        movd     %%ebx,     %%mm0;      \
+        movd     (%%edi),   %%mm3;      \
+        paddq    %%mm3,     %%mm1;      \
+        movd     (%%esi),   %%mm2;      \
+        pmuludq  %%mm0,     %%mm2;      \
+        movd     4(%%esi),  %%mm4;      \
+        pmuludq  %%mm0,     %%mm4;      \
+        movd     8(%%esi),  %%mm6;      \
+        pmuludq  %%mm0,     %%mm6;      \
+        movd     12(%%esi), %%mm7;      \
+        pmuludq  %%mm0,     %%mm7;      \
+        paddq    %%mm2,     %%mm1;      \
+        movd     4(%%edi),  %%mm3;      \
+        paddq    %%mm4,     %%mm3;      \
+        movd     8(%%edi),  %%mm5;      \
+        paddq    %%mm6,     %%mm5;      \
+        movd     12(%%edi), %%mm4;      \
+        paddq    %%mm4,     %%mm7;      \
+        movd     %%mm1,     (%%edi);    \
+        movd     16(%%esi), %%mm2;      \
+        pmuludq  %%mm0,     %%mm2;      \
+        psrlq    $32,       %%mm1;      \
+        movd     20(%%esi), %%mm4;      \
+        pmuludq  %%mm0,     %%mm4;      \
+        paddq    %%mm3,     %%mm1;      \
+        movd     24(%%esi), %%mm6;      \
+        pmuludq  %%mm0,     %%mm6;      \
+        movd     %%mm1,     4(%%edi);   \
+        psrlq    $32,       %%mm1;      \
+        movd     28(%%esi), %%mm3;      \
+        pmuludq  %%mm0,     %%mm3;      \
+        paddq    %%mm5,     %%mm1;      \
+        movd     16(%%edi), %%mm5;      \
+        paddq    %%mm5,     %%mm2;      \
+        movd     %%mm1,     8(%%edi);   \
+        psrlq    $32,       %%mm1;      \
+        paddq    %%mm7,     %%mm1;      \
+        movd     20(%%edi), %%mm5;      \
+        paddq    %%mm5,     %%mm4;      \
+        movd     %%mm1,     12(%%edi);  \
+        psrlq    $32,       %%mm1;      \
+        paddq    %%mm2,     %%mm1;      \
+        movd     24(%%edi), %%mm5;      \
+        paddq    %%mm5,     %%mm6;      \
+        movd     %%mm1,     16(%%edi);  \
+        psrlq    $32,       %%mm1;      \
+        paddq    %%mm4,     %%mm1;      \
+        movd     28(%%edi), %%mm5;      \
+        paddq    %%mm5,     %%mm3;      \
+        movd     %%mm1,     20(%%edi);  \
+        psrlq    $32,       %%mm1;      \
+        paddq    %%mm6,     %%mm1;      \
+        movd     %%mm1,     24(%%edi);  \
+        psrlq    $32,       %%mm1;      \
+        paddq    %%mm3,     %%mm1;      \
+        movd     %%mm1,     28(%%edi);  \
+        addl     $32,       %%edi;      \
+        addl     $32,       %%esi;      \
+        psrlq    $32,       %%mm1;      \
+        movd     %%mm1,     %%ecx;      \
+        "
+
+#define MULADDC_STOP            \
+        "                       \
+        emms;                   \
+        movl   %4, %%ebx;       \
+        movl   %%ecx, %1;       \
+        movl   %%edi, %2;       \
+        movl   %%esi, %3;       \
+        "                       \
+        : "=m" (t), "=m" (c), "=m" (d), "=m" (s)        \
+        : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b)   \
+        : "eax", "ecx", "edx", "esi", "edi"             \
+    );
+
+#else
+
+#define MULADDC_STOP            \
+        "                       \
+        movl   %4, %%ebx;       \
+        movl   %%ecx, %1;       \
+        movl   %%edi, %2;       \
+        movl   %%esi, %3;       \
+        "                       \
+        : "=m" (t), "=m" (c), "=m" (d), "=m" (s)        \
+        : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b)   \
+        : "eax", "ecx", "edx", "esi", "edi"             \
+    );
+#endif /* SSE2 */
+#endif /* i386 */
+
+#if defined(__amd64__) || defined (__x86_64__)
+
+#define MULADDC_INIT                            \
+    asm( "movq   %0, %%rsi      " :: "m" (s));  \
+    asm( "movq   %0, %%rdi      " :: "m" (d));  \
+    asm( "movq   %0, %%rcx      " :: "m" (c));  \
+    asm( "movq   %0, %%rbx      " :: "m" (b));  \
+    asm( "xorq   %r8, %r8       " );
+
+#define MULADDC_CORE                            \
+    asm( "movq  (%rsi),%rax     " );            \
+    asm( "mulq   %rbx           " );            \
+    asm( "addq   $8,   %rsi     " );            \
+    asm( "addq   %rcx, %rax     " );            \
+    asm( "movq   %r8,  %rcx     " );            \
+    asm( "adcq   $0,   %rdx     " );            \
+    asm( "nop                   " );            \
+    asm( "addq   %rax, (%rdi)   " );            \
+    asm( "adcq   %rdx, %rcx     " );            \
+    asm( "addq   $8,   %rdi     " );
+
+#define MULADDC_STOP                            \
+    asm( "movq   %%rcx, %0      " : "=m" (c));  \
+    asm( "movq   %%rdi, %0      " : "=m" (d));  \
+    asm( "movq   %%rsi, %0      " : "=m" (s) :: \
+    "rax", "rcx", "rdx", "rbx", "rsi", "rdi", "r8" );
+
+#endif /* AMD64 */
+
+#if defined(__mc68020__) || defined(__mcpu32__)
+
+#define MULADDC_INIT                            \
+    asm( "movl   %0, %%a2       " :: "m" (s));  \
+    asm( "movl   %0, %%a3       " :: "m" (d));  \
+    asm( "movl   %0, %%d3       " :: "m" (c));  \
+    asm( "movl   %0, %%d2       " :: "m" (b));  \
+    asm( "moveq  #0, %d0        " );
+
+#define MULADDC_CORE                            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d4:%d1   " );            \
+    asm( "addl   %d3, %d1       " );            \
+    asm( "addxl  %d0, %d4       " );            \
+    asm( "moveq  #0,  %d3       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "addxl  %d4, %d3       " );
+
+#define MULADDC_STOP                            \
+    asm( "movl   %%d3, %0       " : "=m" (c));  \
+    asm( "movl   %%a3, %0       " : "=m" (d));  \
+    asm( "movl   %%a2, %0       " : "=m" (s) :: \
+    "d0", "d1", "d2", "d3", "d4", "a2", "a3" );
+
+#define MULADDC_HUIT                            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d4:%d1   " );            \
+    asm( "addxl  %d3, %d1       " );            \
+    asm( "addxl  %d0, %d4       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d3:%d1   " );            \
+    asm( "addxl  %d4, %d1       " );            \
+    asm( "addxl  %d0, %d3       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d4:%d1   " );            \
+    asm( "addxl  %d3, %d1       " );            \
+    asm( "addxl  %d0, %d4       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d3:%d1   " );            \
+    asm( "addxl  %d4, %d1       " );            \
+    asm( "addxl  %d0, %d3       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d4:%d1   " );            \
+    asm( "addxl  %d3, %d1       " );            \
+    asm( "addxl  %d0, %d4       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d3:%d1   " );            \
+    asm( "addxl  %d4, %d1       " );            \
+    asm( "addxl  %d0, %d3       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d4:%d1   " );            \
+    asm( "addxl  %d3, %d1       " );            \
+    asm( "addxl  %d0, %d4       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "movel  %a2@+, %d1     " );            \
+    asm( "mulul  %d2, %d3:%d1   " );            \
+    asm( "addxl  %d4, %d1       " );            \
+    asm( "addxl  %d0, %d3       " );            \
+    asm( "addl   %d1, %a3@+     " );            \
+    asm( "addxl  %d0, %d3       " );
+
+#endif /* MC68000 */
+
+#if defined(__powerpc__)   || defined(__ppc__)
+#if defined(__powerpc64__) || defined(__ppc64__)
+
+#if defined(__MACH__) && defined(__APPLE__)
+
+#define MULADDC_INIT                            \
+    asm( "ld     r3, %0         " :: "m" (s));  \
+    asm( "ld     r4, %0         " :: "m" (d));  \
+    asm( "ld     r5, %0         " :: "m" (c));  \
+    asm( "ld     r6, %0         " :: "m" (b));  \
+    asm( "addi   r3, r3, -8     " );            \
+    asm( "addi   r4, r4, -8     " );            \
+    asm( "addic  r5, r5,  0     " );
+
+#define MULADDC_CORE                            \
+    asm( "ldu    r7, 8(r3)      " );            \
+    asm( "mulld  r8, r7, r6     " );            \
+    asm( "mulhdu r9, r7, r6     " );            \
+    asm( "adde   r8, r8, r5     " );            \
+    asm( "ld     r7, 8(r4)      " );            \
+    asm( "addze  r5, r9         " );            \
+    asm( "addc   r8, r8, r7     " );            \
+    asm( "stdu   r8, 8(r4)      " );
+
+#define MULADDC_STOP                            \
+    asm( "addze  r5, r5         " );            \
+    asm( "addi   r4, r4, 8      " );            \
+    asm( "addi   r3, r3, 8      " );            \
+    asm( "std    r5, %0         " : "=m" (c));  \
+    asm( "std    r4, %0         " : "=m" (d));  \
+    asm( "std    r3, %0         " : "=m" (s) :: \
+    "r3", "r4", "r5", "r6", "r7", "r8", "r9" );
+
+#else
+
+#define MULADDC_INIT                            \
+    asm( "ld     %%r3, %0       " :: "m" (s));  \
+    asm( "ld     %%r4, %0       " :: "m" (d));  \
+    asm( "ld     %%r5, %0       " :: "m" (c));  \
+    asm( "ld     %%r6, %0       " :: "m" (b));  \
+    asm( "addi   %r3, %r3, -8   " );            \
+    asm( "addi   %r4, %r4, -8   " );            \
+    asm( "addic  %r5, %r5,  0   " );
+
+#define MULADDC_CORE                            \
+    asm( "ldu    %r7, 8(%r3)    " );            \
+    asm( "mulld  %r8, %r7, %r6  " );            \
+    asm( "mulhdu %r9, %r7, %r6  " );            \
+    asm( "adde   %r8, %r8, %r5  " );            \
+    asm( "ld     %r7, 8(%r4)    " );            \
+    asm( "addze  %r5, %r9       " );            \
+    asm( "addc   %r8, %r8, %r7  " );            \
+    asm( "stdu   %r8, 8(%r4)    " );
+
+#define MULADDC_STOP                            \
+    asm( "addze  %r5, %r5       " );            \
+    asm( "addi   %r4, %r4, 8    " );            \
+    asm( "addi   %r3, %r3, 8    " );            \
+    asm( "std    %%r5, %0       " : "=m" (c));  \
+    asm( "std    %%r4, %0       " : "=m" (d));  \
+    asm( "std    %%r3, %0       " : "=m" (s) :: \
+    "r3", "r4", "r5", "r6", "r7", "r8", "r9" );
+
+#endif
+
+#else /* PPC32 */
+
+#if defined(__MACH__) && defined(__APPLE__)
+
+#define MULADDC_INIT                            \
+    asm( "lwz    r3, %0         " :: "m" (s));  \
+    asm( "lwz    r4, %0         " :: "m" (d));  \
+    asm( "lwz    r5, %0         " :: "m" (c));  \
+    asm( "lwz    r6, %0         " :: "m" (b));  \
+    asm( "addi   r3, r3, -4     " );            \
+    asm( "addi   r4, r4, -4     " );            \
+    asm( "addic  r5, r5,  0     " );
+
+#define MULADDC_CORE                            \
+    asm( "lwzu   r7, 4(r3)      " );            \
+    asm( "mullw  r8, r7, r6     " );            \
+    asm( "mulhwu r9, r7, r6     " );            \
+    asm( "adde   r8, r8, r5     " );            \
+    asm( "lwz    r7, 4(r4)      " );            \
+    asm( "addze  r5, r9         " );            \
+    asm( "addc   r8, r8, r7     " );            \
+    asm( "stwu   r8, 4(r4)      " );
+
+#define MULADDC_STOP                            \
+    asm( "addze  r5, r5         " );            \
+    asm( "addi   r4, r4, 4      " );            \
+    asm( "addi   r3, r3, 4      " );            \
+    asm( "stw    r5, %0         " : "=m" (c));  \
+    asm( "stw    r4, %0         " : "=m" (d));  \
+    asm( "stw    r3, %0         " : "=m" (s) :: \
+    "r3", "r4", "r5", "r6", "r7", "r8", "r9" );
+
+#else
+
+#define MULADDC_INIT                            \
+    asm( "lwz    %%r3, %0       " :: "m" (s));  \
+    asm( "lwz    %%r4, %0       " :: "m" (d));  \
+    asm( "lwz    %%r5, %0       " :: "m" (c));  \
+    asm( "lwz    %%r6, %0       " :: "m" (b));  \
+    asm( "addi   %r3, %r3, -4   " );            \
+    asm( "addi   %r4, %r4, -4   " );            \
+    asm( "addic  %r5, %r5,  0   " );
+
+#define MULADDC_CORE                            \
+    asm( "lwzu   %r7, 4(%r3)    " );            \
+    asm( "mullw  %r8, %r7, %r6  " );            \
+    asm( "mulhwu %r9, %r7, %r6  " );            \
+    asm( "adde   %r8, %r8, %r5  " );            \
+    asm( "lwz    %r7, 4(%r4)    " );            \
+    asm( "addze  %r5, %r9       " );            \
+    asm( "addc   %r8, %r8, %r7  " );            \
+    asm( "stwu   %r8, 4(%r4)    " );
+
+#define MULADDC_STOP                            \
+    asm( "addze  %r5, %r5       " );            \
+    asm( "addi   %r4, %r4, 4    " );            \
+    asm( "addi   %r3, %r3, 4    " );            \
+    asm( "stw    %%r5, %0       " : "=m" (c));  \
+    asm( "stw    %%r4, %0       " : "=m" (d));  \
+    asm( "stw    %%r3, %0       " : "=m" (s) :: \
+    "r3", "r4", "r5", "r6", "r7", "r8", "r9" );
+
+#endif
+
+#endif /* PPC32 */
+#endif /* PPC64 */
+
+#if defined(__sparc__)
+
+#define MULADDC_INIT                            \
+    asm( "ld     %0, %%o0       " :: "m" (s));  \
+    asm( "ld     %0, %%o1       " :: "m" (d));  \
+    asm( "ld     %0, %%o2       " :: "m" (c));  \
+    asm( "ld     %0, %%o3       " :: "m" (b));
+
+#define MULADDC_CORE                            \
+    asm( "ld    [%o0], %o4      " );            \
+    asm( "inc      4,  %o0      " );            \
+    asm( "ld    [%o1], %o5      " );            \
+    asm( "umul   %o3,  %o4, %o4 " );            \
+    asm( "addcc  %o4,  %o2, %o4 " );            \
+    asm( "rd      %y,  %g1      " );            \
+    asm( "addx   %g1,    0, %g1 " );            \
+    asm( "addcc  %o4,  %o5, %o4 " );            \
+    asm( "st     %o4, [%o1]     " );            \
+    asm( "addx   %g1,    0, %o2 " );            \
+    asm( "inc      4,  %o1      " );
+
+#define MULADDC_STOP                            \
+    asm( "st     %%o2, %0       " : "=m" (c));  \
+    asm( "st     %%o1, %0       " : "=m" (d));  \
+    asm( "st     %%o0, %0       " : "=m" (s) :: \
+    "g1", "o0", "o1", "o2", "o3", "o4", "o5" );
+
+#endif /* SPARCv8 */
+
+#if defined(__microblaze__) || defined(microblaze)
+
+#define MULADDC_INIT                            \
+    asm( "lwi   r3,   %0        " :: "m" (s));  \
+    asm( "lwi   r4,   %0        " :: "m" (d));  \
+    asm( "lwi   r5,   %0        " :: "m" (c));  \
+    asm( "lwi   r6,   %0        " :: "m" (b));  \
+    asm( "andi  r7,   r6, 0xffff" );            \
+    asm( "bsrli r6,   r6, 16    " );
+
+#define MULADDC_CORE                            \
+    asm( "lhui  r8,   r3,   0   " );            \
+    asm( "addi  r3,   r3,   2   " );            \
+    asm( "lhui  r9,   r3,   0   " );            \
+    asm( "addi  r3,   r3,   2   " );            \
+    asm( "mul   r10,  r9,  r6   " );            \
+    asm( "mul   r11,  r8,  r7   " );            \
+    asm( "mul   r12,  r9,  r7   " );            \
+    asm( "mul   r13,  r8,  r6   " );            \
+    asm( "bsrli  r8, r10,  16   " );            \
+    asm( "bsrli  r9, r11,  16   " );            \
+    asm( "add   r13, r13,  r8   " );            \
+    asm( "add   r13, r13,  r9   " );            \
+    asm( "bslli r10, r10,  16   " );            \
+    asm( "bslli r11, r11,  16   " );            \
+    asm( "add   r12, r12, r10   " );            \
+    asm( "addc  r13, r13,  r0   " );            \
+    asm( "add   r12, r12, r11   " );            \
+    asm( "addc  r13, r13,  r0   " );            \
+    asm( "lwi   r10,  r4,   0   " );            \
+    asm( "add   r12, r12, r10   " );            \
+    asm( "addc  r13, r13,  r0   " );            \
+    asm( "add   r12, r12,  r5   " );            \
+    asm( "addc   r5, r13,  r0   " );            \
+    asm( "swi   r12,  r4,   0   " );            \
+    asm( "addi   r4,  r4,   4   " );
+
+#define MULADDC_STOP                            \
+    asm( "swi   r5,   %0        " : "=m" (c));  \
+    asm( "swi   r4,   %0        " : "=m" (d));  \
+    asm( "swi   r3,   %0        " : "=m" (s) :: \
+     "r3", "r4" , "r5" , "r6" , "r7" , "r8" ,   \
+     "r9", "r10", "r11", "r12", "r13" );
+
+#endif /* MicroBlaze */
+
+#if defined(__tricore__)
+
+#define MULADDC_INIT                            \
+    asm( "ld.a   %%a2, %0       " :: "m" (s));  \
+    asm( "ld.a   %%a3, %0       " :: "m" (d));  \
+    asm( "ld.w   %%d4, %0       " :: "m" (c));  \
+    asm( "ld.w   %%d1, %0       " :: "m" (b));  \
+    asm( "xor    %d5, %d5       " );
+
+#define MULADDC_CORE                            \
+    asm( "ld.w   %d0,   [%a2+]      " );        \
+    asm( "madd.u %e2, %e4, %d0, %d1 " );        \
+    asm( "ld.w   %d0,   [%a3]       " );        \
+    asm( "addx   %d2,    %d2,  %d0  " );        \
+    asm( "addc   %d3,    %d3,    0  " );        \
+    asm( "mov    %d4,    %d3        " );        \
+    asm( "st.w  [%a3+],  %d2        " );
+
+#define MULADDC_STOP                            \
+    asm( "st.w   %0, %%d4       " : "=m" (c));  \
+    asm( "st.a   %0, %%a3       " : "=m" (d));  \
+    asm( "st.a   %0, %%a2       " : "=m" (s) :: \
+    "d0", "d1", "e2", "d4", "a2", "a3" );
+
+#endif /* TriCore */
+
+#if defined(__arm__)
+#if defined(__ARM_FEATURE_DSP)
+/* The ARM DSP instructions are available on Cortex M4, M7 and
+   Cortex A CPUs */
+
+#define MULADDC_1024_CORE \
+        "ldmia  %[s]!, { r7, r8, r9, r10 } \n\t" \
+        "ldmia  %[d], { r3, r4, r5, r6 }   \n\t" \
+        "umaal  r3, %2, %[b], r7           \n\t" \
+        "umaal  r4, %2, %[b], r8           \n\t" \
+        "umaal  r5, %2, %[b], r9           \n\t" \
+        "umaal  r6, %2, %[b], r10          \n\t" \
+        "stmia  %[d]!, {r3, r4, r5, r6}    \n\t"
+
+#define MULADDC_1024_LOOP                        \
+   asm( "tst    %[i], #0xfe0           \n\t"     \
+        "beq    0f                     \n"       \
+"1:	 sub    %[i], %[i], #32        \n\t"     \
+        MULADDC_1024_CORE MULADDC_1024_CORE      \
+        MULADDC_1024_CORE MULADDC_1024_CORE      \
+        MULADDC_1024_CORE MULADDC_1024_CORE      \
+        MULADDC_1024_CORE MULADDC_1024_CORE      \
+        "tst    %[i], #0xfe0           \n\t"     \
+        "bne    1b                     \n"       \
+"0:"                                             \
+        : [s] "=r" (s), [d] "=r" (d), [c] "=r" (c), [i] "=r" (i)    \
+        : [b] "r" (b), "[s]" (s), "[d]" (d), "[c]" (c), "[i]" (i)   \
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc" );
+
+#define MULADDC_INIT                                      \
+    asm(
+
+#define MULADDC_CORE                                      \
+                  "ldr    r0, [%0], #4            \n\t"   \
+                  "ldr    r1, [%1]                \n\t"   \
+                  "umaal  r1, %2, %3, r0          \n\t"   \
+                  "str    r1, [%1], #4            \n\t"
+
+#define MULADDC_HUIT                                      \
+                  "ldmia  %0!, {r0, r1, r2, r3}   \n\t"   \
+                  "ldmia  %1, {r4, r5, r6, r7}    \n\t"   \
+                  "umaal  r4, %2, %3, r0          \n\t"   \
+                  "umaal  r5, %2, %3, r1          \n\t"   \
+                  "umaal  r6, %2, %3, r2          \n\t"   \
+                  "umaal  r7, %2, %3, r3          \n\t"   \
+                  "stmia  %1!, {r4, r5, r6, r7}   \n\t"   \
+                  "ldmia  %0!, {r0, r1, r2, r3}   \n\t"   \
+                  "ldmia  %1, {r4, r5, r6, r7}    \n\t"   \
+                  "umaal  r4, %2, %3, r0          \n\t"   \
+                  "umaal  r5, %2, %3, r1          \n\t"   \
+                  "umaal  r6, %2, %3, r2          \n\t"   \
+                  "umaal  r7, %2, %3, r3          \n\t"   \
+                  "stmia  %1!, {r4, r5, r6, r7}   \n\t"
+
+#define MULADDC_STOP                                      \
+                  : "=r" (s), "=r" (d), "=r" (c)          \
+                  : "r" (b), "0" (s), "1" (d), "2" (c)    \
+                  : "r0", "r1", "r2", "r3", "r4", "r5",   \
+                    "r6", "r7", "memory");
+
+#else /* __ARM_FEATURE_DSP */
+
+#define MULADDC_1024_CORE                        \
+       "ldmia  %[s]!, { r8, r9, r10 } \n\t"      \
+       "ldmia  %[d], { r5, r6, r7 }   \n\t"      \
+       "adcs   r5, r5, %[c]           \n\t"      \
+       "umull  r4, r8, r8, %[b]       \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r5, r5, r4             \n\t"      \
+       "adcs   r6, r6, %[c]           \n\t"      \
+       "umull  r4, r8, r9, %[b]       \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r6, r6, r4             \n\t"      \
+       "adcs   r7, r7, %[c]           \n\t"      \
+       "umull  r4, r8, r10, %[b]      \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r7, r7, r4             \n\t"      \
+       "stmia  %[d]!, { r5, r6, r7 }  \n\t"
+
+#define MULADDC_1024_LOOP                        \
+  asm( "tst    %[i], #0xfe0           \n\t"      \
+       "beq    0f                     \n"        \
+"1:	ldmia  %[s]!, { r8, r9, r10 } \n\t"      \
+       "ldmia  %[d], { r5, r6, r7 }   \n\t"      \
+       "sub    %[i], %[i], #32        \n\t"      \
+       "adds   r5, r5, %[c]           \n\t"      \
+       "umull  r4, r8, %[b], r8       \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r5, r5, r4             \n\t"      \
+       "adcs   r6, r6, %[c]           \n\t"      \
+       "umull  r4, r8, %[b], r9       \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r6, r6, r4             \n\t"      \
+       "adcs   r7, r7, %[c]           \n\t"      \
+       "umull  r4, r8, %[b], r10      \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r7, r7, r4             \n\t"      \
+       "stmia  %[d]!, { r5, r6, r7 }  \n\t"      \
+       MULADDC_1024_CORE MULADDC_1024_CORE       \
+       MULADDC_1024_CORE MULADDC_1024_CORE       \
+       MULADDC_1024_CORE MULADDC_1024_CORE       \
+       MULADDC_1024_CORE MULADDC_1024_CORE       \
+       MULADDC_1024_CORE                         \
+       "ldmia  %[s]!, { r8, r9 }      \n\t"      \
+       "ldmia  %[d], { r5, r6 }       \n\t"      \
+       "adcs   r5, r5, %[c]           \n\t"      \
+       "umull  r4, r8, %[b], r8       \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r5, r5, r4             \n\t"      \
+       "adcs   r6, r6, %[c]           \n\t"      \
+       "umull  r4, r8, %[b], r9       \n\t"      \
+       "adc    %[c], r8, #0           \n\t"      \
+       "adds   r6, r6, r4             \n\t"      \
+       "adc    %[c], %[c], #0         \n\t"      \
+       "stmia  %[d]!, { r5, r6 }      \n\t"      \
+       "tst    %[i], #0xfe0           \n\t"      \
+       "bne    1b                     \n"        \
+"0:"                                             \
+       : [s] "=r" (s), [d] "=r" (d), [c] "=r" (c), [i] "=r" (i)    \
+       : [b] "r" (b), "[s]" (s), "[d]" (d), "[c]" (c), "[i]" (i)   \
+       : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc" );
+
+/* Just for reference (dead code) */
+#define MULADDC_HUIT_DEAD                              \
+                  "ldmia  %0!, { r4, r5 } \n\t"        \
+                  "ldmia  %1, { r8, r9 }  \n\t"        \
+                  "umull  r6, r7, %3, r4  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r8, r8, r6      \n\t"        \
+                  "umull  r6, r7, %3, r5  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r9, r9, r6      \n\t"        \
+                  "stmia  %1!, { r8, r9 } \n\t"        \
+                  "ldmia  %0!, { r4, r5 } \n\t"        \
+                  "ldmia  %1, { r8, r9 }  \n\t"        \
+                  "umull  r6, r7, %3, r4  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r8, r8, r6      \n\t"        \
+                  "umull  r6, r7, %3, r5  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r9, r9, r6      \n\t"        \
+                  "stmia  %1!, { r8, r9 } \n\t"        \
+                  "ldmia  %0!, { r4, r5 } \n\t"        \
+                  "ldmia  %1, { r8, r9 }  \n\t"        \
+                  "umull  r6, r7, %3, r4  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r8, r8, r6      \n\t"        \
+                  "umull  r6, r7, %3, r5  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r9, r9, r6      \n\t"        \
+                  "stmia  %1!, { r8, r9 } \n\t"        \
+                  "ldmia  %0!, { r4, r5 } \n\t"        \
+                  "ldmia  %1, { r8, r9 }  \n\t"        \
+                  "umull  r6, r7, %3, r4  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r8, r8, r6      \n\t"        \
+                  "umull  r6, r7, %3, r5  \n\t"        \
+                  "adcs   r6, r6, %2      \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r9, r9, r6      \n\t"        \
+                  "stmia  %1!, { r8, r9 } \n\t" 
+
+#define MULADDC_INIT                                   \
+             asm( "adds   %0, #0         \n\t"
+
+#define MULADDC_CORE                                   \
+                  "ldr    r5, [%1]        \n\t"        \
+                  "ldr    r4, [%0], #4    \n\t"        \
+                  "adcs   r5, r5, %2      \n\t"        \
+                  "umull  r6, r7, %3, r4  \n\t"        \
+                  "adc    %2, r7, #0      \n\t"        \
+                  "adds   r5, r5, r6      \n\t"        \
+                  "str    r5, [%1], #4    \n\t"
+
+#define MULADDC_STOP                                            \
+                  "adc    %2, %2, #0     "                      \
+                  : "=r" (s), "=r" (d), "=r" (c)                \
+                  : "r" (b), "0" (s), "1" (d), "2" (c)          \
+                  : "r4", "r5", "r6", "r7", "memory", "cc" );
+
+#endif /* __ARM_FEATURE_DSP */
+#endif /* ARMv3 */
+
+#if defined(__alpha__)
+
+#define MULADDC_INIT                            \
+    asm( "ldq    $1, %0         " :: "m" (s));  \
+    asm( "ldq    $2, %0         " :: "m" (d));  \
+    asm( "ldq    $3, %0         " :: "m" (c));  \
+    asm( "ldq    $4, %0         " :: "m" (b));
+
+#define MULADDC_CORE                            \
+    asm( "ldq    $6,  0($1)     " );            \
+    asm( "addq   $1,  8, $1     " );            \
+    asm( "mulq   $6, $4, $7     " );            \
+    asm( "umulh  $6, $4, $6     " );            \
+    asm( "addq   $7, $3, $7     " );            \
+    asm( "cmpult $7, $3, $3     " );            \
+    asm( "ldq    $5,  0($2)     " );            \
+    asm( "addq   $7, $5, $7     " );            \
+    asm( "cmpult $7, $5, $5     " );            \
+    asm( "stq    $7,  0($2)     " );            \
+    asm( "addq   $2,  8, $2     " );            \
+    asm( "addq   $6, $3, $3     " );            \
+    asm( "addq   $5, $3, $3     " );
+
+#define MULADDC_STOP                            \
+    asm( "stq    $3, %0         " : "=m" (c));  \
+    asm( "stq    $2, %0         " : "=m" (d));  \
+    asm( "stq    $1, %0         " : "=m" (s) :: \
+    "$1", "$2", "$3", "$4", "$5", "$6", "$7" );
+
+#endif /* Alpha */
+
+#if defined(__mips__)
+
+#define MULADDC_INIT                            \
+    asm( "lw     $10, %0        " :: "m" (s));  \
+    asm( "lw     $11, %0        " :: "m" (d));  \
+    asm( "lw     $12, %0        " :: "m" (c));  \
+    asm( "lw     $13, %0        " :: "m" (b));
+
+#define MULADDC_CORE                            \
+    asm( "lw     $14, 0($10)    " );            \
+    asm( "multu  $13, $14       " );            \
+    asm( "addi   $10, $10, 4    " );            \
+    asm( "mflo   $14            " );            \
+    asm( "mfhi   $9             " );            \
+    asm( "addu   $14, $12, $14  " );            \
+    asm( "lw     $15, 0($11)    " );            \
+    asm( "sltu   $12, $14, $12  " );            \
+    asm( "addu   $15, $14, $15  " );            \
+    asm( "sltu   $14, $15, $14  " );            \
+    asm( "addu   $12, $12, $9   " );            \
+    asm( "sw     $15, 0($11)    " );            \
+    asm( "addu   $12, $12, $14  " );            \
+    asm( "addi   $11, $11, 4    " );
+
+#define MULADDC_STOP                            \
+    asm( "sw     $12, %0        " : "=m" (c));  \
+    asm( "sw     $11, %0        " : "=m" (d));  \
+    asm( "sw     $10, %0        " : "=m" (s) :: \
+    "$9", "$10", "$11", "$12", "$13", "$14", "$15" );
+
+#endif /* MIPS */
+#endif /* GNUC */
+
+#if (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+
+#define MULADDC_INIT                            \
+    __asm   mov     esi, s                      \
+    __asm   mov     edi, d                      \
+    __asm   mov     ecx, c                      \
+    __asm   mov     ebx, b
+
+#define MULADDC_CORE                            \
+    __asm   lodsd                               \
+    __asm   mul     ebx                         \
+    __asm   add     eax, ecx                    \
+    __asm   adc     edx, 0                      \
+    __asm   add     eax, [edi]                  \
+    __asm   adc     edx, 0                      \
+    __asm   mov     ecx, edx                    \
+    __asm   stosd
+
+#if defined(POLARSSL_HAVE_SSE2)
+
+#define EMIT __asm _emit
+
+#define MULADDC_HUIT                            \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0xC9             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0xC3             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x1F             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCB             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x16             \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xD0             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x66  EMIT 0x04  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xE0             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x76  EMIT 0x08  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xF0             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x7E  EMIT 0x0C  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xF8             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCA             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x5F  EMIT 0x04  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xDC             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x08  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xEE             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x67  EMIT 0x0C  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xFC             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x0F             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x56  EMIT 0x10  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xD0             \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x66  EMIT 0x14  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xE0             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCB             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x76  EMIT 0x18  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xF0             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x04  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x5E  EMIT 0x1C  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xD8             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCD             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x10  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xD5             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x08  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCF             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x14  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xE5             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x0C  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCA             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x18  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xF5             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x10  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCC             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x1C  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xDD             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x14  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCE             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x18  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCB             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x1C  \
+    EMIT 0x83  EMIT 0xC7  EMIT 0x20             \
+    EMIT 0x83  EMIT 0xC6  EMIT 0x20             \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0xC9
+
+#define MULADDC_STOP                            \
+    EMIT 0x0F  EMIT 0x77                        \
+    __asm   mov     c, ecx                      \
+    __asm   mov     d, edi                      \
+    __asm   mov     s, esi                      \
+
+#else
+
+#define MULADDC_STOP                            \
+    __asm   mov     c, ecx                      \
+    __asm   mov     d, edi                      \
+    __asm   mov     s, esi                      \
+
+#endif /* SSE2 */
+#endif /* MSVC */
+
+#endif /* POLARSSL_HAVE_ASM */
+
+#if !defined(MULADDC_CORE)
+#if defined(POLARSSL_HAVE_LONGLONG)
+
+#define MULADDC_INIT                    \
+{                                       \
+    t_dbl r;                            \
+    t_int r0, r1;
+
+#define MULADDC_CORE                    \
+    r   = *(s++) * (t_dbl) b;           \
+    r0  = r;                            \
+    r1  = r >> biL;                     \
+    r0 += c;  r1 += (r0 <  c);          \
+    r0 += *d; r1 += (r0 < *d);          \
+    c = r1; *(d++) = r0;
+
+#define MULADDC_STOP                    \
+}
+
+#else
+#define MULADDC_INIT                    \
+{                                       \
+    t_uint s0, s1, b0, b1;              \
+    t_uint r0, r1, rx, ry;              \
+    b0 = ( b << biH ) >> biH;           \
+    b1 = ( b >> biH );
+
+#define MULADDC_CORE                    \
+    s0 = ( *s << biH ) >> biH;          \
+    s1 = ( *s >> biH ); s++;            \
+    rx = s0 * b1; r0 = s0 * b0;         \
+    ry = s1 * b0; r1 = s1 * b1;         \
+    r1 += ( rx >> biH );                \
+    r1 += ( ry >> biH );                \
+    rx <<= biH; ry <<= biH;             \
+    r0 += rx; r1 += (r0 < rx);          \
+    r0 += ry; r1 += (r0 < ry);          \
+    r0 +=  c; r1 += (r0 <  c);          \
+    r0 += *d; r1 += (r0 < *d);          \
+    c = r1; *(d++) = r0;
+
+#define MULADDC_STOP                    \
+}
+
+#endif /* C (generic)  */
+#endif /* C (longlong) */
+
+#endif /* bn_mul.h */
--- a/polarssl/config.h
+++ b/polarssl/config.h
--- a/polarssl/rsa.h
+++ b/polarssl/rsa.h
@ -0,0 +1,633 @@
+/**
+ * \file rsa.h
+ *
+ * \brief The RSA public-key cryptosystem
+ *
+ *  Copyright (C) 2006-2010, Brainspark B.V.
+ *
+ *  This file is part of PolarSSL (http://www.polarssl.org)
+ *  Lead Maintainer: Paul Bakker <polarssl_maintainer at polarssl.org>
+ *
+ *  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef POLARSSL_RSA_H
+#define POLARSSL_RSA_H
+
+#include "bignum.h"
+
+/*
+ * RSA Error codes
+ */
+#define POLARSSL_ERR_RSA_BAD_INPUT_DATA                    -0x4080  /**< Bad input parameters to function. */
+#define POLARSSL_ERR_RSA_INVALID_PADDING                   -0x4100  /**< Input data contains invalid padding and is rejected. */
+#define POLARSSL_ERR_RSA_KEY_GEN_FAILED                    -0x4180  /**< Something failed during generation of a key. */
+#define POLARSSL_ERR_RSA_KEY_CHECK_FAILED                  -0x4200  /**< Key failed to pass the libraries validity check. */
+#define POLARSSL_ERR_RSA_PUBLIC_FAILED                     -0x4280  /**< The public key operation failed. */
+#define POLARSSL_ERR_RSA_PRIVATE_FAILED                    -0x4300  /**< The private key operation failed. */
+#define POLARSSL_ERR_RSA_VERIFY_FAILED                     -0x4380  /**< The PKCS#1 verification failed. */
+#define POLARSSL_ERR_RSA_OUTPUT_TOO_LARGE                  -0x4400  /**< The output buffer for decryption is not large enough. */
+#define POLARSSL_ERR_RSA_RNG_FAILED                        -0x4480  /**< The random generator failed to generate non-zeros. */
+
+/*
+ * PKCS#1 constants
+ */
+#define SIG_RSA_RAW     0
+#define SIG_RSA_MD2     2
+#define SIG_RSA_MD4     3
+#define SIG_RSA_MD5     4
+#define SIG_RSA_SHA1    5
+#define SIG_RSA_SHA224 14
+#define SIG_RSA_SHA256 11
+#define SIG_RSA_SHA384 12
+#define SIG_RSA_SHA512 13
+
+#define RSA_PUBLIC      0
+#define RSA_PRIVATE     1
+
+#define RSA_PKCS_V15    0
+#define RSA_PKCS_V21    1
+
+#define RSA_SIGN        1
+#define RSA_CRYPT       2
+
+#define ASN1_STR_CONSTRUCTED_SEQUENCE   "\x30"
+#define ASN1_STR_NULL                   "\x05"
+#define ASN1_STR_OID                    "\x06"
+#define ASN1_STR_OCTET_STRING           "\x04"
+
+#define OID_DIGEST_ALG_MDX              "\x2A\x86\x48\x86\xF7\x0D\x02\x00"
+#define OID_HASH_ALG_SHA1               "\x2b\x0e\x03\x02\x1a"
+#define OID_HASH_ALG_SHA2X              "\x60\x86\x48\x01\x65\x03\x04\x02\x00"
+
+#define OID_ISO_MEMBER_BODIES           "\x2a"
+#define OID_ISO_IDENTIFIED_ORG          "\x2b"
+
+/*
+ * ISO Member bodies OID parts
+ */
+#define OID_COUNTRY_US                  "\x86\x48"
+#define OID_RSA_DATA_SECURITY           "\x86\xf7\x0d"
+
+/*
+ * ISO Identified organization OID parts
+ */
+#define OID_OIW_SECSIG_SHA1             "\x0e\x03\x02\x1a"
+
+/*
+ * DigestInfo ::= SEQUENCE {
+ *   digestAlgorithm DigestAlgorithmIdentifier,
+ *   digest Digest }
+ *
+ * DigestAlgorithmIdentifier ::= AlgorithmIdentifier
+ *
+ * Digest ::= OCTET STRING
+ */
+#define ASN1_HASH_MDX                           \
+(                                               \
+    ASN1_STR_CONSTRUCTED_SEQUENCE "\x20"        \
+      ASN1_STR_CONSTRUCTED_SEQUENCE "\x0C"      \
+        ASN1_STR_OID "\x08"                     \
+      OID_DIGEST_ALG_MDX                        \
+    ASN1_STR_NULL "\x00"                        \
+      ASN1_STR_OCTET_STRING "\x10"              \
+)
+
+#define ASN1_HASH_SHA1                          \
+    ASN1_STR_CONSTRUCTED_SEQUENCE "\x21"        \
+      ASN1_STR_CONSTRUCTED_SEQUENCE "\x09"      \
+        ASN1_STR_OID "\x05"                     \
+      OID_HASH_ALG_SHA1                         \
+        ASN1_STR_NULL "\x00"                    \
+      ASN1_STR_OCTET_STRING "\x14"
+
+#define ASN1_HASH_SHA1_ALT                      \
+    ASN1_STR_CONSTRUCTED_SEQUENCE "\x1F"        \
+      ASN1_STR_CONSTRUCTED_SEQUENCE "\x07"      \
+        ASN1_STR_OID "\x05"                     \
+      OID_HASH_ALG_SHA1                         \
+      ASN1_STR_OCTET_STRING "\x14"
+
+#define ASN1_HASH_SHA2X                         \
+    ASN1_STR_CONSTRUCTED_SEQUENCE "\x11"        \
+      ASN1_STR_CONSTRUCTED_SEQUENCE "\x0d"      \
+        ASN1_STR_OID "\x09"                     \
+      OID_HASH_ALG_SHA2X                        \
+        ASN1_STR_NULL "\x00"                    \
+      ASN1_STR_OCTET_STRING "\x00"
+
+/**
+ * \brief          RSA context structure
+ */
+typedef struct
+{
+    int ver;                    /*!<  always 0          */
+    size_t len;                 /*!<  size(N) in chars  */
+
+    mpi N;                      /*!<  public modulus    */
+    mpi E;                      /*!<  public exponent   */
+
+    mpi D;                      /*!<  private exponent  */
+    mpi P;                      /*!<  1st prime factor  */
+    mpi Q;                      /*!<  2nd prime factor  */
+    mpi DP;                     /*!<  D % (P - 1)       */
+    mpi DQ;                     /*!<  D % (Q - 1)       */
+    mpi QP;                     /*!<  1 / (Q % P)       */
+
+    mpi RN;                     /*!<  cached R^2 mod N  */
+    mpi RP;                     /*!<  cached R^2 mod P  */
+    mpi RQ;                     /*!<  cached R^2 mod Q  */
+
+    int padding;                /*!<  RSA_PKCS_V15 for 1.5 padding and
+                                      RSA_PKCS_v21 for OAEP/PSS         */
+    int hash_id;                /*!<  Hash identifier of md_type_t as
+                                      specified in the md.h header file
+                                      for the EME-OAEP and EMSA-PSS
+                                      encoding                          */
+}
+rsa_context;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          Initialize an RSA context
+ *
+ *                 Note: Set padding to RSA_PKCS_V21 for the RSAES-OAEP
+ *                 encryption scheme and the RSASSA-PSS signature scheme.
+ *
+ * \param ctx      RSA context to be initialized
+ * \param padding  RSA_PKCS_V15 or RSA_PKCS_V21
+ * \param hash_id  RSA_PKCS_V21 hash identifier
+ *
+ * \note           The hash_id parameter is actually ignored
+ *                 when using RSA_PKCS_V15 padding.
+ */
+void rsa_init( rsa_context *ctx,
+               int padding,
+               int hash_id);
+
+/**
+ * \brief          Generate an RSA keypair
+ *
+ * \param ctx      RSA context that will hold the key
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ * \param nbits    size of the public key in bits
+ * \param exponent public exponent (e.g., 65537)
+ *
+ * \note           rsa_init() must be called beforehand to setup
+ *                 the RSA context.
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ */
+int rsa_gen_key( rsa_context *ctx,
+                 int (*f_rng)(void *, unsigned char *, size_t),
+                 void *p_rng,
+                 unsigned int nbits, int exponent );
+
+/**
+ * \brief          Check a public RSA key
+ *
+ * \param ctx      RSA context to be checked
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ */
+int rsa_check_pubkey( const rsa_context *ctx );
+
+/**
+ * \brief          Check a private RSA key
+ *
+ * \param ctx      RSA context to be checked
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ */
+int rsa_check_privkey( const rsa_context *ctx );
+
+/**
+ * \brief          Do an RSA public key operation
+ *
+ * \param ctx      RSA context
+ * \param input    input buffer
+ * \param output   output buffer
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           This function does NOT take care of message
+ *                 padding. Also, be sure to set input[0] = 0 or assure that
+ *                 input is smaller than N.
+ *
+ * \note           The input and output buffers must be large
+ *                 enough (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_public( rsa_context *ctx,
+                const unsigned char *input,
+                unsigned char *output );
+
+/**
+ * \brief          Do an RSA private key operation
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Needed for blinding)
+ * \param p_rng    RNG parameter
+ * \param input    input buffer
+ * \param output   output buffer
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The input and output buffers must be large
+ *                 enough (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_private( rsa_context *ctx,
+                 int (*f_rng)(void *, unsigned char *, size_t),
+                 void *p_rng,
+                 const unsigned char *input,
+                 unsigned char *output );
+
+/**
+ * \brief          Generic wrapper to perform a PKCS#1 encryption using the
+ *                 mode from the context. Add the message padding, then do an
+ *                 RSA operation.
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Needed for padding and PKCS#1 v2.1 encoding
+ *                               and RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param ilen     contains the plaintext length
+ * \param input    buffer holding the data to be encrypted
+ * \param output   buffer that will hold the ciphertext
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The output buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_pkcs1_encrypt( rsa_context *ctx,
+                       int (*f_rng)(void *, unsigned char *, size_t),
+                       void *p_rng,
+                       int mode, size_t ilen,
+                       const unsigned char *input,
+                       unsigned char *output );
+
+/**
+ * \brief          Perform a PKCS#1 v1.5 encryption (RSAES-PKCS1-v1_5-ENCRYPT)
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Needed for padding and RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param ilen     contains the plaintext length
+ * \param input    buffer holding the data to be encrypted
+ * \param output   buffer that will hold the ciphertext
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The output buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_rsaes_pkcs1_v15_encrypt( rsa_context *ctx,
+                                 int (*f_rng)(void *, unsigned char *, size_t),
+                                 void *p_rng,
+                                 int mode, size_t ilen,
+                                 const unsigned char *input,
+                                 unsigned char *output );
+
+/**
+ * \brief          Perform a PKCS#1 v2.1 OAEP encryption (RSAES-OAEP-ENCRYPT)
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Needed for padding and PKCS#1 v2.1 encoding
+ *                               and RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param label    buffer holding the custom label to use
+ * \param label_len contains the label length
+ * \param ilen     contains the plaintext length
+ * \param input    buffer holding the data to be encrypted
+ * \param output   buffer that will hold the ciphertext
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The output buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_rsaes_oaep_encrypt( rsa_context *ctx,
+                            int (*f_rng)(void *, unsigned char *, size_t),
+                            void *p_rng,
+                            int mode,
+                            const unsigned char *label, size_t label_len,
+                            size_t ilen,
+                            const unsigned char *input,
+                            unsigned char *output );
+
+/**
+ * \brief          Generic wrapper to perform a PKCS#1 decryption using the
+ *                 mode from the context. Do an RSA operation, then remove
+ *                 the message padding
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param olen     will contain the plaintext length
+ * \param input    buffer holding the encrypted data
+ * \param output   buffer that will hold the plaintext
+ * \param output_max_len    maximum length of the output buffer
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The output buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used) otherwise
+ *                 an error is thrown.
+ */
+int rsa_pkcs1_decrypt( rsa_context *ctx,
+                       int (*f_rng)(void *, unsigned char *, size_t),
+                       void *p_rng,
+                       int mode, size_t *olen,
+                       const unsigned char *input,
+                       unsigned char *output,
+                       size_t output_max_len );
+
+/**
+ * \brief          Perform a PKCS#1 v1.5 decryption (RSAES-PKCS1-v1_5-DECRYPT)
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param olen     will contain the plaintext length
+ * \param input    buffer holding the encrypted data
+ * \param output   buffer that will hold the plaintext
+ * \param output_max_len    maximum length of the output buffer
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The output buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used) otherwise
+ *                 an error is thrown.
+ */
+int rsa_rsaes_pkcs1_v15_decrypt( rsa_context *ctx,
+                                 int (*f_rng)(void *, unsigned char *, size_t),
+                                 void *p_rng,
+                                 int mode, size_t *olen,
+                                 const unsigned char *input,
+                                 unsigned char *output,
+                                 size_t output_max_len );
+
+/**
+ * \brief          Perform a PKCS#1 v2.1 OAEP decryption (RSAES-OAEP-DECRYPT)
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param label    buffer holding the custom label to use
+ * \param label_len contains the label length
+ * \param olen     will contain the plaintext length
+ * \param input    buffer holding the encrypted data
+ * \param output   buffer that will hold the plaintext
+ * \param output_max_len    maximum length of the output buffer
+ *
+ * \return         0 if successful, or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The output buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used) otherwise
+ *                 an error is thrown.
+ */
+int rsa_rsaes_oaep_decrypt( rsa_context *ctx,
+                            int (*f_rng)(void *, unsigned char *, size_t),
+                            void *p_rng,
+                            int mode,
+                            const unsigned char *label, size_t label_len,
+                            size_t *olen,
+                            const unsigned char *input,
+                            unsigned char *output,
+                            size_t output_max_len );
+
+/**
+ * \brief          Generic wrapper to perform a PKCS#1 signature using the
+ *                 mode from the context. Do a private RSA operation to sign
+ *                 a message digest
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Needed for PKCS#1 v2.1 encoding and for
+ *                 RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param hash_id  SIG_RSA_RAW, SIG_RSA_MD{2,4,5} or SIG_RSA_SHA{1,224,256,384,512}
+ * \param hashlen  message digest length (for SIG_RSA_RAW only)
+ * \param hash     buffer holding the message digest
+ * \param sig      buffer that will hold the ciphertext
+ *
+ * \return         0 if the signing operation was successful,
+ *                 or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The "sig" buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ *
+ * \note           In case of PKCS#1 v2.1 encoding keep in mind that
+ *                 the hash_id in the RSA context is the one used for the
+ *                 encoding. hash_id in the function call is the type of hash
+ *                 that is encoded. According to RFC 3447 it is advised to
+ *                 keep both hashes the same.
+ */
+int rsa_pkcs1_sign( rsa_context *ctx,
+                    int (*f_rng)(void *, unsigned char *, size_t),
+                    void *p_rng,
+                    int mode,
+                    int hash_id,
+                    unsigned int hashlen,
+                    const unsigned char *hash,
+                    unsigned char *sig );
+
+/**
+ * \brief          Perform a PKCS#1 v1.5 signature (RSASSA-PKCS1-v1_5-SIGN)
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param hash_id  SIG_RSA_RAW, SIG_RSA_MD{2,4,5} or SIG_RSA_SHA{1,224,256,384,512}
+ * \param hashlen  message digest length (for SIG_RSA_RAW only)
+ * \param hash     buffer holding the message digest
+ * \param sig      buffer that will hold the ciphertext
+ *
+ * \return         0 if the signing operation was successful,
+ *                 or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The "sig" buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_rsassa_pkcs1_v15_sign( rsa_context *ctx,
+                               int (*f_rng)(void *, unsigned char *, size_t),
+                               void *p_rng,
+                               int mode,
+                               int hash_id,
+                               unsigned int hashlen,
+                               const unsigned char *hash,
+                               unsigned char *sig );
+
+/**
+ * \brief          Perform a PKCS#1 v2.1 PSS signature (RSASSA-PSS-SIGN)
+ *
+ * \param ctx      RSA context
+ * \param f_rng    RNG function (Needed for PKCS#1 v2.1 encoding and for
+ *                               RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param hash_id  SIG_RSA_RAW, SIG_RSA_MD{2,4,5} or SIG_RSA_SHA{1,224,256,384,512}
+ * \param hashlen  message digest length (for SIG_RSA_RAW only)
+ * \param hash     buffer holding the message digest
+ * \param sig      buffer that will hold the ciphertext
+ *
+ * \return         0 if the signing operation was successful,
+ *                 or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The "sig" buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ *
+ * \note           In case of PKCS#1 v2.1 encoding keep in mind that
+ *                 the hash_id in the RSA context is the one used for the
+ *                 encoding. hash_id in the function call is the type of hash
+ *                 that is encoded. According to RFC 3447 it is advised to
+ *                 keep both hashes the same.
+ */
+int rsa_rsassa_pss_sign( rsa_context *ctx,
+                         int (*f_rng)(void *, unsigned char *, size_t),
+                         void *p_rng,
+                         int mode,
+                         int hash_id,
+                         unsigned int hashlen,
+                         const unsigned char *hash,
+                         unsigned char *sig );
+
+/**
+ * \brief          Generic wrapper to perform a PKCS#1 verification using the
+ *                 mode from the context. Do a public RSA operation and check
+ *                 the message digest
+ *
+ * \param ctx      points to an RSA public key
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param hash_id  SIG_RSA_RAW, SIG_RSA_MD{2,4,5} or SIG_RSA_SHA{1,224,256,384,512}
+ * \param hashlen  message digest length (for SIG_RSA_RAW only)
+ * \param hash     buffer holding the message digest
+ * \param sig      buffer holding the ciphertext
+ *
+ * \return         0 if the verify operation was successful,
+ *                 or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The "sig" buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ *
+ * \note           In case of PKCS#1 v2.1 encoding keep in mind that
+ *                 the hash_id in the RSA context is the one used for the
+ *                 verification. hash_id in the function call is the type of hash
+ *                 that is verified. According to RFC 3447 it is advised to
+ *                 keep both hashes the same.
+ */
+int rsa_pkcs1_verify( rsa_context *ctx,
+                      int (*f_rng)(void *, unsigned char *, size_t),
+                      void *p_rng,
+                      int mode,
+                      int hash_id,
+                      unsigned int hashlen,
+                      const unsigned char *hash,
+                      const unsigned char *sig );
+
+/**
+ * \brief          Perform a PKCS#1 v1.5 verification (RSASSA-PKCS1-v1_5-VERIFY)
+ *
+ * \param ctx      points to an RSA public key
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param hash_id  SIG_RSA_RAW, SIG_RSA_MD{2,4,5} or SIG_RSA_SHA{1,224,256,384,512}
+ * \param hashlen  message digest length (for SIG_RSA_RAW only)
+ * \param hash     buffer holding the message digest
+ * \param sig      buffer holding the ciphertext
+ *
+ * \return         0 if the verify operation was successful,
+ *                 or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The "sig" buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ */
+int rsa_rsassa_pkcs1_v15_verify( rsa_context *ctx,
+                                 int (*f_rng)(void *, unsigned char *, size_t),
+                                 void *p_rng,
+                                 int mode,
+                                 int hash_id,
+                                 unsigned int hashlen,
+                                 const unsigned char *hash,
+                                 const unsigned char *sig );
+
+/**
+ * \brief          Perform a PKCS#1 v2.1 PSS verification (RSASSA-PSS-VERIFY)
+ * \brief          Do a public RSA and check the message digest
+ *
+ * \param ctx      points to an RSA public key
+ * \param f_rng    RNG function (Only needed for RSA_PRIVATE)
+ * \param p_rng    RNG parameter
+ * \param mode     RSA_PUBLIC or RSA_PRIVATE
+ * \param hash_id  SIG_RSA_RAW, SIG_RSA_MD{2,4,5} or SIG_RSA_SHA{1,224,256,384,512}
+ * \param hashlen  message digest length (for SIG_RSA_RAW only)
+ * \param hash     buffer holding the message digest
+ * \param sig      buffer holding the ciphertext
+ *
+ * \return         0 if the verify operation was successful,
+ *                 or an POLARSSL_ERR_RSA_XXX error code
+ *
+ * \note           The "sig" buffer must be as large as the size
+ *                 of ctx->N (eg. 128 bytes if RSA-1024 is used).
+ *
+ * \note           In case of PKCS#1 v2.1 encoding keep in mind that
+ *                 the hash_id in the RSA context is the one used for the
+ *                 verification. hash_id in the function call is the type of hash
+ *                 that is verified. According to RFC 3447 it is advised to
+ *                 keep both hashes the same.
+ */
+int rsa_rsassa_pss_verify( rsa_context *ctx,
+                           int (*f_rng)(void *, unsigned char *, size_t),
+                           void *p_rng,
+                           int mode,
+                           int hash_id,
+                           unsigned int hashlen,
+                           const unsigned char *hash,
+                           unsigned char *sig );
+
+/**
+ * \brief          Free the components of an RSA key
+ *
+ * \param ctx      RSA Context to free
+ */
+void rsa_free( rsa_context *ctx );
+
+/**
+ * \brief          Checkup routine
+ *
+ * \return         0 if successful, or 1 if the test failed
+ */
+int rsa_self_test( int verbose );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* rsa.h */
--- a/random.c
+++ b/random.c
@ -0,0 +1,120 @@
+/*
+ * random.c -- get random bytes
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015
+ *               Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "gnuk.h"
+#include "neug.h"
+
+#define RANDOM_BYTES_LENGTH 32
+static uint32_t random_word[RANDOM_BYTES_LENGTH/sizeof (uint32_t)];
+
+void
+random_init (void)
+{
+  int i;
+
+  neug_init (random_word, RANDOM_BYTES_LENGTH/sizeof (uint32_t));
+
+  for (i = 0; i < NEUG_PRE_LOOP; i++)
+    (void)neug_get (NEUG_KICK_FILLING);
+}
+
+void
+random_fini (void)
+{
+  neug_fini ();
+}
+
+/*
+ * Return pointer to random 32-byte
+ */
+const uint8_t *
+random_bytes_get (void)
+{
+  neug_wait_full ();
+  return (const uint8_t *)random_word;
+}
+
+/*
+ * Free pointer to random 32-byte
+ */
+void
+random_bytes_free (const uint8_t *p)
+{
+  (void)p;
+  memset (random_word, 0, RANDOM_BYTES_LENGTH);
+  neug_flush ();
+}
+
+/*
+ * Return 4-byte salt
+ */
+void
+random_get_salt (uint8_t *p)
+{
+  uint32_t rnd;
+
+  rnd = neug_get (NEUG_KICK_FILLING);
+  memcpy (p, &rnd, sizeof (uint32_t));
+  rnd = neug_get (NEUG_KICK_FILLING);
+  memcpy (p + sizeof (uint32_t), &rnd, sizeof (uint32_t));
+}
+
+
+/*
+ * Random byte iterator
+ */
+int
+random_gen (void *arg, unsigned char *out, size_t out_len)
+{
+  uint8_t *index_p = (uint8_t *)arg;
+  uint8_t index = *index_p;
+  size_t n;
+
+  while (out_len)
+    {
+      neug_wait_full ();
+
+      n = RANDOM_BYTES_LENGTH - index;
+      if (n > out_len)
+	n = out_len;
+
+      memcpy (out, ((unsigned char *)random_word) + index, n);
+      out += n;
+      out_len -= n;
+      index += n;
+
+      if (index >= RANDOM_BYTES_LENGTH)
+	{
+	  index = 0;
+	  neug_flush ();
+	}
+    }
+
+  *index_p = index;
+
+  return 0;
+}
--- a/random.h
+++ b/random.h
@ -0,0 +1,12 @@
+void random_init (void);
+void random_fini (void);
+
+/* 32-byte random bytes */
+const uint8_t *random_bytes_get (void);
+void random_bytes_free (const uint8_t *p);
+
+/* 8-byte salt */
+void random_get_salt (uint8_t *p);
+
+/* iterator returning a byta at a time */
+int random_gen (void *arg, unsigned char *output, size_t output_len);
--- a/rsa.c
+++ b/rsa.c
--- a/sha256.c
+++ b/sha256.c
@ -0,0 +1,225 @@
+/*
+ * sha256.c -- Compute SHA-256 hash
+ *
+ * Just for little endian architecture.
+ *
+ * Code taken from:
+ *  http://gladman.plushost.co.uk/oldsite/cryptography_technology/sha/index.php
+ *
+ *  File names are sha2.c, sha2.h, brg_types.h, brg_endian.h
+ *  in the archive sha2-07-01-07.zip.
+ *
+ * Code is modified in the style of PolarSSL API.
+ *
+ * See original copyright notice below.
+ */
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+*/
+
+#include <string.h>
+#include <stdint.h>
+#include "sha256.h"
+
+#define SHA256_MASK (SHA256_BLOCK_SIZE - 1)
+
+static void memcpy_output_bswap32 (unsigned char *dst, const uint32_t *p)
+{
+  int i;
+  uint32_t q = 0;
+
+  for (i = 0; i < 32; i++)
+    {
+      if ((i & 3) == 0)
+	q = __builtin_bswap32 (p[i >> 2]); /* bswap32 is GCC extention */
+      dst[i] = q >> ((i & 3) * 8);
+    }
+}
+
+#define rotr32(x,n)   (((x) >> n) | ((x) << (32 - n)))
+
+#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+/* round transforms for SHA256 compression functions */
+#define vf(n,i) v[(n - i) & 7]
+
+#define hf(i) (p[i & 15] += \
+    g_1(p[(i + 14) & 15]) + p[(i + 9) & 15] + g_0(p[(i + 1) & 15]))
+
+#define v_cycle0(i)                                 \
+    p[i] = __builtin_bswap32 (p[i]);                \
+    vf(7,i) += p[i] + k_0[i]                        \
+    + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i));   \
+    vf(3,i) += vf(7,i);                             \
+    vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
+
+#define v_cycle(i, j)                               \
+    vf(7,i) += hf(i) + k_0[i+j]                     \
+    + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i));   \
+    vf(3,i) += vf(7,i);                             \
+    vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
+
+#define s_0(x)  (rotr32((x),  2) ^ rotr32((x), 13) ^ rotr32((x), 22))
+#define s_1(x)  (rotr32((x),  6) ^ rotr32((x), 11) ^ rotr32((x), 25))
+#define g_0(x)  (rotr32((x),  7) ^ rotr32((x), 18) ^ ((x) >>  3))
+#define g_1(x)  (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
+#define k_0     k256
+
+static const uint32_t k256[64] = {
+  0X428A2F98, 0X71374491, 0XB5C0FBCF, 0XE9B5DBA5,
+  0X3956C25B, 0X59F111F1, 0X923F82A4, 0XAB1C5ED5,
+  0XD807AA98, 0X12835B01, 0X243185BE, 0X550C7DC3,
+  0X72BE5D74, 0X80DEB1FE, 0X9BDC06A7, 0XC19BF174,
+  0XE49B69C1, 0XEFBE4786, 0X0FC19DC6, 0X240CA1CC,
+  0X2DE92C6F, 0X4A7484AA, 0X5CB0A9DC, 0X76F988DA,
+  0X983E5152, 0XA831C66D, 0XB00327C8, 0XBF597FC7,
+  0XC6E00BF3, 0XD5A79147, 0X06CA6351, 0X14292967,
+  0X27B70A85, 0X2E1B2138, 0X4D2C6DFC, 0X53380D13,
+  0X650A7354, 0X766A0ABB, 0X81C2C92E, 0X92722C85,
+  0XA2BFE8A1, 0XA81A664B, 0XC24B8B70, 0XC76C51A3,
+  0XD192E819, 0XD6990624, 0XF40E3585, 0X106AA070,
+  0X19A4C116, 0X1E376C08, 0X2748774C, 0X34B0BCB5,
+  0X391C0CB3, 0X4ED8AA4A, 0X5B9CCA4F, 0X682E6FF3,
+  0X748F82EE, 0X78A5636F, 0X84C87814, 0X8CC70208,
+  0X90BEFFFA, 0XA4506CEB, 0XBEF9A3F7, 0XC67178F2,
+};
+
+void
+sha256_process (sha256_context *ctx)
+{
+  uint32_t i;
+  uint32_t *p = ctx->wbuf;
+  uint32_t v[8];
+
+  memcpy (v, ctx->state, 8 * sizeof (uint32_t));
+
+  v_cycle0 ( 0); v_cycle0 ( 1); v_cycle0 ( 2); v_cycle0 ( 3);
+  v_cycle0 ( 4); v_cycle0 ( 5); v_cycle0 ( 6); v_cycle0 ( 7);
+  v_cycle0 ( 8); v_cycle0 ( 9); v_cycle0 (10); v_cycle0 (11);
+  v_cycle0 (12); v_cycle0 (13); v_cycle0 (14); v_cycle0 (15);
+
+  for (i = 16; i < 64; i += 16)
+    {
+      v_cycle ( 0, i); v_cycle ( 1, i); v_cycle ( 2, i); v_cycle ( 3, i);
+      v_cycle ( 4, i); v_cycle ( 5, i); v_cycle ( 6, i); v_cycle ( 7, i);
+      v_cycle ( 8, i); v_cycle ( 9, i); v_cycle (10, i); v_cycle (11, i);
+      v_cycle (12, i); v_cycle (13, i); v_cycle (14, i); v_cycle (15, i);
+    }
+
+  ctx->state[0] += v[0];
+  ctx->state[1] += v[1];
+  ctx->state[2] += v[2];
+  ctx->state[3] += v[3];
+  ctx->state[4] += v[4];
+  ctx->state[5] += v[5];
+  ctx->state[6] += v[6];
+  ctx->state[7] += v[7];
+}
+
+void
+sha256_update (sha256_context *ctx, const unsigned char *input,
+               unsigned int ilen)
+{
+  uint32_t left = (ctx->total[0] & SHA256_MASK);
+  uint32_t fill = SHA256_BLOCK_SIZE - left;
+
+  ctx->total[0] += ilen;
+  if (ctx->total[0] < ilen)
+    ctx->total[1]++;
+
+  while (ilen >= fill)
+    {
+      memcpy (((unsigned char*)ctx->wbuf) + left, input, fill);
+      sha256_process (ctx);
+      input += fill;
+      ilen -= fill;
+      left = 0;
+      fill = SHA256_BLOCK_SIZE;
+    }
+
+  memcpy (((unsigned char*)ctx->wbuf) + left, input, ilen);
+}
+
+void
+sha256_finish (sha256_context *ctx, unsigned char output[32])
+{
+  uint32_t last = (ctx->total[0] & SHA256_MASK);
+
+  ctx->wbuf[last >> 2] = __builtin_bswap32 (ctx->wbuf[last >> 2]);
+  ctx->wbuf[last >> 2] &= 0xffffff80 << (8 * (~last & 3));
+  ctx->wbuf[last >> 2] |= 0x00000080 << (8 * (~last & 3));
+  ctx->wbuf[last >> 2] = __builtin_bswap32 (ctx->wbuf[last >> 2]);
+
+  if (last > SHA256_BLOCK_SIZE - 9)
+    {
+      if (last < 60)
+        ctx->wbuf[15] = 0;
+      sha256_process (ctx);
+      last = 0;
+    }
+  else
+    last = (last >> 2) + 1;
+
+  while (last < 14)
+    ctx->wbuf[last++] = 0;
+
+  ctx->wbuf[14] = __builtin_bswap32 ((ctx->total[0] >> 29) | (ctx->total[1] << 3));
+  ctx->wbuf[15] = __builtin_bswap32 (ctx->total[0] << 3);
+  sha256_process (ctx);
+
+  memcpy_output_bswap32 (output, ctx->state);
+  memset (ctx, 0, sizeof (sha256_context));
+}
+
+static const uint32_t initial_state[8] =
+{
+  0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+  0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+void
+sha256_start (sha256_context *ctx)
+{
+  ctx->total[0] = ctx->total[1] = 0;
+  memcpy (ctx->state, initial_state, 8 * sizeof(uint32_t));
+}
+
+void
+sha256 (const unsigned char *input, unsigned int ilen,
+        unsigned char output[32])
+{
+  sha256_context ctx;
+
+  sha256_start (&ctx);
+  sha256_update (&ctx, input, ilen);
+  sha256_finish (&ctx, output);
+}
--- a/sha256.h
+++ b/sha256.h
@ -0,0 +1,17 @@
+#define SHA256_DIGEST_SIZE  32
+#define SHA256_BLOCK_SIZE   64
+
+typedef struct
+{
+  uint32_t total[2];
+  uint32_t state[8];
+  uint32_t wbuf[16];
+} sha256_context;
+
+void sha256 (const unsigned char *input, unsigned int ilen,
+	     unsigned char output[32]);
+void sha256_start (sha256_context *ctx);
+void sha256_finish (sha256_context *ctx, unsigned char output[32]);
+void sha256_update (sha256_context *ctx, const unsigned char *input,
+		    unsigned int ilen);
+void sha256_process (sha256_context *ctx);
--- a/sha512.c
+++ b/sha512.c
@ -0,0 +1,215 @@
+/*
+ * sha512.c -- Compute SHA-512 hash (for little endian architecture).
+ *
+ * This module is written by gniibe, following the API of sha256.c.
+ *
+ * Copyright (C) 2014 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * Reference:
+ *
+ * [1] FIPS PUB 180-4: Secure hash Standard (SHS), March, 2012.
+ *
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "sha512.h"
+
+#define SHA512_MASK (SHA512_BLOCK_SIZE - 1)
+
+static void memcpy_output_bswap64 (unsigned char dst[64], const uint64_t *p)
+{
+  int i;
+  uint64_t q = 0;
+
+  for (i = 0; i < 64; i++)
+    {
+      if ((i & 7) == 0)
+	q = __builtin_bswap64 (p[i >> 3]); /* bswap64 is GCC extention */
+      dst[i] = q >> ((i & 7) * 8);
+    }
+}
+
+#define rotr64(x,n)   (((x) >> n) | ((x) << (64 - n)))
+
+#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+/* round transforms for SHA512 compression functions */
+#define vf(n,i) v[(n - i) & 7]
+
+#define hf(i) (p[i & 15] += \
+    g_1(p[(i + 14) & 15]) + p[(i + 9) & 15] + g_0(p[(i + 1) & 15]))
+
+#define v_cycle0(i)                                 \
+    p[i] = __builtin_bswap64 (p[i]);                \
+    vf(7,i) += p[i] + k_0[i]                        \
+    + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i));   \
+    vf(3,i) += vf(7,i);                             \
+    vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
+
+#define v_cycle(i, j)                               \
+    vf(7,i) += hf(i) + k_0[i+j]                     \
+    + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i));   \
+    vf(3,i) += vf(7,i);                             \
+    vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
+
+#define s_0(x)  (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
+#define s_1(x)  (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
+#define g_0(x)  (rotr64((x),  1) ^ rotr64((x),  8) ^ ((x) >>  7))
+#define g_1(x)  (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >>  6))
+#define k_0     k512
+
+/* Taken from section 4.2.3 of [1].  */
+static const uint64_t k512[80] = {
+0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
+0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
+0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
+0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
+0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
+0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
+void
+sha512_process (sha512_context *ctx)
+{
+  uint32_t i;
+  uint64_t *p = ctx->wbuf;
+  uint64_t v[8];
+
+  memcpy (v, ctx->state, 8 * sizeof (uint64_t));
+
+  v_cycle0 ( 0); v_cycle0 ( 1); v_cycle0 ( 2); v_cycle0 ( 3);
+  v_cycle0 ( 4); v_cycle0 ( 5); v_cycle0 ( 6); v_cycle0 ( 7);
+  v_cycle0 ( 8); v_cycle0 ( 9); v_cycle0 (10); v_cycle0 (11);
+  v_cycle0 (12); v_cycle0 (13); v_cycle0 (14); v_cycle0 (15);
+
+  for (i = 16; i < 80; i += 16)
+    {
+      v_cycle ( 0, i); v_cycle ( 1, i); v_cycle ( 2, i); v_cycle ( 3, i);
+      v_cycle ( 4, i); v_cycle ( 5, i); v_cycle ( 6, i); v_cycle ( 7, i);
+      v_cycle ( 8, i); v_cycle ( 9, i); v_cycle (10, i); v_cycle (11, i);
+      v_cycle (12, i); v_cycle (13, i); v_cycle (14, i); v_cycle (15, i);
+    }
+
+  ctx->state[0] += v[0];
+  ctx->state[1] += v[1];
+  ctx->state[2] += v[2];
+  ctx->state[3] += v[3];
+  ctx->state[4] += v[4];
+  ctx->state[5] += v[5];
+  ctx->state[6] += v[6];
+  ctx->state[7] += v[7];
+}
+
+void
+sha512_update (sha512_context *ctx, const unsigned char *input,
+               unsigned int ilen)
+{
+  uint32_t left = (ctx->total[0] & SHA512_MASK);
+  uint32_t fill = SHA512_BLOCK_SIZE - left;
+
+  ctx->total[0] += ilen;
+  if (ctx->total[0] < ilen)
+    ctx->total[1]++;
+
+  while (ilen >= fill)
+    {
+      memcpy (((unsigned char*)ctx->wbuf) + left, input, fill);
+      sha512_process (ctx);
+      input += fill;
+      ilen -= fill;
+      left = 0;
+      fill = SHA512_BLOCK_SIZE;
+    }
+
+  memcpy (((unsigned char*)ctx->wbuf) + left, input, ilen);
+}
+
+void
+sha512_finish (sha512_context *ctx, unsigned char output[64])
+{
+  uint32_t last = (ctx->total[0] & SHA512_MASK);
+
+  ctx->wbuf[last >> 3] = __builtin_bswap64 (ctx->wbuf[last >> 3]);
+  ctx->wbuf[last >> 3] &= 0xffffffffffffff80LL << (8 * (~last & 7));
+  ctx->wbuf[last >> 3] |= 0x0000000000000080LL << (8 * (~last & 7));
+  ctx->wbuf[last >> 3] = __builtin_bswap64 (ctx->wbuf[last >> 3]);
+
+  if (last > SHA512_BLOCK_SIZE - 17)
+    {
+      if (last < 120)
+        ctx->wbuf[15] = 0;
+      sha512_process (ctx);
+      last = 0;
+    }
+  else
+    last = (last >> 3) + 1;
+
+  while (last < 14)
+    ctx->wbuf[last++] = 0;
+
+  ctx->wbuf[14] = __builtin_bswap64 ((ctx->total[0] >> 61) | (ctx->total[1] << 3));
+  ctx->wbuf[15] = __builtin_bswap64 (ctx->total[0] << 3);
+  sha512_process (ctx);
+
+  memcpy_output_bswap64 (output, ctx->state);
+  memset (ctx, 0, sizeof (sha512_context));
+}
+
+/* Taken from section 5.3.5 of [1].  */
+static const uint64_t initial_state[8] = {
+0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+};
+
+void
+sha512_start (sha512_context *ctx)
+{
+  ctx->total[0] = ctx->total[1] = 0;
+  memcpy (ctx->state, initial_state, 8 * sizeof(uint64_t));
+}
+
+void
+sha512 (const unsigned char *input, unsigned int ilen,
+        unsigned char output[64])
+{
+  sha512_context ctx;
+
+  sha512_start (&ctx);
+  sha512_update (&ctx, input, ilen);
+  sha512_finish (&ctx, output);
+}
--- a/sha512.h
+++ b/sha512.h
@ -0,0 +1,17 @@
+#define SHA512_DIGEST_SIZE  64
+#define SHA512_BLOCK_SIZE   128
+
+typedef struct
+{
+  uint64_t total[2];
+  uint64_t state[8];
+  uint64_t wbuf[16];
+} sha512_context;
+
+void sha512 (const unsigned char *input, unsigned int ilen,
+	     unsigned char output[64]);
+void sha512_start (sha512_context *ctx);
+void sha512_finish (sha512_context *ctx, unsigned char output[64]);
+void sha512_update (sha512_context *ctx, const unsigned char *input,
+		    unsigned int ilen);
+void sha512_process (sha512_context *ctx);
--- a/shake256.c
+++ b/shake256.c
@ -0,0 +1,202 @@
+/*
+ * shake256.c -- Compute SHAKE hash.
+ *
+ * Copyright (C) 2021 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*
+ * Reference:
+ *
+ * [1] FIPS PUB 202: SHA-3 Standard:
+ *                   Permutation-Based Hash and Extendable-Output Functions,
+ *                   August 2015.
+ */
+
+#define SHAKE_BITS 256
+#define SHAKE_INDEX_MAX (200 - (SHAKE_BITS >> 2))
+
+/*
+ * b=1600
+ * nr = 24 iterations
+ * l = 6
+ *
+ * state: 25x64-bit  ==  5 x      5 x  64
+ *                       row   column  bit
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "shake256.h"
+
+/* Round constants in iota step.  */
+static const uint64_t rc[24] = {
+  UINT64_C (0x0000000000000001), UINT64_C (0x0000000000008082),
+  UINT64_C (0x800000000000808a), UINT64_C (0x8000000080008000),
+  UINT64_C (0x000000000000808b), UINT64_C (0x0000000080000001),
+  UINT64_C (0x8000000080008081), UINT64_C (0x8000000000008009),
+  UINT64_C (0x000000000000008a), UINT64_C (0x0000000000000088),
+  UINT64_C (0x0000000080008009), UINT64_C (0x000000008000000a),
+  UINT64_C (0x000000008000808b), UINT64_C (0x800000000000008b),
+  UINT64_C (0x8000000000008089), UINT64_C (0x8000000000008003),
+  UINT64_C (0x8000000000008002), UINT64_C (0x8000000000000080),
+  UINT64_C (0x000000000000800a), UINT64_C (0x800000008000000a),
+  UINT64_C (0x8000000080008081), UINT64_C (0x8000000000008080),
+  UINT64_C (0x0000000080000001), UINT64_C (0x8000000080008008),
+};
+
+static const uint8_t rho[25-1] = {
+      1, 62, 28, 27,
+ 36, 44,  6, 55, 20,
+  3, 10, 43, 25, 39,
+ 41, 45, 15, 21,  8,
+ 18,  2, 61, 56, 14
+};
+
+static const uint8_t pi[24] = {
+  10,  7, 11, 17, 18, 3,  5, 16,  8, 21, 24, 4,
+  15, 23, 19, 13, 12, 2, 20, 14, 22,  9,  6, 1,
+};
+
+static uint64_t
+rotl64 (uint64_t x, uint64_t y)
+{
+  return (x << y) | (x >> (64U - y));
+}
+
+static void
+absorb (uint64_t *dst, uint8_t index, uint8_t v)
+{
+  dst[index >> 3] ^= ((uint64_t)v) << ((index & 7) << 3);
+}
+
+static uint8_t
+squeeze (const uint64_t *src, uint8_t index)
+{
+  return src[index >> 3] >> ((index & 7) << 3);
+}
+
+/* The permutation function.  */
+static void
+keccak_f1600 (uint64_t s[25])
+{
+  uint64_t lane[5];
+  int i, j, round;
+
+  for (round = 0; round < 24; round++)
+    {
+      uint64_t t;
+
+      /* STEP: theta */
+      for (i = 0; i < 5; i++)
+	lane[i] = s[i] ^ s[i + 5] ^ s[i + 10] ^ s[i + 15] ^ s[i + 20];
+
+      for (i = 0; i < 5; i++)
+	{
+	  t = lane[(i + 4) % 5] ^ rotl64 (lane[(i + 1) % 5], 1);
+	  for (j = 0; j < 25; j += 5)
+	    s[j + i] ^= t;
+	}
+
+      /* STEP: rho */
+      for (i = 1; i < 25; i++)
+	s[i] = rotl64(s[i], rho[i-1]);
+
+      /* STEP: pi */
+      t = s[1];
+      for (i = 0; i < 25-1; i++)
+	{
+	  uint64_t tmp;
+
+	  j = pi[i];
+	  tmp = s[j];
+	  s[j] = t;
+	  t = tmp;
+	}
+
+      /* STEP: chi */
+      for (i = 0; i < 25; i += 5)
+	{
+	  for (j = 0; j < 5; j++)
+	    lane[j] = s[i + j];
+	  for (j = 0; j < 5; j++)
+	    s[i + j] ^= (~lane[(j + 1) % 5]) & lane[(j + 2) % 5];
+	}
+
+      /* STEP: iota */
+      s[0] ^= rc[round];
+    }
+}
+
+void
+shake256_start (struct shake_context *shake)
+{
+  memset (shake, 0, sizeof (shake_context));
+}
+
+void
+shake256_update (struct shake_context *shake,
+		 const unsigned char *src, unsigned int size)
+{
+  if (size == 0)
+    return;
+
+  while (1)
+    {
+      absorb (shake->state, shake->index, *src++);
+      if (++shake->index == SHAKE_INDEX_MAX)
+	{
+	  keccak_f1600 (shake->state);
+	  shake->index = 0;
+	}
+      if (--size == 0)
+	break;
+    }
+}
+
+void
+shake256_finish (struct shake_context *shake,
+		 unsigned char *dst, unsigned int size)
+{
+  if (size == 0)
+    return;
+
+  /*
+   * SHAKE is defined appending 11 at the end to RawSHAKE,
+   * RawSHAKE is defined adding 11 at the end to KECCAK,
+   * and KECCACK uses pad10*1 at the end.
+   * This means adding 111110*1 at the end.
+   */
+  absorb (shake->state, shake->index, 0x1F);
+  absorb (shake->state, SHAKE_INDEX_MAX - 1, 0x80);
+  keccak_f1600 (shake->state);
+  shake->index = 0;
+
+  while (1)
+    {
+      *dst++ = squeeze (shake->state, shake->index);
+      if (--size == 0)
+	break;
+      if (++shake->index == SHAKE_INDEX_MAX)
+	{
+	  keccak_f1600 (shake->state);
+	  shake->index = 0;
+	}
+    }
+}
--- a/shake256.h
+++ b/shake256.h
@ -0,0 +1,13 @@
+#include <stdint.h>
+
+struct shake_context {
+  uint64_t state[25];
+  uint32_t index;
+};
+typedef struct shake_context shake_context;
+
+void shake256_start (struct shake_context *shake);
+void shake256_update (struct shake_context *shake,
+		      const unsigned char *src, unsigned int size);
+void shake256_finish (struct shake_context *shake,
+		      unsigned char *dst, unsigned int size);
--- a/status-code.h
+++ b/status-code.h
@ -0,0 +1,14 @@
+#define GPG_APPLICATION_TERMINATED()	set_res_sw (0x62, 0x85)
+#define GPG_MEMORY_FAILURE()		set_res_sw (0x65, 0x81)
+#define GPG_WRONG_LENGTH()		set_res_sw (0x67, 0x00)
+#define GPG_SECURITY_FAILURE()		set_res_sw (0x69, 0x82)
+#define GPG_SECURITY_AUTH_BLOCKED()	set_res_sw (0x69, 0x83)
+#define GPG_CONDITION_NOT_SATISFIED()	set_res_sw (0x69, 0x85)
+#define GPG_COMMAND_NOT_ALLOWED()	set_res_sw (0x69, 0x86)
+#define GPG_FUNCTION_NOT_SUPPORTED()	set_res_sw (0x6a, 0x81)
+#define GPG_NO_FILE()			set_res_sw (0x6a, 0x82)
+#define GPG_NO_RECORD()			set_res_sw (0x6a, 0x88)
+#define GPG_BAD_P1_P2()			set_res_sw (0x6b, 0x00)
+#define GPG_NO_INS() 			set_res_sw (0x6d, 0x00)
+#define GPG_ERROR()			set_res_sw (0x6f, 0x00)
+#define GPG_SUCCESS()			set_res_sw (0x90, 0x00)
--- a/sys.h
+++ b/sys.h