/* > ecdl2-89.c
 * Purpose: Some tools for arithmetic on elliptic curves.
 * Copyright: Robert J. Harley, 03-Feb-1997
 * Contact: Robert.Harley@inria.fr
 * Legalese: You can do what you want with this code as long as this
 *           comment stays attached in a prominent position in
 *           human-readable form, any distribution of this or
 *           derived products is made without profit and includes
 *           full source and is subject to the same conditions.
 */

/* This is RobsECDL, 64-bit version.
 *
 * Compile with something like:
 *   gcc -O4 ecdl2-89.c f2Mul.s -o ecdl -DFROM=me@here
 * or:
 *   cc -O5 -tune host -std1 ecdl2-89.c f2Mul.s -o ecdl -DFROM=me@here
 *
 * replacing "me@here" with your email address (unless you prefer to
 * remain anonymous) and appending one of the following flags:
 *
 *   If your machine is permanently connected to the Net:
 *     -DMAIL_TO=ecdl@pauillac.inria.fr
 *
 *   If it is connected but email to INRIA bounces, try this instead:
 *     -DMAIL_TO=robert@vlsi.cs.caltech.edu
 *
 *   If it is not connected, or email won't get through, use batch mode:
 *     -DBATCH
 *
 *   In batch mode, you should send the output "by hand" via email to one
 *   of the addresses, ideally from a machine with ptr and mx DNS records.
 *
 * Then run it!
 *
 * NB: if you compile on Digital Unix with -non_shared and run on Linux
 *     then use batch mode, since otherwise the compiled program will
 *     look for /sbin/sh whereas Linux has /bin/sh instead.
 *
 * NB: You can do without the "f2Mul.s" file if you use "-DNO_ASM" but
 *     it's usually a bit slower.  You can check the speed by
 *     compiling with "-DTEST" for TEST mode instead of MAIL_TO or
 *     BATCH modes.  Then it quickly produces some (useless) output
 *     along with info on iterations per second.  This also allows you
 *     to compare compilers, optimisation flags etc.
 */

#ifndef __alpha
#error This is the 64-bit version for Alpha!
#endif


/*== #includes =============================================================*/

/** Ansi includes. **/
#include <stdio.h>
#include <stdlib.h>

/** Unix includes. **/
/* For getpid(). */
#include <unistd.h>
/* For pclose() status. */
#include <sysexits.h>
/* Timing. */
#include <sys/time.h>
#include <sys/resource.h>


/*== Types =================================================================*/

typedef unsigned u32;
typedef long s64;
typedef unsigned long u64;
typedef struct { u64 hi, lo; } u128;


/*== #defines ==============================================================*/

#ifndef FROM
#error Please compile with -DFROM=me@here or something similar.
#endif

#if !defined(BATCH) && !defined(TEST) && !defined(MAIL_TO)
#error Please compile with -DMAIL_TO=ecdl@pauillac.inria.fr or -DMAIL_TO=robert@vlsi.cs.caltech.edu or -DBATCH or -DTEST
#endif


/* The place to find sendmail.  Often it's in /usr/sbin/sendmail but
 * then /usr/lib/sendmail is a link to it, so this works fairly universally.
 */
#define SENDMAIL "/usr/lib/sendmail"

/* Version 1.02: January 22nd 1998
 * - Changed SENDMAIL to /usr/lib/sendmail.
 * Version 1.01: January 16th 1998
 * - Various cosmetic mungings.
 * Version 1.00: January 8th 1998
 * - Does 171K/sec with "gcc -O4" on Alpha Linux on 500Mhz PWS
 */
#define CLIENT "RobsECDL2-89"
#define VERSION "102"

#ifdef TEST
#define IDENTITY "TEST"
#else
#define IDENTITY "ECC2-89"
#endif


#ifdef __GNUC__
#define INLINE inline
#else
#define INLINE
#endif

#define STR(x) #x
#define STRINGIFY(x) STR(x)


/** Curve from Certicom ECC2-89. **/

/* mod = 1+u^38+u^89
 * a   = 0x095AA3E660B75E77315E94E = 180933900475900610250926414
 * b   = 0x1AC2701C6C54021D1BF0A72 = 517604455857561980565916274
 * x1  = 0x1675C1BC9BD234852E8123C = 434439339556979836994458172
 * y1  = 0x0426B727619B0E93561C6A3 =  80296508747084467020088995 
 * x2  = 0x1BE721F8E3B6DBEE40DCE98 = 539719847425144080118435480
 * y2  = 0x04DD88A4BDAD73E0B32EBB3 =  94109870387226220980005811 
 * ord = 0x100000000000B41C8C9D8FD = 309485009821357445894232317 (prime)
 */

#define A   { 0x095AA3EUL, 0x660B75E77315E94EUL }
#define B   { 0x1AC2701UL, 0xC6C54021D1BF0A72UL }
#define X1  { 0x1675C1BUL, 0xC9BD234852E8123CUL }
#define Y1  { 0x0426B72UL, 0x7619B0E93561C6A3UL }
#define X2  { 0x1BE721FUL, 0x8E3B6DBEE40DCE98UL }
#define Y2  { 0x04DD88AUL, 0x4BDAD73E0B32EBB3UL }
#define ORD { 0x1000000UL, 0x00000B41C8C9D8FDUL }


/*== Function declarations =================================================*/

#ifdef NO_ASM
static u64 f2Mul(u64 *ph, u64 a, u64 b);
#else
/* From f2Mul.s */
/* About 200 cycles. */
extern u64 f2Mul(u64 *ph, u64 a, u64 b);
#endif

/* From this file. */
static INLINE u128 sumMod(u128 x, u128 y, u128 mod);
static INLINE u128 doubleMod(u128 x, u128 mod);
int main(void);

static INLINE u64 equal(u128 x, u128 y);
static INLINE u128 sum(u128 x, u128 y);
static u128 square(u128 x);
static u64 f2MulLo(u64 a, u64 b);
static u128 product(u128 x, u128 y);
static u128 inverse(u128 y);
static INLINE u128 quotient(u128 x, u128 y);

static int ellipticSum
  ( u128 x1, u128 y1, int z1, u128 x2, u128 y2, int z2, u128 a
  , u128 *px3, u128 *py3
  );
static int ellipticDouble
  ( u128 x, u128 y, int z, u128 a
  , u128 *px2, u128 *py2
  );
static int ellipticProduct
  ( u128 x, u128 y, int z, u128 fac, u128 a
  , u128 *px2, u128 *py2
  );


/*== Function definitions ==================================================*/

/*== Driver ================================================================*/

/*-- sumMod ----------------------------------------------------------------*/

/* Sum: x + y modulo mod, 0 <= x, y < mod < 2^96. */
static INLINE u128 sumMod(u128 x, u128 y, u128 mod) {

  x.hi += y.hi; x.lo += y.lo; x.hi += x.lo < y.lo;
  if (x.hi > mod.hi || (x.hi == mod.hi && x.lo >= mod.lo)) {
    u64 c;

    c = x.lo < mod.lo; x.hi -= mod.hi; x.lo -= mod.lo; x.hi -= c;
  } /* end if */

  return x;
} /* end sumMod */


/*-- doubleMod -------------------------------------------------------------*/

/* Double: x<<1 modulo mod, where 0 <= x < mod < 2^96. */
static INLINE u128 doubleMod(u128 x, u128 mod) {

  x.hi <<= 1; x.hi |= x.lo>>63; x.lo <<= 1;
  if (x.hi > mod.hi || (x.hi == mod.hi && x.lo >= mod.lo)) {
    u64 c;

    c = x.lo < mod.lo; x.hi -= mod.hi; x.lo -= mod.lo; x.hi -= c;
  } /* end if */

  return x;
} /* end doubleMod */


/*-- main ------------------------------------------------------------------*/

/* Do not change these. */
#define ADDER_SHIFT (4UL)
#define ADDERS ((1UL<<ADDER_SHIFT)-1UL)
#ifdef TEST
#define POW_WRITE (20UL)
#else
#define POW_WRITE (30UL)
#endif

int main(void) {
  const int z1 = 1, z2 = 1;
  const u128 a = A, b = B, x1 = X1, y1 = Y1, x2 = X2, y2 = Y2, ord = ORD;

  int z3;
  u64 iters, sent;
  u128 startv, u3,v3,x3,y3;

  int zA[ADDERS];
  u128 uA[ADDERS], xA[ADDERS], yA[ADDERS];

  /* Timing. */
  struct rusage ru;
  struct timeval tvUT;


  /** Banner. **/

  puts( "RobsECDL, 64-bit version, "
#ifdef BATCH
          "BATCH"
#elif defined(TEST)
          "TEST"
#else
          "MAIL_TO"
#endif
          " mode.\n"
        "From: " STRINGIFY(FROM) "."
#ifdef MAIL_TO
          "\n"
        "To: " STRINGIFY(MAIL_TO) "."
#endif
      );
  fflush(stdout);


  /** Simple sanity checks. **/

  if (sizeof(u32) != 4UL || sizeof(s64) != 8UL || sizeof(u64) != 8UL) {
    puts("The size of u32, s64 or u64 is wrong.  Stopping.");
    fflush(stdout);
    return 0;
  } /* end if */


  /** Initialisation. **/

  /* Pseudo-random values to add.  Do not change. */
  { u64 i;
    u32 startu;

    startu = 1U;
    for (i = 0UL; i < ADDERS; ++i) {
      u128 fac;
      const u32 pi = 3141592653U; /* pi times 1e9 */

      startu *= pi;
      fac.hi = 0UL; fac.lo = (u64)startu;
      uA[i] = fac;
      zA[i] = ellipticProduct(x1, y1, z1, fac, a, xA+i, yA+i);
    } /* end for */
  } /* end block */

  /* Get initial v. */
#ifdef TEST
  /* Fixed value for testing. */
  startv.hi = 0UL; startv.lo = 1234567890UL;
#else
  /* Random value. */
  { long seed;
    FILE *handle;

    handle = fopen("/dev/urandom", "r");
    if (handle != NULL) {
      fread((void *)&seed, sizeof(long), 1UL, handle);
      fclose(handle);
    } /* end if */
    seed ^= (long)getpid()<<32;
    seed ^= (long)time(NULL);
    startv.hi = 0UL; startv.lo = seed;
  } /* end block */
#endif


  /** Main loop.  Does about:
   **   168 K (egcs)
   **   171 K (gcc)
   **   177 K (cc)
   **   178 K (cc.alt)
   **   185 K (cc.alt2)
   ** steps/sec on 500Mhz Alpha 21164a.
   **/

  puts("Computing iterations...");
  fflush(stdout);
  getrusage(RUSAGE_SELF, &ru); tvUT = ru.ru_utime; /* Timing. */

  z3 = ellipticProduct(x2, y2, z2, startv, a, &x3, &y3);
  u3.hi = u3.lo = 0UL;
  v3 = startv;
  iters = 0UL;
  sent = 0UL;

  for (; ; ) {

    if (x3.lo>>(64UL-POW_WRITE) == 0UL) {
      /* Output distinguished point. */
      u64 total;
      double dUT, dRate; /* Timing. */

      getrusage(RUSAGE_SELF, &ru);
      dUT = (double)(ru.ru_utime.tv_sec-tvUT.tv_sec)
            + (double)(ru.ru_utime.tv_usec-tvUT.tv_usec)/1000000.0;
      if (dUT) {
        total = sent+iters;
        dRate = (double)total/dUT;
      } /* end if (division OK) */

#if !defined(BATCH) && !defined(TEST)
      { FILE *handle;

        handle = popen(SENDMAIL " -t", "w");
        if (handle == NULL) {
          puts("Warning: couldn't pipe to sendmail, send by hand!");
          fflush(stdout);
        } else {
          int status;

          fprintf( handle
                 , "To: " STRINGIFY(MAIL_TO) "\n"
                   IDENTITY " s %07lx%016lx i %012lx "
                   "x %07lx%016lx y %07lx%016lx z %x "
                   "u %07lx%016lx v %07lx%016lx "
                   CLIENT " " VERSION " " STRINGIFY(FROM) " ;\n"
                 , startv.hi, startv.lo, iters, x3.hi, x3.lo, y3.hi, y3.lo
                 , z3, u3.hi, u3.lo, v3.hi, v3.lo
                 );
          /* Timing. */
          if (dUT) {
            fprintf( handle
                   , "Total iterations = %lu at %g/sec\n"
                   , total, dRate
                   );
          } /* end if */
          fflush(handle);
          status = pclose(handle);
          if (status != EX_OK) {
            printf("Warning: sendmail returned status %d\n", status);
            fflush(stdout);
          } /* end if */
        } /* end if/else */
      } /* end block */
#endif
      printf( IDENTITY " s %07lx%016lx i %012lx "
              "x %07lx%016lx y %07lx%016lx z %x "
              "u %07lx%016lx v %07lx%016lx "
              CLIENT " " VERSION " " STRINGIFY(FROM) " ;\n"
            , startv.hi, startv.lo, iters, x3.hi, x3.lo, y3.hi, y3.lo, z3
            , u3.hi, u3.lo, v3.hi, v3.lo
            );
      /* Timing. */
      if (dUT) printf("Total iterations = %lu at %g/sec\n", total, dRate);

      fflush(stdout);

      sent += iters;
        
      /* Restart this one by setting u to 0... */
      u3.hi = u3.lo = 0UL;
      startv = v3;
      iters = 0UL;
      z3 = ellipticProduct(x2, y2, z2, startv, a, &x3, &y3);
    } /* end if (distinguished point) */

    /* Pseudo-random function.  Same on all machines. **/
    { u64 m;

      m = x3.lo>>(64UL-ADDER_SHIFT);
      if (m < ADDERS) {
        u3 = sumMod(u3, uA[m], ord);
        z3 = ellipticSum(xA[m], yA[m], zA[m], x3, y3, z3, a, &x3, &y3);
      } else {
        u3 = doubleMod(u3, ord);
        v3 = doubleMod(v3, ord);
        z3 = ellipticDouble(x3, y3, z3, a, &x3, &y3);
      } /* end if/else */
      ++iters;

    } /* end block */

  } /* end main loop */

  return 0;
} /* end main */


/*== Arithmetic in the field (Z/2Z)[u] / (Z/2Z)[1+u^38+u^89] ===============*/

/*-- equal -----------------------------------------------------------------*/

/* Check equality: x == y, where degree x, y < 128. */
static INLINE u64 equal(u128 x, u128 y) {

  return ((x.hi ^ y.hi) | (x.lo ^ y.lo)) == 0UL;
} /* end equal */


/*-- sum -------------------------------------------------------------------*/

/* Add: x + y, where degree x, y < 128. */
static INLINE u128 sum(u128 x, u128 y) {
  u128 z;

  z.hi = x.hi ^ y.hi; z.lo = x.lo ^ y.lo;

  return z;
} /* end sum */


/*-- square ----------------------------------------------------------------*/

/* Square of x modulo 1+u^38+u^89, degree x < 89.
 * About 65 (or 55 inlined) cycles (gcc), 46 cycles (cc.alt2).
 */
static u128 square(u128 x) {
  /* TO DO: smaller tab??? */
  static const int tab[256] =
  {     0,     1,     4,     5,    16,    17,    20,    21
  ,    64,    65,    68,    69,    80,    81,    84,    85
  ,   256,   257,   260,   261,   272,   273,   276,   277
  ,   320,   321,   324,   325,   336,   337,   340,   341
  ,  1024,  1025,  1028,  1029,  1040,  1041,  1044,  1045
  ,  1088,  1089,  1092,  1093,  1104,  1105,  1108,  1109
  ,  1280,  1281,  1284,  1285,  1296,  1297,  1300,  1301
  ,  1344,  1345,  1348,  1349,  1360,  1361,  1364,  1365
  ,  4096,  4097,  4100,  4101,  4112,  4113,  4116,  4117
  ,  4160,  4161,  4164,  4165,  4176,  4177,  4180,  4181
  ,  4352,  4353,  4356,  4357,  4368,  4369,  4372,  4373
  ,  4416,  4417,  4420,  4421,  4432,  4433,  4436,  4437
  ,  5120,  5121,  5124,  5125,  5136,  5137,  5140,  5141
  ,  5184,  5185,  5188,  5189,  5200,  5201,  5204,  5205
  ,  5376,  5377,  5380,  5381,  5392,  5393,  5396,  5397
  ,  5440,  5441,  5444,  5445,  5456,  5457,  5460,  5461
  , 16384, 16385, 16388, 16389, 16400, 16401, 16404, 16405
  , 16448, 16449, 16452, 16453, 16464, 16465, 16468, 16469
  , 16640, 16641, 16644, 16645, 16656, 16657, 16660, 16661
  , 16704, 16705, 16708, 16709, 16720, 16721, 16724, 16725
  , 17408, 17409, 17412, 17413, 17424, 17425, 17428, 17429
  , 17472, 17473, 17476, 17477, 17488, 17489, 17492, 17493
  , 17664, 17665, 17668, 17669, 17680, 17681, 17684, 17685
  , 17728, 17729, 17732, 17733, 17744, 17745, 17748, 17749
  , 20480, 20481, 20484, 20485, 20496, 20497, 20500, 20501
  , 20544, 20545, 20548, 20549, 20560, 20561, 20564, 20565
  , 20736, 20737, 20740, 20741, 20752, 20753, 20756, 20757
  , 20800, 20801, 20804, 20805, 20816, 20817, 20820, 20821
  , 21504, 21505, 21508, 21509, 21520, 21521, 21524, 21525
  , 21568, 21569, 21572, 21573, 21584, 21585, 21588, 21589
  , 21760, 21761, 21764, 21765, 21776, 21777, 21780, 21781
  , 21824, 21825, 21828, 21829, 21840, 21841, 21844, 21845
  };

  u64 t0,t1,t2, tmp, xh,xl;
  u128 z;

  xh = x.hi; xl = x.lo;

  t0 = (u64)(long)tab[xl & 255] | (u64)(long)tab[xl>>8 & 255]<<16
       | ((u64)(long)tab[xl>>16 & 255]<<32 | (u64)(long)tab[xl>>24 & 255]<<48)
       ;
  t1 = (u64)(long)tab[xl>>32 & 255] | (u64)(long)tab[xl>>40 & 255]<<16
       | ((u64)(long)tab[xl>>48 & 255]<<32 | (u64)(long)tab[xl>>56 & 255]<<48)
       ;
  t2 = (u64)(long)tab[xh & 255] | (u64)(long)tab[xh>>8 & 255]<<16
       | ((u64)(long)tab[xh>>16 & 255]<<32 | (u64)(long)tab[xh>>24 & 255]<<48)
       ;

  t0 ^= t2<<39; t1 ^= t2>>25;
  t1 ^= t2<<13;

  tmp = t1>>25; t1 ^= tmp<<25;
  t0 ^= tmp; t0 ^= tmp<<38;
  t1 ^= tmp>>26;

  z.hi = t1; z.lo = t0;
  return z;
} /* end square */


/*-- f2MulLo ---------------------------------------------------------------*/

/* About 101 cycles (gcc), 101 (cc.alt2). */
static u64 f2MulLo(u64 a, u64 b) {
  u64 a0,a1,a2,a3,a01,a23,a02,a13,a0123
    , b0,b1,b2,b3,b01,b23,b02,b13,b0123
    , c0l,c1l,c2l,c3l,c01l,c23l,c02l,c13l,c0123l
    ;
  const u64 mask = 0x1111111111111111UL;

  a0 = a&mask; a1 = (a>>1)&mask; a2 = (a>>2)&mask; a3 = (a>>3)&mask;
  a01 = a0^a1; a23 = a2^a3; a02 = a0^a2; a13 = a1^a3; a0123 = a02^a13;

  b0 = b&mask; b1 = (b>>1)&mask; b2 = (b>>2)&mask; b3 = (b>>3)&mask;
  b01 = b0^b1; b23 = b2^b3; b02 = b0^b2; b13 = b1^b3; b0123 = b02^b13;

  c01l = a01*b01 & mask;
  c0l = a0*b0 & mask; c01l ^= c0l;
  c1l = a1*b1 & mask; c01l ^= c1l;

  c23l = a23*b23 & mask;
  c2l = a2*b2 & mask; c23l ^= c2l;
  c3l = a3*b3 & mask; c23l ^= c3l;

  c0123l = a0123*b0123 & mask;
  c02l = a02*b02 & mask; c0123l ^= c02l;
  c13l = a13*b13 & mask; c0123l ^= c13l;

  c01l = c0l^c01l<<1^c1l<<2;
  c23l = c2l^c23l<<1^c3l<<2;
  c0123l = c02l^c0123l<<1^c13l<<2;

  c0123l ^= c01l; c0123l ^= c23l;

  return (c0123l^c23l<<2)<<2^c01l;
} /* end f2MulLo */


/*-- f2Mul -----------------------------------------------------------------*/

#ifdef NO_ASM
static u64 f2Mul(u64 *ph, u64 a, u64 b) {
  u64 l0, l1, h0, h1, i;
  const u64 bit31 = 1UL<<31, bit32 = 1UL<<32;

  l0 = l1 = h0 = h1 = 0UL;
  for (i = 16UL; i; --i) {
    if (a & 1UL) l0 ^= b;
    if (a & bit32) l1 ^= b;
    a >>= 1;
    if ((s64)b < 0L) h0 ^= a;
    if (b & bit31) h1 ^= a;
    b += b;
    if (a & 1UL) l0 ^= b;
    if (a & bit32) l1 ^= b;
    a >>= 1;
    if ((s64)b < 0L) h0 ^= a;
    if (b & bit31) h1 ^= a;
    b += b;
  } /* end for (i) */

  *ph = (h1>>32) ^ h0;
  return l0 ^ (l1<<32);
} /* end f2Mul */
#endif


/*-- product ---------------------------------------------------------------*/

/* Product of x and y modulo 1+u^38+u^89, degree x, y < 89.
 * About 548 cycles (gcc), 540 cycles (cc.alt2).
 */
static u128 product(u128 x, u128 y) {
  u64 ah,al, bh,bl, c, tmp0, tmp1, t0,t1,t2;
  u128 z;

  al = f2Mul(&ah, x.lo, y.lo);
  bl = f2Mul(&bh, x.lo^x.hi, y.lo^y.hi);
  c = f2MulLo(x.hi, y.hi);

  t0 = al; tmp0 = ah^c; t1 = al^bl^tmp0; t2 = bh^tmp0;

  t0 ^= t2<<39; t1 ^= t2>>25;
  t1 ^= t2<<13;

  tmp1 = t1>>25; t1 ^= tmp1<<25;
  t0 ^= tmp1; t0 ^= tmp1<<38;
  t1 ^= tmp1>>26;

  z.hi = t1; z.lo = t0;
  return z;
} /* end product */


/*-- inverse --------------------------------------------------------------*/

/* Invert y modulo 1+u^38+u^89, degree y < 89, y != 0.
 * About 1726 cycles (gcc), 1475 (cc.alt2) for 89-bit inputs.
 */

#define TAB_SIZE (64UL)
#define TAB_MASK (TAB_SIZE-1UL)

static u128 inverse(u128 y) {
  u64 ah,al, bh,bl, sh, th, t, uh,ul, vh,vl;
  static const int tab[TAB_SIZE] =
  { 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
  , 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
  };
  const u64 mh = 1UL<<25, ml = 1UL | 1UL<<38;

  /* Maintain: u.y = a<<t and v.y = b<<t. */

  ah = y.hi; al = y.lo;
  t = 0UL;
  while (!(al & 1UL)) {
    al >>= 1; al |= ah<<63; ah >>= 1;
    ++t;
  } /* end while */
  bh = mh; bl = ml;
  uh = 0UL; ul = 1UL; vh = 0UL; vl = 0UL;

  do {
    do {

      do {
        bh ^= ah; bl ^= al;
        vh ^= uh; vl ^= ul;

        sh = (u64)(long)tab[bl & TAB_MASK]; th = 64UL-sh;
        bl >>= sh; bl |= bh<<th; bh >>= sh;
        t += sh;
        uh <<= sh; uh |= ul>>th; ul <<= sh;
        while (!(bl & 1UL)) {
          bl >>= 1; bl |= bh<<63; bh >>= 1;
          ++t;
          uh <<= 1; uh |= ul>>63; ul <<= 1;
        } /* end while */
      } while (ah < bh || (ah == bh && al < bl));

      if ((al ^ bl) == 0UL && (ah ^ bh) == 0UL) break;

      do {
        ah ^= bh; al ^= bl;
        uh ^= vh; ul ^= vl;

        sh = (u64)(long)tab[al & TAB_MASK]; th = 64UL-sh;
        al >>= sh; al |= ah<<th; ah >>= sh;
        t += sh;
        vh <<= sh; vh |= vl>>th; vl <<= sh;
        while (!(al & 1UL)) {
          al >>= 1; al |= ah<<63; ah >>= 1;
          ++t;
          vh <<= 1; vh |= vl>>63; vl <<= 1;
        } /* end while */
      } while (ah > bh || (ah == bh && al > bl));

    } while (al != bl);
  } while (ah != bh);

  /* Divide u by 2^t modulo the modulus. */
  while (t >= 38UL) {
    u64 w;

    w = ul<<26>>26;
    ul >>= 38; ul ^= uh<<26;
    ul ^= w;
    ul ^= w<<51; uh = w>>13;

    t -= 38UL;
  } /* end while */
  if (t) {
    u64 s, w;

    s = 64UL-t;
    w = ul<<s>>s;
    ul ^= w<<38; uh ^= w>>26;
    uh ^= w<<25;
    ul >>= t; ul |= uh<<s; uh >>= t;
  } /* end if */

  { u128 u;

    u.hi = uh; u.lo = ul;

    return u;
  } /* end block */
} /* end inverse */


/*-- quotient --------------------------------------------------------------*/

/* Divide x by y modulo 1+u^38+u^89, degree x, y < 89, y != 0.
 * About 2180 cycles (gcc), 2023 (cc.alt2) for 89-bit inputs.
 */
static INLINE u128 quotient(u128 x, u128 y) {
  u128 z;

  z = inverse(y);

  return product(x, z);
} /* end quotient */


/*== Arithmetic on elliptic curves over the field ==========================*/

/*-- ellipticSum -----------------------------------------------------------*/

/* Given pts (x1:y1:z1) and (x2:y2:z2) on y^2 + x*y = x^3 + a*x^2 + b,
 * computes their sum (x3:y3:z3) by the group law.  Doesn't actually need b.
 * Puts x3 at *px3 and y3 at *py3, and returns z3.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticSum
  ( u128 x1, u128 y1, int z1, u128 x2, u128 y2, int z2, u128 a
  , u128 *px3, u128 *py3
  ) {
  u128 lam, x3, y3;

#if 0
  /* Short and sweet. */
  if (z1 == 0UL) { *px3 = x2; *py3 = y2; return z2; }
  if (z2 == 0UL) { *px3 = x1; *py3 = y1; return z1; }

  if (equal(x1, x2)) {
    const u128 zero = { 0UL, 0UL }, one = { 0UL, 1UL };

    if (equal(y1, y2)) return ellipticDouble(x1, y1, z1, a, px3, py3);
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  lam = quotient(sum(y1, y2), sum(x1, x2));
  x3 = sum(sum(sum(sum(square(lam), lam), x1), x2), a);
  y3 = sum(sum(product(lam, sum(x1, x3)), x3), y1);
#else
  /* Same thing but a bit faster with explicit xors. */
  u128 t, x, y;

  if (z1 == 0UL) { *px3 = x2; *py3 = y2; return z2; }
  if (z2 == 0UL) { *px3 = x1; *py3 = y1; return z1; }

  if (((x1.hi ^ x2.hi) | (x1.lo ^ x2.lo)) == 0UL) {
    const u128 zero = { 0UL, 0UL }, one = { 0UL, 1UL };

    if (((y1.hi ^ y2.hi) | (y1.lo ^ y2.lo)) == 0UL) {
      return ellipticDouble(x1, y1, z1, a, px3, py3);
    } /* end if */
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  x.hi = x1.hi^x2.hi; x.lo = x1.lo^x2.lo;
  y.hi = y1.hi^y2.hi; y.lo = y1.lo^y2.lo;
  lam = quotient(y, x);

  x3 = square(lam);
  x3.hi ^= lam.hi; x3.lo ^= lam.lo;
  x3.hi ^= x1.hi; x3.lo ^= x1.lo;
  x3.hi ^= x2.hi; x3.lo ^= x2.lo;
  x3.hi ^= a.hi; x3.lo ^= a.lo;

  t.hi = x1.hi^x3.hi; t.lo = x1.lo^x3.lo;
  y3 = product(lam, t);
  y3.hi ^= y1.hi; y3.lo ^= y1.lo;
  y3.hi ^= x3.hi; y3.lo ^= x3.lo;
#endif

  *px3 = x3; *py3 = y3; return 1;
} /* end ellipticSum */


/*-- ellipticDouble --------------------------------------------------------*/

/* Given point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its double (x2:y2:z2) by the group law.  Doesn't actually need b.
 * Puts x2 at *px2 and y2 at *py2, and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticDouble
  ( u128 x, u128 y, int z, u128 a
  , u128 *px2, u128 *py2
  ) {
  u128 lam, x2, y2;
  const u128 zero = { 0UL, 0UL }, one = { 0UL, 1UL };

#if 0
  /* Short and sweet. */
  if (z == 0 || equal(x, zero)) { *px2 = zero; *py2 = one; return 0; }

  lam = sum(x, quotient(y, x));
  x2 = sum(sum(square(lam), lam), a);
  y2 = sum(square(x), sum(product(lam, x2), x2));
#else
  /* Same thing but a bit faster with explicit xors. */
  u128 t;

  if (z == 0 || (x.hi | x.lo) == 0UL) {
    *px2 = zero; *py2 = one; return 0;
  } /* end if */

  lam = quotient(y, x);
  lam.hi ^= x.hi; lam.lo ^= x.lo;

  x2 = square(lam);
  x2.hi ^= lam.hi; x2.lo ^= lam.lo;
  x2.hi ^= a.hi; x2.lo ^= a.lo;

  y2 = product(lam, x2);
  t = square(x);
  y2.hi ^= x2.hi; y2.lo ^= x2.lo;
  y2.hi ^= t.hi; y2.lo ^= t.lo;
#endif

  *px2 = x2; *py2 = y2; return 1;
} /* end ellipticDouble */


/*-- ellipticProduct -------------------------------------------------------*/

/* Given point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes its
 * fac-th multiple (x2:y2:z2) by the group law.  Doesn't actually need b.
 * Puts x2 at *px2 and y2 at *py2, and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticProduct
  ( u128 x, u128 y, int z, u128 fac, u128 a
  , u128 *px2, u128 *py2
  ) {
  int z2;
  u64 fh, fl, m;
  u128 x2, y2;
  const u128 zero = { 0UL, 0UL }, one = { 0UL, 1UL };

  if (equal(fac, zero)) { *px2 = zero; *py2 = one; return 0; }

  fh = fac.hi;
  if (fh) {
    x2 = x; y2 = y; z2 = z;
    m = fh;
    m |= m>>1; m |= m>>2; m |= m>>4; m |= m>>8; m |= m>>16; m |= m>>32;
    m ^= m>>1;
    while (m >>= 1) {
      z2 = ellipticDouble(x2, y2, z2, a, &x2, &y2);
      if (fh & m) z2 = ellipticSum(x, y, z, x2, y2, z2, a, &x2, &y2);
    } /* end while */
  } else { x2 = zero; y2 = one; z2 = 0; }

  m = 1UL<<63;
  fl = fac.lo;
  do {
    z2 = ellipticDouble(x2, y2, z2, a, &x2, &y2);
    if (fl & m) z2 = ellipticSum(x, y, z, x2, y2, z2, a, &x2, &y2);
    m >>= 1;
  } while (m);

  *px2 = x2; *py2 = y2; return z2;
} /* end ellipticProduct */


/*== end of file ecdl2-89.c ================================================*/

