/* > ecdl32bit.c
 * Purpose: Some tools for arithmetic on elliptic curves.
 * Copyright: Robert J. Harley, 03-Feb-1997
 * Contact: Robert.Harley@inria.fr
 * Legalese: You can do what you want with this code as long as this
 *           comment stays attached in a prominent position in
 *           human-readable form, any distribution of this or
 *           derived products is made without profit and includes
 *           full source and is subject to the same conditions.
 */

/* This is RobsECDL, 32-bit version.
 *
 * Compile with something like:
 *   gcc -O4 ecdl32bit.c -o ecdl -DFROM=me@here
 *
 * replacing "me@here" with your email address (unless you prefer to
 * remain anonymous) and appending one of the following flags:
 *
 *   If your machine is permanently connected to the Net:
 *     -DMAIL_TO=ecdl@pauillac.inria.fr
 *
 *   If it is connected but email to INRIA bounces, try this instead:
 *     -DMAIL_TO=robert@vlsi.cs.caltech.edu
 *
 *   If it is not connected, or email won't get through, use batch mode:
 *     -DBATCH
 *
 *   In batch mode, you should send the output "by hand" via email to one
 *   of the addresses, ideally from a machine with ptr and mx DNS records.
 *
 * Then run it!
 *
 * NB: You can also use "-DNO_ASM".  On Sparc v8 and on i686 it is
 *     slower.  On Sparc v7 it is faster.  On others I don't know!
 *     You can check the speed by compiling with "-DTEST" for TEST
 *     mode instead of MAIL_TO or BATCH modes.  Then it quickly
 *     produces some (useless) output along with info on iterations
 *     per second.  This also allows you to compare compilers,
 *     optimisation flags etc.
 */

#ifdef __alpha
#error Use the 64-bit version on an Alpha, not the 32-bit one!
#endif


/*== #includes =============================================================*/

/** Ansi includes. **/
#include <stdio.h>
#include <stdlib.h>

/** Unix includes. **/
/* For getpid(). */
#include <unistd.h>
/* For pclose() status. */
#include <sysexits.h>
/* Timing. */
#include <sys/time.h>
#include <sys/resource.h>


/*== Types =================================================================*/

typedef long s32;
typedef unsigned long u32;
typedef struct { u32 hi, lo; } u64;
typedef struct { u32 h, m, l; } u96;


/*== #defines ==============================================================*/

#ifndef FROM
#error Please compile with -DFROM=me@here or something similar.
#endif

#if !defined(BATCH) && !defined(TEST) && !defined(MAIL_TO)
#error Please compile with -DMAIL_TO=ecdl@pauillac.inria.fr or -DMAIL_TO=robert@vlsi.cs.caltech.edu or -DBATCH or -DTEST
#endif


/* The place to find sendmail.  Often it's in /usr/sbin/sendmail but
 * then /usr/lib/sendmail is a link to it, so this works fairly universally.
 */
#define SENDMAIL "/usr/lib/sendmail"

/* Version 1.04: February 2nd 1998
 * - Added asm() for HP PA, tested OK on NextStep.
 * Version 1.03: January 27th 1998
 * - Changed SENDMAIL to /usr/lib/sendmail.
 * - Added asm() for SGI MIPS, tested OK on Irix.
 * Version 1.02: January 20th 1998
 * - Fixed reporting of rate for > 2^32 iterations.
 * - Added Rick Gorton's asm() for ARM >= 6, tested OK on NetBSD.
 * Version 1.01: January 16th 1998
 * - Various cosmetic mungings.
 * Version 1.00: January 14th 1998
 * - Does 25K/sec with "gcc -O4" on Linux on 200Mhz PPro
 */
#define CLIENT "RobsECDL.32bit"
#define VERSION "104"

#ifdef TEST
#define IDENTITY "TEST"
#else
#define IDENTITY "ECC2-89"
#endif


#ifdef __GNUC__
#define INLINE inline
#else
#define INLINE
#endif

#define STR(x) #x
#define STRINGIFY(x) STR(x)


/** Curve from Certicom ECC2-89. **/

/* mod = 1+u^38+u^89
 * a   = 0x095AA3E660B75E77315E94E = 180933900475900610250926414
 * b   = 0x1AC2701C6C54021D1BF0A72 = 517604455857561980565916274
 * x1  = 0x1675C1BC9BD234852E8123C = 434439339556979836994458172
 * y1  = 0x0426B727619B0E93561C6A3 =  80296508747084467020088995 
 * x2  = 0x1BE721F8E3B6DBEE40DCE98 = 539719847425144080118435480
 * y2  = 0x04DD88A4BDAD73E0B32EBB3 =  94109870387226220980005811 
 * ord = 0x100000000000B41C8C9D8FD = 309485009821357445894232317 (prime)
 */

#define A   { 0x095AA3EUL, 0x660B75E7UL, 0x7315E94EUL }
#define B   { 0x1AC2701UL, 0xC6C54021UL, 0xD1BF0A72UL }
#define X1  { 0x1675C1BUL, 0xC9BD2348UL, 0x52E8123CUL }
#define Y1  { 0x0426B72UL, 0x7619B0E9UL, 0x3561C6A3UL }
#define X2  { 0x1BE721FUL, 0x8E3B6DBEUL, 0xE40DCE98UL }
#define Y2  { 0x04DD88AUL, 0x4BDAD73EUL, 0x0B32EBB3UL }
#define ORD { 0x1000000UL, 0x00000B41UL, 0xC8C9D8FDUL }


/*== Function declarations =================================================*/

/* From this file. */
static INLINE u96 sumMod(u96 x, u96 y, u96 mod);
static INLINE u96 doubleMod(u96 x, u96 mod);
int main(void);

static INLINE int equal(u96 x, u96 y);
static INLINE u96 sum(u96 x, u96 y);
static u96 square(u96 x);
static u32 f2Mul(u32 *ph, u32 a, u32 b);
static u96 product(u96 x, u96 y);
static u96 inverse(u96 y);
static INLINE u96 quotient(u96 x, u96 y);

static int ellipticSum
  ( u96 x1, u96 y1, int z1, u96 x2, u96 y2, int z2, u96 a
  , u96 *px3, u96 *py3
  );
static int ellipticDouble
  ( u96 x, u96 y, int z, u96 a
  , u96 *px2, u96 *py2
  );
static int ellipticProduct
  ( u96 x, u96 y, int z, u96 fac, u96 a
  , u96 *px2, u96 *py2
  );


/*== Function definitions ==================================================*/

/*== Driver ================================================================*/

/*-- sumMod ----------------------------------------------------------------*/

/* Sum: x + y modulo mod, 0 <= x, y < mod < 2^95. */
static INLINE u96 sumMod(u96 x, u96 y, u96 mod) {
  u32 c;

  x.l += y.l; c = x.l < y.l;
  x.m += c; x.h += x.m < c; x.m += y.m; x.h += x.m < y.m;
  x.h += y.h;
  
  if ( x.h > mod.h
     || (x.h == mod.h && (x.m > mod.m || (x.m == mod.m && x.l >= mod.l)))
     ) {
    u32 d;

    c = x.l < mod.l; x.l -= mod.l;
    d = x.m < c; x.m -= c; d |= x.m < mod.m; x.m -= mod.m;
    x.h -= mod.h; x.h -= d;
  } /* end if */

  return x;
} /* end sumMod */


/*-- doubleMod -------------------------------------------------------------*/

/* Double: x<<1 modulo mod, where 0 <= x < mod < 2^95. */
static INLINE u96 doubleMod(u96 x, u96 mod) {

  x.h <<= 1; x.h |= x.m>>31;
  x.m <<= 1; x.m |= x.l>>31;
  x.l <<= 1;

  if ( x.h > mod.h
     || (x.h == mod.h && (x.m > mod.m || (x.m == mod.m && x.l >= mod.l)))
     ) {
    u32 c, d;

    c = x.l < mod.l; x.l -= mod.l;
    d = x.m < c; x.m -= c; d |= x.m < mod.m; x.m -= mod.m;
    x.h -= mod.h; x.h -= d;
  } /* end if */

  return x;
} /* end doubleMod */


/*-- main ------------------------------------------------------------------*/

/* Do not change these. */
#define ADDER_SHIFT (4UL)
#define ADDERS ((1UL<<ADDER_SHIFT)-1UL)
#ifdef TEST
#define POW_WRITE (20UL)
#else
#define POW_WRITE (30UL)
#endif

int main(void) {
  const int z1 = 1, z2 = 1;
  const u96 a = A, b = B, x1 = X1, y1 = Y1, x2 = X2, y2 = Y2, ord = ORD;

  int z3;
  u64 iters, sent;
  u96 startv, u3,v3,x3,y3;

  int zA[ADDERS];
  u96 uA[ADDERS], xA[ADDERS], yA[ADDERS];

  /* Timing. */
  struct rusage ru;
  struct timeval tvUT;


  /** Banner. **/

  puts( "RobsECDL, 32-bit version, "
#ifdef BATCH
          "BATCH"
#elif defined(TEST)
          "TEST"
#else
          "MAIL_TO"
#endif
          " mode.\n"
        "From: " STRINGIFY(FROM) "."
#ifdef MAIL_TO
          "\n"
        "To: " STRINGIFY(MAIL_TO) "."
#endif
      );
  fflush(stdout);


  /** Simple sanity checks. **/

  if (sizeof(u32) != 4UL || sizeof(s32) != 4UL) {
    puts("The size of u32 or s32 is wrong.  Stopping.");
    fflush(stdout);
    return 0;
  } /* end if */


  /** Initialisation. **/

  /* Pseudo-random values to add.  Do not change. */
  { u32 i, startu;

    startu = 1UL;
    for (i = 0UL; i < ADDERS; ++i) {
      u96 fac;
      const u32 pi = 3141592653UL; /* pi times 1e9 */

      startu *= pi;
      fac.h = 0UL; fac.m = 0UL; fac.l = startu;
      uA[i] = fac;
      zA[i] = ellipticProduct(x1, y1, z1, fac, a, xA+i, yA+i);
    } /* end for */
  } /* end block */

  /* Get initial v. */
#ifdef TEST
  /* Fixed value for testing. */
  startv.h = 0UL; startv.m = 0UL; startv.l = 1234567890UL;
#else
  /* Random value. */
  { u64 seed;
    FILE *handle;

    handle = fopen("/dev/urandom", "r");
    if (handle != NULL) {
      fread((void *)&seed, sizeof(u64), 1UL, handle);
      fclose(handle);
    } /* end if */
    seed.hi ^= (u32)getpid();
    seed.lo ^= (u32)time(NULL);
    startv.h = 0UL; startv.m = seed.hi; startv.l = seed.lo;
  } /* end block */
#endif


  /** Main loop. **/

  puts("Computing iterations...");
  fflush(stdout);
  getrusage(RUSAGE_SELF, &ru); tvUT = ru.ru_utime; /* Timing. */

  z3 = ellipticProduct(x2, y2, z2, startv, a, &x3, &y3);
  u3.h = u3.m = u3.l = 0UL;
  v3 = startv;
  iters.hi = iters.lo = 0UL;
  sent.hi = sent.lo = 0UL;

  for (; ; ) {

    if (x3.m>>(32UL-POW_WRITE) == 0UL) {
      /* Output distinguished point. */
      u64 total;
      double dUT, dRate; /* Timing. */

      getrusage(RUSAGE_SELF, &ru);
      dUT = (double)(ru.ru_utime.tv_sec-tvUT.tv_sec)
            + (double)(ru.ru_utime.tv_usec-tvUT.tv_usec)/1000000.0;
      if (dUT) {
        total.lo = sent.lo+iters.lo; total.hi = sent.hi+iters.hi;
        total.hi += total.lo < iters.lo;
        dRate = ((double)total.hi*4294967296.0+(double)total.lo)/dUT;
      } /* end if (division OK) */

#if !defined(BATCH) && !defined(TEST)
      { FILE *handle;

        handle = popen(SENDMAIL " -t", "w");
        if (handle == NULL) {
          puts("Warning: couldn't pipe to sendmail, send by hand!");
          fflush(stdout);
        } else {
          int status;

          fprintf( handle
                 , "To: " STRINGIFY(MAIL_TO) "\n"
                   IDENTITY " s %07lx%08lx%08lx i %04lx%08lx "
                   "x %07lx%08lx%08lx y %07lx%08lx%08lx z %x "
                   "u %07lx%08lx%08lx v %07lx%08lx%08lx "
                   CLIENT " " VERSION " " STRINGIFY(FROM) " ;\n"
                 , startv.h, startv.m, startv.l, iters.hi, iters.lo
                 , x3.h, x3.m, x3.l, y3.h, y3.m, y3.l
                 , z3, u3.h, u3.m, u3.l, v3.h, v3.m, v3.l
                 );
          /* Timing. */
          if (dUT) {
            fprintf( handle
                   , "Total iterations = %lu*2^32+%lu at %g/sec\n"
                   , total.hi, total.lo, dRate
                   );
          } /* end if */
          fflush(handle);
          status = pclose(handle);
          if (status != EX_OK) {
            printf("Warning: sendmail returned status %d\n", status);
            fflush(stdout);
          } /* end if */
        } /* end if/else */
      } /* end block */
#endif
      printf( IDENTITY " s %07lx%08lx%08lx i %04lx%08lx "
              "x %07lx%08lx%08lx y %07lx%08lx%08lx z %x "
              "u %07lx%08lx%08lx v %07lx%08lx%08lx "
              CLIENT " " VERSION " " STRINGIFY(FROM) " ;\n"
            , startv.h, startv.m, startv.l, iters.hi, iters.lo
            , x3.h, x3.m, x3.l, y3.h, y3.m, y3.l
            , z3, u3.h, u3.m, u3.l, v3.h, v3.m, v3.l
            );
      /* Timing. */
      if (dUT) {
        printf( "Total iterations = %lu*2^32+%lu at %g/sec\n"
              , total.hi, total.lo, dRate
              );
      } /* end if */

      fflush(stdout);

      sent.lo += iters.lo; sent.hi += iters.hi; sent.hi += sent.lo < iters.lo;
        
      /* Restart this one by setting u to 0... */
      u3.h = u3.m = u3.l = 0UL;
      startv = v3;
      iters.hi = iters.lo = 0UL;
      z3 = ellipticProduct(x2, y2, z2, startv, a, &x3, &y3);
    } /* end if (distinguished point) */

    /* Pseudo-random function.  Same on all machines. **/
    { u32 m;

      m = x3.m>>(32UL-ADDER_SHIFT);
      if (m < ADDERS) {
        u3 = sumMod(u3, uA[m], ord);
        z3 = ellipticSum(xA[m], yA[m], zA[m], x3, y3, z3, a, &x3, &y3);
      } else {
        u3 = doubleMod(u3, ord);
        v3 = doubleMod(v3, ord);
        z3 = ellipticDouble(x3, y3, z3, a, &x3, &y3);
      } /* end if/else */
      ++iters.lo; iters.hi += iters.lo == 0UL;

    } /* end block */

  } /* end main loop */

  return 0;
} /* end main */


/*== Arithmetic in the field (Z/2Z)[u] / (Z/2Z)[1+u^38+u^89] ===============*/

/*-- equal -----------------------------------------------------------------*/

/* Check equality: x == y, where degree x, y < 96. */
static INLINE int equal(u96 x, u96 y) {

  return ((x.h ^ y.h) | (x.m ^ y.m)| (x.l ^ y.l)) == 0UL;
} /* end equal */


/*-- sum -------------------------------------------------------------------*/

/* Add: x + y, where degree x, y < 96. */
static INLINE u96 sum(u96 x, u96 y) {
  u96 z;

  z.h = x.h ^ y.h;
  z.m = x.m ^ y.m;
  z.l = x.l ^ y.l;

  return z;
} /* end sum */


/*-- square ----------------------------------------------------------------*/

/* Square of x modulo 1+u^38+u^89, degree x < 89. */
static u96 square(u96 x) {
  /* TO DO: smaller tab??? */
  static const u32 tab[256] =
  {     0UL,     1UL,     4UL,     5UL,    16UL,    17UL,    20UL,    21UL
  ,    64UL,    65UL,    68UL,    69UL,    80UL,    81UL,    84UL,    85UL
  ,   256UL,   257UL,   260UL,   261UL,   272UL,   273UL,   276UL,   277UL
  ,   320UL,   321UL,   324UL,   325UL,   336UL,   337UL,   340UL,   341UL
  ,  1024UL,  1025UL,  1028UL,  1029UL,  1040UL,  1041UL,  1044UL,  1045UL
  ,  1088UL,  1089UL,  1092UL,  1093UL,  1104UL,  1105UL,  1108UL,  1109UL
  ,  1280UL,  1281UL,  1284UL,  1285UL,  1296UL,  1297UL,  1300UL,  1301UL
  ,  1344UL,  1345UL,  1348UL,  1349UL,  1360UL,  1361UL,  1364UL,  1365UL
  ,  4096UL,  4097UL,  4100UL,  4101UL,  4112UL,  4113UL,  4116UL,  4117UL
  ,  4160UL,  4161UL,  4164UL,  4165UL,  4176UL,  4177UL,  4180UL,  4181UL
  ,  4352UL,  4353UL,  4356UL,  4357UL,  4368UL,  4369UL,  4372UL,  4373UL
  ,  4416UL,  4417UL,  4420UL,  4421UL,  4432UL,  4433UL,  4436UL,  4437UL
  ,  5120UL,  5121UL,  5124UL,  5125UL,  5136UL,  5137UL,  5140UL,  5141UL
  ,  5184UL,  5185UL,  5188UL,  5189UL,  5200UL,  5201UL,  5204UL,  5205UL
  ,  5376UL,  5377UL,  5380UL,  5381UL,  5392UL,  5393UL,  5396UL,  5397UL
  ,  5440UL,  5441UL,  5444UL,  5445UL,  5456UL,  5457UL,  5460UL,  5461UL
  , 16384UL, 16385UL, 16388UL, 16389UL, 16400UL, 16401UL, 16404UL, 16405UL
  , 16448UL, 16449UL, 16452UL, 16453UL, 16464UL, 16465UL, 16468UL, 16469UL
  , 16640UL, 16641UL, 16644UL, 16645UL, 16656UL, 16657UL, 16660UL, 16661UL
  , 16704UL, 16705UL, 16708UL, 16709UL, 16720UL, 16721UL, 16724UL, 16725UL
  , 17408UL, 17409UL, 17412UL, 17413UL, 17424UL, 17425UL, 17428UL, 17429UL
  , 17472UL, 17473UL, 17476UL, 17477UL, 17488UL, 17489UL, 17492UL, 17493UL
  , 17664UL, 17665UL, 17668UL, 17669UL, 17680UL, 17681UL, 17684UL, 17685UL
  , 17728UL, 17729UL, 17732UL, 17733UL, 17744UL, 17745UL, 17748UL, 17749UL
  , 20480UL, 20481UL, 20484UL, 20485UL, 20496UL, 20497UL, 20500UL, 20501UL
  , 20544UL, 20545UL, 20548UL, 20549UL, 20560UL, 20561UL, 20564UL, 20565UL
  , 20736UL, 20737UL, 20740UL, 20741UL, 20752UL, 20753UL, 20756UL, 20757UL
  , 20800UL, 20801UL, 20804UL, 20805UL, 20816UL, 20817UL, 20820UL, 20821UL
  , 21504UL, 21505UL, 21508UL, 21509UL, 21520UL, 21521UL, 21524UL, 21525UL
  , 21568UL, 21569UL, 21572UL, 21573UL, 21584UL, 21585UL, 21588UL, 21589UL
  , 21760UL, 21761UL, 21764UL, 21765UL, 21776UL, 21777UL, 21780UL, 21781UL
  , 21824UL, 21825UL, 21828UL, 21829UL, 21840UL, 21841UL, 21844UL, 21845UL
  };

  u32 t0,t1,t2,t3,t4,t5, tmp, xh,xm,xl;
  u96 z;

  xh = x.h; xm = x.m; xl = x.l;

  t0 = (u32)tab[xl & 255] | (u32)tab[xl>>8 & 255]<<16;
  t1 = (u32)tab[xl>>16 & 255] | (u32)tab[xl>>24 & 255]<<16;
  t2 = (u32)tab[xm & 255] | (u32)tab[xm>>8 & 255]<<16;
  t3 = (u32)tab[xm>>16 & 255] | (u32)tab[xm>>24 & 255]<<16;
  t4 = (u32)tab[xh & 255] | (u32)tab[xh>>8 & 255]<<16;
  t5 = (u32)tab[xh>>16 & 255] | (u32)tab[xh>>24 & 255]<<16;

  t2 ^= t5<<7; t3 ^= t5>>25;
  t3 ^= t5<<13; t4 ^= t5>>19;
  t1 ^= t4<<7; t2 ^= t4>>25;
  t2 ^= t4<<13; t3 ^= t4>>19;
  t0 ^= t3<<7; t1 ^= t3>>25;
  t1 ^= t3<<13; t2 ^= t3>>19;

  tmp = t2>>25; t2 ^= tmp<<25;
  t0 ^= tmp; t1 ^= tmp<<6;

  z.h = t2; z.m = t1; z.l = t0;
  return z;
} /* end square */


/*-- f2Mul -----------------------------------------------------------------*/

#ifdef NO_ASM

static u32 f2Mul(u32 *ph, u32 a, u32 b) {
  u32 l0, l1, h0, h1, i;
  const u32 bit15 = 1UL<<15, bit16 = 1UL<<16;

  l0 = l1 = h0 = h1 = 0UL;
  for (i = 8UL; i; --i) {
    if (a & 1UL) l0 ^= b;
    if (a & bit16) l1 ^= b;
    a >>= 1;
    if ((s32)b < 0L) h0 ^= a;
    if (b & bit15) h1 ^= a;
    b += b;
    if (a & 1UL) l0 ^= b;
    if (a & bit16) l1 ^= b;
    a >>= 1;
    if ((s32)b < 0L) h0 ^= a;
    if (b & bit15) h1 ^= a;
    b += b;
  } /* end for (i) */

  *ph = (h1>>16) ^ h0;
  return l0 ^ (l1<<16);
} /* end f2Mul */

#else

#ifdef __GNUC__
#ifdef __arm6__
#define umul_ppmm(w1, w0, u, v) \
  __asm__ ( "umull %1,%0,%2,%3"                                         \
          : "=&r" ((u32)(w1)), "=&r" ((u32)(w0))                        \
          : "r" ((u32)(u)), "r" ((u32)(v))                              \
          )
#elif defined(__i386__) || defined(__i486__)
#define umul_ppmm(w1, w0, u, v) \
  __asm__ ( "mull %3"                                                   \
          : "=a" ((u32)(w0)), "=d" ((u32)(w1))                          \
          : "%0" ((u32)(u)), "rm" ((u32)(v))                            \
          )
#elif defined(__mips__)
#define umul_ppmm(w1, w0, u, v) \
  __asm__ ( "multu %2,%3"                                               \
          : "=l" ((u32)(w0)), "=h" ((u32)(w1))                          \
          : "d" ((u32)(u)), "d" ((u32)(v))                              \
          )
#elif defined(_PA_RISC1_1)
#define umul_ppmm(w1, w0, u, v) \
  do {                                                                  \
    union {                                                             \
      unsigned long long __ll;                                          \
      struct { u32 __h, __l; } __hl;                                    \
    } __stuff;                                                          \
    __asm__ ( "xmpyu %1,%2,%0"                                          \
            : "=x" (__stuff.__ll)                                       \
            : "x" ((u32)(u)), "x" ((u32)(v))                            \
            );                                                          \
    w1 = __stuff.__hl.__h;                                              \
    w0 = __stuff.__hl.__l;                                              \
  } while (0)
#elif defined(_ARCH_PPC)
#define umul_ppmm(w1, w0, u, v) \
  do { u32 __u, __v;                                                    \
    __u = (u); __v = (v);                                               \
    __asm__ ( "mulhwu %0,%1,%2"                                         \
            : "=r" ((u32)w1)                                            \
            : "%r" (__u), "r" (__v)                                     \
            );                                                          \
    w0 = __u*__v;                                                       \
  } while (0)
#elif defined(__sparc__)
#define umul_ppmm(w1, w0, u, v) \
  __asm__ ( "umul %2,%3,%1; rd %%y,%0"                                  \
          : "=r" ((u32)(w1)), "=r" ((u32)(w0))                          \
          : "r" ((u32)(u)), "r" ((u32)(v))                              \
          )
#else
#error Use -DNO_ASM or write a umul_ppmm() for your chip here!
#endif
#else
#error Use -DNO_ASM or compile with gcc.
#endif

static u32 f2Mul(u32 *ph, u32 a, u32 b) {
  u32 a0,a1,a2,a3,a01,a23,a02,a13,a0123
    , b0,b1,b2,b3,b01,b23,b02,b13,b0123
    , c0h,c1h,c2h,c3h,c01h,c23h,c02h,c13h,c0123h
    , c0l,c1l,c2l,c3l,c01l,c23l,c02l,c13l,c0123l
    , h, l;
  const u32 mask = 0x11111111UL;

  a0 = a & mask; b0 = b & mask;
  a1 = (a>>1) & mask; b1 = (b>>1) & mask;

  a2 = (a>>2) & mask; b2 = (b>>2) & mask;
  a3 = (a>>3) & mask; b3 = (b>>3) & mask;

  a02 = a0^a2; b02 = b0^b2;
  a13 = a1^a3; b13 = b1^b3;
  a0123 = a02^a13; b0123 = b02^b13;


  umul_ppmm(c02h, c02l, a02, b02); c02l &= mask; c02h &= mask;
  umul_ppmm(c0123h, c0123l, a0123, b0123); c0123l &= mask; c0123h &= mask;
  c0123l ^= c02l;
  c0123h ^= c02h;
  umul_ppmm(c13h, c13l, a13, b13); c13l &= mask; c13h &= mask;
  c0123l ^= c13l;
  c0123l = c02l^c0123l<<1^c13l<<2;
  c0123h ^= c13h;
  c0123h = c02h^c0123h<<1^c13h<<2;


  a01 = a0^a1; b01 = b0^b1;

  umul_ppmm(c0h, c0l, a0, b0); c0l &= mask; c0h &= mask;
  umul_ppmm(c01h, c01l, a01, b01); c01l &= mask; c01h &= mask;
  c01l ^= c0l;
  c01h ^= c0h;
  umul_ppmm(c1h, c1l, a1, b1); c1l &= mask; c1h &= mask;
  c01l ^= c1l;
  c01l = c0l^c1l<<2^c01l<<1;
  c0123l ^= c01l;
  c01h ^= c1h;
  c01h = c0h^c1h<<2^c01h<<1;
  c0123h ^= c01h;


  a23 = a2^a3; b23 = b2^b3;

  umul_ppmm(c2h, c2l, a2, b2); c2l &= mask; c2h &= mask;
  umul_ppmm(c23h, c23l, a23, b23); c23l &= mask; c23h &= mask;
  c23l ^= c2l;
  c23h ^= c2h;
  umul_ppmm(c3h, c3l, a3, b3); c3l &= mask; c3h &= mask;
  c23l ^= c3l;
  c23l = c2l^c3l<<2^c23l<<1;
  c0123l ^= c23l;
  c23h ^= c3h;
  c23h = c2h^c3h<<2^c23h<<1;
  c0123h ^= c23h;

  h = (c0123h^c23h<<2)<<2^c01h^c0123l>>30^c23l>>28;
  l = (c0123l^c23l<<2)<<2^c01l;

  *ph = h;
  return l;
} /* end f2Mul */

#endif


/*-- product ---------------------------------------------------------------*/

/* Product of x and y modulo 1+u^38+u^89, degree x, y < 89. */
static u96 product(u96 x, u96 y) {
  u32 r,rh,rl, sh,sl,s0,s1,s2,s3, t0,t1,t2,t3,t4,t5, xlh, ylh;
  u96 z;

  t0 = f2Mul(&t1, x.l, y.l);
  s2 = f2Mul(&t3, x.m, y.m);
  rl = f2Mul(&rh, x.l^x.m, y.l^y.m);
  r = t1^s2;
  t1 = r^rl^t0; t2 = r^rh^t3;

  t4 = f2Mul(&t5, x.h, y.h);

  xlh = x.l^x.h; ylh = y.l^y.h;
  s0 = f2Mul(&s1, xlh, ylh);
  rl = f2Mul(&rh, xlh^x.m, ylh^y.m);
  s3 = t3;
  r = s1^s2;
  s1 = r^rl^s0; s2 = r^rh^s3;

  sl = t2^t4;
  sh = t3^t5;

  t2 = sl^s0^t0;
  t3 = sh^s1^t1;
  t4 = sl^s2;
  t5 = sh^s3;


  t2 ^= t5<<7; t3 ^= t5>>25;
  t3 ^= t5<<13; t4 ^= t5>>19;
  t1 ^= t4<<7; t2 ^= t4>>25;
  t2 ^= t4<<13; t3 ^= t4>>19;
  t0 ^= t3<<7; t1 ^= t3>>25;
  t1 ^= t3<<13; t2 ^= t3>>19;

  { u32 tmp;

    tmp = t2>>25; t2 ^= tmp<<25;
    t0 ^= tmp; t1 ^= tmp<<6;
  } /* end block */

  z.h = t2; z.m = t1; z.l = t0;
  return z;
} /* end product */


/*-- inverse --------------------------------------------------------------*/

/* Invert y modulo 1+u^38+u^89, degree y < 89, y != 0. */
static u96 inverse(u96 y) {
  u32 ah,am,al, bh,bm,bl, t, uh,um,ul, vh,vm,vl;
  const u32 mh = 1UL<<25, mm = 1UL<<6, ml = 1UL;

  /* Maintain: u.y = a<<t and v.y = b<<t modulo 1+u^38+u^89. */

  ah = y.h; am = y.m; al = y.l;
  t = 0UL;
  while (!(al & 1UL)) {
    al >>= 1; al |= am<<31; am >>= 1; am |= ah<<31; ah >>= 1;
    ++t;
  } /* end while */
  bh = mh; bm = mm; bl = ml;
  uh = um = 0UL; ul = 1UL;
  vh = vm = vl = 0UL;

  do {
    do {
      do {

        do {
          bh ^= ah; bm ^= am; bl ^= al;
          vh ^= uh; vm ^= um; vl ^= ul;

          do {
            bl >>= 1; bl |= bm<<31; bm >>= 1; bm |= bh<<31; bh >>= 1;
            ++t;
            uh <<= 1; uh |= um>>31; um <<= 1; um |= ul>>31; ul <<= 1;
          } while (!(bl & 1UL));
        } while (ah < bh || (ah == bh && (am < bm || (am == bm && al < bl))));

        if ((al ^ bl) == 0UL && (am ^ bm) == 0UL && (ah ^ bh) == 0UL) break;

        do {
          ah ^= bh; am ^= bm; al ^= bl;
          uh ^= vh; um ^= vm; ul ^= vl;

          do {
            al >>= 1; al |= am<<31; am >>= 1; am |= ah<<31; ah >>= 1;
            ++t;
            vh <<= 1; vh |= vm>>31; vm <<= 1; vm |= vl>>31; vl <<= 1;
          } while (!(al & 1UL));
        } while (ah > bh || (ah == bh && (am > bm || (am == bm && al > bl))));

      } while (al != bl);
    } while (am != bm);
  } while (ah != bh);

  /* Divide u by 2^t modulo the modulus. */
  while (t >= 38UL) {
    u32 wh, wl;

    wh = um & 0x3fUL;
    wl = ul;

    ul = wl ^ um>>6 ^ uh<<26;
    um = wh ^ uh>>6 ^ wl<<19;
    uh = wl>>13 ^ wh<<19;
    
    t -= 38UL;
  } /* end while */
  if (t < 32UL) {
    if (t) {
      u32 s, w;

      s = 32UL-t;
      w = ul<<s>>s;

      ul = ul>>t | um<<s;
      um = um>>t | uh<<s;
      uh = uh>>t;

      if (t > 6UL) {
        ul ^= w<<(38UL-t);
        um ^= w>>(t-6UL);
        if (t > 25UL) {
          um ^= w<<(57UL-t);
          uh ^= w>>(t-25UL);
        } else {
          uh ^= w<<(25UL-t);
        } /* end if/else */
      } else {
        um ^= w<<(6UL-t);
        uh ^= w<<(25UL-t);
      } /* end if/else */

    } /* end if */
  } else if (t > 32) {
    u32 q, r, s, wh, wl;

    q = 64UL-t;
    wh = um<<q>>q;
    wl = ul;

    r = t-32UL;

    ul = um>>r | uh<<q;
    um = uh>>r;

    s = 38UL-t;
    ul ^= wl<<s;
    um = um ^ wl>>(t-6UL) ^ wh<<s;

    s = 57UL-t;
    um ^= wl<<s;
    uh = wl>>(t-25UL) ^ wh<<s;
  } else {
    u32 w;

    w = ul;

    ul = um ^ w<<6;
    um = uh ^ w>>26 ^ w<<25;
    uh = w>>7;
  }/* end if/else if/else */

  { u96 u;

    u.h = uh; u.m = um; u.l = ul;

    return u;
  } /* end block */
} /* end inverse */


/*-- quotient --------------------------------------------------------------*/

/* Divide x by y modulo 1+u^38+u^89, degree x, y < 89, y != 0. */
static INLINE u96 quotient(u96 x, u96 y) {
  u96 z;

  z = inverse(y);

  return product(x, z);
} /* end quotient */


/*== Arithmetic on elliptic curves over the field ==========================*/

/*-- ellipticSum -----------------------------------------------------------*/

/* Given pts (x1:y1:z1) and (x2:y2:z2) on y^2 + x*y = x^3 + a*x^2 + b,
 * computes their sum (x3:y3:z3) by the group law.  Doesn't actually need b.
 * Puts x3 at *px3 and y3 at *py3, and returns z3.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticSum
  ( u96 x1, u96 y1, int z1, u96 x2, u96 y2, int z2, u96 a
  , u96 *px3, u96 *py3
  ) {
  u96 lam, x3, y3;

#if 0
  /* Short and sweet. */
  if (z1 == 0UL) { *px3 = x2; *py3 = y2; return z2; }
  if (z2 == 0UL) { *px3 = x1; *py3 = y1; return z1; }

  if (equal(x1, x2)) {
    const u96 zero = { 0UL, 0UL, 0UL }, one = { 0UL, 0UL, 1UL };

    if (equal(y1, y2)) return ellipticDouble(x1, y1, z1, a, px3, py3);
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  lam = quotient(sum(y1, y2), sum(x1, x2));
  x3 = sum(sum(sum(sum(square(lam), lam), x1), x2), a);
  y3 = sum(sum(product(lam, sum(x1, x3)), x3), y1);
#else
  /* Same thing but a bit faster with explicit xors. */
  u96 t, x, y;

  if (z1 == 0UL) { *px3 = x2; *py3 = y2; return z2; }
  if (z2 == 0UL) { *px3 = x1; *py3 = y1; return z1; }

  if (((x1.h ^ x2.h) | (x1.m ^ x2.m) | (x1.l ^ x2.l)) == 0UL) {
    const u96 zero = { 0UL, 0UL, 0UL }, one = { 0UL, 0UL, 1UL };

    if (((y1.h ^ y2.h) | (y1.m ^ y2.m) | (y1.l ^ y2.l)) == 0UL) {
      return ellipticDouble(x1, y1, z1, a, px3, py3);
    } /* end if */
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  x.h = x1.h^x2.h; x.m = x1.m^x2.m; x.l = x1.l^x2.l;
  y.h = y1.h^y2.h; y.m = y1.m^y2.m; y.l = y1.l^y2.l;
  lam = quotient(y, x);

  x3 = square(lam);
  x3.h ^= lam.h; x3.m ^= lam.m; x3.l ^= lam.l;
  x3.h ^= x1.h; x3.m ^= x1.m; x3.l ^= x1.l;
  x3.h ^= x2.h; x3.m ^= x2.m; x3.l ^= x2.l;
  x3.h ^= a.h; x3.m ^= a.m; x3.l ^= a.l;

  t.h = x1.h^x3.h; t.m = x1.m^x3.m; t.l = x1.l^x3.l;
  y3 = product(lam, t);
  y3.h ^= y1.h; y3.m ^= y1.m; y3.l ^= y1.l;
  y3.h ^= x3.h; y3.m ^= x3.m; y3.l ^= x3.l;
#endif

  *px3 = x3; *py3 = y3; return 1;
} /* end ellipticSum */


/*-- ellipticDouble --------------------------------------------------------*/

/* Given point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its double (x2:y2:z2) by the group law.  Doesn't actually need b.
 * Puts x2 at *px2 and y2 at *py2, and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticDouble
  ( u96 x, u96 y, int z, u96 a
  , u96 *px2, u96 *py2
  ) {
  u96 lam, x2, y2;
  const u96 zero = { 0UL, 0UL, 0UL }, one = { 0UL, 0UL, 1UL };

#if 0
  /* Short and sweet. */
  if (z == 0 || equal(x, zero)) { *px2 = zero; *py2 = one; return 0; }

  lam = sum(x, quotient(y, x));
  x2 = sum(sum(square(lam), lam), a);
  y2 = sum(square(x), sum(product(lam, x2), x2));
#else
  /* Same thing but a bit faster with explicit xors. */
  u96 t;

  if (z == 0 || (x.h | x.m | x.l) == 0UL) {
    *px2 = zero; *py2 = one; return 0;
  } /* end if */

  lam = quotient(y, x);
  lam.h ^= x.h; lam.m ^= x.m; lam.l ^= x.l;

  x2 = square(lam);
  x2.h ^= lam.h; x2.m ^= lam.m; x2.l ^= lam.l;
  x2.h ^= a.h; x2.m ^= a.m; x2.l ^= a.l;

  y2 = product(lam, x2);
  t = square(x);
  y2.h ^= x2.h; y2.m ^= x2.m; y2.l ^= x2.l;
  y2.h ^= t.h; y2.m ^= t.m; y2.l ^= t.l;
#endif

  *px2 = x2; *py2 = y2; return 1;
} /* end ellipticDouble */


/*-- ellipticProduct -------------------------------------------------------*/

/* Given point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes its
 * fac-th multiple (x2:y2:z2) by the group law.  Doesn't actually need b.
 * Puts x2 at *px2 and y2 at *py2, and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticProduct
  ( u96 x, u96 y, int z, u96 fac, u96 a
  , u96 *px2, u96 *py2
  ) {
  int z2;
  u32 f, m;
  u96 x2, y2;
  const u96 zero = { 0UL, 0UL, 0UL }, one = { 0UL, 0UL, 1UL };

  if (equal(fac, zero)) { *px2 = zero; *py2 = one; return 0; }

  f = fac.h;
  if (f) {
    x2 = x; y2 = y; z2 = z;
    m = f;
    m |= m>>1; m |= m>>2; m |= m>>4; m |= m>>8; m |= m>>16;
    m ^= m>>1;
    while (m >>= 1) {
      z2 = ellipticDouble(x2, y2, z2, a, &x2, &y2);
      if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, a, &x2, &y2);
    } /* end while */
  } else { x2 = zero; y2 = one; z2 = 0; }

  m = 1UL<<31;
  f = fac.m;
  do {
    z2 = ellipticDouble(x2, y2, z2, a, &x2, &y2);
    if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, a, &x2, &y2);
    m >>= 1;
  } while (m);

  m = 1UL<<31;
  f = fac.l;
  do {
    z2 = ellipticDouble(x2, y2, z2, a, &x2, &y2);
    if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, a, &x2, &y2);
    m >>= 1;
  } while (m);

  *px2 = x2; *py2 = y2; return z2;
} /* end ellipticProduct */


/*== end of file ecdl32bit.c ===============================================*/

