/* > ecdl2-97.c
 * Purpose: Fast arithmetic for computing discrete logs on elliptic curves.
 * Copyright: Robert J. Harley, 1997-1999.
 * Contact: Robert.Harley@inria.fr
 * Legalese: This source code is subject to the GNU Public License (v2).
 */

/* This is ECDL32, version 1.2.1.
 *
 * Consult the following URL for more information:
 *   http://pauillac.inria.fr/~harley/ecdl6/
 */


/*** Quickstart ***/

/* Do this:

cc -O3 ecdl2-97.c -o ecdl
mkdir `uname -n`
cd `uname -n`
nice ../ecdl mail `whoami` MyTeam `uname -n` `uname -m` `uname -s` > log &

 */


/*** Detailed instructions ***/

/** 1. Compiling **
 *
 * Compile with something like:
 *   gcc ecdl2-97.c -o ecdl -O3 -fomit-frame-pointer -freg-struct-return
 *
 * You can test the binary like this:
 *   ./ecdl test me Testers bla bla bla
 *
 * The test output should match the sample below except for the
 * number of iterations per second.
 *
 * If the program complains that it cannot find sendmail, you can
 * add -DSENDMAIL='"/usr/sbin/sendmail"' or something similar.
 *
 * Test output follows:

ECDL32, version: 121.
Mode ...... test
User ...... me
Team ...... Testers
Machine ... bla
Hardware .. bla
OS ........ bla
Comment ... none
Generating 16 new starting points.
Computing iterations...
TEST|i|00000000F5D9|u|0CE88C78C39716A6D889E4D6F|v|097270894213CD2CBEA285D55|x|0C3C2CDDFF8C4279EC5AAB9BF|y|1290F0B3A000002F531FBCF48|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 1.00699e+06 at 118330 per second.
TEST|i|000000029A81|u|0BA4E82EA6583E60E838CAEEB|v|05B6B313B019235A5B625DCDE|x|121EB4F0B63979CF23C09D141|y|143FABEBA00000DABD7D4D947|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 2.73e+06 at 118490 per second.
TEST|i|000000021E19|u|03CAE5F39F306A6AD65546C8E|v|0EA840FE305582F5BA4579262|x|1EDEBCF29E944196D035CA9C4|y|1173E36A000000325651075A9|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 3.22742e+06 at 118438 per second.
TEST|i|000000038C2D|u|0B82CED105A610870BFA92949|v|09857CC5392EEB909B42284FC|x|0C9B2A1B19035BFED248338DC|y|1B15423D300000637751200A2|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 3.71989e+06 at 118392 per second.
TEST|i|0000000394E7|u|0B12B3E964A4838F5171EFF34|v|0EC332CBBBD928885F68DB32B|x|1181B6278149DA62C19952F60|y|0E29784080000003F1357A460|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 3.75563e+06 at 118362 per second.
TEST|i|000000047FBC|u|0BA7954CF5ED75872199EED48|v|0E01F017DA0B255ECF5A8FFB6|x|029AF42867F438BA63D63CC70|y|1E8A120BE00000582C12DC2DD|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 4.7175e+06 at 118322 per second.
TEST|i|000000049F5A|u|02A08894A33477FAF87B576DD|v|06CAC5498B6A9B16867BBDC64|x|030AD30833783CB041004E04B|y|16054C86F00000AC181007E65|z|1|ECDL32|121|me|Testers|bla|bla|bla|none
Iterations during this run = 4.84701e+06 at 118306 per second.
[...and so on...]

 */

/** 2. Optimising **
 *
 * Good speeds are roughly 120 k iterations per second for a 500 MHz x86,
 * 145 k for a 450 MHz PowerPC G3 and 60 k for a 275 MHz StrongARM.
 * You can compile with various optimisation flags to maximise speed
 * before starting on the real calculations.
 *
 * You can try defining -DPROD=2 or -DPROD=3 to use different
 * implementations of two speed critical functions.  They might or
 * might not be faster depending on your chip and compiler.
 *
 * You can also try tweaking the value of PARAL.  Just add -DPARAL=12,
 * for instance.  The default is 16 and useful values are roughly in
 * the range 8 to 20.  Note that the test output will no longer
 * exactly match the sample given above.
 */

/** 3. Starting up **
 *
 * When you're ready to start up: on each machine you want to run on
 * make a directory for that machine, 'cd' into it and go.
 *
 *   mkdir `uname -n`
 *   cd `uname -n`
 *   unlimit cputime || (echo Using ulimit instead; ulimit -t unlimited)
 *   nohup nice +15 ../ecdl [...arguments, see below...] > log &
 *
 * NB: Some shells need 'nice -15' instead.
 *     If you have a machine with several CPUs, make a directory for
 *     each CPU and start up an ecdl process in each one.
 *
 * The arguments have the following syntax:
 *   ecdl <mode> <user> <team> <machine> <hardware> <OS> { <comment> }
 *
 * <mode>:      Either "test", "mail", "alt" or "batch".
 * <user>:      Something to identify you by    e.g., your email address.
 * <team>:      Name of the team you are on     e.g., one you just made up!
 * <machine>:   The machine name                e.g., as given by "uname -n".
 * <hardware>:  The machine hardware type       e.g., as given by "uname -m".
 * <OS>:        The OS name                     e.g., as given by "uname -s".
 * <comment>:   Optional extra text.
 *
 * None of these fields should contain the | character but they can
 * contain spaces and stuff as long as you use 'single quotes' around the
 * fields concerned on the command line.
 *
 * Examples:
 *
 *   ecdl mail 'Robert Harley' INRIA corton '500 MHz 21164a' Linux
 *   ecdl mail biff@skool 'Hackerz from Hell' `uname -n` `uname -m` `uname -s`
 *   ecdl batch anon 'Anonymous People' bla bla bla 'Yo dude, check this out!'
 *
 * Test mode quickly produces some (useless) output to stdout along
 * with info on iterations per second.  This allows you to check that
 * your binary is OK, to compare compiler optimisation flags etc.
 *
 * Mail mode emails a result back to ecdl2-97@pauillac.inria.fr every few
 * hours or so (as well as writing it to stdout).  INRIA's excessively
 * aggressive spam filters might refuse the email; in that case switch
 * to "alt" mode which sends to ecdl2-97@rupture.net instead.
 *
 * For machines not permanently connected to the Net, "batch" mode
 * only writes the results the stdout, so you should redirect that to
 * a log file and email the file by hand once in a while.
 */

/** 4. Shutting down (and saved states) **
 *
 * When the program is running it saves its internal state in a file
 * (called saved.state.new.format) from time to time.  The reason each
 * machine should have its own directory is to avoid the saved states
 * getting mixed up!  If you want to stop the program, the best way is
 * to find it's PID (first column of the 'ps' output) and send the TERM
 * signal which causes it to save its state and exit:
 *
 *   ps x | grep ecdl
 *   kill -TERM <the PID>
 *
 * When you restart in the same directory:
 *   nohup nice -15 ../ecdl [...arguments...] >> log &
 *
 * the program will look for the saved.state file.  If no saved.state
 * is found it will generate a new state and work from there.  If one
 * is found, the program reads it, and continues where it left off.
 * Note that it would be wasteful to use a given saved state on more
 * than one CPU because work would just be duplicated.  Note also that
 * test mode does not mess with the saved state.
 */


/*== #includes =============================================================*/

/** Ansi includes. **/

#include <stdio.h>

/* For strerror() and strcmp(). */
#include <string.h>

/* For signal() and SIGTERM. */
#include <signal.h>

/* For errno. */
#include <errno.h>


/** Unix includes. **/

/* For getpid(), access() and timing. */
#include <unistd.h>

/* For timing: getrusage() and struct rusage. */
#include <sys/time.h>
#include <sys/resource.h>


/*== Types =================================================================*/

/* Abbreviation used when bit-length does not matter. */
typedef unsigned long ulong;

/* Integers. */
typedef unsigned char u8;
typedef int s32;
typedef unsigned int u32;
typedef struct { u32 hi, lo; } u64;
typedef struct { u32 b, h, m, l; } u128;

/* Polynomials over the two-element field GF(2) = Z/2Z. */
typedef struct { u32 b, h, m, l; } poly128;

/* Operating modes. */
typedef enum { Bad, Test, Mail, Alt, Batch } modeType;


/*== #defines ==============================================================*/

/*-= Stuff that you can change =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/* The place to find sendmail.  Often it's in /usr/sbin/sendmail but
 * then /usr/lib/sendmail is a link to it, so this works fairly universally.
 */
#ifndef SENDMAIL
#define SENDMAIL "/usr/lib/sendmail"
#endif


/* Name of file to write after each distinguished point or when
 * stopping due to a SIGTERM.  It is read when restarting.
 */
#ifndef SAVED_STATE_FILENAME
#define SAVED_STATE_FILENAME "saved.state.new.format"
#endif


/* Number of iterations to run "in parallel".  This is done so that
 * PARAL inversions can be replaced by one inversion and 3*PARAL-3 mults.
 * Try something roughly in the range 8 to 20.
 */
#ifndef PARAL
#define PARAL (16)
#endif


/* Define PROD to choose different implementations of the critical
 * GF2Product...() and product() functions.  The default is 1.
 * Try 2 or 3, they might be faster.  Then again maybe not!
 */
#ifndef PROD
#define PROD 1
#endif

#if PROD < 1 || PROD > 3
#error PROD must be defined as 1, 2 or 3.
#endif


/* Keyword for giving "inline" hint.
 * Adjust for your compiler or leave it out.
 */
#if defined(__GNUC__) || defined(__DECC)
#define INLINE __inline
#else
#define INLINE
#endif


/*-= Stuff that must not be changed =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/* Version 0.x.x: March 1998
 * Version 1.0.0: September 1998
 * - Does 340 k iterations per second with "gcc -O2" + Linux + 500 MHz Alpha.
 * - Tried Ari's sqrt(2) trick, took it out again (real speedup was too
 *   small to be worth the extra complexity).
 * Version 1.1.0: July 1999
 * - Various minor clean-ups.
 * Version 1.2.0: August 1999
 * - Changed saved state file to much better format.
 * Version 1.2.1: August 1999
 * - Cosmetic tweaks.
 * - Changed alternative email address to ecdl2-97@rupture.net.
 */
#define CLIENT "ECDL32"
#define VERSION "121"


/* These are used a few places for types poly128 and u128. */
#define ZERO { 0, 0, 0, 0 }
#define ONE { 0, 0, 0, 1 }


/* The number of cases used in pseudo-random function is 2^CASES_SHIFT.
 * One case is a doubling, the rest are additions of constant points.
 */
#define CASES_SHIFT (4)
#define ADDERS ((1U<<CASES_SHIFT)-1)


/* Shifts used to decide if a point is distinguished. */
#define TEST_SHIFT (20)
#define DIST_SHIFT (30)


/*-= Data defining which ECDL problem to solve =-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/* Field is GF(2^97) = (Z/2Z)[t] / (t^97+t^6+1)
 *   (t^97+t^6+1 is irreducible in (Z/2Z)[t]).
 */

/* Curve from Certicom ECC2-97 problem is y^2 + x*y = x^3 + a*x^2 + b
 * over GF(2^97), where:
 * a = 151759946635783345666111607832
 * b =  46586505053717239594563606920
 *
 * Group order is G = 2 * 79228162514264464603828067969.
 * Use subgroup of order G/2 (prime).
 */
#define A       { 1, 0xEA5CE2B7, 0xF0A58E01, 0xB4389418 }
#define B       { 0, 0x9687742B, 0x6329E706, 0x80231988 }
#define ORDER   { 1, 0x00000000, 0x00007383, 0xE2DE1E81 }

/* Points from Certicom ECC2-97 problem:
 * XP = 46059744771359721362870909225
 * YP = 98068469436517043337972206208
 * XQ = 72978944960364674557961530030
 * YQ = 96375666913718821189526238111
 */
#define XP      { 0, 0x94D3BA57, 0x43058882, 0x62363929 }
#define YP      { 1, 0x3CE0562C, 0xC51EF416, 0xB2C0CA80 }
#define XQ      { 0, 0xEBCEC4B5, 0x961C75DF, 0x04EB6AAE }
#define YQ      { 1, 0x3768154C, 0x2131B67B, 0x39A0339F }


/*== Function declarations =================================================*/

static INLINE int ge(u128 x, u128 y);
static INLINE u128 diff(u128 x, u128 y);
static INLINE u128 sumMod(u128 x, u128 y, u128 m);
static u128 doubleMod(u128 x, u128 m);
static INLINE u128 incMod(u128 x, u128 m);
static INLINE int equal(poly128 x, poly128 y);
static INLINE poly128 xor(poly128 x, poly128 y);
static INLINE u32 topBit(u32 x);

static modeType parse(int argc, char *argv[]);
void catchSigTerm(int sigNum);
int main(int argc, char *argv[]);

static poly128 square(poly128 x);

#if PROD == 1 || PROD == 2
static u32 GF2Product48x48(u32 xh, u32 xl, u32 yh, u32 yl, u32 *ph, u32 *pm);
#else
static u32 GF2Product32x32(u32 a, u32 b, u32 *ph);
#endif

static poly128 product(poly128 x, poly128 y);
static poly128 inverse(poly128 y);
static INLINE poly128 quotient(poly128 x, poly128 y);

static int ellipticDouble
  ( poly128 x, poly128 y, int z
  , poly128 *px2, poly128 *py2
  );
static int ellipticSum
  ( poly128 x1, poly128 y1, int z1, poly128 x2, poly128 y2, int z2
  , poly128 *px3, poly128 *py3
  );
static int ellipticProduct
  ( poly128 x, poly128 y, int z, u128 fac
  , poly128 *px2, poly128 *py2
  );

static void reportDistinguished
  ( modeType mode, int argc, char *argv[]
  , u64 iters, u128 u, u128 v, poly128 x, poly128 y, int z
  , u64 total, double dStart
  );
static void u32ToBytes(u32 data, u8 *p);
static u32 bytesToU32(u8 *p);
static void writeState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  );
static ulong readState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  );


/*== Global variables ======================================================*/

int gotSigTerm = 0;


/*== Function definitions ==================================================*/

/*-= Short little functions =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- ge --------------------------------------------------------------------*/

/* Return whether x >= y, for 0 <= x, y < 2^128. */
static INLINE int ge(u128 x, u128 y) {

  return x.b > y.b
         || x.b == y.b
            && ( x.h > y.h
                 || x.h == y.h && (x.m > y.m || x.m == y.m && x.l >= y.l)
               )
         ;
} /* end ge */


/*-- diff ------------------------------------------------------------------*/

/* Return x-y for 0 <= y <= x < 2^128. */
static INLINE u128 diff(u128 x, u128 y) {
  u32 c, d;

  c = x.l < y.l; x.l -= y.l;
  d = x.m < c; x.m -= c; c = d | x.m < y.m; x.m -= y.m;
  d = x.h < c; x.h -= c; c = d | x.h < y.h; x.h -= y.h;
  x.b -= c; x.b -= y.b;

  return x;
} /* end diff */


/*-- sumMod ----------------------------------------------------------------*/

/* Return sum x+y modulo m, 0 <= x, y < m <= 2^127. */
static INLINE u128 sumMod(u128 x, u128 y, u128 m) {
  u32 c;

  x.l += y.l; c = x.l < y.l;
  x.m += c; c = x.m < c; x.m += y.m; c |= x.m < y.m;
  x.h += c; c = x.h < c; x.h += y.h; c |= x.h < y.h;
  x.b += c; x.b += y.b;

  if (ge(x, m)) x = diff(x, m);

  return x;
} /* end sumMod */


/*-- doubleMod -------------------------------------------------------------*/

/* Return double x<<1 modulo m, where 0 <= x < m <= 2^127. */
static u128 doubleMod(u128 x, u128 m) {

  x.b <<= 1; x.b |= x.h>>31;
  x.h <<= 1; x.h |= x.m>>31;
  x.m <<= 1; x.m |= x.l>>31;
  x.l <<= 1;

  if (ge(x, m)) x = diff(x, m);

  return x;
} /* end doubleMod */


/*-- incMod ----------------------------------------------------------------*/

/* Increment x modulo m, for 0 <= x < m < 2^128. */
static INLINE u128 incMod(u128 x, u128 m) {
  const u128 intZero = ZERO;

  if (!++x.l) if (!++x.m) if (!++x.h) ++x.b;

  if (x.l == m.l && x.m == m.m && x.h == m.h && x.b == m.b) x = intZero;

  return x;
} /* end incMod */


/*-- equal -----------------------------------------------------------------*/

/* Return whether x == y, for polys over Z/2Z. */
static INLINE int equal(poly128 x, poly128 y) {

  return x.l == y.l && x.m == y.m && x.h == y.h && x.b == y.b;
} /* end equal */


/*-- xor -------------------------------------------------------------------*/

/* Return x XOR y, in other words add them as polys over Z/2Z. */
static INLINE poly128 xor(poly128 x, poly128 y) {

  x.l ^= y.l; x.m ^= y.m; x.h ^= y.h; x.b ^= y.b;

  return x;
} /* end xor */


/*-- topBit ----------------------------------------------------------------*/

/* Returns top bit of x, or 0 if x = 0. */
static INLINE u32 topBit(u32 x) {

  x |= x>>1; x |= x>>2; x |= x>>4; x |= x>>8; x |= x>>16;

  return x ^ x>>1;
} /* end topBit */


/*-- parse -----------------------------------------------------------------*/

/* Parse command line and return mode Bad, Test, Mail, Alt or Batch. */
static modeType parse(int argc, char *argv[]) {
  int i;
  char ch, *str;
  modeType mode;

  if (argc != 7 && argc != 8) return Bad;

  str = argv[1];

  if (!strcmp(str, "test")) mode = Test;
  else if (!strcmp(str, "mail")) mode = Mail;
  else if (!strcmp(str, "alt")) mode = Alt;
  else if (!strcmp(str, "batch")) mode = Batch;
  else return Bad;

  for (i = 2; i < argc; ++i) {
    for (str = argv[i]; (ch = *str) != '\0'; ++str) {
      if (ch == '|') return Bad;
    } /* end for */
  } /* end for */

  return mode;
} /* end parse */


/*-- catchSigTerm ----------------------------------------------------------*/

/* Signal catcher for SIGTERM.
 * Allows program to stop cleanly and save state.
 * User can send the signal by "kill -TERM <pid>", for instance.
 */
void catchSigTerm(int sigNum) {

  gotSigTerm = 1;

} /* end catchSigTerm */


/*-- main ------------------------------------------------------------------*/

int main(int argc, char *argv[]) {
  /* Constant stuff. */
  const int zP = 1, zQ = 1;
  const u128 order = ORDER;
  const poly128 xP = XP, yP = YP, xQ = XQ, yQ = YQ;
  const poly128 zero = ZERO, one = ONE;

  /* These get initialised once and for all. */
  int distShift;
  modeType mode;

  /* These are variable. */
  u64 total;
  double dStart; /* For timing. */

  int zA[ADDERS];
  u128 uA[ADDERS];
  poly128 xA[ADDERS], yA[ADDERS];

  int zT[PARAL];
  u64 itersT[PARAL];
  u128 uT[PARAL], vT[PARAL];
  poly128 denT[PARAL], xT[PARAL], yT[PARAL];


  /** Simple sanity check. **/

  if (sizeof(u32) != 4) {
    puts("Fatal error: *** the size of u32 is wrong.  Stopping. ***");
    return 1;
  } /* end if */


  /** See if we can find sendmail with execute permission. **/

  if (access(SENDMAIL, X_OK)) {
    printf( "Warning: sendmail program " SENDMAIL " does not exist or is not "
              "executable\n"
            "         (errno = %d: %s).  Continuing anyway...\n"
            , errno, strerror(errno)
            );
    fflush(stdout);
  } /* end if */


  /** Parse command line. **/

  mode = parse(argc, argv);
  if (mode == Bad) {
    printf( "Syntax: %s <mode> <user> <team> <machine> <hardware> <OS> "
              "{ <comment> }\n"
            "See top of source code or readMe file for details.  Stopping.\n"
          , argv[0]
          );
    return 1;
  } /* end if */


  /** Put up banner. **/

  printf( CLIENT ", version: " VERSION ".\n"
          "Mode ...... %s\n"
          "User ...... %s\n"
          "Team ...... %s\n"
          "Machine ... %s\n"
          "Hardware .. %s\n"
          "OS ........ %s\n"
          "Comment ... %s\n"
        , argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]
        , argc == 8 ? argv[7] : "none"
        );

  fflush(stdout);


  /** Initialisation. **/

  /* Pseudo-random points to add.  Do not change. */
  { ulong i;
    u32 startu;

    /* Points have same large prime order as P since 0 < fac < order. */
    startu = 1;
    for (i = 0; i < ADDERS; ++i) {
      u128 fac;
      const u32 pi = 3141592653U, e = 2718281829U; /* Rounded to odd. */

      /* NB: not terribly random, but it will do! */
      startu = startu*pi+e; fac.l = startu;
      startu = startu*pi+e; fac.m = startu;
      startu = startu*pi+e; fac.h = startu;
      fac.b = 0;
      uA[i] = fac;
      zA[i] = ellipticProduct(xP, yP, zP, fac, xA+i, yA+i);
    } /* end for */

  } /* end block */

  /* Read starting points from the saved state or generate new random ones. */
  { ulong i;

    if (mode == Test) i = 0;
    else i = readState(itersT, uT, vT, xT, yT, zT);

    if (i < PARAL) {
      u128 seed;

      printf("Generating %lu new starting points.\n", PARAL-i);
      fflush(stdout);

      if (mode == Test) {
        /* Fixed seed for testing. */
        seed.l = 0x89ABCDEF;
        seed.m = 0x01234567;
        seed.h = 0;
        seed.b = 0;
      } else {
        FILE *handle;

        /* Get really random seed. */
        handle = fopen("/dev/urandom", "r");
        if (handle) { /* Otherwise seed is stack junk which is fine! */
          fread((void *)&seed, sizeof(seed), (size_t)1, handle);
          fclose(handle);
        } /* end if */
        seed.l ^= time(NULL);
        seed.h ^= getpid();
        seed.b = 0;
      } /* end if/else */

    /* Generate random points. */
      for (; i < PARAL; ++i) {
        u32 c;
        const u128 intZero = ZERO;
        /* Euler gamma times 2^96. */
        const u128 gamma = { 0, 0x93C467E3, 0x7DB0C7A4, 0xD1BE3F81 };

        itersT[i].hi = itersT[i].lo = 0;
        uT[i] = intZero; vT[i] = seed;
        zT[i] = ellipticProduct(xQ, yQ, zQ, seed, xT+i, yT+i);

        seed.l += gamma.l; c = seed.l < gamma.l;
        seed.m += c; c = seed.m < c; seed.m += gamma.m; c |= seed.m < gamma.m;
        seed.h += c; c = seed.h < c; seed.h += gamma.h;
        /* NB: seed.b stays at 0. */
      } /* end for (i) */
    } /* end if */
  } /* end block */


  /* Timing (measured in seconds of user time for this process). */
  { struct rusage ru;

    getrusage(RUSAGE_SELF, &ru);
    dStart = (double)ru.ru_utime.tv_sec+(double)ru.ru_utime.tv_usec*0.000001;
  } /* end block */


  /** Main loop. **/

  /* Sketch of main loop structure:
   * for ever {
   *   for (all PARAL points) {
   *     while (this one is distinguished) { output it; replace by new one; }
   *   }
   *   for (all PARAL points) {
   *     get denominator for this one;
   *   }
   *   get inverses of all PARAL denominators at once;
   *   for (all PARAL points) {
   *     apply pseudo-random function to this one using inverse(denominator);
   *   }
   * }
   */


  puts("Computing iterations...");
  fflush(stdout);

  distShift = 32-(mode == Test ? TEST_SHIFT : DIST_SHIFT);

  signal(SIGTERM, catchSigTerm);

  total.hi = total.lo = 0;
  while (!gotSigTerm) {
    ulong i;

    /** Check for distinguished points. **/
    for (i = 0; i < PARAL; ++i) {
      while (yT[i].m>>distShift == 0) { /* Point is distinguished. */

        /* Check that everything is OK i.e., point is equal to [u]P+[v]Q. */
        int z, zu, zv;
        poly128 x, xu, xv, y, yu, yv;

        zu = ellipticProduct(xP, yP, zP, uT[i], &xu, &yu);
        zv = ellipticProduct(xQ, yQ, zQ, vT[i], &xv, &yv);
        z = ellipticSum(xu, yu, zu, xv, yv, zv, &x, &y);

        if (!equal(x, xT[i]) || !equal(y, yT[i]) || z != zT[i]) {
          puts("Fatal error: *** bad point detected! ***");
          fflush(stdout);
          return 2;
        } /* end if */

        reportDistinguished( mode, argc, argv
                           , itersT[i], uT[i], vT[i], xT[i], yT[i], zT[i]
                           , total, dStart
                           );

        /* Restart this point by incrementing v. */
        itersT[i].hi = itersT[i].lo = 0;
        vT[i] = incMod(vT[i], order);
        zT[i] = ellipticSum(xT[i], yT[i], zT[i], xQ, yQ, zQ, xT+i, yT+i);

        /** Save state (except when testing). **/
        if (mode != Test) writeState(itersT, uT, vT, xT, yT, zT);
      } /* end while (distinguished point) */
    } /* end for (i) */

    /** Get denominators for additions and doublings of points. **/
    for (i = 0; i < PARAL; ++i) {
      ulong m;
      poly128 den, x1;

      x1 = xT[i];
      m = x1.m>>(32-CASES_SHIFT);
      if (m < ADDERS) { /* Addition of constant point. */
        den = xA[m];
        if (!equal(den, x1)) den = xor(den, x1);
        uT[i] = sumMod(uT[i], uA[m], order);
      } else { /* Doubling. */
        den = x1;
        uT[i] = doubleMod(uT[i], order);
        vT[i] = doubleMod(vT[i], order);
      } /* end if/else */

      if (equal(den, zero)) den = one; /* Dummy to avoid division by 0. */
      denT[i] = den;
    } /* end for (i) */


    /** Invert PARAL denominators with 1 inversion and 3*PARAL-3 mults. **/
    { poly128 ix, prod, q, prodT[PARAL];

      prod = denT[0];
      for (i = 1; i < PARAL; ++i) {
        prodT[i] = prod;
        prod = product(prod, denT[i]);
      } /* end for */

      q = inverse(prod);

      for (i = PARAL; --i; ) {
        ix = product(q, prodT[i]);
        q = product(q, denT[i]);
        denT[i] = ix;
      } /* end for */
      denT[0] = q;

    } /* end block */


    /** Get new points. **/
    for (i = 0; i < PARAL; ++i) {
      int z1, z2, nz;
      ulong m;
      poly128 nx,ny, x1,y1, x2,y2;
      const poly128 a = A;

      /* Get points to add, (x1:y1:z1) and (x2:y2:z2). */
      x1 = xT[i]; y1 = yT[i]; z1 = zT[i];

      m = x1.m>>(32-CASES_SHIFT);
      if (m < ADDERS) { x2 = xA[m]; y2 = yA[m]; z2 = zA[m]; }
      else { x2 = x1; y2 = y1; z2 = z1; }


      /* Get their sum as a new point P = (nx:ny:nz) using denT[] table. */
      if (!z1) { nx = x2; ny = y2; nz = z2; }
      else if (!z2) { nx = x1; ny = y1; nz = z1; }
      else if (equal(x1, x2)) {
        if (equal(y1, y2)) { /* Doubling. */
          poly128 lam = xor(product(y1, denT[i]), x1);

          nx = xor(xor(square(lam), lam), a);
          ny = xor(xor(product(lam, xor(nx, x1)), nx), y1);
          nz = 1;
        } else { nx = zero; ny = one; nz = 0; }
      } else { /* General case. */
        poly128 lam = product(xor(y1, y2), denT[i]);
        poly128 t = xor(xor(xor(square(lam), lam), x2), a);

        nx = xor(t, x1);
        ny = xor(xor(product(lam, t), nx), y1);
        nz = 1;
      } /* end if/else if/else if/else */

      xT[i] = nx; yT[i] = ny; zT[i] = nz;
      ++itersT[i].lo; itersT[i].hi += !itersT[i].lo;

    } /* end for (i) */

    total.lo += PARAL;
    total.hi += total.lo < PARAL;
  } /* end main loop */

  puts("Received SIGTERM.");


  /** Save state after receiving SIGTERM (except when testing). **/
  if (mode != Test) writeState(itersT, uT, vT, xT, yT, zT);


  puts("Bye!");
  return 0;
} /* end main */


/*-= Arithmetic in the field GF(2^97) =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- square ----------------------------------------------------------------*/

/* Square x in the field GF(2^97) i.e., as a poly in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x < 97.
 */
static poly128 square(poly128 x) {
  static const u32 tab[256] =
  {     0,     1,     4,     5,    16,    17,    20,    21
  ,    64,    65,    68,    69,    80,    81,    84,    85
  ,   256,   257,   260,   261,   272,   273,   276,   277
  ,   320,   321,   324,   325,   336,   337,   340,   341
  ,  1024,  1025,  1028,  1029,  1040,  1041,  1044,  1045
  ,  1088,  1089,  1092,  1093,  1104,  1105,  1108,  1109
  ,  1280,  1281,  1284,  1285,  1296,  1297,  1300,  1301
  ,  1344,  1345,  1348,  1349,  1360,  1361,  1364,  1365
  ,  4096,  4097,  4100,  4101,  4112,  4113,  4116,  4117
  ,  4160,  4161,  4164,  4165,  4176,  4177,  4180,  4181
  ,  4352,  4353,  4356,  4357,  4368,  4369,  4372,  4373
  ,  4416,  4417,  4420,  4421,  4432,  4433,  4436,  4437
  ,  5120,  5121,  5124,  5125,  5136,  5137,  5140,  5141
  ,  5184,  5185,  5188,  5189,  5200,  5201,  5204,  5205
  ,  5376,  5377,  5380,  5381,  5392,  5393,  5396,  5397
  ,  5440,  5441,  5444,  5445,  5456,  5457,  5460,  5461
  , 16384, 16385, 16388, 16389, 16400, 16401, 16404, 16405
  , 16448, 16449, 16452, 16453, 16464, 16465, 16468, 16469
  , 16640, 16641, 16644, 16645, 16656, 16657, 16660, 16661
  , 16704, 16705, 16708, 16709, 16720, 16721, 16724, 16725
  , 17408, 17409, 17412, 17413, 17424, 17425, 17428, 17429
  , 17472, 17473, 17476, 17477, 17488, 17489, 17492, 17493
  , 17664, 17665, 17668, 17669, 17680, 17681, 17684, 17685
  , 17728, 17729, 17732, 17733, 17744, 17745, 17748, 17749
  , 20480, 20481, 20484, 20485, 20496, 20497, 20500, 20501
  , 20544, 20545, 20548, 20549, 20560, 20561, 20564, 20565
  , 20736, 20737, 20740, 20741, 20752, 20753, 20756, 20757
  , 20800, 20801, 20804, 20805, 20816, 20817, 20820, 20821
  , 21504, 21505, 21508, 21509, 21520, 21521, 21524, 21525
  , 21568, 21569, 21572, 21573, 21584, 21585, 21588, 21589
  , 21760, 21761, 21764, 21765, 21776, 21777, 21780, 21781
  , 21824, 21825, 21828, 21829, 21840, 21841, 21844, 21845
  };

  u32 t0,t1,t2,t3,t4,t5, w;

  w = x.l;
  t0 = tab[w & 255] | tab[w>>8 & 255]<<16;
  t1 = tab[w>>16 & 255] | tab[w>>24]<<16;

  w = x.m;
  t2 = tab[w & 255] | tab[w>>8 & 255]<<16;
  t3 = tab[w>>16 & 255] | tab[w>>24]<<16;

  w = x.h;
  t4 = tab[w & 255] | tab[w>>8 & 255]<<16;
  t5 = tab[w>>16 & 255] | tab[w>>24]<<16;

  /* Handle 97th bit. */
  w = x.b; t3 ^= w<<5; t2 ^= w<<31;

  /* Reduce modulo t^97+t^6+1. */
  t1 ^= t5<<31; t2 ^= t5>>1;
  t2 ^= t5<<5; t3 ^= t5>>27;

  t0 ^= t4<<31; t1 ^= t4>>1;
  t1 ^= t4<<5; t2 ^= t4>>27;

  w = t3>>1; t3 &= 1;
  t0 ^= w; t0 ^= w<<6; t1 ^= w>>26;

  { poly128 r;
    r.l = t0; r.m = t1; r.h = t2; r.b = t3;
    return r;
  } /* end block */
} /* end square */


#if PROD == 1
#undef MUNGE

/*-- GF2Product48x48 -------------------------------------------------------*/

/* Multiply y by low 48 bits of x, as polys over Z/2Z, degree y < 48.
 * Speed-critical auxiliary function used for product().
 * Returns low 32 bits of result, puts middle 32 and high 31 bits in *pm, *ph.
 * Hi NSA dudes! -- Rob.
 */
static u32 GF2Product48x48(u32 xh, u32 xl, u32 yh, u32 yl, u32 *ph, u32 *pm) {
  u32 tab[32], w0, w1;

  { u32 yh2, yh4, yh8;

    yh2 = yh<<1 ^ yl>>31;
    yh4 = yh<<2 ^ yl>>30;
    yh8 = yh<<3 ^ yl>>29;

#define STASH(t) do { tab[2*t] = w0; tab[2*t+1] = w1; } while (0)

    /* Gray code walk through table. */
    /* 0000 */ w0 = 0;          w1 = 0;         STASH(0);
    /* 0001 */ w0 = yl;         w1 = yh;        STASH(1);
    /* 0011 */ w0 = yl ^ yl<<1; w1 = yh ^ yh2;  STASH(3);
    /* 0010 */ w0 = yl<<1;      w1 = yh2;       STASH(2);
    /* 0110 */ w0 ^= yl<<2;     w1 = yh2 ^ yh4; STASH(6);
    /* 0111 */ w0 ^= yl;        w1 ^= yh;       STASH(7);
    /* 0101 */ w0 = yl ^ yl<<2; w1 = yh ^ yh4;  STASH(5);
    /* 0100 */ w0 = yl<<2;      w1 = yh4;       STASH(4);
    /* 1100 */ w0 ^= yl<<3;     w1 = yh4 ^ yh8; STASH(12);
    /* 1101 */ w0 ^= yl;        w1 ^= yh;       STASH(13);
    /* 1111 */ w0 ^= yl<<1;     w1 ^= yh2;      STASH(15);
    /* 1110 */ w0 ^= yl;        w1 ^= yh;       STASH(14);
    /* 1010 */ w0 ^= yl<<2;     w1 = yh2 ^ yh8; STASH(10);
    /* 1011 */ w0 ^= yl;        w1 ^= yh;       STASH(11);
    /* 1001 */ w0 = yl ^ yl<<3; w1 = yh ^ yh8;  STASH(9);
    /* 1000 */ w0 = yl<<3;      w1 = yh8;       STASH(8);

#undef STASH
  } /* end block */

  { u32 ah,al, bh,bl, ch,cl, t;

#define GRAB do { w0 = tab[2*t]; w1 = tab[2*t+1]; } while (0)

    t = xl & 15;     GRAB; al = w0;      ah = w1;
    t = xl>>4 & 15;  GRAB; al ^= w0<<4;  ah ^= w0>>28; ah ^= w1<<4;
    t = xl>>8 & 15;  GRAB; al ^= w0<<8;  ah ^= w0>>24; ah ^= w1<<8;
    t = xl>>12 & 15; GRAB; al ^= w0<<12; ah ^= w0>>20; ah ^= w1<<12;

    t = xl>>16 & 15; GRAB; bl = w0;      bh = w1;
    t = xl>>20 & 15; GRAB; bl ^= w0<<4;  bh ^= w0>>28; bh ^= w1<<4;
    t = xl>>24 & 15; GRAB; bl ^= w0<<8;  bh ^= w0>>24; bh ^= w1<<8;
    t = xl>>28 & 15; GRAB; bl ^= w0<<12; bh ^= w0>>20; bh ^= w1<<12;

    t = xh & 15;     GRAB; cl = w0;      ch = w1;
    t = xh>>4 & 15;  GRAB; cl ^= w0<<4;  ch ^= w0>>28; ch ^= w1<<4;
    t = xh>>8 & 15;  GRAB; cl ^= w0<<8;  ch ^= w0>>24; ch ^= w1<<8;
    t = xh>>12 & 15; GRAB; cl ^= w0<<12; ch ^= w0>>20; ch ^= w1<<12;

#undef GRAB

    *ph = bh>>16 ^ ch;
    *pm = ah ^ bl>>16 ^ bh<<16 ^ cl;
    return al ^ bl<<16;
  } /* end block */
} /* end GF2Product48x48 */

#elif PROD == 2
#define MUNGE

/*-- GF2Product48x48 -------------------------------------------------------*/

/* Multiply y by low 48 bits of x, as polys over Z/2Z, degree y < 48.
 * Input y must be pre-munged by yh = yh<<16 ^ yl>>16;
 * Speed-critical auxiliairy function used for product().
 * Returns low 32 bits of result, puts middle 32 and high 31 bits in *pm, *ph.
 */
static u32 GF2Product48x48(u32 xh, u32 xl, u32 yh, u32 yl, u32 *ph, u32 *pm) {
  u32 rh, rm, rl;

  rh = 0; rm = 0;
  if (xl & 1) { rh = yh>>16; rm = yl; }

#define STEP(n) if (xl & (1U<<(n))) { rh ^= yh>>(16-(n)); rm ^= yl<<(n); }
  STEP(1); STEP(2); STEP(3);
  STEP(4); STEP(5); STEP(6); STEP(7);
  STEP(8); STEP(9); STEP(10); STEP(11);
  STEP(12); STEP(13); STEP(14); STEP(15);
#undef STEP

  rl = rm<<16; rm >>= 16; rm ^= rh<<16; rh >>= 16;

#define STEP(n) if (xl & (65536U<<(n))) { rh ^= yh>>(16-(n)); rm ^= yl<<(n); }
  STEP(0); STEP(1); STEP(2); STEP(3);
  STEP(4); STEP(5); STEP(6); STEP(7);
  STEP(8); STEP(9); STEP(10); STEP(11);
  STEP(12); STEP(13); STEP(14); STEP(15);
#undef STEP

  rl >>= 16; rl ^= rm<<16; rm >>= 16; rm ^= rh<<16; rh >>= 16;

#define STEP(n) if (xh & (1U<<(n))) { rh ^= yh>>(16-(n)); rm ^= yl<<(n); }
  STEP(0); STEP(1); STEP(2); STEP(3);
  STEP(4); STEP(5); STEP(6); STEP(7);
  STEP(8); STEP(9); STEP(10); STEP(11);
  STEP(12); STEP(13); STEP(14); STEP(15);
#undef STEP

  *ph = rh;
  *pm = rm;
  return rl;
} /* end GF2Product48x48 */

#else
/* PROD == 3 */

/*-- GF2Product32x32 -------------------------------------------------------*/

/* Return low 32 bits of 63-bit product of polynomials over Z/2Z
 * with degree < 32, and put top 31 bits in word at ph.
 */
static u32 GF2Product32x32(u32 a, u32 b, u32 *ph) {
  u32 h, l, y0,y1,y2;

  y2 = a<<2; y1 = y2>>1; y0 = y2>>2;

  { u32 tab[8] = { 0, y0, y1, y0 ^ y1, y2, y0 ^ y2, y1 ^ y2, y0 ^ y1 ^ y2 };

    l = 0; if ((s32)a < 0) l = b;
    h = l>>31; l <<= 1; if (a & 1U<<30) l ^= b;

    l ^= tab[b>>30];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>27 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>24 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>21 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>18 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>15 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>12 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>9 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>6 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>3 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b & 7];

    *ph = h;
    return l;
  } /* end block */

} /* end GF2Product32x32 */

#endif


#if PROD == 1 || PROD == 2

/*-- product ---------------------------------------------------------------*/

/* Product of x and y in the field GF(2^97) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x, y < 97.
 * This version uses three 48x48-bit products.
 */
static poly128 product(poly128 x, poly128 y) {
  u32 t0,t1,t2,t3, v0,v1,v2, w0,w1,w2,w3,w4,w5, xb,xh,xm,xl, yb,yh,ym,yl;

  xh = x.h; xm = x.m; xl = x.l;
  yh = y.h; ym = y.m; yl = y.l;

#ifdef MUNGE
  /* With munging. */
  t0 = xh>>16; t1 = xm>>16 ^ xh<<16;
  t2 = yl>>16 ^ ym<<16; t3 = ym>>16 ^ yh<<16;

  v0 = GF2Product48x48(xm ^ t0, xl ^ t1, yh ^ t2, yl ^ t3, &v2, &v1);
  w0 = GF2Product48x48(xm, xl, t2, yl, &w2, &w1);
  w3 = GF2Product48x48(t0, t1, yh, t3, &w5, &w4);
#else
  /* No munging. */
  t0 = xh>>16; t1 = xh<<16 ^ xm>>16;
  t2 = yh>>16; t3 = yh<<16 ^ ym>>16;

  v0 = GF2Product48x48( xm ^ t0, xl ^ t1, (ym ^ t2) & 0x0000FFFF, yl ^ t3
                      , &v2, &v1
                      );
  w0 = GF2Product48x48(xm, xl, ym & 0x0000FFFF, yl, &w2, &w1);
  w3 = GF2Product48x48(t0, t1, t2, t3, &w5, &w4);
#endif

  v0 ^= w0; v1 ^= w1; v2 ^= w2;
  v0 ^= w3; v1 ^= w4; v2 ^= w5;

  w1 ^= v0<<16; w2 ^= v0>>16;
  w2 ^= v1<<16; w3 ^= v1>>16;
  w3 ^= v2<<16; w4 ^= v2>>16;

  /* Handle 97th bit. */
  xb = x.b;
  yb = y.b;
  if (xb) { w3 ^= yl; w4 ^= ym; w5 ^= yh; }
  if (yb) { w3 ^= xl; w4 ^= xm; w5 ^= xh; }
  t0 = xb & yb; w3 ^= t0<<5; w2 ^= t0<<31;

  /* Reduce modulo t^97+t^6+1. */
  w1 ^= w5<<31; w2 ^= w5>>1;
  w2 ^= w5<<5; w3 ^= w5>>27;

  w0 ^= w4<<31; w1 ^= w4>>1;
  w1 ^= w4<<5; w2 ^= w4>>27;

  t0 = w3>>1; w3 &= 1;
  w0 ^= t0; w0 ^= t0<<6; w1 ^= t0>>26;

  { poly128 r;
    r.b = w3; r.h = w2; r.m = w1; r.l = w0;
    return r;
  } /* end block */
} /* end product */

#else
/* PROD == 3 */

/*-- product ---------------------------------------------------------------*/

/* Product of x and y in the field GF(2^97) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x, y < 97.
 * This version uses six 32x32-bit products.
 */
static poly128 product(poly128 x, poly128 y) {
  u32 tmp, u,uh,ul, vh,vl,v0,v1,v2,v3, w0,w1,w2,w3,w4,w5
    , xb, xh,xm,xl, xlh, yb, yh,ym,yl, ylh
    ;

  xh = x.h; xm = x.m; xl = x.l;
  yh = y.h; ym = y.m; yl = y.l;

  w0 = GF2Product32x32(xl, yl, &w1);
  v2 = GF2Product32x32(xm, ym, &w3);
  ul = GF2Product32x32(xl ^ xm, yl ^ ym, &uh);
  u = w1 ^ v2;
  w1 = u ^ ul ^ w0; w2 = u ^ uh ^ w3;

  w4 = GF2Product32x32(xh, yh, &w5);

  xlh = xl ^ xh; ylh = yl ^ yh;
  v0 = GF2Product32x32(xlh, ylh, &v1);
  ul = GF2Product32x32(xlh ^ xm, ylh ^ ym, &uh);
  v3 = w3;
  u = v1 ^ v2;
  v1 = u ^ ul ^ v0; v2 = u ^ uh ^ v3;

  vl = w2 ^ w4;
  vh = w3 ^ w5;

  w2 = vl ^ v0 ^ w0;
  w3 = vh ^ v1 ^ w1;
  w4 = vl ^ v2;
  w5 = vh ^ v3;

  /* Handle 97th bit. */
  xb = x.b;
  yb = y.b;
  if (xb) { w3 ^= yl; w4 ^= ym; w5 ^= yh; }
  if (yb) { w3 ^= xl; w4 ^= xm; w5 ^= xh; }
  tmp = xb & yb; w3 ^= tmp<<5; w2 ^= tmp<<31;

  /* Reduce modulo t^97+t^6+1. */
  w1 ^= w5<<31; w2 ^= w5>>1;
  w2 ^= w5<<5; w3 ^= w5>>27;

  w0 ^= w4<<31; w1 ^= w4>>1;
  w1 ^= w4<<5; w2 ^= w4>>27;

  tmp = w3>>1; w3 &= 1;
  w0 ^= tmp; w0 ^= tmp<<6; w1 ^= tmp>>26;

  { poly128 r;
    r.b = w3; r.h = w2; r.m = w1; r.l = w0;
    return r;
  } /* end block */
} /* end product */

#endif


/*-- inverse ---------------------------------------------------------------*/

/* Two useful macros:
 *   SHR(al, am, ah, ab) shifts ab:ah:am:al right by one bit.
 *   SHR_BIS(al, am, ah, ab) is similar but discards ab at the end.
 */

#define SHR(al, am, ah, ab) \
  do {                          \
    al >>= 1; al |= am<<31;     \
    am >>= 1; am |= ah<<31;     \
    ah >>= 1; ah |= ab<<31;     \
    ab >>= 1;                   \
  } while (0)

#define SHR_BIS(al, am, ah, ab) \
  do {                          \
    al >>= 1; al |= am<<31;     \
    am >>= 1; am |= ah<<31;     \
    ah >>= 1; ah |= ab<<31;     \
  } while (0)

/* Invert y in the field GF(2^97) i.e., as a poly in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree y < 97, y != 0.
 */
static poly128 inverse(poly128 y) {
  u32 al,am,ah,ab, bl,bm,bh,bb, t, ul,um,uh,ub, vl,vm,vh,vb;
  const u32 ml = 65, mb = 2; /* NB: mm and mh are 0 */


  /* Maintain: u.y = a and v.y = b modulo t^97+t^6+1. */

  al = y.l; am = y.m; ah = y.h; ab = y.b;
  ul = 1; um = uh = ub = 0;

  while (!(al & 1)) {
    t = ul & 1; SHR_BIS(ul, um, uh, ub); ul ^= t<<5; ub = t;
    SHR(al, am, ah, ab);
  } /* end while */

  bl = ml; bm = bh = 0; bb = mb;
  vl = vm = vh = vb = 0;

  do {
    do {
      do {
        do {

          do {
            bl ^= al; bm ^= am; bh ^= ah; bb ^= ab;
            vl ^= ul; vm ^= um; vh ^= uh; vb ^= ub;

            do {
              t = vl & 1; SHR_BIS(vl, vm, vh, vb); vl ^= t<<5; vb = t;
              SHR(bl, bm, bh, bb);
            } while (!(bl & 1));
          } while ( ab < bb
                    || ab == bb
                         && ( ah < bh
                              || ah == bh && (am < bm || am == bm && al < bl)
                            )
                  );

          if (al == bl && am == bm && ah == bh && ab == bb) break;

          do {
            al ^= bl; am ^= bm; ah ^= bh; ab ^= bb;
            ul ^= vl; um ^= vm; uh ^= vh; ub ^= vb;

            do {
              t = ul & 1; SHR_BIS(ul, um, uh, ub); ul ^= t<<5; ub = t;
              SHR_BIS(al, am, ah, ab); ab = 0;
            } while (!(al & 1));
          } while ( !bb
                    && ( bh < ah
                         || bh == ah && (bm < am || bm == am && bl < al)
                       )
                  );

        } while (al != bl);
      } while (am != bm);
    } while (ah != bh);
  } while (ab != bb);

  /* Now a (and b) must equal 1. */

  /* Reduce u modulo t^97+t^6+1. */
  t = ub>>1; ul ^= t; ul ^= t<<6; ub ^= t<<1;

  { poly128 r;
    r.l = ul; r.m = um; r.h = uh; r.b = ub;
    return r;
  } /* end block */
} /* end inverse */

#undef SHR
#undef SHR_BIS


/*-- quotient --------------------------------------------------------------*/

/* Divide x by y in the field GF(2^97) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x, y < 97, y != 0.
 */
static INLINE poly128 quotient(poly128 x, poly128 y) {

  return product(x, inverse(y));
} /* end quotient */


/*-= Arithmetic on elliptic curve over the field =-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/*-- ellipticDouble --------------------------------------------------------*/

/* Given a point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its double (x2:y2:z2) by the group law.
 * It puts x2 in *px2, y2 in *py2 and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticDouble
  ( poly128 x, poly128 y, int z
  , poly128 *px2, poly128 *py2
  ) {
  poly128 lam, x2,y2;
  const poly128 a = A, one = ONE, zero = ZERO;

  if (!z || equal(x, zero)) { *px2 = zero; *py2 = one; return 0; }

  lam = xor(x, quotient(y, x));
  x2 = xor(xor(square(lam), lam), a);
  y2 = xor(xor(product(lam, xor(x, x2)), x2), y);

  *px2 = x2; *py2 = y2; return 1;
} /* end ellipticDouble */


/*-- ellipticSum -----------------------------------------------------------*/

/* Given points (x1:y1:z1) and (x2:y2:z2) on y^2 + x*y = x^3 + a*x^2 + b,
 * this computes their sum (x3:y3:z3) by the group law.
 * It puts x3 in *px3, y3 in *py3 and returns z3.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticSum
  ( poly128 x1, poly128 y1, int z1, poly128 x2, poly128 y2, int z2
  , poly128 *px3, poly128 *py3
  ) {
  poly128 lam, t, x3,y3;
  const poly128 a = A;

  if (!z1) { *px3 = x2; *py3 = y2; return z2; }
  if (!z2) { *px3 = x1; *py3 = y1; return z1; }

  if (equal(x1, x2)) {
    const poly128 zero = ZERO, one = ONE;

    if (equal(y1, y2)) return ellipticDouble(x1, y1, z1, px3, py3);
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  lam = quotient(xor(y1, y2), xor(x1, x2));
  t = xor(xor(xor(square(lam), lam), x2), a);
  x3 = xor(t, x1);
  y3 = xor(xor(product(lam, t), x3), y1);

  *px3 = x3; *py3 = y3; return 1;
} /* end ellipticSum */


/*-- ellipticProduct -------------------------------------------------------*/

/* Given a point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its fac-th multiple (x2:y2:z2) by the group law.
 * It puts x2 in *px2, y2 in *py2 and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticProduct
  ( poly128 x, poly128 y, int z, u128 fac
  , poly128 *px2, poly128 *py2
  ) {
  int z2;
  u32 f, i, m, tab[4];
  poly128 x2,y2;

  /* Get most significant 32-bit word of fac. */
  tab[0] = fac.l; tab[1] = fac.m; tab[2] = fac.h; tab[3] = fac.b;
  for (i = 3; !(f = tab[i]) && i; --i) ;

  /* Quick exit if fac was 0. */
  if (!f) {
    const poly128 zero = ZERO, one = ONE;

    *px2 = zero; *py2 = one; return 0;
  } /* end if */

  /* Run through bits of most significant word. */
  m = topBit(f);
  x2 = x; y2 = y; z2 = z;
  while (m >>= 1) {
    z2 = ellipticDouble(x2, y2, z2, &x2, &y2);
    if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, &x2, &y2);
  } /* end while */

  /* Run through bits of less significant words. */
  while (i--) {
    f = tab[i];
    m = 1U<<31;
    do {
      z2 = ellipticDouble(x2, y2, z2, &x2, &y2);
      if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, &x2, &y2);
      m >>= 1;
    } while (m);
  } /* end while */

  *px2 = x2; *py2 = y2; return z2;
} /* end ellipticProduct */


/*-= Various file manipulation thingies =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- reportDistinguished ---------------------------------------------------*/

/* Reporting of distinguished points back to headquarters. */
static void reportDistinguished
  ( modeType mode, int argc, char *argv[]
  , u64 iters, u128 u, u128 v
  , poly128 x, poly128 y, int z
  , u64 total, double dStart
  ) {
  char *identity;
  double dRate, dTotal, dUserTime; /* For timing. */
  const char format[] =
      "%s|i|%04X%08X|u|%X%08X%08X%08X|v|%X%08X%08X%08X|"
      "x|%X%08X%08X%08X|y|%X%08X%08X%08X|z|%X|"
      CLIENT "|" VERSION "|%s|%s|%s|%s|%s|%s\n"
  ;

  identity = mode == Test ? "TEST" : "ECC2-97";
  printf( format
        , identity, iters.hi, iters.lo
        , u.b, u.h, u.m, u.l, v.b, v.h, v.m, v.l
        , x.b, x.h, x.m, x.l, y.b, y.h, y.m, y.l, z
        , argv[2], argv[3], argv[4], argv[5], argv[6]
        , argc == 8 ? argv[7] : "none"
        );

  /* Timing. */
  { double dNow;
    struct rusage ru;

    getrusage(RUSAGE_SELF, &ru);
    dNow = (double)ru.ru_utime.tv_sec+(double)ru.ru_utime.tv_usec*0.000001;
    dUserTime = dNow-dStart;
  } /* end block */

  if (dUserTime) {
    dTotal = (double)total.hi*4294967296.0 + (double)total.lo;
    dRate = dTotal/dUserTime;
    printf( "Iterations during this run = %g at %g per second.\n"
          , dTotal, dRate
          );
  } /* end if */

  if (mode == Mail || mode == Alt) {
    FILE *handle;

    handle = popen(SENDMAIL " -t", "w");
    if (handle == NULL) {
      puts("Warning: couldn't pipe to " SENDMAIL ", send by hand!");
    } else {
      int status;

      fprintf( handle
             , "To: %s\n"
             , mode == Mail
               ? "ecdl2-97@pauillac.inria.fr"
               : "ecdl2-97@rupture.net"
             );
      fprintf( handle
             , format
             , identity, iters.hi, iters.lo
             , u.b, u.h, u.m, u.l, v.b, v.h, v.m, v.l
             , x.b, x.h, x.m, x.l, y.b, y.h, y.m, y.l, z
             , argv[2], argv[3], argv[4], argv[5], argv[6]
             , argc == 8 ? argv[7] : "none"
             );

      /* Timing. */
      if (dUserTime) {
        fprintf( handle
               , "Iterations during this run = %g at %g per second.\n"
               , dTotal, dRate
               );
      } /* end if */

      status = fflush(handle);
      if (status == EOF) {
        printf( "Warning: fflush() of pipe failed\n"
                "         (errno = %d: %s).\n"
              , errno, strerror(errno)
              );
      } /* end if */

      status = pclose(handle);
      if (status) {
        if (status == -1) {
          printf( "Warning: pclose() failed\n"
                  "         (errno = %d: %s).\n"
                , errno, strerror(errno)
                );
        } else printf("Warning: " SENDMAIL " returned status %d.\n", status);
      } /* end if */

    } /* end if/else */
  } /* end if */

  fflush(stdout);

} /* end reportDistinguished */


/*-- u32ToBytes ------------------------------------------------------------*/

static void u32ToBytes(u32 data, u8 *p) {

  p[0] = data;
  p[1] = data>>8;
  p[2] = data>>16;
  p[3] = data>>24;

} /* end u32ToBytes */


/*-- bytesToU32 ------------------------------------------------------------*/

static u32 bytesToU32(u8 *p) {

  return p[0] | (u32)p[1]<<8 | (u32)p[2]<<16 | (u32)p[3]<<24;
} /* end bytesToU32 */


/*-- writeState ------------------------------------------------------------*/

/* Saves state to file in a portable binary format. */
static void writeState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  ) {
  FILE *handle;

  puts("Saving state to " SAVED_STATE_FILENAME "...");
  handle = fopen(SAVED_STATE_FILENAME, "w");
  if (handle) {
    ulong i;
    int status;
    unsigned char buf[73];

    for (i = 0; i < PARAL; ++i) {

#define OUT128(O, X) \
  u32ToBytes(X.l, buf+O); \
  u32ToBytes(X.m, buf+O+4); \
  u32ToBytes(X.h, buf+O+8); \
  u32ToBytes(X.b, buf+O+12) /* ; */

      u32ToBytes(itersT[i].lo, buf);
      u32ToBytes(itersT[i].hi, buf+4);
      OUT128(8, uT[i]);
      OUT128(24, vT[i]);
      OUT128(40, xT[i]);
      OUT128(56, yT[i]);
      buf[72] = (u8)zT[i];

#undef OUT128

      if (fwrite((void *)buf, sizeof(buf), 1, handle) != 1) {
        printf( "Warning: couldn't fwrite() to saved state file\n"
                "         (errno = %d: %s).\n"
              , errno, strerror(errno)
              );
        break;
      } /* end if */
    } /* end for */

    puts("done.");

    status = fclose(handle);
    if (status == EOF) {
      printf( "Warning: fclose() of saved state file failed\n"
              "         (errno = %d: %s).\n"
            , errno, strerror(errno)
            );
    } /* end if */

  } else printf( "Warning: couldn't fopen() saved state file for writing\n"
                 "         (errno = %d: %s).\n"
               , errno, strerror(errno)
               );

  fflush(stdout);

} /* end writeState */


/*-- readState -------------------------------------------------------------*/

/* Reads state back from file (if it exists) in a portable binary format
 * and returns number of valid items (i.e., u,v,x,y,z data) found.
 * NB: There is nothing to check for the itersT[] array.
 */
static ulong readState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  ) {
  ulong i;
  FILE *handle;

  i = 0;
  handle = fopen(SAVED_STATE_FILENAME, "r");
  if (handle) {
    int status;
    unsigned char buf[73];

    for (i = 0; i < PARAL; ++i) {
      int OK;

      if (fread((void *)buf, sizeof(buf), 1, handle) != 1) {
        if (feof(handle)) puts("Warning: short saved state file.");
        else {
          printf( "Warning: couldn't fread() from saved state file\n"
                  "         (errno = %d: %s).\n"
                , errno, strerror(errno)
                );
        } /* end if/else */
        break;
      } /* end if */

#define IN128(O, X) \
  X.l = bytesToU32(buf+O); \
  X.m = bytesToU32(buf+O+4); \
  X.h = bytesToU32(buf+O+8); \
  X.b = bytesToU32(buf+O+12) /* ; */

      itersT[i].lo = bytesToU32(buf);
      itersT[i].hi = bytesToU32(buf+4);
      IN128(8, uT[i]);
      IN128(24, vT[i]);
      IN128(40, xT[i]);
      IN128(56, yT[i]);
      zT[i] = (int)buf[72];

#undef IN128

      for (OK = 0; OK < 1; ++OK) {
        int z, zu, zv;
        poly128 x,y, xu,yu, xv,yv;

        const int zP = 1, zQ = 1;
        const poly128 xP = XP, xQ = XQ, yP = YP, yQ = YQ;
        const u128 order = ORDER;

        if (ge(uT[i], order)) break;
        if (ge(vT[i], order)) break;

        if (xT[i].b > 1) break;
        if (yT[i].b > 1) break;
        if (zT[i] != 0 && zT[i] != 1) break;

        zu = ellipticProduct(xP, yP, zP, uT[i], &xu, &yu);
        zv = ellipticProduct(xQ, yQ, zQ, vT[i], &xv, &yv);
        z = ellipticSum(xu, yu, zu, xv, yv, zv, &x, &y);

        if (!equal(x, xT[i]) || !equal(y, yT[i]) || z != zT[i]) break;
      } /* end for */

      if (!OK) {
        puts("Warning: invalid starting point found in saved state!");
        break;
      } /* end if */

    } /* end for */

    printf( "Read %lu of %lu starting points from saved state.\n"
          , i, (ulong)PARAL
          );

    status = fclose(handle);
    if (status == EOF) {
      printf( "Warning: fclose() of saved state file failed\n"
              "         (errno = %d: %s).\n"
            , errno, strerror(errno)
            );
    } /* end if */

  } else printf( "Warning: couldn't fopen() saved state file for reading\n"
                 "         (errno = %d: %s).\n"
                 "NB: 'No such file' is normal for the first run.\n"
               , errno, strerror(errno)
               );

  fflush(stdout);

  return i;
} /* end readState */


/*== end of file ecdl2-97.c ================================================*/
