MMX optimisations

From: Robert Harley (harley@landfield.com)
Date: Tue Nov 09 1999 - 18:02:51 MET


Hi all,

Yes we are still alive here at INRIA.

The 32-bit implementation of the ECDL program is now essentially
"done" (the 64-bit one has been done for ages). All that remains is
to write some README-type documentation and then a few people can
start running it. We'll smooth out any rough edges and then look for
AMPAP: As Many People As Possible!

Some quick experiments suggest that an MMX version would be a
significant win on chips that support it. Here is a quick hack of one
critical function that gains about 30% overall. The code below is
likely to be far from optimal... Do any x86 hackers want to work on
this?

Rob.

#ifndef __GNUC__
#error Doh!
#endif

#define STR(MEM, RVAL) asm volatile ("movq %%" RVAL ", %0" : "=m" (MEM) : )
#define LOD(LVAL, MEM) asm volatile ("movq %0, %%" LVAL : : "m" (MEM))
#define CPY(LVAL, RVAL) asm volatile ("movq %%" RVAL ", %%" LVAL : : )
#define SLL(LVAL, SH) asm volatile ("psllq $" #SH ", %%" LVAL : : )
#define SRL(LVAL, SH) asm volatile ("psrlq $" #SH ", %%" LVAL : : )
#define XOR(LVAL, RVAL) asm volatile ("pxor %%" RVAL ", %%" LVAL : : )
#define XOR_MEM(LVAL, MEM) asm volatile ("pxor %0, %%" LVAL : : "m" (MEM))
#define END_MMX() asm volatile ("emms" : : )

typedef unsigned int u32;
typedef unsigned long long u64;

/*-- MMX_GF2Product56x56 ---------------------------------------------------*/

/* Multiply *py by low 56 bits of *px, as polys over Z/2Z, degree *py < 56.
 * Speed-critical auxiliary function used for product().
 * Returns with low 64 bits of result in *pl and high 47 bits in *ph.
 */
static void MMX_GF2Product56x56
  ( u64 *ph, u64 *pl, const u64 *px, const u64 *py
  ) {
  u32 x;
  u64 tab[16];

#define mm_E "mm0"
#define mm_Y1 "mm1"
#define mm_Y2 "mm2"
#define mm_Y4 "mm3"
#define mm_Y8 "mm4"

  LOD(mm_Y1, *py);

  CPY(mm_Y2, mm_Y1);
  SLL(mm_Y2, 1);
  CPY(mm_Y4, mm_Y1);
  SLL(mm_Y4, 2);
  CPY(mm_Y8, mm_Y1);
  SLL(mm_Y8, 3);

  /* Gray code walk through table. */
  XOR(mm_E, mm_E); STR(tab[ 0], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[ 1], mm_Y1);
  XOR(mm_E, mm_Y2); STR(tab[ 3], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[ 2], mm_Y2);
  XOR(mm_E, mm_Y4); STR(tab[ 6], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[ 7], mm_E);
  XOR(mm_E, mm_Y2); STR(tab[ 5], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[ 4], mm_Y4);
  XOR(mm_E, mm_Y8); STR(tab[12], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[13], mm_E);
  XOR(mm_E, mm_Y2); STR(tab[15], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[14], mm_E);
  XOR(mm_E, mm_Y4); STR(tab[10], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[11], mm_E);
  XOR(mm_E, mm_Y2); STR(tab[ 9], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[ 8], mm_Y8);
  
#undef mm_E
#undef mm_Y1
#undef mm_Y2
#undef mm_Y4
#undef mm_Y8

#define mm_A "mm0"
#define mm_B "mm1"
#define mm_C "mm2"
#define mm_D "mm3"
#define mm_E "mm4"
#define mm_F "mm5"
#define mm_G "mm6"
#define mm_T "mm7"

  x = (u32)*px;
  LOD(mm_A, tab[x>> 4 & 15]); SLL(mm_A, 4); XOR_MEM(mm_A, tab[x & 15]);
  LOD(mm_B, tab[x>>12 & 15]); SLL(mm_B, 4); XOR_MEM(mm_B, tab[x>> 8 & 15]);
  LOD(mm_C, tab[x>>20 & 15]); SLL(mm_C, 4); XOR_MEM(mm_C, tab[x>>16 & 15]);
  LOD(mm_D, tab[x>>28 & 15]); SLL(mm_D, 4); XOR_MEM(mm_D, tab[x>>24 & 15]);

  x = (u32)(*px>>32);
  LOD(mm_E, tab[x>> 4 & 15]); SLL(mm_E, 4); XOR_MEM(mm_E, tab[x & 15]);
  LOD(mm_F, tab[x>>12 & 15]); SLL(mm_F, 4); XOR_MEM(mm_F, tab[x>> 8 & 15]);
  LOD(mm_G, tab[x>>20 & 15]); SLL(mm_G, 4); XOR_MEM(mm_G, tab[x>>16 & 15]);

  CPY(mm_T, mm_B); SLL(mm_B, 8); SRL(mm_T, 8); XOR(mm_A, mm_B);
  XOR(mm_T, mm_C); SLL(mm_C, 16); SRL(mm_T, 8); XOR(mm_A, mm_C);
  XOR(mm_T, mm_D); SLL(mm_D, 24); SRL(mm_T, 8); XOR(mm_A, mm_D);
  XOR(mm_T, mm_E); SLL(mm_E, 32); SRL(mm_T, 8); XOR(mm_A, mm_E);
  XOR(mm_T, mm_F); SLL(mm_F, 40); SRL(mm_T, 8); XOR(mm_A, mm_F);
  XOR(mm_T, mm_G); SLL(mm_G, 48); SRL(mm_T, 16); XOR(mm_A, mm_G);
  STR(*ph, mm_T); STR(*pl, mm_A);

#undef mm_A
#undef mm_B
#undef mm_C
#undef mm_D
#undef mm_E
#undef mm_F
#undef mm_G
#undef mm_T

  END_MMX();
} /* end function MMX_GF2Product56x56 */



This archive was generated by hypermail 2b29 : Sat Jan 01 2000 - 15:26:57 MET