Hi all,
Yes we are still alive here at INRIA.
The 32-bit implementation of the ECDL program is now essentially
"done" (the 64-bit one has been done for ages). All that remains is
to write some README-type documentation and then a few people can
start running it. We'll smooth out any rough edges and then look for
AMPAP: As Many People As Possible!
Some quick experiments suggest that an MMX version would be a
significant win on chips that support it. Here is a quick hack of one
critical function that gains about 30% overall. The code below is
likely to be far from optimal... Do any x86 hackers want to work on
this?
Rob.
#ifndef __GNUC__
#error Doh!
#endif
#define STR(MEM, RVAL) asm volatile ("movq %%" RVAL ", %0" : "=m" (MEM) : )
#define LOD(LVAL, MEM) asm volatile ("movq %0, %%" LVAL : : "m" (MEM))
#define CPY(LVAL, RVAL) asm volatile ("movq %%" RVAL ", %%" LVAL : : )
#define SLL(LVAL, SH) asm volatile ("psllq $" #SH ", %%" LVAL : : )
#define SRL(LVAL, SH) asm volatile ("psrlq $" #SH ", %%" LVAL : : )
#define XOR(LVAL, RVAL) asm volatile ("pxor %%" RVAL ", %%" LVAL : : )
#define XOR_MEM(LVAL, MEM) asm volatile ("pxor %0, %%" LVAL : : "m" (MEM))
#define END_MMX() asm volatile ("emms" : : )
typedef unsigned int u32;
typedef unsigned long long u64;
/*-- MMX_GF2Product56x56 ---------------------------------------------------*/
/* Multiply *py by low 56 bits of *px, as polys over Z/2Z, degree *py < 56.
* Speed-critical auxiliary function used for product().
* Returns with low 64 bits of result in *pl and high 47 bits in *ph.
*/
static void MMX_GF2Product56x56
( u64 *ph, u64 *pl, const u64 *px, const u64 *py
) {
u32 x;
u64 tab[16];
#define mm_E "mm0"
#define mm_Y1 "mm1"
#define mm_Y2 "mm2"
#define mm_Y4 "mm3"
#define mm_Y8 "mm4"
LOD(mm_Y1, *py);
CPY(mm_Y2, mm_Y1);
SLL(mm_Y2, 1);
CPY(mm_Y4, mm_Y1);
SLL(mm_Y4, 2);
CPY(mm_Y8, mm_Y1);
SLL(mm_Y8, 3);
/* Gray code walk through table. */
XOR(mm_E, mm_E); STR(tab[ 0], mm_E);
XOR(mm_E, mm_Y1); STR(tab[ 1], mm_Y1);
XOR(mm_E, mm_Y2); STR(tab[ 3], mm_E);
XOR(mm_E, mm_Y1); STR(tab[ 2], mm_Y2);
XOR(mm_E, mm_Y4); STR(tab[ 6], mm_E);
XOR(mm_E, mm_Y1); STR(tab[ 7], mm_E);
XOR(mm_E, mm_Y2); STR(tab[ 5], mm_E);
XOR(mm_E, mm_Y1); STR(tab[ 4], mm_Y4);
XOR(mm_E, mm_Y8); STR(tab[12], mm_E);
XOR(mm_E, mm_Y1); STR(tab[13], mm_E);
XOR(mm_E, mm_Y2); STR(tab[15], mm_E);
XOR(mm_E, mm_Y1); STR(tab[14], mm_E);
XOR(mm_E, mm_Y4); STR(tab[10], mm_E);
XOR(mm_E, mm_Y1); STR(tab[11], mm_E);
XOR(mm_E, mm_Y2); STR(tab[ 9], mm_E);
XOR(mm_E, mm_Y1); STR(tab[ 8], mm_Y8);
#undef mm_E
#undef mm_Y1
#undef mm_Y2
#undef mm_Y4
#undef mm_Y8
#define mm_A "mm0"
#define mm_B "mm1"
#define mm_C "mm2"
#define mm_D "mm3"
#define mm_E "mm4"
#define mm_F "mm5"
#define mm_G "mm6"
#define mm_T "mm7"
x = (u32)*px;
LOD(mm_A, tab[x>> 4 & 15]); SLL(mm_A, 4); XOR_MEM(mm_A, tab[x & 15]);
LOD(mm_B, tab[x>>12 & 15]); SLL(mm_B, 4); XOR_MEM(mm_B, tab[x>> 8 & 15]);
LOD(mm_C, tab[x>>20 & 15]); SLL(mm_C, 4); XOR_MEM(mm_C, tab[x>>16 & 15]);
LOD(mm_D, tab[x>>28 & 15]); SLL(mm_D, 4); XOR_MEM(mm_D, tab[x>>24 & 15]);
x = (u32)(*px>>32);
LOD(mm_E, tab[x>> 4 & 15]); SLL(mm_E, 4); XOR_MEM(mm_E, tab[x & 15]);
LOD(mm_F, tab[x>>12 & 15]); SLL(mm_F, 4); XOR_MEM(mm_F, tab[x>> 8 & 15]);
LOD(mm_G, tab[x>>20 & 15]); SLL(mm_G, 4); XOR_MEM(mm_G, tab[x>>16 & 15]);
CPY(mm_T, mm_B); SLL(mm_B, 8); SRL(mm_T, 8); XOR(mm_A, mm_B);
XOR(mm_T, mm_C); SLL(mm_C, 16); SRL(mm_T, 8); XOR(mm_A, mm_C);
XOR(mm_T, mm_D); SLL(mm_D, 24); SRL(mm_T, 8); XOR(mm_A, mm_D);
XOR(mm_T, mm_E); SLL(mm_E, 32); SRL(mm_T, 8); XOR(mm_A, mm_E);
XOR(mm_T, mm_F); SLL(mm_F, 40); SRL(mm_T, 8); XOR(mm_A, mm_F);
XOR(mm_T, mm_G); SLL(mm_G, 48); SRL(mm_T, 16); XOR(mm_A, mm_G);
STR(*ph, mm_T); STR(*pl, mm_A);
#undef mm_A
#undef mm_B
#undef mm_C
#undef mm_D
#undef mm_E
#undef mm_F
#undef mm_G
#undef mm_T
END_MMX();
} /* end function MMX_GF2Product56x56 */
This archive was generated by hypermail 2b29 : Sat Jan 01 2000 - 15:26:57 MET