If you are running ECDL2K with MMX code on FreeBSD,
you may have noticed that the speed depends on the length of the
command line...
This seems to be due to a lack of double-word alignment in the
stack. Here is a patch ecdl2k version 1.0.1, which declares all mmx64
variables to be static, so that they will appear in the data segment,
and be correctly aligned. It improves speed by about 5% (lucky case,
most data was already aligned) to 50% (bad case, essential data was
not aligned).
By the way, this does not hurt on other architectures, so even if you
are not using FreeBSD you may give it a try.
Regards,
Jacques
------------------------------------------------------
Jacques Garrigue, visiting INRIA from Kyoto University
Jacques.Garrigue at inria.fr
------------------------------------------------------
*** ecdl2K-108.32bit.c.orig Tue Dec 7 15:55:44 1999
--- ecdl2K-108.32bit.c Thu Dec 9 15:02:44 1999
***************
*** 1434,1440 ****
uint i;
u32 s, t;
! mmx64 yhi, ylo;
const mmx128 *p;
p = &matrix[0];
--- 1434,1440 ----
uint i;
u32 s, t;
! static mmx64 yhi, ylo;
const mmx128 *p;
p = &matrix[0];
***************
*** 1897,1903 ****
/* Two at a time. */
for ( ; n >= 2; n -= 2) {
! mmx64 yh, yl;
LOD(mm_T, tab4[xt>> 8 /* & 31 */ ]);
LOD(mm_YH, tab4[xh>>24]);
--- 1897,1903 ----
/* Two at a time. */
for ( ; n >= 2; n -= 2) {
! static mmx64 yh, yl;
LOD(mm_T, tab4[xt>> 8 /* & 31 */ ]);
LOD(mm_YH, tab4[xh>>24]);
***************
*** 1961,1967 ****
/* Last one (if n was odd). */
if (n) {
! mmx64 yh,yl;
LOD(mm_U, tab2[xt>> 8 /* & 31 */ ]);
LOD(mm_T, tab2[xh>>24]);
--- 1961,1967 ----
/* Last one (if n was odd). */
if (n) {
! static mmx64 yh,yl;
LOD(mm_U, tab2[xt>> 8 /* & 31 */ ]);
LOD(mm_T, tab2[xh>>24]);
***************
*** 2276,2282 ****
( u32 xh, u32 xl, const mmx64 *py, mmx64 *ph, mmx64 *pl
) {
u32 w;
! mmx64 tab[16];
/* Note: No START_MMX() here. It is done at start of product(). */
--- 2276,2282 ----
( u32 xh, u32 xl, const mmx64 *py, mmx64 *ph, mmx64 *pl
) {
u32 w;
! static mmx64 tab[16];
/* Note: No START_MMX() here. It is done at start of product(). */
***************
*** 2620,2632 ****
/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#ifdef MMX
! mmx64 hh,hl, lh,ll, mh,ml;
START_MMX();
{ const u32 mask = (1UL<<24)-1;
u32 xt,xh, yt,yh;
! mmx64 t;
xt = x.t<<8 | x.h>>24; xh = x.h<<8 | x.m>>24;
yt = y.t<<8 | y.h>>24; yh = y.h<<8 | y.m>>24;
--- 2620,2632 ----
/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#ifdef MMX
! static mmx64 hh,hl, lh,ll, mh,ml;
START_MMX();
{ const u32 mask = (1UL<<24)-1;
u32 xt,xh, yt,yh;
! static mmx64 t;
xt = x.t<<8 | x.h>>24; xh = x.h<<8 | x.m>>24;
yt = y.t<<8 | y.h>>24; yh = y.h<<8 | y.m>>24;
***************
*** 2691,2697 ****
SLL(mm_TMP, 1); XOR(mm_T0, mm_TMP);
SLL(mm_TMP, 7); XOR(mm_T0, mm_TMP);
! { mmx64 rh, rl;
poly128 r;
STR(rh, mm_T1); STR(rl, mm_T0);
--- 2691,2697 ----
SLL(mm_TMP, 1); XOR(mm_T0, mm_TMP);
SLL(mm_TMP, 7); XOR(mm_T0, mm_TMP);
! { static mmx64 rh, rl;
poly128 r;
STR(rh, mm_T1); STR(rl, mm_T0);
This archive was generated by hypermail 2b29 : Sat Jan 01 2000 - 15:26:57 MET