If you are running ECDL2K with MMX code on FreeBSD,
you may have noticed that the speed depends on the length of the
command line...
This seems to be due to a lack of double-word alignment in the
stack. Here is a patch ecdl2k version 1.0.1, which declares all mmx64
variables to be static, so that they will appear in the data segment,
and be correctly aligned. It improves speed by about 5% (lucky case,
most data was already aligned) to 50% (bad case, essential data was
not aligned).
By the way, this does not hurt on other architectures, so even if you
are not using FreeBSD you may give it a try.
Regards,
Jacques
------------------------------------------------------
Jacques Garrigue, visiting INRIA from Kyoto University
                          Jacques.Garrigue at inria.fr
------------------------------------------------------
*** ecdl2K-108.32bit.c.orig	Tue Dec  7 15:55:44 1999
--- ecdl2K-108.32bit.c	Thu Dec  9 15:02:44 1999
***************
*** 1434,1440 ****
  
    uint i;
    u32 s, t;
!   mmx64 yhi, ylo;
    const mmx128 *p;
  
    p = &matrix[0];
--- 1434,1440 ----
  
    uint i;
    u32 s, t;
!   static mmx64 yhi, ylo;
    const mmx128 *p;
  
    p = &matrix[0];
***************
*** 1897,1903 ****
  
    /* Two at a time. */
    for ( ; n >= 2; n -= 2) {
!     mmx64 yh, yl;
  
      LOD(mm_T, tab4[xt>> 8 /* & 31 */ ]);
      LOD(mm_YH, tab4[xh>>24]);
--- 1897,1903 ----
  
    /* Two at a time. */
    for ( ; n >= 2; n -= 2) {
!     static mmx64 yh, yl;
  
      LOD(mm_T, tab4[xt>> 8 /* & 31 */ ]);
      LOD(mm_YH, tab4[xh>>24]);
***************
*** 1961,1967 ****
  
    /* Last one (if n was odd). */
    if (n) {
!     mmx64 yh,yl;
  
      LOD(mm_U, tab2[xt>> 8 /* & 31 */ ]);
      LOD(mm_T, tab2[xh>>24]);
--- 1961,1967 ----
  
    /* Last one (if n was odd). */
    if (n) {
!     static mmx64 yh,yl;
  
      LOD(mm_U, tab2[xt>> 8 /* & 31 */ ]);
      LOD(mm_T, tab2[xh>>24]);
***************
*** 2276,2282 ****
    ( u32 xh, u32 xl, const mmx64 *py, mmx64 *ph, mmx64 *pl
    ) {
    u32 w;
!   mmx64 tab[16];
  
    /* Note: No START_MMX() here.  It is done at start of product(). */
  
--- 2276,2282 ----
    ( u32 xh, u32 xl, const mmx64 *py, mmx64 *ph, mmx64 *pl
    ) {
    u32 w;
!   static mmx64 tab[16];
  
    /* Note: No START_MMX() here.  It is done at start of product(). */
  
***************
*** 2620,2632 ****
  /*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
  #ifdef MMX
  
!   mmx64 hh,hl, lh,ll, mh,ml;
  
    START_MMX();
  
    { const u32 mask = (1UL<<24)-1;
      u32 xt,xh, yt,yh;
!     mmx64 t;
  
      xt = x.t<<8 | x.h>>24; xh = x.h<<8 | x.m>>24;
      yt = y.t<<8 | y.h>>24; yh = y.h<<8 | y.m>>24;
--- 2620,2632 ----
  /*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
  #ifdef MMX
  
!   static mmx64 hh,hl, lh,ll, mh,ml;
  
    START_MMX();
  
    { const u32 mask = (1UL<<24)-1;
      u32 xt,xh, yt,yh;
!     static mmx64 t;
  
      xt = x.t<<8 | x.h>>24; xh = x.h<<8 | x.m>>24;
      yt = y.t<<8 | y.h>>24; yh = y.h<<8 | y.m>>24;
***************
*** 2691,2697 ****
    SLL(mm_TMP, 1); XOR(mm_T0, mm_TMP);
    SLL(mm_TMP, 7); XOR(mm_T0, mm_TMP);
  
!   { mmx64 rh, rl;
      poly128 r;
  
      STR(rh, mm_T1); STR(rl, mm_T0);
--- 2691,2697 ----
    SLL(mm_TMP, 1); XOR(mm_T0, mm_TMP);
    SLL(mm_TMP, 7); XOR(mm_T0, mm_TMP);
  
!   { static mmx64 rh, rl;
      poly128 r;
  
      STR(rh, mm_T1); STR(rl, mm_T0);
This archive was generated by hypermail 2b29 : Sat Jan 01 2000 - 15:26:57 MET