added tomsfastmath-0.03

2005-03-01 23:00:09 +00:00 · 2005-03-01 23:00:09 +00:00 · ca551d4c5e
commit ca551d4c5e
parent 6bb413fd72
15 changed files with 2587 additions and 859 deletions
--- a/14
+++ b/14
@ -1,6 +1,20 @@
+---
+0. IMPORTANT... why are you doubling the "even" terms individually?  STUPID!
+   - make it so you have four new macros that use an additional 3 carry variables
+        - SQRADDSC - store first mult      [ simple store, no carry ]
+        - SQRADDAC - add subsequent mults  [ 3n word add ]
+        - SQRADDDB - double the carry      [ 3n word add ]
+        - SQRADDFC - forward the doubles into the main [ 3n word add, note, x86_32 may need "g" instead of "r" ]
+   - only use the four macro pattern for rows with >= 3 "doubles"
+        - otherwise use the existing SQRADD
+
+
 1. Write more documentation ;-)
 2. Ports to PPC and MIPS
 3. Fix any lingering bugs, add additional requested functionality.
+4. Unrolled copies of montgomery will speed it up a bit
+5. 
+

 NOTE:  The library is still fairly new.  I've tested it quite a bit but that doesn't mean surprises
 can't happen.  Please test the results you get for correctness.
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,8 @@
+March 1st, 2005
+0.03 -- Optimized squaring
+     -- 
+
+
 September 18th, 2004
 0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
        This also helps prevent killing the cache on smaller cpus.
--- a/comba_sqr_gen.c
+++ b/comba_sqr_gen.c
@ -3,13 +3,16 @@

 int main(int argc, char **argv)
 {
-   int x, y, z, N;
+   int x, y, z, N, f;
   N = atoi(argv[1]);

+if (N >= 16 && N < 32) printf("#ifdef TFM_LARGE\n");
+if (N >= 32) printf("#ifdef TFM_HUGE\n");
+
 printf(
 "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
 "{\n"
-"   fp_digit *a, b[%d], c0, c1, c2;\n"
+"   fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
 "\n"
 "   a = A->dp;\n"
 "   COMBA_START; \n"
@ -25,6 +28,16 @@ printf(
 printf(
 "\n   /* output %d */\n"
 "   CARRY_FORWARD;\n   ", x);
+
+       for (f = y = 0; y < N; y++) {
+           for (z = 0; z < N; z++) {
+               if (z != y && z + y == x && y <= z) {
+                  ++f;
+               }
+           }
+       }
+
+   if (f <= 2) {
       for (y = 0; y < N; y++) {
           for (z = 0; z < N; z++) {
               if (y<=z && (y+z)==x) {
@ -36,6 +49,30 @@ printf(
               }
           }
       }
+   } else {
+      // new method 
+      /* do evens first */
+       f = 0;
+       for (y = 0; y < N; y++) {
+           for (z = 0; z < N; z++) {
+               if (z != y && z + y == x && y <= z) {
+                  if (f == 0) {
+                     // first double 
+                     printf("SQRADDSC(a[%d], a[%d]); ", y, z);
+                     f = 1;
+                  } else { 
+                     printf("SQRADDAC(a[%d], a[%d]); ", y, z);
+                  }
+               }
+           }
+       }
+       // forward the carry
+       printf("SQRADDDB; ");
+       if ((x&1) == 0) {
+          // add the square 
+          printf("SQRADD(a[%d], a[%d]); ", x/2, x/2);
+       }
+    }
 printf("\n   COMBA_STORE(b[%d]);\n", x);
   }
 printf("   COMBA_STORE2(b[%d]);\n", N+N-1);
@ -49,5 +86,7 @@ printf(
 "   fp_clamp(B);\n"
 "}\n\n\n", N+N, N+N);

+if (N >= 16) printf("#endif\n");
+
  return 0;
 }
--- a/demo/test.c
+++ b/demo/test.c
@ -23,7 +23,7 @@ static ulong64 TIMFUNC (void)
   {
   #if defined __GNUC__
      #if defined(__i386__) || defined(__x86_64__)
-         unsigned long long a;
+         ulong64 a;
         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
         return a;
      #else /* gcc-IA64 version */
@ -60,7 +60,7 @@ int main(void)
                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
   ulong64 t1, t2;

-
+  srand(time(NULL));
  printf("TFM Ident string:\n%s\n\n", fp_ident());
  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); 
  fp_zero(&a); draw(&a);
@ -135,6 +135,8 @@ int main(void)
  printf("Testing read_radix\n");
  fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);

+goto testing;
+
 #if 1
  /* test mont */
  printf("Montgomery test #1\n");
@ -143,7 +145,7 @@ int main(void)
  fp_montgomery_calc_normalization(&b, &a);

  fp_read_radix(&d, "123456789123", 16);
-  for (n = 0; n < 100000; n++) {
+  for (n = 0; n < 1000000; n++) {
      fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); 
      fp_mul(&d, &b, &c);
      fp_montgomery_reduce(&c, &a, fp);
@ -165,7 +167,7 @@ int main(void)
  fp_montgomery_calc_normalization(&b, &a);

  fp_read_radix(&d, "123456789123", 16);
-  for (n = 0; n < 100000; n++) {
+  for (n = 0; n < 1000000; n++) {
      fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); 
      fp_mul(&d, &b, &c);
      fp_montgomery_reduce(&c, &a, fp);
@ -195,7 +197,7 @@ int main(void)
   printf("\n\n");
 #endif
  
-#if 0
+#if 1
 /* do some timings... */
  printf("Addition:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
@ -242,6 +244,7 @@ int main(void)
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
+sqrtime:
  printf("Squaring:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
      fp_zero(&a);
@ -260,6 +263,7 @@ int main(void)
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
+return;
 //#else
  printf("Montgomery:\n");
  for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
@ -288,7 +292,9 @@ int main(void)
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
+expttime:
  printf("Exptmod:\n");
+ 
  for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
      fp_zero(&a);
      fp_zero(&b);
@ -303,7 +309,7 @@ int main(void)
      c.used = t;

     t2 = -1;
-     for (ix = 0; ix < 1024; ++ix) {
+     for (ix = 0; ix < 256; ++ix) {
          t1 = TIMFUNC();
          fp_exptmod(&c, &b, &a, &d);
          fp_exptmod(&c, &b, &a, &d);
@ -311,11 +317,15 @@ int main(void)
          fp_copy(&b, &c);      
          fp_copy(&b, &d);      
          if (t1<t2) { t2 = t1; --ix; }
-      }
-      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
+     }
+     printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
+return;
+
 #endif

+testing:
+
   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;

--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/fp_exptmod.c
+++ b/fp_exptmod.c
@ -12,7 +12,6 @@
 /* y = g**x (mod b) 
 * Some restrictions... x must be positive and < b
 */
-
 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
  fp_int   M[64], res;
@ -169,6 +168,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  return FP_OKAY;
 }

+
 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
   fp_int tmp;
@ -181,11 +181,12 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
      if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
         return err;
      }
-      /* _fp_exptmod doesn't care about the sign of X */
-      return _fp_exptmod(&tmp, X, P, Y);
+      X->sign = FP_ZPOS;
+      err =  _fp_exptmod(&tmp, X, P, Y);
+      X->sign = FP_NEG;
+      return err;
   } else {
      /* Positive exponent so just exptmod */
      return _fp_exptmod(G, X, P, Y);
   }
 }
-
--- a/fp_mul.c
+++ b/fp_mul.c
@ -29,11 +29,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
        } else if (y <= 8) {
           fp_mul_comba8(A,B,C);
 #if defined(TFM_LARGE)
-        } else if (y <= 16 && y >= 12) {
+        } else if (y <= 16 && y >= 10) {
           fp_mul_comba16(A,B,C);
 #endif
 #if defined(TFM_HUGE)
-        } else if (y <= 32 && y >= 28) {
+        } else if (y <= 32 && y >= 24) {
           fp_mul_comba32(A,B,C);
 #endif
        } else {
--- a/fp_sqr.c
+++ b/fp_sqr.c
@ -16,7 +16,7 @@ void fp_sqr(fp_int *A, fp_int *B)
    fp_int aa, bb, comp, amb, t1;

    y = A->used;
-    if (y <= 48) { 
+    if (y <= 64) { 
        if (y <= 4) {
           fp_sqr_comba4(A,B);
        } else if (y <= 8) {
@ -26,8 +26,10 @@ void fp_sqr(fp_int *A, fp_int *B)
           fp_sqr_comba16(A,B);
 #endif
 #if defined(TFM_HUGE)
-        } else if (y <= 32 && y >= 28) {
+        } else if (y <= 32 && y >= 20) {
           fp_sqr_comba32(A,B);
+        } else if (y <= 64 && y >= 48) {
+           fp_sqr_comba64(A,B);
 #endif
        } else {
           fp_sqr_comba(A, B);
--- a/fp_sqr_comba.c
+++ b/fp_sqr_comba.c
--- a/fp_sqr_comba_generic.c
+++ b/fp_sqr_comba_generic.c
@ -0,0 +1,75 @@
+/* generic comba squarer */
+void fp_sqr_comba(fp_int *A, fp_int *B)
+{
+  int       pa, ix, iz;
+  fp_digit  c0, c1, c2;
+  fp_int    tmp, *dst;
+
+  /* get size of output and trim */
+  pa = A->used + A->used;
+  if (pa >= FP_SIZE) {
+     pa = FP_SIZE-1;
+  }
+
+  /* number of output digits to produce */
+  COMBA_START;
+  CLEAR_CARRY;
+
+  if (A == B) {
+     fp_zero(&tmp);
+     dst = &tmp;
+  } else {
+     fp_zero(B);
+     dst = B;
+  }
+
+  for (ix = 0; ix < pa; ix++) { 
+      int      tx, ty, iy;
+      fp_digit *tmpy, *tmpx;
+
+      /* get offsets into the two bignums */
+      ty = MIN(A->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = A->dp + tx;
+      tmpy = A->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(A->used-tx, ty+1);
+
+      /* now for squaring tx can never equal ty 
+       * we halve the distance since they approach at a rate of 2x
+       * and we have to round because odd cases need to be executed
+       */
+      iy = MIN(iy, (ty-tx+1)>>1);
+
+      /* forward carries */
+      CARRY_FORWARD;
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+          SQRADD2(*tmpx++, *tmpy--);
+      }
+
+      /* even columns have the square term in them */
+      if ((ix&1) == 0) {
+          SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
+      }
+
+      /* store it */
+      COMBA_STORE(dst->dp[ix]);
+  }
+  COMBA_STORE2(dst->dp[ix]);
+
+  COMBA_FINI;
+
+  /* setup dest */
+  dst->used = pa;
+  fp_clamp (dst);
+  if (dst != B) {
+     fp_copy(dst, B);
+  }
+}
--- a/2
+++ b/2
@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
 #speed
 CFLAGS += -fomit-frame-pointer

-VERSION=0.02
+VERSION=0.03

 default: libtfm.a

--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
--- a/random_txt_files/newsqr.txt
+++ b/random_txt_files/newsqr.txt
@ -0,0 +1,36 @@
+New code added in TFM v0.03
+
+OLD 64-bit...[athlon64]
+
+Squaring:
+  256-bit:        89
+  512-bit:       234
+ 1024-bit:       815
+ 2048-bit:      2851
+
+NEW 64-bit ...
+
+Squaring:
+  256-bit:        89
+  512-bit:       228
+ 1024-bit:       691
+ 2048-bit:      2228
+
+
+OLD 32-bit [athlonxp]
+
+Squaring:
+
+  256-bit:       327
+  512-bit:      1044
+ 1024-bit:      3646
+ 2048-bit:     17055
+
+NEW 32-bit
+
+Squaring:
+
+  256-bit:       332
+  512-bit:       894
+ 1024-bit:      2983
+ 2048-bit:     10385
--- a/tfm.h
+++ b/tfm.h
@ -107,11 +107,11 @@

 /* we want no asm? */
 #ifdef TFM_NO_ASM
-	#undef TFM_X86
-	#undef TFM_X86_64
-	#undef TFM_SSE2
-	#undef TFM_ARM
-	#undef TFM_ASM   
+   #undef TFM_X86
+   #undef TFM_X86_64
+   #undef TFM_SSE2
+   #undef TFM_ARM
+   #undef TFM_ASM   
 #endif

 /* some default configurations.
@ -350,6 +350,7 @@ void fp_sqr_comba16(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_HUGE
 void fp_sqr_comba32(fp_int *A, fp_int *B);
+void fp_sqr_comba64(fp_int *A, fp_int *B);
 #endif
 extern const char *fp_s_rmap;

--- a/tfm.tex
+++ b/tfm.tex
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{TomsFastMath User Manual \\ v0.02}
+\title{TomsFastMath User Manual \\ v0.03}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text and library are all hereby placed in the public domain.  This book has been formatted for B5 
@ -525,6 +525,37 @@ This is essentially the MULADD macro from the multiplication code.
 This is like SQRADD except it adds the produce twice.  It's similar to 
 computing SQRADD(i, j*2).

+To further make things interesting the squaring code also has ``doubles'' (see my LTM book chapter five...) which are
+handled with these macros.
+
+\begin{verbatim}
+#define SQRADDSC(i, j)                                                         \
+   do { fp_word t;                                                             \
+      t =  ((fp_word)i) * ((fp_word)j);                                        \
+      sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;                      \
+   } while (0);
+\end{verbatim}
+This computes a product and stores it in the ``secondary'' carry registers $\left < sc0, sc1, sc2 \right >$.
+
+\begin{verbatim}
+#define SQRADDAC(i, j)                                                         \
+   do { fp_word t;                                                             \
+   t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 = t;                            \
+   t = sc1 + (t >> DIGIT_BIT);             sc1 = t; sc2 += t >> DIGIT_BIT;     \
+   } while (0);
+\end{verbatim}
+This computes a product and adds it to the ``secondary'' carry registers.
+
+\begin{verbatim}
+#define SQRADDDB                                                               \
+   do { fp_word t;                                                             \
+   t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t;                                                 \
+   t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t;                              \
+   c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT);                                     \
+   } while (0);
+\end{verbatim}
+This doubles the ``secondary'' carry registers and adds the sum to the main carry registers.  Really complicated.
+
 \section{Montgomery with Comba}
 Montgomery reduction is used in modular exponentiation and is most called function during
 that operation.  It's important to make sure this routine is very fast or all is lost.