added tomsfastmath-0.01

2004-08-25 02:43:43 +00:00 · 2004-08-25 02:43:43 +00:00 · 5e92ed2a59
commit 5e92ed2a59
75 changed files with 11069 additions and 0 deletions
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 TomsFastMath is public domain.
 Note some ideas were borrowed from LibTomMath and OpenSSL.  All of the code is original or ported
 from LibTomMath [no code was ported from OpenSSL].  As such the origins and status of this code
 are both public domain.
 -- Tom St Denis
--- a/5
+++ b/5
@ -0,0 +1,5 @@
 Development of TomsFastMath was sponsored by three groups.  Two companies that use LTC and LTM commercially
 and one individual who decided he wanted to help out by being generous.
 Thanks goes to them [though they wished to remain anonymous] and people like them.  
--- a/6
+++ b/6
@ -0,0 +1,6 @@
 1. Write more documentation ;-)
 2. Ports to PPC and MIPS
 3. Fix any lingering bugs, add additional requested functionality.
 NOTE:  The library is still fairly new.  I've tested it quite a bit but that doesn't mean surprises
 can't happen.  Please test the results you get for correctness.
--- a/changes.txt
+++ b/changes.txt
@ -0,0 +1,2 @@
 August 25th, 2004
 TFM 0.01  -- Initial Release
--- a/comba_mult_gen.c
+++ b/comba_mult_gen.c
@ -0,0 +1,50 @@
 /* program emits a NxN comba multiplier */
 #include <stdio.h>
 int main(int argc, char **argv)
 {
   int N, x, y, z;
   N = atoi(argv[1]);
   /* print out preamble */
 printf(
 "void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
 "{\n"
 "   fp_word t;\n"
 "   fp_digit c0, c1, c2, at[%d];\n"
 "\n"
 "   memcpy(at, A->dp, %d * sizeof(fp_digit));\n"
 "   memcpy(at+%d, B->dp, %d * sizeof(fp_digit));\n"
 "   COMBA_START;\n"
 "\n"
 "   COMBA_CLEAR;\n", N, N+N, N, N, N, N);
   /* now do the rows */
   for (x = 0; x < (N+N-1); x++) {
 printf(
 "   /* %d */\n", x);
 if (x > 0) {
 printf(
 "   COMBA_FORWARD;\n");
 }
      for (y = 0; y < N; y++) {
      for (z = 0; z < N; z++) {
          if ((y+z)==x) {
             printf("   MULADD(at[%d], at[%d]); ", y, z+N);
          }
      }
      }
 printf(
 "\n"
 "   COMBA_STORE(C->dp[%d]);\n", x);
   }
 printf(
 "   COMBA_STORE2(C->dp[%d]);\n"
 "   C->used = %d;\n"
 "   C->sign = A->sign ^ B->sign;\n"
 "   fp_clamp(C);\n"
 "   COMBA_FINI;\n"
 "}\n\n\n", N+N-1, N+N, N+N);
  return 0;
 }
--- a/comba_sqr_gen.c
+++ b/comba_sqr_gen.c
@ -0,0 +1,54 @@
 /* Generates squaring comba code... it learns it knows our secrets! */
 #include <stdio.h>
 int main(int argc, char **argv)
 {
   int x, y, z, N;
   N = atoi(argv[1]);
 printf(
 "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
 "{\n"
 "   fp_word t;\n"
 "   fp_digit *a, b[%d], c0, c1, c2;\n"
 "\n"
 "   a = A->dp;\n"
 "   COMBA_START; \n"
 "\n"
 "   /* clear carries */\n"
 "   CLEAR_CARRY;\n"
 "\n"
 "   /* output 0 */\n"
 "   SQRADD(a[0],a[0]);\n"
 "   COMBA_STORE(b[0]);\n", N, N+N);
   for (x = 1; x < N+N-1; x++) {
 printf(
 "\n   /* output %d */\n"
 "   CARRY_FORWARD;\n   ", x);
       for (y = 0; y < N; y++) {
           for (z = 0; z < N; z++) {
               if (y<=z && (y+z)==x) {
                  if (y == z) { 
                     printf("SQRADD(a[%d], a[%d]); ", y, y);
                  } else {
                     printf("SQRADD2(a[%d], a[%d]); ", y, z);
                  }
               }
           }
       }
 printf("\n   COMBA_STORE(b[%d]);\n", x);
   }
 printf("   COMBA_STORE2(b[%d]);\n", N+N-1);
 printf(
 "   COMBA_FINI;\n"
 "\n"
 "   B->used = %d;\n"
 "   B->sign = FP_ZPOS;\n"
 "   memcpy(B->dp, b, %d * sizeof(fp_digit));\n"
 "   fp_clamp(B);\n"
 "}\n\n\n", N+N, N+N);
  return 0;
 }
--- a/demo/stest.c
+++ b/demo/stest.c
@ -0,0 +1,144 @@
 /* A simple static test program. */
 #include <tfm.h>
 #ifdef GBA_MODE
 #include <gba.h>
   #define DISPLAY(x) modetxt_puts(vfb, x, 1)
 #endif
 #ifndef DISPLAY
   #define DISPLAY(x) printf(x)
 #endif
 #ifdef GBA_MODE
 int c_main(void)
 #else
 int main(void)
 #endif
 {
   fp_int a,b,c,d,e,f;
   fp_digit dp;
   fp_init(&a);
   fp_init(&b);
   fp_init(&c);
   fp_init(&d);
   fp_init(&e);
   fp_init(&f);
 #ifdef GBA_MODE
   install_common();
   modetxt_init();
   modetxt_gotoxy(0,0);
 #endif
   /* test multiplication */
   fp_read_radix(&a, "3453534534535345345341230891273", 10);
   fp_read_radix(&b, "2394873294871238934718923" , 10);
   fp_read_radix(&c, "8270777629674273015508507050766235312931312159028658979", 10);
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
      return 0;
   } else {
      DISPLAY("mul passed\n");
   }
   /* test multiplication */
   fp_read_radix(&a, "30481290320498235987349712308523652378643912563478232907782361237864278207235782364578264891274789264278634289739", 10);
   fp_read_radix(&b, "48761478126387263782638276327836287632836278362837627838736278362923698724823749238732" , 10);
   fp_read_radix(&c, "1486312771227034563307950634490737985563993459700941115664257275795366623795590136120579100118233580357115074068815507257715906295105536107921754177810976863679300283932188006885811950341132768970948", 10);
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
      return 0;
   } else {
      DISPLAY("mul passed\n");
   }
   /* test multiplication */
   fp_read_radix(&a, "115792089237316195423570985008687907853269984665640564039457584007913129639935", 10);
   fp_read_radix(&b, "174224571863520493293247799005065324265471" , 10);
   fp_read_radix(&c, "20173827172553973356686868531273530268200710714389071377794102651988800859098544338487575161443744102709980552583184385", 10);
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
      return 0;
   } else {
      DISPLAY("mul passed\n");
   }
   /* test squaring */
   fp_read_radix(&a, "298723982748923478923473927489237289347238947238947238947238972893", 10);
   fp_read_radix(&b, "89236017869379132235512787068367546521309689412262624434964313994127411682542855190667724226920696163962644836740110835385588789449" , 10);
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
      return 0;
   } else {
      DISPLAY("sqr passed\n");
   }
   fp_read_radix(&a, "397823894238973128942895123894327123941724927848927349274897238978927593487012378490184789429812734982738972389", 10);
   fp_read_radix(&b, "158263850827461677491961439999264901067636282938352531932899298293270945997930087353471903166601507321298827087008336951419604640736464667188494668962822678461626245753696845719301945679092882499787869509090904187704367321" , 10);
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
      return 0;
   } else {
      DISPLAY("sqr passed\n");
   }
   fp_read_radix(&a, "13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006084095", 10);
   fp_read_radix(&b, "179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474097562152033539671286128252223189553839160721441767298250321715263238814402734379959506792230903356495130620869925267845538430714092411695463462326211969025" , 10);
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
      return 0;
   } else {
      DISPLAY("sqr passed\n");
   }
   /* montgomery reductions */
   fp_read_radix(&a, "234892374892374893489123428937892781237863278637826327367637836278362783627836783678363", 10);
   fp_read_radix(&b, "4447823492749823749234123489273987393983289319382762756425425425642727352327452374521", 10);
   fp_read_radix(&c, "2396271882990732698083317035605836523697277786556053771759862552557086442129695099100", 10);
   fp_montgomery_setup(&b, &dp);
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
      return 0;
   } else {
      DISPLAY("mont passed\n");
   }
   fp_read_radix(&a, "2348923748923748934891234456645654645645684576353428937892781237863278637826327367637836278362783627836783678363", 10);
   fp_read_radix(&b, "444782349274982374923412348927398739398328931938276275642542542564272735232745237452123424324324444121111119", 10);
   fp_read_radix(&c, "45642613844554582908652603086180267403823312390990082328515008314514368668691233331246183943400359349283420", 10);
   fp_montgomery_setup(&b, &dp);
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
      return 0;
   } else {
      DISPLAY("mont passed\n");
   }
   fp_read_radix(&a, "234823424242342923748923748934891234456645654645645684576353424972378234762378623891236834132352375235378462378489378927812378632786378263273676378362783627555555555539568389052478124618461834763837685723645827529034853490580134568947341278498542893481762349723907847892983627836783678363", 10);
   fp_read_radix(&b, "44478234927456563455982374923412348927398739398328931938276275642485623481638279025465891276312903262837562349056234783648712314678120389173890128905425242424239784256427", 10);
   fp_read_radix(&c, "33160865265453361650564031464519042126185632333462754084489985719613480783282357410514898819797738034600484519472656152351777186694609218202276509271061460265488348645081", 10);
   fp_montgomery_setup(&b, &dp);
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
      return 0;
   } else {
      DISPLAY("mont passed\n");
   }
   return 0;
 }   
--- a/demo/test.c
+++ b/demo/test.c
@ -0,0 +1,537 @@
 /* TFM demo program */
 #include <tfm.h>
 void draw(fp_int *a)
 {
  int x;
  printf("%d, %d, ", a->used, a->sign);
  for (x = a->used - 1; x >= 0; x--) {
      printf("%08lx ", a->dp[x]);
  }
  printf("\n");
 }
 int myrng(unsigned char *dst, int len, void *dat)
 {
   int x;
   for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
   return len;
 }
 /* RDTSC from Scott Duplichan */
 static ulong64 TIMFUNC (void)
   {
   #if defined __GNUC__
      #if defined(__i386__) || defined(__x86_64__)
         unsigned long long a;
         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
         return a;
      #else /* gcc-IA64 version */
         unsigned long result;
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         while (__builtin_expect ((int) result == -1, 0))
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         return result;
      #endif
   // Microsoft and Intel Windows compilers
   #elif defined _M_IX86
     __asm rdtsc
   #elif defined _M_AMD64
     return __rdtsc ();
   #elif defined _M_IA64
     #if defined __INTEL_COMPILER
       #include <ia64intrin.h>
     #endif
      return __getReg (3116);
   #else
     #error need rdtsc function for this build
   #endif
   }
   char cmd[4096], buf[4096];
 int main(void)
 {
  fp_int a,b,c,d,e,f;
  fp_digit fp;
  int n, err;
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
   ulong64 t1, t2;
  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); 
  fp_zero(&a); draw(&a);
  /* test set and simple shifts */
  printf("Testing mul/div 2\n");
  fp_set(&a, 1); draw(&a);
  for (n = 0; n <= DIGIT_BIT; n++) {
      fp_mul_2(&a, &a); printf("(%d) ", fp_count_bits(&a));
      draw(&a);
  }
  for (n = 0; n <= (DIGIT_BIT + 1); n++) {
      fp_div_2(&a, &a);
      draw(&a);
  }
  fp_set(&a, 1);
  /* test lshd/rshd */
  printf("testing lshd/rshd\n");
  fp_lshd(&a, 3); draw(&a);
  fp_rshd(&a, 3); draw(&a);
  /* test more complicated shifts */
  printf("Testing mul/div 2d\n");
  fp_mul_2d(&a, DIGIT_BIT/2, &a); draw(&a);
  fp_div_2d(&a, DIGIT_BIT/2, &a, NULL); draw(&a);
  fp_mul_2d(&a, DIGIT_BIT + DIGIT_BIT/2, &a); draw(&a);
  fp_div_2d(&a, DIGIT_BIT + DIGIT_BIT/2, &a, NULL); draw(&a);
  /* test neg/abs  */
  printf("testing neg/abs\n");
  fp_neg(&a, &a); draw(&a);
  fp_neg(&a, &a); draw(&a);
  fp_neg(&a, &a); draw(&a);
  fp_abs(&a, &a); draw(&a);
  /* test comparisons */
  fp_set(&b, 3);
  fp_set(&c, 4); fp_neg(&c, &c);
  fp_set(&d, 1);
  printf("Testing compares\n%d, %d, %d, %d\n", fp_cmp(&a, &b), fp_cmp(&a, &c), fp_cmp(&a, &d), fp_cmp(&b, &c));
  /* test add/sub */
  printf("Testing add/sub \n");
  fp_set(&a, ((fp_digit)1)<<(DIGIT_BIT-1)); draw(&a);
  fp_set(&b, ((fp_digit)1)<<(DIGIT_BIT-2));
  fp_add(&a, &b, &a); draw(&a);
  fp_add(&a, &b, &a); draw(&a);
  fp_add(&a, &b, &a); draw(&a);
  printf("sub...\n");
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  /* test mul_d */
  printf("Testing mul_d and div_d\n");
  fp_set(&a, 1);
  fp_mul_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a); draw(&a);
  fp_mul_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a); draw(&a);
  fp_mul_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a); draw(&a);
  printf("div_d\n");
  fp_div_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a, NULL); draw(&a);
  fp_div_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a, NULL); draw(&a);
  fp_div_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a, NULL); draw(&a);
  /* testing read radix */
  printf("Testing read_radix\n");
  fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
  /* test mont */
  printf("Montgomery test\n");
  fp_set(&a, 1);
  fp_lshd(&a, 4);
  fp_add_d(&a, 1, &a);
  fp_montgomery_setup(&a, &fp);
  fp_montgomery_calc_normalization(&b, &a);
  fp_read_radix(&d, "123456789123", 16);
  for (n = 0; n < 100000; n++) {
      fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); 
      fp_mul(&d, &b, &c);
      fp_montgomery_reduce(&c, &a, fp);
      if (fp_cmp(&c, &d) != FP_EQ) {
         printf("Failed mont %d\n", n);
         draw(&a);
         draw(&d);
         draw(&c);
         return EXIT_FAILURE;
      }
  }
  printf("Passed.\n");
   /* test for size */
   for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) {
       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
       err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL);
       if (err != FP_OKAY) {
          printf("failed with err code %d\n", err);
          return EXIT_FAILURE;
       }
       if (fp_count_bits(&a) != ix) {
          printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix);
          return EXIT_FAILURE;
       }
   }
   printf("\n\n");
 #if 0
 /* do some timings... */
  printf("Addition:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix;
          b.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      t2 = -1;
      for (ix = 0; ix < 2500; ++ix) {
          t1 = TIMFUNC();
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          t2 = (TIMFUNC() - t1)>>3;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
  printf("Multiplication:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix;
          b.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      t2 = -1;
      for (ix = 0; ix < 10000; ++ix) {
          t1 = TIMFUNC();
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          t2 = (TIMFUNC() - t1)>>2;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
  printf("Squaring:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix;
      }
      a.used = t;
      t2 = -1;
      for (ix = 0; ix < 10000; ++ix) {
          t1 = TIMFUNC();
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          t2 = (TIMFUNC() - t1)>>2;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
  printf("Montgomery:\n");
  for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
      fp_zero(&a);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix | 1;
      }
      a.used = t;
     fp_montgomery_setup(&a, &fp);
     fp_sub_d(&a, 3, &b);
     fp_sqr(&b, &b);      
     fp_copy(&b, &c);      
     fp_copy(&b, &d);      
     t2 = -1;
     for (ix = 0; ix < 10000; ++ix) {
          t1 = TIMFUNC();
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          t2 = (TIMFUNC() - t1)>>1;
          fp_copy(&b, &c);      
          fp_copy(&b, &d);      
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
  printf("Exptmod:\n");
  for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix+1;
          b.dp[ix] = (fp_digit)rand() * (fp_digit)rand();
          c.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      c.used = t;
     t2 = -1;
     for (ix = 0; ix < 50; ++ix) {
          t1 = TIMFUNC();
          fp_exptmod(&c, &b, &a, &d);
          fp_exptmod(&c, &b, &a, &d);
          t2 = (TIMFUNC() - t1)>>1;
          fp_copy(&b, &c);      
          fp_copy(&b, &d);      
          if (t1<t2) { t2 = t1; --ix; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 #endif
   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
   for (;;) {
       printf("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n);
       fgets(cmd, 4095, stdin);
       cmd[strlen(cmd)-1] = 0;
       printf("%s  ]\r",cmd); fflush(stdout);
       if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
          fp_mul_2d(&a, rr, &a);
          a.sign = b.sign;
          if (fp_cmp(&a, &b) != FP_EQ) {
             printf("mul2d failed, rr == %d\n",rr);
             draw(&a);
             draw(&b);
             return 0;
          }
       } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
          fp_div_2d(&a, rr, &a, &e);
          a.sign = b.sign;
          if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
          if (fp_cmp(&a, &b) != FP_EQ) {
             printf("div2d failed, rr == %d\n",rr);
             draw(&a);
             draw(&b);
             return 0;
          }
       } else if (!strcmp(cmd, "add")) { ++add_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
          fp_copy(&a, &d);
          fp_add(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
             printf("add %lu failure!\n", add_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
          /* test the sign/unsigned storage functions */
          rr = fp_signed_bin_size(&c);
          fp_to_signed_bin(&c, (unsigned char *)cmd);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
          fp_read_signed_bin(&d, (unsigned char *)cmd, rr);
          if (fp_cmp(&c, &d) != FP_EQ) {
             printf("fp_signed_bin failure!\n");
             draw(&c);
             draw(&d);
             return 0;
          }
          rr = fp_unsigned_bin_size(&c);
          fp_to_unsigned_bin(&c, (unsigned char *)cmd);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
          fp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
          if (fp_cmp_mag(&c, &d) != FP_EQ) {
             printf("fp_unsigned_bin failure!\n");
             draw(&c);
             draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "sub")) { ++sub_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
          fp_copy(&a, &d);
          fp_sub(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
             printf("sub %lu failure!\n", sub_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "mul")) { 
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
          fp_copy(&a, &d);
          fp_mul(&d, &b, &d); ++mul_n;
          if (fp_cmp(&c, &d) != FP_EQ) {
             printf("mul %lu failure!\n", mul_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "div")) { 
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&d, buf, 64);
 // continue;
          fp_div(&a, &b, &e, &f); ++div_n;
          if (fp_cmp(&c, &e) != FP_EQ || fp_cmp(&d, &f) != FP_EQ) {
             printf("div %lu failure!\n", div_n);
 draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
             return 0;
          }
       } else if (!strcmp(cmd, "sqr")) { 
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
 // continue;
          fp_copy(&a, &c);
          fp_sqr(&c, &c); ++sqr_n;
          if (fp_cmp(&b, &c) != FP_EQ) {
             printf("sqr %lu failure!\n", sqr_n);
 draw(&a);draw(&b);draw(&c);
             return 0;
          }
       } else if (!strcmp(cmd, "gcd")) { 
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 // continue;
          fp_copy(&a, &d);
          fp_gcd(&d, &b, &d); ++gcd_n;
          d.sign = c.sign;
          if (fp_cmp(&c, &d) != FP_EQ) {
             printf("gcd %lu failure!\n", gcd_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "lcm")) { 
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
             fp_copy(&a, &d);
             fp_lcm(&d, &b, &d); ++lcm_n;
             d.sign = c.sign;
             if (fp_cmp(&c, &d) != FP_EQ) {
                printf("lcm %lu failure!\n", lcm_n);
   draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
       } else if (!strcmp(cmd, "expt")) {  
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&d, buf, 64);
 // continue;
             fp_copy(&a, &e);
             fp_exptmod(&e, &b, &c, &e); ++expt_n;
             if (fp_cmp(&d, &e) != FP_EQ) {
                printf("expt %lu failure!\n", expt_n);
   draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
                return 0;
             }
       } else if (!strcmp(cmd, "invmod")) {  
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
             fp_invmod(&a, &b, &d);
 #if 1
             fp_mulmod(&d,&a,&b,&e); ++inv_n;
             if (fp_cmp_d(&e, 1) != FP_EQ) {
 #else
             if (fp_cmp(&d, &c) != FP_EQ) {
 #endif
                printf("inv [wrong value from MPI?!] failure\n");
                draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
       } else if (!strcmp(cmd, "div2")) { ++div2_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fp_div_2(&a, &c);
             if (fp_cmp(&c, &b) != FP_EQ) {
                 printf("div_2 %lu failure\n", div2_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
                 return 0;
             }
       } else if (!strcmp(cmd, "mul2")) { ++mul2_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fp_mul_2(&a, &c);
             if (fp_cmp(&c, &b) != FP_EQ) {
                 printf("mul_2 %lu failure\n", mul2_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
                 return 0;
             }
       } else if (!strcmp(cmd, "add_d")) { ++add_d_n;
              fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_add_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
                 printf("add_d %lu failure\n", add_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
                 printf("d == %d\n", ix);
                 return 0;
              }
       } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
              fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_sub_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
                 printf("sub_d %lu failure\n", sub_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
                 printf("d == %d\n", ix);
                 return 0;
              }
       } else if (!strcmp(cmd, "mul_d")) { ++mul_d_n;
              fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_mul_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
                 printf("mul_d %lu failure\n", sub_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
                 printf("d == %d\n", ix);
                 return 0;
              }
       }
   }
 }
--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/fp_2expt.c
+++ b/fp_2expt.c
@ -0,0 +1,35 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* computes a = 2**b */
 void fp_2expt(fp_int *a, int b)
 {
   int     z;
   /* zero a as per default */
   fp_zero (a);
   if (b < 0) { 
      return;
   }
   z = b / DIGIT_BIT;
   if (z >= FP_SIZE) {
      return; 
   }
  /* set the used count of where the bit will go */
  a->used = z + 1;
  /* put the single bit in its place */
  a->dp[z] = ((fp_digit)1) << (b % DIGIT_BIT);
 }
--- a/fp_add.c
+++ b/fp_add.c
@ -0,0 +1,39 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_add(fp_int *a, fp_int *b, fp_int *c)
 {
  int     sa, sb;
  /* get sign of both inputs */
  sa = a->sign;
  sb = b->sign;
  /* handle two cases, not four */
  if (sa == sb) {
    /* both positive or both negative */
    /* add their magnitudes, copy the sign */
    c->sign = sa;
    s_fp_add (a, b, c);
  } else {
    /* one positive, the other negative */
    /* subtract the one with the greater magnitude from */
    /* the one of the lesser magnitude.  The result gets */
    /* the sign of the one with the greater magnitude. */
    if (fp_cmp_mag (a, b) == FP_LT) {
      c->sign = sb;
      s_fp_sub (b, a, c);
    } else {
      c->sign = sa;
      s_fp_sub (a, b, c);
    }
  }
 }
--- a/fp_add_d.c
+++ b/fp_add_d.c
@ -0,0 +1,18 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a + b */
 void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
 {
   fp_int tmp;
   fp_set(&tmp, b);
   fp_add(a,&tmp,c);
 }
--- a/fp_addmod.c
+++ b/fp_addmod.c
@ -0,0 +1,19 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* d = a + b (mod c) */
 int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
  fp_int tmp;
  fp_zero(&tmp);
  fp_add(a, b, &tmp);
  return fp_mod(&tmp, c, d);
 }
--- a/fp_cmp.c
+++ b/fp_cmp.c
@ -0,0 +1,27 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_cmp(fp_int *a, fp_int *b)
 {
   if (a->sign == FP_NEG && b->sign == FP_ZPOS) {
      return FP_LT;
   } else if (a->sign == FP_ZPOS && b->sign == FP_NEG) {
      return FP_GT;
   } else {
      /* compare digits */
      if (a->sign == FP_NEG) {
         /* if negative compare opposite direction */
         return fp_cmp_mag(b, a);
      } else {
         return fp_cmp_mag(a, b);
      }
   }
 }
--- a/fp_cmp_d.c
+++ b/fp_cmp_d.c
@ -0,0 +1,34 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* compare against a single digit */
 int fp_cmp_d(fp_int *a, fp_digit b)
 {
  /* compare based on sign */
  if ((b && a->used == 0) || a->sign == FP_NEG) {
    return FP_LT;
  }
  /* compare based on magnitude */
  if (a->used > 1) {
    return FP_GT;
  }
  /* compare the only digit of a to b */
  if (a->dp[0] > b) {
    return FP_GT;
  } else if (a->dp[0] < b) {
    return FP_LT;
  } else {
    return FP_EQ;
  }
 }
--- a/fp_cmp_mag.c
+++ b/fp_cmp_mag.c
@ -0,0 +1,31 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_cmp_mag(fp_int *a, fp_int *b)
 {
   int x;
   if (a->used > b->used) {
      return FP_GT;
   } else if (a->used < b->used) {
      return FP_LT;
   } else {
      for (x = a->used - 1; x >= 0; x--) {
          if (a->dp[x] > b->dp[x]) {
             return FP_GT;
          } else if (a->dp[x] < b->dp[x]) {
             return FP_LT;
          }
      }
   }
   return FP_EQ;
 }
--- a/fp_cnt_lsb.c
+++ b/fp_cnt_lsb.c
@ -0,0 +1,42 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 static const int lnz[16] = {
   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 };
 /* Counts the number of lsbs which are zero before the first zero bit */
 int fp_cnt_lsb(fp_int *a)
 {
   int x;
   fp_digit q, qq;
   /* easy out */
   if (fp_iszero(a) == 1) {
      return 0;
   }
   /* scan lower digits until non-zero */
   for (x = 0; x < a->used && a->dp[x] == 0; x++);
   q = a->dp[x];
   x *= DIGIT_BIT;
   /* now scan this digit until a 1 is found */
   if ((q & 1) == 0) {
      do {
         qq  = q & 15;
         x  += lnz[qq];
         q >>= 4;
      } while (qq == 0);
   }
   return x;
 }
--- a/fp_count_bits.c
+++ b/fp_count_bits.c
@ -0,0 +1,32 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_count_bits (fp_int * a)
 {
  int     r;
  fp_digit q;
  /* shortcut */
  if (a->used == 0) {
    return 0;
  }
  /* get number of digits and add that */
  r = (a->used - 1) * DIGIT_BIT;
  /* take the last digit and count the bits in it */
  q = a->dp[a->used - 1];
  while (q > ((fp_digit) 0)) {
    ++r;
    q >>= ((fp_digit) 1);
  }
  return r;
 }
--- a/fp_div.c
+++ b/fp_div.c
@ -0,0 +1,153 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* a/b => cb + d == a */
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
  fp_int  q, x, y, t1, t2;
  int     n, t, i, norm, neg;
  /* is divisor zero ? */
  if (fp_iszero (b) == 1) {
    return FP_VAL;
  }
  /* if a < b then q=0, r = a */
  if (fp_cmp_mag (a, b) == FP_LT) {
    if (d != NULL) {
      fp_copy (a, d);
    } 
    if (c != NULL) {
      fp_zero (c);
    }
    return FP_OKAY;
  }
  fp_init(&q);
  q.used = a->used + 2;
  fp_init(&t1);
  fp_init(&t2);
  fp_init_copy(&x, a);
  fp_init_copy(&y, b);
  /* fix the sign */
  neg = (a->sign == b->sign) ? FP_ZPOS : FP_NEG;
  x.sign = y.sign = FP_ZPOS;
  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
  norm = fp_count_bits(&y) % DIGIT_BIT;
  if (norm < (int)(DIGIT_BIT-1)) {
     norm = (DIGIT_BIT-1) - norm;
     fp_mul_2d (&x, norm, &x);
     fp_mul_2d (&y, norm, &y);
  } else {
     norm = 0;
  }
  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
  n = x.used - 1;
  t = y.used - 1;
  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
  fp_lshd (&y, n - t);                                             /* y = y*b**{n-t} */
  while (fp_cmp (&x, &y) != FP_LT) {
    ++(q.dp[n - t]);
    fp_sub (&x, &y, &x);
  }
  /* reset y by shifting it back down */
  fp_rshd (&y, n - t);
  /* step 3. for i from n down to (t + 1) */
  for (i = n; i >= (t + 1); i--) {
    if (i > x.used) {
      continue;
    }
    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
    if (x.dp[i] == y.dp[t]) {
      q.dp[i - t - 1] = ((((fp_word)1) << DIGIT_BIT) - 1);
    } else {
      fp_word tmp;
      tmp = ((fp_word) x.dp[i]) << ((fp_word) DIGIT_BIT);
      tmp |= ((fp_word) x.dp[i - 1]);
      tmp /= ((fp_word) y.dp[t]);
      q.dp[i - t - 1] = (fp_digit) (tmp);
    }
    /* while (q{i-t-1} * (yt * b + y{t-1})) > 
             xi * b**2 + xi-1 * b + xi-2 
       do q{i-t-1} -= 1; 
    */
    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
    do {
      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
      /* find left hand */
      fp_zero (&t1);
      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
      t1.dp[1] = y.dp[t];
      t1.used = 2;
      fp_mul_d (&t1, q.dp[i - t - 1], &t1);
      /* find right hand */
      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
      t2.dp[2] = x.dp[i];
      t2.used = 3;
    } while (fp_cmp_mag(&t1, &t2) == FP_GT);
    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
    fp_mul_d (&y, q.dp[i - t - 1], &t1);
    fp_lshd  (&t1, i - t - 1);
    fp_sub   (&x, &t1, &x);
    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
    if (x.sign == FP_NEG) {
      fp_copy (&y, &t1);
      fp_lshd (&t1, i - t - 1);
      fp_add (&x, &t1, &x);
      q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
    }
  }
  /* now q is the quotient and x is the remainder 
   * [which we have to normalize] 
   */
  /* get sign before writing to c */
  x.sign = x.used == 0 ? FP_ZPOS : a->sign;
  if (c != NULL) {
    fp_clamp (&q);
    fp_copy (&q, c);
    c->sign = neg;
  }
  if (d != NULL) {
    fp_div_2d (&x, norm, &x, NULL);
 /* the following is a kludge, essentially we were seeing the right remainder but 
   with excess digits that should have been zero
 */
    for (i = b->used; i < x.used; i++) {
        x.dp[i] = 0;
    }
    fp_clamp(&x);
    fp_copy (&x, d);
  }
  return FP_OKAY;
 }
--- a/fp_div_2.c
+++ b/fp_div_2.c
@ -0,0 +1,49 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* b = a/2 */
 void fp_div_2(fp_int * a, fp_int * b)
 {
  int     x, oldused;
  oldused = b->used;
  b->used = a->used;
  {
    register fp_digit r, rr, *tmpa, *tmpb;
    /* source alias */
    tmpa = a->dp + b->used - 1;
    /* dest alias */
    tmpb = b->dp + b->used - 1;
    /* carry */
    r = 0;
    for (x = b->used - 1; x >= 0; x--) {
      /* get the carry for the next iteration */
      rr = *tmpa & 1;
      /* shift the current digit, add in carry and store */
      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
      /* forward carry to next iteration */
      r = rr;
    }
    /* zero excess digits */
    tmpb = b->dp + b->used;
    for (x = b->used; x < oldused; x++) {
      *tmpb++ = 0;
    }
  }
  b->sign = a->sign;
  fp_clamp (b);
 }
--- a/fp_div_2d.c
+++ b/fp_div_2d.c
@ -0,0 +1,75 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a / 2**b */
 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
 {
  fp_digit D, r, rr;
  int      x;
  fp_int   t;
  /* if the shift count is <= 0 then we do no work */
  if (b <= 0) {
    fp_copy (a, c);
    if (d != NULL) {
      fp_zero (d);
    }
    return;
  }
  fp_init(&t);
  /* get the remainder */
  if (d != NULL) {
    fp_mod_2d (a, b, &t);
  }
  /* copy */
  fp_copy(a, c);
  /* shift by as many digits in the bit count */
  if (b >= (int)DIGIT_BIT) {
    fp_rshd (c, b / DIGIT_BIT);
  }
  /* shift any bit count < DIGIT_BIT */
  D = (fp_digit) (b % DIGIT_BIT);
  if (D != 0) {
    register fp_digit *tmpc, mask, shift;
    /* mask */
    mask = (((fp_digit)1) << D) - 1;
    /* shift for lsb */
    shift = DIGIT_BIT - D;
    /* alias */
    tmpc = c->dp + (c->used - 1);
    /* carry */
    r = 0;
    for (x = c->used - 1; x >= 0; x--) {
      /* get the lower  bits of this word in a temp */
      rr = *tmpc & mask;
      /* shift the current word and mix in the carry bits from the previous word */
      *tmpc = (*tmpc >> D) | (r << shift);
      --tmpc;
      /* set the carry to the carry bits of the current word found above */
      r = rr;
    }
  }
  fp_clamp (c);
  if (d != NULL) {
    fp_copy (&t, d);
  }
 }
--- a/fp_div_d.c
+++ b/fp_div_d.c
@ -0,0 +1,89 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 static int s_is_power_of_two(fp_digit b, int *p)
 {
   int x;
   for (x = 1; x < DIGIT_BIT; x++) {
      if (b == (((fp_digit)1)<<x)) {
         *p = x;
         return 1;
      }
   }
   return 0;
 }
 /* a/b => cb + d == a */
 int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
 {
  fp_int   q;
  fp_word  w;
  fp_digit t;
  int      ix;
  /* cannot divide by zero */
  if (b == 0) {
     return FP_VAL;
  }
  /* quick outs */
  if (b == 1 || fp_iszero(a) == 1) {
     if (d != NULL) {
        *d = 0;
     }
     if (c != NULL) {
        fp_copy(a, c);
     }
     return FP_OKAY;
  }
  /* power of two ? */
  if (s_is_power_of_two(b, &ix) == 1) {
     if (d != NULL) {
        *d = a->dp[0] & ((((fp_digit)1)<<ix) - 1);
     }
     if (c != NULL) {
        fp_div_2d(a, ix, c, NULL);
     }
     return FP_OKAY;
  }
  /* no easy answer [c'est la vie].  Just division */
  fp_init(&q);
  q.used = a->used;
  q.sign = a->sign;
  w = 0;
  for (ix = a->used - 1; ix >= 0; ix--) {
     w = (w << ((fp_word)DIGIT_BIT)) | ((fp_word)a->dp[ix]);
     if (w >= b) {
        t = (fp_digit)(w / b);
        w -= ((fp_word)t) * ((fp_word)b);
      } else {
        t = 0;
      }
      q.dp[ix] = (fp_digit)t;
  }
  if (d != NULL) {
     *d = (fp_digit)w;
  }
  if (c != NULL) {
     fp_clamp(&q);
     fp_copy(&q, c);
  }
  return FP_OKAY;
 }
--- a/fp_exptmod.c
+++ b/fp_exptmod.c
@ -0,0 +1,170 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* y = g**x (mod b) 
 * Some restrictions... x must be positive and < b
 */
 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
  fp_int   M[64], res;
  fp_digit buf, mp;
  int      err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
  /* find window size */
  x = fp_count_bits (X);
  if (x <= 7) {
    winsize = 2;
  } else if (x <= 36) {
    winsize = 3;
  } else if (x <= 140) {
    winsize = 4;
  } else if (x <= 450) {
    winsize = 5;
  } else {
    winsize = 6;
  } 
  /* init M array */
  memset(M, 0, sizeof(fp_int)*(1<<winsize));
  /* now setup montgomery  */
  if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
     return err;
  }
  /* setup result */
  fp_init(&res);
  /* create M table
   *
   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
   *
   * The first half of the table is not computed though accept for M[0] and M[1]
   */
   /* now we need R mod m */
   fp_montgomery_calc_normalization (&res, P);
   /* now set M[1] to G * R mod m */
   if (fp_cmp_mag(P, G) != FP_GT) {
      /* G > P so we reduce it first */
      fp_mod(G, P, &M[1]);
   } else {
      fp_copy(G, &M[1]);
   }
   fp_mulmod (&M[1], &res, P, &M[1]);
  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
  fp_copy (&M[1], &M[1 << (winsize - 1)]);
  for (x = 0; x < (winsize - 1); x++) {
    fp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)]);
    fp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp);
  }
  /* create upper table */
  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
    fp_mul(&M[x - 1], &M[1], &M[x]);
    fp_montgomery_reduce(&M[x], P, mp);
  }
  /* set initial mode and bit cnt */
  mode   = 0;
  bitcnt = 1;
  buf    = 0;
  digidx = X->used - 1;
  bitcpy = 0;
  bitbuf = 0;
  for (;;) {
    /* grab next digit as required */
    if (--bitcnt == 0) {
      /* if digidx == -1 we are out of digits so break */
      if (digidx == -1) {
        break;
      }
      /* read next digit and reset bitcnt */
      buf    = X->dp[digidx--];
      bitcnt = (int)DIGIT_BIT;
    }
    /* grab the next msb from the exponent */
    y     = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
    buf <<= (fp_digit)1;
    /* if the bit is zero and mode == 0 then we ignore it
     * These represent the leading zero bits before the first 1 bit
     * in the exponent.  Technically this opt is not required but it
     * does lower the # of trivial squaring/reductions used
     */
    if (mode == 0 && y == 0) {
      continue;
    }
    /* if the bit is zero and mode == 1 then we square */
    if (mode == 1 && y == 0) {
      fp_sqr(&res, &res);
      fp_montgomery_reduce(&res, P, mp);
      continue;
    }
    /* else we add it to the window */
    bitbuf |= (y << (winsize - ++bitcpy));
    mode    = 2;
    if (bitcpy == winsize) {
      /* ok window is filled so square as required and multiply  */
      /* square first */
      for (x = 0; x < winsize; x++) {
        fp_sqr(&res, &res);
        fp_montgomery_reduce(&res, P, mp);
      }
      /* then multiply */
      fp_mul(&res, &M[bitbuf], &res);
      fp_montgomery_reduce(&res, P, mp);
      /* empty window and reset */
      bitcpy = 0;
      bitbuf = 0;
      mode   = 1;
    }
  }
  /* if bits remain then square/multiply */
  if (mode == 2 && bitcpy > 0) {
    /* square then multiply if the bit is set */
    for (x = 0; x < bitcpy; x++) {
      fp_sqr(&res, &res);
      fp_montgomery_reduce(&res, P, mp);
      /* get next bit of the window */
      bitbuf <<= 1;
      if ((bitbuf & (1 << winsize)) != 0) {
        /* then multiply */
        fp_mul(&res, &M[1], &res);
        fp_montgomery_reduce(&res, P, mp);
      }
    }
  }
  /* fixup result if Montgomery reduction is used
   * recall that any value in a Montgomery system is
   * actually multiplied by R mod n.  So we have
   * to reduce one more time to cancel out the factor
   * of R.
   */
  fp_montgomery_reduce(&res, P, mp);
  /* swap res with Y */
  fp_copy (&res, Y);
  return FP_OKAY;
 }
--- a/fp_gcd.c
+++ b/fp_gcd.c
@ -0,0 +1,51 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = (a, b) */
 void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
 {
   fp_int u, v, r;
   /* either zero than gcd is the largest */
   if (fp_iszero (a) == 1 && fp_iszero (b) == 0) {
     fp_abs (b, c);
     return;
   }
   if (fp_iszero (a) == 0 && fp_iszero (b) == 1) {
     fp_abs (a, c);
     return;
   }
   /* optimized.  At this point if a == 0 then
    * b must equal zero too
    */
   if (fp_iszero (a) == 1) {
     fp_zero(c);
     return;
   }
   /* sort inputs */
   if (fp_cmp_mag(a, b) != FP_LT) {
      fp_init_copy(&u, a);
      fp_init_copy(&v, b);
   } else {
      fp_init_copy(&u, b);
      fp_init_copy(&v, a);
   }
   fp_zero(&r);
   while (fp_iszero(&v) == FP_NO) {
      fp_mod(&u, &v, &r);
      fp_copy(&v, &u);
      fp_copy(&r, &v);
   }
   fp_copy(&u, c);
 }
--- a/fp_invmod.c
+++ b/fp_invmod.c
@ -0,0 +1,98 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = 1/a (mod b) for odd b only */
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
 {
  fp_int  x, y, u, v, B, D;
  int     neg;
  /* 2. [modified] b must be odd   */
  if (fp_iseven (b) == FP_YES) {
    return FP_VAL;
  }
  /* init all our temps */
  fp_init(&x);  fp_init(&y);
  fp_init(&u);  fp_init(&v);
  fp_init(&B);  fp_init(&D);
  /* x == modulus, y == value to invert */
  fp_copy(b, &x);
  /* we need y = |a| */
  fp_abs(a, &y);
  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
  fp_copy(&x, &u);
  fp_copy(&y, &v);
  fp_set (&D, 1);
 top:
  /* 4.  while u is even do */
  while (fp_iseven (&u) == FP_YES) {
    /* 4.1 u = u/2 */
    fp_div_2 (&u, &u);
    /* 4.2 if B is odd then */
    if (fp_isodd (&B) == FP_YES) {
      fp_sub (&B, &x, &B);
    }
    /* B = B/2 */
    fp_div_2 (&B, &B);
  }
  /* 5.  while v is even do */
  while (fp_iseven (&v) == FP_YES) {
    /* 5.1 v = v/2 */
    fp_div_2 (&v, &v);
    /* 5.2 if D is odd then */
    if (fp_isodd (&D) == FP_YES) {
      /* D = (D-x)/2 */
      fp_sub (&D, &x, &D);
    }
    /* D = D/2 */
    fp_div_2 (&D, &D);
  }
  /* 6.  if u >= v then */
  if (fp_cmp (&u, &v) != FP_LT) {
    /* u = u - v, B = B - D */
    fp_sub (&u, &v, &u);
    fp_sub (&B, &D, &B);
  } else {
    /* v - v - u, D = D - B */
    fp_sub (&v, &u, &v);
    fp_sub (&D, &B, &D);
  }
  /* if not zero goto step 4 */
  if (fp_iszero (&u) == FP_NO) {
    goto top;
  }
  /* now a = C, b = D, gcd == g*v */
  /* if v != 1 then there is no inverse */
  if (fp_cmp_d (&v, 1) != FP_EQ) {
    return FP_VAL;
  }
  /* b is now the inverse */
  neg = a->sign;
  while (D.sign == FP_NEG) {
    fp_add (&D, b, &D);
  }
  fp_copy (&D, c);
  c->sign = neg;
  return FP_OKAY;
 }
--- a/fp_isprime.c
+++ b/fp_isprime.c
@ -0,0 +1,74 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* a few primes */
 static const fp_digit primes[256] = {
  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
 };
 int fp_isprime(fp_int *a)
 {
   fp_int   b;
   fp_digit d;
   int      r, res;
   /* do trial division */
   for (r = 0; r < 256; r++) {
       fp_mod_d(a, primes[r], &d);
       if (d == 0) {
          return FP_NO;
       }
   }
   /* now do 8 miller rabins */
   for (r = 0; r < 8; r++) {
       fp_set(&b, primes[r]);
       fp_prime_miller_rabin(a, &b, &res);
       if (res == FP_NO) {
          return FP_NO;
       }
   }
   return FP_YES;
 }
--- a/fp_lcm.c
+++ b/fp_lcm.c
@ -0,0 +1,27 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = [a, b] */
 void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
 {
   fp_int  t1, t2;
   fp_init(&t1);
   fp_init(&t2);
   fp_gcd(a, b, &t1);
   if (fp_cmp_mag(a, b) == FP_GT) {
      fp_div(a, &t1, &t2, NULL);
      fp_mul(b, &t2, c);
   } else {
      fp_div(b, &t1, &t2, NULL);
      fp_mul(a, &t2, c);
   }   
 }
--- a/fp_lshd.c
+++ b/fp_lshd.c
@ -0,0 +1,34 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_lshd(fp_int *a, int x)
 {
   int y;
   /* move up and truncate as required */
   y = MIN(a->used + x - 1, (int)(FP_SIZE-1));
   /* store new size */
   a->used = y + 1;
   /* move digits */
   for (; y >= x; y--) {
       a->dp[y] = a->dp[y-x];
   }
   /* zero lower digits */
   for (; y >= 0; y--) {
       a->dp[y] = 0;
   }
   /* clamp digits */
   fp_clamp(a);
 }
--- a/fp_mod.c
+++ b/fp_mod.c
@ -0,0 +1,18 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
 {
   return fp_div(a, b, NULL, c);
 }
--- a/fp_mod_2d.c
+++ b/fp_mod_2d.c
@ -0,0 +1,38 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a mod 2**d */
 void fp_mod_2d(fp_int *a, int b, fp_int *c)
 {
   int x;
   /* zero if count less than or equal to zero */
   if (b <= 0) {
      fp_zero(c);
      return;
   }
   /* get copy of input */
   fp_copy(a, c);
   /* if 2**d is larger than we just return */
   if (b >= (DIGIT_BIT * a->used)) {
      return;
   }
  /* zero digits above the last digit of the modulus */
  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
    c->dp[x] = 0;
  }
  /* clear the digit that is not completely outside/inside the modulus */
  c->dp[b / DIGIT_BIT] &= ~((fp_digit)0) >> (DIGIT_BIT - b);
  fp_clamp (c);
 }
--- a/fp_mod_d.c
+++ b/fp_mod_d.c
@ -0,0 +1,16 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a mod b, 0 <= c < b  */
 int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
 {
   return fp_div_d(a, b, NULL, c);
 }
--- a/fp_montgomery_calc_normalization.c
+++ b/fp_montgomery_calc_normalization.c
@ -0,0 +1,38 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* computes a = B**n mod b without division or multiplication useful for
 * normalizing numbers in a Montgomery system.
 */
 void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
 {
  int     x, bits;
  /* how many bits of last digit does b use */
  bits = fp_count_bits (b) % DIGIT_BIT;
  /* compute A = B^(n-1) * 2^(bits-1) */
  if (b->used > 1) {
     fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
  } else {
     fp_set(a, 1);
     ++bits;
  }
  /* now compute C = A * B mod b */
  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
    fp_mul_2 (a, a);
    if (fp_cmp_mag (a, b) != FP_LT) {
      s_fp_sub (a, b, a);
    }
  }
 }
--- a/fp_montgomery_reduce.c
+++ b/fp_montgomery_reduce.c
@ -0,0 +1,249 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 #if defined(TFM_X86) 
 /* x86-32 code */
 #define MONT_START 
 #define MONT_FINI
 #define LOOP_START \
   mu = c[x] * mp;
 #define INNERMUL \
 asm(                                                                                          \
 "movl %7,%%eax                \n\t"                                                           \
 "mull %6                      \n\t"                                                           \
 "addl %%eax,%0                \n\t"                                                           \
 "adcl %%edx,%1                \n\t"                                                           \
 "adcl $0,%2                   \n\t"                                                           \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),  \
                                                "g"(mu), "g"(*tmpm++)                          \
                                               : "%eax", "%edx", "%cc");
 #define PROPCARRY \
 asm(                                                                                               \
 "movl %1,%%eax                \n\t"                                                                \
 "addl  %%eax,%6               \n\t"                                                                \
 "movl %2,%%eax                \n\t"                                                                \
 "adcl  %%eax,%7               \n\t"                                                                \
 "adcl $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
 : "%eax", "%cc");
 #elif defined(TFM_X86_64)
 /* x86-64 code */
 #define MONT_START 
 #define MONT_FINI
 #define LOOP_START \
   mu = c[x] * mp;
 #define INNERMUL \
 asm(                                                                                          \
 "movq %7,%%rax                \n\t"                                                           \
 "mulq %6                      \n\t"                                                           \
 "addq %%rax,%0                \n\t"                                                           \
 "adcq %%rdx,%1                \n\t"                                                           \
 "adcq $0,%2                   \n\t"                                                           \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),  \
                                                "g"(mu), "g"(*tmpm++)                          \
                                               : "%rax", "%rdx", "%cc");
 #define PROPCARRY \
 asm(                                                                                               \
 "movq %1,%%rax                \n\t"                                                                \
 "addq  %%rax,%6               \n\t"                                                                \
 "movq %2,%%rax                \n\t"                                                                \
 "adcq  %%rax,%7               \n\t"                                                                \
 "adcq $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
 : "%rax", "%cc");
 #elif defined(TFM_SSE2)
 /* SSE2 code */
 #define MONT_START \
 asm("movd %0,%%mm2"::"g"(mp));
 #define MONT_FINI \
 asm("emms");
 #define LOOP_START \
 asm(\
 "movd %0,%%mm1                \n\t" \
 "pmuludq %%mm2,%%mm1          \n\t" \
 :: "g"(c[x]), "g"(mp));
 #define INNERMUL \
 asm(                                                                                          \
 "movd %6,%%mm0                \n\t"                                                           \
 "pmuludq %%mm1,%%mm0          \n\t"                                                           \
 "movd %%mm0,%%eax             \n\t"                                                           \
 "psrlq $32, %%mm0             \n\t"                                                           \
 "addl %%eax,%0                \n\t"                                                           \
 "movd %%mm0,%%eax             \n\t"                                                           \
 "adcl %%eax,%1                \n\t"                                                           \
 "adcl $0,%2                   \n\t"                                                           \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),  \
                                                "g"(*tmpm++)                                  \
                                               : "%eax", "%cc");
 #define PROPCARRY \
 asm(                                                                                               \
 "movl %1,%%eax                \n\t"                                                                \
 "addl  %%eax,%6               \n\t"                                                                \
 "movl %2,%%eax                \n\t"                                                                \
 "adcl  %%eax,%7               \n\t"                                                                \
 "adcl $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
 : "%eax", "%cc");
 #elif defined(TFM_ARM)
 /* ISO C code */
 #define MONT_START 
 #define MONT_FINI
 #define LOOP_START \
   mu = c[x] * mp;
 /* NOTE: later write it using two regs instead of three for _c + ... */
 #define INNERMUL \
 asm(                                             \
 "UMULL r0,r1,%0,%1                \n\t"          \
 "LDR   r2,[%2]                    \n\t"          \
 "ADDS  r2,r2,r0                   \n\t"          \
 "STR   r2,[%2]                    \n\t"          \
 "LDR   r2,[%3]                    \n\t"          \
 "ADCS  r2,r2,r1                   \n\t"          \
 "STR   r2,[%3]                    \n\t"          \
 "LDR   r2,[%4]                    \n\t"          \
 "ADC   r2,r2,#0                   \n\t"          \
 "STR   r2,[%4]                    \n\t"          \
 ::"r"(mu),"r"(*tmpm++),"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "r2", "%cc");
 #define PROPCARRY \
 asm(                                             \
 "LDR   r0,[%1]                    \n\t"          \
 "LDR   r1,[%0,#4]                 \n\t"          \
 "ADDS  r0,r0,r1                   \n\t"          \
 "STR   r0,[%0,#4]                 \n\t"          \
 "LDR   r0,[%2]                    \n\t"          \
 "LDR   r1,[%1,#4]                 \n\t"          \
 "ADCS  r0,r0,r1                   \n\t"          \
 "STR   r0,[%1,#4]                 \n\t"          \
 "LDR   r0,[%2,#4]                 \n\t"          \
 "ADC   r0,r0,#0                   \n\t"          \
 "STR   r0,[%2,#4]                 \n\t"          \
 ::"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "%cc");
 #else
 /* ISO C code */
 #define MONT_START 
 #define MONT_FINI
 #define LOOP_START \
   mu = c[x] * mp;
 #define INNERMUL \
   t = ((fp_word)mu) * ((fp_word)*tmpm++);                                             \
   _c[OFF0] += t;               if (_c[OFF0] < (fp_digit)t)              ++_c[OFF1];   \
   _c[OFF1] += (t>>DIGIT_BIT);  if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2];   \
 #define PROPCARRY \
   _c[OFF0+1] += _c[OFF1];          if (_c[OFF0+1] < _c[OFF1])                ++_c[OFF1+1]; \
   _c[OFF1+1] += _c[OFF2];          if (_c[OFF1+1] < _c[OFF2])                ++_c[OFF2+1];
 #endif
 #define OFF0  (0)
 #define OFF1  (FP_SIZE)
 #define OFF2  (FP_SIZE+FP_SIZE)
 /* computes x/R == x (mod N) via Montgomery Reduction */
 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 {
   fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
   int      oldused, x, y, pa;
   fp_word  t;
   /* now zero the buff */
   pa = m->used;
   memset(c, 0, sizeof(c));
   /* copy the input */
   oldused = a->used;
   for (x = 0; x < oldused; x++) {
       c[x] = a->dp[x];
   }
   MONT_START;
   /* now let's get bizz-sy! */
   for (x = 0; x < pa; x++) {
       /* get Mu for this round */
       LOOP_START;
       /* our friendly neighbourhood alias */
       _c   = c + x;
       tmpm = m->dp;
       for (y = 0; y < pa; y++) {
          INNERMUL;
          ++_c;
       }
       /* send carry up man... */
       _c = c + x;
       PROPCARRY;
  }         
  /* fix the rest of the carries */
  _c = c + pa;
  for (; x < pa * 2 + 2; x++) {
     PROPCARRY;
     ++_c;
  }
  /* now copy out */
  _c = c + pa;
  tmpm = a->dp;
  for (x = 0; x < pa+1; x++) {
     *tmpm++ = *_c++;
  }
  for (; x < oldused; x++)   {
     *tmpm++ = 0;
  }
  MONT_FINI;
  a->used = pa+1;
  fp_clamp(a);
  /* if A >= m then A = A - m */
  if (fp_cmp_mag (a, m) != FP_LT) {
    s_fp_sub (a, m, a);
  }
 }
--- a/fp_montgomery_setup.c
+++ b/fp_montgomery_setup.c
@ -0,0 +1,44 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* setups the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *rho)
 {
  fp_digit x, b;
 /* fast inversion mod 2**k
 *
 * Based on the fact that
 *
 * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
 *                    =>  2*X*A - X*X*A*A = 1
 *                    =>  2*(1) - (1)     = 1
 */
  b = a->dp[0];
  if ((b & 1) == 0) {
    return FP_VAL;
  }
  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
 #ifdef FP_64BIT
  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
 #endif
  /* rho = -1/m mod b */
  *rho = (((fp_word) 1 << ((fp_word) DIGIT_BIT)) - ((fp_word)x));
  return FP_OKAY;
 }
--- a/fp_mul.c
+++ b/fp_mul.c
@ -0,0 +1,134 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a * b */
 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
    int    r, y, yy, s;
    fp_int ac, bd, comp, amb, cmd, t1, t2;
     y  = MAX(A->used, B->used);
     yy = MIN(A->used, B->used);
     if (yy <= 8 || y <= 64) {
    /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
       of the largest input.  We also want to avoid doing excess mults if the 
       inputs are not close to the next power of two.  That is, for example,
       if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications 
    */
        if (y <= 4) {
           fp_mul_comba4(A,B,C);
        } else if (y <= 8) {
           fp_mul_comba8(A,B,C);
        } else if (y <= 16 && y >= 12) {
           fp_mul_comba16(A,B,C);
 #ifdef TFM_HUGE
        } else if (y <= 32 && y >= 28) {
           fp_mul_comba32(A,B,C);
 #endif
        } else {
           fp_mul_comba(A,B,C);
        }
    } else {
        /* do the karatsuba action 
           if A = ab and B = cd for ||a|| = r we need to solve 
           ac*r^2 + (-(a-b)(c-d) + ac + bd)*r + bd
           So we solve for the three products then we form the final result with careful shifting 
           and addition.
 Obvious points of optimization
 - "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]
 - Similarly the "bd" parts can be memcpy'ed and zeroed to 8
 - 
        */
        /* get our value of r */
        r = yy >> 1;
        /* now solve for ac */
 //        fp_copy(A, &t1); fp_rshd(&t1, r); 
        for (s = 0; s < A->used - r; s++) {
            t1.dp[s] = A->dp[s+r];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s] = 0; 
        }
        if (A->used >= r) {
           t1.used = A->used - r;
        } else {
           t1.used = 0;
        }
        t1.sign = A->sign;
 //        fp_copy(B, &t2); fp_rshd(&t2, r); 
        for (s = 0; s < B->used - r; s++) {
            t2.dp[s] = B->dp[s+r];
        }
        for (; s < FP_SIZE; s++) {
            t2.dp[s] = 0; 
        }
        if (B->used >= r) {
           t2.used = B->used - r;
        } else {
           t2.used = 0;
        }
        t2.sign = B->sign;
        fp_copy(&t1, &amb); fp_copy(&t2, &cmd);
        fp_zero(&ac);
        fp_mul(&t1, &t2, &ac);
        /* now solve for bd */
 //        fp_mod_2d(A, r * DIGIT_BIT, &t1);
 //        fp_mod_2d(B, r * DIGIT_BIT, &t2);
        for (s = 0; s < r; s++) {
            t1.dp[s] = A->dp[s];
            t2.dp[s] = B->dp[s];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s]   = 0; 
            t2.dp[s] = 0; 
        }
        t1.used = r;
        t2.used = r;
        fp_clamp(&t1);
        fp_clamp(&t2);
        fp_sub(&amb, &t1, &amb); fp_sub(&cmd, &t2, &cmd);
        fp_zero(&bd);
        fp_mul(&t1, &t2, &bd);
        /* now get the (a-b)(c-d) term */
        fp_zero(&comp);
        fp_mul(&amb, &cmd, &comp);
        /* now solve the system, do the middle term first */
        comp.sign ^= 1;
        fp_add(&comp, &ac, &comp);
        fp_add(&comp, &bd, &comp);
        fp_lshd(&comp, r);
        /* leading term */
        fp_lshd(&ac, r+r);
        /* now sum them together */
        s = A->sign ^ B->sign;
        fp_zero(C);
        fp_add(&ac, &comp, C);
        fp_add(&bd, C, C);    
        C->sign = C->used ? s : FP_ZPOS;
    }
 }
--- a/fp_mul_2.c
+++ b/fp_mul_2.c
@ -0,0 +1,63 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_mul_2(fp_int * a, fp_int * b)
 {
  int     x, oldused;
  oldused = b->used;
  b->used = a->used;
  {
    register fp_digit r, rr, *tmpa, *tmpb;
    /* alias for source */
    tmpa = a->dp;
    /* alias for dest */
    tmpb = b->dp;
    /* carry */
    r = 0;
    for (x = 0; x < a->used; x++) {
      /* get what will be the *next* carry bit from the 
       * MSB of the current digit 
       */
      rr = *tmpa >> ((fp_digit)(DIGIT_BIT - 1));
      /* now shift up this digit, add in the carry [from the previous] */
      *tmpb++ = ((*tmpa++ << ((fp_digit)1)) | r);
      /* copy the carry that would be from the source 
       * digit into the next iteration 
       */
      r = rr;
    }
    /* new leading digit? */
    if (r != 0 && b->used != (FP_SIZE-1)) {
      /* add a MSB which is always 1 at this point */
      *tmpb = 1;
      ++(b->used);
    }
    /* now zero any excess digits on the destination 
     * that we didn't write to 
     */
    tmpb = b->dp + b->used;
    for (x = b->used; x < oldused; x++) {
      *tmpb++ = 0;
    }
  }
  b->sign = a->sign;
 }
--- a/fp_mul_2d.c
+++ b/fp_mul_2d.c
@ -0,0 +1,43 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a * 2**d */
 void fp_mul_2d(fp_int *a, int b, fp_int *c)
 {
   fp_digit carry, carrytmp, shift;
   int x;
   /* copy it */
   fp_copy(a, c);
   /* handle whole digits */
   if (b >= DIGIT_BIT) {
      fp_lshd(c, b/DIGIT_BIT);
   }
   b %= DIGIT_BIT;
   /* shift the digits */
   if (b != 0) {
      carry = 0;   
      shift = DIGIT_BIT - b;
      for (x = 0; x < c->used; x++) {
          carrytmp = c->dp[x] >> shift;
          c->dp[x] = (c->dp[x] << b) + carry;
          carry = carrytmp;
      }
      /* store last carry if room */
      if (carry && x < FP_SIZE) {
         c->dp[c->used++] = carry;
      }
   }
   fp_clamp(c);
 }
--- a/fp_mul_comba.c
+++ b/fp_mul_comba.c
@ -0,0 +1,772 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 /* About this file...
 */
 #include <tfm.h>
 /* these are the combas.  Worship them. */
 #if defined(TFM_X86)
 /* Generic x86 optimized code */
 /* anything you need at the start */
 #define COMBA_START
 /* clear the chaining variables */
 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;
 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 /* store the first sum */
 #define COMBA_STORE(x) \
   x = c0;
 /* store the second sum [carry] */
 #define COMBA_STORE2(x) \
   x = c1;
 /* anything you need at the end */
 #define COMBA_FINI
 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
 asm volatile (                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
 #elif defined(TFM_X86_64)
 /* x86-64 optimized */
 /* anything you need at the start */
 #define COMBA_START
 /* clear the chaining variables */
 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;
 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 /* store the first sum */
 #define COMBA_STORE(x) \
   x = c0;
 /* store the second sum [carry] */
 #define COMBA_STORE2(x) \
   x = c1;
 /* anything you need at the end */
 #define COMBA_FINI
 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
 asm volatile (                                            \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%rax","%rdx","%cc");
 #elif defined(TFM_SSE2)
 /* use SSE2 optimizations */
 /* anything you need at the start */
 #define COMBA_START
 /* clear the chaining variables */
 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;
 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 /* store the first sum */
 #define COMBA_STORE(x) \
   x = c0;
 /* store the second sum [carry] */
 #define COMBA_STORE2(x) \
   x = c1;
 /* anything you need at the end */
 #define COMBA_FINI \
   asm("emms");
 /* this should multiply i and j  */
   #define MULADD(i, j)                                      \
   asm volatile (                                            \
        "movd  %6,%%mm0     \n\t"                            \
        "movd  %7,%%mm1     \n\t"                            \
        "pmuludq %%mm1,%%mm0\n\t"                            \
        "movd  %%mm0,%%eax  \n\t"                            \
        "psrlq $32,%%mm0    \n\t"                            \
        "addl  %%eax,%0     \n\t"                            \
        "movd  %%mm0,%%eax  \n\t"                            \
        "adcl  %%eax,%1     \n\t"                            \
        "adcl  $0,%2        \n\t"                            \
        :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%cc");
 #elif defined(TFM_ARM)
 /* ARM code */
 #define COMBA_START 
 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;
 #define COMBA_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define COMBA_FINI
 #define MULADD(i, j)                                          \
 asm(                                                          \
 "  UMULL  r0,r1,%6,%7           \n\t"                         \
 "  ADDS   %0,%0,r0              \n\t"                         \
 "  ADCS   %1,%1,r1              \n\t"                         \
 "  ADC    %2, %2, #0            \n\t"                         \
 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
 #else
 /* ISO C code */
 #define COMBA_START 
 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;
 #define COMBA_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define COMBA_FINI
 #define MULADD(i, j)                                          \
   t  = ((fp_word)i) * ((fp_word)j);                          \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
 #endif
 /* generic PxQ multiplier */
 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
 {
   int       ix, iy, iz, tx, ty, pa;
   fp_digit  c0, c1, c2, *tmpx, *tmpy;
   fp_word   t;
   fp_int    tmp, *dst;
   COMBA_START;
   COMBA_CLEAR;
   /* get size of output and trim */
   pa = A->used + B->used;
   if (pa >= FP_SIZE) {
      pa = FP_SIZE-1;
   }
   if (A == C || B == C) {
      fp_zero(&tmp);
      dst = &tmp;
   } else {
      fp_zero(C);
      dst = C;
   }
   for (ix = 0; ix < pa; ix++) {
      /* get offsets into the two bignums */
      ty = MIN(ix, B->used-1);
      tx = ix - ty;
      /* setup temp aliases */
      tmpx = A->dp + tx;
      tmpy = B->dp + ty;
      /* this is the number of times the loop will iterrate, essentially its 
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
      iy = MIN(A->used-tx, ty+1);
      /* execute loop */
      COMBA_FORWARD;
      for (iz = 0; iz < iy; ++iz) {
          MULADD(*tmpx++, *tmpy--);
      }
      /* store term */
      COMBA_STORE(dst->dp[ix]);
  }
  /* store final carry */
  COMBA_STORE2(dst->dp[ix]);
  COMBA_FINI;
  dst->used = pa;
  fp_clamp(dst);
  dst->sign = dst->used ? A->sign ^ B->sign : FP_ZPOS;
  fp_copy(dst, C);
 }
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_word t;
   fp_digit c0, c1, c2, at[8];
   memcpy(at, A->dp, 4 * sizeof(fp_digit));
   memcpy(at+4, B->dp, 4 * sizeof(fp_digit));
   COMBA_START;
   COMBA_CLEAR;
   /* 0 */
   MULADD(at[0], at[4]); 
   COMBA_STORE(C->dp[0]);
   /* 1 */
   COMBA_FORWARD;
   MULADD(at[0], at[5]);    MULADD(at[1], at[4]); 
   COMBA_STORE(C->dp[1]);
   /* 2 */
   COMBA_FORWARD;
   MULADD(at[0], at[6]);    MULADD(at[1], at[5]);    MULADD(at[2], at[4]); 
   COMBA_STORE(C->dp[2]);
   /* 3 */
   COMBA_FORWARD;
   MULADD(at[0], at[7]);    MULADD(at[1], at[6]);    MULADD(at[2], at[5]);    MULADD(at[3], at[4]); 
   COMBA_STORE(C->dp[3]);
   /* 4 */
   COMBA_FORWARD;
   MULADD(at[1], at[7]);    MULADD(at[2], at[6]);    MULADD(at[3], at[5]); 
   COMBA_STORE(C->dp[4]);
   /* 5 */
   COMBA_FORWARD;
   MULADD(at[2], at[7]);    MULADD(at[3], at[6]); 
   COMBA_STORE(C->dp[5]);
   /* 6 */
   COMBA_FORWARD;
   MULADD(at[3], at[7]); 
   COMBA_STORE(C->dp[6]);
   COMBA_STORE2(C->dp[7]);
   C->used = 8;
   C->sign = A->sign ^ B->sign;
   fp_clamp(C);
   COMBA_FINI;
 }
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_word t;
   fp_digit c0, c1, c2, at[16];
   memcpy(at, A->dp, 8 * sizeof(fp_digit));
   memcpy(at+8, B->dp, 8 * sizeof(fp_digit));
   COMBA_START;
   COMBA_CLEAR;
   /* 0 */
   MULADD(at[0], at[8]); 
   COMBA_STORE(C->dp[0]);
   /* 1 */
   COMBA_FORWARD;
   MULADD(at[0], at[9]);    MULADD(at[1], at[8]); 
   COMBA_STORE(C->dp[1]);
   /* 2 */
   COMBA_FORWARD;
   MULADD(at[0], at[10]);    MULADD(at[1], at[9]);    MULADD(at[2], at[8]); 
   COMBA_STORE(C->dp[2]);
   /* 3 */
   COMBA_FORWARD;
   MULADD(at[0], at[11]);    MULADD(at[1], at[10]);    MULADD(at[2], at[9]);    MULADD(at[3], at[8]); 
   COMBA_STORE(C->dp[3]);
   /* 4 */
   COMBA_FORWARD;
   MULADD(at[0], at[12]);    MULADD(at[1], at[11]);    MULADD(at[2], at[10]);    MULADD(at[3], at[9]);    MULADD(at[4], at[8]); 
   COMBA_STORE(C->dp[4]);
   /* 5 */
   COMBA_FORWARD;
   MULADD(at[0], at[13]);    MULADD(at[1], at[12]);    MULADD(at[2], at[11]);    MULADD(at[3], at[10]);    MULADD(at[4], at[9]);    MULADD(at[5], at[8]); 
   COMBA_STORE(C->dp[5]);
   /* 6 */
   COMBA_FORWARD;
   MULADD(at[0], at[14]);    MULADD(at[1], at[13]);    MULADD(at[2], at[12]);    MULADD(at[3], at[11]);    MULADD(at[4], at[10]);    MULADD(at[5], at[9]);    MULADD(at[6], at[8]); 
   COMBA_STORE(C->dp[6]);
   /* 7 */
   COMBA_FORWARD;
   MULADD(at[0], at[15]);    MULADD(at[1], at[14]);    MULADD(at[2], at[13]);    MULADD(at[3], at[12]);    MULADD(at[4], at[11]);    MULADD(at[5], at[10]);    MULADD(at[6], at[9]);    MULADD(at[7], at[8]); 
   COMBA_STORE(C->dp[7]);
   /* 8 */
   COMBA_FORWARD;
   MULADD(at[1], at[15]);    MULADD(at[2], at[14]);    MULADD(at[3], at[13]);    MULADD(at[4], at[12]);    MULADD(at[5], at[11]);    MULADD(at[6], at[10]);    MULADD(at[7], at[9]); 
   COMBA_STORE(C->dp[8]);
   /* 9 */
   COMBA_FORWARD;
   MULADD(at[2], at[15]);    MULADD(at[3], at[14]);    MULADD(at[4], at[13]);    MULADD(at[5], at[12]);    MULADD(at[6], at[11]);    MULADD(at[7], at[10]); 
   COMBA_STORE(C->dp[9]);
   /* 10 */
   COMBA_FORWARD;
   MULADD(at[3], at[15]);    MULADD(at[4], at[14]);    MULADD(at[5], at[13]);    MULADD(at[6], at[12]);    MULADD(at[7], at[11]); 
   COMBA_STORE(C->dp[10]);
   /* 11 */
   COMBA_FORWARD;
   MULADD(at[4], at[15]);    MULADD(at[5], at[14]);    MULADD(at[6], at[13]);    MULADD(at[7], at[12]); 
   COMBA_STORE(C->dp[11]);
   /* 12 */
   COMBA_FORWARD;
   MULADD(at[5], at[15]);    MULADD(at[6], at[14]);    MULADD(at[7], at[13]); 
   COMBA_STORE(C->dp[12]);
   /* 13 */
   COMBA_FORWARD;
   MULADD(at[6], at[15]);    MULADD(at[7], at[14]); 
   COMBA_STORE(C->dp[13]);
   /* 14 */
   COMBA_FORWARD;
   MULADD(at[7], at[15]); 
   COMBA_STORE(C->dp[14]);
   COMBA_STORE2(C->dp[15]);
   C->used = 16;
   C->sign = A->sign ^ B->sign;
   fp_clamp(C);
   COMBA_FINI;
 }
 void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_word t;
   fp_digit c0, c1, c2, at[32];
   memcpy(at, A->dp, 16 * sizeof(fp_digit));
   memcpy(at+16, B->dp, 16 * sizeof(fp_digit));
   COMBA_START;
   COMBA_CLEAR;
   /* 0 */
   MULADD(at[0], at[16]); 
   COMBA_STORE(C->dp[0]);
   /* 1 */
   COMBA_FORWARD;
   MULADD(at[0], at[17]);    MULADD(at[1], at[16]); 
   COMBA_STORE(C->dp[1]);
   /* 2 */
   COMBA_FORWARD;
   MULADD(at[0], at[18]);    MULADD(at[1], at[17]);    MULADD(at[2], at[16]); 
   COMBA_STORE(C->dp[2]);
   /* 3 */
   COMBA_FORWARD;
   MULADD(at[0], at[19]);    MULADD(at[1], at[18]);    MULADD(at[2], at[17]);    MULADD(at[3], at[16]); 
   COMBA_STORE(C->dp[3]);
   /* 4 */
   COMBA_FORWARD;
   MULADD(at[0], at[20]);    MULADD(at[1], at[19]);    MULADD(at[2], at[18]);    MULADD(at[3], at[17]);    MULADD(at[4], at[16]); 
   COMBA_STORE(C->dp[4]);
   /* 5 */
   COMBA_FORWARD;
   MULADD(at[0], at[21]);    MULADD(at[1], at[20]);    MULADD(at[2], at[19]);    MULADD(at[3], at[18]);    MULADD(at[4], at[17]);    MULADD(at[5], at[16]); 
   COMBA_STORE(C->dp[5]);
   /* 6 */
   COMBA_FORWARD;
   MULADD(at[0], at[22]);    MULADD(at[1], at[21]);    MULADD(at[2], at[20]);    MULADD(at[3], at[19]);    MULADD(at[4], at[18]);    MULADD(at[5], at[17]);    MULADD(at[6], at[16]); 
   COMBA_STORE(C->dp[6]);
   /* 7 */
   COMBA_FORWARD;
   MULADD(at[0], at[23]);    MULADD(at[1], at[22]);    MULADD(at[2], at[21]);    MULADD(at[3], at[20]);    MULADD(at[4], at[19]);    MULADD(at[5], at[18]);    MULADD(at[6], at[17]);    MULADD(at[7], at[16]); 
   COMBA_STORE(C->dp[7]);
   /* 8 */
   COMBA_FORWARD;
   MULADD(at[0], at[24]);    MULADD(at[1], at[23]);    MULADD(at[2], at[22]);    MULADD(at[3], at[21]);    MULADD(at[4], at[20]);    MULADD(at[5], at[19]);    MULADD(at[6], at[18]);    MULADD(at[7], at[17]);    MULADD(at[8], at[16]); 
   COMBA_STORE(C->dp[8]);
   /* 9 */
   COMBA_FORWARD;
   MULADD(at[0], at[25]);    MULADD(at[1], at[24]);    MULADD(at[2], at[23]);    MULADD(at[3], at[22]);    MULADD(at[4], at[21]);    MULADD(at[5], at[20]);    MULADD(at[6], at[19]);    MULADD(at[7], at[18]);    MULADD(at[8], at[17]);    MULADD(at[9], at[16]); 
   COMBA_STORE(C->dp[9]);
   /* 10 */
   COMBA_FORWARD;
   MULADD(at[0], at[26]);    MULADD(at[1], at[25]);    MULADD(at[2], at[24]);    MULADD(at[3], at[23]);    MULADD(at[4], at[22]);    MULADD(at[5], at[21]);    MULADD(at[6], at[20]);    MULADD(at[7], at[19]);    MULADD(at[8], at[18]);    MULADD(at[9], at[17]);    MULADD(at[10], at[16]); 
   COMBA_STORE(C->dp[10]);
   /* 11 */
   COMBA_FORWARD;
   MULADD(at[0], at[27]);    MULADD(at[1], at[26]);    MULADD(at[2], at[25]);    MULADD(at[3], at[24]);    MULADD(at[4], at[23]);    MULADD(at[5], at[22]);    MULADD(at[6], at[21]);    MULADD(at[7], at[20]);    MULADD(at[8], at[19]);    MULADD(at[9], at[18]);    MULADD(at[10], at[17]);    MULADD(at[11], at[16]); 
   COMBA_STORE(C->dp[11]);
   /* 12 */
   COMBA_FORWARD;
   MULADD(at[0], at[28]);    MULADD(at[1], at[27]);    MULADD(at[2], at[26]);    MULADD(at[3], at[25]);    MULADD(at[4], at[24]);    MULADD(at[5], at[23]);    MULADD(at[6], at[22]);    MULADD(at[7], at[21]);    MULADD(at[8], at[20]);    MULADD(at[9], at[19]);    MULADD(at[10], at[18]);    MULADD(at[11], at[17]);    MULADD(at[12], at[16]); 
   COMBA_STORE(C->dp[12]);
   /* 13 */
   COMBA_FORWARD;
   MULADD(at[0], at[29]);    MULADD(at[1], at[28]);    MULADD(at[2], at[27]);    MULADD(at[3], at[26]);    MULADD(at[4], at[25]);    MULADD(at[5], at[24]);    MULADD(at[6], at[23]);    MULADD(at[7], at[22]);    MULADD(at[8], at[21]);    MULADD(at[9], at[20]);    MULADD(at[10], at[19]);    MULADD(at[11], at[18]);    MULADD(at[12], at[17]);    MULADD(at[13], at[16]); 
   COMBA_STORE(C->dp[13]);
   /* 14 */
   COMBA_FORWARD;
   MULADD(at[0], at[30]);    MULADD(at[1], at[29]);    MULADD(at[2], at[28]);    MULADD(at[3], at[27]);    MULADD(at[4], at[26]);    MULADD(at[5], at[25]);    MULADD(at[6], at[24]);    MULADD(at[7], at[23]);    MULADD(at[8], at[22]);    MULADD(at[9], at[21]);    MULADD(at[10], at[20]);    MULADD(at[11], at[19]);    MULADD(at[12], at[18]);    MULADD(at[13], at[17]);    MULADD(at[14], at[16]); 
   COMBA_STORE(C->dp[14]);
   /* 15 */
   COMBA_FORWARD;
   MULADD(at[0], at[31]);    MULADD(at[1], at[30]);    MULADD(at[2], at[29]);    MULADD(at[3], at[28]);    MULADD(at[4], at[27]);    MULADD(at[5], at[26]);    MULADD(at[6], at[25]);    MULADD(at[7], at[24]);    MULADD(at[8], at[23]);    MULADD(at[9], at[22]);    MULADD(at[10], at[21]);    MULADD(at[11], at[20]);    MULADD(at[12], at[19]);    MULADD(at[13], at[18]);    MULADD(at[14], at[17]);    MULADD(at[15], at[16]); 
   COMBA_STORE(C->dp[15]);
   /* 16 */
   COMBA_FORWARD;
   MULADD(at[1], at[31]);    MULADD(at[2], at[30]);    MULADD(at[3], at[29]);    MULADD(at[4], at[28]);    MULADD(at[5], at[27]);    MULADD(at[6], at[26]);    MULADD(at[7], at[25]);    MULADD(at[8], at[24]);    MULADD(at[9], at[23]);    MULADD(at[10], at[22]);    MULADD(at[11], at[21]);    MULADD(at[12], at[20]);    MULADD(at[13], at[19]);    MULADD(at[14], at[18]);    MULADD(at[15], at[17]); 
   COMBA_STORE(C->dp[16]);
   /* 17 */
   COMBA_FORWARD;
   MULADD(at[2], at[31]);    MULADD(at[3], at[30]);    MULADD(at[4], at[29]);    MULADD(at[5], at[28]);    MULADD(at[6], at[27]);    MULADD(at[7], at[26]);    MULADD(at[8], at[25]);    MULADD(at[9], at[24]);    MULADD(at[10], at[23]);    MULADD(at[11], at[22]);    MULADD(at[12], at[21]);    MULADD(at[13], at[20]);    MULADD(at[14], at[19]);    MULADD(at[15], at[18]); 
   COMBA_STORE(C->dp[17]);
   /* 18 */
   COMBA_FORWARD;
   MULADD(at[3], at[31]);    MULADD(at[4], at[30]);    MULADD(at[5], at[29]);    MULADD(at[6], at[28]);    MULADD(at[7], at[27]);    MULADD(at[8], at[26]);    MULADD(at[9], at[25]);    MULADD(at[10], at[24]);    MULADD(at[11], at[23]);    MULADD(at[12], at[22]);    MULADD(at[13], at[21]);    MULADD(at[14], at[20]);    MULADD(at[15], at[19]); 
   COMBA_STORE(C->dp[18]);
   /* 19 */
   COMBA_FORWARD;
   MULADD(at[4], at[31]);    MULADD(at[5], at[30]);    MULADD(at[6], at[29]);    MULADD(at[7], at[28]);    MULADD(at[8], at[27]);    MULADD(at[9], at[26]);    MULADD(at[10], at[25]);    MULADD(at[11], at[24]);    MULADD(at[12], at[23]);    MULADD(at[13], at[22]);    MULADD(at[14], at[21]);    MULADD(at[15], at[20]); 
   COMBA_STORE(C->dp[19]);
   /* 20 */
   COMBA_FORWARD;
   MULADD(at[5], at[31]);    MULADD(at[6], at[30]);    MULADD(at[7], at[29]);    MULADD(at[8], at[28]);    MULADD(at[9], at[27]);    MULADD(at[10], at[26]);    MULADD(at[11], at[25]);    MULADD(at[12], at[24]);    MULADD(at[13], at[23]);    MULADD(at[14], at[22]);    MULADD(at[15], at[21]); 
   COMBA_STORE(C->dp[20]);
   /* 21 */
   COMBA_FORWARD;
   MULADD(at[6], at[31]);    MULADD(at[7], at[30]);    MULADD(at[8], at[29]);    MULADD(at[9], at[28]);    MULADD(at[10], at[27]);    MULADD(at[11], at[26]);    MULADD(at[12], at[25]);    MULADD(at[13], at[24]);    MULADD(at[14], at[23]);    MULADD(at[15], at[22]); 
   COMBA_STORE(C->dp[21]);
   /* 22 */
   COMBA_FORWARD;
   MULADD(at[7], at[31]);    MULADD(at[8], at[30]);    MULADD(at[9], at[29]);    MULADD(at[10], at[28]);    MULADD(at[11], at[27]);    MULADD(at[12], at[26]);    MULADD(at[13], at[25]);    MULADD(at[14], at[24]);    MULADD(at[15], at[23]); 
   COMBA_STORE(C->dp[22]);
   /* 23 */
   COMBA_FORWARD;
   MULADD(at[8], at[31]);    MULADD(at[9], at[30]);    MULADD(at[10], at[29]);    MULADD(at[11], at[28]);    MULADD(at[12], at[27]);    MULADD(at[13], at[26]);    MULADD(at[14], at[25]);    MULADD(at[15], at[24]); 
   COMBA_STORE(C->dp[23]);
   /* 24 */
   COMBA_FORWARD;
   MULADD(at[9], at[31]);    MULADD(at[10], at[30]);    MULADD(at[11], at[29]);    MULADD(at[12], at[28]);    MULADD(at[13], at[27]);    MULADD(at[14], at[26]);    MULADD(at[15], at[25]); 
   COMBA_STORE(C->dp[24]);
   /* 25 */
   COMBA_FORWARD;
   MULADD(at[10], at[31]);    MULADD(at[11], at[30]);    MULADD(at[12], at[29]);    MULADD(at[13], at[28]);    MULADD(at[14], at[27]);    MULADD(at[15], at[26]); 
   COMBA_STORE(C->dp[25]);
   /* 26 */
   COMBA_FORWARD;
   MULADD(at[11], at[31]);    MULADD(at[12], at[30]);    MULADD(at[13], at[29]);    MULADD(at[14], at[28]);    MULADD(at[15], at[27]); 
   COMBA_STORE(C->dp[26]);
   /* 27 */
   COMBA_FORWARD;
   MULADD(at[12], at[31]);    MULADD(at[13], at[30]);    MULADD(at[14], at[29]);    MULADD(at[15], at[28]); 
   COMBA_STORE(C->dp[27]);
   /* 28 */
   COMBA_FORWARD;
   MULADD(at[13], at[31]);    MULADD(at[14], at[30]);    MULADD(at[15], at[29]); 
   COMBA_STORE(C->dp[28]);
   /* 29 */
   COMBA_FORWARD;
   MULADD(at[14], at[31]);    MULADD(at[15], at[30]); 
   COMBA_STORE(C->dp[29]);
   /* 30 */
   COMBA_FORWARD;
   MULADD(at[15], at[31]); 
   COMBA_STORE(C->dp[30]);
   COMBA_STORE2(C->dp[31]);
   C->used = 32;
   C->sign = A->sign ^ B->sign;
   fp_clamp(C);
   COMBA_FINI;
 }
 #ifdef TFM_HUGE
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_word t;
   fp_digit c0, c1, c2, at[64];
   memcpy(at, A->dp, 32 * sizeof(fp_digit));
   memcpy(at+32, B->dp, 32 * sizeof(fp_digit));
   COMBA_START;
   COMBA_CLEAR;
   /* 0 */
   MULADD(at[0], at[32]); 
   COMBA_STORE(C->dp[0]);
   /* 1 */
   COMBA_FORWARD;
   MULADD(at[0], at[33]);    MULADD(at[1], at[32]); 
   COMBA_STORE(C->dp[1]);
   /* 2 */
   COMBA_FORWARD;
   MULADD(at[0], at[34]);    MULADD(at[1], at[33]);    MULADD(at[2], at[32]); 
   COMBA_STORE(C->dp[2]);
   /* 3 */
   COMBA_FORWARD;
   MULADD(at[0], at[35]);    MULADD(at[1], at[34]);    MULADD(at[2], at[33]);    MULADD(at[3], at[32]); 
   COMBA_STORE(C->dp[3]);
   /* 4 */
   COMBA_FORWARD;
   MULADD(at[0], at[36]);    MULADD(at[1], at[35]);    MULADD(at[2], at[34]);    MULADD(at[3], at[33]);    MULADD(at[4], at[32]); 
   COMBA_STORE(C->dp[4]);
   /* 5 */
   COMBA_FORWARD;
   MULADD(at[0], at[37]);    MULADD(at[1], at[36]);    MULADD(at[2], at[35]);    MULADD(at[3], at[34]);    MULADD(at[4], at[33]);    MULADD(at[5], at[32]); 
   COMBA_STORE(C->dp[5]);
   /* 6 */
   COMBA_FORWARD;
   MULADD(at[0], at[38]);    MULADD(at[1], at[37]);    MULADD(at[2], at[36]);    MULADD(at[3], at[35]);    MULADD(at[4], at[34]);    MULADD(at[5], at[33]);    MULADD(at[6], at[32]); 
   COMBA_STORE(C->dp[6]);
   /* 7 */
   COMBA_FORWARD;
   MULADD(at[0], at[39]);    MULADD(at[1], at[38]);    MULADD(at[2], at[37]);    MULADD(at[3], at[36]);    MULADD(at[4], at[35]);    MULADD(at[5], at[34]);    MULADD(at[6], at[33]);    MULADD(at[7], at[32]); 
   COMBA_STORE(C->dp[7]);
   /* 8 */
   COMBA_FORWARD;
   MULADD(at[0], at[40]);    MULADD(at[1], at[39]);    MULADD(at[2], at[38]);    MULADD(at[3], at[37]);    MULADD(at[4], at[36]);    MULADD(at[5], at[35]);    MULADD(at[6], at[34]);    MULADD(at[7], at[33]);    MULADD(at[8], at[32]); 
   COMBA_STORE(C->dp[8]);
   /* 9 */
   COMBA_FORWARD;
   MULADD(at[0], at[41]);    MULADD(at[1], at[40]);    MULADD(at[2], at[39]);    MULADD(at[3], at[38]);    MULADD(at[4], at[37]);    MULADD(at[5], at[36]);    MULADD(at[6], at[35]);    MULADD(at[7], at[34]);    MULADD(at[8], at[33]);    MULADD(at[9], at[32]); 
   COMBA_STORE(C->dp[9]);
   /* 10 */
   COMBA_FORWARD;
   MULADD(at[0], at[42]);    MULADD(at[1], at[41]);    MULADD(at[2], at[40]);    MULADD(at[3], at[39]);    MULADD(at[4], at[38]);    MULADD(at[5], at[37]);    MULADD(at[6], at[36]);    MULADD(at[7], at[35]);    MULADD(at[8], at[34]);    MULADD(at[9], at[33]);    MULADD(at[10], at[32]); 
   COMBA_STORE(C->dp[10]);
   /* 11 */
   COMBA_FORWARD;
   MULADD(at[0], at[43]);    MULADD(at[1], at[42]);    MULADD(at[2], at[41]);    MULADD(at[3], at[40]);    MULADD(at[4], at[39]);    MULADD(at[5], at[38]);    MULADD(at[6], at[37]);    MULADD(at[7], at[36]);    MULADD(at[8], at[35]);    MULADD(at[9], at[34]);    MULADD(at[10], at[33]);    MULADD(at[11], at[32]); 
   COMBA_STORE(C->dp[11]);
   /* 12 */
   COMBA_FORWARD;
   MULADD(at[0], at[44]);    MULADD(at[1], at[43]);    MULADD(at[2], at[42]);    MULADD(at[3], at[41]);    MULADD(at[4], at[40]);    MULADD(at[5], at[39]);    MULADD(at[6], at[38]);    MULADD(at[7], at[37]);    MULADD(at[8], at[36]);    MULADD(at[9], at[35]);    MULADD(at[10], at[34]);    MULADD(at[11], at[33]);    MULADD(at[12], at[32]); 
   COMBA_STORE(C->dp[12]);
   /* 13 */
   COMBA_FORWARD;
   MULADD(at[0], at[45]);    MULADD(at[1], at[44]);    MULADD(at[2], at[43]);    MULADD(at[3], at[42]);    MULADD(at[4], at[41]);    MULADD(at[5], at[40]);    MULADD(at[6], at[39]);    MULADD(at[7], at[38]);    MULADD(at[8], at[37]);    MULADD(at[9], at[36]);    MULADD(at[10], at[35]);    MULADD(at[11], at[34]);    MULADD(at[12], at[33]);    MULADD(at[13], at[32]); 
   COMBA_STORE(C->dp[13]);
   /* 14 */
   COMBA_FORWARD;
   MULADD(at[0], at[46]);    MULADD(at[1], at[45]);    MULADD(at[2], at[44]);    MULADD(at[3], at[43]);    MULADD(at[4], at[42]);    MULADD(at[5], at[41]);    MULADD(at[6], at[40]);    MULADD(at[7], at[39]);    MULADD(at[8], at[38]);    MULADD(at[9], at[37]);    MULADD(at[10], at[36]);    MULADD(at[11], at[35]);    MULADD(at[12], at[34]);    MULADD(at[13], at[33]);    MULADD(at[14], at[32]); 
   COMBA_STORE(C->dp[14]);
   /* 15 */
   COMBA_FORWARD;
   MULADD(at[0], at[47]);    MULADD(at[1], at[46]);    MULADD(at[2], at[45]);    MULADD(at[3], at[44]);    MULADD(at[4], at[43]);    MULADD(at[5], at[42]);    MULADD(at[6], at[41]);    MULADD(at[7], at[40]);    MULADD(at[8], at[39]);    MULADD(at[9], at[38]);    MULADD(at[10], at[37]);    MULADD(at[11], at[36]);    MULADD(at[12], at[35]);    MULADD(at[13], at[34]);    MULADD(at[14], at[33]);    MULADD(at[15], at[32]); 
   COMBA_STORE(C->dp[15]);
   /* 16 */
   COMBA_FORWARD;
   MULADD(at[0], at[48]);    MULADD(at[1], at[47]);    MULADD(at[2], at[46]);    MULADD(at[3], at[45]);    MULADD(at[4], at[44]);    MULADD(at[5], at[43]);    MULADD(at[6], at[42]);    MULADD(at[7], at[41]);    MULADD(at[8], at[40]);    MULADD(at[9], at[39]);    MULADD(at[10], at[38]);    MULADD(at[11], at[37]);    MULADD(at[12], at[36]);    MULADD(at[13], at[35]);    MULADD(at[14], at[34]);    MULADD(at[15], at[33]);    MULADD(at[16], at[32]); 
   COMBA_STORE(C->dp[16]);
   /* 17 */
   COMBA_FORWARD;
   MULADD(at[0], at[49]);    MULADD(at[1], at[48]);    MULADD(at[2], at[47]);    MULADD(at[3], at[46]);    MULADD(at[4], at[45]);    MULADD(at[5], at[44]);    MULADD(at[6], at[43]);    MULADD(at[7], at[42]);    MULADD(at[8], at[41]);    MULADD(at[9], at[40]);    MULADD(at[10], at[39]);    MULADD(at[11], at[38]);    MULADD(at[12], at[37]);    MULADD(at[13], at[36]);    MULADD(at[14], at[35]);    MULADD(at[15], at[34]);    MULADD(at[16], at[33]);    MULADD(at[17], at[32]); 
   COMBA_STORE(C->dp[17]);
   /* 18 */
   COMBA_FORWARD;
   MULADD(at[0], at[50]);    MULADD(at[1], at[49]);    MULADD(at[2], at[48]);    MULADD(at[3], at[47]);    MULADD(at[4], at[46]);    MULADD(at[5], at[45]);    MULADD(at[6], at[44]);    MULADD(at[7], at[43]);    MULADD(at[8], at[42]);    MULADD(at[9], at[41]);    MULADD(at[10], at[40]);    MULADD(at[11], at[39]);    MULADD(at[12], at[38]);    MULADD(at[13], at[37]);    MULADD(at[14], at[36]);    MULADD(at[15], at[35]);    MULADD(at[16], at[34]);    MULADD(at[17], at[33]);    MULADD(at[18], at[32]); 
   COMBA_STORE(C->dp[18]);
   /* 19 */
   COMBA_FORWARD;
   MULADD(at[0], at[51]);    MULADD(at[1], at[50]);    MULADD(at[2], at[49]);    MULADD(at[3], at[48]);    MULADD(at[4], at[47]);    MULADD(at[5], at[46]);    MULADD(at[6], at[45]);    MULADD(at[7], at[44]);    MULADD(at[8], at[43]);    MULADD(at[9], at[42]);    MULADD(at[10], at[41]);    MULADD(at[11], at[40]);    MULADD(at[12], at[39]);    MULADD(at[13], at[38]);    MULADD(at[14], at[37]);    MULADD(at[15], at[36]);    MULADD(at[16], at[35]);    MULADD(at[17], at[34]);    MULADD(at[18], at[33]);    MULADD(at[19], at[32]); 
   COMBA_STORE(C->dp[19]);
   /* 20 */
   COMBA_FORWARD;
   MULADD(at[0], at[52]);    MULADD(at[1], at[51]);    MULADD(at[2], at[50]);    MULADD(at[3], at[49]);    MULADD(at[4], at[48]);    MULADD(at[5], at[47]);    MULADD(at[6], at[46]);    MULADD(at[7], at[45]);    MULADD(at[8], at[44]);    MULADD(at[9], at[43]);    MULADD(at[10], at[42]);    MULADD(at[11], at[41]);    MULADD(at[12], at[40]);    MULADD(at[13], at[39]);    MULADD(at[14], at[38]);    MULADD(at[15], at[37]);    MULADD(at[16], at[36]);    MULADD(at[17], at[35]);    MULADD(at[18], at[34]);    MULADD(at[19], at[33]);    MULADD(at[20], at[32]); 
   COMBA_STORE(C->dp[20]);
   /* 21 */
   COMBA_FORWARD;
   MULADD(at[0], at[53]);    MULADD(at[1], at[52]);    MULADD(at[2], at[51]);    MULADD(at[3], at[50]);    MULADD(at[4], at[49]);    MULADD(at[5], at[48]);    MULADD(at[6], at[47]);    MULADD(at[7], at[46]);    MULADD(at[8], at[45]);    MULADD(at[9], at[44]);    MULADD(at[10], at[43]);    MULADD(at[11], at[42]);    MULADD(at[12], at[41]);    MULADD(at[13], at[40]);    MULADD(at[14], at[39]);    MULADD(at[15], at[38]);    MULADD(at[16], at[37]);    MULADD(at[17], at[36]);    MULADD(at[18], at[35]);    MULADD(at[19], at[34]);    MULADD(at[20], at[33]);    MULADD(at[21], at[32]); 
   COMBA_STORE(C->dp[21]);
   /* 22 */
   COMBA_FORWARD;
   MULADD(at[0], at[54]);    MULADD(at[1], at[53]);    MULADD(at[2], at[52]);    MULADD(at[3], at[51]);    MULADD(at[4], at[50]);    MULADD(at[5], at[49]);    MULADD(at[6], at[48]);    MULADD(at[7], at[47]);    MULADD(at[8], at[46]);    MULADD(at[9], at[45]);    MULADD(at[10], at[44]);    MULADD(at[11], at[43]);    MULADD(at[12], at[42]);    MULADD(at[13], at[41]);    MULADD(at[14], at[40]);    MULADD(at[15], at[39]);    MULADD(at[16], at[38]);    MULADD(at[17], at[37]);    MULADD(at[18], at[36]);    MULADD(at[19], at[35]);    MULADD(at[20], at[34]);    MULADD(at[21], at[33]);    MULADD(at[22], at[32]); 
   COMBA_STORE(C->dp[22]);
   /* 23 */
   COMBA_FORWARD;
   MULADD(at[0], at[55]);    MULADD(at[1], at[54]);    MULADD(at[2], at[53]);    MULADD(at[3], at[52]);    MULADD(at[4], at[51]);    MULADD(at[5], at[50]);    MULADD(at[6], at[49]);    MULADD(at[7], at[48]);    MULADD(at[8], at[47]);    MULADD(at[9], at[46]);    MULADD(at[10], at[45]);    MULADD(at[11], at[44]);    MULADD(at[12], at[43]);    MULADD(at[13], at[42]);    MULADD(at[14], at[41]);    MULADD(at[15], at[40]);    MULADD(at[16], at[39]);    MULADD(at[17], at[38]);    MULADD(at[18], at[37]);    MULADD(at[19], at[36]);    MULADD(at[20], at[35]);    MULADD(at[21], at[34]);    MULADD(at[22], at[33]);    MULADD(at[23], at[32]); 
   COMBA_STORE(C->dp[23]);
   /* 24 */
   COMBA_FORWARD;
   MULADD(at[0], at[56]);    MULADD(at[1], at[55]);    MULADD(at[2], at[54]);    MULADD(at[3], at[53]);    MULADD(at[4], at[52]);    MULADD(at[5], at[51]);    MULADD(at[6], at[50]);    MULADD(at[7], at[49]);    MULADD(at[8], at[48]);    MULADD(at[9], at[47]);    MULADD(at[10], at[46]);    MULADD(at[11], at[45]);    MULADD(at[12], at[44]);    MULADD(at[13], at[43]);    MULADD(at[14], at[42]);    MULADD(at[15], at[41]);    MULADD(at[16], at[40]);    MULADD(at[17], at[39]);    MULADD(at[18], at[38]);    MULADD(at[19], at[37]);    MULADD(at[20], at[36]);    MULADD(at[21], at[35]);    MULADD(at[22], at[34]);    MULADD(at[23], at[33]);    MULADD(at[24], at[32]); 
   COMBA_STORE(C->dp[24]);
   /* 25 */
   COMBA_FORWARD;
   MULADD(at[0], at[57]);    MULADD(at[1], at[56]);    MULADD(at[2], at[55]);    MULADD(at[3], at[54]);    MULADD(at[4], at[53]);    MULADD(at[5], at[52]);    MULADD(at[6], at[51]);    MULADD(at[7], at[50]);    MULADD(at[8], at[49]);    MULADD(at[9], at[48]);    MULADD(at[10], at[47]);    MULADD(at[11], at[46]);    MULADD(at[12], at[45]);    MULADD(at[13], at[44]);    MULADD(at[14], at[43]);    MULADD(at[15], at[42]);    MULADD(at[16], at[41]);    MULADD(at[17], at[40]);    MULADD(at[18], at[39]);    MULADD(at[19], at[38]);    MULADD(at[20], at[37]);    MULADD(at[21], at[36]);    MULADD(at[22], at[35]);    MULADD(at[23], at[34]);    MULADD(at[24], at[33]);    MULADD(at[25], at[32]); 
   COMBA_STORE(C->dp[25]);
   /* 26 */
   COMBA_FORWARD;
   MULADD(at[0], at[58]);    MULADD(at[1], at[57]);    MULADD(at[2], at[56]);    MULADD(at[3], at[55]);    MULADD(at[4], at[54]);    MULADD(at[5], at[53]);    MULADD(at[6], at[52]);    MULADD(at[7], at[51]);    MULADD(at[8], at[50]);    MULADD(at[9], at[49]);    MULADD(at[10], at[48]);    MULADD(at[11], at[47]);    MULADD(at[12], at[46]);    MULADD(at[13], at[45]);    MULADD(at[14], at[44]);    MULADD(at[15], at[43]);    MULADD(at[16], at[42]);    MULADD(at[17], at[41]);    MULADD(at[18], at[40]);    MULADD(at[19], at[39]);    MULADD(at[20], at[38]);    MULADD(at[21], at[37]);    MULADD(at[22], at[36]);    MULADD(at[23], at[35]);    MULADD(at[24], at[34]);    MULADD(at[25], at[33]);    MULADD(at[26], at[32]); 
   COMBA_STORE(C->dp[26]);
   /* 27 */
   COMBA_FORWARD;
   MULADD(at[0], at[59]);    MULADD(at[1], at[58]);    MULADD(at[2], at[57]);    MULADD(at[3], at[56]);    MULADD(at[4], at[55]);    MULADD(at[5], at[54]);    MULADD(at[6], at[53]);    MULADD(at[7], at[52]);    MULADD(at[8], at[51]);    MULADD(at[9], at[50]);    MULADD(at[10], at[49]);    MULADD(at[11], at[48]);    MULADD(at[12], at[47]);    MULADD(at[13], at[46]);    MULADD(at[14], at[45]);    MULADD(at[15], at[44]);    MULADD(at[16], at[43]);    MULADD(at[17], at[42]);    MULADD(at[18], at[41]);    MULADD(at[19], at[40]);    MULADD(at[20], at[39]);    MULADD(at[21], at[38]);    MULADD(at[22], at[37]);    MULADD(at[23], at[36]);    MULADD(at[24], at[35]);    MULADD(at[25], at[34]);    MULADD(at[26], at[33]);    MULADD(at[27], at[32]); 
   COMBA_STORE(C->dp[27]);
   /* 28 */
   COMBA_FORWARD;
   MULADD(at[0], at[60]);    MULADD(at[1], at[59]);    MULADD(at[2], at[58]);    MULADD(at[3], at[57]);    MULADD(at[4], at[56]);    MULADD(at[5], at[55]);    MULADD(at[6], at[54]);    MULADD(at[7], at[53]);    MULADD(at[8], at[52]);    MULADD(at[9], at[51]);    MULADD(at[10], at[50]);    MULADD(at[11], at[49]);    MULADD(at[12], at[48]);    MULADD(at[13], at[47]);    MULADD(at[14], at[46]);    MULADD(at[15], at[45]);    MULADD(at[16], at[44]);    MULADD(at[17], at[43]);    MULADD(at[18], at[42]);    MULADD(at[19], at[41]);    MULADD(at[20], at[40]);    MULADD(at[21], at[39]);    MULADD(at[22], at[38]);    MULADD(at[23], at[37]);    MULADD(at[24], at[36]);    MULADD(at[25], at[35]);    MULADD(at[26], at[34]);    MULADD(at[27], at[33]);    MULADD(at[28], at[32]); 
   COMBA_STORE(C->dp[28]);
   /* 29 */
   COMBA_FORWARD;
   MULADD(at[0], at[61]);    MULADD(at[1], at[60]);    MULADD(at[2], at[59]);    MULADD(at[3], at[58]);    MULADD(at[4], at[57]);    MULADD(at[5], at[56]);    MULADD(at[6], at[55]);    MULADD(at[7], at[54]);    MULADD(at[8], at[53]);    MULADD(at[9], at[52]);    MULADD(at[10], at[51]);    MULADD(at[11], at[50]);    MULADD(at[12], at[49]);    MULADD(at[13], at[48]);    MULADD(at[14], at[47]);    MULADD(at[15], at[46]);    MULADD(at[16], at[45]);    MULADD(at[17], at[44]);    MULADD(at[18], at[43]);    MULADD(at[19], at[42]);    MULADD(at[20], at[41]);    MULADD(at[21], at[40]);    MULADD(at[22], at[39]);    MULADD(at[23], at[38]);    MULADD(at[24], at[37]);    MULADD(at[25], at[36]);    MULADD(at[26], at[35]);    MULADD(at[27], at[34]);    MULADD(at[28], at[33]);    MULADD(at[29], at[32]); 
   COMBA_STORE(C->dp[29]);
   /* 30 */
   COMBA_FORWARD;
   MULADD(at[0], at[62]);    MULADD(at[1], at[61]);    MULADD(at[2], at[60]);    MULADD(at[3], at[59]);    MULADD(at[4], at[58]);    MULADD(at[5], at[57]);    MULADD(at[6], at[56]);    MULADD(at[7], at[55]);    MULADD(at[8], at[54]);    MULADD(at[9], at[53]);    MULADD(at[10], at[52]);    MULADD(at[11], at[51]);    MULADD(at[12], at[50]);    MULADD(at[13], at[49]);    MULADD(at[14], at[48]);    MULADD(at[15], at[47]);    MULADD(at[16], at[46]);    MULADD(at[17], at[45]);    MULADD(at[18], at[44]);    MULADD(at[19], at[43]);    MULADD(at[20], at[42]);    MULADD(at[21], at[41]);    MULADD(at[22], at[40]);    MULADD(at[23], at[39]);    MULADD(at[24], at[38]);    MULADD(at[25], at[37]);    MULADD(at[26], at[36]);    MULADD(at[27], at[35]);    MULADD(at[28], at[34]);    MULADD(at[29], at[33]);    MULADD(at[30], at[32]); 
   COMBA_STORE(C->dp[30]);
   /* 31 */
   COMBA_FORWARD;
   MULADD(at[0], at[63]);    MULADD(at[1], at[62]);    MULADD(at[2], at[61]);    MULADD(at[3], at[60]);    MULADD(at[4], at[59]);    MULADD(at[5], at[58]);    MULADD(at[6], at[57]);    MULADD(at[7], at[56]);    MULADD(at[8], at[55]);    MULADD(at[9], at[54]);    MULADD(at[10], at[53]);    MULADD(at[11], at[52]);    MULADD(at[12], at[51]);    MULADD(at[13], at[50]);    MULADD(at[14], at[49]);    MULADD(at[15], at[48]);    MULADD(at[16], at[47]);    MULADD(at[17], at[46]);    MULADD(at[18], at[45]);    MULADD(at[19], at[44]);    MULADD(at[20], at[43]);    MULADD(at[21], at[42]);    MULADD(at[22], at[41]);    MULADD(at[23], at[40]);    MULADD(at[24], at[39]);    MULADD(at[25], at[38]);    MULADD(at[26], at[37]);    MULADD(at[27], at[36]);    MULADD(at[28], at[35]);    MULADD(at[29], at[34]);    MULADD(at[30], at[33]);    MULADD(at[31], at[32]); 
   COMBA_STORE(C->dp[31]);
   /* 32 */
   COMBA_FORWARD;
   MULADD(at[1], at[63]);    MULADD(at[2], at[62]);    MULADD(at[3], at[61]);    MULADD(at[4], at[60]);    MULADD(at[5], at[59]);    MULADD(at[6], at[58]);    MULADD(at[7], at[57]);    MULADD(at[8], at[56]);    MULADD(at[9], at[55]);    MULADD(at[10], at[54]);    MULADD(at[11], at[53]);    MULADD(at[12], at[52]);    MULADD(at[13], at[51]);    MULADD(at[14], at[50]);    MULADD(at[15], at[49]);    MULADD(at[16], at[48]);    MULADD(at[17], at[47]);    MULADD(at[18], at[46]);    MULADD(at[19], at[45]);    MULADD(at[20], at[44]);    MULADD(at[21], at[43]);    MULADD(at[22], at[42]);    MULADD(at[23], at[41]);    MULADD(at[24], at[40]);    MULADD(at[25], at[39]);    MULADD(at[26], at[38]);    MULADD(at[27], at[37]);    MULADD(at[28], at[36]);    MULADD(at[29], at[35]);    MULADD(at[30], at[34]);    MULADD(at[31], at[33]); 
   COMBA_STORE(C->dp[32]);
   /* 33 */
   COMBA_FORWARD;
   MULADD(at[2], at[63]);    MULADD(at[3], at[62]);    MULADD(at[4], at[61]);    MULADD(at[5], at[60]);    MULADD(at[6], at[59]);    MULADD(at[7], at[58]);    MULADD(at[8], at[57]);    MULADD(at[9], at[56]);    MULADD(at[10], at[55]);    MULADD(at[11], at[54]);    MULADD(at[12], at[53]);    MULADD(at[13], at[52]);    MULADD(at[14], at[51]);    MULADD(at[15], at[50]);    MULADD(at[16], at[49]);    MULADD(at[17], at[48]);    MULADD(at[18], at[47]);    MULADD(at[19], at[46]);    MULADD(at[20], at[45]);    MULADD(at[21], at[44]);    MULADD(at[22], at[43]);    MULADD(at[23], at[42]);    MULADD(at[24], at[41]);    MULADD(at[25], at[40]);    MULADD(at[26], at[39]);    MULADD(at[27], at[38]);    MULADD(at[28], at[37]);    MULADD(at[29], at[36]);    MULADD(at[30], at[35]);    MULADD(at[31], at[34]); 
   COMBA_STORE(C->dp[33]);
   /* 34 */
   COMBA_FORWARD;
   MULADD(at[3], at[63]);    MULADD(at[4], at[62]);    MULADD(at[5], at[61]);    MULADD(at[6], at[60]);    MULADD(at[7], at[59]);    MULADD(at[8], at[58]);    MULADD(at[9], at[57]);    MULADD(at[10], at[56]);    MULADD(at[11], at[55]);    MULADD(at[12], at[54]);    MULADD(at[13], at[53]);    MULADD(at[14], at[52]);    MULADD(at[15], at[51]);    MULADD(at[16], at[50]);    MULADD(at[17], at[49]);    MULADD(at[18], at[48]);    MULADD(at[19], at[47]);    MULADD(at[20], at[46]);    MULADD(at[21], at[45]);    MULADD(at[22], at[44]);    MULADD(at[23], at[43]);    MULADD(at[24], at[42]);    MULADD(at[25], at[41]);    MULADD(at[26], at[40]);    MULADD(at[27], at[39]);    MULADD(at[28], at[38]);    MULADD(at[29], at[37]);    MULADD(at[30], at[36]);    MULADD(at[31], at[35]); 
   COMBA_STORE(C->dp[34]);
   /* 35 */
   COMBA_FORWARD;
   MULADD(at[4], at[63]);    MULADD(at[5], at[62]);    MULADD(at[6], at[61]);    MULADD(at[7], at[60]);    MULADD(at[8], at[59]);    MULADD(at[9], at[58]);    MULADD(at[10], at[57]);    MULADD(at[11], at[56]);    MULADD(at[12], at[55]);    MULADD(at[13], at[54]);    MULADD(at[14], at[53]);    MULADD(at[15], at[52]);    MULADD(at[16], at[51]);    MULADD(at[17], at[50]);    MULADD(at[18], at[49]);    MULADD(at[19], at[48]);    MULADD(at[20], at[47]);    MULADD(at[21], at[46]);    MULADD(at[22], at[45]);    MULADD(at[23], at[44]);    MULADD(at[24], at[43]);    MULADD(at[25], at[42]);    MULADD(at[26], at[41]);    MULADD(at[27], at[40]);    MULADD(at[28], at[39]);    MULADD(at[29], at[38]);    MULADD(at[30], at[37]);    MULADD(at[31], at[36]); 
   COMBA_STORE(C->dp[35]);
   /* 36 */
   COMBA_FORWARD;
   MULADD(at[5], at[63]);    MULADD(at[6], at[62]);    MULADD(at[7], at[61]);    MULADD(at[8], at[60]);    MULADD(at[9], at[59]);    MULADD(at[10], at[58]);    MULADD(at[11], at[57]);    MULADD(at[12], at[56]);    MULADD(at[13], at[55]);    MULADD(at[14], at[54]);    MULADD(at[15], at[53]);    MULADD(at[16], at[52]);    MULADD(at[17], at[51]);    MULADD(at[18], at[50]);    MULADD(at[19], at[49]);    MULADD(at[20], at[48]);    MULADD(at[21], at[47]);    MULADD(at[22], at[46]);    MULADD(at[23], at[45]);    MULADD(at[24], at[44]);    MULADD(at[25], at[43]);    MULADD(at[26], at[42]);    MULADD(at[27], at[41]);    MULADD(at[28], at[40]);    MULADD(at[29], at[39]);    MULADD(at[30], at[38]);    MULADD(at[31], at[37]); 
   COMBA_STORE(C->dp[36]);
   /* 37 */
   COMBA_FORWARD;
   MULADD(at[6], at[63]);    MULADD(at[7], at[62]);    MULADD(at[8], at[61]);    MULADD(at[9], at[60]);    MULADD(at[10], at[59]);    MULADD(at[11], at[58]);    MULADD(at[12], at[57]);    MULADD(at[13], at[56]);    MULADD(at[14], at[55]);    MULADD(at[15], at[54]);    MULADD(at[16], at[53]);    MULADD(at[17], at[52]);    MULADD(at[18], at[51]);    MULADD(at[19], at[50]);    MULADD(at[20], at[49]);    MULADD(at[21], at[48]);    MULADD(at[22], at[47]);    MULADD(at[23], at[46]);    MULADD(at[24], at[45]);    MULADD(at[25], at[44]);    MULADD(at[26], at[43]);    MULADD(at[27], at[42]);    MULADD(at[28], at[41]);    MULADD(at[29], at[40]);    MULADD(at[30], at[39]);    MULADD(at[31], at[38]); 
   COMBA_STORE(C->dp[37]);
   /* 38 */
   COMBA_FORWARD;
   MULADD(at[7], at[63]);    MULADD(at[8], at[62]);    MULADD(at[9], at[61]);    MULADD(at[10], at[60]);    MULADD(at[11], at[59]);    MULADD(at[12], at[58]);    MULADD(at[13], at[57]);    MULADD(at[14], at[56]);    MULADD(at[15], at[55]);    MULADD(at[16], at[54]);    MULADD(at[17], at[53]);    MULADD(at[18], at[52]);    MULADD(at[19], at[51]);    MULADD(at[20], at[50]);    MULADD(at[21], at[49]);    MULADD(at[22], at[48]);    MULADD(at[23], at[47]);    MULADD(at[24], at[46]);    MULADD(at[25], at[45]);    MULADD(at[26], at[44]);    MULADD(at[27], at[43]);    MULADD(at[28], at[42]);    MULADD(at[29], at[41]);    MULADD(at[30], at[40]);    MULADD(at[31], at[39]); 
   COMBA_STORE(C->dp[38]);
   /* 39 */
   COMBA_FORWARD;
   MULADD(at[8], at[63]);    MULADD(at[9], at[62]);    MULADD(at[10], at[61]);    MULADD(at[11], at[60]);    MULADD(at[12], at[59]);    MULADD(at[13], at[58]);    MULADD(at[14], at[57]);    MULADD(at[15], at[56]);    MULADD(at[16], at[55]);    MULADD(at[17], at[54]);    MULADD(at[18], at[53]);    MULADD(at[19], at[52]);    MULADD(at[20], at[51]);    MULADD(at[21], at[50]);    MULADD(at[22], at[49]);    MULADD(at[23], at[48]);    MULADD(at[24], at[47]);    MULADD(at[25], at[46]);    MULADD(at[26], at[45]);    MULADD(at[27], at[44]);    MULADD(at[28], at[43]);    MULADD(at[29], at[42]);    MULADD(at[30], at[41]);    MULADD(at[31], at[40]); 
   COMBA_STORE(C->dp[39]);
   /* 40 */
   COMBA_FORWARD;
   MULADD(at[9], at[63]);    MULADD(at[10], at[62]);    MULADD(at[11], at[61]);    MULADD(at[12], at[60]);    MULADD(at[13], at[59]);    MULADD(at[14], at[58]);    MULADD(at[15], at[57]);    MULADD(at[16], at[56]);    MULADD(at[17], at[55]);    MULADD(at[18], at[54]);    MULADD(at[19], at[53]);    MULADD(at[20], at[52]);    MULADD(at[21], at[51]);    MULADD(at[22], at[50]);    MULADD(at[23], at[49]);    MULADD(at[24], at[48]);    MULADD(at[25], at[47]);    MULADD(at[26], at[46]);    MULADD(at[27], at[45]);    MULADD(at[28], at[44]);    MULADD(at[29], at[43]);    MULADD(at[30], at[42]);    MULADD(at[31], at[41]); 
   COMBA_STORE(C->dp[40]);
   /* 41 */
   COMBA_FORWARD;
   MULADD(at[10], at[63]);    MULADD(at[11], at[62]);    MULADD(at[12], at[61]);    MULADD(at[13], at[60]);    MULADD(at[14], at[59]);    MULADD(at[15], at[58]);    MULADD(at[16], at[57]);    MULADD(at[17], at[56]);    MULADD(at[18], at[55]);    MULADD(at[19], at[54]);    MULADD(at[20], at[53]);    MULADD(at[21], at[52]);    MULADD(at[22], at[51]);    MULADD(at[23], at[50]);    MULADD(at[24], at[49]);    MULADD(at[25], at[48]);    MULADD(at[26], at[47]);    MULADD(at[27], at[46]);    MULADD(at[28], at[45]);    MULADD(at[29], at[44]);    MULADD(at[30], at[43]);    MULADD(at[31], at[42]); 
   COMBA_STORE(C->dp[41]);
   /* 42 */
   COMBA_FORWARD;
   MULADD(at[11], at[63]);    MULADD(at[12], at[62]);    MULADD(at[13], at[61]);    MULADD(at[14], at[60]);    MULADD(at[15], at[59]);    MULADD(at[16], at[58]);    MULADD(at[17], at[57]);    MULADD(at[18], at[56]);    MULADD(at[19], at[55]);    MULADD(at[20], at[54]);    MULADD(at[21], at[53]);    MULADD(at[22], at[52]);    MULADD(at[23], at[51]);    MULADD(at[24], at[50]);    MULADD(at[25], at[49]);    MULADD(at[26], at[48]);    MULADD(at[27], at[47]);    MULADD(at[28], at[46]);    MULADD(at[29], at[45]);    MULADD(at[30], at[44]);    MULADD(at[31], at[43]); 
   COMBA_STORE(C->dp[42]);
   /* 43 */
   COMBA_FORWARD;
   MULADD(at[12], at[63]);    MULADD(at[13], at[62]);    MULADD(at[14], at[61]);    MULADD(at[15], at[60]);    MULADD(at[16], at[59]);    MULADD(at[17], at[58]);    MULADD(at[18], at[57]);    MULADD(at[19], at[56]);    MULADD(at[20], at[55]);    MULADD(at[21], at[54]);    MULADD(at[22], at[53]);    MULADD(at[23], at[52]);    MULADD(at[24], at[51]);    MULADD(at[25], at[50]);    MULADD(at[26], at[49]);    MULADD(at[27], at[48]);    MULADD(at[28], at[47]);    MULADD(at[29], at[46]);    MULADD(at[30], at[45]);    MULADD(at[31], at[44]); 
   COMBA_STORE(C->dp[43]);
   /* 44 */
   COMBA_FORWARD;
   MULADD(at[13], at[63]);    MULADD(at[14], at[62]);    MULADD(at[15], at[61]);    MULADD(at[16], at[60]);    MULADD(at[17], at[59]);    MULADD(at[18], at[58]);    MULADD(at[19], at[57]);    MULADD(at[20], at[56]);    MULADD(at[21], at[55]);    MULADD(at[22], at[54]);    MULADD(at[23], at[53]);    MULADD(at[24], at[52]);    MULADD(at[25], at[51]);    MULADD(at[26], at[50]);    MULADD(at[27], at[49]);    MULADD(at[28], at[48]);    MULADD(at[29], at[47]);    MULADD(at[30], at[46]);    MULADD(at[31], at[45]); 
   COMBA_STORE(C->dp[44]);
   /* 45 */
   COMBA_FORWARD;
   MULADD(at[14], at[63]);    MULADD(at[15], at[62]);    MULADD(at[16], at[61]);    MULADD(at[17], at[60]);    MULADD(at[18], at[59]);    MULADD(at[19], at[58]);    MULADD(at[20], at[57]);    MULADD(at[21], at[56]);    MULADD(at[22], at[55]);    MULADD(at[23], at[54]);    MULADD(at[24], at[53]);    MULADD(at[25], at[52]);    MULADD(at[26], at[51]);    MULADD(at[27], at[50]);    MULADD(at[28], at[49]);    MULADD(at[29], at[48]);    MULADD(at[30], at[47]);    MULADD(at[31], at[46]); 
   COMBA_STORE(C->dp[45]);
   /* 46 */
   COMBA_FORWARD;
   MULADD(at[15], at[63]);    MULADD(at[16], at[62]);    MULADD(at[17], at[61]);    MULADD(at[18], at[60]);    MULADD(at[19], at[59]);    MULADD(at[20], at[58]);    MULADD(at[21], at[57]);    MULADD(at[22], at[56]);    MULADD(at[23], at[55]);    MULADD(at[24], at[54]);    MULADD(at[25], at[53]);    MULADD(at[26], at[52]);    MULADD(at[27], at[51]);    MULADD(at[28], at[50]);    MULADD(at[29], at[49]);    MULADD(at[30], at[48]);    MULADD(at[31], at[47]); 
   COMBA_STORE(C->dp[46]);
   /* 47 */
   COMBA_FORWARD;
   MULADD(at[16], at[63]);    MULADD(at[17], at[62]);    MULADD(at[18], at[61]);    MULADD(at[19], at[60]);    MULADD(at[20], at[59]);    MULADD(at[21], at[58]);    MULADD(at[22], at[57]);    MULADD(at[23], at[56]);    MULADD(at[24], at[55]);    MULADD(at[25], at[54]);    MULADD(at[26], at[53]);    MULADD(at[27], at[52]);    MULADD(at[28], at[51]);    MULADD(at[29], at[50]);    MULADD(at[30], at[49]);    MULADD(at[31], at[48]); 
   COMBA_STORE(C->dp[47]);
   /* 48 */
   COMBA_FORWARD;
   MULADD(at[17], at[63]);    MULADD(at[18], at[62]);    MULADD(at[19], at[61]);    MULADD(at[20], at[60]);    MULADD(at[21], at[59]);    MULADD(at[22], at[58]);    MULADD(at[23], at[57]);    MULADD(at[24], at[56]);    MULADD(at[25], at[55]);    MULADD(at[26], at[54]);    MULADD(at[27], at[53]);    MULADD(at[28], at[52]);    MULADD(at[29], at[51]);    MULADD(at[30], at[50]);    MULADD(at[31], at[49]); 
   COMBA_STORE(C->dp[48]);
   /* 49 */
   COMBA_FORWARD;
   MULADD(at[18], at[63]);    MULADD(at[19], at[62]);    MULADD(at[20], at[61]);    MULADD(at[21], at[60]);    MULADD(at[22], at[59]);    MULADD(at[23], at[58]);    MULADD(at[24], at[57]);    MULADD(at[25], at[56]);    MULADD(at[26], at[55]);    MULADD(at[27], at[54]);    MULADD(at[28], at[53]);    MULADD(at[29], at[52]);    MULADD(at[30], at[51]);    MULADD(at[31], at[50]); 
   COMBA_STORE(C->dp[49]);
   /* 50 */
   COMBA_FORWARD;
   MULADD(at[19], at[63]);    MULADD(at[20], at[62]);    MULADD(at[21], at[61]);    MULADD(at[22], at[60]);    MULADD(at[23], at[59]);    MULADD(at[24], at[58]);    MULADD(at[25], at[57]);    MULADD(at[26], at[56]);    MULADD(at[27], at[55]);    MULADD(at[28], at[54]);    MULADD(at[29], at[53]);    MULADD(at[30], at[52]);    MULADD(at[31], at[51]); 
   COMBA_STORE(C->dp[50]);
   /* 51 */
   COMBA_FORWARD;
   MULADD(at[20], at[63]);    MULADD(at[21], at[62]);    MULADD(at[22], at[61]);    MULADD(at[23], at[60]);    MULADD(at[24], at[59]);    MULADD(at[25], at[58]);    MULADD(at[26], at[57]);    MULADD(at[27], at[56]);    MULADD(at[28], at[55]);    MULADD(at[29], at[54]);    MULADD(at[30], at[53]);    MULADD(at[31], at[52]); 
   COMBA_STORE(C->dp[51]);
   /* 52 */
   COMBA_FORWARD;
   MULADD(at[21], at[63]);    MULADD(at[22], at[62]);    MULADD(at[23], at[61]);    MULADD(at[24], at[60]);    MULADD(at[25], at[59]);    MULADD(at[26], at[58]);    MULADD(at[27], at[57]);    MULADD(at[28], at[56]);    MULADD(at[29], at[55]);    MULADD(at[30], at[54]);    MULADD(at[31], at[53]); 
   COMBA_STORE(C->dp[52]);
   /* 53 */
   COMBA_FORWARD;
   MULADD(at[22], at[63]);    MULADD(at[23], at[62]);    MULADD(at[24], at[61]);    MULADD(at[25], at[60]);    MULADD(at[26], at[59]);    MULADD(at[27], at[58]);    MULADD(at[28], at[57]);    MULADD(at[29], at[56]);    MULADD(at[30], at[55]);    MULADD(at[31], at[54]); 
   COMBA_STORE(C->dp[53]);
   /* 54 */
   COMBA_FORWARD;
   MULADD(at[23], at[63]);    MULADD(at[24], at[62]);    MULADD(at[25], at[61]);    MULADD(at[26], at[60]);    MULADD(at[27], at[59]);    MULADD(at[28], at[58]);    MULADD(at[29], at[57]);    MULADD(at[30], at[56]);    MULADD(at[31], at[55]); 
   COMBA_STORE(C->dp[54]);
   /* 55 */
   COMBA_FORWARD;
   MULADD(at[24], at[63]);    MULADD(at[25], at[62]);    MULADD(at[26], at[61]);    MULADD(at[27], at[60]);    MULADD(at[28], at[59]);    MULADD(at[29], at[58]);    MULADD(at[30], at[57]);    MULADD(at[31], at[56]); 
   COMBA_STORE(C->dp[55]);
   /* 56 */
   COMBA_FORWARD;
   MULADD(at[25], at[63]);    MULADD(at[26], at[62]);    MULADD(at[27], at[61]);    MULADD(at[28], at[60]);    MULADD(at[29], at[59]);    MULADD(at[30], at[58]);    MULADD(at[31], at[57]); 
   COMBA_STORE(C->dp[56]);
   /* 57 */
   COMBA_FORWARD;
   MULADD(at[26], at[63]);    MULADD(at[27], at[62]);    MULADD(at[28], at[61]);    MULADD(at[29], at[60]);    MULADD(at[30], at[59]);    MULADD(at[31], at[58]); 
   COMBA_STORE(C->dp[57]);
   /* 58 */
   COMBA_FORWARD;
   MULADD(at[27], at[63]);    MULADD(at[28], at[62]);    MULADD(at[29], at[61]);    MULADD(at[30], at[60]);    MULADD(at[31], at[59]); 
   COMBA_STORE(C->dp[58]);
   /* 59 */
   COMBA_FORWARD;
   MULADD(at[28], at[63]);    MULADD(at[29], at[62]);    MULADD(at[30], at[61]);    MULADD(at[31], at[60]); 
   COMBA_STORE(C->dp[59]);
   /* 60 */
   COMBA_FORWARD;
   MULADD(at[29], at[63]);    MULADD(at[30], at[62]);    MULADD(at[31], at[61]); 
   COMBA_STORE(C->dp[60]);
   /* 61 */
   COMBA_FORWARD;
   MULADD(at[30], at[63]);    MULADD(at[31], at[62]); 
   COMBA_STORE(C->dp[61]);
   /* 62 */
   COMBA_FORWARD;
   MULADD(at[31], at[63]); 
   COMBA_STORE(C->dp[62]);
   COMBA_STORE2(C->dp[63]);
   C->used = 64;
   C->sign = A->sign ^ B->sign;
   fp_clamp(C);
   COMBA_FINI;
 }
 #endif
--- a/fp_mul_d.c
+++ b/fp_mul_d.c
@ -0,0 +1,36 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a * b */
 void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
 {
   fp_word  w;
   int      x, oldused;
   oldused = c->used;
   c->used = a->used;
   c->sign = a->sign;
   w       = 0;
   for (x = 0; x < a->used; x++) {
       w         = ((fp_word)a->dp[x]) * ((fp_word)b) + w;
       c->dp[x]  = (fp_digit)w;
       w         = w >> DIGIT_BIT;
   }
   if (w != 0 && (a->used != FP_SIZE)) {
      c->dp[c->used++] = w;
      ++x;
   }
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
   fp_clamp(c);
 }
--- a/fp_mulmod.c
+++ b/fp_mulmod.c
@ -0,0 +1,18 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* d = a * b (mod c) */
 int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
  fp_int tmp;
  fp_zero(&tmp);
  fp_mul(a, b, &tmp);
  return fp_mod(&tmp, c, d);
 }
--- a/fp_prime_miller_rabin.c
+++ b/fp_prime_miller_rabin.c
@ -0,0 +1,73 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* Miller-Rabin test of "a" to the base of "b" as described in 
 * HAC pp. 139 Algorithm 4.24
 *
 * Sets result to 0 if definitely composite or 1 if probably prime.
 * Randomly the chance of error is no more than 1/4 and often 
 * very much lower.
 */
 void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result)
 {
  fp_int  n1, y, r;
  int     s, j;
  /* default */
  *result = FP_NO;
  /* ensure b > 1 */
  if (fp_cmp_d(b, 1) != FP_GT) {
     return;
  }     
  /* get n1 = a - 1 */
  fp_init_copy(&n1, a);
  fp_sub_d(&n1, 1, &n1);
  /* set 2**s * r = n1 */
  fp_init_copy(&r, &n1);
  /* count the number of least significant bits
   * which are zero
   */
  s = fp_cnt_lsb(&r);
  /* now divide n - 1 by 2**s */
  fp_div_2d (&r, s, &r, NULL);
  /* compute y = b**r mod a */
  fp_init(&y);
  fp_exptmod(b, &r, a, &y);
  /* if y != 1 and y != n1 do */
  if (fp_cmp_d (&y, 1) != FP_EQ && fp_cmp (&y, &n1) != FP_EQ) {
    j = 1;
    /* while j <= s-1 and y != n1 */
    while ((j <= (s - 1)) && fp_cmp (&y, &n1) != FP_EQ) {
      fp_sqrmod (&y, a, &y);
      /* if y == 1 then composite */
      if (fp_cmp_d (&y, 1) == FP_EQ) {
         return;
      }
      ++j;
    }
    /* if y != n1 then composite */
    if (fp_cmp (&y, &n1) != FP_EQ) {
       return;
    }
  }
  /* probably prime now */
  *result = FP_YES;
 }
--- a/fp_prime_random_ex.c
+++ b/fp_prime_random_ex.c
@ -0,0 +1,97 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
 int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback cb, void *dat)
 {
   unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
   int res, err, bsize, maskOR_msb_offset;
   /* sanity check the input */
   if (size <= 1 || t <= 0) {
      return FP_VAL;
   }
   /* TFM_PRIME_SAFE implies TFM_PRIME_BBS */
   if (flags & TFM_PRIME_SAFE) {
      flags |= TFM_PRIME_BBS;
   }
   /* calc the byte size */
   bsize = (size>>3)+(size&7?1:0);
   /* we need a buffer of bsize bytes */
   tmp = malloc(bsize);
   if (tmp == NULL) {
      return FP_MEM;
   }
   /* calc the maskAND value for the MSbyte*/
   maskAND = 0xFF >> (8 - (size & 7));
   /* calc the maskOR_msb */
   maskOR_msb        = 0;
   maskOR_msb_offset = (size - 2) >> 3;
   if (flags & TFM_PRIME_2MSB_ON) {
      maskOR_msb     |= 1 << ((size - 2) & 7);
   } else if (flags & TFM_PRIME_2MSB_OFF) {
      maskAND        &= ~(1 << ((size - 2) & 7));
   }
   /* get the maskOR_lsb */
   maskOR_lsb         = 1;
   if (flags & TFM_PRIME_BBS) {
      maskOR_lsb     |= 3;
   }
   do {
      /* read the bytes */
      if (cb(tmp, bsize, dat) != bsize) {
         err = FP_VAL;
         goto error;
      }
      /* work over the MSbyte */
      tmp[0]    &= maskAND;
      tmp[0]    |= 1 << ((size - 1) & 7);
      /* mix in the maskORs */
      tmp[maskOR_msb_offset]   |= maskOR_msb;
      tmp[bsize-1]             |= maskOR_lsb;
      /* read it in */
      fp_read_unsigned_bin(a, tmp, bsize);
      /* is it prime? */
      res = fp_isprime(a);
      if (res == FP_NO) continue;
      if (flags & TFM_PRIME_SAFE) {
         /* see if (a-1)/2 is prime */
         fp_sub_d(a, 1, a);
         fp_div_2(a, a);
         /* is it prime? */
         res = fp_isprime(a);
      }
   } while (res == FP_NO);
   if (flags & TFM_PRIME_SAFE) {
      /* restore a to the original value */
      fp_mul_2(a, a);
      fp_add_d(a, 1, a);
   }
   err = FP_OKAY;
 error:
   free(tmp);
   return err;
 }
--- a/fp_radix_size.c
+++ b/fp_radix_size.c
@ -0,0 +1,14 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_radix_size(fp_int *a, int radix, int *size)
 {
 }
--- a/fp_read_radix.c
+++ b/fp_read_radix.c
@ -0,0 +1,66 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_read_radix(fp_int *a, char *str, int radix)
 {
  int     y, neg;
  char    ch;
  /* make sure the radix is ok */
  if (radix < 2 || radix > 64) {
    return FP_VAL;
  }
  /* if the leading digit is a
   * minus set the sign to negative.
   */
  if (*str == '-') {
    ++str;
    neg = FP_NEG;
  } else {
    neg = FP_ZPOS;
  }
  /* set the integer to the default of zero */
  fp_zero (a);
  /* process each digit of the string */
  while (*str) {
    /* if the radix < 36 the conversion is case insensitive
     * this allows numbers like 1AB and 1ab to represent the same  value
     * [e.g. in hex]
     */
    ch = (char) ((radix < 36) ? toupper (*str) : *str);
    for (y = 0; y < 64; y++) {
      if (ch == fp_s_rmap[y]) {
         break;
      }
    }
    /* if the char was found in the map
     * and is less than the given radix add it
     * to the number, otherwise exit the loop.
     */
    if (y < radix) {
      fp_mul_d (a, (fp_digit) radix, a);
      fp_add_d (a, (fp_digit) y, a);
    } else {
      break;
    }
    ++str;
  }
  /* set the sign only if a != 0 */
  if (fp_iszero(a) != FP_YES) {
     a->sign = neg;
  }
  return FP_OKAY;
 }
--- a/fp_read_signed_bin.c
+++ b/fp_read_signed_bin.c
@ -0,0 +1,23 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_read_signed_bin(fp_int *a, unsigned char *b, int c)
 {
  /* read magnitude */
  fp_read_unsigned_bin (a, b + 1, c - 1);
  /* first byte is 0 for positive, non-zero for negative */
  if (b[0] == 0) {
     a->sign = FP_ZPOS;
  } else {
     a->sign = FP_NEG;
  }
 }
--- a/fp_read_unsigned_bin.c
+++ b/fp_read_unsigned_bin.c
@ -0,0 +1,24 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c)
 {
  /* zero the int */
  fp_zero (a);
  /* read the bytes in */
  for (; c > 0; c--) {
    fp_mul_2d (a, 8, a);
    a->dp[0] |= *b++;
    a->used += 1;
  }
  fp_clamp (a);
 }
--- a/fp_reverse.c
+++ b/fp_reverse.c
@ -0,0 +1,27 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* reverse an array, used for radix code */
 void bn_reverse (unsigned char *s, int len)
 {
  int     ix, iy;
  unsigned char t;
  ix = 0;
  iy = len - 1;
  while (ix < iy) {
    t     = s[ix];
    s[ix] = s[iy];
    s[iy] = t;
    ++ix;
    --iy;
  }
 }
--- a/fp_rshd.c
+++ b/fp_rshd.c
@ -0,0 +1,36 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_rshd(fp_int *a, int x)
 {
  int y;
  /* too many digits just zero and return */
  if (x >= a->used) {
     fp_zero(a);
     return;
  }
   /* shift */
   for (y = 0; y < a->used - x; y++) {
      a->dp[y] = a->dp[y+x];
   }
   /* zero rest */
   for (; y < a->used; y++) {
      a->dp[y] = 0;
   }
   /* decrement count */
   a->used -= x;
   fp_clamp(a);
 }
--- a/fp_s_rmap.c
+++ b/fp_s_rmap.c
@ -0,0 +1,13 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* chars used in radix conversions */
 const char *fp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
--- a/fp_set.c
+++ b/fp_set.c
@ -0,0 +1,17 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_set(fp_int *a, fp_digit b)
 {
   fp_zero(a);
   a->dp[0] = b;
   a->used  = b ? 1 : 0;
 }
--- a/fp_signed_bin_size.c
+++ b/fp_signed_bin_size.c
@ -0,0 +1,15 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_signed_bin_size(fp_int *a)
 {
  return 1 + fp_unsigned_bin_size (a);
 }
--- a/fp_sqr.c
+++ b/fp_sqr.c
@ -0,0 +1,107 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* b = a*a  */
 void fp_sqr(fp_int *A, fp_int *B)
 {
    int    r, y, s;
    fp_int aa, bb, comp, amb, t1;
    y = A->used;
    if (y <= 48) { 
        if (y <= 4) {
           fp_sqr_comba4(A,B);
        } else if (y <= 8) {
           fp_sqr_comba8(A,B);
        } else if (y <= 16 && y >= 12) {
           fp_sqr_comba16(A,B);
 #ifdef TFM_HUGE
        } else if (y <= 32 && y >= 28) {
           fp_sqr_comba32(A,B);
 #endif
        } else {
           fp_sqr_comba(A, B);
        }
    } else {
        /* do the karatsuba action 
           if A = ab ||a|| = r we need to solve 
           a^2*r^2 + (-(a-b)^2 + a^2 + b^2)*r + b^2
           So we solve for the three products then we form the final result with careful shifting 
           and addition.
 Obvious points of optimization
 - "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]
 - Similarly the "bd" parts can be memcpy'ed and zeroed to 8
 - 
        */
        /* get our value of r */
        r = y >> 1;
        /* now solve for ac */
 //        fp_copy(A, &t1); fp_rshd(&t1, r); 
        for (s = 0; s < A->used - r; s++) {
            t1.dp[s] = A->dp[s+r];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s] = 0; 
        }
        if (A->used >= r) {
           t1.used = A->used - r;
        } else {
           t1.used = 0;
        }
        t1.sign = A->sign;
        fp_copy(&t1, &amb); 
        fp_zero(&aa);
        fp_sqr(&t1, &aa);
        /* now solve for bd */
 //        fp_mod_2d(A, r * DIGIT_BIT, &t1);
        for (s = 0; s < r; s++) {
            t1.dp[s] = A->dp[s];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s]   = 0; 
        }
        t1.used = r;
        fp_clamp(&t1);
        fp_sub(&amb, &t1, &amb); 
        fp_zero(&bb);
        fp_sqr(&t1, &bb);
        /* now get the (a-b) term */
        fp_zero(&comp);
        fp_sqr(&amb, &comp);
        /* now solve the system, do the middle term first */
        comp.sign ^= 1;
        fp_add(&comp, &aa, &comp);
        fp_add(&comp, &bb, &comp);
        fp_lshd(&comp, r);
        /* leading term */
        fp_lshd(&aa, r+r);
        /* now sum them together */
        fp_zero(B);
        fp_add(&aa, &comp, B);
        fp_add(&bb, B, B);    
        B->sign = FP_ZPOS;
    }
 }
--- a/fp_sqr_comba.c
+++ b/fp_sqr_comba.c
@ -0,0 +1,956 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* About this file...
 */
 #if defined(TFM_X86)
 /* x86-32 optimized */
 #define COMBA_START
 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define CARRY_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_FINI
 #define SQRADD(i, j)                                      \
 asm volatile (                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %%eax        \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
 #define SQRADD2(i, j)                                     \
 asm volatile (                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
 #elif defined(TFM_X86_64)
 /* x86-64 optimized */
 #define COMBA_START
 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define CARRY_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_FINI
 #define SQRADD(i, j)                                      \
 asm volatile (                                            \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %%rax        \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
 #define SQRADD2(i, j)                                     \
 asm volatile (                                            \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%rax","%rdx","%cc");
 #elif defined(TFM_SSE2)
 /* SSE2 Optimized */
 #define COMBA_START
 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define CARRY_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_FINI \
   asm("emms");
 #define SQRADD(i, j)                                      \
 asm volatile (                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "pmuludq %%mm0,%%mm0\n\t"                            \
     "movd  %%mm0,%%eax  \n\t"                            \
     "psrlq $32,%%mm0    \n\t"                            \
     "movd  %%mm0,%%edx  \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
 #define SQRADD2(i, j)                                     \
 asm volatile (                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
     "movd  %%mm0,%%eax  \n\t"                            \
     "psrlq $32,%%mm0    \n\t"                            \
     "movd  %%mm0,%%edx  \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
 #elif defined(TFM_ARM)
 /* ARM code */
 #define COMBA_START
 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define CARRY_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_FINI
 /* multiplies point i and j, updates carry "c1" and digit c2 */
 #define SQRADD(i, j)                                             \
 asm(                                                             \
 "  UMULL  r0,r1,%6,%6              \n\t"                         \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)                                            \
 asm(                                                             \
 "  UMULL  r0,r1,%6,%7              \n\t"                         \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
 #else
 /* ISO C portable code */
 #define COMBA_START
 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
 #define COMBA_STORE(x) \
   x = c0;
 #define COMBA_STORE2(x) \
   x = c1;
 #define CARRY_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 #define COMBA_FINI
 /* multiplies point i and j, updates carry "c1" and digit c2 */
 #define SQRADD(i, j)                       \
   t  = ((fp_word)i) * ((fp_word)j);       \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)                       \
   t  = ((fp_word)i) * ((fp_word)j);       \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
 #endif
 /* generic comba squarer */
 void fp_sqr_comba(fp_int *A, fp_int *B)
 {
  int       pa, ix, iz;
  fp_digit  c0, c1, c2;
  fp_int    tmp, *dst;
  fp_word   t;
  /* get size of output and trim */
  pa = A->used + A->used;
  if (pa >= FP_SIZE) {
     pa = FP_SIZE-1;
  }
  /* number of output digits to produce */
  COMBA_START;
  CLEAR_CARRY;
  if (A == B) {
     fp_zero(&tmp);
     dst = &tmp;
  } else {
     fp_zero(B);
     dst = B;
  }
  for (ix = 0; ix < pa; ix++) { 
      int      tx, ty, iy;
      fp_digit *tmpy, *tmpx;
      /* get offsets into the two bignums */
      ty = MIN(A->used-1, ix);
      tx = ix - ty;
      /* setup temp aliases */
      tmpx = A->dp + tx;
      tmpy = A->dp + ty;
      /* this is the number of times the loop will iterrate, essentially its 
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
      iy = MIN(A->used-tx, ty+1);
      /* now for squaring tx can never equal ty 
       * we halve the distance since they approach at a rate of 2x
       * and we have to round because odd cases need to be executed
       */
      iy = MIN(iy, (ty-tx+1)>>1);
      /* forward carries */
      CARRY_FORWARD;
      /* execute loop */
      for (iz = 0; iz < iy; iz++) {
          SQRADD2(*tmpx++, *tmpy--);
      }
      /* even columns have the square term in them */
      if ((ix&1) == 0) {
          SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
      }
      /* store it */
      COMBA_STORE(dst->dp[ix]);
  }
  COMBA_STORE2(dst->dp[ix]);
  COMBA_FINI;
  /* setup dest */
  dst->used = pa;
  fp_clamp (dst);
  if (dst != B) {
     fp_copy(dst, B);
  }
 }
 void fp_sqr_comba4(fp_int *A, fp_int *B)
 {
   fp_word t;
   fp_digit *a, b[8], c0, c1, c2;
   a = A->dp;
   COMBA_START; 
   /* clear carries */
   CLEAR_CARRY;
   /* output 0 */
   SQRADD(a[0],a[0]);
   COMBA_STORE(b[0]);
   /* output 1 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[1]); 
   COMBA_STORE(b[1]);
   /* output 2 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
   COMBA_STORE(b[2]);
   /* output 3 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
   COMBA_STORE(b[3]);
   /* output 4 */
   CARRY_FORWARD;
   SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
   COMBA_STORE(b[4]);
   /* output 5 */
   CARRY_FORWARD;
   SQRADD2(a[2], a[3]); 
   COMBA_STORE(b[5]);
   /* output 6 */
   CARRY_FORWARD;
   SQRADD(a[3], a[3]); 
   COMBA_STORE(b[6]);
   COMBA_STORE2(b[7]);
   COMBA_FINI;
   B->used = 8;
   B->sign = FP_ZPOS;
   memcpy(B->dp, b, 8 * sizeof(fp_digit));
   fp_clamp(B);
 }
 void fp_sqr_comba8(fp_int *A, fp_int *B)
 {
   fp_word t;
   fp_digit *a, b[16], c0, c1, c2;
   a = A->dp;
   COMBA_START; 
   /* clear carries */
   CLEAR_CARRY;
   /* output 0 */
   SQRADD(a[0],a[0]);
   COMBA_STORE(b[0]);
   /* output 1 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[1]); 
   COMBA_STORE(b[1]);
   /* output 2 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
   COMBA_STORE(b[2]);
   /* output 3 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
   COMBA_STORE(b[3]);
   /* output 4 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
   COMBA_STORE(b[4]);
   /* output 5 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[5]); SQRADD2(a[1], a[4]); SQRADD2(a[2], a[3]); 
   COMBA_STORE(b[5]);
   /* output 6 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[6]); SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]); 
   COMBA_STORE(b[6]);
   /* output 7 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[7]); SQRADD2(a[1], a[6]); SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]); 
   COMBA_STORE(b[7]);
   /* output 8 */
   CARRY_FORWARD;
   SQRADD2(a[1], a[7]); SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]); 
   COMBA_STORE(b[8]);
   /* output 9 */
   CARRY_FORWARD;
   SQRADD2(a[2], a[7]); SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]); 
   COMBA_STORE(b[9]);
   /* output 10 */
   CARRY_FORWARD;
   SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]); 
   COMBA_STORE(b[10]);
   /* output 11 */
   CARRY_FORWARD;
   SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]); 
   COMBA_STORE(b[11]);
   /* output 12 */
   CARRY_FORWARD;
   SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]); 
   COMBA_STORE(b[12]);
   /* output 13 */
   CARRY_FORWARD;
   SQRADD2(a[6], a[7]); 
   COMBA_STORE(b[13]);
   /* output 14 */
   CARRY_FORWARD;
   SQRADD(a[7], a[7]); 
   COMBA_STORE(b[14]);
   COMBA_STORE2(b[15]);
   COMBA_FINI;
   B->used = 16;
   B->sign = FP_ZPOS;
   memcpy(B->dp, b, 16 * sizeof(fp_digit));
   fp_clamp(B);
 }
 void fp_sqr_comba16(fp_int *A, fp_int *B)
 {
   fp_word t;
   fp_digit *a, b[32], c0, c1, c2;
   a = A->dp;
   COMBA_START; 
   /* clear carries */
   CLEAR_CARRY;
   /* output 0 */
   SQRADD(a[0],a[0]);
   COMBA_STORE(b[0]);
   /* output 1 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[1]); 
   COMBA_STORE(b[1]);
   /* output 2 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
   COMBA_STORE(b[2]);
   /* output 3 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
   COMBA_STORE(b[3]);
   /* output 4 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
   COMBA_STORE(b[4]);
   /* output 5 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[5]); SQRADD2(a[1], a[4]); SQRADD2(a[2], a[3]); 
   COMBA_STORE(b[5]);
   /* output 6 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[6]); SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]); 
   COMBA_STORE(b[6]);
   /* output 7 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[7]); SQRADD2(a[1], a[6]); SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]); 
   COMBA_STORE(b[7]);
   /* output 8 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[8]); SQRADD2(a[1], a[7]); SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]); 
   COMBA_STORE(b[8]);
   /* output 9 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[9]); SQRADD2(a[1], a[8]); SQRADD2(a[2], a[7]); SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]); 
   COMBA_STORE(b[9]);
   /* output 10 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[10]); SQRADD2(a[1], a[9]); SQRADD2(a[2], a[8]); SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]); 
   COMBA_STORE(b[10]);
   /* output 11 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[11]); SQRADD2(a[1], a[10]); SQRADD2(a[2], a[9]); SQRADD2(a[3], a[8]); SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]); 
   COMBA_STORE(b[11]);
   /* output 12 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[12]); SQRADD2(a[1], a[11]); SQRADD2(a[2], a[10]); SQRADD2(a[3], a[9]); SQRADD2(a[4], a[8]); SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]); 
   COMBA_STORE(b[12]);
   /* output 13 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[13]); SQRADD2(a[1], a[12]); SQRADD2(a[2], a[11]); SQRADD2(a[3], a[10]); SQRADD2(a[4], a[9]); SQRADD2(a[5], a[8]); SQRADD2(a[6], a[7]); 
   COMBA_STORE(b[13]);
   /* output 14 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[14]); SQRADD2(a[1], a[13]); SQRADD2(a[2], a[12]); SQRADD2(a[3], a[11]); SQRADD2(a[4], a[10]); SQRADD2(a[5], a[9]); SQRADD2(a[6], a[8]); SQRADD(a[7], a[7]); 
   COMBA_STORE(b[14]);
   /* output 15 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[15]); SQRADD2(a[1], a[14]); SQRADD2(a[2], a[13]); SQRADD2(a[3], a[12]); SQRADD2(a[4], a[11]); SQRADD2(a[5], a[10]); SQRADD2(a[6], a[9]); SQRADD2(a[7], a[8]); 
   COMBA_STORE(b[15]);
   /* output 16 */
   CARRY_FORWARD;
   SQRADD2(a[1], a[15]); SQRADD2(a[2], a[14]); SQRADD2(a[3], a[13]); SQRADD2(a[4], a[12]); SQRADD2(a[5], a[11]); SQRADD2(a[6], a[10]); SQRADD2(a[7], a[9]); SQRADD(a[8], a[8]); 
   COMBA_STORE(b[16]);
   /* output 17 */
   CARRY_FORWARD;
   SQRADD2(a[2], a[15]); SQRADD2(a[3], a[14]); SQRADD2(a[4], a[13]); SQRADD2(a[5], a[12]); SQRADD2(a[6], a[11]); SQRADD2(a[7], a[10]); SQRADD2(a[8], a[9]); 
   COMBA_STORE(b[17]);
   /* output 18 */
   CARRY_FORWARD;
   SQRADD2(a[3], a[15]); SQRADD2(a[4], a[14]); SQRADD2(a[5], a[13]); SQRADD2(a[6], a[12]); SQRADD2(a[7], a[11]); SQRADD2(a[8], a[10]); SQRADD(a[9], a[9]); 
   COMBA_STORE(b[18]);
   /* output 19 */
   CARRY_FORWARD;
   SQRADD2(a[4], a[15]); SQRADD2(a[5], a[14]); SQRADD2(a[6], a[13]); SQRADD2(a[7], a[12]); SQRADD2(a[8], a[11]); SQRADD2(a[9], a[10]); 
   COMBA_STORE(b[19]);
   /* output 20 */
   CARRY_FORWARD;
   SQRADD2(a[5], a[15]); SQRADD2(a[6], a[14]); SQRADD2(a[7], a[13]); SQRADD2(a[8], a[12]); SQRADD2(a[9], a[11]); SQRADD(a[10], a[10]); 
   COMBA_STORE(b[20]);
   /* output 21 */
   CARRY_FORWARD;
   SQRADD2(a[6], a[15]); SQRADD2(a[7], a[14]); SQRADD2(a[8], a[13]); SQRADD2(a[9], a[12]); SQRADD2(a[10], a[11]); 
   COMBA_STORE(b[21]);
   /* output 22 */
   CARRY_FORWARD;
   SQRADD2(a[7], a[15]); SQRADD2(a[8], a[14]); SQRADD2(a[9], a[13]); SQRADD2(a[10], a[12]); SQRADD(a[11], a[11]); 
   COMBA_STORE(b[22]);
   /* output 23 */
   CARRY_FORWARD;
   SQRADD2(a[8], a[15]); SQRADD2(a[9], a[14]); SQRADD2(a[10], a[13]); SQRADD2(a[11], a[12]); 
   COMBA_STORE(b[23]);
   /* output 24 */
   CARRY_FORWARD;
   SQRADD2(a[9], a[15]); SQRADD2(a[10], a[14]); SQRADD2(a[11], a[13]); SQRADD(a[12], a[12]); 
   COMBA_STORE(b[24]);
   /* output 25 */
   CARRY_FORWARD;
   SQRADD2(a[10], a[15]); SQRADD2(a[11], a[14]); SQRADD2(a[12], a[13]); 
   COMBA_STORE(b[25]);
   /* output 26 */
   CARRY_FORWARD;
   SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]); 
   COMBA_STORE(b[26]);
   /* output 27 */
   CARRY_FORWARD;
   SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]); 
   COMBA_STORE(b[27]);
   /* output 28 */
   CARRY_FORWARD;
   SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]); 
   COMBA_STORE(b[28]);
   /* output 29 */
   CARRY_FORWARD;
   SQRADD2(a[14], a[15]); 
   COMBA_STORE(b[29]);
   /* output 30 */
   CARRY_FORWARD;
   SQRADD(a[15], a[15]); 
   COMBA_STORE(b[30]);
   COMBA_STORE2(b[31]);
   COMBA_FINI;
   B->used = 32;
   B->sign = FP_ZPOS;
   memcpy(B->dp, b, 32 * sizeof(fp_digit));
   fp_clamp(B);
 }
 #ifdef TFM_HUGE
 void fp_sqr_comba32(fp_int *A, fp_int *B)
 {
   fp_word t;
   fp_digit *a, b[64], c0, c1, c2;
   a = A->dp;
   COMBA_START; 
   /* clear carries */
   CLEAR_CARRY;
   /* output 0 */
   SQRADD(a[0],a[0]);
   COMBA_STORE(b[0]);
   /* output 1 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[1]); 
   COMBA_STORE(b[1]);
   /* output 2 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
   COMBA_STORE(b[2]);
   /* output 3 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
   COMBA_STORE(b[3]);
   /* output 4 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
   COMBA_STORE(b[4]);
   /* output 5 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[5]); SQRADD2(a[1], a[4]); SQRADD2(a[2], a[3]); 
   COMBA_STORE(b[5]);
   /* output 6 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[6]); SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]); 
   COMBA_STORE(b[6]);
   /* output 7 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[7]); SQRADD2(a[1], a[6]); SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]); 
   COMBA_STORE(b[7]);
   /* output 8 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[8]); SQRADD2(a[1], a[7]); SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]); 
   COMBA_STORE(b[8]);
   /* output 9 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[9]); SQRADD2(a[1], a[8]); SQRADD2(a[2], a[7]); SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]); 
   COMBA_STORE(b[9]);
   /* output 10 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[10]); SQRADD2(a[1], a[9]); SQRADD2(a[2], a[8]); SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]); 
   COMBA_STORE(b[10]);
   /* output 11 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[11]); SQRADD2(a[1], a[10]); SQRADD2(a[2], a[9]); SQRADD2(a[3], a[8]); SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]); 
   COMBA_STORE(b[11]);
   /* output 12 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[12]); SQRADD2(a[1], a[11]); SQRADD2(a[2], a[10]); SQRADD2(a[3], a[9]); SQRADD2(a[4], a[8]); SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]); 
   COMBA_STORE(b[12]);
   /* output 13 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[13]); SQRADD2(a[1], a[12]); SQRADD2(a[2], a[11]); SQRADD2(a[3], a[10]); SQRADD2(a[4], a[9]); SQRADD2(a[5], a[8]); SQRADD2(a[6], a[7]); 
   COMBA_STORE(b[13]);
   /* output 14 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[14]); SQRADD2(a[1], a[13]); SQRADD2(a[2], a[12]); SQRADD2(a[3], a[11]); SQRADD2(a[4], a[10]); SQRADD2(a[5], a[9]); SQRADD2(a[6], a[8]); SQRADD(a[7], a[7]); 
   COMBA_STORE(b[14]);
   /* output 15 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[15]); SQRADD2(a[1], a[14]); SQRADD2(a[2], a[13]); SQRADD2(a[3], a[12]); SQRADD2(a[4], a[11]); SQRADD2(a[5], a[10]); SQRADD2(a[6], a[9]); SQRADD2(a[7], a[8]); 
   COMBA_STORE(b[15]);
   /* output 16 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[16]); SQRADD2(a[1], a[15]); SQRADD2(a[2], a[14]); SQRADD2(a[3], a[13]); SQRADD2(a[4], a[12]); SQRADD2(a[5], a[11]); SQRADD2(a[6], a[10]); SQRADD2(a[7], a[9]); SQRADD(a[8], a[8]); 
   COMBA_STORE(b[16]);
   /* output 17 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[17]); SQRADD2(a[1], a[16]); SQRADD2(a[2], a[15]); SQRADD2(a[3], a[14]); SQRADD2(a[4], a[13]); SQRADD2(a[5], a[12]); SQRADD2(a[6], a[11]); SQRADD2(a[7], a[10]); SQRADD2(a[8], a[9]); 
   COMBA_STORE(b[17]);
   /* output 18 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[18]); SQRADD2(a[1], a[17]); SQRADD2(a[2], a[16]); SQRADD2(a[3], a[15]); SQRADD2(a[4], a[14]); SQRADD2(a[5], a[13]); SQRADD2(a[6], a[12]); SQRADD2(a[7], a[11]); SQRADD2(a[8], a[10]); SQRADD(a[9], a[9]); 
   COMBA_STORE(b[18]);
   /* output 19 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[19]); SQRADD2(a[1], a[18]); SQRADD2(a[2], a[17]); SQRADD2(a[3], a[16]); SQRADD2(a[4], a[15]); SQRADD2(a[5], a[14]); SQRADD2(a[6], a[13]); SQRADD2(a[7], a[12]); SQRADD2(a[8], a[11]); SQRADD2(a[9], a[10]); 
   COMBA_STORE(b[19]);
   /* output 20 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[20]); SQRADD2(a[1], a[19]); SQRADD2(a[2], a[18]); SQRADD2(a[3], a[17]); SQRADD2(a[4], a[16]); SQRADD2(a[5], a[15]); SQRADD2(a[6], a[14]); SQRADD2(a[7], a[13]); SQRADD2(a[8], a[12]); SQRADD2(a[9], a[11]); SQRADD(a[10], a[10]); 
   COMBA_STORE(b[20]);
   /* output 21 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[21]); SQRADD2(a[1], a[20]); SQRADD2(a[2], a[19]); SQRADD2(a[3], a[18]); SQRADD2(a[4], a[17]); SQRADD2(a[5], a[16]); SQRADD2(a[6], a[15]); SQRADD2(a[7], a[14]); SQRADD2(a[8], a[13]); SQRADD2(a[9], a[12]); SQRADD2(a[10], a[11]); 
   COMBA_STORE(b[21]);
   /* output 22 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[22]); SQRADD2(a[1], a[21]); SQRADD2(a[2], a[20]); SQRADD2(a[3], a[19]); SQRADD2(a[4], a[18]); SQRADD2(a[5], a[17]); SQRADD2(a[6], a[16]); SQRADD2(a[7], a[15]); SQRADD2(a[8], a[14]); SQRADD2(a[9], a[13]); SQRADD2(a[10], a[12]); SQRADD(a[11], a[11]); 
   COMBA_STORE(b[22]);
   /* output 23 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[23]); SQRADD2(a[1], a[22]); SQRADD2(a[2], a[21]); SQRADD2(a[3], a[20]); SQRADD2(a[4], a[19]); SQRADD2(a[5], a[18]); SQRADD2(a[6], a[17]); SQRADD2(a[7], a[16]); SQRADD2(a[8], a[15]); SQRADD2(a[9], a[14]); SQRADD2(a[10], a[13]); SQRADD2(a[11], a[12]); 
   COMBA_STORE(b[23]);
   /* output 24 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[24]); SQRADD2(a[1], a[23]); SQRADD2(a[2], a[22]); SQRADD2(a[3], a[21]); SQRADD2(a[4], a[20]); SQRADD2(a[5], a[19]); SQRADD2(a[6], a[18]); SQRADD2(a[7], a[17]); SQRADD2(a[8], a[16]); SQRADD2(a[9], a[15]); SQRADD2(a[10], a[14]); SQRADD2(a[11], a[13]); SQRADD(a[12], a[12]); 
   COMBA_STORE(b[24]);
   /* output 25 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[25]); SQRADD2(a[1], a[24]); SQRADD2(a[2], a[23]); SQRADD2(a[3], a[22]); SQRADD2(a[4], a[21]); SQRADD2(a[5], a[20]); SQRADD2(a[6], a[19]); SQRADD2(a[7], a[18]); SQRADD2(a[8], a[17]); SQRADD2(a[9], a[16]); SQRADD2(a[10], a[15]); SQRADD2(a[11], a[14]); SQRADD2(a[12], a[13]); 
   COMBA_STORE(b[25]);
   /* output 26 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[26]); SQRADD2(a[1], a[25]); SQRADD2(a[2], a[24]); SQRADD2(a[3], a[23]); SQRADD2(a[4], a[22]); SQRADD2(a[5], a[21]); SQRADD2(a[6], a[20]); SQRADD2(a[7], a[19]); SQRADD2(a[8], a[18]); SQRADD2(a[9], a[17]); SQRADD2(a[10], a[16]); SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]); 
   COMBA_STORE(b[26]);
   /* output 27 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[27]); SQRADD2(a[1], a[26]); SQRADD2(a[2], a[25]); SQRADD2(a[3], a[24]); SQRADD2(a[4], a[23]); SQRADD2(a[5], a[22]); SQRADD2(a[6], a[21]); SQRADD2(a[7], a[20]); SQRADD2(a[8], a[19]); SQRADD2(a[9], a[18]); SQRADD2(a[10], a[17]); SQRADD2(a[11], a[16]); SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]); 
   COMBA_STORE(b[27]);
   /* output 28 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[28]); SQRADD2(a[1], a[27]); SQRADD2(a[2], a[26]); SQRADD2(a[3], a[25]); SQRADD2(a[4], a[24]); SQRADD2(a[5], a[23]); SQRADD2(a[6], a[22]); SQRADD2(a[7], a[21]); SQRADD2(a[8], a[20]); SQRADD2(a[9], a[19]); SQRADD2(a[10], a[18]); SQRADD2(a[11], a[17]); SQRADD2(a[12], a[16]); SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]); 
   COMBA_STORE(b[28]);
   /* output 29 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[29]); SQRADD2(a[1], a[28]); SQRADD2(a[2], a[27]); SQRADD2(a[3], a[26]); SQRADD2(a[4], a[25]); SQRADD2(a[5], a[24]); SQRADD2(a[6], a[23]); SQRADD2(a[7], a[22]); SQRADD2(a[8], a[21]); SQRADD2(a[9], a[20]); SQRADD2(a[10], a[19]); SQRADD2(a[11], a[18]); SQRADD2(a[12], a[17]); SQRADD2(a[13], a[16]); SQRADD2(a[14], a[15]); 
   COMBA_STORE(b[29]);
   /* output 30 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[30]); SQRADD2(a[1], a[29]); SQRADD2(a[2], a[28]); SQRADD2(a[3], a[27]); SQRADD2(a[4], a[26]); SQRADD2(a[5], a[25]); SQRADD2(a[6], a[24]); SQRADD2(a[7], a[23]); SQRADD2(a[8], a[22]); SQRADD2(a[9], a[21]); SQRADD2(a[10], a[20]); SQRADD2(a[11], a[19]); SQRADD2(a[12], a[18]); SQRADD2(a[13], a[17]); SQRADD2(a[14], a[16]); SQRADD(a[15], a[15]); 
   COMBA_STORE(b[30]);
   /* output 31 */
   CARRY_FORWARD;
   SQRADD2(a[0], a[31]); SQRADD2(a[1], a[30]); SQRADD2(a[2], a[29]); SQRADD2(a[3], a[28]); SQRADD2(a[4], a[27]); SQRADD2(a[5], a[26]); SQRADD2(a[6], a[25]); SQRADD2(a[7], a[24]); SQRADD2(a[8], a[23]); SQRADD2(a[9], a[22]); SQRADD2(a[10], a[21]); SQRADD2(a[11], a[20]); SQRADD2(a[12], a[19]); SQRADD2(a[13], a[18]); SQRADD2(a[14], a[17]); SQRADD2(a[15], a[16]); 
   COMBA_STORE(b[31]);
   /* output 32 */
   CARRY_FORWARD;
   SQRADD2(a[1], a[31]); SQRADD2(a[2], a[30]); SQRADD2(a[3], a[29]); SQRADD2(a[4], a[28]); SQRADD2(a[5], a[27]); SQRADD2(a[6], a[26]); SQRADD2(a[7], a[25]); SQRADD2(a[8], a[24]); SQRADD2(a[9], a[23]); SQRADD2(a[10], a[22]); SQRADD2(a[11], a[21]); SQRADD2(a[12], a[20]); SQRADD2(a[13], a[19]); SQRADD2(a[14], a[18]); SQRADD2(a[15], a[17]); SQRADD(a[16], a[16]); 
   COMBA_STORE(b[32]);
   /* output 33 */
   CARRY_FORWARD;
   SQRADD2(a[2], a[31]); SQRADD2(a[3], a[30]); SQRADD2(a[4], a[29]); SQRADD2(a[5], a[28]); SQRADD2(a[6], a[27]); SQRADD2(a[7], a[26]); SQRADD2(a[8], a[25]); SQRADD2(a[9], a[24]); SQRADD2(a[10], a[23]); SQRADD2(a[11], a[22]); SQRADD2(a[12], a[21]); SQRADD2(a[13], a[20]); SQRADD2(a[14], a[19]); SQRADD2(a[15], a[18]); SQRADD2(a[16], a[17]); 
   COMBA_STORE(b[33]);
   /* output 34 */
   CARRY_FORWARD;
   SQRADD2(a[3], a[31]); SQRADD2(a[4], a[30]); SQRADD2(a[5], a[29]); SQRADD2(a[6], a[28]); SQRADD2(a[7], a[27]); SQRADD2(a[8], a[26]); SQRADD2(a[9], a[25]); SQRADD2(a[10], a[24]); SQRADD2(a[11], a[23]); SQRADD2(a[12], a[22]); SQRADD2(a[13], a[21]); SQRADD2(a[14], a[20]); SQRADD2(a[15], a[19]); SQRADD2(a[16], a[18]); SQRADD(a[17], a[17]); 
   COMBA_STORE(b[34]);
   /* output 35 */
   CARRY_FORWARD;
   SQRADD2(a[4], a[31]); SQRADD2(a[5], a[30]); SQRADD2(a[6], a[29]); SQRADD2(a[7], a[28]); SQRADD2(a[8], a[27]); SQRADD2(a[9], a[26]); SQRADD2(a[10], a[25]); SQRADD2(a[11], a[24]); SQRADD2(a[12], a[23]); SQRADD2(a[13], a[22]); SQRADD2(a[14], a[21]); SQRADD2(a[15], a[20]); SQRADD2(a[16], a[19]); SQRADD2(a[17], a[18]); 
   COMBA_STORE(b[35]);
   /* output 36 */
   CARRY_FORWARD;
   SQRADD2(a[5], a[31]); SQRADD2(a[6], a[30]); SQRADD2(a[7], a[29]); SQRADD2(a[8], a[28]); SQRADD2(a[9], a[27]); SQRADD2(a[10], a[26]); SQRADD2(a[11], a[25]); SQRADD2(a[12], a[24]); SQRADD2(a[13], a[23]); SQRADD2(a[14], a[22]); SQRADD2(a[15], a[21]); SQRADD2(a[16], a[20]); SQRADD2(a[17], a[19]); SQRADD(a[18], a[18]); 
   COMBA_STORE(b[36]);
   /* output 37 */
   CARRY_FORWARD;
   SQRADD2(a[6], a[31]); SQRADD2(a[7], a[30]); SQRADD2(a[8], a[29]); SQRADD2(a[9], a[28]); SQRADD2(a[10], a[27]); SQRADD2(a[11], a[26]); SQRADD2(a[12], a[25]); SQRADD2(a[13], a[24]); SQRADD2(a[14], a[23]); SQRADD2(a[15], a[22]); SQRADD2(a[16], a[21]); SQRADD2(a[17], a[20]); SQRADD2(a[18], a[19]); 
   COMBA_STORE(b[37]);
   /* output 38 */
   CARRY_FORWARD;
   SQRADD2(a[7], a[31]); SQRADD2(a[8], a[30]); SQRADD2(a[9], a[29]); SQRADD2(a[10], a[28]); SQRADD2(a[11], a[27]); SQRADD2(a[12], a[26]); SQRADD2(a[13], a[25]); SQRADD2(a[14], a[24]); SQRADD2(a[15], a[23]); SQRADD2(a[16], a[22]); SQRADD2(a[17], a[21]); SQRADD2(a[18], a[20]); SQRADD(a[19], a[19]); 
   COMBA_STORE(b[38]);
   /* output 39 */
   CARRY_FORWARD;
   SQRADD2(a[8], a[31]); SQRADD2(a[9], a[30]); SQRADD2(a[10], a[29]); SQRADD2(a[11], a[28]); SQRADD2(a[12], a[27]); SQRADD2(a[13], a[26]); SQRADD2(a[14], a[25]); SQRADD2(a[15], a[24]); SQRADD2(a[16], a[23]); SQRADD2(a[17], a[22]); SQRADD2(a[18], a[21]); SQRADD2(a[19], a[20]); 
   COMBA_STORE(b[39]);
   /* output 40 */
   CARRY_FORWARD;
   SQRADD2(a[9], a[31]); SQRADD2(a[10], a[30]); SQRADD2(a[11], a[29]); SQRADD2(a[12], a[28]); SQRADD2(a[13], a[27]); SQRADD2(a[14], a[26]); SQRADD2(a[15], a[25]); SQRADD2(a[16], a[24]); SQRADD2(a[17], a[23]); SQRADD2(a[18], a[22]); SQRADD2(a[19], a[21]); SQRADD(a[20], a[20]); 
   COMBA_STORE(b[40]);
   /* output 41 */
   CARRY_FORWARD;
   SQRADD2(a[10], a[31]); SQRADD2(a[11], a[30]); SQRADD2(a[12], a[29]); SQRADD2(a[13], a[28]); SQRADD2(a[14], a[27]); SQRADD2(a[15], a[26]); SQRADD2(a[16], a[25]); SQRADD2(a[17], a[24]); SQRADD2(a[18], a[23]); SQRADD2(a[19], a[22]); SQRADD2(a[20], a[21]); 
   COMBA_STORE(b[41]);
   /* output 42 */
   CARRY_FORWARD;
   SQRADD2(a[11], a[31]); SQRADD2(a[12], a[30]); SQRADD2(a[13], a[29]); SQRADD2(a[14], a[28]); SQRADD2(a[15], a[27]); SQRADD2(a[16], a[26]); SQRADD2(a[17], a[25]); SQRADD2(a[18], a[24]); SQRADD2(a[19], a[23]); SQRADD2(a[20], a[22]); SQRADD(a[21], a[21]); 
   COMBA_STORE(b[42]);
   /* output 43 */
   CARRY_FORWARD;
   SQRADD2(a[12], a[31]); SQRADD2(a[13], a[30]); SQRADD2(a[14], a[29]); SQRADD2(a[15], a[28]); SQRADD2(a[16], a[27]); SQRADD2(a[17], a[26]); SQRADD2(a[18], a[25]); SQRADD2(a[19], a[24]); SQRADD2(a[20], a[23]); SQRADD2(a[21], a[22]); 
   COMBA_STORE(b[43]);
   /* output 44 */
   CARRY_FORWARD;
   SQRADD2(a[13], a[31]); SQRADD2(a[14], a[30]); SQRADD2(a[15], a[29]); SQRADD2(a[16], a[28]); SQRADD2(a[17], a[27]); SQRADD2(a[18], a[26]); SQRADD2(a[19], a[25]); SQRADD2(a[20], a[24]); SQRADD2(a[21], a[23]); SQRADD(a[22], a[22]); 
   COMBA_STORE(b[44]);
   /* output 45 */
   CARRY_FORWARD;
   SQRADD2(a[14], a[31]); SQRADD2(a[15], a[30]); SQRADD2(a[16], a[29]); SQRADD2(a[17], a[28]); SQRADD2(a[18], a[27]); SQRADD2(a[19], a[26]); SQRADD2(a[20], a[25]); SQRADD2(a[21], a[24]); SQRADD2(a[22], a[23]); 
   COMBA_STORE(b[45]);
   /* output 46 */
   CARRY_FORWARD;
   SQRADD2(a[15], a[31]); SQRADD2(a[16], a[30]); SQRADD2(a[17], a[29]); SQRADD2(a[18], a[28]); SQRADD2(a[19], a[27]); SQRADD2(a[20], a[26]); SQRADD2(a[21], a[25]); SQRADD2(a[22], a[24]); SQRADD(a[23], a[23]); 
   COMBA_STORE(b[46]);
   /* output 47 */
   CARRY_FORWARD;
   SQRADD2(a[16], a[31]); SQRADD2(a[17], a[30]); SQRADD2(a[18], a[29]); SQRADD2(a[19], a[28]); SQRADD2(a[20], a[27]); SQRADD2(a[21], a[26]); SQRADD2(a[22], a[25]); SQRADD2(a[23], a[24]); 
   COMBA_STORE(b[47]);
   /* output 48 */
   CARRY_FORWARD;
   SQRADD2(a[17], a[31]); SQRADD2(a[18], a[30]); SQRADD2(a[19], a[29]); SQRADD2(a[20], a[28]); SQRADD2(a[21], a[27]); SQRADD2(a[22], a[26]); SQRADD2(a[23], a[25]); SQRADD(a[24], a[24]); 
   COMBA_STORE(b[48]);
   /* output 49 */
   CARRY_FORWARD;
   SQRADD2(a[18], a[31]); SQRADD2(a[19], a[30]); SQRADD2(a[20], a[29]); SQRADD2(a[21], a[28]); SQRADD2(a[22], a[27]); SQRADD2(a[23], a[26]); SQRADD2(a[24], a[25]); 
   COMBA_STORE(b[49]);
   /* output 50 */
   CARRY_FORWARD;
   SQRADD2(a[19], a[31]); SQRADD2(a[20], a[30]); SQRADD2(a[21], a[29]); SQRADD2(a[22], a[28]); SQRADD2(a[23], a[27]); SQRADD2(a[24], a[26]); SQRADD(a[25], a[25]); 
   COMBA_STORE(b[50]);
   /* output 51 */
   CARRY_FORWARD;
   SQRADD2(a[20], a[31]); SQRADD2(a[21], a[30]); SQRADD2(a[22], a[29]); SQRADD2(a[23], a[28]); SQRADD2(a[24], a[27]); SQRADD2(a[25], a[26]); 
   COMBA_STORE(b[51]);
   /* output 52 */
   CARRY_FORWARD;
   SQRADD2(a[21], a[31]); SQRADD2(a[22], a[30]); SQRADD2(a[23], a[29]); SQRADD2(a[24], a[28]); SQRADD2(a[25], a[27]); SQRADD(a[26], a[26]); 
   COMBA_STORE(b[52]);
   /* output 53 */
   CARRY_FORWARD;
   SQRADD2(a[22], a[31]); SQRADD2(a[23], a[30]); SQRADD2(a[24], a[29]); SQRADD2(a[25], a[28]); SQRADD2(a[26], a[27]); 
   COMBA_STORE(b[53]);
   /* output 54 */
   CARRY_FORWARD;
   SQRADD2(a[23], a[31]); SQRADD2(a[24], a[30]); SQRADD2(a[25], a[29]); SQRADD2(a[26], a[28]); SQRADD(a[27], a[27]); 
   COMBA_STORE(b[54]);
   /* output 55 */
   CARRY_FORWARD;
   SQRADD2(a[24], a[31]); SQRADD2(a[25], a[30]); SQRADD2(a[26], a[29]); SQRADD2(a[27], a[28]); 
   COMBA_STORE(b[55]);
   /* output 56 */
   CARRY_FORWARD;
   SQRADD2(a[25], a[31]); SQRADD2(a[26], a[30]); SQRADD2(a[27], a[29]); SQRADD(a[28], a[28]); 
   COMBA_STORE(b[56]);
   /* output 57 */
   CARRY_FORWARD;
   SQRADD2(a[26], a[31]); SQRADD2(a[27], a[30]); SQRADD2(a[28], a[29]); 
   COMBA_STORE(b[57]);
   /* output 58 */
   CARRY_FORWARD;
   SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]); 
   COMBA_STORE(b[58]);
   /* output 59 */
   CARRY_FORWARD;
   SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]); 
   COMBA_STORE(b[59]);
   /* output 60 */
   CARRY_FORWARD;
   SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]); 
   COMBA_STORE(b[60]);
   /* output 61 */
   CARRY_FORWARD;
   SQRADD2(a[30], a[31]); 
   COMBA_STORE(b[61]);
   /* output 62 */
   CARRY_FORWARD;
   SQRADD(a[31], a[31]); 
   COMBA_STORE(b[62]);
   COMBA_STORE2(b[63]);
   COMBA_FINI;
   B->used = 64;
   B->sign = FP_ZPOS;
   memcpy(B->dp, b, 64 * sizeof(fp_digit));
   fp_clamp(B);
 }
 #endif
--- a/fp_sqrmod.c
+++ b/fp_sqrmod.c
@ -0,0 +1,19 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a * a (mod b) */
 int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c)
 {
  fp_int tmp;
  fp_zero(&tmp);
  fp_sqr(a, &tmp);
  return fp_mod(&tmp, b, c);
 }
--- a/fp_sub.c
+++ b/fp_sub.c
@ -0,0 +1,46 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a - b */
 void fp_sub(fp_int *a, fp_int *b, fp_int *c)
 {
  int     sa, sb;
  sa = a->sign;
  sb = b->sign;
  if (sa != sb) {
    /* subtract a negative from a positive, OR */
    /* subtract a positive from a negative. */
    /* In either case, ADD their magnitudes, */
    /* and use the sign of the first number. */
    c->sign = sa;
    s_fp_add (a, b, c);
  } else {
    /* subtract a positive from a positive, OR */
    /* subtract a negative from a negative. */
    /* First, take the difference between their */
    /* magnitudes, then... */
    if (fp_cmp_mag (a, b) != FP_LT) {
      /* Copy the sign from the first */
      c->sign = sa;
      /* The first has a larger or equal magnitude */
      s_fp_sub (a, b, c);
    } else {
      /* The result has the *opposite* sign from */
      /* the first number. */
      c->sign = (sa == FP_ZPOS) ? FP_NEG : FP_ZPOS;
      /* The second has a larger magnitude */
      s_fp_sub (b, a, c);
    }
  }
 }
--- a/fp_sub_d.c
+++ b/fp_sub_d.c
@ -0,0 +1,18 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* c = a - b */
 void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
 {
   fp_int tmp;
   fp_set(&tmp, b);
   fp_sub(a, &tmp, c);
 }
--- a/fp_submod.c
+++ b/fp_submod.c
@ -0,0 +1,20 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* d = a - b (mod c) */
 int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
  fp_int tmp;
  fp_zero(&tmp);
  fp_sub(a, b, &tmp);
  return fp_mod(&tmp, c, d);
 }
--- a/fp_to_signed_bin.c
+++ b/fp_to_signed_bin.c
@ -0,0 +1,16 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_to_signed_bin(fp_int *a, unsigned char *b)
 {
  fp_to_unsigned_bin (a, b + 1);
  b[0] = (unsigned char) ((a->sign == FP_ZPOS) ? 0 : 1);
 }
--- a/fp_to_unsigned_bin.c
+++ b/fp_to_unsigned_bin.c
@ -0,0 +1,25 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
 {
  int     x;
  fp_int  t;
  fp_init_copy(&t, a);
  x = 0;
  while (fp_iszero (&t) == FP_NO) {
      b[x++] = (unsigned char) (t.dp[0] & 255);
      fp_div_2d (&t, 8, &t, NULL);
  }
  bn_reverse (b, x);
 }
--- a/fp_toradix.c
+++ b/fp_toradix.c
@ -0,0 +1,55 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_toradix(fp_int *a, char *str, int radix)
 {
  int     digs;
  fp_int  t;
  fp_digit d;
  char   *_s = str;
  /* check range of the radix */
  if (radix < 2 || radix > 64) {
    return FP_VAL;
  }
  /* quick out if its zero */
  if (fp_iszero(a) == 1) {
     *str++ = '0';
     *str = '\0';
     return FP_OKAY;
  }
  fp_init_copy(&t, a);
  /* if it is negative output a - */
  if (t.sign == FP_NEG) {
    ++_s;
    *str++ = '-';
    t.sign = FP_ZPOS;
  }
  digs = 0;
  while (fp_iszero (&t) == FP_NO) {
    fp_div_d (&t, (fp_digit) radix, &t, &d);
    *str++ = fp_s_rmap[d];
    ++digs;
  }
  /* reverse the digits of the string.  In this case _s points
   * to the first digit [exluding the sign] of the number]
   */
  bn_reverse ((unsigned char *)_s, digs);
  /* append a NULL so the string is properly terminated */
  *str = '\0';
  return FP_OKAY;
 }
--- a/fp_unsigned_bin_size.c
+++ b/fp_unsigned_bin_size.c
@ -0,0 +1,16 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 int fp_unsigned_bin_size(fp_int *a)
 {
  int     size = fp_count_bits (a);
  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
--- a/gen.pl
+++ b/gen.pl
@ -0,0 +1,17 @@
 #!/usr/bin/perl -w
 #
 # Generates a "single file" you can use to quickly
 # add the whole source without any makefile troubles
 #
 use strict;
 open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
 foreach my $filename (glob "fp_*.c") {
   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
   print OUT "/* Start: $filename */\n";
   print OUT while <SRC>;
   print OUT "\n/* End: $filename */\n\n";
   close SRC or die "Error closing $filename after reading: $!";
 }
 print OUT "\n/* EOF */\n";
 close OUT or die "Error closing mpi.c after writing: $!";
--- a/78
+++ b/78
@ -0,0 +1,78 @@
 #makefile for TomsFastMath
 #
 #
 CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
 #profiling
 #PROF=-pg -g
 #CFLAGS += $(PROF)
 #speed
 CFLAGS += -fomit-frame-pointer
 VERSION=0.01
 default: libtfm.a
 OBJECTS = \
 fp_set.o \
 \
 fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
 fp_mul_2.o fp_div_2.o  \
 \
 fp_cnt_lsb.o \
 \
 fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
 s_fp_add.o s_fp_sub.o \
 \
 fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
 fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
 fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
 fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
 \
 fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
 \
 fp_exptmod.o \
 \
 fp_cmp.o fp_cmp_mag.o \
 \
 fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
 fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
 fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
 \
 libtfm.a: $(OBJECTS)
 	$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
 	ranlib libtfm.a
 mtest/mtest: mtest/mtest.c
 	cd mtest ; make mtest
 test: libtfm.a demo/test.o mtest/mtest
 	$(CC) demo/test.o libtfm.a $(PROF) -o test
 stest: libtfm.a demo/stest.o 
 	$(CC) demo/stest.o libtfm.a -o stest
 docdvi: tfm.tex
 	touch tfm.ind
 	latex tfm >/dev/null
 	latex tfm >/dev/null
 	makeindex tfm
 	latex tfm >/dev/null
 docs: docdvi
 	latex tfm >/dev/null
 	dvipdf tfm
 	mv -f tfm.pdf doc
 clean:
 	rm -f $(OBJECTS) *.a demo/*.o test tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc stest
 	cd mtest ; make clean
 zipup: docs clean
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf tfm* tomsfastmath-$(VERSION) ; mkdir tomsfastmath-$(VERSION) ; \
 	cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
 	tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
 	zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/*
--- a/makefile.gba
+++ b/makefile.gba
@ -0,0 +1,55 @@
 #makefile for TomsFastMath
 #
 #For the GameboyAdance... er.... ARMv4
 SFLAGS = $(CFLAGS) -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -mthumb -mthumb-interwork -I../devkitadv/mylib/lib
 CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -marm -mthumb-interwork -I../devkitadv/mylib/lib
 #profiling
 #PROF=-pg -g
 #CFLAGS += $(PROF)
 #speed
 CFLAGS += -fomit-frame-pointer
 VERSION=0.01
 default: libtfm.a
 OBJECTS = \
 fp_set.o \
 \
 fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
 fp_mul_2.o fp_div_2.o  \
 \
 fp_cnt_lsb.o \
 \
 fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
 s_fp_add.o s_fp_sub.o \
 \
 fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
 fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
 fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
 fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
 \
 fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
 \
 fp_exptmod.o \
 \
 fp_cmp.o fp_cmp_mag.o \
 \
 fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
 fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
 fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
 \
 libtfm.a: $(OBJECTS)
 	$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
 	ranlib libtfm.a
 demo/stest.o: demo/stest.c
 	$(CC) $(SFLAGS) -DGBA_MODE demo/stest.c -c -o demo/stest.o
 stest: libtfm.a demo/stest.o 
 	$(CC) -mthumb -mthumb-interwork demo/stest.o libtfm.a ../devkitadv/mylib/lib/gba.a -o stest.elf
 	objcopy -O binary stest.elf stest.bin
--- a/mtest/makefile
+++ b/mtest/makefile
@ -0,0 +1,9 @@
 CFLAGS += -Wall -W -O3 
 default: mtest
 mtest: mtest.o
 	$(CC) mtest.o -ltommath -o mtest
 clean:
 	rm -f *.o mtest
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -0,0 +1,320 @@
 /* makes a bignum test harness with NUM tests per operation
 *
 * the output is made in the following format [one parameter per line]
 operation
 operand1
 operand2
 [... operandN]
 result1
 result2
 [... resultN]
 So for example "a * b mod n" would be
 mulmod
 a
 b
 n
 a*b mod n
 e.g. if a=3, b=4 n=11 then
 mulmod
 3
 4
 11
 1
 */
 #ifdef MP_8BIT
 #define THE_MASK 127
 #else
 #define THE_MASK 32767
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <tommath.h>
 FILE *rng;
 /* 1-2048 bit numbers */
 void rand_num(mp_int *a)
 {
   int n, size;
   unsigned char buf[2048];
   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 256;
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   while (buf[1] == 0) buf[1] = fgetc(rng);
   mp_read_raw(a, buf, 1+size);
 }
 /* 1-256 bit numbers (to test things like exptmod) */
 void rand_num2(mp_int *a)
 {
   int n, size;
   unsigned char buf[2048];
   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 32;
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   while (buf[1] == 0) buf[1] = fgetc(rng);
   mp_read_raw(a, buf, 1+size);
 }
 #define mp_to64(a, b) mp_toradix(a, b, 64)
 int main(void)
 {
   int n, tmp;
   mp_int a, b, c, d, e;
   clock_t t1;
   char buf[4096];
   mp_init(&a);
   mp_init(&b);
   mp_init(&c);
   mp_init(&d);
   mp_init(&e);
   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
 /*
   mp_set(&a, 1);
   for (n = 1; n < 8192; n++) {
       mp_mul(&a, &a, &c);
       printf("mul\n");
       mp_to64(&a, buf);
       printf("%s\n%s\n", buf, buf);
       mp_to64(&c, buf);
       printf("%s\n", buf);
       mp_add_d(&a, 1, &a);
       mp_mul_2(&a, &a);
       mp_sub_d(&a, 1, &a);
   }
 */
   rng = fopen("/dev/urandom", "rb");
   if (rng == NULL) {
      rng = fopen("/dev/random", "rb");
      if (rng == NULL) {
         fprintf(stderr, "\nWarning:  stdin used as random source\n\n");
         rng = stdin;
      }
   }
   t1 = clock();
   for (;;) {
 #if 0
      if (clock() - t1 > CLOCKS_PER_SEC) {
         sleep(2);
         t1 = clock();
      }
 #endif
       n = fgetc(rng) % 16;
   if (n == 0) {
       /* add tests */
       rand_num(&a);
       rand_num(&b);
       mp_add(&a, &b, &c);
       printf("add\n");
       mp_to64(&a, buf);
       printf("%s\n", buf);
       mp_to64(&b, buf);
       printf("%s\n", buf);
       mp_to64(&c, buf);
       printf("%s\n", buf);
   } else if (n == 1) {
      /* sub tests */
       rand_num(&a);
       rand_num(&b);
       mp_sub(&a, &b, &c);
       printf("sub\n");
       mp_to64(&a, buf);
       printf("%s\n", buf);
       mp_to64(&b, buf);
       printf("%s\n", buf);
       mp_to64(&c, buf);
       printf("%s\n", buf);
   } else if (n == 2) {
       /* mul tests */
       rand_num(&a);
       rand_num(&b);
       mp_mul(&a, &b, &c);
       printf("mul\n");
       mp_to64(&a, buf);
       printf("%s\n", buf);
       mp_to64(&b, buf);
       printf("%s\n", buf);
       mp_to64(&c, buf);
       printf("%s\n", buf);
   } else if (n == 3) {
      /* div tests */
       rand_num(&a);
       rand_num(&b);
       mp_div(&a, &b, &c, &d);
       printf("div\n");
       mp_to64(&a, buf);
       printf("%s\n", buf);
       mp_to64(&b, buf);
       printf("%s\n", buf);
       mp_to64(&c, buf);
       printf("%s\n", buf);
       mp_to64(&d, buf);
       printf("%s\n", buf);
   } else if (n == 4) {
      /* sqr tests */
       rand_num(&a);
       mp_sqr(&a, &b);
       printf("sqr\n");
       mp_to64(&a, buf);
       printf("%s\n", buf);
       mp_to64(&b, buf);
       printf("%s\n", buf);
   } else if (n == 5) {
      /* mul_2d test */
      rand_num(&a);
      mp_copy(&a, &b);
      n = fgetc(rng) & 63;
      mp_mul_2d(&b, n, &b);
      mp_to64(&a, buf);
      printf("mul2d\n");
      printf("%s\n", buf);
      printf("%d\n", n);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 6) {
      /* div_2d test */
      rand_num(&a);
      mp_copy(&a, &b);
      n = fgetc(rng) & 63;
      mp_div_2d(&b, n, &b, NULL);
      mp_to64(&a, buf);
      printf("div2d\n");
      printf("%s\n", buf);
      printf("%d\n", n);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 7) {
      /* gcd test */
      rand_num(&a);
      rand_num(&b);
      a.sign = MP_ZPOS;
      b.sign = MP_ZPOS;
      mp_gcd(&a, &b, &c);
      printf("gcd\n");
      mp_to64(&a, buf);
      printf("%s\n", buf);
      mp_to64(&b, buf);
      printf("%s\n", buf);
      mp_to64(&c, buf);
      printf("%s\n", buf);
   } else if (n == 8) {
      /* lcm test */
      rand_num(&a);
      rand_num(&b);
      a.sign = MP_ZPOS;
      b.sign = MP_ZPOS;
      mp_lcm(&a, &b, &c);
      printf("lcm\n");
      mp_to64(&a, buf);
      printf("%s\n", buf);
      mp_to64(&b, buf);
      printf("%s\n", buf);
      mp_to64(&c, buf);
      printf("%s\n", buf);
   } else if (n == 9) {
      /* exptmod test */
      rand_num2(&a);
      rand_num2(&b);
      rand_num2(&c);
 //      if (c.dp[0]&1) mp_add_d(&c, 1, &c);
      a.sign = b.sign = c.sign = 0;
      c.dp[0] |= 1;
      if (c.used <= 2) continue;
 //      if (mp_cmp(&a, &c) != MP_LT) continue;
 //      if (mp_cmp(&b, &c) != MP_LT) continue;
      mp_exptmod(&a, &b, &c, &d);
      printf("expt\n");
      mp_to64(&a, buf);
      printf("%s\n", buf);
      mp_to64(&b, buf);
      printf("%s\n", buf);
      mp_to64(&c, buf);
      printf("%s\n", buf);
      mp_to64(&d, buf);
      printf("%s\n", buf);
   } else if (n == 10) {
      /* invmod test */
      rand_num2(&a);
      rand_num2(&b);
      b.dp[0] |= 1;
      b.sign = MP_ZPOS;
      a.sign = MP_ZPOS;
      mp_gcd(&a, &b, &c);
      if (mp_cmp_d(&c, 1) != 0) continue;
      if (mp_cmp_d(&b, 1) == 0) continue;
      mp_invmod(&a, &b, &c);
      printf("invmod\n");
      mp_to64(&a, buf);
      printf("%s\n", buf);
      mp_to64(&b, buf);
      printf("%s\n", buf);
      mp_to64(&c, buf);
      printf("%s\n", buf);
   } else if (n == 11) {
      rand_num(&a);
      mp_mul_2(&a, &a);
      mp_div_2(&a, &b);
      printf("div2\n");
      mp_to64(&a, buf);
      printf("%s\n", buf);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 12) {
      rand_num(&a);
      mp_mul_2(&a, &b);
      printf("mul2\n");
      mp_to64(&a, buf);
      printf("%s\n", buf);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 13) {
      rand_num(&a);
      tmp = abs(rand()) & THE_MASK;
      mp_add_d(&a, tmp, &b);
      printf("add_d\n");
      mp_to64(&a, buf);
      printf("%s\n%d\n", buf, tmp);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 14) {
      rand_num(&a);
      tmp = abs(rand()) & THE_MASK;
      mp_sub_d(&a, tmp, &b);
      printf("sub_d\n");
      mp_to64(&a, buf);
      printf("%s\n%d\n", buf, tmp);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 15) {
      rand_num(&a);
      tmp = abs(rand()) & THE_MASK;
      mp_mul_d(&a, tmp, &b);
      printf("mul_d\n");
      mp_to64(&a, buf);
      printf("%s\n%d\n", buf, tmp);
      mp_to64(&b, buf);
      printf("%s\n", buf);
   }
   }
   fclose(rng);
   return 0;
 }
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
--- a/random_txt_files/amd64.txt
+++ b/random_txt_files/amd64.txt
@ -0,0 +1,43 @@
 AMD64 timings
 using ISO C
 mult
  512-bit:       496
 1024-bit:      1717
 2048-bit:      7200
 sqr
  512-bit:       448
 1024-bit:      1760
 2048-bit:      7099
 mont
  512-bit:      1416
 1024-bit:      5156
 2048-bit:     20820
 expt
  512-bit:   1520207
 1024-bit:  10603520
 2048-bit:  84893649
 using amd64
 mult
  512-bit:       292
 1024-bit:       945
 2048-bit:      3620
 sqr
  512-bit:       238
 1024-bit:       801
 2048-bit:      2853
 mont
  512-bit:       731
 1024-bit:      1730
 2048-bit:      5462
 Exptmod:
  512-bit:    641743
 1024-bit:   3167406
 2048-bit:  20158609
 LTM exptmods
 Exponentiating   513-bit =>       825/sec,   2183028 cycles
 Exponentiating  1025-bit =>       151/sec,  11900720 cycles
 Exponentiating  2049-bit =>        24/sec,  72376416 cycles
--- a/random_txt_files/exptmod_timings.txt
+++ b/random_txt_files/exptmod_timings.txt
@ -0,0 +1,45 @@
 LTM timings:
 Athlon Barton
 Exponentiating   513-bit =>       561/sec,   3909824 cycles
 Exponentiating  1025-bit =>       103/sec,  21175496 cycles
 Exponentiating  2049-bit =>        16/sec, 129845554 cycles
 P4 Northwood
 Exponentiating   513-bit =>       284/sec,   9884722 cycles
 Exponentiating  1025-bit =>        47/sec,  59090432 cycles
 Exponentiating  2049-bit =>         6/sec, 427456070 cycles
 TFM timings:
 Athlon Barton
  512-bit:   2289257
 1024-bit:  12871373
 2048-bit:  97211357
 P4 Northwood [x86-32]
  512-bit:   8015598
 1024-bit:  55559304
 2048-bit: 409861746
 P4 Northwood [SSE2]
  512-bit:   5895000
 1024-bit:  39648730
 2048-bit: 304110670
 <center>
 <table border=1 width=100%>
 <tr><td>Processor</td><td>Size in bits</td><td>x86-32</td> <td>x86-64</td><td>SSE2</td><td>LTM</td></tr>
 <tr><td>P4       </td><td>512         </td><td>8015598</td><td></td>      <td>5895000</td><td>9884722</td></tr>
 <tr><td>         </td><td>1024        </td><td>55559304</td><td></td>     <td>39648730</td><td>59090432</td></tr>
 <tr><td>         </td><td>2048        </td><td>409861746</td><td></td>    <td>304110670</td><td>427456070</td></tr>
 <tr><td>Athlon Barton</td><td>512     </td><td>2289257</td><td></td><td></td><td>3909824</td></tr>
 <tr><td>             </td><td>1024    </td><td>12871373</td><td></td><td></td><td>21175496</td></tr>
 <tr><td>             </td><td>2048    </td><td>97211357</td><td></td><td></td><td>129845554</td></tr>
 <tr><td>Athlon64     </td><td>512     </td><td></td><td>641743</td><td></td><td>2183028</td></tr>
 <tr><td>             </td><td>1042    </td><td></td><td>3167406</td><td></td><td>11900720</td></tr>
 <tr><td>             </td><td>2048    </td><td></td><td>20158609</td><td></td><td>72376416</td></tr>
 </table>
 <b>Cycles per operation</b>
 </center>
--- a/random_txt_files/ltm_times.txt
+++ b/random_txt_files/ltm_times.txt
@ -0,0 +1,37 @@
 LTM Timings...
 Multiplying      140-bit =>   2950763/sec,       952 cycles
 Multiplying      196-bit =>   2150939/sec,      1306 cycles
 Multiplying      252-bit =>   1357066/sec,      2070 cycles
 Multiplying      308-bit =>   1055269/sec,      2662 cycles
 Multiplying      364-bit =>    817557/sec,      3436 cycles
 Multiplying      420-bit =>    636413/sec,      4414 cycles
 Multiplying      475-bit =>    536912/sec,      5232 cycles
 Multiplying      531-bit =>    433641/sec,      6478 cycles
 Multiplying      588-bit =>    372069/sec,      7550 cycles
 Multiplying      644-bit =>    322813/sec,      8702 cycles
 Multiplying      698-bit =>    275566/sec,     10194 cycles
 Multiplying      753-bit =>    242082/sec,     11604 cycles
 Multiplying      809-bit =>    214797/sec,     13078 cycles
 Multiplying      867-bit =>    189626/sec,     14814 cycles
 Multiplying      921-bit =>    168858/sec,     16636 cycles
 Multiplying      978-bit =>    151598/sec,     18530 cycles
 Multiplying     1036-bit =>    137580/sec,     20418 cycles
 Multiplying     1091-bit =>    124661/sec,     22534 cycles
 Multiplying     1148-bit =>    111677/sec,     25154 cycles
 Multiplying     1199-bit =>    102762/sec,     27336 cycles
 Multiplying     1258-bit =>     94519/sec,     29720 cycles
 Multiplying     1316-bit =>     86975/sec,     32298 cycles
 Multiplying     1371-bit =>     79754/sec,     35222 cycles
 Multiplying     1427-bit =>     74473/sec,     37720 cycles
 Multiplying     1483-bit =>     68827/sec,     40814 cycles
 Multiplying     1537-bit =>     63644/sec,     44138 cycles
 Multiplying     1595-bit =>     59646/sec,     47096 cycles
 Multiplying     1651-bit =>     56469/sec,     49746 cycles
 Multiplying     1708-bit =>     52640/sec,     53364 cycles
 Multiplying     1764-bit =>     49823/sec,     56382 cycles
 Multiplying     1819-bit =>     46856/sec,     59952 cycles
 Multiplying     1875-bit =>     44264/sec,     63462 cycles
 Multiplying     1929-bit =>     41641/sec,     67460 cycles
 Multiplying     1985-bit =>     39539/sec,     71046 cycles
 Multiplying     2044-bit =>     37591/sec,     74728 cycles
--- a/random_txt_files/old_sqr_times.txt
+++ b/random_txt_files/old_sqr_times.txt
@ -0,0 +1,14 @@
 I started with:
  512-bit:     16338
 1024-bit:     51020
 2048-bit:    142718
 My x86-32
  512-bit:      2864
 1024-bit:     10615
 2048-bit:     41807
 My SSE2
  512-bit:      2168
 1024-bit:      7727
 2048-bit:     33163
--- a/s_fp_add.c
+++ b/s_fp_add.c
@ -0,0 +1,37 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* unsigned addition */
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
 {
  int      x, y, oldused;
  fp_word  t;
  y       = MAX(a->used, b->used);
  oldused = c->used;
  c->used = y;
  t = 0;
  for (x = 0; x < y; x++) {
      t         += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
      c->dp[x]   = (fp_digit)t;
      t        >>= DIGIT_BIT;
  }
  if (t != 0 && x != FP_SIZE) {
     c->dp[c->used++] = (fp_digit)t;
     ++x;
  }
  for (; x < oldused; x++) {
     c->dp[x] = 0;
  }
  fp_clamp(c);
 }
--- a/s_fp_sub.c
+++ b/s_fp_sub.c
@ -0,0 +1,31 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #include <tfm.h>
 /* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 {
  int      x, oldused;
  fp_word  t;
  oldused = c->used;
  c->used = a->used;
  t       = 0;
  for (x = 0; x < a->used; x++) {
      t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
      c->dp[x]  = (fp_digit)t;
      t         = (t >> DIGIT_BIT) & 1;
  }
  for (; x < oldused; x++) {
     c->dp[x] = 0;
  }
  fp_clamp(c);
 }
--- a/tfm.h
+++ b/tfm.h
@ -0,0 +1,290 @@
 /* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@iahu.ca
 */
 #ifndef TFM_H_
 #define TFM_H_
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <limits.h>
 #undef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
 #undef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
 /* do we want huge code?  The answer is, yes. */
 #define TFM_HUGE
 /* Max size of any number in bits.  Basically the largest size you will be multiplying
 * should be half [or smaller] of FP_MAX_SIZE-four_digit
 *
 * You can externally define this or it defaults to 4096-bits.
 */
 #ifndef FP_MAX_SIZE
   #define FP_MAX_SIZE           (4096+(4*DIGIT_BIT))
 #endif
 /* will this lib work? */
 #if (CHAR_BIT & 7)
   #error CHAR_BIT must be a multiple of eight.
 #endif
 #if FP_MAX_SIZE % CHAR_BIT
   #error FP_MAX_SIZE must be a multiple of CHAR_BIT
 #endif
 /* make sure we are using 64-bit digits with x86-64 asm */
 #if defined(TFM_X86_64)
    #ifndef FP_64BIT
       #define FP_64BIT
    #endif
 #endif
 /* make sure we're 32-bit for x86-32/sse/arm */
 #if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
   #warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
   #undef FP_64BIT
 #endif
 /* some default configurations.
 */
 #if defined(FP_64BIT)
   /* for GCC only on supported platforms */
 #ifndef CRYPT
   typedef unsigned long ulong64;
 #endif
   typedef ulong64            fp_digit;
   typedef unsigned long      fp_word __attribute__ ((mode(TI)));
 #else
   /* this is to make porting into LibTomCrypt easier :-) */
 #ifndef CRYPT
   #if defined(_MSC_VER) || defined(__BORLANDC__) 
      typedef unsigned __int64   ulong64;
      typedef signed __int64     long64;
   #else
      typedef unsigned long long ulong64;
      typedef signed long long   long64;
   #endif
 #endif
   typedef unsigned long      fp_digit;
   typedef ulong64            fp_word;
 #endif
 /* # of digits this is */
 #define DIGIT_BIT  (int)((CHAR_BIT) * sizeof(fp_digit))
 #define FP_MASK    (fp_digit)(-1)
 #define FP_SIZE    (FP_MAX_SIZE/DIGIT_BIT)
 /* signs */
 #define FP_ZPOS     0
 #define FP_NEG      1
 /* return codes */
 #define FP_OKAY     0
 #define FP_VAL      1
 #define FP_MEM      2
 /* equalities */
 #define FP_LT        -1   /* less than */
 #define FP_EQ         0   /* equal to */
 #define FP_GT         1   /* greater than */
 /* replies */
 #define FP_YES        1   /* yes response */
 #define FP_NO         0   /* no response */
 /* a FP type */
 typedef struct {
    fp_digit dp[FP_SIZE];
    int      used, 
             sign;
 } fp_int;
 /* functions */
 /* initialize [or zero] an fp int */
 #define fp_init(a)  memset((a), 0, sizeof(fp_int))
 #define fp_zero(a)  fp_init(a)
 /* zero/even/odd ? */
 #define fp_iszero(a) (((a)->used == 0) ? FP_YES : FP_NO)
 #define fp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? FP_YES : FP_NO)
 #define fp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? FP_YES : FP_NO)
 /* set to a small digit */
 void fp_set(fp_int *a, fp_digit b);
 /* copy from a to b */
 #define fp_copy(a, b)     (((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
 #define fp_init_copy(a, b) fp_copy(b, a)
 /* negate and absolute */
 #define fp_neg(a, b)  { fp_copy(a, b); (b)->sign ^= 1; }
 #define fp_abs(a, b)  { fp_copy(a, b); (b)->sign  = 0; }
 /* clamp digits */
 #define fp_clamp(a)   { while ((a)->used && (a)->dp[(a)->used-1] == 0) --((a)->used); (a)->sign = (a)->used ? (a)->sign : FP_ZPOS; }
 /* right shift x digits */
 void fp_rshd(fp_int *a, int x);
 /* left shift x digits */
 void fp_lshd(fp_int *a, int x);
 /* signed comparisonm */
 int fp_cmp(fp_int *a, fp_int *b);
 /* unsigned comparisonm */
 int fp_cmp_mag(fp_int *a, fp_int *b);
 /* power of 2 operations */
 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d);
 void fp_mod_2d(fp_int *a, int b, fp_int *c);
 void fp_mul_2d(fp_int *a, int b, fp_int *c);
 void fp_2expt (fp_int *a, int b);
 void fp_mul_2(fp_int *a, fp_int *c);
 void fp_div_2(fp_int *a, fp_int *c);
 /* Counts the number of lsbs which are zero before the first zero bit */
 int fp_cnt_lsb(fp_int *a);
 /* c = a + b */
 void fp_add(fp_int *a, fp_int *b, fp_int *c);
 /* c = a - b */
 void fp_sub(fp_int *a, fp_int *b, fp_int *c);
 /* c = a * b */
 void fp_mul(fp_int *a, fp_int *b, fp_int *c);
 /* b = a*a  */
 void fp_sqr(fp_int *a, fp_int *b);
 /* a/b => cb + d == a */
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c);
 /* compare against a single digit */
 int fp_cmp_d(fp_int *a, fp_digit b);
 /* c = a + b */
 void fp_add_d(fp_int *a, fp_digit b, fp_int *c);
 /* c = a - b */
 void fp_sub_d(fp_int *a, fp_digit b, fp_int *c);
 /* c = a * b */
 void fp_mul_d(fp_int *a, fp_digit b, fp_int *c);
 /* a/b => cb + d == a */
 int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d);
 /* c = a mod b, 0 <= c < b  */
 int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c);
 /* ---> number theory <--- */
 /* d = a + b (mod c) */
 int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* d = a - b (mod c) */
 int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* d = a * b (mod c) */
 int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* c = a * a (mod b) */
 int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c);
 /* c = 1/a (mod b) */
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c);
 /* c = (a, b) */
 void fp_gcd(fp_int *a, fp_int *b, fp_int *c);
 /* c = [a, b] */
 void fp_lcm(fp_int *a, fp_int *b, fp_int *c);
 /* setups the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *mp);
 /* computes a = B**n mod b without division or multiplication useful for
 * normalizing numbers in a Montgomery system.
 */
 void fp_montgomery_calc_normalization(fp_int *a, fp_int *b);
 /* computes x/R == x (mod N) via Montgomery Reduction */
 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp);
 /* d = a**b (mod c) */
 int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* primality stuff */
 /* perform a Miller-Rabin test of a to the base b and store result in "result" */
 void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result);
 /* 256 trial divisions + 8 Miller-Rabins, returns FP_YES if probable prime  */
 int fp_isprime(fp_int *a);
 /* Primality generation flags */
 #define TFM_PRIME_BBS      0x0001 /* BBS style prime */
 #define TFM_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
 #define TFM_PRIME_2MSB_OFF 0x0004 /* force 2nd MSB to 0 */
 #define TFM_PRIME_2MSB_ON  0x0008 /* force 2nd MSB to 1 */
 /* callback for fp_prime_random, should fill dst with random bytes and return how many read [upto len] */
 typedef int tfm_prime_callback(unsigned char *dst, int len, void *dat);
 #define fp_prime_random(a, t, size, bbs, cb, dat) fp_prime_random_ex(a, t, ((size) * 8) + 1, (bbs==1)?TFM_PRIME_BBS:0, cb, dat)
 int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback cb, void *dat);
 /* radix conersions */
 int fp_count_bits(fp_int *a);
 int fp_unsigned_bin_size(fp_int *a);
 void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c);
 void fp_to_unsigned_bin(fp_int *a, unsigned char *b);
 int fp_signed_bin_size(fp_int *a);
 void fp_read_signed_bin(fp_int *a, unsigned char *b, int c);
 void fp_to_signed_bin(fp_int *a, unsigned char *b);
 int fp_read_radix(fp_int *a, char *str, int radix);
 int fp_toradix(fp_int *a, char *str, int radix);
 int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);
 /* VARIOUS LOW LEVEL STUFFS */
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
 void bn_reverse(unsigned char *s, int len);
 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
 #ifdef TFM_HUGE
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
 #endif
 void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
 void fp_sqr_comba(fp_int *A, fp_int *B);
 void fp_sqr_comba4(fp_int *A, fp_int *B);
 void fp_sqr_comba8(fp_int *A, fp_int *B);
 void fp_sqr_comba16(fp_int *A, fp_int *B);
 #ifdef TFM_HUGE
 void fp_sqr_comba32(fp_int *A, fp_int *B);
 #endif
 extern const char *fp_s_rmap;
 #endif
--- a/tfm.tex
+++ b/tfm.tex
@ -0,0 +1,580 @@
 \documentclass[b5paper]{book}
 \usepackage{hyperref}
 \usepackage{makeidx}
 \usepackage{amssymb}
 \usepackage{color}
 \usepackage{alltt}
 \usepackage{graphicx}
 \usepackage{layout}
 \def\union{\cup}
 \def\intersect{\cap}
 \def\getsrandom{\stackrel{\rm R}{\gets}}
 \def\cross{\times}
 \def\cat{\hspace{0.5em} \| \hspace{0.5em}}
 \def\catn{$\|$}
 \def\divides{\hspace{0.3em} | \hspace{0.3em}}
 \def\nequiv{\not\equiv}
 \def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
 \def\lcm{{\rm lcm}}
 \def\gcd{{\rm gcd}}
 \def\log{{\rm log}}
 \def\ord{{\rm ord}}
 \def\abs{{\mathit abs}}
 \def\rep{{\mathit rep}}
 \def\mod{{\mathit\ mod\ }}
 \renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
 \newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
 \newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
 \def\Or{{\rm\ or\ }}
 \def\And{{\rm\ and\ }}
 \def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
 \def\implies{\Rightarrow}
 \def\undefined{{\rm ``undefined"}}
 \def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
 \let\oldphi\phi
 \def\phi{\varphi}
 \def\Pr{{\rm Pr}}
 \newcommand{\str}[1]{{\mathbf{#1}}}
 \def\F{{\mathbb F}}
 \def\N{{\mathbb N}}
 \def\Z{{\mathbb Z}}
 \def\R{{\mathbb R}}
 \def\C{{\mathbb C}}
 \def\Q{{\mathbb Q}}
 \definecolor{DGray}{gray}{0.5}
 \newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
 \def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
 \def\gap{\vspace{0.5ex}}
 \makeindex
 \begin{document}
 \frontmatter
 \pagestyle{empty}
 \title{TomsFastMath User Manual \\ v0.01}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text and library are all hereby placed in the public domain.  This book has been formatted for B5 
 [176x250] paper using the \LaTeX{} {\em book} macro package.
 \vspace{13cm}
 \begin{flushleft}This project was sponsored in part by  
 Secure Science Corporation \url{http://www.securescience.net}.
 \end{flushleft}
 \tableofcontents
 \listoffigures
 \mainmatter
 \pagestyle{headings}
 \chapter{Introduction}
 \section{What is TomsFastMath?}
 TomsFastMath is meant to be a very fast yet still fairly portable and easy to port large
 integer arithmetic library written in ISO C.  The goal specifically is to be able to perform
 very fast modular exponentiations and other related functions required for ECC, DH and RSA
 cryptosystems.
 Most of the library is pure ISO C portable source code while a small portion (three files) contain
 a mixture of ISO C and assembler inline fragments.  Compared to LibTomMath this new library is 
 meant to be much faster while sacrificing flexibiltiy.  This is accomplished through several means.
 \begin{enumerate}
   \item The new code is slightly messier and contains asm blocks.
   \item This uses fixed not multiple precision integers.  
   \item It is designed only for fast modular exponentiations [e.g. less flexibility].
 \end{enumerate}
 To mitigate some of the problems that arise from using assembler it has been carefully and 
 appropriately used where it would make the most gain in performance.  Also we use macro's
 for assembler code which allows new ports to be inserted easily.
 The new code uses fixed precision arithmetic which means at compile time you choose a maximum 
 precision and all numbers are limited to that.  This has the benefit of not requiring any
 memory heap operations (which are slow) in any of the functions.  It has the downside that 
 integers that are too large are truncated.
 The goal of this library is to be able to perform modular exponentiations (with an odd modulus) very
 fast.  This is what takes the most time in systems such as RSA and DH.  This also requires
 fast multiplication and squaring and has the side effect of speeding up ECC operations as well.
 \section{License}
 TomsFastMath is public domain.
 \section{Building}
 Currently only a GCC makefile has been provided.  To build the library simply type
 ``make''.  The library is a bit too new to put into production so no install
 scripts exist yet.  You can build the test program with ``make test''.
 To perform simple static testing (useful to test out new assembly ports) use the stest
 program.  Type ``make stest'' and run it on your target.  The program will perform three
 multiplications, squarings and montgomery reductions.  Likely if your assembly 
 code is invalid this code will exhibit the bug.
 \subsection{Build Limitations}
 TomsFastMath has the following build requirements which are non--portable but under most 
 circumstances not problematic.
 \begin{enumerate}
 \item ``CHAR\_BIT'' must be eight.  
 \item The ``fp\_digit'' type must be a multiple of eight bits long.
 \item The ``fp\_word'' must be at least twice the length of fp\_digit.
 \end{enumerate}
 \subsection{Optimization Configuration}
 By default TFM is configured for 32--bit digits using ISO C source code.  This mode while portable
 is not very efficient.  While building the library (from scratch) you can define one of 
 several ``CFLAGS'' defines.
 For example, to build with with SSE2 optimizations type 
 \begin{verbatim}
 export CFLAGS=-DTFM_SSE2
 make clean libtfm.a
 \end{verbatim}
 \subsubsection{x86--32}  The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all
 i386 and beyond processors.  It requires GCC to build and only works with 32--bit digits.  In this 
 mode fp\_digit is 32--bits and fp\_word is 64--bits.
 \subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium
 M or Athlon64 processor.  It requires GCC to build.  Note that you shouldn't define both
 TFM\_X86 and TFM\_SSE2 at the same time.   This mode only works with 32--bit digits.  In this 
 mode fp\_digit is 32--bits and fp\_word is 64--bits.
 \subsubsection{x86--64}  The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a 
 ``x86--64'' capable processor (Athlon64 and future Pentium processors).  It requires GCC to
 build and only works with 64--bit digits.  Note that by enabling this mode it will automatically
 enable 64--bit digits.  In this mode fp\_digit is 64--bits and fp\_word is 128--bits.
 \subsubsection{ARM}  The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
 processor.  It requires GCC and works with 32--bit digits.  In this mode fp\_digit is 32--bits and 
 fp\_word is 64--bits.
 \subsubsection{Future Releases}  Future releases will support additional platform optimizations.
 Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline patches 
 (see chapter \ref{chap:asmops} for more information).
 \begin{figure}[here]
 \begin{small}
 \begin{center}
 \begin{tabular}{|l|l|}
 \hline \textbf{Processor} & \textbf{Recommended Mode} \\
 \hline All 32--bit x86 platforms  & TFM\_X86 \\
 \hline Pentium 4                  & TFM\_SSE2 \\
 \hline Athlon64                   & TFM\_X86\_64 \\
 \hline ARMv4 or higher            & TFM\_ARM \\
 \hline
 \end{tabular}
 \caption{Recommended Build Modes}
 \end{center}
 \end{small}
 \end{figure}
 \subsection{Precision Configuration}
 The precision of all integers in this library are fixed to a limited precision.  Essentially
 the rule of setting the precision is if you plan on doing modular exponentiation with $k$--bit
 numbers than the precision must be fixed to $2k$--bits plus four digits.  
 This is changed by altering the value of ``FP\_MAX\_SIZE'' in tfm.h to your desired size.  By default, 
 the library is configured to handle upto 2048--bit inputs to the modular exponentiator.  
 \chapter{Getting Started}
 \section{Data Types}
 TomsFastMath is a large fixed precision integer library.  It provides the functionality to 
 manipulate large signed integers through a relatively trivial api and a single data type.
 The ``fp\_int'' or fixed precision integer is the data type that the functions operate with.  
 \begin{verbatim}
 typedef struct {
    fp_digit dp[FP_SIZE];
    int      used, 
             sign;
 } fp_int;
 \end{verbatim}
 The \textbf{dp} member is the array of digits that forms the number.  It must always be zero 
 padded.  The \textbf{used} member is the count of digits used in the array.  Although the 
 precision is fixed the algorithms are still tuned to not process the entire array if it 
 does not have to.  The \textbf{sign} indicates the sign of the integer.  It is \textbf{FP\_ZPOS} (0)
 if the integer is zero or positive and \textbf{FP\_NEG} (1) otherwise.
 \section{Initialization}
 \subsection{Simple Initialization}
 To initialize an integer to the default state of zero use the fp\_init() function.
 \index{fp\_init}
 \begin{verbatim}
 void fp_init(fp_int *a);
 \end{verbatim}
 This will initialize the fp\_int $a$ to zero.  Note that the function fp\_zero() is an alias
 for fp\_init().
 \subsection{Initialize Small Constants}
 To initialize an integer with a small single digit value use the fp\_set() function.
 \index{fp\_set}
 \begin{verbatim}
 void fp_set(fp_int *a, fp_digit b);
 \end{verbatim}
 This will initialize $a$ and set it equal to the digit $b$.  
 \subsection{Initialize Copy}
 To initialize an integer with a copy of another integer use the fp\_init\_copy() function.
 \index{fp\_init\_copy}
 \begin{verbatim}
 void fp_init_copy(fp_int *a, fp_int *b)
 \end{verbatim}
 This will initialize $a$ as a copy of $b$.  Note that for compatibility with LibTomMath the function
 fp\_copy() is also provided.
 \chapter{Arithmetic Operations}
 \section{Odds and Evens}
 To quickly and easily tell if an integer is zero, odd or even use the following functions.
 \index{fp\_iszero} \index{fp\_iseven} \index{fp\_isodd}
 \begin{verbatim}
 int fp_iszero(fp_int *a);
 int fp_iseven(fp_int *a);
 int fp_isodd(fp_int *a);
 \end{verbatim}
 These will return \textbf{FP\_YES} if the answer to their respective questions is yes.  Otherwise they
 return \textbf{FP\_NO}.  Note that these are implemented as macros and as such you should avoid using 
 ++ or --~-- operators on the input operand.
 \section{Sign Manipulation}
 To negate or compute the absolute of an integer use the following functions.
 \index{fp\_neg} \index{fp\_abs}
 \begin{verbatim}
 void fp_neg(fp_int *a, fp_int *b);
 void fp_abs(fp_int *a, fp_int *b);
 \end{verbatim}
 This will compute the negation (or absolute) of $a$ and store the result in $b$.  Note that these 
 are implemented as macros and as such you should avoid using ++ or --~-- operators on the input 
 operand.
 \section{Comparisons}
 To perform signed or unsigned comparisons use following functions.
 \index{fp\_cmp} \index{fp\_cmp\_mag}
 \begin{verbatim}
 int fp_cmp(fp_int *a, fp_int *b);
 int fp_cmp_mag(fp_int *a, fp_int *b);
 \end{verbatim}
 These will compare $a$ to $b$.  They will return \textbf{FP\_GT} if $a$ is larger than $b$, 
 \textbf{FP\_EQ} if they are equal and \textbf{FP\_LT} if $a$ is less than $b$.
 The function fp\_cmp performs signed comparisons while the other performs unsigned comparisons.
 \section{Shifting}
 To shift the digits of an fp\_int left or right use the following functions.
 \index{fp\_lshd} \index{fp\_rshd}
 \begin{verbatim}
 void fp_lshd(fp_int *a, int x);
 void fp_rshd(fp_int *a, int x);
 \end{verbatim}
 These will shift the digits of $a$ left (or right respectively) $x$ digits.  
 To shift individual bits of an fp\_int use the following functions.
 \index{fp\_div\_2d} \index{fp\_mod\_2d} \index{fp\_mul\_2d} \index{fp\_div\_2} \index{fp\_mul\_2}
 \begin{verbatim}
 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d);
 void fp_mod_2d(fp_int *a, int b, fp_int *c);
 void fp_mul_2d(fp_int *a, int b, fp_int *c);
 void fp_mul_2(fp_int *a, fp_int *c);
 void fp_div_2(fp_int *a, fp_int *c);
 void fp_2expt(fp_int *a, int b);
 \end{verbatim}
 fp\_div\_2d() will divide $a$ by $2^b$ and store the quotient in $c$ and remainder in $d$.  Either of 
 $c$ or $d$ can be \textbf{NULL} if their value is not required.  fp\_mod\_2d() is a shortcut to 
 compute the remainder directly.  fp\_mul\_2d() will multiply $a$ by $2^b$ and store the result in $c$.  
 The fp\_mul\_2() and fp\_div\_2() functions are optimized multiplication and divisions by two.  The 
 function fp\_2expt() will compute $a = 2^b$ quickly.
 To quickly count the number of least significant bits that are zero use the following function.
 \index{fp\_cnt\_lsb}
 \begin{verbatim}
 int fp_cnt_lsb(fp_int *a);
 \end{verbatim}
 This will return the number of adjacent least significant bits that are zero.  This is equivalent 
 to the number of times two evenly divides $a$.
 \section{Basic Algebra}
 The following functions round out the basic algebraic functionality of the library.
 \index{fp\_add} \index{fp\_sub} \index{fp\_mul} \index{fp\_sqr} \index{fp\_div} \index{fp\_mod}
 \begin{verbatim}
 void fp_add(fp_int *a, fp_int *b, fp_int *c);
 void fp_sub(fp_int *a, fp_int *b, fp_int *c);
 void fp_mul(fp_int *a, fp_int *b, fp_int *c);
 void fp_sqr(fp_int *a, fp_int *b);
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 int fp_mod(fp_int *a, fp_int *b, fp_int *c);
 \end{verbatim}
 The functions fp\_add(), fp\_sub() and fp\_mul() perform their respective operations on $a$ and
 $b$ and store the result in $c$.  The function fp\_sqr() computes $b = a^2$ and is faster than
 using fp\_mul() to perform the same operation.
 The function fp\_div() divides $a$ by $b$ and stores the quotient in $c$ and remainder in $d$.  Either 
 of $c$ and $d$ can be \textbf{NULL} if the result is not required.  The function fp\_mod() is a simple 
 shortcut to find the remainder.
 \section{Modular Exponentiation}
 To compute a modular exponentiation use the following function.
 \index{fp\_exptmod}
 \begin{verbatim}
 int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 \end{verbatim}
 This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$.  The size of $c$
 must be half of the maximum precision used during the build of the library.  For example,
 by default $c$ must be less than $2^{2048}$.  
 \section{Number Theoretic}
 To perform modular inverses, greatest common divisor or least common multiples use the following
 functions.
 \index{fp\_invmod} \index{fp\_gcd} \index{fp\_lcm}
 \begin{verbatim}
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c);
 void fp_gcd(fp_int *a, fp_int *b, fp_int *c);
 void fp_lcm(fp_int *a, fp_int *b, fp_int *c);
 \end{verbatim}
 The fp\_invmod() function will find the modular inverse of $a$ modulo an odd modulus $b$ and store
 it in $c$ (provided it exists).  The function fp\_gcd() will compute the greatest common
 divisor of $a$ and $b$ and store it in $c$.  Similarly the fp\_lcm() function will compute
 the least common multiple of $a$ and $b$ and store it in $c$.
 \section{Prime Numbers}
 To quickly test a number for primality call this function.
 \index{fp\_isprime}
 \begin{verbatim}
 int fp_isprime(fp_int *a);
 \end{verbatim}
 This will return \textbf{FP\_YES} if $a$ is probably prime.  It uses 256 trial divisions and
 eight rounds of Rabin-Miller testing.  Note that this routine performs modular exponentiations
 which means that $a$ must be in a valid range of precision.
 \chapter{Porting TomsFastMath}
 \label{chap:asmops}
 \section{Getting Started}
 Porting TomsFastMath to a given processor target is usually a simple procedure.  For the most part 
 assembly is used to get around the lack of a ``add with carry'' operation in the C language.  To
 make matters simpler the use of assembler is through macro blocks.
 Each ``port'' is defined by a block of code that re-defines the portable ISO C macros with assembler
 inline blocks.  To add a new port you must designate a TFM\_XXX define that will enable your 
 port when built.
 \section{Multiply with Comba}
 The file ``fp\_mul\_comba.c'' is responsible for providing the fast multiplication within the 
 library.  This comba multiplication is fairly simple.  It uses a sliding three digit carry 
 system with the variables $c0$, $c1$, $c2$.  For every digit of output $c0$ is the what will
 be that digit, $c1$ will carry into the next digit and $c2$ will be the ``c1'' carry for
 the next digit.  For every ``next'' digit effectively $c0$ is stored as output, $c1$ moves into
 $c0$, $c2$ into $c1$ and zero into $c2$.
 The following macros define the assmebler interface to the code.
 \begin{verbatim}
 #define COMBA_START 
 \end{verbatim}
 This is issued at the beginning of the multiplication function.  This is in place to allow you to
 initialize any registers or machine words required.  You can leave it blank if you do not need 
 it.
 \begin{verbatim}
 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;
 \end{verbatim}
 This clears the three comba carries.  If you are going to place carries in registers then 
 zero the appropriate registers.  Note that the functions do not use $c0$, $c1$ or $c2$ directly
 so you are free to ignore these varibles and use registers directly.
 \begin{verbatim}
 #define COMBA_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 \end{verbatim}
 This propagates the carries after a digit has been produced.  
 \begin{verbatim}
 #define COMBA_STORE(x) \
   x = c0;
 \end{verbatim}
 This stores the $c0$ digit in the memory location specified by $x$.  Note that if you manually 
 aliased $c0$ with a register than just store that register in $x$.  
 \begin{verbatim}
 #define COMBA_STORE2(x) \
   x = c1;
 \end{verbatim}
 This stores the $c1$ digit in the memory location specified by $x$.  Note that if you manually 
 aliased $c1$ with a register than just store that register in $x$.  
 \begin{verbatim}
 #define COMBA_FINI
 \end{verbatim}
 If at the end of the function you need to perform some action fill this macro in. 
 \begin{verbatim}
 #define MULADD(i, j)                                          \
   t  = ((fp_word)i) * ((fp_word)j);                          \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
 \end{verbatim}
 This macro performs the ``multiply and add'' step that is central to the comba
 multiplier.  It multiplies the fp\_digits $i$ and $j$ to produce a fp\_word result.  Effectively
 the double--digit value is added to the three-digit carry formed by $c0$, $c1$, $c2$ where $c0$
 is the least significant digit.
 \section{Squaring with Comba}
 Squaring is similar to multiplication except that it uses a special ``multiply and add twice'' macro
 that replaces multiplications that are not required.
 \begin{verbatim}
 #define COMBA_START
 \end{verbatim}
 This allows for any initialization code you might have.  
 \begin{verbatim}
 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
 \end{verbatim}
 This will clear the carries.  Like multiplication you can safely alias the three carry variables
 to registers if you can/want to.
 \begin{verbatim}
 #define COMBA_STORE(x) \
   x = c0;
 \end{verbatim}
 Store the $c0$ carry to a given memory location.
 \begin{verbatim}
 #define COMBA_STORE2(x) \
   x = c1;
 \end{verbatim}
 Store the $c1$ carry to a given memory location.
 \begin{verbatim}
 #define CARRY_FORWARD \
   c0 = c1; c1 = c2; c2 = 0;
 \end{verbatim}
 Forward propagate all three carry variables.
 \begin{verbatim}
 #define COMBA_FINI
 \end{verbatim}
 If you need to clean up at the end of the function.
 \begin{verbatim}
 /* multiplies point i and j, updates carry "c1" and digit c2 */
 #define SQRADD(i, j)                       \
   t  = ((fp_word)i) * ((fp_word)j);       \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
 \end{verbatim}
 This is essentially the MULADD macro from the multiplication code.
 \begin{verbatim}
 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)                       \
   t  = ((fp_word)i) * ((fp_word)j);       \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
 \end{verbatim}
 This is like SQRADD except it adds the produce twice.  It's similar to 
 computing SQRADD(i, j*2).
 \section{Montgomery with Comba}
 Montgomery reduction is used in modular exponentiation and is most called function during
 that operation.  It's important to make sure this routine is very fast or all is lost.
 Unlike the two other comba routines this one does not use a single three--digit carry 
 system.  It does have three--digit carries except that the routine steps through them
 in the inner loop.  This means you cannot alias them to registers (at all).
 To make matters simple though the three arrays of carries are stored in one array.  The 
 ``c0'' array resides in $c[0 \ldots OFF1-1]$, ``c1'' in $c[OFF1 \ldots OFF2-1]$ and ``c2'' in
 $c[OFF2 \ldots OFF2+FP\_SIZE-1]$.  
 \begin{verbatim}
 #define MONT_START 
 \end{verbatim}
 This allows you to insert anything at the start that you need.
 \begin{verbatim}
 #define MONT_FINI
 \end{verbatim}
 This allows you to insert anything at the end that you need.
 \begin{verbatim}
 #define LOOP_START \
   mu = c[x] * mp;
 \end{verbatim}
 This computes the $\mu$ value for the inner loop.  You can safely alias $mu$ and $mp$ to
 a register if you want.
 \begin{verbatim}
 #define INNERMUL \
   t = ((fp_word)mu) * ((fp_word)*tmpm++);                \
   _c[OFF0] += t;                                         \
   if (_c[OFF0] < (fp_digit)t)              ++_c[OFF1];   \
   _c[OFF1] += (t>>DIGIT_BIT);                            \
   if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2];   
 \end{verbatim}
 This computes the inner product and adds it to the correct set of carry variables.  The variable
 $\_c$ is a pointer alias to $c[x+y]$ and used to simplify the code.
 You can safely alias $\_c$ to a register for INNERMUL by setting it equal to ``c + x''
 \footnote{Where ``c'' is an array on the stack.} by modifying LOOP\_START.
 \begin{verbatim}
 #define PROPCARRY \
   _c[OFF0+1] += _c[OFF1];                                \
   if (_c[OFF0+1] < _c[OFF1])       ++_c[OFF1+1];         \
   _c[OFF1+1] += _c[OFF2];                                \
   if (_c[OFF1+1] < _c[OFF2])       ++_c[OFF2+1];         
 \end{verbatim}
 This propagates the carry upwards by one digit.  
 \input{tfm.ind}
 \end{document}
		`@ -0,0 +1,2 @@`
							`August 25th, 2004`
							`TFM 0.01 -- Initial Release`