added tomsfastmath-0.04

2005-07-23 10:43:03 +00:00 · 2005-07-23 10:43:03 +00:00 · f91cf2d1cf
commit f91cf2d1cf
parent ca551d4c5e
83 changed files with 10946 additions and 1899 deletions
--- a/20
+++ b/20
@ -1,20 +0,0 @@
---
-0. IMPORTANT... why are you doubling the "even" terms individually?  STUPID!
-   - make it so you have four new macros that use an additional 3 carry variables
-        - SQRADDSC - store first mult      [ simple store, no carry ]
-        - SQRADDAC - add subsequent mults  [ 3n word add ]
-        - SQRADDDB - double the carry      [ 3n word add ]
-        - SQRADDFC - forward the doubles into the main [ 3n word add, note, x86_32 may need "g" instead of "r" ]
-   - only use the four macro pattern for rows with >= 3 "doubles"
-        - otherwise use the existing SQRADD
-
-
-1. Write more documentation ;-)
-2. Ports to PPC and MIPS
-3. Fix any lingering bugs, add additional requested functionality.
-4. Unrolled copies of montgomery will speed it up a bit
-5. 
-
-
-NOTE:  The library is still fairly new.  I've tested it quite a bit but that doesn't mean surprises
-can't happen.  Please test the results you get for correctness.
--- a/changes.txt
+++ b/changes.txt
@ -1,7 +1,25 @@
+0.04 -- Fixed bugs in the SSE2 squaring code
+     -- Rewrote the multipliers to be optimized for small inputs 
+     -- Nelson Bolyard of the NSS crew submitted [among other things] new faster Montgomery reduction
+        code.  It brings the performance for small numbers on the AMD64 and all numbers on the P4
+        to a new level.  Thanks!
+     -- Added missing ARM support for fp_montgomery_reduce.c that the NSS folk left off, Officially 
+        the ARM code is for v4 and above WITH the "M" multiplier support (e.g. umlal instruction)
+     -- Added PPC32 support, define TFM_PPC32 to enable it, I used the "PowerPC 6xx" instruction
+        databook for reference.  Does not require altivec.  Should be fairly portable to the other
+        32-bit PPCs provided they have mullw and mulhwu instructions.
+        [Note: porting the macros to PPC64 should be trivial, anyone with a shell to lend... email me!]
+     -- Rewrote the config a bit in tfm.h so you can better choose which set of "oh my god that's huge" code to 
+        enable for your task.  "generic" functions are ALWAYS included which are smaller but will cover the
+        gaps in the coverage for ya.
+     -- The PPC32 code has been verified to function on a Darwin box running GCC 2.95.2 
+        [Thanks to the folk at PeerSec for lending me a shell to use]
+     -- Fixed a bug in fp_exptmod() where if the exponent was negative AND the destination the output
+        would have the sign set to FP_NEG.
+
 March 1st, 2005
 0.03 -- Optimized squaring
-     -- 
-
+     -- Applied new license header to all files (still PD)

 September 18th, 2004
 0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
--- a/comba_mont_gen.c
+++ b/comba_mont_gen.c
@ -0,0 +1,81 @@
+/* generate montgomery reductions for m->used = 1...16 */
+
+#include <stdio.h>
+
+int main(void)
+{
+   int N;
+   
+   for (N = 1; N <= 16; N++) {
+       
+printf("void fp_montgomery_reduce_%d(fp_int *a, fp_int *m, fp_digit mp)\n", N);
+printf(
+"{\n"
+"   fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;\n"
+"   int      oldused, x, y;\n"
+"\n"
+"   /* now zero the buff */\n"
+"   memset(c, 0, sizeof(c));\n"
+"\n"
+"   /* copy the input */\n"
+"   oldused = a->used;\n"
+"   for (x = 0; x < oldused; x++) {\n"
+"       c[x] = a->dp[x];\n"
+"   }\n"
+"\n"
+"   MONT_START;\n"
+"\n"
+"   /* now let's get bizz-sy! */\n"
+"   for (x = 0; x < %d; x++) {\n"
+"       /* get Mu for this round */\n"
+"       LOOP_START;\n"
+"\n"
+"       /* our friendly neighbourhood alias */\n"
+"       _c   = c + x;\n"
+"       tmpm = m->dp;\n"
+"\n"
+"       for (y = 0; y < %d; y++) {\n"
+"          INNERMUL;\n"
+"          ++_c;\n"
+"       }\n"
+"       /* send carry up man... */\n"
+"       _c = c + x;\n"
+"       PROPCARRY;\n"
+"   }         \n"
+"\n"
+"  /* fix the rest of the carries */\n"
+"  _c = c + %d;\n"
+"  for (x = %d; x < %d * 2 + 2; x++) {\n"
+"     PROPCARRY;\n"
+"     ++_c;\n"
+"  }\n"
+"\n"
+"  /* now copy out */\n"
+"  _c   = c + %d;\n"
+"  tmpm = a->dp;\n"
+"  for (x = 0; x < %d+1; x++) {\n"
+"     *tmpm++ = *_c++;\n"
+"  }\n"
+"\n"
+"  for (; x < oldused; x++)   {\n"
+"     *tmpm++ = 0;\n"
+"  }\n"
+"\n"
+"  MONT_FINI;\n"
+"\n"
+"  a->used = %d+1;\n"
+"  fp_clamp(a);\n"
+"\n"  
+"  /* if A >= m then A = A - m */\n"
+"  if (fp_cmp_mag (a, m) != FP_LT) {\n"
+"    s_fp_sub (a, m, a);\n"
+"  }\n"
+"}\n", N,N,N,N,N,N,N,N);
+}
+
+return 0;
+}
+
+
+
+
--- a/comba_mult_gen.c
+++ b/comba_mult_gen.c
@ -1,3 +1,13 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+
 /* program emits a NxN comba multiplier */
 #include <stdio.h>

@ -47,3 +57,7 @@ printf(

  return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/comba_mult_smallgen.c
+++ b/comba_mult_smallgen.c
@ -0,0 +1,61 @@
+/* program emits a NxN comba multiplier for 1x1 to 16x16 */
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+   int N, x, y, z;
+
+   /* print out preamble */
+printf(
+"void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)\n"
+"{\n"
+"   fp_digit c0, c1, c2, at[32];\n"
+"   switch (MAX(A->used, B->used)) { \n"
+);
+
+for (N = 1; N <= 16; N++) {
+
+printf(
+"\n"
+"   case %d:\n"
+"      memcpy(at, A->dp, %d * sizeof(fp_digit));\n"
+"      memcpy(at+%d, B->dp, %d * sizeof(fp_digit));\n"
+"      COMBA_START;\n"
+"\n"
+"      COMBA_CLEAR;\n", N, N, N, N);
+
+   /* now do the rows */
+   for (x = 0; x < (N+N-1); x++) {
+printf(
+"      /* %d */\n", x);
+if (x > 0) {
+printf(
+"      COMBA_FORWARD;\n");
+}
+      for (y = 0; y < N; y++) {
+      for (z = 0; z < N; z++) {
+          if ((y+z)==x) {
+             printf("      MULADD(at[%d], at[%d]); ", y, z+N);
+          }
+      }
+      }
+printf(
+"\n"
+"      COMBA_STORE(C->dp[%d]);\n", x);
+   }
+printf(
+"      COMBA_STORE2(C->dp[%d]);\n"
+"      C->used = %d;\n"
+"      C->sign = A->sign ^ B->sign;\n"
+"      fp_clamp(C);\n"
+"      COMBA_FINI;\n"
+"      break;\n", N+N-1, N+N);
+}
+printf("   }\n}\n\n");
+
+  return 0;
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/comba_sqr_gen.c
+++ b/comba_sqr_gen.c
@ -1,3 +1,13 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+
 /* Generates squaring comba code... it learns it knows our secrets! */
 #include <stdio.h>

@ -90,3 +100,7 @@ if (N >= 16) printf("#endif\n");

  return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/comba_sqr_smallgen.c
+++ b/comba_sqr_smallgen.c
@ -0,0 +1,109 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+
+/* Generates squaring comba code... it learns it knows our secrets! */
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+   int x, y, z, N, f;
+
+printf(
+"void fp_sqr_comba_small(fp_int *A, fp_int *B)\n"
+"{\n"
+"   fp_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;\n"
+);
+
+printf("   switch (A->used) { \n");
+
+for (N = 1; N <= 16; N++) {
+printf(
+"   case %d:\n"
+"      a = A->dp;\n"
+"      COMBA_START; \n"
+"\n"
+"      /* clear carries */\n"
+"      CLEAR_CARRY;\n"
+"\n"
+"      /* output 0 */\n"
+"      SQRADD(a[0],a[0]);\n"
+"      COMBA_STORE(b[0]);\n", N);
+
+   for (x = 1; x < N+N-1; x++) {
+printf(
+"\n      /* output %d */\n"
+"      CARRY_FORWARD;\n   ", x);
+
+       for (f = y = 0; y < N; y++) {
+           for (z = 0; z < N; z++) {
+               if (z != y && z + y == x && y <= z) {
+                  ++f;
+               }
+           }
+       }
+
+   if (f <= 2) {
+       for (y = 0; y < N; y++) {
+           for (z = 0; z < N; z++) {
+               if (y<=z && (y+z)==x) {
+                  if (y == z) { 
+                     printf("   SQRADD(a[%d], a[%d]); ", y, y);
+                  } else {
+                     printf("   SQRADD2(a[%d], a[%d]); ", y, z);
+                  }
+               }
+           }
+       }
+   } else {
+      // new method 
+      /* do evens first */
+       f = 0;
+       for (y = 0; y < N; y++) {
+           for (z = 0; z < N; z++) {
+               if (z != y && z + y == x && y <= z) {
+                  if (f == 0) {
+                     // first double 
+                     printf("SQRADDSC(a[%d], a[%d]); ", y, z);
+                     f = 1;
+                  } else { 
+                     printf("SQRADDAC(a[%d], a[%d]); ", y, z);
+                  }
+               }
+           }
+       }
+       // forward the carry
+       printf("SQRADDDB; ");
+       if ((x&1) == 0) {
+          // add the square 
+          printf("SQRADD(a[%d], a[%d]); ", x/2, x/2);
+       }
+    }
+printf("\n      COMBA_STORE(b[%d]);\n", x);
+   }
+printf("      COMBA_STORE2(b[%d]);\n", N+N-1);
+
+printf(
+"      COMBA_FINI;\n"
+"\n"
+"      B->used = %d;\n"
+"      B->sign = FP_ZPOS;\n"
+"      memcpy(B->dp, b, %d * sizeof(fp_digit));\n"
+"      fp_clamp(B);\n"
+"      break;\n\n", N+N, N+N);
+}
+
+printf("}\n\n}\n");
+
+  return 0;
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/delme.c
+++ b/delme.c
@ -1,24 +0,0 @@
-#include "tfm.h"
-
-int main(void)
-{
-    fp_int a;
-    char buf[4096];
-
-    fp_init(&a);
-    fp_read_radix( &a,
-        "///////////93zgY8MZ2DCJ6Oek0t1pHAG9E28fdp7G22xwcEnER8b5A27cED0JT"
-        "xvKPiyqwGnimAmfjybyKDq/XDMrjKS95v8MrTc9UViRqJ4BffZVjQml/NBRq1hVj"
-        "xZXh+rg9dwMkdoGHV4iVvaaePb7iv5izmW1ykA5ZlmMOsaWs75NJccaMFwZz9CzV"
-        "WsLT8zoZhPOSOlDM88LIkvxLAGTmbfPjPmmrJagyc0JnT6m8oXWXV3AGNaOkDiux"
-        "uvvtB1WEXWER9uEYx0UYZxN5NV1lJ5B9tYlBzfLO5nWvbKbywfLgvHNI9XYO+WKG"
-        "5NAEMeggn2sjCnSD151wCwXL8QlV7BfaxFk515ZRxmgAwd5NNGOCVREN3uMcuUJ7"
-        "g/MkZDi9CzSUZ9JWIYLXdSxZqYOQqkvhyI/w1jcA26JOTW9pFiXgP58VAnWNUo0C"
-        "k+4NLtfXNMnt2OZ0kjb6uWZYJw1qvQinGzjR/E3z48vBWj4WgJhIol//////////",
-        64 );
-
-    if( fp_isprime( &a ) ) printf("It's prime.\n");
-    else printf( "Not prime.\n");
-
-    return 0;
-}
--- a/demo/rsa.c
+++ b/demo/rsa.c
@ -0,0 +1,83 @@
+#include "tfm.h"
+#include <time.h>
+
+int main(void)
+{
+   fp_int d, e, n, c, m, e_m;
+   clock_t t1;
+   int x;
+
+   /* read in the parameters */
+   fp_read_radix(&n, "ce032e860a9809a5ec31e4b0fd4b546f8c40043e3d2ec3d8f49d8f2f3dd19e887094ee1af75caa1c2e6cd9ec78bf1dfd6280002ac8c30ecd72da2e4c59a28a9248048aaae2a8fa627f71bece979cebf9f8eee2bd594d4a4f2e791647573c7ec1fcbd320d3825be3fa8a17c97086fdae56f7086ce512b81cc2fe44161270ec5e9", 16);
+   fp_read_radix(&e, "10001", 16);
+   fp_read_radix(&m, "39f5a911250f45b99390e2df322b33c729099ab52b5879d06b00818cce57c649a66ed7eb6d8ae214d11caf9c81e83a7368cf0edb2b71dad791f13fecf546123b40377851e67835ade1d6be57f4de18a62db4cdb1880f4ab2e6a29acfd85ca22a13dc1f6fee2621ef0fc8689cd738e6f065c033ec7c148d8d348688af83d6f6bd", 16);
+   fp_read_radix(&c, "9ff70ea6968a04530e6b06bf01aa937209cc8450e76ac19477743de996ba3fb445923c947f8d0add8c57efa51d15485309918459da6c1e5a97f215193b797dce98db51bdb4639c2ecfa90ebb051e3a2daeffd27a7d6e62043703a7b15e0ada5170427b63099cd01ef52cd92d8723e5774bea32716aaa7f5adbae817fb12a5b50", 16);
+
+   /* test it */
+   fp_exptmod(&m, &e, &n, &e_m);
+   if (fp_cmp(&e_m, &c)) {
+      char buf[1024];
+      printf("Encrypted text not equal\n");
+      fp_toradix(&e_m, buf, 16);
+      printf("e_m == %s\n", buf);
+      return 0;
+   }
+
+   printf("CLOCKS_PER_SEC = %llu\n", (unsigned long long)CLOCKS_PER_SEC);
+   t1 = clock();
+   for (x = 0; x < 1000; x++) {
+      fp_exptmod(&m, &e, &n, &e_m);
+   }
+   t1 = clock() - t1;
+   printf("1000 RSA operations took     %10.5g seconds\n", (double)t1 / (double)CLOCKS_PER_SEC);
+   printf("RSA encrypt/sec              %10.5g\n", (double)CLOCKS_PER_SEC / ((double)t1 / 1000.0) );
+
+   /* read in the parameters */
+   fp_read_radix(&n, "a7f30e2e04d31acc6936916af1e404a4007adfb9e97864de28d1c7ba3034633bee2cd9d5da3ea3cdcdc9a6f3daf5702ef750f4c3aadb0e27410ac04532176795995148cdb4691bd09a8a846e3e24e073ce2f89b34dfeb2ee89b646923ca60ee3f73c4d5397478380425e7260f75dfdc54826e160395b0889b1162cf115a9773f", 16);
+   fp_read_radix(&d, "16d166f3c9a404d810d3611e6e8ed43293fe1db75c8906eb4810785a4b82529929dade1db7f11ac0335d5a59773e3167b022479eedefa514a0399db5c900750a56323cf9f5b0f21e7d60a46d75f3fcaabf30a63cbe34048b741a57ac36a13914afda798709dea5771f8d456cf72ec5f3afc1d88d023de40311143a36e7028739", 16);
+   fp_read_radix(&c, "7d216641c32543f5b8428bdd0b11d819cfbdb16f1df285247f677aa4d44de62ab064f4a0d060ec99cb94aa398113a4317f2c550d0371140b0fd2c88886cac771812e72faad4b7adf495b9b850b142ccd7f45c0a27f164c8c7731731c0015f69d0241812e769d961054618aeb9e8e8989dba95714a2cf56c9e525c5e34b5812dd", 16);
+   fp_read_radix(&m, "5f323bf0b394b98ffd78727dc9883bb4f42287def6b60fa2a964b2510bc55d61357bf5a6883d2982b268810f8fef116d3ae68ebb41fd10d65a0af4bec0530eb369f37c14b55c3be60223b582372fb6589b648d5a0c7252d1ae2dae5809785d993e9e5d0c4d9b0bcba0cde0d6671734747fba5483c735e1dab7df7b10ec6f62d8", 16);
+
+   /* test it */
+   fp_exptmod(&c, &d, &n, &e_m);
+   if (fp_cmp(&e_m, &m)) {
+      char buf[1024];
+      printf("Decrypted text not equal\n");
+      fp_toradix(&e_m, buf, 16);
+      printf("e_m == %s\n", buf);
+      return 0;
+   }
+
+   t1 = clock();
+   for (x = 0; x < 100; x++) {
+      fp_exptmod(&c, &d, &n, &e_m);
+   }
+   t1 = clock() - t1;
+   printf("100 RSA operations took      %10.5g seconds\n", (double)t1 / (double)CLOCKS_PER_SEC);
+   printf("RSA decrypt/sec              %10.5g\n", (double)CLOCKS_PER_SEC / ((double)t1 / 100.0) );
+
+
+   /* test half size */
+   fp_rshd(&n, n.used >> 1);
+   fp_rshd(&d, d.used >> 1);
+   fp_rshd(&c, c.used >> 1);
+   printf("n.used == %4d bits\n", n.used * DIGIT_BIT);
+
+   /* ensure n is odd */
+   n.dp[0] |= 1;
+   t1 = clock();
+   for (x = 0; x < 100; x++) {
+      fp_exptmod(&c, &d, &n, &e_m);
+   }
+   t1 = clock() - t1;
+   printf("100 RSA-half operations took %10.5g seconds\n", (double)t1 / (double)CLOCKS_PER_SEC);
+   printf("RSA decrypt/sec              %10.5g (estimate of RSA-1024-CRT) \n", (double)CLOCKS_PER_SEC / ((double)t1 / 50.0) );
+
+
+
+   return 0;
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/demo/stest.c
+++ b/demo/stest.c
@ -142,3 +142,7 @@ int main(void)
   return 0;
 }   

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/demo/test.c
+++ b/demo/test.c
@ -22,10 +22,18 @@ int myrng(unsigned char *dst, int len, void *dat)
 static ulong64 TIMFUNC (void)
   {
   #if defined __GNUC__
-      #if defined(__i386__) || defined(__x86_64__)
+      #if defined(INTEL_CC)
+			ulong64 a;
+         asm ("rdtsc":"=A"(a));
+         return a;
+      #elif defined(__i386__) || defined(__x86_64__)
         ulong64 a;
         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
         return a;
+      #elif defined(TFM_PPC32) 
+         unsigned long a;
+         __asm__ __volatile__ ("mftb %0":"=r"(a));
+         return a;
      #else /* gcc-IA64 version */
         unsigned long result;
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
@ -135,9 +143,7 @@ int main(void)
  printf("Testing read_radix\n");
  fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);

-goto testing;
-
-#if 1
+#if 0
  /* test mont */
  printf("Montgomery test #1\n");
  fp_set(&a, 0x1234567ULL);
@ -196,8 +202,18 @@ goto testing;
   }
   printf("\n\n");
 #endif
-  
+
+#ifdef TESTING
+goto testing;
+#endif
+
 #if 1
+
+t1 = TIMFUNC();
+sleep(1);
+printf("Ticks per second: %llu\n", TIMFUNC() - t1);
+
+goto expttime;
 /* do some timings... */
  printf("Addition:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
@ -211,7 +227,7 @@ goto testing;
      a.used = t;
      b.used = t;
      t2 = -1;
-      for (ix = 0; ix < 2500; ++ix) {
+      for (ix = 0; ix < 25000; ++ix) {
          t1 = TIMFUNC();
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
@ -222,6 +238,7 @@ goto testing;
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
+multtime:
  printf("Multiplication:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
      fp_zero(&a);
@ -263,8 +280,8 @@ sqrtime:
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
-return;
 //#else
+monttime:
  printf("Montgomery:\n");
  for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
      fp_zero(&a);
@ -295,7 +312,7 @@ return;
 expttime:
  printf("Exptmod:\n");
 
-  for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
+  for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += 256/DIGIT_BIT) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
@ -309,7 +326,7 @@ expttime:
      c.used = t;

     t2 = -1;
-     for (ix = 0; ix < 256; ++ix) {
+     for (ix = 0; ix < 500; ++ix) {
          t1 = TIMFUNC();
          fp_exptmod(&c, &b, &a, &d);
          fp_exptmod(&c, &b, &a, &d);
@ -320,10 +337,10 @@ expttime:
     }
     printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
-return;
-
+  return;
 #endif

+return;
 testing:

   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
@ -567,3 +584,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
 }
  
  
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/fp_2expt.c
+++ b/fp_2expt.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -33,3 +33,7 @@ void fp_2expt(fp_int *a, int b)
  a->dp[z] = ((fp_digit)1) << (b % DIGIT_BIT);
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_add.c
+++ b/fp_add.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -37,3 +37,7 @@ void fp_add(fp_int *a, fp_int *b, fp_int *c)
    }
  }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_add_d.c
+++ b/fp_add_d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -16,3 +16,7 @@ void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
   fp_set(&tmp, b);
   fp_add(a,&tmp,c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_addmod.c
+++ b/fp_addmod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -17,3 +17,7 @@ int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
  fp_add(a, b, &tmp);
  return fp_mod(&tmp, c, d);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_cmp.c
+++ b/fp_cmp.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -25,3 +25,7 @@ int fp_cmp(fp_int *a, fp_int *b)
      }
   }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_cmp_d.c
+++ b/fp_cmp_d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -32,3 +32,7 @@ int fp_cmp_d(fp_int *a, fp_digit b)
  }

 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_cmp_mag.c
+++ b/fp_cmp_mag.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -29,3 +29,7 @@ int fp_cmp_mag(fp_int *a, fp_int *b)
   return FP_EQ;
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_cnt_lsb.c
+++ b/fp_cnt_lsb.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -40,3 +40,7 @@ int fp_cnt_lsb(fp_int *a)
   return x;
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_count_bits.c
+++ b/fp_count_bits.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -30,3 +30,7 @@ int fp_count_bits (fp_int * a)
  }
  return r;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_div.c
+++ b/fp_div.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -151,3 +151,7 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)

  return FP_OKAY;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_div_2.c
+++ b/fp_div_2.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -47,3 +47,7 @@ void fp_div_2(fp_int * a, fp_int * b)
  b->sign = a->sign;
  fp_clamp (b);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_div_2d.c
+++ b/fp_div_2d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -73,3 +73,7 @@ void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
    fp_copy (&t, d);
  }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_div_d.c
+++ b/fp_div_d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -87,3 +87,7 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
  return FP_OKAY;
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_exptmod.c
+++ b/fp_exptmod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -174,6 +174,13 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
   fp_int tmp;
   int    err;
   
+#ifdef TFM_CHECK
+   /* prevent overflows */
+   if (P->used > (FP_SIZE/2)) {
+      return FP_VAL;
+   }
+#endif
+
   /* is X negative?  */
   if (X->sign == FP_NEG) {
      /* yes, copy G and invmod it */
@ -183,10 +190,16 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
      }
      X->sign = FP_ZPOS;
      err =  _fp_exptmod(&tmp, X, P, Y);
-      X->sign = FP_NEG;
+      if (X != Y) {
+         X->sign = FP_NEG;
+      }
      return err;
   } else {
      /* Positive exponent so just exptmod */
      return _fp_exptmod(G, X, P, Y);
   }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_gcd.c
+++ b/fp_gcd.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -49,3 +49,7 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
   }
   fp_copy(&u, c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_ident.c
+++ b/fp_ident.c
@ -1,3 +1,12 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@gmail.com
+ */
 #include "tfm.h"

 const char *fp_ident(void)
@ -39,9 +48,6 @@ const char *fp_ident(void)
 #ifdef FP_64BIT
 " FP_64BIT "
 #endif
-#ifdef TFM_LARGE
-" TFM_LARGE "
-#endif
 #ifdef TFM_HUGE
 " TFM_HUGE "
 #endif
@ -64,3 +70,7 @@ int main(void)

 #endif

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_invmod.c
+++ b/fp_invmod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -96,3 +96,7 @@ top:
  c->sign = neg;
  return FP_OKAY;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_isprime.c
+++ b/fp_isprime.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -72,3 +72,7 @@ int fp_isprime(fp_int *a)
   }
   return FP_YES;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_lcm.c
+++ b/fp_lcm.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -25,3 +25,7 @@ void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
      fp_mul(a, &t2, c);
   }   
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_lshd.c
+++ b/fp_lshd.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -32,3 +32,7 @@ void fp_lshd(fp_int *a, int x)
   /* clamp digits */
   fp_clamp(a);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mod.c
+++ b/fp_mod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -28,3 +28,7 @@ int fp_mod(fp_int *a, fp_int *b, fp_int *c)
 }


+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mod_2d.c
+++ b/fp_mod_2d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -36,3 +36,7 @@ void fp_mod_2d(fp_int *a, int b, fp_int *c)
  c->dp[b / DIGIT_BIT] &= ~((fp_digit)0) >> (DIGIT_BIT - b);
  fp_clamp (c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mod_d.c
+++ b/fp_mod_d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -14,3 +14,7 @@ int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
 {
   return fp_div_d(a, b, NULL, c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_montgomery_calc_normalization.c
+++ b/fp_montgomery_calc_normalization.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -36,3 +36,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
  }
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_montgomery_reduce.c
+++ b/fp_montgomery_reduce.c
@ -1,252 +1,380 @@
-/* TomsFastMath, a fast ISO C bignum library.
- * 
- * This project is meant to fill in where LibTomMath
- * falls short.  That is speed ;-)
- *
- * This project is public domain and free for all purposes.
- * 
- * Tom St Denis, tomstdenis@iahu.ca
- */
-#include <tfm.h>
-
-#if defined(TFM_X86) 
-
-/* x86-32 code */
-
-#define MONT_START 
-
-#define MONT_FINI
-
-#define LOOP_START \
-   mu = c[x] * mp;
-
-#define INNERMUL \
-asm(                                                                                          \
-"movl %7,%%eax                \n\t"                                                           \
-"mull %6                      \n\t"                                                           \
-"addl %%eax,%0                \n\t"                                                           \
-"adcl %%edx,%1                \n\t"                                                           \
-"adcl $0,%2                   \n\t"                                                           \
-:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),  \
-                                                "g"(mu), "g"(*tmpm++)                          \
-                                               : "%eax", "%edx", "%cc");
-
-#define PROPCARRY \
-asm(                                                                                               \
-"movl %1,%%eax                \n\t"                                                                \
-"addl  %%eax,%6               \n\t"                                                                \
-"movl %2,%%eax                \n\t"                                                                \
-"adcl  %%eax,%7               \n\t"                                                                \
-"adcl $0,%8                   \n\t"                                                                \
-:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
-                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
-: "%eax", "%cc");
-
-#elif defined(TFM_X86_64)
-/* x86-64 code */
-
-#define MONT_START 
-
-#define MONT_FINI
-
-#define LOOP_START \
-   mu = c[x] * mp;
-
-#define INNERMUL \
-asm(                                                                                          \
-"movq %7,%%rax                \n\t"                                                           \
-"mulq %6                      \n\t"                                                           \
-"addq %%rax,%0                \n\t"                                                           \
-"adcq %%rdx,%1                \n\t"                                                           \
-"adcq $0,%2                   \n\t"                                                           \
-:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),  \
-                                                "g"(mu), "g"(*tmpm++)                          \
-                                               : "%rax", "%rdx", "%cc");
-
-#define PROPCARRY \
-asm(                                                                                               \
-"movq %1,%%rax                \n\t"                                                                \
-"movq %2,%%rbx                \n\t"                                                                \
-"addq  %%rax,%6               \n\t"                                                                \
-"adcq  %%rbx,%7               \n\t"                                                                \
-"adcq $0,%8                   \n\t"                                                                \
-:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
-                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
-: "%rax", "%rbx", "%cc");
-
-#elif defined(TFM_SSE2)
-
-/* SSE2 code */
-
-#define MONT_START \
-asm("movd %0,%%mm2"::"g"(mp));
-
-#define MONT_FINI \
-asm("emms");
-
-#define LOOP_START \
-asm(\
-"movd %0,%%mm1                \n\t" \
-"pmuludq %%mm2,%%mm1          \n\t" \
-:: "g"(c[x]));
-
-#define INNERMUL \
-asm(                                                                                          \
-"movd %6,%%mm0                \n\t"                                                           \
-"pmuludq %%mm1,%%mm0          \n\t"                                                           \
-"movd %%mm0,%%eax             \n\t"                                                           \
-"psrlq $32, %%mm0             \n\t"                                                           \
-"addl %%eax,%0                \n\t"                                                           \
-"movd %%mm0,%%eax             \n\t"                                                           \
-"adcl %%eax,%1                \n\t"                                                           \
-"adcl $0,%2                   \n\t"                                                           \
-:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),  \
-                                                "g"(*tmpm++)                                  \
-                                               : "%eax", "%cc");
-
-#define PROPCARRY \
-asm(                                                                                               \
-"movl %1,%%eax                \n\t"                                                                \
-"addl  %%eax,%6               \n\t"                                                                \
-"movl %2,%%eax                \n\t"                                                                \
-"adcl  %%eax,%7               \n\t"                                                                \
-"adcl $0,%8                   \n\t"                                                                \
-:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
-                                                "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1])  \
-: "%eax", "%cc");
-
-#elif defined(TFM_ARM)
-
-/* ISO C code */
-#define MONT_START 
-
-#define MONT_FINI
-
-#define LOOP_START \
-   mu = c[x] * mp;
-
-/* NOTE: later write it using two regs instead of three for _c + ... */
-#define INNERMUL \
-asm(                                             \
-"UMULL r0,r1,%0,%1                \n\t"          \
-"LDR   r2,[%2]                    \n\t"          \
-"ADDS  r2,r2,r0                   \n\t"          \
-"STR   r2,[%2]                    \n\t"          \
-"LDR   r2,[%3]                    \n\t"          \
-"ADCS  r2,r2,r1                   \n\t"          \
-"STR   r2,[%3]                    \n\t"          \
-"LDR   r2,[%4]                    \n\t"          \
-"ADC   r2,r2,#0                   \n\t"          \
-"STR   r2,[%4]                    \n\t"          \
-::"r"(mu),"r"(*tmpm++),"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "r2", "%cc");
-
-#define PROPCARRY \
-asm(                                             \
-"LDR   r0,[%1]                    \n\t"          \
-"LDR   r1,[%0,#4]                 \n\t"          \
-"ADDS  r0,r0,r1                   \n\t"          \
-"STR   r0,[%0,#4]                 \n\t"          \
-"LDR   r0,[%2]                    \n\t"          \
-"LDR   r1,[%1,#4]                 \n\t"          \
-"ADCS  r0,r0,r1                   \n\t"          \
-"STR   r0,[%1,#4]                 \n\t"          \
-"LDR   r0,[%2,#4]                 \n\t"          \
-"ADC   r0,r0,#0                   \n\t"          \
-"STR   r0,[%2,#4]                 \n\t"          \
-::"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "%cc");
-
-#else
-
-/* ISO C code */
-#define MONT_START 
-
-#define MONT_FINI
-
-#define LOOP_START \
-   mu = c[x] * mp;
-
-#define INNERMUL \
-   do { fp_word t;                                                           \
-   t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
-   t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT);                   _c[OFF1] = t; \
-   _c[OFF2] += (t >> DIGIT_BIT);                                             \
-   } while (0);
-
-#define PROPCARRY \
-   do { fp_word t;                                                           \
-   t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1];                    _c[OFF0+1] = t; \
-   t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
-   _c[OFF2+1] += (t >> DIGIT_BIT);                                           \
-   } while (0);
-
-#endif
-
-
-#define OFF0  (0)
-#define OFF1  (FP_SIZE)
-#define OFF2  (FP_SIZE+FP_SIZE)
-
-/* computes x/R == x (mod N) via Montgomery Reduction */
-void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
-{
-   fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
-   int      oldused, x, y, pa;
-
-   /* now zero the buff */
-   pa = m->used;
-   memset(c, 0, sizeof(c));
-
-   /* copy the input */
-   oldused = a->used;
-   for (x = 0; x < oldused; x++) {
-       c[x] = a->dp[x];
-   }
-
-   MONT_START;
-
-   /* now let's get bizz-sy! */
-   for (x = 0; x < pa; x++) {
-       /* get Mu for this round */
-       LOOP_START;
-
-       /* our friendly neighbourhood alias */
-       _c   = c + x;
-       tmpm = m->dp;
-
-       for (y = 0; y < pa; y++) {
-          INNERMUL;
-          ++_c;
-       }
-       /* send carry up man... */
-       _c = c + x;
-       PROPCARRY;
-  }         
-
-  /* fix the rest of the carries */
-  _c = c + pa;
-  for (x = pa; x < pa * 2 + 2; x++) {
-     PROPCARRY;
-     ++_c;
-  }
-
-  /* now copy out */
-  _c   = c + pa;
-  tmpm = a->dp;
-  for (x = 0; x < pa+1; x++) {
-     *tmpm++ = *_c++;
-  }
-
-  for (; x < oldused; x++)   {
-     *tmpm++ = 0;
-  }
-
-  MONT_FINI;
-
-  a->used = pa+1;
-  fp_clamp(a);
-  
-  /* if A >= m then A = A - m */
-  if (fp_cmp_mag (a, m) != FP_LT) {
-    s_fp_sub (a, m, a);
-  }
-}
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+#include <tfm.h>
+
+/******************************************************************/
+#if defined(TFM_X86) 
+/* x86-32 code */
+
+#define MONT_START 
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                                          \
+asm(                                                      \
+   "movl %5,%%eax \n\t"                                   \
+   "mull %4       \n\t"                                   \
+   "addl %1,%%eax \n\t"                                   \
+   "adcl $0,%%edx \n\t"                                   \
+   "addl %%eax,%0 \n\t"                                   \
+   "adcl $0,%%edx \n\t"                                   \
+   "movl %%edx,%1 \n\t"                                   \
+:"=g"(_c[LO]), "=r"(cy)                                   \
+:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
+: "%eax", "%edx", "%cc")
+
+#define PROPCARRY                           \
+asm(                                        \
+   "addl   %1,%0    \n\t"                   \
+   "setb   %%al     \n\t"                   \
+   "movzbl %%al,%1 \n\t"                    \
+:"=g"(_c[LO]), "=r"(cy)                     \
+:"0"(_c[LO]), "1"(cy)                       \
+: "%eax", "%cc")
+
+/******************************************************************/
+#elif defined(TFM_X86_64)
+/* x86-64 code */
+
+#define MONT_START 
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                                          \
+asm(                                                      \
+   "movq %5,%%rax \n\t"                                   \
+   "mulq %4       \n\t"                                   \
+   "addq %1,%%rax \n\t"                                   \
+   "adcq $0,%%rdx \n\t"                                   \
+   "addq %%rax,%0 \n\t"                                   \
+   "adcq $0,%%rdx \n\t"                                   \
+   "movq %%rdx,%1 \n\t"                                   \
+:"=g"(_c[LO]), "=r"(cy)                                   \
+:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
+: "%rax", "%rdx", "%cc")
+
+#define INNERMUL8 \
+ asm(                  \
+ "movq 0(%5),%%rax    \n\t"  \
+ "movq 0(%2),%%r10    \n\t"  \
+ "movq 0x8(%5),%%r11  \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x8(%2),%%r10  \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0(%0)    \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "movq 0x10(%5),%%r11 \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x10(%2),%%r10 \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x8(%0)  \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "movq 0x18(%5),%%r11 \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x18(%2),%%r10 \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x10(%0) \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "movq 0x20(%5),%%r11 \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x20(%2),%%r10 \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x18(%0) \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "movq 0x28(%5),%%r11 \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x28(%2),%%r10 \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x20(%0) \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "movq 0x30(%5),%%r11 \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x30(%2),%%r10 \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x28(%0) \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "movq 0x38(%5),%%r11 \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq 0x38(%2),%%r10 \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x30(%0) \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+ "movq %%r11,%%rax    \n\t"  \
+ "mulq %4             \n\t"  \
+ "addq %%r10,%%rax    \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "addq %3,%%rax       \n\t"  \
+ "adcq $0,%%rdx       \n\t"  \
+ "movq %%rax,0x38(%0) \n\t"  \
+ "movq %%rdx,%1       \n\t"  \
+ \
+:"=r"(_c), "=r"(cy)                    \
+: "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
+: "%rax", "%rdx", "%r10", "%r11", "%cc")
+
+
+#define PROPCARRY                           \
+asm(                                        \
+   "addq   %1,%0    \n\t"                   \
+   "setb   %%al     \n\t"                   \
+   "movzbq %%al,%1 \n\t"                    \
+:"=g"(_c[LO]), "=r"(cy)                     \
+:"0"(_c[LO]), "1"(cy)                       \
+: "%rax", "%cc")
+
+/******************************************************************/
+#elif defined(TFM_SSE2)  
+/* SSE2 code (assumes 32-bit fp_digits) */
+/* XMM register assignments:
+ * xmm0  *tmpm++, then Mu * (*tmpm++)
+ * xmm1  c[x], then Mu
+ * xmm2  mp
+ * xmm3  cy
+ * xmm4  _c[LO]
+ */
+
+#define MONT_START \
+   asm("movd %0,%%mm2"::"g"(mp))
+
+#define MONT_FINI \
+   asm("emms")
+
+#define LOOP_START          \
+asm(                        \
+"movd %0,%%mm1        \n\t" \
+"pxor %%mm3,%%mm3     \n\t" \
+"pmuludq %%mm2,%%mm1  \n\t" \
+:: "g"(c[x]))
+
+/* pmuludq on mmx registers does a 32x32->64 multiply. */
+#define INNERMUL               \
+asm(                           \
+   "movd %1,%%mm4        \n\t" \
+   "movd %2,%%mm0        \n\t" \
+   "paddq %%mm4,%%mm3    \n\t" \
+   "pmuludq %%mm1,%%mm0  \n\t" \
+   "paddq %%mm0,%%mm3    \n\t" \
+   "movd %%mm3,%0        \n\t" \
+   "psrlq $32, %%mm3     \n\t" \
+:"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
+
+#define LOOP_END \
+asm( "movd %%mm3,%0  \n" :"=r"(cy))
+
+#define PROPCARRY                           \
+asm(                                        \
+   "addl   %1,%0    \n\t"                   \
+   "setb   %%al     \n\t"                   \
+   "movzbl %%al,%1 \n\t"                    \
+:"=g"(_c[LO]), "=r"(cy)                     \
+:"0"(_c[LO]), "1"(cy)                       \
+: "%eax", "%cc")
+
+/******************************************************************/
+#elif defined(TFM_ARM)
+   /* ARMv4 code */
+
+#define MONT_START 
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                    \
+asm(                                \
+    " LDR    r0,%1            \n\t" \
+    " ADDS   r0,r0,%0         \n\t" \
+    " MOVCS  %0,#1            \n\t" \
+    " MOVCC  %0,#0            \n\t" \
+    " UMLAL  r0,%0,%3,%4      \n\t" \
+    " STR    r0,%1            \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
+
+#define PROPCARRY                  \
+asm(                               \
+    " LDR   r0,%1            \n\t" \
+    " ADDS  r0,r0,%0         \n\t" \
+    " STR   r0,%1            \n\t" \
+    " MOVCS %0,#1            \n\t" \
+    " MOVCC %0,#0            \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
+
+#elif defined(TFM_PPC32)
+
+/* PPC32 */
+#define MONT_START 
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                     \
+asm(                                 \
+   " mullw    r16,%3,%4       \n\t"  \
+   " mulhwu   r17,%3,%4       \n\t"  \
+   " addc     r16,r16,%0      \n\t"  \
+   " addze    r17,r17         \n\t"  \
+   " lwz      r18,%1          \n\t"  \
+   " addc     r16,r16,r18     \n\t"  \
+   " addze    %0,r17          \n\t"  \
+   " stw      r16,%1          \n\t"  \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r16", "r17", "r18","%cc");
+
+#define PROPCARRY                    \
+asm(                                 \
+   " lwz      r16,%1          \n\t"  \
+   " addc     r16,r16,%0      \n\t"  \
+   " stw      r16,%1          \n\t"  \
+   " xor      %0,%0,%0        \n\t"  \
+   " addze    %0,%0           \n\t"  \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc");
+
+/******************************************************************/
+#else
+
+/* ISO C code */
+#define MONT_START 
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                                      \
+   do { fp_word t;                                    \
+   _c[0] = t  = ((fp_word)_c[0] + (fp_word)cy) +      \
+                (((fp_word)mu) * ((fp_word)*tmpm++)); \
+   cy = (t >> DIGIT_BIT);                             \
+   } while (0)
+
+#define PROPCARRY \
+   do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
+
+#endif
+/******************************************************************/
+
+
+#define LO  0
+#define HI  1
+#define CY  2
+
+/* computes x/R == x (mod N) via Montgomery Reduction */
+void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
+{
+   fp_digit c[FP_SIZE], *_c, *tmpm, mu;
+   int      oldused, x, y, pa;
+
+#if defined(USE_MEMSET)
+   /* now zero the buff */
+   memset(c, 0, sizeof c);
+#endif
+   pa = m->used;
+
+   /* copy the input */
+   oldused = a->used;
+   for (x = 0; x < oldused; x++) {
+       c[x] = a->dp[x];
+   }
+#if !defined(USE_MEMSET)
+   for (; x < 2*pa+3; x++) {
+       c[x] = 0;
+   }
+#endif
+   MONT_START;
+
+   for (x = 0; x < pa; x++) {
+       fp_digit cy = 0;
+       /* get Mu for this round */
+       LOOP_START;
+       _c   = c + x;
+       tmpm = m->dp;
+       y = 0;
+       #if defined(TFM_X86_64)
+           for (; y < (pa & ~7); y += 8) {
+              INNERMUL8;
+              _c   += 8;
+              tmpm += 8;
+           }
+       #endif
+
+       for (; y < pa; y++) {
+          INNERMUL;
+          ++_c;
+       }
+       LOOP_END;
+       while (cy) {
+           PROPCARRY; //  cy = cy > (*_c += cy);
+           ++_c;
+       }
+  }         
+
+  /* now copy out */
+  _c   = c + pa;
+  tmpm = a->dp;
+  for (x = 0; x < pa+1; x++) {
+     *tmpm++ = *_c++;
+  }
+
+  for (; x < oldused; x++)   {
+     *tmpm++ = 0;
+  }
+
+  MONT_FINI;
+
+  a->used = pa+1;
+  fp_clamp(a);
+  
+  /* if A >= m then A = A - m */
+  if (fp_cmp_mag (a, m) != FP_LT) {
+    s_fp_sub (a, m, a);
+  }
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
--- a/fp_montgomery_setup.c
+++ b/fp_montgomery_setup.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -42,3 +42,7 @@ int fp_montgomery_setup(fp_int *a, fp_digit *rho)
  return FP_OKAY;
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mul.c
+++ b/fp_mul.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -24,19 +24,26 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
       inputs are not close to the next power of two.  That is, for example,
       if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications 
    */
-        if (y <= 4) {
-           fp_mul_comba4(A,B,C);
-        } else if (y <= 8) {
-           fp_mul_comba8(A,B,C);
-#if defined(TFM_LARGE)
-        } else if (y <= 16 && y >= 10) {
-           fp_mul_comba16(A,B,C);
+
+#ifdef TFM_SMALL_SET
+        if (y <= 16) {
+           fp_mul_comba_small(A,B,C);
+#elif defined(TFM_HUGE)
+        if (0) { 1;
 #endif
 #if defined(TFM_HUGE)
-        } else if (y <= 32 && y >= 24) {
+        } else if (y <= 32) {
           fp_mul_comba32(A,B,C);
+        } else if (y <= 48) {
+           fp_mul_comba48(A,B,C);
+        } else if (y <= 64) {
+           fp_mul_comba64(A,B,C);
 #endif
+#if !defined(TFM_HUGE) && !defined(TFM_SMALL_SET)
+        {
+#else
        } else {
+#endif
           fp_mul_comba(A,B,C);
        }
    } else {
@ -44,7 +51,7 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)

           if A = ab and B = cd for ||a|| = r we need to solve 

-           ac*r^2 + (-(a-b)(c-d) + ac + bd)*r + bd
+           ac*r^2 + ((a+b)(c+d) - (ac + bd))*r + bd

           So we solve for the three products then we form the final result with careful shifting 
           and addition.
@ -72,7 +79,7 @@ Obvious points of optimization
        } else {
           t1.used = 0;
        }
-        t1.sign = A->sign;
+        t1.sign = 0;

 //        fp_copy(B, &t2); fp_rshd(&t2, r); 
        for (s = 0; s < B->used - r; s++) {
@ -86,7 +93,7 @@ Obvious points of optimization
        } else {
           t2.used = 0;
        }
-        t2.sign = B->sign;
+        t2.sign = 0;

        fp_copy(&t1, &amb); fp_copy(&t2, &cmd);
        fp_zero(&ac);
@ -100,7 +107,7 @@ Obvious points of optimization
            t2.dp[s] = B->dp[s];
        }
        for (; s < FP_SIZE; s++) {
-            t1.dp[s]   = 0; 
+            t1.dp[s] = 0; 
            t2.dp[s] = 0; 
        }
        t1.used = r;
@ -108,18 +115,17 @@ Obvious points of optimization
        fp_clamp(&t1);
        fp_clamp(&t2);
        
-        fp_sub(&amb, &t1, &amb); fp_sub(&cmd, &t2, &cmd);
+        s_fp_add(&amb, &t1, &amb); s_fp_add(&cmd, &t2, &cmd);
        fp_zero(&bd);
        fp_mul(&t1, &t2, &bd);

-        /* now get the (a-b)(c-d) term */
+        /* now get the (a+b)(c+d) term */
        fp_zero(&comp);
        fp_mul(&amb, &cmd, &comp);

        /* now solve the system, do the middle term first */
-        comp.sign ^= 1;
-        fp_add(&comp, &ac, &comp);
-        fp_add(&comp, &bd, &comp);
+        s_fp_sub(&comp, &ac, &comp);
+        s_fp_sub(&comp, &bd, &comp);
        fp_lshd(&comp, r);
  
        /* leading term */
@ -134,3 +140,7 @@ Obvious points of optimization
    }
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mul_2.c
+++ b/fp_mul_2.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -61,3 +61,7 @@ void fp_mul_2(fp_int * a, fp_int * b)
  b->sign = a->sign;
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mul_2d.c
+++ b/fp_mul_2d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -41,3 +41,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
   fp_clamp(c);
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mul_comba.c
+++ b/fp_mul_comba.c
--- a/fp_mul_d.c
+++ b/fp_mul_d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -34,3 +34,7 @@ void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
   fp_clamp(c);
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_mulmod.c
+++ b/fp_mulmod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>
 /* d = a * b (mod c) */
@ -16,3 +16,7 @@ int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
  fp_mul(a, b, &tmp);
  return fp_mod(&tmp, c, d);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_prime_miller_rabin.c
+++ b/fp_prime_miller_rabin.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -71,3 +71,7 @@ void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result)
  /* probably prime now */
  *result = FP_YES;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_prime_random_ex.c
+++ b/fp_prime_random_ex.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -95,3 +95,7 @@ error:
   free(tmp);
   return err;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_radix_size.c
+++ b/fp_radix_size.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -47,3 +47,7 @@ int fp_radix_size(fp_int *a, int radix, int *size)
  return FP_OKAY;

 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_read_radix.c
+++ b/fp_read_radix.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -64,3 +64,7 @@ int fp_read_radix(fp_int *a, char *str, int radix)
  }
  return FP_OKAY;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_read_signed_bin.c
+++ b/fp_read_signed_bin.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -21,3 +21,7 @@ void fp_read_signed_bin(fp_int *a, unsigned char *b, int c)
     a->sign = FP_NEG;
  }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_read_unsigned_bin.c
+++ b/fp_read_unsigned_bin.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -22,3 +22,7 @@ void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c)
  }
  fp_clamp (a);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_reverse.c
+++ b/fp_reverse.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -25,3 +25,7 @@ void bn_reverse (unsigned char *s, int len)
    --iy;
  }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_rshd.c
+++ b/fp_rshd.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -34,3 +34,7 @@ void fp_rshd(fp_int *a, int x)
   fp_clamp(a);
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_s_rmap.c
+++ b/fp_s_rmap.c
@ -5,9 +5,13 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

 /* chars used in radix conversions */
 const char *fp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_set.c
+++ b/fp_set.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -15,3 +15,7 @@ void fp_set(fp_int *a, fp_digit b)
   a->dp[0] = b;
   a->used  = b ? 1 : 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_signed_bin_size.c
+++ b/fp_signed_bin_size.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -13,3 +13,7 @@ int fp_signed_bin_size(fp_int *a)
 {
  return 1 + fp_unsigned_bin_size (a);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_sqr.c
+++ b/fp_sqr.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -17,21 +17,26 @@ void fp_sqr(fp_int *A, fp_int *B)

    y = A->used;
    if (y <= 64) { 
-        if (y <= 4) {
-           fp_sqr_comba4(A,B);
-        } else if (y <= 8) {
-           fp_sqr_comba8(A,B);
-#if defined(TFM_LARGE)
-        } else if (y <= 16 && y >= 12) {
-           fp_sqr_comba16(A,B);
+
+#if defined(TFM_SMALL_SET)
+        if (y <= 16) {
+           fp_sqr_comba_small(A,B);
+#elif defined(TFM_HUGE)
+        if (0) { 1; 
 #endif
 #if defined(TFM_HUGE)
-        } else if (y <= 32 && y >= 20) {
+        } else if (y <= 32) {
           fp_sqr_comba32(A,B);
-        } else if (y <= 64 && y >= 48) {
+        } else if (y <= 48) {
+           fp_sqr_comba48(A,B);
+        } else if (y <= 64) {
           fp_sqr_comba64(A,B);
 #endif
+#if !defined(TFM_SMALL_SET) && !defined(TFM_HUGE)
+        {
+#else
        } else {
+#endif
           fp_sqr_comba(A, B);
        }
       
@ -109,3 +114,7 @@ Obvious points of optimization
    }
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_sqr_comba.c
+++ b/fp_sqr_comba.c
--- a/fp_sqr_comba_generic.c
+++ b/fp_sqr_comba_generic.c
@ -1,3 +1,13 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+
 /* generic comba squarer */
 void fp_sqr_comba(fp_int *A, fp_int *B)
 {
@ -73,3 +83,7 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
     fp_copy(dst, B);
  }
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_sqrmod.c
+++ b/fp_sqrmod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -17,3 +17,7 @@ int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c)
  fp_sqr(a, &tmp);
  return fp_mod(&tmp, b, c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_sub.c
+++ b/fp_sub.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -44,3 +44,7 @@ void fp_sub(fp_int *a, fp_int *b, fp_int *c)
  }
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_sub_d.c
+++ b/fp_sub_d.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -16,3 +16,7 @@ void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
   fp_set(&tmp, b);
   fp_sub(a, &tmp, c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_submod.c
+++ b/fp_submod.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -18,3 +18,7 @@ int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
  return fp_mod(&tmp, c, d);
 }

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_to_signed_bin.c
+++ b/fp_to_signed_bin.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -14,3 +14,7 @@ void fp_to_signed_bin(fp_int *a, unsigned char *b)
  fp_to_unsigned_bin (a, b + 1);
  b[0] = (unsigned char) ((a->sign == FP_ZPOS) ? 0 : 1);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_to_unsigned_bin.c
+++ b/fp_to_unsigned_bin.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -23,3 +23,7 @@ void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
  }
  bn_reverse (b, x);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_toradix.c
+++ b/fp_toradix.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -53,3 +53,7 @@ int fp_toradix(fp_int *a, char *str, int radix)
  *str = '\0';
  return FP_OKAY;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/fp_unsigned_bin_size.c
+++ b/fp_unsigned_bin_size.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -14,3 +14,7 @@ int fp_unsigned_bin_size(fp_int *a)
  int     size = fp_count_bits (a);
  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/53
+++ b/53
@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
 #speed
 CFLAGS += -fomit-frame-pointer

-VERSION=0.03
+VERSION=0.04

 default: libtfm.a

@ -42,9 +42,37 @@ fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_r
 \
 fp_ident.o 

-libtfm.a: $(OBJECTS)
-	$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
-	ranlib libtfm.a
+HEADERS=tfm.h
+
+ifndef LIBPATH
+   LIBPATH=/usr/lib
+endif
+
+ifndef INCPATH
+   INCPATH=/usr/include
+endif
+
+ifndef TFM_GROUP
+   GROUP=wheel
+endif
+
+ifndef TFM_USER
+   USER=root
+endif
+
+ifndef LIBNAME
+	LIBNAME=libtfm.a
+endif
+
+$(LIBNAME): $(OBJECTS)
+	$(AR) $(ARFLAGS) $(LIBNAME) $(OBJECTS)
+	ranlib $(LIBNAME)
+
+install: libtfm.a
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)

 mtest/mtest: mtest/mtest.c
 	cd mtest ; make mtest
@ -52,8 +80,14 @@ mtest/mtest: mtest/mtest.c
 test: libtfm.a demo/test.o mtest/mtest
 	$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test

+timing: libtfm.a demo/test.o
+	$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
+	
 stest: libtfm.a demo/stest.o 
-	$(CC) demo/stest.o libtfm.a -o stest
+	$(CC) $(CFLAGS) demo/stest.o libtfm.a -o stest
+
+rsatest: libtfm.a demo/rsa.o
+	$(CC) $(CFLAGS) demo/rsa.o libtfm.a -o rsatest

 docdvi: tfm.tex
 	touch tfm.ind
@ -68,10 +102,15 @@ docs: docdvi
 	mv -f tfm.pdf doc

 clean:
-	rm -f $(OBJECTS) *.a demo/*.o test tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc stest *~
+	rm -f $(OBJECTS) *.a demo/*.o test tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc stest *~ rsatest *.gcda *.gcno demo/*.gcda demo/*.gcno mtest/*.gcno mtest/*.gcda
 	cd mtest ; make clean

-zipup: docs clean
+no_oops: clean
+	cd .. ; cvs commit
+	echo Scanning for scratch/dirty files
+	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
+
+zipup: no_oops docs clean
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf tfm* tomsfastmath-$(VERSION) ; mkdir tomsfastmath-$(VERSION) ; \
 	cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
--- a/mess.sh
+++ b/mess.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+if cvs log $1 >/dev/null 2>/dev/null; then exit 0; else echo "$1 shouldn't be here" ; exit 1; fi
+
+
--- a/mtest/makefile
+++ b/mtest/makefile
@ -3,7 +3,7 @@ CFLAGS += -Wall -W -O3
 default: mtest

 mtest: mtest.o
-	$(CC) mtest.o -ltommath -o mtest
+	$(CC) $(CFLAGS) mtest.o -ltommath -o mtest

 clean:
 	rm -f *.o mtest *~
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -60,7 +60,7 @@ void rand_num2(mp_int *a)
   int n, size;
   unsigned char buf[2048];

-   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 32;
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 256;
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   while (buf[1] == 0) buf[1] = fgetc(rng);
@ -317,3 +317,7 @@ int main(void)
   fclose(rng);
   return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
--- a/s_fp_add.c
+++ b/s_fp_add.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -35,3 +35,7 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
  }
  fp_clamp(c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/s_fp_sub.c
+++ b/s_fp_sub.c
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm.h>

@ -29,3 +29,7 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
  }
  fp_clamp(c);
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/tfm.aux
+++ b/tfm.aux
@ -0,0 +1,56 @@
+\relax 
+\ifx\hyper@anchor\@undefined
+\global \let \oldcontentsline\contentsline
+\gdef \contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
+\global \let \oldnewlabel\newlabel
+\gdef \newlabel#1#2{\newlabelxx{#1}#2}
+\gdef \newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
+\AtEndDocument{\let \contentsline\oldcontentsline
+\let \newlabel\oldnewlabel}
+\else
+\global \let \hyper@last\relax 
+\fi
+
+\@writefile{toc}{\contentsline {chapter}{\numberline {1}Introduction}{1}{chapter.1}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}}
+\@writefile{toc}{\contentsline {subsubsection}{x86--32}{3}{section*.3}}
+\@writefile{toc}{\contentsline {subsubsection}{SSE2}{3}{section*.4}}
+\@writefile{toc}{\contentsline {subsubsection}{x86--64}{3}{section*.5}}
+\@writefile{toc}{\contentsline {subsubsection}{ARM}{3}{section*.6}}
+\@writefile{toc}{\contentsline {subsubsection}{PPC32}{3}{section*.7}}
+\@writefile{toc}{\contentsline {subsubsection}{Future Releases}{4}{section*.8}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\newlabel{chap:asmops}{{4}{11}{Porting TomsFastMath\relax }{chapter.4}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}}
--- a/tfm.dvi
+++ b/tfm.dvi
--- a/tfm.h
+++ b/tfm.h
@ -5,7 +5,7 @@
 *
 * This project is public domain and free for all purposes.
 * 
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
 */
 #ifndef TFM_H_
 #define TFM_H_
@ -16,28 +16,44 @@
 #include <ctype.h>
 #include <limits.h>

-#undef MIN
-#define MIN(x,y) ((x)<(y)?(x):(y))
-#undef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
+#ifndef MIN
+   #define MIN(x,y) ((x)<(y)?(x):(y))
+#endif

-/* do we want large code? */
-#define TFM_LARGE
+#ifndef MAX
+   #define MAX(x,y) ((x)>(y)?(x):(y))
+#endif

-/* do we want huge code (implies large)?  The answer is, yes. */
+/* externally define this symbol to ignore the default settings, useful for changing the build from the make process */
+#ifndef TFM_ALREADY_SET
+
+/* do we want the large set of small multiplications ? 
+   Enable these if you are going to be doing a lot of small (<= 16 digit) multiplications say in ECC
+   Or if you're on a 64-bit machine doing RSA as a 1024-bit integer == 16 digits ;-)
+ */
+#define TFM_SMALL_SET
+
+/* do we want huge code 
+   Enable these if you are doing 32, 48 or 64 digit multiplications (useful for RSA)
+   Less important on 64-bit machines as 32 digits == 2048 bits
+ */
 #define TFM_HUGE

-/* imply TFM_LARGE as required */
-#if defined(TFM_HUGE)
-   #if !defined(TFM_LARGE)
-      #define TFM_LARGE
-   #endif
+/* do we want some overflow checks
+   Not required if you make sure your numbers are within range (e.g. by default a modulus for fp_exptmod() can only be upto 2048 bits long)
+ */
+/* #define TFM_CHECK */
+
+/* Is the target a P4 Prescott
+ */
+/* #define TFM_PRESCOTT */
+
 #endif

 /* Max size of any number in bits.  Basically the largest size you will be multiplying
 * should be half [or smaller] of FP_MAX_SIZE-four_digit
 *
- * You can externally define this or it defaults to 4096-bits.
+ * You can externally define this or it defaults to 4096-bits [allowing multiplications upto 2048x2048 bits ]
 */
 #ifndef FP_MAX_SIZE
   #define FP_MAX_SIZE           (4096+(4*DIGIT_BIT))
@ -76,9 +92,9 @@
   #endif
 #endif

-/* make sure we're 32-bit for x86-32/sse/arm */
-#if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
-   #warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
+/* make sure we're 32-bit for x86-32/sse/arm/ppc32 */
+#if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM) || defined(TFM_PPC32)) && defined(FP_64BIT)
+   #warning x86-32, SSE2 and ARM, PPC32 optimizations require 32-bit digits (undefining)
   #undef FP_64BIT
 #endif

@ -104,6 +120,12 @@
   #endif
   #define TFM_ASM
 #endif
+#ifdef TFM_PPC32
+   #ifdef TFM_ASM
+      #error TFM_ASM already defined!
+   #endif
+   #define TFM_ASM
+#endif

 /* we want no asm? */
 #ifdef TFM_NO_ASM
@ -111,6 +133,7 @@
   #undef TFM_X86_64
   #undef TFM_SSE2
   #undef TFM_ARM
+   #undef TFM_PPC32
   #undef TFM_ASM   
 #endif

@ -179,8 +202,8 @@ const char *fp_ident(void);

 /* zero/even/odd ? */
 #define fp_iszero(a) (((a)->used == 0) ? FP_YES : FP_NO)
-#define fp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? FP_YES : FP_NO)
-#define fp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? FP_YES : FP_NO)
+#define fp_iseven(a) (((a)->used >= 0 && (((a)->dp[0] & 1) == 0)) ? FP_YES : FP_NO)
+#define fp_isodd(a)  (((a)->used > 0  && (((a)->dp[0] & 1) == 1)) ? FP_YES : FP_NO)

 /* set to a small digit */
 void fp_set(fp_int *a, fp_digit b);
@ -335,24 +358,22 @@ void bn_reverse(unsigned char *s, int len);
 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
 #ifdef TFM_HUGE
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
+void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
+void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
 #endif
-#ifdef TFM_LARGE
-void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
-#endif
-void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
-void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
+void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);

-void fp_sqr_comba(fp_int *A, fp_int *B);
-void fp_sqr_comba4(fp_int *A, fp_int *B);
-void fp_sqr_comba8(fp_int *A, fp_int *B);
-#ifdef TFM_LARGE
-void fp_sqr_comba16(fp_int *A, fp_int *B);
-#endif
+void fp_sqr_comba_small(fp_int *A, fp_int *B);
 #ifdef TFM_HUGE
 void fp_sqr_comba32(fp_int *A, fp_int *B);
+void fp_sqr_comba48(fp_int *A, fp_int *B);
 void fp_sqr_comba64(fp_int *A, fp_int *B);
 #endif
 extern const char *fp_s_rmap;

 #endif

+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/tfm.idx
+++ b/tfm.idx
@ -0,0 +1,29 @@
+\indexentry{fp\_init|hyperpage}{6}
+\indexentry{fp\_set|hyperpage}{6}
+\indexentry{fp\_init\_copy|hyperpage}{6}
+\indexentry{fp\_iszero|hyperpage}{7}
+\indexentry{fp\_iseven|hyperpage}{7}
+\indexentry{fp\_isodd|hyperpage}{7}
+\indexentry{fp\_neg|hyperpage}{7}
+\indexentry{fp\_abs|hyperpage}{7}
+\indexentry{fp\_cmp|hyperpage}{8}
+\indexentry{fp\_cmp\_mag|hyperpage}{8}
+\indexentry{fp\_lshd|hyperpage}{8}
+\indexentry{fp\_rshd|hyperpage}{8}
+\indexentry{fp\_div\_2d|hyperpage}{8}
+\indexentry{fp\_mod\_2d|hyperpage}{8}
+\indexentry{fp\_mul\_2d|hyperpage}{8}
+\indexentry{fp\_div\_2|hyperpage}{8}
+\indexentry{fp\_mul\_2|hyperpage}{8}
+\indexentry{fp\_cnt\_lsb|hyperpage}{8}
+\indexentry{fp\_add|hyperpage}{9}
+\indexentry{fp\_sub|hyperpage}{9}
+\indexentry{fp\_mul|hyperpage}{9}
+\indexentry{fp\_sqr|hyperpage}{9}
+\indexentry{fp\_div|hyperpage}{9}
+\indexentry{fp\_mod|hyperpage}{9}
+\indexentry{fp\_exptmod|hyperpage}{9}
+\indexentry{fp\_invmod|hyperpage}{9}
+\indexentry{fp\_gcd|hyperpage}{9}
+\indexentry{fp\_lcm|hyperpage}{9}
+\indexentry{fp\_isprime|hyperpage}{10}
--- a/tfm.ilg
+++ b/tfm.ilg
@ -0,0 +1,6 @@
+This is makeindex, version 2.14 [02-Oct-2002] (kpathsea + Thai support).
+Scanning input file tfm.idx....done (29 entries accepted, 0 rejected).
+Sorting entries....done (137 comparisons).
+Generating output file tfm.ind....done (33 lines written, 0 warnings).
+Output written in tfm.ind.
+Transcript written in tfm.ilg.
--- a/tfm.ind
+++ b/tfm.ind
@ -0,0 +1,33 @@
+\begin{theindex}
+
+  \item fp\_abs, \hyperpage{7}
+  \item fp\_add, \hyperpage{9}
+  \item fp\_cmp, \hyperpage{8}
+  \item fp\_cmp\_mag, \hyperpage{8}
+  \item fp\_cnt\_lsb, \hyperpage{8}
+  \item fp\_div, \hyperpage{9}
+  \item fp\_div\_2, \hyperpage{8}
+  \item fp\_div\_2d, \hyperpage{8}
+  \item fp\_exptmod, \hyperpage{9}
+  \item fp\_gcd, \hyperpage{9}
+  \item fp\_init, \hyperpage{6}
+  \item fp\_init\_copy, \hyperpage{6}
+  \item fp\_invmod, \hyperpage{9}
+  \item fp\_iseven, \hyperpage{7}
+  \item fp\_isodd, \hyperpage{7}
+  \item fp\_isprime, \hyperpage{10}
+  \item fp\_iszero, \hyperpage{7}
+  \item fp\_lcm, \hyperpage{9}
+  \item fp\_lshd, \hyperpage{8}
+  \item fp\_mod, \hyperpage{9}
+  \item fp\_mod\_2d, \hyperpage{8}
+  \item fp\_mul, \hyperpage{9}
+  \item fp\_mul\_2, \hyperpage{8}
+  \item fp\_mul\_2d, \hyperpage{8}
+  \item fp\_neg, \hyperpage{7}
+  \item fp\_rshd, \hyperpage{8}
+  \item fp\_set, \hyperpage{6}
+  \item fp\_sqr, \hyperpage{9}
+  \item fp\_sub, \hyperpage{9}
+
+\end{theindex}
--- a/tfm.lof
+++ b/tfm.lof
@ -0,0 +1,5 @@
+\addvspace {10\p@ }
+\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}
+\addvspace {10\p@ }
+\addvspace {10\p@ }
+\addvspace {10\p@ }
--- a/tfm.log
+++ b/tfm.log
@ -0,0 +1,332 @@
+This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10)  23 JUL 2005 07:42
+entering extended mode
+**tfm
+(./tfm.tex
+LaTeX2e <2003/12/01>
+Babel <v3.8d> and hyphenation patterns for american, french, german, ngerman, b
+ahasa, basque, bulgarian, catalan, croatian, czech, danish, dutch, esperanto, e
+stonian, finnish, greek, icelandic, irish, italian, latin, magyar, norsk, polis
+h, portuges, romanian, russian, serbian, slovak, slovene, spanish, swedish, tur
+kish, ukrainian, nohyphenation, loaded.
+(/usr/share/texmf/tex/latex/base/book.cls
+Document Class: book 2004/02/16 v1.4f Standard LaTeX document class
+(/usr/share/texmf/tex/latex/base/bk10.clo
+File: bk10.clo 2004/02/16 v1.4f Standard LaTeX file (size option)
+)
+\c@part=\count79
+\c@chapter=\count80
+\c@section=\count81
+\c@subsection=\count82
+\c@subsubsection=\count83
+\c@paragraph=\count84
+\c@subparagraph=\count85
+\c@figure=\count86
+\c@table=\count87
+\abovecaptionskip=\skip41
+\belowcaptionskip=\skip42
+\bibindent=\dimen102
+)
+(/usr/share/texmf/tex/latex/hyperref/hyperref.sty
+Package: hyperref 2003/11/30 v6.74m Hypertext links for LaTeX
+
+(/usr/share/texmf/tex/latex/graphics/keyval.sty
+Package: keyval 1999/03/16 v1.13 key=value parser (DPC)
+\KV@toks@=\toks14
+)
+\@linkdim=\dimen103
+\Hy@linkcounter=\count88
+\Hy@pagecounter=\count89
+
+(/usr/share/texmf/tex/latex/hyperref/pd1enc.def
+File: pd1enc.def 2003/11/30 v6.74m Hyperref: PDFDocEncoding definition (HO)
+)
+(/usr/share/texmf/tex/latex/hyperref/hyperref.cfg
+File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive and teTeX
+)
+Package hyperref Info: Hyper figures OFF on input line 1880.
+Package hyperref Info: Link nesting OFF on input line 1885.
+Package hyperref Info: Hyper index ON on input line 1888.
+Package hyperref Info: Plain pages ON on input line 1893.
+Package hyperref Info: Backreferencing OFF on input line 1900.
+
+Implicit mode ON; LaTeX internals redefined
+Package hyperref Info: Bookmarks ON on input line 2004.
+(/usr/share/texmf/tex/latex/html/url.sty
+Package: url 1999/03/02  ver 1.4  Verb mode for urls, email addresses, and file
+ names
+)
+LaTeX Info: Redefining \url on input line 2143.
+\Fld@menulength=\count90
+\Field@Width=\dimen104
+\Fld@charsize=\dimen105
+\Choice@toks=\toks15
+\Field@toks=\toks16
+Package hyperref Info: Hyper figures OFF on input line 2618.
+Package hyperref Info: Link nesting OFF on input line 2623.
+Package hyperref Info: Hyper index ON on input line 2626.
+Package hyperref Info: backreferencing OFF on input line 2633.
+Package hyperref Info: Link coloring OFF on input line 2638.
+\c@Item=\count91
+\c@Hfootnote=\count92
+)
+*hyperref using default driver hypertex*
+(/usr/share/texmf/tex/latex/hyperref/hypertex.def
+File: hypertex.def 2003/11/30 v6.74m Hyperref driver for HyperTeX specials
+)
+(/usr/share/texmf/tex/latex/base/makeidx.sty
+Package: makeidx 2000/03/29 v1.0m Standard LaTeX package
+)
+(/usr/share/texmf/tex/latex/amsfonts/amssymb.sty
+Package: amssymb 2002/01/22 v2.2d
+
+(/usr/share/texmf/tex/latex/amsfonts/amsfonts.sty
+Package: amsfonts 2001/10/25 v2.2f
+\@emptytoks=\toks17
+\symAMSa=\mathgroup4
+\symAMSb=\mathgroup5
+LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
+(Font)                  U/euf/m/n --> U/euf/b/n on input line 132.
+))
+(/usr/share/texmf/tex/latex/graphics/color.sty
+Package: color 1999/02/16 v1.0i Standard LaTeX Color (DPC)
+
+(/usr/share/texmf/tex/latex/graphics/color.cfg
+File: color.cfg 2005/02/03 v1.3 color configuration of teTeX/TeXLive
+)
+Package color Info: Driver file: dvips.def on input line 125.
+
+(/usr/share/texmf/tex/latex/graphics/dvips.def
+File: dvips.def 1999/02/16 v3.0i Driver-dependant file (DPC,SPQR)
+)
+(/usr/share/texmf/tex/latex/graphics/dvipsnam.def
+File: dvipsnam.def 1999/02/16 v3.0i Driver-dependant file (DPC,SPQR)
+))
+(/usr/share/texmf/tex/latex/base/alltt.sty
+Package: alltt 1997/06/16 v2.0g defines alltt environment
+)
+(/usr/share/texmf/tex/latex/graphics/graphicx.sty
+Package: graphicx 1999/02/16 v1.0f Enhanced LaTeX Graphics (DPC,SPQR)
+
+(/usr/share/texmf/tex/latex/graphics/graphics.sty
+Package: graphics 2001/07/07 v1.0n Standard LaTeX Graphics (DPC,SPQR)
+
+(/usr/share/texmf/tex/latex/graphics/trig.sty
+Package: trig 1999/03/16 v1.09 sin cos tan (DPC)
+)
+(/usr/share/texmf/tex/latex/graphics/graphics.cfg
+File: graphics.cfg 2005/02/03 v1.3 graphics configuration of teTeX/TeXLive
+)
+Package graphics Info: Driver file: dvips.def on input line 80.
+)
+\Gin@req@height=\dimen106
+\Gin@req@width=\dimen107
+)
+(/usr/share/texmf/tex/latex/tools/layout.sty
+Package: layout 2000/09/25 v1.2c Show layout parameters
+\oneinch=\count93
+\cnt@paperwidth=\count94
+\cnt@paperheight=\count95
+\cnt@hoffset=\count96
+\cnt@voffset=\count97
+\cnt@textheight=\count98
+\cnt@textwidth=\count99
+\cnt@topmargin=\count100
+\cnt@oddsidemargin=\count101
+\cnt@evensidemargin=\count102
+\cnt@headheight=\count103
+\cnt@headsep=\count104
+\cnt@marginparsep=\count105
+\cnt@marginparwidth=\count106
+\cnt@marginparpush=\count107
+\cnt@footskip=\count108
+\fheight=\count109
+\ref@top=\count110
+\ref@hoffset=\count111
+\ref@voffset=\count112
+\ref@head=\count113
+\ref@body=\count114
+\ref@foot=\count115
+\ref@margin=\count116
+\ref@marginwidth=\count117
+\ref@marginpar=\count118
+\Interval=\count119
+\ExtraYPos=\count120
+\PositionX=\count121
+\PositionY=\count122
+\ArrowLength=\count123
+)
+\@indexfile=\write3
+\openout3 = `tfm.idx'.
+
+
+Writing index file tfm.idx
+(./tfm.aux)
+\openout1 = `tfm.aux'.
+
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 49.
+LaTeX Font Info:    ... okay on input line 49.
+Package hyperref Info: Link coloring OFF on input line 49.
+ (/usr/share/texmf/tex/latex/hyperref/nameref.sty
+Package: nameref 2003/12/03 v2.21 Cross-referencing by name of section
+\c@section@level=\count124
+)
+LaTeX Info: Redefining \ref on input line 49.
+LaTeX Info: Redefining \pageref on input line 49.
+LaTeX Font Info:    Try loading font information for U+msa on input line 55.
+
+(/usr/share/texmf/tex/latex/amsfonts/umsa.fd
+File: umsa.fd 2002/01/19 v2.2g AMS font definitions
+)
+LaTeX Font Info:    Try loading font information for U+msb on input line 55.
+
+(/usr/share/texmf/tex/latex/amsfonts/umsb.fd
+File: umsb.fd 2002/01/19 v2.2g AMS font definitions
+) [1
+
+
+
+] [2] (./tfm.toc [3
+
+])
+\tf@toc=\write4
+\openout4 = `tfm.toc'.
+
+ [4]
+(./tfm.lof)
+\tf@lof=\write5
+\openout5 = `tfm.lof'.
+
+ [5
+
+] [6
+
+]
+Chapter 1.
+[1
+
+] [2] [3] [4]
+Chapter 2.
+
+Underfull \vbox (badness 7649) has occurred while \output is active []
+
+ [5
+
+]
+[6]
+Chapter 3.
+[7
+
+] [8] [9] [10]
+Chapter 4.
+[11
+
+] [12] [13]
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+[]\OT1/cmtt/m/n/10 #define SQRADDSC(i, j)                                      
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+[]   \OT1/cmtt/m/n/10 do { fp_word t;                                          
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+[]      \OT1/cmtt/m/n/10 t =  ((fp_word)i) * ((fp_word)j);                     
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+[]      \OT1/cmtt/m/n/10 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;   
+                   \[] 
+ []
+
+
+Overfull \hbox (25.129pt too wide) in paragraph at lines 548--549
+\OT1/cmr/m/n/10 This com-putes a prod-uct and stores it in the ``sec-ondary'' c
+arry reg-is-ters $[]$. 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+[]\OT1/cmtt/m/n/10 #define SQRADDAC(i, j)                                      
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+[]   \OT1/cmtt/m/n/10 do { fp_word t;                                          
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+[]   \OT1/cmtt/m/n/10 t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 = t;         
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+[]   \OT1/cmtt/m/n/10 t = sc1 + (t >> DIGIT_BIT);             sc1 = t; sc2 += t
+ >> DIGIT_BIT;     \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566
+[]\OT1/cmtt/m/n/10 #define SQRADDDB                                            
+                   \[] 
+ []
+
+
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566
+[]   \OT1/cmtt/m/n/10 do { fp_word t;                                          
+                   \[] 
+ []
+
+
+Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
+[]   \OT1/cmtt/m/n/10 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t;        
+                                         \[] 
+ []
+
+
+Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
+[]   \OT1/cmtt/m/n/10 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BI
+T); c1 = t;                              \[] 
+ []
+
+
+Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
+[]   \OT1/cmtt/m/n/10 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_B
+IT);                                     \[] 
+ []
+
+[14] [15] (./tfm.ind [16] [17
+
+
+]) (./tfm.aux) ) 
+Here is how much of TeX's memory you used:
+ 2712 strings out of 49501
+ 35892 string characters out of 426789
+ 81342 words of memory out of 1100000
+ 5856 multiletter control sequences out of 10000+15000
+ 15453 words of font info for 59 fonts, out of 400000 for 2000
+ 580 hyphenation exceptions out of 1000
+ 25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s
+
+Output written on tfm.dvi (23 pages, 49708 bytes).
--- a/tfm.tex
+++ b/tfm.tex
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{TomsFastMath User Manual \\ v0.03}
+\title{TomsFastMath User Manual \\ v0.04}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text and library are all hereby placed in the public domain.  This book has been formatted for B5 
@ -143,6 +143,10 @@ TFM\_X86 and TFM\_SSE2 at the same time.   This mode only works with 32--bit dig
 mode fp\_digit is 32--bits and fp\_word is 64--bits.  While this mode will work on the AMD Athlon64 
 series of processors it is less efficient than the native ``x86--64'' mode and not recommended.

+There is an additional ``TFM\_PRESCOTT'' flag that you can define for P4 Prescott processors.  This causes
+the mul/sqr functions to use x86\_32 and the montgomery reduction to use SSE2 which is (so far) the fastest
+combination.  If you are using an older (e.g. Northwood) generation P4 don't define this.
+
 \subsubsection{x86--64}  The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a 
 ``x86--64'' capable processor (Athlon64 and future Pentium processors).  It requires GCC to
 build and only works with 64--bit digits.  Note that by enabling this mode it will automatically
@ -150,12 +154,16 @@ enable 64--bit digits.  In this mode fp\_digit is 64--bits and fp\_word is 128--
 be autodetected when building with GCC to an ``x86--64'' target.  You can override this behaviour by defining
 TFM\_NO\_ASM.

-\subsubsection{ARM}  The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
-processor.  It requires GCC and works with 32--bit digits.  In this mode fp\_digit is 32--bits and 
+\subsubsection{ARM}  The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 with the M instructions (enhanced 
+multipliers) or higher processor.  It requires GCC and works with 32--bit digits.  In this mode fp\_digit is 32--bits and 
 fp\_word is 64--bits.

+\subsubsection{PPC32} The ``PPC32'' mode is defined by ``TFM\_PPC32'' and requires a standard PPC processor.  It doesn't 
+use altivec or other extensions so it should work on all compliant implementations of PPC.  It requires GCC and works
+with 32--bit digits.  In this mode fp\_digit is 32--bits and fp\_word is 64--bits.
+
 \subsubsection{Future Releases}  Future releases will support additional platform optimizations.
-Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline patches 
+Developers of MIPS and SPARC platforms are encouraged to submit GCC asm inline patches 
 (see chapter \ref{chap:asmops} for more information).

 \begin{figure}[here]
@ -165,8 +173,10 @@ Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline pat
 \hline \textbf{Processor} & \textbf{Recommended Mode} \\
 \hline All 32--bit x86 platforms  & TFM\_X86 \\
 \hline Pentium 4                  & TFM\_SSE2 \\
+\hline Pentium 4 Prescott         & TFM\_SSE2 + TFM\_PRESCOTT \\
 \hline Athlon64                   & TFM\_X86\_64 \\
-\hline ARMv4 or higher            & TFM\_ARM \\
+\hline ARMv4 or higher with M     & TFM\_ARM \\
+\hline G3/G4 (32-bit PPC)         & TFM\_PPC32 \\
 \hline &\\
 \hline x86--32 or x86--64 (with GCC) & Leave blank and let autodetect work \\
 \hline
@ -589,26 +599,26 @@ This computes the $\mu$ value for the inner loop.  You can safely alias $mu$ and
 a register if you want.

 \begin{verbatim}
-#define INNERMUL \
-   t = ((fp_word)mu) * ((fp_word)*tmpm++);                \
-   _c[OFF0] += t;                                         \
-   if (_c[OFF0] < (fp_digit)t)              ++_c[OFF1];   \
-   _c[OFF1] += (t>>DIGIT_BIT);                            \
-   if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2];   
+#define INNERMUL                                      \
+   do { fp_word t;                                    \
+   _c[0] = t  = ((fp_word)_c[0] + (fp_word)cy) +      \
+                (((fp_word)mu) * ((fp_word)*tmpm++)); \
+   cy = (t >> DIGIT_BIT);                             \
+   } while (0)
 \end{verbatim}

-This computes the inner product and adds it to the correct set of carry variables.  The variable
-$\_c$ is a pointer alias to $c[x+y]$ and used to simplify the code.
+This computes the inner product and adds it to the destination and carry variable $cy$.
+This uses the $mu$ value computed above (can be in a register already) and the 
+$cy$ which is a chaining carry.  Inside the INNERMUL loop the $cy$ value can be kept
+inside a register (hint: it always starts as $cy = 0$ in the first iteration).

-You can safely alias $\_c$ to a register for INNERMUL by setting it equal to ``c + x''
-\footnote{Where ``c'' is an array on the stack.} by modifying LOOP\_START.
+Upon completion of the inner loop the macro LOOP\_END is called which is used to fetch
+$cy$ into the variable the C program can see.  This is where, if you cached $cy$ in a
+register you would copy it to the locally accessible C variable.

 \begin{verbatim}
 #define PROPCARRY \
-   _c[OFF0+1] += _c[OFF1];                                \
-   if (_c[OFF0+1] < _c[OFF1])       ++_c[OFF1+1];         \
-   _c[OFF1+1] += _c[OFF2];                                \
-   if (_c[OFF1+1] < _c[OFF2])       ++_c[OFF2+1];         
+   do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
 \end{verbatim}

 This propagates the carry upwards by one digit.  
--- a/tfm.toc
+++ b/tfm.toc
@ -0,0 +1,33 @@
+\contentsline {chapter}{\numberline {1}Introduction}{1}{chapter.1}
+\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}
+\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}
+\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}
+\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}
+\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}
+\contentsline {subsubsection}{x86--32}{3}{section*.3}
+\contentsline {subsubsection}{SSE2}{3}{section*.4}
+\contentsline {subsubsection}{x86--64}{3}{section*.5}
+\contentsline {subsubsection}{ARM}{3}{section*.6}
+\contentsline {subsubsection}{PPC32}{3}{section*.7}
+\contentsline {subsubsection}{Future Releases}{4}{section*.8}
+\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}
+\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}
+\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}
+\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}
+\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}
+\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}
+\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}
+\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}
+\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}
+\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}
+\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}
+\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}
+\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}
+\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}
+\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}
+\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}
+\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}
+\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}
+\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}
+\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}
+\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}