added tomsfastmath-0.02

2004-09-19 01:31:44 +00:00 · 2004-09-19 01:31:44 +00:00 · 6bb413fd72
commit 6bb413fd72
parent 5e92ed2a59
23 changed files with 689 additions and 184 deletions
--- a/changes.txt
+++ b/changes.txt
@ -1,2 +1,21 @@
+September 18th, 2004
+0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
+        This also helps prevent killing the cache on smaller cpus.
+     -- Cast memset to void in fp_init() to catch people who misuse the function (e.g. expect return)
+        Thanks to Johan Lindh
+     -- Cleaned up x86-64 support [faster montgomery reductions]
+     -- Autodetects x86-32 and x86-64 and enables it's asm now 
+     -- Made test demo build cleaner in multilib platforms [e.g. mixed 32/64 bits]
+     -- Fix to fp_mod to ensure that remainder is of the same sign as the modulus.
+     -- Fixed bug in fp_montgomery_calc_normalization for single digit moduli
+     -- cleaned up ISO C macros in comba/mont to avoid branches [works best with GCC 3.4.x branch]
+     -- Added more testing to tfm.h to help detect misconfigured builds
+     -- Added TFM_NO_ASM which forces ASM off [even if it was autodetected].
+     -- Added fp_radix_size() to API
+     -- Cleaned up demo/test.c to build with far fewer warnings (mostly %d => %lu fixes)
+     -- fp_exptmod() now supports negative exponent and base>modulus cases
+     -- Added fp_ident() which gives a string showing how TFM was configured.  Useful for debuging... 
+     -- fix gen.pl script so it includes the whole source tree now 
+
 August 25th, 2004
-TFM 0.01  -- Initial Release
+0.01 -- Initial Release
--- a/comba_mult_gen.c
+++ b/comba_mult_gen.c
@ -10,7 +10,6 @@ int main(int argc, char **argv)
 printf(
 "void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
 "{\n"
-"   fp_word t;\n"
 "   fp_digit c0, c1, c2, at[%d];\n"
 "\n"
 "   memcpy(at, A->dp, %d * sizeof(fp_digit));\n"
--- a/comba_sqr_gen.c
+++ b/comba_sqr_gen.c
@ -9,7 +9,6 @@ int main(int argc, char **argv)
 printf(
 "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
 "{\n"
-"   fp_word t;\n"
 "   fp_digit *a, b[%d], c0, c1, c2;\n"
 "\n"
 "   a = A->dp;\n"
--- a/delme.c
+++ b/delme.c
@ -0,0 +1,24 @@
+#include "tfm.h"
+
+int main(void)
+{
+    fp_int a;
+    char buf[4096];
+
+    fp_init(&a);
+    fp_read_radix( &a,
+        "///////////93zgY8MZ2DCJ6Oek0t1pHAG9E28fdp7G22xwcEnER8b5A27cED0JT"
+        "xvKPiyqwGnimAmfjybyKDq/XDMrjKS95v8MrTc9UViRqJ4BffZVjQml/NBRq1hVj"
+        "xZXh+rg9dwMkdoGHV4iVvaaePb7iv5izmW1ykA5ZlmMOsaWs75NJccaMFwZz9CzV"
+        "WsLT8zoZhPOSOlDM88LIkvxLAGTmbfPjPmmrJagyc0JnT6m8oXWXV3AGNaOkDiux"
+        "uvvtB1WEXWER9uEYx0UYZxN5NV1lJ5B9tYlBzfLO5nWvbKbywfLgvHNI9XYO+WKG"
+        "5NAEMeggn2sjCnSD151wCwXL8QlV7BfaxFk515ZRxmgAwd5NNGOCVREN3uMcuUJ7"
+        "g/MkZDi9CzSUZ9JWIYLXdSxZqYOQqkvhyI/w1jcA26JOTW9pFiXgP58VAnWNUo0C"
+        "k+4NLtfXNMnt2OZ0kjb6uWZYJw1qvQinGzjR/E3z48vBWj4WgJhIol//////////",
+        64 );
+
+    if( fp_isprime( &a ) ) printf("It's prime.\n");
+    else printf( "Not prime.\n");
+
+    return 0;
+}
--- a/demo/test.c
+++ b/demo/test.c
@ -60,6 +60,8 @@ int main(void)
                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
   ulong64 t1, t2;

+
+  printf("TFM Ident string:\n%s\n\n", fp_ident());
  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); 
  fp_zero(&a); draw(&a);

@ -133,9 +135,30 @@ int main(void)
  printf("Testing read_radix\n");
  fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);

+#if 1
  /* test mont */
-  printf("Montgomery test\n");
-  fp_set(&a, 1);
+  printf("Montgomery test #1\n");
+  fp_set(&a, 0x1234567ULL);
+  fp_montgomery_setup(&a, &fp);
+  fp_montgomery_calc_normalization(&b, &a);
+
+  fp_read_radix(&d, "123456789123", 16);
+  for (n = 0; n < 100000; n++) {
+      fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); 
+      fp_mul(&d, &b, &c);
+      fp_montgomery_reduce(&c, &a, fp);
+      if (fp_cmp(&c, &d) != FP_EQ) {
+         printf("Failed mont %d\n", n);
+         draw(&a);
+         draw(&d);
+         draw(&c);
+         return EXIT_FAILURE;
+      }
+  }
+  printf("Passed.\n");
+
+  printf("Montgomery test #2\n");
+  fp_set(&a, 0x1234567ULL);
  fp_lshd(&a, 4);
  fp_add_d(&a, 1, &a);
  fp_montgomery_setup(&a, &fp);
@ -158,19 +181,19 @@ int main(void)

   /* test for size */
   for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) {
-       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
+       printf("Testing (not safe-prime): %9lu bits    \r", ix); fflush(stdout);
       err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL);
       if (err != FP_OKAY) {
          printf("failed with err code %d\n", err);
          return EXIT_FAILURE;
       }
-       if (fp_count_bits(&a) != ix) {
-          printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix);
+       if ((unsigned long)fp_count_bits(&a) != ix) {
+          printf("Prime is %d not %lu bits!!!\n", fp_count_bits(&a), ix);
          return EXIT_FAILURE;
       }
   }
   printf("\n\n");
-
+#endif

 #if 0
 /* do some timings... */
@ -280,7 +303,7 @@ int main(void)
      c.used = t;

     t2 = -1;
-     for (ix = 0; ix < 50; ++ix) {
+     for (ix = 0; ix < 1024; ++ix) {
          t1 = TIMFUNC();
          fp_exptmod(&c, &b, &a, &d);
          fp_exptmod(&c, &b, &a, &d);
@ -293,7 +316,6 @@ int main(void)
  }
 #endif

-
   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;

@ -304,27 +326,27 @@ int main(void)
       printf("%s  ]\r",cmd); fflush(stdout);
       if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
+          fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);

          fp_mul_2d(&a, rr, &a);
          a.sign = b.sign;
          if (fp_cmp(&a, &b) != FP_EQ) {
-             printf("mul2d failed, rr == %d\n",rr);
+             printf("mul2d failed, rr == %lu\n",rr);
             draw(&a);
             draw(&b);
             return 0;
          }
       } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
+          fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);

          fp_div_2d(&a, rr, &a, &e);
          a.sign = b.sign;
          if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
          if (fp_cmp(&a, &b) != FP_EQ) {
-             printf("div2d failed, rr == %d\n",rr);
+             printf("div2d failed, rr == %lu\n",rr);
             draw(&a);
             draw(&b);
             return 0;
@ -492,7 +514,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
             }
       } else if (!strcmp(cmd, "add_d")) { ++add_d_n;
              fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
+              fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_add_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
@ -500,12 +522,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                 draw(&a);
                 draw(&b);
                 draw(&c);
-                 printf("d == %d\n", ix);
+                 printf("d == %lu\n", ix);
                 return 0;
              }
       } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
              fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
+              fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_sub_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
@ -513,12 +535,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                 draw(&a);
                 draw(&b);
                 draw(&c);
-                 printf("d == %d\n", ix);
+                 printf("d == %lu\n", ix);
                 return 0;
              }
       } else if (!strcmp(cmd, "mul_d")) { ++mul_d_n;
              fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
+              fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_mul_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
@ -526,7 +548,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                 draw(&a);
                 draw(&b);
                 draw(&c);
-                 printf("d == %d\n", ix);
+                 printf("d == %lu\n", ix);
                 return 0;
              }
       }
--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/fp_exptmod.c
+++ b/fp_exptmod.c
@ -13,7 +13,7 @@
 * Some restrictions... x must be positive and < b
 */

-int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
+static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
  fp_int   M[64], res;
  fp_digit buf, mp;
@ -34,7 +34,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  } 

  /* init M array */
-  memset(M, 0, sizeof(fp_int)*(1<<winsize));
+  memset(M, 0, sizeof(M));	

  /* now setup montgomery  */
  if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -168,3 +168,24 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  fp_copy (&res, Y);
  return FP_OKAY;
 }
+
+int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
+{
+   fp_int tmp;
+   int    err;
+
+   /* is X negative?  */
+   if (X->sign == FP_NEG) {
+      /* yes, copy G and invmod it */
+      fp_copy(G, &tmp);
+      if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
+         return err;
+      }
+      /* _fp_exptmod doesn't care about the sign of X */
+      return _fp_exptmod(&tmp, X, P, Y);
+   } else {
+      /* Positive exponent so just exptmod */
+      return _fp_exptmod(G, X, P, Y);
+   }
+}
+
--- a/fp_ident.c
+++ b/fp_ident.c
@ -0,0 +1,66 @@
+#include "tfm.h"
+
+const char *fp_ident(void)
+{
+   static char buf[1024];
+
+   memset(buf, 0, sizeof(buf));
+   snprintf(buf, sizeof(buf)-1,
+"TomsFastMath (%s)\n"
+"\n"
+"Sizeofs\n"
+"\tfp_digit = %u\n"
+"\tfp_word  = %u\n"
+"\n"
+"FP_MAX_SIZE = %u\n"
+"\n"
+"Defines: \n"
+#ifdef __i386__
+" __i386__ "
+#endif
+#ifdef __x86_64__
+" __x86_64__ "
+#endif
+#ifdef TFM_X86
+" TFM_X86 "
+#endif
+#ifdef TFM_X86_64
+" TFM_X86_64 "
+#endif
+#ifdef TFM_SSE2
+" TFM_SSE2 "
+#endif
+#ifdef TFM_ARM
+" TFM_ARM "
+#endif
+#ifdef TFM_NO_ASM
+" TFM_NO_ASM "
+#endif
+#ifdef FP_64BIT
+" FP_64BIT "
+#endif
+#ifdef TFM_LARGE
+" TFM_LARGE "
+#endif
+#ifdef TFM_HUGE
+" TFM_HUGE "
+#endif
+"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
+
+   if (sizeof(fp_digit) == sizeof(fp_word)) {
+      strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n", 
+              sizeof(buf)-1);
+   }
+   return buf;
+}
+
+#ifdef STANDALONE
+
+int main(void)
+{
+   printf("%s\n", fp_ident());
+   return 0;
+}
+
+#endif
+
--- a/fp_mod.c
+++ b/fp_mod.c
@ -12,7 +12,19 @@
 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
 {
-   return fp_div(a, b, NULL, c);
+   fp_int t;
+   int    err;
+
+   fp_zero(&t);
+   if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
+      return err;
+   }
+   if (t.sign != b->sign) {
+      fp_add(&t, b, c);
+   } else {
+      fp_copy(&t, c);
+  }
+  return FP_OKAY;
 }


--- a/fp_montgomery_calc_normalization.c
+++ b/fp_montgomery_calc_normalization.c
@ -24,7 +24,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
     fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
  } else {
     fp_set(a, 1);
-     ++bits;
+     bits = 1;
  }

  /* now compute C = A * B mod b */
--- a/fp_montgomery_reduce.c
+++ b/fp_montgomery_reduce.c
@ -66,13 +66,13 @@ asm(
 #define PROPCARRY \
 asm(                                                                                               \
 "movq %1,%%rax                \n\t"                                                                \
+"movq %2,%%rbx                \n\t"                                                                \
 "addq  %%rax,%6               \n\t"                                                                \
-"movq %2,%%rax                \n\t"                                                                \
-"adcq  %%rax,%7               \n\t"                                                                \
+"adcq  %%rbx,%7               \n\t"                                                                \
 "adcq $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
-: "%rax", "%cc");
+: "%rax", "%rbx", "%cc");

 #elif defined(TFM_SSE2)

@ -88,7 +88,7 @@ asm("emms");
 asm(\
 "movd %0,%%mm1                \n\t" \
 "pmuludq %%mm2,%%mm1          \n\t" \
-:: "g"(c[x]), "g"(mp));
+:: "g"(c[x]));

 #define INNERMUL \
 asm(                                                                                          \
@ -112,7 +112,7 @@ asm(
 "adcl  %%eax,%7               \n\t"                                                                \
 "adcl $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
-                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
+                                                "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1])  \
 : "%eax", "%cc");

 #elif defined(TFM_ARM)
@ -166,14 +166,18 @@ asm(                                             \
   mu = c[x] * mp;

 #define INNERMUL \
-   t = ((fp_word)mu) * ((fp_word)*tmpm++);                                             \
-   _c[OFF0] += t;               if (_c[OFF0] < (fp_digit)t)              ++_c[OFF1];   \
-   _c[OFF1] += (t>>DIGIT_BIT);  if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2];   \
+   do { fp_word t;                                                           \
+   t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
+   t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT);                   _c[OFF1] = t; \
+   _c[OFF2] += (t >> DIGIT_BIT);                                             \
+   } while (0);

 #define PROPCARRY \
-   _c[OFF0+1] += _c[OFF1];          if (_c[OFF0+1] < _c[OFF1])                ++_c[OFF1+1]; \
-   _c[OFF1+1] += _c[OFF2];          if (_c[OFF1+1] < _c[OFF2])                ++_c[OFF2+1];
-
+   do { fp_word t;                                                           \
+   t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1];                    _c[OFF0+1] = t; \
+   t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
+   _c[OFF2+1] += (t >> DIGIT_BIT);                                           \
+   } while (0);

 #endif

@ -187,7 +191,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 {
   fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
   int      oldused, x, y, pa;
-   fp_word  t;

   /* now zero the buff */
   pa = m->used;
@ -221,13 +224,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)

  /* fix the rest of the carries */
  _c = c + pa;
-  for (; x < pa * 2 + 2; x++) {
+  for (x = pa; x < pa * 2 + 2; x++) {
     PROPCARRY;
     ++_c;
  }

  /* now copy out */
-  _c = c + pa;
+  _c   = c + pa;
  tmpm = a->dp;
  for (x = 0; x < pa+1; x++) {
     *tmpm++ = *_c++;
--- a/fp_mul.c
+++ b/fp_mul.c
@ -28,9 +28,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
           fp_mul_comba4(A,B,C);
        } else if (y <= 8) {
           fp_mul_comba8(A,B,C);
+#if defined(TFM_LARGE)
        } else if (y <= 16 && y >= 12) {
           fp_mul_comba16(A,B,C);
-#ifdef TFM_HUGE
+#endif
+#if defined(TFM_HUGE)
        } else if (y <= 32 && y >= 28) {
           fp_mul_comba32(A,B,C);
 #endif
--- a/fp_mul_comba.c
+++ b/fp_mul_comba.c
@ -27,7 +27,7 @@

 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 /* store the first sum */
 #define COMBA_STORE(x) \
@ -42,7 +42,7 @@

 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm volatile (                                            \
+asm (                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -62,7 +62,7 @@ asm volatile (                                            \

 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 /* store the first sum */
 #define COMBA_STORE(x) \
@ -77,13 +77,13 @@ asm volatile (                                            \

 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm volatile (                                            \
+asm  (                                                    \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");

 #elif defined(TFM_SSE2)
 /* use SSE2 optimizations */
@ -97,7 +97,7 @@ asm volatile (                                            \

 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 /* store the first sum */
 #define COMBA_STORE(x) \
@ -134,7 +134,7 @@ asm volatile (                                            \
   c0 = c1 = c2 = 0;

 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_STORE(x) \
   x = c0;
@ -155,13 +155,13 @@ asm(                                                          \
 #else
 /* ISO C code */

-#define COMBA_START 
+#define COMBA_START

 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;

 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_STORE(x) \
   x = c0;
@ -169,12 +169,13 @@ asm(                                                          \
 #define COMBA_STORE2(x) \
   x = c1;

-#define COMBA_FINI
-
-#define MULADD(i, j)                                          \
-   t  = ((fp_word)i) * ((fp_word)j);                          \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
+#define COMBA_FINI 
+   
+#define MULADD(i, j)                                                              \
+   do { fp_word t;                                                                \
+   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t;                         \
+   t = (fp_word)c1 + (t >> DIGIT_BIT);            c1 = t; c2 += t >> DIGIT_BIT;   \
+   } while (0);

 #endif

@ -184,7 +185,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
 {
   int       ix, iy, iz, tx, ty, pa;
   fp_digit  c0, c1, c2, *tmpx, *tmpy;
-   fp_word   t;
   fp_int    tmp, *dst;

   COMBA_START;
@ -239,7 +239,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)

 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[8];

   memcpy(at, A->dp, 4 * sizeof(fp_digit));
@ -284,7 +283,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)

 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[16];

   memcpy(at, A->dp, 8 * sizeof(fp_digit));
@ -358,10 +356,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }

+#if defined(TFM_LARGE)

 void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[32];

   memcpy(at, A->dp, 16 * sizeof(fp_digit));
@ -499,11 +497,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }

+#endif /* TFM_LARGE */
+
 #ifdef TFM_HUGE

 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[64];

   memcpy(at, A->dp, 32 * sizeof(fp_digit));
--- a/fp_radix_size.c
+++ b/fp_radix_size.c
@ -11,4 +11,39 @@

 int fp_radix_size(fp_int *a, int radix, int *size)
 {
+  int     digs;
+  fp_int  t;
+  fp_digit d;
+   
+  *size = 0;
+
+  /* check range of the radix */
+  if (radix < 2 || radix > 64) {
+    return FP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (fp_iszero(a) == 1) {
+     *size = 2;
+     return FP_OKAY;
+  }
+
+  fp_init_copy(&t, a);
+
+  /* if it is negative output a - */
+  if (t.sign == FP_NEG) {
+    *size++;
+    t.sign = FP_ZPOS;
+  }
+
+  digs = 0;
+  while (fp_iszero (&t) == FP_NO) {
+    fp_div_d (&t, (fp_digit) radix, &t, &d);
+    *size++;
+  }
+
+  /* append a NULL so the string is properly terminated */
+  *size++;
+  return FP_OKAY;
+
 }
--- a/fp_sqr.c
+++ b/fp_sqr.c
@ -21,9 +21,11 @@ void fp_sqr(fp_int *A, fp_int *B)
           fp_sqr_comba4(A,B);
        } else if (y <= 8) {
           fp_sqr_comba8(A,B);
+#if defined(TFM_LARGE)
        } else if (y <= 16 && y >= 12) {
           fp_sqr_comba16(A,B);
-#ifdef TFM_HUGE
+#endif
+#if defined(TFM_HUGE)
        } else if (y <= 32 && y >= 28) {
           fp_sqr_comba32(A,B);
 #endif
--- a/fp_sqr_comba.c
+++ b/fp_sqr_comba.c
@ -28,7 +28,7 @@
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI

@ -68,21 +68,21 @@ asm volatile (                                            \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI

 #define SQRADD(i, j)                                      \
-asm volatile (                                            \
+asm (                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %%rax        \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");

 #define SQRADD2(i, j)                                     \
-asm volatile (                                            \
+asm (                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -91,7 +91,7 @@ asm volatile (                                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");


 #elif defined(TFM_SSE2)
@ -109,7 +109,7 @@ asm volatile (                                            \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI \
   asm("emms");
@ -120,11 +120,11 @@ asm volatile (                                            \
     "pmuludq %%mm0,%%mm0\n\t"                            \
     "movd  %%mm0,%%eax  \n\t"                            \
     "psrlq $32,%%mm0    \n\t"                            \
-     "movd  %%mm0,%%edx  \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
-     "adcl  %%edx,%1     \n\t"                            \
+     "movd  %%mm0,%%eax  \n\t"                            \
+     "adcl  %%eax,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");

 #define SQRADD2(i, j)                                     \
 asm volatile (                                            \
@ -158,7 +158,7 @@ asm volatile (                                            \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI

@ -187,7 +187,8 @@ asm(                                                             \

 /* ISO C portable code */

-#define COMBA_START
+#define COMBA_START \
+   { fp_word tt; 

 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
@ -199,23 +200,28 @@ asm(                                                             \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

-#define COMBA_FINI
+#define COMBA_FINI \
+   }

 /* multiplies point i and j, updates carry "c1" and digit c2 */
-#define SQRADD(i, j)                       \
-   t  = ((fp_word)i) * ((fp_word)j);       \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
+#define SQRADD(i, j)                                 \
+   do { fp_word t;                                   \
+   t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = t;    \
+   t = c1 + (t >> DIGIT_BIT);             c1 = t; c2 += t >> DIGIT_BIT; \
+   } while (0);
+  

 /* for squaring some of the terms are doubled... */
-#define SQRADD2(i, j)                       \
-   t  = ((fp_word)i) * ((fp_word)j);       \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
+#define SQRADD2(i, j)                                                 \
+   do { fp_word t;                                                    \
+   t  = ((fp_word)i) * ((fp_word)j);                                  \
+   tt = (fp_word)c0 + t;                 c0 = tt;                              \
+   tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT;       \
+   tt = (fp_word)c0 + t;                 c0 = tt;                              \
+   tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT;       \
+   } while (0);

 #endif

@ -225,7 +231,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
  int       pa, ix, iz;
  fp_digit  c0, c1, c2;
  fp_int    tmp, *dst;
-  fp_word   t;

  /* get size of output and trim */
  pa = A->used + A->used;
@ -298,7 +303,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)

 void fp_sqr_comba4(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[8], c0, c1, c2;

   a = A->dp;
@ -352,7 +356,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)

 void fp_sqr_comba8(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[16], c0, c1, c2;

   a = A->dp;
@ -443,10 +446,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
   fp_clamp(B);
 }

+#if defined(TFM_LARGE)

 void fp_sqr_comba16(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[32], c0, c1, c2;

   a = A->dp;
@ -617,11 +620,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
   fp_clamp(B);
 }

+#endif /* TFM_LARGE */
+
 #ifdef TFM_HUGE

 void fp_sqr_comba32(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[64], c0, c1, c2;

   a = A->dp;
--- a/gen.pl
+++ b/gen.pl
@ -6,7 +6,7 @@
 use strict;

 open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
-foreach my $filename (glob "fp_*.c") {
+foreach my $filename (glob "*fp_*.c") {
   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
   print OUT "/* Start: $filename */\n";
   print OUT while <SRC>;
@ -14,4 +14,4 @@ foreach my $filename (glob "fp_*.c") {
   close SRC or die "Error closing $filename after reading: $!";
 }
 print OUT "\n/* EOF */\n";
-close OUT or die "Error closing mpi.c after writing: $!";
+close OUT or die "Error closing mpi.c after writing: $!";
--- a/9
+++ b/9
@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
 #speed
 CFLAGS += -fomit-frame-pointer

-VERSION=0.01
+VERSION=0.02

 default: libtfm.a

@ -38,8 +38,9 @@ fp_cmp.o fp_cmp_mag.o \
 \
 fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
 fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
-fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
+fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
 \
+fp_ident.o 

 libtfm.a: $(OBJECTS)
 	$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
@ -49,7 +50,7 @@ mtest/mtest: mtest/mtest.c
 	cd mtest ; make mtest

 test: libtfm.a demo/test.o mtest/mtest
-	$(CC) demo/test.o libtfm.a $(PROF) -o test
+	$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test

 stest: libtfm.a demo/stest.o 
 	$(CC) demo/stest.o libtfm.a -o stest
@ -67,7 +68,7 @@ docs: docdvi
 	mv -f tfm.pdf doc

 clean:
-	rm -f $(OBJECTS) *.a demo/*.o test tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc stest
+	rm -f $(OBJECTS) *.a demo/*.o test tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc stest *~
 	cd mtest ; make clean

 zipup: docs clean
--- a/mtest/makefile
+++ b/mtest/makefile
@ -6,4 +6,4 @@ mtest: mtest.o
 	$(CC) mtest.o -ltommath -o mtest

 clean:
-	rm -f *.o mtest
+	rm -f *.o mtest *~
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -235,10 +235,9 @@ int main(void)
      rand_num2(&a);
      rand_num2(&b);
      rand_num2(&c);
-//      if (c.dp[0]&1) mp_add_d(&c, 1, &c);
      a.sign = b.sign = c.sign = 0;
      c.dp[0] |= 1;
-      if (c.used <= 2) continue;
+//      if (c.used <= 4) continue;
 //      if (mp_cmp(&a, &c) != MP_LT) continue;
 //      if (mp_cmp(&b, &c) != MP_LT) continue;
      mp_exptmod(&a, &b, &c, &d);
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -709,7 +709,7 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
 * Some restrictions... x must be positive and < b
 */

-int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
+static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
  fp_int   M[64], res;
  fp_digit buf, mp;
@ -730,7 +730,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  } 

  /* init M array */
-  memset(M, 0, sizeof(fp_int)*(1<<winsize));
+  memset(M, 0, sizeof(M));	

  /* now setup montgomery  */
  if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -865,6 +865,27 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  return FP_OKAY;
 }

+int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
+{
+   fp_int tmp;
+   int    err;
+
+   /* is X negative?  */
+   if (X->sign == FP_NEG) {
+      /* yes, copy G and invmod it */
+      fp_copy(G, &tmp);
+      if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
+         return err;
+      }
+      /* _fp_exptmod doesn't care about the sign of X */
+      return _fp_exptmod(&tmp, X, P, Y);
+   } else {
+      /* Positive exponent so just exptmod */
+      return _fp_exptmod(G, X, P, Y);
+   }
+}
+
+
 /* End: fp_exptmod.c */

 /* Start: fp_gcd.c */
@ -922,6 +943,76 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)

 /* End: fp_gcd.c */

+/* Start: fp_ident.c */
+#include "tfm.h"
+
+const char *fp_ident(void)
+{
+   static char buf[1024];
+
+   memset(buf, 0, sizeof(buf));
+   snprintf(buf, sizeof(buf)-1,
+"TomsFastMath (%s)\n"
+"\n"
+"Sizeofs\n"
+"\tfp_digit = %u\n"
+"\tfp_word  = %u\n"
+"\n"
+"FP_MAX_SIZE = %u\n"
+"\n"
+"Defines: \n"
+#ifdef __i386__
+" __i386__ "
+#endif
+#ifdef __x86_64__
+" __x86_64__ "
+#endif
+#ifdef TFM_X86
+" TFM_X86 "
+#endif
+#ifdef TFM_X86_64
+" TFM_X86_64 "
+#endif
+#ifdef TFM_SSE2
+" TFM_SSE2 "
+#endif
+#ifdef TFM_ARM
+" TFM_ARM "
+#endif
+#ifdef TFM_NO_ASM
+" TFM_NO_ASM "
+#endif
+#ifdef FP_64BIT
+" FP_64BIT "
+#endif
+#ifdef TFM_LARGE
+" TFM_LARGE "
+#endif
+#ifdef TFM_HUGE
+" TFM_HUGE "
+#endif
+"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
+
+   if (sizeof(fp_digit) == sizeof(fp_word)) {
+      strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n", 
+              sizeof(buf)-1);
+   }
+   return buf;
+}
+
+#ifdef STANDALONE
+
+int main(void)
+{
+   printf("%s\n", fp_ident());
+   return 0;
+}
+
+#endif
+
+
+/* End: fp_ident.c */
+
 /* Start: fp_invmod.c */
 /* TomsFastMath, a fast ISO C bignum library.
 * 
@ -1186,7 +1277,19 @@ void fp_lshd(fp_int *a, int x)
 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
 {
-   return fp_div(a, b, NULL, c);
+   fp_int t;
+   int    err;
+
+   fp_zero(&t);
+   if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
+      return err;
+   }
+   if (t.sign != b->sign) {
+      fp_add(&t, b, c);
+   } else {
+      fp_copy(&t, c);
+  }
+  return FP_OKAY;
 }


@ -1282,7 +1385,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
     fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
  } else {
     fp_set(a, 1);
-     ++bits;
+     bits = 1;
  }

  /* now compute C = A * B mod b */
@ -1366,13 +1469,13 @@ asm(
 #define PROPCARRY \
 asm(                                                                                               \
 "movq %1,%%rax                \n\t"                                                                \
+"movq %2,%%rbx                \n\t"                                                                \
 "addq  %%rax,%6               \n\t"                                                                \
-"movq %2,%%rax                \n\t"                                                                \
-"adcq  %%rax,%7               \n\t"                                                                \
+"adcq  %%rbx,%7               \n\t"                                                                \
 "adcq $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
-: "%rax", "%cc");
+: "%rax", "%rbx", "%cc");

 #elif defined(TFM_SSE2)

@ -1388,7 +1491,7 @@ asm("emms");
 asm(\
 "movd %0,%%mm1                \n\t" \
 "pmuludq %%mm2,%%mm1          \n\t" \
-:: "g"(c[x]), "g"(mp));
+:: "g"(c[x]));

 #define INNERMUL \
 asm(                                                                                          \
@ -1412,7 +1515,7 @@ asm(
 "adcl  %%eax,%7               \n\t"                                                                \
 "adcl $0,%8                   \n\t"                                                                \
 :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]),       \
-                                                "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1])  \
+                                                "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1])  \
 : "%eax", "%cc");

 #elif defined(TFM_ARM)
@ -1466,14 +1569,18 @@ asm(                                             \
   mu = c[x] * mp;

 #define INNERMUL \
-   t = ((fp_word)mu) * ((fp_word)*tmpm++);                                             \
-   _c[OFF0] += t;               if (_c[OFF0] < (fp_digit)t)              ++_c[OFF1];   \
-   _c[OFF1] += (t>>DIGIT_BIT);  if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2];   \
+   do { fp_word t;                                                           \
+   t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
+   t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT);                   _c[OFF1] = t; \
+   _c[OFF2] += (t >> DIGIT_BIT);                                             \
+   } while (0);

 #define PROPCARRY \
-   _c[OFF0+1] += _c[OFF1];          if (_c[OFF0+1] < _c[OFF1])                ++_c[OFF1+1]; \
-   _c[OFF1+1] += _c[OFF2];          if (_c[OFF1+1] < _c[OFF2])                ++_c[OFF2+1];
-
+   do { fp_word t;                                                           \
+   t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1];                    _c[OFF0+1] = t; \
+   t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
+   _c[OFF2+1] += (t >> DIGIT_BIT);                                           \
+   } while (0);

 #endif

@ -1487,7 +1594,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 {
   fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
   int      oldused, x, y, pa;
-   fp_word  t;

   /* now zero the buff */
   pa = m->used;
@ -1521,13 +1627,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)

  /* fix the rest of the carries */
  _c = c + pa;
-  for (; x < pa * 2 + 2; x++) {
+  for (x = pa; x < pa * 2 + 2; x++) {
     PROPCARRY;
     ++_c;
  }

  /* now copy out */
-  _c = c + pa;
+  _c   = c + pa;
  tmpm = a->dp;
  for (x = 0; x < pa+1; x++) {
     *tmpm++ = *_c++;
@ -1629,9 +1735,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
           fp_mul_comba4(A,B,C);
        } else if (y <= 8) {
           fp_mul_comba8(A,B,C);
+#if defined(TFM_LARGE)
        } else if (y <= 16 && y >= 12) {
           fp_mul_comba16(A,B,C);
-#ifdef TFM_HUGE
+#endif
+#if defined(TFM_HUGE)
        } else if (y <= 32 && y >= 28) {
           fp_mul_comba32(A,B,C);
 #endif
@ -1880,7 +1988,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)

 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 /* store the first sum */
 #define COMBA_STORE(x) \
@ -1895,7 +2003,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)

 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm volatile (                                            \
+asm (                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -1915,7 +2023,7 @@ asm volatile (                                            \

 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 /* store the first sum */
 #define COMBA_STORE(x) \
@ -1930,13 +2038,13 @@ asm volatile (                                            \

 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm volatile (                                            \
+asm  (                                                    \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");

 #elif defined(TFM_SSE2)
 /* use SSE2 optimizations */
@ -1950,7 +2058,7 @@ asm volatile (                                            \

 /* forward the carry to the next digit */
 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 /* store the first sum */
 #define COMBA_STORE(x) \
@ -1987,7 +2095,7 @@ asm volatile (                                            \
   c0 = c1 = c2 = 0;

 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_STORE(x) \
   x = c0;
@ -2008,13 +2116,13 @@ asm(                                                          \
 #else
 /* ISO C code */

-#define COMBA_START 
+#define COMBA_START

 #define COMBA_CLEAR \
   c0 = c1 = c2 = 0;

 #define COMBA_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_STORE(x) \
   x = c0;
@ -2022,12 +2130,13 @@ asm(                                                          \
 #define COMBA_STORE2(x) \
   x = c1;

-#define COMBA_FINI
-
-#define MULADD(i, j)                                          \
-   t  = ((fp_word)i) * ((fp_word)j);                          \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
+#define COMBA_FINI 
+   
+#define MULADD(i, j)                                                              \
+   do { fp_word t;                                                                \
+   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t;                         \
+   t = (fp_word)c1 + (t >> DIGIT_BIT);            c1 = t; c2 += t >> DIGIT_BIT;   \
+   } while (0);

 #endif

@ -2037,7 +2146,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
 {
   int       ix, iy, iz, tx, ty, pa;
   fp_digit  c0, c1, c2, *tmpx, *tmpy;
-   fp_word   t;
   fp_int    tmp, *dst;

   COMBA_START;
@ -2092,7 +2200,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)

 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[8];

   memcpy(at, A->dp, 4 * sizeof(fp_digit));
@ -2137,7 +2244,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)

 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[16];

   memcpy(at, A->dp, 8 * sizeof(fp_digit));
@ -2211,10 +2317,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }

+#if defined(TFM_LARGE)

 void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[32];

   memcpy(at, A->dp, 16 * sizeof(fp_digit));
@ -2352,11 +2458,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }

+#endif /* TFM_LARGE */
+
 #ifdef TFM_HUGE

 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_word t;
   fp_digit c0, c1, c2, at[64];

   memcpy(at, A->dp, 32 * sizeof(fp_digit));
@ -2880,6 +2987,41 @@ error:

 int fp_radix_size(fp_int *a, int radix, int *size)
 {
+  int     digs;
+  fp_int  t;
+  fp_digit d;
+   
+  *size = 0;
+
+  /* check range of the radix */
+  if (radix < 2 || radix > 64) {
+    return FP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (fp_iszero(a) == 1) {
+     *size = 2;
+     return FP_OKAY;
+  }
+
+  fp_init_copy(&t, a);
+
+  /* if it is negative output a - */
+  if (t.sign == FP_NEG) {
+    *size++;
+    t.sign = FP_ZPOS;
+  }
+
+  digs = 0;
+  while (fp_iszero (&t) == FP_NO) {
+    fp_div_d (&t, (fp_digit) radix, &t, &d);
+    *size++;
+  }
+
+  /* append a NULL so the string is properly terminated */
+  *size++;
+  return FP_OKAY;
+
 }

 /* End: fp_radix_size.c */
@ -3161,9 +3303,11 @@ void fp_sqr(fp_int *A, fp_int *B)
           fp_sqr_comba4(A,B);
        } else if (y <= 8) {
           fp_sqr_comba8(A,B);
+#if defined(TFM_LARGE)
        } else if (y <= 16 && y >= 12) {
           fp_sqr_comba16(A,B);
-#ifdef TFM_HUGE
+#endif
+#if defined(TFM_HUGE)
        } else if (y <= 32 && y >= 28) {
           fp_sqr_comba32(A,B);
 #endif
@ -3279,7 +3423,7 @@ Obvious points of optimization
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI

@ -3319,21 +3463,21 @@ asm volatile (                                            \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI

 #define SQRADD(i, j)                                      \
-asm volatile (                                            \
+asm (                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %%rax        \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");

 #define SQRADD2(i, j)                                     \
-asm volatile (                                            \
+asm (                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -3342,7 +3486,7 @@ asm volatile (                                            \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");


 #elif defined(TFM_SSE2)
@ -3360,7 +3504,7 @@ asm volatile (                                            \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI \
   asm("emms");
@ -3371,11 +3515,11 @@ asm volatile (                                            \
     "pmuludq %%mm0,%%mm0\n\t"                            \
     "movd  %%mm0,%%eax  \n\t"                            \
     "psrlq $32,%%mm0    \n\t"                            \
-     "movd  %%mm0,%%edx  \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
-     "adcl  %%edx,%1     \n\t"                            \
+     "movd  %%mm0,%%eax  \n\t"                            \
+     "adcl  %%eax,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");

 #define SQRADD2(i, j)                                     \
 asm volatile (                                            \
@ -3409,7 +3553,7 @@ asm volatile (                                            \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

 #define COMBA_FINI

@ -3438,7 +3582,8 @@ asm(                                                             \

 /* ISO C portable code */

-#define COMBA_START
+#define COMBA_START \
+   { fp_word tt; 

 #define CLEAR_CARRY \
   c0 = c1 = c2 = 0;
@ -3450,23 +3595,28 @@ asm(                                                             \
   x = c1;

 #define CARRY_FORWARD \
-   c0 = c1; c1 = c2; c2 = 0;
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);

-#define COMBA_FINI
+#define COMBA_FINI \
+   }

 /* multiplies point i and j, updates carry "c1" and digit c2 */
-#define SQRADD(i, j)                       \
-   t  = ((fp_word)i) * ((fp_word)j);       \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
+#define SQRADD(i, j)                                 \
+   do { fp_word t;                                   \
+   t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = t;    \
+   t = c1 + (t >> DIGIT_BIT);             c1 = t; c2 += t >> DIGIT_BIT; \
+   } while (0);
+  

 /* for squaring some of the terms are doubled... */
-#define SQRADD2(i, j)                       \
-   t  = ((fp_word)i) * ((fp_word)j);       \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
-   c0 = (c0 + t);              if (c0 < ((fp_digit)t))  ++c1; \
-   c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; 
+#define SQRADD2(i, j)                                                 \
+   do { fp_word t;                                                    \
+   t  = ((fp_word)i) * ((fp_word)j);                                  \
+   tt = (fp_word)c0 + t;                 c0 = tt;                              \
+   tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT;       \
+   tt = (fp_word)c0 + t;                 c0 = tt;                              \
+   tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT;       \
+   } while (0);

 #endif

@ -3476,7 +3626,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
  int       pa, ix, iz;
  fp_digit  c0, c1, c2;
  fp_int    tmp, *dst;
-  fp_word   t;

  /* get size of output and trim */
  pa = A->used + A->used;
@ -3549,7 +3698,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)

 void fp_sqr_comba4(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[8], c0, c1, c2;

   a = A->dp;
@ -3603,7 +3751,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)

 void fp_sqr_comba8(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[16], c0, c1, c2;

   a = A->dp;
@ -3694,10 +3841,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
   fp_clamp(B);
 }

+#if defined(TFM_LARGE)

 void fp_sqr_comba16(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[32], c0, c1, c2;

   a = A->dp;
@ -3868,11 +4015,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
   fp_clamp(B);
 }

+#endif /* TFM_LARGE */
+
 #ifdef TFM_HUGE

 void fp_sqr_comba32(fp_int *A, fp_int *B)
 {
-   fp_word t;
   fp_digit *a, b[64], c0, c1, c2;

   a = A->dp;
@ -4455,5 +4603,81 @@ int fp_unsigned_bin_size(fp_int *a)

 /* End: fp_unsigned_bin_size.c */

+/* Start: s_fp_add.c */
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@iahu.ca
+ */
+#include <tfm.h>
+
+/* unsigned addition */
+void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
+{
+  int      x, y, oldused;
+  fp_word  t;
+
+  y       = MAX(a->used, b->used);
+  oldused = c->used;
+  c->used = y;
+ 
+  t = 0;
+  for (x = 0; x < y; x++) {
+      t         += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
+      c->dp[x]   = (fp_digit)t;
+      t        >>= DIGIT_BIT;
+  }
+  if (t != 0 && x != FP_SIZE) {
+     c->dp[c->used++] = (fp_digit)t;
+     ++x;
+  }
+  
+  for (; x < oldused; x++) {
+     c->dp[x] = 0;
+  }
+  fp_clamp(c);
+}
+
+/* End: s_fp_add.c */
+
+/* Start: s_fp_sub.c */
+/* TomsFastMath, a fast ISO C bignum library.
+ * 
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ * 
+ * Tom St Denis, tomstdenis@iahu.ca
+ */
+#include <tfm.h>
+
+/* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
+void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
+{
+  int      x, oldused;
+  fp_word  t;
+
+  oldused = c->used;
+  c->used = a->used;
+  t       = 0;
+  for (x = 0; x < a->used; x++) {
+      t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
+      c->dp[x]  = (fp_digit)t;
+      t         = (t >> DIGIT_BIT) & 1;
+  }
+  
+  for (; x < oldused; x++) {
+     c->dp[x] = 0;
+  }
+  fp_clamp(c);
+}
+
+/* End: s_fp_sub.c */
+

 /* EOF */
--- a/tfm.h
+++ b/tfm.h
@ -21,9 +21,19 @@
 #undef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))

-/* do we want huge code?  The answer is, yes. */
+/* do we want large code? */
+#define TFM_LARGE
+
+/* do we want huge code (implies large)?  The answer is, yes. */
 #define TFM_HUGE

+/* imply TFM_LARGE as required */
+#if defined(TFM_HUGE)
+   #if !defined(TFM_LARGE)
+      #define TFM_LARGE
+   #endif
+#endif
+
 /* Max size of any number in bits.  Basically the largest size you will be multiplying
 * should be half [or smaller] of FP_MAX_SIZE-four_digit
 *
@ -41,19 +51,69 @@
   #error FP_MAX_SIZE must be a multiple of CHAR_BIT
 #endif

-/* make sure we are using 64-bit digits with x86-64 asm */
+/* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */
+#if defined(__x86_64__)
+   #if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM) 
+       #error x86-64 detected, x86-32/SSE2/ARM optimizations are not valid!
+   #endif
+   #if !defined(TFM_X86_64) && !defined(TFM_NO_ASM)
+      #define TFM_X86_64
+   #endif
+#endif
 #if defined(TFM_X86_64)
-    #ifndef FP_64BIT
+    #if !defined(FP_64BIT)
       #define FP_64BIT
    #endif
 #endif

+/* try to detect x86-32 */
+#if defined(__i386__) && !defined(TFM_SSE2)
+   #if defined(TFM_X86_64) || defined(TFM_ARM) 
+       #error x86-32 detected, x86-64/ARM optimizations are not valid!
+   #endif
+   #if !defined(TFM_X86) && !defined(TFM_NO_ASM)
+      #define TFM_X86
+   #endif
+#endif
+
 /* make sure we're 32-bit for x86-32/sse/arm */
 #if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
   #warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
   #undef FP_64BIT
 #endif

+/* multi asms? */
+#ifdef TFM_X86
+   #define TFM_ASM
+#endif
+#ifdef TFM_X86_64
+   #ifdef TFM_ASM
+      #error TFM_ASM already defined!
+   #endif
+   #define TFM_ASM
+#endif
+#ifdef TFM_SSE2
+   #ifdef TFM_ASM
+      #error TFM_ASM already defined!
+   #endif
+   #define TFM_ASM
+#endif
+#ifdef TFM_ARM
+   #ifdef TFM_ASM
+      #error TFM_ASM already defined!
+   #endif
+   #define TFM_ASM
+#endif
+
+/* we want no asm? */
+#ifdef TFM_NO_ASM
+	#undef TFM_X86
+	#undef TFM_X86_64
+	#undef TFM_SSE2
+	#undef TFM_ARM
+	#undef TFM_ASM   
+#endif
+
 /* some default configurations.
 */
 #if defined(FP_64BIT)
@ -110,8 +170,11 @@ typedef struct {

 /* functions */

+/* returns a TFM ident string useful for debugging... */
+const char *fp_ident(void);
+
 /* initialize [or zero] an fp int */
-#define fp_init(a)  memset((a), 0, sizeof(fp_int))
+#define fp_init(a)  (void)memset((a), 0, sizeof(fp_int))
 #define fp_zero(a)  fp_init(a)

 /* zero/even/odd ? */
@ -123,7 +186,7 @@ typedef struct {
 void fp_set(fp_int *a, fp_digit b);

 /* copy from a to b */
-#define fp_copy(a, b)     (((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
+#define fp_copy(a, b)      (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
 #define fp_init_copy(a, b) fp_copy(b, a)

 /* negate and absolute */
@ -139,10 +202,10 @@ void fp_rshd(fp_int *a, int x);
 /* left shift x digits */
 void fp_lshd(fp_int *a, int x);

-/* signed comparisonm */
+/* signed comparison */
 int fp_cmp(fp_int *a, fp_int *b);

-/* unsigned comparisonm */
+/* unsigned comparison */
 int fp_cmp_mag(fp_int *a, fp_int *b);

 /* power of 2 operations */
@ -273,14 +336,18 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
 #ifdef TFM_HUGE
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
 #endif
+#ifdef TFM_LARGE
 void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
+#endif
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);

 void fp_sqr_comba(fp_int *A, fp_int *B);
 void fp_sqr_comba4(fp_int *A, fp_int *B);
 void fp_sqr_comba8(fp_int *A, fp_int *B);
+#ifdef TFM_LARGE
 void fp_sqr_comba16(fp_int *A, fp_int *B);
+#endif
 #ifdef TFM_HUGE
 void fp_sqr_comba32(fp_int *A, fp_int *B);
 #endif
--- a/tfm.tex
+++ b/tfm.tex
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{TomsFastMath User Manual \\ v0.01}
+\title{TomsFastMath User Manual \\ v0.02}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text and library are all hereby placed in the public domain.  This book has been formatted for B5 
@ -128,23 +128,27 @@ several ``CFLAGS'' defines.
 For example, to build with with SSE2 optimizations type 

 \begin{verbatim}
-export CFLAGS=-DTFM_SSE2
-make clean libtfm.a
+CFLAGS=-DTFM_SSE2 make clean libtfm.a
 \end{verbatim}

 \subsubsection{x86--32}  The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all
 i386 and beyond processors.  It requires GCC to build and only works with 32--bit digits.  In this 
-mode fp\_digit is 32--bits and fp\_word is 64--bits.
+mode fp\_digit is 32--bits and fp\_word is 64--bits.  This mode will be autodetected when building 
+with GCC to an  ``i386'' target.  You can override this behaviour by defining TFM\_NO\_ASM or 
+another optimization mode (such as SSE2).

 \subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium
 M or Athlon64 processor.  It requires GCC to build.  Note that you shouldn't define both
 TFM\_X86 and TFM\_SSE2 at the same time.   This mode only works with 32--bit digits.  In this 
-mode fp\_digit is 32--bits and fp\_word is 64--bits.
+mode fp\_digit is 32--bits and fp\_word is 64--bits.  While this mode will work on the AMD Athlon64 
+series of processors it is less efficient than the native ``x86--64'' mode and not recommended.

 \subsubsection{x86--64}  The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a 
 ``x86--64'' capable processor (Athlon64 and future Pentium processors).  It requires GCC to
 build and only works with 64--bit digits.  Note that by enabling this mode it will automatically
-enable 64--bit digits.  In this mode fp\_digit is 64--bits and fp\_word is 128--bits.
+enable 64--bit digits.  In this mode fp\_digit is 64--bits and fp\_word is 128--bits.  This mode will
+be autodetected when building with GCC to an ``x86--64'' target.  You can override this behaviour by defining
+TFM\_NO\_ASM.

 \subsubsection{ARM}  The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
 processor.  It requires GCC and works with 32--bit digits.  In this mode fp\_digit is 32--bits and 
@ -163,6 +167,8 @@ Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline pat
 \hline Pentium 4                  & TFM\_SSE2 \\
 \hline Athlon64                   & TFM\_X86\_64 \\
 \hline ARMv4 or higher            & TFM\_ARM \\
+\hline &\\
+\hline x86--32 or x86--64 (with GCC) & Leave blank and let autodetect work \\
 \hline
 \end{tabular}
 \caption{Recommended Build Modes}
@ -339,9 +345,10 @@ To compute a modular exponentiation use the following function.
 \begin{verbatim}
 int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 \end{verbatim}
-This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$.  The size of $c$
-must be half of the maximum precision used during the build of the library.  For example,
-by default $c$ must be less than $2^{2048}$.  
+This computes $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for any odd $c$ and $b$.  $b$ may be negative so long as 
+$a^{-1} \mbox{ (mod }c\mbox{)}$ exists.  The initial value of $a$ may be larger than $c$.  The size of $c$ must be 
+half of the maximum precision used during the build of the library.  For example, by default $c$ must be less 
+than $2^{2048}$.  

 \section{Number Theoretic}