added tomsfastmath-0.05

2005-08-01 16:37:35 +00:00 · 2005-08-01 16:37:35 +00:00 · a6c4c5a261
commit a6c4c5a261
parent f91cf2d1cf
21 changed files with 830 additions and 310 deletions
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,10 @@
+August 1st, 2005
+0.05 -- Quick fix to the fp_invmod.c code to let it handle even moduli [required for LTC]
+     -- Added makefile.shared to make shared objects [required for LTC]
+     -- Improved makefiles to make them way more configurable
+     -- Added timing resistant fp_exptmod() enabled with TFM_TIMING_RESISTANT
+
+July 23rd, 2005
 0.04 -- Fixed bugs in the SSE2 squaring code
     -- Rewrote the multipliers to be optimized for small inputs 
     -- Nelson Bolyard of the NSS crew submitted [among other things] new faster Montgomery reduction
--- a/comba_mont_gen.c
+++ b/comba_mont_gen.c
@ -1,59 +1,112 @@
-/* generate montgomery reductions for m->used = 1...16 */
-
 #include <stdio.h>

 int main(void)
 {
-   int N;
-   
-   for (N = 1; N <= 16; N++) {
-       
-printf("void fp_montgomery_reduce_%d(fp_int *a, fp_int *m, fp_digit mp)\n", N);
+   int x, y, z;
+
 printf(
+#if 0
+"#ifdef TFM_SMALL_SET\n"
+"/* computes x/R == x (mod N) via Montgomery Reduction */\n"
+"void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)\n"
 "{\n"
-"   fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;\n"
-"   int      oldused, x, y;\n"
+"   fp_digit c[FP_SIZE], *_c, *tmpm, mu, cy;\n"
+"   int      oldused, x, y, pa;\n"
 "\n"
+"#if defined(USE_MEMSET)\n"
 "   /* now zero the buff */\n"
-"   memset(c, 0, sizeof(c));\n"
+"   memset(c, 0, sizeof c);\n"
+"#endif\n"
+"   pa = m->used;\n"
 "\n"
 "   /* copy the input */\n"
 "   oldused = a->used;\n"
 "   for (x = 0; x < oldused; x++) {\n"
 "       c[x] = a->dp[x];\n"
 "   }\n"
-"\n"
+"#if !defined(USE_MEMSET)\n"
+"   for (; x < 2*pa+3; x++) {\n"
+"       c[x] = 0;\n"
+"   }\n"
+"#endif\n"
 "   MONT_START;\n"
+#endif
 "\n"
-"   /* now let's get bizz-sy! */\n"
-"   for (x = 0; x < %d; x++) {\n"
-"       /* get Mu for this round */\n"
-"       LOOP_START;\n"
-"\n"
-"       /* our friendly neighbourhood alias */\n"
-"       _c   = c + x;\n"
-"       tmpm = m->dp;\n"
-"\n"
-"       for (y = 0; y < %d; y++) {\n"
-"          INNERMUL;\n"
-"          ++_c;\n"
-"       }\n"
-"       /* send carry up man... */\n"
-"       _c = c + x;\n"
-"       PROPCARRY;\n"
-"   }         \n"
-"\n"
-"  /* fix the rest of the carries */\n"
-"  _c = c + %d;\n"
-"  for (x = %d; x < %d * 2 + 2; x++) {\n"
-"     PROPCARRY;\n"
-"     ++_c;\n"
+"   switch (pa) {\n");
+
+for (x = 1; x <= 64; x++) {
+if (x > 16 && (x != 32 && x != 48 && x != 64)) continue;
+if (x > 16) printf("#ifdef TFM_HUGE\n");
+
+
+
+printf("      case %d:\n", x);
+
+for (y = 0; y < x; y++) {
+
+printf("            x = %d; cy   = 0;\n"
+       "            LOOP_START;\n"
+       "            _c   = c + %d;\n"
+       "            tmpm = m->dp;\n", y, y);
+
+printf("#ifdef INNERMUL8\n");
+for (z = 0; z+8 <= x; z += 8) {
+printf("            INNERMUL8; _c += 8; tmpm += 8;\n");
+}
+for (; z < x; z++) {
+printf("            INNERMUL; ++_c;\n");
+}
+printf("#else\n");
+for (z = 0; z < x; z++) {
+printf("            INNERMUL; ++_c;\n");
+}
+printf("#endif\n");
+printf("            LOOP_END;\n"
+       "            while (cy) {\n"
+       "               PROPCARRY;\n"
+       "               ++_c;\n"
+       "            }\n");
+}
+//printf("         }\n");
+printf("         break;\n");
+
+
+
+#define LOOP_MACRO(stride)                                 \
+   for (x = 0; x < stride; x++) {                          \
+       fp_digit cy = 0;                                    \
+       /* get Mu for this round */                         \
+       LOOP_START;                                         \
+       _c   = c + x;                                       \
+       tmpm = m->dp;                                       \
+       for (y = 0; y < stride; y++) {                      \
+          INNERMUL;                                        \
+          ++_c;                                            \
+       }                                                   \
+       LOOP_END;                                           \
+       while (cy) {                                        \
+           PROPCARRY;                                      \
+           ++_c;                                           \
+       }                                                   \
+  }         
+
+
+
+
+
+if (x > 16) printf("#endif /* TFM_HUGE */\n");
+
+
+}
+
+#if 0
+
+printf(
 "  }\n"
-"\n"
 "  /* now copy out */\n"
-"  _c   = c + %d;\n"
+"  _c   = c + pa;\n"
 "  tmpm = a->dp;\n"
-"  for (x = 0; x < %d+1; x++) {\n"
+"  for (x = 0; x < pa+1; x++) {\n"
 "     *tmpm++ = *_c++;\n"
 "  }\n"
 "\n"
@ -63,19 +116,17 @@ printf(
 "\n"
 "  MONT_FINI;\n"
 "\n"
-"  a->used = %d+1;\n"
+"  a->used = pa+1;\n"
 "  fp_clamp(a);\n"
 "\n"  
 "  /* if A >= m then A = A - m */\n"
 "  if (fp_cmp_mag (a, m) != FP_LT) {\n"
 "    s_fp_sub (a, m, a);\n"
 "  }\n"
-"}\n", N,N,N,N,N,N,N,N);
-}
+"}\n\n#endif\n");
+
+#endif
+

 return 0;
 }
-
-
-
-
--- a/demo/test.c
+++ b/demo/test.c
@ -213,7 +213,7 @@ t1 = TIMFUNC();
 sleep(1);
 printf("Ticks per second: %llu\n", TIMFUNC() - t1);

-goto expttime;
+goto multtime;
 /* do some timings... */
  printf("Addition:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/fp_exptmod.c
+++ b/fp_exptmod.c
@ -9,6 +9,75 @@
 */
 #include <tfm.h>

+#ifdef TFM_TIMING_RESISTANT
+
+/* timing resistant montgomery ladder based exptmod 
+
+   Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
+*/
+static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
+{
+  fp_int   R[2];
+  fp_digit buf, mp;
+  int      err, bitcnt, digidx, y;
+
+  /* now setup montgomery  */
+  if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
+     return err;
+  }
+
+  fp_init(&R[0]);   
+  fp_init(&R[1]);   
+   
+  /* now we need R mod m */
+  fp_montgomery_calc_normalization (&R[0], P);
+
+  /* now set R[0][1] to G * R mod m */
+  if (fp_cmp_mag(P, G) != FP_GT) {
+     /* G > P so we reduce it first */
+     fp_mod(G, P, &R[1]);
+  } else {
+     fp_copy(G, &R[1]);
+  }
+  fp_mulmod (&R[1], &R[0], P, &R[1]);
+
+  /* for j = t-1 downto 0 do
+        r_!k = R0*R1; r_k = r_k^2
+  */
+  
+  /* set initial mode and bit cnt */
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits so break */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int)DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (fp_digit)1;
+
+    /* do ops */
+    fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
+    fp_sqr(&R[y], &R[y]);          fp_montgomery_reduce(&R[y], P, mp);
+  }
+
+   fp_montgomery_reduce(&R[0], P, mp);
+   fp_copy(&R[0], Y);
+   return FP_OKAY;
+}   
+
+#else
+
 /* y = g**x (mod b) 
 * Some restrictions... x must be positive and < b
 */
@ -168,6 +237,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  return FP_OKAY;
 }

+#endif
+

 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
--- a/fp_invmod.c
+++ b/fp_invmod.c
@ -9,6 +9,111 @@
 */
 #include <tfm.h>

+static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
+{
+  fp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == FP_NEG || fp_iszero(b) == 1) {
+    return FP_VAL;
+  }
+
+  /* init temps */
+  fp_init(&x);    fp_init(&y);
+  fp_init(&u);    fp_init(&v);
+  fp_init(&A);    fp_init(&B);
+  fp_init(&C);    fp_init(&D);
+
+  /* x = a, y = b */
+  if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
+      return res;
+  }
+  fp_copy(b, &y);
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
+    return FP_VAL;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  fp_copy (&x, &u);
+  fp_copy (&y, &v);
+  fp_set (&A, 1);
+  fp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (fp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    fp_div_2 (&u, &u);
+
+    /* 4.2 if A or B is odd then */
+    if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      fp_add (&A, &y, &A);
+      fp_sub (&B, &x, &B);
+    }
+    /* A = A/2, B = B/2 */
+    fp_div_2 (&A, &A);
+    fp_div_2 (&B, &B);
+  }
+
+  /* 5.  while v is even do */
+  while (fp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    fp_div_2 (&v, &v);
+
+    /* 5.2 if C or D is odd then */
+    if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      fp_add (&C, &y, &C);
+      fp_sub (&D, &x, &D);
+    }
+    /* C = C/2, D = D/2 */
+    fp_div_2 (&C, &C);
+    fp_div_2 (&D, &D);
+  }
+
+  /* 6.  if u >= v then */
+  if (fp_cmp (&u, &v) != FP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    fp_sub (&u, &v, &u);
+    fp_sub (&A, &C, &A);
+    fp_sub (&B, &D, &B);
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    fp_sub (&v, &u, &v);
+    fp_sub (&C, &A, &C);
+    fp_sub (&D, &B, &D);
+  }
+
+  /* if not zero goto step 4 */
+  if (fp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (fp_cmp_d (&v, 1) != FP_EQ) {
+    return FP_VAL;
+  }
+
+  /* if its too low */
+  while (fp_cmp_d(&C, 0) == FP_LT) {
+      fp_add(&C, b, &C);
+  }
+  
+  /* too big */
+  while (fp_cmp_mag(&C, b) != FP_LT) {
+      fp_sub(&C, b, &C);
+  }
+  
+  /* C is now the inverse */
+  fp_copy(&C, c);
+  return FP_OKAY;
+}
+
 /* c = 1/a (mod b) for odd b only */
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
 {
@ -17,7 +122,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)

  /* 2. [modified] b must be odd   */
  if (fp_iseven (b) == FP_YES) {
-    return FP_VAL;
+    return fp_invmod_slow(a,b,c);
  }

  /* init all our temps */
--- a/fp_montgomery_reduce.c
+++ b/fp_montgomery_reduce.c
@ -299,8 +299,6 @@ asm(                                 \


 #define LO  0
-#define HI  1
-#define CY  2

 /* computes x/R == x (mod N) via Montgomery Reduction */
 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
@ -347,7 +345,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
       }
       LOOP_END;
       while (cy) {
-           PROPCARRY; //  cy = cy > (*_c += cy);
+           PROPCARRY;
           ++_c;
       }
  }         
@ -374,7 +372,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
  }
 }

+
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
-
--- a/fp_mul_comba.c
+++ b/fp_mul_comba.c
@ -47,7 +47,7 @@

 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm (                                                     \
+asm(                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -118,7 +118,7 @@ asm  (                                                    \

 /* this should multiply i and j  */
   #define MULADD(i, j)                                      \
-   asm volatile (                                            \
+   asm(                                            \
        "movd  %6,%%mm0     \n\t"                            \
        "movd  %7,%%mm1     \n\t"                            \
        "pmuludq %%mm1,%%mm0\n\t"                            \
--- a/fp_sqr_comba.c
+++ b/fp_sqr_comba.c
@ -36,7 +36,7 @@
 #define COMBA_FINI

 #define SQRADD(i, j)                                      \
-asm volatile (                                            \
+asm(                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %%eax        \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -45,7 +45,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");

 #define SQRADD2(i, j)                                     \
-asm volatile (                                            \
+asm(                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -57,7 +57,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");

 #define SQRADDSC(i, j)                                    \
-asm (                                                     \
+asm(                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "movl  %%eax,%0     \n\t"                            \
@ -66,7 +66,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");

 #define SQRADDAC(i, j)                                    \
-asm (                                                     \
+asm(                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -75,7 +75,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");

 #define SQRADDDB                                          \
-asm (                                                     \
+asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
@ -104,7 +104,7 @@ asm (                                                     \
 #define COMBA_FINI

 #define SQRADD(i, j)                                      \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %%rax        \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -113,7 +113,7 @@ asm (                                                     \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");

 #define SQRADD2(i, j)                                     \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -125,7 +125,7 @@ asm (                                                     \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");

 #define SQRADDSC(i, j)                                    \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "movq  %%rax,%0     \n\t"                            \
@ -134,7 +134,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");

 #define SQRADDAC(i, j)                                                         \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -143,7 +143,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");

 #define SQRADDDB                                          \
-asm (                                                     \
+asm(                                                     \
     "addq %6,%0         \n\t"                            \
     "adcq %7,%1         \n\t"                            \
     "adcq %8,%2         \n\t"                            \
@ -173,7 +173,7 @@ asm (                                                     \
   asm("emms");

 #define SQRADD(i, j)                                      \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "pmuludq %%mm0,%%mm0\n\t"                            \
     "movd  %%mm0,%%eax  \n\t"                            \
@ -185,7 +185,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");

 #define SQRADD2(i, j)                                     \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
@ -201,7 +201,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");

 #define SQRADDSC(i, j)                                                         \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
@ -212,7 +212,7 @@ asm volatile (                                            \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));

 #define SQRADDAC(i, j)                                                         \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
@ -225,7 +225,7 @@ asm volatile (                                            \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","%cc");

 #define SQRADDDB                                          \
-asm (                                                     \
+asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
--- a/67
+++ b/67
@ -1,7 +1,13 @@
 #makefile for TomsFastMath
 #
 #
-CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
+VERSION=0.05
+
+CFLAGS += -Wall -W -Wshadow -I./ 
+
+ifndef IGNORE_SPEED
+
+CFLAGS += -O3 -funroll-all-loops

 #profiling
 #PROF=-pg -g
@ -10,9 +16,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
 #speed
 CFLAGS += -fomit-frame-pointer

-VERSION=0.04
-
-default: libtfm.a
+endif

 OBJECTS = \
 fp_set.o \
@ -52,23 +56,29 @@ ifndef INCPATH
   INCPATH=/usr/include
 endif

-ifndef TFM_GROUP
+ifndef INSTALL_GROUP
   GROUP=wheel
+else
+   GROUP=$(INSTALL_GROUP)
 endif

-ifndef TFM_USER
+ifndef INSTALL_USER
   USER=root
+else
+   USER=$(INSTALL_USER)
 endif

 ifndef LIBNAME
 	LIBNAME=libtfm.a
 endif

-$(LIBNAME): $(OBJECTS)
-	$(AR) $(ARFLAGS) $(LIBNAME) $(OBJECTS)
-	ranlib $(LIBNAME)
+default: $(LIBNAME)

-install: libtfm.a
+$(LIBNAME): $(OBJECTS)
+	$(AR) $(ARFLAGS) $@ $(OBJECTS)
+	ranlib $@
+
+install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
@ -77,17 +87,17 @@ install: libtfm.a
 mtest/mtest: mtest/mtest.c
 	cd mtest ; make mtest

-test: libtfm.a demo/test.o mtest/mtest
-	$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
+test: $(LIBNAME) demo/test.o mtest/mtest
+	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test

-timing: libtfm.a demo/test.o
-	$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
+timing: $(LIBNAME) demo/test.o
+	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
 	
-stest: libtfm.a demo/stest.o 
-	$(CC) $(CFLAGS) demo/stest.o libtfm.a -o stest
+stest: $(LIBNAME) demo/stest.o 
+	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME) -o stest

-rsatest: libtfm.a demo/rsa.o
-	$(CC) $(CFLAGS) demo/rsa.o libtfm.a -o rsatest
+rsatest: $(LIBNAME) demo/rsa.o
+	$(CC) $(CFLAGS) demo/rsa.o $(LIBNAME) -o rsatest

 docdvi: tfm.tex
 	touch tfm.ind
@ -101,8 +111,23 @@ docs: docdvi
 	dvipdf tfm
 	mv -f tfm.pdf doc

+#This rule cleans the source tree of all compiled code, not including the pdf
+#documentation.
 clean:
-	rm -f $(OBJECTS) *.a demo/*.o test tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc stest *~ rsatest *.gcda *.gcno demo/*.gcda demo/*.gcno mtest/*.gcno mtest/*.gcda
+	rm -f `find . -type f | grep "[.]o" | xargs`
+	rm -f `find . -type f | grep "[.]lo"  | xargs`
+	rm -f `find . -type f | grep "[.]a" | xargs`
+	rm -f `find . -type f | grep "[.]la"  | xargs`
+	rm -f `find . -type f | grep "[.]obj" | xargs`
+	rm -f `find . -type f | grep "[.]lib" | xargs`
+	rm -f `find . -type f | grep "[.]exe" | xargs`
+	rm -f `find . -type f | grep "[.]gcda" | xargs`
+	rm -f `find . -type f | grep "[.]gcno" | xargs`
+	rm -f `find . -type f | grep "[.]il" | xargs`
+	rm -f `find . -type f | grep "[.]dyn" | xargs`
+	rm -f `find . -type f | grep "[.]dpi" | xargs`
+	rm -rf `find . -type d | grep "[.]libs" | xargs`
+	rm -f tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc 
 	cd mtest ; make clean

 no_oops: clean
@ -116,3 +141,7 @@ zipup: no_oops docs clean
 	cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
 	tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
 	zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/*
+
+# $Source: /cvs/libtom/tomsfastmath/makefile,v $ 
+# $Revision: 1.17 $ 
+# $Date: 2005/07/30 04:23:55 $ 
--- a/makefile.gba
+++ b/makefile.gba
@ -1,55 +0,0 @@
-#makefile for TomsFastMath
-#
-#For the GameboyAdance... er.... ARMv4
-SFLAGS = $(CFLAGS) -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -mthumb -mthumb-interwork -I../devkitadv/mylib/lib
-CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -marm -mthumb-interwork -I../devkitadv/mylib/lib
-
-#profiling
-#PROF=-pg -g
-#CFLAGS += $(PROF)
-
-#speed
-CFLAGS += -fomit-frame-pointer
-
-VERSION=0.01
-
-default: libtfm.a
-
-OBJECTS = \
-fp_set.o \
-\
-fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
-fp_mul_2.o fp_div_2.o  \
-\
-fp_cnt_lsb.o \
-\
-fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
-s_fp_add.o s_fp_sub.o \
-\
-fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
-fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
-fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
-fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
-\
-fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
-\
-fp_exptmod.o \
-\
-fp_cmp.o fp_cmp_mag.o \
-\
-fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
-fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
-fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
-\
-
-libtfm.a: $(OBJECTS)
-	$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
-	ranlib libtfm.a
-
-demo/stest.o: demo/stest.c
-	$(CC) $(SFLAGS) -DGBA_MODE demo/stest.c -c -o demo/stest.o
-
-stest: libtfm.a demo/stest.o 
-	$(CC) -mthumb -mthumb-interwork demo/stest.o libtfm.a ../devkitadv/mylib/lib/gba.a -o stest.elf
-	objcopy -O binary stest.elf stest.bin
-
--- a/makefile.shared
+++ b/makefile.shared
@ -0,0 +1,109 @@
+#makefile for TomsFastMath
+#
+#
+
+CC=libtool --mode=compile gcc
+
+CFLAGS += -Wall -W -Wshadow -I./ 
+
+ifndef IGNORE_SPEED
+
+CFLAGS += -O3 -funroll-all-loops
+
+#profiling
+#PROF=-pg -g
+#CFLAGS += $(PROF)
+
+#speed
+CFLAGS += -fomit-frame-pointer
+
+endif
+
+VERSION=0:5
+
+OBJECTS = \
+fp_set.o \
+\
+fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
+fp_mul_2.o fp_div_2.o  \
+\
+fp_cnt_lsb.o \
+\
+fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
+s_fp_add.o s_fp_sub.o \
+\
+fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
+fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
+fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
+fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
+\
+fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
+\
+fp_exptmod.o \
+\
+fp_cmp.o fp_cmp_mag.o \
+\
+fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
+fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
+fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
+\
+fp_ident.o 
+
+HEADERS=tfm.h
+
+ifndef LIBPATH
+   LIBPATH=/usr/lib
+endif
+
+ifndef INCPATH
+   INCPATH=/usr/include
+endif
+
+ifndef INSTALL_GROUP
+   GROUP=wheel
+else
+   GROUP=$(INSTALL_GROUP)
+endif
+
+ifndef INSTALL_USER
+   USER=root
+else
+   USER=$(INSTALL_USER)
+endif
+
+ifndef LIBNAME
+	LIBNAME=libtfm.la
+endif
+
+ifndef LIBNAME_S
+	LIBNAME_S=libtfm.a
+endif
+
+default: $(LIBNAME)
+
+$(LIBNAME): $(OBJECTS)
+
+install: $(LIBNAME)
+	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
+	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]o" | xargs` -o $(LIBNAME_S)
+	ranlib $(LIBNAME_S)
+	libtool --silent --mode=install install -c $(LIBNAME) $(LIBPATH)/$(LIBNAME)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+
+mtest/mtest: mtest/mtest.c
+	cd mtest ; make mtest
+
+test: $(LIBNAME) demo/test.o mtest/mtest
+	$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
+
+timing: $(LIBNAME) demo/test.o
+	$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
+	
+stest: $(LIBNAME) demo/stest.o 
+	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest
+
+# $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $ 
+# $Revision: 1.4 $ 
+# $Date: 2005/07/28 03:08:35 $ 
+
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -757,6 +757,75 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
 */
 #include <tfm.h>

+#ifdef TFM_TIMING_RESISTANT
+
+/* timing resistant montgomery ladder based exptmod 
+
+   Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
+*/
+static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
+{
+  fp_int   R[2];
+  fp_digit buf, mp;
+  int      err, bitcnt, digidx, y;
+
+  /* now setup montgomery  */
+  if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
+     return err;
+  }
+
+  fp_init(&R[0]);   
+  fp_init(&R[1]);   
+   
+  /* now we need R mod m */
+  fp_montgomery_calc_normalization (&R[0], P);
+
+  /* now set R[0][1] to G * R mod m */
+  if (fp_cmp_mag(P, G) != FP_GT) {
+     /* G > P so we reduce it first */
+     fp_mod(G, P, &R[1]);
+  } else {
+     fp_copy(G, &R[1]);
+  }
+  fp_mulmod (&R[1], &R[0], P, &R[1]);
+
+  /* for j = t-1 downto 0 do
+        r_!k = R0*R1; r_k = r_k^2
+  */
+  
+  /* set initial mode and bit cnt */
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits so break */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int)DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (fp_digit)1;
+
+    /* do ops */
+    fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
+    fp_sqr(&R[y], &R[y]);          fp_montgomery_reduce(&R[y], P, mp);
+  }
+
+   fp_montgomery_reduce(&R[0], P, mp);
+   fp_copy(&R[0], Y);
+   return FP_OKAY;
+}   
+
+#else
+
 /* y = g**x (mod b) 
 * Some restrictions... x must be positive and < b
 */
@ -916,6 +985,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  return FP_OKAY;
 }

+#endif
+

 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
@ -1105,6 +1176,111 @@ int main(void)
 */
 #include <tfm.h>

+static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
+{
+  fp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == FP_NEG || fp_iszero(b) == 1) {
+    return FP_VAL;
+  }
+
+  /* init temps */
+  fp_init(&x);    fp_init(&y);
+  fp_init(&u);    fp_init(&v);
+  fp_init(&A);    fp_init(&B);
+  fp_init(&C);    fp_init(&D);
+
+  /* x = a, y = b */
+  if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
+      return res;
+  }
+  fp_copy(b, &y);
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
+    return FP_VAL;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  fp_copy (&x, &u);
+  fp_copy (&y, &v);
+  fp_set (&A, 1);
+  fp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (fp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    fp_div_2 (&u, &u);
+
+    /* 4.2 if A or B is odd then */
+    if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      fp_add (&A, &y, &A);
+      fp_sub (&B, &x, &B);
+    }
+    /* A = A/2, B = B/2 */
+    fp_div_2 (&A, &A);
+    fp_div_2 (&B, &B);
+  }
+
+  /* 5.  while v is even do */
+  while (fp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    fp_div_2 (&v, &v);
+
+    /* 5.2 if C or D is odd then */
+    if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      fp_add (&C, &y, &C);
+      fp_sub (&D, &x, &D);
+    }
+    /* C = C/2, D = D/2 */
+    fp_div_2 (&C, &C);
+    fp_div_2 (&D, &D);
+  }
+
+  /* 6.  if u >= v then */
+  if (fp_cmp (&u, &v) != FP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    fp_sub (&u, &v, &u);
+    fp_sub (&A, &C, &A);
+    fp_sub (&B, &D, &B);
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    fp_sub (&v, &u, &v);
+    fp_sub (&C, &A, &C);
+    fp_sub (&D, &B, &D);
+  }
+
+  /* if not zero goto step 4 */
+  if (fp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (fp_cmp_d (&v, 1) != FP_EQ) {
+    return FP_VAL;
+  }
+
+  /* if its too low */
+  while (fp_cmp_d(&C, 0) == FP_LT) {
+      fp_add(&C, b, &C);
+  }
+  
+  /* too big */
+  while (fp_cmp_mag(&C, b) != FP_LT) {
+      fp_sub(&C, b, &C);
+  }
+  
+  /* C is now the inverse */
+  fp_copy(&C, c);
+  return FP_OKAY;
+}
+
 /* c = 1/a (mod b) for odd b only */
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
 {
@ -1113,7 +1289,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)

  /* 2. [modified] b must be odd   */
  if (fp_iseven (b) == FP_YES) {
-    return FP_VAL;
+    return fp_invmod_slow(a,b,c);
  }

  /* init all our temps */
@ -1814,8 +1990,6 @@ asm(                                 \


 #define LO  0
-#define HI  1
-#define CY  2

 /* computes x/R == x (mod N) via Montgomery Reduction */
 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
@ -1862,7 +2036,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
       }
       LOOP_END;
       while (cy) {
-           PROPCARRY; //  cy = cy > (*_c += cy);
+           PROPCARRY;
           ++_c;
       }
  }         
@ -1889,10 +2063,10 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
  }
 }

+
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
-

 /* End: fp_montgomery_reduce.c */

@ -2270,7 +2444,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)

 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm (                                                     \
+asm(                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -2341,7 +2515,7 @@ asm  (                                                    \

 /* this should multiply i and j  */
   #define MULADD(i, j)                                      \
-   asm volatile (                                            \
+   asm(                                            \
        "movd  %6,%%mm0     \n\t"                            \
        "movd  %7,%%mm1     \n\t"                            \
        "pmuludq %%mm1,%%mm0\n\t"                            \
@ -5678,7 +5852,7 @@ Obvious points of optimization
 #define COMBA_FINI

 #define SQRADD(i, j)                                      \
-asm volatile (                                            \
+asm(                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %%eax        \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -5687,7 +5861,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");

 #define SQRADD2(i, j)                                     \
-asm volatile (                                            \
+asm(                                            \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -5699,7 +5873,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");

 #define SQRADDSC(i, j)                                    \
-asm (                                                     \
+asm(                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "movl  %%eax,%0     \n\t"                            \
@ -5708,7 +5882,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");

 #define SQRADDAC(i, j)                                    \
-asm (                                                     \
+asm(                                                     \
     "movl  %6,%%eax     \n\t"                            \
     "mull  %7           \n\t"                            \
     "addl  %%eax,%0     \n\t"                            \
@ -5717,7 +5891,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");

 #define SQRADDDB                                          \
-asm (                                                     \
+asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
@ -5746,7 +5920,7 @@ asm (                                                     \
 #define COMBA_FINI

 #define SQRADD(i, j)                                      \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %%rax        \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -5755,7 +5929,7 @@ asm (                                                     \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");

 #define SQRADD2(i, j)                                     \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -5767,7 +5941,7 @@ asm (                                                     \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");

 #define SQRADDSC(i, j)                                    \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "movq  %%rax,%0     \n\t"                            \
@ -5776,7 +5950,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");

 #define SQRADDAC(i, j)                                                         \
-asm (                                                     \
+asm(                                                     \
     "movq  %6,%%rax     \n\t"                            \
     "mulq  %7           \n\t"                            \
     "addq  %%rax,%0     \n\t"                            \
@ -5785,7 +5959,7 @@ asm (                                                     \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");

 #define SQRADDDB                                          \
-asm (                                                     \
+asm(                                                     \
     "addq %6,%0         \n\t"                            \
     "adcq %7,%1         \n\t"                            \
     "adcq %8,%2         \n\t"                            \
@ -5815,7 +5989,7 @@ asm (                                                     \
   asm("emms");

 #define SQRADD(i, j)                                      \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "pmuludq %%mm0,%%mm0\n\t"                            \
     "movd  %%mm0,%%eax  \n\t"                            \
@ -5827,7 +6001,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");

 #define SQRADD2(i, j)                                     \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
@ -5843,7 +6017,7 @@ asm volatile (                                            \
     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");

 #define SQRADDSC(i, j)                                                         \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
@ -5854,7 +6028,7 @@ asm volatile (                                            \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));

 #define SQRADDAC(i, j)                                                         \
-asm volatile (                                            \
+asm(                                            \
     "movd  %6,%%mm0     \n\t"                            \
     "movd  %7,%%mm1     \n\t"                            \
     "pmuludq %%mm1,%%mm0\n\t"                            \
@ -5867,7 +6041,7 @@ asm volatile (                                            \
     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","%cc");

 #define SQRADDDB                                          \
-asm (                                                     \
+asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
--- a/tfm.aux
+++ b/tfm.aux
@ -17,40 +17,42 @@
 \@writefile{toc}{\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}}
 \@writefile{toc}{\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}}
 \@writefile{toc}{\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}}
 \@writefile{toc}{\contentsline {subsubsection}{x86--32}{3}{section*.3}}
 \@writefile{toc}{\contentsline {subsubsection}{SSE2}{3}{section*.4}}
-\@writefile{toc}{\contentsline {subsubsection}{x86--64}{3}{section*.5}}
-\@writefile{toc}{\contentsline {subsubsection}{ARM}{3}{section*.6}}
-\@writefile{toc}{\contentsline {subsubsection}{PPC32}{3}{section*.7}}
+\@writefile{toc}{\contentsline {subsubsection}{x86--64}{4}{section*.5}}
+\@writefile{toc}{\contentsline {subsubsection}{ARM}{4}{section*.6}}
+\@writefile{toc}{\contentsline {subsubsection}{PPC32}{4}{section*.7}}
 \@writefile{toc}{\contentsline {subsubsection}{Future Releases}{4}{section*.8}}
 \@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}}
-\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}}
-\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}}
-\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}}
-\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{chap:asmops}{{4}{11}{Porting TomsFastMath\relax }{chapter.4}{}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}}
+\newlabel{chap:asmops}{{4}{13}{Porting TomsFastMath\relax }{chapter.4}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}}
--- a/tfm.dvi
+++ b/tfm.dvi
--- a/tfm.h
+++ b/tfm.h
@ -48,6 +48,11 @@
 */
 /* #define TFM_PRESCOTT */

+/* Do we want timing resistant fp_exptmod() ?
+ * This makes it slower but also timing invariant with respect to the exponent 
+ */
+/* #define TFM_TIMING_RESISTANT */
+
 #endif

 /* Max size of any number in bits.  Basically the largest size you will be multiplying
@ -355,15 +360,25 @@ int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
 void bn_reverse(unsigned char *s, int len);
+
 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
+
+#ifdef TFM_SMALL_SET
+void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
+#endif
+
 #ifdef TFM_HUGE
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
 #endif
-void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);

+void fp_sqr_comba(fp_int *A, fp_int *B);
+
+#ifdef TFM_SMALL_SET
 void fp_sqr_comba_small(fp_int *A, fp_int *B);
+#endif
+
 #ifdef TFM_HUGE
 void fp_sqr_comba32(fp_int *A, fp_int *B);
 void fp_sqr_comba48(fp_int *A, fp_int *B);
--- a/tfm.idx
+++ b/tfm.idx
@ -1,29 +1,29 @@
-\indexentry{fp\_init|hyperpage}{6}
-\indexentry{fp\_set|hyperpage}{6}
-\indexentry{fp\_init\_copy|hyperpage}{6}
-\indexentry{fp\_iszero|hyperpage}{7}
-\indexentry{fp\_iseven|hyperpage}{7}
-\indexentry{fp\_isodd|hyperpage}{7}
-\indexentry{fp\_neg|hyperpage}{7}
-\indexentry{fp\_abs|hyperpage}{7}
-\indexentry{fp\_cmp|hyperpage}{8}
-\indexentry{fp\_cmp\_mag|hyperpage}{8}
-\indexentry{fp\_lshd|hyperpage}{8}
-\indexentry{fp\_rshd|hyperpage}{8}
-\indexentry{fp\_div\_2d|hyperpage}{8}
-\indexentry{fp\_mod\_2d|hyperpage}{8}
-\indexentry{fp\_mul\_2d|hyperpage}{8}
-\indexentry{fp\_div\_2|hyperpage}{8}
-\indexentry{fp\_mul\_2|hyperpage}{8}
-\indexentry{fp\_cnt\_lsb|hyperpage}{8}
-\indexentry{fp\_add|hyperpage}{9}
-\indexentry{fp\_sub|hyperpage}{9}
-\indexentry{fp\_mul|hyperpage}{9}
-\indexentry{fp\_sqr|hyperpage}{9}
-\indexentry{fp\_div|hyperpage}{9}
-\indexentry{fp\_mod|hyperpage}{9}
-\indexentry{fp\_exptmod|hyperpage}{9}
-\indexentry{fp\_invmod|hyperpage}{9}
-\indexentry{fp\_gcd|hyperpage}{9}
-\indexentry{fp\_lcm|hyperpage}{9}
-\indexentry{fp\_isprime|hyperpage}{10}
+\indexentry{fp\_init|hyperpage}{8}
+\indexentry{fp\_set|hyperpage}{8}
+\indexentry{fp\_init\_copy|hyperpage}{8}
+\indexentry{fp\_iszero|hyperpage}{9}
+\indexentry{fp\_iseven|hyperpage}{9}
+\indexentry{fp\_isodd|hyperpage}{9}
+\indexentry{fp\_neg|hyperpage}{9}
+\indexentry{fp\_abs|hyperpage}{9}
+\indexentry{fp\_cmp|hyperpage}{10}
+\indexentry{fp\_cmp\_mag|hyperpage}{10}
+\indexentry{fp\_lshd|hyperpage}{10}
+\indexentry{fp\_rshd|hyperpage}{10}
+\indexentry{fp\_div\_2d|hyperpage}{10}
+\indexentry{fp\_mod\_2d|hyperpage}{10}
+\indexentry{fp\_mul\_2d|hyperpage}{10}
+\indexentry{fp\_div\_2|hyperpage}{10}
+\indexentry{fp\_mul\_2|hyperpage}{10}
+\indexentry{fp\_cnt\_lsb|hyperpage}{10}
+\indexentry{fp\_add|hyperpage}{11}
+\indexentry{fp\_sub|hyperpage}{11}
+\indexentry{fp\_mul|hyperpage}{11}
+\indexentry{fp\_sqr|hyperpage}{11}
+\indexentry{fp\_div|hyperpage}{11}
+\indexentry{fp\_mod|hyperpage}{11}
+\indexentry{fp\_exptmod|hyperpage}{11}
+\indexentry{fp\_invmod|hyperpage}{11}
+\indexentry{fp\_gcd|hyperpage}{11}
+\indexentry{fp\_lcm|hyperpage}{11}
+\indexentry{fp\_isprime|hyperpage}{12}
--- a/tfm.ind
+++ b/tfm.ind
@ -1,33 +1,33 @@
 \begin{theindex}

-  \item fp\_abs, \hyperpage{7}
-  \item fp\_add, \hyperpage{9}
-  \item fp\_cmp, \hyperpage{8}
-  \item fp\_cmp\_mag, \hyperpage{8}
-  \item fp\_cnt\_lsb, \hyperpage{8}
-  \item fp\_div, \hyperpage{9}
-  \item fp\_div\_2, \hyperpage{8}
-  \item fp\_div\_2d, \hyperpage{8}
-  \item fp\_exptmod, \hyperpage{9}
-  \item fp\_gcd, \hyperpage{9}
-  \item fp\_init, \hyperpage{6}
-  \item fp\_init\_copy, \hyperpage{6}
-  \item fp\_invmod, \hyperpage{9}
-  \item fp\_iseven, \hyperpage{7}
-  \item fp\_isodd, \hyperpage{7}
-  \item fp\_isprime, \hyperpage{10}
-  \item fp\_iszero, \hyperpage{7}
-  \item fp\_lcm, \hyperpage{9}
-  \item fp\_lshd, \hyperpage{8}
-  \item fp\_mod, \hyperpage{9}
-  \item fp\_mod\_2d, \hyperpage{8}
-  \item fp\_mul, \hyperpage{9}
-  \item fp\_mul\_2, \hyperpage{8}
-  \item fp\_mul\_2d, \hyperpage{8}
-  \item fp\_neg, \hyperpage{7}
-  \item fp\_rshd, \hyperpage{8}
-  \item fp\_set, \hyperpage{6}
-  \item fp\_sqr, \hyperpage{9}
-  \item fp\_sub, \hyperpage{9}
+  \item fp\_abs, \hyperpage{9}
+  \item fp\_add, \hyperpage{11}
+  \item fp\_cmp, \hyperpage{10}
+  \item fp\_cmp\_mag, \hyperpage{10}
+  \item fp\_cnt\_lsb, \hyperpage{10}
+  \item fp\_div, \hyperpage{11}
+  \item fp\_div\_2, \hyperpage{10}
+  \item fp\_div\_2d, \hyperpage{10}
+  \item fp\_exptmod, \hyperpage{11}
+  \item fp\_gcd, \hyperpage{11}
+  \item fp\_init, \hyperpage{8}
+  \item fp\_init\_copy, \hyperpage{8}
+  \item fp\_invmod, \hyperpage{11}
+  \item fp\_iseven, \hyperpage{9}
+  \item fp\_isodd, \hyperpage{9}
+  \item fp\_isprime, \hyperpage{12}
+  \item fp\_iszero, \hyperpage{9}
+  \item fp\_lcm, \hyperpage{11}
+  \item fp\_lshd, \hyperpage{10}
+  \item fp\_mod, \hyperpage{11}
+  \item fp\_mod\_2d, \hyperpage{10}
+  \item fp\_mul, \hyperpage{11}
+  \item fp\_mul\_2, \hyperpage{10}
+  \item fp\_mul\_2d, \hyperpage{10}
+  \item fp\_neg, \hyperpage{9}
+  \item fp\_rshd, \hyperpage{10}
+  \item fp\_set, \hyperpage{8}
+  \item fp\_sqr, \hyperpage{11}
+  \item fp\_sub, \hyperpage{11}

 \end{theindex}
--- a/tfm.log
+++ b/tfm.log
@ -1,4 +1,4 @@
-This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10)  23 JUL 2005 07:42
+This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10)  1 AUG 2005 13:34
 entering extended mode
 **tfm
 (./tfm.tex
@ -216,107 +216,107 @@ File: umsb.fd 2002/01/19 v2.2g AMS font definitions
 Chapter 1.
 [1

-] [2] [3] [4]
+] [2] [3] [4] [5] [6
+
+]
 Chapter 2.

 Underfull \vbox (badness 7649) has occurred while \output is active []

- [5
-
-]
-[6]
+ [7]
+[8]
 Chapter 3.
-[7
+[9

-] [8] [9] [10]
+] [10] [11] [12]
 Chapter 4.
-[11
+[13

-] [12] [13]
-Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+] [14] [15]
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
 []\OT1/cmtt/m/n/10 #define SQRADDSC(i, j)                                      
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
 []   \OT1/cmtt/m/n/10 do { fp_word t;                                          
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
 []      \OT1/cmtt/m/n/10 t =  ((fp_word)i) * ((fp_word)j);                     
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
 []      \OT1/cmtt/m/n/10 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;   
                   \[] 
 []


-Overfull \hbox (25.129pt too wide) in paragraph at lines 548--549
+Overfull \hbox (25.129pt too wide) in paragraph at lines 560--561
 \OT1/cmr/m/n/10 This com-putes a prod-uct and stores it in the ``sec-ondary'' c
 arry reg-is-ters $[]$. 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
 []\OT1/cmtt/m/n/10 #define SQRADDAC(i, j)                                      
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
 []   \OT1/cmtt/m/n/10 do { fp_word t;                                          
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
 []   \OT1/cmtt/m/n/10 t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 = t;         
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
 []   \OT1/cmtt/m/n/10 t = sc1 + (t >> DIGIT_BIT);             sc1 = t; sc2 += t
 >> DIGIT_BIT;     \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578
 []\OT1/cmtt/m/n/10 #define SQRADDDB                                            
                   \[] 
 []


-Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566
+Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578
 []   \OT1/cmtt/m/n/10 do { fp_word t;                                          
                   \[] 
 []


-Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
+Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
 []   \OT1/cmtt/m/n/10 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t;        
                                         \[] 
 []


-Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
+Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
 []   \OT1/cmtt/m/n/10 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BI
 T); c1 = t;                              \[] 
 []


-Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
+Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
 []   \OT1/cmtt/m/n/10 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_B
 IT);                                     \[] 
 []

-[14] [15] (./tfm.ind [16] [17
+[16] [17] (./tfm.ind [18] [19


 ]) (./tfm.aux) ) 
@ -329,4 +329,4 @@ Here is how much of TeX's memory you used:
 580 hyphenation exceptions out of 1000
 25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s

-Output written on tfm.dvi (23 pages, 49708 bytes).
+Output written on tfm.dvi (25 pages, 51612 bytes).
--- a/tfm.tex
+++ b/tfm.tex
@ -49,8 +49,8 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{TomsFastMath User Manual \\ v0.04}
-\author{Tom St Denis \\ tomstdenis@iahu.ca}
+\title{TomsFastMath User Manual \\ v0.05}
+\author{Tom St Denis \\ tomstdenis@gmail.com}
 \maketitle
 This text and library are all hereby placed in the public domain.  This book has been formatted for B5 
 [176x250] paper using the \LaTeX{} {\em book} macro package.
@ -101,14 +101,26 @@ fast multiplication and squaring and has the side effect of speeding up ECC oper
 TomsFastMath is public domain.

 \section{Building}
-Currently only a GCC makefile has been provided.  To build the library simply type
-``make''.  The library is a bit too new to put into production so no install
-scripts exist yet.  You can build the test program with ``make test''.
+To build the library simply type ``make''.  Or to install in typical *unix like directories use
+``make install''.  Similarly a shared library can be built with ``make -f makefile.shared install''.

-To perform simple static testing (useful to test out new assembly ports) use the stest
-program.  Type ``make stest'' and run it on your target.  The program will perform three
-multiplications, squarings and montgomery reductions.  Likely if your assembly 
-code is invalid this code will exhibit the bug.
+You can build the test program with ``make test''.  To perform simple static testing (useful to 
+test out new assembly ports) use the stest program.  Type ``make stest'' and run it on your 
+target.  The program will perform three multiplications, squarings and montgomery reductions.  
+Likely if your assembly code is invalid this code will exhibit the bug.
+
+\subsection{Intel CC}
+In theory you should be able to build the library with
+
+\begin{verbatim}
+CFLAGS="-O3 -ip" CC=icc make IGNORE_SPEED=1
+\end{verbatim}
+
+However, Intels inline assembler is way less advanced than GCCs.  As a result it doesn't compile.  
+Fortunately it doesn't really matter.
+
+\subsection{MSVC}
+The library doesn't build with MSVC.  Imagine that.

 \subsection{Build Limitations}
 TomsFastMath has the following build requirements which are non--portable but under most 
--- a/tfm.toc
+++ b/tfm.toc
@ -2,32 +2,34 @@
 \contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}
 \contentsline {section}{\numberline {1.2}License}{2}{section.1.2}
 \contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}
-\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}
-\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}
+\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}
+\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}
+\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}
+\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}
 \contentsline {subsubsection}{x86--32}{3}{section*.3}
 \contentsline {subsubsection}{SSE2}{3}{section*.4}
-\contentsline {subsubsection}{x86--64}{3}{section*.5}
-\contentsline {subsubsection}{ARM}{3}{section*.6}
-\contentsline {subsubsection}{PPC32}{3}{section*.7}
+\contentsline {subsubsection}{x86--64}{4}{section*.5}
+\contentsline {subsubsection}{ARM}{4}{section*.6}
+\contentsline {subsubsection}{PPC32}{4}{section*.7}
 \contentsline {subsubsection}{Future Releases}{4}{section*.8}
-\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}
-\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}
-\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}
-\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}
-\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}
-\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}
-\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}
-\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}
-\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}
-\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}
-\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}
-\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}
-\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}
-\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}
-\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}
-\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}
-\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}
-\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}
-\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}
-\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}
-\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}
+\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}
+\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}
+\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}
+\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}
+\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}
+\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}
+\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}
+\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}
+\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}
+\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}
+\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}
+\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}
+\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}
+\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}
+\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}
+\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}
+\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}
+\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}
+\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}
+\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}
+\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}