Merge branch 'release/0.13.0'

2015-10-24 14:56:46 +02:00 · 2015-10-24 14:56:46 +02:00 · 5c395e04e6
commit 5c395e04e6
parent da88c2d42f a170b64106
124 changed files with 16680 additions and 2783 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,27 @@
+*.[ao]
+*.aux
+*.dvi
+*.idx
+*.ilg
+*.ind
+*.lof
+*.log
+*.toc
+*.out
+*.l[ao]
+*.orig
+.project
+.cproject
+/.libs
+test_*.txt
+
+test
+test.exe
+mtest
+mtest.exe
+stest
+stest.exe
+rsatest
+rsatest.exe
+timing
+timing.exe
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,32 @@
+language: c
+compiler:
+  - gcc
+script: CC="${MYCC}" make ${SHARED} test_standalone >test_gcc_1.txt 2>test_gcc_2.txt && ./test >test_std.txt 2>test_err.txt
+env:
+  - MYCC="gcc" SHARED=""
+  - MYCC="gcc -m32" SHARED=""
+  - MYCC="gcc-4.8" SHARED=""
+  - MYCC="gcc-4.8 -m32" SHARED=""
+  - MYCC="gcc-4.9" SHARED=""
+  - MYCC="gcc-4.9 -m32" SHARED=""
+  - MYCC="gcc" SHARED="-f makefile.shared"
+  - MYCC="gcc -m32" SHARED="-f makefile.shared"
+  - MYCC="gcc-4.8" SHARED="-f makefile.shared"
+  - MYCC="gcc-4.8 -m32" SHARED="-f makefile.shared"
+  - MYCC="gcc-4.9" SHARED="-f makefile.shared"
+  - MYCC="gcc-4.9 -m32" SHARED="-f makefile.shared"
+matrix:
+  fast_finish: true
+before_script:
+  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+  - sudo apt-get -qq update
+  - sudo apt-get install gcc-4.9-multilib gcc-4.8-multilib gcc-multilib build-essential
+after_failure:
+  - cat test_gcc_1.txt
+  - cat test_std.txt
+  - cat test_err.txt
+after_script:
+  - cat test_gcc_2.txt
+notifications:
+  irc: "chat.freenode.net#libtom"
+
--- a/35
+++ b/35
@ -1,7 +1,36 @@
-TomsFastMath is public domain.
+TomsFastMath is licensed under DUAL licensing terms.
+
+Choose and use the license of your needs.
+
+[LICENSE #1]
+
+TomsFastMath is public domain.  As should all quality software be.
+
+Tom St Denis
+
+[/LICENSE #1]
+
+[LICENSE #2]
+
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                    Version 2, December 2004
+
+ Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
+
+ Everyone is permitted to copy and distribute verbatim or modified
+ copies of this license document, and changing it is allowed as long
+ as the name is changed.
+
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. You just DO WHAT THE FUCK YOU WANT TO. 
+
+[/LICENSE #2]
+
+-- Mark Karpelès & Steffen Jaeckel

 Note some ideas were borrowed from LibTomMath and OpenSSL.  All of the code is original or ported
-from LibTomMath [no code was ported from OpenSSL].  As such the origins and status of this code
-are both public domain.
+from LibTomMath [no code was ported from OpenSSL].

 -- Tom St Denis
--- a/README.md
+++ b/README.md
@ -0,0 +1,11 @@
+tomsfastmath
+============
+
+See doc/tfm.pdf for a detailed documentation
+
+
+Project Status
+==============
+
+master: [![Build Status](https://travis-ci.org/libtom/tomsfastmath.svg?branch=master)](https://travis-ci.org/libtom/tomsfastmath)
+
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,15 @@
+October 24th, 2015
+v0.13.0
+     -- Add fp_rand()
+     -- Fix bug in fp_sub() reported by Martins Mozeiko
+     -- Fix bugs/apply patches in fp_mul() and fp_sqr() reported by rasky
+     -- Fix bugs in fp_read_radix()
+     -- Fix build issues for Linux x32 ABI
+     -- Sebastian Siewior provided fp_toradix_n(),
+        reported multiple issues on behalf of ClamAV
+        and did most of the testing work to be able to push this release out.
+     -- Fix a load of compiler warnings.
+
 March 14th, 2007
 0.12 -- Christophe Devine contributed MIPS asm w00t
     ++ quick release to get the MIPS code out there
--- a/demo/stest.c
+++ b/demo/stest.c
@ -8,8 +8,32 @@

 #ifndef DISPLAY
   #define DISPLAY(x) printf(x)
+   #define DISPLAY_P(...) printf(__VA_ARGS__)
+#else
+   #define DISPLAY_P(...) (void)0
+   #define fp_dump(n,p) do{}while(0)
 #endif

+#ifndef fp_dump
+void fp_dump(const char* n, fp_int* p)
+{
+  int sz;
+  if (fp_radix_size(p, 2, &sz) != FP_OKAY)
+    return;
+  char* str = malloc(sz);
+  if (!str)
+    return;
+#ifdef STEST_VERBOSE
+  fp_toradix(p, str, 2);
+  DISPLAY_P("%s = 0b%s\n", n, str);
+  fp_toradix(p, str, 16);
+  DISPLAY_P("%s = 0x%s\n", n, str);
+#endif
+  fp_toradix(p, str, 10);
+  DISPLAY_P("%s = %s\n", n, str);
+  free(str);
+}
+#endif

 #ifdef GBA_MODE
 int c_main(void)
@ -33,6 +57,8 @@ int main(void)
   modetxt_gotoxy(0,0);
 #endif

+   DISPLAY_P("TFM Ident string:\n%s\n\n", fp_ident());
+
   /* test multiplication */
   fp_read_radix(&a, "3453534534535345345341230891273", 10);
   fp_read_radix(&b, "2394873294871238934718923" , 10);
@ -40,7 +66,7 @@ int main(void)
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("mul passed\n");
   }
@ -52,7 +78,7 @@ int main(void)
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("mul passed\n");
   }
@ -64,7 +90,7 @@ int main(void)
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("mul passed\n");
   }
@ -75,7 +101,7 @@ int main(void)
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("sqr passed\n");
   }
@ -85,7 +111,7 @@ int main(void)
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("sqr passed\n");
   }
@ -95,7 +121,7 @@ int main(void)
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("sqr passed\n");
   }
@ -104,12 +130,19 @@ int main(void)
   /* montgomery reductions */
   fp_read_radix(&a, "234892374892374893489123428937892781237863278637826327367637836278362783627836783678363", 10);
   fp_read_radix(&b, "4447823492749823749234123489273987393983289319382762756425425425642727352327452374521", 10);
+#ifdef FP_64BIT
+   fp_read_radix(&c, "942974496560863503657226741422301598807235487941674147660989764036913926327577165648", 10);
+#else
   fp_read_radix(&c, "2396271882990732698083317035605836523697277786556053771759862552557086442129695099100", 10);
-   fp_montgomery_setup(&b, &dp);
+#endif
+   if (fp_montgomery_setup(&b, &dp) != FP_OKAY)
+      DISPLAY("mont setup failed\n");
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
-      return 0;
+      fp_dump("a (is    )", &a);
+      fp_dump("c (should)", &c);
+      return -1;
   } else {
      DISPLAY("mont passed\n");
   }
@ -117,11 +150,14 @@ int main(void)
   fp_read_radix(&a, "2348923748923748934891234456645654645645684576353428937892781237863278637826327367637836278362783627836783678363", 10);
   fp_read_radix(&b, "444782349274982374923412348927398739398328931938276275642542542564272735232745237452123424324324444121111119", 10);
   fp_read_radix(&c, "45642613844554582908652603086180267403823312390990082328515008314514368668691233331246183943400359349283420", 10);
-   fp_montgomery_setup(&b, &dp);
+   if (fp_montgomery_setup(&b, &dp) != FP_OKAY)
+      DISPLAY("mont setup failed\n");
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
-      return 0;
+      fp_dump("a (is    )", &a);
+      fp_dump("c (should)", &c);
+      return -1;
   } else {
      DISPLAY("mont passed\n");
   }
@ -129,11 +165,14 @@ int main(void)
   fp_read_radix(&a, "234823424242342923748923748934891234456645654645645684576353424972378234762378623891236834132352375235378462378489378927812378632786378263273676378362783627555555555539568389052478124618461834763837685723645827529034853490580134568947341278498542893481762349723907847892983627836783678363", 10);
   fp_read_radix(&b, "44478234927456563455982374923412348927398739398328931938276275642485623481638279025465891276312903262837562349056234783648712314678120389173890128905425242424239784256427", 10);
   fp_read_radix(&c, "33160865265453361650564031464519042126185632333462754084489985719613480783282357410514898819797738034600484519472656152351777186694609218202276509271061460265488348645081", 10);
-   fp_montgomery_setup(&b, &dp);
+   if (fp_montgomery_setup(&b, &dp) != FP_OKAY)
+      DISPLAY("mont setup failed\n");
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
-      return 0;
+      fp_dump("a (is    )", &a);
+      fp_dump("c (should)", &c);
+      return -1;
   } else {
      DISPLAY("mont passed\n");
   }
--- a/demo/test.c
+++ b/demo/test.c
@ -1,12 +1,23 @@
 /* TFM demo program */
 #include <tfm.h>
+#include <time.h>
+#include <unistd.h>
+
+
+#ifndef TFM_DEMO_TEST_VS_MTEST
+#define TFM_DEMO_TEST_VS_MTEST 1
+#endif

 void draw(fp_int *a)
 {
  int x;
  printf("%d, %d, ", a->used, a->sign);
  for (x = a->used - 1; x >= 0; x--) {
+#if SIZEOF_FP_DIGIT == 4
      printf("%08lx ", a->dp[x]);
+#else
+      printf("%016llx ", a->dp[x]);
+#endif
  }
  printf("\n");
 }
@ -14,71 +25,33 @@ void draw(fp_int *a)
 int myrng(unsigned char *dst, int len, void *dat)
 {
   int x;
+   (void)dat;
   for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
   return len;
 }

-/* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC (void)
-   {
-   #if defined __GNUC__
-      #if defined(INTEL_CC)
-	 ulong64 a;
-         asm ("rdtsc":"=A"(a));
-         return a;
-      #elif defined(__i386__) || defined(__x86_64__)
-         ulong64 a;
-         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
-         return a;
-      #elif defined(TFM_PPC32) 
-         unsigned long a, b;
-         __asm__ __volatile__ ("mftbu %1 \nmftb %0\n":"=r"(a), "=r"(b));
-         return (((ulong64)b) << 32ULL) | ((ulong64)a);
-      #elif defined(TFM_AVR32) 
-	 FILE *in;
-         char buf[20];
-	 in = fopen("/sys/devices/system/cpu/cpu0/pccycles", "r");
-	 fgets(buf, 20, in);
-	 fclose(in);
-	 return strtoul(buf, NULL, 10);
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
-
-   // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
-
   char cmd[4096], buf[4096];

 int main(void)
 {
  fp_int a,b,c,d,e,f;
+  unsigned long ix;
+#if TFM_DEMO_TEST_VS_MTEST
+  unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
+                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, cnt, rr;
+#else
  fp_digit fp;
  int n, err;
-   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
-                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
-   ulong64 t1, t2;
+#endif

  srand(time(NULL));
  printf("TFM Ident string:\n%s\n\n", fp_ident());
  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
-  fp_zero(&a); draw(&a);
+  fp_zero(&a);
+
+#if TFM_DEMO_TEST_VS_MTEST == 0
+
+  draw(&a);

  /* test set and simple shifts */
  printf("Testing mul/div 2\n");
@ -134,6 +107,10 @@ int main(void)
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
+  fp_read_radix(&a, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", 16); draw(&a);
+  fp_sub_d(&a, 3, &b); draw(&b);
+  fp_read_radix(&a, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", 16);
+  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);

  /* test mul_d */
  printf("Testing mul_d and div_d\n");
@ -150,7 +127,6 @@ int main(void)
  printf("Testing read_radix\n");
  fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);

-#if 0
  /* test mont */
  printf("Montgomery test #1\n");
  fp_set(&a, 0x1234567ULL);
@ -208,421 +184,10 @@ int main(void)
       }
   }
   printf("\n\n");
-#endif
-
-#ifdef TESTING
-goto testing;
-#endif
-
-#if 1
-
-t1 = TIMFUNC();
-sleep(1);
-printf("Ticks per second: %llu\n", TIMFUNC() - t1);
-
-goto multtime;
- /* do some timings... */
-  printf("Addition:\n");
-  for (t = 2; t <= FP_SIZE/2; t += 2) {
-      fp_zero(&a);
-      fp_zero(&b);
-      fp_zero(&c);
-      for (ix = 0; ix < t; ix++) {
-          a.dp[ix] = ix;
-          b.dp[ix] = ix;
-      }
-      a.used = t;
-      b.used = t;
-      t2 = -1;
-      for (ix = 0; ix < 25000; ++ix) {
-          t1 = TIMFUNC();
-          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
-          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
-          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
-          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
-          t2 = (TIMFUNC() - t1)>>3;
-          if (t1<t2) { --ix; t2 = t1; }
-      }
-      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-  }
-multtime:
-  printf("Multiplication:\n");
-  for (t = 2; t < FP_SIZE/2; t += 2) {
-      fp_zero(&a);
-      fp_zero(&b);
-      fp_zero(&c);
-      for (ix = 0; ix < t; ix++) {
-          a.dp[ix] = ix;
-          b.dp[ix] = ix;
-      }
-      a.used = t;
-      b.used = t;
-      t2 = -1;
-      for (ix = 0; ix < 100; ++ix) {
-          t1 = TIMFUNC();
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
-          t2 = (TIMFUNC() - t1)>>7;
-          if (t1<t2) { --ix; t2 = t1; }
-      }
-      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-  }
-//#else
-sqrtime:
-  printf("Squaring:\n");
-  for (t = 2; t < FP_SIZE/2; t += 2) {
-      fp_zero(&a);
-      fp_zero(&b);
-      for (ix = 0; ix < t; ix++) {
-          a.dp[ix] = ix;
-      }
-      a.used = t;
-      t2 = -1;
-      for (ix = 0; ix < 100; ++ix) {
-          t1 = TIMFUNC();
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          fp_sqr(&a, &b); fp_sqr(&a, &b);
-          t2 = (TIMFUNC() - t1)>>7;
-          if (t1<t2) { --ix; t2 = t1; }
-      }
-      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-  }
-invmodtime:
-  printf("Invmod:\n");
-  for (t = 2; t < FP_SIZE/2; t += 2) {
-     fp_zero(&a);
-     for (ix = 0; ix < t; ix++) {
-         a.dp[ix] = ix | 1;
-     }
-     a.used = t;
-     fp_zero(&b);
-     for (ix = 0; ix < t; ix++) {
-         b.dp[ix] = rand();
-     }
-     b.used = t;
-     fp_clamp(&b);
-     fp_zero(&c);
-     t2 = -1;
-     for (ix = 0; ix < 100; ++ix) {
-          t1 = TIMFUNC();
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          fp_invmod(&b, &a, &c);
-          t2 = (TIMFUNC() - t1)>>6;
-          if (t1<t2) { --ix; t2 = t1; }
-      }
-      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-  }
-//#else
-monttime:
-  printf("Montgomery:\n");
-  for (t = 2; t <= (FP_SIZE/2)-4; t += 2) {
-//      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-      fp_zero(&a);
-      for (ix = 0; ix < t; ix++) {
-          a.dp[ix] = ix | 1;
-      }
-      a.used = t;
-
-     fp_montgomery_setup(&a, &fp);
-     fp_sub_d(&a, 3, &b);
-     fp_sqr(&b, &b);      
-     fp_copy(&b, &c);      
-     fp_copy(&b, &d);      
-
-     t2 = -1;
-     for (ix = 0; ix < 100; ++ix) {
-          t1 = TIMFUNC();
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          fp_montgomery_reduce(&c, &a, &fp);
-          fp_montgomery_reduce(&d, &a, &fp);
-          t2 = (TIMFUNC() - t1)>>6;
-          fp_copy(&b, &c);      
-          fp_copy(&b, &d);      
-          if (t1<t2) { --ix; t2 = t1; }
-      }
-      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-  }
-//#else
-expttime:
-  printf("Exptmod:\n");
- 
-  for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += 256/DIGIT_BIT) {
-      fp_zero(&a);
-      fp_zero(&b);
-      fp_zero(&c);
-      for (ix = 0; ix < t; ix++) {
-          a.dp[ix] = ix+1;
-          b.dp[ix] = (fp_digit)rand() * (fp_digit)rand();
-          c.dp[ix] = ix;
-      }
-      a.used = t;
-      b.used = t;
-      c.used = t;
-
-     t2 = -1;
-     for (ix = 0; ix < 500; ++ix) {
-          t1 = TIMFUNC();
-          fp_exptmod(&c, &b, &a, &d);
-          fp_exptmod(&c, &b, &a, &d);
-          t2 = (TIMFUNC() - t1)>>1;
-          fp_copy(&b, &c);      
-          fp_copy(&b, &d);      
-          if (t1<t2) { t2 = t1; --ix; }
-     }
-     printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
-  }
-  return 0;
-#endif

 return 0;
-testing:
+
+#else

  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); fp_zero(&a);

@ -643,7 +208,7 @@ testing:
          fp_mul_2d(&a, rr, &a);
          a.sign = b.sign;
          if (fp_cmp(&a, &b) != FP_EQ) {
-             printf("mul2d failed, rr == %lu\n",rr);
+             printf("\nmul2d failed, rr == %lu\n",rr);
             draw(&a);
             draw(&b);
             return 0;
@ -657,7 +222,7 @@ testing:
          a.sign = b.sign;
          if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
          if (fp_cmp(&a, &b) != FP_EQ) {
-             printf("div2d failed, rr == %lu\n",rr);
+             printf("\ndiv2d failed, rr == %lu\n",rr);
             draw(&a);
             draw(&b);
             return 0;
@ -669,7 +234,7 @@ testing:
          fp_copy(&a, &d);
          fp_add(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("add %lu failure!\n", add_n);
+             printf("\nadd %lu failure!\n", add_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
@ -681,7 +246,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
          fp_read_signed_bin(&d, (unsigned char *)cmd, rr);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("fp_signed_bin failure!\n");
+             printf("f\np_signed_bin failure!\n");
             draw(&c);
             draw(&d);
             return 0;
@ -692,7 +257,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
          fp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
          if (fp_cmp_mag(&c, &d) != FP_EQ) {
-             printf("fp_unsigned_bin failure!\n");
+             printf("\nfp_unsigned_bin failure!\n");
             draw(&c);
             draw(&d);
             return 0;
@ -705,98 +270,98 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          fp_copy(&a, &d);
          fp_sub(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("sub %lu failure!\n", sub_n);
+             printf("\nsub %lu failure!\n", sub_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-       } else if (!strcmp(cmd, "mul")) { 
+       } else if (!strcmp(cmd, "mul")) { ++mul_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
          fp_copy(&a, &d);
-          fp_mul(&d, &b, &d); ++mul_n;
+          fp_mul(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("mul %lu failure!\n", mul_n);
+             printf("\nmul %lu failure!\n", mul_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-       } else if (!strcmp(cmd, "div")) { 
+       } else if (!strcmp(cmd, "div")) { ++div_n;
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&d, buf, 64);
 // continue;
-          fp_div(&a, &b, &e, &f); ++div_n;
+          fp_div(&a, &b, &e, &f);
          if (fp_cmp(&c, &e) != FP_EQ || fp_cmp(&d, &f) != FP_EQ) {
-             printf("div %lu failure!\n", div_n);
+             printf("\ndiv %lu failure!\n", div_n);
 draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
             return 0;
          }

-       } else if (!strcmp(cmd, "sqr")) { 
+       } else if (!strcmp(cmd, "sqr")) { ++sqr_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
 // continue;
          fp_copy(&a, &c);
-          fp_sqr(&c, &c); ++sqr_n;
+          fp_sqr(&c, &c);
          if (fp_cmp(&b, &c) != FP_EQ) {
-             printf("sqr %lu failure!\n", sqr_n);
+             printf("\nsqr %lu failure!\n", sqr_n);
 draw(&a);draw(&b);draw(&c);
             return 0;
          }
-       } else if (!strcmp(cmd, "gcd")) { 
+       } else if (!strcmp(cmd, "gcd")) { ++gcd_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 // continue;
          fp_copy(&a, &d);
-          fp_gcd(&d, &b, &d); ++gcd_n;
+          fp_gcd(&d, &b, &d);
          d.sign = c.sign;
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("gcd %lu failure!\n", gcd_n);
+             printf("\ngcd %lu failure!\n", gcd_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-       } else if (!strcmp(cmd, "lcm")) { 
+       } else if (!strcmp(cmd, "lcm")) { ++lcm_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
             fp_copy(&a, &d);
-             fp_lcm(&d, &b, &d); ++lcm_n;
+             fp_lcm(&d, &b, &d);
             d.sign = c.sign;
             if (fp_cmp(&c, &d) != FP_EQ) {
-                printf("lcm %lu failure!\n", lcm_n);
+                printf("\nlcm %lu failure!\n", lcm_n);
   draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
-       } else if (!strcmp(cmd, "expt")) {  
+       } else if (!strcmp(cmd, "expt")) { ++expt_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&d, buf, 64);
 // continue;
             fp_copy(&a, &e);
-             fp_exptmod(&e, &b, &c, &e); ++expt_n;
+             fp_exptmod(&e, &b, &c, &e);
             if (fp_cmp(&d, &e) != FP_EQ) {
-                printf("expt %lu failure!\n", expt_n);
+                printf("\nexpt %lu failure!\n", expt_n);
   draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
                return 0;
             }
-       } else if (!strcmp(cmd, "invmod")) {  
+       } else if (!strcmp(cmd, "invmod")) { ++inv_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
             fp_invmod(&a, &b, &d);
 #if 1
-             fp_mulmod(&d,&a,&b,&e); ++inv_n;
+             fp_mulmod(&d,&a,&b,&e);
             if (fp_cmp_d(&e, 1) != FP_EQ) {
 #else
             if (fp_cmp(&d, &c) != FP_EQ) {
 #endif
-                printf("inv [wrong value from MPI?!] failure\n");
+                printf("\ninv [wrong value from MPI?!] failure\n");
                draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
@ -806,7 +371,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fp_div_2(&a, &c);
             if (fp_cmp(&c, &b) != FP_EQ) {
-                 printf("div_2 %lu failure\n", div2_n);
+                 printf("\ndiv_2 %lu failure\n", div2_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -817,7 +382,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fp_mul_2(&a, &c);
             if (fp_cmp(&c, &b) != FP_EQ) {
-                 printf("mul_2 %lu failure\n", mul2_n);
+                 printf("\nmul_2 %lu failure\n", mul2_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -829,7 +394,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_add_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
-                 printf("add_d %lu failure\n", add_d_n);
+                 printf("\nadd_d %lu failure\n", add_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -842,7 +407,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_sub_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
-                 printf("sub_d %lu failure\n", sub_d_n);
+                 printf("\nsub_d %lu failure\n", sub_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -855,7 +420,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_mul_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
-                 printf("mul_d %lu failure\n", sub_d_n);
+                 printf("\nmul_d %lu failure\n", mul_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -865,6 +430,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
       }

   }
+#endif
 }


--- a/demo/timing.c
+++ b/demo/timing.c
@ -0,0 +1,625 @@
+/* TFM timing analysis */
+#include <tfm.h>
+#include <time.h>
+#include <unistd.h>
+
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC(void)
+{
+#if defined __GNUC__
+   #if defined(INTEL_CC)
+ ulong64 a;
+      asm ("rdtsc":"=A"(a));
+      return a;
+   #elif defined(__i386__) || defined(__x86_64__)
+      /* version from http://www.mcs.anl.gov/~kazutomo/rdtsc.html
+       * the old code always got a warning issued by gcc, clang did not complain...
+       */
+      unsigned hi, lo;
+      __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+      return ((ulong64)lo)|( ((ulong64)hi)<<32);
+   #elif defined(TFM_PPC32)
+      unsigned long a, b;
+      __asm__ __volatile__ ("mftbu %1 \nmftb %0\n":"=r"(a), "=r"(b));
+      return (((ulong64)b) << 32ULL) | ((ulong64)a);
+   #elif defined(TFM_AVR32)
+ FILE *in;
+      char buf[20];
+ in = fopen("/sys/devices/system/cpu/cpu0/pccycles", "r");
+ fgets(buf, 20, in);
+ fclose(in);
+ return strtoul(buf, NULL, 10);
+   #else /* gcc-IA64 version */
+      unsigned long result;
+      __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+      while (__builtin_expect ((int) result == -1, 0))
+      __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+      return result;
+   #endif
+
+// Microsoft and Intel Windows compilers
+#elif defined _M_IX86
+  __asm rdtsc
+#elif defined _M_AMD64
+  return __rdtsc ();
+#elif defined _M_IA64
+  #if defined __INTEL_COMPILER
+    #include <ia64intrin.h>
+  #endif
+   return __getReg (3116);
+#else
+  #error need rdtsc function for this build
+#endif
+}
+
+static ulong64 ticks;
+static const char* p_str;
+
+static void print_start(const char* s)
+{
+   p_str = s;
+}
+
+static void print_line(ulong64 b, ulong64 t)
+{
+   printf("%llu;%s;%llu;%llu\n", ticks, p_str, b, t);
+}
+
+int main(void)
+{
+   fp_int a,b,c,d;
+   ulong64 t1, t2;
+   fp_digit fp;
+   unsigned long t, ix;
+
+   t1 = TIMFUNC();
+   sleep(1);
+   ticks = TIMFUNC() - t1;
+   fprintf(stderr, "Ticks per second: %llu\n", ticks);
+
+   printf("Ticks/sec;Algorithm;bits;time\n");
+   /* do some timings... */
+   print_start("Addition");
+   for (t = 2; t <= FP_SIZE / 2; t += 2) {
+      fp_zero(&a);
+      fp_zero(&b);
+      fp_zero(&c);
+      for (ix = 0; ix < t; ix++) {
+         a.dp[ix] = ix;
+         b.dp[ix] = ix;
+      }
+      a.used = t;
+      b.used = t;
+      t2 = -1;
+      for (ix = 0; ix < 25000; ++ix) {
+         t1 = TIMFUNC();
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         fp_add(&a, &b, &c);
+         t2 = (TIMFUNC() - t1) >> 3;
+         if (t1 < t2) {
+            --ix;
+            t2 = t1;
+         }
+      }
+      print_line(t * DIGIT_BIT, t2);
+   }
+   print_start("Multiplication");
+   for (t = 2; t < FP_SIZE / 2; t += 2) {
+      fp_zero(&a);
+      fp_zero(&b);
+      fp_zero(&c);
+      for (ix = 0; ix < t; ix++) {
+         a.dp[ix] = ix;
+         b.dp[ix] = ix;
+      }
+      a.used = t;
+      b.used = t;
+      t2 = -1;
+      for (ix = 0; ix < 100; ++ix) {
+         t1 = TIMFUNC();
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         fp_mul(&a, &b, &c);
+         t2 = (TIMFUNC() - t1) >> 7;
+         if (t1 < t2) {
+            --ix;
+            t2 = t1;
+         }
+      }
+      print_line(t * DIGIT_BIT, t2);
+   }
+
+   print_start("Squaring");
+   for (t = 2; t < FP_SIZE / 2; t += 2) {
+      fp_zero(&a);
+      fp_zero(&b);
+      for (ix = 0; ix < t; ix++) {
+         a.dp[ix] = ix;
+      }
+      a.used = t;
+      t2 = -1;
+      for (ix = 0; ix < 100; ++ix) {
+         t1 = TIMFUNC();
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         fp_sqr(&a, &b);
+         t2 = (TIMFUNC() - t1) >> 7;
+         if (t1 < t2) {
+            --ix;
+            t2 = t1;
+         }
+      }
+      print_line(t * DIGIT_BIT, t2);
+   }
+
+   print_start("Invmod");
+   for (t = 2; t < FP_SIZE / 2; t += 2) {
+      fp_zero(&a);
+      for (ix = 0; ix < t; ix++) {
+         a.dp[ix] = ix | 1;
+      }
+      a.used = t;
+      fp_zero(&b);
+      for (ix = 0; ix < t; ix++) {
+         b.dp[ix] = rand();
+      }
+      b.used = t;
+      fp_clamp(&b);
+      fp_zero(&c);
+      t2 = -1;
+      for (ix = 0; ix < 100; ++ix) {
+         t1 = TIMFUNC();
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         fp_invmod(&b, &a, &c);
+         t2 = (TIMFUNC() - t1) >> 6;
+         if (t1 < t2) {
+            --ix;
+            t2 = t1;
+         }
+      }
+      print_line(t * DIGIT_BIT, t2);
+   }
+
+   print_start("Montgomery");
+   for (t = 2; t <= (FP_SIZE / 2) - 4; t += 2) {
+      //      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
+      fp_zero(&a);
+      for (ix = 0; ix < t; ix++) {
+         a.dp[ix] = ix | 1;
+      }
+      a.used = t;
+
+      fp_montgomery_setup(&a, &fp);
+      fp_sub_d(&a, 3, &b);
+      fp_sqr(&b, &b);
+      fp_copy(&b, &c);
+      fp_copy(&b, &d);
+
+      t2 = -1;
+      for (ix = 0; ix < 100; ++ix) {
+         t1 = TIMFUNC();
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         fp_montgomery_reduce(&c, &a, fp);
+         fp_montgomery_reduce(&d, &a, fp);
+         t2 = (TIMFUNC() - t1) >> 6;
+         fp_copy(&b, &c);
+         fp_copy(&b, &d);
+         if (t1 < t2) {
+            --ix;
+            t2 = t1;
+         }
+      }
+      print_line(t * DIGIT_BIT, t2);
+   }
+
+   print_start("Exptmod");
+
+   for (t = 512 / DIGIT_BIT; t <= (FP_SIZE / 2) - 2; t += 256 / DIGIT_BIT) {
+      fp_zero(&a);
+      fp_zero(&b);
+      fp_zero(&c);
+      for (ix = 0; ix < t; ix++) {
+         a.dp[ix] = ix + 1;
+         b.dp[ix] = (fp_digit) rand() * (fp_digit) rand();
+         c.dp[ix] = ix;
+      }
+      a.used = t;
+      b.used = t;
+      c.used = t;
+
+      t2 = -1;
+      for (ix = 0; ix < 500; ++ix) {
+         t1 = TIMFUNC();
+         fp_exptmod(&c, &b, &a, &d);
+         fp_exptmod(&c, &b, &a, &d);
+         t2 = (TIMFUNC() - t1) >> 1;
+         fp_copy(&b, &c);
+         fp_copy(&b, &d);
+         if (t1 < t2) {
+            t2 = t1;
+            --ix;
+         }
+      }
+      print_line(t * DIGIT_BIT, t2);
+   }
+   return 0;
+}
--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/gen.pl
+++ b/gen.pl
@ -6,7 +6,7 @@
 use strict;

 open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
-foreach my $filename (glob "*fp_*.c") {
+foreach my $filename (glob "src/*/*fp_*.c") {
   next if ($filename eq "fp_sqr_comba_generic.c");
   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
   print OUT "/* Start: $filename */\n";
--- a/libtfm.symbols
+++ b/libtfm.symbols
@ -0,0 +1,49 @@
+fp_2expt
+fp_add
+fp_add_d
+fp_addmod
+fp_cmp
+fp_cmp_d
+fp_cmp_mag
+fp_cnt_lsb
+fp_count_bits
+fp_div
+fp_div_2
+fp_div_2d
+fp_div_d
+fp_exptmod
+fp_gcd
+fp_ident
+fp_invmod
+fp_isprime
+fp_lcm
+fp_lshd
+fp_mod
+fp_mod_2d
+fp_mod_d
+fp_montgomery_calc_normalization
+fp_montgomery_reduce
+fp_montgomery_setup
+fp_mul
+fp_mul_2
+fp_mul_2d
+fp_mul_d
+fp_mulmod
+fp_prime_random_ex
+fp_radix_size
+fp_read_radix
+fp_read_signed_bin
+fp_read_unsigned_bin
+fp_rshd
+fp_set
+fp_signed_bin_size
+fp_sqr
+fp_sqrmod
+fp_sub
+fp_sub_d
+fp_submod
+fp_to_signed_bin
+fp_to_unsigned_bin
+fp_toradix
+fp_toradix_n
+fp_unsigned_bin_size
--- a/156
+++ b/156
@ -1,10 +1,22 @@
 #makefile for TomsFastMath
 #
 #
-VERSION=0.12
+VERSION=0.13

 CFLAGS += -Wall -W -Wshadow -Isrc/headers

+# Compiler and Linker Names
+ifndef PREFIX
+  PREFIX=
+endif
+
+ifeq ($(CC),cc)
+  CC = $(PREFIX)gcc
+endif
+LD=$(PREFIX)ld
+AR=$(PREFIX)ar
+RANLIB=$(PREFIX)ranlib
+
 ifndef MAKE
   MAKE=make
 endif
@ -27,27 +39,29 @@ OBJECTS=src/addsub/fp_add.o src/addsub/fp_add_d.o src/addsub/fp_addmod.o src/add
 src/addsub/fp_cmp_d.o src/addsub/fp_cmp_mag.o src/addsub/fp_sub.o src/addsub/fp_sub_d.o \
 src/addsub/fp_submod.o src/addsub/s_fp_add.o src/addsub/s_fp_sub.o src/bin/fp_radix_size.o \
 src/bin/fp_read_radix.o src/bin/fp_read_signed_bin.o src/bin/fp_read_unsigned_bin.o \
-src/bin/fp_reverse.o src/bin/fp_s_rmap.o src/bin/fp_signed_bin_size.o src/bin/fp_to_signed_bin.o \
-src/bin/fp_to_unsigned_bin.o src/bin/fp_toradix.o src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o \
-src/bit/fp_count_bits.o src/bit/fp_div_2.o src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o \
-src/bit/fp_rshd.o src/divide/fp_div.o src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o \
-src/exptmod/fp_2expt.o src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_set.o \
+src/bin/fp_reverse.o src/bin/fp_signed_bin_size.o src/bin/fp_s_rmap.o src/bin/fp_toradix.o \
+src/bin/fp_toradix_n.o src/bin/fp_to_signed_bin.o src/bin/fp_to_unsigned_bin.o \
+src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o src/bit/fp_count_bits.o src/bit/fp_div_2.o \
+src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o src/bit/fp_rshd.o src/divide/fp_div.o \
+src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o src/exptmod/fp_2expt.o \
+src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_rand.o src/misc/fp_set.o \
 src/mont/fp_montgomery_calc_normalization.o src/mont/fp_montgomery_reduce.o \
-src/mont/fp_montgomery_setup.o src/mul/fp_mul.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o \
-src/mul/fp_mul_comba.o src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o \
-src/mul/fp_mul_comba_24.o src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_32.o \
-src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_48.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_64.o \
-src/mul/fp_mul_comba_7.o src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o \
+src/mont/fp_montgomery_setup.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o src/mul/fp_mul.o \
+src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o src/mul/fp_mul_comba_24.o \
+src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_32.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_48.o \
+src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_64.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_7.o \
+src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o src/mul/fp_mul_comba.o \
 src/mul/fp_mul_comba_small_set.o src/mul/fp_mul_d.o src/mul/fp_mulmod.o src/numtheory/fp_gcd.o \
-src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_lcm.o \
-src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o src/sqr/fp_sqr.o \
-src/sqr/fp_sqr_comba.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
-src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_3.o src/sqr/fp_sqr_comba_32.o \
-src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_6.o src/sqr/fp_sqr_comba_64.o \
-src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o \
+src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_isprime_ex.o \
+src/numtheory/fp_lcm.o src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o \
+src/sqr/fp_sqr.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
+src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_32.o src/sqr/fp_sqr_comba_3.o \
+src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_64.o src/sqr/fp_sqr_comba_6.o \
+src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o src/sqr/fp_sqr_comba.o \
 src/sqr/fp_sqr_comba_generic.o src/sqr/fp_sqr_comba_small_set.o src/sqr/fp_sqrmod.o

-HEADERS=src/headers/tfm.h 
+HEADERS_PUB:=src/headers/tfm.h
+HEADERS=src/headers/tfm_private.h $(HEADERS_PUB)

 #END_INS

@ -77,32 +91,44 @@ endif

 default: $(LIBNAME)

+$(OBJECTS): $(HEADERS)
+
 $(LIBNAME): $(OBJECTS)
 	$(AR) $(ARFLAGS) $@ $(OBJECTS)
-	ranlib $@
+	$(RANLIB) $@

 install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
-	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS_PUB) $(DESTDIR)$(INCPATH)

-mtest/mtest: mtest/mtest.o
-	cd mtest ; CFLAGS="$(CFLAGS) -I../" MAKE=${MAKE} ${MAKE} mtest
+.PHONY: mtest
+mtest: $(LIBNAME)
+	cd mtest; CC="$(CC)" CFLAGS="$(CFLAGS) -I../" MAKE=${MAKE} ${MAKE} mtest

-test: $(LIBNAME) demo/test.o mtest/mtest
+demo/test.o: CFLAGS+=-Wno-unused-result
+
+.PHONY: test
+test: $(LIBNAME) demo/test.o
 	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test

-timing: $(LIBNAME) demo/test.o
+test_standalone: CFLAGS+=-DTFM_DEMO_TEST_VS_MTEST=0
+
+.PHONY: test_standalone
+test_standalone: $(LIBNAME) demo/test.o
 	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test

+timing: $(LIBNAME) demo/timing.o
+	$(CC) $(CFLAGS) demo/timing.o $(LIBNAME) $(PROF) -o timing
+
 profiled:
-	CFLAGS="${CFLAGS} -fprofile-generate" MAKE=${MAKE} ${MAKE} timing
+	CC="$(CC)" PREFIX="${PREFIX} CFLAGS="${CFLAGS} -fprofile-generate" MAKE=${MAKE} ${MAKE} timing
 	./test
-	rm -f `find . -type f | grep "[.]o" | xargs`
-	rm -f `find . -type f | grep "[.]a" | xargs`
+	rm -f `find . -type f -name "*.o" | xargs`
+	rm -f `find . -type f -name "*.a" | xargs`
 	rm -f test
-	CFLAGS="${CFLAGS} -fprofile-use" MAKE=${MAKE} ${MAKE} timing
+	CC=$(CC) PREFIX="${PREFIX} CFLAGS="${CFLAGS} -fprofile-use" MAKE=${MAKE} ${MAKE} timing
 	
 stest: $(LIBNAME) demo/stest.o
 	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME) -o stest
@ -111,6 +137,15 @@ rsatest: $(LIBNAME) demo/rsa.o
 	$(CC) $(CFLAGS) demo/rsa.o $(LIBNAME) -o rsatest

 docdvi: tfm.tex
+	cp tfm.tex tfm.bak
+	touch --reference=tfm.tex tfm.bak
+	(printf "%s" "\def\fixedpdfdate{"; date +'D:%Y%m%d%H%M%S%:z' -d @$$(stat --format=%Y tfm.tex) | sed "s/:\([0-9][0-9]\)$$/'\1'}/g") > tfm-deterministic.tex
+	printf "%s\n" "\pdfinfo{" >> tfm-deterministic.tex
+	printf "%s\n" "  /CreationDate (\fixedpdfdate)" >> tfm-deterministic.tex
+	printf "%s\n}\n" "  /ModDate (\fixedpdfdate)" >> tfm-deterministic.tex
+	cat tfm.tex >> tfm-deterministic.tex
+	mv tfm-deterministic.tex tfm.tex
+	touch --reference=tfm.bak tfm.tex
 	touch tfm.ind
 	latex tfm >/dev/null
 	latex tfm >/dev/null
@ -119,41 +154,48 @@ docdvi: tfm.tex

 docs: docdvi
 	latex tfm >/dev/null
-	dvipdf tfm
+	pdflatex tfm >/dev/null
+	sed -b -i 's,^/ID \[.*\]$$,/ID [<0> <0>],g' tfm.pdf
+	mv tfm.bak tfm.tex
 	mv -f tfm.pdf doc

 #This rule cleans the source tree of all compiled code, not including the pdf
 #documentation.
 clean:
-	rm -f `find . -type f | grep "[.]o" | xargs`
-	rm -f `find . -type f | grep "[.]lo"  | xargs`
-	rm -f `find . -type f | grep "[.]a" | xargs`
-	rm -f `find . -type f | grep "[.]la"  | xargs`
-	rm -f `find . -type f | grep "[.]obj" | xargs`
-	rm -f `find . -type f | grep "[.]lib" | xargs`
-	rm -f `find . -type f | grep "[.]exe" | xargs`
-	rm -f `find . -type f | grep "[.]gcda" | xargs`
-	rm -f `find . -type f | grep "[.]gcno" | xargs`
-	rm -f `find . -type f | grep "[.]il" | xargs`
-	rm -f `find . -type f | grep "[.]dyn" | xargs`
-	rm -f `find . -type f | grep "[.]dpi" | xargs`
-	rm -rf `find . -type d | grep "[.]libs" | xargs`
-	rm -f tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc test mtest/mtest
+	rm -f `find . -type f -name "*.o" | xargs`
+	rm -f `find . -type f -name "*.lo"  | xargs`
+	rm -f `find . -type f -name "*.a" | xargs`
+	rm -f `find . -type f -name "*.la"  | xargs`
+	rm -f `find . -type f -name "*.obj" | xargs`
+	rm -f `find . -type f -name "*.lib" | xargs`
+	rm -f `find . -type f -name "*.exe" | xargs`
+	rm -f `find . -type f -name "*.gcov" | xargs`
+	rm -f `find . -type f -name "*.gcda" | xargs`
+	rm -f `find . -type f -name "*.gcno" | xargs`
+	rm -f `find . -type f -name "*.il" | xargs`
+	rm -f `find . -type f -name "*.dyn" | xargs`
+	rm -f `find . -type f -name "*.dpi" | xargs`
+	rm -rf `find . -type d -name "*.libs" | xargs`
+	rm -f tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.out  tfm.toc  test  test.exe
 	cd mtest; MAKE=${MAKE} ${MAKE} clean

-no_oops: clean
-	cd .. ; cvs commit
-	echo Scanning for scratch/dirty files
-	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
+.PHONY: pre_gen
+pre_gen:
+	perl gen.pl
+	sed -e 's/[[:blank:]]*$$//' mpi.c > pre_gen/mpi.c
+	rm mpi.c

-zipup: no_oops docs clean
-	perl gen.pl ; mv mpi.c pre_gen/ ; \
-	cd .. ; rm -rf tfm* tomsfastmath-$(VERSION) ; mkdir tomsfastmath-$(VERSION) ; \
-	cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
-	tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
-	zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/* ; \
-	mv -f tfm* ~ ; rm -rf tomsfastmath-$(VERSION)
+zipup:
+	rm -rf ../tomsfastmath-$(VERSION) && rm -f ../tfm-$(VERSION).zip ../tfm-$(VERSION).tar.bz2 && \
+	expsrc.sh -i . -o ../tomsfastmath-$(VERSION) --svntags --no-fetch -p '*.c' -p '*.h' && \
+	MAKE=${MAKE} ${MAKE} -C ../tomsfastmath-$(VERSION) docs && \
+	tar -c ../tomsfastmath-$(VERSION)/* | bzip2 -9vvc > ../tfm-$(VERSION).tar.bz2 && \
+	zip -9 -r ../tfm-$(VERSION).zip ../tomsfastmath-$(VERSION)/* && \
+	gpg -b -a ../tfm-$(VERSION).tar.bz2 && gpg -b -a ../tfm-$(VERSION).zip

-# $Source: /cvs/libtom/tomsfastmath/makefile,v $ 
-# $Revision: 1.38 $ 
-# $Date: 2007/03/13 01:23:03 $ 
+new_file:
+	bash updatemakes.sh
+
+# $Source$
+# $Revision$
+# $Date$
--- a/makefile.shared
+++ b/makefile.shared
@ -1,9 +1,10 @@
 #makefile for TomsFastMath
 #
 #
-VERSION=0:12
+VERSION=1:0:0

-CC=libtool --mode=compile --tag=CC gcc
+LT  ?= libtool
+LTCOMPILE = $(LT) --mode=compile --tag=CC $(CC)

 CFLAGS += -Wall -W -Wshadow -Isrc/headers

@ -25,24 +26,25 @@ OBJECTS=src/addsub/fp_add.o src/addsub/fp_add_d.o src/addsub/fp_addmod.o src/add
 src/addsub/fp_cmp_d.o src/addsub/fp_cmp_mag.o src/addsub/fp_sub.o src/addsub/fp_sub_d.o \
 src/addsub/fp_submod.o src/addsub/s_fp_add.o src/addsub/s_fp_sub.o src/bin/fp_radix_size.o \
 src/bin/fp_read_radix.o src/bin/fp_read_signed_bin.o src/bin/fp_read_unsigned_bin.o \
-src/bin/fp_reverse.o src/bin/fp_s_rmap.o src/bin/fp_signed_bin_size.o src/bin/fp_to_signed_bin.o \
-src/bin/fp_to_unsigned_bin.o src/bin/fp_toradix.o src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o \
-src/bit/fp_count_bits.o src/bit/fp_div_2.o src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o \
-src/bit/fp_rshd.o src/divide/fp_div.o src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o \
-src/exptmod/fp_2expt.o src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_set.o \
+src/bin/fp_reverse.o src/bin/fp_signed_bin_size.o src/bin/fp_s_rmap.o src/bin/fp_toradix.o \
+src/bin/fp_toradix_n.o src/bin/fp_to_signed_bin.o src/bin/fp_to_unsigned_bin.o \
+src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o src/bit/fp_count_bits.o src/bit/fp_div_2.o \
+src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o src/bit/fp_rshd.o src/divide/fp_div.o \
+src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o src/exptmod/fp_2expt.o \
+src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_rand.o src/misc/fp_set.o \
 src/mont/fp_montgomery_calc_normalization.o src/mont/fp_montgomery_reduce.o \
-src/mont/fp_montgomery_setup.o src/mul/fp_mul.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o \
-src/mul/fp_mul_comba.o src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o \
-src/mul/fp_mul_comba_24.o src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_32.o \
-src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_48.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_64.o \
-src/mul/fp_mul_comba_7.o src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o \
+src/mont/fp_montgomery_setup.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o src/mul/fp_mul.o \
+src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o src/mul/fp_mul_comba_24.o \
+src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_32.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_48.o \
+src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_64.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_7.o \
+src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o src/mul/fp_mul_comba.o \
 src/mul/fp_mul_comba_small_set.o src/mul/fp_mul_d.o src/mul/fp_mulmod.o src/numtheory/fp_gcd.o \
-src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_lcm.o \
-src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o src/sqr/fp_sqr.o \
-src/sqr/fp_sqr_comba.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
-src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_3.o src/sqr/fp_sqr_comba_32.o \
-src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_6.o src/sqr/fp_sqr_comba_64.o \
-src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o \
+src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_isprime_ex.o \
+src/numtheory/fp_lcm.o src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o \
+src/sqr/fp_sqr.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
+src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_32.o src/sqr/fp_sqr_comba_3.o \
+src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_64.o src/sqr/fp_sqr_comba_6.o \
+src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o src/sqr/fp_sqr_comba.o \
 src/sqr/fp_sqr_comba_generic.o src/sqr/fp_sqr_comba_small_set.o src/sqr/fp_sqrmod.o

 HEADERS=src/headers/tfm.h
@ -80,10 +82,13 @@ endif

 default: $(LIBNAME)

-objs: $(OBJECTS)
+$(OBJECTS): $(HEADERS)
+
+.c.o:
+	$(LTCOMPILE) $(CFLAGS) $(LDFLAGS) -o $@ -c $<

 $(LIBNAME): $(OBJECTS)
-	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
+	libtool --silent --mode=link --tag=CC $(CC) $(CFLAGS) $(LDFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION) -export-symbols libtfm.symbols

 install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
@ -94,16 +99,26 @@ install: $(LIBNAME)
 mtest/mtest: mtest/mtest.c
 	cd mtest ; make mtest

-test: $(LIBNAME) demo/test.o mtest/mtest
-	$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
+demo/test.o: CFLAGS+=-Wno-unused-result

-timing: $(LIBNAME) demo/test.o
-	$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
+.PHONY: test
+test: $(LIBNAME) demo/test.o
+	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o test demo/test.o $(LIBNAME)
+
+test_standalone: CFLAGS+=-DTFM_DEMO_TEST_VS_MTEST=0
+
+.PHONY: test_standalone
+test_standalone: $(LIBNAME) demo/test.o
+	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o test demo/test.o $(LIBNAME)

 stest: $(LIBNAME) demo/stest.o
-	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest
+	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o stest demo/stest.o $(LIBNAME)

-# $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $ 
-# $Revision: 1.19 $ 
-# $Date: 2007/03/13 01:23:03 $ 
+.PHONY: timing
+timing: $(LIBNAME) demo/timing.o
+	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o timing demo/timing.o $(LIBNAME)
+
+# $Source$
+# $Revision$
+# $Date$

--- a/mtest/makefile
+++ b/mtest/makefile
@ -1,9 +1,10 @@
-CFLAGS += -Wall -W -O3 
+CFLAGS += -Wall -W -O3 -Wno-unused-result

 default: mtest

+.PHONY: mtest
 mtest: mtest.o
 	$(CC) $(CFLAGS) mtest.o -ltommath -o mtest

 clean:
-	rm -f *.o mtest *~
+	rm -f *.o mtest *~ mtest.exe
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -39,6 +39,7 @@ mulmod
 #include <time.h>
 #include <tommath.h>
 #define CRYPT
+#undef DIGIT_BIT
 #include "../src/headers/tfm.h"

 FILE *rng;
@ -46,8 +47,8 @@ FILE *rng;
 /* 1-2048 bit numbers */
 void rand_num(mp_int *a)
 {
-   int n, size;
-   unsigned char buf[2048];
+   int size;
+   unsigned char buf[(FP_MAX_SIZE/16 - DIGIT_BIT/2) + 1];

   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % (FP_MAX_SIZE/16 - DIGIT_BIT/2);
   buf[0] = (fgetc(rng)&1)?1:0;
@ -59,8 +60,8 @@ void rand_num(mp_int *a)
 /* 1-256 bit numbers (to test things like exptmod) */
 void rand_num2(mp_int *a)
 {
-   int n, size;
-   unsigned char buf[2048];
+   int size;
+   unsigned char buf[(FP_MAX_SIZE/16 - DIGIT_BIT/2) + 1];

   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % (FP_MAX_SIZE/16 - DIGIT_BIT/2);
   buf[0] = (fgetc(rng)&1)?1:0;
@ -69,13 +70,15 @@ void rand_num2(mp_int *a)
   mp_read_raw(a, buf, 1+size);
 }

-#define mp_to64(a, b) mp_toradix(a, b, 64)
+#define mp_to64(a, b) mp_toradix_n(a, b, 64, sizeof(b))

 int main(void)
 {
   int n, tmp;
   mp_int a, b, c, d, e;
+#ifdef MTEST_NO_FULLSPEED
   clock_t t1;
+#endif
   char buf[4096];

   mp_init(&a);
@ -88,7 +91,7 @@ int main(void)
   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
 /*
   mp_set(&a, 1);
-   for (n = 1; n < 8192; n++) {
+   for (n = 1; n < ((FP_MAX_SIZE-(8*DIGIT_BIT))/2); n++) {
       mp_mul(&a, &a, &c);
       printf("mul\n");
       mp_to64(&a, buf);
@ -111,9 +114,11 @@ int main(void)
      }
   }

+#ifdef MTEST_NO_FULLSPEED
   t1 = clock();
+#endif
   for (;;) {
-#if 0
+#ifdef MTEST_NO_FULLSPEED
      if (clock() - t1 > CLOCKS_PER_SEC) {
         sleep(2);
         t1 = clock();
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
--- a/src/addsub/fp_add.c
+++ b/src/addsub/fp_add.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_add(fp_int *a, fp_int *b, fp_int *c)
 {
--- a/src/addsub/fp_add_d.c
+++ b/src/addsub/fp_add_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a + b */
 void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
--- a/src/addsub/fp_addmod.c
+++ b/src/addsub/fp_addmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* d = a + b (mod c) */
 int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
--- a/src/addsub/fp_cmp.c
+++ b/src/addsub/fp_cmp.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_cmp(fp_int *a, fp_int *b)
 {
--- a/src/addsub/fp_cmp_d.c
+++ b/src/addsub/fp_cmp_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* compare against a single digit */
 int fp_cmp_d(fp_int *a, fp_digit b)
--- a/src/addsub/fp_cmp_mag.c
+++ b/src/addsub/fp_cmp_mag.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_cmp_mag(fp_int *a, fp_int *b)
 {
--- a/src/addsub/fp_sub.c
+++ b/src/addsub/fp_sub.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a - b */
 void fp_sub(fp_int *a, fp_int *b, fp_int *c)
--- a/src/addsub/fp_sub_d.c
+++ b/src/addsub/fp_sub_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a - b */
 void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
--- a/src/addsub/fp_submod.c
+++ b/src/addsub/fp_submod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* d = a - b (mod c) */
 int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
--- a/src/addsub/s_fp_add.c
+++ b/src/addsub/s_fp_add.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* unsigned addition */
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
@ -16,7 +16,7 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
  register fp_word  t;

  y       = MAX(a->used, b->used);
-  oldused = c->used;
+  oldused = MIN(c->used, FP_SIZE);
  c->used = y;
 
  t = 0;
--- a/src/addsub/s_fp_sub.c
+++ b/src/addsub/s_fp_sub.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
@ -27,7 +27,7 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
  for (; x < a->used; x++) {
     t         = ((fp_word)a->dp[x]) - t;
     c->dp[x]  = (fp_digit)t;
-     t         = (t >> DIGIT_BIT);
+     t         = (t >> DIGIT_BIT)&1;
   }
  for (; x < oldused; x++) {
     c->dp[x] = 0;
--- a/src/bin/fp_radix_size.c
+++ b/src/bin/fp_radix_size.c
@ -7,11 +7,10 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_radix_size(fp_int *a, int radix, int *size)
 {
-  int     digs;
  fp_int  t;
  fp_digit d;

@ -36,7 +35,6 @@ int fp_radix_size(fp_int *a, int radix, int *size)
    t.sign = FP_ZPOS;
  }

-  digs = 0;
  while (fp_iszero (&t) == FP_NO) {
    fp_div_d (&t, (fp_digit) radix, &t, &d);
    (*size)++;
--- a/src/bin/fp_read_radix.c
+++ b/src/bin/fp_read_radix.c
@ -7,13 +7,16 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_read_radix(fp_int *a, char *str, int radix)
 {
  int     y, neg;
  char    ch;

+  /* set the integer to the default of zero */
+  fp_zero (a);
+
  /* make sure the radix is ok */
  if (radix < 2 || radix > 64) {
    return FP_VAL;
@ -29,16 +32,13 @@ int fp_read_radix(fp_int *a, char *str, int radix)
    neg = FP_ZPOS;
  }

-  /* set the integer to the default of zero */
-  fp_zero (a);
-
  /* process each digit of the string */
  while (*str) {
    /* if the radix < 36 the conversion is case insensitive
     * this allows numbers like 1AB and 1ab to represent the same  value
     * [e.g. in hex]
     */
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    ch = (char) ((radix <= 36) ? toupper ((int)*str) : *str);
    for (y = 0; y < 64; y++) {
      if (ch == fp_s_rmap[y]) {
         break;
--- a/src/bin/fp_read_signed_bin.c
+++ b/src/bin/fp_read_signed_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_read_signed_bin(fp_int *a, unsigned char *b, int c)
 {
--- a/src/bin/fp_read_unsigned_bin.c
+++ b/src/bin/fp_read_unsigned_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c)
 {
--- a/src/bin/fp_reverse.c
+++ b/src/bin/fp_reverse.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* reverse an array, used for radix code */
 void fp_reverse (unsigned char *s, int len)
--- a/src/bin/fp_s_rmap.c
+++ b/src/bin/fp_s_rmap.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* chars used in radix conversions */
 const char *fp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
--- a/src/bin/fp_signed_bin_size.c
+++ b/src/bin/fp_signed_bin_size.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_signed_bin_size(fp_int *a)
 {
--- a/src/bin/fp_to_signed_bin.c
+++ b/src/bin/fp_to_signed_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_to_signed_bin(fp_int *a, unsigned char *b)
 {
--- a/src/bin/fp_to_unsigned_bin.c
+++ b/src/bin/fp_to_unsigned_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
 {
--- a/src/bin/fp_toradix.c
+++ b/src/bin/fp_toradix.c
@ -7,51 +7,23 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

+/**
+ * a:		pointer to fp_int representing the input number
+ * str:		output buffer
+ * radix:	number of character to use for encoding of the number
+ *
+ * The radix value can be in the range 2 to 64. This function converts number
+ * a into a string str. Please don't use this function because a too small
+ * chosen str buffer would lead to an overflow which can not be detected.
+ * Please use fp_toradix_n() instead.
+ *
+ * Return: FP_VAL on error, FP_OKAY on success.
+ */
 int fp_toradix(fp_int *a, char *str, int radix)
 {
-  int     digs;
-  fp_int  t;
-  fp_digit d;
-  char   *_s = str;
-
-  /* check range of the radix */
-  if (radix < 2 || radix > 64) {
-    return FP_VAL;
-  }
-
-  /* quick out if its zero */
-  if (fp_iszero(a) == 1) {
-     *str++ = '0';
-     *str = '\0';
-     return FP_OKAY;
-  }
-
-  fp_init_copy(&t, a);
-
-  /* if it is negative output a - */
-  if (t.sign == FP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = FP_ZPOS;
-  }
-
-  digs = 0;
-  while (fp_iszero (&t) == FP_NO) {
-    fp_div_d (&t, (fp_digit) radix, &t, &d);
-    *str++ = fp_s_rmap[d];
-    ++digs;
-  }
-
-  /* reverse the digits of the string.  In this case _s points
-   * to the first digit [exluding the sign] of the number]
-   */
-  fp_reverse ((unsigned char *)_s, digs);
-
-  /* append a NULL so the string is properly terminated */
-  *str = '\0';
-  return FP_OKAY;
+   return fp_toradix_n(a, str, radix, INT_MAX);
 }

 /* $Source$ */
--- a/src/bin/fp_toradix_n.c
+++ b/src/bin/fp_toradix_n.c
@ -0,0 +1,71 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ *
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ *
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+#include <tfm_private.h>
+
+int fp_toradix_n(fp_int *a, char *str, int radix, int maxlen)
+{
+   int digs;
+   fp_int t;
+   fp_digit d;
+   char *_s = str;
+
+   /* check range of the radix */
+   if (maxlen < 2 || radix < 2 || radix > 64)
+      return FP_VAL;
+
+   /* quick check for zero */
+   if (fp_iszero(a) == FP_YES) {
+      *str++ = '0';
+      *str = '\0';
+      return FP_OKAY;
+   }
+
+   fp_init_copy(&t, a);
+
+   /* if it is negative output a - */
+   if (t.sign == FP_NEG) {
+      /* we have to reverse our digits later... but not the - sign!! */
+      ++_s;
+
+      /* store the flag and mark the number as positive */
+      *str++ = '-';
+      t.sign = FP_ZPOS;
+
+      /* subtract a char */
+      --maxlen;
+   }
+
+   digs = 0;
+   while (fp_iszero (&t) == FP_NO) {
+      if (--maxlen < 1) {
+         /* no more room */
+         break;
+      }
+      fp_div_d(&t, (fp_digit) radix, &t, &d);
+      *str++ = fp_s_rmap[d];
+      ++digs;
+   }
+
+   /* reverse the digits of the string.  In this case _s points
+    * to the first digit [exluding the sign] of the number]
+    */
+   fp_reverse((unsigned char *) _s, digs);
+
+   /* append a NULL so the string is properly terminated */
+   *str = '\0';
+
+   if (maxlen < 1)
+      return FP_VAL;
+   return FP_OKAY;
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/bin/fp_unsigned_bin_size.c
+++ b/src/bin/fp_unsigned_bin_size.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_unsigned_bin_size(fp_int *a)
 {
--- a/src/bit/fp_cnt_lsb.c
+++ b/src/bit/fp_cnt_lsb.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 static const int lnz[16] = {
   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
--- a/src/bit/fp_count_bits.c
+++ b/src/bit/fp_count_bits.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 int fp_count_bits (fp_int * a)
 {
--- a/src/bit/fp_div_2.c
+++ b/src/bit/fp_div_2.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* b = a/2 */
 void fp_div_2(fp_int * a, fp_int * b)
--- a/src/bit/fp_div_2d.c
+++ b/src/bit/fp_div_2d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a / 2**b */
 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
--- a/src/bit/fp_lshd.c
+++ b/src/bit/fp_lshd.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_lshd(fp_int *a, int x)
 {
--- a/src/bit/fp_mod_2d.c
+++ b/src/bit/fp_mod_2d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a mod 2**d */
 void fp_mod_2d(fp_int *a, int b, fp_int *c)
--- a/src/bit/fp_rshd.c
+++ b/src/bit/fp_rshd.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_rshd(fp_int *a, int x)
 {
--- a/src/divide/fp_div.c
+++ b/src/divide/fp_div.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* a/b => cb + d == a */
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
--- a/src/divide/fp_div_d.c
+++ b/src/divide/fp_div_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 static int s_is_power_of_two(fp_digit b, int *p)
 {
--- a/src/divide/fp_mod.c
+++ b/src/divide/fp_mod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
--- a/src/divide/fp_mod_d.c
+++ b/src/divide/fp_mod_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a mod b, 0 <= c < b  */
 int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
--- a/src/exptmod/fp_2expt.c
+++ b/src/exptmod/fp_2expt.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* computes a = 2**b */
 void fp_2expt(fp_int *a, int b)
--- a/src/exptmod/fp_exptmod.c
+++ b/src/exptmod/fp_exptmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 #ifdef TFM_TIMING_RESISTANT

--- a/src/generators/.gitignore
+++ b/src/generators/.gitignore
@ -0,0 +1,8 @@
+comba_mult_gen
+comba_mult_smallgen
+comba_sqr_gen
+comba_sqr_smallgen
+comba_mult_gen.exe
+comba_mult_smallgen.exe
+comba_sqr_gen.exe
+comba_sqr_smallgen.exe
--- a/src/generators/comba_mult_gen.c
+++ b/src/generators/comba_mult_gen.c
@ -18,6 +18,10 @@ int main(int argc, char **argv)

   /* print out preamble */
 printf(
+"#define TFM_DEFINES\n"
+"#include \"fp_mul_comba.c\"\n"
+"\n"
+"#if defined(TFM_MUL%d) && FP_SIZE >= %d\n"
 "void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
 "{\n"
 "   fp_digit c0, c1, c2, at[%d];\n"
@ -26,7 +30,7 @@ printf(
 "   memcpy(at+%d, B->dp, %d * sizeof(fp_digit));\n"
 "   COMBA_START;\n"
 "\n"
-"   COMBA_CLEAR;\n", N, N+N, N, N, N);
+"   COMBA_CLEAR;\n", N, N+N, N, N+N, N, N, N);

   /* now do the rows */
   for (x = 0; x < (N+N-1); x++) {
@ -53,7 +57,11 @@ printf(
 "   C->sign = A->sign ^ B->sign;\n"
 "   fp_clamp(C);\n"
 "   COMBA_FINI;\n"
-"}\n\n\n", N+N-1, N+N);
+"}\n#endif\n\n\n"
+"/* $Source$ */\n"
+"/* $Revision$ */\n"
+"/* $Date$ */\n"
+, N+N-1, N+N);

  return 0;
 }
--- a/src/generators/comba_mult_smallgen.c
+++ b/src/generators/comba_mult_smallgen.c
@ -7,6 +7,10 @@ int main(int argc, char **argv)

   /* print out preamble */
 printf(
+"#define TFM_DEFINES\n"
+"#include \"fp_mul_comba.c\"\n"
+"\n"
+"#if defined(TFM_SMALL_SET)\n"
 "void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)\n"
 "{\n"
 "   fp_digit c0, c1, c2, at[32];\n"
@ -51,7 +55,10 @@ printf(
 "      COMBA_FINI;\n"
 "      break;\n", N+N-1, N+N);
 }
-printf("   }\n}\n\n");
+printf("   }\n}\n\n#endif\n\n\n"
+"/* $Source$ */\n"
+"/* $Revision$ */\n"
+"/* $Date$ */\n");

  return 0;
 }
--- a/src/generators/comba_sqr_gen.c
+++ b/src/generators/comba_sqr_gen.c
@ -16,10 +16,16 @@ int main(int argc, char **argv)
   N = atoi(argv[1]);

 printf(
-"#ifdef TFM_SQR%d\n"
+"#define TFM_DEFINES\n"
+"#include \"fp_sqr_comba.c\"\n"
+"\n"
+"#if defined(TFM_SQR%d) && FP_SIZE >= %d\n"
 "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
 "{\n"
 "   fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
+"#ifdef TFM_ISO\n"
+"   fp_word tt;\n"
+"#endif\n"
 "\n"
 "   a = A->dp;\n"
 "   COMBA_START;\n"
@ -29,7 +35,7 @@ printf(
 "\n"
 "   /* output 0 */\n"
 "   SQRADD(a[0],a[0]);\n"
-"   COMBA_STORE(b[0]);\n", N, N, N+N);
+"   COMBA_STORE(b[0]);\n", N, N+N, N, N+N);

   for (x = 1; x < N+N-1; x++) {
 printf(
@ -91,7 +97,11 @@ printf(
 "   B->sign = FP_ZPOS;\n"
 "   memcpy(B->dp, b, %d * sizeof(fp_digit));\n"
 "   fp_clamp(B);\n"
-"}\n#endif\n\n\n", N+N, N+N);
+"}\n#endif\n\n\n"
+"/* $Source$ */\n"
+"/* $Revision$ */\n"
+"/* $Date$ */\n"
+, N+N, N+N);

  return 0;
 }
--- a/src/generators/comba_sqr_smallgen.c
+++ b/src/generators/comba_sqr_smallgen.c
@ -16,9 +16,16 @@ int main(int argc, char **argv)
   int x, y, z, N, f;

 printf(
+"#define TFM_DEFINES\n"
+"#include \"fp_sqr_comba.c\"\n"
+"\n"
+"#if defined(TFM_SMALL_SET)\n"
 "void fp_sqr_comba_small(fp_int *A, fp_int *B)\n"
 "{\n"
 "   fp_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;\n"
+"#ifdef TFM_ISO\n"
+"   fp_word tt;\n"
+"#endif\n"
 );

 printf("   switch (A->used) { \n");
@ -99,7 +106,11 @@ printf(
 "      break;\n\n", N+N, N+N);
 }

-printf("}\n\n}\n");
+printf("}\n}\n\n#endif /* TFM_SMALL_SET */\n\n"
+"/* $Source$ */\n"
+"/* $Revision$ */\n"
+"/* $Date$ */\n"
+);

  return 0;
 }
--- a/src/generators/makefile
+++ b/src/generators/makefile
@ -0,0 +1,31 @@
+all: comba_sqr_gen comba_sqr_smallgen
+
+clean:
+	rm -f comba_mult_gen
+	rm -f comba_mult_gen.exe
+	rm -f comba_mult_smallgen
+	rm -f comba_mult_smallgen.exe
+	rm -f comba_sqr_gen
+	rm -f comba_sqr_gen.exe
+	rm -f comba_sqr_smallgen
+	rm -f comba_sqr_smallgen.exe
+
+comba_mult_gen: comba_mult_gen.c
+	gcc -o comba_mult_gen comba_mult_gen.c
+comba_mult_smallgen: comba_mult_smallgen.c
+	gcc -o comba_mult_smallgen comba_mult_smallgen.c
+comba_sqr_gen: comba_sqr_gen.c
+	gcc -o comba_sqr_gen comba_sqr_gen.c
+comba_sqr_smallgen: comba_sqr_smallgen.c
+	gcc -o comba_sqr_smallgen comba_sqr_smallgen.c
+
+regen: comba_mult_gen comba_mult_smallgen comba_sqr_gen comba_sqr_smallgen
+	for i in 3 4 6 7 8 9 12 17 20 24 28 32 48 64; do \
+		./comba_mult_gen $$i | sed -e 's/ *$$//' > ../mul/fp_mul_comba_$$i.c; \
+	done
+	./comba_mult_smallgen > ../mul/fp_mul_comba_small_set.c
+	for i in 3 4 6 7 8 9 12 17 20 24 28 32 48 64; do \
+		./comba_sqr_gen $$i | sed -e 's/ *$$//' > ../sqr/fp_sqr_comba_$$i.c; \
+	done
+	./comba_sqr_smallgen > ../sqr/fp_sqr_comba_small_set.c
+
--- a/src/headers/tfm.h
+++ b/src/headers/tfm.h
@ -16,6 +16,15 @@
 #include <ctype.h>
 #include <limits.h>

+/* 0xMaMiPaXX
+ * Major
+ * Minor
+ * Patch
+ * XX - undefined
+ */
+#define TFM_VERSION     0x000D0000
+#define TFM_VERSION_S   "v0.13.0"
+
 #ifndef MIN
   #define MIN(x,y) ((x)<(y)?(x):(y))
 #endif
@ -104,6 +113,10 @@
   #error FP_MAX_SIZE must be a multiple of CHAR_BIT
 #endif

+#if __SIZEOF_LONG__ == 8
+	#define FP_64BIT
+#endif
+
 /* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */
 #if defined(__x86_64__)
   #if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)
@ -245,11 +258,15 @@
 #if defined(FP_64BIT)
   /* for GCC only on supported platforms */
 #ifndef CRYPT
-   typedef unsigned long ulong64;
-#endif
+   typedef unsigned long long ulong64;
+#endif /* CRYPT */
+
   typedef ulong64            fp_digit;
+#define SIZEOF_FP_DIGIT 8
   typedef unsigned long      fp_word __attribute__ ((mode(TI)));
+
 #else
+
   /* this is to make porting into LibTomCrypt easier :-) */
 #ifndef CRYPT
   #if defined(_MSC_VER) || defined(__BORLANDC__)
@ -258,14 +275,16 @@
   #else
      typedef unsigned long long ulong64;
      typedef signed long long   long64;
-   #endif
-#endif
-   typedef unsigned long      fp_digit;
+   #endif /* defined(_MSC_VER) ... */
+#endif /* CRYPT */
+
+   typedef unsigned int       fp_digit;
+#define SIZEOF_FP_DIGIT 4
   typedef ulong64            fp_word;
-#endif
+#endif /* FP_64BIT */

 /* # of digits this is */
-#define DIGIT_BIT  (int)((CHAR_BIT) * sizeof(fp_digit))
+#define DIGIT_BIT  ((CHAR_BIT) * SIZEOF_FP_DIGIT)
 #define FP_MASK    (fp_digit)(-1)
 #define FP_SIZE    (FP_MAX_SIZE/DIGIT_BIT)

@ -311,6 +330,9 @@ const char *fp_ident(void);
 /* set to a small digit */
 void fp_set(fp_int *a, fp_digit b);

+/* makes a pseudo-random int of a given size */
+void fp_rand(fp_int *a, int digits);
+
 /* copy from a to b */
 #define fp_copy(a, b)      (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
 #define fp_init_copy(a, b) fp_copy(b, a)
@ -422,8 +444,11 @@ int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* perform a Miller-Rabin test of a to the base b and store result in "result" */
 void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result);

+#define FP_PRIME_SIZE      256
 /* 256 trial divisions + 8 Miller-Rabins, returns FP_YES if probable prime  */
 int fp_isprime(fp_int *a);
+/* extended version of fp_isprime, do 't' Miller-Rabins instead of only 8 */
+int fp_isprime_ex(fp_int *a, int t);

 /* Primality generation flags */
 #define TFM_PRIME_BBS      0x0001 /* BBS style prime */
@ -450,119 +475,13 @@ void fp_read_signed_bin(fp_int *a, unsigned char *b, int c);
 void fp_to_signed_bin(fp_int *a, unsigned char *b);

 int fp_read_radix(fp_int *a, char *str, int radix);
+
+int fp_radix_size(fp_int *a, int radix, int *size);
 int fp_toradix(fp_int *a, char *str, int radix);
 int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);

-
-/* VARIOUS LOW LEVEL STUFFS */
-void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
-void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
-void fp_reverse(unsigned char *s, int len);
-
-void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
-
-#ifdef TFM_SMALL_SET
-void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
 #endif

-#ifdef TFM_MUL3
-void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL4
-void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL6
-void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL7
-void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL8
-void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL9
-void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL12
-void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL17
-void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C);
-#endif
-
-#ifdef TFM_MUL20
-void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL24
-void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL28
-void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL32
-void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL48
-void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
-#endif
-#ifdef TFM_MUL64
-void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
-#endif
-
-void fp_sqr_comba(fp_int *A, fp_int *B);
-
-#ifdef TFM_SMALL_SET
-void fp_sqr_comba_small(fp_int *A, fp_int *B);
-#endif
-
-#ifdef TFM_SQR3
-void fp_sqr_comba3(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR4
-void fp_sqr_comba4(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR6
-void fp_sqr_comba6(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR7
-void fp_sqr_comba7(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR8
-void fp_sqr_comba8(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR9
-void fp_sqr_comba9(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR12
-void fp_sqr_comba12(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR17
-void fp_sqr_comba17(fp_int *A, fp_int *B);
-#endif
-
-#ifdef TFM_SQR20
-void fp_sqr_comba20(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR24
-void fp_sqr_comba24(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR28
-void fp_sqr_comba28(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR32
-void fp_sqr_comba32(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR48
-void fp_sqr_comba48(fp_int *A, fp_int *B);
-#endif
-#ifdef TFM_SQR64
-void fp_sqr_comba64(fp_int *A, fp_int *B);
-#endif
-extern const char *fp_s_rmap;
-
-#endif
-
-
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/headers/tfm_private.h
+++ b/src/headers/tfm_private.h
@ -0,0 +1,125 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ *
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ *
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+#ifndef TFM_PRIVATE_H_
+#define TFM_PRIVATE_H_
+
+#include <tfm.h>
+
+/* VARIOUS LOW LEVEL STUFFS */
+void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
+void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
+void fp_reverse(unsigned char *s, int len);
+
+void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
+
+#ifdef TFM_SMALL_SET
+void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
+#endif
+
+#ifdef TFM_MUL3
+void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL4
+void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL6
+void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL7
+void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL8
+void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL9
+void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL12
+void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL17
+void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C);
+#endif
+
+#ifdef TFM_MUL20
+void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL24
+void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL28
+void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL32
+void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL48
+void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
+#endif
+#ifdef TFM_MUL64
+void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
+#endif
+
+void fp_sqr_comba(fp_int *A, fp_int *B);
+
+#ifdef TFM_SMALL_SET
+void fp_sqr_comba_small(fp_int *A, fp_int *B);
+#endif
+
+#ifdef TFM_SQR3
+void fp_sqr_comba3(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR4
+void fp_sqr_comba4(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR6
+void fp_sqr_comba6(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR7
+void fp_sqr_comba7(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR8
+void fp_sqr_comba8(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR9
+void fp_sqr_comba9(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR12
+void fp_sqr_comba12(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR17
+void fp_sqr_comba17(fp_int *A, fp_int *B);
+#endif
+
+#ifdef TFM_SQR20
+void fp_sqr_comba20(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR24
+void fp_sqr_comba24(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR28
+void fp_sqr_comba28(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR32
+void fp_sqr_comba32(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR48
+void fp_sqr_comba48(fp_int *A, fp_int *B);
+#endif
+#ifdef TFM_SQR64
+void fp_sqr_comba64(fp_int *A, fp_int *B);
+#endif
+extern const char *fp_s_rmap;
+
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/misc/fp_ident.c
+++ b/src/misc/fp_ident.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include "tfm.h"
+#include <tfm_private.h>

 const char *fp_ident(void)
 {
@ -15,11 +15,14 @@ const char *fp_ident(void)

   memset(buf, 0, sizeof(buf));
   snprintf(buf, sizeof(buf)-1,
-"TomsFastMath (%s)\n"
+"TomsFastMath " TFM_VERSION_S "\n"
+#if defined(TFM_IDENT_BUILD_DATE)
+"Built on " __DATE__ " at " __TIME__ "\n"
+#endif
 "\n"
 "Sizeofs\n"
-"\tfp_digit = %u\n"
-"\tfp_word  = %u\n"
+"\tfp_digit = %lu\n"
+"\tfp_word  = %lu\n"
 "\n"
 "FP_MAX_SIZE = %u\n"
 "\n"
@ -70,11 +73,11 @@ const char *fp_ident(void)
 #ifdef TFM_HUGE
 " TFM_HUGE "
 #endif
-"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
+"\n", (unsigned long)sizeof(fp_digit), (unsigned long)sizeof(fp_word), FP_MAX_SIZE);

   if (sizeof(fp_digit) == sizeof(fp_word)) {
      strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
-              sizeof(buf)-1);
+              sizeof(buf) - strlen(buf) - 1);
   }
   return buf;
 }
--- a/src/misc/fp_rand.c
+++ b/src/misc/fp_rand.c
@ -0,0 +1,41 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ *
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ *
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+#include <tfm_private.h>
+
+/* makes a pseudo-random int of a given size */
+
+void fp_rand(fp_int *a, int digits)
+{
+   fp_digit d;
+
+   fp_zero(a);
+   if (digits <= 0) {
+     return;
+   }
+
+   /* first place a random non-zero digit */
+   do {
+     d = ((fp_digit) abs (rand ())) & FP_MASK;
+   } while (d == 0);
+
+   fp_add_d (a, d, a);
+
+   while (--digits > 0) {
+     fp_lshd (a, 1);
+     fp_add_d (a, ((fp_digit) abs (rand ())), a);
+   }
+
+   return;
+
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/misc/fp_set.c
+++ b/src/misc/fp_set.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_set(fp_int *a, fp_digit b)
 {
--- a/src/mont/fp_montgomery_calc_normalization.c
+++ b/src/mont/fp_montgomery_calc_normalization.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* computes a = B**n mod b without division or multiplication useful for
 * normalizing numbers in a Montgomery system.
--- a/src/mont/fp_montgomery_reduce.c
+++ b/src/mont/fp_montgomery_reduce.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /******************************************************************/
 #if defined(TFM_X86) && !defined(TFM_SSE2)
@ -29,8 +29,8 @@ asm(                                                      \
   "adcl $0,%%edx \n\t"                                   \
   "movl %%edx,%1 \n\t"                                   \
 :"=g"(_c[LO]), "=r"(cy)                                   \
-:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
-: "%eax", "%edx", "%cc")
+:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
+: "%eax", "%edx", "cc")

 #define PROPCARRY                           \
 asm(                                        \
@ -39,7 +39,7 @@ asm(                                        \
   "movzbl %%al,%1 \n\t"                    \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
-: "%eax", "%cc")
+: "%eax", "cc")

 /******************************************************************/
 #elif defined(TFM_X86_64)
@ -62,7 +62,7 @@ asm(                                                      \
   "movq %%rdx,%1 \n\t"                                   \
 :"=g"(_c[LO]), "=r"(cy)                                   \
 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
-: "%rax", "%rdx", "%cc")
+: "%rax", "%rdx", "cc")

 #define INNERMUL8 \
 asm(                  \
@ -155,7 +155,7 @@ asm(                                                      \
 \
 :"=r"(_c), "=r"(cy)                    \
 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
-: "%rax", "%rdx", "%r10", "%r11", "%cc")
+: "%rax", "%rdx", "%r10", "%r11", "cc")


 #define PROPCARRY                           \
@ -165,7 +165,7 @@ asm(                                        \
   "movzbq %%al,%1 \n\t"                    \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
-: "%rax", "%cc")
+: "%rax", "cc")

 /******************************************************************/
 #elif defined(TFM_SSE2)
@ -280,7 +280,7 @@ asm(                                        \
   "movzbl %%al,%1 \n\t"                    \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
-: "%eax", "%cc")
+: "%eax", "cc")

 /******************************************************************/
 #elif defined(TFM_ARM)
@ -300,7 +300,7 @@ asm(                                \
    " MOVCC  %0,#0            \n\t" \
    " UMLAL  r0,%0,%3,%4      \n\t" \
    " STR    r0,%1            \n\t" \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");

 #define PROPCARRY                  \
 asm(                               \
@ -309,7 +309,7 @@ asm(                               \
    " STR   r0,%1            \n\t" \
    " MOVCS %0,#1            \n\t" \
    " MOVCC %0,#0            \n\t" \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");

 /******************************************************************/
 #elif defined(TFM_PPC32)
@ -325,22 +325,18 @@ asm(                               \
 asm(                                 \
   " mullw    16,%3,%4       \n\t"   \
   " mulhwu   17,%3,%4       \n\t"   \
-   " addc     16,16,%0       \n\t"   \
+   " addc     16,16,%2       \n\t"   \
   " addze    17,17          \n\t"   \
-   " lwz      18,%1          \n\t"   \
-   " addc     16,16,18       \n\t"   \
+   " addc     %1,16,%5       \n\t"   \
   " addze    %0,17          \n\t"   \
-   " stw      16,%1          \n\t"   \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm;
+:"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;

 #define PROPCARRY                    \
 asm(                                 \
-   " lwz      16,%1         \n\t"    \
-   " addc     16,16,%0      \n\t"    \
-   " stw      16,%1         \n\t"    \
-   " xor      %0,%0,%0      \n\t"    \
-   " addze    %0,%0         \n\t"    \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc");
+   " addc     %1,%3,%2      \n\t"    \
+   " xor      %0,%2,%2      \n\t"    \
+   " addze    %0,%2         \n\t"    \
+:"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");

 /******************************************************************/
 #elif defined(TFM_PPC64)
@ -362,7 +358,7 @@ asm(                                 \
   " addc     r16,r16,r18       \n\t"   \
   " addze    %0,r17          \n\t"   \
   " sdx      r16,0,%1        \n\t"   \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","%cc"); ++tmpm;
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;

 #define PROPCARRY                    \
 asm(                                 \
@ -371,7 +367,7 @@ asm(                                 \
   " sdx      r16,0,%1       \n\t"    \
   " xor      %0,%0,%0      \n\t"    \
   " addze    %0,%0         \n\t"    \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc");
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");

 /******************************************************************/
 #elif defined(TFM_AVR32)
@ -401,7 +397,7 @@ asm(                                 \
   " st.w     %1,r2         \n\t"    \
   " eor      %0,%0         \n\t"    \
   " acr      %0            \n\t"    \
-:"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc");
+:"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");

 /******************************************************************/
 #elif defined(TFM_MIPS)
@ -509,7 +505,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
       _c   = c + x;
       tmpm = m->dp;
       y = 0;
-       #if (defined(TFM_SSE2) || defined(TFM_X86_64))
+       #if defined(INNERMUL8)
        for (; y < (pa & ~7); y += 8) {
              INNERMUL8;
              _c   += 8;
--- a/src/mont/fp_montgomery_setup.c
+++ b/src/mont/fp_montgomery_setup.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* setups the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *rho)
--- a/src/mul/fp_mul.c
+++ b/src/mul/fp_mul.c
@ -7,122 +7,133 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a * b */
 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
-    int   y, yy;
+    int   y, old_used;
+#if FP_SIZE >= 48
+    int   yy;
+#endif
+
+    old_used = C->used;

    /* call generic if we're out of range */
    if (A->used + B->used > FP_SIZE) {
       fp_mul_comba(A, B, C);
-       return ;
+       goto clean;
    }

     y  = MAX(A->used, B->used);
+#if FP_SIZE >= 48
     yy = MIN(A->used, B->used);
+#endif
    /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
       of the largest input.  We also want to avoid doing excess mults if the
       inputs are not close to the next power of two.  That is, for example,
       if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
    */

-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
        if (y <= 3) {
           fp_mul_comba3(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
        if (y == 4) {
           fp_mul_comba4(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
        if (y <= 6) {
           fp_mul_comba6(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
        if (y == 7) {
           fp_mul_comba7(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
        if (y == 8) {
           fp_mul_comba8(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
        if (y == 9) {
           fp_mul_comba9(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
        if (y <= 12) {
           fp_mul_comba12(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
        if (y <= 17) {
           fp_mul_comba17(A,B,C);
-           return;
+           goto clean;
        }
 #endif

-#ifdef TFM_SMALL_SET
+#if defined(TFM_SMALL_SET) && FP_SIZE >= 32
        if (y <= 16) {
           fp_mul_comba_small(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL20)
+#if defined(TFM_MUL20) && FP_SIZE >= 40
        if (y <= 20) {
           fp_mul_comba20(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL24)
+#if defined(TFM_MUL24) && FP_SIZE >= 48
        if (yy >= 16 && y <= 24) {
           fp_mul_comba24(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL28)
+#if defined(TFM_MUL28) && FP_SIZE >= 56
        if (yy >= 20 && y <= 28) {
           fp_mul_comba28(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL32)
+#if defined(TFM_MUL32) && FP_SIZE >= 64
        if (yy >= 24 && y <= 32) {
           fp_mul_comba32(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL48)
+#if defined(TFM_MUL48) && FP_SIZE >= 96
        if (yy >= 40 && y <= 48) {
           fp_mul_comba48(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL64)
+#if defined(TFM_MUL64) && FP_SIZE >= 128
        if (yy >= 56 && y <= 64) {
           fp_mul_comba64(A,B,C);
-           return;
+           goto clean;
        }
 #endif
        fp_mul_comba(A,B,C);
+clean:
+    for (y = C->used; y < old_used; y++) {
+       C->dp[y] = 0;
+    }
 }


-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
+/* $Source: /cvs/libtom/tomsfastmath/src/mul/fp_mul.c,v $ */
+/* $Revision: 1.1 $ */
+/* $Date: 2006/12/31 21:25:53 $ */
--- a/src/mul/fp_mul_2.c
+++ b/src/mul/fp_mul_2.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 void fp_mul_2(fp_int * a, fp_int * b)
 {
--- a/src/mul/fp_mul_2d.c
+++ b/src/mul/fp_mul_2d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a * 2**d */
 void fp_mul_2d(fp_int *a, int b, fp_int *c)
--- a/src/mul/fp_mul_comba.c
+++ b/src/mul/fp_mul_comba.c
@ -12,7 +12,7 @@

 */

-#include <tfm.h>
+#include <tfm_private.h>

 #if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
   #undef TFM_SSE2
@ -53,7 +53,7 @@ asm(                                                      \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");

 #elif defined(TFM_X86_64)
 /* x86-64 optimized */
@ -88,7 +88,7 @@ asm  (                                                    \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");

 #elif defined(TFM_SSE2)
 /* use SSE2 optimizations */
@ -128,7 +128,7 @@ asm(                                                     \
    "movd  %%mm0,%%eax  \n\t"                            \
    "adcl  %%eax,%1     \n\t"                            \
    "adcl  $0,%2        \n\t"                            \
-    :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%cc");
+    :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");

 #elif defined(TFM_ARM)
 /* ARM code */
@ -155,7 +155,7 @@ asm(                                                          \
 "  ADDS   %0,%0,r0              \n\t"                         \
 "  ADCS   %1,%1,r1              \n\t"                         \
 "  ADC    %2,%2,#0              \n\t"                         \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");

 #elif defined(TFM_PPC32)
 /* For 32-bit PPC */
@ -297,8 +297,11 @@ asm(                              \

 #define MULADD(i, j)                                    \
   do { fp_word t;                                      \
-   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t;                         \
-   t = (fp_word)c1 + (t >> DIGIT_BIT);            c1 = t; c2 += t >> DIGIT_BIT;   \
+   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j);       \
+   c0 = t;                                              \
+   t = (fp_word)c1 + (t >> DIGIT_BIT);                  \
+   c1 = t;                                              \
+   c2 += t >> DIGIT_BIT;                                \
   } while (0);

 #endif
@ -346,7 +349,9 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
      /* execute loop */
      COMBA_FORWARD;
      for (iz = 0; iz < iy; ++iz) {
-          MULADD(*tmpx++, *tmpy--);
+          fp_digit _tmpx = *tmpx++;
+          fp_digit _tmpy = *tmpy--;
+          MULADD(_tmpx, _tmpy);
      }

      /* store term */
--- a/src/mul/fp_mul_comba_12.c
+++ b/src/mul/fp_mul_comba_12.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
 void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[24];
@ -109,3 +109,8 @@ void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_17.c
+++ b/src/mul/fp_mul_comba_17.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
 void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[34];
@ -149,3 +149,8 @@ void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_20.c
+++ b/src/mul/fp_mul_comba_20.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL20
+#if defined(TFM_MUL20) && FP_SIZE >= 40
 void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[40];
@ -173,3 +173,8 @@ void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_24.c
+++ b/src/mul/fp_mul_comba_24.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL24
+#if defined(TFM_MUL24) && FP_SIZE >= 48
 void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[48];
@ -205,3 +205,8 @@ void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_28.c
+++ b/src/mul/fp_mul_comba_28.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL28
+#if defined(TFM_MUL28) && FP_SIZE >= 56
 void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[56];
@ -237,3 +237,8 @@ void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_3.c
+++ b/src/mul/fp_mul_comba_3.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
 void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[6];
@ -37,3 +37,8 @@ void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_32.c
+++ b/src/mul/fp_mul_comba_32.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL32
+#if defined(TFM_MUL32) && FP_SIZE >= 64
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[64];
@ -283,3 +283,8 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_4.c
+++ b/src/mul/fp_mul_comba_4.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[8];
@ -45,3 +45,8 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_48.c
+++ b/src/mul/fp_mul_comba_48.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL48
+#if defined(TFM_MUL48) && FP_SIZE >= 96
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[96];
@ -397,3 +397,8 @@ void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_6.c
+++ b/src/mul/fp_mul_comba_6.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
 void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[12];
@ -61,3 +61,8 @@ void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_64.c
+++ b/src/mul/fp_mul_comba_64.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL64
+#if defined(TFM_MUL64) && FP_SIZE >= 128
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[128];
@ -525,3 +525,8 @@ void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_7.c
+++ b/src/mul/fp_mul_comba_7.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
 void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[14];
@ -69,3 +69,8 @@ void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_8.c
+++ b/src/mul/fp_mul_comba_8.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[16];
@ -77,3 +77,8 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_9.c
+++ b/src/mul/fp_mul_comba_9.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"

-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
 void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[18];
@ -85,3 +85,8 @@ void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_comba_small_set.c
+++ b/src/mul/fp_mul_comba_small_set.c
@ -1226,3 +1226,8 @@ void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
 }

 #endif
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/mul/fp_mul_d.c
+++ b/src/mul/fp_mul_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = a * b */
 void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
--- a/src/mul/fp_mulmod.c
+++ b/src/mul/fp_mulmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* d = a * b (mod c) */
 int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
--- a/src/numtheory/fp_gcd.c
+++ b/src/numtheory/fp_gcd.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = (a, b) */
 void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
--- a/src/numtheory/fp_invmod.c
+++ b/src/numtheory/fp_invmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
 {
--- a/src/numtheory/fp_isprime.c
+++ b/src/numtheory/fp_isprime.c
@ -7,71 +7,11 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
-
-/* a few primes */
-static const fp_digit primes[256] = {
-  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
-  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-
-  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-
-  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-
-  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
-};
+#include <tfm_private.h>

 int fp_isprime(fp_int *a)
 {
-   fp_int   b;
-   fp_digit d;
-   int      r, res;
-
-   /* do trial division */
-   for (r = 0; r < 256; r++) {
-       fp_mod_d(a, primes[r], &d);
-       if (d == 0) {
-          return FP_NO;
-       }
-   }
-
-   /* now do 8 miller rabins */
-   fp_init(&b);
-   for (r = 0; r < 8; r++) {
-       fp_set(&b, primes[r]);
-       fp_prime_miller_rabin(a, &b, &res);
-       if (res == FP_NO) {
-          return FP_NO;
-       }
-   }
-   return FP_YES;
+  return fp_isprime_ex(a, 8);
 }

 /* $Source$ */
--- a/src/numtheory/fp_isprime_ex.c
+++ b/src/numtheory/fp_isprime_ex.c
@ -0,0 +1,83 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ *
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ *
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+#include <tfm_private.h>
+
+/* a few primes */
+static const fp_digit primes[FP_PRIME_SIZE] = {
+  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
+  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+
+  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+
+  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+
+  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+};
+
+int fp_isprime_ex(fp_int *a, int t)
+{
+   fp_int   b;
+   fp_digit d;
+   int      r, res;
+
+   if (t <= 0 || t > FP_PRIME_SIZE) {
+     return FP_NO;
+   }
+
+   /* do trial division */
+   for (r = 0; r < 256; r++) {
+       fp_mod_d(a, primes[r], &d);
+       if (d == 0) {
+          return FP_NO;
+       }
+   }
+
+   /* now do 't' miller rabins */
+   fp_init(&b);
+   for (r = 0; r < t; r++) {
+       fp_set(&b, primes[r]);
+       fp_prime_miller_rabin(a, &b, &res);
+       if (res == FP_NO) {
+          return FP_NO;
+       }
+   }
+   return FP_YES;
+}
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
--- a/src/numtheory/fp_lcm.c
+++ b/src/numtheory/fp_lcm.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* c = [a, b] */
 void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
--- a/src/numtheory/fp_prime_miller_rabin.c
+++ b/src/numtheory/fp_prime_miller_rabin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* Miller-Rabin test of "a" to the base of "b" as described in 
 * HAC pp. 139 Algorithm 4.24
--- a/src/numtheory/fp_prime_random_ex.c
+++ b/src/numtheory/fp_prime_random_ex.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
 int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback cb, void *dat)
@ -16,7 +16,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
   int res, err, bsize, maskOR_msb_offset;

   /* sanity check the input */
-   if (size <= 1 || t <= 0) {
+   if (size <= 1 || cb == NULL || t <= 0 || t > FP_PRIME_SIZE) {
      return FP_VAL;
   }

@ -35,7 +35,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
   }

   /* calc the maskAND value for the MSbyte*/
-   maskAND = 0xFF >> (8 - (size & 7));
+   maskAND = 0xFF >> ((8 - (size & 7)) & 7);

   /* calc the maskOR_msb */
   maskOR_msb        = 0;
@ -71,7 +71,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
      fp_read_unsigned_bin(a, tmp, bsize);

      /* is it prime? */
-      res = fp_isprime(a);
+      res = fp_isprime_ex(a, t);
      if (res == FP_NO) continue;

      if (flags & TFM_PRIME_SAFE) {
@ -80,7 +80,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
         fp_div_2(a, a);

         /* is it prime? */
-         res = fp_isprime(a);
+         res = fp_isprime_ex(a, t);
      }
   } while (res == FP_NO);

--- a/src/sqr/fp_sqr.c
+++ b/src/sqr/fp_sqr.c
@ -7,114 +7,120 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 /* b = a*a  */
 void fp_sqr(fp_int *A, fp_int *B)
 {
-    int     y;
+    int     y, old_used;
+
+    old_used = B->used;

    /* call generic if we're out of range */
    if (A->used + A->used > FP_SIZE) {
       fp_sqr_comba(A, B);
-       return ;
+       goto clean;
    }

    y = A->used;
-#if defined(TFM_SQR3)
+#if defined(TFM_SQR3) && FP_SIZE >= 6
        if (y <= 3) {
           fp_sqr_comba3(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR4)
+#if defined(TFM_SQR4) && FP_SIZE >= 8
        if (y == 4) {
           fp_sqr_comba4(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR6)
+#if defined(TFM_SQR6) && FP_SIZE >= 12
        if (y <= 6) {
           fp_sqr_comba6(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR7)
+#if defined(TFM_SQR7) && FP_SIZE >= 14
        if (y == 7) {
           fp_sqr_comba7(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR8)
+#if defined(TFM_SQR8) && FP_SIZE >= 16
        if (y == 8) {
           fp_sqr_comba8(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR9)
+#if defined(TFM_SQR9) && FP_SIZE >= 18
        if (y == 9) {
           fp_sqr_comba9(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR12)
+#if defined(TFM_SQR12) && FP_SIZE >= 24
        if (y <= 12) {
           fp_sqr_comba12(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR17)
+#if defined(TFM_SQR17) && FP_SIZE >= 34
        if (y <= 17) {
           fp_sqr_comba17(A,B);
-           return;
+           goto clean;
        }
 #endif
 #if defined(TFM_SMALL_SET)
        if (y <= 16) {
           fp_sqr_comba_small(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR20)
+#if defined(TFM_SQR20) && FP_SIZE >= 40
        if (y <= 20) {
           fp_sqr_comba20(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR24)
+#if defined(TFM_SQR24) && FP_SIZE >= 48
        if (y <= 24) {
           fp_sqr_comba24(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR28)
+#if defined(TFM_SQR28) && FP_SIZE >= 56
        if (y <= 28) {
           fp_sqr_comba28(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR32)
+#if defined(TFM_SQR32) && FP_SIZE >= 64
        if (y <= 32) {
           fp_sqr_comba32(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR48)
+#if defined(TFM_SQR48) && FP_SIZE >= 96
        if (y <= 48) {
           fp_sqr_comba48(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR64)
+#if defined(TFM_SQR64) && FP_SIZE >= 128
        if (y <= 64) {
           fp_sqr_comba64(A,B);
-           return;
+           goto clean;
        }
 #endif
       fp_sqr_comba(A, B);
+clean:
+    for (y = B->used; y < old_used; y++) {
+       B->dp[y] = 0;
+    }
 }


-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
+/* $Source: /cvs/libtom/tomsfastmath/src/sqr/fp_sqr.c,v $ */
+/* $Revision: 1.1 $ */
+/* $Date: 2006/12/31 21:25:53 $ */
--- a/src/sqr/fp_sqr_comba.c
+++ b/src/sqr/fp_sqr_comba.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>

 #if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
   #undef TFM_SSE2
@ -41,7 +41,7 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");

 #define SQRADD2(i, j)                                     \
 asm(                                            \
@ -53,16 +53,16 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");

 #define SQRADDSC(i, j)                                    \
 asm(                                                     \
-     "movl  %6,%%eax     \n\t"                            \
-     "mull  %7           \n\t"                            \
+     "movl  %3,%%eax     \n\t"                            \
+     "mull  %4           \n\t"                            \
     "movl  %%eax,%0     \n\t"                            \
     "movl  %%edx,%1     \n\t"                            \
     "xorl  %2,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");

 #define SQRADDAC(i, j)                                    \
 asm(                                                     \
@ -71,7 +71,7 @@ asm(                                                     \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");

 #define SQRADDDB                                          \
 asm(                                                     \
@ -81,7 +81,7 @@ asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");

 #elif defined(TFM_X86_64)
 /* x86-64 optimized */
@ -109,7 +109,7 @@ asm(                                                     \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");

 #define SQRADD2(i, j)                                     \
 asm(                                                     \
@ -121,16 +121,16 @@ asm(                                                     \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");

 #define SQRADDSC(i, j)                                    \
 asm(                                                     \
-     "movq  %6,%%rax     \n\t"                            \
-     "mulq  %7           \n\t"                            \
+     "movq  %3,%%rax     \n\t"                            \
+     "mulq  %4           \n\t"                            \
     "movq  %%rax,%0     \n\t"                            \
     "movq  %%rdx,%1     \n\t"                            \
     "xorq  %2,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");

 #define SQRADDAC(i, j)                                                         \
 asm(                                                     \
@ -139,7 +139,7 @@ asm(                                                     \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");

 #define SQRADDDB                                          \
 asm(                                                     \
@ -149,7 +149,7 @@ asm(                                                     \
     "addq %6,%0         \n\t"                            \
     "adcq %7,%1         \n\t"                            \
     "adcq %8,%2         \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");

 #elif defined(TFM_SSE2)

@ -181,7 +181,7 @@ asm(                                            \
     "movd  %%mm0,%%eax  \n\t"                            \
     "adcl  %%eax,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");

 #define SQRADD2(i, j)                                     \
 asm(                                            \
@ -197,7 +197,7 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");

 #define SQRADDSC(i, j)                                                         \
 asm(                                            \
@ -221,7 +221,7 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");

 #define SQRADDDB                                          \
 asm(                                                     \
@ -231,7 +231,7 @@ asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");

 #elif defined(TFM_ARM)

@ -260,7 +260,7 @@ asm(                                                             \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");

 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)                                            \
@ -272,13 +272,13 @@ asm(                                                             \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");

 #define SQRADDSC(i, j)                                           \
 asm(                                                             \
 "  UMULL  %0,%1,%6,%7              \n\t"                         \
 "  SUB    %2,%2,%2                 \n\t"                         \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");

 #define SQRADDAC(i, j)                                           \
 asm(                                                             \
@ -286,7 +286,7 @@ asm(                                                             \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");

 #define SQRADDDB                                                 \
 asm(                                                             \
@ -296,7 +296,7 @@ asm(                                                             \
 "  ADDS  %0,%0,%3                     \n\t"                      \
 "  ADCS  %1,%1,%4                     \n\t"                      \
 "  ADC   %2,%2,%5                     \n\t"                      \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");

 #elif defined(TFM_PPC32)

@ -326,7 +326,7 @@ asm(                             \
   " mulhwu 16,%6,%6       \n\t" \
   " adde   %1,%1,16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");

 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)            \
@ -339,14 +339,14 @@ asm(                             \
   " addc   %0,%0,16       \n\t" \
   " adde   %1,%1,17       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");

 #define SQRADDSC(i, j)            \
 asm(                              \
   " mullw  %0,%6,%7        \n\t" \
   " mulhwu %1,%6,%7        \n\t" \
   " xor    %2,%2,%2        \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");

 #define SQRADDAC(i, j)           \
 asm(                             \
@ -355,7 +355,7 @@ asm(                             \
   " mulhwu 16,%6,%7       \n\t" \
   " adde   %1,%1,16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");

 #define SQRADDDB                  \
 asm(                              \
@ -365,7 +365,7 @@ asm(                              \
   " addc   %0,%0,%3        \n\t" \
   " adde   %1,%1,%4        \n\t" \
   " adde   %2,%2,%5        \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");

 #elif defined(TFM_PPC64)
 /* PPC64 */
@ -394,7 +394,7 @@ asm(                             \
   " mulhdu r16,%6,%6       \n\t" \
   " adde   %1,%1,r16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");

 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)            \
@ -407,14 +407,14 @@ asm(                             \
   " addc   %0,%0,r16       \n\t" \
   " adde   %1,%1,r17       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");

 #define SQRADDSC(i, j)            \
 asm(                              \
   " mulld  %0,%6,%7        \n\t" \
   " mulhdu %1,%6,%7        \n\t" \
   " xor    %2,%2,%2        \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");

 #define SQRADDAC(i, j)           \
 asm(                             \
@ -423,7 +423,7 @@ asm(                             \
   " mulhdu r16,%6,%7       \n\t" \
   " adde   %1,%1,r16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");

 #define SQRADDDB                  \
 asm(                              \
@ -433,7 +433,7 @@ asm(                              \
   " addc   %0,%0,%3        \n\t" \
   " adde   %1,%1,%4        \n\t" \
   " adde   %2,%2,%5        \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");


 #elif defined(TFM_AVR32)
@ -501,7 +501,7 @@ asm(                              \
   " add    %0,%0,%3        \n\t" \
   " adc    %1,%1,%4        \n\t" \
   " adc    %2,%2,%5        \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");

 #elif defined(TFM_MIPS)

@ -571,7 +571,7 @@ asm(                              \
   " mflo   %0             \n\t"  \
   " mfhi   %1             \n\t"  \
   " xor    %2,%2,%2       \n\t"  \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");

 #define SQRADDAC(i, j)           \
 asm(                             \
--- a/src/sqr/fp_sqr_comba_12.c
+++ b/src/sqr/fp_sqr_comba_12.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"

-#ifdef TFM_SQR12
+#if defined(TFM_SQR12) && FP_SIZE >= 24
 void fp_sqr_comba12(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[24], c0, c1, c2, sc0, sc1, sc2;
--- a/src/sqr/fp_sqr_comba_17.c
+++ b/src/sqr/fp_sqr_comba_17.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"

-#ifdef TFM_SQR17
+#if defined(TFM_SQR17) && FP_SIZE >= 34
 void fp_sqr_comba17(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[34], c0, c1, c2, sc0, sc1, sc2;
--- a/src/sqr/fp_sqr_comba_20.c
+++ b/src/sqr/fp_sqr_comba_20.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"

-#ifdef TFM_SQR20
+#if defined(TFM_SQR20) && FP_SIZE >= 40
 void fp_sqr_comba20(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[40], c0, c1, c2, sc0, sc1, sc2;
--- a/src/sqr/fp_sqr_comba_24.c
+++ b/src/sqr/fp_sqr_comba_24.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"

-#ifdef TFM_SQR24
+#if defined(TFM_SQR24) && FP_SIZE >= 48
 void fp_sqr_comba24(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[48], c0, c1, c2, sc0, sc1, sc2;
--- a/Show More
+++ b/Show More