Merge branch 'release/0.13.0'

2015-10-24 14:56:46 +02:00 · 2015-10-24 14:56:46 +02:00 · 5c395e04e6
commit 5c395e04e6
parent da88c2d42f a170b64106
124 changed files with 16680 additions and 2783 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,27 @@
 *.[ao]
 *.aux
 *.dvi
 *.idx
 *.ilg
 *.ind
 *.lof
 *.log
 *.toc
 *.out
 *.l[ao]
 *.orig
 .project
 .cproject
 /.libs
 test_*.txt
 test
 test.exe
 mtest
 mtest.exe
 stest
 stest.exe
 rsatest
 rsatest.exe
 timing
 timing.exe
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,32 @@
 language: c
 compiler:
  - gcc
 script: CC="${MYCC}" make ${SHARED} test_standalone >test_gcc_1.txt 2>test_gcc_2.txt && ./test >test_std.txt 2>test_err.txt
 env:
  - MYCC="gcc" SHARED=""
  - MYCC="gcc -m32" SHARED=""
  - MYCC="gcc-4.8" SHARED=""
  - MYCC="gcc-4.8 -m32" SHARED=""
  - MYCC="gcc-4.9" SHARED=""
  - MYCC="gcc-4.9 -m32" SHARED=""
  - MYCC="gcc" SHARED="-f makefile.shared"
  - MYCC="gcc -m32" SHARED="-f makefile.shared"
  - MYCC="gcc-4.8" SHARED="-f makefile.shared"
  - MYCC="gcc-4.8 -m32" SHARED="-f makefile.shared"
  - MYCC="gcc-4.9" SHARED="-f makefile.shared"
  - MYCC="gcc-4.9 -m32" SHARED="-f makefile.shared"
 matrix:
  fast_finish: true
 before_script:
  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
  - sudo apt-get -qq update
  - sudo apt-get install gcc-4.9-multilib gcc-4.8-multilib gcc-multilib build-essential
 after_failure:
  - cat test_gcc_1.txt
  - cat test_std.txt
  - cat test_err.txt
 after_script:
  - cat test_gcc_2.txt
 notifications:
  irc: "chat.freenode.net#libtom"
--- a/35
+++ b/35
@ -1,7 +1,36 @@
-TomsFastMath is public domain.
+TomsFastMath is licensed under DUAL licensing terms.
 Choose and use the license of your needs.
 [LICENSE #1]
 TomsFastMath is public domain.  As should all quality software be.
 Tom St Denis
 [/LICENSE #1]
 [LICENSE #2]
            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
                    Version 2, December 2004
 Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
 Everyone is permitted to copy and distribute verbatim or modified
 copies of this license document, and changing it is allowed as long
 as the name is changed.
            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
  0. You just DO WHAT THE FUCK YOU WANT TO. 
 [/LICENSE #2]
 -- Mark Karpelès & Steffen Jaeckel
 Note some ideas were borrowed from LibTomMath and OpenSSL.  All of the code is original or ported
-from LibTomMath [no code was ported from OpenSSL].  As such the origins and status of this code
+from LibTomMath [no code was ported from OpenSSL].
 are both public domain.
 -- Tom St Denis
--- a/README.md
+++ b/README.md
@ -0,0 +1,11 @@
 tomsfastmath
 ============
 See doc/tfm.pdf for a detailed documentation
 Project Status
 ==============
 master: [![Build Status](https://travis-ci.org/libtom/tomsfastmath.svg?branch=master)](https://travis-ci.org/libtom/tomsfastmath)
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,15 @@
 October 24th, 2015
 v0.13.0
     -- Add fp_rand()
     -- Fix bug in fp_sub() reported by Martins Mozeiko
     -- Fix bugs/apply patches in fp_mul() and fp_sqr() reported by rasky
     -- Fix bugs in fp_read_radix()
     -- Fix build issues for Linux x32 ABI
     -- Sebastian Siewior provided fp_toradix_n(),
        reported multiple issues on behalf of ClamAV
        and did most of the testing work to be able to push this release out.
     -- Fix a load of compiler warnings.
 March 14th, 2007
 0.12 -- Christophe Devine contributed MIPS asm w00t
     ++ quick release to get the MIPS code out there
--- a/demo/stest.c
+++ b/demo/stest.c
@ -8,8 +8,32 @@
 #ifndef DISPLAY
   #define DISPLAY(x) printf(x)
   #define DISPLAY_P(...) printf(__VA_ARGS__)
 #else
   #define DISPLAY_P(...) (void)0
   #define fp_dump(n,p) do{}while(0)
 #endif
 #ifndef fp_dump
 void fp_dump(const char* n, fp_int* p)
 {
  int sz;
  if (fp_radix_size(p, 2, &sz) != FP_OKAY)
    return;
  char* str = malloc(sz);
  if (!str)
    return;
 #ifdef STEST_VERBOSE
  fp_toradix(p, str, 2);
  DISPLAY_P("%s = 0b%s\n", n, str);
  fp_toradix(p, str, 16);
  DISPLAY_P("%s = 0x%s\n", n, str);
 #endif
  fp_toradix(p, str, 10);
  DISPLAY_P("%s = %s\n", n, str);
  free(str);
 }
 #endif
 #ifdef GBA_MODE
 int c_main(void)
@ -33,6 +57,8 @@ int main(void)
   modetxt_gotoxy(0,0);
 #endif
   DISPLAY_P("TFM Ident string:\n%s\n\n", fp_ident());
   /* test multiplication */
   fp_read_radix(&a, "3453534534535345345341230891273", 10);
   fp_read_radix(&b, "2394873294871238934718923" , 10);
@ -40,7 +66,7 @@ int main(void)
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("mul passed\n");
   }
@ -52,7 +78,7 @@ int main(void)
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("mul passed\n");
   }
@ -64,7 +90,7 @@ int main(void)
   fp_mul(&a, &b, &d);
   if (fp_cmp(&c, &d)) {
      DISPLAY("mul failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("mul passed\n");
   }
@ -75,7 +101,7 @@ int main(void)
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("sqr passed\n");
   }
@ -85,7 +111,7 @@ int main(void)
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("sqr passed\n");
   }
@ -95,7 +121,7 @@ int main(void)
   fp_sqr(&a, &c);
   if (fp_cmp(&c, &b)) {
      DISPLAY("sqr failed\n");
-      return 0;
+      return -1;
   } else {
      DISPLAY("sqr passed\n");
   }
@ -104,12 +130,19 @@ int main(void)
   /* montgomery reductions */
   fp_read_radix(&a, "234892374892374893489123428937892781237863278637826327367637836278362783627836783678363", 10);
   fp_read_radix(&b, "4447823492749823749234123489273987393983289319382762756425425425642727352327452374521", 10);
 #ifdef FP_64BIT
   fp_read_radix(&c, "942974496560863503657226741422301598807235487941674147660989764036913926327577165648", 10);
 #else
   fp_read_radix(&c, "2396271882990732698083317035605836523697277786556053771759862552557086442129695099100", 10);
-   fp_montgomery_setup(&b, &dp);
+#endif
   if (fp_montgomery_setup(&b, &dp) != FP_OKAY)
      DISPLAY("mont setup failed\n");
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
-      return 0;
+      fp_dump("a (is    )", &a);
      fp_dump("c (should)", &c);
      return -1;
   } else {
      DISPLAY("mont passed\n");
   }
@ -117,11 +150,14 @@ int main(void)
   fp_read_radix(&a, "2348923748923748934891234456645654645645684576353428937892781237863278637826327367637836278362783627836783678363", 10);
   fp_read_radix(&b, "444782349274982374923412348927398739398328931938276275642542542564272735232745237452123424324324444121111119", 10);
   fp_read_radix(&c, "45642613844554582908652603086180267403823312390990082328515008314514368668691233331246183943400359349283420", 10);
-   fp_montgomery_setup(&b, &dp);
+   if (fp_montgomery_setup(&b, &dp) != FP_OKAY)
      DISPLAY("mont setup failed\n");
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
-      return 0;
+      fp_dump("a (is    )", &a);
      fp_dump("c (should)", &c);
      return -1;
   } else {
      DISPLAY("mont passed\n");
   }
@ -129,11 +165,14 @@ int main(void)
   fp_read_radix(&a, "234823424242342923748923748934891234456645654645645684576353424972378234762378623891236834132352375235378462378489378927812378632786378263273676378362783627555555555539568389052478124618461834763837685723645827529034853490580134568947341278498542893481762349723907847892983627836783678363", 10);
   fp_read_radix(&b, "44478234927456563455982374923412348927398739398328931938276275642485623481638279025465891276312903262837562349056234783648712314678120389173890128905425242424239784256427", 10);
   fp_read_radix(&c, "33160865265453361650564031464519042126185632333462754084489985719613480783282357410514898819797738034600484519472656152351777186694609218202276509271061460265488348645081", 10);
-   fp_montgomery_setup(&b, &dp);
+   if (fp_montgomery_setup(&b, &dp) != FP_OKAY)
      DISPLAY("mont setup failed\n");
   fp_montgomery_reduce(&a, &b, dp);
   if (fp_cmp(&a, &c)) {
      DISPLAY("mont failed\n");
-      return 0;
+      fp_dump("a (is    )", &a);
      fp_dump("c (should)", &c);
      return -1;
   } else {
      DISPLAY("mont passed\n");
   }
--- a/demo/test.c
+++ b/demo/test.c
@ -1,12 +1,23 @@
 /* TFM demo program */
 #include <tfm.h>
 #include <time.h>
 #include <unistd.h>
 #ifndef TFM_DEMO_TEST_VS_MTEST
 #define TFM_DEMO_TEST_VS_MTEST 1
 #endif
 void draw(fp_int *a)
 {
  int x;
  printf("%d, %d, ", a->used, a->sign);
  for (x = a->used - 1; x >= 0; x--) {
 #if SIZEOF_FP_DIGIT == 4
      printf("%08lx ", a->dp[x]);
 #else
      printf("%016llx ", a->dp[x]);
 #endif
  }
  printf("\n");
 }
@ -14,71 +25,33 @@ void draw(fp_int *a)
 int myrng(unsigned char *dst, int len, void *dat)
 {
   int x;
   (void)dat;
   for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
   return len;
 }
 /* RDTSC from Scott Duplichan */
 static ulong64 TIMFUNC (void)
   {
   #if defined __GNUC__
      #if defined(INTEL_CC)
 	 ulong64 a;
         asm ("rdtsc":"=A"(a));
         return a;
      #elif defined(__i386__) || defined(__x86_64__)
         ulong64 a;
         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
         return a;
      #elif defined(TFM_PPC32) 
         unsigned long a, b;
         __asm__ __volatile__ ("mftbu %1 \nmftb %0\n":"=r"(a), "=r"(b));
         return (((ulong64)b) << 32ULL) | ((ulong64)a);
      #elif defined(TFM_AVR32) 
 	 FILE *in;
         char buf[20];
 	 in = fopen("/sys/devices/system/cpu/cpu0/pccycles", "r");
 	 fgets(buf, 20, in);
 	 fclose(in);
 	 return strtoul(buf, NULL, 10);
      #else /* gcc-IA64 version */
         unsigned long result;
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         while (__builtin_expect ((int) result == -1, 0))
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         return result;
      #endif
   // Microsoft and Intel Windows compilers
   #elif defined _M_IX86
     __asm rdtsc
   #elif defined _M_AMD64
     return __rdtsc ();
   #elif defined _M_IA64
     #if defined __INTEL_COMPILER
       #include <ia64intrin.h>
     #endif
      return __getReg (3116);
   #else
     #error need rdtsc function for this build
   #endif
   }
   char cmd[4096], buf[4096];
 int main(void)
 {
  fp_int a,b,c,d,e,f;
  unsigned long ix;
 #if TFM_DEMO_TEST_VS_MTEST
  unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, cnt, rr;
 #else
  fp_digit fp;
  int n, err;
-   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
+#endif
                 div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
   ulong64 t1, t2;
  srand(time(NULL));
  printf("TFM Ident string:\n%s\n\n", fp_ident());
  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
-  fp_zero(&a); draw(&a);
+  fp_zero(&a);
 #if TFM_DEMO_TEST_VS_MTEST == 0
  draw(&a);
  /* test set and simple shifts */
  printf("Testing mul/div 2\n");
@ -134,6 +107,10 @@ int main(void)
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  fp_read_radix(&a, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", 16); draw(&a);
  fp_sub_d(&a, 3, &b); draw(&b);
  fp_read_radix(&a, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", 16);
  printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
  /* test mul_d */
  printf("Testing mul_d and div_d\n");
@ -150,7 +127,6 @@ int main(void)
  printf("Testing read_radix\n");
  fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
 #if 0
  /* test mont */
  printf("Montgomery test #1\n");
  fp_set(&a, 0x1234567ULL);
@ -208,421 +184,10 @@ int main(void)
       }
   }
   printf("\n\n");
 #endif
 #ifdef TESTING
 goto testing;
 #endif
 #if 1
 t1 = TIMFUNC();
 sleep(1);
 printf("Ticks per second: %llu\n", TIMFUNC() - t1);
 goto multtime;
 /* do some timings... */
  printf("Addition:\n");
  for (t = 2; t <= FP_SIZE/2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix;
          b.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      t2 = -1;
      for (ix = 0; ix < 25000; ++ix) {
          t1 = TIMFUNC();
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          fp_add(&a, &b, &c); fp_add(&a, &b, &c);
          t2 = (TIMFUNC() - t1)>>3;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 multtime:
  printf("Multiplication:\n");
  for (t = 2; t < FP_SIZE/2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix;
          b.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      t2 = -1;
      for (ix = 0; ix < 100; ++ix) {
          t1 = TIMFUNC();
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
          t2 = (TIMFUNC() - t1)>>7;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
 sqrtime:
  printf("Squaring:\n");
  for (t = 2; t < FP_SIZE/2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix;
      }
      a.used = t;
      t2 = -1;
      for (ix = 0; ix < 100; ++ix) {
          t1 = TIMFUNC();
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          fp_sqr(&a, &b); fp_sqr(&a, &b);
          t2 = (TIMFUNC() - t1)>>7;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 invmodtime:
  printf("Invmod:\n");
  for (t = 2; t < FP_SIZE/2; t += 2) {
     fp_zero(&a);
     for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix | 1;
     }
     a.used = t;
     fp_zero(&b);
     for (ix = 0; ix < t; ix++) {
         b.dp[ix] = rand();
     }
     b.used = t;
     fp_clamp(&b);
     fp_zero(&c);
     t2 = -1;
     for (ix = 0; ix < 100; ++ix) {
          t1 = TIMFUNC();
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          fp_invmod(&b, &a, &c);
          t2 = (TIMFUNC() - t1)>>6;
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
 monttime:
  printf("Montgomery:\n");
  for (t = 2; t <= (FP_SIZE/2)-4; t += 2) {
 //      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
      fp_zero(&a);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix | 1;
      }
      a.used = t;
     fp_montgomery_setup(&a, &fp);
     fp_sub_d(&a, 3, &b);
     fp_sqr(&b, &b);      
     fp_copy(&b, &c);      
     fp_copy(&b, &d);      
     t2 = -1;
     for (ix = 0; ix < 100; ++ix) {
          t1 = TIMFUNC();
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          fp_montgomery_reduce(&c, &a, &fp);
          fp_montgomery_reduce(&d, &a, &fp);
          t2 = (TIMFUNC() - t1)>>6;
          fp_copy(&b, &c);      
          fp_copy(&b, &d);      
          if (t1<t2) { --ix; t2 = t1; }
      }
      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
 //#else
 expttime:
  printf("Exptmod:\n");
  for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += 256/DIGIT_BIT) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
          a.dp[ix] = ix+1;
          b.dp[ix] = (fp_digit)rand() * (fp_digit)rand();
          c.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      c.used = t;
     t2 = -1;
     for (ix = 0; ix < 500; ++ix) {
          t1 = TIMFUNC();
          fp_exptmod(&c, &b, &a, &d);
          fp_exptmod(&c, &b, &a, &d);
          t2 = (TIMFUNC() - t1)>>1;
          fp_copy(&b, &c);      
          fp_copy(&b, &d);      
          if (t1<t2) { t2 = t1; --ix; }
     }
     printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
  }
  return 0;
 #endif
 return 0;
-testing:
+
 #else
  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); fp_zero(&a);
@ -643,7 +208,7 @@ testing:
          fp_mul_2d(&a, rr, &a);
          a.sign = b.sign;
          if (fp_cmp(&a, &b) != FP_EQ) {
-             printf("mul2d failed, rr == %lu\n",rr);
+             printf("\nmul2d failed, rr == %lu\n",rr);
             draw(&a);
             draw(&b);
             return 0;
@ -657,7 +222,7 @@ testing:
          a.sign = b.sign;
          if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
          if (fp_cmp(&a, &b) != FP_EQ) {
-             printf("div2d failed, rr == %lu\n",rr);
+             printf("\ndiv2d failed, rr == %lu\n",rr);
             draw(&a);
             draw(&b);
             return 0;
@ -669,7 +234,7 @@ testing:
          fp_copy(&a, &d);
          fp_add(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("add %lu failure!\n", add_n);
+             printf("\nadd %lu failure!\n", add_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
@ -681,7 +246,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
          fp_read_signed_bin(&d, (unsigned char *)cmd, rr);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("fp_signed_bin failure!\n");
+             printf("f\np_signed_bin failure!\n");
             draw(&c);
             draw(&d);
             return 0;
@ -692,7 +257,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
          fp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
          if (fp_cmp_mag(&c, &d) != FP_EQ) {
-             printf("fp_unsigned_bin failure!\n");
+             printf("\nfp_unsigned_bin failure!\n");
             draw(&c);
             draw(&d);
             return 0;
@ -705,98 +270,98 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          fp_copy(&a, &d);
          fp_sub(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("sub %lu failure!\n", sub_n);
+             printf("\nsub %lu failure!\n", sub_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-       } else if (!strcmp(cmd, "mul")) { 
+       } else if (!strcmp(cmd, "mul")) { ++mul_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
          fp_copy(&a, &d);
-          fp_mul(&d, &b, &d); ++mul_n;
+          fp_mul(&d, &b, &d);
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("mul %lu failure!\n", mul_n);
+             printf("\nmul %lu failure!\n", mul_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-       } else if (!strcmp(cmd, "div")) { 
+       } else if (!strcmp(cmd, "div")) { ++div_n;
          fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
          fgets(buf, 4095, stdin); fp_read_radix(&d, buf, 64);
 // continue;
-          fp_div(&a, &b, &e, &f); ++div_n;
+          fp_div(&a, &b, &e, &f);
          if (fp_cmp(&c, &e) != FP_EQ || fp_cmp(&d, &f) != FP_EQ) {
-             printf("div %lu failure!\n", div_n);
+             printf("\ndiv %lu failure!\n", div_n);
 draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
             return 0;
          }
-       } else if (!strcmp(cmd, "sqr")) { 
+       } else if (!strcmp(cmd, "sqr")) { ++sqr_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
 // continue;
          fp_copy(&a, &c);
-          fp_sqr(&c, &c); ++sqr_n;
+          fp_sqr(&c, &c);
          if (fp_cmp(&b, &c) != FP_EQ) {
-             printf("sqr %lu failure!\n", sqr_n);
+             printf("\nsqr %lu failure!\n", sqr_n);
 draw(&a);draw(&b);draw(&c);
             return 0;
          }
-       } else if (!strcmp(cmd, "gcd")) { 
+       } else if (!strcmp(cmd, "gcd")) { ++gcd_n;
          fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
          fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 // continue;
          fp_copy(&a, &d);
-          fp_gcd(&d, &b, &d); ++gcd_n;
+          fp_gcd(&d, &b, &d);
          d.sign = c.sign;
          if (fp_cmp(&c, &d) != FP_EQ) {
-             printf("gcd %lu failure!\n", gcd_n);
+             printf("\ngcd %lu failure!\n", gcd_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-       } else if (!strcmp(cmd, "lcm")) { 
+       } else if (!strcmp(cmd, "lcm")) { ++lcm_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
             fp_copy(&a, &d);
-             fp_lcm(&d, &b, &d); ++lcm_n;
+             fp_lcm(&d, &b, &d);
             d.sign = c.sign;
             if (fp_cmp(&c, &d) != FP_EQ) {
-                printf("lcm %lu failure!\n", lcm_n);
+                printf("\nlcm %lu failure!\n", lcm_n);
   draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
-       } else if (!strcmp(cmd, "expt")) {  
+       } else if (!strcmp(cmd, "expt")) { ++expt_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&d, buf, 64);
 // continue;
             fp_copy(&a, &e);
-             fp_exptmod(&e, &b, &c, &e); ++expt_n;
+             fp_exptmod(&e, &b, &c, &e);
             if (fp_cmp(&d, &e) != FP_EQ) {
-                printf("expt %lu failure!\n", expt_n);
+                printf("\nexpt %lu failure!\n", expt_n);
   draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
                return 0;
             }
-       } else if (!strcmp(cmd, "invmod")) {  
+       } else if (!strcmp(cmd, "invmod")) { ++inv_n;
             fgets(buf, 4095, stdin);  fp_read_radix(&a, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fgets(buf, 4095, stdin);  fp_read_radix(&c, buf, 64);
 //continue;
             fp_invmod(&a, &b, &d);
 #if 1
-             fp_mulmod(&d,&a,&b,&e); ++inv_n;
+             fp_mulmod(&d,&a,&b,&e);
             if (fp_cmp_d(&e, 1) != FP_EQ) {
 #else
             if (fp_cmp(&d, &c) != FP_EQ) {
 #endif
-                printf("inv [wrong value from MPI?!] failure\n");
+                printf("\ninv [wrong value from MPI?!] failure\n");
                draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
@ -806,7 +371,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fp_div_2(&a, &c);
             if (fp_cmp(&c, &b) != FP_EQ) {
-                 printf("div_2 %lu failure\n", div2_n);
+                 printf("\ndiv_2 %lu failure\n", div2_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -817,7 +382,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
             fgets(buf, 4095, stdin);  fp_read_radix(&b, buf, 64);
             fp_mul_2(&a, &c);
             if (fp_cmp(&c, &b) != FP_EQ) {
-                 printf("mul_2 %lu failure\n", mul2_n);
+                 printf("\nmul_2 %lu failure\n", mul2_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -829,7 +394,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_add_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
-                 printf("add_d %lu failure\n", add_d_n);
+                 printf("\nadd_d %lu failure\n", add_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -842,7 +407,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_sub_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
-                 printf("sub_d %lu failure\n", sub_d_n);
+                 printf("\nsub_d %lu failure\n", sub_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -855,7 +420,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
              fp_mul_d(&a, ix, &c);
              if (fp_cmp(&b, &c) != FP_EQ) {
-                 printf("mul_d %lu failure\n", sub_d_n);
+                 printf("\nmul_d %lu failure\n", mul_d_n);
                 draw(&a);
                 draw(&b);
                 draw(&c);
@ -865,6 +430,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
       }
   }
 #endif
 }
--- a/demo/timing.c
+++ b/demo/timing.c
@ -0,0 +1,625 @@
 /* TFM timing analysis */
 #include <tfm.h>
 #include <time.h>
 #include <unistd.h>
 /* RDTSC from Scott Duplichan */
 static ulong64 TIMFUNC(void)
 {
 #if defined __GNUC__
   #if defined(INTEL_CC)
 ulong64 a;
      asm ("rdtsc":"=A"(a));
      return a;
   #elif defined(__i386__) || defined(__x86_64__)
      /* version from http://www.mcs.anl.gov/~kazutomo/rdtsc.html
       * the old code always got a warning issued by gcc, clang did not complain...
       */
      unsigned hi, lo;
      __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
      return ((ulong64)lo)|( ((ulong64)hi)<<32);
   #elif defined(TFM_PPC32)
      unsigned long a, b;
      __asm__ __volatile__ ("mftbu %1 \nmftb %0\n":"=r"(a), "=r"(b));
      return (((ulong64)b) << 32ULL) | ((ulong64)a);
   #elif defined(TFM_AVR32)
 FILE *in;
      char buf[20];
 in = fopen("/sys/devices/system/cpu/cpu0/pccycles", "r");
 fgets(buf, 20, in);
 fclose(in);
 return strtoul(buf, NULL, 10);
   #else /* gcc-IA64 version */
      unsigned long result;
      __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
      while (__builtin_expect ((int) result == -1, 0))
      __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
      return result;
   #endif
 // Microsoft and Intel Windows compilers
 #elif defined _M_IX86
  __asm rdtsc
 #elif defined _M_AMD64
  return __rdtsc ();
 #elif defined _M_IA64
  #if defined __INTEL_COMPILER
    #include <ia64intrin.h>
  #endif
   return __getReg (3116);
 #else
  #error need rdtsc function for this build
 #endif
 }
 static ulong64 ticks;
 static const char* p_str;
 static void print_start(const char* s)
 {
   p_str = s;
 }
 static void print_line(ulong64 b, ulong64 t)
 {
   printf("%llu;%s;%llu;%llu\n", ticks, p_str, b, t);
 }
 int main(void)
 {
   fp_int a,b,c,d;
   ulong64 t1, t2;
   fp_digit fp;
   unsigned long t, ix;
   t1 = TIMFUNC();
   sleep(1);
   ticks = TIMFUNC() - t1;
   fprintf(stderr, "Ticks per second: %llu\n", ticks);
   printf("Ticks/sec;Algorithm;bits;time\n");
   /* do some timings... */
   print_start("Addition");
   for (t = 2; t <= FP_SIZE / 2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix;
         b.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      t2 = -1;
      for (ix = 0; ix < 25000; ++ix) {
         t1 = TIMFUNC();
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         fp_add(&a, &b, &c);
         t2 = (TIMFUNC() - t1) >> 3;
         if (t1 < t2) {
            --ix;
            t2 = t1;
         }
      }
      print_line(t * DIGIT_BIT, t2);
   }
   print_start("Multiplication");
   for (t = 2; t < FP_SIZE / 2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix;
         b.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      t2 = -1;
      for (ix = 0; ix < 100; ++ix) {
         t1 = TIMFUNC();
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         fp_mul(&a, &b, &c);
         t2 = (TIMFUNC() - t1) >> 7;
         if (t1 < t2) {
            --ix;
            t2 = t1;
         }
      }
      print_line(t * DIGIT_BIT, t2);
   }
   print_start("Squaring");
   for (t = 2; t < FP_SIZE / 2; t += 2) {
      fp_zero(&a);
      fp_zero(&b);
      for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix;
      }
      a.used = t;
      t2 = -1;
      for (ix = 0; ix < 100; ++ix) {
         t1 = TIMFUNC();
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         fp_sqr(&a, &b);
         t2 = (TIMFUNC() - t1) >> 7;
         if (t1 < t2) {
            --ix;
            t2 = t1;
         }
      }
      print_line(t * DIGIT_BIT, t2);
   }
   print_start("Invmod");
   for (t = 2; t < FP_SIZE / 2; t += 2) {
      fp_zero(&a);
      for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix | 1;
      }
      a.used = t;
      fp_zero(&b);
      for (ix = 0; ix < t; ix++) {
         b.dp[ix] = rand();
      }
      b.used = t;
      fp_clamp(&b);
      fp_zero(&c);
      t2 = -1;
      for (ix = 0; ix < 100; ++ix) {
         t1 = TIMFUNC();
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         fp_invmod(&b, &a, &c);
         t2 = (TIMFUNC() - t1) >> 6;
         if (t1 < t2) {
            --ix;
            t2 = t1;
         }
      }
      print_line(t * DIGIT_BIT, t2);
   }
   print_start("Montgomery");
   for (t = 2; t <= (FP_SIZE / 2) - 4; t += 2) {
      //      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
      fp_zero(&a);
      for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix | 1;
      }
      a.used = t;
      fp_montgomery_setup(&a, &fp);
      fp_sub_d(&a, 3, &b);
      fp_sqr(&b, &b);
      fp_copy(&b, &c);
      fp_copy(&b, &d);
      t2 = -1;
      for (ix = 0; ix < 100; ++ix) {
         t1 = TIMFUNC();
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         fp_montgomery_reduce(&c, &a, fp);
         fp_montgomery_reduce(&d, &a, fp);
         t2 = (TIMFUNC() - t1) >> 6;
         fp_copy(&b, &c);
         fp_copy(&b, &d);
         if (t1 < t2) {
            --ix;
            t2 = t1;
         }
      }
      print_line(t * DIGIT_BIT, t2);
   }
   print_start("Exptmod");
   for (t = 512 / DIGIT_BIT; t <= (FP_SIZE / 2) - 2; t += 256 / DIGIT_BIT) {
      fp_zero(&a);
      fp_zero(&b);
      fp_zero(&c);
      for (ix = 0; ix < t; ix++) {
         a.dp[ix] = ix + 1;
         b.dp[ix] = (fp_digit) rand() * (fp_digit) rand();
         c.dp[ix] = ix;
      }
      a.used = t;
      b.used = t;
      c.used = t;
      t2 = -1;
      for (ix = 0; ix < 500; ++ix) {
         t1 = TIMFUNC();
         fp_exptmod(&c, &b, &a, &d);
         fp_exptmod(&c, &b, &a, &d);
         t2 = (TIMFUNC() - t1) >> 1;
         fp_copy(&b, &c);
         fp_copy(&b, &d);
         if (t1 < t2) {
            t2 = t1;
            --ix;
         }
      }
      print_line(t * DIGIT_BIT, t2);
   }
   return 0;
 }
--- a/doc/tfm.pdf
+++ b/doc/tfm.pdf
--- a/gen.pl
+++ b/gen.pl
@ -6,7 +6,7 @@
 use strict;
 open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
-foreach my $filename (glob "*fp_*.c") {
+foreach my $filename (glob "src/*/*fp_*.c") {
   next if ($filename eq "fp_sqr_comba_generic.c");
   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
   print OUT "/* Start: $filename */\n";
--- a/libtfm.symbols
+++ b/libtfm.symbols
@ -0,0 +1,49 @@
 fp_2expt
 fp_add
 fp_add_d
 fp_addmod
 fp_cmp
 fp_cmp_d
 fp_cmp_mag
 fp_cnt_lsb
 fp_count_bits
 fp_div
 fp_div_2
 fp_div_2d
 fp_div_d
 fp_exptmod
 fp_gcd
 fp_ident
 fp_invmod
 fp_isprime
 fp_lcm
 fp_lshd
 fp_mod
 fp_mod_2d
 fp_mod_d
 fp_montgomery_calc_normalization
 fp_montgomery_reduce
 fp_montgomery_setup
 fp_mul
 fp_mul_2
 fp_mul_2d
 fp_mul_d
 fp_mulmod
 fp_prime_random_ex
 fp_radix_size
 fp_read_radix
 fp_read_signed_bin
 fp_read_unsigned_bin
 fp_rshd
 fp_set
 fp_signed_bin_size
 fp_sqr
 fp_sqrmod
 fp_sub
 fp_sub_d
 fp_submod
 fp_to_signed_bin
 fp_to_unsigned_bin
 fp_toradix
 fp_toradix_n
 fp_unsigned_bin_size
--- a/158
+++ b/158
@ -1,10 +1,22 @@
 #makefile for TomsFastMath
 #
 #
-VERSION=0.12
+VERSION=0.13
 CFLAGS += -Wall -W -Wshadow -Isrc/headers
 # Compiler and Linker Names
 ifndef PREFIX
  PREFIX=
 endif
 ifeq ($(CC),cc)
  CC = $(PREFIX)gcc
 endif
 LD=$(PREFIX)ld
 AR=$(PREFIX)ar
 RANLIB=$(PREFIX)ranlib
 ifndef MAKE
   MAKE=make
 endif
@ -27,27 +39,29 @@ OBJECTS=src/addsub/fp_add.o src/addsub/fp_add_d.o src/addsub/fp_addmod.o src/add
 src/addsub/fp_cmp_d.o src/addsub/fp_cmp_mag.o src/addsub/fp_sub.o src/addsub/fp_sub_d.o \
 src/addsub/fp_submod.o src/addsub/s_fp_add.o src/addsub/s_fp_sub.o src/bin/fp_radix_size.o \
 src/bin/fp_read_radix.o src/bin/fp_read_signed_bin.o src/bin/fp_read_unsigned_bin.o \
-src/bin/fp_reverse.o src/bin/fp_s_rmap.o src/bin/fp_signed_bin_size.o src/bin/fp_to_signed_bin.o \
+src/bin/fp_reverse.o src/bin/fp_signed_bin_size.o src/bin/fp_s_rmap.o src/bin/fp_toradix.o \
-src/bin/fp_to_unsigned_bin.o src/bin/fp_toradix.o src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o \
+src/bin/fp_toradix_n.o src/bin/fp_to_signed_bin.o src/bin/fp_to_unsigned_bin.o \
-src/bit/fp_count_bits.o src/bit/fp_div_2.o src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o \
+src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o src/bit/fp_count_bits.o src/bit/fp_div_2.o \
-src/bit/fp_rshd.o src/divide/fp_div.o src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o \
+src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o src/bit/fp_rshd.o src/divide/fp_div.o \
-src/exptmod/fp_2expt.o src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_set.o \
+src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o src/exptmod/fp_2expt.o \
 src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_rand.o src/misc/fp_set.o \
 src/mont/fp_montgomery_calc_normalization.o src/mont/fp_montgomery_reduce.o \
-src/mont/fp_montgomery_setup.o src/mul/fp_mul.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o \
+src/mont/fp_montgomery_setup.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o src/mul/fp_mul.o \
-src/mul/fp_mul_comba.o src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o \
+src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o src/mul/fp_mul_comba_24.o \
-src/mul/fp_mul_comba_24.o src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_32.o \
+src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_32.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_48.o \
-src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_48.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_64.o \
+src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_64.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_7.o \
-src/mul/fp_mul_comba_7.o src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o \
+src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o src/mul/fp_mul_comba.o \
 src/mul/fp_mul_comba_small_set.o src/mul/fp_mul_d.o src/mul/fp_mulmod.o src/numtheory/fp_gcd.o \
-src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_lcm.o \
+src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_isprime_ex.o \
-src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o src/sqr/fp_sqr.o \
+src/numtheory/fp_lcm.o src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o \
-src/sqr/fp_sqr_comba.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
+src/sqr/fp_sqr.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
-src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_3.o src/sqr/fp_sqr_comba_32.o \
+src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_32.o src/sqr/fp_sqr_comba_3.o \
-src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_6.o src/sqr/fp_sqr_comba_64.o \
+src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_64.o src/sqr/fp_sqr_comba_6.o \
-src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o \
+src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o src/sqr/fp_sqr_comba.o \
 src/sqr/fp_sqr_comba_generic.o src/sqr/fp_sqr_comba_small_set.o src/sqr/fp_sqrmod.o
-HEADERS=src/headers/tfm.h 
+HEADERS_PUB:=src/headers/tfm.h
 HEADERS=src/headers/tfm_private.h $(HEADERS_PUB)
 #END_INS
@ -77,32 +91,44 @@ endif
 default: $(LIBNAME)
 $(OBJECTS): $(HEADERS)
 $(LIBNAME): $(OBJECTS)
 	$(AR) $(ARFLAGS) $@ $(OBJECTS)
-	ranlib $@
+	$(RANLIB) $@
 install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
-	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS_PUB) $(DESTDIR)$(INCPATH)
-mtest/mtest: mtest/mtest.o
+.PHONY: mtest
-	cd mtest ; CFLAGS="$(CFLAGS) -I../" MAKE=${MAKE} ${MAKE} mtest
+mtest: $(LIBNAME)
 	cd mtest; CC="$(CC)" CFLAGS="$(CFLAGS) -I../" MAKE=${MAKE} ${MAKE} mtest
-test: $(LIBNAME) demo/test.o mtest/mtest
+demo/test.o: CFLAGS+=-Wno-unused-result
 .PHONY: test
 test: $(LIBNAME) demo/test.o
 	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
-timing: $(LIBNAME) demo/test.o
+test_standalone: CFLAGS+=-DTFM_DEMO_TEST_VS_MTEST=0
 .PHONY: test_standalone
 test_standalone: $(LIBNAME) demo/test.o
 	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
 timing: $(LIBNAME) demo/timing.o
 	$(CC) $(CFLAGS) demo/timing.o $(LIBNAME) $(PROF) -o timing
 profiled:
-	CFLAGS="${CFLAGS} -fprofile-generate" MAKE=${MAKE} ${MAKE} timing
+	CC="$(CC)" PREFIX="${PREFIX} CFLAGS="${CFLAGS} -fprofile-generate" MAKE=${MAKE} ${MAKE} timing
 	./test
-	rm -f `find . -type f | grep "[.]o" | xargs`
+	rm -f `find . -type f -name "*.o" | xargs`
-	rm -f `find . -type f | grep "[.]a" | xargs`
+	rm -f `find . -type f -name "*.a" | xargs`
 	rm -f test
-	CFLAGS="${CFLAGS} -fprofile-use" MAKE=${MAKE} ${MAKE} timing
+	CC=$(CC) PREFIX="${PREFIX} CFLAGS="${CFLAGS} -fprofile-use" MAKE=${MAKE} ${MAKE} timing
 stest: $(LIBNAME) demo/stest.o
 	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME) -o stest
@ -111,6 +137,15 @@ rsatest: $(LIBNAME) demo/rsa.o
 	$(CC) $(CFLAGS) demo/rsa.o $(LIBNAME) -o rsatest
 docdvi: tfm.tex
 	cp tfm.tex tfm.bak
 	touch --reference=tfm.tex tfm.bak
 	(printf "%s" "\def\fixedpdfdate{"; date +'D:%Y%m%d%H%M%S%:z' -d @$$(stat --format=%Y tfm.tex) | sed "s/:\([0-9][0-9]\)$$/'\1'}/g") > tfm-deterministic.tex
 	printf "%s\n" "\pdfinfo{" >> tfm-deterministic.tex
 	printf "%s\n" "  /CreationDate (\fixedpdfdate)" >> tfm-deterministic.tex
 	printf "%s\n}\n" "  /ModDate (\fixedpdfdate)" >> tfm-deterministic.tex
 	cat tfm.tex >> tfm-deterministic.tex
 	mv tfm-deterministic.tex tfm.tex
 	touch --reference=tfm.bak tfm.tex
 	touch tfm.ind
 	latex tfm >/dev/null
 	latex tfm >/dev/null
@ -119,41 +154,48 @@ docdvi: tfm.tex
 docs: docdvi
 	latex tfm >/dev/null
-	dvipdf tfm
+	pdflatex tfm >/dev/null
 	sed -b -i 's,^/ID \[.*\]$$,/ID [<0> <0>],g' tfm.pdf
 	mv tfm.bak tfm.tex
 	mv -f tfm.pdf doc
 #This rule cleans the source tree of all compiled code, not including the pdf
 #documentation.
 clean:
-	rm -f `find . -type f | grep "[.]o" | xargs`
+	rm -f `find . -type f -name "*.o" | xargs`
-	rm -f `find . -type f | grep "[.]lo"  | xargs`
+	rm -f `find . -type f -name "*.lo"  | xargs`
-	rm -f `find . -type f | grep "[.]a" | xargs`
+	rm -f `find . -type f -name "*.a" | xargs`
-	rm -f `find . -type f | grep "[.]la"  | xargs`
+	rm -f `find . -type f -name "*.la"  | xargs`
-	rm -f `find . -type f | grep "[.]obj" | xargs`
+	rm -f `find . -type f -name "*.obj" | xargs`
-	rm -f `find . -type f | grep "[.]lib" | xargs`
+	rm -f `find . -type f -name "*.lib" | xargs`
-	rm -f `find . -type f | grep "[.]exe" | xargs`
+	rm -f `find . -type f -name "*.exe" | xargs`
-	rm -f `find . -type f | grep "[.]gcda" | xargs`
+	rm -f `find . -type f -name "*.gcov" | xargs`
-	rm -f `find . -type f | grep "[.]gcno" | xargs`
+	rm -f `find . -type f -name "*.gcda" | xargs`
-	rm -f `find . -type f | grep "[.]il" | xargs`
+	rm -f `find . -type f -name "*.gcno" | xargs`
-	rm -f `find . -type f | grep "[.]dyn" | xargs`
+	rm -f `find . -type f -name "*.il" | xargs`
-	rm -f `find . -type f | grep "[.]dpi" | xargs`
+	rm -f `find . -type f -name "*.dyn" | xargs`
-	rm -rf `find . -type d | grep "[.]libs" | xargs`
+	rm -f `find . -type f -name "*.dpi" | xargs`
-	rm -f tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.toc test mtest/mtest
+	rm -rf `find . -type d -name "*.libs" | xargs`
-	cd mtest ; MAKE=${MAKE} ${MAKE} clean
+	rm -f tfm.aux  tfm.dvi  tfm.idx  tfm.ilg  tfm.ind  tfm.lof  tfm.log  tfm.out  tfm.toc  test  test.exe
 	cd mtest; MAKE=${MAKE} ${MAKE} clean
-no_oops: clean
+.PHONY: pre_gen
-	cd .. ; cvs commit
+pre_gen:
-	echo Scanning for scratch/dirty files
+	perl gen.pl
-	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
+	sed -e 's/[[:blank:]]*$$//' mpi.c > pre_gen/mpi.c
 	rm mpi.c
-zipup: no_oops docs clean
+zipup:
-	perl gen.pl ; mv mpi.c pre_gen/ ; \
+	rm -rf ../tomsfastmath-$(VERSION) && rm -f ../tfm-$(VERSION).zip ../tfm-$(VERSION).tar.bz2 && \
-	cd .. ; rm -rf tfm* tomsfastmath-$(VERSION) ; mkdir tomsfastmath-$(VERSION) ; \
+	expsrc.sh -i . -o ../tomsfastmath-$(VERSION) --svntags --no-fetch -p '*.c' -p '*.h' && \
-	cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
+	MAKE=${MAKE} ${MAKE} -C ../tomsfastmath-$(VERSION) docs && \
-	tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
+	tar -c ../tomsfastmath-$(VERSION)/* | bzip2 -9vvc > ../tfm-$(VERSION).tar.bz2 && \
-	zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/* ; \
+	zip -9 -r ../tfm-$(VERSION).zip ../tomsfastmath-$(VERSION)/* && \
-	mv -f tfm* ~ ; rm -rf tomsfastmath-$(VERSION)
+	gpg -b -a ../tfm-$(VERSION).tar.bz2 && gpg -b -a ../tfm-$(VERSION).zip
-# $Source: /cvs/libtom/tomsfastmath/makefile,v $ 
+new_file:
-# $Revision: 1.38 $ 
+	bash updatemakes.sh
-# $Date: 2007/03/13 01:23:03 $ 
+
 # $Source$
 # $Revision$
 # $Date$
--- a/makefile.shared
+++ b/makefile.shared
@ -1,9 +1,10 @@
 #makefile for TomsFastMath
 #
 #
-VERSION=0:12
+VERSION=1:0:0
-CC=libtool --mode=compile --tag=CC gcc
+LT  ?= libtool
 LTCOMPILE = $(LT) --mode=compile --tag=CC $(CC)
 CFLAGS += -Wall -W -Wshadow -Isrc/headers
@ -25,24 +26,25 @@ OBJECTS=src/addsub/fp_add.o src/addsub/fp_add_d.o src/addsub/fp_addmod.o src/add
 src/addsub/fp_cmp_d.o src/addsub/fp_cmp_mag.o src/addsub/fp_sub.o src/addsub/fp_sub_d.o \
 src/addsub/fp_submod.o src/addsub/s_fp_add.o src/addsub/s_fp_sub.o src/bin/fp_radix_size.o \
 src/bin/fp_read_radix.o src/bin/fp_read_signed_bin.o src/bin/fp_read_unsigned_bin.o \
-src/bin/fp_reverse.o src/bin/fp_s_rmap.o src/bin/fp_signed_bin_size.o src/bin/fp_to_signed_bin.o \
+src/bin/fp_reverse.o src/bin/fp_signed_bin_size.o src/bin/fp_s_rmap.o src/bin/fp_toradix.o \
-src/bin/fp_to_unsigned_bin.o src/bin/fp_toradix.o src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o \
+src/bin/fp_toradix_n.o src/bin/fp_to_signed_bin.o src/bin/fp_to_unsigned_bin.o \
-src/bit/fp_count_bits.o src/bit/fp_div_2.o src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o \
+src/bin/fp_unsigned_bin_size.o src/bit/fp_cnt_lsb.o src/bit/fp_count_bits.o src/bit/fp_div_2.o \
-src/bit/fp_rshd.o src/divide/fp_div.o src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o \
+src/bit/fp_div_2d.o src/bit/fp_lshd.o src/bit/fp_mod_2d.o src/bit/fp_rshd.o src/divide/fp_div.o \
-src/exptmod/fp_2expt.o src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_set.o \
+src/divide/fp_div_d.o src/divide/fp_mod.o src/divide/fp_mod_d.o src/exptmod/fp_2expt.o \
 src/exptmod/fp_exptmod.o src/misc/fp_ident.o src/misc/fp_rand.o src/misc/fp_set.o \
 src/mont/fp_montgomery_calc_normalization.o src/mont/fp_montgomery_reduce.o \
-src/mont/fp_montgomery_setup.o src/mul/fp_mul.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o \
+src/mont/fp_montgomery_setup.o src/mul/fp_mul_2.o src/mul/fp_mul_2d.o src/mul/fp_mul.o \
-src/mul/fp_mul_comba.o src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o \
+src/mul/fp_mul_comba_12.o src/mul/fp_mul_comba_17.o src/mul/fp_mul_comba_20.o src/mul/fp_mul_comba_24.o \
-src/mul/fp_mul_comba_24.o src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_32.o \
+src/mul/fp_mul_comba_28.o src/mul/fp_mul_comba_32.o src/mul/fp_mul_comba_3.o src/mul/fp_mul_comba_48.o \
-src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_48.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_64.o \
+src/mul/fp_mul_comba_4.o src/mul/fp_mul_comba_64.o src/mul/fp_mul_comba_6.o src/mul/fp_mul_comba_7.o \
-src/mul/fp_mul_comba_7.o src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o \
+src/mul/fp_mul_comba_8.o src/mul/fp_mul_comba_9.o src/mul/fp_mul_comba.o \
 src/mul/fp_mul_comba_small_set.o src/mul/fp_mul_d.o src/mul/fp_mulmod.o src/numtheory/fp_gcd.o \
-src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_lcm.o \
+src/numtheory/fp_invmod.o src/numtheory/fp_isprime.o src/numtheory/fp_isprime_ex.o \
-src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o src/sqr/fp_sqr.o \
+src/numtheory/fp_lcm.o src/numtheory/fp_prime_miller_rabin.o src/numtheory/fp_prime_random_ex.o \
-src/sqr/fp_sqr_comba.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
+src/sqr/fp_sqr.o src/sqr/fp_sqr_comba_12.o src/sqr/fp_sqr_comba_17.o src/sqr/fp_sqr_comba_20.o \
-src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_3.o src/sqr/fp_sqr_comba_32.o \
+src/sqr/fp_sqr_comba_24.o src/sqr/fp_sqr_comba_28.o src/sqr/fp_sqr_comba_32.o src/sqr/fp_sqr_comba_3.o \
-src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_6.o src/sqr/fp_sqr_comba_64.o \
+src/sqr/fp_sqr_comba_48.o src/sqr/fp_sqr_comba_4.o src/sqr/fp_sqr_comba_64.o src/sqr/fp_sqr_comba_6.o \
-src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o \
+src/sqr/fp_sqr_comba_7.o src/sqr/fp_sqr_comba_8.o src/sqr/fp_sqr_comba_9.o src/sqr/fp_sqr_comba.o \
 src/sqr/fp_sqr_comba_generic.o src/sqr/fp_sqr_comba_small_set.o src/sqr/fp_sqrmod.o
 HEADERS=src/headers/tfm.h
@ -80,10 +82,13 @@ endif
 default: $(LIBNAME)
-objs: $(OBJECTS)
+$(OBJECTS): $(HEADERS)
 .c.o:
 	$(LTCOMPILE) $(CFLAGS) $(LDFLAGS) -o $@ -c $<
 $(LIBNAME): $(OBJECTS)
-	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
+	libtool --silent --mode=link --tag=CC $(CC) $(CFLAGS) $(LDFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION) -export-symbols libtfm.symbols
 install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
@ -94,16 +99,26 @@ install: $(LIBNAME)
 mtest/mtest: mtest/mtest.c
 	cd mtest ; make mtest
-test: $(LIBNAME) demo/test.o mtest/mtest
+demo/test.o: CFLAGS+=-Wno-unused-result
 	$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
-timing: $(LIBNAME) demo/test.o
+.PHONY: test
-	$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
+test: $(LIBNAME) demo/test.o
 	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o test demo/test.o $(LIBNAME)
 test_standalone: CFLAGS+=-DTFM_DEMO_TEST_VS_MTEST=0
 .PHONY: test_standalone
 test_standalone: $(LIBNAME) demo/test.o
 	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o test demo/test.o $(LIBNAME)
 stest: $(LIBNAME) demo/stest.o
-	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest
+	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o stest demo/stest.o $(LIBNAME)
-# $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $ 
+.PHONY: timing
-# $Revision: 1.19 $ 
+timing: $(LIBNAME) demo/timing.o
-# $Date: 2007/03/13 01:23:03 $ 
+	$(LT) --mode=link --tag=CC $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o timing demo/timing.o $(LIBNAME)
 # $Source$
 # $Revision$
 # $Date$
--- a/mtest/makefile
+++ b/mtest/makefile
@ -1,9 +1,10 @@
-CFLAGS += -Wall -W -O3 
+CFLAGS += -Wall -W -O3 -Wno-unused-result
 default: mtest
 .PHONY: mtest
 mtest: mtest.o
 	$(CC) $(CFLAGS) mtest.o -ltommath -o mtest
 clean:
-	rm -f *.o mtest *~
+	rm -f *.o mtest *~ mtest.exe
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -39,6 +39,7 @@ mulmod
 #include <time.h>
 #include <tommath.h>
 #define CRYPT
 #undef DIGIT_BIT
 #include "../src/headers/tfm.h"
 FILE *rng;
@ -46,8 +47,8 @@ FILE *rng;
 /* 1-2048 bit numbers */
 void rand_num(mp_int *a)
 {
-   int n, size;
+   int size;
-   unsigned char buf[2048];
+   unsigned char buf[(FP_MAX_SIZE/16 - DIGIT_BIT/2) + 1];
   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % (FP_MAX_SIZE/16 - DIGIT_BIT/2);
   buf[0] = (fgetc(rng)&1)?1:0;
@ -59,8 +60,8 @@ void rand_num(mp_int *a)
 /* 1-256 bit numbers (to test things like exptmod) */
 void rand_num2(mp_int *a)
 {
-   int n, size;
+   int size;
-   unsigned char buf[2048];
+   unsigned char buf[(FP_MAX_SIZE/16 - DIGIT_BIT/2) + 1];
   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % (FP_MAX_SIZE/16 - DIGIT_BIT/2);
   buf[0] = (fgetc(rng)&1)?1:0;
@ -69,13 +70,15 @@ void rand_num2(mp_int *a)
   mp_read_raw(a, buf, 1+size);
 }
-#define mp_to64(a, b) mp_toradix(a, b, 64)
+#define mp_to64(a, b) mp_toradix_n(a, b, 64, sizeof(b))
 int main(void)
 {
   int n, tmp;
   mp_int a, b, c, d, e;
 #ifdef MTEST_NO_FULLSPEED
   clock_t t1;
 #endif
   char buf[4096];
   mp_init(&a);
@ -88,7 +91,7 @@ int main(void)
   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
 /*
   mp_set(&a, 1);
-   for (n = 1; n < 8192; n++) {
+   for (n = 1; n < ((FP_MAX_SIZE-(8*DIGIT_BIT))/2); n++) {
       mp_mul(&a, &a, &c);
       printf("mul\n");
       mp_to64(&a, buf);
@ -111,9 +114,11 @@ int main(void)
      }
   }
 #ifdef MTEST_NO_FULLSPEED
   t1 = clock();
 #endif
   for (;;) {
-#if 0
+#ifdef MTEST_NO_FULLSPEED
      if (clock() - t1 > CLOCKS_PER_SEC) {
         sleep(2);
         t1 = clock();
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
--- a/src/addsub/fp_add.c
+++ b/src/addsub/fp_add.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_add(fp_int *a, fp_int *b, fp_int *c)
 {
--- a/src/addsub/fp_add_d.c
+++ b/src/addsub/fp_add_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a + b */
 void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
--- a/src/addsub/fp_addmod.c
+++ b/src/addsub/fp_addmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* d = a + b (mod c) */
 int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
--- a/src/addsub/fp_cmp.c
+++ b/src/addsub/fp_cmp.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_cmp(fp_int *a, fp_int *b)
 {
--- a/src/addsub/fp_cmp_d.c
+++ b/src/addsub/fp_cmp_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* compare against a single digit */
 int fp_cmp_d(fp_int *a, fp_digit b)
--- a/src/addsub/fp_cmp_mag.c
+++ b/src/addsub/fp_cmp_mag.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_cmp_mag(fp_int *a, fp_int *b)
 {
--- a/src/addsub/fp_sub.c
+++ b/src/addsub/fp_sub.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a - b */
 void fp_sub(fp_int *a, fp_int *b, fp_int *c)
--- a/src/addsub/fp_sub_d.c
+++ b/src/addsub/fp_sub_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a - b */
 void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
--- a/src/addsub/fp_submod.c
+++ b/src/addsub/fp_submod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* d = a - b (mod c) */
 int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
--- a/src/addsub/s_fp_add.c
+++ b/src/addsub/s_fp_add.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* unsigned addition */
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
@ -16,7 +16,7 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
  register fp_word  t;
  y       = MAX(a->used, b->used);
-  oldused = c->used;
+  oldused = MIN(c->used, FP_SIZE);
  c->used = y;
  t = 0;
--- a/src/addsub/s_fp_sub.c
+++ b/src/addsub/s_fp_sub.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
@ -27,7 +27,7 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
  for (; x < a->used; x++) {
     t         = ((fp_word)a->dp[x]) - t;
     c->dp[x]  = (fp_digit)t;
-     t         = (t >> DIGIT_BIT);
+     t         = (t >> DIGIT_BIT)&1;
   }
  for (; x < oldused; x++) {
     c->dp[x] = 0;
--- a/src/bin/fp_radix_size.c
+++ b/src/bin/fp_radix_size.c
@ -7,11 +7,10 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_radix_size(fp_int *a, int radix, int *size)
 {
  int     digs;
  fp_int  t;
  fp_digit d;
@ -36,7 +35,6 @@ int fp_radix_size(fp_int *a, int radix, int *size)
    t.sign = FP_ZPOS;
  }
  digs = 0;
  while (fp_iszero (&t) == FP_NO) {
    fp_div_d (&t, (fp_digit) radix, &t, &d);
    (*size)++;
--- a/src/bin/fp_read_radix.c
+++ b/src/bin/fp_read_radix.c
@ -7,13 +7,16 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_read_radix(fp_int *a, char *str, int radix)
 {
  int     y, neg;
  char    ch;
  /* set the integer to the default of zero */
  fp_zero (a);
  /* make sure the radix is ok */
  if (radix < 2 || radix > 64) {
    return FP_VAL;
@ -29,16 +32,13 @@ int fp_read_radix(fp_int *a, char *str, int radix)
    neg = FP_ZPOS;
  }
  /* set the integer to the default of zero */
  fp_zero (a);
  /* process each digit of the string */
  while (*str) {
    /* if the radix < 36 the conversion is case insensitive
     * this allows numbers like 1AB and 1ab to represent the same  value
     * [e.g. in hex]
     */
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    ch = (char) ((radix <= 36) ? toupper ((int)*str) : *str);
    for (y = 0; y < 64; y++) {
      if (ch == fp_s_rmap[y]) {
         break;
--- a/src/bin/fp_read_signed_bin.c
+++ b/src/bin/fp_read_signed_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_read_signed_bin(fp_int *a, unsigned char *b, int c)
 {
--- a/src/bin/fp_read_unsigned_bin.c
+++ b/src/bin/fp_read_unsigned_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c)
 {
--- a/src/bin/fp_reverse.c
+++ b/src/bin/fp_reverse.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* reverse an array, used for radix code */
 void fp_reverse (unsigned char *s, int len)
--- a/src/bin/fp_s_rmap.c
+++ b/src/bin/fp_s_rmap.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* chars used in radix conversions */
 const char *fp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
--- a/src/bin/fp_signed_bin_size.c
+++ b/src/bin/fp_signed_bin_size.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_signed_bin_size(fp_int *a)
 {
--- a/src/bin/fp_to_signed_bin.c
+++ b/src/bin/fp_to_signed_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_to_signed_bin(fp_int *a, unsigned char *b)
 {
--- a/src/bin/fp_to_unsigned_bin.c
+++ b/src/bin/fp_to_unsigned_bin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
 {
--- a/src/bin/fp_toradix.c
+++ b/src/bin/fp_toradix.c
@ -7,51 +7,23 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /**
 * a:		pointer to fp_int representing the input number
 * str:		output buffer
 * radix:	number of character to use for encoding of the number
 *
 * The radix value can be in the range 2 to 64. This function converts number
 * a into a string str. Please don't use this function because a too small
 * chosen str buffer would lead to an overflow which can not be detected.
 * Please use fp_toradix_n() instead.
 *
 * Return: FP_VAL on error, FP_OKAY on success.
 */
 int fp_toradix(fp_int *a, char *str, int radix)
 {
-  int     digs;
+   return fp_toradix_n(a, str, radix, INT_MAX);
  fp_int  t;
  fp_digit d;
  char   *_s = str;
  /* check range of the radix */
  if (radix < 2 || radix > 64) {
    return FP_VAL;
  }
  /* quick out if its zero */
  if (fp_iszero(a) == 1) {
     *str++ = '0';
     *str = '\0';
     return FP_OKAY;
  }
  fp_init_copy(&t, a);
  /* if it is negative output a - */
  if (t.sign == FP_NEG) {
    ++_s;
    *str++ = '-';
    t.sign = FP_ZPOS;
  }
  digs = 0;
  while (fp_iszero (&t) == FP_NO) {
    fp_div_d (&t, (fp_digit) radix, &t, &d);
    *str++ = fp_s_rmap[d];
    ++digs;
  }
  /* reverse the digits of the string.  In this case _s points
   * to the first digit [exluding the sign] of the number]
   */
  fp_reverse ((unsigned char *)_s, digs);
  /* append a NULL so the string is properly terminated */
  *str = '\0';
  return FP_OKAY;
 }
 /* $Source$ */
--- a/src/bin/fp_toradix_n.c
+++ b/src/bin/fp_toradix_n.c
@ -0,0 +1,71 @@
 /* TomsFastMath, a fast ISO C bignum library.
 *
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm_private.h>
 int fp_toradix_n(fp_int *a, char *str, int radix, int maxlen)
 {
   int digs;
   fp_int t;
   fp_digit d;
   char *_s = str;
   /* check range of the radix */
   if (maxlen < 2 || radix < 2 || radix > 64)
      return FP_VAL;
   /* quick check for zero */
   if (fp_iszero(a) == FP_YES) {
      *str++ = '0';
      *str = '\0';
      return FP_OKAY;
   }
   fp_init_copy(&t, a);
   /* if it is negative output a - */
   if (t.sign == FP_NEG) {
      /* we have to reverse our digits later... but not the - sign!! */
      ++_s;
      /* store the flag and mark the number as positive */
      *str++ = '-';
      t.sign = FP_ZPOS;
      /* subtract a char */
      --maxlen;
   }
   digs = 0;
   while (fp_iszero (&t) == FP_NO) {
      if (--maxlen < 1) {
         /* no more room */
         break;
      }
      fp_div_d(&t, (fp_digit) radix, &t, &d);
      *str++ = fp_s_rmap[d];
      ++digs;
   }
   /* reverse the digits of the string.  In this case _s points
    * to the first digit [exluding the sign] of the number]
    */
   fp_reverse((unsigned char *) _s, digs);
   /* append a NULL so the string is properly terminated */
   *str = '\0';
   if (maxlen < 1)
      return FP_VAL;
   return FP_OKAY;
 }
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/bin/fp_unsigned_bin_size.c
+++ b/src/bin/fp_unsigned_bin_size.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_unsigned_bin_size(fp_int *a)
 {
--- a/src/bit/fp_cnt_lsb.c
+++ b/src/bit/fp_cnt_lsb.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 static const int lnz[16] = {
   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
--- a/src/bit/fp_count_bits.c
+++ b/src/bit/fp_count_bits.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 int fp_count_bits (fp_int * a)
 {
--- a/src/bit/fp_div_2.c
+++ b/src/bit/fp_div_2.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* b = a/2 */
 void fp_div_2(fp_int * a, fp_int * b)
--- a/src/bit/fp_div_2d.c
+++ b/src/bit/fp_div_2d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a / 2**b */
 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
--- a/src/bit/fp_lshd.c
+++ b/src/bit/fp_lshd.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_lshd(fp_int *a, int x)
 {
--- a/src/bit/fp_mod_2d.c
+++ b/src/bit/fp_mod_2d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a mod 2**d */
 void fp_mod_2d(fp_int *a, int b, fp_int *c)
--- a/src/bit/fp_rshd.c
+++ b/src/bit/fp_rshd.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_rshd(fp_int *a, int x)
 {
--- a/src/divide/fp_div.c
+++ b/src/divide/fp_div.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* a/b => cb + d == a */
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
--- a/src/divide/fp_div_d.c
+++ b/src/divide/fp_div_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 static int s_is_power_of_two(fp_digit b, int *p)
 {
--- a/src/divide/fp_mod.c
+++ b/src/divide/fp_mod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
--- a/src/divide/fp_mod_d.c
+++ b/src/divide/fp_mod_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a mod b, 0 <= c < b  */
 int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
--- a/src/exptmod/fp_2expt.c
+++ b/src/exptmod/fp_2expt.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* computes a = 2**b */
 void fp_2expt(fp_int *a, int b)
--- a/src/exptmod/fp_exptmod.c
+++ b/src/exptmod/fp_exptmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 #ifdef TFM_TIMING_RESISTANT
--- a/src/generators/.gitignore
+++ b/src/generators/.gitignore
@ -0,0 +1,8 @@
 comba_mult_gen
 comba_mult_smallgen
 comba_sqr_gen
 comba_sqr_smallgen
 comba_mult_gen.exe
 comba_mult_smallgen.exe
 comba_sqr_gen.exe
 comba_sqr_smallgen.exe
--- a/src/generators/comba_mult_gen.c
+++ b/src/generators/comba_mult_gen.c
@ -18,6 +18,10 @@ int main(int argc, char **argv)
   /* print out preamble */
 printf(
 "#define TFM_DEFINES\n"
 "#include \"fp_mul_comba.c\"\n"
 "\n"
 "#if defined(TFM_MUL%d) && FP_SIZE >= %d\n"
 "void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
 "{\n"
 "   fp_digit c0, c1, c2, at[%d];\n"
@ -26,7 +30,7 @@ printf(
 "   memcpy(at+%d, B->dp, %d * sizeof(fp_digit));\n"
 "   COMBA_START;\n"
 "\n"
-"   COMBA_CLEAR;\n", N, N+N, N, N, N);
+"   COMBA_CLEAR;\n", N, N+N, N, N+N, N, N, N);
   /* now do the rows */
   for (x = 0; x < (N+N-1); x++) {
@ -53,7 +57,11 @@ printf(
 "   C->sign = A->sign ^ B->sign;\n"
 "   fp_clamp(C);\n"
 "   COMBA_FINI;\n"
-"}\n\n\n", N+N-1, N+N);
+"}\n#endif\n\n\n"
 "/* $Source$ */\n"
 "/* $Revision$ */\n"
 "/* $Date$ */\n"
 , N+N-1, N+N);
  return 0;
 }
--- a/src/generators/comba_mult_smallgen.c
+++ b/src/generators/comba_mult_smallgen.c
@ -7,6 +7,10 @@ int main(int argc, char **argv)
   /* print out preamble */
 printf(
 "#define TFM_DEFINES\n"
 "#include \"fp_mul_comba.c\"\n"
 "\n"
 "#if defined(TFM_SMALL_SET)\n"
 "void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)\n"
 "{\n"
 "   fp_digit c0, c1, c2, at[32];\n"
@ -51,7 +55,10 @@ printf(
 "      COMBA_FINI;\n"
 "      break;\n", N+N-1, N+N);
 }
-printf("   }\n}\n\n");
+printf("   }\n}\n\n#endif\n\n\n"
 "/* $Source$ */\n"
 "/* $Revision$ */\n"
 "/* $Date$ */\n");
  return 0;
 }
--- a/src/generators/comba_sqr_gen.c
+++ b/src/generators/comba_sqr_gen.c
@ -16,20 +16,26 @@ int main(int argc, char **argv)
   N = atoi(argv[1]);
 printf(
-"#ifdef TFM_SQR%d\n"
+"#define TFM_DEFINES\n"
 "#include \"fp_sqr_comba.c\"\n"
 "\n"
 "#if defined(TFM_SQR%d) && FP_SIZE >= %d\n"
 "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
 "{\n"
 "   fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
 "#ifdef TFM_ISO\n"
 "   fp_word tt;\n"
 "#endif\n"
 "\n"
 "   a = A->dp;\n"
-"   COMBA_START; \n"
+"   COMBA_START;\n"
 "\n"
 "   /* clear carries */\n"
 "   CLEAR_CARRY;\n"
 "\n"
 "   /* output 0 */\n"
 "   SQRADD(a[0],a[0]);\n"
-"   COMBA_STORE(b[0]);\n", N, N, N+N);
+"   COMBA_STORE(b[0]);\n", N, N+N, N, N+N);
   for (x = 1; x < N+N-1; x++) {
 printf(
@ -91,7 +97,11 @@ printf(
 "   B->sign = FP_ZPOS;\n"
 "   memcpy(B->dp, b, %d * sizeof(fp_digit));\n"
 "   fp_clamp(B);\n"
-"}\n#endif\n\n\n", N+N, N+N);
+"}\n#endif\n\n\n"
 "/* $Source$ */\n"
 "/* $Revision$ */\n"
 "/* $Date$ */\n"
 , N+N, N+N);
  return 0;
 }
--- a/src/generators/comba_sqr_smallgen.c
+++ b/src/generators/comba_sqr_smallgen.c
@ -16,9 +16,16 @@ int main(int argc, char **argv)
   int x, y, z, N, f;
 printf(
 "#define TFM_DEFINES\n"
 "#include \"fp_sqr_comba.c\"\n"
 "\n"
 "#if defined(TFM_SMALL_SET)\n"
 "void fp_sqr_comba_small(fp_int *A, fp_int *B)\n"
 "{\n"
 "   fp_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;\n"
 "#ifdef TFM_ISO\n"
 "   fp_word tt;\n"
 "#endif\n"
 );
 printf("   switch (A->used) { \n");
@ -99,7 +106,11 @@ printf(
 "      break;\n\n", N+N, N+N);
 }
-printf("}\n\n}\n");
+printf("}\n}\n\n#endif /* TFM_SMALL_SET */\n\n"
 "/* $Source$ */\n"
 "/* $Revision$ */\n"
 "/* $Date$ */\n"
 );
  return 0;
 }
--- a/src/generators/makefile
+++ b/src/generators/makefile
@ -0,0 +1,31 @@
 all: comba_sqr_gen comba_sqr_smallgen
 clean:
 	rm -f comba_mult_gen
 	rm -f comba_mult_gen.exe
 	rm -f comba_mult_smallgen
 	rm -f comba_mult_smallgen.exe
 	rm -f comba_sqr_gen
 	rm -f comba_sqr_gen.exe
 	rm -f comba_sqr_smallgen
 	rm -f comba_sqr_smallgen.exe
 comba_mult_gen: comba_mult_gen.c
 	gcc -o comba_mult_gen comba_mult_gen.c
 comba_mult_smallgen: comba_mult_smallgen.c
 	gcc -o comba_mult_smallgen comba_mult_smallgen.c
 comba_sqr_gen: comba_sqr_gen.c
 	gcc -o comba_sqr_gen comba_sqr_gen.c
 comba_sqr_smallgen: comba_sqr_smallgen.c
 	gcc -o comba_sqr_smallgen comba_sqr_smallgen.c
 regen: comba_mult_gen comba_mult_smallgen comba_sqr_gen comba_sqr_smallgen
 	for i in 3 4 6 7 8 9 12 17 20 24 28 32 48 64; do \
 		./comba_mult_gen $$i | sed -e 's/ *$$//' > ../mul/fp_mul_comba_$$i.c; \
 	done
 	./comba_mult_smallgen > ../mul/fp_mul_comba_small_set.c
 	for i in 3 4 6 7 8 9 12 17 20 24 28 32 48 64; do \
 		./comba_sqr_gen $$i | sed -e 's/ *$$//' > ../sqr/fp_sqr_comba_$$i.c; \
 	done
 	./comba_sqr_smallgen > ../sqr/fp_sqr_comba_small_set.c
--- a/src/headers/tfm.h
+++ b/src/headers/tfm.h
@ -16,6 +16,15 @@
 #include <ctype.h>
 #include <limits.h>
 /* 0xMaMiPaXX
 * Major
 * Minor
 * Patch
 * XX - undefined
 */
 #define TFM_VERSION     0x000D0000
 #define TFM_VERSION_S   "v0.13.0"
 #ifndef MIN
   #define MIN(x,y) ((x)<(y)?(x):(y))
 #endif
@ -104,6 +113,10 @@
   #error FP_MAX_SIZE must be a multiple of CHAR_BIT
 #endif
 #if __SIZEOF_LONG__ == 8
 	#define FP_64BIT
 #endif
 /* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */
 #if defined(__x86_64__)
   #if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)
@ -245,11 +258,15 @@
 #if defined(FP_64BIT)
   /* for GCC only on supported platforms */
 #ifndef CRYPT
-   typedef unsigned long ulong64;
+   typedef unsigned long long ulong64;
-#endif
+#endif /* CRYPT */
   typedef ulong64            fp_digit;
 #define SIZEOF_FP_DIGIT 8
   typedef unsigned long      fp_word __attribute__ ((mode(TI)));
 #else
   /* this is to make porting into LibTomCrypt easier :-) */
 #ifndef CRYPT
   #if defined(_MSC_VER) || defined(__BORLANDC__)
@ -258,14 +275,16 @@
   #else
      typedef unsigned long long ulong64;
      typedef signed long long   long64;
-   #endif
+   #endif /* defined(_MSC_VER) ... */
-#endif
+#endif /* CRYPT */
-   typedef unsigned long      fp_digit;
+
   typedef unsigned int       fp_digit;
 #define SIZEOF_FP_DIGIT 4
   typedef ulong64            fp_word;
-#endif
+#endif /* FP_64BIT */
 /* # of digits this is */
-#define DIGIT_BIT  (int)((CHAR_BIT) * sizeof(fp_digit))
+#define DIGIT_BIT  ((CHAR_BIT) * SIZEOF_FP_DIGIT)
 #define FP_MASK    (fp_digit)(-1)
 #define FP_SIZE    (FP_MAX_SIZE/DIGIT_BIT)
@ -311,6 +330,9 @@ const char *fp_ident(void);
 /* set to a small digit */
 void fp_set(fp_int *a, fp_digit b);
 /* makes a pseudo-random int of a given size */
 void fp_rand(fp_int *a, int digits);
 /* copy from a to b */
 #define fp_copy(a, b)      (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
 #define fp_init_copy(a, b) fp_copy(b, a)
@ -422,8 +444,11 @@ int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* perform a Miller-Rabin test of a to the base b and store result in "result" */
 void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result);
 #define FP_PRIME_SIZE      256
 /* 256 trial divisions + 8 Miller-Rabins, returns FP_YES if probable prime  */
 int fp_isprime(fp_int *a);
 /* extended version of fp_isprime, do 't' Miller-Rabins instead of only 8 */
 int fp_isprime_ex(fp_int *a, int t);
 /* Primality generation flags */
 #define TFM_PRIME_BBS      0x0001 /* BBS style prime */
@ -450,119 +475,13 @@ void fp_read_signed_bin(fp_int *a, unsigned char *b, int c);
 void fp_to_signed_bin(fp_int *a, unsigned char *b);
 int fp_read_radix(fp_int *a, char *str, int radix);
 int fp_radix_size(fp_int *a, int radix, int *size);
 int fp_toradix(fp_int *a, char *str, int radix);
 int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);
 /* VARIOUS LOW LEVEL STUFFS */
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
 void fp_reverse(unsigned char *s, int len);
 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
 #ifdef TFM_SMALL_SET
 void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL3
 void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL4
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL6
 void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL7
 void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL8
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL9
 void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL12
 void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL17
 void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL20
 void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL24
 void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL28
 void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
 #endif
 void fp_sqr_comba(fp_int *A, fp_int *B);
 #ifdef TFM_SMALL_SET
 void fp_sqr_comba_small(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR3
 void fp_sqr_comba3(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR4
 void fp_sqr_comba4(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR6
 void fp_sqr_comba6(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR7
 void fp_sqr_comba7(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR8
 void fp_sqr_comba8(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR9
 void fp_sqr_comba9(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR12
 void fp_sqr_comba12(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR17
 void fp_sqr_comba17(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR20
 void fp_sqr_comba20(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR24
 void fp_sqr_comba24(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR28
 void fp_sqr_comba28(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *A, fp_int *B);
 #endif
 extern const char *fp_s_rmap;
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/headers/tfm_private.h
+++ b/src/headers/tfm_private.h
@ -0,0 +1,125 @@
 /* TomsFastMath, a fast ISO C bignum library.
 *
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
 #ifndef TFM_PRIVATE_H_
 #define TFM_PRIVATE_H_
 #include <tfm.h>
 /* VARIOUS LOW LEVEL STUFFS */
 void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
 void fp_reverse(unsigned char *s, int len);
 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
 #ifdef TFM_SMALL_SET
 void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL3
 void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL4
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL6
 void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL7
 void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL8
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL9
 void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL12
 void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL17
 void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL20
 void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL24
 void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL28
 void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
 #endif
 #ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
 #endif
 void fp_sqr_comba(fp_int *A, fp_int *B);
 #ifdef TFM_SMALL_SET
 void fp_sqr_comba_small(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR3
 void fp_sqr_comba3(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR4
 void fp_sqr_comba4(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR6
 void fp_sqr_comba6(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR7
 void fp_sqr_comba7(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR8
 void fp_sqr_comba8(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR9
 void fp_sqr_comba9(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR12
 void fp_sqr_comba12(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR17
 void fp_sqr_comba17(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR20
 void fp_sqr_comba20(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR24
 void fp_sqr_comba24(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR28
 void fp_sqr_comba28(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *A, fp_int *B);
 #endif
 #ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *A, fp_int *B);
 #endif
 extern const char *fp_s_rmap;
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/misc/fp_ident.c
+++ b/src/misc/fp_ident.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include "tfm.h"
+#include <tfm_private.h>
 const char *fp_ident(void)
 {
@ -15,11 +15,14 @@ const char *fp_ident(void)
   memset(buf, 0, sizeof(buf));
   snprintf(buf, sizeof(buf)-1,
-"TomsFastMath (%s)\n"
+"TomsFastMath " TFM_VERSION_S "\n"
 #if defined(TFM_IDENT_BUILD_DATE)
 "Built on " __DATE__ " at " __TIME__ "\n"
 #endif
 "\n"
 "Sizeofs\n"
-"\tfp_digit = %u\n"
+"\tfp_digit = %lu\n"
-"\tfp_word  = %u\n"
+"\tfp_word  = %lu\n"
 "\n"
 "FP_MAX_SIZE = %u\n"
 "\n"
@ -70,11 +73,11 @@ const char *fp_ident(void)
 #ifdef TFM_HUGE
 " TFM_HUGE "
 #endif
-"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
+"\n", (unsigned long)sizeof(fp_digit), (unsigned long)sizeof(fp_word), FP_MAX_SIZE);
   if (sizeof(fp_digit) == sizeof(fp_word)) {
      strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
-              sizeof(buf)-1);
+              sizeof(buf) - strlen(buf) - 1);
   }
   return buf;
 }
--- a/src/misc/fp_rand.c
+++ b/src/misc/fp_rand.c
@ -0,0 +1,41 @@
 /* TomsFastMath, a fast ISO C bignum library.
 *
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm_private.h>
 /* makes a pseudo-random int of a given size */
 void fp_rand(fp_int *a, int digits)
 {
   fp_digit d;
   fp_zero(a);
   if (digits <= 0) {
     return;
   }
   /* first place a random non-zero digit */
   do {
     d = ((fp_digit) abs (rand ())) & FP_MASK;
   } while (d == 0);
   fp_add_d (a, d, a);
   while (--digits > 0) {
     fp_lshd (a, 1);
     fp_add_d (a, ((fp_digit) abs (rand ())), a);
   }
   return;
 }
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/misc/fp_set.c
+++ b/src/misc/fp_set.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_set(fp_int *a, fp_digit b)
 {
--- a/src/mont/fp_montgomery_calc_normalization.c
+++ b/src/mont/fp_montgomery_calc_normalization.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* computes a = B**n mod b without division or multiplication useful for
 * normalizing numbers in a Montgomery system.
--- a/src/mont/fp_montgomery_reduce.c
+++ b/src/mont/fp_montgomery_reduce.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /******************************************************************/
 #if defined(TFM_X86) && !defined(TFM_SSE2)
@ -29,8 +29,8 @@ asm(                                                      \
   "adcl $0,%%edx \n\t"                                   \
   "movl %%edx,%1 \n\t"                                   \
 :"=g"(_c[LO]), "=r"(cy)                                   \
-:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
+:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
-: "%eax", "%edx", "%cc")
+: "%eax", "%edx", "cc")
 #define PROPCARRY                           \
 asm(                                        \
@ -39,7 +39,7 @@ asm(                                        \
   "movzbl %%al,%1 \n\t"                    \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
-: "%eax", "%cc")
+: "%eax", "cc")
 /******************************************************************/
 #elif defined(TFM_X86_64)
@ -62,7 +62,7 @@ asm(                                                      \
   "movq %%rdx,%1 \n\t"                                   \
 :"=g"(_c[LO]), "=r"(cy)                                   \
 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
-: "%rax", "%rdx", "%cc")
+: "%rax", "%rdx", "cc")
 #define INNERMUL8 \
 asm(                  \
@ -155,7 +155,7 @@ asm(                                                      \
 \
 :"=r"(_c), "=r"(cy)                    \
 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
-: "%rax", "%rdx", "%r10", "%r11", "%cc")
+: "%rax", "%rdx", "%r10", "%r11", "cc")
 #define PROPCARRY                           \
@ -165,7 +165,7 @@ asm(                                        \
   "movzbq %%al,%1 \n\t"                    \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
-: "%rax", "%cc")
+: "%rax", "cc")
 /******************************************************************/
 #elif defined(TFM_SSE2)
@ -280,7 +280,7 @@ asm(                                        \
   "movzbl %%al,%1 \n\t"                    \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
-: "%eax", "%cc")
+: "%eax", "cc")
 /******************************************************************/
 #elif defined(TFM_ARM)
@ -300,7 +300,7 @@ asm(                                \
    " MOVCC  %0,#0            \n\t" \
    " UMLAL  r0,%0,%3,%4      \n\t" \
    " STR    r0,%1            \n\t" \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
 #define PROPCARRY                  \
 asm(                               \
@ -309,7 +309,7 @@ asm(                               \
    " STR   r0,%1            \n\t" \
    " MOVCS %0,#1            \n\t" \
    " MOVCC %0,#0            \n\t" \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
 /******************************************************************/
 #elif defined(TFM_PPC32)
@ -325,22 +325,18 @@ asm(                               \
 asm(                                 \
   " mullw    16,%3,%4       \n\t"   \
   " mulhwu   17,%3,%4       \n\t"   \
-   " addc     16,16,%0       \n\t"   \
+   " addc     16,16,%2       \n\t"   \
   " addze    17,17          \n\t"   \
-   " lwz      18,%1          \n\t"   \
+   " addc     %1,16,%5       \n\t"   \
   " addc     16,16,18       \n\t"   \
   " addze    %0,17          \n\t"   \
-   " stw      16,%1          \n\t"   \
+:"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm;
 #define PROPCARRY                    \
 asm(                                 \
-   " lwz      16,%1         \n\t"    \
+   " addc     %1,%3,%2      \n\t"    \
-   " addc     16,16,%0      \n\t"    \
+   " xor      %0,%2,%2      \n\t"    \
-   " stw      16,%1         \n\t"    \
+   " addze    %0,%2         \n\t"    \
-   " xor      %0,%0,%0      \n\t"    \
+:"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
   " addze    %0,%0         \n\t"    \
 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc");
 /******************************************************************/
 #elif defined(TFM_PPC64)
@ -362,7 +358,7 @@ asm(                                 \
   " addc     r16,r16,r18       \n\t"   \
   " addze    %0,r17          \n\t"   \
   " sdx      r16,0,%1        \n\t"   \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","%cc"); ++tmpm;
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
 #define PROPCARRY                    \
 asm(                                 \
@ -371,7 +367,7 @@ asm(                                 \
   " sdx      r16,0,%1       \n\t"    \
   " xor      %0,%0,%0      \n\t"    \
   " addze    %0,%0         \n\t"    \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc");
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
 /******************************************************************/
 #elif defined(TFM_AVR32)
@ -401,7 +397,7 @@ asm(                                 \
   " st.w     %1,r2         \n\t"    \
   " eor      %0,%0         \n\t"    \
   " acr      %0            \n\t"    \
-:"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc");
+:"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
 /******************************************************************/
 #elif defined(TFM_MIPS)
@ -509,7 +505,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
       _c   = c + x;
       tmpm = m->dp;
       y = 0;
-       #if (defined(TFM_SSE2) || defined(TFM_X86_64))
+       #if defined(INNERMUL8)
        for (; y < (pa & ~7); y += 8) {
              INNERMUL8;
              _c   += 8;
--- a/src/mont/fp_montgomery_setup.c
+++ b/src/mont/fp_montgomery_setup.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* setups the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *rho)
--- a/src/mul/fp_mul.c
+++ b/src/mul/fp_mul.c
@ -7,122 +7,133 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a * b */
 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
-    int   y, yy;
+    int   y, old_used;
 #if FP_SIZE >= 48
    int   yy;
 #endif
    old_used = C->used;
    /* call generic if we're out of range */
    if (A->used + B->used > FP_SIZE) {
       fp_mul_comba(A, B, C);
-       return ;
+       goto clean;
    }
     y  = MAX(A->used, B->used);
 #if FP_SIZE >= 48
     yy = MIN(A->used, B->used);
 #endif
    /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
       of the largest input.  We also want to avoid doing excess mults if the
       inputs are not close to the next power of two.  That is, for example,
       if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
    */
-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
        if (y <= 3) {
           fp_mul_comba3(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
        if (y == 4) {
           fp_mul_comba4(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
        if (y <= 6) {
           fp_mul_comba6(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
        if (y == 7) {
           fp_mul_comba7(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
        if (y == 8) {
           fp_mul_comba8(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
        if (y == 9) {
           fp_mul_comba9(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
        if (y <= 12) {
           fp_mul_comba12(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
        if (y <= 17) {
           fp_mul_comba17(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#ifdef TFM_SMALL_SET
+#if defined(TFM_SMALL_SET) && FP_SIZE >= 32
        if (y <= 16) {
           fp_mul_comba_small(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL20)
+#if defined(TFM_MUL20) && FP_SIZE >= 40
        if (y <= 20) {
           fp_mul_comba20(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL24)
+#if defined(TFM_MUL24) && FP_SIZE >= 48
        if (yy >= 16 && y <= 24) {
           fp_mul_comba24(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL28)
+#if defined(TFM_MUL28) && FP_SIZE >= 56
        if (yy >= 20 && y <= 28) {
           fp_mul_comba28(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL32)
+#if defined(TFM_MUL32) && FP_SIZE >= 64
        if (yy >= 24 && y <= 32) {
           fp_mul_comba32(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL48)
+#if defined(TFM_MUL48) && FP_SIZE >= 96
        if (yy >= 40 && y <= 48) {
           fp_mul_comba48(A,B,C);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_MUL64)
+#if defined(TFM_MUL64) && FP_SIZE >= 128
        if (yy >= 56 && y <= 64) {
           fp_mul_comba64(A,B,C);
-           return;
+           goto clean;
        }
 #endif
        fp_mul_comba(A,B,C);
 clean:
    for (y = C->used; y < old_used; y++) {
       C->dp[y] = 0;
    }
 }
-/* $Source$ */
+/* $Source: /cvs/libtom/tomsfastmath/src/mul/fp_mul.c,v $ */
-/* $Revision$ */
+/* $Revision: 1.1 $ */
-/* $Date$ */
+/* $Date: 2006/12/31 21:25:53 $ */
--- a/src/mul/fp_mul_2.c
+++ b/src/mul/fp_mul_2.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 void fp_mul_2(fp_int * a, fp_int * b)
 {
--- a/src/mul/fp_mul_2d.c
+++ b/src/mul/fp_mul_2d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a * 2**d */
 void fp_mul_2d(fp_int *a, int b, fp_int *c)
--- a/src/mul/fp_mul_comba.c
+++ b/src/mul/fp_mul_comba.c
@ -12,7 +12,7 @@
 */
-#include <tfm.h>
+#include <tfm_private.h>
 #if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
   #undef TFM_SSE2
@ -53,7 +53,7 @@ asm(                                                      \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
 #elif defined(TFM_X86_64)
 /* x86-64 optimized */
@ -88,7 +88,7 @@ asm  (                                                    \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
 #elif defined(TFM_SSE2)
 /* use SSE2 optimizations */
@ -128,7 +128,7 @@ asm(                                                     \
    "movd  %%mm0,%%eax  \n\t"                            \
    "adcl  %%eax,%1     \n\t"                            \
    "adcl  $0,%2        \n\t"                            \
-    :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%cc");
+    :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
 #elif defined(TFM_ARM)
 /* ARM code */
@ -155,7 +155,7 @@ asm(                                                          \
 "  ADDS   %0,%0,r0              \n\t"                         \
 "  ADCS   %1,%1,r1              \n\t"                         \
 "  ADC    %2,%2,#0              \n\t"                         \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
 #elif defined(TFM_PPC32)
 /* For 32-bit PPC */
@ -297,8 +297,11 @@ asm(                              \
 #define MULADD(i, j)                                    \
   do { fp_word t;                                      \
-   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t;                         \
+   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j);       \
-   t = (fp_word)c1 + (t >> DIGIT_BIT);            c1 = t; c2 += t >> DIGIT_BIT;   \
+   c0 = t;                                              \
   t = (fp_word)c1 + (t >> DIGIT_BIT);                  \
   c1 = t;                                              \
   c2 += t >> DIGIT_BIT;                                \
   } while (0);
 #endif
@ -346,7 +349,9 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
      /* execute loop */
      COMBA_FORWARD;
      for (iz = 0; iz < iy; ++iz) {
-          MULADD(*tmpx++, *tmpy--);
+          fp_digit _tmpx = *tmpx++;
          fp_digit _tmpy = *tmpy--;
          MULADD(_tmpx, _tmpy);
      }
      /* store term */
--- a/src/mul/fp_mul_comba_12.c
+++ b/src/mul/fp_mul_comba_12.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
 void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[24];
@ -109,3 +109,8 @@ void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_17.c
+++ b/src/mul/fp_mul_comba_17.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
 void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[34];
@ -149,3 +149,8 @@ void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_20.c
+++ b/src/mul/fp_mul_comba_20.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL20
+#if defined(TFM_MUL20) && FP_SIZE >= 40
 void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[40];
@ -173,3 +173,8 @@ void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_24.c
+++ b/src/mul/fp_mul_comba_24.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL24
+#if defined(TFM_MUL24) && FP_SIZE >= 48
 void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[48];
@ -205,3 +205,8 @@ void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_28.c
+++ b/src/mul/fp_mul_comba_28.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL28
+#if defined(TFM_MUL28) && FP_SIZE >= 56
 void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[56];
@ -237,3 +237,8 @@ void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_3.c
+++ b/src/mul/fp_mul_comba_3.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
 void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[6];
@ -37,3 +37,8 @@ void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_32.c
+++ b/src/mul/fp_mul_comba_32.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL32
+#if defined(TFM_MUL32) && FP_SIZE >= 64
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[64];
@ -283,3 +283,8 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_4.c
+++ b/src/mul/fp_mul_comba_4.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
 void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[8];
@ -45,3 +45,8 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_48.c
+++ b/src/mul/fp_mul_comba_48.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL48
+#if defined(TFM_MUL48) && FP_SIZE >= 96
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[96];
@ -397,3 +397,8 @@ void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_6.c
+++ b/src/mul/fp_mul_comba_6.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
 void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[12];
@ -61,3 +61,8 @@ void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_64.c
+++ b/src/mul/fp_mul_comba_64.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL64
+#if defined(TFM_MUL64) && FP_SIZE >= 128
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[128];
@ -525,3 +525,8 @@ void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_7.c
+++ b/src/mul/fp_mul_comba_7.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
 void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[14];
@ -69,3 +69,8 @@ void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_8.c
+++ b/src/mul/fp_mul_comba_8.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
 void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[16];
@ -77,3 +77,8 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_9.c
+++ b/src/mul/fp_mul_comba_9.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_mul_comba.c"
-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
 void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
 {
   fp_digit c0, c1, c2, at[18];
@ -85,3 +85,8 @@ void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_comba_small_set.c
+++ b/src/mul/fp_mul_comba_small_set.c
@ -1226,3 +1226,8 @@ void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
 }
 #endif
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/mul/fp_mul_d.c
+++ b/src/mul/fp_mul_d.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = a * b */
 void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
--- a/src/mul/fp_mulmod.c
+++ b/src/mul/fp_mulmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* d = a * b (mod c) */
 int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
--- a/src/numtheory/fp_gcd.c
+++ b/src/numtheory/fp_gcd.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = (a, b) */
 void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
--- a/src/numtheory/fp_invmod.c
+++ b/src/numtheory/fp_invmod.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
 {
--- a/src/numtheory/fp_isprime.c
+++ b/src/numtheory/fp_isprime.c
@ -7,71 +7,11 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* a few primes */
 static const fp_digit primes[256] = {
  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
 };
 int fp_isprime(fp_int *a)
 {
-   fp_int   b;
+  return fp_isprime_ex(a, 8);
   fp_digit d;
   int      r, res;
   /* do trial division */
   for (r = 0; r < 256; r++) {
       fp_mod_d(a, primes[r], &d);
       if (d == 0) {
          return FP_NO;
       }
   }
   /* now do 8 miller rabins */
   fp_init(&b);
   for (r = 0; r < 8; r++) {
       fp_set(&b, primes[r]);
       fp_prime_miller_rabin(a, &b, &res);
       if (res == FP_NO) {
          return FP_NO;
       }
   }
   return FP_YES;
 }
 /* $Source$ */
--- a/src/numtheory/fp_isprime_ex.c
+++ b/src/numtheory/fp_isprime_ex.c
@ -0,0 +1,83 @@
 /* TomsFastMath, a fast ISO C bignum library.
 *
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
 #include <tfm_private.h>
 /* a few primes */
 static const fp_digit primes[FP_PRIME_SIZE] = {
  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
 };
 int fp_isprime_ex(fp_int *a, int t)
 {
   fp_int   b;
   fp_digit d;
   int      r, res;
   if (t <= 0 || t > FP_PRIME_SIZE) {
     return FP_NO;
   }
   /* do trial division */
   for (r = 0; r < 256; r++) {
       fp_mod_d(a, primes[r], &d);
       if (d == 0) {
          return FP_NO;
       }
   }
   /* now do 't' miller rabins */
   fp_init(&b);
   for (r = 0; r < t; r++) {
       fp_set(&b, primes[r]);
       fp_prime_miller_rabin(a, &b, &res);
       if (res == FP_NO) {
          return FP_NO;
       }
   }
   return FP_YES;
 }
 /* $Source$ */
 /* $Revision$ */
 /* $Date$ */
--- a/src/numtheory/fp_lcm.c
+++ b/src/numtheory/fp_lcm.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* c = [a, b] */
 void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
--- a/src/numtheory/fp_prime_miller_rabin.c
+++ b/src/numtheory/fp_prime_miller_rabin.c
@ -7,7 +7,7 @@
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* Miller-Rabin test of "a" to the base of "b" as described in 
 * HAC pp. 139 Algorithm 4.24
--- a/src/numtheory/fp_prime_random_ex.c
+++ b/src/numtheory/fp_prime_random_ex.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
 int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback cb, void *dat)
@ -16,7 +16,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
   int res, err, bsize, maskOR_msb_offset;
   /* sanity check the input */
-   if (size <= 1 || t <= 0) {
+   if (size <= 1 || cb == NULL || t <= 0 || t > FP_PRIME_SIZE) {
      return FP_VAL;
   }
@ -35,7 +35,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
   }
   /* calc the maskAND value for the MSbyte*/
-   maskAND = 0xFF >> (8 - (size & 7));
+   maskAND = 0xFF >> ((8 - (size & 7)) & 7);
   /* calc the maskOR_msb */
   maskOR_msb        = 0;
@ -71,7 +71,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
      fp_read_unsigned_bin(a, tmp, bsize);
      /* is it prime? */
-      res = fp_isprime(a);
+      res = fp_isprime_ex(a, t);
      if (res == FP_NO) continue;
      if (flags & TFM_PRIME_SAFE) {
@ -80,7 +80,7 @@ int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback
         fp_div_2(a, a);
         /* is it prime? */
-         res = fp_isprime(a);
+         res = fp_isprime_ex(a, t);
      }
   } while (res == FP_NO);
--- a/src/sqr/fp_sqr.c
+++ b/src/sqr/fp_sqr.c
@ -7,114 +7,120 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 /* b = a*a  */
 void fp_sqr(fp_int *A, fp_int *B)
 {
-    int     y;
+    int     y, old_used;
    old_used = B->used;
    /* call generic if we're out of range */
    if (A->used + A->used > FP_SIZE) {
       fp_sqr_comba(A, B);
-       return ;
+       goto clean;
    }
    y = A->used;
-#if defined(TFM_SQR3)
+#if defined(TFM_SQR3) && FP_SIZE >= 6
        if (y <= 3) {
           fp_sqr_comba3(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR4)
+#if defined(TFM_SQR4) && FP_SIZE >= 8
        if (y == 4) {
           fp_sqr_comba4(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR6)
+#if defined(TFM_SQR6) && FP_SIZE >= 12
        if (y <= 6) {
           fp_sqr_comba6(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR7)
+#if defined(TFM_SQR7) && FP_SIZE >= 14
        if (y == 7) {
           fp_sqr_comba7(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR8)
+#if defined(TFM_SQR8) && FP_SIZE >= 16
        if (y == 8) {
           fp_sqr_comba8(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR9)
+#if defined(TFM_SQR9) && FP_SIZE >= 18
        if (y == 9) {
           fp_sqr_comba9(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR12)
+#if defined(TFM_SQR12) && FP_SIZE >= 24
        if (y <= 12) {
           fp_sqr_comba12(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR17)
+#if defined(TFM_SQR17) && FP_SIZE >= 34
        if (y <= 17) {
           fp_sqr_comba17(A,B);
-           return;
+           goto clean;
        }
 #endif
 #if defined(TFM_SMALL_SET)
        if (y <= 16) {
           fp_sqr_comba_small(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR20)
+#if defined(TFM_SQR20) && FP_SIZE >= 40
        if (y <= 20) {
           fp_sqr_comba20(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR24)
+#if defined(TFM_SQR24) && FP_SIZE >= 48
        if (y <= 24) {
           fp_sqr_comba24(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR28)
+#if defined(TFM_SQR28) && FP_SIZE >= 56
        if (y <= 28) {
           fp_sqr_comba28(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR32)
+#if defined(TFM_SQR32) && FP_SIZE >= 64
        if (y <= 32) {
           fp_sqr_comba32(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR48)
+#if defined(TFM_SQR48) && FP_SIZE >= 96
        if (y <= 48) {
           fp_sqr_comba48(A,B);
-           return;
+           goto clean;
        }
 #endif
-#if defined(TFM_SQR64)
+#if defined(TFM_SQR64) && FP_SIZE >= 128
        if (y <= 64) {
           fp_sqr_comba64(A,B);
-           return;
+           goto clean;
        }
 #endif
       fp_sqr_comba(A, B);
 clean:
    for (y = B->used; y < old_used; y++) {
       B->dp[y] = 0;
    }
 }
-/* $Source$ */
+/* $Source: /cvs/libtom/tomsfastmath/src/sqr/fp_sqr.c,v $ */
-/* $Revision$ */
+/* $Revision: 1.1 $ */
-/* $Date$ */
+/* $Date: 2006/12/31 21:25:53 $ */
--- a/src/sqr/fp_sqr_comba.c
+++ b/src/sqr/fp_sqr_comba.c
@ -7,7 +7,7 @@
 *
 * Tom St Denis, tomstdenis@gmail.com
 */
-#include <tfm.h>
+#include <tfm_private.h>
 #if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
   #undef TFM_SSE2
@ -41,7 +41,7 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
 #define SQRADD2(i, j)                                     \
 asm(                                            \
@ -53,16 +53,16 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
 #define SQRADDSC(i, j)                                    \
 asm(                                                     \
-     "movl  %6,%%eax     \n\t"                            \
+     "movl  %3,%%eax     \n\t"                            \
-     "mull  %7           \n\t"                            \
+     "mull  %4           \n\t"                            \
     "movl  %%eax,%0     \n\t"                            \
     "movl  %%edx,%1     \n\t"                            \
     "xorl  %2,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
 #define SQRADDAC(i, j)                                    \
 asm(                                                     \
@ -71,7 +71,7 @@ asm(                                                     \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
 #define SQRADDDB                                          \
 asm(                                                     \
@ -81,7 +81,7 @@ asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
 #elif defined(TFM_X86_64)
 /* x86-64 optimized */
@ -109,7 +109,7 @@ asm(                                                     \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
 #define SQRADD2(i, j)                                     \
 asm(                                                     \
@ -121,16 +121,16 @@ asm(                                                     \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
 #define SQRADDSC(i, j)                                    \
 asm(                                                     \
-     "movq  %6,%%rax     \n\t"                            \
+     "movq  %3,%%rax     \n\t"                            \
-     "mulq  %7           \n\t"                            \
+     "mulq  %4           \n\t"                            \
     "movq  %%rax,%0     \n\t"                            \
     "movq  %%rdx,%1     \n\t"                            \
     "xorq  %2,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
 #define SQRADDAC(i, j)                                                         \
 asm(                                                     \
@ -139,7 +139,7 @@ asm(                                                     \
     "addq  %%rax,%0     \n\t"                            \
     "adcq  %%rdx,%1     \n\t"                            \
     "adcq  $0,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
 #define SQRADDDB                                          \
 asm(                                                     \
@ -149,7 +149,7 @@ asm(                                                     \
     "addq %6,%0         \n\t"                            \
     "adcq %7,%1         \n\t"                            \
     "adcq %8,%2         \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
 #elif defined(TFM_SSE2)
@ -181,7 +181,7 @@ asm(                                            \
     "movd  %%mm0,%%eax  \n\t"                            \
     "adcl  %%eax,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
 #define SQRADD2(i, j)                                     \
 asm(                                            \
@ -197,7 +197,7 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
 #define SQRADDSC(i, j)                                                         \
 asm(                                            \
@ -221,7 +221,7 @@ asm(                                            \
     "addl  %%eax,%0     \n\t"                            \
     "adcl  %%edx,%1     \n\t"                            \
     "adcl  $0,%2        \n\t"                            \
-     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+     :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");
 #define SQRADDDB                                          \
 asm(                                                     \
@ -231,7 +231,7 @@ asm(                                                     \
     "addl %6,%0         \n\t"                            \
     "adcl %7,%1         \n\t"                            \
     "adcl %8,%2         \n\t"                            \
-     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+     :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
 #elif defined(TFM_ARM)
@ -260,7 +260,7 @@ asm(                                                             \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)                                            \
@ -272,13 +272,13 @@ asm(                                                             \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
 #define SQRADDSC(i, j)                                           \
 asm(                                                             \
 "  UMULL  %0,%1,%6,%7              \n\t"                         \
 "  SUB    %2,%2,%2                 \n\t"                         \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");
 #define SQRADDAC(i, j)                                           \
 asm(                                                             \
@ -286,7 +286,7 @@ asm(                                                             \
 "  ADDS   %0,%0,r0                 \n\t"                         \
 "  ADCS   %1,%1,r1                 \n\t"                         \
 "  ADC    %2,%2,#0                 \n\t"                         \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
 #define SQRADDDB                                                 \
 asm(                                                             \
@ -296,7 +296,7 @@ asm(                                                             \
 "  ADDS  %0,%0,%3                     \n\t"                      \
 "  ADCS  %1,%1,%4                     \n\t"                      \
 "  ADC   %2,%2,%5                     \n\t"                      \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
 #elif defined(TFM_PPC32)
@ -326,7 +326,7 @@ asm(                             \
   " mulhwu 16,%6,%6       \n\t" \
   " adde   %1,%1,16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)            \
@ -339,14 +339,14 @@ asm(                             \
   " addc   %0,%0,16       \n\t" \
   " adde   %1,%1,17       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
 #define SQRADDSC(i, j)            \
 asm(                              \
   " mullw  %0,%6,%7        \n\t" \
   " mulhwu %1,%6,%7        \n\t" \
   " xor    %2,%2,%2        \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
 #define SQRADDAC(i, j)           \
 asm(                             \
@ -355,7 +355,7 @@ asm(                             \
   " mulhwu 16,%6,%7       \n\t" \
   " adde   %1,%1,16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
 #define SQRADDDB                  \
 asm(                              \
@ -365,7 +365,7 @@ asm(                              \
   " addc   %0,%0,%3        \n\t" \
   " adde   %1,%1,%4        \n\t" \
   " adde   %2,%2,%5        \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
 #elif defined(TFM_PPC64)
 /* PPC64 */
@ -394,7 +394,7 @@ asm(                             \
   " mulhdu r16,%6,%6       \n\t" \
   " adde   %1,%1,r16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");
 /* for squaring some of the terms are doubled... */
 #define SQRADD2(i, j)            \
@ -407,14 +407,14 @@ asm(                             \
   " addc   %0,%0,r16       \n\t" \
   " adde   %1,%1,r17       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");
 #define SQRADDSC(i, j)            \
 asm(                              \
   " mulld  %0,%6,%7        \n\t" \
   " mulhdu %1,%6,%7        \n\t" \
   " xor    %2,%2,%2        \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
 #define SQRADDAC(i, j)           \
 asm(                             \
@ -423,7 +423,7 @@ asm(                             \
   " mulhdu r16,%6,%7       \n\t" \
   " adde   %1,%1,r16       \n\t" \
   " addze  %2,%2          \n\t" \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");
 #define SQRADDDB                  \
 asm(                              \
@ -433,7 +433,7 @@ asm(                              \
   " addc   %0,%0,%3        \n\t" \
   " adde   %1,%1,%4        \n\t" \
   " adde   %2,%2,%5        \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
 #elif defined(TFM_AVR32)
@ -501,7 +501,7 @@ asm(                              \
   " add    %0,%0,%3        \n\t" \
   " adc    %1,%1,%4        \n\t" \
   " adc    %2,%2,%5        \n\t" \
-:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
 #elif defined(TFM_MIPS)
@ -571,7 +571,7 @@ asm(                              \
   " mflo   %0             \n\t"  \
   " mfhi   %1             \n\t"  \
   " xor    %2,%2,%2       \n\t"  \
-:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
 #define SQRADDAC(i, j)           \
 asm(                             \
--- a/src/sqr/fp_sqr_comba_12.c
+++ b/src/sqr/fp_sqr_comba_12.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"
-#ifdef TFM_SQR12
+#if defined(TFM_SQR12) && FP_SIZE >= 24
 void fp_sqr_comba12(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[24], c0, c1, c2, sc0, sc1, sc2;
--- a/src/sqr/fp_sqr_comba_17.c
+++ b/src/sqr/fp_sqr_comba_17.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"
-#ifdef TFM_SQR17
+#if defined(TFM_SQR17) && FP_SIZE >= 34
 void fp_sqr_comba17(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[34], c0, c1, c2, sc0, sc1, sc2;
--- a/src/sqr/fp_sqr_comba_20.c
+++ b/src/sqr/fp_sqr_comba_20.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"
-#ifdef TFM_SQR20
+#if defined(TFM_SQR20) && FP_SIZE >= 40
 void fp_sqr_comba20(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[40], c0, c1, c2, sc0, sc1, sc2;
--- a/src/sqr/fp_sqr_comba_24.c
+++ b/src/sqr/fp_sqr_comba_24.c
@ -1,7 +1,7 @@
 #define TFM_DEFINES
 #include "fp_sqr_comba.c"
-#ifdef TFM_SQR24
+#if defined(TFM_SQR24) && FP_SIZE >= 48
 void fp_sqr_comba24(fp_int *A, fp_int *B)
 {
   fp_digit *a, b[48], c0, c1, c2, sc0, sc1, sc2;
--- a/Show More
+++ b/Show More