diff --git a/changes.txt b/changes.txt
index 525e2d4..4fab275 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,9 @@
+October 31st, 2005
+0.06 -- fixed fp_mul() and fp_sqr() to trim digits when overflows would occur.  Produces numerically inprecise results
+        (e.g. the lower FP_SIZE digits) but shouldn't segfault at least ;-)
+     -- Updated the combas so you can turn on and off specific unrolled loops at build time 
+     -- Michael Heyman reported a bug in s_fp_sub() that was pretty substantial and a bug in fp_montgomery_calc_normalization().  Fixed.
+
 August 1st, 2005
 0.05 -- Quick fix to the fp_invmod.c code to let it handle even moduli [required for LTC]
      -- Added makefile.shared to make shared objects [required for LTC]
diff --git a/demo/test.c b/demo/test.c
index bf1c288..d79759c 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -283,7 +283,8 @@ sqrtime:
 //#else
 monttime:
   printf("Montgomery:\n");
-  for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
+  for (t = 2; t <= (FP_SIZE/2)-4; t += 2) {
+//      printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
       fp_zero(&a);
       for (ix = 0; ix < t; ix++) {
           a.dp[ix] = ix | 1;
@@ -343,6 +344,9 @@ expttime:
 return;
 testing:
 
+  fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); fp_zero(&a);
+
+
    div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
    sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
 
diff --git a/doc/tfm.pdf b/doc/tfm.pdf
index a85837d..1f6b759 100644
Binary files a/doc/tfm.pdf and b/doc/tfm.pdf differ
diff --git a/fp_montgomery_calc_normalization.c b/fp_montgomery_calc_normalization.c
index 1ac28b7..2949f2b 100644
--- a/fp_montgomery_calc_normalization.c
+++ b/fp_montgomery_calc_normalization.c
@@ -18,6 +18,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
 
   /* how many bits of last digit does b use */
   bits = fp_count_bits (b) % DIGIT_BIT;
+  if (!bits) bits = DIGIT_BIT;
 
   /* compute A = B^(n-1) * 2^(bits-1) */
   if (b->used > 1) {
diff --git a/fp_montgomery_reduce.c b/fp_montgomery_reduce.c
index 53d6773..b0c2305 100644
--- a/fp_montgomery_reduce.c
+++ b/fp_montgomery_reduce.c
@@ -64,6 +64,8 @@ asm(                                                      \
 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
 : "%rax", "%rdx", "%cc")
 
+#ifdef TFM_HUGE
+
 #define INNERMUL8 \
  asm(                  \
  "movq 0(%5),%%rax    \n\t"  \
@@ -157,6 +159,8 @@ asm(                                                      \
 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
 : "%rax", "%rdx", "%r10", "%r11", "%cc")
 
+#endif
+
 
 #define PROPCARRY                           \
 asm(                                        \
@@ -306,6 +310,11 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
    fp_digit c[FP_SIZE], *_c, *tmpm, mu;
    int      oldused, x, y, pa;
 
+   /* bail if too large */
+   if (m->used > (FP_SIZE/2)) {
+      return;
+   }
+
 #if defined(USE_MEMSET)
    /* now zero the buff */
    memset(c, 0, sizeof c);
@@ -331,7 +340,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
        _c   = c + x;
        tmpm = m->dp;
        y = 0;
-       #if defined(TFM_X86_64)
+       #if defined(TFM_X86_64) && defined(TFM_HUGE)
            for (; y < (pa & ~7); y += 8) {
               INNERMUL8;
               _c   += 8;
diff --git a/fp_mul.c b/fp_mul.c
index 71c5f47..a8b42da 100644
--- a/fp_mul.c
+++ b/fp_mul.c
@@ -15,6 +15,12 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
     int    r, y, yy, s;
     fp_int ac, bd, comp, amb, cmd, t1, t2;
 
+    /* call generic if we're out of range */
+    if (A->used + B->used > FP_SIZE) {
+       fp_mul_comba(A, B, C);
+       return ;
+    }
+
      y  = MAX(A->used, B->used);
      yy = MIN(A->used, B->used);
      if (yy <= 8 || y <= 64) {
@@ -31,11 +37,15 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 #elif defined(TFM_HUGE)
         if (0) { 1;
 #endif
-#if defined(TFM_HUGE)
+#if defined(TFM_MUL32)
         } else if (y <= 32) {
            fp_mul_comba32(A,B,C);
+#endif
+#if defined(TFM_MUL48)
         } else if (y <= 48) {
            fp_mul_comba48(A,B,C);
+#endif
+#if defined(TFM_MUL64)
         } else if (y <= 64) {
            fp_mul_comba64(A,B,C);
 #endif
diff --git a/fp_mul_comba.c b/fp_mul_comba.c
index cf51eba..8aa99cc 100644
--- a/fp_mul_comba.c
+++ b/fp_mul_comba.c
@@ -47,7 +47,7 @@
 
 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm(                                                     \
+asm(                                                      \
      "movl  %6,%%eax     \n\t"                            \
      "mull  %7           \n\t"                            \
      "addl  %%eax,%0     \n\t"                            \
@@ -266,8 +266,8 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 
   dst->used = pa;
+  dst->sign = A->sign ^ B->sign;
   fp_clamp(dst);
-  dst->sign = dst->used ? A->sign ^ B->sign : FP_ZPOS;
   fp_copy(dst, C);
 }
 
@@ -1497,8 +1497,7 @@ void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
 
 #endif
 
-#ifdef TFM_HUGE
-
+#ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[64];
@@ -1765,7 +1764,9 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
    fp_clamp(C);
    COMBA_FINI;
 }
+#endif
 
+#ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[128];
@@ -2288,7 +2289,9 @@ void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
    fp_clamp(C);
    COMBA_FINI;
 }
+#endif
 
+#ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[96];
@@ -2683,8 +2686,6 @@ void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
    fp_clamp(C);
    COMBA_FINI;
 }
-
-
 #endif
 
 
diff --git a/fp_set.c b/fp_set.c
index e8c849b..39f4cf4 100644
--- a/fp_set.c
+++ b/fp_set.c
@@ -13,7 +13,7 @@ void fp_set(fp_int *a, fp_digit b)
 {
    fp_zero(a);
    a->dp[0] = b;
-   a->used  = b ? 1 : 0;
+   a->used  = a->dp[0] ? 1 : 0;
 }
 
 /* $Source$ */
diff --git a/fp_sqr.c b/fp_sqr.c
index da3c944..c95fe7b 100644
--- a/fp_sqr.c
+++ b/fp_sqr.c
@@ -15,6 +15,12 @@ void fp_sqr(fp_int *A, fp_int *B)
     int    r, y, s;
     fp_int aa, bb, comp, amb, t1;
 
+    /* call generic if we're out of range */
+    if (A->used + A->used > FP_SIZE) {
+       fp_sqr_comba(A, B);
+       return ;
+    }
+
     y = A->used;
     if (y <= 64) { 
 
@@ -24,11 +30,15 @@ void fp_sqr(fp_int *A, fp_int *B)
 #elif defined(TFM_HUGE)
         if (0) { 1; 
 #endif
-#if defined(TFM_HUGE)
+#if defined(TFM_SQR32)
         } else if (y <= 32) {
            fp_sqr_comba32(A,B);
+#endif
+#if defined(TFM_SQR48)
         } else if (y <= 48) {
            fp_sqr_comba48(A,B);
+#endif
+#if defined(TFM_SQR64)
         } else if (y <= 64) {
            fp_sqr_comba64(A,B);
 #endif
diff --git a/fp_sqr_comba.c b/fp_sqr_comba.c
index d5a39e5..443f910 100644
--- a/fp_sqr_comba.c
+++ b/fp_sqr_comba.c
@@ -1945,7 +1945,7 @@ void fp_sqr_comba_small(fp_int *A, fp_int *B)
 #endif /* TFM_SMALL_SET */
 
 
-#ifdef TFM_HUGE
+#ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
@@ -2272,16 +2272,14 @@ void fp_sqr_comba32(fp_int *A, fp_int *B)
    COMBA_STORE2(b[63]);
    COMBA_FINI;
 
+   memcpy(B->dp, b, 64 * sizeof(fp_digit));
    B->used = 64;
    B->sign = FP_ZPOS;
-   memcpy(B->dp, b, 64 * sizeof(fp_digit));
    fp_clamp(B);
 }
-
-
 #endif
 
-#ifdef TFM_HUGE
+#ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[128], c0, c1, c2, sc0, sc1, sc2;
@@ -2933,7 +2931,9 @@ void fp_sqr_comba64(fp_int *A, fp_int *B)
    memcpy(B->dp, b, 128 * sizeof(fp_digit));
    fp_clamp(B);
 }
+#endif
 
+#ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[96], c0, c1, c2, sc0, sc1, sc2;
@@ -2985,7 +2985,7 @@ void fp_sqr_comba48(fp_int *A, fp_int *B)
 
    /* output 8 */
    CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
+         SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
    COMBA_STORE(b[8]);
 
    /* output 9 */
@@ -3420,9 +3420,9 @@ void fp_sqr_comba48(fp_int *A, fp_int *B)
    COMBA_STORE2(b[95]);
    COMBA_FINI;
 
+   memcpy(B->dp, b, 96 * sizeof(fp_digit));
    B->used = 96;
    B->sign = FP_ZPOS;
-   memcpy(B->dp, b, 96 * sizeof(fp_digit));
    fp_clamp(B);
 }
 
diff --git a/makefile b/makefile
index a9ad6b2..b74b0bb 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
 #makefile for TomsFastMath
 #
 #
-VERSION=0.05
+VERSION=0.06
 
 CFLAGS += -Wall -W -Wshadow -I./ 
 
@@ -85,7 +85,7 @@ install: $(LIBNAME)
 	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
 mtest/mtest: mtest/mtest.c
-	cd mtest ; make mtest
+	cd mtest ; CFLAGS="$(CFLAGS) -I../" make mtest
 
 test: $(LIBNAME) demo/test.o mtest/mtest
 	$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
@@ -143,5 +143,5 @@ zipup: no_oops docs clean
 	zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/*
 
 # $Source: /cvs/libtom/tomsfastmath/makefile,v $ 
-# $Revision: 1.17 $ 
-# $Date: 2005/07/30 04:23:55 $ 
+# $Revision: 1.19 $ 
+# $Date: 2005/08/25 23:53:40 $ 
diff --git a/makefile.shared b/makefile.shared
index 74fa873..2ccac42 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -1,6 +1,7 @@
 #makefile for TomsFastMath
 #
 #
+VERSION=0:6
 
 CC=libtool --mode=compile gcc
 
@@ -19,7 +20,6 @@ CFLAGS += -fomit-frame-pointer
 
 endif
 
-VERSION=0:5
 
 OBJECTS = \
 fp_set.o \
@@ -81,12 +81,12 @@ endif
 
 default: $(LIBNAME)
 
+objs: $(OBJECTS)
+
 $(LIBNAME): $(OBJECTS)
+	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
 
 install: $(LIBNAME)
-	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
-	libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]o" | xargs` -o $(LIBNAME_S)
-	ranlib $(LIBNAME_S)
 	libtool --silent --mode=install install -c $(LIBNAME) $(LIBPATH)/$(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
@@ -104,6 +104,6 @@ stest: $(LIBNAME) demo/stest.o
 	$(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest
 
 # $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $ 
-# $Revision: 1.4 $ 
-# $Date: 2005/07/28 03:08:35 $ 
+# $Revision: 1.7 $ 
+# $Date: 2005/10/06 23:31:17 $ 
 
diff --git a/mtest/mtest.c b/mtest/mtest.c
index 02fd483..66a2621 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -38,6 +38,8 @@ mulmod
 #include <stdlib.h>
 #include <time.h>
 #include <tommath.h>
+#define CRYPT
+#include "../tfm.h"
 
 FILE *rng;
 
@@ -47,7 +49,7 @@ void rand_num(mp_int *a)
    int n, size;
    unsigned char buf[2048];
 
-   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 256;
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % (FP_MAX_SIZE/16 - DIGIT_BIT/2);
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    while (buf[1] == 0) buf[1] = fgetc(rng);
@@ -60,7 +62,7 @@ void rand_num2(mp_int *a)
    int n, size;
    unsigned char buf[2048];
 
-   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 256;
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % (FP_MAX_SIZE/16 - DIGIT_BIT/2);
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    while (buf[1] == 0) buf[1] = fgetc(rng);
@@ -118,7 +120,6 @@ int main(void)
       }
 #endif
        n = fgetc(rng) % 16;
-
    if (n == 0) {
        /* add tests */
        rand_num(&a);
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index 132c1e2..4a8a033 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -1663,6 +1663,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
 
   /* how many bits of last digit does b use */
   bits = fp_count_bits (b) % DIGIT_BIT;
+  if (!bits) bits = DIGIT_BIT;
 
   /* compute A = B^(n-1) * 2^(bits-1) */
   if (b->used > 1) {
@@ -1755,6 +1756,8 @@ asm(                                                      \
 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
 : "%rax", "%rdx", "%cc")
 
+#ifdef TFM_HUGE
+
 #define INNERMUL8 \
  asm(                  \
  "movq 0(%5),%%rax    \n\t"  \
@@ -1848,6 +1851,8 @@ asm(                                                      \
 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
 : "%rax", "%rdx", "%r10", "%r11", "%cc")
 
+#endif
+
 
 #define PROPCARRY                           \
 asm(                                        \
@@ -1997,6 +2002,11 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
    fp_digit c[FP_SIZE], *_c, *tmpm, mu;
    int      oldused, x, y, pa;
 
+   /* bail if too large */
+   if (m->used > (FP_SIZE/2)) {
+      return;
+   }
+
 #if defined(USE_MEMSET)
    /* now zero the buff */
    memset(c, 0, sizeof c);
@@ -2022,7 +2032,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
        _c   = c + x;
        tmpm = m->dp;
        y = 0;
-       #if defined(TFM_X86_64)
+       #if defined(TFM_X86_64) && defined(TFM_HUGE)
            for (; y < (pa & ~7); y += 8) {
               INNERMUL8;
               _c   += 8;
@@ -2140,6 +2150,12 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
     int    r, y, yy, s;
     fp_int ac, bd, comp, amb, cmd, t1, t2;
 
+    /* call generic if we're out of range */
+    if (A->used + B->used > FP_SIZE) {
+       fp_mul_comba(A, B, C);
+       return ;
+    }
+
      y  = MAX(A->used, B->used);
      yy = MIN(A->used, B->used);
      if (yy <= 8 || y <= 64) {
@@ -2156,11 +2172,15 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 #elif defined(TFM_HUGE)
         if (0) { 1;
 #endif
-#if defined(TFM_HUGE)
+#if defined(TFM_MUL32)
         } else if (y <= 32) {
            fp_mul_comba32(A,B,C);
+#endif
+#if defined(TFM_MUL48)
         } else if (y <= 48) {
            fp_mul_comba48(A,B,C);
+#endif
+#if defined(TFM_MUL64)
         } else if (y <= 64) {
            fp_mul_comba64(A,B,C);
 #endif
@@ -2444,7 +2464,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
 
 /* this should multiply i and j  */
 #define MULADD(i, j)                                      \
-asm(                                                     \
+asm(                                                      \
      "movl  %6,%%eax     \n\t"                            \
      "mull  %7           \n\t"                            \
      "addl  %%eax,%0     \n\t"                            \
@@ -2663,8 +2683,8 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
   COMBA_FINI;
 
   dst->used = pa;
+  dst->sign = A->sign ^ B->sign;
   fp_clamp(dst);
-  dst->sign = dst->used ? A->sign ^ B->sign : FP_ZPOS;
   fp_copy(dst, C);
 }
 
@@ -3894,8 +3914,7 @@ void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
 
 #endif
 
-#ifdef TFM_HUGE
-
+#ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[64];
@@ -4162,7 +4181,9 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
    fp_clamp(C);
    COMBA_FINI;
 }
+#endif
 
+#ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[128];
@@ -4685,7 +4706,9 @@ void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
    fp_clamp(C);
    COMBA_FINI;
 }
+#endif
 
+#ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[96];
@@ -5080,8 +5103,6 @@ void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
    fp_clamp(C);
    COMBA_FINI;
 }
-
-
 #endif
 
 
@@ -5657,7 +5678,7 @@ void fp_set(fp_int *a, fp_digit b)
 {
    fp_zero(a);
    a->dp[0] = b;
-   a->used  = b ? 1 : 0;
+   a->used  = a->dp[0] ? 1 : 0;
 }
 
 /* $Source$ */
@@ -5707,6 +5728,12 @@ void fp_sqr(fp_int *A, fp_int *B)
     int    r, y, s;
     fp_int aa, bb, comp, amb, t1;
 
+    /* call generic if we're out of range */
+    if (A->used + A->used > FP_SIZE) {
+       fp_sqr_comba(A, B);
+       return ;
+    }
+
     y = A->used;
     if (y <= 64) { 
 
@@ -5716,11 +5743,15 @@ void fp_sqr(fp_int *A, fp_int *B)
 #elif defined(TFM_HUGE)
         if (0) { 1; 
 #endif
-#if defined(TFM_HUGE)
+#if defined(TFM_SQR32)
         } else if (y <= 32) {
            fp_sqr_comba32(A,B);
+#endif
+#if defined(TFM_SQR48)
         } else if (y <= 48) {
            fp_sqr_comba48(A,B);
+#endif
+#if defined(TFM_SQR64)
         } else if (y <= 64) {
            fp_sqr_comba64(A,B);
 #endif
@@ -7761,7 +7792,7 @@ void fp_sqr_comba_small(fp_int *A, fp_int *B)
 #endif /* TFM_SMALL_SET */
 
 
-#ifdef TFM_HUGE
+#ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
@@ -8088,16 +8119,14 @@ void fp_sqr_comba32(fp_int *A, fp_int *B)
    COMBA_STORE2(b[63]);
    COMBA_FINI;
 
+   memcpy(B->dp, b, 64 * sizeof(fp_digit));
    B->used = 64;
    B->sign = FP_ZPOS;
-   memcpy(B->dp, b, 64 * sizeof(fp_digit));
    fp_clamp(B);
 }
-
-
 #endif
 
-#ifdef TFM_HUGE
+#ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[128], c0, c1, c2, sc0, sc1, sc2;
@@ -8749,7 +8778,9 @@ void fp_sqr_comba64(fp_int *A, fp_int *B)
    memcpy(B->dp, b, 128 * sizeof(fp_digit));
    fp_clamp(B);
 }
+#endif
 
+#ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[96], c0, c1, c2, sc0, sc1, sc2;
@@ -8801,7 +8832,7 @@ void fp_sqr_comba48(fp_int *A, fp_int *B)
 
    /* output 8 */
    CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
+         SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
    COMBA_STORE(b[8]);
 
    /* output 9 */
@@ -9236,9 +9267,9 @@ void fp_sqr_comba48(fp_int *A, fp_int *B)
    COMBA_STORE2(b[95]);
    COMBA_FINI;
 
+   memcpy(B->dp, b, 96 * sizeof(fp_digit));
    B->used = 96;
    B->sign = FP_ZPOS;
-   memcpy(B->dp, b, 96 * sizeof(fp_digit));
    fp_clamp(B);
 }
 
@@ -9652,11 +9683,11 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
       c->dp[x]   = (fp_digit)t;
       t        >>= DIGIT_BIT;
   }
-  if (t != 0 && x != FP_SIZE) {
+  if (t != 0 && x < FP_SIZE) {
      c->dp[c->used++] = (fp_digit)t;
      ++x;
   }
-  
+  c->used = x;
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -9684,18 +9715,23 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
 /* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 {
-  int      x, oldused;
+  int      x, oldbused, oldused;
   fp_word  t;
 
-  oldused = c->used;
-  c->used = a->used;
+  oldused  = c->used;
+  oldbused = b->used;
+  c->used  = a->used;
   t       = 0;
-  for (x = 0; x < a->used; x++) {
-      t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
-      c->dp[x]  = (fp_digit)t;
-      t         = (t >> DIGIT_BIT) & 1;
+  for (x = 0; x < oldbused; x++) {
+     t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
+     c->dp[x]  = (fp_digit)t;
+     t         = (t >> DIGIT_BIT)&1;
   }
-  
+  for (; x < a->used; x++) {
+     t         = ((fp_word)a->dp[x]) - t;
+     c->dp[x]  = (fp_digit)t;
+     t         = (t >> DIGIT_BIT);
+   }
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
diff --git a/s_fp_add.c b/s_fp_add.c
index 3a874f2..7b882e3 100644
--- a/s_fp_add.c
+++ b/s_fp_add.c
@@ -25,11 +25,11 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
       c->dp[x]   = (fp_digit)t;
       t        >>= DIGIT_BIT;
   }
-  if (t != 0 && x != FP_SIZE) {
+  if (t != 0 && x < FP_SIZE) {
      c->dp[c->used++] = (fp_digit)t;
      ++x;
   }
-  
+  c->used = x;
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
diff --git a/s_fp_sub.c b/s_fp_sub.c
index 41cc6c4..ffabee8 100644
--- a/s_fp_sub.c
+++ b/s_fp_sub.c
@@ -12,18 +12,23 @@
 /* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 {
-  int      x, oldused;
+  int      x, oldbused, oldused;
   fp_word  t;
 
-  oldused = c->used;
-  c->used = a->used;
+  oldused  = c->used;
+  oldbused = b->used;
+  c->used  = a->used;
   t       = 0;
-  for (x = 0; x < a->used; x++) {
-      t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
-      c->dp[x]  = (fp_digit)t;
-      t         = (t >> DIGIT_BIT) & 1;
+  for (x = 0; x < oldbused; x++) {
+     t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
+     c->dp[x]  = (fp_digit)t;
+     t         = (t >> DIGIT_BIT)&1;
   }
-  
+  for (; x < a->used; x++) {
+     t         = ((fp_word)a->dp[x]) - t;
+     c->dp[x]  = (fp_digit)t;
+     t         = (t >> DIGIT_BIT);
+   }
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
diff --git a/tfm.dvi b/tfm.dvi
index 7766aaa..1b76490 100644
Binary files a/tfm.dvi and b/tfm.dvi differ
diff --git a/tfm.h b/tfm.h
index da91e81..c17a1d0 100644
--- a/tfm.h
+++ b/tfm.h
@@ -37,7 +37,12 @@
    Enable these if you are doing 32, 48 or 64 digit multiplications (useful for RSA)
    Less important on 64-bit machines as 32 digits == 2048 bits
  */
-#define TFM_HUGE
+#define TFM_MUL32
+#define TFM_MUL48
+#define TFM_MUL64
+#define TFM_SQR32
+#define TFM_SQR48
+#define TFM_SQR64
 
 /* do we want some overflow checks
    Not required if you make sure your numbers are within range (e.g. by default a modulus for fp_exptmod() can only be upto 2048 bits long)
@@ -61,7 +66,7 @@
  * You can externally define this or it defaults to 4096-bits [allowing multiplications upto 2048x2048 bits ]
  */
 #ifndef FP_MAX_SIZE
-   #define FP_MAX_SIZE           (4096+(4*DIGIT_BIT))
+   #define FP_MAX_SIZE           (4096+(8*DIGIT_BIT))
 #endif
 
 /* will this lib work? */
diff --git a/tfm.log b/tfm.log
index e8dbba0..282bc8b 100644
--- a/tfm.log
+++ b/tfm.log
@@ -1,4 +1,4 @@
-This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10)  1 AUG 2005 13:34
+This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.9.20)  31 OCT 2005 11:30
 entering extended mode
 **tfm
 (./tfm.tex
@@ -329,4 +329,4 @@ Here is how much of TeX's memory you used:
  580 hyphenation exceptions out of 1000
  25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s
 
-Output written on tfm.dvi (25 pages, 51612 bytes).
+Output written on tfm.dvi (25 pages, 51616 bytes).
diff --git a/tfm.tex b/tfm.tex
index 761d95f..2985500 100644
--- a/tfm.tex
+++ b/tfm.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{TomsFastMath User Manual \\ v0.05}
+\title{TomsFastMath User Manual \\ v0.06}
 \author{Tom St Denis \\ tomstdenis@gmail.com}
 \maketitle
 This text and library are all hereby placed in the public domain.  This book has been formatted for B5