diff --git a/changes.txt b/changes.txt index 5ad2fb7..525e2d4 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,10 @@ +August 1st, 2005 +0.05 -- Quick fix to the fp_invmod.c code to let it handle even moduli [required for LTC] + -- Added makefile.shared to make shared objects [required for LTC] + -- Improved makefiles to make them way more configurable + -- Added timing resistant fp_exptmod() enabled with TFM_TIMING_RESISTANT + +July 23rd, 2005 0.04 -- Fixed bugs in the SSE2 squaring code -- Rewrote the multipliers to be optimized for small inputs -- Nelson Bolyard of the NSS crew submitted [among other things] new faster Montgomery reduction diff --git a/comba_mont_gen.c b/comba_mont_gen.c index 7b5e6fb..34c9e05 100644 --- a/comba_mont_gen.c +++ b/comba_mont_gen.c @@ -1,59 +1,112 @@ -/* generate montgomery reductions for m->used = 1...16 */ - #include int main(void) { - int N; - - for (N = 1; N <= 16; N++) { - -printf("void fp_montgomery_reduce_%d(fp_int *a, fp_int *m, fp_digit mp)\n", N); + int x, y, z; + printf( +#if 0 +"#ifdef TFM_SMALL_SET\n" +"/* computes x/R == x (mod N) via Montgomery Reduction */\n" +"void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)\n" "{\n" -" fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;\n" -" int oldused, x, y;\n" +" fp_digit c[FP_SIZE], *_c, *tmpm, mu, cy;\n" +" int oldused, x, y, pa;\n" "\n" +"#if defined(USE_MEMSET)\n" " /* now zero the buff */\n" -" memset(c, 0, sizeof(c));\n" +" memset(c, 0, sizeof c);\n" +"#endif\n" +" pa = m->used;\n" "\n" " /* copy the input */\n" " oldused = a->used;\n" " for (x = 0; x < oldused; x++) {\n" " c[x] = a->dp[x];\n" " }\n" -"\n" +"#if !defined(USE_MEMSET)\n" +" for (; x < 2*pa+3; x++) {\n" +" c[x] = 0;\n" +" }\n" +"#endif\n" " MONT_START;\n" +#endif "\n" -" /* now let's get bizz-sy! */\n" -" for (x = 0; x < %d; x++) {\n" -" /* get Mu for this round */\n" -" LOOP_START;\n" -"\n" -" /* our friendly neighbourhood alias */\n" -" _c = c + x;\n" -" tmpm = m->dp;\n" -"\n" -" for (y = 0; y < %d; y++) {\n" -" INNERMUL;\n" -" ++_c;\n" -" }\n" -" /* send carry up man... */\n" -" _c = c + x;\n" -" PROPCARRY;\n" -" } \n" -"\n" -" /* fix the rest of the carries */\n" -" _c = c + %d;\n" -" for (x = %d; x < %d * 2 + 2; x++) {\n" -" PROPCARRY;\n" -" ++_c;\n" +" switch (pa) {\n"); + +for (x = 1; x <= 64; x++) { +if (x > 16 && (x != 32 && x != 48 && x != 64)) continue; +if (x > 16) printf("#ifdef TFM_HUGE\n"); + + + +printf(" case %d:\n", x); + +for (y = 0; y < x; y++) { + +printf(" x = %d; cy = 0;\n" + " LOOP_START;\n" + " _c = c + %d;\n" + " tmpm = m->dp;\n", y, y); + +printf("#ifdef INNERMUL8\n"); +for (z = 0; z+8 <= x; z += 8) { +printf(" INNERMUL8; _c += 8; tmpm += 8;\n"); +} +for (; z < x; z++) { +printf(" INNERMUL; ++_c;\n"); +} +printf("#else\n"); +for (z = 0; z < x; z++) { +printf(" INNERMUL; ++_c;\n"); +} +printf("#endif\n"); +printf(" LOOP_END;\n" + " while (cy) {\n" + " PROPCARRY;\n" + " ++_c;\n" + " }\n"); +} +//printf(" }\n"); +printf(" break;\n"); + + + +#define LOOP_MACRO(stride) \ + for (x = 0; x < stride; x++) { \ + fp_digit cy = 0; \ + /* get Mu for this round */ \ + LOOP_START; \ + _c = c + x; \ + tmpm = m->dp; \ + for (y = 0; y < stride; y++) { \ + INNERMUL; \ + ++_c; \ + } \ + LOOP_END; \ + while (cy) { \ + PROPCARRY; \ + ++_c; \ + } \ + } + + + + + +if (x > 16) printf("#endif /* TFM_HUGE */\n"); + + +} + +#if 0 + +printf( " }\n" -"\n" " /* now copy out */\n" -" _c = c + %d;\n" +" _c = c + pa;\n" " tmpm = a->dp;\n" -" for (x = 0; x < %d+1; x++) {\n" +" for (x = 0; x < pa+1; x++) {\n" " *tmpm++ = *_c++;\n" " }\n" "\n" @@ -63,19 +116,17 @@ printf( "\n" " MONT_FINI;\n" "\n" -" a->used = %d+1;\n" +" a->used = pa+1;\n" " fp_clamp(a);\n" "\n" " /* if A >= m then A = A - m */\n" " if (fp_cmp_mag (a, m) != FP_LT) {\n" " s_fp_sub (a, m, a);\n" " }\n" -"}\n", N,N,N,N,N,N,N,N); -} +"}\n\n#endif\n"); + +#endif + return 0; } - - - - diff --git a/demo/test.c b/demo/test.c index fa87605..bf1c288 100644 --- a/demo/test.c +++ b/demo/test.c @@ -213,7 +213,7 @@ t1 = TIMFUNC(); sleep(1); printf("Ticks per second: %llu\n", TIMFUNC() - t1); -goto expttime; +goto multtime; /* do some timings... */ printf("Addition:\n"); for (t = 2; t <= FP_SIZE/2; t += 2) { diff --git a/doc/tfm.pdf b/doc/tfm.pdf index 0830505..a85837d 100644 Binary files a/doc/tfm.pdf and b/doc/tfm.pdf differ diff --git a/fp_exptmod.c b/fp_exptmod.c index a4a2e8d..69a06d4 100644 --- a/fp_exptmod.c +++ b/fp_exptmod.c @@ -9,6 +9,75 @@ */ #include +#ifdef TFM_TIMING_RESISTANT + +/* timing resistant montgomery ladder based exptmod + + Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002 +*/ +static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) +{ + fp_int R[2]; + fp_digit buf, mp; + int err, bitcnt, digidx, y; + + /* now setup montgomery */ + if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) { + return err; + } + + fp_init(&R[0]); + fp_init(&R[1]); + + /* now we need R mod m */ + fp_montgomery_calc_normalization (&R[0], P); + + /* now set R[0][1] to G * R mod m */ + if (fp_cmp_mag(P, G) != FP_GT) { + /* G > P so we reduce it first */ + fp_mod(G, P, &R[1]); + } else { + fp_copy(G, &R[1]); + } + fp_mulmod (&R[1], &R[0], P, &R[1]); + + /* for j = t-1 downto 0 do + r_!k = R0*R1; r_k = r_k^2 + */ + + /* set initial mode and bit cnt */ + bitcnt = 1; + buf = 0; + digidx = X->used - 1; + + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + /* if digidx == -1 we are out of digits so break */ + if (digidx == -1) { + break; + } + /* read next digit and reset bitcnt */ + buf = X->dp[digidx--]; + bitcnt = (int)DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1; + buf <<= (fp_digit)1; + + /* do ops */ + fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp); + fp_sqr(&R[y], &R[y]); fp_montgomery_reduce(&R[y], P, mp); + } + + fp_montgomery_reduce(&R[0], P, mp); + fp_copy(&R[0], Y); + return FP_OKAY; +} + +#else + /* y = g**x (mod b) * Some restrictions... x must be positive and < b */ @@ -168,6 +237,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) return FP_OKAY; } +#endif + int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) { diff --git a/fp_invmod.c b/fp_invmod.c index 17ddb69..6086029 100644 --- a/fp_invmod.c +++ b/fp_invmod.c @@ -9,6 +9,111 @@ */ #include +static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c) +{ + fp_int x, y, u, v, A, B, C, D; + int res; + + /* b cannot be negative */ + if (b->sign == FP_NEG || fp_iszero(b) == 1) { + return FP_VAL; + } + + /* init temps */ + fp_init(&x); fp_init(&y); + fp_init(&u); fp_init(&v); + fp_init(&A); fp_init(&B); + fp_init(&C); fp_init(&D); + + /* x = a, y = b */ + if ((res = fp_mod(a, b, &x)) != FP_OKAY) { + return res; + } + fp_copy(b, &y); + + /* 2. [modified] if x,y are both even then return an error! */ + if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) { + return FP_VAL; + } + + /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ + fp_copy (&x, &u); + fp_copy (&y, &v); + fp_set (&A, 1); + fp_set (&D, 1); + +top: + /* 4. while u is even do */ + while (fp_iseven (&u) == 1) { + /* 4.1 u = u/2 */ + fp_div_2 (&u, &u); + + /* 4.2 if A or B is odd then */ + if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) { + /* A = (A+y)/2, B = (B-x)/2 */ + fp_add (&A, &y, &A); + fp_sub (&B, &x, &B); + } + /* A = A/2, B = B/2 */ + fp_div_2 (&A, &A); + fp_div_2 (&B, &B); + } + + /* 5. while v is even do */ + while (fp_iseven (&v) == 1) { + /* 5.1 v = v/2 */ + fp_div_2 (&v, &v); + + /* 5.2 if C or D is odd then */ + if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) { + /* C = (C+y)/2, D = (D-x)/2 */ + fp_add (&C, &y, &C); + fp_sub (&D, &x, &D); + } + /* C = C/2, D = D/2 */ + fp_div_2 (&C, &C); + fp_div_2 (&D, &D); + } + + /* 6. if u >= v then */ + if (fp_cmp (&u, &v) != FP_LT) { + /* u = u - v, A = A - C, B = B - D */ + fp_sub (&u, &v, &u); + fp_sub (&A, &C, &A); + fp_sub (&B, &D, &B); + } else { + /* v - v - u, C = C - A, D = D - B */ + fp_sub (&v, &u, &v); + fp_sub (&C, &A, &C); + fp_sub (&D, &B, &D); + } + + /* if not zero goto step 4 */ + if (fp_iszero (&u) == 0) + goto top; + + /* now a = C, b = D, gcd == g*v */ + + /* if v != 1 then there is no inverse */ + if (fp_cmp_d (&v, 1) != FP_EQ) { + return FP_VAL; + } + + /* if its too low */ + while (fp_cmp_d(&C, 0) == FP_LT) { + fp_add(&C, b, &C); + } + + /* too big */ + while (fp_cmp_mag(&C, b) != FP_LT) { + fp_sub(&C, b, &C); + } + + /* C is now the inverse */ + fp_copy(&C, c); + return FP_OKAY; +} + /* c = 1/a (mod b) for odd b only */ int fp_invmod(fp_int *a, fp_int *b, fp_int *c) { @@ -17,7 +122,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c) /* 2. [modified] b must be odd */ if (fp_iseven (b) == FP_YES) { - return FP_VAL; + return fp_invmod_slow(a,b,c); } /* init all our temps */ diff --git a/fp_montgomery_reduce.c b/fp_montgomery_reduce.c index 185c9f6..53d6773 100644 --- a/fp_montgomery_reduce.c +++ b/fp_montgomery_reduce.c @@ -299,8 +299,6 @@ asm( \ #define LO 0 -#define HI 1 -#define CY 2 /* computes x/R == x (mod N) via Montgomery Reduction */ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) @@ -347,7 +345,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) } LOOP_END; while (cy) { - PROPCARRY; // cy = cy > (*_c += cy); + PROPCARRY; ++_c; } } @@ -374,7 +372,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) } } + /* $Source$ */ /* $Revision$ */ /* $Date$ */ - diff --git a/fp_mul_comba.c b/fp_mul_comba.c index 472b2d3..cf51eba 100644 --- a/fp_mul_comba.c +++ b/fp_mul_comba.c @@ -47,7 +47,7 @@ /* this should multiply i and j */ #define MULADD(i, j) \ -asm ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -118,7 +118,7 @@ asm ( \ /* this should multiply i and j */ #define MULADD(i, j) \ - asm volatile ( \ + asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ diff --git a/fp_sqr_comba.c b/fp_sqr_comba.c index 9b72493..d5a39e5 100644 --- a/fp_sqr_comba.c +++ b/fp_sqr_comba.c @@ -36,7 +36,7 @@ #define COMBA_FINI #define SQRADD(i, j) \ -asm volatile ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %%eax \n\t" \ "addl %%eax,%0 \n\t" \ @@ -45,7 +45,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); #define SQRADD2(i, j) \ -asm volatile ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -57,7 +57,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); #define SQRADDSC(i, j) \ -asm ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "movl %%eax,%0 \n\t" \ @@ -66,7 +66,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); #define SQRADDAC(i, j) \ -asm ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -75,7 +75,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); #define SQRADDDB \ -asm ( \ +asm( \ "addl %6,%0 \n\t" \ "adcl %7,%1 \n\t" \ "adcl %8,%2 \n\t" \ @@ -104,7 +104,7 @@ asm ( \ #define COMBA_FINI #define SQRADD(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %%rax \n\t" \ "addq %%rax,%0 \n\t" \ @@ -113,7 +113,7 @@ asm ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc"); #define SQRADD2(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ @@ -125,7 +125,7 @@ asm ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #define SQRADDSC(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "movq %%rax,%0 \n\t" \ @@ -134,7 +134,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #define SQRADDAC(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ @@ -143,7 +143,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #define SQRADDDB \ -asm ( \ +asm( \ "addq %6,%0 \n\t" \ "adcq %7,%1 \n\t" \ "adcq %8,%2 \n\t" \ @@ -173,7 +173,7 @@ asm ( \ asm("emms"); #define SQRADD(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "pmuludq %%mm0,%%mm0\n\t" \ "movd %%mm0,%%eax \n\t" \ @@ -185,7 +185,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc"); #define SQRADD2(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -201,7 +201,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); #define SQRADDSC(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -212,7 +212,7 @@ asm volatile ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)); #define SQRADDAC(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -225,7 +225,7 @@ asm volatile ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc"); #define SQRADDDB \ -asm ( \ +asm( \ "addl %6,%0 \n\t" \ "adcl %7,%1 \n\t" \ "adcl %8,%2 \n\t" \ diff --git a/makefile b/makefile index 74a712c..a9ad6b2 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,13 @@ #makefile for TomsFastMath # # -CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops +VERSION=0.05 + +CFLAGS += -Wall -W -Wshadow -I./ + +ifndef IGNORE_SPEED + +CFLAGS += -O3 -funroll-all-loops #profiling #PROF=-pg -g @@ -10,9 +16,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops #speed CFLAGS += -fomit-frame-pointer -VERSION=0.04 - -default: libtfm.a +endif OBJECTS = \ fp_set.o \ @@ -52,23 +56,29 @@ ifndef INCPATH INCPATH=/usr/include endif -ifndef TFM_GROUP +ifndef INSTALL_GROUP GROUP=wheel +else + GROUP=$(INSTALL_GROUP) endif -ifndef TFM_USER +ifndef INSTALL_USER USER=root +else + USER=$(INSTALL_USER) endif ifndef LIBNAME LIBNAME=libtfm.a endif -$(LIBNAME): $(OBJECTS) - $(AR) $(ARFLAGS) $(LIBNAME) $(OBJECTS) - ranlib $(LIBNAME) +default: $(LIBNAME) -install: libtfm.a +$(LIBNAME): $(OBJECTS) + $(AR) $(ARFLAGS) $@ $(OBJECTS) + ranlib $@ + +install: $(LIBNAME) install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH) install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH) install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH) @@ -77,17 +87,17 @@ install: libtfm.a mtest/mtest: mtest/mtest.c cd mtest ; make mtest -test: libtfm.a demo/test.o mtest/mtest - $(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test +test: $(LIBNAME) demo/test.o mtest/mtest + $(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test -timing: libtfm.a demo/test.o - $(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test +timing: $(LIBNAME) demo/test.o + $(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test -stest: libtfm.a demo/stest.o - $(CC) $(CFLAGS) demo/stest.o libtfm.a -o stest +stest: $(LIBNAME) demo/stest.o + $(CC) $(CFLAGS) demo/stest.o $(LIBNAME) -o stest -rsatest: libtfm.a demo/rsa.o - $(CC) $(CFLAGS) demo/rsa.o libtfm.a -o rsatest +rsatest: $(LIBNAME) demo/rsa.o + $(CC) $(CFLAGS) demo/rsa.o $(LIBNAME) -o rsatest docdvi: tfm.tex touch tfm.ind @@ -101,8 +111,23 @@ docs: docdvi dvipdf tfm mv -f tfm.pdf doc +#This rule cleans the source tree of all compiled code, not including the pdf +#documentation. clean: - rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~ rsatest *.gcda *.gcno demo/*.gcda demo/*.gcno mtest/*.gcno mtest/*.gcda + rm -f `find . -type f | grep "[.]o" | xargs` + rm -f `find . -type f | grep "[.]lo" | xargs` + rm -f `find . -type f | grep "[.]a" | xargs` + rm -f `find . -type f | grep "[.]la" | xargs` + rm -f `find . -type f | grep "[.]obj" | xargs` + rm -f `find . -type f | grep "[.]lib" | xargs` + rm -f `find . -type f | grep "[.]exe" | xargs` + rm -f `find . -type f | grep "[.]gcda" | xargs` + rm -f `find . -type f | grep "[.]gcno" | xargs` + rm -f `find . -type f | grep "[.]il" | xargs` + rm -f `find . -type f | grep "[.]dyn" | xargs` + rm -f `find . -type f | grep "[.]dpi" | xargs` + rm -rf `find . -type d | grep "[.]libs" | xargs` + rm -f tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc cd mtest ; make clean no_oops: clean @@ -116,3 +141,7 @@ zipup: no_oops docs clean cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \ tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \ zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/* + +# $Source: /cvs/libtom/tomsfastmath/makefile,v $ +# $Revision: 1.17 $ +# $Date: 2005/07/30 04:23:55 $ diff --git a/makefile.gba b/makefile.gba deleted file mode 100644 index 89e3451..0000000 --- a/makefile.gba +++ /dev/null @@ -1,55 +0,0 @@ -#makefile for TomsFastMath -# -#For the GameboyAdance... er.... ARMv4 -SFLAGS = $(CFLAGS) -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -mthumb -mthumb-interwork -I../devkitadv/mylib/lib -CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -marm -mthumb-interwork -I../devkitadv/mylib/lib - -#profiling -#PROF=-pg -g -#CFLAGS += $(PROF) - -#speed -CFLAGS += -fomit-frame-pointer - -VERSION=0.01 - -default: libtfm.a - -OBJECTS = \ -fp_set.o \ -\ -fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \ -fp_mul_2.o fp_div_2.o \ -\ -fp_cnt_lsb.o \ -\ -fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \ -s_fp_add.o s_fp_sub.o \ -\ -fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \ -fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \ -fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \ -fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \ -\ -fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \ -\ -fp_exptmod.o \ -\ -fp_cmp.o fp_cmp_mag.o \ -\ -fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \ -fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \ -fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \ -\ - -libtfm.a: $(OBJECTS) - $(AR) $(ARFLAGS) libtfm.a $(OBJECTS) - ranlib libtfm.a - -demo/stest.o: demo/stest.c - $(CC) $(SFLAGS) -DGBA_MODE demo/stest.c -c -o demo/stest.o - -stest: libtfm.a demo/stest.o - $(CC) -mthumb -mthumb-interwork demo/stest.o libtfm.a ../devkitadv/mylib/lib/gba.a -o stest.elf - objcopy -O binary stest.elf stest.bin - diff --git a/makefile.shared b/makefile.shared new file mode 100644 index 0000000..74fa873 --- /dev/null +++ b/makefile.shared @@ -0,0 +1,109 @@ +#makefile for TomsFastMath +# +# + +CC=libtool --mode=compile gcc + +CFLAGS += -Wall -W -Wshadow -I./ + +ifndef IGNORE_SPEED + +CFLAGS += -O3 -funroll-all-loops + +#profiling +#PROF=-pg -g +#CFLAGS += $(PROF) + +#speed +CFLAGS += -fomit-frame-pointer + +endif + +VERSION=0:5 + +OBJECTS = \ +fp_set.o \ +\ +fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \ +fp_mul_2.o fp_div_2.o \ +\ +fp_cnt_lsb.o \ +\ +fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \ +s_fp_add.o s_fp_sub.o \ +\ +fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \ +fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \ +fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \ +fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \ +\ +fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \ +\ +fp_exptmod.o \ +\ +fp_cmp.o fp_cmp_mag.o \ +\ +fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \ +fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \ +fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \ +\ +fp_ident.o + +HEADERS=tfm.h + +ifndef LIBPATH + LIBPATH=/usr/lib +endif + +ifndef INCPATH + INCPATH=/usr/include +endif + +ifndef INSTALL_GROUP + GROUP=wheel +else + GROUP=$(INSTALL_GROUP) +endif + +ifndef INSTALL_USER + USER=root +else + USER=$(INSTALL_USER) +endif + +ifndef LIBNAME + LIBNAME=libtfm.la +endif + +ifndef LIBNAME_S + LIBNAME_S=libtfm.a +endif + +default: $(LIBNAME) + +$(LIBNAME): $(OBJECTS) + +install: $(LIBNAME) + libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION) + libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]o" | xargs` -o $(LIBNAME_S) + ranlib $(LIBNAME_S) + libtool --silent --mode=install install -c $(LIBNAME) $(LIBPATH)/$(LIBNAME) + install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH) + install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH) + +mtest/mtest: mtest/mtest.c + cd mtest ; make mtest + +test: $(LIBNAME) demo/test.o mtest/mtest + $(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test + +timing: $(LIBNAME) demo/test.o + $(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test + +stest: $(LIBNAME) demo/stest.o + $(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest + +# $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $ +# $Revision: 1.4 $ +# $Date: 2005/07/28 03:08:35 $ + diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c index 08eaaa0..132c1e2 100644 --- a/pre_gen/mpi.c +++ b/pre_gen/mpi.c @@ -757,6 +757,75 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d) */ #include +#ifdef TFM_TIMING_RESISTANT + +/* timing resistant montgomery ladder based exptmod + + Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002 +*/ +static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) +{ + fp_int R[2]; + fp_digit buf, mp; + int err, bitcnt, digidx, y; + + /* now setup montgomery */ + if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) { + return err; + } + + fp_init(&R[0]); + fp_init(&R[1]); + + /* now we need R mod m */ + fp_montgomery_calc_normalization (&R[0], P); + + /* now set R[0][1] to G * R mod m */ + if (fp_cmp_mag(P, G) != FP_GT) { + /* G > P so we reduce it first */ + fp_mod(G, P, &R[1]); + } else { + fp_copy(G, &R[1]); + } + fp_mulmod (&R[1], &R[0], P, &R[1]); + + /* for j = t-1 downto 0 do + r_!k = R0*R1; r_k = r_k^2 + */ + + /* set initial mode and bit cnt */ + bitcnt = 1; + buf = 0; + digidx = X->used - 1; + + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + /* if digidx == -1 we are out of digits so break */ + if (digidx == -1) { + break; + } + /* read next digit and reset bitcnt */ + buf = X->dp[digidx--]; + bitcnt = (int)DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1; + buf <<= (fp_digit)1; + + /* do ops */ + fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp); + fp_sqr(&R[y], &R[y]); fp_montgomery_reduce(&R[y], P, mp); + } + + fp_montgomery_reduce(&R[0], P, mp); + fp_copy(&R[0], Y); + return FP_OKAY; +} + +#else + /* y = g**x (mod b) * Some restrictions... x must be positive and < b */ @@ -916,6 +985,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) return FP_OKAY; } +#endif + int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) { @@ -1105,6 +1176,111 @@ int main(void) */ #include +static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c) +{ + fp_int x, y, u, v, A, B, C, D; + int res; + + /* b cannot be negative */ + if (b->sign == FP_NEG || fp_iszero(b) == 1) { + return FP_VAL; + } + + /* init temps */ + fp_init(&x); fp_init(&y); + fp_init(&u); fp_init(&v); + fp_init(&A); fp_init(&B); + fp_init(&C); fp_init(&D); + + /* x = a, y = b */ + if ((res = fp_mod(a, b, &x)) != FP_OKAY) { + return res; + } + fp_copy(b, &y); + + /* 2. [modified] if x,y are both even then return an error! */ + if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) { + return FP_VAL; + } + + /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ + fp_copy (&x, &u); + fp_copy (&y, &v); + fp_set (&A, 1); + fp_set (&D, 1); + +top: + /* 4. while u is even do */ + while (fp_iseven (&u) == 1) { + /* 4.1 u = u/2 */ + fp_div_2 (&u, &u); + + /* 4.2 if A or B is odd then */ + if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) { + /* A = (A+y)/2, B = (B-x)/2 */ + fp_add (&A, &y, &A); + fp_sub (&B, &x, &B); + } + /* A = A/2, B = B/2 */ + fp_div_2 (&A, &A); + fp_div_2 (&B, &B); + } + + /* 5. while v is even do */ + while (fp_iseven (&v) == 1) { + /* 5.1 v = v/2 */ + fp_div_2 (&v, &v); + + /* 5.2 if C or D is odd then */ + if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) { + /* C = (C+y)/2, D = (D-x)/2 */ + fp_add (&C, &y, &C); + fp_sub (&D, &x, &D); + } + /* C = C/2, D = D/2 */ + fp_div_2 (&C, &C); + fp_div_2 (&D, &D); + } + + /* 6. if u >= v then */ + if (fp_cmp (&u, &v) != FP_LT) { + /* u = u - v, A = A - C, B = B - D */ + fp_sub (&u, &v, &u); + fp_sub (&A, &C, &A); + fp_sub (&B, &D, &B); + } else { + /* v - v - u, C = C - A, D = D - B */ + fp_sub (&v, &u, &v); + fp_sub (&C, &A, &C); + fp_sub (&D, &B, &D); + } + + /* if not zero goto step 4 */ + if (fp_iszero (&u) == 0) + goto top; + + /* now a = C, b = D, gcd == g*v */ + + /* if v != 1 then there is no inverse */ + if (fp_cmp_d (&v, 1) != FP_EQ) { + return FP_VAL; + } + + /* if its too low */ + while (fp_cmp_d(&C, 0) == FP_LT) { + fp_add(&C, b, &C); + } + + /* too big */ + while (fp_cmp_mag(&C, b) != FP_LT) { + fp_sub(&C, b, &C); + } + + /* C is now the inverse */ + fp_copy(&C, c); + return FP_OKAY; +} + /* c = 1/a (mod b) for odd b only */ int fp_invmod(fp_int *a, fp_int *b, fp_int *c) { @@ -1113,7 +1289,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c) /* 2. [modified] b must be odd */ if (fp_iseven (b) == FP_YES) { - return FP_VAL; + return fp_invmod_slow(a,b,c); } /* init all our temps */ @@ -1814,8 +1990,6 @@ asm( \ #define LO 0 -#define HI 1 -#define CY 2 /* computes x/R == x (mod N) via Montgomery Reduction */ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) @@ -1862,7 +2036,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) } LOOP_END; while (cy) { - PROPCARRY; // cy = cy > (*_c += cy); + PROPCARRY; ++_c; } } @@ -1889,10 +2063,10 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) } } + /* $Source$ */ /* $Revision$ */ /* $Date$ */ - /* End: fp_montgomery_reduce.c */ @@ -2270,7 +2444,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c) /* this should multiply i and j */ #define MULADD(i, j) \ -asm ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -2341,7 +2515,7 @@ asm ( \ /* this should multiply i and j */ #define MULADD(i, j) \ - asm volatile ( \ + asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -5678,7 +5852,7 @@ Obvious points of optimization #define COMBA_FINI #define SQRADD(i, j) \ -asm volatile ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %%eax \n\t" \ "addl %%eax,%0 \n\t" \ @@ -5687,7 +5861,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); #define SQRADD2(i, j) \ -asm volatile ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -5699,7 +5873,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); #define SQRADDSC(i, j) \ -asm ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "movl %%eax,%0 \n\t" \ @@ -5708,7 +5882,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); #define SQRADDAC(i, j) \ -asm ( \ +asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -5717,7 +5891,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); #define SQRADDDB \ -asm ( \ +asm( \ "addl %6,%0 \n\t" \ "adcl %7,%1 \n\t" \ "adcl %8,%2 \n\t" \ @@ -5746,7 +5920,7 @@ asm ( \ #define COMBA_FINI #define SQRADD(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %%rax \n\t" \ "addq %%rax,%0 \n\t" \ @@ -5755,7 +5929,7 @@ asm ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc"); #define SQRADD2(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ @@ -5767,7 +5941,7 @@ asm ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #define SQRADDSC(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "movq %%rax,%0 \n\t" \ @@ -5776,7 +5950,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #define SQRADDAC(i, j) \ -asm ( \ +asm( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ @@ -5785,7 +5959,7 @@ asm ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #define SQRADDDB \ -asm ( \ +asm( \ "addq %6,%0 \n\t" \ "adcq %7,%1 \n\t" \ "adcq %8,%2 \n\t" \ @@ -5815,7 +5989,7 @@ asm ( \ asm("emms"); #define SQRADD(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "pmuludq %%mm0,%%mm0\n\t" \ "movd %%mm0,%%eax \n\t" \ @@ -5827,7 +6001,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc"); #define SQRADD2(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -5843,7 +6017,7 @@ asm volatile ( \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); #define SQRADDSC(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -5854,7 +6028,7 @@ asm volatile ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)); #define SQRADDAC(i, j) \ -asm volatile ( \ +asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ @@ -5867,7 +6041,7 @@ asm volatile ( \ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc"); #define SQRADDDB \ -asm ( \ +asm( \ "addl %6,%0 \n\t" \ "adcl %7,%1 \n\t" \ "adcl %8,%2 \n\t" \ diff --git a/tfm.aux b/tfm.aux index 865325f..2db70b6 100644 --- a/tfm.aux +++ b/tfm.aux @@ -17,40 +17,42 @@ \@writefile{toc}{\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}} \@writefile{toc}{\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}} \@writefile{toc}{\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}} -\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}} -\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}} +\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}} +\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}} \@writefile{toc}{\contentsline {subsubsection}{x86--32}{3}{section*.3}} \@writefile{toc}{\contentsline {subsubsection}{SSE2}{3}{section*.4}} -\@writefile{toc}{\contentsline {subsubsection}{x86--64}{3}{section*.5}} -\@writefile{toc}{\contentsline {subsubsection}{ARM}{3}{section*.6}} -\@writefile{toc}{\contentsline {subsubsection}{PPC32}{3}{section*.7}} +\@writefile{toc}{\contentsline {subsubsection}{x86--64}{4}{section*.5}} +\@writefile{toc}{\contentsline {subsubsection}{ARM}{4}{section*.6}} +\@writefile{toc}{\contentsline {subsubsection}{PPC32}{4}{section*.7}} \@writefile{toc}{\contentsline {subsubsection}{Future Releases}{4}{section*.8}} \@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}} -\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}} -\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}} +\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}} -\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}} +\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}} +\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}} +\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}} -\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}} -\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}} -\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}} -\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}} -\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}} -\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}} -\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}} +\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}} +\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}} +\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}} +\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}} +\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}} +\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}} +\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}} +\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}} +\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chap:asmops}{{4}{11}{Porting TomsFastMath\relax }{chapter.4}{}} -\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}} -\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}} -\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}} -\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}} +\newlabel{chap:asmops}{{4}{13}{Porting TomsFastMath\relax }{chapter.4}{}} +\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}} +\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}} +\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}} +\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}} diff --git a/tfm.dvi b/tfm.dvi index d1c3630..7766aaa 100644 Binary files a/tfm.dvi and b/tfm.dvi differ diff --git a/tfm.h b/tfm.h index f73a08d..da91e81 100644 --- a/tfm.h +++ b/tfm.h @@ -48,6 +48,11 @@ */ /* #define TFM_PRESCOTT */ +/* Do we want timing resistant fp_exptmod() ? + * This makes it slower but also timing invariant with respect to the exponent + */ +/* #define TFM_TIMING_RESISTANT */ + #endif /* Max size of any number in bits. Basically the largest size you will be multiplying @@ -355,15 +360,25 @@ int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen); void s_fp_add(fp_int *a, fp_int *b, fp_int *c); void s_fp_sub(fp_int *a, fp_int *b, fp_int *c); void bn_reverse(unsigned char *s, int len); + void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C); + +#ifdef TFM_SMALL_SET +void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C); +#endif + #ifdef TFM_HUGE void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C); #endif -void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C); +void fp_sqr_comba(fp_int *A, fp_int *B); + +#ifdef TFM_SMALL_SET void fp_sqr_comba_small(fp_int *A, fp_int *B); +#endif + #ifdef TFM_HUGE void fp_sqr_comba32(fp_int *A, fp_int *B); void fp_sqr_comba48(fp_int *A, fp_int *B); diff --git a/tfm.idx b/tfm.idx index 65f1bf8..757d2e7 100644 --- a/tfm.idx +++ b/tfm.idx @@ -1,29 +1,29 @@ -\indexentry{fp\_init|hyperpage}{6} -\indexentry{fp\_set|hyperpage}{6} -\indexentry{fp\_init\_copy|hyperpage}{6} -\indexentry{fp\_iszero|hyperpage}{7} -\indexentry{fp\_iseven|hyperpage}{7} -\indexentry{fp\_isodd|hyperpage}{7} -\indexentry{fp\_neg|hyperpage}{7} -\indexentry{fp\_abs|hyperpage}{7} -\indexentry{fp\_cmp|hyperpage}{8} -\indexentry{fp\_cmp\_mag|hyperpage}{8} -\indexentry{fp\_lshd|hyperpage}{8} -\indexentry{fp\_rshd|hyperpage}{8} -\indexentry{fp\_div\_2d|hyperpage}{8} -\indexentry{fp\_mod\_2d|hyperpage}{8} -\indexentry{fp\_mul\_2d|hyperpage}{8} -\indexentry{fp\_div\_2|hyperpage}{8} -\indexentry{fp\_mul\_2|hyperpage}{8} -\indexentry{fp\_cnt\_lsb|hyperpage}{8} -\indexentry{fp\_add|hyperpage}{9} -\indexentry{fp\_sub|hyperpage}{9} -\indexentry{fp\_mul|hyperpage}{9} -\indexentry{fp\_sqr|hyperpage}{9} -\indexentry{fp\_div|hyperpage}{9} -\indexentry{fp\_mod|hyperpage}{9} -\indexentry{fp\_exptmod|hyperpage}{9} -\indexentry{fp\_invmod|hyperpage}{9} -\indexentry{fp\_gcd|hyperpage}{9} -\indexentry{fp\_lcm|hyperpage}{9} -\indexentry{fp\_isprime|hyperpage}{10} +\indexentry{fp\_init|hyperpage}{8} +\indexentry{fp\_set|hyperpage}{8} +\indexentry{fp\_init\_copy|hyperpage}{8} +\indexentry{fp\_iszero|hyperpage}{9} +\indexentry{fp\_iseven|hyperpage}{9} +\indexentry{fp\_isodd|hyperpage}{9} +\indexentry{fp\_neg|hyperpage}{9} +\indexentry{fp\_abs|hyperpage}{9} +\indexentry{fp\_cmp|hyperpage}{10} +\indexentry{fp\_cmp\_mag|hyperpage}{10} +\indexentry{fp\_lshd|hyperpage}{10} +\indexentry{fp\_rshd|hyperpage}{10} +\indexentry{fp\_div\_2d|hyperpage}{10} +\indexentry{fp\_mod\_2d|hyperpage}{10} +\indexentry{fp\_mul\_2d|hyperpage}{10} +\indexentry{fp\_div\_2|hyperpage}{10} +\indexentry{fp\_mul\_2|hyperpage}{10} +\indexentry{fp\_cnt\_lsb|hyperpage}{10} +\indexentry{fp\_add|hyperpage}{11} +\indexentry{fp\_sub|hyperpage}{11} +\indexentry{fp\_mul|hyperpage}{11} +\indexentry{fp\_sqr|hyperpage}{11} +\indexentry{fp\_div|hyperpage}{11} +\indexentry{fp\_mod|hyperpage}{11} +\indexentry{fp\_exptmod|hyperpage}{11} +\indexentry{fp\_invmod|hyperpage}{11} +\indexentry{fp\_gcd|hyperpage}{11} +\indexentry{fp\_lcm|hyperpage}{11} +\indexentry{fp\_isprime|hyperpage}{12} diff --git a/tfm.ind b/tfm.ind index a2a2a57..8a527bd 100644 --- a/tfm.ind +++ b/tfm.ind @@ -1,33 +1,33 @@ \begin{theindex} - \item fp\_abs, \hyperpage{7} - \item fp\_add, \hyperpage{9} - \item fp\_cmp, \hyperpage{8} - \item fp\_cmp\_mag, \hyperpage{8} - \item fp\_cnt\_lsb, \hyperpage{8} - \item fp\_div, \hyperpage{9} - \item fp\_div\_2, \hyperpage{8} - \item fp\_div\_2d, \hyperpage{8} - \item fp\_exptmod, \hyperpage{9} - \item fp\_gcd, \hyperpage{9} - \item fp\_init, \hyperpage{6} - \item fp\_init\_copy, \hyperpage{6} - \item fp\_invmod, \hyperpage{9} - \item fp\_iseven, \hyperpage{7} - \item fp\_isodd, \hyperpage{7} - \item fp\_isprime, \hyperpage{10} - \item fp\_iszero, \hyperpage{7} - \item fp\_lcm, \hyperpage{9} - \item fp\_lshd, \hyperpage{8} - \item fp\_mod, \hyperpage{9} - \item fp\_mod\_2d, \hyperpage{8} - \item fp\_mul, \hyperpage{9} - \item fp\_mul\_2, \hyperpage{8} - \item fp\_mul\_2d, \hyperpage{8} - \item fp\_neg, \hyperpage{7} - \item fp\_rshd, \hyperpage{8} - \item fp\_set, \hyperpage{6} - \item fp\_sqr, \hyperpage{9} - \item fp\_sub, \hyperpage{9} + \item fp\_abs, \hyperpage{9} + \item fp\_add, \hyperpage{11} + \item fp\_cmp, \hyperpage{10} + \item fp\_cmp\_mag, \hyperpage{10} + \item fp\_cnt\_lsb, \hyperpage{10} + \item fp\_div, \hyperpage{11} + \item fp\_div\_2, \hyperpage{10} + \item fp\_div\_2d, \hyperpage{10} + \item fp\_exptmod, \hyperpage{11} + \item fp\_gcd, \hyperpage{11} + \item fp\_init, \hyperpage{8} + \item fp\_init\_copy, \hyperpage{8} + \item fp\_invmod, \hyperpage{11} + \item fp\_iseven, \hyperpage{9} + \item fp\_isodd, \hyperpage{9} + \item fp\_isprime, \hyperpage{12} + \item fp\_iszero, \hyperpage{9} + \item fp\_lcm, \hyperpage{11} + \item fp\_lshd, \hyperpage{10} + \item fp\_mod, \hyperpage{11} + \item fp\_mod\_2d, \hyperpage{10} + \item fp\_mul, \hyperpage{11} + \item fp\_mul\_2, \hyperpage{10} + \item fp\_mul\_2d, \hyperpage{10} + \item fp\_neg, \hyperpage{9} + \item fp\_rshd, \hyperpage{10} + \item fp\_set, \hyperpage{8} + \item fp\_sqr, \hyperpage{11} + \item fp\_sub, \hyperpage{11} \end{theindex} diff --git a/tfm.log b/tfm.log index d46bd4a..e8dbba0 100644 --- a/tfm.log +++ b/tfm.log @@ -1,4 +1,4 @@ -This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10) 23 JUL 2005 07:42 +This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10) 1 AUG 2005 13:34 entering extended mode **tfm (./tfm.tex @@ -216,107 +216,107 @@ File: umsb.fd 2002/01/19 v2.2g AMS font definitions Chapter 1. [1 -] [2] [3] [4] +] [2] [3] [4] [5] [6 + +] Chapter 2. Underfull \vbox (badness 7649) has occurred while \output is active [] - [5 - -] -[6] + [7] +[8] Chapter 3. -[7 +[9 -] [8] [9] [10] +] [10] [11] [12] Chapter 4. -[11 +[13 -] [12] [13] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 +] [14] [15] +Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559 []\OT1/cmtt/m/n/10 #define SQRADDSC(i, j) \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559 [] \OT1/cmtt/m/n/10 do { fp_word t; \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559 [] \OT1/cmtt/m/n/10 t = ((fp_word)i) * ((fp_word)j); \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559 [] \OT1/cmtt/m/n/10 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \[] [] -Overfull \hbox (25.129pt too wide) in paragraph at lines 548--549 +Overfull \hbox (25.129pt too wide) in paragraph at lines 560--561 \OT1/cmr/m/n/10 This com-putes a prod-uct and stores it in the ``sec-ondary'' c arry reg-is-ters $[]$. [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568 []\OT1/cmtt/m/n/10 #define SQRADDAC(i, j) \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568 [] \OT1/cmtt/m/n/10 do { fp_word t; \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568 [] \OT1/cmtt/m/n/10 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568 [] \OT1/cmtt/m/n/10 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578 []\OT1/cmtt/m/n/10 #define SQRADDDB \[] [] -Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566 +Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578 [] \OT1/cmtt/m/n/10 do { fp_word t; \[] [] -Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566 +Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578 [] \OT1/cmtt/m/n/10 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \[] [] -Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566 +Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578 [] \OT1/cmtt/m/n/10 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BI T); c1 = t; \[] [] -Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566 +Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578 [] \OT1/cmtt/m/n/10 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_B IT); \[] [] -[14] [15] (./tfm.ind [16] [17 +[16] [17] (./tfm.ind [18] [19 ]) (./tfm.aux) ) @@ -329,4 +329,4 @@ Here is how much of TeX's memory you used: 580 hyphenation exceptions out of 1000 25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s -Output written on tfm.dvi (23 pages, 49708 bytes). +Output written on tfm.dvi (25 pages, 51612 bytes). diff --git a/tfm.tex b/tfm.tex index 08d7bfb..761d95f 100644 --- a/tfm.tex +++ b/tfm.tex @@ -49,8 +49,8 @@ \begin{document} \frontmatter \pagestyle{empty} -\title{TomsFastMath User Manual \\ v0.04} -\author{Tom St Denis \\ tomstdenis@iahu.ca} +\title{TomsFastMath User Manual \\ v0.05} +\author{Tom St Denis \\ tomstdenis@gmail.com} \maketitle This text and library are all hereby placed in the public domain. This book has been formatted for B5 [176x250] paper using the \LaTeX{} {\em book} macro package. @@ -101,14 +101,26 @@ fast multiplication and squaring and has the side effect of speeding up ECC oper TomsFastMath is public domain. \section{Building} -Currently only a GCC makefile has been provided. To build the library simply type -``make''. The library is a bit too new to put into production so no install -scripts exist yet. You can build the test program with ``make test''. +To build the library simply type ``make''. Or to install in typical *unix like directories use +``make install''. Similarly a shared library can be built with ``make -f makefile.shared install''. -To perform simple static testing (useful to test out new assembly ports) use the stest -program. Type ``make stest'' and run it on your target. The program will perform three -multiplications, squarings and montgomery reductions. Likely if your assembly -code is invalid this code will exhibit the bug. +You can build the test program with ``make test''. To perform simple static testing (useful to +test out new assembly ports) use the stest program. Type ``make stest'' and run it on your +target. The program will perform three multiplications, squarings and montgomery reductions. +Likely if your assembly code is invalid this code will exhibit the bug. + +\subsection{Intel CC} +In theory you should be able to build the library with + +\begin{verbatim} +CFLAGS="-O3 -ip" CC=icc make IGNORE_SPEED=1 +\end{verbatim} + +However, Intels inline assembler is way less advanced than GCCs. As a result it doesn't compile. +Fortunately it doesn't really matter. + +\subsection{MSVC} +The library doesn't build with MSVC. Imagine that. \subsection{Build Limitations} TomsFastMath has the following build requirements which are non--portable but under most diff --git a/tfm.toc b/tfm.toc index 83d12c4..acf180c 100644 --- a/tfm.toc +++ b/tfm.toc @@ -2,32 +2,34 @@ \contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1} \contentsline {section}{\numberline {1.2}License}{2}{section.1.2} \contentsline {section}{\numberline {1.3}Building}{2}{section.1.3} -\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1} -\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2} +\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1} +\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2} +\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3} +\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4} \contentsline {subsubsection}{x86--32}{3}{section*.3} \contentsline {subsubsection}{SSE2}{3}{section*.4} -\contentsline {subsubsection}{x86--64}{3}{section*.5} -\contentsline {subsubsection}{ARM}{3}{section*.6} -\contentsline {subsubsection}{PPC32}{3}{section*.7} +\contentsline {subsubsection}{x86--64}{4}{section*.5} +\contentsline {subsubsection}{ARM}{4}{section*.6} +\contentsline {subsubsection}{PPC32}{4}{section*.7} \contentsline {subsubsection}{Future Releases}{4}{section*.8} -\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3} -\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2} -\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1} -\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2} -\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1} -\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2} -\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3} -\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3} -\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1} -\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2} -\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3} -\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4} -\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5} -\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6} -\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7} -\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8} -\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4} -\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1} -\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2} -\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3} -\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4} +\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5} +\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2} +\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1} +\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2} +\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1} +\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2} +\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3} +\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3} +\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1} +\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2} +\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3} +\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4} +\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5} +\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6} +\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7} +\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8} +\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4} +\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1} +\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2} +\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3} +\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}