added tomsfastmath-0.05

This commit is contained in:
Tom St Denis 2005-08-01 16:37:35 +00:00 committed by Steffen Jaeckel
parent f91cf2d1cf
commit a6c4c5a261
21 changed files with 830 additions and 310 deletions

View File

@ -1,3 +1,10 @@
August 1st, 2005
0.05 -- Quick fix to the fp_invmod.c code to let it handle even moduli [required for LTC]
-- Added makefile.shared to make shared objects [required for LTC]
-- Improved makefiles to make them way more configurable
-- Added timing resistant fp_exptmod() enabled with TFM_TIMING_RESISTANT
July 23rd, 2005
0.04 -- Fixed bugs in the SSE2 squaring code 0.04 -- Fixed bugs in the SSE2 squaring code
-- Rewrote the multipliers to be optimized for small inputs -- Rewrote the multipliers to be optimized for small inputs
-- Nelson Bolyard of the NSS crew submitted [among other things] new faster Montgomery reduction -- Nelson Bolyard of the NSS crew submitted [among other things] new faster Montgomery reduction

View File

@ -1,59 +1,112 @@
/* generate montgomery reductions for m->used = 1...16 */
#include <stdio.h> #include <stdio.h>
int main(void) int main(void)
{ {
int N; int x, y, z;
for (N = 1; N <= 16; N++) {
printf("void fp_montgomery_reduce_%d(fp_int *a, fp_int *m, fp_digit mp)\n", N);
printf( printf(
#if 0
"#ifdef TFM_SMALL_SET\n"
"/* computes x/R == x (mod N) via Montgomery Reduction */\n"
"void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)\n"
"{\n" "{\n"
" fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;\n" " fp_digit c[FP_SIZE], *_c, *tmpm, mu, cy;\n"
" int oldused, x, y;\n" " int oldused, x, y, pa;\n"
"\n" "\n"
"#if defined(USE_MEMSET)\n"
" /* now zero the buff */\n" " /* now zero the buff */\n"
" memset(c, 0, sizeof(c));\n" " memset(c, 0, sizeof c);\n"
"#endif\n"
" pa = m->used;\n"
"\n" "\n"
" /* copy the input */\n" " /* copy the input */\n"
" oldused = a->used;\n" " oldused = a->used;\n"
" for (x = 0; x < oldused; x++) {\n" " for (x = 0; x < oldused; x++) {\n"
" c[x] = a->dp[x];\n" " c[x] = a->dp[x];\n"
" }\n" " }\n"
"\n" "#if !defined(USE_MEMSET)\n"
" for (; x < 2*pa+3; x++) {\n"
" c[x] = 0;\n"
" }\n"
"#endif\n"
" MONT_START;\n" " MONT_START;\n"
#endif
"\n" "\n"
" /* now let's get bizz-sy! */\n" " switch (pa) {\n");
" for (x = 0; x < %d; x++) {\n"
" /* get Mu for this round */\n" for (x = 1; x <= 64; x++) {
" LOOP_START;\n" if (x > 16 && (x != 32 && x != 48 && x != 64)) continue;
"\n" if (x > 16) printf("#ifdef TFM_HUGE\n");
" /* our friendly neighbourhood alias */\n"
" _c = c + x;\n"
" tmpm = m->dp;\n"
"\n" printf(" case %d:\n", x);
" for (y = 0; y < %d; y++) {\n"
" INNERMUL;\n" for (y = 0; y < x; y++) {
" ++_c;\n"
" }\n" printf(" x = %d; cy = 0;\n"
" /* send carry up man... */\n" " LOOP_START;\n"
" _c = c + x;\n" " _c = c + %d;\n"
" PROPCARRY;\n" " tmpm = m->dp;\n", y, y);
" } \n"
"\n" printf("#ifdef INNERMUL8\n");
" /* fix the rest of the carries */\n" for (z = 0; z+8 <= x; z += 8) {
" _c = c + %d;\n" printf(" INNERMUL8; _c += 8; tmpm += 8;\n");
" for (x = %d; x < %d * 2 + 2; x++) {\n" }
" PROPCARRY;\n" for (; z < x; z++) {
" ++_c;\n" printf(" INNERMUL; ++_c;\n");
}
printf("#else\n");
for (z = 0; z < x; z++) {
printf(" INNERMUL; ++_c;\n");
}
printf("#endif\n");
printf(" LOOP_END;\n"
" while (cy) {\n"
" PROPCARRY;\n"
" ++_c;\n"
" }\n");
}
//printf(" }\n");
printf(" break;\n");
#define LOOP_MACRO(stride) \
for (x = 0; x < stride; x++) { \
fp_digit cy = 0; \
/* get Mu for this round */ \
LOOP_START; \
_c = c + x; \
tmpm = m->dp; \
for (y = 0; y < stride; y++) { \
INNERMUL; \
++_c; \
} \
LOOP_END; \
while (cy) { \
PROPCARRY; \
++_c; \
} \
}
if (x > 16) printf("#endif /* TFM_HUGE */\n");
}
#if 0
printf(
" }\n" " }\n"
"\n"
" /* now copy out */\n" " /* now copy out */\n"
" _c = c + %d;\n" " _c = c + pa;\n"
" tmpm = a->dp;\n" " tmpm = a->dp;\n"
" for (x = 0; x < %d+1; x++) {\n" " for (x = 0; x < pa+1; x++) {\n"
" *tmpm++ = *_c++;\n" " *tmpm++ = *_c++;\n"
" }\n" " }\n"
"\n" "\n"
@ -63,19 +116,17 @@ printf(
"\n" "\n"
" MONT_FINI;\n" " MONT_FINI;\n"
"\n" "\n"
" a->used = %d+1;\n" " a->used = pa+1;\n"
" fp_clamp(a);\n" " fp_clamp(a);\n"
"\n" "\n"
" /* if A >= m then A = A - m */\n" " /* if A >= m then A = A - m */\n"
" if (fp_cmp_mag (a, m) != FP_LT) {\n" " if (fp_cmp_mag (a, m) != FP_LT) {\n"
" s_fp_sub (a, m, a);\n" " s_fp_sub (a, m, a);\n"
" }\n" " }\n"
"}\n", N,N,N,N,N,N,N,N); "}\n\n#endif\n");
}
#endif
return 0; return 0;
} }

View File

@ -213,7 +213,7 @@ t1 = TIMFUNC();
sleep(1); sleep(1);
printf("Ticks per second: %llu\n", TIMFUNC() - t1); printf("Ticks per second: %llu\n", TIMFUNC() - t1);
goto expttime; goto multtime;
/* do some timings... */ /* do some timings... */
printf("Addition:\n"); printf("Addition:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) { for (t = 2; t <= FP_SIZE/2; t += 2) {

Binary file not shown.

View File

@ -9,6 +9,75 @@
*/ */
#include <tfm.h> #include <tfm.h>
#ifdef TFM_TIMING_RESISTANT
/* timing resistant montgomery ladder based exptmod
Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
*/
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int R[2];
fp_digit buf, mp;
int err, bitcnt, digidx, y;
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
return err;
}
fp_init(&R[0]);
fp_init(&R[1]);
/* now we need R mod m */
fp_montgomery_calc_normalization (&R[0], P);
/* now set R[0][1] to G * R mod m */
if (fp_cmp_mag(P, G) != FP_GT) {
/* G > P so we reduce it first */
fp_mod(G, P, &R[1]);
} else {
fp_copy(G, &R[1]);
}
fp_mulmod (&R[1], &R[0], P, &R[1]);
/* for j = t-1 downto 0 do
r_!k = R0*R1; r_k = r_k^2
*/
/* set initial mode and bit cnt */
bitcnt = 1;
buf = 0;
digidx = X->used - 1;
for (;;) {
/* grab next digit as required */
if (--bitcnt == 0) {
/* if digidx == -1 we are out of digits so break */
if (digidx == -1) {
break;
}
/* read next digit and reset bitcnt */
buf = X->dp[digidx--];
bitcnt = (int)DIGIT_BIT;
}
/* grab the next msb from the exponent */
y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
buf <<= (fp_digit)1;
/* do ops */
fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
fp_sqr(&R[y], &R[y]); fp_montgomery_reduce(&R[y], P, mp);
}
fp_montgomery_reduce(&R[0], P, mp);
fp_copy(&R[0], Y);
return FP_OKAY;
}
#else
/* y = g**x (mod b) /* y = g**x (mod b)
* Some restrictions... x must be positive and < b * Some restrictions... x must be positive and < b
*/ */
@ -168,6 +237,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY; return FP_OKAY;
} }
#endif
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{ {

View File

@ -9,6 +9,111 @@
*/ */
#include <tfm.h> #include <tfm.h>
static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
{
fp_int x, y, u, v, A, B, C, D;
int res;
/* b cannot be negative */
if (b->sign == FP_NEG || fp_iszero(b) == 1) {
return FP_VAL;
}
/* init temps */
fp_init(&x); fp_init(&y);
fp_init(&u); fp_init(&v);
fp_init(&A); fp_init(&B);
fp_init(&C); fp_init(&D);
/* x = a, y = b */
if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
return res;
}
fp_copy(b, &y);
/* 2. [modified] if x,y are both even then return an error! */
if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
return FP_VAL;
}
/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
fp_copy (&x, &u);
fp_copy (&y, &v);
fp_set (&A, 1);
fp_set (&D, 1);
top:
/* 4. while u is even do */
while (fp_iseven (&u) == 1) {
/* 4.1 u = u/2 */
fp_div_2 (&u, &u);
/* 4.2 if A or B is odd then */
if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
/* A = (A+y)/2, B = (B-x)/2 */
fp_add (&A, &y, &A);
fp_sub (&B, &x, &B);
}
/* A = A/2, B = B/2 */
fp_div_2 (&A, &A);
fp_div_2 (&B, &B);
}
/* 5. while v is even do */
while (fp_iseven (&v) == 1) {
/* 5.1 v = v/2 */
fp_div_2 (&v, &v);
/* 5.2 if C or D is odd then */
if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
/* C = (C+y)/2, D = (D-x)/2 */
fp_add (&C, &y, &C);
fp_sub (&D, &x, &D);
}
/* C = C/2, D = D/2 */
fp_div_2 (&C, &C);
fp_div_2 (&D, &D);
}
/* 6. if u >= v then */
if (fp_cmp (&u, &v) != FP_LT) {
/* u = u - v, A = A - C, B = B - D */
fp_sub (&u, &v, &u);
fp_sub (&A, &C, &A);
fp_sub (&B, &D, &B);
} else {
/* v - v - u, C = C - A, D = D - B */
fp_sub (&v, &u, &v);
fp_sub (&C, &A, &C);
fp_sub (&D, &B, &D);
}
/* if not zero goto step 4 */
if (fp_iszero (&u) == 0)
goto top;
/* now a = C, b = D, gcd == g*v */
/* if v != 1 then there is no inverse */
if (fp_cmp_d (&v, 1) != FP_EQ) {
return FP_VAL;
}
/* if its too low */
while (fp_cmp_d(&C, 0) == FP_LT) {
fp_add(&C, b, &C);
}
/* too big */
while (fp_cmp_mag(&C, b) != FP_LT) {
fp_sub(&C, b, &C);
}
/* C is now the inverse */
fp_copy(&C, c);
return FP_OKAY;
}
/* c = 1/a (mod b) for odd b only */ /* c = 1/a (mod b) for odd b only */
int fp_invmod(fp_int *a, fp_int *b, fp_int *c) int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
{ {
@ -17,7 +122,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
/* 2. [modified] b must be odd */ /* 2. [modified] b must be odd */
if (fp_iseven (b) == FP_YES) { if (fp_iseven (b) == FP_YES) {
return FP_VAL; return fp_invmod_slow(a,b,c);
} }
/* init all our temps */ /* init all our temps */

View File

@ -299,8 +299,6 @@ asm( \
#define LO 0 #define LO 0
#define HI 1
#define CY 2
/* computes x/R == x (mod N) via Montgomery Reduction */ /* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
@ -347,7 +345,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
} }
LOOP_END; LOOP_END;
while (cy) { while (cy) {
PROPCARRY; // cy = cy > (*_c += cy); PROPCARRY;
++_c; ++_c;
} }
} }
@ -374,7 +372,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
} }
} }
/* $Source$ */ /* $Source$ */
/* $Revision$ */ /* $Revision$ */
/* $Date$ */ /* $Date$ */

View File

@ -47,7 +47,7 @@
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -118,7 +118,7 @@ asm ( \
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \

View File

@ -36,7 +36,7 @@
#define COMBA_FINI #define COMBA_FINI
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm volatile ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %%eax \n\t" \ "mull %%eax \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -45,7 +45,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -57,7 +57,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \ #define SQRADDSC(i, j) \
asm ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"movl %%eax,%0 \n\t" \ "movl %%eax,%0 \n\t" \
@ -66,7 +66,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDAC(i, j) \ #define SQRADDAC(i, j) \
asm ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -75,7 +75,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDDB \ #define SQRADDDB \
asm ( \ asm( \
"addl %6,%0 \n\t" \ "addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \ "adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \ "adcl %8,%2 \n\t" \
@ -104,7 +104,7 @@ asm ( \
#define COMBA_FINI #define COMBA_FINI
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \ "mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -113,7 +113,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -125,7 +125,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDSC(i, j) \ #define SQRADDSC(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"movq %%rax,%0 \n\t" \ "movq %%rax,%0 \n\t" \
@ -134,7 +134,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDAC(i, j) \ #define SQRADDAC(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -143,7 +143,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDDB \ #define SQRADDDB \
asm ( \ asm( \
"addq %6,%0 \n\t" \ "addq %6,%0 \n\t" \
"adcq %7,%1 \n\t" \ "adcq %7,%1 \n\t" \
"adcq %8,%2 \n\t" \ "adcq %8,%2 \n\t" \
@ -173,7 +173,7 @@ asm ( \
asm("emms"); asm("emms");
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"pmuludq %%mm0,%%mm0\n\t" \ "pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \ "movd %%mm0,%%eax \n\t" \
@ -185,7 +185,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -201,7 +201,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \ #define SQRADDSC(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -212,7 +212,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));
#define SQRADDAC(i, j) \ #define SQRADDAC(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -225,7 +225,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDDB \ #define SQRADDDB \
asm ( \ asm( \
"addl %6,%0 \n\t" \ "addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \ "adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \ "adcl %8,%2 \n\t" \

View File

@ -1,7 +1,13 @@
#makefile for TomsFastMath #makefile for TomsFastMath
# #
# #
CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops VERSION=0.05
CFLAGS += -Wall -W -Wshadow -I./
ifndef IGNORE_SPEED
CFLAGS += -O3 -funroll-all-loops
#profiling #profiling
#PROF=-pg -g #PROF=-pg -g
@ -10,9 +16,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#speed #speed
CFLAGS += -fomit-frame-pointer CFLAGS += -fomit-frame-pointer
VERSION=0.04 endif
default: libtfm.a
OBJECTS = \ OBJECTS = \
fp_set.o \ fp_set.o \
@ -52,23 +56,29 @@ ifndef INCPATH
INCPATH=/usr/include INCPATH=/usr/include
endif endif
ifndef TFM_GROUP ifndef INSTALL_GROUP
GROUP=wheel GROUP=wheel
else
GROUP=$(INSTALL_GROUP)
endif endif
ifndef TFM_USER ifndef INSTALL_USER
USER=root USER=root
else
USER=$(INSTALL_USER)
endif endif
ifndef LIBNAME ifndef LIBNAME
LIBNAME=libtfm.a LIBNAME=libtfm.a
endif endif
$(LIBNAME): $(OBJECTS) default: $(LIBNAME)
$(AR) $(ARFLAGS) $(LIBNAME) $(OBJECTS)
ranlib $(LIBNAME)
install: libtfm.a $(LIBNAME): $(OBJECTS)
$(AR) $(ARFLAGS) $@ $(OBJECTS)
ranlib $@
install: $(LIBNAME)
install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH) install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH) install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH) install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
@ -77,17 +87,17 @@ install: libtfm.a
mtest/mtest: mtest/mtest.c mtest/mtest: mtest/mtest.c
cd mtest ; make mtest cd mtest ; make mtest
test: libtfm.a demo/test.o mtest/mtest test: $(LIBNAME) demo/test.o mtest/mtest
$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test $(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
timing: libtfm.a demo/test.o timing: $(LIBNAME) demo/test.o
$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test $(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
stest: libtfm.a demo/stest.o stest: $(LIBNAME) demo/stest.o
$(CC) $(CFLAGS) demo/stest.o libtfm.a -o stest $(CC) $(CFLAGS) demo/stest.o $(LIBNAME) -o stest
rsatest: libtfm.a demo/rsa.o rsatest: $(LIBNAME) demo/rsa.o
$(CC) $(CFLAGS) demo/rsa.o libtfm.a -o rsatest $(CC) $(CFLAGS) demo/rsa.o $(LIBNAME) -o rsatest
docdvi: tfm.tex docdvi: tfm.tex
touch tfm.ind touch tfm.ind
@ -101,8 +111,23 @@ docs: docdvi
dvipdf tfm dvipdf tfm
mv -f tfm.pdf doc mv -f tfm.pdf doc
#This rule cleans the source tree of all compiled code, not including the pdf
#documentation.
clean: clean:
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~ rsatest *.gcda *.gcno demo/*.gcda demo/*.gcno mtest/*.gcno mtest/*.gcda rm -f `find . -type f | grep "[.]o" | xargs`
rm -f `find . -type f | grep "[.]lo" | xargs`
rm -f `find . -type f | grep "[.]a" | xargs`
rm -f `find . -type f | grep "[.]la" | xargs`
rm -f `find . -type f | grep "[.]obj" | xargs`
rm -f `find . -type f | grep "[.]lib" | xargs`
rm -f `find . -type f | grep "[.]exe" | xargs`
rm -f `find . -type f | grep "[.]gcda" | xargs`
rm -f `find . -type f | grep "[.]gcno" | xargs`
rm -f `find . -type f | grep "[.]il" | xargs`
rm -f `find . -type f | grep "[.]dyn" | xargs`
rm -f `find . -type f | grep "[.]dpi" | xargs`
rm -rf `find . -type d | grep "[.]libs" | xargs`
rm -f tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc
cd mtest ; make clean cd mtest ; make clean
no_oops: clean no_oops: clean
@ -116,3 +141,7 @@ zipup: no_oops docs clean
cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \ cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \ tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/* zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/*
# $Source: /cvs/libtom/tomsfastmath/makefile,v $
# $Revision: 1.17 $
# $Date: 2005/07/30 04:23:55 $

View File

@ -1,55 +0,0 @@
#makefile for TomsFastMath
#
#For the GameboyAdance... er.... ARMv4
SFLAGS = $(CFLAGS) -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -mthumb -mthumb-interwork -I../devkitadv/mylib/lib
CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -marm -mthumb-interwork -I../devkitadv/mylib/lib
#profiling
#PROF=-pg -g
#CFLAGS += $(PROF)
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.01
default: libtfm.a
OBJECTS = \
fp_set.o \
\
fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
fp_mul_2.o fp_div_2.o \
\
fp_cnt_lsb.o \
\
fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
s_fp_add.o s_fp_sub.o \
\
fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
\
fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
\
fp_exptmod.o \
\
fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
libtfm.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
ranlib libtfm.a
demo/stest.o: demo/stest.c
$(CC) $(SFLAGS) -DGBA_MODE demo/stest.c -c -o demo/stest.o
stest: libtfm.a demo/stest.o
$(CC) -mthumb -mthumb-interwork demo/stest.o libtfm.a ../devkitadv/mylib/lib/gba.a -o stest.elf
objcopy -O binary stest.elf stest.bin

109
makefile.shared Normal file
View File

@ -0,0 +1,109 @@
#makefile for TomsFastMath
#
#
CC=libtool --mode=compile gcc
CFLAGS += -Wall -W -Wshadow -I./
ifndef IGNORE_SPEED
CFLAGS += -O3 -funroll-all-loops
#profiling
#PROF=-pg -g
#CFLAGS += $(PROF)
#speed
CFLAGS += -fomit-frame-pointer
endif
VERSION=0:5
OBJECTS = \
fp_set.o \
\
fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
fp_mul_2.o fp_div_2.o \
\
fp_cnt_lsb.o \
\
fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
s_fp_add.o s_fp_sub.o \
\
fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
\
fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
\
fp_exptmod.o \
\
fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
fp_ident.o
HEADERS=tfm.h
ifndef LIBPATH
LIBPATH=/usr/lib
endif
ifndef INCPATH
INCPATH=/usr/include
endif
ifndef INSTALL_GROUP
GROUP=wheel
else
GROUP=$(INSTALL_GROUP)
endif
ifndef INSTALL_USER
USER=root
else
USER=$(INSTALL_USER)
endif
ifndef LIBNAME
LIBNAME=libtfm.la
endif
ifndef LIBNAME_S
LIBNAME_S=libtfm.a
endif
default: $(LIBNAME)
$(LIBNAME): $(OBJECTS)
install: $(LIBNAME)
libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]o" | xargs` -o $(LIBNAME_S)
ranlib $(LIBNAME_S)
libtool --silent --mode=install install -c $(LIBNAME) $(LIBPATH)/$(LIBNAME)
install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
mtest/mtest: mtest/mtest.c
cd mtest ; make mtest
test: $(LIBNAME) demo/test.o mtest/mtest
$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
timing: $(LIBNAME) demo/test.o
$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
stest: $(LIBNAME) demo/stest.o
$(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest
# $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $
# $Revision: 1.4 $
# $Date: 2005/07/28 03:08:35 $

View File

@ -757,6 +757,75 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
*/ */
#include <tfm.h> #include <tfm.h>
#ifdef TFM_TIMING_RESISTANT
/* timing resistant montgomery ladder based exptmod
Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
*/
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int R[2];
fp_digit buf, mp;
int err, bitcnt, digidx, y;
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
return err;
}
fp_init(&R[0]);
fp_init(&R[1]);
/* now we need R mod m */
fp_montgomery_calc_normalization (&R[0], P);
/* now set R[0][1] to G * R mod m */
if (fp_cmp_mag(P, G) != FP_GT) {
/* G > P so we reduce it first */
fp_mod(G, P, &R[1]);
} else {
fp_copy(G, &R[1]);
}
fp_mulmod (&R[1], &R[0], P, &R[1]);
/* for j = t-1 downto 0 do
r_!k = R0*R1; r_k = r_k^2
*/
/* set initial mode and bit cnt */
bitcnt = 1;
buf = 0;
digidx = X->used - 1;
for (;;) {
/* grab next digit as required */
if (--bitcnt == 0) {
/* if digidx == -1 we are out of digits so break */
if (digidx == -1) {
break;
}
/* read next digit and reset bitcnt */
buf = X->dp[digidx--];
bitcnt = (int)DIGIT_BIT;
}
/* grab the next msb from the exponent */
y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
buf <<= (fp_digit)1;
/* do ops */
fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
fp_sqr(&R[y], &R[y]); fp_montgomery_reduce(&R[y], P, mp);
}
fp_montgomery_reduce(&R[0], P, mp);
fp_copy(&R[0], Y);
return FP_OKAY;
}
#else
/* y = g**x (mod b) /* y = g**x (mod b)
* Some restrictions... x must be positive and < b * Some restrictions... x must be positive and < b
*/ */
@ -916,6 +985,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY; return FP_OKAY;
} }
#endif
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{ {
@ -1105,6 +1176,111 @@ int main(void)
*/ */
#include <tfm.h> #include <tfm.h>
static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
{
fp_int x, y, u, v, A, B, C, D;
int res;
/* b cannot be negative */
if (b->sign == FP_NEG || fp_iszero(b) == 1) {
return FP_VAL;
}
/* init temps */
fp_init(&x); fp_init(&y);
fp_init(&u); fp_init(&v);
fp_init(&A); fp_init(&B);
fp_init(&C); fp_init(&D);
/* x = a, y = b */
if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
return res;
}
fp_copy(b, &y);
/* 2. [modified] if x,y are both even then return an error! */
if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
return FP_VAL;
}
/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
fp_copy (&x, &u);
fp_copy (&y, &v);
fp_set (&A, 1);
fp_set (&D, 1);
top:
/* 4. while u is even do */
while (fp_iseven (&u) == 1) {
/* 4.1 u = u/2 */
fp_div_2 (&u, &u);
/* 4.2 if A or B is odd then */
if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
/* A = (A+y)/2, B = (B-x)/2 */
fp_add (&A, &y, &A);
fp_sub (&B, &x, &B);
}
/* A = A/2, B = B/2 */
fp_div_2 (&A, &A);
fp_div_2 (&B, &B);
}
/* 5. while v is even do */
while (fp_iseven (&v) == 1) {
/* 5.1 v = v/2 */
fp_div_2 (&v, &v);
/* 5.2 if C or D is odd then */
if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
/* C = (C+y)/2, D = (D-x)/2 */
fp_add (&C, &y, &C);
fp_sub (&D, &x, &D);
}
/* C = C/2, D = D/2 */
fp_div_2 (&C, &C);
fp_div_2 (&D, &D);
}
/* 6. if u >= v then */
if (fp_cmp (&u, &v) != FP_LT) {
/* u = u - v, A = A - C, B = B - D */
fp_sub (&u, &v, &u);
fp_sub (&A, &C, &A);
fp_sub (&B, &D, &B);
} else {
/* v - v - u, C = C - A, D = D - B */
fp_sub (&v, &u, &v);
fp_sub (&C, &A, &C);
fp_sub (&D, &B, &D);
}
/* if not zero goto step 4 */
if (fp_iszero (&u) == 0)
goto top;
/* now a = C, b = D, gcd == g*v */
/* if v != 1 then there is no inverse */
if (fp_cmp_d (&v, 1) != FP_EQ) {
return FP_VAL;
}
/* if its too low */
while (fp_cmp_d(&C, 0) == FP_LT) {
fp_add(&C, b, &C);
}
/* too big */
while (fp_cmp_mag(&C, b) != FP_LT) {
fp_sub(&C, b, &C);
}
/* C is now the inverse */
fp_copy(&C, c);
return FP_OKAY;
}
/* c = 1/a (mod b) for odd b only */ /* c = 1/a (mod b) for odd b only */
int fp_invmod(fp_int *a, fp_int *b, fp_int *c) int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
{ {
@ -1113,7 +1289,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
/* 2. [modified] b must be odd */ /* 2. [modified] b must be odd */
if (fp_iseven (b) == FP_YES) { if (fp_iseven (b) == FP_YES) {
return FP_VAL; return fp_invmod_slow(a,b,c);
} }
/* init all our temps */ /* init all our temps */
@ -1814,8 +1990,6 @@ asm( \
#define LO 0 #define LO 0
#define HI 1
#define CY 2
/* computes x/R == x (mod N) via Montgomery Reduction */ /* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
@ -1862,7 +2036,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
} }
LOOP_END; LOOP_END;
while (cy) { while (cy) {
PROPCARRY; // cy = cy > (*_c += cy); PROPCARRY;
++_c; ++_c;
} }
} }
@ -1889,11 +2063,11 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
} }
} }
/* $Source$ */ /* $Source$ */
/* $Revision$ */ /* $Revision$ */
/* $Date$ */ /* $Date$ */
/* End: fp_montgomery_reduce.c */ /* End: fp_montgomery_reduce.c */
/* Start: fp_montgomery_setup.c */ /* Start: fp_montgomery_setup.c */
@ -2270,7 +2444,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -2341,7 +2515,7 @@ asm ( \
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -5678,7 +5852,7 @@ Obvious points of optimization
#define COMBA_FINI #define COMBA_FINI
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm volatile ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %%eax \n\t" \ "mull %%eax \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -5687,7 +5861,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -5699,7 +5873,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \ #define SQRADDSC(i, j) \
asm ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"movl %%eax,%0 \n\t" \ "movl %%eax,%0 \n\t" \
@ -5708,7 +5882,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDAC(i, j) \ #define SQRADDAC(i, j) \
asm ( \ asm( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -5717,7 +5891,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDDB \ #define SQRADDDB \
asm ( \ asm( \
"addl %6,%0 \n\t" \ "addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \ "adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \ "adcl %8,%2 \n\t" \
@ -5746,7 +5920,7 @@ asm ( \
#define COMBA_FINI #define COMBA_FINI
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \ "mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -5755,7 +5929,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -5767,7 +5941,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDSC(i, j) \ #define SQRADDSC(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"movq %%rax,%0 \n\t" \ "movq %%rax,%0 \n\t" \
@ -5776,7 +5950,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDAC(i, j) \ #define SQRADDAC(i, j) \
asm ( \ asm( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -5785,7 +5959,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDDB \ #define SQRADDDB \
asm ( \ asm( \
"addq %6,%0 \n\t" \ "addq %6,%0 \n\t" \
"adcq %7,%1 \n\t" \ "adcq %7,%1 \n\t" \
"adcq %8,%2 \n\t" \ "adcq %8,%2 \n\t" \
@ -5815,7 +5989,7 @@ asm ( \
asm("emms"); asm("emms");
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"pmuludq %%mm0,%%mm0\n\t" \ "pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \ "movd %%mm0,%%eax \n\t" \
@ -5827,7 +6001,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -5843,7 +6017,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \ #define SQRADDSC(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -5854,7 +6028,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));
#define SQRADDAC(i, j) \ #define SQRADDAC(i, j) \
asm volatile ( \ asm( \
"movd %6,%%mm0 \n\t" \ "movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \ "movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \ "pmuludq %%mm1,%%mm0\n\t" \
@ -5867,7 +6041,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc"); :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDDB \ #define SQRADDDB \
asm ( \ asm( \
"addl %6,%0 \n\t" \ "addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \ "adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \ "adcl %8,%2 \n\t" \

56
tfm.aux
View File

@ -17,40 +17,42 @@
\@writefile{toc}{\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}} \@writefile{toc}{\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}}
\@writefile{toc}{\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}} \@writefile{toc}{\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}}
\@writefile{toc}{\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}} \@writefile{toc}{\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}} \@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}} \@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}}
\@writefile{toc}{\contentsline {subsubsection}{x86--32}{3}{section*.3}} \@writefile{toc}{\contentsline {subsubsection}{x86--32}{3}{section*.3}}
\@writefile{toc}{\contentsline {subsubsection}{SSE2}{3}{section*.4}} \@writefile{toc}{\contentsline {subsubsection}{SSE2}{3}{section*.4}}
\@writefile{toc}{\contentsline {subsubsection}{x86--64}{3}{section*.5}} \@writefile{toc}{\contentsline {subsubsection}{x86--64}{4}{section*.5}}
\@writefile{toc}{\contentsline {subsubsection}{ARM}{3}{section*.6}} \@writefile{toc}{\contentsline {subsubsection}{ARM}{4}{section*.6}}
\@writefile{toc}{\contentsline {subsubsection}{PPC32}{3}{section*.7}} \@writefile{toc}{\contentsline {subsubsection}{PPC32}{4}{section*.7}}
\@writefile{toc}{\contentsline {subsubsection}{Future Releases}{4}{section*.8}} \@writefile{toc}{\contentsline {subsubsection}{Future Releases}{4}{section*.8}}
\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}} \@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}} \@writefile{toc}{\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}} \@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}}
\@writefile{lof}{\addvspace {10\p@ }} \@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}} \@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}}
\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}} \@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}}
\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}} \@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}}
\@writefile{lof}{\addvspace {10\p@ }} \@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}} \@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}}
\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}} \@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}}
\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}} \@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}}
\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}} \@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}}
\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}} \@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}}
\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}} \@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}}
\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}} \@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}}
\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}} \@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}}
\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}} \@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}}
\@writefile{lof}{\addvspace {10\p@ }} \@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }}
\newlabel{chap:asmops}{{4}{11}{Porting TomsFastMath\relax }{chapter.4}{}} \newlabel{chap:asmops}{{4}{13}{Porting TomsFastMath\relax }{chapter.4}{}}
\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}} \@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}}
\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}} \@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}}
\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}} \@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}}
\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}} \@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}}

BIN
tfm.dvi

Binary file not shown.

17
tfm.h
View File

@ -48,6 +48,11 @@
*/ */
/* #define TFM_PRESCOTT */ /* #define TFM_PRESCOTT */
/* Do we want timing resistant fp_exptmod() ?
* This makes it slower but also timing invariant with respect to the exponent
*/
/* #define TFM_TIMING_RESISTANT */
#endif #endif
/* Max size of any number in bits. Basically the largest size you will be multiplying /* Max size of any number in bits. Basically the largest size you will be multiplying
@ -355,15 +360,25 @@ int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);
void s_fp_add(fp_int *a, fp_int *b, fp_int *c); void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c); void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
void bn_reverse(unsigned char *s, int len); void bn_reverse(unsigned char *s, int len);
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
#ifdef TFM_SMALL_SET
void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
#endif
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
#endif #endif
void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
void fp_sqr_comba(fp_int *A, fp_int *B);
#ifdef TFM_SMALL_SET
void fp_sqr_comba_small(fp_int *A, fp_int *B); void fp_sqr_comba_small(fp_int *A, fp_int *B);
#endif
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B); void fp_sqr_comba32(fp_int *A, fp_int *B);
void fp_sqr_comba48(fp_int *A, fp_int *B); void fp_sqr_comba48(fp_int *A, fp_int *B);

58
tfm.idx
View File

@ -1,29 +1,29 @@
\indexentry{fp\_init|hyperpage}{6} \indexentry{fp\_init|hyperpage}{8}
\indexentry{fp\_set|hyperpage}{6} \indexentry{fp\_set|hyperpage}{8}
\indexentry{fp\_init\_copy|hyperpage}{6} \indexentry{fp\_init\_copy|hyperpage}{8}
\indexentry{fp\_iszero|hyperpage}{7} \indexentry{fp\_iszero|hyperpage}{9}
\indexentry{fp\_iseven|hyperpage}{7} \indexentry{fp\_iseven|hyperpage}{9}
\indexentry{fp\_isodd|hyperpage}{7} \indexentry{fp\_isodd|hyperpage}{9}
\indexentry{fp\_neg|hyperpage}{7} \indexentry{fp\_neg|hyperpage}{9}
\indexentry{fp\_abs|hyperpage}{7} \indexentry{fp\_abs|hyperpage}{9}
\indexentry{fp\_cmp|hyperpage}{8} \indexentry{fp\_cmp|hyperpage}{10}
\indexentry{fp\_cmp\_mag|hyperpage}{8} \indexentry{fp\_cmp\_mag|hyperpage}{10}
\indexentry{fp\_lshd|hyperpage}{8} \indexentry{fp\_lshd|hyperpage}{10}
\indexentry{fp\_rshd|hyperpage}{8} \indexentry{fp\_rshd|hyperpage}{10}
\indexentry{fp\_div\_2d|hyperpage}{8} \indexentry{fp\_div\_2d|hyperpage}{10}
\indexentry{fp\_mod\_2d|hyperpage}{8} \indexentry{fp\_mod\_2d|hyperpage}{10}
\indexentry{fp\_mul\_2d|hyperpage}{8} \indexentry{fp\_mul\_2d|hyperpage}{10}
\indexentry{fp\_div\_2|hyperpage}{8} \indexentry{fp\_div\_2|hyperpage}{10}
\indexentry{fp\_mul\_2|hyperpage}{8} \indexentry{fp\_mul\_2|hyperpage}{10}
\indexentry{fp\_cnt\_lsb|hyperpage}{8} \indexentry{fp\_cnt\_lsb|hyperpage}{10}
\indexentry{fp\_add|hyperpage}{9} \indexentry{fp\_add|hyperpage}{11}
\indexentry{fp\_sub|hyperpage}{9} \indexentry{fp\_sub|hyperpage}{11}
\indexentry{fp\_mul|hyperpage}{9} \indexentry{fp\_mul|hyperpage}{11}
\indexentry{fp\_sqr|hyperpage}{9} \indexentry{fp\_sqr|hyperpage}{11}
\indexentry{fp\_div|hyperpage}{9} \indexentry{fp\_div|hyperpage}{11}
\indexentry{fp\_mod|hyperpage}{9} \indexentry{fp\_mod|hyperpage}{11}
\indexentry{fp\_exptmod|hyperpage}{9} \indexentry{fp\_exptmod|hyperpage}{11}
\indexentry{fp\_invmod|hyperpage}{9} \indexentry{fp\_invmod|hyperpage}{11}
\indexentry{fp\_gcd|hyperpage}{9} \indexentry{fp\_gcd|hyperpage}{11}
\indexentry{fp\_lcm|hyperpage}{9} \indexentry{fp\_lcm|hyperpage}{11}
\indexentry{fp\_isprime|hyperpage}{10} \indexentry{fp\_isprime|hyperpage}{12}

58
tfm.ind
View File

@ -1,33 +1,33 @@
\begin{theindex} \begin{theindex}
\item fp\_abs, \hyperpage{7} \item fp\_abs, \hyperpage{9}
\item fp\_add, \hyperpage{9} \item fp\_add, \hyperpage{11}
\item fp\_cmp, \hyperpage{8} \item fp\_cmp, \hyperpage{10}
\item fp\_cmp\_mag, \hyperpage{8} \item fp\_cmp\_mag, \hyperpage{10}
\item fp\_cnt\_lsb, \hyperpage{8} \item fp\_cnt\_lsb, \hyperpage{10}
\item fp\_div, \hyperpage{9} \item fp\_div, \hyperpage{11}
\item fp\_div\_2, \hyperpage{8} \item fp\_div\_2, \hyperpage{10}
\item fp\_div\_2d, \hyperpage{8} \item fp\_div\_2d, \hyperpage{10}
\item fp\_exptmod, \hyperpage{9} \item fp\_exptmod, \hyperpage{11}
\item fp\_gcd, \hyperpage{9} \item fp\_gcd, \hyperpage{11}
\item fp\_init, \hyperpage{6} \item fp\_init, \hyperpage{8}
\item fp\_init\_copy, \hyperpage{6} \item fp\_init\_copy, \hyperpage{8}
\item fp\_invmod, \hyperpage{9} \item fp\_invmod, \hyperpage{11}
\item fp\_iseven, \hyperpage{7} \item fp\_iseven, \hyperpage{9}
\item fp\_isodd, \hyperpage{7} \item fp\_isodd, \hyperpage{9}
\item fp\_isprime, \hyperpage{10} \item fp\_isprime, \hyperpage{12}
\item fp\_iszero, \hyperpage{7} \item fp\_iszero, \hyperpage{9}
\item fp\_lcm, \hyperpage{9} \item fp\_lcm, \hyperpage{11}
\item fp\_lshd, \hyperpage{8} \item fp\_lshd, \hyperpage{10}
\item fp\_mod, \hyperpage{9} \item fp\_mod, \hyperpage{11}
\item fp\_mod\_2d, \hyperpage{8} \item fp\_mod\_2d, \hyperpage{10}
\item fp\_mul, \hyperpage{9} \item fp\_mul, \hyperpage{11}
\item fp\_mul\_2, \hyperpage{8} \item fp\_mul\_2, \hyperpage{10}
\item fp\_mul\_2d, \hyperpage{8} \item fp\_mul\_2d, \hyperpage{10}
\item fp\_neg, \hyperpage{7} \item fp\_neg, \hyperpage{9}
\item fp\_rshd, \hyperpage{8} \item fp\_rshd, \hyperpage{10}
\item fp\_set, \hyperpage{6} \item fp\_set, \hyperpage{8}
\item fp\_sqr, \hyperpage{9} \item fp\_sqr, \hyperpage{11}
\item fp\_sub, \hyperpage{9} \item fp\_sub, \hyperpage{11}
\end{theindex} \end{theindex}

52
tfm.log
View File

@ -1,4 +1,4 @@
This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10) 23 JUL 2005 07:42 This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10) 1 AUG 2005 13:34
entering extended mode entering extended mode
**tfm **tfm
(./tfm.tex (./tfm.tex
@ -216,107 +216,107 @@ File: umsb.fd 2002/01/19 v2.2g AMS font definitions
Chapter 1. Chapter 1.
[1 [1
] [2] [3] [4] ] [2] [3] [4] [5] [6
]
Chapter 2. Chapter 2.
Underfull \vbox (badness 7649) has occurred while \output is active [] Underfull \vbox (badness 7649) has occurred while \output is active []
[5 [7]
[8]
]
[6]
Chapter 3. Chapter 3.
[7 [9
] [8] [9] [10] ] [10] [11] [12]
Chapter 4. Chapter 4.
[11 [13
] [12] [13] ] [14] [15]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[]\OT1/cmtt/m/n/10 #define SQRADDSC(i, j) []\OT1/cmtt/m/n/10 #define SQRADDSC(i, j)
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[] \OT1/cmtt/m/n/10 do { fp_word t; [] \OT1/cmtt/m/n/10 do { fp_word t;
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[] \OT1/cmtt/m/n/10 t = ((fp_word)i) * ((fp_word)j); [] \OT1/cmtt/m/n/10 t = ((fp_word)i) * ((fp_word)j);
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547 Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[] \OT1/cmtt/m/n/10 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; [] \OT1/cmtt/m/n/10 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;
\[] \[]
[] []
Overfull \hbox (25.129pt too wide) in paragraph at lines 548--549 Overfull \hbox (25.129pt too wide) in paragraph at lines 560--561
\OT1/cmr/m/n/10 This com-putes a prod-uct and stores it in the ``sec-ondary'' c \OT1/cmr/m/n/10 This com-putes a prod-uct and stores it in the ``sec-ondary'' c
arry reg-is-ters $[]$. arry reg-is-ters $[]$.
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[]\OT1/cmtt/m/n/10 #define SQRADDAC(i, j) []\OT1/cmtt/m/n/10 #define SQRADDAC(i, j)
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[] \OT1/cmtt/m/n/10 do { fp_word t; [] \OT1/cmtt/m/n/10 do { fp_word t;
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[] \OT1/cmtt/m/n/10 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; [] \OT1/cmtt/m/n/10 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t;
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556 Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[] \OT1/cmtt/m/n/10 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t [] \OT1/cmtt/m/n/10 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t
>> DIGIT_BIT; \[] >> DIGIT_BIT; \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566 Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578
[]\OT1/cmtt/m/n/10 #define SQRADDDB []\OT1/cmtt/m/n/10 #define SQRADDDB
\[] \[]
[] []
Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566 Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 do { fp_word t; [] \OT1/cmtt/m/n/10 do { fp_word t;
\[] \[]
[] []
Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566 Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; [] \OT1/cmtt/m/n/10 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t;
\[] \[]
[] []
Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566 Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BI [] \OT1/cmtt/m/n/10 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BI
T); c1 = t; \[] T); c1 = t; \[]
[] []
Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566 Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_B [] \OT1/cmtt/m/n/10 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_B
IT); \[] IT); \[]
[] []
[14] [15] (./tfm.ind [16] [17 [16] [17] (./tfm.ind [18] [19
]) (./tfm.aux) ) ]) (./tfm.aux) )
@ -329,4 +329,4 @@ Here is how much of TeX's memory you used:
580 hyphenation exceptions out of 1000 580 hyphenation exceptions out of 1000
25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s 25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s
Output written on tfm.dvi (23 pages, 49708 bytes). Output written on tfm.dvi (25 pages, 51612 bytes).

30
tfm.tex
View File

@ -49,8 +49,8 @@
\begin{document} \begin{document}
\frontmatter \frontmatter
\pagestyle{empty} \pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.04} \title{TomsFastMath User Manual \\ v0.05}
\author{Tom St Denis \\ tomstdenis@iahu.ca} \author{Tom St Denis \\ tomstdenis@gmail.com}
\maketitle \maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5 This text and library are all hereby placed in the public domain. This book has been formatted for B5
[176x250] paper using the \LaTeX{} {\em book} macro package. [176x250] paper using the \LaTeX{} {\em book} macro package.
@ -101,14 +101,26 @@ fast multiplication and squaring and has the side effect of speeding up ECC oper
TomsFastMath is public domain. TomsFastMath is public domain.
\section{Building} \section{Building}
Currently only a GCC makefile has been provided. To build the library simply type To build the library simply type ``make''. Or to install in typical *unix like directories use
``make''. The library is a bit too new to put into production so no install ``make install''. Similarly a shared library can be built with ``make -f makefile.shared install''.
scripts exist yet. You can build the test program with ``make test''.
To perform simple static testing (useful to test out new assembly ports) use the stest You can build the test program with ``make test''. To perform simple static testing (useful to
program. Type ``make stest'' and run it on your target. The program will perform three test out new assembly ports) use the stest program. Type ``make stest'' and run it on your
multiplications, squarings and montgomery reductions. Likely if your assembly target. The program will perform three multiplications, squarings and montgomery reductions.
code is invalid this code will exhibit the bug. Likely if your assembly code is invalid this code will exhibit the bug.
\subsection{Intel CC}
In theory you should be able to build the library with
\begin{verbatim}
CFLAGS="-O3 -ip" CC=icc make IGNORE_SPEED=1
\end{verbatim}
However, Intels inline assembler is way less advanced than GCCs. As a result it doesn't compile.
Fortunately it doesn't really matter.
\subsection{MSVC}
The library doesn't build with MSVC. Imagine that.
\subsection{Build Limitations} \subsection{Build Limitations}
TomsFastMath has the following build requirements which are non--portable but under most TomsFastMath has the following build requirements which are non--portable but under most

54
tfm.toc
View File

@ -2,32 +2,34 @@
\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1} \contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}
\contentsline {section}{\numberline {1.2}License}{2}{section.1.2} \contentsline {section}{\numberline {1.2}License}{2}{section.1.2}
\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3} \contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}
\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1} \contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}
\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2} \contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}
\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}
\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}
\contentsline {subsubsection}{x86--32}{3}{section*.3} \contentsline {subsubsection}{x86--32}{3}{section*.3}
\contentsline {subsubsection}{SSE2}{3}{section*.4} \contentsline {subsubsection}{SSE2}{3}{section*.4}
\contentsline {subsubsection}{x86--64}{3}{section*.5} \contentsline {subsubsection}{x86--64}{4}{section*.5}
\contentsline {subsubsection}{ARM}{3}{section*.6} \contentsline {subsubsection}{ARM}{4}{section*.6}
\contentsline {subsubsection}{PPC32}{3}{section*.7} \contentsline {subsubsection}{PPC32}{4}{section*.7}
\contentsline {subsubsection}{Future Releases}{4}{section*.8} \contentsline {subsubsection}{Future Releases}{4}{section*.8}
\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3} \contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}
\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2} \contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}
\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1} \contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}
\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2} \contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}
\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1} \contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}
\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2} \contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}
\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3} \contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}
\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3} \contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}
\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1} \contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}
\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2} \contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}
\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3} \contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}
\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4} \contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}
\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5} \contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}
\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6} \contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}
\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7} \contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}
\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8} \contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}
\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4} \contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}
\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1} \contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}
\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2} \contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}
\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3} \contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}
\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4} \contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}