added tomsfastmath-0.05

This commit is contained in:
Tom St Denis 2005-08-01 16:37:35 +00:00 committed by Steffen Jaeckel
parent f91cf2d1cf
commit a6c4c5a261
21 changed files with 830 additions and 310 deletions

View File

@ -1,3 +1,10 @@
August 1st, 2005
0.05 -- Quick fix to the fp_invmod.c code to let it handle even moduli [required for LTC]
-- Added makefile.shared to make shared objects [required for LTC]
-- Improved makefiles to make them way more configurable
-- Added timing resistant fp_exptmod() enabled with TFM_TIMING_RESISTANT
July 23rd, 2005
0.04 -- Fixed bugs in the SSE2 squaring code
-- Rewrote the multipliers to be optimized for small inputs
-- Nelson Bolyard of the NSS crew submitted [among other things] new faster Montgomery reduction

View File

@ -1,59 +1,112 @@
/* generate montgomery reductions for m->used = 1...16 */
#include <stdio.h>
int main(void)
{
int N;
for (N = 1; N <= 16; N++) {
printf("void fp_montgomery_reduce_%d(fp_int *a, fp_int *m, fp_digit mp)\n", N);
int x, y, z;
printf(
#if 0
"#ifdef TFM_SMALL_SET\n"
"/* computes x/R == x (mod N) via Montgomery Reduction */\n"
"void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)\n"
"{\n"
" fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;\n"
" int oldused, x, y;\n"
" fp_digit c[FP_SIZE], *_c, *tmpm, mu, cy;\n"
" int oldused, x, y, pa;\n"
"\n"
"#if defined(USE_MEMSET)\n"
" /* now zero the buff */\n"
" memset(c, 0, sizeof(c));\n"
" memset(c, 0, sizeof c);\n"
"#endif\n"
" pa = m->used;\n"
"\n"
" /* copy the input */\n"
" oldused = a->used;\n"
" for (x = 0; x < oldused; x++) {\n"
" c[x] = a->dp[x];\n"
" }\n"
"\n"
"#if !defined(USE_MEMSET)\n"
" for (; x < 2*pa+3; x++) {\n"
" c[x] = 0;\n"
" }\n"
"#endif\n"
" MONT_START;\n"
#endif
"\n"
" /* now let's get bizz-sy! */\n"
" for (x = 0; x < %d; x++) {\n"
" /* get Mu for this round */\n"
" LOOP_START;\n"
"\n"
" /* our friendly neighbourhood alias */\n"
" _c = c + x;\n"
" tmpm = m->dp;\n"
"\n"
" for (y = 0; y < %d; y++) {\n"
" INNERMUL;\n"
" ++_c;\n"
" }\n"
" /* send carry up man... */\n"
" _c = c + x;\n"
" PROPCARRY;\n"
" } \n"
"\n"
" /* fix the rest of the carries */\n"
" _c = c + %d;\n"
" for (x = %d; x < %d * 2 + 2; x++) {\n"
" PROPCARRY;\n"
" ++_c;\n"
" switch (pa) {\n");
for (x = 1; x <= 64; x++) {
if (x > 16 && (x != 32 && x != 48 && x != 64)) continue;
if (x > 16) printf("#ifdef TFM_HUGE\n");
printf(" case %d:\n", x);
for (y = 0; y < x; y++) {
printf(" x = %d; cy = 0;\n"
" LOOP_START;\n"
" _c = c + %d;\n"
" tmpm = m->dp;\n", y, y);
printf("#ifdef INNERMUL8\n");
for (z = 0; z+8 <= x; z += 8) {
printf(" INNERMUL8; _c += 8; tmpm += 8;\n");
}
for (; z < x; z++) {
printf(" INNERMUL; ++_c;\n");
}
printf("#else\n");
for (z = 0; z < x; z++) {
printf(" INNERMUL; ++_c;\n");
}
printf("#endif\n");
printf(" LOOP_END;\n"
" while (cy) {\n"
" PROPCARRY;\n"
" ++_c;\n"
" }\n");
}
//printf(" }\n");
printf(" break;\n");
#define LOOP_MACRO(stride) \
for (x = 0; x < stride; x++) { \
fp_digit cy = 0; \
/* get Mu for this round */ \
LOOP_START; \
_c = c + x; \
tmpm = m->dp; \
for (y = 0; y < stride; y++) { \
INNERMUL; \
++_c; \
} \
LOOP_END; \
while (cy) { \
PROPCARRY; \
++_c; \
} \
}
if (x > 16) printf("#endif /* TFM_HUGE */\n");
}
#if 0
printf(
" }\n"
"\n"
" /* now copy out */\n"
" _c = c + %d;\n"
" _c = c + pa;\n"
" tmpm = a->dp;\n"
" for (x = 0; x < %d+1; x++) {\n"
" for (x = 0; x < pa+1; x++) {\n"
" *tmpm++ = *_c++;\n"
" }\n"
"\n"
@ -63,19 +116,17 @@ printf(
"\n"
" MONT_FINI;\n"
"\n"
" a->used = %d+1;\n"
" a->used = pa+1;\n"
" fp_clamp(a);\n"
"\n"
" /* if A >= m then A = A - m */\n"
" if (fp_cmp_mag (a, m) != FP_LT) {\n"
" s_fp_sub (a, m, a);\n"
" }\n"
"}\n", N,N,N,N,N,N,N,N);
}
"}\n\n#endif\n");
#endif
return 0;
}

View File

@ -213,7 +213,7 @@ t1 = TIMFUNC();
sleep(1);
printf("Ticks per second: %llu\n", TIMFUNC() - t1);
goto expttime;
goto multtime;
/* do some timings... */
printf("Addition:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) {

Binary file not shown.

View File

@ -9,6 +9,75 @@
*/
#include <tfm.h>
#ifdef TFM_TIMING_RESISTANT
/* timing resistant montgomery ladder based exptmod
Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
*/
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int R[2];
fp_digit buf, mp;
int err, bitcnt, digidx, y;
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
return err;
}
fp_init(&R[0]);
fp_init(&R[1]);
/* now we need R mod m */
fp_montgomery_calc_normalization (&R[0], P);
/* now set R[0][1] to G * R mod m */
if (fp_cmp_mag(P, G) != FP_GT) {
/* G > P so we reduce it first */
fp_mod(G, P, &R[1]);
} else {
fp_copy(G, &R[1]);
}
fp_mulmod (&R[1], &R[0], P, &R[1]);
/* for j = t-1 downto 0 do
r_!k = R0*R1; r_k = r_k^2
*/
/* set initial mode and bit cnt */
bitcnt = 1;
buf = 0;
digidx = X->used - 1;
for (;;) {
/* grab next digit as required */
if (--bitcnt == 0) {
/* if digidx == -1 we are out of digits so break */
if (digidx == -1) {
break;
}
/* read next digit and reset bitcnt */
buf = X->dp[digidx--];
bitcnt = (int)DIGIT_BIT;
}
/* grab the next msb from the exponent */
y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
buf <<= (fp_digit)1;
/* do ops */
fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
fp_sqr(&R[y], &R[y]); fp_montgomery_reduce(&R[y], P, mp);
}
fp_montgomery_reduce(&R[0], P, mp);
fp_copy(&R[0], Y);
return FP_OKAY;
}
#else
/* y = g**x (mod b)
* Some restrictions... x must be positive and < b
*/
@ -168,6 +237,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY;
}
#endif
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{

View File

@ -9,6 +9,111 @@
*/
#include <tfm.h>
static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
{
fp_int x, y, u, v, A, B, C, D;
int res;
/* b cannot be negative */
if (b->sign == FP_NEG || fp_iszero(b) == 1) {
return FP_VAL;
}
/* init temps */
fp_init(&x); fp_init(&y);
fp_init(&u); fp_init(&v);
fp_init(&A); fp_init(&B);
fp_init(&C); fp_init(&D);
/* x = a, y = b */
if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
return res;
}
fp_copy(b, &y);
/* 2. [modified] if x,y are both even then return an error! */
if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
return FP_VAL;
}
/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
fp_copy (&x, &u);
fp_copy (&y, &v);
fp_set (&A, 1);
fp_set (&D, 1);
top:
/* 4. while u is even do */
while (fp_iseven (&u) == 1) {
/* 4.1 u = u/2 */
fp_div_2 (&u, &u);
/* 4.2 if A or B is odd then */
if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
/* A = (A+y)/2, B = (B-x)/2 */
fp_add (&A, &y, &A);
fp_sub (&B, &x, &B);
}
/* A = A/2, B = B/2 */
fp_div_2 (&A, &A);
fp_div_2 (&B, &B);
}
/* 5. while v is even do */
while (fp_iseven (&v) == 1) {
/* 5.1 v = v/2 */
fp_div_2 (&v, &v);
/* 5.2 if C or D is odd then */
if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
/* C = (C+y)/2, D = (D-x)/2 */
fp_add (&C, &y, &C);
fp_sub (&D, &x, &D);
}
/* C = C/2, D = D/2 */
fp_div_2 (&C, &C);
fp_div_2 (&D, &D);
}
/* 6. if u >= v then */
if (fp_cmp (&u, &v) != FP_LT) {
/* u = u - v, A = A - C, B = B - D */
fp_sub (&u, &v, &u);
fp_sub (&A, &C, &A);
fp_sub (&B, &D, &B);
} else {
/* v - v - u, C = C - A, D = D - B */
fp_sub (&v, &u, &v);
fp_sub (&C, &A, &C);
fp_sub (&D, &B, &D);
}
/* if not zero goto step 4 */
if (fp_iszero (&u) == 0)
goto top;
/* now a = C, b = D, gcd == g*v */
/* if v != 1 then there is no inverse */
if (fp_cmp_d (&v, 1) != FP_EQ) {
return FP_VAL;
}
/* if its too low */
while (fp_cmp_d(&C, 0) == FP_LT) {
fp_add(&C, b, &C);
}
/* too big */
while (fp_cmp_mag(&C, b) != FP_LT) {
fp_sub(&C, b, &C);
}
/* C is now the inverse */
fp_copy(&C, c);
return FP_OKAY;
}
/* c = 1/a (mod b) for odd b only */
int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
{
@ -17,7 +122,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
/* 2. [modified] b must be odd */
if (fp_iseven (b) == FP_YES) {
return FP_VAL;
return fp_invmod_slow(a,b,c);
}
/* init all our temps */

View File

@ -299,8 +299,6 @@ asm( \
#define LO 0
#define HI 1
#define CY 2
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
@ -347,7 +345,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
}
LOOP_END;
while (cy) {
PROPCARRY; // cy = cy > (*_c += cy);
PROPCARRY;
++_c;
}
}
@ -374,7 +372,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
}
}
/* $Source$ */
/* $Revision$ */
/* $Date$ */

View File

@ -47,7 +47,7 @@
/* this should multiply i and j */
#define MULADD(i, j) \
asm ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -118,7 +118,7 @@ asm ( \
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \

View File

@ -36,7 +36,7 @@
#define COMBA_FINI
#define SQRADD(i, j) \
asm volatile ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %%eax \n\t" \
"addl %%eax,%0 \n\t" \
@ -45,7 +45,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -57,7 +57,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \
asm ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"movl %%eax,%0 \n\t" \
@ -66,7 +66,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDAC(i, j) \
asm ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -75,7 +75,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDDB \
asm ( \
asm( \
"addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \
@ -104,7 +104,7 @@ asm ( \
#define COMBA_FINI
#define SQRADD(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \
@ -113,7 +113,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
@ -125,7 +125,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDSC(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"movq %%rax,%0 \n\t" \
@ -134,7 +134,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDAC(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
@ -143,7 +143,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDDB \
asm ( \
asm( \
"addq %6,%0 \n\t" \
"adcq %7,%1 \n\t" \
"adcq %8,%2 \n\t" \
@ -173,7 +173,7 @@ asm ( \
asm("emms");
#define SQRADD(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
@ -185,7 +185,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -201,7 +201,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -212,7 +212,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));
#define SQRADDAC(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -225,7 +225,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDDB \
asm ( \
asm( \
"addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \

View File

@ -1,7 +1,13 @@
#makefile for TomsFastMath
#
#
CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
VERSION=0.05
CFLAGS += -Wall -W -Wshadow -I./
ifndef IGNORE_SPEED
CFLAGS += -O3 -funroll-all-loops
#profiling
#PROF=-pg -g
@ -10,9 +16,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.04
default: libtfm.a
endif
OBJECTS = \
fp_set.o \
@ -52,23 +56,29 @@ ifndef INCPATH
INCPATH=/usr/include
endif
ifndef TFM_GROUP
ifndef INSTALL_GROUP
GROUP=wheel
else
GROUP=$(INSTALL_GROUP)
endif
ifndef TFM_USER
ifndef INSTALL_USER
USER=root
else
USER=$(INSTALL_USER)
endif
ifndef LIBNAME
LIBNAME=libtfm.a
endif
$(LIBNAME): $(OBJECTS)
$(AR) $(ARFLAGS) $(LIBNAME) $(OBJECTS)
ranlib $(LIBNAME)
default: $(LIBNAME)
install: libtfm.a
$(LIBNAME): $(OBJECTS)
$(AR) $(ARFLAGS) $@ $(OBJECTS)
ranlib $@
install: $(LIBNAME)
install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
@ -77,17 +87,17 @@ install: libtfm.a
mtest/mtest: mtest/mtest.c
cd mtest ; make mtest
test: libtfm.a demo/test.o mtest/mtest
$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
test: $(LIBNAME) demo/test.o mtest/mtest
$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
timing: libtfm.a demo/test.o
$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
timing: $(LIBNAME) demo/test.o
$(CC) $(CFLAGS) demo/test.o $(LIBNAME) $(PROF) -o test
stest: libtfm.a demo/stest.o
$(CC) $(CFLAGS) demo/stest.o libtfm.a -o stest
stest: $(LIBNAME) demo/stest.o
$(CC) $(CFLAGS) demo/stest.o $(LIBNAME) -o stest
rsatest: libtfm.a demo/rsa.o
$(CC) $(CFLAGS) demo/rsa.o libtfm.a -o rsatest
rsatest: $(LIBNAME) demo/rsa.o
$(CC) $(CFLAGS) demo/rsa.o $(LIBNAME) -o rsatest
docdvi: tfm.tex
touch tfm.ind
@ -101,8 +111,23 @@ docs: docdvi
dvipdf tfm
mv -f tfm.pdf doc
#This rule cleans the source tree of all compiled code, not including the pdf
#documentation.
clean:
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~ rsatest *.gcda *.gcno demo/*.gcda demo/*.gcno mtest/*.gcno mtest/*.gcda
rm -f `find . -type f | grep "[.]o" | xargs`
rm -f `find . -type f | grep "[.]lo" | xargs`
rm -f `find . -type f | grep "[.]a" | xargs`
rm -f `find . -type f | grep "[.]la" | xargs`
rm -f `find . -type f | grep "[.]obj" | xargs`
rm -f `find . -type f | grep "[.]lib" | xargs`
rm -f `find . -type f | grep "[.]exe" | xargs`
rm -f `find . -type f | grep "[.]gcda" | xargs`
rm -f `find . -type f | grep "[.]gcno" | xargs`
rm -f `find . -type f | grep "[.]il" | xargs`
rm -f `find . -type f | grep "[.]dyn" | xargs`
rm -f `find . -type f | grep "[.]dpi" | xargs`
rm -rf `find . -type d | grep "[.]libs" | xargs`
rm -f tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc
cd mtest ; make clean
no_oops: clean
@ -116,3 +141,7 @@ zipup: no_oops docs clean
cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/*
# $Source: /cvs/libtom/tomsfastmath/makefile,v $
# $Revision: 1.17 $
# $Date: 2005/07/30 04:23:55 $

View File

@ -1,55 +0,0 @@
#makefile for TomsFastMath
#
#For the GameboyAdance... er.... ARMv4
SFLAGS = $(CFLAGS) -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -mthumb -mthumb-interwork -I../devkitadv/mylib/lib
CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -marm -mthumb-interwork -I../devkitadv/mylib/lib
#profiling
#PROF=-pg -g
#CFLAGS += $(PROF)
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.01
default: libtfm.a
OBJECTS = \
fp_set.o \
\
fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
fp_mul_2.o fp_div_2.o \
\
fp_cnt_lsb.o \
\
fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
s_fp_add.o s_fp_sub.o \
\
fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
\
fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
\
fp_exptmod.o \
\
fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
libtfm.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
ranlib libtfm.a
demo/stest.o: demo/stest.c
$(CC) $(SFLAGS) -DGBA_MODE demo/stest.c -c -o demo/stest.o
stest: libtfm.a demo/stest.o
$(CC) -mthumb -mthumb-interwork demo/stest.o libtfm.a ../devkitadv/mylib/lib/gba.a -o stest.elf
objcopy -O binary stest.elf stest.bin

109
makefile.shared Normal file
View File

@ -0,0 +1,109 @@
#makefile for TomsFastMath
#
#
CC=libtool --mode=compile gcc
CFLAGS += -Wall -W -Wshadow -I./
ifndef IGNORE_SPEED
CFLAGS += -O3 -funroll-all-loops
#profiling
#PROF=-pg -g
#CFLAGS += $(PROF)
#speed
CFLAGS += -fomit-frame-pointer
endif
VERSION=0:5
OBJECTS = \
fp_set.o \
\
fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
fp_mul_2.o fp_div_2.o \
\
fp_cnt_lsb.o \
\
fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
s_fp_add.o s_fp_sub.o \
\
fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
\
fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
\
fp_exptmod.o \
\
fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
fp_ident.o
HEADERS=tfm.h
ifndef LIBPATH
LIBPATH=/usr/lib
endif
ifndef INCPATH
INCPATH=/usr/include
endif
ifndef INSTALL_GROUP
GROUP=wheel
else
GROUP=$(INSTALL_GROUP)
endif
ifndef INSTALL_USER
USER=root
else
USER=$(INSTALL_USER)
endif
ifndef LIBNAME
LIBNAME=libtfm.la
endif
ifndef LIBNAME_S
LIBNAME_S=libtfm.a
endif
default: $(LIBNAME)
$(LIBNAME): $(OBJECTS)
install: $(LIBNAME)
libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]lo" | xargs` -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
libtool --silent --mode=link gcc $(CFLAGS) `find . -type f | grep "[.]o" | xargs` -o $(LIBNAME_S)
ranlib $(LIBNAME_S)
libtool --silent --mode=install install -c $(LIBNAME) $(LIBPATH)/$(LIBNAME)
install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
mtest/mtest: mtest/mtest.c
cd mtest ; make mtest
test: $(LIBNAME) demo/test.o mtest/mtest
$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
timing: $(LIBNAME) demo/test.o
$(CC) $(CFLAGS) demo/test.o $(LIBNAME_S) $(PROF) -o test
stest: $(LIBNAME) demo/stest.o
$(CC) $(CFLAGS) demo/stest.o $(LIBNAME_S) -o stest
# $Source: /cvs/libtom/tomsfastmath/makefile.shared,v $
# $Revision: 1.4 $
# $Date: 2005/07/28 03:08:35 $

View File

@ -757,6 +757,75 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
*/
#include <tfm.h>
#ifdef TFM_TIMING_RESISTANT
/* timing resistant montgomery ladder based exptmod
Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
*/
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int R[2];
fp_digit buf, mp;
int err, bitcnt, digidx, y;
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
return err;
}
fp_init(&R[0]);
fp_init(&R[1]);
/* now we need R mod m */
fp_montgomery_calc_normalization (&R[0], P);
/* now set R[0][1] to G * R mod m */
if (fp_cmp_mag(P, G) != FP_GT) {
/* G > P so we reduce it first */
fp_mod(G, P, &R[1]);
} else {
fp_copy(G, &R[1]);
}
fp_mulmod (&R[1], &R[0], P, &R[1]);
/* for j = t-1 downto 0 do
r_!k = R0*R1; r_k = r_k^2
*/
/* set initial mode and bit cnt */
bitcnt = 1;
buf = 0;
digidx = X->used - 1;
for (;;) {
/* grab next digit as required */
if (--bitcnt == 0) {
/* if digidx == -1 we are out of digits so break */
if (digidx == -1) {
break;
}
/* read next digit and reset bitcnt */
buf = X->dp[digidx--];
bitcnt = (int)DIGIT_BIT;
}
/* grab the next msb from the exponent */
y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
buf <<= (fp_digit)1;
/* do ops */
fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
fp_sqr(&R[y], &R[y]); fp_montgomery_reduce(&R[y], P, mp);
}
fp_montgomery_reduce(&R[0], P, mp);
fp_copy(&R[0], Y);
return FP_OKAY;
}
#else
/* y = g**x (mod b)
* Some restrictions... x must be positive and < b
*/
@ -916,6 +985,8 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY;
}
#endif
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
@ -1105,6 +1176,111 @@ int main(void)
*/
#include <tfm.h>
static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
{
fp_int x, y, u, v, A, B, C, D;
int res;
/* b cannot be negative */
if (b->sign == FP_NEG || fp_iszero(b) == 1) {
return FP_VAL;
}
/* init temps */
fp_init(&x); fp_init(&y);
fp_init(&u); fp_init(&v);
fp_init(&A); fp_init(&B);
fp_init(&C); fp_init(&D);
/* x = a, y = b */
if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
return res;
}
fp_copy(b, &y);
/* 2. [modified] if x,y are both even then return an error! */
if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
return FP_VAL;
}
/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
fp_copy (&x, &u);
fp_copy (&y, &v);
fp_set (&A, 1);
fp_set (&D, 1);
top:
/* 4. while u is even do */
while (fp_iseven (&u) == 1) {
/* 4.1 u = u/2 */
fp_div_2 (&u, &u);
/* 4.2 if A or B is odd then */
if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
/* A = (A+y)/2, B = (B-x)/2 */
fp_add (&A, &y, &A);
fp_sub (&B, &x, &B);
}
/* A = A/2, B = B/2 */
fp_div_2 (&A, &A);
fp_div_2 (&B, &B);
}
/* 5. while v is even do */
while (fp_iseven (&v) == 1) {
/* 5.1 v = v/2 */
fp_div_2 (&v, &v);
/* 5.2 if C or D is odd then */
if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
/* C = (C+y)/2, D = (D-x)/2 */
fp_add (&C, &y, &C);
fp_sub (&D, &x, &D);
}
/* C = C/2, D = D/2 */
fp_div_2 (&C, &C);
fp_div_2 (&D, &D);
}
/* 6. if u >= v then */
if (fp_cmp (&u, &v) != FP_LT) {
/* u = u - v, A = A - C, B = B - D */
fp_sub (&u, &v, &u);
fp_sub (&A, &C, &A);
fp_sub (&B, &D, &B);
} else {
/* v - v - u, C = C - A, D = D - B */
fp_sub (&v, &u, &v);
fp_sub (&C, &A, &C);
fp_sub (&D, &B, &D);
}
/* if not zero goto step 4 */
if (fp_iszero (&u) == 0)
goto top;
/* now a = C, b = D, gcd == g*v */
/* if v != 1 then there is no inverse */
if (fp_cmp_d (&v, 1) != FP_EQ) {
return FP_VAL;
}
/* if its too low */
while (fp_cmp_d(&C, 0) == FP_LT) {
fp_add(&C, b, &C);
}
/* too big */
while (fp_cmp_mag(&C, b) != FP_LT) {
fp_sub(&C, b, &C);
}
/* C is now the inverse */
fp_copy(&C, c);
return FP_OKAY;
}
/* c = 1/a (mod b) for odd b only */
int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
{
@ -1113,7 +1289,7 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
/* 2. [modified] b must be odd */
if (fp_iseven (b) == FP_YES) {
return FP_VAL;
return fp_invmod_slow(a,b,c);
}
/* init all our temps */
@ -1814,8 +1990,6 @@ asm( \
#define LO 0
#define HI 1
#define CY 2
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
@ -1862,7 +2036,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
}
LOOP_END;
while (cy) {
PROPCARRY; // cy = cy > (*_c += cy);
PROPCARRY;
++_c;
}
}
@ -1889,10 +2063,10 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
}
}
/* $Source$ */
/* $Revision$ */
/* $Date$ */
/* End: fp_montgomery_reduce.c */
@ -2270,7 +2444,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* this should multiply i and j */
#define MULADD(i, j) \
asm ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -2341,7 +2515,7 @@ asm ( \
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -5678,7 +5852,7 @@ Obvious points of optimization
#define COMBA_FINI
#define SQRADD(i, j) \
asm volatile ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %%eax \n\t" \
"addl %%eax,%0 \n\t" \
@ -5687,7 +5861,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -5699,7 +5873,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \
asm ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"movl %%eax,%0 \n\t" \
@ -5708,7 +5882,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDAC(i, j) \
asm ( \
asm( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -5717,7 +5891,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
#define SQRADDDB \
asm ( \
asm( \
"addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \
@ -5746,7 +5920,7 @@ asm ( \
#define COMBA_FINI
#define SQRADD(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \
@ -5755,7 +5929,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
@ -5767,7 +5941,7 @@ asm ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDSC(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"movq %%rax,%0 \n\t" \
@ -5776,7 +5950,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDAC(i, j) \
asm ( \
asm( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
@ -5785,7 +5959,7 @@ asm ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#define SQRADDDB \
asm ( \
asm( \
"addq %6,%0 \n\t" \
"adcq %7,%1 \n\t" \
"adcq %8,%2 \n\t" \
@ -5815,7 +5989,7 @@ asm ( \
asm("emms");
#define SQRADD(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
@ -5827,7 +6001,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -5843,7 +6017,7 @@ asm volatile ( \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDSC(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -5854,7 +6028,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));
#define SQRADDAC(i, j) \
asm volatile ( \
asm( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
@ -5867,7 +6041,7 @@ asm volatile ( \
:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#define SQRADDDB \
asm ( \
asm( \
"addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \

56
tfm.aux
View File

@ -17,40 +17,42 @@
\@writefile{toc}{\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}}
\@writefile{toc}{\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}}
\@writefile{toc}{\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}}
\@writefile{toc}{\contentsline {subsubsection}{x86--32}{3}{section*.3}}
\@writefile{toc}{\contentsline {subsubsection}{SSE2}{3}{section*.4}}
\@writefile{toc}{\contentsline {subsubsection}{x86--64}{3}{section*.5}}
\@writefile{toc}{\contentsline {subsubsection}{ARM}{3}{section*.6}}
\@writefile{toc}{\contentsline {subsubsection}{PPC32}{3}{section*.7}}
\@writefile{toc}{\contentsline {subsubsection}{x86--64}{4}{section*.5}}
\@writefile{toc}{\contentsline {subsubsection}{ARM}{4}{section*.6}}
\@writefile{toc}{\contentsline {subsubsection}{PPC32}{4}{section*.7}}
\@writefile{toc}{\contentsline {subsubsection}{Future Releases}{4}{section*.8}}
\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Recommended Build Modes}}{4}{figure.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}}
\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}}
\@writefile{toc}{\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}}
\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}}
\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}}
\@writefile{toc}{\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}}
\@writefile{toc}{\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}}
\@writefile{toc}{\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}}
\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}}
\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}}
\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}}
\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}}
\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}}
\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}}
\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}}
\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}}
\@writefile{toc}{\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}}
\@writefile{toc}{\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}}
\@writefile{toc}{\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}}
\@writefile{toc}{\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}}
\@writefile{toc}{\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}}
\@writefile{toc}{\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}}
\@writefile{toc}{\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}}
\@writefile{toc}{\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}}
\@writefile{toc}{\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\newlabel{chap:asmops}{{4}{11}{Porting TomsFastMath\relax }{chapter.4}{}}
\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}}
\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}}
\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}}
\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}}
\newlabel{chap:asmops}{{4}{13}{Porting TomsFastMath\relax }{chapter.4}{}}
\@writefile{toc}{\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}}
\@writefile{toc}{\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}}
\@writefile{toc}{\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}}
\@writefile{toc}{\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}}

BIN
tfm.dvi

Binary file not shown.

17
tfm.h
View File

@ -48,6 +48,11 @@
*/
/* #define TFM_PRESCOTT */
/* Do we want timing resistant fp_exptmod() ?
* This makes it slower but also timing invariant with respect to the exponent
*/
/* #define TFM_TIMING_RESISTANT */
#endif
/* Max size of any number in bits. Basically the largest size you will be multiplying
@ -355,15 +360,25 @@ int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);
void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
void bn_reverse(unsigned char *s, int len);
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
#ifdef TFM_SMALL_SET
void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
#endif
#ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C);
#endif
void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C);
void fp_sqr_comba(fp_int *A, fp_int *B);
#ifdef TFM_SMALL_SET
void fp_sqr_comba_small(fp_int *A, fp_int *B);
#endif
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B);
void fp_sqr_comba48(fp_int *A, fp_int *B);

58
tfm.idx
View File

@ -1,29 +1,29 @@
\indexentry{fp\_init|hyperpage}{6}
\indexentry{fp\_set|hyperpage}{6}
\indexentry{fp\_init\_copy|hyperpage}{6}
\indexentry{fp\_iszero|hyperpage}{7}
\indexentry{fp\_iseven|hyperpage}{7}
\indexentry{fp\_isodd|hyperpage}{7}
\indexentry{fp\_neg|hyperpage}{7}
\indexentry{fp\_abs|hyperpage}{7}
\indexentry{fp\_cmp|hyperpage}{8}
\indexentry{fp\_cmp\_mag|hyperpage}{8}
\indexentry{fp\_lshd|hyperpage}{8}
\indexentry{fp\_rshd|hyperpage}{8}
\indexentry{fp\_div\_2d|hyperpage}{8}
\indexentry{fp\_mod\_2d|hyperpage}{8}
\indexentry{fp\_mul\_2d|hyperpage}{8}
\indexentry{fp\_div\_2|hyperpage}{8}
\indexentry{fp\_mul\_2|hyperpage}{8}
\indexentry{fp\_cnt\_lsb|hyperpage}{8}
\indexentry{fp\_add|hyperpage}{9}
\indexentry{fp\_sub|hyperpage}{9}
\indexentry{fp\_mul|hyperpage}{9}
\indexentry{fp\_sqr|hyperpage}{9}
\indexentry{fp\_div|hyperpage}{9}
\indexentry{fp\_mod|hyperpage}{9}
\indexentry{fp\_exptmod|hyperpage}{9}
\indexentry{fp\_invmod|hyperpage}{9}
\indexentry{fp\_gcd|hyperpage}{9}
\indexentry{fp\_lcm|hyperpage}{9}
\indexentry{fp\_isprime|hyperpage}{10}
\indexentry{fp\_init|hyperpage}{8}
\indexentry{fp\_set|hyperpage}{8}
\indexentry{fp\_init\_copy|hyperpage}{8}
\indexentry{fp\_iszero|hyperpage}{9}
\indexentry{fp\_iseven|hyperpage}{9}
\indexentry{fp\_isodd|hyperpage}{9}
\indexentry{fp\_neg|hyperpage}{9}
\indexentry{fp\_abs|hyperpage}{9}
\indexentry{fp\_cmp|hyperpage}{10}
\indexentry{fp\_cmp\_mag|hyperpage}{10}
\indexentry{fp\_lshd|hyperpage}{10}
\indexentry{fp\_rshd|hyperpage}{10}
\indexentry{fp\_div\_2d|hyperpage}{10}
\indexentry{fp\_mod\_2d|hyperpage}{10}
\indexentry{fp\_mul\_2d|hyperpage}{10}
\indexentry{fp\_div\_2|hyperpage}{10}
\indexentry{fp\_mul\_2|hyperpage}{10}
\indexentry{fp\_cnt\_lsb|hyperpage}{10}
\indexentry{fp\_add|hyperpage}{11}
\indexentry{fp\_sub|hyperpage}{11}
\indexentry{fp\_mul|hyperpage}{11}
\indexentry{fp\_sqr|hyperpage}{11}
\indexentry{fp\_div|hyperpage}{11}
\indexentry{fp\_mod|hyperpage}{11}
\indexentry{fp\_exptmod|hyperpage}{11}
\indexentry{fp\_invmod|hyperpage}{11}
\indexentry{fp\_gcd|hyperpage}{11}
\indexentry{fp\_lcm|hyperpage}{11}
\indexentry{fp\_isprime|hyperpage}{12}

58
tfm.ind
View File

@ -1,33 +1,33 @@
\begin{theindex}
\item fp\_abs, \hyperpage{7}
\item fp\_add, \hyperpage{9}
\item fp\_cmp, \hyperpage{8}
\item fp\_cmp\_mag, \hyperpage{8}
\item fp\_cnt\_lsb, \hyperpage{8}
\item fp\_div, \hyperpage{9}
\item fp\_div\_2, \hyperpage{8}
\item fp\_div\_2d, \hyperpage{8}
\item fp\_exptmod, \hyperpage{9}
\item fp\_gcd, \hyperpage{9}
\item fp\_init, \hyperpage{6}
\item fp\_init\_copy, \hyperpage{6}
\item fp\_invmod, \hyperpage{9}
\item fp\_iseven, \hyperpage{7}
\item fp\_isodd, \hyperpage{7}
\item fp\_isprime, \hyperpage{10}
\item fp\_iszero, \hyperpage{7}
\item fp\_lcm, \hyperpage{9}
\item fp\_lshd, \hyperpage{8}
\item fp\_mod, \hyperpage{9}
\item fp\_mod\_2d, \hyperpage{8}
\item fp\_mul, \hyperpage{9}
\item fp\_mul\_2, \hyperpage{8}
\item fp\_mul\_2d, \hyperpage{8}
\item fp\_neg, \hyperpage{7}
\item fp\_rshd, \hyperpage{8}
\item fp\_set, \hyperpage{6}
\item fp\_sqr, \hyperpage{9}
\item fp\_sub, \hyperpage{9}
\item fp\_abs, \hyperpage{9}
\item fp\_add, \hyperpage{11}
\item fp\_cmp, \hyperpage{10}
\item fp\_cmp\_mag, \hyperpage{10}
\item fp\_cnt\_lsb, \hyperpage{10}
\item fp\_div, \hyperpage{11}
\item fp\_div\_2, \hyperpage{10}
\item fp\_div\_2d, \hyperpage{10}
\item fp\_exptmod, \hyperpage{11}
\item fp\_gcd, \hyperpage{11}
\item fp\_init, \hyperpage{8}
\item fp\_init\_copy, \hyperpage{8}
\item fp\_invmod, \hyperpage{11}
\item fp\_iseven, \hyperpage{9}
\item fp\_isodd, \hyperpage{9}
\item fp\_isprime, \hyperpage{12}
\item fp\_iszero, \hyperpage{9}
\item fp\_lcm, \hyperpage{11}
\item fp\_lshd, \hyperpage{10}
\item fp\_mod, \hyperpage{11}
\item fp\_mod\_2d, \hyperpage{10}
\item fp\_mul, \hyperpage{11}
\item fp\_mul\_2, \hyperpage{10}
\item fp\_mul\_2d, \hyperpage{10}
\item fp\_neg, \hyperpage{9}
\item fp\_rshd, \hyperpage{10}
\item fp\_set, \hyperpage{8}
\item fp\_sqr, \hyperpage{11}
\item fp\_sub, \hyperpage{11}
\end{theindex}

52
tfm.log
View File

@ -1,4 +1,4 @@
This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10) 23 JUL 2005 07:42
This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) (format=latex 2005.4.10) 1 AUG 2005 13:34
entering extended mode
**tfm
(./tfm.tex
@ -216,107 +216,107 @@ File: umsb.fd 2002/01/19 v2.2g AMS font definitions
Chapter 1.
[1
] [2] [3] [4]
] [2] [3] [4] [5] [6
]
Chapter 2.
Underfull \vbox (badness 7649) has occurred while \output is active []
[5
]
[6]
[7]
[8]
Chapter 3.
[7
[9
] [8] [9] [10]
] [10] [11] [12]
Chapter 4.
[11
[13
] [12] [13]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
] [14] [15]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[]\OT1/cmtt/m/n/10 #define SQRADDSC(i, j)
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[] \OT1/cmtt/m/n/10 do { fp_word t;
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[] \OT1/cmtt/m/n/10 t = ((fp_word)i) * ((fp_word)j);
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 547--547
Overfull \hbox (74.99634pt too wide) in paragraph at lines 559--559
[] \OT1/cmtt/m/n/10 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;
\[]
[]
Overfull \hbox (25.129pt too wide) in paragraph at lines 548--549
Overfull \hbox (25.129pt too wide) in paragraph at lines 560--561
\OT1/cmr/m/n/10 This com-putes a prod-uct and stores it in the ``sec-ondary'' c
arry reg-is-ters $[]$.
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[]\OT1/cmtt/m/n/10 #define SQRADDAC(i, j)
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[] \OT1/cmtt/m/n/10 do { fp_word t;
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[] \OT1/cmtt/m/n/10 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t;
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 556--556
Overfull \hbox (74.99634pt too wide) in paragraph at lines 568--568
[] \OT1/cmtt/m/n/10 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t
>> DIGIT_BIT; \[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566
Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578
[]\OT1/cmtt/m/n/10 #define SQRADDDB
\[]
[]
Overfull \hbox (74.99634pt too wide) in paragraph at lines 566--566
Overfull \hbox (74.99634pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 do { fp_word t;
\[]
[]
Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t;
\[]
[]
Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BI
T); c1 = t; \[]
[]
Overfull \hbox (190.49533pt too wide) in paragraph at lines 566--566
Overfull \hbox (190.49533pt too wide) in paragraph at lines 578--578
[] \OT1/cmtt/m/n/10 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_B
IT); \[]
[]
[14] [15] (./tfm.ind [16] [17
[16] [17] (./tfm.ind [18] [19
]) (./tfm.aux) )
@ -329,4 +329,4 @@ Here is how much of TeX's memory you used:
580 hyphenation exceptions out of 1000
25i,9n,25p,195b,321s stack positions out of 1500i,500n,1500p,200000b,5000s
Output written on tfm.dvi (23 pages, 49708 bytes).
Output written on tfm.dvi (25 pages, 51612 bytes).

30
tfm.tex
View File

@ -49,8 +49,8 @@
\begin{document}
\frontmatter
\pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.04}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\title{TomsFastMath User Manual \\ v0.05}
\author{Tom St Denis \\ tomstdenis@gmail.com}
\maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5
[176x250] paper using the \LaTeX{} {\em book} macro package.
@ -101,14 +101,26 @@ fast multiplication and squaring and has the side effect of speeding up ECC oper
TomsFastMath is public domain.
\section{Building}
Currently only a GCC makefile has been provided. To build the library simply type
``make''. The library is a bit too new to put into production so no install
scripts exist yet. You can build the test program with ``make test''.
To build the library simply type ``make''. Or to install in typical *unix like directories use
``make install''. Similarly a shared library can be built with ``make -f makefile.shared install''.
To perform simple static testing (useful to test out new assembly ports) use the stest
program. Type ``make stest'' and run it on your target. The program will perform three
multiplications, squarings and montgomery reductions. Likely if your assembly
code is invalid this code will exhibit the bug.
You can build the test program with ``make test''. To perform simple static testing (useful to
test out new assembly ports) use the stest program. Type ``make stest'' and run it on your
target. The program will perform three multiplications, squarings and montgomery reductions.
Likely if your assembly code is invalid this code will exhibit the bug.
\subsection{Intel CC}
In theory you should be able to build the library with
\begin{verbatim}
CFLAGS="-O3 -ip" CC=icc make IGNORE_SPEED=1
\end{verbatim}
However, Intels inline assembler is way less advanced than GCCs. As a result it doesn't compile.
Fortunately it doesn't really matter.
\subsection{MSVC}
The library doesn't build with MSVC. Imagine that.
\subsection{Build Limitations}
TomsFastMath has the following build requirements which are non--portable but under most

54
tfm.toc
View File

@ -2,32 +2,34 @@
\contentsline {section}{\numberline {1.1}What is TomsFastMath?}{1}{section.1.1}
\contentsline {section}{\numberline {1.2}License}{2}{section.1.2}
\contentsline {section}{\numberline {1.3}Building}{2}{section.1.3}
\contentsline {subsection}{\numberline {1.3.1}Build Limitations}{2}{subsection.1.3.1}
\contentsline {subsection}{\numberline {1.3.2}Optimization Configuration}{2}{subsection.1.3.2}
\contentsline {subsection}{\numberline {1.3.1}Intel CC}{2}{subsection.1.3.1}
\contentsline {subsection}{\numberline {1.3.2}MSVC}{2}{subsection.1.3.2}
\contentsline {subsection}{\numberline {1.3.3}Build Limitations}{3}{subsection.1.3.3}
\contentsline {subsection}{\numberline {1.3.4}Optimization Configuration}{3}{subsection.1.3.4}
\contentsline {subsubsection}{x86--32}{3}{section*.3}
\contentsline {subsubsection}{SSE2}{3}{section*.4}
\contentsline {subsubsection}{x86--64}{3}{section*.5}
\contentsline {subsubsection}{ARM}{3}{section*.6}
\contentsline {subsubsection}{PPC32}{3}{section*.7}
\contentsline {subsubsection}{x86--64}{4}{section*.5}
\contentsline {subsubsection}{ARM}{4}{section*.6}
\contentsline {subsubsection}{PPC32}{4}{section*.7}
\contentsline {subsubsection}{Future Releases}{4}{section*.8}
\contentsline {subsection}{\numberline {1.3.3}Precision Configuration}{4}{subsection.1.3.3}
\contentsline {chapter}{\numberline {2}Getting Started}{5}{chapter.2}
\contentsline {section}{\numberline {2.1}Data Types}{5}{section.2.1}
\contentsline {section}{\numberline {2.2}Initialization}{6}{section.2.2}
\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{6}{subsection.2.2.1}
\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{6}{subsection.2.2.2}
\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{6}{subsection.2.2.3}
\contentsline {chapter}{\numberline {3}Arithmetic Operations}{7}{chapter.3}
\contentsline {section}{\numberline {3.1}Odds and Evens}{7}{section.3.1}
\contentsline {section}{\numberline {3.2}Sign Manipulation}{7}{section.3.2}
\contentsline {section}{\numberline {3.3}Comparisons}{8}{section.3.3}
\contentsline {section}{\numberline {3.4}Shifting}{8}{section.3.4}
\contentsline {section}{\numberline {3.5}Basic Algebra}{9}{section.3.5}
\contentsline {section}{\numberline {3.6}Modular Exponentiation}{9}{section.3.6}
\contentsline {section}{\numberline {3.7}Number Theoretic}{9}{section.3.7}
\contentsline {section}{\numberline {3.8}Prime Numbers}{10}{section.3.8}
\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{11}{chapter.4}
\contentsline {section}{\numberline {4.1}Getting Started}{11}{section.4.1}
\contentsline {section}{\numberline {4.2}Multiply with Comba}{11}{section.4.2}
\contentsline {section}{\numberline {4.3}Squaring with Comba}{13}{section.4.3}
\contentsline {section}{\numberline {4.4}Montgomery with Comba}{15}{section.4.4}
\contentsline {subsection}{\numberline {1.3.5}Precision Configuration}{5}{subsection.1.3.5}
\contentsline {chapter}{\numberline {2}Getting Started}{7}{chapter.2}
\contentsline {section}{\numberline {2.1}Data Types}{7}{section.2.1}
\contentsline {section}{\numberline {2.2}Initialization}{8}{section.2.2}
\contentsline {subsection}{\numberline {2.2.1}Simple Initialization}{8}{subsection.2.2.1}
\contentsline {subsection}{\numberline {2.2.2}Initialize Small Constants}{8}{subsection.2.2.2}
\contentsline {subsection}{\numberline {2.2.3}Initialize Copy}{8}{subsection.2.2.3}
\contentsline {chapter}{\numberline {3}Arithmetic Operations}{9}{chapter.3}
\contentsline {section}{\numberline {3.1}Odds and Evens}{9}{section.3.1}
\contentsline {section}{\numberline {3.2}Sign Manipulation}{9}{section.3.2}
\contentsline {section}{\numberline {3.3}Comparisons}{10}{section.3.3}
\contentsline {section}{\numberline {3.4}Shifting}{10}{section.3.4}
\contentsline {section}{\numberline {3.5}Basic Algebra}{11}{section.3.5}
\contentsline {section}{\numberline {3.6}Modular Exponentiation}{11}{section.3.6}
\contentsline {section}{\numberline {3.7}Number Theoretic}{11}{section.3.7}
\contentsline {section}{\numberline {3.8}Prime Numbers}{12}{section.3.8}
\contentsline {chapter}{\numberline {4}Porting TomsFastMath}{13}{chapter.4}
\contentsline {section}{\numberline {4.1}Getting Started}{13}{section.4.1}
\contentsline {section}{\numberline {4.2}Multiply with Comba}{13}{section.4.2}
\contentsline {section}{\numberline {4.3}Squaring with Comba}{15}{section.4.3}
\contentsline {section}{\numberline {4.4}Montgomery with Comba}{17}{section.4.4}