added tomsfastmath-0.03

This commit is contained in:
Tom St Denis 2005-03-01 23:00:09 +00:00 committed by Steffen Jaeckel
parent 6bb413fd72
commit ca551d4c5e
15 changed files with 2587 additions and 859 deletions

14
TODO
View File

@ -1,6 +1,20 @@
---
0. IMPORTANT... why are you doubling the "even" terms individually? STUPID!
- make it so you have four new macros that use an additional 3 carry variables
- SQRADDSC - store first mult [ simple store, no carry ]
- SQRADDAC - add subsequent mults [ 3n word add ]
- SQRADDDB - double the carry [ 3n word add ]
- SQRADDFC - forward the doubles into the main [ 3n word add, note, x86_32 may need "g" instead of "r" ]
- only use the four macro pattern for rows with >= 3 "doubles"
- otherwise use the existing SQRADD
1. Write more documentation ;-) 1. Write more documentation ;-)
2. Ports to PPC and MIPS 2. Ports to PPC and MIPS
3. Fix any lingering bugs, add additional requested functionality. 3. Fix any lingering bugs, add additional requested functionality.
4. Unrolled copies of montgomery will speed it up a bit
5.
NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises
can't happen. Please test the results you get for correctness. can't happen. Please test the results you get for correctness.

View File

@ -1,3 +1,8 @@
March 1st, 2005
0.03 -- Optimized squaring
--
September 18th, 2004 September 18th, 2004
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space. 0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
This also helps prevent killing the cache on smaller cpus. This also helps prevent killing the cache on smaller cpus.

View File

@ -3,13 +3,16 @@
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int x, y, z, N; int x, y, z, N, f;
N = atoi(argv[1]); N = atoi(argv[1]);
if (N >= 16 && N < 32) printf("#ifdef TFM_LARGE\n");
if (N >= 32) printf("#ifdef TFM_HUGE\n");
printf( printf(
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n" "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
"{\n" "{\n"
" fp_digit *a, b[%d], c0, c1, c2;\n" " fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
"\n" "\n"
" a = A->dp;\n" " a = A->dp;\n"
" COMBA_START; \n" " COMBA_START; \n"
@ -25,6 +28,16 @@ printf(
printf( printf(
"\n /* output %d */\n" "\n /* output %d */\n"
" CARRY_FORWARD;\n ", x); " CARRY_FORWARD;\n ", x);
for (f = y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if (z != y && z + y == x && y <= z) {
++f;
}
}
}
if (f <= 2) {
for (y = 0; y < N; y++) { for (y = 0; y < N; y++) {
for (z = 0; z < N; z++) { for (z = 0; z < N; z++) {
if (y<=z && (y+z)==x) { if (y<=z && (y+z)==x) {
@ -36,6 +49,30 @@ printf(
} }
} }
} }
} else {
// new method
/* do evens first */
f = 0;
for (y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if (z != y && z + y == x && y <= z) {
if (f == 0) {
// first double
printf("SQRADDSC(a[%d], a[%d]); ", y, z);
f = 1;
} else {
printf("SQRADDAC(a[%d], a[%d]); ", y, z);
}
}
}
}
// forward the carry
printf("SQRADDDB; ");
if ((x&1) == 0) {
// add the square
printf("SQRADD(a[%d], a[%d]); ", x/2, x/2);
}
}
printf("\n COMBA_STORE(b[%d]);\n", x); printf("\n COMBA_STORE(b[%d]);\n", x);
} }
printf(" COMBA_STORE2(b[%d]);\n", N+N-1); printf(" COMBA_STORE2(b[%d]);\n", N+N-1);
@ -49,5 +86,7 @@ printf(
" fp_clamp(B);\n" " fp_clamp(B);\n"
"}\n\n\n", N+N, N+N); "}\n\n\n", N+N, N+N);
if (N >= 16) printf("#endif\n");
return 0; return 0;
} }

View File

@ -23,7 +23,7 @@ static ulong64 TIMFUNC (void)
{ {
#if defined __GNUC__ #if defined __GNUC__
#if defined(__i386__) || defined(__x86_64__) #if defined(__i386__) || defined(__x86_64__)
unsigned long long a; ulong64 a;
__asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx"); __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
return a; return a;
#else /* gcc-IA64 version */ #else /* gcc-IA64 version */
@ -60,7 +60,7 @@ int main(void)
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix; div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
ulong64 t1, t2; ulong64 t1, t2;
srand(time(NULL));
printf("TFM Ident string:\n%s\n\n", fp_ident()); printf("TFM Ident string:\n%s\n\n", fp_ident());
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
fp_zero(&a); draw(&a); fp_zero(&a); draw(&a);
@ -135,6 +135,8 @@ int main(void)
printf("Testing read_radix\n"); printf("Testing read_radix\n");
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a); fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
goto testing;
#if 1 #if 1
/* test mont */ /* test mont */
printf("Montgomery test #1\n"); printf("Montgomery test #1\n");
@ -143,7 +145,7 @@ int main(void)
fp_montgomery_calc_normalization(&b, &a); fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16); fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) { for (n = 0; n < 1000000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c); fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp); fp_montgomery_reduce(&c, &a, fp);
@ -165,7 +167,7 @@ int main(void)
fp_montgomery_calc_normalization(&b, &a); fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16); fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) { for (n = 0; n < 1000000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c); fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp); fp_montgomery_reduce(&c, &a, fp);
@ -195,7 +197,7 @@ int main(void)
printf("\n\n"); printf("\n\n");
#endif #endif
#if 0 #if 1
/* do some timings... */ /* do some timings... */
printf("Addition:\n"); printf("Addition:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) { for (t = 2; t <= FP_SIZE/2; t += 2) {
@ -242,6 +244,7 @@ int main(void)
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2); printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
} }
//#else //#else
sqrtime:
printf("Squaring:\n"); printf("Squaring:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) { for (t = 2; t <= FP_SIZE/2; t += 2) {
fp_zero(&a); fp_zero(&a);
@ -260,6 +263,7 @@ int main(void)
} }
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2); printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
} }
return;
//#else //#else
printf("Montgomery:\n"); printf("Montgomery:\n");
for (t = 2; t <= (FP_SIZE/2)-2; t += 2) { for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
@ -288,7 +292,9 @@ int main(void)
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2); printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
} }
//#else //#else
expttime:
printf("Exptmod:\n"); printf("Exptmod:\n");
for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) { for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
fp_zero(&a); fp_zero(&a);
fp_zero(&b); fp_zero(&b);
@ -303,7 +309,7 @@ int main(void)
c.used = t; c.used = t;
t2 = -1; t2 = -1;
for (ix = 0; ix < 1024; ++ix) { for (ix = 0; ix < 256; ++ix) {
t1 = TIMFUNC(); t1 = TIMFUNC();
fp_exptmod(&c, &b, &a, &d); fp_exptmod(&c, &b, &a, &d);
fp_exptmod(&c, &b, &a, &d); fp_exptmod(&c, &b, &a, &d);
@ -311,11 +317,15 @@ int main(void)
fp_copy(&b, &c); fp_copy(&b, &c);
fp_copy(&b, &d); fp_copy(&b, &d);
if (t1<t2) { t2 = t1; --ix; } if (t1<t2) { t2 = t1; --ix; }
} }
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2); printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
} }
return;
#endif #endif
testing:
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0; sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;

Binary file not shown.

View File

@ -12,7 +12,6 @@
/* y = g**x (mod b) /* y = g**x (mod b)
* Some restrictions... x must be positive and < b * Some restrictions... x must be positive and < b
*/ */
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{ {
fp_int M[64], res; fp_int M[64], res;
@ -169,6 +168,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY; return FP_OKAY;
} }
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{ {
fp_int tmp; fp_int tmp;
@ -181,11 +181,12 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) { if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
return err; return err;
} }
/* _fp_exptmod doesn't care about the sign of X */ X->sign = FP_ZPOS;
return _fp_exptmod(&tmp, X, P, Y); err = _fp_exptmod(&tmp, X, P, Y);
X->sign = FP_NEG;
return err;
} else { } else {
/* Positive exponent so just exptmod */ /* Positive exponent so just exptmod */
return _fp_exptmod(G, X, P, Y); return _fp_exptmod(G, X, P, Y);
} }
} }

View File

@ -29,11 +29,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
} else if (y <= 8) { } else if (y <= 8) {
fp_mul_comba8(A,B,C); fp_mul_comba8(A,B,C);
#if defined(TFM_LARGE) #if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) { } else if (y <= 16 && y >= 10) {
fp_mul_comba16(A,B,C); fp_mul_comba16(A,B,C);
#endif #endif
#if defined(TFM_HUGE) #if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) { } else if (y <= 32 && y >= 24) {
fp_mul_comba32(A,B,C); fp_mul_comba32(A,B,C);
#endif #endif
} else { } else {

View File

@ -16,7 +16,7 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_int aa, bb, comp, amb, t1; fp_int aa, bb, comp, amb, t1;
y = A->used; y = A->used;
if (y <= 48) { if (y <= 64) {
if (y <= 4) { if (y <= 4) {
fp_sqr_comba4(A,B); fp_sqr_comba4(A,B);
} else if (y <= 8) { } else if (y <= 8) {
@ -26,8 +26,10 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_sqr_comba16(A,B); fp_sqr_comba16(A,B);
#endif #endif
#if defined(TFM_HUGE) #if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) { } else if (y <= 32 && y >= 20) {
fp_sqr_comba32(A,B); fp_sqr_comba32(A,B);
} else if (y <= 64 && y >= 48) {
fp_sqr_comba64(A,B);
#endif #endif
} else { } else {
fp_sqr_comba(A, B); fp_sqr_comba(A, B);

File diff suppressed because it is too large Load Diff

75
fp_sqr_comba_generic.c Normal file
View File

@ -0,0 +1,75 @@
/* generic comba squarer */
void fp_sqr_comba(fp_int *A, fp_int *B)
{
int pa, ix, iz;
fp_digit c0, c1, c2;
fp_int tmp, *dst;
/* get size of output and trim */
pa = A->used + A->used;
if (pa >= FP_SIZE) {
pa = FP_SIZE-1;
}
/* number of output digits to produce */
COMBA_START;
CLEAR_CARRY;
if (A == B) {
fp_zero(&tmp);
dst = &tmp;
} else {
fp_zero(B);
dst = B;
}
for (ix = 0; ix < pa; ix++) {
int tx, ty, iy;
fp_digit *tmpy, *tmpx;
/* get offsets into the two bignums */
ty = MIN(A->used-1, ix);
tx = ix - ty;
/* setup temp aliases */
tmpx = A->dp + tx;
tmpy = A->dp + ty;
/* this is the number of times the loop will iterrate, essentially its
while (tx++ < a->used && ty-- >= 0) { ... }
*/
iy = MIN(A->used-tx, ty+1);
/* now for squaring tx can never equal ty
* we halve the distance since they approach at a rate of 2x
* and we have to round because odd cases need to be executed
*/
iy = MIN(iy, (ty-tx+1)>>1);
/* forward carries */
CARRY_FORWARD;
/* execute loop */
for (iz = 0; iz < iy; iz++) {
SQRADD2(*tmpx++, *tmpy--);
}
/* even columns have the square term in them */
if ((ix&1) == 0) {
SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
}
/* store it */
COMBA_STORE(dst->dp[ix]);
}
COMBA_STORE2(dst->dp[ix]);
COMBA_FINI;
/* setup dest */
dst->used = pa;
fp_clamp (dst);
if (dst != B) {
fp_copy(dst, B);
}
}

View File

@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#speed #speed
CFLAGS += -fomit-frame-pointer CFLAGS += -fomit-frame-pointer
VERSION=0.02 VERSION=0.03
default: libtfm.a default: libtfm.a

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,36 @@
New code added in TFM v0.03
OLD 64-bit...[athlon64]
Squaring:
256-bit: 89
512-bit: 234
1024-bit: 815
2048-bit: 2851
NEW 64-bit ...
Squaring:
256-bit: 89
512-bit: 228
1024-bit: 691
2048-bit: 2228
OLD 32-bit [athlonxp]
Squaring:
256-bit: 327
512-bit: 1044
1024-bit: 3646
2048-bit: 17055
NEW 32-bit
Squaring:
256-bit: 332
512-bit: 894
1024-bit: 2983
2048-bit: 10385

11
tfm.h
View File

@ -107,11 +107,11 @@
/* we want no asm? */ /* we want no asm? */
#ifdef TFM_NO_ASM #ifdef TFM_NO_ASM
#undef TFM_X86 #undef TFM_X86
#undef TFM_X86_64 #undef TFM_X86_64
#undef TFM_SSE2 #undef TFM_SSE2
#undef TFM_ARM #undef TFM_ARM
#undef TFM_ASM #undef TFM_ASM
#endif #endif
/* some default configurations. /* some default configurations.
@ -350,6 +350,7 @@ void fp_sqr_comba16(fp_int *A, fp_int *B);
#endif #endif
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B); void fp_sqr_comba32(fp_int *A, fp_int *B);
void fp_sqr_comba64(fp_int *A, fp_int *B);
#endif #endif
extern const char *fp_s_rmap; extern const char *fp_s_rmap;

33
tfm.tex
View File

@ -49,7 +49,7 @@
\begin{document} \begin{document}
\frontmatter \frontmatter
\pagestyle{empty} \pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.02} \title{TomsFastMath User Manual \\ v0.03}
\author{Tom St Denis \\ tomstdenis@iahu.ca} \author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle \maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5 This text and library are all hereby placed in the public domain. This book has been formatted for B5
@ -525,6 +525,37 @@ This is essentially the MULADD macro from the multiplication code.
This is like SQRADD except it adds the produce twice. It's similar to This is like SQRADD except it adds the produce twice. It's similar to
computing SQRADD(i, j*2). computing SQRADD(i, j*2).
To further make things interesting the squaring code also has ``doubles'' (see my LTM book chapter five...) which are
handled with these macros.
\begin{verbatim}
#define SQRADDSC(i, j) \
do { fp_word t; \
t = ((fp_word)i) * ((fp_word)j); \
sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
} while (0);
\end{verbatim}
This computes a product and stores it in the ``secondary'' carry registers $\left < sc0, sc1, sc2 \right >$.
\begin{verbatim}
#define SQRADDAC(i, j) \
do { fp_word t; \
t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \
t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \
} while (0);
\end{verbatim}
This computes a product and adds it to the ``secondary'' carry registers.
\begin{verbatim}
#define SQRADDDB \
do { fp_word t; \
t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \
t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \
c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \
} while (0);
\end{verbatim}
This doubles the ``secondary'' carry registers and adds the sum to the main carry registers. Really complicated.
\section{Montgomery with Comba} \section{Montgomery with Comba}
Montgomery reduction is used in modular exponentiation and is most called function during Montgomery reduction is used in modular exponentiation and is most called function during
that operation. It's important to make sure this routine is very fast or all is lost. that operation. It's important to make sure this routine is very fast or all is lost.