3
0
Fork 0

added tomsfastmath-0.03

This commit is contained in:
Tom St Denis 2005-03-01 23:00:09 +00:00 committed by Steffen Jaeckel
parent 6bb413fd72
commit ca551d4c5e
15 changed files with 2587 additions and 859 deletions

14
TODO
View File

@ -1,6 +1,20 @@
---
0. IMPORTANT... why are you doubling the "even" terms individually? STUPID!
- make it so you have four new macros that use an additional 3 carry variables
- SQRADDSC - store first mult [ simple store, no carry ]
- SQRADDAC - add subsequent mults [ 3n word add ]
- SQRADDDB - double the carry [ 3n word add ]
- SQRADDFC - forward the doubles into the main [ 3n word add, note, x86_32 may need "g" instead of "r" ]
- only use the four macro pattern for rows with >= 3 "doubles"
- otherwise use the existing SQRADD
1. Write more documentation ;-)
2. Ports to PPC and MIPS
3. Fix any lingering bugs, add additional requested functionality.
4. Unrolled copies of montgomery will speed it up a bit
5.
NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises
can't happen. Please test the results you get for correctness.

View File

@ -1,3 +1,8 @@
March 1st, 2005
0.03 -- Optimized squaring
--
September 18th, 2004
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
This also helps prevent killing the cache on smaller cpus.

View File

@ -3,13 +3,16 @@
int main(int argc, char **argv)
{
int x, y, z, N;
int x, y, z, N, f;
N = atoi(argv[1]);
if (N >= 16 && N < 32) printf("#ifdef TFM_LARGE\n");
if (N >= 32) printf("#ifdef TFM_HUGE\n");
printf(
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
"{\n"
" fp_digit *a, b[%d], c0, c1, c2;\n"
" fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
"\n"
" a = A->dp;\n"
" COMBA_START; \n"
@ -25,6 +28,16 @@ printf(
printf(
"\n /* output %d */\n"
" CARRY_FORWARD;\n ", x);
for (f = y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if (z != y && z + y == x && y <= z) {
++f;
}
}
}
if (f <= 2) {
for (y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if (y<=z && (y+z)==x) {
@ -36,6 +49,30 @@ printf(
}
}
}
} else {
// new method
/* do evens first */
f = 0;
for (y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if (z != y && z + y == x && y <= z) {
if (f == 0) {
// first double
printf("SQRADDSC(a[%d], a[%d]); ", y, z);
f = 1;
} else {
printf("SQRADDAC(a[%d], a[%d]); ", y, z);
}
}
}
}
// forward the carry
printf("SQRADDDB; ");
if ((x&1) == 0) {
// add the square
printf("SQRADD(a[%d], a[%d]); ", x/2, x/2);
}
}
printf("\n COMBA_STORE(b[%d]);\n", x);
}
printf(" COMBA_STORE2(b[%d]);\n", N+N-1);
@ -49,5 +86,7 @@ printf(
" fp_clamp(B);\n"
"}\n\n\n", N+N, N+N);
if (N >= 16) printf("#endif\n");
return 0;
}

View File

@ -23,7 +23,7 @@ static ulong64 TIMFUNC (void)
{
#if defined __GNUC__
#if defined(__i386__) || defined(__x86_64__)
unsigned long long a;
ulong64 a;
__asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
return a;
#else /* gcc-IA64 version */
@ -60,7 +60,7 @@ int main(void)
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
ulong64 t1, t2;
srand(time(NULL));
printf("TFM Ident string:\n%s\n\n", fp_ident());
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
fp_zero(&a); draw(&a);
@ -135,6 +135,8 @@ int main(void)
printf("Testing read_radix\n");
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
goto testing;
#if 1
/* test mont */
printf("Montgomery test #1\n");
@ -143,7 +145,7 @@ int main(void)
fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) {
for (n = 0; n < 1000000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp);
@ -165,7 +167,7 @@ int main(void)
fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) {
for (n = 0; n < 1000000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp);
@ -194,8 +196,8 @@ int main(void)
}
printf("\n\n");
#endif
#if 0
#if 1
/* do some timings... */
printf("Addition:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) {
@ -242,6 +244,7 @@ int main(void)
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
//#else
sqrtime:
printf("Squaring:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) {
fp_zero(&a);
@ -260,6 +263,7 @@ int main(void)
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
return;
//#else
printf("Montgomery:\n");
for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
@ -288,7 +292,9 @@ int main(void)
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
//#else
expttime:
printf("Exptmod:\n");
for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
fp_zero(&a);
fp_zero(&b);
@ -303,7 +309,7 @@ int main(void)
c.used = t;
t2 = -1;
for (ix = 0; ix < 1024; ++ix) {
for (ix = 0; ix < 256; ++ix) {
t1 = TIMFUNC();
fp_exptmod(&c, &b, &a, &d);
fp_exptmod(&c, &b, &a, &d);
@ -311,11 +317,15 @@ int main(void)
fp_copy(&b, &c);
fp_copy(&b, &d);
if (t1<t2) { t2 = t1; --ix; }
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
return;
#endif
testing:
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;

Binary file not shown.

View File

@ -12,7 +12,6 @@
/* y = g**x (mod b)
* Some restrictions... x must be positive and < b
*/
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int M[64], res;
@ -34,7 +33,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
}
/* init M array */
memset(M, 0, sizeof(M));
memset(M, 0, sizeof(M));
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -169,11 +168,12 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY;
}
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int tmp;
int err;
/* is X negative? */
if (X->sign == FP_NEG) {
/* yes, copy G and invmod it */
@ -181,11 +181,12 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
return err;
}
/* _fp_exptmod doesn't care about the sign of X */
return _fp_exptmod(&tmp, X, P, Y);
X->sign = FP_ZPOS;
err = _fp_exptmod(&tmp, X, P, Y);
X->sign = FP_NEG;
return err;
} else {
/* Positive exponent so just exptmod */
return _fp_exptmod(G, X, P, Y);
}
}

View File

@ -29,11 +29,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
} else if (y <= 8) {
fp_mul_comba8(A,B,C);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) {
} else if (y <= 16 && y >= 10) {
fp_mul_comba16(A,B,C);
#endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) {
} else if (y <= 32 && y >= 24) {
fp_mul_comba32(A,B,C);
#endif
} else {

View File

@ -16,7 +16,7 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_int aa, bb, comp, amb, t1;
y = A->used;
if (y <= 48) {
if (y <= 64) {
if (y <= 4) {
fp_sqr_comba4(A,B);
} else if (y <= 8) {
@ -26,8 +26,10 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_sqr_comba16(A,B);
#endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) {
} else if (y <= 32 && y >= 20) {
fp_sqr_comba32(A,B);
} else if (y <= 64 && y >= 48) {
fp_sqr_comba64(A,B);
#endif
} else {
fp_sqr_comba(A, B);

File diff suppressed because it is too large Load Diff

75
fp_sqr_comba_generic.c Normal file
View File

@ -0,0 +1,75 @@
/* generic comba squarer */
void fp_sqr_comba(fp_int *A, fp_int *B)
{
int pa, ix, iz;
fp_digit c0, c1, c2;
fp_int tmp, *dst;
/* get size of output and trim */
pa = A->used + A->used;
if (pa >= FP_SIZE) {
pa = FP_SIZE-1;
}
/* number of output digits to produce */
COMBA_START;
CLEAR_CARRY;
if (A == B) {
fp_zero(&tmp);
dst = &tmp;
} else {
fp_zero(B);
dst = B;
}
for (ix = 0; ix < pa; ix++) {
int tx, ty, iy;
fp_digit *tmpy, *tmpx;
/* get offsets into the two bignums */
ty = MIN(A->used-1, ix);
tx = ix - ty;
/* setup temp aliases */
tmpx = A->dp + tx;
tmpy = A->dp + ty;
/* this is the number of times the loop will iterrate, essentially its
while (tx++ < a->used && ty-- >= 0) { ... }
*/
iy = MIN(A->used-tx, ty+1);
/* now for squaring tx can never equal ty
* we halve the distance since they approach at a rate of 2x
* and we have to round because odd cases need to be executed
*/
iy = MIN(iy, (ty-tx+1)>>1);
/* forward carries */
CARRY_FORWARD;
/* execute loop */
for (iz = 0; iz < iy; iz++) {
SQRADD2(*tmpx++, *tmpy--);
}
/* even columns have the square term in them */
if ((ix&1) == 0) {
SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
}
/* store it */
COMBA_STORE(dst->dp[ix]);
}
COMBA_STORE2(dst->dp[ix]);
COMBA_FINI;
/* setup dest */
dst->used = pa;
fp_clamp (dst);
if (dst != B) {
fp_copy(dst, B);
}
}

View File

@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.02
VERSION=0.03
default: libtfm.a

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,36 @@
New code added in TFM v0.03
OLD 64-bit...[athlon64]
Squaring:
256-bit: 89
512-bit: 234
1024-bit: 815
2048-bit: 2851
NEW 64-bit ...
Squaring:
256-bit: 89
512-bit: 228
1024-bit: 691
2048-bit: 2228
OLD 32-bit [athlonxp]
Squaring:
256-bit: 327
512-bit: 1044
1024-bit: 3646
2048-bit: 17055
NEW 32-bit
Squaring:
256-bit: 332
512-bit: 894
1024-bit: 2983
2048-bit: 10385

11
tfm.h
View File

@ -107,11 +107,11 @@
/* we want no asm? */
#ifdef TFM_NO_ASM
#undef TFM_X86
#undef TFM_X86_64
#undef TFM_SSE2
#undef TFM_ARM
#undef TFM_ASM
#undef TFM_X86
#undef TFM_X86_64
#undef TFM_SSE2
#undef TFM_ARM
#undef TFM_ASM
#endif
/* some default configurations.
@ -350,6 +350,7 @@ void fp_sqr_comba16(fp_int *A, fp_int *B);
#endif
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B);
void fp_sqr_comba64(fp_int *A, fp_int *B);
#endif
extern const char *fp_s_rmap;

33
tfm.tex
View File

@ -49,7 +49,7 @@
\begin{document}
\frontmatter
\pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.02}
\title{TomsFastMath User Manual \\ v0.03}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5
@ -525,6 +525,37 @@ This is essentially the MULADD macro from the multiplication code.
This is like SQRADD except it adds the produce twice. It's similar to
computing SQRADD(i, j*2).
To further make things interesting the squaring code also has ``doubles'' (see my LTM book chapter five...) which are
handled with these macros.
\begin{verbatim}
#define SQRADDSC(i, j) \
do { fp_word t; \
t = ((fp_word)i) * ((fp_word)j); \
sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
} while (0);
\end{verbatim}
This computes a product and stores it in the ``secondary'' carry registers $\left < sc0, sc1, sc2 \right >$.
\begin{verbatim}
#define SQRADDAC(i, j) \
do { fp_word t; \
t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \
t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \
} while (0);
\end{verbatim}
This computes a product and adds it to the ``secondary'' carry registers.
\begin{verbatim}
#define SQRADDDB \
do { fp_word t; \
t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \
t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \
c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \
} while (0);
\end{verbatim}
This doubles the ``secondary'' carry registers and adds the sum to the main carry registers. Really complicated.
\section{Montgomery with Comba}
Montgomery reduction is used in modular exponentiation and is most called function during
that operation. It's important to make sure this routine is very fast or all is lost.