added tomsfastmath-0.03
This commit is contained in:
parent
6bb413fd72
commit
ca551d4c5e
14
TODO
14
TODO
@ -1,6 +1,20 @@
|
||||
---
|
||||
0. IMPORTANT... why are you doubling the "even" terms individually? STUPID!
|
||||
- make it so you have four new macros that use an additional 3 carry variables
|
||||
- SQRADDSC - store first mult [ simple store, no carry ]
|
||||
- SQRADDAC - add subsequent mults [ 3n word add ]
|
||||
- SQRADDDB - double the carry [ 3n word add ]
|
||||
- SQRADDFC - forward the doubles into the main [ 3n word add, note, x86_32 may need "g" instead of "r" ]
|
||||
- only use the four macro pattern for rows with >= 3 "doubles"
|
||||
- otherwise use the existing SQRADD
|
||||
|
||||
|
||||
1. Write more documentation ;-)
|
||||
2. Ports to PPC and MIPS
|
||||
3. Fix any lingering bugs, add additional requested functionality.
|
||||
4. Unrolled copies of montgomery will speed it up a bit
|
||||
5.
|
||||
|
||||
|
||||
NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises
|
||||
can't happen. Please test the results you get for correctness.
|
||||
|
@ -1,3 +1,8 @@
|
||||
March 1st, 2005
|
||||
0.03 -- Optimized squaring
|
||||
--
|
||||
|
||||
|
||||
September 18th, 2004
|
||||
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
|
||||
This also helps prevent killing the cache on smaller cpus.
|
||||
|
@ -3,13 +3,16 @@
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int x, y, z, N;
|
||||
int x, y, z, N, f;
|
||||
N = atoi(argv[1]);
|
||||
|
||||
if (N >= 16 && N < 32) printf("#ifdef TFM_LARGE\n");
|
||||
if (N >= 32) printf("#ifdef TFM_HUGE\n");
|
||||
|
||||
printf(
|
||||
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
|
||||
"{\n"
|
||||
" fp_digit *a, b[%d], c0, c1, c2;\n"
|
||||
" fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
|
||||
"\n"
|
||||
" a = A->dp;\n"
|
||||
" COMBA_START; \n"
|
||||
@ -25,6 +28,16 @@ printf(
|
||||
printf(
|
||||
"\n /* output %d */\n"
|
||||
" CARRY_FORWARD;\n ", x);
|
||||
|
||||
for (f = y = 0; y < N; y++) {
|
||||
for (z = 0; z < N; z++) {
|
||||
if (z != y && z + y == x && y <= z) {
|
||||
++f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (f <= 2) {
|
||||
for (y = 0; y < N; y++) {
|
||||
for (z = 0; z < N; z++) {
|
||||
if (y<=z && (y+z)==x) {
|
||||
@ -36,6 +49,30 @@ printf(
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// new method
|
||||
/* do evens first */
|
||||
f = 0;
|
||||
for (y = 0; y < N; y++) {
|
||||
for (z = 0; z < N; z++) {
|
||||
if (z != y && z + y == x && y <= z) {
|
||||
if (f == 0) {
|
||||
// first double
|
||||
printf("SQRADDSC(a[%d], a[%d]); ", y, z);
|
||||
f = 1;
|
||||
} else {
|
||||
printf("SQRADDAC(a[%d], a[%d]); ", y, z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// forward the carry
|
||||
printf("SQRADDDB; ");
|
||||
if ((x&1) == 0) {
|
||||
// add the square
|
||||
printf("SQRADD(a[%d], a[%d]); ", x/2, x/2);
|
||||
}
|
||||
}
|
||||
printf("\n COMBA_STORE(b[%d]);\n", x);
|
||||
}
|
||||
printf(" COMBA_STORE2(b[%d]);\n", N+N-1);
|
||||
@ -49,5 +86,7 @@ printf(
|
||||
" fp_clamp(B);\n"
|
||||
"}\n\n\n", N+N, N+N);
|
||||
|
||||
if (N >= 16) printf("#endif\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
26
demo/test.c
26
demo/test.c
@ -23,7 +23,7 @@ static ulong64 TIMFUNC (void)
|
||||
{
|
||||
#if defined __GNUC__
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
unsigned long long a;
|
||||
ulong64 a;
|
||||
__asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
|
||||
return a;
|
||||
#else /* gcc-IA64 version */
|
||||
@ -60,7 +60,7 @@ int main(void)
|
||||
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
|
||||
ulong64 t1, t2;
|
||||
|
||||
|
||||
srand(time(NULL));
|
||||
printf("TFM Ident string:\n%s\n\n", fp_ident());
|
||||
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
|
||||
fp_zero(&a); draw(&a);
|
||||
@ -135,6 +135,8 @@ int main(void)
|
||||
printf("Testing read_radix\n");
|
||||
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
|
||||
|
||||
goto testing;
|
||||
|
||||
#if 1
|
||||
/* test mont */
|
||||
printf("Montgomery test #1\n");
|
||||
@ -143,7 +145,7 @@ int main(void)
|
||||
fp_montgomery_calc_normalization(&b, &a);
|
||||
|
||||
fp_read_radix(&d, "123456789123", 16);
|
||||
for (n = 0; n < 100000; n++) {
|
||||
for (n = 0; n < 1000000; n++) {
|
||||
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
||||
fp_mul(&d, &b, &c);
|
||||
fp_montgomery_reduce(&c, &a, fp);
|
||||
@ -165,7 +167,7 @@ int main(void)
|
||||
fp_montgomery_calc_normalization(&b, &a);
|
||||
|
||||
fp_read_radix(&d, "123456789123", 16);
|
||||
for (n = 0; n < 100000; n++) {
|
||||
for (n = 0; n < 1000000; n++) {
|
||||
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
||||
fp_mul(&d, &b, &c);
|
||||
fp_montgomery_reduce(&c, &a, fp);
|
||||
@ -195,7 +197,7 @@ int main(void)
|
||||
printf("\n\n");
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
/* do some timings... */
|
||||
printf("Addition:\n");
|
||||
for (t = 2; t <= FP_SIZE/2; t += 2) {
|
||||
@ -242,6 +244,7 @@ int main(void)
|
||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||
}
|
||||
//#else
|
||||
sqrtime:
|
||||
printf("Squaring:\n");
|
||||
for (t = 2; t <= FP_SIZE/2; t += 2) {
|
||||
fp_zero(&a);
|
||||
@ -260,6 +263,7 @@ int main(void)
|
||||
}
|
||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||
}
|
||||
return;
|
||||
//#else
|
||||
printf("Montgomery:\n");
|
||||
for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
|
||||
@ -288,7 +292,9 @@ int main(void)
|
||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||
}
|
||||
//#else
|
||||
expttime:
|
||||
printf("Exptmod:\n");
|
||||
|
||||
for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
|
||||
fp_zero(&a);
|
||||
fp_zero(&b);
|
||||
@ -303,7 +309,7 @@ int main(void)
|
||||
c.used = t;
|
||||
|
||||
t2 = -1;
|
||||
for (ix = 0; ix < 1024; ++ix) {
|
||||
for (ix = 0; ix < 256; ++ix) {
|
||||
t1 = TIMFUNC();
|
||||
fp_exptmod(&c, &b, &a, &d);
|
||||
fp_exptmod(&c, &b, &a, &d);
|
||||
@ -311,11 +317,15 @@ int main(void)
|
||||
fp_copy(&b, &c);
|
||||
fp_copy(&b, &d);
|
||||
if (t1<t2) { t2 = t1; --ix; }
|
||||
}
|
||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||
}
|
||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||
}
|
||||
return;
|
||||
|
||||
#endif
|
||||
|
||||
testing:
|
||||
|
||||
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
|
||||
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
|
||||
|
||||
|
BIN
doc/tfm.pdf
BIN
doc/tfm.pdf
Binary file not shown.
@ -12,7 +12,6 @@
|
||||
/* y = g**x (mod b)
|
||||
* Some restrictions... x must be positive and < b
|
||||
*/
|
||||
|
||||
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
{
|
||||
fp_int M[64], res;
|
||||
@ -169,6 +168,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
|
||||
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
{
|
||||
fp_int tmp;
|
||||
@ -181,11 +181,12 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
|
||||
return err;
|
||||
}
|
||||
/* _fp_exptmod doesn't care about the sign of X */
|
||||
return _fp_exptmod(&tmp, X, P, Y);
|
||||
X->sign = FP_ZPOS;
|
||||
err = _fp_exptmod(&tmp, X, P, Y);
|
||||
X->sign = FP_NEG;
|
||||
return err;
|
||||
} else {
|
||||
/* Positive exponent so just exptmod */
|
||||
return _fp_exptmod(G, X, P, Y);
|
||||
}
|
||||
}
|
||||
|
||||
|
4
fp_mul.c
4
fp_mul.c
@ -29,11 +29,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
|
||||
} else if (y <= 8) {
|
||||
fp_mul_comba8(A,B,C);
|
||||
#if defined(TFM_LARGE)
|
||||
} else if (y <= 16 && y >= 12) {
|
||||
} else if (y <= 16 && y >= 10) {
|
||||
fp_mul_comba16(A,B,C);
|
||||
#endif
|
||||
#if defined(TFM_HUGE)
|
||||
} else if (y <= 32 && y >= 28) {
|
||||
} else if (y <= 32 && y >= 24) {
|
||||
fp_mul_comba32(A,B,C);
|
||||
#endif
|
||||
} else {
|
||||
|
6
fp_sqr.c
6
fp_sqr.c
@ -16,7 +16,7 @@ void fp_sqr(fp_int *A, fp_int *B)
|
||||
fp_int aa, bb, comp, amb, t1;
|
||||
|
||||
y = A->used;
|
||||
if (y <= 48) {
|
||||
if (y <= 64) {
|
||||
if (y <= 4) {
|
||||
fp_sqr_comba4(A,B);
|
||||
} else if (y <= 8) {
|
||||
@ -26,8 +26,10 @@ void fp_sqr(fp_int *A, fp_int *B)
|
||||
fp_sqr_comba16(A,B);
|
||||
#endif
|
||||
#if defined(TFM_HUGE)
|
||||
} else if (y <= 32 && y >= 28) {
|
||||
} else if (y <= 32 && y >= 20) {
|
||||
fp_sqr_comba32(A,B);
|
||||
} else if (y <= 64 && y >= 48) {
|
||||
fp_sqr_comba64(A,B);
|
||||
#endif
|
||||
} else {
|
||||
fp_sqr_comba(A, B);
|
||||
|
1040
fp_sqr_comba.c
1040
fp_sqr_comba.c
File diff suppressed because it is too large
Load Diff
75
fp_sqr_comba_generic.c
Normal file
75
fp_sqr_comba_generic.c
Normal file
@ -0,0 +1,75 @@
|
||||
/* generic comba squarer */
|
||||
void fp_sqr_comba(fp_int *A, fp_int *B)
|
||||
{
|
||||
int pa, ix, iz;
|
||||
fp_digit c0, c1, c2;
|
||||
fp_int tmp, *dst;
|
||||
|
||||
/* get size of output and trim */
|
||||
pa = A->used + A->used;
|
||||
if (pa >= FP_SIZE) {
|
||||
pa = FP_SIZE-1;
|
||||
}
|
||||
|
||||
/* number of output digits to produce */
|
||||
COMBA_START;
|
||||
CLEAR_CARRY;
|
||||
|
||||
if (A == B) {
|
||||
fp_zero(&tmp);
|
||||
dst = &tmp;
|
||||
} else {
|
||||
fp_zero(B);
|
||||
dst = B;
|
||||
}
|
||||
|
||||
for (ix = 0; ix < pa; ix++) {
|
||||
int tx, ty, iy;
|
||||
fp_digit *tmpy, *tmpx;
|
||||
|
||||
/* get offsets into the two bignums */
|
||||
ty = MIN(A->used-1, ix);
|
||||
tx = ix - ty;
|
||||
|
||||
/* setup temp aliases */
|
||||
tmpx = A->dp + tx;
|
||||
tmpy = A->dp + ty;
|
||||
|
||||
/* this is the number of times the loop will iterrate, essentially its
|
||||
while (tx++ < a->used && ty-- >= 0) { ... }
|
||||
*/
|
||||
iy = MIN(A->used-tx, ty+1);
|
||||
|
||||
/* now for squaring tx can never equal ty
|
||||
* we halve the distance since they approach at a rate of 2x
|
||||
* and we have to round because odd cases need to be executed
|
||||
*/
|
||||
iy = MIN(iy, (ty-tx+1)>>1);
|
||||
|
||||
/* forward carries */
|
||||
CARRY_FORWARD;
|
||||
|
||||
/* execute loop */
|
||||
for (iz = 0; iz < iy; iz++) {
|
||||
SQRADD2(*tmpx++, *tmpy--);
|
||||
}
|
||||
|
||||
/* even columns have the square term in them */
|
||||
if ((ix&1) == 0) {
|
||||
SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
|
||||
}
|
||||
|
||||
/* store it */
|
||||
COMBA_STORE(dst->dp[ix]);
|
||||
}
|
||||
COMBA_STORE2(dst->dp[ix]);
|
||||
|
||||
COMBA_FINI;
|
||||
|
||||
/* setup dest */
|
||||
dst->used = pa;
|
||||
fp_clamp (dst);
|
||||
if (dst != B) {
|
||||
fp_copy(dst, B);
|
||||
}
|
||||
}
|
2
makefile
2
makefile
@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
|
||||
#speed
|
||||
CFLAGS += -fomit-frame-pointer
|
||||
|
||||
VERSION=0.02
|
||||
VERSION=0.03
|
||||
|
||||
default: libtfm.a
|
||||
|
||||
|
2132
pre_gen/mpi.c
2132
pre_gen/mpi.c
File diff suppressed because it is too large
Load Diff
36
random_txt_files/newsqr.txt
Normal file
36
random_txt_files/newsqr.txt
Normal file
@ -0,0 +1,36 @@
|
||||
New code added in TFM v0.03
|
||||
|
||||
OLD 64-bit...[athlon64]
|
||||
|
||||
Squaring:
|
||||
256-bit: 89
|
||||
512-bit: 234
|
||||
1024-bit: 815
|
||||
2048-bit: 2851
|
||||
|
||||
NEW 64-bit ...
|
||||
|
||||
Squaring:
|
||||
256-bit: 89
|
||||
512-bit: 228
|
||||
1024-bit: 691
|
||||
2048-bit: 2228
|
||||
|
||||
|
||||
OLD 32-bit [athlonxp]
|
||||
|
||||
Squaring:
|
||||
|
||||
256-bit: 327
|
||||
512-bit: 1044
|
||||
1024-bit: 3646
|
||||
2048-bit: 17055
|
||||
|
||||
NEW 32-bit
|
||||
|
||||
Squaring:
|
||||
|
||||
256-bit: 332
|
||||
512-bit: 894
|
||||
1024-bit: 2983
|
||||
2048-bit: 10385
|
11
tfm.h
11
tfm.h
@ -107,11 +107,11 @@
|
||||
|
||||
/* we want no asm? */
|
||||
#ifdef TFM_NO_ASM
|
||||
#undef TFM_X86
|
||||
#undef TFM_X86_64
|
||||
#undef TFM_SSE2
|
||||
#undef TFM_ARM
|
||||
#undef TFM_ASM
|
||||
#undef TFM_X86
|
||||
#undef TFM_X86_64
|
||||
#undef TFM_SSE2
|
||||
#undef TFM_ARM
|
||||
#undef TFM_ASM
|
||||
#endif
|
||||
|
||||
/* some default configurations.
|
||||
@ -350,6 +350,7 @@ void fp_sqr_comba16(fp_int *A, fp_int *B);
|
||||
#endif
|
||||
#ifdef TFM_HUGE
|
||||
void fp_sqr_comba32(fp_int *A, fp_int *B);
|
||||
void fp_sqr_comba64(fp_int *A, fp_int *B);
|
||||
#endif
|
||||
extern const char *fp_s_rmap;
|
||||
|
||||
|
33
tfm.tex
33
tfm.tex
@ -49,7 +49,7 @@
|
||||
\begin{document}
|
||||
\frontmatter
|
||||
\pagestyle{empty}
|
||||
\title{TomsFastMath User Manual \\ v0.02}
|
||||
\title{TomsFastMath User Manual \\ v0.03}
|
||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||
\maketitle
|
||||
This text and library are all hereby placed in the public domain. This book has been formatted for B5
|
||||
@ -525,6 +525,37 @@ This is essentially the MULADD macro from the multiplication code.
|
||||
This is like SQRADD except it adds the produce twice. It's similar to
|
||||
computing SQRADD(i, j*2).
|
||||
|
||||
To further make things interesting the squaring code also has ``doubles'' (see my LTM book chapter five...) which are
|
||||
handled with these macros.
|
||||
|
||||
\begin{verbatim}
|
||||
#define SQRADDSC(i, j) \
|
||||
do { fp_word t; \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
|
||||
} while (0);
|
||||
\end{verbatim}
|
||||
This computes a product and stores it in the ``secondary'' carry registers $\left < sc0, sc1, sc2 \right >$.
|
||||
|
||||
\begin{verbatim}
|
||||
#define SQRADDAC(i, j) \
|
||||
do { fp_word t; \
|
||||
t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \
|
||||
t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
\end{verbatim}
|
||||
This computes a product and adds it to the ``secondary'' carry registers.
|
||||
|
||||
\begin{verbatim}
|
||||
#define SQRADDDB \
|
||||
do { fp_word t; \
|
||||
t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \
|
||||
t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \
|
||||
c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \
|
||||
} while (0);
|
||||
\end{verbatim}
|
||||
This doubles the ``secondary'' carry registers and adds the sum to the main carry registers. Really complicated.
|
||||
|
||||
\section{Montgomery with Comba}
|
||||
Montgomery reduction is used in modular exponentiation and is most called function during
|
||||
that operation. It's important to make sure this routine is very fast or all is lost.
|
||||
|
Loading…
Reference in New Issue
Block a user