forked from ibphoenix/tomsfastmath
added tomsfastmath-0.03
This commit is contained in:
parent
6bb413fd72
commit
ca551d4c5e
14
TODO
14
TODO
@ -1,6 +1,20 @@
|
|||||||
|
---
|
||||||
|
0. IMPORTANT... why are you doubling the "even" terms individually? STUPID!
|
||||||
|
- make it so you have four new macros that use an additional 3 carry variables
|
||||||
|
- SQRADDSC - store first mult [ simple store, no carry ]
|
||||||
|
- SQRADDAC - add subsequent mults [ 3n word add ]
|
||||||
|
- SQRADDDB - double the carry [ 3n word add ]
|
||||||
|
- SQRADDFC - forward the doubles into the main [ 3n word add, note, x86_32 may need "g" instead of "r" ]
|
||||||
|
- only use the four macro pattern for rows with >= 3 "doubles"
|
||||||
|
- otherwise use the existing SQRADD
|
||||||
|
|
||||||
|
|
||||||
1. Write more documentation ;-)
|
1. Write more documentation ;-)
|
||||||
2. Ports to PPC and MIPS
|
2. Ports to PPC and MIPS
|
||||||
3. Fix any lingering bugs, add additional requested functionality.
|
3. Fix any lingering bugs, add additional requested functionality.
|
||||||
|
4. Unrolled copies of montgomery will speed it up a bit
|
||||||
|
5.
|
||||||
|
|
||||||
|
|
||||||
NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises
|
NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises
|
||||||
can't happen. Please test the results you get for correctness.
|
can't happen. Please test the results you get for correctness.
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
|
March 1st, 2005
|
||||||
|
0.03 -- Optimized squaring
|
||||||
|
--
|
||||||
|
|
||||||
|
|
||||||
September 18th, 2004
|
September 18th, 2004
|
||||||
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
|
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
|
||||||
This also helps prevent killing the cache on smaller cpus.
|
This also helps prevent killing the cache on smaller cpus.
|
||||||
|
@ -3,13 +3,16 @@
|
|||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
int x, y, z, N;
|
int x, y, z, N, f;
|
||||||
N = atoi(argv[1]);
|
N = atoi(argv[1]);
|
||||||
|
|
||||||
|
if (N >= 16 && N < 32) printf("#ifdef TFM_LARGE\n");
|
||||||
|
if (N >= 32) printf("#ifdef TFM_HUGE\n");
|
||||||
|
|
||||||
printf(
|
printf(
|
||||||
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
|
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" fp_digit *a, b[%d], c0, c1, c2;\n"
|
" fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
|
||||||
"\n"
|
"\n"
|
||||||
" a = A->dp;\n"
|
" a = A->dp;\n"
|
||||||
" COMBA_START; \n"
|
" COMBA_START; \n"
|
||||||
@ -25,6 +28,16 @@ printf(
|
|||||||
printf(
|
printf(
|
||||||
"\n /* output %d */\n"
|
"\n /* output %d */\n"
|
||||||
" CARRY_FORWARD;\n ", x);
|
" CARRY_FORWARD;\n ", x);
|
||||||
|
|
||||||
|
for (f = y = 0; y < N; y++) {
|
||||||
|
for (z = 0; z < N; z++) {
|
||||||
|
if (z != y && z + y == x && y <= z) {
|
||||||
|
++f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (f <= 2) {
|
||||||
for (y = 0; y < N; y++) {
|
for (y = 0; y < N; y++) {
|
||||||
for (z = 0; z < N; z++) {
|
for (z = 0; z < N; z++) {
|
||||||
if (y<=z && (y+z)==x) {
|
if (y<=z && (y+z)==x) {
|
||||||
@ -36,6 +49,30 @@ printf(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// new method
|
||||||
|
/* do evens first */
|
||||||
|
f = 0;
|
||||||
|
for (y = 0; y < N; y++) {
|
||||||
|
for (z = 0; z < N; z++) {
|
||||||
|
if (z != y && z + y == x && y <= z) {
|
||||||
|
if (f == 0) {
|
||||||
|
// first double
|
||||||
|
printf("SQRADDSC(a[%d], a[%d]); ", y, z);
|
||||||
|
f = 1;
|
||||||
|
} else {
|
||||||
|
printf("SQRADDAC(a[%d], a[%d]); ", y, z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// forward the carry
|
||||||
|
printf("SQRADDDB; ");
|
||||||
|
if ((x&1) == 0) {
|
||||||
|
// add the square
|
||||||
|
printf("SQRADD(a[%d], a[%d]); ", x/2, x/2);
|
||||||
|
}
|
||||||
|
}
|
||||||
printf("\n COMBA_STORE(b[%d]);\n", x);
|
printf("\n COMBA_STORE(b[%d]);\n", x);
|
||||||
}
|
}
|
||||||
printf(" COMBA_STORE2(b[%d]);\n", N+N-1);
|
printf(" COMBA_STORE2(b[%d]);\n", N+N-1);
|
||||||
@ -49,5 +86,7 @@ printf(
|
|||||||
" fp_clamp(B);\n"
|
" fp_clamp(B);\n"
|
||||||
"}\n\n\n", N+N, N+N);
|
"}\n\n\n", N+N, N+N);
|
||||||
|
|
||||||
|
if (N >= 16) printf("#endif\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
26
demo/test.c
26
demo/test.c
@ -23,7 +23,7 @@ static ulong64 TIMFUNC (void)
|
|||||||
{
|
{
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
#if defined(__i386__) || defined(__x86_64__)
|
||||||
unsigned long long a;
|
ulong64 a;
|
||||||
__asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
|
__asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
|
||||||
return a;
|
return a;
|
||||||
#else /* gcc-IA64 version */
|
#else /* gcc-IA64 version */
|
||||||
@ -60,7 +60,7 @@ int main(void)
|
|||||||
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
|
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
|
||||||
ulong64 t1, t2;
|
ulong64 t1, t2;
|
||||||
|
|
||||||
|
srand(time(NULL));
|
||||||
printf("TFM Ident string:\n%s\n\n", fp_ident());
|
printf("TFM Ident string:\n%s\n\n", fp_ident());
|
||||||
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
|
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
|
||||||
fp_zero(&a); draw(&a);
|
fp_zero(&a); draw(&a);
|
||||||
@ -135,6 +135,8 @@ int main(void)
|
|||||||
printf("Testing read_radix\n");
|
printf("Testing read_radix\n");
|
||||||
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
|
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
|
||||||
|
|
||||||
|
goto testing;
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
/* test mont */
|
/* test mont */
|
||||||
printf("Montgomery test #1\n");
|
printf("Montgomery test #1\n");
|
||||||
@ -143,7 +145,7 @@ int main(void)
|
|||||||
fp_montgomery_calc_normalization(&b, &a);
|
fp_montgomery_calc_normalization(&b, &a);
|
||||||
|
|
||||||
fp_read_radix(&d, "123456789123", 16);
|
fp_read_radix(&d, "123456789123", 16);
|
||||||
for (n = 0; n < 100000; n++) {
|
for (n = 0; n < 1000000; n++) {
|
||||||
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
||||||
fp_mul(&d, &b, &c);
|
fp_mul(&d, &b, &c);
|
||||||
fp_montgomery_reduce(&c, &a, fp);
|
fp_montgomery_reduce(&c, &a, fp);
|
||||||
@ -165,7 +167,7 @@ int main(void)
|
|||||||
fp_montgomery_calc_normalization(&b, &a);
|
fp_montgomery_calc_normalization(&b, &a);
|
||||||
|
|
||||||
fp_read_radix(&d, "123456789123", 16);
|
fp_read_radix(&d, "123456789123", 16);
|
||||||
for (n = 0; n < 100000; n++) {
|
for (n = 0; n < 1000000; n++) {
|
||||||
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
||||||
fp_mul(&d, &b, &c);
|
fp_mul(&d, &b, &c);
|
||||||
fp_montgomery_reduce(&c, &a, fp);
|
fp_montgomery_reduce(&c, &a, fp);
|
||||||
@ -195,7 +197,7 @@ int main(void)
|
|||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
/* do some timings... */
|
/* do some timings... */
|
||||||
printf("Addition:\n");
|
printf("Addition:\n");
|
||||||
for (t = 2; t <= FP_SIZE/2; t += 2) {
|
for (t = 2; t <= FP_SIZE/2; t += 2) {
|
||||||
@ -242,6 +244,7 @@ int main(void)
|
|||||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||||
}
|
}
|
||||||
//#else
|
//#else
|
||||||
|
sqrtime:
|
||||||
printf("Squaring:\n");
|
printf("Squaring:\n");
|
||||||
for (t = 2; t <= FP_SIZE/2; t += 2) {
|
for (t = 2; t <= FP_SIZE/2; t += 2) {
|
||||||
fp_zero(&a);
|
fp_zero(&a);
|
||||||
@ -260,6 +263,7 @@ int main(void)
|
|||||||
}
|
}
|
||||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
//#else
|
//#else
|
||||||
printf("Montgomery:\n");
|
printf("Montgomery:\n");
|
||||||
for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
|
for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
|
||||||
@ -288,7 +292,9 @@ int main(void)
|
|||||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||||
}
|
}
|
||||||
//#else
|
//#else
|
||||||
|
expttime:
|
||||||
printf("Exptmod:\n");
|
printf("Exptmod:\n");
|
||||||
|
|
||||||
for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
|
for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
|
||||||
fp_zero(&a);
|
fp_zero(&a);
|
||||||
fp_zero(&b);
|
fp_zero(&b);
|
||||||
@ -303,7 +309,7 @@ int main(void)
|
|||||||
c.used = t;
|
c.used = t;
|
||||||
|
|
||||||
t2 = -1;
|
t2 = -1;
|
||||||
for (ix = 0; ix < 1024; ++ix) {
|
for (ix = 0; ix < 256; ++ix) {
|
||||||
t1 = TIMFUNC();
|
t1 = TIMFUNC();
|
||||||
fp_exptmod(&c, &b, &a, &d);
|
fp_exptmod(&c, &b, &a, &d);
|
||||||
fp_exptmod(&c, &b, &a, &d);
|
fp_exptmod(&c, &b, &a, &d);
|
||||||
@ -311,11 +317,15 @@ int main(void)
|
|||||||
fp_copy(&b, &c);
|
fp_copy(&b, &c);
|
||||||
fp_copy(&b, &d);
|
fp_copy(&b, &d);
|
||||||
if (t1<t2) { t2 = t1; --ix; }
|
if (t1<t2) { t2 = t1; --ix; }
|
||||||
}
|
}
|
||||||
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
testing:
|
||||||
|
|
||||||
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
|
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
|
||||||
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
|
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
|
||||||
|
|
||||||
|
BIN
doc/tfm.pdf
BIN
doc/tfm.pdf
Binary file not shown.
@ -12,7 +12,6 @@
|
|||||||
/* y = g**x (mod b)
|
/* y = g**x (mod b)
|
||||||
* Some restrictions... x must be positive and < b
|
* Some restrictions... x must be positive and < b
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||||
{
|
{
|
||||||
fp_int M[64], res;
|
fp_int M[64], res;
|
||||||
@ -169,6 +168,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
|||||||
return FP_OKAY;
|
return FP_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||||
{
|
{
|
||||||
fp_int tmp;
|
fp_int tmp;
|
||||||
@ -181,11 +181,12 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
|||||||
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
|
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
/* _fp_exptmod doesn't care about the sign of X */
|
X->sign = FP_ZPOS;
|
||||||
return _fp_exptmod(&tmp, X, P, Y);
|
err = _fp_exptmod(&tmp, X, P, Y);
|
||||||
|
X->sign = FP_NEG;
|
||||||
|
return err;
|
||||||
} else {
|
} else {
|
||||||
/* Positive exponent so just exptmod */
|
/* Positive exponent so just exptmod */
|
||||||
return _fp_exptmod(G, X, P, Y);
|
return _fp_exptmod(G, X, P, Y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
4
fp_mul.c
4
fp_mul.c
@ -29,11 +29,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
|
|||||||
} else if (y <= 8) {
|
} else if (y <= 8) {
|
||||||
fp_mul_comba8(A,B,C);
|
fp_mul_comba8(A,B,C);
|
||||||
#if defined(TFM_LARGE)
|
#if defined(TFM_LARGE)
|
||||||
} else if (y <= 16 && y >= 12) {
|
} else if (y <= 16 && y >= 10) {
|
||||||
fp_mul_comba16(A,B,C);
|
fp_mul_comba16(A,B,C);
|
||||||
#endif
|
#endif
|
||||||
#if defined(TFM_HUGE)
|
#if defined(TFM_HUGE)
|
||||||
} else if (y <= 32 && y >= 28) {
|
} else if (y <= 32 && y >= 24) {
|
||||||
fp_mul_comba32(A,B,C);
|
fp_mul_comba32(A,B,C);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
|
6
fp_sqr.c
6
fp_sqr.c
@ -16,7 +16,7 @@ void fp_sqr(fp_int *A, fp_int *B)
|
|||||||
fp_int aa, bb, comp, amb, t1;
|
fp_int aa, bb, comp, amb, t1;
|
||||||
|
|
||||||
y = A->used;
|
y = A->used;
|
||||||
if (y <= 48) {
|
if (y <= 64) {
|
||||||
if (y <= 4) {
|
if (y <= 4) {
|
||||||
fp_sqr_comba4(A,B);
|
fp_sqr_comba4(A,B);
|
||||||
} else if (y <= 8) {
|
} else if (y <= 8) {
|
||||||
@ -26,8 +26,10 @@ void fp_sqr(fp_int *A, fp_int *B)
|
|||||||
fp_sqr_comba16(A,B);
|
fp_sqr_comba16(A,B);
|
||||||
#endif
|
#endif
|
||||||
#if defined(TFM_HUGE)
|
#if defined(TFM_HUGE)
|
||||||
} else if (y <= 32 && y >= 28) {
|
} else if (y <= 32 && y >= 20) {
|
||||||
fp_sqr_comba32(A,B);
|
fp_sqr_comba32(A,B);
|
||||||
|
} else if (y <= 64 && y >= 48) {
|
||||||
|
fp_sqr_comba64(A,B);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
fp_sqr_comba(A, B);
|
fp_sqr_comba(A, B);
|
||||||
|
1040
fp_sqr_comba.c
1040
fp_sqr_comba.c
File diff suppressed because it is too large
Load Diff
75
fp_sqr_comba_generic.c
Normal file
75
fp_sqr_comba_generic.c
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
/* generic comba squarer */
|
||||||
|
void fp_sqr_comba(fp_int *A, fp_int *B)
|
||||||
|
{
|
||||||
|
int pa, ix, iz;
|
||||||
|
fp_digit c0, c1, c2;
|
||||||
|
fp_int tmp, *dst;
|
||||||
|
|
||||||
|
/* get size of output and trim */
|
||||||
|
pa = A->used + A->used;
|
||||||
|
if (pa >= FP_SIZE) {
|
||||||
|
pa = FP_SIZE-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* number of output digits to produce */
|
||||||
|
COMBA_START;
|
||||||
|
CLEAR_CARRY;
|
||||||
|
|
||||||
|
if (A == B) {
|
||||||
|
fp_zero(&tmp);
|
||||||
|
dst = &tmp;
|
||||||
|
} else {
|
||||||
|
fp_zero(B);
|
||||||
|
dst = B;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (ix = 0; ix < pa; ix++) {
|
||||||
|
int tx, ty, iy;
|
||||||
|
fp_digit *tmpy, *tmpx;
|
||||||
|
|
||||||
|
/* get offsets into the two bignums */
|
||||||
|
ty = MIN(A->used-1, ix);
|
||||||
|
tx = ix - ty;
|
||||||
|
|
||||||
|
/* setup temp aliases */
|
||||||
|
tmpx = A->dp + tx;
|
||||||
|
tmpy = A->dp + ty;
|
||||||
|
|
||||||
|
/* this is the number of times the loop will iterrate, essentially its
|
||||||
|
while (tx++ < a->used && ty-- >= 0) { ... }
|
||||||
|
*/
|
||||||
|
iy = MIN(A->used-tx, ty+1);
|
||||||
|
|
||||||
|
/* now for squaring tx can never equal ty
|
||||||
|
* we halve the distance since they approach at a rate of 2x
|
||||||
|
* and we have to round because odd cases need to be executed
|
||||||
|
*/
|
||||||
|
iy = MIN(iy, (ty-tx+1)>>1);
|
||||||
|
|
||||||
|
/* forward carries */
|
||||||
|
CARRY_FORWARD;
|
||||||
|
|
||||||
|
/* execute loop */
|
||||||
|
for (iz = 0; iz < iy; iz++) {
|
||||||
|
SQRADD2(*tmpx++, *tmpy--);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* even columns have the square term in them */
|
||||||
|
if ((ix&1) == 0) {
|
||||||
|
SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* store it */
|
||||||
|
COMBA_STORE(dst->dp[ix]);
|
||||||
|
}
|
||||||
|
COMBA_STORE2(dst->dp[ix]);
|
||||||
|
|
||||||
|
COMBA_FINI;
|
||||||
|
|
||||||
|
/* setup dest */
|
||||||
|
dst->used = pa;
|
||||||
|
fp_clamp (dst);
|
||||||
|
if (dst != B) {
|
||||||
|
fp_copy(dst, B);
|
||||||
|
}
|
||||||
|
}
|
2
makefile
2
makefile
@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
|
|||||||
#speed
|
#speed
|
||||||
CFLAGS += -fomit-frame-pointer
|
CFLAGS += -fomit-frame-pointer
|
||||||
|
|
||||||
VERSION=0.02
|
VERSION=0.03
|
||||||
|
|
||||||
default: libtfm.a
|
default: libtfm.a
|
||||||
|
|
||||||
|
2132
pre_gen/mpi.c
2132
pre_gen/mpi.c
File diff suppressed because it is too large
Load Diff
36
random_txt_files/newsqr.txt
Normal file
36
random_txt_files/newsqr.txt
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
New code added in TFM v0.03
|
||||||
|
|
||||||
|
OLD 64-bit...[athlon64]
|
||||||
|
|
||||||
|
Squaring:
|
||||||
|
256-bit: 89
|
||||||
|
512-bit: 234
|
||||||
|
1024-bit: 815
|
||||||
|
2048-bit: 2851
|
||||||
|
|
||||||
|
NEW 64-bit ...
|
||||||
|
|
||||||
|
Squaring:
|
||||||
|
256-bit: 89
|
||||||
|
512-bit: 228
|
||||||
|
1024-bit: 691
|
||||||
|
2048-bit: 2228
|
||||||
|
|
||||||
|
|
||||||
|
OLD 32-bit [athlonxp]
|
||||||
|
|
||||||
|
Squaring:
|
||||||
|
|
||||||
|
256-bit: 327
|
||||||
|
512-bit: 1044
|
||||||
|
1024-bit: 3646
|
||||||
|
2048-bit: 17055
|
||||||
|
|
||||||
|
NEW 32-bit
|
||||||
|
|
||||||
|
Squaring:
|
||||||
|
|
||||||
|
256-bit: 332
|
||||||
|
512-bit: 894
|
||||||
|
1024-bit: 2983
|
||||||
|
2048-bit: 10385
|
11
tfm.h
11
tfm.h
@ -107,11 +107,11 @@
|
|||||||
|
|
||||||
/* we want no asm? */
|
/* we want no asm? */
|
||||||
#ifdef TFM_NO_ASM
|
#ifdef TFM_NO_ASM
|
||||||
#undef TFM_X86
|
#undef TFM_X86
|
||||||
#undef TFM_X86_64
|
#undef TFM_X86_64
|
||||||
#undef TFM_SSE2
|
#undef TFM_SSE2
|
||||||
#undef TFM_ARM
|
#undef TFM_ARM
|
||||||
#undef TFM_ASM
|
#undef TFM_ASM
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* some default configurations.
|
/* some default configurations.
|
||||||
@ -350,6 +350,7 @@ void fp_sqr_comba16(fp_int *A, fp_int *B);
|
|||||||
#endif
|
#endif
|
||||||
#ifdef TFM_HUGE
|
#ifdef TFM_HUGE
|
||||||
void fp_sqr_comba32(fp_int *A, fp_int *B);
|
void fp_sqr_comba32(fp_int *A, fp_int *B);
|
||||||
|
void fp_sqr_comba64(fp_int *A, fp_int *B);
|
||||||
#endif
|
#endif
|
||||||
extern const char *fp_s_rmap;
|
extern const char *fp_s_rmap;
|
||||||
|
|
||||||
|
33
tfm.tex
33
tfm.tex
@ -49,7 +49,7 @@
|
|||||||
\begin{document}
|
\begin{document}
|
||||||
\frontmatter
|
\frontmatter
|
||||||
\pagestyle{empty}
|
\pagestyle{empty}
|
||||||
\title{TomsFastMath User Manual \\ v0.02}
|
\title{TomsFastMath User Manual \\ v0.03}
|
||||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||||
\maketitle
|
\maketitle
|
||||||
This text and library are all hereby placed in the public domain. This book has been formatted for B5
|
This text and library are all hereby placed in the public domain. This book has been formatted for B5
|
||||||
@ -525,6 +525,37 @@ This is essentially the MULADD macro from the multiplication code.
|
|||||||
This is like SQRADD except it adds the produce twice. It's similar to
|
This is like SQRADD except it adds the produce twice. It's similar to
|
||||||
computing SQRADD(i, j*2).
|
computing SQRADD(i, j*2).
|
||||||
|
|
||||||
|
To further make things interesting the squaring code also has ``doubles'' (see my LTM book chapter five...) which are
|
||||||
|
handled with these macros.
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
#define SQRADDSC(i, j) \
|
||||||
|
do { fp_word t; \
|
||||||
|
t = ((fp_word)i) * ((fp_word)j); \
|
||||||
|
sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
|
||||||
|
} while (0);
|
||||||
|
\end{verbatim}
|
||||||
|
This computes a product and stores it in the ``secondary'' carry registers $\left < sc0, sc1, sc2 \right >$.
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
#define SQRADDAC(i, j) \
|
||||||
|
do { fp_word t; \
|
||||||
|
t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \
|
||||||
|
t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \
|
||||||
|
} while (0);
|
||||||
|
\end{verbatim}
|
||||||
|
This computes a product and adds it to the ``secondary'' carry registers.
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
#define SQRADDDB \
|
||||||
|
do { fp_word t; \
|
||||||
|
t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \
|
||||||
|
t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \
|
||||||
|
c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \
|
||||||
|
} while (0);
|
||||||
|
\end{verbatim}
|
||||||
|
This doubles the ``secondary'' carry registers and adds the sum to the main carry registers. Really complicated.
|
||||||
|
|
||||||
\section{Montgomery with Comba}
|
\section{Montgomery with Comba}
|
||||||
Montgomery reduction is used in modular exponentiation and is most called function during
|
Montgomery reduction is used in modular exponentiation and is most called function during
|
||||||
that operation. It's important to make sure this routine is very fast or all is lost.
|
that operation. It's important to make sure this routine is very fast or all is lost.
|
||||||
|
Loading…
Reference in New Issue
Block a user