diff --git a/changes.txt b/changes.txt index 41322cd..809fc76 100644 --- a/changes.txt +++ b/changes.txt @@ -1,2 +1,21 @@ +September 18th, 2004 +0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space. + This also helps prevent killing the cache on smaller cpus. + -- Cast memset to void in fp_init() to catch people who misuse the function (e.g. expect return) + Thanks to Johan Lindh + -- Cleaned up x86-64 support [faster montgomery reductions] + -- Autodetects x86-32 and x86-64 and enables it's asm now + -- Made test demo build cleaner in multilib platforms [e.g. mixed 32/64 bits] + -- Fix to fp_mod to ensure that remainder is of the same sign as the modulus. + -- Fixed bug in fp_montgomery_calc_normalization for single digit moduli + -- cleaned up ISO C macros in comba/mont to avoid branches [works best with GCC 3.4.x branch] + -- Added more testing to tfm.h to help detect misconfigured builds + -- Added TFM_NO_ASM which forces ASM off [even if it was autodetected]. + -- Added fp_radix_size() to API + -- Cleaned up demo/test.c to build with far fewer warnings (mostly %d => %lu fixes) + -- fp_exptmod() now supports negative exponent and base>modulus cases + -- Added fp_ident() which gives a string showing how TFM was configured. Useful for debuging... + -- fix gen.pl script so it includes the whole source tree now + August 25th, 2004 -TFM 0.01 -- Initial Release +0.01 -- Initial Release diff --git a/comba_mult_gen.c b/comba_mult_gen.c index 0dcc8b5..b43471d 100644 --- a/comba_mult_gen.c +++ b/comba_mult_gen.c @@ -10,7 +10,6 @@ int main(int argc, char **argv) printf( "void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n" "{\n" -" fp_word t;\n" " fp_digit c0, c1, c2, at[%d];\n" "\n" " memcpy(at, A->dp, %d * sizeof(fp_digit));\n" diff --git a/comba_sqr_gen.c b/comba_sqr_gen.c index 8c34f13..eb5949c 100644 --- a/comba_sqr_gen.c +++ b/comba_sqr_gen.c @@ -9,7 +9,6 @@ int main(int argc, char **argv) printf( "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n" "{\n" -" fp_word t;\n" " fp_digit *a, b[%d], c0, c1, c2;\n" "\n" " a = A->dp;\n" diff --git a/delme.c b/delme.c new file mode 100644 index 0000000..7fb0ad5 --- /dev/null +++ b/delme.c @@ -0,0 +1,24 @@ +#include "tfm.h" + +int main(void) +{ + fp_int a; + char buf[4096]; + + fp_init(&a); + fp_read_radix( &a, + "///////////93zgY8MZ2DCJ6Oek0t1pHAG9E28fdp7G22xwcEnER8b5A27cED0JT" + "xvKPiyqwGnimAmfjybyKDq/XDMrjKS95v8MrTc9UViRqJ4BffZVjQml/NBRq1hVj" + "xZXh+rg9dwMkdoGHV4iVvaaePb7iv5izmW1ykA5ZlmMOsaWs75NJccaMFwZz9CzV" + "WsLT8zoZhPOSOlDM88LIkvxLAGTmbfPjPmmrJagyc0JnT6m8oXWXV3AGNaOkDiux" + "uvvtB1WEXWER9uEYx0UYZxN5NV1lJ5B9tYlBzfLO5nWvbKbywfLgvHNI9XYO+WKG" + "5NAEMeggn2sjCnSD151wCwXL8QlV7BfaxFk515ZRxmgAwd5NNGOCVREN3uMcuUJ7" + "g/MkZDi9CzSUZ9JWIYLXdSxZqYOQqkvhyI/w1jcA26JOTW9pFiXgP58VAnWNUo0C" + "k+4NLtfXNMnt2OZ0kjb6uWZYJw1qvQinGzjR/E3z48vBWj4WgJhIol//////////", + 64 ); + + if( fp_isprime( &a ) ) printf("It's prime.\n"); + else printf( "Not prime.\n"); + + return 0; +} diff --git a/demo/test.c b/demo/test.c index 486bb03..947aeb6 100644 --- a/demo/test.c +++ b/demo/test.c @@ -60,6 +60,8 @@ int main(void) div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix; ulong64 t1, t2; + + printf("TFM Ident string:\n%s\n\n", fp_ident()); fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); fp_zero(&a); draw(&a); @@ -133,9 +135,30 @@ int main(void) printf("Testing read_radix\n"); fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a); +#if 1 /* test mont */ - printf("Montgomery test\n"); - fp_set(&a, 1); + printf("Montgomery test #1\n"); + fp_set(&a, 0x1234567ULL); + fp_montgomery_setup(&a, &fp); + fp_montgomery_calc_normalization(&b, &a); + + fp_read_radix(&d, "123456789123", 16); + for (n = 0; n < 100000; n++) { + fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d); + fp_mul(&d, &b, &c); + fp_montgomery_reduce(&c, &a, fp); + if (fp_cmp(&c, &d) != FP_EQ) { + printf("Failed mont %d\n", n); + draw(&a); + draw(&d); + draw(&c); + return EXIT_FAILURE; + } + } + printf("Passed.\n"); + + printf("Montgomery test #2\n"); + fp_set(&a, 0x1234567ULL); fp_lshd(&a, 4); fp_add_d(&a, 1, &a); fp_montgomery_setup(&a, &fp); @@ -158,19 +181,19 @@ int main(void) /* test for size */ for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) { - printf("Testing (not safe-prime): %9d bits \r", ix); fflush(stdout); + printf("Testing (not safe-prime): %9lu bits \r", ix); fflush(stdout); err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL); if (err != FP_OKAY) { printf("failed with err code %d\n", err); return EXIT_FAILURE; } - if (fp_count_bits(&a) != ix) { - printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix); + if ((unsigned long)fp_count_bits(&a) != ix) { + printf("Prime is %d not %lu bits!!!\n", fp_count_bits(&a), ix); return EXIT_FAILURE; } } printf("\n\n"); - +#endif #if 0 /* do some timings... */ @@ -280,7 +303,7 @@ int main(void) c.used = t; t2 = -1; - for (ix = 0; ix < 50; ++ix) { + for (ix = 0; ix < 1024; ++ix) { t1 = TIMFUNC(); fp_exptmod(&c, &b, &a, &d); fp_exptmod(&c, &b, &a, &d); @@ -293,7 +316,6 @@ int main(void) } #endif - div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0; @@ -304,27 +326,27 @@ int main(void) printf("%s ]\r",cmd); fflush(stdout); if (!strcmp(cmd, "mul2d")) { ++mul2d_n; fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); - fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); + fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fp_mul_2d(&a, rr, &a); a.sign = b.sign; if (fp_cmp(&a, &b) != FP_EQ) { - printf("mul2d failed, rr == %d\n",rr); + printf("mul2d failed, rr == %lu\n",rr); draw(&a); draw(&b); return 0; } } else if (!strcmp(cmd, "div2d")) { ++div2d_n; fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); - fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); + fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fp_div_2d(&a, rr, &a, &e); a.sign = b.sign; if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; } if (fp_cmp(&a, &b) != FP_EQ) { - printf("div2d failed, rr == %d\n",rr); + printf("div2d failed, rr == %lu\n",rr); draw(&a); draw(&b); return 0; @@ -492,7 +514,7 @@ draw(&a);draw(&b);draw(&c);draw(&d); } } else if (!strcmp(cmd, "add_d")) { ++add_d_n; fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); - fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix); + fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fp_add_d(&a, ix, &c); if (fp_cmp(&b, &c) != FP_EQ) { @@ -500,12 +522,12 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&a); draw(&b); draw(&c); - printf("d == %d\n", ix); + printf("d == %lu\n", ix); return 0; } } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n; fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); - fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix); + fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fp_sub_d(&a, ix, &c); if (fp_cmp(&b, &c) != FP_EQ) { @@ -513,12 +535,12 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&a); draw(&b); draw(&c); - printf("d == %d\n", ix); + printf("d == %lu\n", ix); return 0; } } else if (!strcmp(cmd, "mul_d")) { ++mul_d_n; fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); - fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix); + fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fp_mul_d(&a, ix, &c); if (fp_cmp(&b, &c) != FP_EQ) { @@ -526,7 +548,7 @@ draw(&a);draw(&b);draw(&c);draw(&d); draw(&a); draw(&b); draw(&c); - printf("d == %d\n", ix); + printf("d == %lu\n", ix); return 0; } } diff --git a/doc/tfm.pdf b/doc/tfm.pdf index 60e869c..38c2f39 100644 Binary files a/doc/tfm.pdf and b/doc/tfm.pdf differ diff --git a/fp_exptmod.c b/fp_exptmod.c index eb5457f..5b08da5 100644 --- a/fp_exptmod.c +++ b/fp_exptmod.c @@ -13,7 +13,7 @@ * Some restrictions... x must be positive and < b */ -int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) +static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) { fp_int M[64], res; fp_digit buf, mp; @@ -34,7 +34,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) } /* init M array */ - memset(M, 0, sizeof(fp_int)*(1<sign == FP_NEG) { + /* yes, copy G and invmod it */ + fp_copy(G, &tmp); + if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) { + return err; + } + /* _fp_exptmod doesn't care about the sign of X */ + return _fp_exptmod(&tmp, X, P, Y); + } else { + /* Positive exponent so just exptmod */ + return _fp_exptmod(G, X, P, Y); + } +} + diff --git a/fp_ident.c b/fp_ident.c new file mode 100644 index 0000000..105e3d2 --- /dev/null +++ b/fp_ident.c @@ -0,0 +1,66 @@ +#include "tfm.h" + +const char *fp_ident(void) +{ + static char buf[1024]; + + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf)-1, +"TomsFastMath (%s)\n" +"\n" +"Sizeofs\n" +"\tfp_digit = %u\n" +"\tfp_word = %u\n" +"\n" +"FP_MAX_SIZE = %u\n" +"\n" +"Defines: \n" +#ifdef __i386__ +" __i386__ " +#endif +#ifdef __x86_64__ +" __x86_64__ " +#endif +#ifdef TFM_X86 +" TFM_X86 " +#endif +#ifdef TFM_X86_64 +" TFM_X86_64 " +#endif +#ifdef TFM_SSE2 +" TFM_SSE2 " +#endif +#ifdef TFM_ARM +" TFM_ARM " +#endif +#ifdef TFM_NO_ASM +" TFM_NO_ASM " +#endif +#ifdef FP_64BIT +" FP_64BIT " +#endif +#ifdef TFM_LARGE +" TFM_LARGE " +#endif +#ifdef TFM_HUGE +" TFM_HUGE " +#endif +"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE); + + if (sizeof(fp_digit) == sizeof(fp_word)) { + strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n", + sizeof(buf)-1); + } + return buf; +} + +#ifdef STANDALONE + +int main(void) +{ + printf("%s\n", fp_ident()); + return 0; +} + +#endif + diff --git a/fp_mod.c b/fp_mod.c index bec0593..8fb027b 100644 --- a/fp_mod.c +++ b/fp_mod.c @@ -12,7 +12,19 @@ /* c = a mod b, 0 <= c < b */ int fp_mod(fp_int *a, fp_int *b, fp_int *c) { - return fp_div(a, b, NULL, c); + fp_int t; + int err; + + fp_zero(&t); + if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) { + return err; + } + if (t.sign != b->sign) { + fp_add(&t, b, c); + } else { + fp_copy(&t, c); + } + return FP_OKAY; } diff --git a/fp_montgomery_calc_normalization.c b/fp_montgomery_calc_normalization.c index 3f3331d..f6791af 100644 --- a/fp_montgomery_calc_normalization.c +++ b/fp_montgomery_calc_normalization.c @@ -24,7 +24,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b) fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1); } else { fp_set(a, 1); - ++bits; + bits = 1; } /* now compute C = A * B mod b */ diff --git a/fp_montgomery_reduce.c b/fp_montgomery_reduce.c index 2f4fbe7..778a433 100644 --- a/fp_montgomery_reduce.c +++ b/fp_montgomery_reduce.c @@ -66,13 +66,13 @@ asm( #define PROPCARRY \ asm( \ "movq %1,%%rax \n\t" \ +"movq %2,%%rbx \n\t" \ "addq %%rax,%6 \n\t" \ -"movq %2,%%rax \n\t" \ -"adcq %%rax,%7 \n\t" \ +"adcq %%rbx,%7 \n\t" \ "adcq $0,%8 \n\t" \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ -: "%rax", "%cc"); +: "%rax", "%rbx", "%cc"); #elif defined(TFM_SSE2) @@ -88,7 +88,7 @@ asm("emms"); asm(\ "movd %0,%%mm1 \n\t" \ "pmuludq %%mm2,%%mm1 \n\t" \ -:: "g"(c[x]), "g"(mp)); +:: "g"(c[x])); #define INNERMUL \ asm( \ @@ -112,7 +112,7 @@ asm( "adcl %%eax,%7 \n\t" \ "adcl $0,%8 \n\t" \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ - "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ + "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \ : "%eax", "%cc"); #elif defined(TFM_ARM) @@ -166,14 +166,18 @@ asm( \ mu = c[x] * mp; #define INNERMUL \ - t = ((fp_word)mu) * ((fp_word)*tmpm++); \ - _c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \ - _c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \ + do { fp_word t; \ + t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \ + t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \ + _c[OFF2] += (t >> DIGIT_BIT); \ + } while (0); #define PROPCARRY \ - _c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \ - _c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1]; - + do { fp_word t; \ + t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \ + t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \ + _c[OFF2+1] += (t >> DIGIT_BIT); \ + } while (0); #endif @@ -187,7 +191,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) { fp_digit c[3*FP_SIZE], *_c, *tmpm, mu; int oldused, x, y, pa; - fp_word t; /* now zero the buff */ pa = m->used; @@ -221,13 +224,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) /* fix the rest of the carries */ _c = c + pa; - for (; x < pa * 2 + 2; x++) { + for (x = pa; x < pa * 2 + 2; x++) { PROPCARRY; ++_c; } /* now copy out */ - _c = c + pa; + _c = c + pa; tmpm = a->dp; for (x = 0; x < pa+1; x++) { *tmpm++ = *_c++; diff --git a/fp_mul.c b/fp_mul.c index 06459b5..70174d5 100644 --- a/fp_mul.c +++ b/fp_mul.c @@ -28,9 +28,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C) fp_mul_comba4(A,B,C); } else if (y <= 8) { fp_mul_comba8(A,B,C); +#if defined(TFM_LARGE) } else if (y <= 16 && y >= 12) { fp_mul_comba16(A,B,C); -#ifdef TFM_HUGE +#endif +#if defined(TFM_HUGE) } else if (y <= 32 && y >= 28) { fp_mul_comba32(A,B,C); #endif diff --git a/fp_mul_comba.c b/fp_mul_comba.c index a6146cf..ddd0cb3 100644 --- a/fp_mul_comba.c +++ b/fp_mul_comba.c @@ -27,7 +27,7 @@ /* forward the carry to the next digit */ #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ @@ -42,7 +42,7 @@ /* this should multiply i and j */ #define MULADD(i, j) \ -asm volatile ( \ +asm ( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -62,7 +62,7 @@ asm volatile ( \ /* forward the carry to the next digit */ #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ @@ -77,13 +77,13 @@ asm volatile ( \ /* this should multiply i and j */ #define MULADD(i, j) \ -asm volatile ( \ +asm ( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #elif defined(TFM_SSE2) /* use SSE2 optimizations */ @@ -97,7 +97,7 @@ asm volatile ( \ /* forward the carry to the next digit */ #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ @@ -134,7 +134,7 @@ asm volatile ( \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; @@ -155,13 +155,13 @@ asm( \ #else /* ISO C code */ -#define COMBA_START +#define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; @@ -169,12 +169,13 @@ asm( \ #define COMBA_STORE2(x) \ x = c1; -#define COMBA_FINI - -#define MULADD(i, j) \ - t = ((fp_word)i) * ((fp_word)j); \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; +#define COMBA_FINI + +#define MULADD(i, j) \ + do { fp_word t; \ + t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \ + t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \ + } while (0); #endif @@ -184,7 +185,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) { int ix, iy, iz, tx, ty, pa; fp_digit c0, c1, c2, *tmpx, *tmpy; - fp_word t; fp_int tmp, *dst; COMBA_START; @@ -239,7 +239,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[8]; memcpy(at, A->dp, 4 * sizeof(fp_digit)); @@ -284,7 +283,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[16]; memcpy(at, A->dp, 8 * sizeof(fp_digit)); @@ -358,10 +356,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C) COMBA_FINI; } +#if defined(TFM_LARGE) void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[32]; memcpy(at, A->dp, 16 * sizeof(fp_digit)); @@ -499,11 +497,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C) COMBA_FINI; } +#endif /* TFM_LARGE */ + #ifdef TFM_HUGE void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[64]; memcpy(at, A->dp, 32 * sizeof(fp_digit)); diff --git a/fp_radix_size.c b/fp_radix_size.c index c163fec..d632e97 100644 --- a/fp_radix_size.c +++ b/fp_radix_size.c @@ -11,4 +11,39 @@ int fp_radix_size(fp_int *a, int radix, int *size) { + int digs; + fp_int t; + fp_digit d; + + *size = 0; + + /* check range of the radix */ + if (radix < 2 || radix > 64) { + return FP_VAL; + } + + /* quick out if its zero */ + if (fp_iszero(a) == 1) { + *size = 2; + return FP_OKAY; + } + + fp_init_copy(&t, a); + + /* if it is negative output a - */ + if (t.sign == FP_NEG) { + *size++; + t.sign = FP_ZPOS; + } + + digs = 0; + while (fp_iszero (&t) == FP_NO) { + fp_div_d (&t, (fp_digit) radix, &t, &d); + *size++; + } + + /* append a NULL so the string is properly terminated */ + *size++; + return FP_OKAY; + } diff --git a/fp_sqr.c b/fp_sqr.c index 663c444..b7c7f64 100644 --- a/fp_sqr.c +++ b/fp_sqr.c @@ -21,9 +21,11 @@ void fp_sqr(fp_int *A, fp_int *B) fp_sqr_comba4(A,B); } else if (y <= 8) { fp_sqr_comba8(A,B); +#if defined(TFM_LARGE) } else if (y <= 16 && y >= 12) { fp_sqr_comba16(A,B); -#ifdef TFM_HUGE +#endif +#if defined(TFM_HUGE) } else if (y <= 32 && y >= 28) { fp_sqr_comba32(A,B); #endif diff --git a/fp_sqr_comba.c b/fp_sqr_comba.c index 84de74f..4a73e2a 100644 --- a/fp_sqr_comba.c +++ b/fp_sqr_comba.c @@ -28,7 +28,7 @@ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI @@ -68,21 +68,21 @@ asm volatile ( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI #define SQRADD(i, j) \ -asm volatile ( \ +asm ( \ "movq %6,%%rax \n\t" \ "mulq %%rax \n\t" \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc"); #define SQRADD2(i, j) \ -asm volatile ( \ +asm ( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ @@ -91,7 +91,7 @@ asm volatile ( \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #elif defined(TFM_SSE2) @@ -109,7 +109,7 @@ asm volatile ( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI \ asm("emms"); @@ -120,11 +120,11 @@ asm volatile ( \ "pmuludq %%mm0,%%mm0\n\t" \ "movd %%mm0,%%eax \n\t" \ "psrlq $32,%%mm0 \n\t" \ - "movd %%mm0,%%edx \n\t" \ "addl %%eax,%0 \n\t" \ - "adcl %%edx,%1 \n\t" \ + "movd %%mm0,%%eax \n\t" \ + "adcl %%eax,%1 \n\t" \ "adcl $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc"); #define SQRADD2(i, j) \ asm volatile ( \ @@ -158,7 +158,7 @@ asm volatile ( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI @@ -187,7 +187,8 @@ asm( \ /* ISO C portable code */ -#define COMBA_START +#define COMBA_START \ + { fp_word tt; #define CLEAR_CARRY \ c0 = c1 = c2 = 0; @@ -199,23 +200,28 @@ asm( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); -#define COMBA_FINI +#define COMBA_FINI \ + } /* multiplies point i and j, updates carry "c1" and digit c2 */ -#define SQRADD(i, j) \ - t = ((fp_word)i) * ((fp_word)j); \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; +#define SQRADD(i, j) \ + do { fp_word t; \ + t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \ + t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \ + } while (0); + /* for squaring some of the terms are doubled... */ -#define SQRADD2(i, j) \ - t = ((fp_word)i) * ((fp_word)j); \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; +#define SQRADD2(i, j) \ + do { fp_word t; \ + t = ((fp_word)i) * ((fp_word)j); \ + tt = (fp_word)c0 + t; c0 = tt; \ + tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \ + tt = (fp_word)c0 + t; c0 = tt; \ + tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \ + } while (0); #endif @@ -225,7 +231,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B) int pa, ix, iz; fp_digit c0, c1, c2; fp_int tmp, *dst; - fp_word t; /* get size of output and trim */ pa = A->used + A->used; @@ -298,7 +303,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B) void fp_sqr_comba4(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[8], c0, c1, c2; a = A->dp; @@ -352,7 +356,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B) void fp_sqr_comba8(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[16], c0, c1, c2; a = A->dp; @@ -443,10 +446,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B) fp_clamp(B); } +#if defined(TFM_LARGE) void fp_sqr_comba16(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[32], c0, c1, c2; a = A->dp; @@ -617,11 +620,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B) fp_clamp(B); } +#endif /* TFM_LARGE */ + #ifdef TFM_HUGE void fp_sqr_comba32(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[64], c0, c1, c2; a = A->dp; diff --git a/gen.pl b/gen.pl index 127ec6f..c48dbab 100644 --- a/gen.pl +++ b/gen.pl @@ -6,7 +6,7 @@ use strict; open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!"; -foreach my $filename (glob "fp_*.c") { +foreach my $filename (glob "*fp_*.c") { open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!"; print OUT "/* Start: $filename */\n"; print OUT while ; @@ -14,4 +14,4 @@ foreach my $filename (glob "fp_*.c") { close SRC or die "Error closing $filename after reading: $!"; } print OUT "\n/* EOF */\n"; -close OUT or die "Error closing mpi.c after writing: $!"; \ No newline at end of file +close OUT or die "Error closing mpi.c after writing: $!"; diff --git a/makefile b/makefile index 967fa57..353f69a 100644 --- a/makefile +++ b/makefile @@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops #speed CFLAGS += -fomit-frame-pointer -VERSION=0.01 +VERSION=0.02 default: libtfm.a @@ -38,8 +38,9 @@ fp_cmp.o fp_cmp_mag.o \ \ fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \ fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \ -fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \ +fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \ \ +fp_ident.o libtfm.a: $(OBJECTS) $(AR) $(ARFLAGS) libtfm.a $(OBJECTS) @@ -49,7 +50,7 @@ mtest/mtest: mtest/mtest.c cd mtest ; make mtest test: libtfm.a demo/test.o mtest/mtest - $(CC) demo/test.o libtfm.a $(PROF) -o test + $(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test stest: libtfm.a demo/stest.o $(CC) demo/stest.o libtfm.a -o stest @@ -67,7 +68,7 @@ docs: docdvi mv -f tfm.pdf doc clean: - rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest + rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~ cd mtest ; make clean zipup: docs clean diff --git a/mtest/makefile b/mtest/makefile index a0cdf72..8e42029 100644 --- a/mtest/makefile +++ b/mtest/makefile @@ -6,4 +6,4 @@ mtest: mtest.o $(CC) mtest.o -ltommath -o mtest clean: - rm -f *.o mtest + rm -f *.o mtest *~ diff --git a/mtest/mtest.c b/mtest/mtest.c index be85362..3747406 100644 --- a/mtest/mtest.c +++ b/mtest/mtest.c @@ -235,10 +235,9 @@ int main(void) rand_num2(&a); rand_num2(&b); rand_num2(&c); -// if (c.dp[0]&1) mp_add_d(&c, 1, &c); a.sign = b.sign = c.sign = 0; c.dp[0] |= 1; - if (c.used <= 2) continue; +// if (c.used <= 4) continue; // if (mp_cmp(&a, &c) != MP_LT) continue; // if (mp_cmp(&b, &c) != MP_LT) continue; mp_exptmod(&a, &b, &c, &d); diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c index 3ac1f64..14ea49c 100644 --- a/pre_gen/mpi.c +++ b/pre_gen/mpi.c @@ -709,7 +709,7 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d) * Some restrictions... x must be positive and < b */ -int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) +static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) { fp_int M[64], res; fp_digit buf, mp; @@ -730,7 +730,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) } /* init M array */ - memset(M, 0, sizeof(fp_int)*(1<sign == FP_NEG) { + /* yes, copy G and invmod it */ + fp_copy(G, &tmp); + if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) { + return err; + } + /* _fp_exptmod doesn't care about the sign of X */ + return _fp_exptmod(&tmp, X, P, Y); + } else { + /* Positive exponent so just exptmod */ + return _fp_exptmod(G, X, P, Y); + } +} + + /* End: fp_exptmod.c */ /* Start: fp_gcd.c */ @@ -922,6 +943,76 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c) /* End: fp_gcd.c */ +/* Start: fp_ident.c */ +#include "tfm.h" + +const char *fp_ident(void) +{ + static char buf[1024]; + + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf)-1, +"TomsFastMath (%s)\n" +"\n" +"Sizeofs\n" +"\tfp_digit = %u\n" +"\tfp_word = %u\n" +"\n" +"FP_MAX_SIZE = %u\n" +"\n" +"Defines: \n" +#ifdef __i386__ +" __i386__ " +#endif +#ifdef __x86_64__ +" __x86_64__ " +#endif +#ifdef TFM_X86 +" TFM_X86 " +#endif +#ifdef TFM_X86_64 +" TFM_X86_64 " +#endif +#ifdef TFM_SSE2 +" TFM_SSE2 " +#endif +#ifdef TFM_ARM +" TFM_ARM " +#endif +#ifdef TFM_NO_ASM +" TFM_NO_ASM " +#endif +#ifdef FP_64BIT +" FP_64BIT " +#endif +#ifdef TFM_LARGE +" TFM_LARGE " +#endif +#ifdef TFM_HUGE +" TFM_HUGE " +#endif +"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE); + + if (sizeof(fp_digit) == sizeof(fp_word)) { + strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n", + sizeof(buf)-1); + } + return buf; +} + +#ifdef STANDALONE + +int main(void) +{ + printf("%s\n", fp_ident()); + return 0; +} + +#endif + + +/* End: fp_ident.c */ + /* Start: fp_invmod.c */ /* TomsFastMath, a fast ISO C bignum library. * @@ -1186,7 +1277,19 @@ void fp_lshd(fp_int *a, int x) /* c = a mod b, 0 <= c < b */ int fp_mod(fp_int *a, fp_int *b, fp_int *c) { - return fp_div(a, b, NULL, c); + fp_int t; + int err; + + fp_zero(&t); + if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) { + return err; + } + if (t.sign != b->sign) { + fp_add(&t, b, c); + } else { + fp_copy(&t, c); + } + return FP_OKAY; } @@ -1282,7 +1385,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b) fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1); } else { fp_set(a, 1); - ++bits; + bits = 1; } /* now compute C = A * B mod b */ @@ -1366,13 +1469,13 @@ asm( #define PROPCARRY \ asm( \ "movq %1,%%rax \n\t" \ +"movq %2,%%rbx \n\t" \ "addq %%rax,%6 \n\t" \ -"movq %2,%%rax \n\t" \ -"adcq %%rax,%7 \n\t" \ +"adcq %%rbx,%7 \n\t" \ "adcq $0,%8 \n\t" \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ -: "%rax", "%cc"); +: "%rax", "%rbx", "%cc"); #elif defined(TFM_SSE2) @@ -1388,7 +1491,7 @@ asm("emms"); asm(\ "movd %0,%%mm1 \n\t" \ "pmuludq %%mm2,%%mm1 \n\t" \ -:: "g"(c[x]), "g"(mp)); +:: "g"(c[x])); #define INNERMUL \ asm( \ @@ -1412,7 +1515,7 @@ asm( "adcl %%eax,%7 \n\t" \ "adcl $0,%8 \n\t" \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ - "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ + "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \ : "%eax", "%cc"); #elif defined(TFM_ARM) @@ -1466,14 +1569,18 @@ asm( \ mu = c[x] * mp; #define INNERMUL \ - t = ((fp_word)mu) * ((fp_word)*tmpm++); \ - _c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \ - _c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \ + do { fp_word t; \ + t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \ + t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \ + _c[OFF2] += (t >> DIGIT_BIT); \ + } while (0); #define PROPCARRY \ - _c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \ - _c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1]; - + do { fp_word t; \ + t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \ + t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \ + _c[OFF2+1] += (t >> DIGIT_BIT); \ + } while (0); #endif @@ -1487,7 +1594,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) { fp_digit c[3*FP_SIZE], *_c, *tmpm, mu; int oldused, x, y, pa; - fp_word t; /* now zero the buff */ pa = m->used; @@ -1521,13 +1627,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) /* fix the rest of the carries */ _c = c + pa; - for (; x < pa * 2 + 2; x++) { + for (x = pa; x < pa * 2 + 2; x++) { PROPCARRY; ++_c; } /* now copy out */ - _c = c + pa; + _c = c + pa; tmpm = a->dp; for (x = 0; x < pa+1; x++) { *tmpm++ = *_c++; @@ -1629,9 +1735,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C) fp_mul_comba4(A,B,C); } else if (y <= 8) { fp_mul_comba8(A,B,C); +#if defined(TFM_LARGE) } else if (y <= 16 && y >= 12) { fp_mul_comba16(A,B,C); -#ifdef TFM_HUGE +#endif +#if defined(TFM_HUGE) } else if (y <= 32 && y >= 28) { fp_mul_comba32(A,B,C); #endif @@ -1880,7 +1988,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c) /* forward the carry to the next digit */ #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ @@ -1895,7 +2003,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c) /* this should multiply i and j */ #define MULADD(i, j) \ -asm volatile ( \ +asm ( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ @@ -1915,7 +2023,7 @@ asm volatile ( \ /* forward the carry to the next digit */ #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ @@ -1930,13 +2038,13 @@ asm volatile ( \ /* this should multiply i and j */ #define MULADD(i, j) \ -asm volatile ( \ +asm ( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #elif defined(TFM_SSE2) /* use SSE2 optimizations */ @@ -1950,7 +2058,7 @@ asm volatile ( \ /* forward the carry to the next digit */ #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ @@ -1987,7 +2095,7 @@ asm volatile ( \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; @@ -2008,13 +2116,13 @@ asm( \ #else /* ISO C code */ -#define COMBA_START +#define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; @@ -2022,12 +2130,13 @@ asm( \ #define COMBA_STORE2(x) \ x = c1; -#define COMBA_FINI - -#define MULADD(i, j) \ - t = ((fp_word)i) * ((fp_word)j); \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; +#define COMBA_FINI + +#define MULADD(i, j) \ + do { fp_word t; \ + t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \ + t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \ + } while (0); #endif @@ -2037,7 +2146,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) { int ix, iy, iz, tx, ty, pa; fp_digit c0, c1, c2, *tmpx, *tmpy; - fp_word t; fp_int tmp, *dst; COMBA_START; @@ -2092,7 +2200,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[8]; memcpy(at, A->dp, 4 * sizeof(fp_digit)); @@ -2137,7 +2244,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[16]; memcpy(at, A->dp, 8 * sizeof(fp_digit)); @@ -2211,10 +2317,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C) COMBA_FINI; } +#if defined(TFM_LARGE) void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[32]; memcpy(at, A->dp, 16 * sizeof(fp_digit)); @@ -2352,11 +2458,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C) COMBA_FINI; } +#endif /* TFM_LARGE */ + #ifdef TFM_HUGE void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C) { - fp_word t; fp_digit c0, c1, c2, at[64]; memcpy(at, A->dp, 32 * sizeof(fp_digit)); @@ -2880,6 +2987,41 @@ error: int fp_radix_size(fp_int *a, int radix, int *size) { + int digs; + fp_int t; + fp_digit d; + + *size = 0; + + /* check range of the radix */ + if (radix < 2 || radix > 64) { + return FP_VAL; + } + + /* quick out if its zero */ + if (fp_iszero(a) == 1) { + *size = 2; + return FP_OKAY; + } + + fp_init_copy(&t, a); + + /* if it is negative output a - */ + if (t.sign == FP_NEG) { + *size++; + t.sign = FP_ZPOS; + } + + digs = 0; + while (fp_iszero (&t) == FP_NO) { + fp_div_d (&t, (fp_digit) radix, &t, &d); + *size++; + } + + /* append a NULL so the string is properly terminated */ + *size++; + return FP_OKAY; + } /* End: fp_radix_size.c */ @@ -3161,9 +3303,11 @@ void fp_sqr(fp_int *A, fp_int *B) fp_sqr_comba4(A,B); } else if (y <= 8) { fp_sqr_comba8(A,B); +#if defined(TFM_LARGE) } else if (y <= 16 && y >= 12) { fp_sqr_comba16(A,B); -#ifdef TFM_HUGE +#endif +#if defined(TFM_HUGE) } else if (y <= 32 && y >= 28) { fp_sqr_comba32(A,B); #endif @@ -3279,7 +3423,7 @@ Obvious points of optimization x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI @@ -3319,21 +3463,21 @@ asm volatile ( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI #define SQRADD(i, j) \ -asm volatile ( \ +asm ( \ "movq %6,%%rax \n\t" \ "mulq %%rax \n\t" \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc"); #define SQRADD2(i, j) \ -asm volatile ( \ +asm ( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ @@ -3342,7 +3486,7 @@ asm volatile ( \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc"); #elif defined(TFM_SSE2) @@ -3360,7 +3504,7 @@ asm volatile ( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI \ asm("emms"); @@ -3371,11 +3515,11 @@ asm volatile ( \ "pmuludq %%mm0,%%mm0\n\t" \ "movd %%mm0,%%eax \n\t" \ "psrlq $32,%%mm0 \n\t" \ - "movd %%mm0,%%edx \n\t" \ "addl %%eax,%0 \n\t" \ - "adcl %%edx,%1 \n\t" \ + "movd %%mm0,%%eax \n\t" \ + "adcl %%eax,%1 \n\t" \ "adcl $0,%2 \n\t" \ - :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); + :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc"); #define SQRADD2(i, j) \ asm volatile ( \ @@ -3409,7 +3553,7 @@ asm volatile ( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_FINI @@ -3438,7 +3582,8 @@ asm( \ /* ISO C portable code */ -#define COMBA_START +#define COMBA_START \ + { fp_word tt; #define CLEAR_CARRY \ c0 = c1 = c2 = 0; @@ -3450,23 +3595,28 @@ asm( \ x = c1; #define CARRY_FORWARD \ - c0 = c1; c1 = c2; c2 = 0; + do { c0 = c1; c1 = c2; c2 = 0; } while (0); -#define COMBA_FINI +#define COMBA_FINI \ + } /* multiplies point i and j, updates carry "c1" and digit c2 */ -#define SQRADD(i, j) \ - t = ((fp_word)i) * ((fp_word)j); \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; +#define SQRADD(i, j) \ + do { fp_word t; \ + t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \ + t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \ + } while (0); + /* for squaring some of the terms are doubled... */ -#define SQRADD2(i, j) \ - t = ((fp_word)i) * ((fp_word)j); \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \ - c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ - c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; +#define SQRADD2(i, j) \ + do { fp_word t; \ + t = ((fp_word)i) * ((fp_word)j); \ + tt = (fp_word)c0 + t; c0 = tt; \ + tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \ + tt = (fp_word)c0 + t; c0 = tt; \ + tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \ + } while (0); #endif @@ -3476,7 +3626,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B) int pa, ix, iz; fp_digit c0, c1, c2; fp_int tmp, *dst; - fp_word t; /* get size of output and trim */ pa = A->used + A->used; @@ -3549,7 +3698,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B) void fp_sqr_comba4(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[8], c0, c1, c2; a = A->dp; @@ -3603,7 +3751,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B) void fp_sqr_comba8(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[16], c0, c1, c2; a = A->dp; @@ -3694,10 +3841,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B) fp_clamp(B); } +#if defined(TFM_LARGE) void fp_sqr_comba16(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[32], c0, c1, c2; a = A->dp; @@ -3868,11 +4015,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B) fp_clamp(B); } +#endif /* TFM_LARGE */ + #ifdef TFM_HUGE void fp_sqr_comba32(fp_int *A, fp_int *B) { - fp_word t; fp_digit *a, b[64], c0, c1, c2; a = A->dp; @@ -4455,5 +4603,81 @@ int fp_unsigned_bin_size(fp_int *a) /* End: fp_unsigned_bin_size.c */ +/* Start: s_fp_add.c */ +/* TomsFastMath, a fast ISO C bignum library. + * + * This project is meant to fill in where LibTomMath + * falls short. That is speed ;-) + * + * This project is public domain and free for all purposes. + * + * Tom St Denis, tomstdenis@iahu.ca + */ +#include + +/* unsigned addition */ +void s_fp_add(fp_int *a, fp_int *b, fp_int *c) +{ + int x, y, oldused; + fp_word t; + + y = MAX(a->used, b->used); + oldused = c->used; + c->used = y; + + t = 0; + for (x = 0; x < y; x++) { + t += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]); + c->dp[x] = (fp_digit)t; + t >>= DIGIT_BIT; + } + if (t != 0 && x != FP_SIZE) { + c->dp[c->used++] = (fp_digit)t; + ++x; + } + + for (; x < oldused; x++) { + c->dp[x] = 0; + } + fp_clamp(c); +} + +/* End: s_fp_add.c */ + +/* Start: s_fp_sub.c */ +/* TomsFastMath, a fast ISO C bignum library. + * + * This project is meant to fill in where LibTomMath + * falls short. That is speed ;-) + * + * This project is public domain and free for all purposes. + * + * Tom St Denis, tomstdenis@iahu.ca + */ +#include + +/* unsigned subtraction ||a|| >= ||b|| ALWAYS! */ +void s_fp_sub(fp_int *a, fp_int *b, fp_int *c) +{ + int x, oldused; + fp_word t; + + oldused = c->used; + c->used = a->used; + t = 0; + for (x = 0; x < a->used; x++) { + t = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t); + c->dp[x] = (fp_digit)t; + t = (t >> DIGIT_BIT) & 1; + } + + for (; x < oldused; x++) { + c->dp[x] = 0; + } + fp_clamp(c); +} + +/* End: s_fp_sub.c */ + /* EOF */ diff --git a/tfm.h b/tfm.h index 9b12946..c927bae 100644 --- a/tfm.h +++ b/tfm.h @@ -21,9 +21,19 @@ #undef MAX #define MAX(x,y) ((x)>(y)?(x):(y)) -/* do we want huge code? The answer is, yes. */ +/* do we want large code? */ +#define TFM_LARGE + +/* do we want huge code (implies large)? The answer is, yes. */ #define TFM_HUGE +/* imply TFM_LARGE as required */ +#if defined(TFM_HUGE) + #if !defined(TFM_LARGE) + #define TFM_LARGE + #endif +#endif + /* Max size of any number in bits. Basically the largest size you will be multiplying * should be half [or smaller] of FP_MAX_SIZE-four_digit * @@ -41,19 +51,69 @@ #error FP_MAX_SIZE must be a multiple of CHAR_BIT #endif -/* make sure we are using 64-bit digits with x86-64 asm */ +/* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */ +#if defined(__x86_64__) + #if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM) + #error x86-64 detected, x86-32/SSE2/ARM optimizations are not valid! + #endif + #if !defined(TFM_X86_64) && !defined(TFM_NO_ASM) + #define TFM_X86_64 + #endif +#endif #if defined(TFM_X86_64) - #ifndef FP_64BIT + #if !defined(FP_64BIT) #define FP_64BIT #endif #endif +/* try to detect x86-32 */ +#if defined(__i386__) && !defined(TFM_SSE2) + #if defined(TFM_X86_64) || defined(TFM_ARM) + #error x86-32 detected, x86-64/ARM optimizations are not valid! + #endif + #if !defined(TFM_X86) && !defined(TFM_NO_ASM) + #define TFM_X86 + #endif +#endif + /* make sure we're 32-bit for x86-32/sse/arm */ #if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT) #warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining) #undef FP_64BIT #endif +/* multi asms? */ +#ifdef TFM_X86 + #define TFM_ASM +#endif +#ifdef TFM_X86_64 + #ifdef TFM_ASM + #error TFM_ASM already defined! + #endif + #define TFM_ASM +#endif +#ifdef TFM_SSE2 + #ifdef TFM_ASM + #error TFM_ASM already defined! + #endif + #define TFM_ASM +#endif +#ifdef TFM_ARM + #ifdef TFM_ASM + #error TFM_ASM already defined! + #endif + #define TFM_ASM +#endif + +/* we want no asm? */ +#ifdef TFM_NO_ASM + #undef TFM_X86 + #undef TFM_X86_64 + #undef TFM_SSE2 + #undef TFM_ARM + #undef TFM_ASM +#endif + /* some default configurations. */ #if defined(FP_64BIT) @@ -110,8 +170,11 @@ typedef struct { /* functions */ +/* returns a TFM ident string useful for debugging... */ +const char *fp_ident(void); + /* initialize [or zero] an fp int */ -#define fp_init(a) memset((a), 0, sizeof(fp_int)) +#define fp_init(a) (void)memset((a), 0, sizeof(fp_int)) #define fp_zero(a) fp_init(a) /* zero/even/odd ? */ @@ -123,7 +186,7 @@ typedef struct { void fp_set(fp_int *a, fp_digit b); /* copy from a to b */ -#define fp_copy(a, b) (((a) != (b)) && memcpy((b), (a), sizeof(fp_int))) +#define fp_copy(a, b) (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int))) #define fp_init_copy(a, b) fp_copy(b, a) /* negate and absolute */ @@ -139,10 +202,10 @@ void fp_rshd(fp_int *a, int x); /* left shift x digits */ void fp_lshd(fp_int *a, int x); -/* signed comparisonm */ +/* signed comparison */ int fp_cmp(fp_int *a, fp_int *b); -/* unsigned comparisonm */ +/* unsigned comparison */ int fp_cmp_mag(fp_int *a, fp_int *b); /* power of 2 operations */ @@ -273,14 +336,18 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C); #ifdef TFM_HUGE void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C); #endif +#ifdef TFM_LARGE void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C); +#endif void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C); void fp_sqr_comba(fp_int *A, fp_int *B); void fp_sqr_comba4(fp_int *A, fp_int *B); void fp_sqr_comba8(fp_int *A, fp_int *B); +#ifdef TFM_LARGE void fp_sqr_comba16(fp_int *A, fp_int *B); +#endif #ifdef TFM_HUGE void fp_sqr_comba32(fp_int *A, fp_int *B); #endif diff --git a/tfm.tex b/tfm.tex index 914208c..db09a6e 100644 --- a/tfm.tex +++ b/tfm.tex @@ -49,7 +49,7 @@ \begin{document} \frontmatter \pagestyle{empty} -\title{TomsFastMath User Manual \\ v0.01} +\title{TomsFastMath User Manual \\ v0.02} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle This text and library are all hereby placed in the public domain. This book has been formatted for B5 @@ -128,23 +128,27 @@ several ``CFLAGS'' defines. For example, to build with with SSE2 optimizations type \begin{verbatim} -export CFLAGS=-DTFM_SSE2 -make clean libtfm.a +CFLAGS=-DTFM_SSE2 make clean libtfm.a \end{verbatim} \subsubsection{x86--32} The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all i386 and beyond processors. It requires GCC to build and only works with 32--bit digits. In this -mode fp\_digit is 32--bits and fp\_word is 64--bits. +mode fp\_digit is 32--bits and fp\_word is 64--bits. This mode will be autodetected when building +with GCC to an ``i386'' target. You can override this behaviour by defining TFM\_NO\_ASM or +another optimization mode (such as SSE2). \subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium M or Athlon64 processor. It requires GCC to build. Note that you shouldn't define both TFM\_X86 and TFM\_SSE2 at the same time. This mode only works with 32--bit digits. In this -mode fp\_digit is 32--bits and fp\_word is 64--bits. +mode fp\_digit is 32--bits and fp\_word is 64--bits. While this mode will work on the AMD Athlon64 +series of processors it is less efficient than the native ``x86--64'' mode and not recommended. \subsubsection{x86--64} The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a ``x86--64'' capable processor (Athlon64 and future Pentium processors). It requires GCC to build and only works with 64--bit digits. Note that by enabling this mode it will automatically -enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits. +enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits. This mode will +be autodetected when building with GCC to an ``x86--64'' target. You can override this behaviour by defining +TFM\_NO\_ASM. \subsubsection{ARM} The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher processor. It requires GCC and works with 32--bit digits. In this mode fp\_digit is 32--bits and @@ -163,6 +167,8 @@ Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline pat \hline Pentium 4 & TFM\_SSE2 \\ \hline Athlon64 & TFM\_X86\_64 \\ \hline ARMv4 or higher & TFM\_ARM \\ +\hline &\\ +\hline x86--32 or x86--64 (with GCC) & Leave blank and let autodetect work \\ \hline \end{tabular} \caption{Recommended Build Modes} @@ -339,9 +345,10 @@ To compute a modular exponentiation use the following function. \begin{verbatim} int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d); \end{verbatim} -This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$. The size of $c$ -must be half of the maximum precision used during the build of the library. For example, -by default $c$ must be less than $2^{2048}$. +This computes $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for any odd $c$ and $b$. $b$ may be negative so long as +$a^{-1} \mbox{ (mod }c\mbox{)}$ exists. The initial value of $a$ may be larger than $c$. The size of $c$ must be +half of the maximum precision used during the build of the library. For example, by default $c$ must be less +than $2^{2048}$. \section{Number Theoretic}