added tomsfastmath-0.02

This commit is contained in:
Tom St Denis 2004-09-19 01:31:44 +00:00 committed by Steffen Jaeckel
parent 5e92ed2a59
commit 6bb413fd72
23 changed files with 689 additions and 184 deletions

View File

@ -1,2 +1,21 @@
September 18th, 2004
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
This also helps prevent killing the cache on smaller cpus.
-- Cast memset to void in fp_init() to catch people who misuse the function (e.g. expect return)
Thanks to Johan Lindh
-- Cleaned up x86-64 support [faster montgomery reductions]
-- Autodetects x86-32 and x86-64 and enables it's asm now
-- Made test demo build cleaner in multilib platforms [e.g. mixed 32/64 bits]
-- Fix to fp_mod to ensure that remainder is of the same sign as the modulus.
-- Fixed bug in fp_montgomery_calc_normalization for single digit moduli
-- cleaned up ISO C macros in comba/mont to avoid branches [works best with GCC 3.4.x branch]
-- Added more testing to tfm.h to help detect misconfigured builds
-- Added TFM_NO_ASM which forces ASM off [even if it was autodetected].
-- Added fp_radix_size() to API
-- Cleaned up demo/test.c to build with far fewer warnings (mostly %d => %lu fixes)
-- fp_exptmod() now supports negative exponent and base>modulus cases
-- Added fp_ident() which gives a string showing how TFM was configured. Useful for debuging...
-- fix gen.pl script so it includes the whole source tree now
August 25th, 2004 August 25th, 2004
TFM 0.01 -- Initial Release 0.01 -- Initial Release

View File

@ -10,7 +10,6 @@ int main(int argc, char **argv)
printf( printf(
"void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n" "void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
"{\n" "{\n"
" fp_word t;\n"
" fp_digit c0, c1, c2, at[%d];\n" " fp_digit c0, c1, c2, at[%d];\n"
"\n" "\n"
" memcpy(at, A->dp, %d * sizeof(fp_digit));\n" " memcpy(at, A->dp, %d * sizeof(fp_digit));\n"

View File

@ -9,7 +9,6 @@ int main(int argc, char **argv)
printf( printf(
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n" "void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
"{\n" "{\n"
" fp_word t;\n"
" fp_digit *a, b[%d], c0, c1, c2;\n" " fp_digit *a, b[%d], c0, c1, c2;\n"
"\n" "\n"
" a = A->dp;\n" " a = A->dp;\n"

24
delme.c Normal file
View File

@ -0,0 +1,24 @@
#include "tfm.h"
int main(void)
{
fp_int a;
char buf[4096];
fp_init(&a);
fp_read_radix( &a,
"///////////93zgY8MZ2DCJ6Oek0t1pHAG9E28fdp7G22xwcEnER8b5A27cED0JT"
"xvKPiyqwGnimAmfjybyKDq/XDMrjKS95v8MrTc9UViRqJ4BffZVjQml/NBRq1hVj"
"xZXh+rg9dwMkdoGHV4iVvaaePb7iv5izmW1ykA5ZlmMOsaWs75NJccaMFwZz9CzV"
"WsLT8zoZhPOSOlDM88LIkvxLAGTmbfPjPmmrJagyc0JnT6m8oXWXV3AGNaOkDiux"
"uvvtB1WEXWER9uEYx0UYZxN5NV1lJ5B9tYlBzfLO5nWvbKbywfLgvHNI9XYO+WKG"
"5NAEMeggn2sjCnSD151wCwXL8QlV7BfaxFk515ZRxmgAwd5NNGOCVREN3uMcuUJ7"
"g/MkZDi9CzSUZ9JWIYLXdSxZqYOQqkvhyI/w1jcA26JOTW9pFiXgP58VAnWNUo0C"
"k+4NLtfXNMnt2OZ0kjb6uWZYJw1qvQinGzjR/E3z48vBWj4WgJhIol//////////",
64 );
if( fp_isprime( &a ) ) printf("It's prime.\n");
else printf( "Not prime.\n");
return 0;
}

View File

@ -60,6 +60,8 @@ int main(void)
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix; div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
ulong64 t1, t2; ulong64 t1, t2;
printf("TFM Ident string:\n%s\n\n", fp_ident());
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f); fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
fp_zero(&a); draw(&a); fp_zero(&a); draw(&a);
@ -133,9 +135,30 @@ int main(void)
printf("Testing read_radix\n"); printf("Testing read_radix\n");
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a); fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
#if 1
/* test mont */ /* test mont */
printf("Montgomery test\n"); printf("Montgomery test #1\n");
fp_set(&a, 1); fp_set(&a, 0x1234567ULL);
fp_montgomery_setup(&a, &fp);
fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp);
if (fp_cmp(&c, &d) != FP_EQ) {
printf("Failed mont %d\n", n);
draw(&a);
draw(&d);
draw(&c);
return EXIT_FAILURE;
}
}
printf("Passed.\n");
printf("Montgomery test #2\n");
fp_set(&a, 0x1234567ULL);
fp_lshd(&a, 4); fp_lshd(&a, 4);
fp_add_d(&a, 1, &a); fp_add_d(&a, 1, &a);
fp_montgomery_setup(&a, &fp); fp_montgomery_setup(&a, &fp);
@ -158,19 +181,19 @@ int main(void)
/* test for size */ /* test for size */
for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) { for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) {
printf("Testing (not safe-prime): %9d bits \r", ix); fflush(stdout); printf("Testing (not safe-prime): %9lu bits \r", ix); fflush(stdout);
err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL); err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL);
if (err != FP_OKAY) { if (err != FP_OKAY) {
printf("failed with err code %d\n", err); printf("failed with err code %d\n", err);
return EXIT_FAILURE; return EXIT_FAILURE;
} }
if (fp_count_bits(&a) != ix) { if ((unsigned long)fp_count_bits(&a) != ix) {
printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix); printf("Prime is %d not %lu bits!!!\n", fp_count_bits(&a), ix);
return EXIT_FAILURE; return EXIT_FAILURE;
} }
} }
printf("\n\n"); printf("\n\n");
#endif
#if 0 #if 0
/* do some timings... */ /* do some timings... */
@ -280,7 +303,7 @@ int main(void)
c.used = t; c.used = t;
t2 = -1; t2 = -1;
for (ix = 0; ix < 50; ++ix) { for (ix = 0; ix < 1024; ++ix) {
t1 = TIMFUNC(); t1 = TIMFUNC();
fp_exptmod(&c, &b, &a, &d); fp_exptmod(&c, &b, &a, &d);
fp_exptmod(&c, &b, &a, &d); fp_exptmod(&c, &b, &a, &d);
@ -293,7 +316,6 @@ int main(void)
} }
#endif #endif
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0; sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
@ -304,27 +326,27 @@ int main(void)
printf("%s ]\r",cmd); fflush(stdout); printf("%s ]\r",cmd); fflush(stdout);
if (!strcmp(cmd, "mul2d")) { ++mul2d_n; if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_2d(&a, rr, &a); fp_mul_2d(&a, rr, &a);
a.sign = b.sign; a.sign = b.sign;
if (fp_cmp(&a, &b) != FP_EQ) { if (fp_cmp(&a, &b) != FP_EQ) {
printf("mul2d failed, rr == %d\n",rr); printf("mul2d failed, rr == %lu\n",rr);
draw(&a); draw(&a);
draw(&b); draw(&b);
return 0; return 0;
} }
} else if (!strcmp(cmd, "div2d")) { ++div2d_n; } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_div_2d(&a, rr, &a, &e); fp_div_2d(&a, rr, &a, &e);
a.sign = b.sign; a.sign = b.sign;
if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; } if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
if (fp_cmp(&a, &b) != FP_EQ) { if (fp_cmp(&a, &b) != FP_EQ) {
printf("div2d failed, rr == %d\n",rr); printf("div2d failed, rr == %lu\n",rr);
draw(&a); draw(&a);
draw(&b); draw(&b);
return 0; return 0;
@ -492,7 +514,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
} }
} else if (!strcmp(cmd, "add_d")) { ++add_d_n; } else if (!strcmp(cmd, "add_d")) { ++add_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix); fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_add_d(&a, ix, &c); fp_add_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) { if (fp_cmp(&b, &c) != FP_EQ) {
@ -500,12 +522,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&a); draw(&a);
draw(&b); draw(&b);
draw(&c); draw(&c);
printf("d == %d\n", ix); printf("d == %lu\n", ix);
return 0; return 0;
} }
} else if (!strcmp(cmd, "sub_d")) { ++sub_d_n; } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix); fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_sub_d(&a, ix, &c); fp_sub_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) { if (fp_cmp(&b, &c) != FP_EQ) {
@ -513,12 +535,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&a); draw(&a);
draw(&b); draw(&b);
draw(&c); draw(&c);
printf("d == %d\n", ix); printf("d == %lu\n", ix);
return 0; return 0;
} }
} else if (!strcmp(cmd, "mul_d")) { ++mul_d_n; } else if (!strcmp(cmd, "mul_d")) { ++mul_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix); fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64); fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_d(&a, ix, &c); fp_mul_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) { if (fp_cmp(&b, &c) != FP_EQ) {
@ -526,7 +548,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&a); draw(&a);
draw(&b); draw(&b);
draw(&c); draw(&c);
printf("d == %d\n", ix); printf("d == %lu\n", ix);
return 0; return 0;
} }
} }

Binary file not shown.

View File

@ -13,7 +13,7 @@
* Some restrictions... x must be positive and < b * Some restrictions... x must be positive and < b
*/ */
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{ {
fp_int M[64], res; fp_int M[64], res;
fp_digit buf, mp; fp_digit buf, mp;
@ -34,7 +34,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
} }
/* init M array */ /* init M array */
memset(M, 0, sizeof(fp_int)*(1<<winsize)); memset(M, 0, sizeof(M));
/* now setup montgomery */ /* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) { if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -168,3 +168,24 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
fp_copy (&res, Y); fp_copy (&res, Y);
return FP_OKAY; return FP_OKAY;
} }
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int tmp;
int err;
/* is X negative? */
if (X->sign == FP_NEG) {
/* yes, copy G and invmod it */
fp_copy(G, &tmp);
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
return err;
}
/* _fp_exptmod doesn't care about the sign of X */
return _fp_exptmod(&tmp, X, P, Y);
} else {
/* Positive exponent so just exptmod */
return _fp_exptmod(G, X, P, Y);
}
}

66
fp_ident.c Normal file
View File

@ -0,0 +1,66 @@
#include "tfm.h"
const char *fp_ident(void)
{
static char buf[1024];
memset(buf, 0, sizeof(buf));
snprintf(buf, sizeof(buf)-1,
"TomsFastMath (%s)\n"
"\n"
"Sizeofs\n"
"\tfp_digit = %u\n"
"\tfp_word = %u\n"
"\n"
"FP_MAX_SIZE = %u\n"
"\n"
"Defines: \n"
#ifdef __i386__
" __i386__ "
#endif
#ifdef __x86_64__
" __x86_64__ "
#endif
#ifdef TFM_X86
" TFM_X86 "
#endif
#ifdef TFM_X86_64
" TFM_X86_64 "
#endif
#ifdef TFM_SSE2
" TFM_SSE2 "
#endif
#ifdef TFM_ARM
" TFM_ARM "
#endif
#ifdef TFM_NO_ASM
" TFM_NO_ASM "
#endif
#ifdef FP_64BIT
" FP_64BIT "
#endif
#ifdef TFM_LARGE
" TFM_LARGE "
#endif
#ifdef TFM_HUGE
" TFM_HUGE "
#endif
"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
if (sizeof(fp_digit) == sizeof(fp_word)) {
strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
sizeof(buf)-1);
}
return buf;
}
#ifdef STANDALONE
int main(void)
{
printf("%s\n", fp_ident());
return 0;
}
#endif

View File

@ -12,7 +12,19 @@
/* c = a mod b, 0 <= c < b */ /* c = a mod b, 0 <= c < b */
int fp_mod(fp_int *a, fp_int *b, fp_int *c) int fp_mod(fp_int *a, fp_int *b, fp_int *c)
{ {
return fp_div(a, b, NULL, c); fp_int t;
int err;
fp_zero(&t);
if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
return err;
}
if (t.sign != b->sign) {
fp_add(&t, b, c);
} else {
fp_copy(&t, c);
}
return FP_OKAY;
} }

View File

@ -24,7 +24,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1); fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
} else { } else {
fp_set(a, 1); fp_set(a, 1);
++bits; bits = 1;
} }
/* now compute C = A * B mod b */ /* now compute C = A * B mod b */

View File

@ -66,13 +66,13 @@ asm(
#define PROPCARRY \ #define PROPCARRY \
asm( \ asm( \
"movq %1,%%rax \n\t" \ "movq %1,%%rax \n\t" \
"movq %2,%%rbx \n\t" \
"addq %%rax,%6 \n\t" \ "addq %%rax,%6 \n\t" \
"movq %2,%%rax \n\t" \ "adcq %%rbx,%7 \n\t" \
"adcq %%rax,%7 \n\t" \
"adcq $0,%8 \n\t" \ "adcq $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%rax", "%cc"); : "%rax", "%rbx", "%cc");
#elif defined(TFM_SSE2) #elif defined(TFM_SSE2)
@ -88,7 +88,7 @@ asm("emms");
asm(\ asm(\
"movd %0,%%mm1 \n\t" \ "movd %0,%%mm1 \n\t" \
"pmuludq %%mm2,%%mm1 \n\t" \ "pmuludq %%mm2,%%mm1 \n\t" \
:: "g"(c[x]), "g"(mp)); :: "g"(c[x]));
#define INNERMUL \ #define INNERMUL \
asm( \ asm( \
@ -112,7 +112,7 @@ asm(
"adcl %%eax,%7 \n\t" \ "adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \ "adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
: "%eax", "%cc"); : "%eax", "%cc");
#elif defined(TFM_ARM) #elif defined(TFM_ARM)
@ -166,14 +166,18 @@ asm( \
mu = c[x] * mp; mu = c[x] * mp;
#define INNERMUL \ #define INNERMUL \
t = ((fp_word)mu) * ((fp_word)*tmpm++); \ do { fp_word t; \
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \ t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \ t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
_c[OFF2] += (t >> DIGIT_BIT); \
} while (0);
#define PROPCARRY \ #define PROPCARRY \
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \ do { fp_word t; \
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1]; t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
_c[OFF2+1] += (t >> DIGIT_BIT); \
} while (0);
#endif #endif
@ -187,7 +191,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{ {
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu; fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
int oldused, x, y, pa; int oldused, x, y, pa;
fp_word t;
/* now zero the buff */ /* now zero the buff */
pa = m->used; pa = m->used;
@ -221,7 +224,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
/* fix the rest of the carries */ /* fix the rest of the carries */
_c = c + pa; _c = c + pa;
for (; x < pa * 2 + 2; x++) { for (x = pa; x < pa * 2 + 2; x++) {
PROPCARRY; PROPCARRY;
++_c; ++_c;
} }

View File

@ -28,9 +28,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
fp_mul_comba4(A,B,C); fp_mul_comba4(A,B,C);
} else if (y <= 8) { } else if (y <= 8) {
fp_mul_comba8(A,B,C); fp_mul_comba8(A,B,C);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) { } else if (y <= 16 && y >= 12) {
fp_mul_comba16(A,B,C); fp_mul_comba16(A,B,C);
#ifdef TFM_HUGE #endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) { } else if (y <= 32 && y >= 28) {
fp_mul_comba32(A,B,C); fp_mul_comba32(A,B,C);
#endif #endif

View File

@ -27,7 +27,7 @@
/* forward the carry to the next digit */ /* forward the carry to the next digit */
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */ /* store the first sum */
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
@ -42,7 +42,7 @@
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm volatile ( \ asm ( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -62,7 +62,7 @@ asm volatile ( \
/* forward the carry to the next digit */ /* forward the carry to the next digit */
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */ /* store the first sum */
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
@ -77,13 +77,13 @@ asm volatile ( \
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm volatile ( \ asm ( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \ "adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \ "adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2) #elif defined(TFM_SSE2)
/* use SSE2 optimizations */ /* use SSE2 optimizations */
@ -97,7 +97,7 @@ asm volatile ( \
/* forward the carry to the next digit */ /* forward the carry to the next digit */
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */ /* store the first sum */
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
@ -134,7 +134,7 @@ asm volatile ( \
c0 = c1 = c2 = 0; c0 = c1 = c2 = 0;
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
x = c0; x = c0;
@ -161,7 +161,7 @@ asm( \
c0 = c1 = c2 = 0; c0 = c1 = c2 = 0;
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
x = c0; x = c0;
@ -172,9 +172,10 @@ asm( \
#define COMBA_FINI #define COMBA_FINI
#define MULADD(i, j) \ #define MULADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \ do { fp_word t; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
#endif #endif
@ -184,7 +185,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
{ {
int ix, iy, iz, tx, ty, pa; int ix, iy, iz, tx, ty, pa;
fp_digit c0, c1, c2, *tmpx, *tmpy; fp_digit c0, c1, c2, *tmpx, *tmpy;
fp_word t;
fp_int tmp, *dst; fp_int tmp, *dst;
COMBA_START; COMBA_START;
@ -239,7 +239,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[8]; fp_digit c0, c1, c2, at[8];
memcpy(at, A->dp, 4 * sizeof(fp_digit)); memcpy(at, A->dp, 4 * sizeof(fp_digit));
@ -284,7 +283,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[16]; fp_digit c0, c1, c2, at[16];
memcpy(at, A->dp, 8 * sizeof(fp_digit)); memcpy(at, A->dp, 8 * sizeof(fp_digit));
@ -358,10 +356,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI; COMBA_FINI;
} }
#if defined(TFM_LARGE)
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[32]; fp_digit c0, c1, c2, at[32];
memcpy(at, A->dp, 16 * sizeof(fp_digit)); memcpy(at, A->dp, 16 * sizeof(fp_digit));
@ -499,11 +497,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI; COMBA_FINI;
} }
#endif /* TFM_LARGE */
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[64]; fp_digit c0, c1, c2, at[64];
memcpy(at, A->dp, 32 * sizeof(fp_digit)); memcpy(at, A->dp, 32 * sizeof(fp_digit));

View File

@ -11,4 +11,39 @@
int fp_radix_size(fp_int *a, int radix, int *size) int fp_radix_size(fp_int *a, int radix, int *size)
{ {
int digs;
fp_int t;
fp_digit d;
*size = 0;
/* check range of the radix */
if (radix < 2 || radix > 64) {
return FP_VAL;
}
/* quick out if its zero */
if (fp_iszero(a) == 1) {
*size = 2;
return FP_OKAY;
}
fp_init_copy(&t, a);
/* if it is negative output a - */
if (t.sign == FP_NEG) {
*size++;
t.sign = FP_ZPOS;
}
digs = 0;
while (fp_iszero (&t) == FP_NO) {
fp_div_d (&t, (fp_digit) radix, &t, &d);
*size++;
}
/* append a NULL so the string is properly terminated */
*size++;
return FP_OKAY;
} }

View File

@ -21,9 +21,11 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_sqr_comba4(A,B); fp_sqr_comba4(A,B);
} else if (y <= 8) { } else if (y <= 8) {
fp_sqr_comba8(A,B); fp_sqr_comba8(A,B);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) { } else if (y <= 16 && y >= 12) {
fp_sqr_comba16(A,B); fp_sqr_comba16(A,B);
#ifdef TFM_HUGE #endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) { } else if (y <= 32 && y >= 28) {
fp_sqr_comba32(A,B); fp_sqr_comba32(A,B);
#endif #endif

View File

@ -28,7 +28,7 @@
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI
@ -68,21 +68,21 @@ asm volatile ( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm volatile ( \ asm ( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \ "mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \ "adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \ "adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm ( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -91,7 +91,7 @@ asm volatile ( \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \ "adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \ "adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2) #elif defined(TFM_SSE2)
@ -109,7 +109,7 @@ asm volatile ( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI \ #define COMBA_FINI \
asm("emms"); asm("emms");
@ -120,11 +120,11 @@ asm volatile ( \
"pmuludq %%mm0,%%mm0\n\t" \ "pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \ "movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \ "psrlq $32,%%mm0 \n\t" \
"movd %%mm0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \ "movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \ "adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm volatile ( \
@ -158,7 +158,7 @@ asm volatile ( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI
@ -187,7 +187,8 @@ asm( \
/* ISO C portable code */ /* ISO C portable code */
#define COMBA_START #define COMBA_START \
{ fp_word tt;
#define CLEAR_CARRY \ #define CLEAR_CARRY \
c0 = c1 = c2 = 0; c0 = c1 = c2 = 0;
@ -199,23 +200,28 @@ asm( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI \
}
/* multiplies point i and j, updates carry "c1" and digit c2 */ /* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \ #define SQRADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \ do { fp_word t; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
/* for squaring some of the terms are doubled... */ /* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
do { fp_word t; \
t = ((fp_word)i) * ((fp_word)j); \ t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ tt = (fp_word)c0 + t; c0 = tt; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \ tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ tt = (fp_word)c0 + t; c0 = tt; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
} while (0);
#endif #endif
@ -225,7 +231,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
int pa, ix, iz; int pa, ix, iz;
fp_digit c0, c1, c2; fp_digit c0, c1, c2;
fp_int tmp, *dst; fp_int tmp, *dst;
fp_word t;
/* get size of output and trim */ /* get size of output and trim */
pa = A->used + A->used; pa = A->used + A->used;
@ -298,7 +303,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
void fp_sqr_comba4(fp_int *A, fp_int *B) void fp_sqr_comba4(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[8], c0, c1, c2; fp_digit *a, b[8], c0, c1, c2;
a = A->dp; a = A->dp;
@ -352,7 +356,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
void fp_sqr_comba8(fp_int *A, fp_int *B) void fp_sqr_comba8(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[16], c0, c1, c2; fp_digit *a, b[16], c0, c1, c2;
a = A->dp; a = A->dp;
@ -443,10 +446,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
fp_clamp(B); fp_clamp(B);
} }
#if defined(TFM_LARGE)
void fp_sqr_comba16(fp_int *A, fp_int *B) void fp_sqr_comba16(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[32], c0, c1, c2; fp_digit *a, b[32], c0, c1, c2;
a = A->dp; a = A->dp;
@ -617,11 +620,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
fp_clamp(B); fp_clamp(B);
} }
#endif /* TFM_LARGE */
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B) void fp_sqr_comba32(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[64], c0, c1, c2; fp_digit *a, b[64], c0, c1, c2;
a = A->dp; a = A->dp;

2
gen.pl
View File

@ -6,7 +6,7 @@
use strict; use strict;
open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!"; open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
foreach my $filename (glob "fp_*.c") { foreach my $filename (glob "*fp_*.c") {
open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!"; open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
print OUT "/* Start: $filename */\n"; print OUT "/* Start: $filename */\n";
print OUT while <SRC>; print OUT while <SRC>;

View File

@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#speed #speed
CFLAGS += -fomit-frame-pointer CFLAGS += -fomit-frame-pointer
VERSION=0.01 VERSION=0.02
default: libtfm.a default: libtfm.a
@ -38,8 +38,9 @@ fp_cmp.o fp_cmp_mag.o \
\ \
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \ fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \ fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \ fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\ \
fp_ident.o
libtfm.a: $(OBJECTS) libtfm.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS) $(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
@ -49,7 +50,7 @@ mtest/mtest: mtest/mtest.c
cd mtest ; make mtest cd mtest ; make mtest
test: libtfm.a demo/test.o mtest/mtest test: libtfm.a demo/test.o mtest/mtest
$(CC) demo/test.o libtfm.a $(PROF) -o test $(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
stest: libtfm.a demo/stest.o stest: libtfm.a demo/stest.o
$(CC) demo/stest.o libtfm.a -o stest $(CC) demo/stest.o libtfm.a -o stest
@ -67,7 +68,7 @@ docs: docdvi
mv -f tfm.pdf doc mv -f tfm.pdf doc
clean: clean:
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~
cd mtest ; make clean cd mtest ; make clean
zipup: docs clean zipup: docs clean

View File

@ -6,4 +6,4 @@ mtest: mtest.o
$(CC) mtest.o -ltommath -o mtest $(CC) mtest.o -ltommath -o mtest
clean: clean:
rm -f *.o mtest rm -f *.o mtest *~

View File

@ -235,10 +235,9 @@ int main(void)
rand_num2(&a); rand_num2(&a);
rand_num2(&b); rand_num2(&b);
rand_num2(&c); rand_num2(&c);
// if (c.dp[0]&1) mp_add_d(&c, 1, &c);
a.sign = b.sign = c.sign = 0; a.sign = b.sign = c.sign = 0;
c.dp[0] |= 1; c.dp[0] |= 1;
if (c.used <= 2) continue; // if (c.used <= 4) continue;
// if (mp_cmp(&a, &c) != MP_LT) continue; // if (mp_cmp(&a, &c) != MP_LT) continue;
// if (mp_cmp(&b, &c) != MP_LT) continue; // if (mp_cmp(&b, &c) != MP_LT) continue;
mp_exptmod(&a, &b, &c, &d); mp_exptmod(&a, &b, &c, &d);

View File

@ -709,7 +709,7 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
* Some restrictions... x must be positive and < b * Some restrictions... x must be positive and < b
*/ */
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y) static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{ {
fp_int M[64], res; fp_int M[64], res;
fp_digit buf, mp; fp_digit buf, mp;
@ -730,7 +730,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
} }
/* init M array */ /* init M array */
memset(M, 0, sizeof(fp_int)*(1<<winsize)); memset(M, 0, sizeof(M));
/* now setup montgomery */ /* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) { if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -865,6 +865,27 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY; return FP_OKAY;
} }
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int tmp;
int err;
/* is X negative? */
if (X->sign == FP_NEG) {
/* yes, copy G and invmod it */
fp_copy(G, &tmp);
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
return err;
}
/* _fp_exptmod doesn't care about the sign of X */
return _fp_exptmod(&tmp, X, P, Y);
} else {
/* Positive exponent so just exptmod */
return _fp_exptmod(G, X, P, Y);
}
}
/* End: fp_exptmod.c */ /* End: fp_exptmod.c */
/* Start: fp_gcd.c */ /* Start: fp_gcd.c */
@ -922,6 +943,76 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
/* End: fp_gcd.c */ /* End: fp_gcd.c */
/* Start: fp_ident.c */
#include "tfm.h"
const char *fp_ident(void)
{
static char buf[1024];
memset(buf, 0, sizeof(buf));
snprintf(buf, sizeof(buf)-1,
"TomsFastMath (%s)\n"
"\n"
"Sizeofs\n"
"\tfp_digit = %u\n"
"\tfp_word = %u\n"
"\n"
"FP_MAX_SIZE = %u\n"
"\n"
"Defines: \n"
#ifdef __i386__
" __i386__ "
#endif
#ifdef __x86_64__
" __x86_64__ "
#endif
#ifdef TFM_X86
" TFM_X86 "
#endif
#ifdef TFM_X86_64
" TFM_X86_64 "
#endif
#ifdef TFM_SSE2
" TFM_SSE2 "
#endif
#ifdef TFM_ARM
" TFM_ARM "
#endif
#ifdef TFM_NO_ASM
" TFM_NO_ASM "
#endif
#ifdef FP_64BIT
" FP_64BIT "
#endif
#ifdef TFM_LARGE
" TFM_LARGE "
#endif
#ifdef TFM_HUGE
" TFM_HUGE "
#endif
"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
if (sizeof(fp_digit) == sizeof(fp_word)) {
strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
sizeof(buf)-1);
}
return buf;
}
#ifdef STANDALONE
int main(void)
{
printf("%s\n", fp_ident());
return 0;
}
#endif
/* End: fp_ident.c */
/* Start: fp_invmod.c */ /* Start: fp_invmod.c */
/* TomsFastMath, a fast ISO C bignum library. /* TomsFastMath, a fast ISO C bignum library.
* *
@ -1186,7 +1277,19 @@ void fp_lshd(fp_int *a, int x)
/* c = a mod b, 0 <= c < b */ /* c = a mod b, 0 <= c < b */
int fp_mod(fp_int *a, fp_int *b, fp_int *c) int fp_mod(fp_int *a, fp_int *b, fp_int *c)
{ {
return fp_div(a, b, NULL, c); fp_int t;
int err;
fp_zero(&t);
if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
return err;
}
if (t.sign != b->sign) {
fp_add(&t, b, c);
} else {
fp_copy(&t, c);
}
return FP_OKAY;
} }
@ -1282,7 +1385,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1); fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
} else { } else {
fp_set(a, 1); fp_set(a, 1);
++bits; bits = 1;
} }
/* now compute C = A * B mod b */ /* now compute C = A * B mod b */
@ -1366,13 +1469,13 @@ asm(
#define PROPCARRY \ #define PROPCARRY \
asm( \ asm( \
"movq %1,%%rax \n\t" \ "movq %1,%%rax \n\t" \
"movq %2,%%rbx \n\t" \
"addq %%rax,%6 \n\t" \ "addq %%rax,%6 \n\t" \
"movq %2,%%rax \n\t" \ "adcq %%rbx,%7 \n\t" \
"adcq %%rax,%7 \n\t" \
"adcq $0,%8 \n\t" \ "adcq $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ "m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%rax", "%cc"); : "%rax", "%rbx", "%cc");
#elif defined(TFM_SSE2) #elif defined(TFM_SSE2)
@ -1388,7 +1491,7 @@ asm("emms");
asm(\ asm(\
"movd %0,%%mm1 \n\t" \ "movd %0,%%mm1 \n\t" \
"pmuludq %%mm2,%%mm1 \n\t" \ "pmuludq %%mm2,%%mm1 \n\t" \
:: "g"(c[x]), "g"(mp)); :: "g"(c[x]));
#define INNERMUL \ #define INNERMUL \
asm( \ asm( \
@ -1412,7 +1515,7 @@ asm(
"adcl %%eax,%7 \n\t" \ "adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \ "adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \ :"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \ "g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
: "%eax", "%cc"); : "%eax", "%cc");
#elif defined(TFM_ARM) #elif defined(TFM_ARM)
@ -1466,14 +1569,18 @@ asm( \
mu = c[x] * mp; mu = c[x] * mp;
#define INNERMUL \ #define INNERMUL \
t = ((fp_word)mu) * ((fp_word)*tmpm++); \ do { fp_word t; \
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \ t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \ t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
_c[OFF2] += (t >> DIGIT_BIT); \
} while (0);
#define PROPCARRY \ #define PROPCARRY \
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \ do { fp_word t; \
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1]; t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
_c[OFF2+1] += (t >> DIGIT_BIT); \
} while (0);
#endif #endif
@ -1487,7 +1594,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{ {
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu; fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
int oldused, x, y, pa; int oldused, x, y, pa;
fp_word t;
/* now zero the buff */ /* now zero the buff */
pa = m->used; pa = m->used;
@ -1521,7 +1627,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
/* fix the rest of the carries */ /* fix the rest of the carries */
_c = c + pa; _c = c + pa;
for (; x < pa * 2 + 2; x++) { for (x = pa; x < pa * 2 + 2; x++) {
PROPCARRY; PROPCARRY;
++_c; ++_c;
} }
@ -1629,9 +1735,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
fp_mul_comba4(A,B,C); fp_mul_comba4(A,B,C);
} else if (y <= 8) { } else if (y <= 8) {
fp_mul_comba8(A,B,C); fp_mul_comba8(A,B,C);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) { } else if (y <= 16 && y >= 12) {
fp_mul_comba16(A,B,C); fp_mul_comba16(A,B,C);
#ifdef TFM_HUGE #endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) { } else if (y <= 32 && y >= 28) {
fp_mul_comba32(A,B,C); fp_mul_comba32(A,B,C);
#endif #endif
@ -1880,7 +1988,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* forward the carry to the next digit */ /* forward the carry to the next digit */
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */ /* store the first sum */
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
@ -1895,7 +2003,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm volatile ( \ asm ( \
"movl %6,%%eax \n\t" \ "movl %6,%%eax \n\t" \
"mull %7 \n\t" \ "mull %7 \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
@ -1915,7 +2023,7 @@ asm volatile ( \
/* forward the carry to the next digit */ /* forward the carry to the next digit */
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */ /* store the first sum */
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
@ -1930,13 +2038,13 @@ asm volatile ( \
/* this should multiply i and j */ /* this should multiply i and j */
#define MULADD(i, j) \ #define MULADD(i, j) \
asm volatile ( \ asm ( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \ "adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \ "adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2) #elif defined(TFM_SSE2)
/* use SSE2 optimizations */ /* use SSE2 optimizations */
@ -1950,7 +2058,7 @@ asm volatile ( \
/* forward the carry to the next digit */ /* forward the carry to the next digit */
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */ /* store the first sum */
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
@ -1987,7 +2095,7 @@ asm volatile ( \
c0 = c1 = c2 = 0; c0 = c1 = c2 = 0;
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
x = c0; x = c0;
@ -2014,7 +2122,7 @@ asm( \
c0 = c1 = c2 = 0; c0 = c1 = c2 = 0;
#define COMBA_FORWARD \ #define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \ #define COMBA_STORE(x) \
x = c0; x = c0;
@ -2025,9 +2133,10 @@ asm( \
#define COMBA_FINI #define COMBA_FINI
#define MULADD(i, j) \ #define MULADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \ do { fp_word t; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
#endif #endif
@ -2037,7 +2146,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
{ {
int ix, iy, iz, tx, ty, pa; int ix, iy, iz, tx, ty, pa;
fp_digit c0, c1, c2, *tmpx, *tmpy; fp_digit c0, c1, c2, *tmpx, *tmpy;
fp_word t;
fp_int tmp, *dst; fp_int tmp, *dst;
COMBA_START; COMBA_START;
@ -2092,7 +2200,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[8]; fp_digit c0, c1, c2, at[8];
memcpy(at, A->dp, 4 * sizeof(fp_digit)); memcpy(at, A->dp, 4 * sizeof(fp_digit));
@ -2137,7 +2244,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[16]; fp_digit c0, c1, c2, at[16];
memcpy(at, A->dp, 8 * sizeof(fp_digit)); memcpy(at, A->dp, 8 * sizeof(fp_digit));
@ -2211,10 +2317,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI; COMBA_FINI;
} }
#if defined(TFM_LARGE)
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[32]; fp_digit c0, c1, c2, at[32];
memcpy(at, A->dp, 16 * sizeof(fp_digit)); memcpy(at, A->dp, 16 * sizeof(fp_digit));
@ -2352,11 +2458,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI; COMBA_FINI;
} }
#endif /* TFM_LARGE */
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C) void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
{ {
fp_word t;
fp_digit c0, c1, c2, at[64]; fp_digit c0, c1, c2, at[64];
memcpy(at, A->dp, 32 * sizeof(fp_digit)); memcpy(at, A->dp, 32 * sizeof(fp_digit));
@ -2880,6 +2987,41 @@ error:
int fp_radix_size(fp_int *a, int radix, int *size) int fp_radix_size(fp_int *a, int radix, int *size)
{ {
int digs;
fp_int t;
fp_digit d;
*size = 0;
/* check range of the radix */
if (radix < 2 || radix > 64) {
return FP_VAL;
}
/* quick out if its zero */
if (fp_iszero(a) == 1) {
*size = 2;
return FP_OKAY;
}
fp_init_copy(&t, a);
/* if it is negative output a - */
if (t.sign == FP_NEG) {
*size++;
t.sign = FP_ZPOS;
}
digs = 0;
while (fp_iszero (&t) == FP_NO) {
fp_div_d (&t, (fp_digit) radix, &t, &d);
*size++;
}
/* append a NULL so the string is properly terminated */
*size++;
return FP_OKAY;
} }
/* End: fp_radix_size.c */ /* End: fp_radix_size.c */
@ -3161,9 +3303,11 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_sqr_comba4(A,B); fp_sqr_comba4(A,B);
} else if (y <= 8) { } else if (y <= 8) {
fp_sqr_comba8(A,B); fp_sqr_comba8(A,B);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) { } else if (y <= 16 && y >= 12) {
fp_sqr_comba16(A,B); fp_sqr_comba16(A,B);
#ifdef TFM_HUGE #endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) { } else if (y <= 32 && y >= 28) {
fp_sqr_comba32(A,B); fp_sqr_comba32(A,B);
#endif #endif
@ -3279,7 +3423,7 @@ Obvious points of optimization
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI
@ -3319,21 +3463,21 @@ asm volatile ( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI
#define SQRADD(i, j) \ #define SQRADD(i, j) \
asm volatile ( \ asm ( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \ "mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \ "adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \ "adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm ( \
"movq %6,%%rax \n\t" \ "movq %6,%%rax \n\t" \
"mulq %7 \n\t" \ "mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
@ -3342,7 +3486,7 @@ asm volatile ( \
"addq %%rax,%0 \n\t" \ "addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \ "adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \ "adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2) #elif defined(TFM_SSE2)
@ -3360,7 +3504,7 @@ asm volatile ( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI \ #define COMBA_FINI \
asm("emms"); asm("emms");
@ -3371,11 +3515,11 @@ asm volatile ( \
"pmuludq %%mm0,%%mm0\n\t" \ "pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \ "movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \ "psrlq $32,%%mm0 \n\t" \
"movd %%mm0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \ "addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \ "movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \ "adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc"); :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
asm volatile ( \ asm volatile ( \
@ -3409,7 +3553,7 @@ asm volatile ( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI
@ -3438,7 +3582,8 @@ asm( \
/* ISO C portable code */ /* ISO C portable code */
#define COMBA_START #define COMBA_START \
{ fp_word tt;
#define CLEAR_CARRY \ #define CLEAR_CARRY \
c0 = c1 = c2 = 0; c0 = c1 = c2 = 0;
@ -3450,23 +3595,28 @@ asm( \
x = c1; x = c1;
#define CARRY_FORWARD \ #define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0; do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI #define COMBA_FINI \
}
/* multiplies point i and j, updates carry "c1" and digit c2 */ /* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \ #define SQRADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \ do { fp_word t; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
/* for squaring some of the terms are doubled... */ /* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \ #define SQRADD2(i, j) \
do { fp_word t; \
t = ((fp_word)i) * ((fp_word)j); \ t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ tt = (fp_word)c0 + t; c0 = tt; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \ tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \ tt = (fp_word)c0 + t; c0 = tt; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
} while (0);
#endif #endif
@ -3476,7 +3626,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
int pa, ix, iz; int pa, ix, iz;
fp_digit c0, c1, c2; fp_digit c0, c1, c2;
fp_int tmp, *dst; fp_int tmp, *dst;
fp_word t;
/* get size of output and trim */ /* get size of output and trim */
pa = A->used + A->used; pa = A->used + A->used;
@ -3549,7 +3698,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
void fp_sqr_comba4(fp_int *A, fp_int *B) void fp_sqr_comba4(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[8], c0, c1, c2; fp_digit *a, b[8], c0, c1, c2;
a = A->dp; a = A->dp;
@ -3603,7 +3751,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
void fp_sqr_comba8(fp_int *A, fp_int *B) void fp_sqr_comba8(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[16], c0, c1, c2; fp_digit *a, b[16], c0, c1, c2;
a = A->dp; a = A->dp;
@ -3694,10 +3841,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
fp_clamp(B); fp_clamp(B);
} }
#if defined(TFM_LARGE)
void fp_sqr_comba16(fp_int *A, fp_int *B) void fp_sqr_comba16(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[32], c0, c1, c2; fp_digit *a, b[32], c0, c1, c2;
a = A->dp; a = A->dp;
@ -3868,11 +4015,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
fp_clamp(B); fp_clamp(B);
} }
#endif /* TFM_LARGE */
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B) void fp_sqr_comba32(fp_int *A, fp_int *B)
{ {
fp_word t;
fp_digit *a, b[64], c0, c1, c2; fp_digit *a, b[64], c0, c1, c2;
a = A->dp; a = A->dp;
@ -4455,5 +4603,81 @@ int fp_unsigned_bin_size(fp_int *a)
/* End: fp_unsigned_bin_size.c */ /* End: fp_unsigned_bin_size.c */
/* Start: s_fp_add.c */
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* unsigned addition */
void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
{
int x, y, oldused;
fp_word t;
y = MAX(a->used, b->used);
oldused = c->used;
c->used = y;
t = 0;
for (x = 0; x < y; x++) {
t += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
c->dp[x] = (fp_digit)t;
t >>= DIGIT_BIT;
}
if (t != 0 && x != FP_SIZE) {
c->dp[c->used++] = (fp_digit)t;
++x;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}
/* End: s_fp_add.c */
/* Start: s_fp_sub.c */
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
{
int x, oldused;
fp_word t;
oldused = c->used;
c->used = a->used;
t = 0;
for (x = 0; x < a->used; x++) {
t = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
c->dp[x] = (fp_digit)t;
t = (t >> DIGIT_BIT) & 1;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}
/* End: s_fp_sub.c */
/* EOF */ /* EOF */

81
tfm.h
View File

@ -21,9 +21,19 @@
#undef MAX #undef MAX
#define MAX(x,y) ((x)>(y)?(x):(y)) #define MAX(x,y) ((x)>(y)?(x):(y))
/* do we want huge code? The answer is, yes. */ /* do we want large code? */
#define TFM_LARGE
/* do we want huge code (implies large)? The answer is, yes. */
#define TFM_HUGE #define TFM_HUGE
/* imply TFM_LARGE as required */
#if defined(TFM_HUGE)
#if !defined(TFM_LARGE)
#define TFM_LARGE
#endif
#endif
/* Max size of any number in bits. Basically the largest size you will be multiplying /* Max size of any number in bits. Basically the largest size you will be multiplying
* should be half [or smaller] of FP_MAX_SIZE-four_digit * should be half [or smaller] of FP_MAX_SIZE-four_digit
* *
@ -41,19 +51,69 @@
#error FP_MAX_SIZE must be a multiple of CHAR_BIT #error FP_MAX_SIZE must be a multiple of CHAR_BIT
#endif #endif
/* make sure we are using 64-bit digits with x86-64 asm */ /* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */
#if defined(__x86_64__)
#if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)
#error x86-64 detected, x86-32/SSE2/ARM optimizations are not valid!
#endif
#if !defined(TFM_X86_64) && !defined(TFM_NO_ASM)
#define TFM_X86_64
#endif
#endif
#if defined(TFM_X86_64) #if defined(TFM_X86_64)
#ifndef FP_64BIT #if !defined(FP_64BIT)
#define FP_64BIT #define FP_64BIT
#endif #endif
#endif #endif
/* try to detect x86-32 */
#if defined(__i386__) && !defined(TFM_SSE2)
#if defined(TFM_X86_64) || defined(TFM_ARM)
#error x86-32 detected, x86-64/ARM optimizations are not valid!
#endif
#if !defined(TFM_X86) && !defined(TFM_NO_ASM)
#define TFM_X86
#endif
#endif
/* make sure we're 32-bit for x86-32/sse/arm */ /* make sure we're 32-bit for x86-32/sse/arm */
#if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT) #if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
#warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining) #warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
#undef FP_64BIT #undef FP_64BIT
#endif #endif
/* multi asms? */
#ifdef TFM_X86
#define TFM_ASM
#endif
#ifdef TFM_X86_64
#ifdef TFM_ASM
#error TFM_ASM already defined!
#endif
#define TFM_ASM
#endif
#ifdef TFM_SSE2
#ifdef TFM_ASM
#error TFM_ASM already defined!
#endif
#define TFM_ASM
#endif
#ifdef TFM_ARM
#ifdef TFM_ASM
#error TFM_ASM already defined!
#endif
#define TFM_ASM
#endif
/* we want no asm? */
#ifdef TFM_NO_ASM
#undef TFM_X86
#undef TFM_X86_64
#undef TFM_SSE2
#undef TFM_ARM
#undef TFM_ASM
#endif
/* some default configurations. /* some default configurations.
*/ */
#if defined(FP_64BIT) #if defined(FP_64BIT)
@ -110,8 +170,11 @@ typedef struct {
/* functions */ /* functions */
/* returns a TFM ident string useful for debugging... */
const char *fp_ident(void);
/* initialize [or zero] an fp int */ /* initialize [or zero] an fp int */
#define fp_init(a) memset((a), 0, sizeof(fp_int)) #define fp_init(a) (void)memset((a), 0, sizeof(fp_int))
#define fp_zero(a) fp_init(a) #define fp_zero(a) fp_init(a)
/* zero/even/odd ? */ /* zero/even/odd ? */
@ -123,7 +186,7 @@ typedef struct {
void fp_set(fp_int *a, fp_digit b); void fp_set(fp_int *a, fp_digit b);
/* copy from a to b */ /* copy from a to b */
#define fp_copy(a, b) (((a) != (b)) && memcpy((b), (a), sizeof(fp_int))) #define fp_copy(a, b) (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
#define fp_init_copy(a, b) fp_copy(b, a) #define fp_init_copy(a, b) fp_copy(b, a)
/* negate and absolute */ /* negate and absolute */
@ -139,10 +202,10 @@ void fp_rshd(fp_int *a, int x);
/* left shift x digits */ /* left shift x digits */
void fp_lshd(fp_int *a, int x); void fp_lshd(fp_int *a, int x);
/* signed comparisonm */ /* signed comparison */
int fp_cmp(fp_int *a, fp_int *b); int fp_cmp(fp_int *a, fp_int *b);
/* unsigned comparisonm */ /* unsigned comparison */
int fp_cmp_mag(fp_int *a, fp_int *b); int fp_cmp_mag(fp_int *a, fp_int *b);
/* power of 2 operations */ /* power of 2 operations */
@ -273,14 +336,18 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
#endif #endif
#ifdef TFM_LARGE
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
#endif
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C); void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
void fp_sqr_comba(fp_int *A, fp_int *B); void fp_sqr_comba(fp_int *A, fp_int *B);
void fp_sqr_comba4(fp_int *A, fp_int *B); void fp_sqr_comba4(fp_int *A, fp_int *B);
void fp_sqr_comba8(fp_int *A, fp_int *B); void fp_sqr_comba8(fp_int *A, fp_int *B);
#ifdef TFM_LARGE
void fp_sqr_comba16(fp_int *A, fp_int *B); void fp_sqr_comba16(fp_int *A, fp_int *B);
#endif
#ifdef TFM_HUGE #ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B); void fp_sqr_comba32(fp_int *A, fp_int *B);
#endif #endif

25
tfm.tex
View File

@ -49,7 +49,7 @@
\begin{document} \begin{document}
\frontmatter \frontmatter
\pagestyle{empty} \pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.01} \title{TomsFastMath User Manual \\ v0.02}
\author{Tom St Denis \\ tomstdenis@iahu.ca} \author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle \maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5 This text and library are all hereby placed in the public domain. This book has been formatted for B5
@ -128,23 +128,27 @@ several ``CFLAGS'' defines.
For example, to build with with SSE2 optimizations type For example, to build with with SSE2 optimizations type
\begin{verbatim} \begin{verbatim}
export CFLAGS=-DTFM_SSE2 CFLAGS=-DTFM_SSE2 make clean libtfm.a
make clean libtfm.a
\end{verbatim} \end{verbatim}
\subsubsection{x86--32} The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all \subsubsection{x86--32} The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all
i386 and beyond processors. It requires GCC to build and only works with 32--bit digits. In this i386 and beyond processors. It requires GCC to build and only works with 32--bit digits. In this
mode fp\_digit is 32--bits and fp\_word is 64--bits. mode fp\_digit is 32--bits and fp\_word is 64--bits. This mode will be autodetected when building
with GCC to an ``i386'' target. You can override this behaviour by defining TFM\_NO\_ASM or
another optimization mode (such as SSE2).
\subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium \subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium
M or Athlon64 processor. It requires GCC to build. Note that you shouldn't define both M or Athlon64 processor. It requires GCC to build. Note that you shouldn't define both
TFM\_X86 and TFM\_SSE2 at the same time. This mode only works with 32--bit digits. In this TFM\_X86 and TFM\_SSE2 at the same time. This mode only works with 32--bit digits. In this
mode fp\_digit is 32--bits and fp\_word is 64--bits. mode fp\_digit is 32--bits and fp\_word is 64--bits. While this mode will work on the AMD Athlon64
series of processors it is less efficient than the native ``x86--64'' mode and not recommended.
\subsubsection{x86--64} The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a \subsubsection{x86--64} The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a
``x86--64'' capable processor (Athlon64 and future Pentium processors). It requires GCC to ``x86--64'' capable processor (Athlon64 and future Pentium processors). It requires GCC to
build and only works with 64--bit digits. Note that by enabling this mode it will automatically build and only works with 64--bit digits. Note that by enabling this mode it will automatically
enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits. enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits. This mode will
be autodetected when building with GCC to an ``x86--64'' target. You can override this behaviour by defining
TFM\_NO\_ASM.
\subsubsection{ARM} The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher \subsubsection{ARM} The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
processor. It requires GCC and works with 32--bit digits. In this mode fp\_digit is 32--bits and processor. It requires GCC and works with 32--bit digits. In this mode fp\_digit is 32--bits and
@ -163,6 +167,8 @@ Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline pat
\hline Pentium 4 & TFM\_SSE2 \\ \hline Pentium 4 & TFM\_SSE2 \\
\hline Athlon64 & TFM\_X86\_64 \\ \hline Athlon64 & TFM\_X86\_64 \\
\hline ARMv4 or higher & TFM\_ARM \\ \hline ARMv4 or higher & TFM\_ARM \\
\hline &\\
\hline x86--32 or x86--64 (with GCC) & Leave blank and let autodetect work \\
\hline \hline
\end{tabular} \end{tabular}
\caption{Recommended Build Modes} \caption{Recommended Build Modes}
@ -339,9 +345,10 @@ To compute a modular exponentiation use the following function.
\begin{verbatim} \begin{verbatim}
int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d); int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
\end{verbatim} \end{verbatim}
This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$. The size of $c$ This computes $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for any odd $c$ and $b$. $b$ may be negative so long as
must be half of the maximum precision used during the build of the library. For example, $a^{-1} \mbox{ (mod }c\mbox{)}$ exists. The initial value of $a$ may be larger than $c$. The size of $c$ must be
by default $c$ must be less than $2^{2048}$. half of the maximum precision used during the build of the library. For example, by default $c$ must be less
than $2^{2048}$.
\section{Number Theoretic} \section{Number Theoretic}