added tomsfastmath-0.02

This commit is contained in:
Tom St Denis 2004-09-19 01:31:44 +00:00 committed by Steffen Jaeckel
parent 5e92ed2a59
commit 6bb413fd72
23 changed files with 689 additions and 184 deletions

View File

@ -1,2 +1,21 @@
September 18th, 2004
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
This also helps prevent killing the cache on smaller cpus.
-- Cast memset to void in fp_init() to catch people who misuse the function (e.g. expect return)
Thanks to Johan Lindh
-- Cleaned up x86-64 support [faster montgomery reductions]
-- Autodetects x86-32 and x86-64 and enables it's asm now
-- Made test demo build cleaner in multilib platforms [e.g. mixed 32/64 bits]
-- Fix to fp_mod to ensure that remainder is of the same sign as the modulus.
-- Fixed bug in fp_montgomery_calc_normalization for single digit moduli
-- cleaned up ISO C macros in comba/mont to avoid branches [works best with GCC 3.4.x branch]
-- Added more testing to tfm.h to help detect misconfigured builds
-- Added TFM_NO_ASM which forces ASM off [even if it was autodetected].
-- Added fp_radix_size() to API
-- Cleaned up demo/test.c to build with far fewer warnings (mostly %d => %lu fixes)
-- fp_exptmod() now supports negative exponent and base>modulus cases
-- Added fp_ident() which gives a string showing how TFM was configured. Useful for debuging...
-- fix gen.pl script so it includes the whole source tree now
August 25th, 2004
TFM 0.01 -- Initial Release
0.01 -- Initial Release

View File

@ -10,7 +10,6 @@ int main(int argc, char **argv)
printf(
"void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
"{\n"
" fp_word t;\n"
" fp_digit c0, c1, c2, at[%d];\n"
"\n"
" memcpy(at, A->dp, %d * sizeof(fp_digit));\n"

View File

@ -9,7 +9,6 @@ int main(int argc, char **argv)
printf(
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
"{\n"
" fp_word t;\n"
" fp_digit *a, b[%d], c0, c1, c2;\n"
"\n"
" a = A->dp;\n"

24
delme.c Normal file
View File

@ -0,0 +1,24 @@
#include "tfm.h"
int main(void)
{
fp_int a;
char buf[4096];
fp_init(&a);
fp_read_radix( &a,
"///////////93zgY8MZ2DCJ6Oek0t1pHAG9E28fdp7G22xwcEnER8b5A27cED0JT"
"xvKPiyqwGnimAmfjybyKDq/XDMrjKS95v8MrTc9UViRqJ4BffZVjQml/NBRq1hVj"
"xZXh+rg9dwMkdoGHV4iVvaaePb7iv5izmW1ykA5ZlmMOsaWs75NJccaMFwZz9CzV"
"WsLT8zoZhPOSOlDM88LIkvxLAGTmbfPjPmmrJagyc0JnT6m8oXWXV3AGNaOkDiux"
"uvvtB1WEXWER9uEYx0UYZxN5NV1lJ5B9tYlBzfLO5nWvbKbywfLgvHNI9XYO+WKG"
"5NAEMeggn2sjCnSD151wCwXL8QlV7BfaxFk515ZRxmgAwd5NNGOCVREN3uMcuUJ7"
"g/MkZDi9CzSUZ9JWIYLXdSxZqYOQqkvhyI/w1jcA26JOTW9pFiXgP58VAnWNUo0C"
"k+4NLtfXNMnt2OZ0kjb6uWZYJw1qvQinGzjR/E3z48vBWj4WgJhIol//////////",
64 );
if( fp_isprime( &a ) ) printf("It's prime.\n");
else printf( "Not prime.\n");
return 0;
}

View File

@ -60,6 +60,8 @@ int main(void)
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
ulong64 t1, t2;
printf("TFM Ident string:\n%s\n\n", fp_ident());
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
fp_zero(&a); draw(&a);
@ -133,9 +135,30 @@ int main(void)
printf("Testing read_radix\n");
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
#if 1
/* test mont */
printf("Montgomery test\n");
fp_set(&a, 1);
printf("Montgomery test #1\n");
fp_set(&a, 0x1234567ULL);
fp_montgomery_setup(&a, &fp);
fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp);
if (fp_cmp(&c, &d) != FP_EQ) {
printf("Failed mont %d\n", n);
draw(&a);
draw(&d);
draw(&c);
return EXIT_FAILURE;
}
}
printf("Passed.\n");
printf("Montgomery test #2\n");
fp_set(&a, 0x1234567ULL);
fp_lshd(&a, 4);
fp_add_d(&a, 1, &a);
fp_montgomery_setup(&a, &fp);
@ -158,19 +181,19 @@ int main(void)
/* test for size */
for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) {
printf("Testing (not safe-prime): %9d bits \r", ix); fflush(stdout);
printf("Testing (not safe-prime): %9lu bits \r", ix); fflush(stdout);
err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL);
if (err != FP_OKAY) {
printf("failed with err code %d\n", err);
return EXIT_FAILURE;
}
if (fp_count_bits(&a) != ix) {
printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix);
if ((unsigned long)fp_count_bits(&a) != ix) {
printf("Prime is %d not %lu bits!!!\n", fp_count_bits(&a), ix);
return EXIT_FAILURE;
}
}
printf("\n\n");
#endif
#if 0
/* do some timings... */
@ -280,7 +303,7 @@ int main(void)
c.used = t;
t2 = -1;
for (ix = 0; ix < 50; ++ix) {
for (ix = 0; ix < 1024; ++ix) {
t1 = TIMFUNC();
fp_exptmod(&c, &b, &a, &d);
fp_exptmod(&c, &b, &a, &d);
@ -293,7 +316,6 @@ int main(void)
}
#endif
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
@ -304,27 +326,27 @@ int main(void)
printf("%s ]\r",cmd); fflush(stdout);
if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_2d(&a, rr, &a);
a.sign = b.sign;
if (fp_cmp(&a, &b) != FP_EQ) {
printf("mul2d failed, rr == %d\n",rr);
printf("mul2d failed, rr == %lu\n",rr);
draw(&a);
draw(&b);
return 0;
}
} else if (!strcmp(cmd, "div2d")) { ++div2d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_div_2d(&a, rr, &a, &e);
a.sign = b.sign;
if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
if (fp_cmp(&a, &b) != FP_EQ) {
printf("div2d failed, rr == %d\n",rr);
printf("div2d failed, rr == %lu\n",rr);
draw(&a);
draw(&b);
return 0;
@ -492,7 +514,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
}
} else if (!strcmp(cmd, "add_d")) { ++add_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_add_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) {
@ -500,12 +522,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&a);
draw(&b);
draw(&c);
printf("d == %d\n", ix);
printf("d == %lu\n", ix);
return 0;
}
} else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_sub_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) {
@ -513,12 +535,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&a);
draw(&b);
draw(&c);
printf("d == %d\n", ix);
printf("d == %lu\n", ix);
return 0;
}
} else if (!strcmp(cmd, "mul_d")) { ++mul_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) {
@ -526,7 +548,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
draw(&a);
draw(&b);
draw(&c);
printf("d == %d\n", ix);
printf("d == %lu\n", ix);
return 0;
}
}

Binary file not shown.

View File

@ -13,7 +13,7 @@
* Some restrictions... x must be positive and < b
*/
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int M[64], res;
fp_digit buf, mp;
@ -34,7 +34,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
}
/* init M array */
memset(M, 0, sizeof(fp_int)*(1<<winsize));
memset(M, 0, sizeof(M));
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -168,3 +168,24 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
fp_copy (&res, Y);
return FP_OKAY;
}
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int tmp;
int err;
/* is X negative? */
if (X->sign == FP_NEG) {
/* yes, copy G and invmod it */
fp_copy(G, &tmp);
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
return err;
}
/* _fp_exptmod doesn't care about the sign of X */
return _fp_exptmod(&tmp, X, P, Y);
} else {
/* Positive exponent so just exptmod */
return _fp_exptmod(G, X, P, Y);
}
}

66
fp_ident.c Normal file
View File

@ -0,0 +1,66 @@
#include "tfm.h"
const char *fp_ident(void)
{
static char buf[1024];
memset(buf, 0, sizeof(buf));
snprintf(buf, sizeof(buf)-1,
"TomsFastMath (%s)\n"
"\n"
"Sizeofs\n"
"\tfp_digit = %u\n"
"\tfp_word = %u\n"
"\n"
"FP_MAX_SIZE = %u\n"
"\n"
"Defines: \n"
#ifdef __i386__
" __i386__ "
#endif
#ifdef __x86_64__
" __x86_64__ "
#endif
#ifdef TFM_X86
" TFM_X86 "
#endif
#ifdef TFM_X86_64
" TFM_X86_64 "
#endif
#ifdef TFM_SSE2
" TFM_SSE2 "
#endif
#ifdef TFM_ARM
" TFM_ARM "
#endif
#ifdef TFM_NO_ASM
" TFM_NO_ASM "
#endif
#ifdef FP_64BIT
" FP_64BIT "
#endif
#ifdef TFM_LARGE
" TFM_LARGE "
#endif
#ifdef TFM_HUGE
" TFM_HUGE "
#endif
"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
if (sizeof(fp_digit) == sizeof(fp_word)) {
strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
sizeof(buf)-1);
}
return buf;
}
#ifdef STANDALONE
int main(void)
{
printf("%s\n", fp_ident());
return 0;
}
#endif

View File

@ -12,7 +12,19 @@
/* c = a mod b, 0 <= c < b */
int fp_mod(fp_int *a, fp_int *b, fp_int *c)
{
return fp_div(a, b, NULL, c);
fp_int t;
int err;
fp_zero(&t);
if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
return err;
}
if (t.sign != b->sign) {
fp_add(&t, b, c);
} else {
fp_copy(&t, c);
}
return FP_OKAY;
}

View File

@ -24,7 +24,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
} else {
fp_set(a, 1);
++bits;
bits = 1;
}
/* now compute C = A * B mod b */

View File

@ -66,13 +66,13 @@ asm(
#define PROPCARRY \
asm( \
"movq %1,%%rax \n\t" \
"movq %2,%%rbx \n\t" \
"addq %%rax,%6 \n\t" \
"movq %2,%%rax \n\t" \
"adcq %%rax,%7 \n\t" \
"adcq %%rbx,%7 \n\t" \
"adcq $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%rax", "%cc");
: "%rax", "%rbx", "%cc");
#elif defined(TFM_SSE2)
@ -88,7 +88,7 @@ asm("emms");
asm(\
"movd %0,%%mm1 \n\t" \
"pmuludq %%mm2,%%mm1 \n\t" \
:: "g"(c[x]), "g"(mp));
:: "g"(c[x]));
#define INNERMUL \
asm( \
@ -112,7 +112,7 @@ asm(
"adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
"g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
: "%eax", "%cc");
#elif defined(TFM_ARM)
@ -166,14 +166,18 @@ asm( \
mu = c[x] * mp;
#define INNERMUL \
t = ((fp_word)mu) * ((fp_word)*tmpm++); \
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \
do { fp_word t; \
t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
_c[OFF2] += (t >> DIGIT_BIT); \
} while (0);
#define PROPCARRY \
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1];
do { fp_word t; \
t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
_c[OFF2+1] += (t >> DIGIT_BIT); \
} while (0);
#endif
@ -187,7 +191,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
int oldused, x, y, pa;
fp_word t;
/* now zero the buff */
pa = m->used;
@ -221,13 +224,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
/* fix the rest of the carries */
_c = c + pa;
for (; x < pa * 2 + 2; x++) {
for (x = pa; x < pa * 2 + 2; x++) {
PROPCARRY;
++_c;
}
/* now copy out */
_c = c + pa;
_c = c + pa;
tmpm = a->dp;
for (x = 0; x < pa+1; x++) {
*tmpm++ = *_c++;

View File

@ -28,9 +28,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
fp_mul_comba4(A,B,C);
} else if (y <= 8) {
fp_mul_comba8(A,B,C);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) {
fp_mul_comba16(A,B,C);
#ifdef TFM_HUGE
#endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) {
fp_mul_comba32(A,B,C);
#endif

View File

@ -27,7 +27,7 @@
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
@ -42,7 +42,7 @@
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
asm ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -62,7 +62,7 @@ asm volatile ( \
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
@ -77,13 +77,13 @@ asm volatile ( \
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2)
/* use SSE2 optimizations */
@ -97,7 +97,7 @@ asm volatile ( \
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
@ -134,7 +134,7 @@ asm volatile ( \
c0 = c1 = c2 = 0;
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \
x = c0;
@ -155,13 +155,13 @@ asm( \
#else
/* ISO C code */
#define COMBA_START
#define COMBA_START
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \
x = c0;
@ -169,12 +169,13 @@ asm( \
#define COMBA_STORE2(x) \
x = c1;
#define COMBA_FINI
#define MULADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#define COMBA_FINI
#define MULADD(i, j) \
do { fp_word t; \
t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
#endif
@ -184,7 +185,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
{
int ix, iy, iz, tx, ty, pa;
fp_digit c0, c1, c2, *tmpx, *tmpy;
fp_word t;
fp_int tmp, *dst;
COMBA_START;
@ -239,7 +239,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[8];
memcpy(at, A->dp, 4 * sizeof(fp_digit));
@ -284,7 +283,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[16];
memcpy(at, A->dp, 8 * sizeof(fp_digit));
@ -358,10 +356,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI;
}
#if defined(TFM_LARGE)
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[32];
memcpy(at, A->dp, 16 * sizeof(fp_digit));
@ -499,11 +497,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI;
}
#endif /* TFM_LARGE */
#ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[64];
memcpy(at, A->dp, 32 * sizeof(fp_digit));

View File

@ -11,4 +11,39 @@
int fp_radix_size(fp_int *a, int radix, int *size)
{
int digs;
fp_int t;
fp_digit d;
*size = 0;
/* check range of the radix */
if (radix < 2 || radix > 64) {
return FP_VAL;
}
/* quick out if its zero */
if (fp_iszero(a) == 1) {
*size = 2;
return FP_OKAY;
}
fp_init_copy(&t, a);
/* if it is negative output a - */
if (t.sign == FP_NEG) {
*size++;
t.sign = FP_ZPOS;
}
digs = 0;
while (fp_iszero (&t) == FP_NO) {
fp_div_d (&t, (fp_digit) radix, &t, &d);
*size++;
}
/* append a NULL so the string is properly terminated */
*size++;
return FP_OKAY;
}

View File

@ -21,9 +21,11 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_sqr_comba4(A,B);
} else if (y <= 8) {
fp_sqr_comba8(A,B);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) {
fp_sqr_comba16(A,B);
#ifdef TFM_HUGE
#endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) {
fp_sqr_comba32(A,B);
#endif

View File

@ -28,7 +28,7 @@
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
@ -68,21 +68,21 @@ asm volatile ( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
#define SQRADD(i, j) \
asm volatile ( \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
@ -91,7 +91,7 @@ asm volatile ( \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2)
@ -109,7 +109,7 @@ asm volatile ( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI \
asm("emms");
@ -120,11 +120,11 @@ asm volatile ( \
"pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \
"movd %%mm0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
@ -158,7 +158,7 @@ asm volatile ( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
@ -187,7 +187,8 @@ asm( \
/* ISO C portable code */
#define COMBA_START
#define COMBA_START \
{ fp_word tt;
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
@ -199,23 +200,28 @@ asm( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
#define COMBA_FINI \
}
/* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#define SQRADD(i, j) \
do { fp_word t; \
t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
/* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#define SQRADD2(i, j) \
do { fp_word t; \
t = ((fp_word)i) * ((fp_word)j); \
tt = (fp_word)c0 + t; c0 = tt; \
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
tt = (fp_word)c0 + t; c0 = tt; \
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
} while (0);
#endif
@ -225,7 +231,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
int pa, ix, iz;
fp_digit c0, c1, c2;
fp_int tmp, *dst;
fp_word t;
/* get size of output and trim */
pa = A->used + A->used;
@ -298,7 +303,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
void fp_sqr_comba4(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[8], c0, c1, c2;
a = A->dp;
@ -352,7 +356,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
void fp_sqr_comba8(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[16], c0, c1, c2;
a = A->dp;
@ -443,10 +446,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
fp_clamp(B);
}
#if defined(TFM_LARGE)
void fp_sqr_comba16(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[32], c0, c1, c2;
a = A->dp;
@ -617,11 +620,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
fp_clamp(B);
}
#endif /* TFM_LARGE */
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[64], c0, c1, c2;
a = A->dp;

4
gen.pl
View File

@ -6,7 +6,7 @@
use strict;
open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
foreach my $filename (glob "fp_*.c") {
foreach my $filename (glob "*fp_*.c") {
open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
print OUT "/* Start: $filename */\n";
print OUT while <SRC>;
@ -14,4 +14,4 @@ foreach my $filename (glob "fp_*.c") {
close SRC or die "Error closing $filename after reading: $!";
}
print OUT "\n/* EOF */\n";
close OUT or die "Error closing mpi.c after writing: $!";
close OUT or die "Error closing mpi.c after writing: $!";

View File

@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.01
VERSION=0.02
default: libtfm.a
@ -38,8 +38,9 @@ fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
fp_ident.o
libtfm.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
@ -49,7 +50,7 @@ mtest/mtest: mtest/mtest.c
cd mtest ; make mtest
test: libtfm.a demo/test.o mtest/mtest
$(CC) demo/test.o libtfm.a $(PROF) -o test
$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
stest: libtfm.a demo/stest.o
$(CC) demo/stest.o libtfm.a -o stest
@ -67,7 +68,7 @@ docs: docdvi
mv -f tfm.pdf doc
clean:
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~
cd mtest ; make clean
zipup: docs clean

View File

@ -6,4 +6,4 @@ mtest: mtest.o
$(CC) mtest.o -ltommath -o mtest
clean:
rm -f *.o mtest
rm -f *.o mtest *~

View File

@ -235,10 +235,9 @@ int main(void)
rand_num2(&a);
rand_num2(&b);
rand_num2(&c);
// if (c.dp[0]&1) mp_add_d(&c, 1, &c);
a.sign = b.sign = c.sign = 0;
c.dp[0] |= 1;
if (c.used <= 2) continue;
// if (c.used <= 4) continue;
// if (mp_cmp(&a, &c) != MP_LT) continue;
// if (mp_cmp(&b, &c) != MP_LT) continue;
mp_exptmod(&a, &b, &c, &d);

View File

@ -709,7 +709,7 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
* Some restrictions... x must be positive and < b
*/
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int M[64], res;
fp_digit buf, mp;
@ -730,7 +730,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
}
/* init M array */
memset(M, 0, sizeof(fp_int)*(1<<winsize));
memset(M, 0, sizeof(M));
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
@ -865,6 +865,27 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
return FP_OKAY;
}
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int tmp;
int err;
/* is X negative? */
if (X->sign == FP_NEG) {
/* yes, copy G and invmod it */
fp_copy(G, &tmp);
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
return err;
}
/* _fp_exptmod doesn't care about the sign of X */
return _fp_exptmod(&tmp, X, P, Y);
} else {
/* Positive exponent so just exptmod */
return _fp_exptmod(G, X, P, Y);
}
}
/* End: fp_exptmod.c */
/* Start: fp_gcd.c */
@ -922,6 +943,76 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
/* End: fp_gcd.c */
/* Start: fp_ident.c */
#include "tfm.h"
const char *fp_ident(void)
{
static char buf[1024];
memset(buf, 0, sizeof(buf));
snprintf(buf, sizeof(buf)-1,
"TomsFastMath (%s)\n"
"\n"
"Sizeofs\n"
"\tfp_digit = %u\n"
"\tfp_word = %u\n"
"\n"
"FP_MAX_SIZE = %u\n"
"\n"
"Defines: \n"
#ifdef __i386__
" __i386__ "
#endif
#ifdef __x86_64__
" __x86_64__ "
#endif
#ifdef TFM_X86
" TFM_X86 "
#endif
#ifdef TFM_X86_64
" TFM_X86_64 "
#endif
#ifdef TFM_SSE2
" TFM_SSE2 "
#endif
#ifdef TFM_ARM
" TFM_ARM "
#endif
#ifdef TFM_NO_ASM
" TFM_NO_ASM "
#endif
#ifdef FP_64BIT
" FP_64BIT "
#endif
#ifdef TFM_LARGE
" TFM_LARGE "
#endif
#ifdef TFM_HUGE
" TFM_HUGE "
#endif
"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
if (sizeof(fp_digit) == sizeof(fp_word)) {
strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
sizeof(buf)-1);
}
return buf;
}
#ifdef STANDALONE
int main(void)
{
printf("%s\n", fp_ident());
return 0;
}
#endif
/* End: fp_ident.c */
/* Start: fp_invmod.c */
/* TomsFastMath, a fast ISO C bignum library.
*
@ -1186,7 +1277,19 @@ void fp_lshd(fp_int *a, int x)
/* c = a mod b, 0 <= c < b */
int fp_mod(fp_int *a, fp_int *b, fp_int *c)
{
return fp_div(a, b, NULL, c);
fp_int t;
int err;
fp_zero(&t);
if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
return err;
}
if (t.sign != b->sign) {
fp_add(&t, b, c);
} else {
fp_copy(&t, c);
}
return FP_OKAY;
}
@ -1282,7 +1385,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
} else {
fp_set(a, 1);
++bits;
bits = 1;
}
/* now compute C = A * B mod b */
@ -1366,13 +1469,13 @@ asm(
#define PROPCARRY \
asm( \
"movq %1,%%rax \n\t" \
"movq %2,%%rbx \n\t" \
"addq %%rax,%6 \n\t" \
"movq %2,%%rax \n\t" \
"adcq %%rax,%7 \n\t" \
"adcq %%rbx,%7 \n\t" \
"adcq $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%rax", "%cc");
: "%rax", "%rbx", "%cc");
#elif defined(TFM_SSE2)
@ -1388,7 +1491,7 @@ asm("emms");
asm(\
"movd %0,%%mm1 \n\t" \
"pmuludq %%mm2,%%mm1 \n\t" \
:: "g"(c[x]), "g"(mp));
:: "g"(c[x]));
#define INNERMUL \
asm( \
@ -1412,7 +1515,7 @@ asm(
"adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
"g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
: "%eax", "%cc");
#elif defined(TFM_ARM)
@ -1466,14 +1569,18 @@ asm( \
mu = c[x] * mp;
#define INNERMUL \
t = ((fp_word)mu) * ((fp_word)*tmpm++); \
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \
do { fp_word t; \
t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
_c[OFF2] += (t >> DIGIT_BIT); \
} while (0);
#define PROPCARRY \
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1];
do { fp_word t; \
t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
_c[OFF2+1] += (t >> DIGIT_BIT); \
} while (0);
#endif
@ -1487,7 +1594,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
int oldused, x, y, pa;
fp_word t;
/* now zero the buff */
pa = m->used;
@ -1521,13 +1627,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
/* fix the rest of the carries */
_c = c + pa;
for (; x < pa * 2 + 2; x++) {
for (x = pa; x < pa * 2 + 2; x++) {
PROPCARRY;
++_c;
}
/* now copy out */
_c = c + pa;
_c = c + pa;
tmpm = a->dp;
for (x = 0; x < pa+1; x++) {
*tmpm++ = *_c++;
@ -1629,9 +1735,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
fp_mul_comba4(A,B,C);
} else if (y <= 8) {
fp_mul_comba8(A,B,C);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) {
fp_mul_comba16(A,B,C);
#ifdef TFM_HUGE
#endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) {
fp_mul_comba32(A,B,C);
#endif
@ -1880,7 +1988,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
@ -1895,7 +2003,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
asm ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
@ -1915,7 +2023,7 @@ asm volatile ( \
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
@ -1930,13 +2038,13 @@ asm volatile ( \
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2)
/* use SSE2 optimizations */
@ -1950,7 +2058,7 @@ asm volatile ( \
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
/* store the first sum */
#define COMBA_STORE(x) \
@ -1987,7 +2095,7 @@ asm volatile ( \
c0 = c1 = c2 = 0;
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \
x = c0;
@ -2008,13 +2116,13 @@ asm( \
#else
/* ISO C code */
#define COMBA_START
#define COMBA_START
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_STORE(x) \
x = c0;
@ -2022,12 +2130,13 @@ asm( \
#define COMBA_STORE2(x) \
x = c1;
#define COMBA_FINI
#define MULADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#define COMBA_FINI
#define MULADD(i, j) \
do { fp_word t; \
t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
#endif
@ -2037,7 +2146,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
{
int ix, iy, iz, tx, ty, pa;
fp_digit c0, c1, c2, *tmpx, *tmpy;
fp_word t;
fp_int tmp, *dst;
COMBA_START;
@ -2092,7 +2200,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[8];
memcpy(at, A->dp, 4 * sizeof(fp_digit));
@ -2137,7 +2244,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[16];
memcpy(at, A->dp, 8 * sizeof(fp_digit));
@ -2211,10 +2317,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI;
}
#if defined(TFM_LARGE)
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[32];
memcpy(at, A->dp, 16 * sizeof(fp_digit));
@ -2352,11 +2458,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
COMBA_FINI;
}
#endif /* TFM_LARGE */
#ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[64];
memcpy(at, A->dp, 32 * sizeof(fp_digit));
@ -2880,6 +2987,41 @@ error:
int fp_radix_size(fp_int *a, int radix, int *size)
{
int digs;
fp_int t;
fp_digit d;
*size = 0;
/* check range of the radix */
if (radix < 2 || radix > 64) {
return FP_VAL;
}
/* quick out if its zero */
if (fp_iszero(a) == 1) {
*size = 2;
return FP_OKAY;
}
fp_init_copy(&t, a);
/* if it is negative output a - */
if (t.sign == FP_NEG) {
*size++;
t.sign = FP_ZPOS;
}
digs = 0;
while (fp_iszero (&t) == FP_NO) {
fp_div_d (&t, (fp_digit) radix, &t, &d);
*size++;
}
/* append a NULL so the string is properly terminated */
*size++;
return FP_OKAY;
}
/* End: fp_radix_size.c */
@ -3161,9 +3303,11 @@ void fp_sqr(fp_int *A, fp_int *B)
fp_sqr_comba4(A,B);
} else if (y <= 8) {
fp_sqr_comba8(A,B);
#if defined(TFM_LARGE)
} else if (y <= 16 && y >= 12) {
fp_sqr_comba16(A,B);
#ifdef TFM_HUGE
#endif
#if defined(TFM_HUGE)
} else if (y <= 32 && y >= 28) {
fp_sqr_comba32(A,B);
#endif
@ -3279,7 +3423,7 @@ Obvious points of optimization
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
@ -3319,21 +3463,21 @@ asm volatile ( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
#define SQRADD(i, j) \
asm volatile ( \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
@ -3342,7 +3486,7 @@ asm volatile ( \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2)
@ -3360,7 +3504,7 @@ asm volatile ( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI \
asm("emms");
@ -3371,11 +3515,11 @@ asm volatile ( \
"pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \
"movd %%mm0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
@ -3409,7 +3553,7 @@ asm volatile ( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
@ -3438,7 +3582,8 @@ asm( \
/* ISO C portable code */
#define COMBA_START
#define COMBA_START \
{ fp_word tt;
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
@ -3450,23 +3595,28 @@ asm( \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
#define COMBA_FINI
#define COMBA_FINI \
}
/* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#define SQRADD(i, j) \
do { fp_word t; \
t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
} while (0);
/* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#define SQRADD2(i, j) \
do { fp_word t; \
t = ((fp_word)i) * ((fp_word)j); \
tt = (fp_word)c0 + t; c0 = tt; \
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
tt = (fp_word)c0 + t; c0 = tt; \
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
} while (0);
#endif
@ -3476,7 +3626,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
int pa, ix, iz;
fp_digit c0, c1, c2;
fp_int tmp, *dst;
fp_word t;
/* get size of output and trim */
pa = A->used + A->used;
@ -3549,7 +3698,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
void fp_sqr_comba4(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[8], c0, c1, c2;
a = A->dp;
@ -3603,7 +3751,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
void fp_sqr_comba8(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[16], c0, c1, c2;
a = A->dp;
@ -3694,10 +3841,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
fp_clamp(B);
}
#if defined(TFM_LARGE)
void fp_sqr_comba16(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[32], c0, c1, c2;
a = A->dp;
@ -3868,11 +4015,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
fp_clamp(B);
}
#endif /* TFM_LARGE */
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[64], c0, c1, c2;
a = A->dp;
@ -4455,5 +4603,81 @@ int fp_unsigned_bin_size(fp_int *a)
/* End: fp_unsigned_bin_size.c */
/* Start: s_fp_add.c */
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* unsigned addition */
void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
{
int x, y, oldused;
fp_word t;
y = MAX(a->used, b->used);
oldused = c->used;
c->used = y;
t = 0;
for (x = 0; x < y; x++) {
t += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
c->dp[x] = (fp_digit)t;
t >>= DIGIT_BIT;
}
if (t != 0 && x != FP_SIZE) {
c->dp[c->used++] = (fp_digit)t;
++x;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}
/* End: s_fp_add.c */
/* Start: s_fp_sub.c */
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
{
int x, oldused;
fp_word t;
oldused = c->used;
c->used = a->used;
t = 0;
for (x = 0; x < a->used; x++) {
t = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
c->dp[x] = (fp_digit)t;
t = (t >> DIGIT_BIT) & 1;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}
/* End: s_fp_sub.c */
/* EOF */

81
tfm.h
View File

@ -21,9 +21,19 @@
#undef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
/* do we want huge code? The answer is, yes. */
/* do we want large code? */
#define TFM_LARGE
/* do we want huge code (implies large)? The answer is, yes. */
#define TFM_HUGE
/* imply TFM_LARGE as required */
#if defined(TFM_HUGE)
#if !defined(TFM_LARGE)
#define TFM_LARGE
#endif
#endif
/* Max size of any number in bits. Basically the largest size you will be multiplying
* should be half [or smaller] of FP_MAX_SIZE-four_digit
*
@ -41,19 +51,69 @@
#error FP_MAX_SIZE must be a multiple of CHAR_BIT
#endif
/* make sure we are using 64-bit digits with x86-64 asm */
/* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */
#if defined(__x86_64__)
#if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)
#error x86-64 detected, x86-32/SSE2/ARM optimizations are not valid!
#endif
#if !defined(TFM_X86_64) && !defined(TFM_NO_ASM)
#define TFM_X86_64
#endif
#endif
#if defined(TFM_X86_64)
#ifndef FP_64BIT
#if !defined(FP_64BIT)
#define FP_64BIT
#endif
#endif
/* try to detect x86-32 */
#if defined(__i386__) && !defined(TFM_SSE2)
#if defined(TFM_X86_64) || defined(TFM_ARM)
#error x86-32 detected, x86-64/ARM optimizations are not valid!
#endif
#if !defined(TFM_X86) && !defined(TFM_NO_ASM)
#define TFM_X86
#endif
#endif
/* make sure we're 32-bit for x86-32/sse/arm */
#if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
#warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
#undef FP_64BIT
#endif
/* multi asms? */
#ifdef TFM_X86
#define TFM_ASM
#endif
#ifdef TFM_X86_64
#ifdef TFM_ASM
#error TFM_ASM already defined!
#endif
#define TFM_ASM
#endif
#ifdef TFM_SSE2
#ifdef TFM_ASM
#error TFM_ASM already defined!
#endif
#define TFM_ASM
#endif
#ifdef TFM_ARM
#ifdef TFM_ASM
#error TFM_ASM already defined!
#endif
#define TFM_ASM
#endif
/* we want no asm? */
#ifdef TFM_NO_ASM
#undef TFM_X86
#undef TFM_X86_64
#undef TFM_SSE2
#undef TFM_ARM
#undef TFM_ASM
#endif
/* some default configurations.
*/
#if defined(FP_64BIT)
@ -110,8 +170,11 @@ typedef struct {
/* functions */
/* returns a TFM ident string useful for debugging... */
const char *fp_ident(void);
/* initialize [or zero] an fp int */
#define fp_init(a) memset((a), 0, sizeof(fp_int))
#define fp_init(a) (void)memset((a), 0, sizeof(fp_int))
#define fp_zero(a) fp_init(a)
/* zero/even/odd ? */
@ -123,7 +186,7 @@ typedef struct {
void fp_set(fp_int *a, fp_digit b);
/* copy from a to b */
#define fp_copy(a, b) (((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
#define fp_copy(a, b) (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
#define fp_init_copy(a, b) fp_copy(b, a)
/* negate and absolute */
@ -139,10 +202,10 @@ void fp_rshd(fp_int *a, int x);
/* left shift x digits */
void fp_lshd(fp_int *a, int x);
/* signed comparisonm */
/* signed comparison */
int fp_cmp(fp_int *a, fp_int *b);
/* unsigned comparisonm */
/* unsigned comparison */
int fp_cmp_mag(fp_int *a, fp_int *b);
/* power of 2 operations */
@ -273,14 +336,18 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
#ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
#endif
#ifdef TFM_LARGE
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
#endif
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
void fp_sqr_comba(fp_int *A, fp_int *B);
void fp_sqr_comba4(fp_int *A, fp_int *B);
void fp_sqr_comba8(fp_int *A, fp_int *B);
#ifdef TFM_LARGE
void fp_sqr_comba16(fp_int *A, fp_int *B);
#endif
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B);
#endif

25
tfm.tex
View File

@ -49,7 +49,7 @@
\begin{document}
\frontmatter
\pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.01}
\title{TomsFastMath User Manual \\ v0.02}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5
@ -128,23 +128,27 @@ several ``CFLAGS'' defines.
For example, to build with with SSE2 optimizations type
\begin{verbatim}
export CFLAGS=-DTFM_SSE2
make clean libtfm.a
CFLAGS=-DTFM_SSE2 make clean libtfm.a
\end{verbatim}
\subsubsection{x86--32} The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all
i386 and beyond processors. It requires GCC to build and only works with 32--bit digits. In this
mode fp\_digit is 32--bits and fp\_word is 64--bits.
mode fp\_digit is 32--bits and fp\_word is 64--bits. This mode will be autodetected when building
with GCC to an ``i386'' target. You can override this behaviour by defining TFM\_NO\_ASM or
another optimization mode (such as SSE2).
\subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium
M or Athlon64 processor. It requires GCC to build. Note that you shouldn't define both
TFM\_X86 and TFM\_SSE2 at the same time. This mode only works with 32--bit digits. In this
mode fp\_digit is 32--bits and fp\_word is 64--bits.
mode fp\_digit is 32--bits and fp\_word is 64--bits. While this mode will work on the AMD Athlon64
series of processors it is less efficient than the native ``x86--64'' mode and not recommended.
\subsubsection{x86--64} The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a
``x86--64'' capable processor (Athlon64 and future Pentium processors). It requires GCC to
build and only works with 64--bit digits. Note that by enabling this mode it will automatically
enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits.
enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits. This mode will
be autodetected when building with GCC to an ``x86--64'' target. You can override this behaviour by defining
TFM\_NO\_ASM.
\subsubsection{ARM} The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
processor. It requires GCC and works with 32--bit digits. In this mode fp\_digit is 32--bits and
@ -163,6 +167,8 @@ Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline pat
\hline Pentium 4 & TFM\_SSE2 \\
\hline Athlon64 & TFM\_X86\_64 \\
\hline ARMv4 or higher & TFM\_ARM \\
\hline &\\
\hline x86--32 or x86--64 (with GCC) & Leave blank and let autodetect work \\
\hline
\end{tabular}
\caption{Recommended Build Modes}
@ -339,9 +345,10 @@ To compute a modular exponentiation use the following function.
\begin{verbatim}
int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
\end{verbatim}
This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$. The size of $c$
must be half of the maximum precision used during the build of the library. For example,
by default $c$ must be less than $2^{2048}$.
This computes $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for any odd $c$ and $b$. $b$ may be negative so long as
$a^{-1} \mbox{ (mod }c\mbox{)}$ exists. The initial value of $a$ may be larger than $c$. The size of $c$ must be
half of the maximum precision used during the build of the library. For example, by default $c$ must be less
than $2^{2048}$.
\section{Number Theoretic}