forked from ibphoenix/tomsfastmath
added tomsfastmath-0.02
This commit is contained in:
parent
5e92ed2a59
commit
6bb413fd72
21
changes.txt
21
changes.txt
@ -1,2 +1,21 @@
|
||||
September 18th, 2004
|
||||
0.02 -- Added TFM_LARGE to turn on/off 16x combas to save even more space.
|
||||
This also helps prevent killing the cache on smaller cpus.
|
||||
-- Cast memset to void in fp_init() to catch people who misuse the function (e.g. expect return)
|
||||
Thanks to Johan Lindh
|
||||
-- Cleaned up x86-64 support [faster montgomery reductions]
|
||||
-- Autodetects x86-32 and x86-64 and enables it's asm now
|
||||
-- Made test demo build cleaner in multilib platforms [e.g. mixed 32/64 bits]
|
||||
-- Fix to fp_mod to ensure that remainder is of the same sign as the modulus.
|
||||
-- Fixed bug in fp_montgomery_calc_normalization for single digit moduli
|
||||
-- cleaned up ISO C macros in comba/mont to avoid branches [works best with GCC 3.4.x branch]
|
||||
-- Added more testing to tfm.h to help detect misconfigured builds
|
||||
-- Added TFM_NO_ASM which forces ASM off [even if it was autodetected].
|
||||
-- Added fp_radix_size() to API
|
||||
-- Cleaned up demo/test.c to build with far fewer warnings (mostly %d => %lu fixes)
|
||||
-- fp_exptmod() now supports negative exponent and base>modulus cases
|
||||
-- Added fp_ident() which gives a string showing how TFM was configured. Useful for debuging...
|
||||
-- fix gen.pl script so it includes the whole source tree now
|
||||
|
||||
August 25th, 2004
|
||||
TFM 0.01 -- Initial Release
|
||||
0.01 -- Initial Release
|
||||
|
@ -10,7 +10,6 @@ int main(int argc, char **argv)
|
||||
printf(
|
||||
"void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
|
||||
"{\n"
|
||||
" fp_word t;\n"
|
||||
" fp_digit c0, c1, c2, at[%d];\n"
|
||||
"\n"
|
||||
" memcpy(at, A->dp, %d * sizeof(fp_digit));\n"
|
||||
|
@ -9,7 +9,6 @@ int main(int argc, char **argv)
|
||||
printf(
|
||||
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
|
||||
"{\n"
|
||||
" fp_word t;\n"
|
||||
" fp_digit *a, b[%d], c0, c1, c2;\n"
|
||||
"\n"
|
||||
" a = A->dp;\n"
|
||||
|
24
delme.c
Normal file
24
delme.c
Normal file
@ -0,0 +1,24 @@
|
||||
#include "tfm.h"
|
||||
|
||||
int main(void)
|
||||
{
|
||||
fp_int a;
|
||||
char buf[4096];
|
||||
|
||||
fp_init(&a);
|
||||
fp_read_radix( &a,
|
||||
"///////////93zgY8MZ2DCJ6Oek0t1pHAG9E28fdp7G22xwcEnER8b5A27cED0JT"
|
||||
"xvKPiyqwGnimAmfjybyKDq/XDMrjKS95v8MrTc9UViRqJ4BffZVjQml/NBRq1hVj"
|
||||
"xZXh+rg9dwMkdoGHV4iVvaaePb7iv5izmW1ykA5ZlmMOsaWs75NJccaMFwZz9CzV"
|
||||
"WsLT8zoZhPOSOlDM88LIkvxLAGTmbfPjPmmrJagyc0JnT6m8oXWXV3AGNaOkDiux"
|
||||
"uvvtB1WEXWER9uEYx0UYZxN5NV1lJ5B9tYlBzfLO5nWvbKbywfLgvHNI9XYO+WKG"
|
||||
"5NAEMeggn2sjCnSD151wCwXL8QlV7BfaxFk515ZRxmgAwd5NNGOCVREN3uMcuUJ7"
|
||||
"g/MkZDi9CzSUZ9JWIYLXdSxZqYOQqkvhyI/w1jcA26JOTW9pFiXgP58VAnWNUo0C"
|
||||
"k+4NLtfXNMnt2OZ0kjb6uWZYJw1qvQinGzjR/E3z48vBWj4WgJhIol//////////",
|
||||
64 );
|
||||
|
||||
if( fp_isprime( &a ) ) printf("It's prime.\n");
|
||||
else printf( "Not prime.\n");
|
||||
|
||||
return 0;
|
||||
}
|
58
demo/test.c
58
demo/test.c
@ -60,6 +60,8 @@ int main(void)
|
||||
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
|
||||
ulong64 t1, t2;
|
||||
|
||||
|
||||
printf("TFM Ident string:\n%s\n\n", fp_ident());
|
||||
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
|
||||
fp_zero(&a); draw(&a);
|
||||
|
||||
@ -133,9 +135,30 @@ int main(void)
|
||||
printf("Testing read_radix\n");
|
||||
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
|
||||
|
||||
#if 1
|
||||
/* test mont */
|
||||
printf("Montgomery test\n");
|
||||
fp_set(&a, 1);
|
||||
printf("Montgomery test #1\n");
|
||||
fp_set(&a, 0x1234567ULL);
|
||||
fp_montgomery_setup(&a, &fp);
|
||||
fp_montgomery_calc_normalization(&b, &a);
|
||||
|
||||
fp_read_radix(&d, "123456789123", 16);
|
||||
for (n = 0; n < 100000; n++) {
|
||||
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
|
||||
fp_mul(&d, &b, &c);
|
||||
fp_montgomery_reduce(&c, &a, fp);
|
||||
if (fp_cmp(&c, &d) != FP_EQ) {
|
||||
printf("Failed mont %d\n", n);
|
||||
draw(&a);
|
||||
draw(&d);
|
||||
draw(&c);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
printf("Passed.\n");
|
||||
|
||||
printf("Montgomery test #2\n");
|
||||
fp_set(&a, 0x1234567ULL);
|
||||
fp_lshd(&a, 4);
|
||||
fp_add_d(&a, 1, &a);
|
||||
fp_montgomery_setup(&a, &fp);
|
||||
@ -158,19 +181,19 @@ int main(void)
|
||||
|
||||
/* test for size */
|
||||
for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) {
|
||||
printf("Testing (not safe-prime): %9d bits \r", ix); fflush(stdout);
|
||||
printf("Testing (not safe-prime): %9lu bits \r", ix); fflush(stdout);
|
||||
err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL);
|
||||
if (err != FP_OKAY) {
|
||||
printf("failed with err code %d\n", err);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if (fp_count_bits(&a) != ix) {
|
||||
printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix);
|
||||
if ((unsigned long)fp_count_bits(&a) != ix) {
|
||||
printf("Prime is %d not %lu bits!!!\n", fp_count_bits(&a), ix);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
/* do some timings... */
|
||||
@ -280,7 +303,7 @@ int main(void)
|
||||
c.used = t;
|
||||
|
||||
t2 = -1;
|
||||
for (ix = 0; ix < 50; ++ix) {
|
||||
for (ix = 0; ix < 1024; ++ix) {
|
||||
t1 = TIMFUNC();
|
||||
fp_exptmod(&c, &b, &a, &d);
|
||||
fp_exptmod(&c, &b, &a, &d);
|
||||
@ -293,7 +316,6 @@ int main(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
|
||||
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
|
||||
|
||||
@ -304,27 +326,27 @@ int main(void)
|
||||
printf("%s ]\r",cmd); fflush(stdout);
|
||||
if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
|
||||
|
||||
fp_mul_2d(&a, rr, &a);
|
||||
a.sign = b.sign;
|
||||
if (fp_cmp(&a, &b) != FP_EQ) {
|
||||
printf("mul2d failed, rr == %d\n",rr);
|
||||
printf("mul2d failed, rr == %lu\n",rr);
|
||||
draw(&a);
|
||||
draw(&b);
|
||||
return 0;
|
||||
}
|
||||
} else if (!strcmp(cmd, "div2d")) { ++div2d_n;
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &rr);
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
|
||||
|
||||
fp_div_2d(&a, rr, &a, &e);
|
||||
a.sign = b.sign;
|
||||
if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
|
||||
if (fp_cmp(&a, &b) != FP_EQ) {
|
||||
printf("div2d failed, rr == %d\n",rr);
|
||||
printf("div2d failed, rr == %lu\n",rr);
|
||||
draw(&a);
|
||||
draw(&b);
|
||||
return 0;
|
||||
@ -492,7 +514,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
|
||||
}
|
||||
} else if (!strcmp(cmd, "add_d")) { ++add_d_n;
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
|
||||
fp_add_d(&a, ix, &c);
|
||||
if (fp_cmp(&b, &c) != FP_EQ) {
|
||||
@ -500,12 +522,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
|
||||
draw(&a);
|
||||
draw(&b);
|
||||
draw(&c);
|
||||
printf("d == %d\n", ix);
|
||||
printf("d == %lu\n", ix);
|
||||
return 0;
|
||||
}
|
||||
} else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
|
||||
fp_sub_d(&a, ix, &c);
|
||||
if (fp_cmp(&b, &c) != FP_EQ) {
|
||||
@ -513,12 +535,12 @@ draw(&a);draw(&b);draw(&c);draw(&d);
|
||||
draw(&a);
|
||||
draw(&b);
|
||||
draw(&c);
|
||||
printf("d == %d\n", ix);
|
||||
printf("d == %lu\n", ix);
|
||||
return 0;
|
||||
}
|
||||
} else if (!strcmp(cmd, "mul_d")) { ++mul_d_n;
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
|
||||
fgets(buf, 4095, stdin); sscanf(buf, "%lu", &ix);
|
||||
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
|
||||
fp_mul_d(&a, ix, &c);
|
||||
if (fp_cmp(&b, &c) != FP_EQ) {
|
||||
@ -526,7 +548,7 @@ draw(&a);draw(&b);draw(&c);draw(&d);
|
||||
draw(&a);
|
||||
draw(&b);
|
||||
draw(&c);
|
||||
printf("d == %d\n", ix);
|
||||
printf("d == %lu\n", ix);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
BIN
doc/tfm.pdf
BIN
doc/tfm.pdf
Binary file not shown.
25
fp_exptmod.c
25
fp_exptmod.c
@ -13,7 +13,7 @@
|
||||
* Some restrictions... x must be positive and < b
|
||||
*/
|
||||
|
||||
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
{
|
||||
fp_int M[64], res;
|
||||
fp_digit buf, mp;
|
||||
@ -34,7 +34,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
}
|
||||
|
||||
/* init M array */
|
||||
memset(M, 0, sizeof(fp_int)*(1<<winsize));
|
||||
memset(M, 0, sizeof(M));
|
||||
|
||||
/* now setup montgomery */
|
||||
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
|
||||
@ -168,3 +168,24 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
fp_copy (&res, Y);
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
{
|
||||
fp_int tmp;
|
||||
int err;
|
||||
|
||||
/* is X negative? */
|
||||
if (X->sign == FP_NEG) {
|
||||
/* yes, copy G and invmod it */
|
||||
fp_copy(G, &tmp);
|
||||
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
|
||||
return err;
|
||||
}
|
||||
/* _fp_exptmod doesn't care about the sign of X */
|
||||
return _fp_exptmod(&tmp, X, P, Y);
|
||||
} else {
|
||||
/* Positive exponent so just exptmod */
|
||||
return _fp_exptmod(G, X, P, Y);
|
||||
}
|
||||
}
|
||||
|
||||
|
66
fp_ident.c
Normal file
66
fp_ident.c
Normal file
@ -0,0 +1,66 @@
|
||||
#include "tfm.h"
|
||||
|
||||
const char *fp_ident(void)
|
||||
{
|
||||
static char buf[1024];
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
snprintf(buf, sizeof(buf)-1,
|
||||
"TomsFastMath (%s)\n"
|
||||
"\n"
|
||||
"Sizeofs\n"
|
||||
"\tfp_digit = %u\n"
|
||||
"\tfp_word = %u\n"
|
||||
"\n"
|
||||
"FP_MAX_SIZE = %u\n"
|
||||
"\n"
|
||||
"Defines: \n"
|
||||
#ifdef __i386__
|
||||
" __i386__ "
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
" __x86_64__ "
|
||||
#endif
|
||||
#ifdef TFM_X86
|
||||
" TFM_X86 "
|
||||
#endif
|
||||
#ifdef TFM_X86_64
|
||||
" TFM_X86_64 "
|
||||
#endif
|
||||
#ifdef TFM_SSE2
|
||||
" TFM_SSE2 "
|
||||
#endif
|
||||
#ifdef TFM_ARM
|
||||
" TFM_ARM "
|
||||
#endif
|
||||
#ifdef TFM_NO_ASM
|
||||
" TFM_NO_ASM "
|
||||
#endif
|
||||
#ifdef FP_64BIT
|
||||
" FP_64BIT "
|
||||
#endif
|
||||
#ifdef TFM_LARGE
|
||||
" TFM_LARGE "
|
||||
#endif
|
||||
#ifdef TFM_HUGE
|
||||
" TFM_HUGE "
|
||||
#endif
|
||||
"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
|
||||
|
||||
if (sizeof(fp_digit) == sizeof(fp_word)) {
|
||||
strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
|
||||
sizeof(buf)-1);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
#ifdef STANDALONE
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("%s\n", fp_ident());
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
14
fp_mod.c
14
fp_mod.c
@ -12,7 +12,19 @@
|
||||
/* c = a mod b, 0 <= c < b */
|
||||
int fp_mod(fp_int *a, fp_int *b, fp_int *c)
|
||||
{
|
||||
return fp_div(a, b, NULL, c);
|
||||
fp_int t;
|
||||
int err;
|
||||
|
||||
fp_zero(&t);
|
||||
if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
|
||||
return err;
|
||||
}
|
||||
if (t.sign != b->sign) {
|
||||
fp_add(&t, b, c);
|
||||
} else {
|
||||
fp_copy(&t, c);
|
||||
}
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
|
||||
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
|
||||
} else {
|
||||
fp_set(a, 1);
|
||||
++bits;
|
||||
bits = 1;
|
||||
}
|
||||
|
||||
/* now compute C = A * B mod b */
|
||||
|
@ -66,13 +66,13 @@ asm(
|
||||
#define PROPCARRY \
|
||||
asm( \
|
||||
"movq %1,%%rax \n\t" \
|
||||
"movq %2,%%rbx \n\t" \
|
||||
"addq %%rax,%6 \n\t" \
|
||||
"movq %2,%%rax \n\t" \
|
||||
"adcq %%rax,%7 \n\t" \
|
||||
"adcq %%rbx,%7 \n\t" \
|
||||
"adcq $0,%8 \n\t" \
|
||||
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
|
||||
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
|
||||
: "%rax", "%cc");
|
||||
: "%rax", "%rbx", "%cc");
|
||||
|
||||
#elif defined(TFM_SSE2)
|
||||
|
||||
@ -88,7 +88,7 @@ asm("emms");
|
||||
asm(\
|
||||
"movd %0,%%mm1 \n\t" \
|
||||
"pmuludq %%mm2,%%mm1 \n\t" \
|
||||
:: "g"(c[x]), "g"(mp));
|
||||
:: "g"(c[x]));
|
||||
|
||||
#define INNERMUL \
|
||||
asm( \
|
||||
@ -112,7 +112,7 @@ asm(
|
||||
"adcl %%eax,%7 \n\t" \
|
||||
"adcl $0,%8 \n\t" \
|
||||
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
|
||||
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
|
||||
"g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
|
||||
: "%eax", "%cc");
|
||||
|
||||
#elif defined(TFM_ARM)
|
||||
@ -166,14 +166,18 @@ asm( \
|
||||
mu = c[x] * mp;
|
||||
|
||||
#define INNERMUL \
|
||||
t = ((fp_word)mu) * ((fp_word)*tmpm++); \
|
||||
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \
|
||||
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \
|
||||
do { fp_word t; \
|
||||
t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
|
||||
t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
|
||||
_c[OFF2] += (t >> DIGIT_BIT); \
|
||||
} while (0);
|
||||
|
||||
#define PROPCARRY \
|
||||
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \
|
||||
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1];
|
||||
|
||||
do { fp_word t; \
|
||||
t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
|
||||
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
|
||||
_c[OFF2+1] += (t >> DIGIT_BIT); \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
||||
@ -187,7 +191,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
|
||||
{
|
||||
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
|
||||
int oldused, x, y, pa;
|
||||
fp_word t;
|
||||
|
||||
/* now zero the buff */
|
||||
pa = m->used;
|
||||
@ -221,13 +224,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
|
||||
|
||||
/* fix the rest of the carries */
|
||||
_c = c + pa;
|
||||
for (; x < pa * 2 + 2; x++) {
|
||||
for (x = pa; x < pa * 2 + 2; x++) {
|
||||
PROPCARRY;
|
||||
++_c;
|
||||
}
|
||||
|
||||
/* now copy out */
|
||||
_c = c + pa;
|
||||
_c = c + pa;
|
||||
tmpm = a->dp;
|
||||
for (x = 0; x < pa+1; x++) {
|
||||
*tmpm++ = *_c++;
|
||||
|
4
fp_mul.c
4
fp_mul.c
@ -28,9 +28,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
|
||||
fp_mul_comba4(A,B,C);
|
||||
} else if (y <= 8) {
|
||||
fp_mul_comba8(A,B,C);
|
||||
#if defined(TFM_LARGE)
|
||||
} else if (y <= 16 && y >= 12) {
|
||||
fp_mul_comba16(A,B,C);
|
||||
#ifdef TFM_HUGE
|
||||
#endif
|
||||
#if defined(TFM_HUGE)
|
||||
} else if (y <= 32 && y >= 28) {
|
||||
fp_mul_comba32(A,B,C);
|
||||
#endif
|
||||
|
@ -27,7 +27,7 @@
|
||||
|
||||
/* forward the carry to the next digit */
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
/* store the first sum */
|
||||
#define COMBA_STORE(x) \
|
||||
@ -42,7 +42,7 @@
|
||||
|
||||
/* this should multiply i and j */
|
||||
#define MULADD(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movl %6,%%eax \n\t" \
|
||||
"mull %7 \n\t" \
|
||||
"addl %%eax,%0 \n\t" \
|
||||
@ -62,7 +62,7 @@ asm volatile ( \
|
||||
|
||||
/* forward the carry to the next digit */
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
/* store the first sum */
|
||||
#define COMBA_STORE(x) \
|
||||
@ -77,13 +77,13 @@ asm volatile ( \
|
||||
|
||||
/* this should multiply i and j */
|
||||
#define MULADD(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movq %6,%%rax \n\t" \
|
||||
"mulq %7 \n\t" \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
"adcq %%rdx,%1 \n\t" \
|
||||
"adcq $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
|
||||
|
||||
#elif defined(TFM_SSE2)
|
||||
/* use SSE2 optimizations */
|
||||
@ -97,7 +97,7 @@ asm volatile ( \
|
||||
|
||||
/* forward the carry to the next digit */
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
/* store the first sum */
|
||||
#define COMBA_STORE(x) \
|
||||
@ -134,7 +134,7 @@ asm volatile ( \
|
||||
c0 = c1 = c2 = 0;
|
||||
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_STORE(x) \
|
||||
x = c0;
|
||||
@ -155,13 +155,13 @@ asm( \
|
||||
#else
|
||||
/* ISO C code */
|
||||
|
||||
#define COMBA_START
|
||||
#define COMBA_START
|
||||
|
||||
#define COMBA_CLEAR \
|
||||
c0 = c1 = c2 = 0;
|
||||
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_STORE(x) \
|
||||
x = c0;
|
||||
@ -169,12 +169,13 @@ asm( \
|
||||
#define COMBA_STORE2(x) \
|
||||
x = c1;
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
#define MULADD(i, j) \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
|
||||
#define COMBA_FINI
|
||||
|
||||
#define MULADD(i, j) \
|
||||
do { fp_word t; \
|
||||
t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
|
||||
t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
||||
@ -184,7 +185,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
int ix, iy, iz, tx, ty, pa;
|
||||
fp_digit c0, c1, c2, *tmpx, *tmpy;
|
||||
fp_word t;
|
||||
fp_int tmp, *dst;
|
||||
|
||||
COMBA_START;
|
||||
@ -239,7 +239,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
|
||||
|
||||
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[8];
|
||||
|
||||
memcpy(at, A->dp, 4 * sizeof(fp_digit));
|
||||
@ -284,7 +283,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
|
||||
|
||||
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[16];
|
||||
|
||||
memcpy(at, A->dp, 8 * sizeof(fp_digit));
|
||||
@ -358,10 +356,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
|
||||
COMBA_FINI;
|
||||
}
|
||||
|
||||
#if defined(TFM_LARGE)
|
||||
|
||||
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[32];
|
||||
|
||||
memcpy(at, A->dp, 16 * sizeof(fp_digit));
|
||||
@ -499,11 +497,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
|
||||
COMBA_FINI;
|
||||
}
|
||||
|
||||
#endif /* TFM_LARGE */
|
||||
|
||||
#ifdef TFM_HUGE
|
||||
|
||||
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[64];
|
||||
|
||||
memcpy(at, A->dp, 32 * sizeof(fp_digit));
|
||||
|
@ -11,4 +11,39 @@
|
||||
|
||||
int fp_radix_size(fp_int *a, int radix, int *size)
|
||||
{
|
||||
int digs;
|
||||
fp_int t;
|
||||
fp_digit d;
|
||||
|
||||
*size = 0;
|
||||
|
||||
/* check range of the radix */
|
||||
if (radix < 2 || radix > 64) {
|
||||
return FP_VAL;
|
||||
}
|
||||
|
||||
/* quick out if its zero */
|
||||
if (fp_iszero(a) == 1) {
|
||||
*size = 2;
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
fp_init_copy(&t, a);
|
||||
|
||||
/* if it is negative output a - */
|
||||
if (t.sign == FP_NEG) {
|
||||
*size++;
|
||||
t.sign = FP_ZPOS;
|
||||
}
|
||||
|
||||
digs = 0;
|
||||
while (fp_iszero (&t) == FP_NO) {
|
||||
fp_div_d (&t, (fp_digit) radix, &t, &d);
|
||||
*size++;
|
||||
}
|
||||
|
||||
/* append a NULL so the string is properly terminated */
|
||||
*size++;
|
||||
return FP_OKAY;
|
||||
|
||||
}
|
||||
|
4
fp_sqr.c
4
fp_sqr.c
@ -21,9 +21,11 @@ void fp_sqr(fp_int *A, fp_int *B)
|
||||
fp_sqr_comba4(A,B);
|
||||
} else if (y <= 8) {
|
||||
fp_sqr_comba8(A,B);
|
||||
#if defined(TFM_LARGE)
|
||||
} else if (y <= 16 && y >= 12) {
|
||||
fp_sqr_comba16(A,B);
|
||||
#ifdef TFM_HUGE
|
||||
#endif
|
||||
#if defined(TFM_HUGE)
|
||||
} else if (y <= 32 && y >= 28) {
|
||||
fp_sqr_comba32(A,B);
|
||||
#endif
|
||||
|
@ -28,7 +28,7 @@
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
@ -68,21 +68,21 @@ asm volatile ( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
#define SQRADD(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movq %6,%%rax \n\t" \
|
||||
"mulq %%rax \n\t" \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
"adcq %%rdx,%1 \n\t" \
|
||||
"adcq $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
|
||||
|
||||
#define SQRADD2(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movq %6,%%rax \n\t" \
|
||||
"mulq %7 \n\t" \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
@ -91,7 +91,7 @@ asm volatile ( \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
"adcq %%rdx,%1 \n\t" \
|
||||
"adcq $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
|
||||
|
||||
|
||||
#elif defined(TFM_SSE2)
|
||||
@ -109,7 +109,7 @@ asm volatile ( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI \
|
||||
asm("emms");
|
||||
@ -120,11 +120,11 @@ asm volatile ( \
|
||||
"pmuludq %%mm0,%%mm0\n\t" \
|
||||
"movd %%mm0,%%eax \n\t" \
|
||||
"psrlq $32,%%mm0 \n\t" \
|
||||
"movd %%mm0,%%edx \n\t" \
|
||||
"addl %%eax,%0 \n\t" \
|
||||
"adcl %%edx,%1 \n\t" \
|
||||
"movd %%mm0,%%eax \n\t" \
|
||||
"adcl %%eax,%1 \n\t" \
|
||||
"adcl $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
|
||||
|
||||
#define SQRADD2(i, j) \
|
||||
asm volatile ( \
|
||||
@ -158,7 +158,7 @@ asm volatile ( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
@ -187,7 +187,8 @@ asm( \
|
||||
|
||||
/* ISO C portable code */
|
||||
|
||||
#define COMBA_START
|
||||
#define COMBA_START \
|
||||
{ fp_word tt;
|
||||
|
||||
#define CLEAR_CARRY \
|
||||
c0 = c1 = c2 = 0;
|
||||
@ -199,23 +200,28 @@ asm( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
#define COMBA_FINI \
|
||||
}
|
||||
|
||||
/* multiplies point i and j, updates carry "c1" and digit c2 */
|
||||
#define SQRADD(i, j) \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
|
||||
#define SQRADD(i, j) \
|
||||
do { fp_word t; \
|
||||
t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
|
||||
t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
|
||||
|
||||
/* for squaring some of the terms are doubled... */
|
||||
#define SQRADD2(i, j) \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
|
||||
#define SQRADD2(i, j) \
|
||||
do { fp_word t; \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
tt = (fp_word)c0 + t; c0 = tt; \
|
||||
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
|
||||
tt = (fp_word)c0 + t; c0 = tt; \
|
||||
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
||||
@ -225,7 +231,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
|
||||
int pa, ix, iz;
|
||||
fp_digit c0, c1, c2;
|
||||
fp_int tmp, *dst;
|
||||
fp_word t;
|
||||
|
||||
/* get size of output and trim */
|
||||
pa = A->used + A->used;
|
||||
@ -298,7 +303,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
|
||||
|
||||
void fp_sqr_comba4(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[8], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -352,7 +356,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
|
||||
|
||||
void fp_sqr_comba8(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[16], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -443,10 +446,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
|
||||
fp_clamp(B);
|
||||
}
|
||||
|
||||
#if defined(TFM_LARGE)
|
||||
|
||||
void fp_sqr_comba16(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[32], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -617,11 +620,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
|
||||
fp_clamp(B);
|
||||
}
|
||||
|
||||
#endif /* TFM_LARGE */
|
||||
|
||||
#ifdef TFM_HUGE
|
||||
|
||||
void fp_sqr_comba32(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[64], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
|
4
gen.pl
4
gen.pl
@ -6,7 +6,7 @@
|
||||
use strict;
|
||||
|
||||
open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
|
||||
foreach my $filename (glob "fp_*.c") {
|
||||
foreach my $filename (glob "*fp_*.c") {
|
||||
open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
|
||||
print OUT "/* Start: $filename */\n";
|
||||
print OUT while <SRC>;
|
||||
@ -14,4 +14,4 @@ foreach my $filename (glob "fp_*.c") {
|
||||
close SRC or die "Error closing $filename after reading: $!";
|
||||
}
|
||||
print OUT "\n/* EOF */\n";
|
||||
close OUT or die "Error closing mpi.c after writing: $!";
|
||||
close OUT or die "Error closing mpi.c after writing: $!";
|
||||
|
9
makefile
9
makefile
@ -10,7 +10,7 @@ CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
|
||||
#speed
|
||||
CFLAGS += -fomit-frame-pointer
|
||||
|
||||
VERSION=0.01
|
||||
VERSION=0.02
|
||||
|
||||
default: libtfm.a
|
||||
|
||||
@ -38,8 +38,9 @@ fp_cmp.o fp_cmp_mag.o \
|
||||
\
|
||||
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
|
||||
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
|
||||
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
|
||||
fp_read_radix.o fp_toradix.o fp_radix_size.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
|
||||
\
|
||||
fp_ident.o
|
||||
|
||||
libtfm.a: $(OBJECTS)
|
||||
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
|
||||
@ -49,7 +50,7 @@ mtest/mtest: mtest/mtest.c
|
||||
cd mtest ; make mtest
|
||||
|
||||
test: libtfm.a demo/test.o mtest/mtest
|
||||
$(CC) demo/test.o libtfm.a $(PROF) -o test
|
||||
$(CC) $(CFLAGS) demo/test.o libtfm.a $(PROF) -o test
|
||||
|
||||
stest: libtfm.a demo/stest.o
|
||||
$(CC) demo/stest.o libtfm.a -o stest
|
||||
@ -67,7 +68,7 @@ docs: docdvi
|
||||
mv -f tfm.pdf doc
|
||||
|
||||
clean:
|
||||
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest
|
||||
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest *~
|
||||
cd mtest ; make clean
|
||||
|
||||
zipup: docs clean
|
||||
|
@ -6,4 +6,4 @@ mtest: mtest.o
|
||||
$(CC) mtest.o -ltommath -o mtest
|
||||
|
||||
clean:
|
||||
rm -f *.o mtest
|
||||
rm -f *.o mtest *~
|
||||
|
@ -235,10 +235,9 @@ int main(void)
|
||||
rand_num2(&a);
|
||||
rand_num2(&b);
|
||||
rand_num2(&c);
|
||||
// if (c.dp[0]&1) mp_add_d(&c, 1, &c);
|
||||
a.sign = b.sign = c.sign = 0;
|
||||
c.dp[0] |= 1;
|
||||
if (c.used <= 2) continue;
|
||||
// if (c.used <= 4) continue;
|
||||
// if (mp_cmp(&a, &c) != MP_LT) continue;
|
||||
// if (mp_cmp(&b, &c) != MP_LT) continue;
|
||||
mp_exptmod(&a, &b, &c, &d);
|
||||
|
362
pre_gen/mpi.c
362
pre_gen/mpi.c
@ -709,7 +709,7 @@ int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
|
||||
* Some restrictions... x must be positive and < b
|
||||
*/
|
||||
|
||||
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
{
|
||||
fp_int M[64], res;
|
||||
fp_digit buf, mp;
|
||||
@ -730,7 +730,7 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
}
|
||||
|
||||
/* init M array */
|
||||
memset(M, 0, sizeof(fp_int)*(1<<winsize));
|
||||
memset(M, 0, sizeof(M));
|
||||
|
||||
/* now setup montgomery */
|
||||
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
|
||||
@ -865,6 +865,27 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
|
||||
{
|
||||
fp_int tmp;
|
||||
int err;
|
||||
|
||||
/* is X negative? */
|
||||
if (X->sign == FP_NEG) {
|
||||
/* yes, copy G and invmod it */
|
||||
fp_copy(G, &tmp);
|
||||
if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
|
||||
return err;
|
||||
}
|
||||
/* _fp_exptmod doesn't care about the sign of X */
|
||||
return _fp_exptmod(&tmp, X, P, Y);
|
||||
} else {
|
||||
/* Positive exponent so just exptmod */
|
||||
return _fp_exptmod(G, X, P, Y);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* End: fp_exptmod.c */
|
||||
|
||||
/* Start: fp_gcd.c */
|
||||
@ -922,6 +943,76 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
|
||||
|
||||
/* End: fp_gcd.c */
|
||||
|
||||
/* Start: fp_ident.c */
|
||||
#include "tfm.h"
|
||||
|
||||
const char *fp_ident(void)
|
||||
{
|
||||
static char buf[1024];
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
snprintf(buf, sizeof(buf)-1,
|
||||
"TomsFastMath (%s)\n"
|
||||
"\n"
|
||||
"Sizeofs\n"
|
||||
"\tfp_digit = %u\n"
|
||||
"\tfp_word = %u\n"
|
||||
"\n"
|
||||
"FP_MAX_SIZE = %u\n"
|
||||
"\n"
|
||||
"Defines: \n"
|
||||
#ifdef __i386__
|
||||
" __i386__ "
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
" __x86_64__ "
|
||||
#endif
|
||||
#ifdef TFM_X86
|
||||
" TFM_X86 "
|
||||
#endif
|
||||
#ifdef TFM_X86_64
|
||||
" TFM_X86_64 "
|
||||
#endif
|
||||
#ifdef TFM_SSE2
|
||||
" TFM_SSE2 "
|
||||
#endif
|
||||
#ifdef TFM_ARM
|
||||
" TFM_ARM "
|
||||
#endif
|
||||
#ifdef TFM_NO_ASM
|
||||
" TFM_NO_ASM "
|
||||
#endif
|
||||
#ifdef FP_64BIT
|
||||
" FP_64BIT "
|
||||
#endif
|
||||
#ifdef TFM_LARGE
|
||||
" TFM_LARGE "
|
||||
#endif
|
||||
#ifdef TFM_HUGE
|
||||
" TFM_HUGE "
|
||||
#endif
|
||||
"\n", __DATE__, sizeof(fp_digit), sizeof(fp_word), FP_MAX_SIZE);
|
||||
|
||||
if (sizeof(fp_digit) == sizeof(fp_word)) {
|
||||
strncat(buf, "WARNING: sizeof(fp_digit) == sizeof(fp_word), this build is likely to not work properly.\n",
|
||||
sizeof(buf)-1);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
#ifdef STANDALONE
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("%s\n", fp_ident());
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* End: fp_ident.c */
|
||||
|
||||
/* Start: fp_invmod.c */
|
||||
/* TomsFastMath, a fast ISO C bignum library.
|
||||
*
|
||||
@ -1186,7 +1277,19 @@ void fp_lshd(fp_int *a, int x)
|
||||
/* c = a mod b, 0 <= c < b */
|
||||
int fp_mod(fp_int *a, fp_int *b, fp_int *c)
|
||||
{
|
||||
return fp_div(a, b, NULL, c);
|
||||
fp_int t;
|
||||
int err;
|
||||
|
||||
fp_zero(&t);
|
||||
if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
|
||||
return err;
|
||||
}
|
||||
if (t.sign != b->sign) {
|
||||
fp_add(&t, b, c);
|
||||
} else {
|
||||
fp_copy(&t, c);
|
||||
}
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
|
||||
@ -1282,7 +1385,7 @@ void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
|
||||
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
|
||||
} else {
|
||||
fp_set(a, 1);
|
||||
++bits;
|
||||
bits = 1;
|
||||
}
|
||||
|
||||
/* now compute C = A * B mod b */
|
||||
@ -1366,13 +1469,13 @@ asm(
|
||||
#define PROPCARRY \
|
||||
asm( \
|
||||
"movq %1,%%rax \n\t" \
|
||||
"movq %2,%%rbx \n\t" \
|
||||
"addq %%rax,%6 \n\t" \
|
||||
"movq %2,%%rax \n\t" \
|
||||
"adcq %%rax,%7 \n\t" \
|
||||
"adcq %%rbx,%7 \n\t" \
|
||||
"adcq $0,%8 \n\t" \
|
||||
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
|
||||
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
|
||||
: "%rax", "%cc");
|
||||
: "%rax", "%rbx", "%cc");
|
||||
|
||||
#elif defined(TFM_SSE2)
|
||||
|
||||
@ -1388,7 +1491,7 @@ asm("emms");
|
||||
asm(\
|
||||
"movd %0,%%mm1 \n\t" \
|
||||
"pmuludq %%mm2,%%mm1 \n\t" \
|
||||
:: "g"(c[x]), "g"(mp));
|
||||
:: "g"(c[x]));
|
||||
|
||||
#define INNERMUL \
|
||||
asm( \
|
||||
@ -1412,7 +1515,7 @@ asm(
|
||||
"adcl %%eax,%7 \n\t" \
|
||||
"adcl $0,%8 \n\t" \
|
||||
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
|
||||
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
|
||||
"g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
|
||||
: "%eax", "%cc");
|
||||
|
||||
#elif defined(TFM_ARM)
|
||||
@ -1466,14 +1569,18 @@ asm( \
|
||||
mu = c[x] * mp;
|
||||
|
||||
#define INNERMUL \
|
||||
t = ((fp_word)mu) * ((fp_word)*tmpm++); \
|
||||
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \
|
||||
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \
|
||||
do { fp_word t; \
|
||||
t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
|
||||
t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
|
||||
_c[OFF2] += (t >> DIGIT_BIT); \
|
||||
} while (0);
|
||||
|
||||
#define PROPCARRY \
|
||||
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \
|
||||
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1];
|
||||
|
||||
do { fp_word t; \
|
||||
t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
|
||||
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
|
||||
_c[OFF2+1] += (t >> DIGIT_BIT); \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
||||
@ -1487,7 +1594,6 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
|
||||
{
|
||||
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
|
||||
int oldused, x, y, pa;
|
||||
fp_word t;
|
||||
|
||||
/* now zero the buff */
|
||||
pa = m->used;
|
||||
@ -1521,13 +1627,13 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
|
||||
|
||||
/* fix the rest of the carries */
|
||||
_c = c + pa;
|
||||
for (; x < pa * 2 + 2; x++) {
|
||||
for (x = pa; x < pa * 2 + 2; x++) {
|
||||
PROPCARRY;
|
||||
++_c;
|
||||
}
|
||||
|
||||
/* now copy out */
|
||||
_c = c + pa;
|
||||
_c = c + pa;
|
||||
tmpm = a->dp;
|
||||
for (x = 0; x < pa+1; x++) {
|
||||
*tmpm++ = *_c++;
|
||||
@ -1629,9 +1735,11 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
|
||||
fp_mul_comba4(A,B,C);
|
||||
} else if (y <= 8) {
|
||||
fp_mul_comba8(A,B,C);
|
||||
#if defined(TFM_LARGE)
|
||||
} else if (y <= 16 && y >= 12) {
|
||||
fp_mul_comba16(A,B,C);
|
||||
#ifdef TFM_HUGE
|
||||
#endif
|
||||
#if defined(TFM_HUGE)
|
||||
} else if (y <= 32 && y >= 28) {
|
||||
fp_mul_comba32(A,B,C);
|
||||
#endif
|
||||
@ -1880,7 +1988,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
|
||||
|
||||
/* forward the carry to the next digit */
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
/* store the first sum */
|
||||
#define COMBA_STORE(x) \
|
||||
@ -1895,7 +2003,7 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
|
||||
|
||||
/* this should multiply i and j */
|
||||
#define MULADD(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movl %6,%%eax \n\t" \
|
||||
"mull %7 \n\t" \
|
||||
"addl %%eax,%0 \n\t" \
|
||||
@ -1915,7 +2023,7 @@ asm volatile ( \
|
||||
|
||||
/* forward the carry to the next digit */
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
/* store the first sum */
|
||||
#define COMBA_STORE(x) \
|
||||
@ -1930,13 +2038,13 @@ asm volatile ( \
|
||||
|
||||
/* this should multiply i and j */
|
||||
#define MULADD(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movq %6,%%rax \n\t" \
|
||||
"mulq %7 \n\t" \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
"adcq %%rdx,%1 \n\t" \
|
||||
"adcq $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
|
||||
|
||||
#elif defined(TFM_SSE2)
|
||||
/* use SSE2 optimizations */
|
||||
@ -1950,7 +2058,7 @@ asm volatile ( \
|
||||
|
||||
/* forward the carry to the next digit */
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
/* store the first sum */
|
||||
#define COMBA_STORE(x) \
|
||||
@ -1987,7 +2095,7 @@ asm volatile ( \
|
||||
c0 = c1 = c2 = 0;
|
||||
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_STORE(x) \
|
||||
x = c0;
|
||||
@ -2008,13 +2116,13 @@ asm( \
|
||||
#else
|
||||
/* ISO C code */
|
||||
|
||||
#define COMBA_START
|
||||
#define COMBA_START
|
||||
|
||||
#define COMBA_CLEAR \
|
||||
c0 = c1 = c2 = 0;
|
||||
|
||||
#define COMBA_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_STORE(x) \
|
||||
x = c0;
|
||||
@ -2022,12 +2130,13 @@ asm( \
|
||||
#define COMBA_STORE2(x) \
|
||||
x = c1;
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
#define MULADD(i, j) \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
|
||||
#define COMBA_FINI
|
||||
|
||||
#define MULADD(i, j) \
|
||||
do { fp_word t; \
|
||||
t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
|
||||
t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
||||
@ -2037,7 +2146,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
int ix, iy, iz, tx, ty, pa;
|
||||
fp_digit c0, c1, c2, *tmpx, *tmpy;
|
||||
fp_word t;
|
||||
fp_int tmp, *dst;
|
||||
|
||||
COMBA_START;
|
||||
@ -2092,7 +2200,6 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
|
||||
|
||||
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[8];
|
||||
|
||||
memcpy(at, A->dp, 4 * sizeof(fp_digit));
|
||||
@ -2137,7 +2244,6 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
|
||||
|
||||
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[16];
|
||||
|
||||
memcpy(at, A->dp, 8 * sizeof(fp_digit));
|
||||
@ -2211,10 +2317,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
|
||||
COMBA_FINI;
|
||||
}
|
||||
|
||||
#if defined(TFM_LARGE)
|
||||
|
||||
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[32];
|
||||
|
||||
memcpy(at, A->dp, 16 * sizeof(fp_digit));
|
||||
@ -2352,11 +2458,12 @@ void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
|
||||
COMBA_FINI;
|
||||
}
|
||||
|
||||
#endif /* TFM_LARGE */
|
||||
|
||||
#ifdef TFM_HUGE
|
||||
|
||||
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit c0, c1, c2, at[64];
|
||||
|
||||
memcpy(at, A->dp, 32 * sizeof(fp_digit));
|
||||
@ -2880,6 +2987,41 @@ error:
|
||||
|
||||
int fp_radix_size(fp_int *a, int radix, int *size)
|
||||
{
|
||||
int digs;
|
||||
fp_int t;
|
||||
fp_digit d;
|
||||
|
||||
*size = 0;
|
||||
|
||||
/* check range of the radix */
|
||||
if (radix < 2 || radix > 64) {
|
||||
return FP_VAL;
|
||||
}
|
||||
|
||||
/* quick out if its zero */
|
||||
if (fp_iszero(a) == 1) {
|
||||
*size = 2;
|
||||
return FP_OKAY;
|
||||
}
|
||||
|
||||
fp_init_copy(&t, a);
|
||||
|
||||
/* if it is negative output a - */
|
||||
if (t.sign == FP_NEG) {
|
||||
*size++;
|
||||
t.sign = FP_ZPOS;
|
||||
}
|
||||
|
||||
digs = 0;
|
||||
while (fp_iszero (&t) == FP_NO) {
|
||||
fp_div_d (&t, (fp_digit) radix, &t, &d);
|
||||
*size++;
|
||||
}
|
||||
|
||||
/* append a NULL so the string is properly terminated */
|
||||
*size++;
|
||||
return FP_OKAY;
|
||||
|
||||
}
|
||||
|
||||
/* End: fp_radix_size.c */
|
||||
@ -3161,9 +3303,11 @@ void fp_sqr(fp_int *A, fp_int *B)
|
||||
fp_sqr_comba4(A,B);
|
||||
} else if (y <= 8) {
|
||||
fp_sqr_comba8(A,B);
|
||||
#if defined(TFM_LARGE)
|
||||
} else if (y <= 16 && y >= 12) {
|
||||
fp_sqr_comba16(A,B);
|
||||
#ifdef TFM_HUGE
|
||||
#endif
|
||||
#if defined(TFM_HUGE)
|
||||
} else if (y <= 32 && y >= 28) {
|
||||
fp_sqr_comba32(A,B);
|
||||
#endif
|
||||
@ -3279,7 +3423,7 @@ Obvious points of optimization
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
@ -3319,21 +3463,21 @@ asm volatile ( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
#define SQRADD(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movq %6,%%rax \n\t" \
|
||||
"mulq %%rax \n\t" \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
"adcq %%rdx,%1 \n\t" \
|
||||
"adcq $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
|
||||
|
||||
#define SQRADD2(i, j) \
|
||||
asm volatile ( \
|
||||
asm ( \
|
||||
"movq %6,%%rax \n\t" \
|
||||
"mulq %7 \n\t" \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
@ -3342,7 +3486,7 @@ asm volatile ( \
|
||||
"addq %%rax,%0 \n\t" \
|
||||
"adcq %%rdx,%1 \n\t" \
|
||||
"adcq $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
|
||||
|
||||
|
||||
#elif defined(TFM_SSE2)
|
||||
@ -3360,7 +3504,7 @@ asm volatile ( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI \
|
||||
asm("emms");
|
||||
@ -3371,11 +3515,11 @@ asm volatile ( \
|
||||
"pmuludq %%mm0,%%mm0\n\t" \
|
||||
"movd %%mm0,%%eax \n\t" \
|
||||
"psrlq $32,%%mm0 \n\t" \
|
||||
"movd %%mm0,%%edx \n\t" \
|
||||
"addl %%eax,%0 \n\t" \
|
||||
"adcl %%edx,%1 \n\t" \
|
||||
"movd %%mm0,%%eax \n\t" \
|
||||
"adcl %%eax,%1 \n\t" \
|
||||
"adcl $0,%2 \n\t" \
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
|
||||
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
|
||||
|
||||
#define SQRADD2(i, j) \
|
||||
asm volatile ( \
|
||||
@ -3409,7 +3553,7 @@ asm volatile ( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
|
||||
@ -3438,7 +3582,8 @@ asm( \
|
||||
|
||||
/* ISO C portable code */
|
||||
|
||||
#define COMBA_START
|
||||
#define COMBA_START \
|
||||
{ fp_word tt;
|
||||
|
||||
#define CLEAR_CARRY \
|
||||
c0 = c1 = c2 = 0;
|
||||
@ -3450,23 +3595,28 @@ asm( \
|
||||
x = c1;
|
||||
|
||||
#define CARRY_FORWARD \
|
||||
c0 = c1; c1 = c2; c2 = 0;
|
||||
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
||||
|
||||
#define COMBA_FINI
|
||||
#define COMBA_FINI \
|
||||
}
|
||||
|
||||
/* multiplies point i and j, updates carry "c1" and digit c2 */
|
||||
#define SQRADD(i, j) \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
|
||||
#define SQRADD(i, j) \
|
||||
do { fp_word t; \
|
||||
t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
|
||||
t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
|
||||
|
||||
/* for squaring some of the terms are doubled... */
|
||||
#define SQRADD2(i, j) \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
|
||||
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
|
||||
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
|
||||
#define SQRADD2(i, j) \
|
||||
do { fp_word t; \
|
||||
t = ((fp_word)i) * ((fp_word)j); \
|
||||
tt = (fp_word)c0 + t; c0 = tt; \
|
||||
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
|
||||
tt = (fp_word)c0 + t; c0 = tt; \
|
||||
tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
||||
@ -3476,7 +3626,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
|
||||
int pa, ix, iz;
|
||||
fp_digit c0, c1, c2;
|
||||
fp_int tmp, *dst;
|
||||
fp_word t;
|
||||
|
||||
/* get size of output and trim */
|
||||
pa = A->used + A->used;
|
||||
@ -3549,7 +3698,6 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
|
||||
|
||||
void fp_sqr_comba4(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[8], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -3603,7 +3751,6 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
|
||||
|
||||
void fp_sqr_comba8(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[16], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -3694,10 +3841,10 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
|
||||
fp_clamp(B);
|
||||
}
|
||||
|
||||
#if defined(TFM_LARGE)
|
||||
|
||||
void fp_sqr_comba16(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[32], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -3868,11 +4015,12 @@ void fp_sqr_comba16(fp_int *A, fp_int *B)
|
||||
fp_clamp(B);
|
||||
}
|
||||
|
||||
#endif /* TFM_LARGE */
|
||||
|
||||
#ifdef TFM_HUGE
|
||||
|
||||
void fp_sqr_comba32(fp_int *A, fp_int *B)
|
||||
{
|
||||
fp_word t;
|
||||
fp_digit *a, b[64], c0, c1, c2;
|
||||
|
||||
a = A->dp;
|
||||
@ -4455,5 +4603,81 @@ int fp_unsigned_bin_size(fp_int *a)
|
||||
|
||||
/* End: fp_unsigned_bin_size.c */
|
||||
|
||||
/* Start: s_fp_add.c */
|
||||
/* TomsFastMath, a fast ISO C bignum library.
|
||||
*
|
||||
* This project is meant to fill in where LibTomMath
|
||||
* falls short. That is speed ;-)
|
||||
*
|
||||
* This project is public domain and free for all purposes.
|
||||
*
|
||||
* Tom St Denis, tomstdenis@iahu.ca
|
||||
*/
|
||||
#include <tfm.h>
|
||||
|
||||
/* unsigned addition */
|
||||
void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
|
||||
{
|
||||
int x, y, oldused;
|
||||
fp_word t;
|
||||
|
||||
y = MAX(a->used, b->used);
|
||||
oldused = c->used;
|
||||
c->used = y;
|
||||
|
||||
t = 0;
|
||||
for (x = 0; x < y; x++) {
|
||||
t += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
|
||||
c->dp[x] = (fp_digit)t;
|
||||
t >>= DIGIT_BIT;
|
||||
}
|
||||
if (t != 0 && x != FP_SIZE) {
|
||||
c->dp[c->used++] = (fp_digit)t;
|
||||
++x;
|
||||
}
|
||||
|
||||
for (; x < oldused; x++) {
|
||||
c->dp[x] = 0;
|
||||
}
|
||||
fp_clamp(c);
|
||||
}
|
||||
|
||||
/* End: s_fp_add.c */
|
||||
|
||||
/* Start: s_fp_sub.c */
|
||||
/* TomsFastMath, a fast ISO C bignum library.
|
||||
*
|
||||
* This project is meant to fill in where LibTomMath
|
||||
* falls short. That is speed ;-)
|
||||
*
|
||||
* This project is public domain and free for all purposes.
|
||||
*
|
||||
* Tom St Denis, tomstdenis@iahu.ca
|
||||
*/
|
||||
#include <tfm.h>
|
||||
|
||||
/* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
|
||||
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
|
||||
{
|
||||
int x, oldused;
|
||||
fp_word t;
|
||||
|
||||
oldused = c->used;
|
||||
c->used = a->used;
|
||||
t = 0;
|
||||
for (x = 0; x < a->used; x++) {
|
||||
t = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
|
||||
c->dp[x] = (fp_digit)t;
|
||||
t = (t >> DIGIT_BIT) & 1;
|
||||
}
|
||||
|
||||
for (; x < oldused; x++) {
|
||||
c->dp[x] = 0;
|
||||
}
|
||||
fp_clamp(c);
|
||||
}
|
||||
|
||||
/* End: s_fp_sub.c */
|
||||
|
||||
|
||||
/* EOF */
|
||||
|
81
tfm.h
81
tfm.h
@ -21,9 +21,19 @@
|
||||
#undef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
|
||||
/* do we want huge code? The answer is, yes. */
|
||||
/* do we want large code? */
|
||||
#define TFM_LARGE
|
||||
|
||||
/* do we want huge code (implies large)? The answer is, yes. */
|
||||
#define TFM_HUGE
|
||||
|
||||
/* imply TFM_LARGE as required */
|
||||
#if defined(TFM_HUGE)
|
||||
#if !defined(TFM_LARGE)
|
||||
#define TFM_LARGE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Max size of any number in bits. Basically the largest size you will be multiplying
|
||||
* should be half [or smaller] of FP_MAX_SIZE-four_digit
|
||||
*
|
||||
@ -41,19 +51,69 @@
|
||||
#error FP_MAX_SIZE must be a multiple of CHAR_BIT
|
||||
#endif
|
||||
|
||||
/* make sure we are using 64-bit digits with x86-64 asm */
|
||||
/* autodetect x86-64 and make sure we are using 64-bit digits with x86-64 asm */
|
||||
#if defined(__x86_64__)
|
||||
#if defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)
|
||||
#error x86-64 detected, x86-32/SSE2/ARM optimizations are not valid!
|
||||
#endif
|
||||
#if !defined(TFM_X86_64) && !defined(TFM_NO_ASM)
|
||||
#define TFM_X86_64
|
||||
#endif
|
||||
#endif
|
||||
#if defined(TFM_X86_64)
|
||||
#ifndef FP_64BIT
|
||||
#if !defined(FP_64BIT)
|
||||
#define FP_64BIT
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* try to detect x86-32 */
|
||||
#if defined(__i386__) && !defined(TFM_SSE2)
|
||||
#if defined(TFM_X86_64) || defined(TFM_ARM)
|
||||
#error x86-32 detected, x86-64/ARM optimizations are not valid!
|
||||
#endif
|
||||
#if !defined(TFM_X86) && !defined(TFM_NO_ASM)
|
||||
#define TFM_X86
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* make sure we're 32-bit for x86-32/sse/arm */
|
||||
#if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
|
||||
#warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
|
||||
#undef FP_64BIT
|
||||
#endif
|
||||
|
||||
/* multi asms? */
|
||||
#ifdef TFM_X86
|
||||
#define TFM_ASM
|
||||
#endif
|
||||
#ifdef TFM_X86_64
|
||||
#ifdef TFM_ASM
|
||||
#error TFM_ASM already defined!
|
||||
#endif
|
||||
#define TFM_ASM
|
||||
#endif
|
||||
#ifdef TFM_SSE2
|
||||
#ifdef TFM_ASM
|
||||
#error TFM_ASM already defined!
|
||||
#endif
|
||||
#define TFM_ASM
|
||||
#endif
|
||||
#ifdef TFM_ARM
|
||||
#ifdef TFM_ASM
|
||||
#error TFM_ASM already defined!
|
||||
#endif
|
||||
#define TFM_ASM
|
||||
#endif
|
||||
|
||||
/* we want no asm? */
|
||||
#ifdef TFM_NO_ASM
|
||||
#undef TFM_X86
|
||||
#undef TFM_X86_64
|
||||
#undef TFM_SSE2
|
||||
#undef TFM_ARM
|
||||
#undef TFM_ASM
|
||||
#endif
|
||||
|
||||
/* some default configurations.
|
||||
*/
|
||||
#if defined(FP_64BIT)
|
||||
@ -110,8 +170,11 @@ typedef struct {
|
||||
|
||||
/* functions */
|
||||
|
||||
/* returns a TFM ident string useful for debugging... */
|
||||
const char *fp_ident(void);
|
||||
|
||||
/* initialize [or zero] an fp int */
|
||||
#define fp_init(a) memset((a), 0, sizeof(fp_int))
|
||||
#define fp_init(a) (void)memset((a), 0, sizeof(fp_int))
|
||||
#define fp_zero(a) fp_init(a)
|
||||
|
||||
/* zero/even/odd ? */
|
||||
@ -123,7 +186,7 @@ typedef struct {
|
||||
void fp_set(fp_int *a, fp_digit b);
|
||||
|
||||
/* copy from a to b */
|
||||
#define fp_copy(a, b) (((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
|
||||
#define fp_copy(a, b) (void)(((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
|
||||
#define fp_init_copy(a, b) fp_copy(b, a)
|
||||
|
||||
/* negate and absolute */
|
||||
@ -139,10 +202,10 @@ void fp_rshd(fp_int *a, int x);
|
||||
/* left shift x digits */
|
||||
void fp_lshd(fp_int *a, int x);
|
||||
|
||||
/* signed comparisonm */
|
||||
/* signed comparison */
|
||||
int fp_cmp(fp_int *a, fp_int *b);
|
||||
|
||||
/* unsigned comparisonm */
|
||||
/* unsigned comparison */
|
||||
int fp_cmp_mag(fp_int *a, fp_int *b);
|
||||
|
||||
/* power of 2 operations */
|
||||
@ -273,14 +336,18 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
|
||||
#ifdef TFM_HUGE
|
||||
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
|
||||
#endif
|
||||
#ifdef TFM_LARGE
|
||||
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
|
||||
#endif
|
||||
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
|
||||
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
|
||||
|
||||
void fp_sqr_comba(fp_int *A, fp_int *B);
|
||||
void fp_sqr_comba4(fp_int *A, fp_int *B);
|
||||
void fp_sqr_comba8(fp_int *A, fp_int *B);
|
||||
#ifdef TFM_LARGE
|
||||
void fp_sqr_comba16(fp_int *A, fp_int *B);
|
||||
#endif
|
||||
#ifdef TFM_HUGE
|
||||
void fp_sqr_comba32(fp_int *A, fp_int *B);
|
||||
#endif
|
||||
|
25
tfm.tex
25
tfm.tex
@ -49,7 +49,7 @@
|
||||
\begin{document}
|
||||
\frontmatter
|
||||
\pagestyle{empty}
|
||||
\title{TomsFastMath User Manual \\ v0.01}
|
||||
\title{TomsFastMath User Manual \\ v0.02}
|
||||
\author{Tom St Denis \\ tomstdenis@iahu.ca}
|
||||
\maketitle
|
||||
This text and library are all hereby placed in the public domain. This book has been formatted for B5
|
||||
@ -128,23 +128,27 @@ several ``CFLAGS'' defines.
|
||||
For example, to build with with SSE2 optimizations type
|
||||
|
||||
\begin{verbatim}
|
||||
export CFLAGS=-DTFM_SSE2
|
||||
make clean libtfm.a
|
||||
CFLAGS=-DTFM_SSE2 make clean libtfm.a
|
||||
\end{verbatim}
|
||||
|
||||
\subsubsection{x86--32} The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all
|
||||
i386 and beyond processors. It requires GCC to build and only works with 32--bit digits. In this
|
||||
mode fp\_digit is 32--bits and fp\_word is 64--bits.
|
||||
mode fp\_digit is 32--bits and fp\_word is 64--bits. This mode will be autodetected when building
|
||||
with GCC to an ``i386'' target. You can override this behaviour by defining TFM\_NO\_ASM or
|
||||
another optimization mode (such as SSE2).
|
||||
|
||||
\subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium
|
||||
M or Athlon64 processor. It requires GCC to build. Note that you shouldn't define both
|
||||
TFM\_X86 and TFM\_SSE2 at the same time. This mode only works with 32--bit digits. In this
|
||||
mode fp\_digit is 32--bits and fp\_word is 64--bits.
|
||||
mode fp\_digit is 32--bits and fp\_word is 64--bits. While this mode will work on the AMD Athlon64
|
||||
series of processors it is less efficient than the native ``x86--64'' mode and not recommended.
|
||||
|
||||
\subsubsection{x86--64} The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a
|
||||
``x86--64'' capable processor (Athlon64 and future Pentium processors). It requires GCC to
|
||||
build and only works with 64--bit digits. Note that by enabling this mode it will automatically
|
||||
enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits.
|
||||
enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits. This mode will
|
||||
be autodetected when building with GCC to an ``x86--64'' target. You can override this behaviour by defining
|
||||
TFM\_NO\_ASM.
|
||||
|
||||
\subsubsection{ARM} The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
|
||||
processor. It requires GCC and works with 32--bit digits. In this mode fp\_digit is 32--bits and
|
||||
@ -163,6 +167,8 @@ Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline pat
|
||||
\hline Pentium 4 & TFM\_SSE2 \\
|
||||
\hline Athlon64 & TFM\_X86\_64 \\
|
||||
\hline ARMv4 or higher & TFM\_ARM \\
|
||||
\hline &\\
|
||||
\hline x86--32 or x86--64 (with GCC) & Leave blank and let autodetect work \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Recommended Build Modes}
|
||||
@ -339,9 +345,10 @@ To compute a modular exponentiation use the following function.
|
||||
\begin{verbatim}
|
||||
int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
|
||||
\end{verbatim}
|
||||
This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$. The size of $c$
|
||||
must be half of the maximum precision used during the build of the library. For example,
|
||||
by default $c$ must be less than $2^{2048}$.
|
||||
This computes $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for any odd $c$ and $b$. $b$ may be negative so long as
|
||||
$a^{-1} \mbox{ (mod }c\mbox{)}$ exists. The initial value of $a$ may be larger than $c$. The size of $c$ must be
|
||||
half of the maximum precision used during the build of the library. For example, by default $c$ must be less
|
||||
than $2^{2048}$.
|
||||
|
||||
\section{Number Theoretic}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user