3
0
Fork 0

added tomsfastmath-0.01

This commit is contained in:
Tom St Denis 2004-08-25 02:43:43 +00:00 committed by Steffen Jaeckel
commit 5e92ed2a59
75 changed files with 11069 additions and 0 deletions

7
LICENSE Normal file
View File

@ -0,0 +1,7 @@
TomsFastMath is public domain.
Note some ideas were borrowed from LibTomMath and OpenSSL. All of the code is original or ported
from LibTomMath [no code was ported from OpenSSL]. As such the origins and status of this code
are both public domain.
-- Tom St Denis

5
SPONSORS Normal file
View File

@ -0,0 +1,5 @@
Development of TomsFastMath was sponsored by three groups. Two companies that use LTC and LTM commercially
and one individual who decided he wanted to help out by being generous.
Thanks goes to them [though they wished to remain anonymous] and people like them.

6
TODO Normal file
View File

@ -0,0 +1,6 @@
1. Write more documentation ;-)
2. Ports to PPC and MIPS
3. Fix any lingering bugs, add additional requested functionality.
NOTE: The library is still fairly new. I've tested it quite a bit but that doesn't mean surprises
can't happen. Please test the results you get for correctness.

2
changes.txt Normal file
View File

@ -0,0 +1,2 @@
August 25th, 2004
TFM 0.01 -- Initial Release

50
comba_mult_gen.c Normal file
View File

@ -0,0 +1,50 @@
/* program emits a NxN comba multiplier */
#include <stdio.h>
int main(int argc, char **argv)
{
int N, x, y, z;
N = atoi(argv[1]);
/* print out preamble */
printf(
"void fp_mul_comba%d(fp_int *A, fp_int *B, fp_int *C)\n"
"{\n"
" fp_word t;\n"
" fp_digit c0, c1, c2, at[%d];\n"
"\n"
" memcpy(at, A->dp, %d * sizeof(fp_digit));\n"
" memcpy(at+%d, B->dp, %d * sizeof(fp_digit));\n"
" COMBA_START;\n"
"\n"
" COMBA_CLEAR;\n", N, N+N, N, N, N, N);
/* now do the rows */
for (x = 0; x < (N+N-1); x++) {
printf(
" /* %d */\n", x);
if (x > 0) {
printf(
" COMBA_FORWARD;\n");
}
for (y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if ((y+z)==x) {
printf(" MULADD(at[%d], at[%d]); ", y, z+N);
}
}
}
printf(
"\n"
" COMBA_STORE(C->dp[%d]);\n", x);
}
printf(
" COMBA_STORE2(C->dp[%d]);\n"
" C->used = %d;\n"
" C->sign = A->sign ^ B->sign;\n"
" fp_clamp(C);\n"
" COMBA_FINI;\n"
"}\n\n\n", N+N-1, N+N, N+N);
return 0;
}

54
comba_sqr_gen.c Normal file
View File

@ -0,0 +1,54 @@
/* Generates squaring comba code... it learns it knows our secrets! */
#include <stdio.h>
int main(int argc, char **argv)
{
int x, y, z, N;
N = atoi(argv[1]);
printf(
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
"{\n"
" fp_word t;\n"
" fp_digit *a, b[%d], c0, c1, c2;\n"
"\n"
" a = A->dp;\n"
" COMBA_START; \n"
"\n"
" /* clear carries */\n"
" CLEAR_CARRY;\n"
"\n"
" /* output 0 */\n"
" SQRADD(a[0],a[0]);\n"
" COMBA_STORE(b[0]);\n", N, N+N);
for (x = 1; x < N+N-1; x++) {
printf(
"\n /* output %d */\n"
" CARRY_FORWARD;\n ", x);
for (y = 0; y < N; y++) {
for (z = 0; z < N; z++) {
if (y<=z && (y+z)==x) {
if (y == z) {
printf("SQRADD(a[%d], a[%d]); ", y, y);
} else {
printf("SQRADD2(a[%d], a[%d]); ", y, z);
}
}
}
}
printf("\n COMBA_STORE(b[%d]);\n", x);
}
printf(" COMBA_STORE2(b[%d]);\n", N+N-1);
printf(
" COMBA_FINI;\n"
"\n"
" B->used = %d;\n"
" B->sign = FP_ZPOS;\n"
" memcpy(B->dp, b, %d * sizeof(fp_digit));\n"
" fp_clamp(B);\n"
"}\n\n\n", N+N, N+N);
return 0;
}

144
demo/stest.c Normal file
View File

@ -0,0 +1,144 @@
/* A simple static test program. */
#include <tfm.h>
#ifdef GBA_MODE
#include <gba.h>
#define DISPLAY(x) modetxt_puts(vfb, x, 1)
#endif
#ifndef DISPLAY
#define DISPLAY(x) printf(x)
#endif
#ifdef GBA_MODE
int c_main(void)
#else
int main(void)
#endif
{
fp_int a,b,c,d,e,f;
fp_digit dp;
fp_init(&a);
fp_init(&b);
fp_init(&c);
fp_init(&d);
fp_init(&e);
fp_init(&f);
#ifdef GBA_MODE
install_common();
modetxt_init();
modetxt_gotoxy(0,0);
#endif
/* test multiplication */
fp_read_radix(&a, "3453534534535345345341230891273", 10);
fp_read_radix(&b, "2394873294871238934718923" , 10);
fp_read_radix(&c, "8270777629674273015508507050766235312931312159028658979", 10);
fp_mul(&a, &b, &d);
if (fp_cmp(&c, &d)) {
DISPLAY("mul failed\n");
return 0;
} else {
DISPLAY("mul passed\n");
}
/* test multiplication */
fp_read_radix(&a, "30481290320498235987349712308523652378643912563478232907782361237864278207235782364578264891274789264278634289739", 10);
fp_read_radix(&b, "48761478126387263782638276327836287632836278362837627838736278362923698724823749238732" , 10);
fp_read_radix(&c, "1486312771227034563307950634490737985563993459700941115664257275795366623795590136120579100118233580357115074068815507257715906295105536107921754177810976863679300283932188006885811950341132768970948", 10);
fp_mul(&a, &b, &d);
if (fp_cmp(&c, &d)) {
DISPLAY("mul failed\n");
return 0;
} else {
DISPLAY("mul passed\n");
}
/* test multiplication */
fp_read_radix(&a, "115792089237316195423570985008687907853269984665640564039457584007913129639935", 10);
fp_read_radix(&b, "174224571863520493293247799005065324265471" , 10);
fp_read_radix(&c, "20173827172553973356686868531273530268200710714389071377794102651988800859098544338487575161443744102709980552583184385", 10);
fp_mul(&a, &b, &d);
if (fp_cmp(&c, &d)) {
DISPLAY("mul failed\n");
return 0;
} else {
DISPLAY("mul passed\n");
}
/* test squaring */
fp_read_radix(&a, "298723982748923478923473927489237289347238947238947238947238972893", 10);
fp_read_radix(&b, "89236017869379132235512787068367546521309689412262624434964313994127411682542855190667724226920696163962644836740110835385588789449" , 10);
fp_sqr(&a, &c);
if (fp_cmp(&c, &b)) {
DISPLAY("sqr failed\n");
return 0;
} else {
DISPLAY("sqr passed\n");
}
fp_read_radix(&a, "397823894238973128942895123894327123941724927848927349274897238978927593487012378490184789429812734982738972389", 10);
fp_read_radix(&b, "158263850827461677491961439999264901067636282938352531932899298293270945997930087353471903166601507321298827087008336951419604640736464667188494668962822678461626245753696845719301945679092882499787869509090904187704367321" , 10);
fp_sqr(&a, &c);
if (fp_cmp(&c, &b)) {
DISPLAY("sqr failed\n");
return 0;
} else {
DISPLAY("sqr passed\n");
}
fp_read_radix(&a, "13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006084095", 10);
fp_read_radix(&b, "179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474097562152033539671286128252223189553839160721441767298250321715263238814402734379959506792230903356495130620869925267845538430714092411695463462326211969025" , 10);
fp_sqr(&a, &c);
if (fp_cmp(&c, &b)) {
DISPLAY("sqr failed\n");
return 0;
} else {
DISPLAY("sqr passed\n");
}
/* montgomery reductions */
fp_read_radix(&a, "234892374892374893489123428937892781237863278637826327367637836278362783627836783678363", 10);
fp_read_radix(&b, "4447823492749823749234123489273987393983289319382762756425425425642727352327452374521", 10);
fp_read_radix(&c, "2396271882990732698083317035605836523697277786556053771759862552557086442129695099100", 10);
fp_montgomery_setup(&b, &dp);
fp_montgomery_reduce(&a, &b, dp);
if (fp_cmp(&a, &c)) {
DISPLAY("mont failed\n");
return 0;
} else {
DISPLAY("mont passed\n");
}
fp_read_radix(&a, "2348923748923748934891234456645654645645684576353428937892781237863278637826327367637836278362783627836783678363", 10);
fp_read_radix(&b, "444782349274982374923412348927398739398328931938276275642542542564272735232745237452123424324324444121111119", 10);
fp_read_radix(&c, "45642613844554582908652603086180267403823312390990082328515008314514368668691233331246183943400359349283420", 10);
fp_montgomery_setup(&b, &dp);
fp_montgomery_reduce(&a, &b, dp);
if (fp_cmp(&a, &c)) {
DISPLAY("mont failed\n");
return 0;
} else {
DISPLAY("mont passed\n");
}
fp_read_radix(&a, "234823424242342923748923748934891234456645654645645684576353424972378234762378623891236834132352375235378462378489378927812378632786378263273676378362783627555555555539568389052478124618461834763837685723645827529034853490580134568947341278498542893481762349723907847892983627836783678363", 10);
fp_read_radix(&b, "44478234927456563455982374923412348927398739398328931938276275642485623481638279025465891276312903262837562349056234783648712314678120389173890128905425242424239784256427", 10);
fp_read_radix(&c, "33160865265453361650564031464519042126185632333462754084489985719613480783282357410514898819797738034600484519472656152351777186694609218202276509271061460265488348645081", 10);
fp_montgomery_setup(&b, &dp);
fp_montgomery_reduce(&a, &b, dp);
if (fp_cmp(&a, &c)) {
DISPLAY("mont failed\n");
return 0;
} else {
DISPLAY("mont passed\n");
}
return 0;
}

537
demo/test.c Normal file
View File

@ -0,0 +1,537 @@
/* TFM demo program */
#include <tfm.h>
void draw(fp_int *a)
{
int x;
printf("%d, %d, ", a->used, a->sign);
for (x = a->used - 1; x >= 0; x--) {
printf("%08lx ", a->dp[x]);
}
printf("\n");
}
int myrng(unsigned char *dst, int len, void *dat)
{
int x;
for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
return len;
}
/* RDTSC from Scott Duplichan */
static ulong64 TIMFUNC (void)
{
#if defined __GNUC__
#if defined(__i386__) || defined(__x86_64__)
unsigned long long a;
__asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
return a;
#else /* gcc-IA64 version */
unsigned long result;
__asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
while (__builtin_expect ((int) result == -1, 0))
__asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
return result;
#endif
// Microsoft and Intel Windows compilers
#elif defined _M_IX86
__asm rdtsc
#elif defined _M_AMD64
return __rdtsc ();
#elif defined _M_IA64
#if defined __INTEL_COMPILER
#include <ia64intrin.h>
#endif
return __getReg (3116);
#else
#error need rdtsc function for this build
#endif
}
char cmd[4096], buf[4096];
int main(void)
{
fp_int a,b,c,d,e,f;
fp_digit fp;
int n, err;
unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n, t, cnt, rr, ix;
ulong64 t1, t2;
fp_zero(&b); fp_zero(&c); fp_zero(&d); fp_zero(&e); fp_zero(&f);
fp_zero(&a); draw(&a);
/* test set and simple shifts */
printf("Testing mul/div 2\n");
fp_set(&a, 1); draw(&a);
for (n = 0; n <= DIGIT_BIT; n++) {
fp_mul_2(&a, &a); printf("(%d) ", fp_count_bits(&a));
draw(&a);
}
for (n = 0; n <= (DIGIT_BIT + 1); n++) {
fp_div_2(&a, &a);
draw(&a);
}
fp_set(&a, 1);
/* test lshd/rshd */
printf("testing lshd/rshd\n");
fp_lshd(&a, 3); draw(&a);
fp_rshd(&a, 3); draw(&a);
/* test more complicated shifts */
printf("Testing mul/div 2d\n");
fp_mul_2d(&a, DIGIT_BIT/2, &a); draw(&a);
fp_div_2d(&a, DIGIT_BIT/2, &a, NULL); draw(&a);
fp_mul_2d(&a, DIGIT_BIT + DIGIT_BIT/2, &a); draw(&a);
fp_div_2d(&a, DIGIT_BIT + DIGIT_BIT/2, &a, NULL); draw(&a);
/* test neg/abs */
printf("testing neg/abs\n");
fp_neg(&a, &a); draw(&a);
fp_neg(&a, &a); draw(&a);
fp_neg(&a, &a); draw(&a);
fp_abs(&a, &a); draw(&a);
/* test comparisons */
fp_set(&b, 3);
fp_set(&c, 4); fp_neg(&c, &c);
fp_set(&d, 1);
printf("Testing compares\n%d, %d, %d, %d\n", fp_cmp(&a, &b), fp_cmp(&a, &c), fp_cmp(&a, &d), fp_cmp(&b, &c));
/* test add/sub */
printf("Testing add/sub \n");
fp_set(&a, ((fp_digit)1)<<(DIGIT_BIT-1)); draw(&a);
fp_set(&b, ((fp_digit)1)<<(DIGIT_BIT-2));
fp_add(&a, &b, &a); draw(&a);
fp_add(&a, &b, &a); draw(&a);
fp_add(&a, &b, &a); draw(&a);
printf("sub...\n");
printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
printf("cmp returns: %d, ", fp_cmp(&a, &b)); fp_sub(&a, &b, &a); draw(&a);
/* test mul_d */
printf("Testing mul_d and div_d\n");
fp_set(&a, 1);
fp_mul_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a); draw(&a);
fp_mul_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a); draw(&a);
fp_mul_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a); draw(&a);
printf("div_d\n");
fp_div_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a, NULL); draw(&a);
fp_div_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a, NULL); draw(&a);
fp_div_d(&a, ((fp_digit)1)<<(DIGIT_BIT/2), &a, NULL); draw(&a);
/* testing read radix */
printf("Testing read_radix\n");
fp_read_radix(&a, "123456789012345678901234567890", 16); draw(&a);
/* test mont */
printf("Montgomery test\n");
fp_set(&a, 1);
fp_lshd(&a, 4);
fp_add_d(&a, 1, &a);
fp_montgomery_setup(&a, &fp);
fp_montgomery_calc_normalization(&b, &a);
fp_read_radix(&d, "123456789123", 16);
for (n = 0; n < 100000; n++) {
fp_add_d(&d, 1, &d); fp_sqrmod(&d, &a, &d);
fp_mul(&d, &b, &c);
fp_montgomery_reduce(&c, &a, fp);
if (fp_cmp(&c, &d) != FP_EQ) {
printf("Failed mont %d\n", n);
draw(&a);
draw(&d);
draw(&c);
return EXIT_FAILURE;
}
}
printf("Passed.\n");
/* test for size */
for (ix = 8*DIGIT_BIT; ix < 10*DIGIT_BIT; ix++) {
printf("Testing (not safe-prime): %9d bits \r", ix); fflush(stdout);
err = fp_prime_random_ex(&a, 8, ix, (rand()&1)?TFM_PRIME_2MSB_OFF:TFM_PRIME_2MSB_ON, myrng, NULL);
if (err != FP_OKAY) {
printf("failed with err code %d\n", err);
return EXIT_FAILURE;
}
if (fp_count_bits(&a) != ix) {
printf("Prime is %d not %d bits!!!\n", fp_count_bits(&a), ix);
return EXIT_FAILURE;
}
}
printf("\n\n");
#if 0
/* do some timings... */
printf("Addition:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) {
fp_zero(&a);
fp_zero(&b);
fp_zero(&c);
for (ix = 0; ix < t; ix++) {
a.dp[ix] = ix;
b.dp[ix] = ix;
}
a.used = t;
b.used = t;
t2 = -1;
for (ix = 0; ix < 2500; ++ix) {
t1 = TIMFUNC();
fp_add(&a, &b, &c); fp_add(&a, &b, &c);
fp_add(&a, &b, &c); fp_add(&a, &b, &c);
fp_add(&a, &b, &c); fp_add(&a, &b, &c);
fp_add(&a, &b, &c); fp_add(&a, &b, &c);
t2 = (TIMFUNC() - t1)>>3;
if (t1<t2) { --ix; t2 = t1; }
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
printf("Multiplication:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) {
fp_zero(&a);
fp_zero(&b);
fp_zero(&c);
for (ix = 0; ix < t; ix++) {
a.dp[ix] = ix;
b.dp[ix] = ix;
}
a.used = t;
b.used = t;
t2 = -1;
for (ix = 0; ix < 10000; ++ix) {
t1 = TIMFUNC();
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
t2 = (TIMFUNC() - t1)>>2;
if (t1<t2) { --ix; t2 = t1; }
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
//#else
printf("Squaring:\n");
for (t = 2; t <= FP_SIZE/2; t += 2) {
fp_zero(&a);
fp_zero(&b);
for (ix = 0; ix < t; ix++) {
a.dp[ix] = ix;
}
a.used = t;
t2 = -1;
for (ix = 0; ix < 10000; ++ix) {
t1 = TIMFUNC();
fp_sqr(&a, &b); fp_sqr(&a, &b);
fp_sqr(&a, &b); fp_sqr(&a, &b);
t2 = (TIMFUNC() - t1)>>2;
if (t1<t2) { --ix; t2 = t1; }
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
//#else
printf("Montgomery:\n");
for (t = 2; t <= (FP_SIZE/2)-2; t += 2) {
fp_zero(&a);
for (ix = 0; ix < t; ix++) {
a.dp[ix] = ix | 1;
}
a.used = t;
fp_montgomery_setup(&a, &fp);
fp_sub_d(&a, 3, &b);
fp_sqr(&b, &b);
fp_copy(&b, &c);
fp_copy(&b, &d);
t2 = -1;
for (ix = 0; ix < 10000; ++ix) {
t1 = TIMFUNC();
fp_montgomery_reduce(&c, &a, &fp);
fp_montgomery_reduce(&d, &a, &fp);
t2 = (TIMFUNC() - t1)>>1;
fp_copy(&b, &c);
fp_copy(&b, &d);
if (t1<t2) { --ix; t2 = t1; }
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
//#else
printf("Exptmod:\n");
for (t = 512/DIGIT_BIT; t <= (FP_SIZE/2)-2; t += t) {
fp_zero(&a);
fp_zero(&b);
fp_zero(&c);
for (ix = 0; ix < t; ix++) {
a.dp[ix] = ix+1;
b.dp[ix] = (fp_digit)rand() * (fp_digit)rand();
c.dp[ix] = ix;
}
a.used = t;
b.used = t;
c.used = t;
t2 = -1;
for (ix = 0; ix < 50; ++ix) {
t1 = TIMFUNC();
fp_exptmod(&c, &b, &a, &d);
fp_exptmod(&c, &b, &a, &d);
t2 = (TIMFUNC() - t1)>>1;
fp_copy(&b, &c);
fp_copy(&b, &d);
if (t1<t2) { t2 = t1; --ix; }
}
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
}
#endif
div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= mul_d_n = 0;
for (;;) {
printf("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, mul_d_n);
fgets(cmd, 4095, stdin);
cmd[strlen(cmd)-1] = 0;
printf("%s ]\r",cmd); fflush(stdout);
if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_2d(&a, rr, &a);
a.sign = b.sign;
if (fp_cmp(&a, &b) != FP_EQ) {
printf("mul2d failed, rr == %d\n",rr);
draw(&a);
draw(&b);
return 0;
}
} else if (!strcmp(cmd, "div2d")) { ++div2d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_div_2d(&a, rr, &a, &e);
a.sign = b.sign;
if (a.used == b.used && a.used == 0) { a.sign = b.sign = FP_ZPOS; }
if (fp_cmp(&a, &b) != FP_EQ) {
printf("div2d failed, rr == %d\n",rr);
draw(&a);
draw(&b);
return 0;
}
} else if (!strcmp(cmd, "add")) { ++add_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
fp_copy(&a, &d);
fp_add(&d, &b, &d);
if (fp_cmp(&c, &d) != FP_EQ) {
printf("add %lu failure!\n", add_n);
draw(&a);draw(&b);draw(&c);draw(&d);
return 0;
}
/* test the sign/unsigned storage functions */
rr = fp_signed_bin_size(&c);
fp_to_signed_bin(&c, (unsigned char *)cmd);
memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
fp_read_signed_bin(&d, (unsigned char *)cmd, rr);
if (fp_cmp(&c, &d) != FP_EQ) {
printf("fp_signed_bin failure!\n");
draw(&c);
draw(&d);
return 0;
}
rr = fp_unsigned_bin_size(&c);
fp_to_unsigned_bin(&c, (unsigned char *)cmd);
memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
fp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
if (fp_cmp_mag(&c, &d) != FP_EQ) {
printf("fp_unsigned_bin failure!\n");
draw(&c);
draw(&d);
return 0;
}
} else if (!strcmp(cmd, "sub")) { ++sub_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
fp_copy(&a, &d);
fp_sub(&d, &b, &d);
if (fp_cmp(&c, &d) != FP_EQ) {
printf("sub %lu failure!\n", sub_n);
draw(&a);draw(&b);draw(&c);draw(&d);
return 0;
}
} else if (!strcmp(cmd, "mul")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
//continue;
fp_copy(&a, &d);
fp_mul(&d, &b, &d); ++mul_n;
if (fp_cmp(&c, &d) != FP_EQ) {
printf("mul %lu failure!\n", mul_n);
draw(&a);draw(&b);draw(&c);draw(&d);
return 0;
}
} else if (!strcmp(cmd, "div")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&d, buf, 64);
// continue;
fp_div(&a, &b, &e, &f); ++div_n;
if (fp_cmp(&c, &e) != FP_EQ || fp_cmp(&d, &f) != FP_EQ) {
printf("div %lu failure!\n", div_n);
draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
return 0;
}
} else if (!strcmp(cmd, "sqr")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
// continue;
fp_copy(&a, &c);
fp_sqr(&c, &c); ++sqr_n;
if (fp_cmp(&b, &c) != FP_EQ) {
printf("sqr %lu failure!\n", sqr_n);
draw(&a);draw(&b);draw(&c);
return 0;
}
} else if (!strcmp(cmd, "gcd")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
// continue;
fp_copy(&a, &d);
fp_gcd(&d, &b, &d); ++gcd_n;
d.sign = c.sign;
if (fp_cmp(&c, &d) != FP_EQ) {
printf("gcd %lu failure!\n", gcd_n);
draw(&a);draw(&b);draw(&c);draw(&d);
return 0;
}
} else if (!strcmp(cmd, "lcm")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
//continue;
fp_copy(&a, &d);
fp_lcm(&d, &b, &d); ++lcm_n;
d.sign = c.sign;
if (fp_cmp(&c, &d) != FP_EQ) {
printf("lcm %lu failure!\n", lcm_n);
draw(&a);draw(&b);draw(&c);draw(&d);
return 0;
}
} else if (!strcmp(cmd, "expt")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&d, buf, 64);
// continue;
fp_copy(&a, &e);
fp_exptmod(&e, &b, &c, &e); ++expt_n;
if (fp_cmp(&d, &e) != FP_EQ) {
printf("expt %lu failure!\n", expt_n);
draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
return 0;
}
} else if (!strcmp(cmd, "invmod")) {
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&c, buf, 64);
//continue;
fp_invmod(&a, &b, &d);
#if 1
fp_mulmod(&d,&a,&b,&e); ++inv_n;
if (fp_cmp_d(&e, 1) != FP_EQ) {
#else
if (fp_cmp(&d, &c) != FP_EQ) {
#endif
printf("inv [wrong value from MPI?!] failure\n");
draw(&a);draw(&b);draw(&c);draw(&d);
return 0;
}
} else if (!strcmp(cmd, "div2")) { ++div2_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_div_2(&a, &c);
if (fp_cmp(&c, &b) != FP_EQ) {
printf("div_2 %lu failure\n", div2_n);
draw(&a);
draw(&b);
draw(&c);
return 0;
}
} else if (!strcmp(cmd, "mul2")) { ++mul2_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_2(&a, &c);
if (fp_cmp(&c, &b) != FP_EQ) {
printf("mul_2 %lu failure\n", mul2_n);
draw(&a);
draw(&b);
draw(&c);
return 0;
}
} else if (!strcmp(cmd, "add_d")) { ++add_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_add_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) {
printf("add_d %lu failure\n", add_d_n);
draw(&a);
draw(&b);
draw(&c);
printf("d == %d\n", ix);
return 0;
}
} else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_sub_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) {
printf("sub_d %lu failure\n", sub_d_n);
draw(&a);
draw(&b);
draw(&c);
printf("d == %d\n", ix);
return 0;
}
} else if (!strcmp(cmd, "mul_d")) { ++mul_d_n;
fgets(buf, 4095, stdin); fp_read_radix(&a, buf, 64);
fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
fgets(buf, 4095, stdin); fp_read_radix(&b, buf, 64);
fp_mul_d(&a, ix, &c);
if (fp_cmp(&b, &c) != FP_EQ) {
printf("mul_d %lu failure\n", sub_d_n);
draw(&a);
draw(&b);
draw(&c);
printf("d == %d\n", ix);
return 0;
}
}
}
}

BIN
doc/tfm.pdf Normal file

Binary file not shown.

35
fp_2expt.c Normal file
View File

@ -0,0 +1,35 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* computes a = 2**b */
void fp_2expt(fp_int *a, int b)
{
int z;
/* zero a as per default */
fp_zero (a);
if (b < 0) {
return;
}
z = b / DIGIT_BIT;
if (z >= FP_SIZE) {
return;
}
/* set the used count of where the bit will go */
a->used = z + 1;
/* put the single bit in its place */
a->dp[z] = ((fp_digit)1) << (b % DIGIT_BIT);
}

39
fp_add.c Normal file
View File

@ -0,0 +1,39 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_add(fp_int *a, fp_int *b, fp_int *c)
{
int sa, sb;
/* get sign of both inputs */
sa = a->sign;
sb = b->sign;
/* handle two cases, not four */
if (sa == sb) {
/* both positive or both negative */
/* add their magnitudes, copy the sign */
c->sign = sa;
s_fp_add (a, b, c);
} else {
/* one positive, the other negative */
/* subtract the one with the greater magnitude from */
/* the one of the lesser magnitude. The result gets */
/* the sign of the one with the greater magnitude. */
if (fp_cmp_mag (a, b) == FP_LT) {
c->sign = sb;
s_fp_sub (b, a, c);
} else {
c->sign = sa;
s_fp_sub (a, b, c);
}
}
}

18
fp_add_d.c Normal file
View File

@ -0,0 +1,18 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a + b */
void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
{
fp_int tmp;
fp_set(&tmp, b);
fp_add(a,&tmp,c);
}

19
fp_addmod.c Normal file
View File

@ -0,0 +1,19 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* d = a + b (mod c) */
int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
{
fp_int tmp;
fp_zero(&tmp);
fp_add(a, b, &tmp);
return fp_mod(&tmp, c, d);
}

27
fp_cmp.c Normal file
View File

@ -0,0 +1,27 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_cmp(fp_int *a, fp_int *b)
{
if (a->sign == FP_NEG && b->sign == FP_ZPOS) {
return FP_LT;
} else if (a->sign == FP_ZPOS && b->sign == FP_NEG) {
return FP_GT;
} else {
/* compare digits */
if (a->sign == FP_NEG) {
/* if negative compare opposite direction */
return fp_cmp_mag(b, a);
} else {
return fp_cmp_mag(a, b);
}
}
}

34
fp_cmp_d.c Normal file
View File

@ -0,0 +1,34 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* compare against a single digit */
int fp_cmp_d(fp_int *a, fp_digit b)
{
/* compare based on sign */
if ((b && a->used == 0) || a->sign == FP_NEG) {
return FP_LT;
}
/* compare based on magnitude */
if (a->used > 1) {
return FP_GT;
}
/* compare the only digit of a to b */
if (a->dp[0] > b) {
return FP_GT;
} else if (a->dp[0] < b) {
return FP_LT;
} else {
return FP_EQ;
}
}

31
fp_cmp_mag.c Normal file
View File

@ -0,0 +1,31 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_cmp_mag(fp_int *a, fp_int *b)
{
int x;
if (a->used > b->used) {
return FP_GT;
} else if (a->used < b->used) {
return FP_LT;
} else {
for (x = a->used - 1; x >= 0; x--) {
if (a->dp[x] > b->dp[x]) {
return FP_GT;
} else if (a->dp[x] < b->dp[x]) {
return FP_LT;
}
}
}
return FP_EQ;
}

42
fp_cnt_lsb.c Normal file
View File

@ -0,0 +1,42 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
static const int lnz[16] = {
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};
/* Counts the number of lsbs which are zero before the first zero bit */
int fp_cnt_lsb(fp_int *a)
{
int x;
fp_digit q, qq;
/* easy out */
if (fp_iszero(a) == 1) {
return 0;
}
/* scan lower digits until non-zero */
for (x = 0; x < a->used && a->dp[x] == 0; x++);
q = a->dp[x];
x *= DIGIT_BIT;
/* now scan this digit until a 1 is found */
if ((q & 1) == 0) {
do {
qq = q & 15;
x += lnz[qq];
q >>= 4;
} while (qq == 0);
}
return x;
}

32
fp_count_bits.c Normal file
View File

@ -0,0 +1,32 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_count_bits (fp_int * a)
{
int r;
fp_digit q;
/* shortcut */
if (a->used == 0) {
return 0;
}
/* get number of digits and add that */
r = (a->used - 1) * DIGIT_BIT;
/* take the last digit and count the bits in it */
q = a->dp[a->used - 1];
while (q > ((fp_digit) 0)) {
++r;
q >>= ((fp_digit) 1);
}
return r;
}

153
fp_div.c Normal file
View File

@ -0,0 +1,153 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* a/b => cb + d == a */
int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
{
fp_int q, x, y, t1, t2;
int n, t, i, norm, neg;
/* is divisor zero ? */
if (fp_iszero (b) == 1) {
return FP_VAL;
}
/* if a < b then q=0, r = a */
if (fp_cmp_mag (a, b) == FP_LT) {
if (d != NULL) {
fp_copy (a, d);
}
if (c != NULL) {
fp_zero (c);
}
return FP_OKAY;
}
fp_init(&q);
q.used = a->used + 2;
fp_init(&t1);
fp_init(&t2);
fp_init_copy(&x, a);
fp_init_copy(&y, b);
/* fix the sign */
neg = (a->sign == b->sign) ? FP_ZPOS : FP_NEG;
x.sign = y.sign = FP_ZPOS;
/* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
norm = fp_count_bits(&y) % DIGIT_BIT;
if (norm < (int)(DIGIT_BIT-1)) {
norm = (DIGIT_BIT-1) - norm;
fp_mul_2d (&x, norm, &x);
fp_mul_2d (&y, norm, &y);
} else {
norm = 0;
}
/* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
n = x.used - 1;
t = y.used - 1;
/* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
fp_lshd (&y, n - t); /* y = y*b**{n-t} */
while (fp_cmp (&x, &y) != FP_LT) {
++(q.dp[n - t]);
fp_sub (&x, &y, &x);
}
/* reset y by shifting it back down */
fp_rshd (&y, n - t);
/* step 3. for i from n down to (t + 1) */
for (i = n; i >= (t + 1); i--) {
if (i > x.used) {
continue;
}
/* step 3.1 if xi == yt then set q{i-t-1} to b-1,
* otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
if (x.dp[i] == y.dp[t]) {
q.dp[i - t - 1] = ((((fp_word)1) << DIGIT_BIT) - 1);
} else {
fp_word tmp;
tmp = ((fp_word) x.dp[i]) << ((fp_word) DIGIT_BIT);
tmp |= ((fp_word) x.dp[i - 1]);
tmp /= ((fp_word) y.dp[t]);
q.dp[i - t - 1] = (fp_digit) (tmp);
}
/* while (q{i-t-1} * (yt * b + y{t-1})) >
xi * b**2 + xi-1 * b + xi-2
do q{i-t-1} -= 1;
*/
q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
do {
q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
/* find left hand */
fp_zero (&t1);
t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
t1.dp[1] = y.dp[t];
t1.used = 2;
fp_mul_d (&t1, q.dp[i - t - 1], &t1);
/* find right hand */
t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
t2.dp[2] = x.dp[i];
t2.used = 3;
} while (fp_cmp_mag(&t1, &t2) == FP_GT);
/* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
fp_mul_d (&y, q.dp[i - t - 1], &t1);
fp_lshd (&t1, i - t - 1);
fp_sub (&x, &t1, &x);
/* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
if (x.sign == FP_NEG) {
fp_copy (&y, &t1);
fp_lshd (&t1, i - t - 1);
fp_add (&x, &t1, &x);
q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
}
}
/* now q is the quotient and x is the remainder
* [which we have to normalize]
*/
/* get sign before writing to c */
x.sign = x.used == 0 ? FP_ZPOS : a->sign;
if (c != NULL) {
fp_clamp (&q);
fp_copy (&q, c);
c->sign = neg;
}
if (d != NULL) {
fp_div_2d (&x, norm, &x, NULL);
/* the following is a kludge, essentially we were seeing the right remainder but
with excess digits that should have been zero
*/
for (i = b->used; i < x.used; i++) {
x.dp[i] = 0;
}
fp_clamp(&x);
fp_copy (&x, d);
}
return FP_OKAY;
}

49
fp_div_2.c Normal file
View File

@ -0,0 +1,49 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* b = a/2 */
void fp_div_2(fp_int * a, fp_int * b)
{
int x, oldused;
oldused = b->used;
b->used = a->used;
{
register fp_digit r, rr, *tmpa, *tmpb;
/* source alias */
tmpa = a->dp + b->used - 1;
/* dest alias */
tmpb = b->dp + b->used - 1;
/* carry */
r = 0;
for (x = b->used - 1; x >= 0; x--) {
/* get the carry for the next iteration */
rr = *tmpa & 1;
/* shift the current digit, add in carry and store */
*tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
/* forward carry to next iteration */
r = rr;
}
/* zero excess digits */
tmpb = b->dp + b->used;
for (x = b->used; x < oldused; x++) {
*tmpb++ = 0;
}
}
b->sign = a->sign;
fp_clamp (b);
}

75
fp_div_2d.c Normal file
View File

@ -0,0 +1,75 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a / 2**b */
void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
{
fp_digit D, r, rr;
int x;
fp_int t;
/* if the shift count is <= 0 then we do no work */
if (b <= 0) {
fp_copy (a, c);
if (d != NULL) {
fp_zero (d);
}
return;
}
fp_init(&t);
/* get the remainder */
if (d != NULL) {
fp_mod_2d (a, b, &t);
}
/* copy */
fp_copy(a, c);
/* shift by as many digits in the bit count */
if (b >= (int)DIGIT_BIT) {
fp_rshd (c, b / DIGIT_BIT);
}
/* shift any bit count < DIGIT_BIT */
D = (fp_digit) (b % DIGIT_BIT);
if (D != 0) {
register fp_digit *tmpc, mask, shift;
/* mask */
mask = (((fp_digit)1) << D) - 1;
/* shift for lsb */
shift = DIGIT_BIT - D;
/* alias */
tmpc = c->dp + (c->used - 1);
/* carry */
r = 0;
for (x = c->used - 1; x >= 0; x--) {
/* get the lower bits of this word in a temp */
rr = *tmpc & mask;
/* shift the current word and mix in the carry bits from the previous word */
*tmpc = (*tmpc >> D) | (r << shift);
--tmpc;
/* set the carry to the carry bits of the current word found above */
r = rr;
}
}
fp_clamp (c);
if (d != NULL) {
fp_copy (&t, d);
}
}

89
fp_div_d.c Normal file
View File

@ -0,0 +1,89 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
static int s_is_power_of_two(fp_digit b, int *p)
{
int x;
for (x = 1; x < DIGIT_BIT; x++) {
if (b == (((fp_digit)1)<<x)) {
*p = x;
return 1;
}
}
return 0;
}
/* a/b => cb + d == a */
int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
{
fp_int q;
fp_word w;
fp_digit t;
int ix;
/* cannot divide by zero */
if (b == 0) {
return FP_VAL;
}
/* quick outs */
if (b == 1 || fp_iszero(a) == 1) {
if (d != NULL) {
*d = 0;
}
if (c != NULL) {
fp_copy(a, c);
}
return FP_OKAY;
}
/* power of two ? */
if (s_is_power_of_two(b, &ix) == 1) {
if (d != NULL) {
*d = a->dp[0] & ((((fp_digit)1)<<ix) - 1);
}
if (c != NULL) {
fp_div_2d(a, ix, c, NULL);
}
return FP_OKAY;
}
/* no easy answer [c'est la vie]. Just division */
fp_init(&q);
q.used = a->used;
q.sign = a->sign;
w = 0;
for (ix = a->used - 1; ix >= 0; ix--) {
w = (w << ((fp_word)DIGIT_BIT)) | ((fp_word)a->dp[ix]);
if (w >= b) {
t = (fp_digit)(w / b);
w -= ((fp_word)t) * ((fp_word)b);
} else {
t = 0;
}
q.dp[ix] = (fp_digit)t;
}
if (d != NULL) {
*d = (fp_digit)w;
}
if (c != NULL) {
fp_clamp(&q);
fp_copy(&q, c);
}
return FP_OKAY;
}

170
fp_exptmod.c Normal file
View File

@ -0,0 +1,170 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* y = g**x (mod b)
* Some restrictions... x must be positive and < b
*/
int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
{
fp_int M[64], res;
fp_digit buf, mp;
int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
/* find window size */
x = fp_count_bits (X);
if (x <= 7) {
winsize = 2;
} else if (x <= 36) {
winsize = 3;
} else if (x <= 140) {
winsize = 4;
} else if (x <= 450) {
winsize = 5;
} else {
winsize = 6;
}
/* init M array */
memset(M, 0, sizeof(fp_int)*(1<<winsize));
/* now setup montgomery */
if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
return err;
}
/* setup result */
fp_init(&res);
/* create M table
*
* The M table contains powers of the input base, e.g. M[x] = G^x mod P
*
* The first half of the table is not computed though accept for M[0] and M[1]
*/
/* now we need R mod m */
fp_montgomery_calc_normalization (&res, P);
/* now set M[1] to G * R mod m */
if (fp_cmp_mag(P, G) != FP_GT) {
/* G > P so we reduce it first */
fp_mod(G, P, &M[1]);
} else {
fp_copy(G, &M[1]);
}
fp_mulmod (&M[1], &res, P, &M[1]);
/* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
fp_copy (&M[1], &M[1 << (winsize - 1)]);
for (x = 0; x < (winsize - 1); x++) {
fp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)]);
fp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp);
}
/* create upper table */
for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
fp_mul(&M[x - 1], &M[1], &M[x]);
fp_montgomery_reduce(&M[x], P, mp);
}
/* set initial mode and bit cnt */
mode = 0;
bitcnt = 1;
buf = 0;
digidx = X->used - 1;
bitcpy = 0;
bitbuf = 0;
for (;;) {
/* grab next digit as required */
if (--bitcnt == 0) {
/* if digidx == -1 we are out of digits so break */
if (digidx == -1) {
break;
}
/* read next digit and reset bitcnt */
buf = X->dp[digidx--];
bitcnt = (int)DIGIT_BIT;
}
/* grab the next msb from the exponent */
y = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
buf <<= (fp_digit)1;
/* if the bit is zero and mode == 0 then we ignore it
* These represent the leading zero bits before the first 1 bit
* in the exponent. Technically this opt is not required but it
* does lower the # of trivial squaring/reductions used
*/
if (mode == 0 && y == 0) {
continue;
}
/* if the bit is zero and mode == 1 then we square */
if (mode == 1 && y == 0) {
fp_sqr(&res, &res);
fp_montgomery_reduce(&res, P, mp);
continue;
}
/* else we add it to the window */
bitbuf |= (y << (winsize - ++bitcpy));
mode = 2;
if (bitcpy == winsize) {
/* ok window is filled so square as required and multiply */
/* square first */
for (x = 0; x < winsize; x++) {
fp_sqr(&res, &res);
fp_montgomery_reduce(&res, P, mp);
}
/* then multiply */
fp_mul(&res, &M[bitbuf], &res);
fp_montgomery_reduce(&res, P, mp);
/* empty window and reset */
bitcpy = 0;
bitbuf = 0;
mode = 1;
}
}
/* if bits remain then square/multiply */
if (mode == 2 && bitcpy > 0) {
/* square then multiply if the bit is set */
for (x = 0; x < bitcpy; x++) {
fp_sqr(&res, &res);
fp_montgomery_reduce(&res, P, mp);
/* get next bit of the window */
bitbuf <<= 1;
if ((bitbuf & (1 << winsize)) != 0) {
/* then multiply */
fp_mul(&res, &M[1], &res);
fp_montgomery_reduce(&res, P, mp);
}
}
}
/* fixup result if Montgomery reduction is used
* recall that any value in a Montgomery system is
* actually multiplied by R mod n. So we have
* to reduce one more time to cancel out the factor
* of R.
*/
fp_montgomery_reduce(&res, P, mp);
/* swap res with Y */
fp_copy (&res, Y);
return FP_OKAY;
}

51
fp_gcd.c Normal file
View File

@ -0,0 +1,51 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = (a, b) */
void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
{
fp_int u, v, r;
/* either zero than gcd is the largest */
if (fp_iszero (a) == 1 && fp_iszero (b) == 0) {
fp_abs (b, c);
return;
}
if (fp_iszero (a) == 0 && fp_iszero (b) == 1) {
fp_abs (a, c);
return;
}
/* optimized. At this point if a == 0 then
* b must equal zero too
*/
if (fp_iszero (a) == 1) {
fp_zero(c);
return;
}
/* sort inputs */
if (fp_cmp_mag(a, b) != FP_LT) {
fp_init_copy(&u, a);
fp_init_copy(&v, b);
} else {
fp_init_copy(&u, b);
fp_init_copy(&v, a);
}
fp_zero(&r);
while (fp_iszero(&v) == FP_NO) {
fp_mod(&u, &v, &r);
fp_copy(&v, &u);
fp_copy(&r, &v);
}
fp_copy(&u, c);
}

98
fp_invmod.c Normal file
View File

@ -0,0 +1,98 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = 1/a (mod b) for odd b only */
int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
{
fp_int x, y, u, v, B, D;
int neg;
/* 2. [modified] b must be odd */
if (fp_iseven (b) == FP_YES) {
return FP_VAL;
}
/* init all our temps */
fp_init(&x); fp_init(&y);
fp_init(&u); fp_init(&v);
fp_init(&B); fp_init(&D);
/* x == modulus, y == value to invert */
fp_copy(b, &x);
/* we need y = |a| */
fp_abs(a, &y);
/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
fp_copy(&x, &u);
fp_copy(&y, &v);
fp_set (&D, 1);
top:
/* 4. while u is even do */
while (fp_iseven (&u) == FP_YES) {
/* 4.1 u = u/2 */
fp_div_2 (&u, &u);
/* 4.2 if B is odd then */
if (fp_isodd (&B) == FP_YES) {
fp_sub (&B, &x, &B);
}
/* B = B/2 */
fp_div_2 (&B, &B);
}
/* 5. while v is even do */
while (fp_iseven (&v) == FP_YES) {
/* 5.1 v = v/2 */
fp_div_2 (&v, &v);
/* 5.2 if D is odd then */
if (fp_isodd (&D) == FP_YES) {
/* D = (D-x)/2 */
fp_sub (&D, &x, &D);
}
/* D = D/2 */
fp_div_2 (&D, &D);
}
/* 6. if u >= v then */
if (fp_cmp (&u, &v) != FP_LT) {
/* u = u - v, B = B - D */
fp_sub (&u, &v, &u);
fp_sub (&B, &D, &B);
} else {
/* v - v - u, D = D - B */
fp_sub (&v, &u, &v);
fp_sub (&D, &B, &D);
}
/* if not zero goto step 4 */
if (fp_iszero (&u) == FP_NO) {
goto top;
}
/* now a = C, b = D, gcd == g*v */
/* if v != 1 then there is no inverse */
if (fp_cmp_d (&v, 1) != FP_EQ) {
return FP_VAL;
}
/* b is now the inverse */
neg = a->sign;
while (D.sign == FP_NEG) {
fp_add (&D, b, &D);
}
fp_copy (&D, c);
c->sign = neg;
return FP_OKAY;
}

74
fp_isprime.c Normal file
View File

@ -0,0 +1,74 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* a few primes */
static const fp_digit primes[256] = {
0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
};
int fp_isprime(fp_int *a)
{
fp_int b;
fp_digit d;
int r, res;
/* do trial division */
for (r = 0; r < 256; r++) {
fp_mod_d(a, primes[r], &d);
if (d == 0) {
return FP_NO;
}
}
/* now do 8 miller rabins */
for (r = 0; r < 8; r++) {
fp_set(&b, primes[r]);
fp_prime_miller_rabin(a, &b, &res);
if (res == FP_NO) {
return FP_NO;
}
}
return FP_YES;
}

27
fp_lcm.c Normal file
View File

@ -0,0 +1,27 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = [a, b] */
void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
{
fp_int t1, t2;
fp_init(&t1);
fp_init(&t2);
fp_gcd(a, b, &t1);
if (fp_cmp_mag(a, b) == FP_GT) {
fp_div(a, &t1, &t2, NULL);
fp_mul(b, &t2, c);
} else {
fp_div(b, &t1, &t2, NULL);
fp_mul(a, &t2, c);
}
}

34
fp_lshd.c Normal file
View File

@ -0,0 +1,34 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_lshd(fp_int *a, int x)
{
int y;
/* move up and truncate as required */
y = MIN(a->used + x - 1, (int)(FP_SIZE-1));
/* store new size */
a->used = y + 1;
/* move digits */
for (; y >= x; y--) {
a->dp[y] = a->dp[y-x];
}
/* zero lower digits */
for (; y >= 0; y--) {
a->dp[y] = 0;
}
/* clamp digits */
fp_clamp(a);
}

18
fp_mod.c Normal file
View File

@ -0,0 +1,18 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a mod b, 0 <= c < b */
int fp_mod(fp_int *a, fp_int *b, fp_int *c)
{
return fp_div(a, b, NULL, c);
}

38
fp_mod_2d.c Normal file
View File

@ -0,0 +1,38 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a mod 2**d */
void fp_mod_2d(fp_int *a, int b, fp_int *c)
{
int x;
/* zero if count less than or equal to zero */
if (b <= 0) {
fp_zero(c);
return;
}
/* get copy of input */
fp_copy(a, c);
/* if 2**d is larger than we just return */
if (b >= (DIGIT_BIT * a->used)) {
return;
}
/* zero digits above the last digit of the modulus */
for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
c->dp[x] = 0;
}
/* clear the digit that is not completely outside/inside the modulus */
c->dp[b / DIGIT_BIT] &= ~((fp_digit)0) >> (DIGIT_BIT - b);
fp_clamp (c);
}

16
fp_mod_d.c Normal file
View File

@ -0,0 +1,16 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a mod b, 0 <= c < b */
int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
{
return fp_div_d(a, b, NULL, c);
}

View File

@ -0,0 +1,38 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* computes a = B**n mod b without division or multiplication useful for
* normalizing numbers in a Montgomery system.
*/
void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
{
int x, bits;
/* how many bits of last digit does b use */
bits = fp_count_bits (b) % DIGIT_BIT;
/* compute A = B^(n-1) * 2^(bits-1) */
if (b->used > 1) {
fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
} else {
fp_set(a, 1);
++bits;
}
/* now compute C = A * B mod b */
for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
fp_mul_2 (a, a);
if (fp_cmp_mag (a, b) != FP_LT) {
s_fp_sub (a, b, a);
}
}
}

249
fp_montgomery_reduce.c Normal file
View File

@ -0,0 +1,249 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
#if defined(TFM_X86)
/* x86-32 code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
#define INNERMUL \
asm( \
"movl %7,%%eax \n\t" \
"mull %6 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(mu), "g"(*tmpm++) \
: "%eax", "%edx", "%cc");
#define PROPCARRY \
asm( \
"movl %1,%%eax \n\t" \
"addl %%eax,%6 \n\t" \
"movl %2,%%eax \n\t" \
"adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%eax", "%cc");
#elif defined(TFM_X86_64)
/* x86-64 code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
#define INNERMUL \
asm( \
"movq %7,%%rax \n\t" \
"mulq %6 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(mu), "g"(*tmpm++) \
: "%rax", "%rdx", "%cc");
#define PROPCARRY \
asm( \
"movq %1,%%rax \n\t" \
"addq %%rax,%6 \n\t" \
"movq %2,%%rax \n\t" \
"adcq %%rax,%7 \n\t" \
"adcq $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%rax", "%cc");
#elif defined(TFM_SSE2)
/* SSE2 code */
#define MONT_START \
asm("movd %0,%%mm2"::"g"(mp));
#define MONT_FINI \
asm("emms");
#define LOOP_START \
asm(\
"movd %0,%%mm1 \n\t" \
"pmuludq %%mm2,%%mm1 \n\t" \
:: "g"(c[x]), "g"(mp));
#define INNERMUL \
asm( \
"movd %6,%%mm0 \n\t" \
"pmuludq %%mm1,%%mm0 \n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32, %%mm0 \n\t" \
"addl %%eax,%0 \n\t" \
"movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(*tmpm++) \
: "%eax", "%cc");
#define PROPCARRY \
asm( \
"movl %1,%%eax \n\t" \
"addl %%eax,%6 \n\t" \
"movl %2,%%eax \n\t" \
"adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%eax", "%cc");
#elif defined(TFM_ARM)
/* ISO C code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
/* NOTE: later write it using two regs instead of three for _c + ... */
#define INNERMUL \
asm( \
"UMULL r0,r1,%0,%1 \n\t" \
"LDR r2,[%2] \n\t" \
"ADDS r2,r2,r0 \n\t" \
"STR r2,[%2] \n\t" \
"LDR r2,[%3] \n\t" \
"ADCS r2,r2,r1 \n\t" \
"STR r2,[%3] \n\t" \
"LDR r2,[%4] \n\t" \
"ADC r2,r2,#0 \n\t" \
"STR r2,[%4] \n\t" \
::"r"(mu),"r"(*tmpm++),"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "r2", "%cc");
#define PROPCARRY \
asm( \
"LDR r0,[%1] \n\t" \
"LDR r1,[%0,#4] \n\t" \
"ADDS r0,r0,r1 \n\t" \
"STR r0,[%0,#4] \n\t" \
"LDR r0,[%2] \n\t" \
"LDR r1,[%1,#4] \n\t" \
"ADCS r0,r0,r1 \n\t" \
"STR r0,[%1,#4] \n\t" \
"LDR r0,[%2,#4] \n\t" \
"ADC r0,r0,#0 \n\t" \
"STR r0,[%2,#4] \n\t" \
::"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "%cc");
#else
/* ISO C code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
#define INNERMUL \
t = ((fp_word)mu) * ((fp_word)*tmpm++); \
_c[OFF0] += t; if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \
_c[OFF1] += (t>>DIGIT_BIT); if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2]; \
#define PROPCARRY \
_c[OFF0+1] += _c[OFF1]; if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \
_c[OFF1+1] += _c[OFF2]; if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1];
#endif
#define OFF0 (0)
#define OFF1 (FP_SIZE)
#define OFF2 (FP_SIZE+FP_SIZE)
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
int oldused, x, y, pa;
fp_word t;
/* now zero the buff */
pa = m->used;
memset(c, 0, sizeof(c));
/* copy the input */
oldused = a->used;
for (x = 0; x < oldused; x++) {
c[x] = a->dp[x];
}
MONT_START;
/* now let's get bizz-sy! */
for (x = 0; x < pa; x++) {
/* get Mu for this round */
LOOP_START;
/* our friendly neighbourhood alias */
_c = c + x;
tmpm = m->dp;
for (y = 0; y < pa; y++) {
INNERMUL;
++_c;
}
/* send carry up man... */
_c = c + x;
PROPCARRY;
}
/* fix the rest of the carries */
_c = c + pa;
for (; x < pa * 2 + 2; x++) {
PROPCARRY;
++_c;
}
/* now copy out */
_c = c + pa;
tmpm = a->dp;
for (x = 0; x < pa+1; x++) {
*tmpm++ = *_c++;
}
for (; x < oldused; x++) {
*tmpm++ = 0;
}
MONT_FINI;
a->used = pa+1;
fp_clamp(a);
/* if A >= m then A = A - m */
if (fp_cmp_mag (a, m) != FP_LT) {
s_fp_sub (a, m, a);
}
}

44
fp_montgomery_setup.c Normal file
View File

@ -0,0 +1,44 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* setups the montgomery reduction */
int fp_montgomery_setup(fp_int *a, fp_digit *rho)
{
fp_digit x, b;
/* fast inversion mod 2**k
*
* Based on the fact that
*
* XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
* => 2*X*A - X*X*A*A = 1
* => 2*(1) - (1) = 1
*/
b = a->dp[0];
if ((b & 1) == 0) {
return FP_VAL;
}
x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
x *= 2 - b * x; /* here x*a==1 mod 2**8 */
x *= 2 - b * x; /* here x*a==1 mod 2**16 */
x *= 2 - b * x; /* here x*a==1 mod 2**32 */
#ifdef FP_64BIT
x *= 2 - b * x; /* here x*a==1 mod 2**64 */
#endif
/* rho = -1/m mod b */
*rho = (((fp_word) 1 << ((fp_word) DIGIT_BIT)) - ((fp_word)x));
return FP_OKAY;
}

134
fp_mul.c Normal file
View File

@ -0,0 +1,134 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a * b */
void fp_mul(fp_int *A, fp_int *B, fp_int *C)
{
int r, y, yy, s;
fp_int ac, bd, comp, amb, cmd, t1, t2;
y = MAX(A->used, B->used);
yy = MIN(A->used, B->used);
if (yy <= 8 || y <= 64) {
/* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
of the largest input. We also want to avoid doing excess mults if the
inputs are not close to the next power of two. That is, for example,
if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
*/
if (y <= 4) {
fp_mul_comba4(A,B,C);
} else if (y <= 8) {
fp_mul_comba8(A,B,C);
} else if (y <= 16 && y >= 12) {
fp_mul_comba16(A,B,C);
#ifdef TFM_HUGE
} else if (y <= 32 && y >= 28) {
fp_mul_comba32(A,B,C);
#endif
} else {
fp_mul_comba(A,B,C);
}
} else {
/* do the karatsuba action
if A = ab and B = cd for ||a|| = r we need to solve
ac*r^2 + (-(a-b)(c-d) + ac + bd)*r + bd
So we solve for the three products then we form the final result with careful shifting
and addition.
Obvious points of optimization
- "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]
- Similarly the "bd" parts can be memcpy'ed and zeroed to 8
-
*/
/* get our value of r */
r = yy >> 1;
/* now solve for ac */
// fp_copy(A, &t1); fp_rshd(&t1, r);
for (s = 0; s < A->used - r; s++) {
t1.dp[s] = A->dp[s+r];
}
for (; s < FP_SIZE; s++) {
t1.dp[s] = 0;
}
if (A->used >= r) {
t1.used = A->used - r;
} else {
t1.used = 0;
}
t1.sign = A->sign;
// fp_copy(B, &t2); fp_rshd(&t2, r);
for (s = 0; s < B->used - r; s++) {
t2.dp[s] = B->dp[s+r];
}
for (; s < FP_SIZE; s++) {
t2.dp[s] = 0;
}
if (B->used >= r) {
t2.used = B->used - r;
} else {
t2.used = 0;
}
t2.sign = B->sign;
fp_copy(&t1, &amb); fp_copy(&t2, &cmd);
fp_zero(&ac);
fp_mul(&t1, &t2, &ac);
/* now solve for bd */
// fp_mod_2d(A, r * DIGIT_BIT, &t1);
// fp_mod_2d(B, r * DIGIT_BIT, &t2);
for (s = 0; s < r; s++) {
t1.dp[s] = A->dp[s];
t2.dp[s] = B->dp[s];
}
for (; s < FP_SIZE; s++) {
t1.dp[s] = 0;
t2.dp[s] = 0;
}
t1.used = r;
t2.used = r;
fp_clamp(&t1);
fp_clamp(&t2);
fp_sub(&amb, &t1, &amb); fp_sub(&cmd, &t2, &cmd);
fp_zero(&bd);
fp_mul(&t1, &t2, &bd);
/* now get the (a-b)(c-d) term */
fp_zero(&comp);
fp_mul(&amb, &cmd, &comp);
/* now solve the system, do the middle term first */
comp.sign ^= 1;
fp_add(&comp, &ac, &comp);
fp_add(&comp, &bd, &comp);
fp_lshd(&comp, r);
/* leading term */
fp_lshd(&ac, r+r);
/* now sum them together */
s = A->sign ^ B->sign;
fp_zero(C);
fp_add(&ac, &comp, C);
fp_add(&bd, C, C);
C->sign = C->used ? s : FP_ZPOS;
}
}

63
fp_mul_2.c Normal file
View File

@ -0,0 +1,63 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_mul_2(fp_int * a, fp_int * b)
{
int x, oldused;
oldused = b->used;
b->used = a->used;
{
register fp_digit r, rr, *tmpa, *tmpb;
/* alias for source */
tmpa = a->dp;
/* alias for dest */
tmpb = b->dp;
/* carry */
r = 0;
for (x = 0; x < a->used; x++) {
/* get what will be the *next* carry bit from the
* MSB of the current digit
*/
rr = *tmpa >> ((fp_digit)(DIGIT_BIT - 1));
/* now shift up this digit, add in the carry [from the previous] */
*tmpb++ = ((*tmpa++ << ((fp_digit)1)) | r);
/* copy the carry that would be from the source
* digit into the next iteration
*/
r = rr;
}
/* new leading digit? */
if (r != 0 && b->used != (FP_SIZE-1)) {
/* add a MSB which is always 1 at this point */
*tmpb = 1;
++(b->used);
}
/* now zero any excess digits on the destination
* that we didn't write to
*/
tmpb = b->dp + b->used;
for (x = b->used; x < oldused; x++) {
*tmpb++ = 0;
}
}
b->sign = a->sign;
}

43
fp_mul_2d.c Normal file
View File

@ -0,0 +1,43 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a * 2**d */
void fp_mul_2d(fp_int *a, int b, fp_int *c)
{
fp_digit carry, carrytmp, shift;
int x;
/* copy it */
fp_copy(a, c);
/* handle whole digits */
if (b >= DIGIT_BIT) {
fp_lshd(c, b/DIGIT_BIT);
}
b %= DIGIT_BIT;
/* shift the digits */
if (b != 0) {
carry = 0;
shift = DIGIT_BIT - b;
for (x = 0; x < c->used; x++) {
carrytmp = c->dp[x] >> shift;
c->dp[x] = (c->dp[x] << b) + carry;
carry = carrytmp;
}
/* store last carry if room */
if (carry && x < FP_SIZE) {
c->dp[c->used++] = carry;
}
}
fp_clamp(c);
}

772
fp_mul_comba.c Normal file
View File

@ -0,0 +1,772 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
/* About this file...
*/
#include <tfm.h>
/* these are the combas. Worship them. */
#if defined(TFM_X86)
/* Generic x86 optimized code */
/* anything you need at the start */
#define COMBA_START
/* clear the chaining variables */
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
/* store the first sum */
#define COMBA_STORE(x) \
x = c0;
/* store the second sum [carry] */
#define COMBA_STORE2(x) \
x = c1;
/* anything you need at the end */
#define COMBA_FINI
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#elif defined(TFM_X86_64)
/* x86-64 optimized */
/* anything you need at the start */
#define COMBA_START
/* clear the chaining variables */
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
/* store the first sum */
#define COMBA_STORE(x) \
x = c0;
/* store the second sum [carry] */
#define COMBA_STORE2(x) \
x = c1;
/* anything you need at the end */
#define COMBA_FINI
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2)
/* use SSE2 optimizations */
/* anything you need at the start */
#define COMBA_START
/* clear the chaining variables */
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
/* forward the carry to the next digit */
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
/* store the first sum */
#define COMBA_STORE(x) \
x = c0;
/* store the second sum [carry] */
#define COMBA_STORE2(x) \
x = c1;
/* anything you need at the end */
#define COMBA_FINI \
asm("emms");
/* this should multiply i and j */
#define MULADD(i, j) \
asm volatile ( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \
"addl %%eax,%0 \n\t" \
"movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%cc");
#elif defined(TFM_ARM)
/* ARM code */
#define COMBA_START
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define COMBA_FINI
#define MULADD(i, j) \
asm( \
" UMULL r0,r1,%6,%7 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2, %2, #0 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
#else
/* ISO C code */
#define COMBA_START
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define COMBA_FINI
#define MULADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#endif
/* generic PxQ multiplier */
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
{
int ix, iy, iz, tx, ty, pa;
fp_digit c0, c1, c2, *tmpx, *tmpy;
fp_word t;
fp_int tmp, *dst;
COMBA_START;
COMBA_CLEAR;
/* get size of output and trim */
pa = A->used + B->used;
if (pa >= FP_SIZE) {
pa = FP_SIZE-1;
}
if (A == C || B == C) {
fp_zero(&tmp);
dst = &tmp;
} else {
fp_zero(C);
dst = C;
}
for (ix = 0; ix < pa; ix++) {
/* get offsets into the two bignums */
ty = MIN(ix, B->used-1);
tx = ix - ty;
/* setup temp aliases */
tmpx = A->dp + tx;
tmpy = B->dp + ty;
/* this is the number of times the loop will iterrate, essentially its
while (tx++ < a->used && ty-- >= 0) { ... }
*/
iy = MIN(A->used-tx, ty+1);
/* execute loop */
COMBA_FORWARD;
for (iz = 0; iz < iy; ++iz) {
MULADD(*tmpx++, *tmpy--);
}
/* store term */
COMBA_STORE(dst->dp[ix]);
}
/* store final carry */
COMBA_STORE2(dst->dp[ix]);
COMBA_FINI;
dst->used = pa;
fp_clamp(dst);
dst->sign = dst->used ? A->sign ^ B->sign : FP_ZPOS;
fp_copy(dst, C);
}
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[8];
memcpy(at, A->dp, 4 * sizeof(fp_digit));
memcpy(at+4, B->dp, 4 * sizeof(fp_digit));
COMBA_START;
COMBA_CLEAR;
/* 0 */
MULADD(at[0], at[4]);
COMBA_STORE(C->dp[0]);
/* 1 */
COMBA_FORWARD;
MULADD(at[0], at[5]); MULADD(at[1], at[4]);
COMBA_STORE(C->dp[1]);
/* 2 */
COMBA_FORWARD;
MULADD(at[0], at[6]); MULADD(at[1], at[5]); MULADD(at[2], at[4]);
COMBA_STORE(C->dp[2]);
/* 3 */
COMBA_FORWARD;
MULADD(at[0], at[7]); MULADD(at[1], at[6]); MULADD(at[2], at[5]); MULADD(at[3], at[4]);
COMBA_STORE(C->dp[3]);
/* 4 */
COMBA_FORWARD;
MULADD(at[1], at[7]); MULADD(at[2], at[6]); MULADD(at[3], at[5]);
COMBA_STORE(C->dp[4]);
/* 5 */
COMBA_FORWARD;
MULADD(at[2], at[7]); MULADD(at[3], at[6]);
COMBA_STORE(C->dp[5]);
/* 6 */
COMBA_FORWARD;
MULADD(at[3], at[7]);
COMBA_STORE(C->dp[6]);
COMBA_STORE2(C->dp[7]);
C->used = 8;
C->sign = A->sign ^ B->sign;
fp_clamp(C);
COMBA_FINI;
}
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[16];
memcpy(at, A->dp, 8 * sizeof(fp_digit));
memcpy(at+8, B->dp, 8 * sizeof(fp_digit));
COMBA_START;
COMBA_CLEAR;
/* 0 */
MULADD(at[0], at[8]);
COMBA_STORE(C->dp[0]);
/* 1 */
COMBA_FORWARD;
MULADD(at[0], at[9]); MULADD(at[1], at[8]);
COMBA_STORE(C->dp[1]);
/* 2 */
COMBA_FORWARD;
MULADD(at[0], at[10]); MULADD(at[1], at[9]); MULADD(at[2], at[8]);
COMBA_STORE(C->dp[2]);
/* 3 */
COMBA_FORWARD;
MULADD(at[0], at[11]); MULADD(at[1], at[10]); MULADD(at[2], at[9]); MULADD(at[3], at[8]);
COMBA_STORE(C->dp[3]);
/* 4 */
COMBA_FORWARD;
MULADD(at[0], at[12]); MULADD(at[1], at[11]); MULADD(at[2], at[10]); MULADD(at[3], at[9]); MULADD(at[4], at[8]);
COMBA_STORE(C->dp[4]);
/* 5 */
COMBA_FORWARD;
MULADD(at[0], at[13]); MULADD(at[1], at[12]); MULADD(at[2], at[11]); MULADD(at[3], at[10]); MULADD(at[4], at[9]); MULADD(at[5], at[8]);
COMBA_STORE(C->dp[5]);
/* 6 */
COMBA_FORWARD;
MULADD(at[0], at[14]); MULADD(at[1], at[13]); MULADD(at[2], at[12]); MULADD(at[3], at[11]); MULADD(at[4], at[10]); MULADD(at[5], at[9]); MULADD(at[6], at[8]);
COMBA_STORE(C->dp[6]);
/* 7 */
COMBA_FORWARD;
MULADD(at[0], at[15]); MULADD(at[1], at[14]); MULADD(at[2], at[13]); MULADD(at[3], at[12]); MULADD(at[4], at[11]); MULADD(at[5], at[10]); MULADD(at[6], at[9]); MULADD(at[7], at[8]);
COMBA_STORE(C->dp[7]);
/* 8 */
COMBA_FORWARD;
MULADD(at[1], at[15]); MULADD(at[2], at[14]); MULADD(at[3], at[13]); MULADD(at[4], at[12]); MULADD(at[5], at[11]); MULADD(at[6], at[10]); MULADD(at[7], at[9]);
COMBA_STORE(C->dp[8]);
/* 9 */
COMBA_FORWARD;
MULADD(at[2], at[15]); MULADD(at[3], at[14]); MULADD(at[4], at[13]); MULADD(at[5], at[12]); MULADD(at[6], at[11]); MULADD(at[7], at[10]);
COMBA_STORE(C->dp[9]);
/* 10 */
COMBA_FORWARD;
MULADD(at[3], at[15]); MULADD(at[4], at[14]); MULADD(at[5], at[13]); MULADD(at[6], at[12]); MULADD(at[7], at[11]);
COMBA_STORE(C->dp[10]);
/* 11 */
COMBA_FORWARD;
MULADD(at[4], at[15]); MULADD(at[5], at[14]); MULADD(at[6], at[13]); MULADD(at[7], at[12]);
COMBA_STORE(C->dp[11]);
/* 12 */
COMBA_FORWARD;
MULADD(at[5], at[15]); MULADD(at[6], at[14]); MULADD(at[7], at[13]);
COMBA_STORE(C->dp[12]);
/* 13 */
COMBA_FORWARD;
MULADD(at[6], at[15]); MULADD(at[7], at[14]);
COMBA_STORE(C->dp[13]);
/* 14 */
COMBA_FORWARD;
MULADD(at[7], at[15]);
COMBA_STORE(C->dp[14]);
COMBA_STORE2(C->dp[15]);
C->used = 16;
C->sign = A->sign ^ B->sign;
fp_clamp(C);
COMBA_FINI;
}
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[32];
memcpy(at, A->dp, 16 * sizeof(fp_digit));
memcpy(at+16, B->dp, 16 * sizeof(fp_digit));
COMBA_START;
COMBA_CLEAR;
/* 0 */
MULADD(at[0], at[16]);
COMBA_STORE(C->dp[0]);
/* 1 */
COMBA_FORWARD;
MULADD(at[0], at[17]); MULADD(at[1], at[16]);
COMBA_STORE(C->dp[1]);
/* 2 */
COMBA_FORWARD;
MULADD(at[0], at[18]); MULADD(at[1], at[17]); MULADD(at[2], at[16]);
COMBA_STORE(C->dp[2]);
/* 3 */
COMBA_FORWARD;
MULADD(at[0], at[19]); MULADD(at[1], at[18]); MULADD(at[2], at[17]); MULADD(at[3], at[16]);
COMBA_STORE(C->dp[3]);
/* 4 */
COMBA_FORWARD;
MULADD(at[0], at[20]); MULADD(at[1], at[19]); MULADD(at[2], at[18]); MULADD(at[3], at[17]); MULADD(at[4], at[16]);
COMBA_STORE(C->dp[4]);
/* 5 */
COMBA_FORWARD;
MULADD(at[0], at[21]); MULADD(at[1], at[20]); MULADD(at[2], at[19]); MULADD(at[3], at[18]); MULADD(at[4], at[17]); MULADD(at[5], at[16]);
COMBA_STORE(C->dp[5]);
/* 6 */
COMBA_FORWARD;
MULADD(at[0], at[22]); MULADD(at[1], at[21]); MULADD(at[2], at[20]); MULADD(at[3], at[19]); MULADD(at[4], at[18]); MULADD(at[5], at[17]); MULADD(at[6], at[16]);
COMBA_STORE(C->dp[6]);
/* 7 */
COMBA_FORWARD;
MULADD(at[0], at[23]); MULADD(at[1], at[22]); MULADD(at[2], at[21]); MULADD(at[3], at[20]); MULADD(at[4], at[19]); MULADD(at[5], at[18]); MULADD(at[6], at[17]); MULADD(at[7], at[16]);
COMBA_STORE(C->dp[7]);
/* 8 */
COMBA_FORWARD;
MULADD(at[0], at[24]); MULADD(at[1], at[23]); MULADD(at[2], at[22]); MULADD(at[3], at[21]); MULADD(at[4], at[20]); MULADD(at[5], at[19]); MULADD(at[6], at[18]); MULADD(at[7], at[17]); MULADD(at[8], at[16]);
COMBA_STORE(C->dp[8]);
/* 9 */
COMBA_FORWARD;
MULADD(at[0], at[25]); MULADD(at[1], at[24]); MULADD(at[2], at[23]); MULADD(at[3], at[22]); MULADD(at[4], at[21]); MULADD(at[5], at[20]); MULADD(at[6], at[19]); MULADD(at[7], at[18]); MULADD(at[8], at[17]); MULADD(at[9], at[16]);
COMBA_STORE(C->dp[9]);
/* 10 */
COMBA_FORWARD;
MULADD(at[0], at[26]); MULADD(at[1], at[25]); MULADD(at[2], at[24]); MULADD(at[3], at[23]); MULADD(at[4], at[22]); MULADD(at[5], at[21]); MULADD(at[6], at[20]); MULADD(at[7], at[19]); MULADD(at[8], at[18]); MULADD(at[9], at[17]); MULADD(at[10], at[16]);
COMBA_STORE(C->dp[10]);
/* 11 */
COMBA_FORWARD;
MULADD(at[0], at[27]); MULADD(at[1], at[26]); MULADD(at[2], at[25]); MULADD(at[3], at[24]); MULADD(at[4], at[23]); MULADD(at[5], at[22]); MULADD(at[6], at[21]); MULADD(at[7], at[20]); MULADD(at[8], at[19]); MULADD(at[9], at[18]); MULADD(at[10], at[17]); MULADD(at[11], at[16]);
COMBA_STORE(C->dp[11]);
/* 12 */
COMBA_FORWARD;
MULADD(at[0], at[28]); MULADD(at[1], at[27]); MULADD(at[2], at[26]); MULADD(at[3], at[25]); MULADD(at[4], at[24]); MULADD(at[5], at[23]); MULADD(at[6], at[22]); MULADD(at[7], at[21]); MULADD(at[8], at[20]); MULADD(at[9], at[19]); MULADD(at[10], at[18]); MULADD(at[11], at[17]); MULADD(at[12], at[16]);
COMBA_STORE(C->dp[12]);
/* 13 */
COMBA_FORWARD;
MULADD(at[0], at[29]); MULADD(at[1], at[28]); MULADD(at[2], at[27]); MULADD(at[3], at[26]); MULADD(at[4], at[25]); MULADD(at[5], at[24]); MULADD(at[6], at[23]); MULADD(at[7], at[22]); MULADD(at[8], at[21]); MULADD(at[9], at[20]); MULADD(at[10], at[19]); MULADD(at[11], at[18]); MULADD(at[12], at[17]); MULADD(at[13], at[16]);
COMBA_STORE(C->dp[13]);
/* 14 */
COMBA_FORWARD;
MULADD(at[0], at[30]); MULADD(at[1], at[29]); MULADD(at[2], at[28]); MULADD(at[3], at[27]); MULADD(at[4], at[26]); MULADD(at[5], at[25]); MULADD(at[6], at[24]); MULADD(at[7], at[23]); MULADD(at[8], at[22]); MULADD(at[9], at[21]); MULADD(at[10], at[20]); MULADD(at[11], at[19]); MULADD(at[12], at[18]); MULADD(at[13], at[17]); MULADD(at[14], at[16]);
COMBA_STORE(C->dp[14]);
/* 15 */
COMBA_FORWARD;
MULADD(at[0], at[31]); MULADD(at[1], at[30]); MULADD(at[2], at[29]); MULADD(at[3], at[28]); MULADD(at[4], at[27]); MULADD(at[5], at[26]); MULADD(at[6], at[25]); MULADD(at[7], at[24]); MULADD(at[8], at[23]); MULADD(at[9], at[22]); MULADD(at[10], at[21]); MULADD(at[11], at[20]); MULADD(at[12], at[19]); MULADD(at[13], at[18]); MULADD(at[14], at[17]); MULADD(at[15], at[16]);
COMBA_STORE(C->dp[15]);
/* 16 */
COMBA_FORWARD;
MULADD(at[1], at[31]); MULADD(at[2], at[30]); MULADD(at[3], at[29]); MULADD(at[4], at[28]); MULADD(at[5], at[27]); MULADD(at[6], at[26]); MULADD(at[7], at[25]); MULADD(at[8], at[24]); MULADD(at[9], at[23]); MULADD(at[10], at[22]); MULADD(at[11], at[21]); MULADD(at[12], at[20]); MULADD(at[13], at[19]); MULADD(at[14], at[18]); MULADD(at[15], at[17]);
COMBA_STORE(C->dp[16]);
/* 17 */
COMBA_FORWARD;
MULADD(at[2], at[31]); MULADD(at[3], at[30]); MULADD(at[4], at[29]); MULADD(at[5], at[28]); MULADD(at[6], at[27]); MULADD(at[7], at[26]); MULADD(at[8], at[25]); MULADD(at[9], at[24]); MULADD(at[10], at[23]); MULADD(at[11], at[22]); MULADD(at[12], at[21]); MULADD(at[13], at[20]); MULADD(at[14], at[19]); MULADD(at[15], at[18]);
COMBA_STORE(C->dp[17]);
/* 18 */
COMBA_FORWARD;
MULADD(at[3], at[31]); MULADD(at[4], at[30]); MULADD(at[5], at[29]); MULADD(at[6], at[28]); MULADD(at[7], at[27]); MULADD(at[8], at[26]); MULADD(at[9], at[25]); MULADD(at[10], at[24]); MULADD(at[11], at[23]); MULADD(at[12], at[22]); MULADD(at[13], at[21]); MULADD(at[14], at[20]); MULADD(at[15], at[19]);
COMBA_STORE(C->dp[18]);
/* 19 */
COMBA_FORWARD;
MULADD(at[4], at[31]); MULADD(at[5], at[30]); MULADD(at[6], at[29]); MULADD(at[7], at[28]); MULADD(at[8], at[27]); MULADD(at[9], at[26]); MULADD(at[10], at[25]); MULADD(at[11], at[24]); MULADD(at[12], at[23]); MULADD(at[13], at[22]); MULADD(at[14], at[21]); MULADD(at[15], at[20]);
COMBA_STORE(C->dp[19]);
/* 20 */
COMBA_FORWARD;
MULADD(at[5], at[31]); MULADD(at[6], at[30]); MULADD(at[7], at[29]); MULADD(at[8], at[28]); MULADD(at[9], at[27]); MULADD(at[10], at[26]); MULADD(at[11], at[25]); MULADD(at[12], at[24]); MULADD(at[13], at[23]); MULADD(at[14], at[22]); MULADD(at[15], at[21]);
COMBA_STORE(C->dp[20]);
/* 21 */
COMBA_FORWARD;
MULADD(at[6], at[31]); MULADD(at[7], at[30]); MULADD(at[8], at[29]); MULADD(at[9], at[28]); MULADD(at[10], at[27]); MULADD(at[11], at[26]); MULADD(at[12], at[25]); MULADD(at[13], at[24]); MULADD(at[14], at[23]); MULADD(at[15], at[22]);
COMBA_STORE(C->dp[21]);
/* 22 */
COMBA_FORWARD;
MULADD(at[7], at[31]); MULADD(at[8], at[30]); MULADD(at[9], at[29]); MULADD(at[10], at[28]); MULADD(at[11], at[27]); MULADD(at[12], at[26]); MULADD(at[13], at[25]); MULADD(at[14], at[24]); MULADD(at[15], at[23]);
COMBA_STORE(C->dp[22]);
/* 23 */
COMBA_FORWARD;
MULADD(at[8], at[31]); MULADD(at[9], at[30]); MULADD(at[10], at[29]); MULADD(at[11], at[28]); MULADD(at[12], at[27]); MULADD(at[13], at[26]); MULADD(at[14], at[25]); MULADD(at[15], at[24]);
COMBA_STORE(C->dp[23]);
/* 24 */
COMBA_FORWARD;
MULADD(at[9], at[31]); MULADD(at[10], at[30]); MULADD(at[11], at[29]); MULADD(at[12], at[28]); MULADD(at[13], at[27]); MULADD(at[14], at[26]); MULADD(at[15], at[25]);
COMBA_STORE(C->dp[24]);
/* 25 */
COMBA_FORWARD;
MULADD(at[10], at[31]); MULADD(at[11], at[30]); MULADD(at[12], at[29]); MULADD(at[13], at[28]); MULADD(at[14], at[27]); MULADD(at[15], at[26]);
COMBA_STORE(C->dp[25]);
/* 26 */
COMBA_FORWARD;
MULADD(at[11], at[31]); MULADD(at[12], at[30]); MULADD(at[13], at[29]); MULADD(at[14], at[28]); MULADD(at[15], at[27]);
COMBA_STORE(C->dp[26]);
/* 27 */
COMBA_FORWARD;
MULADD(at[12], at[31]); MULADD(at[13], at[30]); MULADD(at[14], at[29]); MULADD(at[15], at[28]);
COMBA_STORE(C->dp[27]);
/* 28 */
COMBA_FORWARD;
MULADD(at[13], at[31]); MULADD(at[14], at[30]); MULADD(at[15], at[29]);
COMBA_STORE(C->dp[28]);
/* 29 */
COMBA_FORWARD;
MULADD(at[14], at[31]); MULADD(at[15], at[30]);
COMBA_STORE(C->dp[29]);
/* 30 */
COMBA_FORWARD;
MULADD(at[15], at[31]);
COMBA_STORE(C->dp[30]);
COMBA_STORE2(C->dp[31]);
C->used = 32;
C->sign = A->sign ^ B->sign;
fp_clamp(C);
COMBA_FINI;
}
#ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
{
fp_word t;
fp_digit c0, c1, c2, at[64];
memcpy(at, A->dp, 32 * sizeof(fp_digit));
memcpy(at+32, B->dp, 32 * sizeof(fp_digit));
COMBA_START;
COMBA_CLEAR;
/* 0 */
MULADD(at[0], at[32]);
COMBA_STORE(C->dp[0]);
/* 1 */
COMBA_FORWARD;
MULADD(at[0], at[33]); MULADD(at[1], at[32]);
COMBA_STORE(C->dp[1]);
/* 2 */
COMBA_FORWARD;
MULADD(at[0], at[34]); MULADD(at[1], at[33]); MULADD(at[2], at[32]);
COMBA_STORE(C->dp[2]);
/* 3 */
COMBA_FORWARD;
MULADD(at[0], at[35]); MULADD(at[1], at[34]); MULADD(at[2], at[33]); MULADD(at[3], at[32]);
COMBA_STORE(C->dp[3]);
/* 4 */
COMBA_FORWARD;
MULADD(at[0], at[36]); MULADD(at[1], at[35]); MULADD(at[2], at[34]); MULADD(at[3], at[33]); MULADD(at[4], at[32]);
COMBA_STORE(C->dp[4]);
/* 5 */
COMBA_FORWARD;
MULADD(at[0], at[37]); MULADD(at[1], at[36]); MULADD(at[2], at[35]); MULADD(at[3], at[34]); MULADD(at[4], at[33]); MULADD(at[5], at[32]);
COMBA_STORE(C->dp[5]);
/* 6 */
COMBA_FORWARD;
MULADD(at[0], at[38]); MULADD(at[1], at[37]); MULADD(at[2], at[36]); MULADD(at[3], at[35]); MULADD(at[4], at[34]); MULADD(at[5], at[33]); MULADD(at[6], at[32]);
COMBA_STORE(C->dp[6]);
/* 7 */
COMBA_FORWARD;
MULADD(at[0], at[39]); MULADD(at[1], at[38]); MULADD(at[2], at[37]); MULADD(at[3], at[36]); MULADD(at[4], at[35]); MULADD(at[5], at[34]); MULADD(at[6], at[33]); MULADD(at[7], at[32]);
COMBA_STORE(C->dp[7]);
/* 8 */
COMBA_FORWARD;
MULADD(at[0], at[40]); MULADD(at[1], at[39]); MULADD(at[2], at[38]); MULADD(at[3], at[37]); MULADD(at[4], at[36]); MULADD(at[5], at[35]); MULADD(at[6], at[34]); MULADD(at[7], at[33]); MULADD(at[8], at[32]);
COMBA_STORE(C->dp[8]);
/* 9 */
COMBA_FORWARD;
MULADD(at[0], at[41]); MULADD(at[1], at[40]); MULADD(at[2], at[39]); MULADD(at[3], at[38]); MULADD(at[4], at[37]); MULADD(at[5], at[36]); MULADD(at[6], at[35]); MULADD(at[7], at[34]); MULADD(at[8], at[33]); MULADD(at[9], at[32]);
COMBA_STORE(C->dp[9]);
/* 10 */
COMBA_FORWARD;
MULADD(at[0], at[42]); MULADD(at[1], at[41]); MULADD(at[2], at[40]); MULADD(at[3], at[39]); MULADD(at[4], at[38]); MULADD(at[5], at[37]); MULADD(at[6], at[36]); MULADD(at[7], at[35]); MULADD(at[8], at[34]); MULADD(at[9], at[33]); MULADD(at[10], at[32]);
COMBA_STORE(C->dp[10]);
/* 11 */
COMBA_FORWARD;
MULADD(at[0], at[43]); MULADD(at[1], at[42]); MULADD(at[2], at[41]); MULADD(at[3], at[40]); MULADD(at[4], at[39]); MULADD(at[5], at[38]); MULADD(at[6], at[37]); MULADD(at[7], at[36]); MULADD(at[8], at[35]); MULADD(at[9], at[34]); MULADD(at[10], at[33]); MULADD(at[11], at[32]);
COMBA_STORE(C->dp[11]);
/* 12 */
COMBA_FORWARD;
MULADD(at[0], at[44]); MULADD(at[1], at[43]); MULADD(at[2], at[42]); MULADD(at[3], at[41]); MULADD(at[4], at[40]); MULADD(at[5], at[39]); MULADD(at[6], at[38]); MULADD(at[7], at[37]); MULADD(at[8], at[36]); MULADD(at[9], at[35]); MULADD(at[10], at[34]); MULADD(at[11], at[33]); MULADD(at[12], at[32]);
COMBA_STORE(C->dp[12]);
/* 13 */
COMBA_FORWARD;
MULADD(at[0], at[45]); MULADD(at[1], at[44]); MULADD(at[2], at[43]); MULADD(at[3], at[42]); MULADD(at[4], at[41]); MULADD(at[5], at[40]); MULADD(at[6], at[39]); MULADD(at[7], at[38]); MULADD(at[8], at[37]); MULADD(at[9], at[36]); MULADD(at[10], at[35]); MULADD(at[11], at[34]); MULADD(at[12], at[33]); MULADD(at[13], at[32]);
COMBA_STORE(C->dp[13]);
/* 14 */
COMBA_FORWARD;
MULADD(at[0], at[46]); MULADD(at[1], at[45]); MULADD(at[2], at[44]); MULADD(at[3], at[43]); MULADD(at[4], at[42]); MULADD(at[5], at[41]); MULADD(at[6], at[40]); MULADD(at[7], at[39]); MULADD(at[8], at[38]); MULADD(at[9], at[37]); MULADD(at[10], at[36]); MULADD(at[11], at[35]); MULADD(at[12], at[34]); MULADD(at[13], at[33]); MULADD(at[14], at[32]);
COMBA_STORE(C->dp[14]);
/* 15 */
COMBA_FORWARD;
MULADD(at[0], at[47]); MULADD(at[1], at[46]); MULADD(at[2], at[45]); MULADD(at[3], at[44]); MULADD(at[4], at[43]); MULADD(at[5], at[42]); MULADD(at[6], at[41]); MULADD(at[7], at[40]); MULADD(at[8], at[39]); MULADD(at[9], at[38]); MULADD(at[10], at[37]); MULADD(at[11], at[36]); MULADD(at[12], at[35]); MULADD(at[13], at[34]); MULADD(at[14], at[33]); MULADD(at[15], at[32]);
COMBA_STORE(C->dp[15]);
/* 16 */
COMBA_FORWARD;
MULADD(at[0], at[48]); MULADD(at[1], at[47]); MULADD(at[2], at[46]); MULADD(at[3], at[45]); MULADD(at[4], at[44]); MULADD(at[5], at[43]); MULADD(at[6], at[42]); MULADD(at[7], at[41]); MULADD(at[8], at[40]); MULADD(at[9], at[39]); MULADD(at[10], at[38]); MULADD(at[11], at[37]); MULADD(at[12], at[36]); MULADD(at[13], at[35]); MULADD(at[14], at[34]); MULADD(at[15], at[33]); MULADD(at[16], at[32]);
COMBA_STORE(C->dp[16]);
/* 17 */
COMBA_FORWARD;
MULADD(at[0], at[49]); MULADD(at[1], at[48]); MULADD(at[2], at[47]); MULADD(at[3], at[46]); MULADD(at[4], at[45]); MULADD(at[5], at[44]); MULADD(at[6], at[43]); MULADD(at[7], at[42]); MULADD(at[8], at[41]); MULADD(at[9], at[40]); MULADD(at[10], at[39]); MULADD(at[11], at[38]); MULADD(at[12], at[37]); MULADD(at[13], at[36]); MULADD(at[14], at[35]); MULADD(at[15], at[34]); MULADD(at[16], at[33]); MULADD(at[17], at[32]);
COMBA_STORE(C->dp[17]);
/* 18 */
COMBA_FORWARD;
MULADD(at[0], at[50]); MULADD(at[1], at[49]); MULADD(at[2], at[48]); MULADD(at[3], at[47]); MULADD(at[4], at[46]); MULADD(at[5], at[45]); MULADD(at[6], at[44]); MULADD(at[7], at[43]); MULADD(at[8], at[42]); MULADD(at[9], at[41]); MULADD(at[10], at[40]); MULADD(at[11], at[39]); MULADD(at[12], at[38]); MULADD(at[13], at[37]); MULADD(at[14], at[36]); MULADD(at[15], at[35]); MULADD(at[16], at[34]); MULADD(at[17], at[33]); MULADD(at[18], at[32]);
COMBA_STORE(C->dp[18]);
/* 19 */
COMBA_FORWARD;
MULADD(at[0], at[51]); MULADD(at[1], at[50]); MULADD(at[2], at[49]); MULADD(at[3], at[48]); MULADD(at[4], at[47]); MULADD(at[5], at[46]); MULADD(at[6], at[45]); MULADD(at[7], at[44]); MULADD(at[8], at[43]); MULADD(at[9], at[42]); MULADD(at[10], at[41]); MULADD(at[11], at[40]); MULADD(at[12], at[39]); MULADD(at[13], at[38]); MULADD(at[14], at[37]); MULADD(at[15], at[36]); MULADD(at[16], at[35]); MULADD(at[17], at[34]); MULADD(at[18], at[33]); MULADD(at[19], at[32]);
COMBA_STORE(C->dp[19]);
/* 20 */
COMBA_FORWARD;
MULADD(at[0], at[52]); MULADD(at[1], at[51]); MULADD(at[2], at[50]); MULADD(at[3], at[49]); MULADD(at[4], at[48]); MULADD(at[5], at[47]); MULADD(at[6], at[46]); MULADD(at[7], at[45]); MULADD(at[8], at[44]); MULADD(at[9], at[43]); MULADD(at[10], at[42]); MULADD(at[11], at[41]); MULADD(at[12], at[40]); MULADD(at[13], at[39]); MULADD(at[14], at[38]); MULADD(at[15], at[37]); MULADD(at[16], at[36]); MULADD(at[17], at[35]); MULADD(at[18], at[34]); MULADD(at[19], at[33]); MULADD(at[20], at[32]);
COMBA_STORE(C->dp[20]);
/* 21 */
COMBA_FORWARD;
MULADD(at[0], at[53]); MULADD(at[1], at[52]); MULADD(at[2], at[51]); MULADD(at[3], at[50]); MULADD(at[4], at[49]); MULADD(at[5], at[48]); MULADD(at[6], at[47]); MULADD(at[7], at[46]); MULADD(at[8], at[45]); MULADD(at[9], at[44]); MULADD(at[10], at[43]); MULADD(at[11], at[42]); MULADD(at[12], at[41]); MULADD(at[13], at[40]); MULADD(at[14], at[39]); MULADD(at[15], at[38]); MULADD(at[16], at[37]); MULADD(at[17], at[36]); MULADD(at[18], at[35]); MULADD(at[19], at[34]); MULADD(at[20], at[33]); MULADD(at[21], at[32]);
COMBA_STORE(C->dp[21]);
/* 22 */
COMBA_FORWARD;
MULADD(at[0], at[54]); MULADD(at[1], at[53]); MULADD(at[2], at[52]); MULADD(at[3], at[51]); MULADD(at[4], at[50]); MULADD(at[5], at[49]); MULADD(at[6], at[48]); MULADD(at[7], at[47]); MULADD(at[8], at[46]); MULADD(at[9], at[45]); MULADD(at[10], at[44]); MULADD(at[11], at[43]); MULADD(at[12], at[42]); MULADD(at[13], at[41]); MULADD(at[14], at[40]); MULADD(at[15], at[39]); MULADD(at[16], at[38]); MULADD(at[17], at[37]); MULADD(at[18], at[36]); MULADD(at[19], at[35]); MULADD(at[20], at[34]); MULADD(at[21], at[33]); MULADD(at[22], at[32]);
COMBA_STORE(C->dp[22]);
/* 23 */
COMBA_FORWARD;
MULADD(at[0], at[55]); MULADD(at[1], at[54]); MULADD(at[2], at[53]); MULADD(at[3], at[52]); MULADD(at[4], at[51]); MULADD(at[5], at[50]); MULADD(at[6], at[49]); MULADD(at[7], at[48]); MULADD(at[8], at[47]); MULADD(at[9], at[46]); MULADD(at[10], at[45]); MULADD(at[11], at[44]); MULADD(at[12], at[43]); MULADD(at[13], at[42]); MULADD(at[14], at[41]); MULADD(at[15], at[40]); MULADD(at[16], at[39]); MULADD(at[17], at[38]); MULADD(at[18], at[37]); MULADD(at[19], at[36]); MULADD(at[20], at[35]); MULADD(at[21], at[34]); MULADD(at[22], at[33]); MULADD(at[23], at[32]);
COMBA_STORE(C->dp[23]);
/* 24 */
COMBA_FORWARD;
MULADD(at[0], at[56]); MULADD(at[1], at[55]); MULADD(at[2], at[54]); MULADD(at[3], at[53]); MULADD(at[4], at[52]); MULADD(at[5], at[51]); MULADD(at[6], at[50]); MULADD(at[7], at[49]); MULADD(at[8], at[48]); MULADD(at[9], at[47]); MULADD(at[10], at[46]); MULADD(at[11], at[45]); MULADD(at[12], at[44]); MULADD(at[13], at[43]); MULADD(at[14], at[42]); MULADD(at[15], at[41]); MULADD(at[16], at[40]); MULADD(at[17], at[39]); MULADD(at[18], at[38]); MULADD(at[19], at[37]); MULADD(at[20], at[36]); MULADD(at[21], at[35]); MULADD(at[22], at[34]); MULADD(at[23], at[33]); MULADD(at[24], at[32]);
COMBA_STORE(C->dp[24]);
/* 25 */
COMBA_FORWARD;
MULADD(at[0], at[57]); MULADD(at[1], at[56]); MULADD(at[2], at[55]); MULADD(at[3], at[54]); MULADD(at[4], at[53]); MULADD(at[5], at[52]); MULADD(at[6], at[51]); MULADD(at[7], at[50]); MULADD(at[8], at[49]); MULADD(at[9], at[48]); MULADD(at[10], at[47]); MULADD(at[11], at[46]); MULADD(at[12], at[45]); MULADD(at[13], at[44]); MULADD(at[14], at[43]); MULADD(at[15], at[42]); MULADD(at[16], at[41]); MULADD(at[17], at[40]); MULADD(at[18], at[39]); MULADD(at[19], at[38]); MULADD(at[20], at[37]); MULADD(at[21], at[36]); MULADD(at[22], at[35]); MULADD(at[23], at[34]); MULADD(at[24], at[33]); MULADD(at[25], at[32]);
COMBA_STORE(C->dp[25]);
/* 26 */
COMBA_FORWARD;
MULADD(at[0], at[58]); MULADD(at[1], at[57]); MULADD(at[2], at[56]); MULADD(at[3], at[55]); MULADD(at[4], at[54]); MULADD(at[5], at[53]); MULADD(at[6], at[52]); MULADD(at[7], at[51]); MULADD(at[8], at[50]); MULADD(at[9], at[49]); MULADD(at[10], at[48]); MULADD(at[11], at[47]); MULADD(at[12], at[46]); MULADD(at[13], at[45]); MULADD(at[14], at[44]); MULADD(at[15], at[43]); MULADD(at[16], at[42]); MULADD(at[17], at[41]); MULADD(at[18], at[40]); MULADD(at[19], at[39]); MULADD(at[20], at[38]); MULADD(at[21], at[37]); MULADD(at[22], at[36]); MULADD(at[23], at[35]); MULADD(at[24], at[34]); MULADD(at[25], at[33]); MULADD(at[26], at[32]);
COMBA_STORE(C->dp[26]);
/* 27 */
COMBA_FORWARD;
MULADD(at[0], at[59]); MULADD(at[1], at[58]); MULADD(at[2], at[57]); MULADD(at[3], at[56]); MULADD(at[4], at[55]); MULADD(at[5], at[54]); MULADD(at[6], at[53]); MULADD(at[7], at[52]); MULADD(at[8], at[51]); MULADD(at[9], at[50]); MULADD(at[10], at[49]); MULADD(at[11], at[48]); MULADD(at[12], at[47]); MULADD(at[13], at[46]); MULADD(at[14], at[45]); MULADD(at[15], at[44]); MULADD(at[16], at[43]); MULADD(at[17], at[42]); MULADD(at[18], at[41]); MULADD(at[19], at[40]); MULADD(at[20], at[39]); MULADD(at[21], at[38]); MULADD(at[22], at[37]); MULADD(at[23], at[36]); MULADD(at[24], at[35]); MULADD(at[25], at[34]); MULADD(at[26], at[33]); MULADD(at[27], at[32]);
COMBA_STORE(C->dp[27]);
/* 28 */
COMBA_FORWARD;
MULADD(at[0], at[60]); MULADD(at[1], at[59]); MULADD(at[2], at[58]); MULADD(at[3], at[57]); MULADD(at[4], at[56]); MULADD(at[5], at[55]); MULADD(at[6], at[54]); MULADD(at[7], at[53]); MULADD(at[8], at[52]); MULADD(at[9], at[51]); MULADD(at[10], at[50]); MULADD(at[11], at[49]); MULADD(at[12], at[48]); MULADD(at[13], at[47]); MULADD(at[14], at[46]); MULADD(at[15], at[45]); MULADD(at[16], at[44]); MULADD(at[17], at[43]); MULADD(at[18], at[42]); MULADD(at[19], at[41]); MULADD(at[20], at[40]); MULADD(at[21], at[39]); MULADD(at[22], at[38]); MULADD(at[23], at[37]); MULADD(at[24], at[36]); MULADD(at[25], at[35]); MULADD(at[26], at[34]); MULADD(at[27], at[33]); MULADD(at[28], at[32]);
COMBA_STORE(C->dp[28]);
/* 29 */
COMBA_FORWARD;
MULADD(at[0], at[61]); MULADD(at[1], at[60]); MULADD(at[2], at[59]); MULADD(at[3], at[58]); MULADD(at[4], at[57]); MULADD(at[5], at[56]); MULADD(at[6], at[55]); MULADD(at[7], at[54]); MULADD(at[8], at[53]); MULADD(at[9], at[52]); MULADD(at[10], at[51]); MULADD(at[11], at[50]); MULADD(at[12], at[49]); MULADD(at[13], at[48]); MULADD(at[14], at[47]); MULADD(at[15], at[46]); MULADD(at[16], at[45]); MULADD(at[17], at[44]); MULADD(at[18], at[43]); MULADD(at[19], at[42]); MULADD(at[20], at[41]); MULADD(at[21], at[40]); MULADD(at[22], at[39]); MULADD(at[23], at[38]); MULADD(at[24], at[37]); MULADD(at[25], at[36]); MULADD(at[26], at[35]); MULADD(at[27], at[34]); MULADD(at[28], at[33]); MULADD(at[29], at[32]);
COMBA_STORE(C->dp[29]);
/* 30 */
COMBA_FORWARD;
MULADD(at[0], at[62]); MULADD(at[1], at[61]); MULADD(at[2], at[60]); MULADD(at[3], at[59]); MULADD(at[4], at[58]); MULADD(at[5], at[57]); MULADD(at[6], at[56]); MULADD(at[7], at[55]); MULADD(at[8], at[54]); MULADD(at[9], at[53]); MULADD(at[10], at[52]); MULADD(at[11], at[51]); MULADD(at[12], at[50]); MULADD(at[13], at[49]); MULADD(at[14], at[48]); MULADD(at[15], at[47]); MULADD(at[16], at[46]); MULADD(at[17], at[45]); MULADD(at[18], at[44]); MULADD(at[19], at[43]); MULADD(at[20], at[42]); MULADD(at[21], at[41]); MULADD(at[22], at[40]); MULADD(at[23], at[39]); MULADD(at[24], at[38]); MULADD(at[25], at[37]); MULADD(at[26], at[36]); MULADD(at[27], at[35]); MULADD(at[28], at[34]); MULADD(at[29], at[33]); MULADD(at[30], at[32]);
COMBA_STORE(C->dp[30]);
/* 31 */
COMBA_FORWARD;
MULADD(at[0], at[63]); MULADD(at[1], at[62]); MULADD(at[2], at[61]); MULADD(at[3], at[60]); MULADD(at[4], at[59]); MULADD(at[5], at[58]); MULADD(at[6], at[57]); MULADD(at[7], at[56]); MULADD(at[8], at[55]); MULADD(at[9], at[54]); MULADD(at[10], at[53]); MULADD(at[11], at[52]); MULADD(at[12], at[51]); MULADD(at[13], at[50]); MULADD(at[14], at[49]); MULADD(at[15], at[48]); MULADD(at[16], at[47]); MULADD(at[17], at[46]); MULADD(at[18], at[45]); MULADD(at[19], at[44]); MULADD(at[20], at[43]); MULADD(at[21], at[42]); MULADD(at[22], at[41]); MULADD(at[23], at[40]); MULADD(at[24], at[39]); MULADD(at[25], at[38]); MULADD(at[26], at[37]); MULADD(at[27], at[36]); MULADD(at[28], at[35]); MULADD(at[29], at[34]); MULADD(at[30], at[33]); MULADD(at[31], at[32]);
COMBA_STORE(C->dp[31]);
/* 32 */
COMBA_FORWARD;
MULADD(at[1], at[63]); MULADD(at[2], at[62]); MULADD(at[3], at[61]); MULADD(at[4], at[60]); MULADD(at[5], at[59]); MULADD(at[6], at[58]); MULADD(at[7], at[57]); MULADD(at[8], at[56]); MULADD(at[9], at[55]); MULADD(at[10], at[54]); MULADD(at[11], at[53]); MULADD(at[12], at[52]); MULADD(at[13], at[51]); MULADD(at[14], at[50]); MULADD(at[15], at[49]); MULADD(at[16], at[48]); MULADD(at[17], at[47]); MULADD(at[18], at[46]); MULADD(at[19], at[45]); MULADD(at[20], at[44]); MULADD(at[21], at[43]); MULADD(at[22], at[42]); MULADD(at[23], at[41]); MULADD(at[24], at[40]); MULADD(at[25], at[39]); MULADD(at[26], at[38]); MULADD(at[27], at[37]); MULADD(at[28], at[36]); MULADD(at[29], at[35]); MULADD(at[30], at[34]); MULADD(at[31], at[33]);
COMBA_STORE(C->dp[32]);
/* 33 */
COMBA_FORWARD;
MULADD(at[2], at[63]); MULADD(at[3], at[62]); MULADD(at[4], at[61]); MULADD(at[5], at[60]); MULADD(at[6], at[59]); MULADD(at[7], at[58]); MULADD(at[8], at[57]); MULADD(at[9], at[56]); MULADD(at[10], at[55]); MULADD(at[11], at[54]); MULADD(at[12], at[53]); MULADD(at[13], at[52]); MULADD(at[14], at[51]); MULADD(at[15], at[50]); MULADD(at[16], at[49]); MULADD(at[17], at[48]); MULADD(at[18], at[47]); MULADD(at[19], at[46]); MULADD(at[20], at[45]); MULADD(at[21], at[44]); MULADD(at[22], at[43]); MULADD(at[23], at[42]); MULADD(at[24], at[41]); MULADD(at[25], at[40]); MULADD(at[26], at[39]); MULADD(at[27], at[38]); MULADD(at[28], at[37]); MULADD(at[29], at[36]); MULADD(at[30], at[35]); MULADD(at[31], at[34]);
COMBA_STORE(C->dp[33]);
/* 34 */
COMBA_FORWARD;
MULADD(at[3], at[63]); MULADD(at[4], at[62]); MULADD(at[5], at[61]); MULADD(at[6], at[60]); MULADD(at[7], at[59]); MULADD(at[8], at[58]); MULADD(at[9], at[57]); MULADD(at[10], at[56]); MULADD(at[11], at[55]); MULADD(at[12], at[54]); MULADD(at[13], at[53]); MULADD(at[14], at[52]); MULADD(at[15], at[51]); MULADD(at[16], at[50]); MULADD(at[17], at[49]); MULADD(at[18], at[48]); MULADD(at[19], at[47]); MULADD(at[20], at[46]); MULADD(at[21], at[45]); MULADD(at[22], at[44]); MULADD(at[23], at[43]); MULADD(at[24], at[42]); MULADD(at[25], at[41]); MULADD(at[26], at[40]); MULADD(at[27], at[39]); MULADD(at[28], at[38]); MULADD(at[29], at[37]); MULADD(at[30], at[36]); MULADD(at[31], at[35]);
COMBA_STORE(C->dp[34]);
/* 35 */
COMBA_FORWARD;
MULADD(at[4], at[63]); MULADD(at[5], at[62]); MULADD(at[6], at[61]); MULADD(at[7], at[60]); MULADD(at[8], at[59]); MULADD(at[9], at[58]); MULADD(at[10], at[57]); MULADD(at[11], at[56]); MULADD(at[12], at[55]); MULADD(at[13], at[54]); MULADD(at[14], at[53]); MULADD(at[15], at[52]); MULADD(at[16], at[51]); MULADD(at[17], at[50]); MULADD(at[18], at[49]); MULADD(at[19], at[48]); MULADD(at[20], at[47]); MULADD(at[21], at[46]); MULADD(at[22], at[45]); MULADD(at[23], at[44]); MULADD(at[24], at[43]); MULADD(at[25], at[42]); MULADD(at[26], at[41]); MULADD(at[27], at[40]); MULADD(at[28], at[39]); MULADD(at[29], at[38]); MULADD(at[30], at[37]); MULADD(at[31], at[36]);
COMBA_STORE(C->dp[35]);
/* 36 */
COMBA_FORWARD;
MULADD(at[5], at[63]); MULADD(at[6], at[62]); MULADD(at[7], at[61]); MULADD(at[8], at[60]); MULADD(at[9], at[59]); MULADD(at[10], at[58]); MULADD(at[11], at[57]); MULADD(at[12], at[56]); MULADD(at[13], at[55]); MULADD(at[14], at[54]); MULADD(at[15], at[53]); MULADD(at[16], at[52]); MULADD(at[17], at[51]); MULADD(at[18], at[50]); MULADD(at[19], at[49]); MULADD(at[20], at[48]); MULADD(at[21], at[47]); MULADD(at[22], at[46]); MULADD(at[23], at[45]); MULADD(at[24], at[44]); MULADD(at[25], at[43]); MULADD(at[26], at[42]); MULADD(at[27], at[41]); MULADD(at[28], at[40]); MULADD(at[29], at[39]); MULADD(at[30], at[38]); MULADD(at[31], at[37]);
COMBA_STORE(C->dp[36]);
/* 37 */
COMBA_FORWARD;
MULADD(at[6], at[63]); MULADD(at[7], at[62]); MULADD(at[8], at[61]); MULADD(at[9], at[60]); MULADD(at[10], at[59]); MULADD(at[11], at[58]); MULADD(at[12], at[57]); MULADD(at[13], at[56]); MULADD(at[14], at[55]); MULADD(at[15], at[54]); MULADD(at[16], at[53]); MULADD(at[17], at[52]); MULADD(at[18], at[51]); MULADD(at[19], at[50]); MULADD(at[20], at[49]); MULADD(at[21], at[48]); MULADD(at[22], at[47]); MULADD(at[23], at[46]); MULADD(at[24], at[45]); MULADD(at[25], at[44]); MULADD(at[26], at[43]); MULADD(at[27], at[42]); MULADD(at[28], at[41]); MULADD(at[29], at[40]); MULADD(at[30], at[39]); MULADD(at[31], at[38]);
COMBA_STORE(C->dp[37]);
/* 38 */
COMBA_FORWARD;
MULADD(at[7], at[63]); MULADD(at[8], at[62]); MULADD(at[9], at[61]); MULADD(at[10], at[60]); MULADD(at[11], at[59]); MULADD(at[12], at[58]); MULADD(at[13], at[57]); MULADD(at[14], at[56]); MULADD(at[15], at[55]); MULADD(at[16], at[54]); MULADD(at[17], at[53]); MULADD(at[18], at[52]); MULADD(at[19], at[51]); MULADD(at[20], at[50]); MULADD(at[21], at[49]); MULADD(at[22], at[48]); MULADD(at[23], at[47]); MULADD(at[24], at[46]); MULADD(at[25], at[45]); MULADD(at[26], at[44]); MULADD(at[27], at[43]); MULADD(at[28], at[42]); MULADD(at[29], at[41]); MULADD(at[30], at[40]); MULADD(at[31], at[39]);
COMBA_STORE(C->dp[38]);
/* 39 */
COMBA_FORWARD;
MULADD(at[8], at[63]); MULADD(at[9], at[62]); MULADD(at[10], at[61]); MULADD(at[11], at[60]); MULADD(at[12], at[59]); MULADD(at[13], at[58]); MULADD(at[14], at[57]); MULADD(at[15], at[56]); MULADD(at[16], at[55]); MULADD(at[17], at[54]); MULADD(at[18], at[53]); MULADD(at[19], at[52]); MULADD(at[20], at[51]); MULADD(at[21], at[50]); MULADD(at[22], at[49]); MULADD(at[23], at[48]); MULADD(at[24], at[47]); MULADD(at[25], at[46]); MULADD(at[26], at[45]); MULADD(at[27], at[44]); MULADD(at[28], at[43]); MULADD(at[29], at[42]); MULADD(at[30], at[41]); MULADD(at[31], at[40]);
COMBA_STORE(C->dp[39]);
/* 40 */
COMBA_FORWARD;
MULADD(at[9], at[63]); MULADD(at[10], at[62]); MULADD(at[11], at[61]); MULADD(at[12], at[60]); MULADD(at[13], at[59]); MULADD(at[14], at[58]); MULADD(at[15], at[57]); MULADD(at[16], at[56]); MULADD(at[17], at[55]); MULADD(at[18], at[54]); MULADD(at[19], at[53]); MULADD(at[20], at[52]); MULADD(at[21], at[51]); MULADD(at[22], at[50]); MULADD(at[23], at[49]); MULADD(at[24], at[48]); MULADD(at[25], at[47]); MULADD(at[26], at[46]); MULADD(at[27], at[45]); MULADD(at[28], at[44]); MULADD(at[29], at[43]); MULADD(at[30], at[42]); MULADD(at[31], at[41]);
COMBA_STORE(C->dp[40]);
/* 41 */
COMBA_FORWARD;
MULADD(at[10], at[63]); MULADD(at[11], at[62]); MULADD(at[12], at[61]); MULADD(at[13], at[60]); MULADD(at[14], at[59]); MULADD(at[15], at[58]); MULADD(at[16], at[57]); MULADD(at[17], at[56]); MULADD(at[18], at[55]); MULADD(at[19], at[54]); MULADD(at[20], at[53]); MULADD(at[21], at[52]); MULADD(at[22], at[51]); MULADD(at[23], at[50]); MULADD(at[24], at[49]); MULADD(at[25], at[48]); MULADD(at[26], at[47]); MULADD(at[27], at[46]); MULADD(at[28], at[45]); MULADD(at[29], at[44]); MULADD(at[30], at[43]); MULADD(at[31], at[42]);
COMBA_STORE(C->dp[41]);
/* 42 */
COMBA_FORWARD;
MULADD(at[11], at[63]); MULADD(at[12], at[62]); MULADD(at[13], at[61]); MULADD(at[14], at[60]); MULADD(at[15], at[59]); MULADD(at[16], at[58]); MULADD(at[17], at[57]); MULADD(at[18], at[56]); MULADD(at[19], at[55]); MULADD(at[20], at[54]); MULADD(at[21], at[53]); MULADD(at[22], at[52]); MULADD(at[23], at[51]); MULADD(at[24], at[50]); MULADD(at[25], at[49]); MULADD(at[26], at[48]); MULADD(at[27], at[47]); MULADD(at[28], at[46]); MULADD(at[29], at[45]); MULADD(at[30], at[44]); MULADD(at[31], at[43]);
COMBA_STORE(C->dp[42]);
/* 43 */
COMBA_FORWARD;
MULADD(at[12], at[63]); MULADD(at[13], at[62]); MULADD(at[14], at[61]); MULADD(at[15], at[60]); MULADD(at[16], at[59]); MULADD(at[17], at[58]); MULADD(at[18], at[57]); MULADD(at[19], at[56]); MULADD(at[20], at[55]); MULADD(at[21], at[54]); MULADD(at[22], at[53]); MULADD(at[23], at[52]); MULADD(at[24], at[51]); MULADD(at[25], at[50]); MULADD(at[26], at[49]); MULADD(at[27], at[48]); MULADD(at[28], at[47]); MULADD(at[29], at[46]); MULADD(at[30], at[45]); MULADD(at[31], at[44]);
COMBA_STORE(C->dp[43]);
/* 44 */
COMBA_FORWARD;
MULADD(at[13], at[63]); MULADD(at[14], at[62]); MULADD(at[15], at[61]); MULADD(at[16], at[60]); MULADD(at[17], at[59]); MULADD(at[18], at[58]); MULADD(at[19], at[57]); MULADD(at[20], at[56]); MULADD(at[21], at[55]); MULADD(at[22], at[54]); MULADD(at[23], at[53]); MULADD(at[24], at[52]); MULADD(at[25], at[51]); MULADD(at[26], at[50]); MULADD(at[27], at[49]); MULADD(at[28], at[48]); MULADD(at[29], at[47]); MULADD(at[30], at[46]); MULADD(at[31], at[45]);
COMBA_STORE(C->dp[44]);
/* 45 */
COMBA_FORWARD;
MULADD(at[14], at[63]); MULADD(at[15], at[62]); MULADD(at[16], at[61]); MULADD(at[17], at[60]); MULADD(at[18], at[59]); MULADD(at[19], at[58]); MULADD(at[20], at[57]); MULADD(at[21], at[56]); MULADD(at[22], at[55]); MULADD(at[23], at[54]); MULADD(at[24], at[53]); MULADD(at[25], at[52]); MULADD(at[26], at[51]); MULADD(at[27], at[50]); MULADD(at[28], at[49]); MULADD(at[29], at[48]); MULADD(at[30], at[47]); MULADD(at[31], at[46]);
COMBA_STORE(C->dp[45]);
/* 46 */
COMBA_FORWARD;
MULADD(at[15], at[63]); MULADD(at[16], at[62]); MULADD(at[17], at[61]); MULADD(at[18], at[60]); MULADD(at[19], at[59]); MULADD(at[20], at[58]); MULADD(at[21], at[57]); MULADD(at[22], at[56]); MULADD(at[23], at[55]); MULADD(at[24], at[54]); MULADD(at[25], at[53]); MULADD(at[26], at[52]); MULADD(at[27], at[51]); MULADD(at[28], at[50]); MULADD(at[29], at[49]); MULADD(at[30], at[48]); MULADD(at[31], at[47]);
COMBA_STORE(C->dp[46]);
/* 47 */
COMBA_FORWARD;
MULADD(at[16], at[63]); MULADD(at[17], at[62]); MULADD(at[18], at[61]); MULADD(at[19], at[60]); MULADD(at[20], at[59]); MULADD(at[21], at[58]); MULADD(at[22], at[57]); MULADD(at[23], at[56]); MULADD(at[24], at[55]); MULADD(at[25], at[54]); MULADD(at[26], at[53]); MULADD(at[27], at[52]); MULADD(at[28], at[51]); MULADD(at[29], at[50]); MULADD(at[30], at[49]); MULADD(at[31], at[48]);
COMBA_STORE(C->dp[47]);
/* 48 */
COMBA_FORWARD;
MULADD(at[17], at[63]); MULADD(at[18], at[62]); MULADD(at[19], at[61]); MULADD(at[20], at[60]); MULADD(at[21], at[59]); MULADD(at[22], at[58]); MULADD(at[23], at[57]); MULADD(at[24], at[56]); MULADD(at[25], at[55]); MULADD(at[26], at[54]); MULADD(at[27], at[53]); MULADD(at[28], at[52]); MULADD(at[29], at[51]); MULADD(at[30], at[50]); MULADD(at[31], at[49]);
COMBA_STORE(C->dp[48]);
/* 49 */
COMBA_FORWARD;
MULADD(at[18], at[63]); MULADD(at[19], at[62]); MULADD(at[20], at[61]); MULADD(at[21], at[60]); MULADD(at[22], at[59]); MULADD(at[23], at[58]); MULADD(at[24], at[57]); MULADD(at[25], at[56]); MULADD(at[26], at[55]); MULADD(at[27], at[54]); MULADD(at[28], at[53]); MULADD(at[29], at[52]); MULADD(at[30], at[51]); MULADD(at[31], at[50]);
COMBA_STORE(C->dp[49]);
/* 50 */
COMBA_FORWARD;
MULADD(at[19], at[63]); MULADD(at[20], at[62]); MULADD(at[21], at[61]); MULADD(at[22], at[60]); MULADD(at[23], at[59]); MULADD(at[24], at[58]); MULADD(at[25], at[57]); MULADD(at[26], at[56]); MULADD(at[27], at[55]); MULADD(at[28], at[54]); MULADD(at[29], at[53]); MULADD(at[30], at[52]); MULADD(at[31], at[51]);
COMBA_STORE(C->dp[50]);
/* 51 */
COMBA_FORWARD;
MULADD(at[20], at[63]); MULADD(at[21], at[62]); MULADD(at[22], at[61]); MULADD(at[23], at[60]); MULADD(at[24], at[59]); MULADD(at[25], at[58]); MULADD(at[26], at[57]); MULADD(at[27], at[56]); MULADD(at[28], at[55]); MULADD(at[29], at[54]); MULADD(at[30], at[53]); MULADD(at[31], at[52]);
COMBA_STORE(C->dp[51]);
/* 52 */
COMBA_FORWARD;
MULADD(at[21], at[63]); MULADD(at[22], at[62]); MULADD(at[23], at[61]); MULADD(at[24], at[60]); MULADD(at[25], at[59]); MULADD(at[26], at[58]); MULADD(at[27], at[57]); MULADD(at[28], at[56]); MULADD(at[29], at[55]); MULADD(at[30], at[54]); MULADD(at[31], at[53]);
COMBA_STORE(C->dp[52]);
/* 53 */
COMBA_FORWARD;
MULADD(at[22], at[63]); MULADD(at[23], at[62]); MULADD(at[24], at[61]); MULADD(at[25], at[60]); MULADD(at[26], at[59]); MULADD(at[27], at[58]); MULADD(at[28], at[57]); MULADD(at[29], at[56]); MULADD(at[30], at[55]); MULADD(at[31], at[54]);
COMBA_STORE(C->dp[53]);
/* 54 */
COMBA_FORWARD;
MULADD(at[23], at[63]); MULADD(at[24], at[62]); MULADD(at[25], at[61]); MULADD(at[26], at[60]); MULADD(at[27], at[59]); MULADD(at[28], at[58]); MULADD(at[29], at[57]); MULADD(at[30], at[56]); MULADD(at[31], at[55]);
COMBA_STORE(C->dp[54]);
/* 55 */
COMBA_FORWARD;
MULADD(at[24], at[63]); MULADD(at[25], at[62]); MULADD(at[26], at[61]); MULADD(at[27], at[60]); MULADD(at[28], at[59]); MULADD(at[29], at[58]); MULADD(at[30], at[57]); MULADD(at[31], at[56]);
COMBA_STORE(C->dp[55]);
/* 56 */
COMBA_FORWARD;
MULADD(at[25], at[63]); MULADD(at[26], at[62]); MULADD(at[27], at[61]); MULADD(at[28], at[60]); MULADD(at[29], at[59]); MULADD(at[30], at[58]); MULADD(at[31], at[57]);
COMBA_STORE(C->dp[56]);
/* 57 */
COMBA_FORWARD;
MULADD(at[26], at[63]); MULADD(at[27], at[62]); MULADD(at[28], at[61]); MULADD(at[29], at[60]); MULADD(at[30], at[59]); MULADD(at[31], at[58]);
COMBA_STORE(C->dp[57]);
/* 58 */
COMBA_FORWARD;
MULADD(at[27], at[63]); MULADD(at[28], at[62]); MULADD(at[29], at[61]); MULADD(at[30], at[60]); MULADD(at[31], at[59]);
COMBA_STORE(C->dp[58]);
/* 59 */
COMBA_FORWARD;
MULADD(at[28], at[63]); MULADD(at[29], at[62]); MULADD(at[30], at[61]); MULADD(at[31], at[60]);
COMBA_STORE(C->dp[59]);
/* 60 */
COMBA_FORWARD;
MULADD(at[29], at[63]); MULADD(at[30], at[62]); MULADD(at[31], at[61]);
COMBA_STORE(C->dp[60]);
/* 61 */
COMBA_FORWARD;
MULADD(at[30], at[63]); MULADD(at[31], at[62]);
COMBA_STORE(C->dp[61]);
/* 62 */
COMBA_FORWARD;
MULADD(at[31], at[63]);
COMBA_STORE(C->dp[62]);
COMBA_STORE2(C->dp[63]);
C->used = 64;
C->sign = A->sign ^ B->sign;
fp_clamp(C);
COMBA_FINI;
}
#endif

36
fp_mul_d.c Normal file
View File

@ -0,0 +1,36 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a * b */
void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
{
fp_word w;
int x, oldused;
oldused = c->used;
c->used = a->used;
c->sign = a->sign;
w = 0;
for (x = 0; x < a->used; x++) {
w = ((fp_word)a->dp[x]) * ((fp_word)b) + w;
c->dp[x] = (fp_digit)w;
w = w >> DIGIT_BIT;
}
if (w != 0 && (a->used != FP_SIZE)) {
c->dp[c->used++] = w;
++x;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}

18
fp_mulmod.c Normal file
View File

@ -0,0 +1,18 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* d = a * b (mod c) */
int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
{
fp_int tmp;
fp_zero(&tmp);
fp_mul(a, b, &tmp);
return fp_mod(&tmp, c, d);
}

73
fp_prime_miller_rabin.c Normal file
View File

@ -0,0 +1,73 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* Miller-Rabin test of "a" to the base of "b" as described in
* HAC pp. 139 Algorithm 4.24
*
* Sets result to 0 if definitely composite or 1 if probably prime.
* Randomly the chance of error is no more than 1/4 and often
* very much lower.
*/
void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result)
{
fp_int n1, y, r;
int s, j;
/* default */
*result = FP_NO;
/* ensure b > 1 */
if (fp_cmp_d(b, 1) != FP_GT) {
return;
}
/* get n1 = a - 1 */
fp_init_copy(&n1, a);
fp_sub_d(&n1, 1, &n1);
/* set 2**s * r = n1 */
fp_init_copy(&r, &n1);
/* count the number of least significant bits
* which are zero
*/
s = fp_cnt_lsb(&r);
/* now divide n - 1 by 2**s */
fp_div_2d (&r, s, &r, NULL);
/* compute y = b**r mod a */
fp_init(&y);
fp_exptmod(b, &r, a, &y);
/* if y != 1 and y != n1 do */
if (fp_cmp_d (&y, 1) != FP_EQ && fp_cmp (&y, &n1) != FP_EQ) {
j = 1;
/* while j <= s-1 and y != n1 */
while ((j <= (s - 1)) && fp_cmp (&y, &n1) != FP_EQ) {
fp_sqrmod (&y, a, &y);
/* if y == 1 then composite */
if (fp_cmp_d (&y, 1) == FP_EQ) {
return;
}
++j;
}
/* if y != n1 then composite */
if (fp_cmp (&y, &n1) != FP_EQ) {
return;
}
}
/* probably prime now */
*result = FP_YES;
}

97
fp_prime_random_ex.c Normal file
View File

@ -0,0 +1,97 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* This is possibly the mother of all prime generation functions, muahahahahaha! */
int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback cb, void *dat)
{
unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
int res, err, bsize, maskOR_msb_offset;
/* sanity check the input */
if (size <= 1 || t <= 0) {
return FP_VAL;
}
/* TFM_PRIME_SAFE implies TFM_PRIME_BBS */
if (flags & TFM_PRIME_SAFE) {
flags |= TFM_PRIME_BBS;
}
/* calc the byte size */
bsize = (size>>3)+(size&7?1:0);
/* we need a buffer of bsize bytes */
tmp = malloc(bsize);
if (tmp == NULL) {
return FP_MEM;
}
/* calc the maskAND value for the MSbyte*/
maskAND = 0xFF >> (8 - (size & 7));
/* calc the maskOR_msb */
maskOR_msb = 0;
maskOR_msb_offset = (size - 2) >> 3;
if (flags & TFM_PRIME_2MSB_ON) {
maskOR_msb |= 1 << ((size - 2) & 7);
} else if (flags & TFM_PRIME_2MSB_OFF) {
maskAND &= ~(1 << ((size - 2) & 7));
}
/* get the maskOR_lsb */
maskOR_lsb = 1;
if (flags & TFM_PRIME_BBS) {
maskOR_lsb |= 3;
}
do {
/* read the bytes */
if (cb(tmp, bsize, dat) != bsize) {
err = FP_VAL;
goto error;
}
/* work over the MSbyte */
tmp[0] &= maskAND;
tmp[0] |= 1 << ((size - 1) & 7);
/* mix in the maskORs */
tmp[maskOR_msb_offset] |= maskOR_msb;
tmp[bsize-1] |= maskOR_lsb;
/* read it in */
fp_read_unsigned_bin(a, tmp, bsize);
/* is it prime? */
res = fp_isprime(a);
if (res == FP_NO) continue;
if (flags & TFM_PRIME_SAFE) {
/* see if (a-1)/2 is prime */
fp_sub_d(a, 1, a);
fp_div_2(a, a);
/* is it prime? */
res = fp_isprime(a);
}
} while (res == FP_NO);
if (flags & TFM_PRIME_SAFE) {
/* restore a to the original value */
fp_mul_2(a, a);
fp_add_d(a, 1, a);
}
err = FP_OKAY;
error:
free(tmp);
return err;
}

14
fp_radix_size.c Normal file
View File

@ -0,0 +1,14 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_radix_size(fp_int *a, int radix, int *size)
{
}

66
fp_read_radix.c Normal file
View File

@ -0,0 +1,66 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_read_radix(fp_int *a, char *str, int radix)
{
int y, neg;
char ch;
/* make sure the radix is ok */
if (radix < 2 || radix > 64) {
return FP_VAL;
}
/* if the leading digit is a
* minus set the sign to negative.
*/
if (*str == '-') {
++str;
neg = FP_NEG;
} else {
neg = FP_ZPOS;
}
/* set the integer to the default of zero */
fp_zero (a);
/* process each digit of the string */
while (*str) {
/* if the radix < 36 the conversion is case insensitive
* this allows numbers like 1AB and 1ab to represent the same value
* [e.g. in hex]
*/
ch = (char) ((radix < 36) ? toupper (*str) : *str);
for (y = 0; y < 64; y++) {
if (ch == fp_s_rmap[y]) {
break;
}
}
/* if the char was found in the map
* and is less than the given radix add it
* to the number, otherwise exit the loop.
*/
if (y < radix) {
fp_mul_d (a, (fp_digit) radix, a);
fp_add_d (a, (fp_digit) y, a);
} else {
break;
}
++str;
}
/* set the sign only if a != 0 */
if (fp_iszero(a) != FP_YES) {
a->sign = neg;
}
return FP_OKAY;
}

23
fp_read_signed_bin.c Normal file
View File

@ -0,0 +1,23 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_read_signed_bin(fp_int *a, unsigned char *b, int c)
{
/* read magnitude */
fp_read_unsigned_bin (a, b + 1, c - 1);
/* first byte is 0 for positive, non-zero for negative */
if (b[0] == 0) {
a->sign = FP_ZPOS;
} else {
a->sign = FP_NEG;
}
}

24
fp_read_unsigned_bin.c Normal file
View File

@ -0,0 +1,24 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c)
{
/* zero the int */
fp_zero (a);
/* read the bytes in */
for (; c > 0; c--) {
fp_mul_2d (a, 8, a);
a->dp[0] |= *b++;
a->used += 1;
}
fp_clamp (a);
}

27
fp_reverse.c Normal file
View File

@ -0,0 +1,27 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* reverse an array, used for radix code */
void bn_reverse (unsigned char *s, int len)
{
int ix, iy;
unsigned char t;
ix = 0;
iy = len - 1;
while (ix < iy) {
t = s[ix];
s[ix] = s[iy];
s[iy] = t;
++ix;
--iy;
}
}

36
fp_rshd.c Normal file
View File

@ -0,0 +1,36 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_rshd(fp_int *a, int x)
{
int y;
/* too many digits just zero and return */
if (x >= a->used) {
fp_zero(a);
return;
}
/* shift */
for (y = 0; y < a->used - x; y++) {
a->dp[y] = a->dp[y+x];
}
/* zero rest */
for (; y < a->used; y++) {
a->dp[y] = 0;
}
/* decrement count */
a->used -= x;
fp_clamp(a);
}

13
fp_s_rmap.c Normal file
View File

@ -0,0 +1,13 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* chars used in radix conversions */
const char *fp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";

17
fp_set.c Normal file
View File

@ -0,0 +1,17 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_set(fp_int *a, fp_digit b)
{
fp_zero(a);
a->dp[0] = b;
a->used = b ? 1 : 0;
}

15
fp_signed_bin_size.c Normal file
View File

@ -0,0 +1,15 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_signed_bin_size(fp_int *a)
{
return 1 + fp_unsigned_bin_size (a);
}

107
fp_sqr.c Normal file
View File

@ -0,0 +1,107 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* b = a*a */
void fp_sqr(fp_int *A, fp_int *B)
{
int r, y, s;
fp_int aa, bb, comp, amb, t1;
y = A->used;
if (y <= 48) {
if (y <= 4) {
fp_sqr_comba4(A,B);
} else if (y <= 8) {
fp_sqr_comba8(A,B);
} else if (y <= 16 && y >= 12) {
fp_sqr_comba16(A,B);
#ifdef TFM_HUGE
} else if (y <= 32 && y >= 28) {
fp_sqr_comba32(A,B);
#endif
} else {
fp_sqr_comba(A, B);
}
} else {
/* do the karatsuba action
if A = ab ||a|| = r we need to solve
a^2*r^2 + (-(a-b)^2 + a^2 + b^2)*r + b^2
So we solve for the three products then we form the final result with careful shifting
and addition.
Obvious points of optimization
- "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]
- Similarly the "bd" parts can be memcpy'ed and zeroed to 8
-
*/
/* get our value of r */
r = y >> 1;
/* now solve for ac */
// fp_copy(A, &t1); fp_rshd(&t1, r);
for (s = 0; s < A->used - r; s++) {
t1.dp[s] = A->dp[s+r];
}
for (; s < FP_SIZE; s++) {
t1.dp[s] = 0;
}
if (A->used >= r) {
t1.used = A->used - r;
} else {
t1.used = 0;
}
t1.sign = A->sign;
fp_copy(&t1, &amb);
fp_zero(&aa);
fp_sqr(&t1, &aa);
/* now solve for bd */
// fp_mod_2d(A, r * DIGIT_BIT, &t1);
for (s = 0; s < r; s++) {
t1.dp[s] = A->dp[s];
}
for (; s < FP_SIZE; s++) {
t1.dp[s] = 0;
}
t1.used = r;
fp_clamp(&t1);
fp_sub(&amb, &t1, &amb);
fp_zero(&bb);
fp_sqr(&t1, &bb);
/* now get the (a-b) term */
fp_zero(&comp);
fp_sqr(&amb, &comp);
/* now solve the system, do the middle term first */
comp.sign ^= 1;
fp_add(&comp, &aa, &comp);
fp_add(&comp, &bb, &comp);
fp_lshd(&comp, r);
/* leading term */
fp_lshd(&aa, r+r);
/* now sum them together */
fp_zero(B);
fp_add(&aa, &comp, B);
fp_add(&bb, B, B);
B->sign = FP_ZPOS;
}
}

956
fp_sqr_comba.c Normal file
View File

@ -0,0 +1,956 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* About this file...
*/
#if defined(TFM_X86)
/* x86-32 optimized */
#define COMBA_START
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_FINI
#define SQRADD(i, j) \
asm volatile ( \
"movl %6,%%eax \n\t" \
"mull %%eax \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#elif defined(TFM_X86_64)
/* x86-64 optimized */
#define COMBA_START
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_FINI
#define SQRADD(i, j) \
asm volatile ( \
"movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%rax","%rdx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%rax","%rdx","%cc");
#elif defined(TFM_SSE2)
/* SSE2 Optimized */
#define COMBA_START
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_FINI \
asm("emms");
#define SQRADD(i, j) \
asm volatile ( \
"movd %6,%%mm0 \n\t" \
"pmuludq %%mm0,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \
"movd %%mm0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
#define SQRADD2(i, j) \
asm volatile ( \
"movd %6,%%mm0 \n\t" \
"movd %7,%%mm1 \n\t" \
"pmuludq %%mm1,%%mm0\n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32,%%mm0 \n\t" \
"movd %%mm0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
#elif defined(TFM_ARM)
/* ARM code */
#define COMBA_START
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_FINI
/* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \
asm( \
" UMULL r0,r1,%6,%6 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
/* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \
asm( \
" UMULL r0,r1,%6,%7 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
#else
/* ISO C portable code */
#define COMBA_START
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
#define COMBA_STORE(x) \
x = c0;
#define COMBA_STORE2(x) \
x = c1;
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
#define COMBA_FINI
/* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
/* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
#endif
/* generic comba squarer */
void fp_sqr_comba(fp_int *A, fp_int *B)
{
int pa, ix, iz;
fp_digit c0, c1, c2;
fp_int tmp, *dst;
fp_word t;
/* get size of output and trim */
pa = A->used + A->used;
if (pa >= FP_SIZE) {
pa = FP_SIZE-1;
}
/* number of output digits to produce */
COMBA_START;
CLEAR_CARRY;
if (A == B) {
fp_zero(&tmp);
dst = &tmp;
} else {
fp_zero(B);
dst = B;
}
for (ix = 0; ix < pa; ix++) {
int tx, ty, iy;
fp_digit *tmpy, *tmpx;
/* get offsets into the two bignums */
ty = MIN(A->used-1, ix);
tx = ix - ty;
/* setup temp aliases */
tmpx = A->dp + tx;
tmpy = A->dp + ty;
/* this is the number of times the loop will iterrate, essentially its
while (tx++ < a->used && ty-- >= 0) { ... }
*/
iy = MIN(A->used-tx, ty+1);
/* now for squaring tx can never equal ty
* we halve the distance since they approach at a rate of 2x
* and we have to round because odd cases need to be executed
*/
iy = MIN(iy, (ty-tx+1)>>1);
/* forward carries */
CARRY_FORWARD;
/* execute loop */
for (iz = 0; iz < iy; iz++) {
SQRADD2(*tmpx++, *tmpy--);
}
/* even columns have the square term in them */
if ((ix&1) == 0) {
SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
}
/* store it */
COMBA_STORE(dst->dp[ix]);
}
COMBA_STORE2(dst->dp[ix]);
COMBA_FINI;
/* setup dest */
dst->used = pa;
fp_clamp (dst);
if (dst != B) {
fp_copy(dst, B);
}
}
void fp_sqr_comba4(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[8], c0, c1, c2;
a = A->dp;
COMBA_START;
/* clear carries */
CLEAR_CARRY;
/* output 0 */
SQRADD(a[0],a[0]);
COMBA_STORE(b[0]);
/* output 1 */
CARRY_FORWARD;
SQRADD2(a[0], a[1]);
COMBA_STORE(b[1]);
/* output 2 */
CARRY_FORWARD;
SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
COMBA_STORE(b[2]);
/* output 3 */
CARRY_FORWARD;
SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
COMBA_STORE(b[3]);
/* output 4 */
CARRY_FORWARD;
SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
COMBA_STORE(b[4]);
/* output 5 */
CARRY_FORWARD;
SQRADD2(a[2], a[3]);
COMBA_STORE(b[5]);
/* output 6 */
CARRY_FORWARD;
SQRADD(a[3], a[3]);
COMBA_STORE(b[6]);
COMBA_STORE2(b[7]);
COMBA_FINI;
B->used = 8;
B->sign = FP_ZPOS;
memcpy(B->dp, b, 8 * sizeof(fp_digit));
fp_clamp(B);
}
void fp_sqr_comba8(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[16], c0, c1, c2;
a = A->dp;
COMBA_START;
/* clear carries */
CLEAR_CARRY;
/* output 0 */
SQRADD(a[0],a[0]);
COMBA_STORE(b[0]);
/* output 1 */
CARRY_FORWARD;
SQRADD2(a[0], a[1]);
COMBA_STORE(b[1]);
/* output 2 */
CARRY_FORWARD;
SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
COMBA_STORE(b[2]);
/* output 3 */
CARRY_FORWARD;
SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
COMBA_STORE(b[3]);
/* output 4 */
CARRY_FORWARD;
SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
COMBA_STORE(b[4]);
/* output 5 */
CARRY_FORWARD;
SQRADD2(a[0], a[5]); SQRADD2(a[1], a[4]); SQRADD2(a[2], a[3]);
COMBA_STORE(b[5]);
/* output 6 */
CARRY_FORWARD;
SQRADD2(a[0], a[6]); SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]);
COMBA_STORE(b[6]);
/* output 7 */
CARRY_FORWARD;
SQRADD2(a[0], a[7]); SQRADD2(a[1], a[6]); SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]);
COMBA_STORE(b[7]);
/* output 8 */
CARRY_FORWARD;
SQRADD2(a[1], a[7]); SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]);
COMBA_STORE(b[8]);
/* output 9 */
CARRY_FORWARD;
SQRADD2(a[2], a[7]); SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]);
COMBA_STORE(b[9]);
/* output 10 */
CARRY_FORWARD;
SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]);
COMBA_STORE(b[10]);
/* output 11 */
CARRY_FORWARD;
SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]);
COMBA_STORE(b[11]);
/* output 12 */
CARRY_FORWARD;
SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]);
COMBA_STORE(b[12]);
/* output 13 */
CARRY_FORWARD;
SQRADD2(a[6], a[7]);
COMBA_STORE(b[13]);
/* output 14 */
CARRY_FORWARD;
SQRADD(a[7], a[7]);
COMBA_STORE(b[14]);
COMBA_STORE2(b[15]);
COMBA_FINI;
B->used = 16;
B->sign = FP_ZPOS;
memcpy(B->dp, b, 16 * sizeof(fp_digit));
fp_clamp(B);
}
void fp_sqr_comba16(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[32], c0, c1, c2;
a = A->dp;
COMBA_START;
/* clear carries */
CLEAR_CARRY;
/* output 0 */
SQRADD(a[0],a[0]);
COMBA_STORE(b[0]);
/* output 1 */
CARRY_FORWARD;
SQRADD2(a[0], a[1]);
COMBA_STORE(b[1]);
/* output 2 */
CARRY_FORWARD;
SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
COMBA_STORE(b[2]);
/* output 3 */
CARRY_FORWARD;
SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
COMBA_STORE(b[3]);
/* output 4 */
CARRY_FORWARD;
SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
COMBA_STORE(b[4]);
/* output 5 */
CARRY_FORWARD;
SQRADD2(a[0], a[5]); SQRADD2(a[1], a[4]); SQRADD2(a[2], a[3]);
COMBA_STORE(b[5]);
/* output 6 */
CARRY_FORWARD;
SQRADD2(a[0], a[6]); SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]);
COMBA_STORE(b[6]);
/* output 7 */
CARRY_FORWARD;
SQRADD2(a[0], a[7]); SQRADD2(a[1], a[6]); SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]);
COMBA_STORE(b[7]);
/* output 8 */
CARRY_FORWARD;
SQRADD2(a[0], a[8]); SQRADD2(a[1], a[7]); SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]);
COMBA_STORE(b[8]);
/* output 9 */
CARRY_FORWARD;
SQRADD2(a[0], a[9]); SQRADD2(a[1], a[8]); SQRADD2(a[2], a[7]); SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]);
COMBA_STORE(b[9]);
/* output 10 */
CARRY_FORWARD;
SQRADD2(a[0], a[10]); SQRADD2(a[1], a[9]); SQRADD2(a[2], a[8]); SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]);
COMBA_STORE(b[10]);
/* output 11 */
CARRY_FORWARD;
SQRADD2(a[0], a[11]); SQRADD2(a[1], a[10]); SQRADD2(a[2], a[9]); SQRADD2(a[3], a[8]); SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]);
COMBA_STORE(b[11]);
/* output 12 */
CARRY_FORWARD;
SQRADD2(a[0], a[12]); SQRADD2(a[1], a[11]); SQRADD2(a[2], a[10]); SQRADD2(a[3], a[9]); SQRADD2(a[4], a[8]); SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]);
COMBA_STORE(b[12]);
/* output 13 */
CARRY_FORWARD;
SQRADD2(a[0], a[13]); SQRADD2(a[1], a[12]); SQRADD2(a[2], a[11]); SQRADD2(a[3], a[10]); SQRADD2(a[4], a[9]); SQRADD2(a[5], a[8]); SQRADD2(a[6], a[7]);
COMBA_STORE(b[13]);
/* output 14 */
CARRY_FORWARD;
SQRADD2(a[0], a[14]); SQRADD2(a[1], a[13]); SQRADD2(a[2], a[12]); SQRADD2(a[3], a[11]); SQRADD2(a[4], a[10]); SQRADD2(a[5], a[9]); SQRADD2(a[6], a[8]); SQRADD(a[7], a[7]);
COMBA_STORE(b[14]);
/* output 15 */
CARRY_FORWARD;
SQRADD2(a[0], a[15]); SQRADD2(a[1], a[14]); SQRADD2(a[2], a[13]); SQRADD2(a[3], a[12]); SQRADD2(a[4], a[11]); SQRADD2(a[5], a[10]); SQRADD2(a[6], a[9]); SQRADD2(a[7], a[8]);
COMBA_STORE(b[15]);
/* output 16 */
CARRY_FORWARD;
SQRADD2(a[1], a[15]); SQRADD2(a[2], a[14]); SQRADD2(a[3], a[13]); SQRADD2(a[4], a[12]); SQRADD2(a[5], a[11]); SQRADD2(a[6], a[10]); SQRADD2(a[7], a[9]); SQRADD(a[8], a[8]);
COMBA_STORE(b[16]);
/* output 17 */
CARRY_FORWARD;
SQRADD2(a[2], a[15]); SQRADD2(a[3], a[14]); SQRADD2(a[4], a[13]); SQRADD2(a[5], a[12]); SQRADD2(a[6], a[11]); SQRADD2(a[7], a[10]); SQRADD2(a[8], a[9]);
COMBA_STORE(b[17]);
/* output 18 */
CARRY_FORWARD;
SQRADD2(a[3], a[15]); SQRADD2(a[4], a[14]); SQRADD2(a[5], a[13]); SQRADD2(a[6], a[12]); SQRADD2(a[7], a[11]); SQRADD2(a[8], a[10]); SQRADD(a[9], a[9]);
COMBA_STORE(b[18]);
/* output 19 */
CARRY_FORWARD;
SQRADD2(a[4], a[15]); SQRADD2(a[5], a[14]); SQRADD2(a[6], a[13]); SQRADD2(a[7], a[12]); SQRADD2(a[8], a[11]); SQRADD2(a[9], a[10]);
COMBA_STORE(b[19]);
/* output 20 */
CARRY_FORWARD;
SQRADD2(a[5], a[15]); SQRADD2(a[6], a[14]); SQRADD2(a[7], a[13]); SQRADD2(a[8], a[12]); SQRADD2(a[9], a[11]); SQRADD(a[10], a[10]);
COMBA_STORE(b[20]);
/* output 21 */
CARRY_FORWARD;
SQRADD2(a[6], a[15]); SQRADD2(a[7], a[14]); SQRADD2(a[8], a[13]); SQRADD2(a[9], a[12]); SQRADD2(a[10], a[11]);
COMBA_STORE(b[21]);
/* output 22 */
CARRY_FORWARD;
SQRADD2(a[7], a[15]); SQRADD2(a[8], a[14]); SQRADD2(a[9], a[13]); SQRADD2(a[10], a[12]); SQRADD(a[11], a[11]);
COMBA_STORE(b[22]);
/* output 23 */
CARRY_FORWARD;
SQRADD2(a[8], a[15]); SQRADD2(a[9], a[14]); SQRADD2(a[10], a[13]); SQRADD2(a[11], a[12]);
COMBA_STORE(b[23]);
/* output 24 */
CARRY_FORWARD;
SQRADD2(a[9], a[15]); SQRADD2(a[10], a[14]); SQRADD2(a[11], a[13]); SQRADD(a[12], a[12]);
COMBA_STORE(b[24]);
/* output 25 */
CARRY_FORWARD;
SQRADD2(a[10], a[15]); SQRADD2(a[11], a[14]); SQRADD2(a[12], a[13]);
COMBA_STORE(b[25]);
/* output 26 */
CARRY_FORWARD;
SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
COMBA_STORE(b[26]);
/* output 27 */
CARRY_FORWARD;
SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
COMBA_STORE(b[27]);
/* output 28 */
CARRY_FORWARD;
SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
COMBA_STORE(b[28]);
/* output 29 */
CARRY_FORWARD;
SQRADD2(a[14], a[15]);
COMBA_STORE(b[29]);
/* output 30 */
CARRY_FORWARD;
SQRADD(a[15], a[15]);
COMBA_STORE(b[30]);
COMBA_STORE2(b[31]);
COMBA_FINI;
B->used = 32;
B->sign = FP_ZPOS;
memcpy(B->dp, b, 32 * sizeof(fp_digit));
fp_clamp(B);
}
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B)
{
fp_word t;
fp_digit *a, b[64], c0, c1, c2;
a = A->dp;
COMBA_START;
/* clear carries */
CLEAR_CARRY;
/* output 0 */
SQRADD(a[0],a[0]);
COMBA_STORE(b[0]);
/* output 1 */
CARRY_FORWARD;
SQRADD2(a[0], a[1]);
COMBA_STORE(b[1]);
/* output 2 */
CARRY_FORWARD;
SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
COMBA_STORE(b[2]);
/* output 3 */
CARRY_FORWARD;
SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
COMBA_STORE(b[3]);
/* output 4 */
CARRY_FORWARD;
SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
COMBA_STORE(b[4]);
/* output 5 */
CARRY_FORWARD;
SQRADD2(a[0], a[5]); SQRADD2(a[1], a[4]); SQRADD2(a[2], a[3]);
COMBA_STORE(b[5]);
/* output 6 */
CARRY_FORWARD;
SQRADD2(a[0], a[6]); SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]);
COMBA_STORE(b[6]);
/* output 7 */
CARRY_FORWARD;
SQRADD2(a[0], a[7]); SQRADD2(a[1], a[6]); SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]);
COMBA_STORE(b[7]);
/* output 8 */
CARRY_FORWARD;
SQRADD2(a[0], a[8]); SQRADD2(a[1], a[7]); SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]);
COMBA_STORE(b[8]);
/* output 9 */
CARRY_FORWARD;
SQRADD2(a[0], a[9]); SQRADD2(a[1], a[8]); SQRADD2(a[2], a[7]); SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]);
COMBA_STORE(b[9]);
/* output 10 */
CARRY_FORWARD;
SQRADD2(a[0], a[10]); SQRADD2(a[1], a[9]); SQRADD2(a[2], a[8]); SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]);
COMBA_STORE(b[10]);
/* output 11 */
CARRY_FORWARD;
SQRADD2(a[0], a[11]); SQRADD2(a[1], a[10]); SQRADD2(a[2], a[9]); SQRADD2(a[3], a[8]); SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]);
COMBA_STORE(b[11]);
/* output 12 */
CARRY_FORWARD;
SQRADD2(a[0], a[12]); SQRADD2(a[1], a[11]); SQRADD2(a[2], a[10]); SQRADD2(a[3], a[9]); SQRADD2(a[4], a[8]); SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]);
COMBA_STORE(b[12]);
/* output 13 */
CARRY_FORWARD;
SQRADD2(a[0], a[13]); SQRADD2(a[1], a[12]); SQRADD2(a[2], a[11]); SQRADD2(a[3], a[10]); SQRADD2(a[4], a[9]); SQRADD2(a[5], a[8]); SQRADD2(a[6], a[7]);
COMBA_STORE(b[13]);
/* output 14 */
CARRY_FORWARD;
SQRADD2(a[0], a[14]); SQRADD2(a[1], a[13]); SQRADD2(a[2], a[12]); SQRADD2(a[3], a[11]); SQRADD2(a[4], a[10]); SQRADD2(a[5], a[9]); SQRADD2(a[6], a[8]); SQRADD(a[7], a[7]);
COMBA_STORE(b[14]);
/* output 15 */
CARRY_FORWARD;
SQRADD2(a[0], a[15]); SQRADD2(a[1], a[14]); SQRADD2(a[2], a[13]); SQRADD2(a[3], a[12]); SQRADD2(a[4], a[11]); SQRADD2(a[5], a[10]); SQRADD2(a[6], a[9]); SQRADD2(a[7], a[8]);
COMBA_STORE(b[15]);
/* output 16 */
CARRY_FORWARD;
SQRADD2(a[0], a[16]); SQRADD2(a[1], a[15]); SQRADD2(a[2], a[14]); SQRADD2(a[3], a[13]); SQRADD2(a[4], a[12]); SQRADD2(a[5], a[11]); SQRADD2(a[6], a[10]); SQRADD2(a[7], a[9]); SQRADD(a[8], a[8]);
COMBA_STORE(b[16]);
/* output 17 */
CARRY_FORWARD;
SQRADD2(a[0], a[17]); SQRADD2(a[1], a[16]); SQRADD2(a[2], a[15]); SQRADD2(a[3], a[14]); SQRADD2(a[4], a[13]); SQRADD2(a[5], a[12]); SQRADD2(a[6], a[11]); SQRADD2(a[7], a[10]); SQRADD2(a[8], a[9]);
COMBA_STORE(b[17]);
/* output 18 */
CARRY_FORWARD;
SQRADD2(a[0], a[18]); SQRADD2(a[1], a[17]); SQRADD2(a[2], a[16]); SQRADD2(a[3], a[15]); SQRADD2(a[4], a[14]); SQRADD2(a[5], a[13]); SQRADD2(a[6], a[12]); SQRADD2(a[7], a[11]); SQRADD2(a[8], a[10]); SQRADD(a[9], a[9]);
COMBA_STORE(b[18]);
/* output 19 */
CARRY_FORWARD;
SQRADD2(a[0], a[19]); SQRADD2(a[1], a[18]); SQRADD2(a[2], a[17]); SQRADD2(a[3], a[16]); SQRADD2(a[4], a[15]); SQRADD2(a[5], a[14]); SQRADD2(a[6], a[13]); SQRADD2(a[7], a[12]); SQRADD2(a[8], a[11]); SQRADD2(a[9], a[10]);
COMBA_STORE(b[19]);
/* output 20 */
CARRY_FORWARD;
SQRADD2(a[0], a[20]); SQRADD2(a[1], a[19]); SQRADD2(a[2], a[18]); SQRADD2(a[3], a[17]); SQRADD2(a[4], a[16]); SQRADD2(a[5], a[15]); SQRADD2(a[6], a[14]); SQRADD2(a[7], a[13]); SQRADD2(a[8], a[12]); SQRADD2(a[9], a[11]); SQRADD(a[10], a[10]);
COMBA_STORE(b[20]);
/* output 21 */
CARRY_FORWARD;
SQRADD2(a[0], a[21]); SQRADD2(a[1], a[20]); SQRADD2(a[2], a[19]); SQRADD2(a[3], a[18]); SQRADD2(a[4], a[17]); SQRADD2(a[5], a[16]); SQRADD2(a[6], a[15]); SQRADD2(a[7], a[14]); SQRADD2(a[8], a[13]); SQRADD2(a[9], a[12]); SQRADD2(a[10], a[11]);
COMBA_STORE(b[21]);
/* output 22 */
CARRY_FORWARD;
SQRADD2(a[0], a[22]); SQRADD2(a[1], a[21]); SQRADD2(a[2], a[20]); SQRADD2(a[3], a[19]); SQRADD2(a[4], a[18]); SQRADD2(a[5], a[17]); SQRADD2(a[6], a[16]); SQRADD2(a[7], a[15]); SQRADD2(a[8], a[14]); SQRADD2(a[9], a[13]); SQRADD2(a[10], a[12]); SQRADD(a[11], a[11]);
COMBA_STORE(b[22]);
/* output 23 */
CARRY_FORWARD;
SQRADD2(a[0], a[23]); SQRADD2(a[1], a[22]); SQRADD2(a[2], a[21]); SQRADD2(a[3], a[20]); SQRADD2(a[4], a[19]); SQRADD2(a[5], a[18]); SQRADD2(a[6], a[17]); SQRADD2(a[7], a[16]); SQRADD2(a[8], a[15]); SQRADD2(a[9], a[14]); SQRADD2(a[10], a[13]); SQRADD2(a[11], a[12]);
COMBA_STORE(b[23]);
/* output 24 */
CARRY_FORWARD;
SQRADD2(a[0], a[24]); SQRADD2(a[1], a[23]); SQRADD2(a[2], a[22]); SQRADD2(a[3], a[21]); SQRADD2(a[4], a[20]); SQRADD2(a[5], a[19]); SQRADD2(a[6], a[18]); SQRADD2(a[7], a[17]); SQRADD2(a[8], a[16]); SQRADD2(a[9], a[15]); SQRADD2(a[10], a[14]); SQRADD2(a[11], a[13]); SQRADD(a[12], a[12]);
COMBA_STORE(b[24]);
/* output 25 */
CARRY_FORWARD;
SQRADD2(a[0], a[25]); SQRADD2(a[1], a[24]); SQRADD2(a[2], a[23]); SQRADD2(a[3], a[22]); SQRADD2(a[4], a[21]); SQRADD2(a[5], a[20]); SQRADD2(a[6], a[19]); SQRADD2(a[7], a[18]); SQRADD2(a[8], a[17]); SQRADD2(a[9], a[16]); SQRADD2(a[10], a[15]); SQRADD2(a[11], a[14]); SQRADD2(a[12], a[13]);
COMBA_STORE(b[25]);
/* output 26 */
CARRY_FORWARD;
SQRADD2(a[0], a[26]); SQRADD2(a[1], a[25]); SQRADD2(a[2], a[24]); SQRADD2(a[3], a[23]); SQRADD2(a[4], a[22]); SQRADD2(a[5], a[21]); SQRADD2(a[6], a[20]); SQRADD2(a[7], a[19]); SQRADD2(a[8], a[18]); SQRADD2(a[9], a[17]); SQRADD2(a[10], a[16]); SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
COMBA_STORE(b[26]);
/* output 27 */
CARRY_FORWARD;
SQRADD2(a[0], a[27]); SQRADD2(a[1], a[26]); SQRADD2(a[2], a[25]); SQRADD2(a[3], a[24]); SQRADD2(a[4], a[23]); SQRADD2(a[5], a[22]); SQRADD2(a[6], a[21]); SQRADD2(a[7], a[20]); SQRADD2(a[8], a[19]); SQRADD2(a[9], a[18]); SQRADD2(a[10], a[17]); SQRADD2(a[11], a[16]); SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
COMBA_STORE(b[27]);
/* output 28 */
CARRY_FORWARD;
SQRADD2(a[0], a[28]); SQRADD2(a[1], a[27]); SQRADD2(a[2], a[26]); SQRADD2(a[3], a[25]); SQRADD2(a[4], a[24]); SQRADD2(a[5], a[23]); SQRADD2(a[6], a[22]); SQRADD2(a[7], a[21]); SQRADD2(a[8], a[20]); SQRADD2(a[9], a[19]); SQRADD2(a[10], a[18]); SQRADD2(a[11], a[17]); SQRADD2(a[12], a[16]); SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
COMBA_STORE(b[28]);
/* output 29 */
CARRY_FORWARD;
SQRADD2(a[0], a[29]); SQRADD2(a[1], a[28]); SQRADD2(a[2], a[27]); SQRADD2(a[3], a[26]); SQRADD2(a[4], a[25]); SQRADD2(a[5], a[24]); SQRADD2(a[6], a[23]); SQRADD2(a[7], a[22]); SQRADD2(a[8], a[21]); SQRADD2(a[9], a[20]); SQRADD2(a[10], a[19]); SQRADD2(a[11], a[18]); SQRADD2(a[12], a[17]); SQRADD2(a[13], a[16]); SQRADD2(a[14], a[15]);
COMBA_STORE(b[29]);
/* output 30 */
CARRY_FORWARD;
SQRADD2(a[0], a[30]); SQRADD2(a[1], a[29]); SQRADD2(a[2], a[28]); SQRADD2(a[3], a[27]); SQRADD2(a[4], a[26]); SQRADD2(a[5], a[25]); SQRADD2(a[6], a[24]); SQRADD2(a[7], a[23]); SQRADD2(a[8], a[22]); SQRADD2(a[9], a[21]); SQRADD2(a[10], a[20]); SQRADD2(a[11], a[19]); SQRADD2(a[12], a[18]); SQRADD2(a[13], a[17]); SQRADD2(a[14], a[16]); SQRADD(a[15], a[15]);
COMBA_STORE(b[30]);
/* output 31 */
CARRY_FORWARD;
SQRADD2(a[0], a[31]); SQRADD2(a[1], a[30]); SQRADD2(a[2], a[29]); SQRADD2(a[3], a[28]); SQRADD2(a[4], a[27]); SQRADD2(a[5], a[26]); SQRADD2(a[6], a[25]); SQRADD2(a[7], a[24]); SQRADD2(a[8], a[23]); SQRADD2(a[9], a[22]); SQRADD2(a[10], a[21]); SQRADD2(a[11], a[20]); SQRADD2(a[12], a[19]); SQRADD2(a[13], a[18]); SQRADD2(a[14], a[17]); SQRADD2(a[15], a[16]);
COMBA_STORE(b[31]);
/* output 32 */
CARRY_FORWARD;
SQRADD2(a[1], a[31]); SQRADD2(a[2], a[30]); SQRADD2(a[3], a[29]); SQRADD2(a[4], a[28]); SQRADD2(a[5], a[27]); SQRADD2(a[6], a[26]); SQRADD2(a[7], a[25]); SQRADD2(a[8], a[24]); SQRADD2(a[9], a[23]); SQRADD2(a[10], a[22]); SQRADD2(a[11], a[21]); SQRADD2(a[12], a[20]); SQRADD2(a[13], a[19]); SQRADD2(a[14], a[18]); SQRADD2(a[15], a[17]); SQRADD(a[16], a[16]);
COMBA_STORE(b[32]);
/* output 33 */
CARRY_FORWARD;
SQRADD2(a[2], a[31]); SQRADD2(a[3], a[30]); SQRADD2(a[4], a[29]); SQRADD2(a[5], a[28]); SQRADD2(a[6], a[27]); SQRADD2(a[7], a[26]); SQRADD2(a[8], a[25]); SQRADD2(a[9], a[24]); SQRADD2(a[10], a[23]); SQRADD2(a[11], a[22]); SQRADD2(a[12], a[21]); SQRADD2(a[13], a[20]); SQRADD2(a[14], a[19]); SQRADD2(a[15], a[18]); SQRADD2(a[16], a[17]);
COMBA_STORE(b[33]);
/* output 34 */
CARRY_FORWARD;
SQRADD2(a[3], a[31]); SQRADD2(a[4], a[30]); SQRADD2(a[5], a[29]); SQRADD2(a[6], a[28]); SQRADD2(a[7], a[27]); SQRADD2(a[8], a[26]); SQRADD2(a[9], a[25]); SQRADD2(a[10], a[24]); SQRADD2(a[11], a[23]); SQRADD2(a[12], a[22]); SQRADD2(a[13], a[21]); SQRADD2(a[14], a[20]); SQRADD2(a[15], a[19]); SQRADD2(a[16], a[18]); SQRADD(a[17], a[17]);
COMBA_STORE(b[34]);
/* output 35 */
CARRY_FORWARD;
SQRADD2(a[4], a[31]); SQRADD2(a[5], a[30]); SQRADD2(a[6], a[29]); SQRADD2(a[7], a[28]); SQRADD2(a[8], a[27]); SQRADD2(a[9], a[26]); SQRADD2(a[10], a[25]); SQRADD2(a[11], a[24]); SQRADD2(a[12], a[23]); SQRADD2(a[13], a[22]); SQRADD2(a[14], a[21]); SQRADD2(a[15], a[20]); SQRADD2(a[16], a[19]); SQRADD2(a[17], a[18]);
COMBA_STORE(b[35]);
/* output 36 */
CARRY_FORWARD;
SQRADD2(a[5], a[31]); SQRADD2(a[6], a[30]); SQRADD2(a[7], a[29]); SQRADD2(a[8], a[28]); SQRADD2(a[9], a[27]); SQRADD2(a[10], a[26]); SQRADD2(a[11], a[25]); SQRADD2(a[12], a[24]); SQRADD2(a[13], a[23]); SQRADD2(a[14], a[22]); SQRADD2(a[15], a[21]); SQRADD2(a[16], a[20]); SQRADD2(a[17], a[19]); SQRADD(a[18], a[18]);
COMBA_STORE(b[36]);
/* output 37 */
CARRY_FORWARD;
SQRADD2(a[6], a[31]); SQRADD2(a[7], a[30]); SQRADD2(a[8], a[29]); SQRADD2(a[9], a[28]); SQRADD2(a[10], a[27]); SQRADD2(a[11], a[26]); SQRADD2(a[12], a[25]); SQRADD2(a[13], a[24]); SQRADD2(a[14], a[23]); SQRADD2(a[15], a[22]); SQRADD2(a[16], a[21]); SQRADD2(a[17], a[20]); SQRADD2(a[18], a[19]);
COMBA_STORE(b[37]);
/* output 38 */
CARRY_FORWARD;
SQRADD2(a[7], a[31]); SQRADD2(a[8], a[30]); SQRADD2(a[9], a[29]); SQRADD2(a[10], a[28]); SQRADD2(a[11], a[27]); SQRADD2(a[12], a[26]); SQRADD2(a[13], a[25]); SQRADD2(a[14], a[24]); SQRADD2(a[15], a[23]); SQRADD2(a[16], a[22]); SQRADD2(a[17], a[21]); SQRADD2(a[18], a[20]); SQRADD(a[19], a[19]);
COMBA_STORE(b[38]);
/* output 39 */
CARRY_FORWARD;
SQRADD2(a[8], a[31]); SQRADD2(a[9], a[30]); SQRADD2(a[10], a[29]); SQRADD2(a[11], a[28]); SQRADD2(a[12], a[27]); SQRADD2(a[13], a[26]); SQRADD2(a[14], a[25]); SQRADD2(a[15], a[24]); SQRADD2(a[16], a[23]); SQRADD2(a[17], a[22]); SQRADD2(a[18], a[21]); SQRADD2(a[19], a[20]);
COMBA_STORE(b[39]);
/* output 40 */
CARRY_FORWARD;
SQRADD2(a[9], a[31]); SQRADD2(a[10], a[30]); SQRADD2(a[11], a[29]); SQRADD2(a[12], a[28]); SQRADD2(a[13], a[27]); SQRADD2(a[14], a[26]); SQRADD2(a[15], a[25]); SQRADD2(a[16], a[24]); SQRADD2(a[17], a[23]); SQRADD2(a[18], a[22]); SQRADD2(a[19], a[21]); SQRADD(a[20], a[20]);
COMBA_STORE(b[40]);
/* output 41 */
CARRY_FORWARD;
SQRADD2(a[10], a[31]); SQRADD2(a[11], a[30]); SQRADD2(a[12], a[29]); SQRADD2(a[13], a[28]); SQRADD2(a[14], a[27]); SQRADD2(a[15], a[26]); SQRADD2(a[16], a[25]); SQRADD2(a[17], a[24]); SQRADD2(a[18], a[23]); SQRADD2(a[19], a[22]); SQRADD2(a[20], a[21]);
COMBA_STORE(b[41]);
/* output 42 */
CARRY_FORWARD;
SQRADD2(a[11], a[31]); SQRADD2(a[12], a[30]); SQRADD2(a[13], a[29]); SQRADD2(a[14], a[28]); SQRADD2(a[15], a[27]); SQRADD2(a[16], a[26]); SQRADD2(a[17], a[25]); SQRADD2(a[18], a[24]); SQRADD2(a[19], a[23]); SQRADD2(a[20], a[22]); SQRADD(a[21], a[21]);
COMBA_STORE(b[42]);
/* output 43 */
CARRY_FORWARD;
SQRADD2(a[12], a[31]); SQRADD2(a[13], a[30]); SQRADD2(a[14], a[29]); SQRADD2(a[15], a[28]); SQRADD2(a[16], a[27]); SQRADD2(a[17], a[26]); SQRADD2(a[18], a[25]); SQRADD2(a[19], a[24]); SQRADD2(a[20], a[23]); SQRADD2(a[21], a[22]);
COMBA_STORE(b[43]);
/* output 44 */
CARRY_FORWARD;
SQRADD2(a[13], a[31]); SQRADD2(a[14], a[30]); SQRADD2(a[15], a[29]); SQRADD2(a[16], a[28]); SQRADD2(a[17], a[27]); SQRADD2(a[18], a[26]); SQRADD2(a[19], a[25]); SQRADD2(a[20], a[24]); SQRADD2(a[21], a[23]); SQRADD(a[22], a[22]);
COMBA_STORE(b[44]);
/* output 45 */
CARRY_FORWARD;
SQRADD2(a[14], a[31]); SQRADD2(a[15], a[30]); SQRADD2(a[16], a[29]); SQRADD2(a[17], a[28]); SQRADD2(a[18], a[27]); SQRADD2(a[19], a[26]); SQRADD2(a[20], a[25]); SQRADD2(a[21], a[24]); SQRADD2(a[22], a[23]);
COMBA_STORE(b[45]);
/* output 46 */
CARRY_FORWARD;
SQRADD2(a[15], a[31]); SQRADD2(a[16], a[30]); SQRADD2(a[17], a[29]); SQRADD2(a[18], a[28]); SQRADD2(a[19], a[27]); SQRADD2(a[20], a[26]); SQRADD2(a[21], a[25]); SQRADD2(a[22], a[24]); SQRADD(a[23], a[23]);
COMBA_STORE(b[46]);
/* output 47 */
CARRY_FORWARD;
SQRADD2(a[16], a[31]); SQRADD2(a[17], a[30]); SQRADD2(a[18], a[29]); SQRADD2(a[19], a[28]); SQRADD2(a[20], a[27]); SQRADD2(a[21], a[26]); SQRADD2(a[22], a[25]); SQRADD2(a[23], a[24]);
COMBA_STORE(b[47]);
/* output 48 */
CARRY_FORWARD;
SQRADD2(a[17], a[31]); SQRADD2(a[18], a[30]); SQRADD2(a[19], a[29]); SQRADD2(a[20], a[28]); SQRADD2(a[21], a[27]); SQRADD2(a[22], a[26]); SQRADD2(a[23], a[25]); SQRADD(a[24], a[24]);
COMBA_STORE(b[48]);
/* output 49 */
CARRY_FORWARD;
SQRADD2(a[18], a[31]); SQRADD2(a[19], a[30]); SQRADD2(a[20], a[29]); SQRADD2(a[21], a[28]); SQRADD2(a[22], a[27]); SQRADD2(a[23], a[26]); SQRADD2(a[24], a[25]);
COMBA_STORE(b[49]);
/* output 50 */
CARRY_FORWARD;
SQRADD2(a[19], a[31]); SQRADD2(a[20], a[30]); SQRADD2(a[21], a[29]); SQRADD2(a[22], a[28]); SQRADD2(a[23], a[27]); SQRADD2(a[24], a[26]); SQRADD(a[25], a[25]);
COMBA_STORE(b[50]);
/* output 51 */
CARRY_FORWARD;
SQRADD2(a[20], a[31]); SQRADD2(a[21], a[30]); SQRADD2(a[22], a[29]); SQRADD2(a[23], a[28]); SQRADD2(a[24], a[27]); SQRADD2(a[25], a[26]);
COMBA_STORE(b[51]);
/* output 52 */
CARRY_FORWARD;
SQRADD2(a[21], a[31]); SQRADD2(a[22], a[30]); SQRADD2(a[23], a[29]); SQRADD2(a[24], a[28]); SQRADD2(a[25], a[27]); SQRADD(a[26], a[26]);
COMBA_STORE(b[52]);
/* output 53 */
CARRY_FORWARD;
SQRADD2(a[22], a[31]); SQRADD2(a[23], a[30]); SQRADD2(a[24], a[29]); SQRADD2(a[25], a[28]); SQRADD2(a[26], a[27]);
COMBA_STORE(b[53]);
/* output 54 */
CARRY_FORWARD;
SQRADD2(a[23], a[31]); SQRADD2(a[24], a[30]); SQRADD2(a[25], a[29]); SQRADD2(a[26], a[28]); SQRADD(a[27], a[27]);
COMBA_STORE(b[54]);
/* output 55 */
CARRY_FORWARD;
SQRADD2(a[24], a[31]); SQRADD2(a[25], a[30]); SQRADD2(a[26], a[29]); SQRADD2(a[27], a[28]);
COMBA_STORE(b[55]);
/* output 56 */
CARRY_FORWARD;
SQRADD2(a[25], a[31]); SQRADD2(a[26], a[30]); SQRADD2(a[27], a[29]); SQRADD(a[28], a[28]);
COMBA_STORE(b[56]);
/* output 57 */
CARRY_FORWARD;
SQRADD2(a[26], a[31]); SQRADD2(a[27], a[30]); SQRADD2(a[28], a[29]);
COMBA_STORE(b[57]);
/* output 58 */
CARRY_FORWARD;
SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
COMBA_STORE(b[58]);
/* output 59 */
CARRY_FORWARD;
SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
COMBA_STORE(b[59]);
/* output 60 */
CARRY_FORWARD;
SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
COMBA_STORE(b[60]);
/* output 61 */
CARRY_FORWARD;
SQRADD2(a[30], a[31]);
COMBA_STORE(b[61]);
/* output 62 */
CARRY_FORWARD;
SQRADD(a[31], a[31]);
COMBA_STORE(b[62]);
COMBA_STORE2(b[63]);
COMBA_FINI;
B->used = 64;
B->sign = FP_ZPOS;
memcpy(B->dp, b, 64 * sizeof(fp_digit));
fp_clamp(B);
}
#endif

19
fp_sqrmod.c Normal file
View File

@ -0,0 +1,19 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a * a (mod b) */
int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c)
{
fp_int tmp;
fp_zero(&tmp);
fp_sqr(a, &tmp);
return fp_mod(&tmp, b, c);
}

46
fp_sub.c Normal file
View File

@ -0,0 +1,46 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a - b */
void fp_sub(fp_int *a, fp_int *b, fp_int *c)
{
int sa, sb;
sa = a->sign;
sb = b->sign;
if (sa != sb) {
/* subtract a negative from a positive, OR */
/* subtract a positive from a negative. */
/* In either case, ADD their magnitudes, */
/* and use the sign of the first number. */
c->sign = sa;
s_fp_add (a, b, c);
} else {
/* subtract a positive from a positive, OR */
/* subtract a negative from a negative. */
/* First, take the difference between their */
/* magnitudes, then... */
if (fp_cmp_mag (a, b) != FP_LT) {
/* Copy the sign from the first */
c->sign = sa;
/* The first has a larger or equal magnitude */
s_fp_sub (a, b, c);
} else {
/* The result has the *opposite* sign from */
/* the first number. */
c->sign = (sa == FP_ZPOS) ? FP_NEG : FP_ZPOS;
/* The second has a larger magnitude */
s_fp_sub (b, a, c);
}
}
}

18
fp_sub_d.c Normal file
View File

@ -0,0 +1,18 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* c = a - b */
void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
{
fp_int tmp;
fp_set(&tmp, b);
fp_sub(a, &tmp, c);
}

20
fp_submod.c Normal file
View File

@ -0,0 +1,20 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* d = a - b (mod c) */
int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
{
fp_int tmp;
fp_zero(&tmp);
fp_sub(a, b, &tmp);
return fp_mod(&tmp, c, d);
}

16
fp_to_signed_bin.c Normal file
View File

@ -0,0 +1,16 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_to_signed_bin(fp_int *a, unsigned char *b)
{
fp_to_unsigned_bin (a, b + 1);
b[0] = (unsigned char) ((a->sign == FP_ZPOS) ? 0 : 1);
}

25
fp_to_unsigned_bin.c Normal file
View File

@ -0,0 +1,25 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
{
int x;
fp_int t;
fp_init_copy(&t, a);
x = 0;
while (fp_iszero (&t) == FP_NO) {
b[x++] = (unsigned char) (t.dp[0] & 255);
fp_div_2d (&t, 8, &t, NULL);
}
bn_reverse (b, x);
}

55
fp_toradix.c Normal file
View File

@ -0,0 +1,55 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_toradix(fp_int *a, char *str, int radix)
{
int digs;
fp_int t;
fp_digit d;
char *_s = str;
/* check range of the radix */
if (radix < 2 || radix > 64) {
return FP_VAL;
}
/* quick out if its zero */
if (fp_iszero(a) == 1) {
*str++ = '0';
*str = '\0';
return FP_OKAY;
}
fp_init_copy(&t, a);
/* if it is negative output a - */
if (t.sign == FP_NEG) {
++_s;
*str++ = '-';
t.sign = FP_ZPOS;
}
digs = 0;
while (fp_iszero (&t) == FP_NO) {
fp_div_d (&t, (fp_digit) radix, &t, &d);
*str++ = fp_s_rmap[d];
++digs;
}
/* reverse the digits of the string. In this case _s points
* to the first digit [exluding the sign] of the number]
*/
bn_reverse ((unsigned char *)_s, digs);
/* append a NULL so the string is properly terminated */
*str = '\0';
return FP_OKAY;
}

16
fp_unsigned_bin_size.c Normal file
View File

@ -0,0 +1,16 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
int fp_unsigned_bin_size(fp_int *a)
{
int size = fp_count_bits (a);
return (size / 8 + ((size & 7) != 0 ? 1 : 0));
}

17
gen.pl Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/perl -w
#
# Generates a "single file" you can use to quickly
# add the whole source without any makefile troubles
#
use strict;
open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
foreach my $filename (glob "fp_*.c") {
open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
print OUT "/* Start: $filename */\n";
print OUT while <SRC>;
print OUT "\n/* End: $filename */\n\n";
close SRC or die "Error closing $filename after reading: $!";
}
print OUT "\n/* EOF */\n";
close OUT or die "Error closing mpi.c after writing: $!";

78
makefile Normal file
View File

@ -0,0 +1,78 @@
#makefile for TomsFastMath
#
#
CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops
#profiling
#PROF=-pg -g
#CFLAGS += $(PROF)
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.01
default: libtfm.a
OBJECTS = \
fp_set.o \
\
fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
fp_mul_2.o fp_div_2.o \
\
fp_cnt_lsb.o \
\
fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
s_fp_add.o s_fp_sub.o \
\
fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
\
fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
\
fp_exptmod.o \
\
fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
libtfm.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
ranlib libtfm.a
mtest/mtest: mtest/mtest.c
cd mtest ; make mtest
test: libtfm.a demo/test.o mtest/mtest
$(CC) demo/test.o libtfm.a $(PROF) -o test
stest: libtfm.a demo/stest.o
$(CC) demo/stest.o libtfm.a -o stest
docdvi: tfm.tex
touch tfm.ind
latex tfm >/dev/null
latex tfm >/dev/null
makeindex tfm
latex tfm >/dev/null
docs: docdvi
latex tfm >/dev/null
dvipdf tfm
mv -f tfm.pdf doc
clean:
rm -f $(OBJECTS) *.a demo/*.o test tfm.aux tfm.dvi tfm.idx tfm.ilg tfm.ind tfm.lof tfm.log tfm.toc stest
cd mtest ; make clean
zipup: docs clean
perl gen.pl ; mv mpi.c pre_gen/ ; \
cd .. ; rm -rf tfm* tomsfastmath-$(VERSION) ; mkdir tomsfastmath-$(VERSION) ; \
cp -R ./tomsfastmath/* ./tomsfastmath-$(VERSION)/ ; \
tar -c tomsfastmath-$(VERSION)/* | bzip2 -9vvc > tfm-$(VERSION).tar.bz2 ; \
zip -9r tfm-$(VERSION).zip tomsfastmath-$(VERSION)/*

55
makefile.gba Normal file
View File

@ -0,0 +1,55 @@
#makefile for TomsFastMath
#
#For the GameboyAdance... er.... ARMv4
SFLAGS = $(CFLAGS) -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -mthumb -mthumb-interwork -I../devkitadv/mylib/lib
CFLAGS += -Wall -W -Wshadow -I./ -O3 -funroll-all-loops -marm -mthumb-interwork -I../devkitadv/mylib/lib
#profiling
#PROF=-pg -g
#CFLAGS += $(PROF)
#speed
CFLAGS += -fomit-frame-pointer
VERSION=0.01
default: libtfm.a
OBJECTS = \
fp_set.o \
\
fp_rshd.o fp_lshd.o fp_div_2d.o fp_mod_2d.o fp_mul_2d.o fp_2expt.o \
fp_mul_2.o fp_div_2.o \
\
fp_cnt_lsb.o \
\
fp_add.o fp_sub.o fp_mul.o fp_sqr.o fp_div.o fp_mod.o \
s_fp_add.o s_fp_sub.o \
\
fp_cmp_d.o fp_add_d.o fp_sub_d.o fp_mul_d.o fp_div_d.o fp_mod_d.o \
fp_addmod.o fp_submod.o fp_mulmod.o fp_sqrmod.o fp_invmod.o \
fp_gcd.o fp_lcm.o fp_prime_miller_rabin.o fp_isprime.o \
fp_prime_random_ex.o fp_mul_comba.o fp_sqr_comba.o \
\
fp_montgomery_setup.o fp_montgomery_calc_normalization.o fp_montgomery_reduce.o \
\
fp_exptmod.o \
\
fp_cmp.o fp_cmp_mag.o \
\
fp_unsigned_bin_size.o fp_read_unsigned_bin.o fp_to_unsigned_bin.o \
fp_signed_bin_size.o fp_read_signed_bin.o fp_to_signed_bin.o \
fp_read_radix.o fp_toradix.o fp_count_bits.o fp_reverse.o fp_s_rmap.o \
\
libtfm.a: $(OBJECTS)
$(AR) $(ARFLAGS) libtfm.a $(OBJECTS)
ranlib libtfm.a
demo/stest.o: demo/stest.c
$(CC) $(SFLAGS) -DGBA_MODE demo/stest.c -c -o demo/stest.o
stest: libtfm.a demo/stest.o
$(CC) -mthumb -mthumb-interwork demo/stest.o libtfm.a ../devkitadv/mylib/lib/gba.a -o stest.elf
objcopy -O binary stest.elf stest.bin

9
mtest/makefile Normal file
View File

@ -0,0 +1,9 @@
CFLAGS += -Wall -W -O3
default: mtest
mtest: mtest.o
$(CC) mtest.o -ltommath -o mtest
clean:
rm -f *.o mtest

320
mtest/mtest.c Normal file
View File

@ -0,0 +1,320 @@
/* makes a bignum test harness with NUM tests per operation
*
* the output is made in the following format [one parameter per line]
operation
operand1
operand2
[... operandN]
result1
result2
[... resultN]
So for example "a * b mod n" would be
mulmod
a
b
n
a*b mod n
e.g. if a=3, b=4 n=11 then
mulmod
3
4
11
1
*/
#ifdef MP_8BIT
#define THE_MASK 127
#else
#define THE_MASK 32767
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <tommath.h>
FILE *rng;
/* 1-2048 bit numbers */
void rand_num(mp_int *a)
{
int n, size;
unsigned char buf[2048];
size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 256;
buf[0] = (fgetc(rng)&1)?1:0;
fread(buf+1, 1, size, rng);
while (buf[1] == 0) buf[1] = fgetc(rng);
mp_read_raw(a, buf, 1+size);
}
/* 1-256 bit numbers (to test things like exptmod) */
void rand_num2(mp_int *a)
{
int n, size;
unsigned char buf[2048];
size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 32;
buf[0] = (fgetc(rng)&1)?1:0;
fread(buf+1, 1, size, rng);
while (buf[1] == 0) buf[1] = fgetc(rng);
mp_read_raw(a, buf, 1+size);
}
#define mp_to64(a, b) mp_toradix(a, b, 64)
int main(void)
{
int n, tmp;
mp_int a, b, c, d, e;
clock_t t1;
char buf[4096];
mp_init(&a);
mp_init(&b);
mp_init(&c);
mp_init(&d);
mp_init(&e);
/* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
/*
mp_set(&a, 1);
for (n = 1; n < 8192; n++) {
mp_mul(&a, &a, &c);
printf("mul\n");
mp_to64(&a, buf);
printf("%s\n%s\n", buf, buf);
mp_to64(&c, buf);
printf("%s\n", buf);
mp_add_d(&a, 1, &a);
mp_mul_2(&a, &a);
mp_sub_d(&a, 1, &a);
}
*/
rng = fopen("/dev/urandom", "rb");
if (rng == NULL) {
rng = fopen("/dev/random", "rb");
if (rng == NULL) {
fprintf(stderr, "\nWarning: stdin used as random source\n\n");
rng = stdin;
}
}
t1 = clock();
for (;;) {
#if 0
if (clock() - t1 > CLOCKS_PER_SEC) {
sleep(2);
t1 = clock();
}
#endif
n = fgetc(rng) % 16;
if (n == 0) {
/* add tests */
rand_num(&a);
rand_num(&b);
mp_add(&a, &b, &c);
printf("add\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
} else if (n == 1) {
/* sub tests */
rand_num(&a);
rand_num(&b);
mp_sub(&a, &b, &c);
printf("sub\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
} else if (n == 2) {
/* mul tests */
rand_num(&a);
rand_num(&b);
mp_mul(&a, &b, &c);
printf("mul\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
} else if (n == 3) {
/* div tests */
rand_num(&a);
rand_num(&b);
mp_div(&a, &b, &c, &d);
printf("div\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
mp_to64(&d, buf);
printf("%s\n", buf);
} else if (n == 4) {
/* sqr tests */
rand_num(&a);
mp_sqr(&a, &b);
printf("sqr\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 5) {
/* mul_2d test */
rand_num(&a);
mp_copy(&a, &b);
n = fgetc(rng) & 63;
mp_mul_2d(&b, n, &b);
mp_to64(&a, buf);
printf("mul2d\n");
printf("%s\n", buf);
printf("%d\n", n);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 6) {
/* div_2d test */
rand_num(&a);
mp_copy(&a, &b);
n = fgetc(rng) & 63;
mp_div_2d(&b, n, &b, NULL);
mp_to64(&a, buf);
printf("div2d\n");
printf("%s\n", buf);
printf("%d\n", n);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 7) {
/* gcd test */
rand_num(&a);
rand_num(&b);
a.sign = MP_ZPOS;
b.sign = MP_ZPOS;
mp_gcd(&a, &b, &c);
printf("gcd\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
} else if (n == 8) {
/* lcm test */
rand_num(&a);
rand_num(&b);
a.sign = MP_ZPOS;
b.sign = MP_ZPOS;
mp_lcm(&a, &b, &c);
printf("lcm\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
} else if (n == 9) {
/* exptmod test */
rand_num2(&a);
rand_num2(&b);
rand_num2(&c);
// if (c.dp[0]&1) mp_add_d(&c, 1, &c);
a.sign = b.sign = c.sign = 0;
c.dp[0] |= 1;
if (c.used <= 2) continue;
// if (mp_cmp(&a, &c) != MP_LT) continue;
// if (mp_cmp(&b, &c) != MP_LT) continue;
mp_exptmod(&a, &b, &c, &d);
printf("expt\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
mp_to64(&d, buf);
printf("%s\n", buf);
} else if (n == 10) {
/* invmod test */
rand_num2(&a);
rand_num2(&b);
b.dp[0] |= 1;
b.sign = MP_ZPOS;
a.sign = MP_ZPOS;
mp_gcd(&a, &b, &c);
if (mp_cmp_d(&c, 1) != 0) continue;
if (mp_cmp_d(&b, 1) == 0) continue;
mp_invmod(&a, &b, &c);
printf("invmod\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
mp_to64(&c, buf);
printf("%s\n", buf);
} else if (n == 11) {
rand_num(&a);
mp_mul_2(&a, &a);
mp_div_2(&a, &b);
printf("div2\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 12) {
rand_num(&a);
mp_mul_2(&a, &b);
printf("mul2\n");
mp_to64(&a, buf);
printf("%s\n", buf);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 13) {
rand_num(&a);
tmp = abs(rand()) & THE_MASK;
mp_add_d(&a, tmp, &b);
printf("add_d\n");
mp_to64(&a, buf);
printf("%s\n%d\n", buf, tmp);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 14) {
rand_num(&a);
tmp = abs(rand()) & THE_MASK;
mp_sub_d(&a, tmp, &b);
printf("sub_d\n");
mp_to64(&a, buf);
printf("%s\n%d\n", buf, tmp);
mp_to64(&b, buf);
printf("%s\n", buf);
} else if (n == 15) {
rand_num(&a);
tmp = abs(rand()) & THE_MASK;
mp_mul_d(&a, tmp, &b);
printf("mul_d\n");
mp_to64(&a, buf);
printf("%s\n%d\n", buf, tmp);
mp_to64(&b, buf);
printf("%s\n", buf);
}
}
fclose(rng);
return 0;
}

4459
pre_gen/mpi.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
AMD64 timings
using ISO C
mult
512-bit: 496
1024-bit: 1717
2048-bit: 7200
sqr
512-bit: 448
1024-bit: 1760
2048-bit: 7099
mont
512-bit: 1416
1024-bit: 5156
2048-bit: 20820
expt
512-bit: 1520207
1024-bit: 10603520
2048-bit: 84893649
using amd64
mult
512-bit: 292
1024-bit: 945
2048-bit: 3620
sqr
512-bit: 238
1024-bit: 801
2048-bit: 2853
mont
512-bit: 731
1024-bit: 1730
2048-bit: 5462
Exptmod:
512-bit: 641743
1024-bit: 3167406
2048-bit: 20158609
LTM exptmods
Exponentiating 513-bit => 825/sec, 2183028 cycles
Exponentiating 1025-bit => 151/sec, 11900720 cycles
Exponentiating 2049-bit => 24/sec, 72376416 cycles

View File

@ -0,0 +1,45 @@
LTM timings:
Athlon Barton
Exponentiating 513-bit => 561/sec, 3909824 cycles
Exponentiating 1025-bit => 103/sec, 21175496 cycles
Exponentiating 2049-bit => 16/sec, 129845554 cycles
P4 Northwood
Exponentiating 513-bit => 284/sec, 9884722 cycles
Exponentiating 1025-bit => 47/sec, 59090432 cycles
Exponentiating 2049-bit => 6/sec, 427456070 cycles
TFM timings:
Athlon Barton
512-bit: 2289257
1024-bit: 12871373
2048-bit: 97211357
P4 Northwood [x86-32]
512-bit: 8015598
1024-bit: 55559304
2048-bit: 409861746
P4 Northwood [SSE2]
512-bit: 5895000
1024-bit: 39648730
2048-bit: 304110670
<center>
<table border=1 width=100%>
<tr><td>Processor</td><td>Size in bits</td><td>x86-32</td> <td>x86-64</td><td>SSE2</td><td>LTM</td></tr>
<tr><td>P4 </td><td>512 </td><td>8015598</td><td></td> <td>5895000</td><td>9884722</td></tr>
<tr><td> </td><td>1024 </td><td>55559304</td><td></td> <td>39648730</td><td>59090432</td></tr>
<tr><td> </td><td>2048 </td><td>409861746</td><td></td> <td>304110670</td><td>427456070</td></tr>
<tr><td>Athlon Barton</td><td>512 </td><td>2289257</td><td></td><td></td><td>3909824</td></tr>
<tr><td> </td><td>1024 </td><td>12871373</td><td></td><td></td><td>21175496</td></tr>
<tr><td> </td><td>2048 </td><td>97211357</td><td></td><td></td><td>129845554</td></tr>
<tr><td>Athlon64 </td><td>512 </td><td></td><td>641743</td><td></td><td>2183028</td></tr>
<tr><td> </td><td>1042 </td><td></td><td>3167406</td><td></td><td>11900720</td></tr>
<tr><td> </td><td>2048 </td><td></td><td>20158609</td><td></td><td>72376416</td></tr>
</table>
<b>Cycles per operation</b>
</center>

View File

@ -0,0 +1,37 @@
LTM Timings...
Multiplying 140-bit => 2950763/sec, 952 cycles
Multiplying 196-bit => 2150939/sec, 1306 cycles
Multiplying 252-bit => 1357066/sec, 2070 cycles
Multiplying 308-bit => 1055269/sec, 2662 cycles
Multiplying 364-bit => 817557/sec, 3436 cycles
Multiplying 420-bit => 636413/sec, 4414 cycles
Multiplying 475-bit => 536912/sec, 5232 cycles
Multiplying 531-bit => 433641/sec, 6478 cycles
Multiplying 588-bit => 372069/sec, 7550 cycles
Multiplying 644-bit => 322813/sec, 8702 cycles
Multiplying 698-bit => 275566/sec, 10194 cycles
Multiplying 753-bit => 242082/sec, 11604 cycles
Multiplying 809-bit => 214797/sec, 13078 cycles
Multiplying 867-bit => 189626/sec, 14814 cycles
Multiplying 921-bit => 168858/sec, 16636 cycles
Multiplying 978-bit => 151598/sec, 18530 cycles
Multiplying 1036-bit => 137580/sec, 20418 cycles
Multiplying 1091-bit => 124661/sec, 22534 cycles
Multiplying 1148-bit => 111677/sec, 25154 cycles
Multiplying 1199-bit => 102762/sec, 27336 cycles
Multiplying 1258-bit => 94519/sec, 29720 cycles
Multiplying 1316-bit => 86975/sec, 32298 cycles
Multiplying 1371-bit => 79754/sec, 35222 cycles
Multiplying 1427-bit => 74473/sec, 37720 cycles
Multiplying 1483-bit => 68827/sec, 40814 cycles
Multiplying 1537-bit => 63644/sec, 44138 cycles
Multiplying 1595-bit => 59646/sec, 47096 cycles
Multiplying 1651-bit => 56469/sec, 49746 cycles
Multiplying 1708-bit => 52640/sec, 53364 cycles
Multiplying 1764-bit => 49823/sec, 56382 cycles
Multiplying 1819-bit => 46856/sec, 59952 cycles
Multiplying 1875-bit => 44264/sec, 63462 cycles
Multiplying 1929-bit => 41641/sec, 67460 cycles
Multiplying 1985-bit => 39539/sec, 71046 cycles
Multiplying 2044-bit => 37591/sec, 74728 cycles

View File

@ -0,0 +1,14 @@
I started with:
512-bit: 16338
1024-bit: 51020
2048-bit: 142718
My x86-32
512-bit: 2864
1024-bit: 10615
2048-bit: 41807
My SSE2
512-bit: 2168
1024-bit: 7727
2048-bit: 33163

37
s_fp_add.c Normal file
View File

@ -0,0 +1,37 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* unsigned addition */
void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
{
int x, y, oldused;
fp_word t;
y = MAX(a->used, b->used);
oldused = c->used;
c->used = y;
t = 0;
for (x = 0; x < y; x++) {
t += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
c->dp[x] = (fp_digit)t;
t >>= DIGIT_BIT;
}
if (t != 0 && x != FP_SIZE) {
c->dp[c->used++] = (fp_digit)t;
++x;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}

31
s_fp_sub.c Normal file
View File

@ -0,0 +1,31 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
/* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
{
int x, oldused;
fp_word t;
oldused = c->used;
c->used = a->used;
t = 0;
for (x = 0; x < a->used; x++) {
t = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
c->dp[x] = (fp_digit)t;
t = (t >> DIGIT_BIT) & 1;
}
for (; x < oldused; x++) {
c->dp[x] = 0;
}
fp_clamp(c);
}

290
tfm.h Normal file
View File

@ -0,0 +1,290 @@
/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#ifndef TFM_H_
#define TFM_H_
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#undef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#undef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
/* do we want huge code? The answer is, yes. */
#define TFM_HUGE
/* Max size of any number in bits. Basically the largest size you will be multiplying
* should be half [or smaller] of FP_MAX_SIZE-four_digit
*
* You can externally define this or it defaults to 4096-bits.
*/
#ifndef FP_MAX_SIZE
#define FP_MAX_SIZE (4096+(4*DIGIT_BIT))
#endif
/* will this lib work? */
#if (CHAR_BIT & 7)
#error CHAR_BIT must be a multiple of eight.
#endif
#if FP_MAX_SIZE % CHAR_BIT
#error FP_MAX_SIZE must be a multiple of CHAR_BIT
#endif
/* make sure we are using 64-bit digits with x86-64 asm */
#if defined(TFM_X86_64)
#ifndef FP_64BIT
#define FP_64BIT
#endif
#endif
/* make sure we're 32-bit for x86-32/sse/arm */
#if (defined(TFM_X86) || defined(TFM_SSE2) || defined(TFM_ARM)) && defined(FP_64BIT)
#warning x86-32, SSE2 and ARM optimizations require 32-bit digits (undefining)
#undef FP_64BIT
#endif
/* some default configurations.
*/
#if defined(FP_64BIT)
/* for GCC only on supported platforms */
#ifndef CRYPT
typedef unsigned long ulong64;
#endif
typedef ulong64 fp_digit;
typedef unsigned long fp_word __attribute__ ((mode(TI)));
#else
/* this is to make porting into LibTomCrypt easier :-) */
#ifndef CRYPT
#if defined(_MSC_VER) || defined(__BORLANDC__)
typedef unsigned __int64 ulong64;
typedef signed __int64 long64;
#else
typedef unsigned long long ulong64;
typedef signed long long long64;
#endif
#endif
typedef unsigned long fp_digit;
typedef ulong64 fp_word;
#endif
/* # of digits this is */
#define DIGIT_BIT (int)((CHAR_BIT) * sizeof(fp_digit))
#define FP_MASK (fp_digit)(-1)
#define FP_SIZE (FP_MAX_SIZE/DIGIT_BIT)
/* signs */
#define FP_ZPOS 0
#define FP_NEG 1
/* return codes */
#define FP_OKAY 0
#define FP_VAL 1
#define FP_MEM 2
/* equalities */
#define FP_LT -1 /* less than */
#define FP_EQ 0 /* equal to */
#define FP_GT 1 /* greater than */
/* replies */
#define FP_YES 1 /* yes response */
#define FP_NO 0 /* no response */
/* a FP type */
typedef struct {
fp_digit dp[FP_SIZE];
int used,
sign;
} fp_int;
/* functions */
/* initialize [or zero] an fp int */
#define fp_init(a) memset((a), 0, sizeof(fp_int))
#define fp_zero(a) fp_init(a)
/* zero/even/odd ? */
#define fp_iszero(a) (((a)->used == 0) ? FP_YES : FP_NO)
#define fp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? FP_YES : FP_NO)
#define fp_isodd(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? FP_YES : FP_NO)
/* set to a small digit */
void fp_set(fp_int *a, fp_digit b);
/* copy from a to b */
#define fp_copy(a, b) (((a) != (b)) && memcpy((b), (a), sizeof(fp_int)))
#define fp_init_copy(a, b) fp_copy(b, a)
/* negate and absolute */
#define fp_neg(a, b) { fp_copy(a, b); (b)->sign ^= 1; }
#define fp_abs(a, b) { fp_copy(a, b); (b)->sign = 0; }
/* clamp digits */
#define fp_clamp(a) { while ((a)->used && (a)->dp[(a)->used-1] == 0) --((a)->used); (a)->sign = (a)->used ? (a)->sign : FP_ZPOS; }
/* right shift x digits */
void fp_rshd(fp_int *a, int x);
/* left shift x digits */
void fp_lshd(fp_int *a, int x);
/* signed comparisonm */
int fp_cmp(fp_int *a, fp_int *b);
/* unsigned comparisonm */
int fp_cmp_mag(fp_int *a, fp_int *b);
/* power of 2 operations */
void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d);
void fp_mod_2d(fp_int *a, int b, fp_int *c);
void fp_mul_2d(fp_int *a, int b, fp_int *c);
void fp_2expt (fp_int *a, int b);
void fp_mul_2(fp_int *a, fp_int *c);
void fp_div_2(fp_int *a, fp_int *c);
/* Counts the number of lsbs which are zero before the first zero bit */
int fp_cnt_lsb(fp_int *a);
/* c = a + b */
void fp_add(fp_int *a, fp_int *b, fp_int *c);
/* c = a - b */
void fp_sub(fp_int *a, fp_int *b, fp_int *c);
/* c = a * b */
void fp_mul(fp_int *a, fp_int *b, fp_int *c);
/* b = a*a */
void fp_sqr(fp_int *a, fp_int *b);
/* a/b => cb + d == a */
int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
/* c = a mod b, 0 <= c < b */
int fp_mod(fp_int *a, fp_int *b, fp_int *c);
/* compare against a single digit */
int fp_cmp_d(fp_int *a, fp_digit b);
/* c = a + b */
void fp_add_d(fp_int *a, fp_digit b, fp_int *c);
/* c = a - b */
void fp_sub_d(fp_int *a, fp_digit b, fp_int *c);
/* c = a * b */
void fp_mul_d(fp_int *a, fp_digit b, fp_int *c);
/* a/b => cb + d == a */
int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d);
/* c = a mod b, 0 <= c < b */
int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c);
/* ---> number theory <--- */
/* d = a + b (mod c) */
int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
/* d = a - b (mod c) */
int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
/* d = a * b (mod c) */
int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
/* c = a * a (mod b) */
int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c);
/* c = 1/a (mod b) */
int fp_invmod(fp_int *a, fp_int *b, fp_int *c);
/* c = (a, b) */
void fp_gcd(fp_int *a, fp_int *b, fp_int *c);
/* c = [a, b] */
void fp_lcm(fp_int *a, fp_int *b, fp_int *c);
/* setups the montgomery reduction */
int fp_montgomery_setup(fp_int *a, fp_digit *mp);
/* computes a = B**n mod b without division or multiplication useful for
* normalizing numbers in a Montgomery system.
*/
void fp_montgomery_calc_normalization(fp_int *a, fp_int *b);
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp);
/* d = a**b (mod c) */
int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
/* primality stuff */
/* perform a Miller-Rabin test of a to the base b and store result in "result" */
void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result);
/* 256 trial divisions + 8 Miller-Rabins, returns FP_YES if probable prime */
int fp_isprime(fp_int *a);
/* Primality generation flags */
#define TFM_PRIME_BBS 0x0001 /* BBS style prime */
#define TFM_PRIME_SAFE 0x0002 /* Safe prime (p-1)/2 == prime */
#define TFM_PRIME_2MSB_OFF 0x0004 /* force 2nd MSB to 0 */
#define TFM_PRIME_2MSB_ON 0x0008 /* force 2nd MSB to 1 */
/* callback for fp_prime_random, should fill dst with random bytes and return how many read [upto len] */
typedef int tfm_prime_callback(unsigned char *dst, int len, void *dat);
#define fp_prime_random(a, t, size, bbs, cb, dat) fp_prime_random_ex(a, t, ((size) * 8) + 1, (bbs==1)?TFM_PRIME_BBS:0, cb, dat)
int fp_prime_random_ex(fp_int *a, int t, int size, int flags, tfm_prime_callback cb, void *dat);
/* radix conersions */
int fp_count_bits(fp_int *a);
int fp_unsigned_bin_size(fp_int *a);
void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c);
void fp_to_unsigned_bin(fp_int *a, unsigned char *b);
int fp_signed_bin_size(fp_int *a);
void fp_read_signed_bin(fp_int *a, unsigned char *b, int c);
void fp_to_signed_bin(fp_int *a, unsigned char *b);
int fp_read_radix(fp_int *a, char *str, int radix);
int fp_toradix(fp_int *a, char *str, int radix);
int fp_toradix_n(fp_int * a, char *str, int radix, int maxlen);
/* VARIOUS LOW LEVEL STUFFS */
void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
void bn_reverse(unsigned char *s, int len);
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C);
#ifdef TFM_HUGE
void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C);
#endif
void fp_mul_comba16(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C);
void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C);
void fp_sqr_comba(fp_int *A, fp_int *B);
void fp_sqr_comba4(fp_int *A, fp_int *B);
void fp_sqr_comba8(fp_int *A, fp_int *B);
void fp_sqr_comba16(fp_int *A, fp_int *B);
#ifdef TFM_HUGE
void fp_sqr_comba32(fp_int *A, fp_int *B);
#endif
extern const char *fp_s_rmap;
#endif

580
tfm.tex Normal file
View File

@ -0,0 +1,580 @@
\documentclass[b5paper]{book}
\usepackage{hyperref}
\usepackage{makeidx}
\usepackage{amssymb}
\usepackage{color}
\usepackage{alltt}
\usepackage{graphicx}
\usepackage{layout}
\def\union{\cup}
\def\intersect{\cap}
\def\getsrandom{\stackrel{\rm R}{\gets}}
\def\cross{\times}
\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
\def\catn{$\|$}
\def\divides{\hspace{0.3em} | \hspace{0.3em}}
\def\nequiv{\not\equiv}
\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
\def\lcm{{\rm lcm}}
\def\gcd{{\rm gcd}}
\def\log{{\rm log}}
\def\ord{{\rm ord}}
\def\abs{{\mathit abs}}
\def\rep{{\mathit rep}}
\def\mod{{\mathit\ mod\ }}
\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
\def\Or{{\rm\ or\ }}
\def\And{{\rm\ and\ }}
\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
\def\implies{\Rightarrow}
\def\undefined{{\rm ``undefined"}}
\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
\let\oldphi\phi
\def\phi{\varphi}
\def\Pr{{\rm Pr}}
\newcommand{\str}[1]{{\mathbf{#1}}}
\def\F{{\mathbb F}}
\def\N{{\mathbb N}}
\def\Z{{\mathbb Z}}
\def\R{{\mathbb R}}
\def\C{{\mathbb C}}
\def\Q{{\mathbb Q}}
\definecolor{DGray}{gray}{0.5}
\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
\def\gap{\vspace{0.5ex}}
\makeindex
\begin{document}
\frontmatter
\pagestyle{empty}
\title{TomsFastMath User Manual \\ v0.01}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
This text and library are all hereby placed in the public domain. This book has been formatted for B5
[176x250] paper using the \LaTeX{} {\em book} macro package.
\vspace{13cm}
\begin{flushleft}This project was sponsored in part by
Secure Science Corporation \url{http://www.securescience.net}.
\end{flushleft}
\tableofcontents
\listoffigures
\mainmatter
\pagestyle{headings}
\chapter{Introduction}
\section{What is TomsFastMath?}
TomsFastMath is meant to be a very fast yet still fairly portable and easy to port large
integer arithmetic library written in ISO C. The goal specifically is to be able to perform
very fast modular exponentiations and other related functions required for ECC, DH and RSA
cryptosystems.
Most of the library is pure ISO C portable source code while a small portion (three files) contain
a mixture of ISO C and assembler inline fragments. Compared to LibTomMath this new library is
meant to be much faster while sacrificing flexibiltiy. This is accomplished through several means.
\begin{enumerate}
\item The new code is slightly messier and contains asm blocks.
\item This uses fixed not multiple precision integers.
\item It is designed only for fast modular exponentiations [e.g. less flexibility].
\end{enumerate}
To mitigate some of the problems that arise from using assembler it has been carefully and
appropriately used where it would make the most gain in performance. Also we use macro's
for assembler code which allows new ports to be inserted easily.
The new code uses fixed precision arithmetic which means at compile time you choose a maximum
precision and all numbers are limited to that. This has the benefit of not requiring any
memory heap operations (which are slow) in any of the functions. It has the downside that
integers that are too large are truncated.
The goal of this library is to be able to perform modular exponentiations (with an odd modulus) very
fast. This is what takes the most time in systems such as RSA and DH. This also requires
fast multiplication and squaring and has the side effect of speeding up ECC operations as well.
\section{License}
TomsFastMath is public domain.
\section{Building}
Currently only a GCC makefile has been provided. To build the library simply type
``make''. The library is a bit too new to put into production so no install
scripts exist yet. You can build the test program with ``make test''.
To perform simple static testing (useful to test out new assembly ports) use the stest
program. Type ``make stest'' and run it on your target. The program will perform three
multiplications, squarings and montgomery reductions. Likely if your assembly
code is invalid this code will exhibit the bug.
\subsection{Build Limitations}
TomsFastMath has the following build requirements which are non--portable but under most
circumstances not problematic.
\begin{enumerate}
\item ``CHAR\_BIT'' must be eight.
\item The ``fp\_digit'' type must be a multiple of eight bits long.
\item The ``fp\_word'' must be at least twice the length of fp\_digit.
\end{enumerate}
\subsection{Optimization Configuration}
By default TFM is configured for 32--bit digits using ISO C source code. This mode while portable
is not very efficient. While building the library (from scratch) you can define one of
several ``CFLAGS'' defines.
For example, to build with with SSE2 optimizations type
\begin{verbatim}
export CFLAGS=-DTFM_SSE2
make clean libtfm.a
\end{verbatim}
\subsubsection{x86--32} The ``x86--32'' mode is defined by ``TFM\_X86'' and covers all
i386 and beyond processors. It requires GCC to build and only works with 32--bit digits. In this
mode fp\_digit is 32--bits and fp\_word is 64--bits.
\subsubsection{SSE2} The ``SSE2'' mode is defined by ``TFM\_SSE2'' and requires a Pentium 4, Pentium
M or Athlon64 processor. It requires GCC to build. Note that you shouldn't define both
TFM\_X86 and TFM\_SSE2 at the same time. This mode only works with 32--bit digits. In this
mode fp\_digit is 32--bits and fp\_word is 64--bits.
\subsubsection{x86--64} The ``x86--64'' mode is defined by ``TFM\_X86\_64'' and requires a
``x86--64'' capable processor (Athlon64 and future Pentium processors). It requires GCC to
build and only works with 64--bit digits. Note that by enabling this mode it will automatically
enable 64--bit digits. In this mode fp\_digit is 64--bits and fp\_word is 128--bits.
\subsubsection{ARM} The ``ARM'' mode is defined by ``TFM\_ARM'' and requires a ARMv4 or higher
processor. It requires GCC and works with 32--bit digits. In this mode fp\_digit is 32--bits and
fp\_word is 64--bits.
\subsubsection{Future Releases} Future releases will support additional platform optimizations.
Developers of MIPS and PPC platforms are encouraged to submit GCC asm inline patches
(see chapter \ref{chap:asmops} for more information).
\begin{figure}[here]
\begin{small}
\begin{center}
\begin{tabular}{|l|l|}
\hline \textbf{Processor} & \textbf{Recommended Mode} \\
\hline All 32--bit x86 platforms & TFM\_X86 \\
\hline Pentium 4 & TFM\_SSE2 \\
\hline Athlon64 & TFM\_X86\_64 \\
\hline ARMv4 or higher & TFM\_ARM \\
\hline
\end{tabular}
\caption{Recommended Build Modes}
\end{center}
\end{small}
\end{figure}
\subsection{Precision Configuration}
The precision of all integers in this library are fixed to a limited precision. Essentially
the rule of setting the precision is if you plan on doing modular exponentiation with $k$--bit
numbers than the precision must be fixed to $2k$--bits plus four digits.
This is changed by altering the value of ``FP\_MAX\_SIZE'' in tfm.h to your desired size. By default,
the library is configured to handle upto 2048--bit inputs to the modular exponentiator.
\chapter{Getting Started}
\section{Data Types}
TomsFastMath is a large fixed precision integer library. It provides the functionality to
manipulate large signed integers through a relatively trivial api and a single data type.
The ``fp\_int'' or fixed precision integer is the data type that the functions operate with.
\begin{verbatim}
typedef struct {
fp_digit dp[FP_SIZE];
int used,
sign;
} fp_int;
\end{verbatim}
The \textbf{dp} member is the array of digits that forms the number. It must always be zero
padded. The \textbf{used} member is the count of digits used in the array. Although the
precision is fixed the algorithms are still tuned to not process the entire array if it
does not have to. The \textbf{sign} indicates the sign of the integer. It is \textbf{FP\_ZPOS} (0)
if the integer is zero or positive and \textbf{FP\_NEG} (1) otherwise.
\section{Initialization}
\subsection{Simple Initialization}
To initialize an integer to the default state of zero use the fp\_init() function.
\index{fp\_init}
\begin{verbatim}
void fp_init(fp_int *a);
\end{verbatim}
This will initialize the fp\_int $a$ to zero. Note that the function fp\_zero() is an alias
for fp\_init().
\subsection{Initialize Small Constants}
To initialize an integer with a small single digit value use the fp\_set() function.
\index{fp\_set}
\begin{verbatim}
void fp_set(fp_int *a, fp_digit b);
\end{verbatim}
This will initialize $a$ and set it equal to the digit $b$.
\subsection{Initialize Copy}
To initialize an integer with a copy of another integer use the fp\_init\_copy() function.
\index{fp\_init\_copy}
\begin{verbatim}
void fp_init_copy(fp_int *a, fp_int *b)
\end{verbatim}
This will initialize $a$ as a copy of $b$. Note that for compatibility with LibTomMath the function
fp\_copy() is also provided.
\chapter{Arithmetic Operations}
\section{Odds and Evens}
To quickly and easily tell if an integer is zero, odd or even use the following functions.
\index{fp\_iszero} \index{fp\_iseven} \index{fp\_isodd}
\begin{verbatim}
int fp_iszero(fp_int *a);
int fp_iseven(fp_int *a);
int fp_isodd(fp_int *a);
\end{verbatim}
These will return \textbf{FP\_YES} if the answer to their respective questions is yes. Otherwise they
return \textbf{FP\_NO}. Note that these are implemented as macros and as such you should avoid using
++ or --~-- operators on the input operand.
\section{Sign Manipulation}
To negate or compute the absolute of an integer use the following functions.
\index{fp\_neg} \index{fp\_abs}
\begin{verbatim}
void fp_neg(fp_int *a, fp_int *b);
void fp_abs(fp_int *a, fp_int *b);
\end{verbatim}
This will compute the negation (or absolute) of $a$ and store the result in $b$. Note that these
are implemented as macros and as such you should avoid using ++ or --~-- operators on the input
operand.
\section{Comparisons}
To perform signed or unsigned comparisons use following functions.
\index{fp\_cmp} \index{fp\_cmp\_mag}
\begin{verbatim}
int fp_cmp(fp_int *a, fp_int *b);
int fp_cmp_mag(fp_int *a, fp_int *b);
\end{verbatim}
These will compare $a$ to $b$. They will return \textbf{FP\_GT} if $a$ is larger than $b$,
\textbf{FP\_EQ} if they are equal and \textbf{FP\_LT} if $a$ is less than $b$.
The function fp\_cmp performs signed comparisons while the other performs unsigned comparisons.
\section{Shifting}
To shift the digits of an fp\_int left or right use the following functions.
\index{fp\_lshd} \index{fp\_rshd}
\begin{verbatim}
void fp_lshd(fp_int *a, int x);
void fp_rshd(fp_int *a, int x);
\end{verbatim}
These will shift the digits of $a$ left (or right respectively) $x$ digits.
To shift individual bits of an fp\_int use the following functions.
\index{fp\_div\_2d} \index{fp\_mod\_2d} \index{fp\_mul\_2d} \index{fp\_div\_2} \index{fp\_mul\_2}
\begin{verbatim}
void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d);
void fp_mod_2d(fp_int *a, int b, fp_int *c);
void fp_mul_2d(fp_int *a, int b, fp_int *c);
void fp_mul_2(fp_int *a, fp_int *c);
void fp_div_2(fp_int *a, fp_int *c);
void fp_2expt(fp_int *a, int b);
\end{verbatim}
fp\_div\_2d() will divide $a$ by $2^b$ and store the quotient in $c$ and remainder in $d$. Either of
$c$ or $d$ can be \textbf{NULL} if their value is not required. fp\_mod\_2d() is a shortcut to
compute the remainder directly. fp\_mul\_2d() will multiply $a$ by $2^b$ and store the result in $c$.
The fp\_mul\_2() and fp\_div\_2() functions are optimized multiplication and divisions by two. The
function fp\_2expt() will compute $a = 2^b$ quickly.
To quickly count the number of least significant bits that are zero use the following function.
\index{fp\_cnt\_lsb}
\begin{verbatim}
int fp_cnt_lsb(fp_int *a);
\end{verbatim}
This will return the number of adjacent least significant bits that are zero. This is equivalent
to the number of times two evenly divides $a$.
\section{Basic Algebra}
The following functions round out the basic algebraic functionality of the library.
\index{fp\_add} \index{fp\_sub} \index{fp\_mul} \index{fp\_sqr} \index{fp\_div} \index{fp\_mod}
\begin{verbatim}
void fp_add(fp_int *a, fp_int *b, fp_int *c);
void fp_sub(fp_int *a, fp_int *b, fp_int *c);
void fp_mul(fp_int *a, fp_int *b, fp_int *c);
void fp_sqr(fp_int *a, fp_int *b);
int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
int fp_mod(fp_int *a, fp_int *b, fp_int *c);
\end{verbatim}
The functions fp\_add(), fp\_sub() and fp\_mul() perform their respective operations on $a$ and
$b$ and store the result in $c$. The function fp\_sqr() computes $b = a^2$ and is faster than
using fp\_mul() to perform the same operation.
The function fp\_div() divides $a$ by $b$ and stores the quotient in $c$ and remainder in $d$. Either
of $c$ and $d$ can be \textbf{NULL} if the result is not required. The function fp\_mod() is a simple
shortcut to find the remainder.
\section{Modular Exponentiation}
To compute a modular exponentiation use the following function.
\index{fp\_exptmod}
\begin{verbatim}
int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
\end{verbatim}
This computes $d \equiv a^b \mbox{ (mod }c)$ for any odd $c$ and positive $b$. The size of $c$
must be half of the maximum precision used during the build of the library. For example,
by default $c$ must be less than $2^{2048}$.
\section{Number Theoretic}
To perform modular inverses, greatest common divisor or least common multiples use the following
functions.
\index{fp\_invmod} \index{fp\_gcd} \index{fp\_lcm}
\begin{verbatim}
int fp_invmod(fp_int *a, fp_int *b, fp_int *c);
void fp_gcd(fp_int *a, fp_int *b, fp_int *c);
void fp_lcm(fp_int *a, fp_int *b, fp_int *c);
\end{verbatim}
The fp\_invmod() function will find the modular inverse of $a$ modulo an odd modulus $b$ and store
it in $c$ (provided it exists). The function fp\_gcd() will compute the greatest common
divisor of $a$ and $b$ and store it in $c$. Similarly the fp\_lcm() function will compute
the least common multiple of $a$ and $b$ and store it in $c$.
\section{Prime Numbers}
To quickly test a number for primality call this function.
\index{fp\_isprime}
\begin{verbatim}
int fp_isprime(fp_int *a);
\end{verbatim}
This will return \textbf{FP\_YES} if $a$ is probably prime. It uses 256 trial divisions and
eight rounds of Rabin-Miller testing. Note that this routine performs modular exponentiations
which means that $a$ must be in a valid range of precision.
\chapter{Porting TomsFastMath}
\label{chap:asmops}
\section{Getting Started}
Porting TomsFastMath to a given processor target is usually a simple procedure. For the most part
assembly is used to get around the lack of a ``add with carry'' operation in the C language. To
make matters simpler the use of assembler is through macro blocks.
Each ``port'' is defined by a block of code that re-defines the portable ISO C macros with assembler
inline blocks. To add a new port you must designate a TFM\_XXX define that will enable your
port when built.
\section{Multiply with Comba}
The file ``fp\_mul\_comba.c'' is responsible for providing the fast multiplication within the
library. This comba multiplication is fairly simple. It uses a sliding three digit carry
system with the variables $c0$, $c1$, $c2$. For every digit of output $c0$ is the what will
be that digit, $c1$ will carry into the next digit and $c2$ will be the ``c1'' carry for
the next digit. For every ``next'' digit effectively $c0$ is stored as output, $c1$ moves into
$c0$, $c2$ into $c1$ and zero into $c2$.
The following macros define the assmebler interface to the code.
\begin{verbatim}
#define COMBA_START
\end{verbatim}
This is issued at the beginning of the multiplication function. This is in place to allow you to
initialize any registers or machine words required. You can leave it blank if you do not need
it.
\begin{verbatim}
#define COMBA_CLEAR \
c0 = c1 = c2 = 0;
\end{verbatim}
This clears the three comba carries. If you are going to place carries in registers then
zero the appropriate registers. Note that the functions do not use $c0$, $c1$ or $c2$ directly
so you are free to ignore these varibles and use registers directly.
\begin{verbatim}
#define COMBA_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
\end{verbatim}
This propagates the carries after a digit has been produced.
\begin{verbatim}
#define COMBA_STORE(x) \
x = c0;
\end{verbatim}
This stores the $c0$ digit in the memory location specified by $x$. Note that if you manually
aliased $c0$ with a register than just store that register in $x$.
\begin{verbatim}
#define COMBA_STORE2(x) \
x = c1;
\end{verbatim}
This stores the $c1$ digit in the memory location specified by $x$. Note that if you manually
aliased $c1$ with a register than just store that register in $x$.
\begin{verbatim}
#define COMBA_FINI
\end{verbatim}
If at the end of the function you need to perform some action fill this macro in.
\begin{verbatim}
#define MULADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
\end{verbatim}
This macro performs the ``multiply and add'' step that is central to the comba
multiplier. It multiplies the fp\_digits $i$ and $j$ to produce a fp\_word result. Effectively
the double--digit value is added to the three-digit carry formed by $c0$, $c1$, $c2$ where $c0$
is the least significant digit.
\section{Squaring with Comba}
Squaring is similar to multiplication except that it uses a special ``multiply and add twice'' macro
that replaces multiplications that are not required.
\begin{verbatim}
#define COMBA_START
\end{verbatim}
This allows for any initialization code you might have.
\begin{verbatim}
#define CLEAR_CARRY \
c0 = c1 = c2 = 0;
\end{verbatim}
This will clear the carries. Like multiplication you can safely alias the three carry variables
to registers if you can/want to.
\begin{verbatim}
#define COMBA_STORE(x) \
x = c0;
\end{verbatim}
Store the $c0$ carry to a given memory location.
\begin{verbatim}
#define COMBA_STORE2(x) \
x = c1;
\end{verbatim}
Store the $c1$ carry to a given memory location.
\begin{verbatim}
#define CARRY_FORWARD \
c0 = c1; c1 = c2; c2 = 0;
\end{verbatim}
Forward propagate all three carry variables.
\begin{verbatim}
#define COMBA_FINI
\end{verbatim}
If you need to clean up at the end of the function.
\begin{verbatim}
/* multiplies point i and j, updates carry "c1" and digit c2 */
#define SQRADD(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
\end{verbatim}
This is essentially the MULADD macro from the multiplication code.
\begin{verbatim}
/* for squaring some of the terms are doubled... */
#define SQRADD2(i, j) \
t = ((fp_word)i) * ((fp_word)j); \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2; \
c0 = (c0 + t); if (c0 < ((fp_digit)t)) ++c1; \
c1 = (c1 + (t>>DIGIT_BIT)); if (c1 < (t>>DIGIT_BIT)) ++c2;
\end{verbatim}
This is like SQRADD except it adds the produce twice. It's similar to
computing SQRADD(i, j*2).
\section{Montgomery with Comba}
Montgomery reduction is used in modular exponentiation and is most called function during
that operation. It's important to make sure this routine is very fast or all is lost.
Unlike the two other comba routines this one does not use a single three--digit carry
system. It does have three--digit carries except that the routine steps through them
in the inner loop. This means you cannot alias them to registers (at all).
To make matters simple though the three arrays of carries are stored in one array. The
``c0'' array resides in $c[0 \ldots OFF1-1]$, ``c1'' in $c[OFF1 \ldots OFF2-1]$ and ``c2'' in
$c[OFF2 \ldots OFF2+FP\_SIZE-1]$.
\begin{verbatim}
#define MONT_START
\end{verbatim}
This allows you to insert anything at the start that you need.
\begin{verbatim}
#define MONT_FINI
\end{verbatim}
This allows you to insert anything at the end that you need.
\begin{verbatim}
#define LOOP_START \
mu = c[x] * mp;
\end{verbatim}
This computes the $\mu$ value for the inner loop. You can safely alias $mu$ and $mp$ to
a register if you want.
\begin{verbatim}
#define INNERMUL \
t = ((fp_word)mu) * ((fp_word)*tmpm++); \
_c[OFF0] += t; \
if (_c[OFF0] < (fp_digit)t) ++_c[OFF1]; \
_c[OFF1] += (t>>DIGIT_BIT); \
if (_c[OFF1] < (fp_digit)(t>>DIGIT_BIT)) ++_c[OFF2];
\end{verbatim}
This computes the inner product and adds it to the correct set of carry variables. The variable
$\_c$ is a pointer alias to $c[x+y]$ and used to simplify the code.
You can safely alias $\_c$ to a register for INNERMUL by setting it equal to ``c + x''
\footnote{Where ``c'' is an array on the stack.} by modifying LOOP\_START.
\begin{verbatim}
#define PROPCARRY \
_c[OFF0+1] += _c[OFF1]; \
if (_c[OFF0+1] < _c[OFF1]) ++_c[OFF1+1]; \
_c[OFF1+1] += _c[OFF2]; \
if (_c[OFF1+1] < _c[OFF2]) ++_c[OFF2+1];
\end{verbatim}
This propagates the carry upwards by one digit.
\input{tfm.ind}
\end{document}