tomsfastmath/fp_montgomery_reduce.c
2010-07-22 10:06:22 +02:00

253 lines
9.6 KiB
C

/* TomsFastMath, a fast ISO C bignum library.
*
* This project is meant to fill in where LibTomMath
* falls short. That is speed ;-)
*
* This project is public domain and free for all purposes.
*
* Tom St Denis, tomstdenis@iahu.ca
*/
#include <tfm.h>
#if defined(TFM_X86)
/* x86-32 code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
#define INNERMUL \
asm( \
"movl %7,%%eax \n\t" \
"mull %6 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(mu), "g"(*tmpm++) \
: "%eax", "%edx", "%cc");
#define PROPCARRY \
asm( \
"movl %1,%%eax \n\t" \
"addl %%eax,%6 \n\t" \
"movl %2,%%eax \n\t" \
"adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%eax", "%cc");
#elif defined(TFM_X86_64)
/* x86-64 code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
#define INNERMUL \
asm( \
"movq %7,%%rax \n\t" \
"mulq %6 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(mu), "g"(*tmpm++) \
: "%rax", "%rdx", "%cc");
#define PROPCARRY \
asm( \
"movq %1,%%rax \n\t" \
"movq %2,%%rbx \n\t" \
"addq %%rax,%6 \n\t" \
"adcq %%rbx,%7 \n\t" \
"adcq $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"m"(_c[OFF0+1]), "m"(_c[OFF1+1]), "m"(_c[OFF2+1]) \
: "%rax", "%rbx", "%cc");
#elif defined(TFM_SSE2)
/* SSE2 code */
#define MONT_START \
asm("movd %0,%%mm2"::"g"(mp));
#define MONT_FINI \
asm("emms");
#define LOOP_START \
asm(\
"movd %0,%%mm1 \n\t" \
"pmuludq %%mm2,%%mm1 \n\t" \
:: "g"(c[x]));
#define INNERMUL \
asm( \
"movd %6,%%mm0 \n\t" \
"pmuludq %%mm1,%%mm0 \n\t" \
"movd %%mm0,%%eax \n\t" \
"psrlq $32, %%mm0 \n\t" \
"addl %%eax,%0 \n\t" \
"movd %%mm0,%%eax \n\t" \
"adcl %%eax,%1 \n\t" \
"adcl $0,%2 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(*tmpm++) \
: "%eax", "%cc");
#define PROPCARRY \
asm( \
"movl %1,%%eax \n\t" \
"addl %%eax,%6 \n\t" \
"movl %2,%%eax \n\t" \
"adcl %%eax,%7 \n\t" \
"adcl $0,%8 \n\t" \
:"=g"(_c[OFF0]), "=g"(_c[OFF1]), "=g"(_c[OFF2]):"0"(_c[OFF0]), "1"(_c[OFF1]), "2"(_c[OFF2]), \
"g"(_c[OFF0+1]), "g"(_c[OFF1+1]), "g"(_c[OFF2+1]) \
: "%eax", "%cc");
#elif defined(TFM_ARM)
/* ISO C code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
/* NOTE: later write it using two regs instead of three for _c + ... */
#define INNERMUL \
asm( \
"UMULL r0,r1,%0,%1 \n\t" \
"LDR r2,[%2] \n\t" \
"ADDS r2,r2,r0 \n\t" \
"STR r2,[%2] \n\t" \
"LDR r2,[%3] \n\t" \
"ADCS r2,r2,r1 \n\t" \
"STR r2,[%3] \n\t" \
"LDR r2,[%4] \n\t" \
"ADC r2,r2,#0 \n\t" \
"STR r2,[%4] \n\t" \
::"r"(mu),"r"(*tmpm++),"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "r2", "%cc");
#define PROPCARRY \
asm( \
"LDR r0,[%1] \n\t" \
"LDR r1,[%0,#4] \n\t" \
"ADDS r0,r0,r1 \n\t" \
"STR r0,[%0,#4] \n\t" \
"LDR r0,[%2] \n\t" \
"LDR r1,[%1,#4] \n\t" \
"ADCS r0,r0,r1 \n\t" \
"STR r0,[%1,#4] \n\t" \
"LDR r0,[%2,#4] \n\t" \
"ADC r0,r0,#0 \n\t" \
"STR r0,[%2,#4] \n\t" \
::"r"(_c + OFF0),"r"(_c + OFF1),"r"(_c + OFF2):"r0", "r1", "%cc");
#else
/* ISO C code */
#define MONT_START
#define MONT_FINI
#define LOOP_START \
mu = c[x] * mp;
#define INNERMUL \
do { fp_word t; \
t = (fp_word)_c[OFF0] + ((fp_word)mu) * ((fp_word)*tmpm++); _c[OFF0] = t; \
t = (fp_word)_c[OFF1] + (t >> DIGIT_BIT); _c[OFF1] = t; \
_c[OFF2] += (t >> DIGIT_BIT); \
} while (0);
#define PROPCARRY \
do { fp_word t; \
t = (fp_word)_c[OFF0+1] + (fp_word)_c[OFF1]; _c[OFF0+1] = t; \
t = (fp_word)_c[OFF1+1] + (t >> DIGIT_BIT) + (fp_word)_c[OFF2]; _c[OFF1+1] = t; \
_c[OFF2+1] += (t >> DIGIT_BIT); \
} while (0);
#endif
#define OFF0 (0)
#define OFF1 (FP_SIZE)
#define OFF2 (FP_SIZE+FP_SIZE)
/* computes x/R == x (mod N) via Montgomery Reduction */
void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
{
fp_digit c[3*FP_SIZE], *_c, *tmpm, mu;
int oldused, x, y, pa;
/* now zero the buff */
pa = m->used;
memset(c, 0, sizeof(c));
/* copy the input */
oldused = a->used;
for (x = 0; x < oldused; x++) {
c[x] = a->dp[x];
}
MONT_START;
/* now let's get bizz-sy! */
for (x = 0; x < pa; x++) {
/* get Mu for this round */
LOOP_START;
/* our friendly neighbourhood alias */
_c = c + x;
tmpm = m->dp;
for (y = 0; y < pa; y++) {
INNERMUL;
++_c;
}
/* send carry up man... */
_c = c + x;
PROPCARRY;
}
/* fix the rest of the carries */
_c = c + pa;
for (x = pa; x < pa * 2 + 2; x++) {
PROPCARRY;
++_c;
}
/* now copy out */
_c = c + pa;
tmpm = a->dp;
for (x = 0; x < pa+1; x++) {
*tmpm++ = *_c++;
}
for (; x < oldused; x++) {
*tmpm++ = 0;
}
MONT_FINI;
a->used = pa+1;
fp_clamp(a);
/* if A >= m then A = A - m */
if (fp_cmp_mag (a, m) != FP_LT) {
s_fp_sub (a, m, a);
}
}