tomsfastmath/fp_mul.c

/* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
#include <tfm.h>

/* c = a * b */
void fp_mul(fp_int *A, fp_int *B, fp_int *C)
{
    int    r, y, yy, s;
    fp_int ac, bd, comp, amb, cmd, t1, t2;

     y  = MAX(A->used, B->used);
     yy = MIN(A->used, B->used);
     if (yy <= 8 || y <= 64) {

    /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
       of the largest input.  We also want to avoid doing excess mults if the 
       inputs are not close to the next power of two.  That is, for example,
       if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications 
    */

#ifdef TFM_SMALL_SET
        if (y <= 16) {
           fp_mul_comba_small(A,B,C);
#elif defined(TFM_HUGE)
        if (0) { 1;
#endif
#if defined(TFM_HUGE)
        } else if (y <= 32) {
           fp_mul_comba32(A,B,C);
        } else if (y <= 48) {
           fp_mul_comba48(A,B,C);
        } else if (y <= 64) {
           fp_mul_comba64(A,B,C);
#endif
#if !defined(TFM_HUGE) && !defined(TFM_SMALL_SET)
        {
#else
        } else {
#endif
           fp_mul_comba(A,B,C);
        }
    } else {
        /* do the karatsuba action 

           if A = ab and B = cd for ||a|| = r we need to solve 

           ac*r^2 + ((a+b)(c+d) - (ac + bd))*r + bd

           So we solve for the three products then we form the final result with careful shifting 
           and addition.

Obvious points of optimization

- "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]
- Similarly the "bd" parts can be memcpy'ed and zeroed to 8
- 

        */
        /* get our value of r */
        r = yy >> 1;

        /* now solve for ac */
//        fp_copy(A, &t1); fp_rshd(&t1, r); 
        for (s = 0; s < A->used - r; s++) {
            t1.dp[s] = A->dp[s+r];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s] = 0; 
        }
        if (A->used >= r) {
           t1.used = A->used - r;
        } else {
           t1.used = 0;
        }
        t1.sign = 0;

//        fp_copy(B, &t2); fp_rshd(&t2, r); 
        for (s = 0; s < B->used - r; s++) {
            t2.dp[s] = B->dp[s+r];
        }
        for (; s < FP_SIZE; s++) {
            t2.dp[s] = 0; 
        }
        if (B->used >= r) {
           t2.used = B->used - r;
        } else {
           t2.used = 0;
        }
        t2.sign = 0;

        fp_copy(&t1, &amb); fp_copy(&t2, &cmd);
        fp_zero(&ac);
        fp_mul(&t1, &t2, &ac);

        /* now solve for bd */
//        fp_mod_2d(A, r * DIGIT_BIT, &t1);
//        fp_mod_2d(B, r * DIGIT_BIT, &t2);
        for (s = 0; s < r; s++) {
            t1.dp[s] = A->dp[s];
            t2.dp[s] = B->dp[s];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s] = 0; 
            t2.dp[s] = 0; 
        }
        t1.used = r;
        t2.used = r;
        fp_clamp(&t1);
        fp_clamp(&t2);
        
        s_fp_add(&amb, &t1, &amb); s_fp_add(&cmd, &t2, &cmd);
        fp_zero(&bd);
        fp_mul(&t1, &t2, &bd);

        /* now get the (a+b)(c+d) term */
        fp_zero(&comp);
        fp_mul(&amb, &cmd, &comp);

        /* now solve the system, do the middle term first */
        s_fp_sub(&comp, &ac, &comp);
        s_fp_sub(&comp, &bd, &comp);
        fp_lshd(&comp, r);
  
        /* leading term */
        fp_lshd(&ac, r+r);

        /* now sum them together */
        s = A->sign ^ B->sign;
        fp_zero(C);
        fp_add(&ac, &comp, C);
        fp_add(&bd, C, C);    
        C->sign = C->used ? s : FP_ZPOS;
    }
}


/* $Source$ */
/* $Revision$ */
/* $Date$ */
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`/* TomsFastMath, a fast ISO C bignum library.`
			`*`
			`* This project is meant to fill in where LibTomMath`
			`* falls short. That is speed ;-)`
			`*`
			`* This project is public domain and free for all purposes.`
			`*`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`* Tom St Denis, tomstdenis@gmail.com`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`*/`
			`#include <tfm.h>`

			`/* c = a * b */`
			`void fp_mul(fp_int A, fp_int B, fp_int *C)`
			`{`
			`int r, y, yy, s;`
			`fp_int ac, bd, comp, amb, cmd, t1, t2;`

			`y = MAX(A->used, B->used);`
			`yy = MIN(A->used, B->used);`
			`if (yy <= 8 \|\| y <= 64) {`

			`/* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size`
			`of the largest input. We also want to avoid doing excess mults if the`
			`inputs are not close to the next power of two. That is, for example,`
			`if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications`
			`*/`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00
			`#ifdef TFM_SMALL_SET`
			`if (y <= 16) {`
			`fp_mul_comba_small(A,B,C);`
			`#elif defined(TFM_HUGE)`
			`if (0) { 1;`
added tomsfastmath-0.02 2004-09-19 03:31:44 +02:00			`#endif`
			`#if defined(TFM_HUGE)`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`} else if (y <= 32) {`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`fp_mul_comba32(A,B,C);`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`} else if (y <= 48) {`
			`fp_mul_comba48(A,B,C);`
			`} else if (y <= 64) {`
			`fp_mul_comba64(A,B,C);`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`#endif`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`#if !defined(TFM_HUGE) && !defined(TFM_SMALL_SET)`
			`{`
			`#else`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`} else {`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`#endif`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`fp_mul_comba(A,B,C);`
			`}`
			`} else {`
			`/* do the karatsuba action`

			`if A = ab and B = cd for \|\|a\|\| = r we need to solve`

added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`acr^2 + ((a+b)(c+d) - (ac + bd))r + bd`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00
			`So we solve for the three products then we form the final result with careful shifting`
			`and addition.`

			`Obvious points of optimization`

			`- "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]`
			`- Similarly the "bd" parts can be memcpy'ed and zeroed to 8`
			`-`

			`*/`
			`/* get our value of r */`
			`r = yy >> 1;`

			`/* now solve for ac */`
			`// fp_copy(A, &t1); fp_rshd(&t1, r);`
			`for (s = 0; s < A->used - r; s++) {`
			`t1.dp[s] = A->dp[s+r];`
			`}`
			`for (; s < FP_SIZE; s++) {`
			`t1.dp[s] = 0;`
			`}`
			`if (A->used >= r) {`
			`t1.used = A->used - r;`
			`} else {`
			`t1.used = 0;`
			`}`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`t1.sign = 0;`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00
			`// fp_copy(B, &t2); fp_rshd(&t2, r);`
			`for (s = 0; s < B->used - r; s++) {`
			`t2.dp[s] = B->dp[s+r];`
			`}`
			`for (; s < FP_SIZE; s++) {`
			`t2.dp[s] = 0;`
			`}`
			`if (B->used >= r) {`
			`t2.used = B->used - r;`
			`} else {`
			`t2.used = 0;`
			`}`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`t2.sign = 0;`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00
			`fp_copy(&t1, &amb); fp_copy(&t2, &cmd);`
			`fp_zero(&ac);`
			`fp_mul(&t1, &t2, &ac);`

			`/* now solve for bd */`
			`// fp_mod_2d(A, r * DIGIT_BIT, &t1);`
			`// fp_mod_2d(B, r * DIGIT_BIT, &t2);`
			`for (s = 0; s < r; s++) {`
			`t1.dp[s] = A->dp[s];`
			`t2.dp[s] = B->dp[s];`
			`}`
			`for (; s < FP_SIZE; s++) {`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`t1.dp[s] = 0;`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`t2.dp[s] = 0;`
			`}`
			`t1.used = r;`
			`t2.used = r;`
			`fp_clamp(&t1);`
			`fp_clamp(&t2);`

added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`s_fp_add(&amb, &t1, &amb); s_fp_add(&cmd, &t2, &cmd);`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`fp_zero(&bd);`
			`fp_mul(&t1, &t2, &bd);`

added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`/* now get the (a+b)(c+d) term */`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`fp_zero(&comp);`
			`fp_mul(&amb, &cmd, &comp);`

			`/* now solve the system, do the middle term first */`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`s_fp_sub(&comp, &ac, &comp);`
			`s_fp_sub(&comp, &bd, &comp);`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`fp_lshd(&comp, r);`

			`/* leading term */`
			`fp_lshd(&ac, r+r);`

			`/* now sum them together */`
			`s = A->sign ^ B->sign;`
			`fp_zero(C);`
			`fp_add(&ac, &comp, C);`
			`fp_add(&bd, C, C);`
			`C->sign = C->used ? s : FP_ZPOS;`
			`}`
			`}`

added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00
			`/* $Source$ */`
			`/* $Revision$ */`
			`/* $Date$ */`