tomsfastmath/fp_sqr.c

/* TomsFastMath, a fast ISO C bignum library.
 * 
 * This project is meant to fill in where LibTomMath
 * falls short.  That is speed ;-)
 *
 * This project is public domain and free for all purposes.
 * 
 * Tom St Denis, tomstdenis@gmail.com
 */
#include <tfm.h>

/* b = a*a  */
void fp_sqr(fp_int *A, fp_int *B)
{
    int    r, y, s;
    fp_int aa, bb, comp, amb, t1;

    /* call generic if we're out of range */
    if (A->used + A->used > FP_SIZE) {
       fp_sqr_comba(A, B);
       return ;
    }

    y = A->used;
    if (y <= 64) { 

#if defined(TFM_SMALL_SET)
        if (y <= 16) {
           fp_sqr_comba_small(A,B);
           return;
        }
#endif
#if defined(TFM_SQR32)
        if (y <= 32) {
           fp_sqr_comba32(A,B);
           return;
        }
#endif
#if defined(TFM_SQR48)
        if (y <= 48) {
           fp_sqr_comba48(A,B);
           return;
        }
#endif
#if defined(TFM_SQR64)
        if (y <= 64) {
           fp_sqr_comba64(A,B);
           return;
        }
#endif
       fp_sqr_comba(A, B);
    } else {
        /* do the karatsuba action 

           if A = ab ||a|| = r we need to solve 

           a^2*r^2 + (-(a-b)^2 + a^2 + b^2)*r + b^2

           So we solve for the three products then we form the final result with careful shifting 
           and addition.

Obvious points of optimization

- "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]
- Similarly the "bd" parts can be memcpy'ed and zeroed to 8
- 

        */
        /* get our value of r */
        r = y >> 1;

        /* now solve for ac */
//        fp_copy(A, &t1); fp_rshd(&t1, r); 
        for (s = 0; s < A->used - r; s++) {
            t1.dp[s] = A->dp[s+r];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s] = 0; 
        }
        if (A->used >= r) {
           t1.used = A->used - r;
        } else {
           t1.used = 0;
        }
        t1.sign = A->sign;
        fp_copy(&t1, &amb); 
        fp_zero(&aa);
        fp_sqr(&t1, &aa);

        /* now solve for bd */
//        fp_mod_2d(A, r * DIGIT_BIT, &t1);
        for (s = 0; s < r; s++) {
            t1.dp[s] = A->dp[s];
        }
        for (; s < FP_SIZE; s++) {
            t1.dp[s]   = 0; 
        }
        t1.used = r;
        fp_clamp(&t1);

        fp_sub(&amb, &t1, &amb); 
        fp_zero(&bb);
        fp_sqr(&t1, &bb);

        /* now get the (a-b) term */
        fp_zero(&comp);
        fp_sqr(&amb, &comp);

        /* now solve the system, do the middle term first */
        comp.sign ^= 1;
        fp_add(&comp, &aa, &comp);
        fp_add(&comp, &bb, &comp);
        fp_lshd(&comp, r);
  
        /* leading term */
        fp_lshd(&aa, r+r);

        /* now sum them together */
        fp_zero(B);
        fp_add(&aa, &comp, B);
        fp_add(&bb, B, B);    
        B->sign = FP_ZPOS;
    }
}


/* $Source$ */
/* $Revision$ */
/* $Date$ */
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`/* TomsFastMath, a fast ISO C bignum library.`
			`*`
			`* This project is meant to fill in where LibTomMath`
			`* falls short. That is speed ;-)`
			`*`
			`* This project is public domain and free for all purposes.`
			`*`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`* Tom St Denis, tomstdenis@gmail.com`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`*/`
			`#include <tfm.h>`

			`/* b = aa /`
			`void fp_sqr(fp_int A, fp_int B)`
			`{`
			`int r, y, s;`
			`fp_int aa, bb, comp, amb, t1;`

added tomsfastmath-0.06 2005-10-31 16:32:05 +01:00			`/* call generic if we're out of range */`
			`if (A->used + A->used > FP_SIZE) {`
			`fp_sqr_comba(A, B);`
			`return ;`
			`}`

added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`y = A->used;`
added tomsfastmath-0.03 2005-03-02 00:00:09 +01:00			`if (y <= 64) {`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00
			`#if defined(TFM_SMALL_SET)`
			`if (y <= 16) {`
			`fp_sqr_comba_small(A,B);`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`return;`
			`}`
added tomsfastmath-0.02 2004-09-19 03:31:44 +02:00			`#endif`
added tomsfastmath-0.06 2005-10-31 16:32:05 +01:00			`#if defined(TFM_SQR32)`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`if (y <= 32) {`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`fp_sqr_comba32(A,B);`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`return;`
			`}`
added tomsfastmath-0.06 2005-10-31 16:32:05 +01:00			`#endif`
			`#if defined(TFM_SQR48)`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`if (y <= 48) {`
added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00			`fp_sqr_comba48(A,B);`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`return;`
			`}`
added tomsfastmath-0.06 2005-10-31 16:32:05 +01:00			`#endif`
			`#if defined(TFM_SQR64)`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`if (y <= 64) {`
added tomsfastmath-0.03 2005-03-02 00:00:09 +01:00			`fp_sqr_comba64(A,B);`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`return;`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`}`
added tomsfastmath-0.07 2005-11-18 06:16:25 +01:00			`#endif`
			`fp_sqr_comba(A, B);`
added tomsfastmath-0.01 2004-08-25 04:43:43 +02:00			`} else {`
			`/* do the karatsuba action`

			`if A = ab \|\|a\|\| = r we need to solve`

			`a^2r^2 + (-(a-b)^2 + a^2 + b^2)r + b^2`

			`So we solve for the three products then we form the final result with careful shifting`
			`and addition.`

			`Obvious points of optimization`

			`- "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits]`
			`- Similarly the "bd" parts can be memcpy'ed and zeroed to 8`
			`-`

			`*/`
			`/* get our value of r */`
			`r = y >> 1;`

			`/* now solve for ac */`
			`// fp_copy(A, &t1); fp_rshd(&t1, r);`
			`for (s = 0; s < A->used - r; s++) {`
			`t1.dp[s] = A->dp[s+r];`
			`}`
			`for (; s < FP_SIZE; s++) {`
			`t1.dp[s] = 0;`
			`}`
			`if (A->used >= r) {`
			`t1.used = A->used - r;`
			`} else {`
			`t1.used = 0;`
			`}`
			`t1.sign = A->sign;`
			`fp_copy(&t1, &amb);`
			`fp_zero(&aa);`
			`fp_sqr(&t1, &aa);`

			`/* now solve for bd */`
			`// fp_mod_2d(A, r * DIGIT_BIT, &t1);`
			`for (s = 0; s < r; s++) {`
			`t1.dp[s] = A->dp[s];`
			`}`
			`for (; s < FP_SIZE; s++) {`
			`t1.dp[s] = 0;`
			`}`
			`t1.used = r;`
			`fp_clamp(&t1);`

			`fp_sub(&amb, &t1, &amb);`
			`fp_zero(&bb);`
			`fp_sqr(&t1, &bb);`

			`/* now get the (a-b) term */`
			`fp_zero(&comp);`
			`fp_sqr(&amb, &comp);`

			`/* now solve the system, do the middle term first */`
			`comp.sign ^= 1;`
			`fp_add(&comp, &aa, &comp);`
			`fp_add(&comp, &bb, &comp);`
			`fp_lshd(&comp, r);`

			`/* leading term */`
			`fp_lshd(&aa, r+r);`

			`/* now sum them together */`
			`fp_zero(B);`
			`fp_add(&aa, &comp, B);`
			`fp_add(&bb, B, B);`
			`B->sign = FP_ZPOS;`
			`}`
			`}`

added tomsfastmath-0.04 2005-07-23 12:43:03 +02:00
			`/* $Source$ */`
			`/* $Revision$ */`
			`/* $Date$ */`