/* TomsFastMath, a fast ISO C bignum library. * * This project is meant to fill in where LibTomMath * falls short. That is speed ;-) * * This project is public domain and free for all purposes. * * Tom St Denis, tomstdenis@gmail.com */ #include /* c = a * b */ void fp_mul(fp_int *A, fp_int *B, fp_int *C) { int r, y, yy, s; fp_int ac, bd, comp, amb, cmd, t1, t2; y = MAX(A->used, B->used); yy = MIN(A->used, B->used); if (yy <= 8 || y <= 64) { /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size of the largest input. We also want to avoid doing excess mults if the inputs are not close to the next power of two. That is, for example, if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications */ #ifdef TFM_SMALL_SET if (y <= 16) { fp_mul_comba_small(A,B,C); #elif defined(TFM_HUGE) if (0) { 1; #endif #if defined(TFM_HUGE) } else if (y <= 32) { fp_mul_comba32(A,B,C); } else if (y <= 48) { fp_mul_comba48(A,B,C); } else if (y <= 64) { fp_mul_comba64(A,B,C); #endif #if !defined(TFM_HUGE) && !defined(TFM_SMALL_SET) { #else } else { #endif fp_mul_comba(A,B,C); } } else { /* do the karatsuba action if A = ab and B = cd for ||a|| = r we need to solve ac*r^2 + ((a+b)(c+d) - (ac + bd))*r + bd So we solve for the three products then we form the final result with careful shifting and addition. Obvious points of optimization - "ac" parts can be memcpy'ed with an offset [all you have to do is zero upto the next 8 digits] - Similarly the "bd" parts can be memcpy'ed and zeroed to 8 - */ /* get our value of r */ r = yy >> 1; /* now solve for ac */ // fp_copy(A, &t1); fp_rshd(&t1, r); for (s = 0; s < A->used - r; s++) { t1.dp[s] = A->dp[s+r]; } for (; s < FP_SIZE; s++) { t1.dp[s] = 0; } if (A->used >= r) { t1.used = A->used - r; } else { t1.used = 0; } t1.sign = 0; // fp_copy(B, &t2); fp_rshd(&t2, r); for (s = 0; s < B->used - r; s++) { t2.dp[s] = B->dp[s+r]; } for (; s < FP_SIZE; s++) { t2.dp[s] = 0; } if (B->used >= r) { t2.used = B->used - r; } else { t2.used = 0; } t2.sign = 0; fp_copy(&t1, &amb); fp_copy(&t2, &cmd); fp_zero(&ac); fp_mul(&t1, &t2, &ac); /* now solve for bd */ // fp_mod_2d(A, r * DIGIT_BIT, &t1); // fp_mod_2d(B, r * DIGIT_BIT, &t2); for (s = 0; s < r; s++) { t1.dp[s] = A->dp[s]; t2.dp[s] = B->dp[s]; } for (; s < FP_SIZE; s++) { t1.dp[s] = 0; t2.dp[s] = 0; } t1.used = r; t2.used = r; fp_clamp(&t1); fp_clamp(&t2); s_fp_add(&amb, &t1, &amb); s_fp_add(&cmd, &t2, &cmd); fp_zero(&bd); fp_mul(&t1, &t2, &bd); /* now get the (a+b)(c+d) term */ fp_zero(&comp); fp_mul(&amb, &cmd, &comp); /* now solve the system, do the middle term first */ s_fp_sub(&comp, &ac, &comp); s_fp_sub(&comp, &bd, &comp); fp_lshd(&comp, r); /* leading term */ fp_lshd(&ac, r+r); /* now sum them together */ s = A->sign ^ B->sign; fp_zero(C); fp_add(&ac, &comp, C); fp_add(&bd, C, C); C->sign = C->used ? s : FP_ZPOS; } } /* $Source$ */ /* $Revision$ */ /* $Date$ */