| OLD | NEW |
| (Empty) |
| 1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times | |
| 2 as large as bn. Or more accurately, (5/2)bn < an < 6bn. | |
| 3 | |
| 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. | |
| 5 | |
| 6 The idea of applying toom to unbalanced multiplication is due to by Marco | |
| 7 Bodrato and Alberto Zanoni. | |
| 8 | |
| 9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY | |
| 10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST | |
| 11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. | |
| 12 | |
| 13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. | |
| 14 | |
| 15 This file is part of the GNU MP Library. | |
| 16 | |
| 17 The GNU MP Library is free software; you can redistribute it and/or modify | |
| 18 it under the terms of the GNU Lesser General Public License as published by | |
| 19 the Free Software Foundation; either version 3 of the License, or (at your | |
| 20 option) any later version. | |
| 21 | |
| 22 The GNU MP Library is distributed in the hope that it will be useful, but | |
| 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
| 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| 25 License for more details. | |
| 26 | |
| 27 You should have received a copy of the GNU Lesser General Public License | |
| 28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ | |
| 29 | |
| 30 | |
| 31 /* | |
| 32 Things to work on: | |
| 33 | |
| 34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be | |
| 35 avoided by instead reusing the pp area and the scratch allocation. | |
| 36 */ | |
| 37 | |
| 38 #include "gmp.h" | |
| 39 #include "gmp-impl.h" | |
| 40 | |
| 41 | |
| 42 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf | |
| 43 | |
| 44 <-s-><--n--><--n--><--n--> | |
| 45 ___ ______ ______ ______ ______ ______ | |
| 46 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| | |
| 47 |_b1_|___b0_| | |
| 48 <-t--><--n--> | |
| 49 | |
| 50 v0 = a0 * b0 # A(0)*B(0) | |
| 51 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 b
h <= 1 | |
| 52 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 b
h = 0 | |
| 53 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 b
h <= 2 | |
| 54 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 b
h <= 2 | |
| 55 vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0
<=bh<=1 | |
| 56 vinf= a5 * b1 # A(inf)*B(inf) | |
| 57 */ | |
| 58 | |
| 59 void | |
| 60 mpn_toom62_mul (mp_ptr pp, | |
| 61 mp_srcptr ap, mp_size_t an, | |
| 62 mp_srcptr bp, mp_size_t bn, | |
| 63 mp_ptr scratch) | |
| 64 { | |
| 65 mp_size_t n, s, t; | |
| 66 int vm1_neg, vmh_neg, bsm_neg; | |
| 67 mp_limb_t cy; | |
| 68 mp_ptr a0_a2, a1_a3; | |
| 69 mp_ptr as1, asm1, as2, ash, asmh; | |
| 70 mp_ptr bs1, bsm1, bs2, bsh, bsmh; | |
| 71 enum toom4_flags flags; | |
| 72 TMP_DECL; | |
| 73 | |
| 74 #define a0 ap | |
| 75 #define a1 (ap + n) | |
| 76 #define a2 (ap + 2*n) | |
| 77 #define a3 (ap + 3*n) | |
| 78 #define a4 (ap + 4*n) | |
| 79 #define a5 (ap + 5*n) | |
| 80 #define b0 bp | |
| 81 #define b1 (bp + n) | |
| 82 | |
| 83 n = 1 + (an >= 3 * bn ? (an - 1) / (unsigned long) 6 : (bn - 1) >> 1); | |
| 84 | |
| 85 s = an - 5 * n; | |
| 86 t = bn - n; | |
| 87 | |
| 88 ASSERT (0 < s && s <= n); | |
| 89 ASSERT (0 < t && t <= n); | |
| 90 | |
| 91 TMP_MARK; | |
| 92 | |
| 93 as1 = TMP_SALLOC_LIMBS (n + 1); | |
| 94 asm1 = TMP_SALLOC_LIMBS (n + 1); | |
| 95 as2 = TMP_SALLOC_LIMBS (n + 1); | |
| 96 ash = TMP_SALLOC_LIMBS (n + 1); | |
| 97 asmh = TMP_SALLOC_LIMBS (n + 1); | |
| 98 | |
| 99 bs1 = TMP_SALLOC_LIMBS (n + 1); | |
| 100 bsm1 = TMP_SALLOC_LIMBS (n); | |
| 101 bs2 = TMP_SALLOC_LIMBS (n + 1); | |
| 102 bsh = TMP_SALLOC_LIMBS (n + 1); | |
| 103 bsmh = TMP_SALLOC_LIMBS (n + 1); | |
| 104 | |
| 105 a0_a2 = pp; | |
| 106 a1_a3 = pp + n + 1; | |
| 107 | |
| 108 /* Compute as1 and asm1. */ | |
| 109 a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n); | |
| 110 a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n); | |
| 111 a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n); | |
| 112 a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s); | |
| 113 #if HAVE_NATIVE_mpn_addsub_n | |
| 114 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
| 115 { | |
| 116 mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1); | |
| 117 vm1_neg = 1; | |
| 118 } | |
| 119 else | |
| 120 { | |
| 121 mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1); | |
| 122 vm1_neg = 0; | |
| 123 } | |
| 124 #else | |
| 125 mpn_add_n (as1, a0_a2, a1_a3, n + 1); | |
| 126 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
| 127 { | |
| 128 mpn_sub_n (asm1, a1_a3, a0_a2, n + 1); | |
| 129 vm1_neg = 1; | |
| 130 } | |
| 131 else | |
| 132 { | |
| 133 mpn_sub_n (asm1, a0_a2, a1_a3, n + 1); | |
| 134 vm1_neg = 0; | |
| 135 } | |
| 136 #endif | |
| 137 | |
| 138 /* Compute as2. */ | |
| 139 #if HAVE_NATIVE_mpn_addlsh1_n | |
| 140 cy = mpn_addlsh1_n (as2, a4, a5, s); | |
| 141 if (s != n) | |
| 142 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); | |
| 143 cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n); | |
| 144 cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); | |
| 145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); | |
| 146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); | |
| 147 #else | |
| 148 cy = mpn_lshift (as2, a5, s, 1); | |
| 149 cy += mpn_add_n (as2, a4, as2, s); | |
| 150 if (s != n) | |
| 151 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); | |
| 152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
| 153 cy += mpn_add_n (as2, a3, as2, n); | |
| 154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
| 155 cy += mpn_add_n (as2, a2, as2, n); | |
| 156 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
| 157 cy += mpn_add_n (as2, a1, as2, n); | |
| 158 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
| 159 cy += mpn_add_n (as2, a0, as2, n); | |
| 160 #endif | |
| 161 as2[n] = cy; | |
| 162 | |
| 163 /* Compute ash and asmh. */ | |
| 164 #if HAVE_NATIVE_mpn_addlsh_n | |
| 165 cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */ | |
| 166 cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */ | |
| 167 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ | |
| 168 a0_a2[n] = cy; | |
| 169 cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */ | |
| 170 cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */ | |
| 171 a1_a3[n] = cy; | |
| 172 #else | |
| 173 cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */ | |
| 174 cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */ | |
| 175 cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */ | |
| 176 cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */ | |
| 177 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ | |
| 178 a0_a2[n] = cy; | |
| 179 cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */ | |
| 180 cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */ | |
| 181 cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */ | |
| 182 cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */ | |
| 183 a1_a3[n] = cy; | |
| 184 #endif | |
| 185 #if HAVE_NATIVE_mpn_addsub_n | |
| 186 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
| 187 { | |
| 188 mpn_addsub_n (ash, asmh, a1_a3, a0_a2, n + 1); | |
| 189 vmh_neg = 1; | |
| 190 } | |
| 191 else | |
| 192 { | |
| 193 mpn_addsub_n (ash, asmh, a0_a2, a1_a3, n + 1); | |
| 194 vmh_neg = 0; | |
| 195 } | |
| 196 #else | |
| 197 mpn_add_n (ash, a0_a2, a1_a3, n + 1); | |
| 198 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
| 199 { | |
| 200 mpn_sub_n (asmh, a1_a3, a0_a2, n + 1); | |
| 201 vmh_neg = 1; | |
| 202 } | |
| 203 else | |
| 204 { | |
| 205 mpn_sub_n (asmh, a0_a2, a1_a3, n + 1); | |
| 206 vmh_neg = 0; | |
| 207 } | |
| 208 #endif | |
| 209 | |
| 210 /* Compute bs1 and bsm1. */ | |
| 211 if (t == n) | |
| 212 { | |
| 213 #if HAVE_NATIVE_mpn_addsub_n | |
| 214 if (mpn_cmp (b0, b1, n) < 0) | |
| 215 { | |
| 216 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n); | |
| 217 bsm_neg = 1; | |
| 218 } | |
| 219 else | |
| 220 { | |
| 221 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n); | |
| 222 bsm_neg = 0; | |
| 223 } | |
| 224 bs1[n] = cy >> 1; | |
| 225 #else | |
| 226 bs1[n] = mpn_add_n (bs1, b0, b1, n); | |
| 227 if (mpn_cmp (b0, b1, n) < 0) | |
| 228 { | |
| 229 mpn_sub_n (bsm1, b1, b0, n); | |
| 230 bsm_neg = 1; | |
| 231 } | |
| 232 else | |
| 233 { | |
| 234 mpn_sub_n (bsm1, b0, b1, n); | |
| 235 bsm_neg = 0; | |
| 236 } | |
| 237 #endif | |
| 238 } | |
| 239 else | |
| 240 { | |
| 241 bs1[n] = mpn_add (bs1, b0, n, b1, t); | |
| 242 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) | |
| 243 { | |
| 244 mpn_sub_n (bsm1, b1, b0, t); | |
| 245 MPN_ZERO (bsm1 + t, n - t); | |
| 246 bsm_neg = 1; | |
| 247 } | |
| 248 else | |
| 249 { | |
| 250 mpn_sub (bsm1, b0, n, b1, t); | |
| 251 bsm_neg = 0; | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 vm1_neg ^= bsm_neg; | |
| 256 | |
| 257 /* Compute bs2, recycling bs1. bs2=bs1+b1 */ | |
| 258 mpn_add (bs2, bs1, n + 1, b1, t); | |
| 259 | |
| 260 /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */ | |
| 261 if (bsm_neg == 1) | |
| 262 { | |
| 263 bsmh[n] = 0; | |
| 264 if (mpn_cmp (bsm1, b0, n) < 0) | |
| 265 { | |
| 266 bsm_neg = 0; | |
| 267 mpn_sub_n (bsmh, b0, bsm1, n); | |
| 268 } | |
| 269 else | |
| 270 mpn_sub_n (bsmh, bsm1, b0, n); | |
| 271 } | |
| 272 else | |
| 273 bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n); | |
| 274 mpn_add (bsh, bs1, n + 1, b0, n); | |
| 275 vmh_neg ^= bsm_neg; | |
| 276 | |
| 277 | |
| 278 ASSERT (as1[n] <= 5); | |
| 279 ASSERT (bs1[n] <= 1); | |
| 280 ASSERT (asm1[n] <= 2); | |
| 281 /*ASSERT (bsm1[n] == 0);*/ | |
| 282 ASSERT (as2[n] <= 62); | |
| 283 ASSERT (bs2[n] <= 2); | |
| 284 ASSERT (ash[n] <= 62); | |
| 285 ASSERT (bsh[n] <= 2); | |
| 286 ASSERT (asmh[n] <= 41); | |
| 287 ASSERT (bsmh[n] <= 1); | |
| 288 | |
| 289 #define v0 pp /* 2n */ | |
| 290 #define v1 (scratch + 6 * n + 6) /* 2n+1 */ | |
| 291 #define vinf (pp + 6 * n) /* s+t */ | |
| 292 #define vm1 scratch /* 2n+1 */ | |
| 293 #define v2 (scratch + 2 * n + 2) /* 2n+1 */ | |
| 294 #define vh (pp + 2 * n) /* 2n+1 */ | |
| 295 #define vmh (scratch + 4 * n + 4) | |
| 296 | |
| 297 /* vm1, 2n+1 limbs */ | |
| 298 mpn_mul_n (vm1, asm1, bsm1, n); | |
| 299 cy = 0; | |
| 300 if (asm1[n] == 1) | |
| 301 { | |
| 302 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); | |
| 303 } | |
| 304 else if (asm1[n] == 2) | |
| 305 { | |
| 306 #if HAVE_NATIVE_mpn_addlsh1_n | |
| 307 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); | |
| 308 #else | |
| 309 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); | |
| 310 #endif | |
| 311 } | |
| 312 vm1[2 * n] = cy; | |
| 313 | |
| 314 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ | |
| 315 | |
| 316 /* vinf, s+t limbs */ | |
| 317 if (s > t) mpn_mul (vinf, a5, s, b1, t); | |
| 318 else mpn_mul (vinf, b1, t, a5, s); | |
| 319 | |
| 320 /* v1, 2n+1 limbs */ | |
| 321 mpn_mul_n (v1, as1, bs1, n); | |
| 322 if (as1[n] == 1) | |
| 323 { | |
| 324 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); | |
| 325 } | |
| 326 else if (as1[n] == 2) | |
| 327 { | |
| 328 #if HAVE_NATIVE_mpn_addlsh1_n | |
| 329 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); | |
| 330 #else | |
| 331 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); | |
| 332 #endif | |
| 333 } | |
| 334 else if (as1[n] != 0) | |
| 335 { | |
| 336 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); | |
| 337 } | |
| 338 else | |
| 339 cy = 0; | |
| 340 if (bs1[n] != 0) | |
| 341 cy += mpn_add_n (v1 + n, v1 + n, as1, n); | |
| 342 v1[2 * n] = cy; | |
| 343 | |
| 344 mpn_mul_n (vh, ash, bsh, n + 1); | |
| 345 | |
| 346 mpn_mul_n (vmh, asmh, bsmh, n + 1); | |
| 347 | |
| 348 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */ | |
| 349 | |
| 350 flags = vm1_neg ? toom4_w3_neg : 0; | |
| 351 flags |= vmh_neg ? toom4_w1_neg : 0; | |
| 352 | |
| 353 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8
* n + 8); | |
| 354 | |
| 355 TMP_FREE; | |
| 356 } | |
| OLD | NEW |