| OLD | NEW |
| (Empty) |
| 1 /* mpn_mu_div_qr, mpn_preinv_mu_div_qr. | |
| 2 | |
| 3 Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and | |
| 4 must be normalized, and Q must be nn-dn limbs. The requirement that Q is | |
| 5 nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to | |
| 6 let N be unmodified during the operation. | |
| 7 | |
| 8 Contributed to the GNU project by Torbjorn Granlund. | |
| 9 | |
| 10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS | |
| 11 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS | |
| 12 ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP | |
| 13 RELEASE. | |
| 14 | |
| 15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc. | |
| 16 | |
| 17 This file is part of the GNU MP Library. | |
| 18 | |
| 19 The GNU MP Library is free software; you can redistribute it and/or modify | |
| 20 it under the terms of the GNU Lesser General Public License as published by | |
| 21 the Free Software Foundation; either version 3 of the License, or (at your | |
| 22 option) any later version. | |
| 23 | |
| 24 The GNU MP Library is distributed in the hope that it will be useful, but | |
| 25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
| 26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| 27 License for more details. | |
| 28 | |
| 29 You should have received a copy of the GNU Lesser General Public License | |
| 30 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ | |
| 31 | |
| 32 | |
| 33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann | |
| 34 and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of | |
| 35 Jebelean's bidirectional exact division algorithm. | |
| 36 | |
| 37 The idea of this algorithm is to compute a smaller inverted value than used | |
| 38 in the standard Barrett algorithm, and thus save time in the Newton | |
| 39 iterations, and pay just a small price when using the inverted value for | |
| 40 developing quotient bits. | |
| 41 | |
| 42 Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the | |
| 43 "wrap around" trick. Based on the GMP divexact code and inspired by code | |
| 44 contributed to GMP by Karl Hasselstroem. | |
| 45 */ | |
| 46 | |
| 47 | |
| 48 /* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockste
p. | |
| 49 | |
| 50 Things to work on: | |
| 51 | |
| 52 * Passing k isn't a great interface. Either 'in' should be passed, or | |
| 53 determined by the code. | |
| 54 | |
| 55 * The current mpn_mu_div_qr_itch isn't exactly scientifically written. | |
| 56 Scratch space buffer overruns are not unlikely before some analysis is | |
| 57 applied. Since scratch requirements are expected to change, such an | |
| 58 analysis will have to wait til things settle. | |
| 59 | |
| 60 * This isn't optimal when the remainder isn't needed, since the final | |
| 61 multiplication could be made special and take O(1) time on average, in that | |
| 62 case. This is particularly bad when qn << dn. At some level, code as in | |
| 63 GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn | |
| 64 dividend limbs by the qn divisor limbs. | |
| 65 | |
| 66 * This isn't optimal when the quotient isn't needed, as it might take a lot | |
| 67 of space. The computation is always needed, though, so there is not time | |
| 68 to save with special code. | |
| 69 | |
| 70 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, | |
| 71 demonstrated by the fact that the mpn_inv function's scratch needs means | |
| 72 that we need to keep a large allocation long after it is needed. Things | |
| 73 are worse as mpn_mul_fft does not accept any scratch parameter, which means | |
| 74 we'll have a large memory hole while in mpn_mul_fft. In general, a peak | |
| 75 scratch need in the beginning of a function isn't well-handled by the | |
| 76 itch/scratch scheme. | |
| 77 | |
| 78 * Some ideas from comments in divexact.c apply to this code too. | |
| 79 */ | |
| 80 | |
| 81 /* the NOSTAT stuff handles properly the case where files are concatenated */ | |
| 82 #ifdef NOSTAT | |
| 83 #undef STAT | |
| 84 #endif | |
| 85 | |
| 86 #ifdef STAT | |
| 87 #undef STAT | |
| 88 #define STAT(x) x | |
| 89 #else | |
| 90 #define NOSTAT | |
| 91 #define STAT(x) | |
| 92 #endif | |
| 93 | |
| 94 #include <stdlib.h> /* for NULL */ | |
| 95 #include "gmp.h" | |
| 96 #include "gmp-impl.h" | |
| 97 | |
| 98 | |
| 99 /* In case k=0 (automatic choice), we distinguish 3 cases: | |
| 100 (a) dn < qn: in = ceil(qn / ceil(qn/dn)) | |
| 101 (b) dn/3 < qn <= dn: in = ceil(qn / 2) | |
| 102 (c) qn < dn/3: in = qn | |
| 103 In all cases we have in <= dn. | |
| 104 */ | |
| 105 mp_size_t | |
| 106 mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) | |
| 107 { | |
| 108 mp_size_t in; | |
| 109 | |
| 110 if (k == 0) | |
| 111 { | |
| 112 mp_size_t b; | |
| 113 if (qn > dn) | |
| 114 { | |
| 115 /* Compute an inverse size that is a nice partition of the quotient.
*/ | |
| 116 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ | |
| 117 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) *
/ | |
| 118 } | |
| 119 else if (3 * qn > dn) | |
| 120 { | |
| 121 in = (qn - 1) / 2 + 1; /* b = 2 */ | |
| 122 } | |
| 123 else | |
| 124 { | |
| 125 in = (qn - 1) / 1 + 1; /* b = 1 */ | |
| 126 } | |
| 127 } | |
| 128 else | |
| 129 { | |
| 130 mp_size_t xn; | |
| 131 xn = MIN (dn, qn); | |
| 132 in = (xn - 1) / k + 1; | |
| 133 } | |
| 134 | |
| 135 return in; | |
| 136 } | |
| 137 | |
| 138 static mp_limb_t | |
| 139 mpn_mu_div_qr2 (mp_ptr qp, | |
| 140 mp_ptr rp, | |
| 141 mp_ptr np, | |
| 142 mp_size_t nn, | |
| 143 mp_srcptr dp, | |
| 144 mp_size_t dn, | |
| 145 mp_ptr scratch) | |
| 146 { | |
| 147 mp_size_t qn, in; | |
| 148 mp_limb_t cy; | |
| 149 mp_ptr ip, tp; | |
| 150 | |
| 151 /* FIXME: We should probably not handle tiny operands, but do it for now. */ | |
| 152 if (dn == 1) | |
| 153 { | |
| 154 rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]); | |
| 155 MPN_COPY (qp, scratch, nn - 1); | |
| 156 return scratch[nn - 1]; | |
| 157 } | |
| 158 | |
| 159 qn = nn - dn; | |
| 160 | |
| 161 /* Compute the inverse size. */ | |
| 162 in = mpn_mu_div_qr_choose_in (qn, dn, 0); | |
| 163 ASSERT (in <= dn); | |
| 164 | |
| 165 #if 1 | |
| 166 /* This alternative inverse computation method gets slightly more accurate | |
| 167 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function | |
| 168 not adapted (3) mpn_invert scratch needs not met. */ | |
| 169 ip = scratch; | |
| 170 tp = scratch + in + 1; | |
| 171 | |
| 172 /* compute an approximate inverse on (in+1) limbs */ | |
| 173 if (dn == in) | |
| 174 { | |
| 175 MPN_COPY (tp + 1, dp, in); | |
| 176 tp[0] = 1; | |
| 177 mpn_invert (ip, tp, in + 1, NULL); | |
| 178 MPN_COPY_INCR (ip, ip + 1, in); | |
| 179 } | |
| 180 else | |
| 181 { | |
| 182 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); | |
| 183 if (UNLIKELY (cy != 0)) | |
| 184 MPN_ZERO (ip, in); | |
| 185 else | |
| 186 { | |
| 187 mpn_invert (ip, tp, in + 1, NULL); | |
| 188 MPN_COPY_INCR (ip, ip + 1, in); | |
| 189 } | |
| 190 } | |
| 191 #else | |
| 192 /* This older inverse computation method gets slightly worse results than the | |
| 193 one above. */ | |
| 194 ip = scratch; | |
| 195 tp = scratch + in; | |
| 196 | |
| 197 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the | |
| 198 inversion function should do this automatically. */ | |
| 199 if (dn == in) | |
| 200 { | |
| 201 tp[in + 1] = 0; | |
| 202 MPN_COPY (tp + in + 2, dp, in); | |
| 203 mpn_invert (tp, tp + in + 1, in + 1, NULL); | |
| 204 } | |
| 205 else | |
| 206 { | |
| 207 mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL); | |
| 208 } | |
| 209 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); | |
| 210 if (UNLIKELY (cy != 0)) | |
| 211 MPN_ZERO (tp + 1, in); | |
| 212 MPN_COPY (ip, tp + 1, in); | |
| 213 #endif | |
| 214 | |
| 215 /* We can't really handle qh = 1 like this since we'd here clobber N, which is | |
| 216 not allowed in the way we've defined this function's API. */ | |
| 217 #if 0 | |
| 218 qh = mpn_cmp (np + qn, dp, dn) >= 0; | |
| 219 if (qh != 0) | |
| 220 mpn_sub_n (np + qn, np + qn, dp, dn); | |
| 221 #endif | |
| 222 | |
| 223 mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); | |
| 224 | |
| 225 /* return qh; */ | |
| 226 return 0; | |
| 227 } | |
| 228 | |
| 229 void | |
| 230 mpn_preinv_mu_div_qr (mp_ptr qp, | |
| 231 mp_ptr rp, | |
| 232 mp_ptr np, | |
| 233 mp_size_t nn, | |
| 234 mp_srcptr dp, | |
| 235 mp_size_t dn, | |
| 236 mp_srcptr ip, | |
| 237 mp_size_t in, | |
| 238 mp_ptr scratch) | |
| 239 { | |
| 240 mp_size_t qn; | |
| 241 mp_limb_t cy; | |
| 242 mp_ptr tp; | |
| 243 mp_limb_t r; | |
| 244 | |
| 245 qn = nn - dn; | |
| 246 | |
| 247 if (qn == 0) | |
| 248 { | |
| 249 MPN_COPY (rp, np, dn); | |
| 250 return; | |
| 251 } | |
| 252 | |
| 253 tp = scratch; | |
| 254 | |
| 255 np += qn; | |
| 256 qp += qn; | |
| 257 | |
| 258 MPN_COPY (rp, np, dn); | |
| 259 | |
| 260 while (qn > 0) | |
| 261 { | |
| 262 if (qn < in) | |
| 263 { | |
| 264 ip += in - qn; | |
| 265 in = qn; | |
| 266 } | |
| 267 np -= in; | |
| 268 qp -= in; | |
| 269 | |
| 270 /* Compute the next block of quotient limbs by multiplying the inverse I | |
| 271 by the upper part of the partial remainder R. */ | |
| 272 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ | |
| 273 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ | |
| 274 ASSERT_ALWAYS (cy == 0); /* FIXME */ | |
| 275 | |
| 276 /* Compute the product of the quotient block and the divisor D, to be | |
| 277 subtracted from the partial remainder combined with new limbs from the | |
| 278 dividend N. We only really need the low dn limbs. */ | |
| 279 #if WANT_FFT | |
| 280 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) | |
| 281 { | |
| 282 /* Use the wrap-around trick. */ | |
| 283 mp_size_t m, wn; | |
| 284 int k; | |
| 285 | |
| 286 k = mpn_fft_best_k (dn + 1, 0); | |
| 287 m = mpn_fft_next_size (dn + 1, k); | |
| 288 wn = dn + in - m; /* number of wrapped limbs */ | |
| 289 | |
| 290 mpn_mul_fft (tp, m, dp, dn, qp, in, k); | |
| 291 | |
| 292 if (wn > 0) | |
| 293 { | |
| 294 cy = mpn_add_n (tp, tp, rp + dn - wn, wn); | |
| 295 mpn_incr_u (tp + wn, cy); | |
| 296 | |
| 297 cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0; | |
| 298 mpn_decr_u (tp, cy); | |
| 299 } | |
| 300 } | |
| 301 else | |
| 302 #endif | |
| 303 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancel
s */ | |
| 304 | |
| 305 r = rp[dn - in] - tp[dn]; | |
| 306 | |
| 307 /* Subtract the product from the partial remainder combined with new | |
| 308 limbs from the dividend N, generating a new partial remainder R. */ | |
| 309 if (dn != in) | |
| 310 { | |
| 311 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ | |
| 312 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); | |
| 313 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ | |
| 314 } | |
| 315 else | |
| 316 { | |
| 317 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ | |
| 318 } | |
| 319 | |
| 320 STAT (int i; int err = 0; | |
| 321 static int errarr[5]; static int err_rec; static int tot); | |
| 322 | |
| 323 /* Check the remainder R and adjust the quotient as needed. */ | |
| 324 r -= cy; | |
| 325 while (r != 0) | |
| 326 { | |
| 327 /* We loop 0 times with about 69% probability, 1 time with about 31% | |
| 328 probability, 2 times with about 0.6% probability, if inverse is | |
| 329 computed as recommended. */ | |
| 330 mpn_incr_u (qp, 1); | |
| 331 cy = mpn_sub_n (rp, rp, dp, dn); | |
| 332 r -= cy; | |
| 333 STAT (err++); | |
| 334 } | |
| 335 if (mpn_cmp (rp, dp, dn) >= 0) | |
| 336 { | |
| 337 /* This is executed with about 76% probability. */ | |
| 338 mpn_incr_u (qp, 1); | |
| 339 cy = mpn_sub_n (rp, rp, dp, dn); | |
| 340 STAT (err++); | |
| 341 } | |
| 342 | |
| 343 STAT ( | |
| 344 tot++; | |
| 345 errarr[err]++; | |
| 346 if (err > err_rec) | |
| 347 err_rec = err; | |
| 348 if (tot % 0x10000 == 0) | |
| 349 { | |
| 350 for (i = 0; i <= err_rec; i++) | |
| 351 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); | |
| 352 printf ("\n"); | |
| 353 } | |
| 354 ); | |
| 355 | |
| 356 qn -= in; | |
| 357 } | |
| 358 } | |
| 359 | |
| 360 #define THRES 100 /* FIXME: somewhat arbitrary */ | |
| 361 | |
| 362 #ifdef CHECK | |
| 363 #undef THRES | |
| 364 #define THRES 1 | |
| 365 #endif | |
| 366 | |
| 367 mp_limb_t | |
| 368 mpn_mu_div_qr (mp_ptr qp, | |
| 369 mp_ptr rp, | |
| 370 mp_ptr np, | |
| 371 mp_size_t nn, | |
| 372 mp_srcptr dp, | |
| 373 mp_size_t dn, | |
| 374 mp_ptr scratch) | |
| 375 { | |
| 376 mp_size_t qn; | |
| 377 | |
| 378 qn = nn - dn; | |
| 379 if (qn + THRES < dn) | |
| 380 { | |
| 381 /* |______________|________| dividend nn | |
| 382 |_______|________| divisor dn | |
| 383 | |
| 384 |______| quotient (prel) qn | |
| 385 | |
| 386 |_______________| quotient * ignored-part-of(divisor) dn-1 | |
| 387 */ | |
| 388 | |
| 389 mp_limb_t cy, x; | |
| 390 | |
| 391 if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0) | |
| 392 { | |
| 393 /* Quotient is 111...111, could optimize this rare case at some point.
*/ | |
| 394 mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); | |
| 395 return 0; | |
| 396 } | |
| 397 | |
| 398 /* Compute a preliminary quotient and a partial remainder by dividing the | |
| 399 most significant limbs of each operand. */ | |
| 400 mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), | |
| 401 np + nn - (2 * qn + 1), 2 * qn + 1, | |
| 402 dp + dn - (qn + 1), qn + 1, | |
| 403 scratch); | |
| 404 | |
| 405 /* Multiply the quotient by the divisor limbs ignored above. */ | |
| 406 if (dn - (qn + 1) > qn) | |
| 407 mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ | |
| 408 else | |
| 409 mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ | |
| 410 | |
| 411 cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); | |
| 412 cy = mpn_sub_nc (rp + nn - (2 * qn + 1), | |
| 413 rp + nn - (2 * qn + 1), | |
| 414 scratch + nn - (2 * qn + 1), | |
| 415 qn, cy); | |
| 416 x = rp[dn - 1]; | |
| 417 rp[dn - 1] = x - cy; | |
| 418 if (cy > x) | |
| 419 { | |
| 420 mpn_decr_u (qp, 1); | |
| 421 mpn_add_n (rp, rp, dp, dn); | |
| 422 } | |
| 423 } | |
| 424 else | |
| 425 { | |
| 426 return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); | |
| 427 } | |
| 428 | |
| 429 return 0; /* FIXME */ | |
| 430 } | |
| 431 | |
| 432 mp_size_t | |
| 433 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) | |
| 434 { | |
| 435 mp_size_t qn, m; | |
| 436 int k; | |
| 437 | |
| 438 /* FIXME: This isn't very carefully written, and might grossly overestimate | |
| 439 the amount of scratch needed, and might perhaps also underestimate it, | |
| 440 leading to potential buffer overruns. In particular k=0 might lead to | |
| 441 gross overestimates. */ | |
| 442 | |
| 443 if (dn == 1) | |
| 444 return nn; | |
| 445 | |
| 446 qn = nn - dn; | |
| 447 if (qn >= dn) | |
| 448 { | |
| 449 k = mpn_fft_best_k (dn + 1, 0); | |
| 450 m = mpn_fft_next_size (dn + 1, k); | |
| 451 return (mua_k <= 1 | |
| 452 ? 6 * dn | |
| 453 : m + 2 * dn); | |
| 454 } | |
| 455 else | |
| 456 { | |
| 457 k = mpn_fft_best_k (dn + 1, 0); | |
| 458 m = mpn_fft_next_size (dn + 1, k); | |
| 459 return (mua_k <= 1 | |
| 460 ? m + 4 * qn | |
| 461 : m + 2 * qn); | |
| 462 } | |
| 463 } | |
| OLD | NEW |