OLD | NEW |
| (Empty) |
1 /* mpn_mu_div_qr, mpn_preinv_mu_div_qr. | |
2 | |
3 Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and | |
4 must be normalized, and Q must be nn-dn limbs. The requirement that Q is | |
5 nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to | |
6 let N be unmodified during the operation. | |
7 | |
8 Contributed to the GNU project by Torbjorn Granlund. | |
9 | |
10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS | |
11 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS | |
12 ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP | |
13 RELEASE. | |
14 | |
15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc. | |
16 | |
17 This file is part of the GNU MP Library. | |
18 | |
19 The GNU MP Library is free software; you can redistribute it and/or modify | |
20 it under the terms of the GNU Lesser General Public License as published by | |
21 the Free Software Foundation; either version 3 of the License, or (at your | |
22 option) any later version. | |
23 | |
24 The GNU MP Library is distributed in the hope that it will be useful, but | |
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
27 License for more details. | |
28 | |
29 You should have received a copy of the GNU Lesser General Public License | |
30 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ | |
31 | |
32 | |
33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann | |
34 and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of | |
35 Jebelean's bidirectional exact division algorithm. | |
36 | |
37 The idea of this algorithm is to compute a smaller inverted value than used | |
38 in the standard Barrett algorithm, and thus save time in the Newton | |
39 iterations, and pay just a small price when using the inverted value for | |
40 developing quotient bits. | |
41 | |
42 Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the | |
43 "wrap around" trick. Based on the GMP divexact code and inspired by code | |
44 contributed to GMP by Karl Hasselstroem. | |
45 */ | |
46 | |
47 | |
48 /* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockste
p. | |
49 | |
50 Things to work on: | |
51 | |
52 * Passing k isn't a great interface. Either 'in' should be passed, or | |
53 determined by the code. | |
54 | |
55 * The current mpn_mu_div_qr_itch isn't exactly scientifically written. | |
56 Scratch space buffer overruns are not unlikely before some analysis is | |
57 applied. Since scratch requirements are expected to change, such an | |
58 analysis will have to wait til things settle. | |
59 | |
60 * This isn't optimal when the remainder isn't needed, since the final | |
61 multiplication could be made special and take O(1) time on average, in that | |
62 case. This is particularly bad when qn << dn. At some level, code as in | |
63 GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn | |
64 dividend limbs by the qn divisor limbs. | |
65 | |
66 * This isn't optimal when the quotient isn't needed, as it might take a lot | |
67 of space. The computation is always needed, though, so there is not time | |
68 to save with special code. | |
69 | |
70 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, | |
71 demonstrated by the fact that the mpn_inv function's scratch needs means | |
72 that we need to keep a large allocation long after it is needed. Things | |
73 are worse as mpn_mul_fft does not accept any scratch parameter, which means | |
74 we'll have a large memory hole while in mpn_mul_fft. In general, a peak | |
75 scratch need in the beginning of a function isn't well-handled by the | |
76 itch/scratch scheme. | |
77 | |
78 * Some ideas from comments in divexact.c apply to this code too. | |
79 */ | |
80 | |
81 /* the NOSTAT stuff handles properly the case where files are concatenated */ | |
82 #ifdef NOSTAT | |
83 #undef STAT | |
84 #endif | |
85 | |
86 #ifdef STAT | |
87 #undef STAT | |
88 #define STAT(x) x | |
89 #else | |
90 #define NOSTAT | |
91 #define STAT(x) | |
92 #endif | |
93 | |
94 #include <stdlib.h> /* for NULL */ | |
95 #include "gmp.h" | |
96 #include "gmp-impl.h" | |
97 | |
98 | |
99 /* In case k=0 (automatic choice), we distinguish 3 cases: | |
100 (a) dn < qn: in = ceil(qn / ceil(qn/dn)) | |
101 (b) dn/3 < qn <= dn: in = ceil(qn / 2) | |
102 (c) qn < dn/3: in = qn | |
103 In all cases we have in <= dn. | |
104 */ | |
105 mp_size_t | |
106 mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) | |
107 { | |
108 mp_size_t in; | |
109 | |
110 if (k == 0) | |
111 { | |
112 mp_size_t b; | |
113 if (qn > dn) | |
114 { | |
115 /* Compute an inverse size that is a nice partition of the quotient.
*/ | |
116 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ | |
117 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) *
/ | |
118 } | |
119 else if (3 * qn > dn) | |
120 { | |
121 in = (qn - 1) / 2 + 1; /* b = 2 */ | |
122 } | |
123 else | |
124 { | |
125 in = (qn - 1) / 1 + 1; /* b = 1 */ | |
126 } | |
127 } | |
128 else | |
129 { | |
130 mp_size_t xn; | |
131 xn = MIN (dn, qn); | |
132 in = (xn - 1) / k + 1; | |
133 } | |
134 | |
135 return in; | |
136 } | |
137 | |
138 static mp_limb_t | |
139 mpn_mu_div_qr2 (mp_ptr qp, | |
140 mp_ptr rp, | |
141 mp_ptr np, | |
142 mp_size_t nn, | |
143 mp_srcptr dp, | |
144 mp_size_t dn, | |
145 mp_ptr scratch) | |
146 { | |
147 mp_size_t qn, in; | |
148 mp_limb_t cy; | |
149 mp_ptr ip, tp; | |
150 | |
151 /* FIXME: We should probably not handle tiny operands, but do it for now. */ | |
152 if (dn == 1) | |
153 { | |
154 rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]); | |
155 MPN_COPY (qp, scratch, nn - 1); | |
156 return scratch[nn - 1]; | |
157 } | |
158 | |
159 qn = nn - dn; | |
160 | |
161 /* Compute the inverse size. */ | |
162 in = mpn_mu_div_qr_choose_in (qn, dn, 0); | |
163 ASSERT (in <= dn); | |
164 | |
165 #if 1 | |
166 /* This alternative inverse computation method gets slightly more accurate | |
167 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function | |
168 not adapted (3) mpn_invert scratch needs not met. */ | |
169 ip = scratch; | |
170 tp = scratch + in + 1; | |
171 | |
172 /* compute an approximate inverse on (in+1) limbs */ | |
173 if (dn == in) | |
174 { | |
175 MPN_COPY (tp + 1, dp, in); | |
176 tp[0] = 1; | |
177 mpn_invert (ip, tp, in + 1, NULL); | |
178 MPN_COPY_INCR (ip, ip + 1, in); | |
179 } | |
180 else | |
181 { | |
182 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); | |
183 if (UNLIKELY (cy != 0)) | |
184 MPN_ZERO (ip, in); | |
185 else | |
186 { | |
187 mpn_invert (ip, tp, in + 1, NULL); | |
188 MPN_COPY_INCR (ip, ip + 1, in); | |
189 } | |
190 } | |
191 #else | |
192 /* This older inverse computation method gets slightly worse results than the | |
193 one above. */ | |
194 ip = scratch; | |
195 tp = scratch + in; | |
196 | |
197 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the | |
198 inversion function should do this automatically. */ | |
199 if (dn == in) | |
200 { | |
201 tp[in + 1] = 0; | |
202 MPN_COPY (tp + in + 2, dp, in); | |
203 mpn_invert (tp, tp + in + 1, in + 1, NULL); | |
204 } | |
205 else | |
206 { | |
207 mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL); | |
208 } | |
209 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); | |
210 if (UNLIKELY (cy != 0)) | |
211 MPN_ZERO (tp + 1, in); | |
212 MPN_COPY (ip, tp + 1, in); | |
213 #endif | |
214 | |
215 /* We can't really handle qh = 1 like this since we'd here clobber N, which is | |
216 not allowed in the way we've defined this function's API. */ | |
217 #if 0 | |
218 qh = mpn_cmp (np + qn, dp, dn) >= 0; | |
219 if (qh != 0) | |
220 mpn_sub_n (np + qn, np + qn, dp, dn); | |
221 #endif | |
222 | |
223 mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); | |
224 | |
225 /* return qh; */ | |
226 return 0; | |
227 } | |
228 | |
229 void | |
230 mpn_preinv_mu_div_qr (mp_ptr qp, | |
231 mp_ptr rp, | |
232 mp_ptr np, | |
233 mp_size_t nn, | |
234 mp_srcptr dp, | |
235 mp_size_t dn, | |
236 mp_srcptr ip, | |
237 mp_size_t in, | |
238 mp_ptr scratch) | |
239 { | |
240 mp_size_t qn; | |
241 mp_limb_t cy; | |
242 mp_ptr tp; | |
243 mp_limb_t r; | |
244 | |
245 qn = nn - dn; | |
246 | |
247 if (qn == 0) | |
248 { | |
249 MPN_COPY (rp, np, dn); | |
250 return; | |
251 } | |
252 | |
253 tp = scratch; | |
254 | |
255 np += qn; | |
256 qp += qn; | |
257 | |
258 MPN_COPY (rp, np, dn); | |
259 | |
260 while (qn > 0) | |
261 { | |
262 if (qn < in) | |
263 { | |
264 ip += in - qn; | |
265 in = qn; | |
266 } | |
267 np -= in; | |
268 qp -= in; | |
269 | |
270 /* Compute the next block of quotient limbs by multiplying the inverse I | |
271 by the upper part of the partial remainder R. */ | |
272 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ | |
273 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ | |
274 ASSERT_ALWAYS (cy == 0); /* FIXME */ | |
275 | |
276 /* Compute the product of the quotient block and the divisor D, to be | |
277 subtracted from the partial remainder combined with new limbs from the | |
278 dividend N. We only really need the low dn limbs. */ | |
279 #if WANT_FFT | |
280 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) | |
281 { | |
282 /* Use the wrap-around trick. */ | |
283 mp_size_t m, wn; | |
284 int k; | |
285 | |
286 k = mpn_fft_best_k (dn + 1, 0); | |
287 m = mpn_fft_next_size (dn + 1, k); | |
288 wn = dn + in - m; /* number of wrapped limbs */ | |
289 | |
290 mpn_mul_fft (tp, m, dp, dn, qp, in, k); | |
291 | |
292 if (wn > 0) | |
293 { | |
294 cy = mpn_add_n (tp, tp, rp + dn - wn, wn); | |
295 mpn_incr_u (tp + wn, cy); | |
296 | |
297 cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0; | |
298 mpn_decr_u (tp, cy); | |
299 } | |
300 } | |
301 else | |
302 #endif | |
303 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancel
s */ | |
304 | |
305 r = rp[dn - in] - tp[dn]; | |
306 | |
307 /* Subtract the product from the partial remainder combined with new | |
308 limbs from the dividend N, generating a new partial remainder R. */ | |
309 if (dn != in) | |
310 { | |
311 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ | |
312 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); | |
313 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ | |
314 } | |
315 else | |
316 { | |
317 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ | |
318 } | |
319 | |
320 STAT (int i; int err = 0; | |
321 static int errarr[5]; static int err_rec; static int tot); | |
322 | |
323 /* Check the remainder R and adjust the quotient as needed. */ | |
324 r -= cy; | |
325 while (r != 0) | |
326 { | |
327 /* We loop 0 times with about 69% probability, 1 time with about 31% | |
328 probability, 2 times with about 0.6% probability, if inverse is | |
329 computed as recommended. */ | |
330 mpn_incr_u (qp, 1); | |
331 cy = mpn_sub_n (rp, rp, dp, dn); | |
332 r -= cy; | |
333 STAT (err++); | |
334 } | |
335 if (mpn_cmp (rp, dp, dn) >= 0) | |
336 { | |
337 /* This is executed with about 76% probability. */ | |
338 mpn_incr_u (qp, 1); | |
339 cy = mpn_sub_n (rp, rp, dp, dn); | |
340 STAT (err++); | |
341 } | |
342 | |
343 STAT ( | |
344 tot++; | |
345 errarr[err]++; | |
346 if (err > err_rec) | |
347 err_rec = err; | |
348 if (tot % 0x10000 == 0) | |
349 { | |
350 for (i = 0; i <= err_rec; i++) | |
351 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); | |
352 printf ("\n"); | |
353 } | |
354 ); | |
355 | |
356 qn -= in; | |
357 } | |
358 } | |
359 | |
360 #define THRES 100 /* FIXME: somewhat arbitrary */ | |
361 | |
362 #ifdef CHECK | |
363 #undef THRES | |
364 #define THRES 1 | |
365 #endif | |
366 | |
367 mp_limb_t | |
368 mpn_mu_div_qr (mp_ptr qp, | |
369 mp_ptr rp, | |
370 mp_ptr np, | |
371 mp_size_t nn, | |
372 mp_srcptr dp, | |
373 mp_size_t dn, | |
374 mp_ptr scratch) | |
375 { | |
376 mp_size_t qn; | |
377 | |
378 qn = nn - dn; | |
379 if (qn + THRES < dn) | |
380 { | |
381 /* |______________|________| dividend nn | |
382 |_______|________| divisor dn | |
383 | |
384 |______| quotient (prel) qn | |
385 | |
386 |_______________| quotient * ignored-part-of(divisor) dn-1 | |
387 */ | |
388 | |
389 mp_limb_t cy, x; | |
390 | |
391 if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0) | |
392 { | |
393 /* Quotient is 111...111, could optimize this rare case at some point.
*/ | |
394 mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); | |
395 return 0; | |
396 } | |
397 | |
398 /* Compute a preliminary quotient and a partial remainder by dividing the | |
399 most significant limbs of each operand. */ | |
400 mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), | |
401 np + nn - (2 * qn + 1), 2 * qn + 1, | |
402 dp + dn - (qn + 1), qn + 1, | |
403 scratch); | |
404 | |
405 /* Multiply the quotient by the divisor limbs ignored above. */ | |
406 if (dn - (qn + 1) > qn) | |
407 mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ | |
408 else | |
409 mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ | |
410 | |
411 cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); | |
412 cy = mpn_sub_nc (rp + nn - (2 * qn + 1), | |
413 rp + nn - (2 * qn + 1), | |
414 scratch + nn - (2 * qn + 1), | |
415 qn, cy); | |
416 x = rp[dn - 1]; | |
417 rp[dn - 1] = x - cy; | |
418 if (cy > x) | |
419 { | |
420 mpn_decr_u (qp, 1); | |
421 mpn_add_n (rp, rp, dp, dn); | |
422 } | |
423 } | |
424 else | |
425 { | |
426 return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); | |
427 } | |
428 | |
429 return 0; /* FIXME */ | |
430 } | |
431 | |
432 mp_size_t | |
433 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) | |
434 { | |
435 mp_size_t qn, m; | |
436 int k; | |
437 | |
438 /* FIXME: This isn't very carefully written, and might grossly overestimate | |
439 the amount of scratch needed, and might perhaps also underestimate it, | |
440 leading to potential buffer overruns. In particular k=0 might lead to | |
441 gross overestimates. */ | |
442 | |
443 if (dn == 1) | |
444 return nn; | |
445 | |
446 qn = nn - dn; | |
447 if (qn >= dn) | |
448 { | |
449 k = mpn_fft_best_k (dn + 1, 0); | |
450 m = mpn_fft_next_size (dn + 1, k); | |
451 return (mua_k <= 1 | |
452 ? 6 * dn | |
453 : m + 2 * dn); | |
454 } | |
455 else | |
456 { | |
457 k = mpn_fft_best_k (dn + 1, 0); | |
458 m = mpn_fft_next_size (dn + 1, k); | |
459 return (mua_k <= 1 | |
460 ? m + 4 * qn | |
461 : m + 2 * qn); | |
462 } | |
463 } | |
OLD | NEW |