gcc/gmp/mpn/generic/mu_div_qr.c - Issue 3050029: [gcc] GCC 4.5.0=>4.5.1

Side by Side Diff: gcc/gmp/mpn/generic/mu_div_qr.c

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git

Patch Set: Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /* mpn_mu_div_qr, mpn_preinv_mu_div_qr.

2

3 Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and

4 must be normalized, and Q must be nn-dn limbs. The requirement that Q is

5 nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to

6 let N be unmodified during the operation.

7

8 Contributed to the GNU project by Torbjorn Granlund.

9

10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS

11 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS

12 ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP

13 RELEASE.

14

15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc.

16

17 This file is part of the GNU MP Library.

18

19 The GNU MP Library is free software; you can redistribute it and/or modify

20 it under the terms of the GNU Lesser General Public License as published by

21 the Free Software Foundation; either version 3 of the License, or (at your

22 option) any later version.

23

24 The GNU MP Library is distributed in the hope that it will be useful, but

25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public

27 License for more details.

28

29 You should have received a copy of the GNU Lesser General Public License

30 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */

31

32

33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann

34 and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of

35 Jebelean's bidirectional exact division algorithm.

36

37 The idea of this algorithm is to compute a smaller inverted value than used

38 in the standard Barrett algorithm, and thus save time in the Newton

39 iterations, and pay just a small price when using the inverted value for

40 developing quotient bits.

41

42 Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the

43 "wrap around" trick. Based on the GMP divexact code and inspired by code

44 contributed to GMP by Karl Hasselstroem.

45 */

46

47

48 /* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockste p.

49

50 Things to work on:

51

52 * Passing k isn't a great interface. Either 'in' should be passed, or

53 determined by the code.

54

55 * The current mpn_mu_div_qr_itch isn't exactly scientifically written.

56 Scratch space buffer overruns are not unlikely before some analysis is

57 applied. Since scratch requirements are expected to change, such an

58 analysis will have to wait til things settle.

59

60 * This isn't optimal when the remainder isn't needed, since the final

61 multiplication could be made special and take O(1) time on average, in that

62 case. This is particularly bad when qn << dn. At some level, code as in

63 GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn

64 dividend limbs by the qn divisor limbs.

65

66 * This isn't optimal when the quotient isn't needed, as it might take a lot

67 of space. The computation is always needed, though, so there is not time

68 to save with special code.

69

70 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,

71 demonstrated by the fact that the mpn_inv function's scratch needs means

72 that we need to keep a large allocation long after it is needed. Things

73 are worse as mpn_mul_fft does not accept any scratch parameter, which means

74 we'll have a large memory hole while in mpn_mul_fft. In general, a peak

75 scratch need in the beginning of a function isn't well-handled by the

76 itch/scratch scheme.

77

78 * Some ideas from comments in divexact.c apply to this code too.

79 */

80

81 /* the NOSTAT stuff handles properly the case where files are concatenated */

82 #ifdef NOSTAT

83 #undef STAT

84 #endif

85

86 #ifdef STAT

87 #undef STAT

88 #define STAT(x) x

89 #else

90 #define NOSTAT

91 #define STAT(x)

92 #endif

93

94 #include <stdlib.h> /* for NULL */

95 #include "gmp.h"

96 #include "gmp-impl.h"

97

98

99 /* In case k=0 (automatic choice), we distinguish 3 cases:

100 (a) dn < qn: in = ceil(qn / ceil(qn/dn))

101 (b) dn/3 < qn <= dn: in = ceil(qn / 2)

102 (c) qn < dn/3: in = qn

103 In all cases we have in <= dn.

104 */

105 mp_size_t

106 mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)

107 {

108 mp_size_t in;

109

110 if (k == 0)

111 {

112 mp_size_t b;

113 if (qn > dn)

114 {

115 /* Compute an inverse size that is a nice partition of the quotient. */

116 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */

117 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) * /

118 }

119 else if (3 * qn > dn)

120 {

121 in = (qn - 1) / 2 + 1; /* b = 2 */

122 }

123 else

124 {

125 in = (qn - 1) / 1 + 1; /* b = 1 */

126 }

127 }

128 else

129 {

130 mp_size_t xn;

131 xn = MIN (dn, qn);

132 in = (xn - 1) / k + 1;

133 }

134

135 return in;

136 }

137

138 static mp_limb_t

139 mpn_mu_div_qr2 (mp_ptr qp,

140 mp_ptr rp,

141 mp_ptr np,

142 mp_size_t nn,

143 mp_srcptr dp,

144 mp_size_t dn,

145 mp_ptr scratch)

146 {

147 mp_size_t qn, in;

148 mp_limb_t cy;

149 mp_ptr ip, tp;

150

151 /* FIXME: We should probably not handle tiny operands, but do it for now. */

152 if (dn == 1)

153 {

154 rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]);

155 MPN_COPY (qp, scratch, nn - 1);

156 return scratch[nn - 1];

157 }

158

159 qn = nn - dn;

160

161 /* Compute the inverse size. */

162 in = mpn_mu_div_qr_choose_in (qn, dn, 0);

163 ASSERT (in <= dn);

164

165 #if 1

166 /* This alternative inverse computation method gets slightly more accurate

167 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function

168 not adapted (3) mpn_invert scratch needs not met. */

169 ip = scratch;

170 tp = scratch + in + 1;

171

172 /* compute an approximate inverse on (in+1) limbs */

173 if (dn == in)

174 {

175 MPN_COPY (tp + 1, dp, in);

176 tp[0] = 1;

177 mpn_invert (ip, tp, in + 1, NULL);

178 MPN_COPY_INCR (ip, ip + 1, in);

179 }

180 else

181 {

182 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);

183 if (UNLIKELY (cy != 0))

184 MPN_ZERO (ip, in);

185 else

186 {

187 mpn_invert (ip, tp, in + 1, NULL);

188 MPN_COPY_INCR (ip, ip + 1, in);

189 }

190 }

191 #else

192 /* This older inverse computation method gets slightly worse results than the

193 one above. */

194 ip = scratch;

195 tp = scratch + in;

196

197 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the

198 inversion function should do this automatically. */

199 if (dn == in)

200 {

201 tp[in + 1] = 0;

202 MPN_COPY (tp + in + 2, dp, in);

203 mpn_invert (tp, tp + in + 1, in + 1, NULL);

204 }

205 else

206 {

207 mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL);

208 }

209 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);

210 if (UNLIKELY (cy != 0))

211 MPN_ZERO (tp + 1, in);

212 MPN_COPY (ip, tp + 1, in);

213 #endif

214

215 /* We can't really handle qh = 1 like this since we'd here clobber N, which is

216 not allowed in the way we've defined this function's API. */

217 #if 0

218 qh = mpn_cmp (np + qn, dp, dn) >= 0;

219 if (qh != 0)

220 mpn_sub_n (np + qn, np + qn, dp, dn);

221 #endif

222

223 mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);

224

225 /* return qh; */

226 return 0;

227 }

228

229 void

230 mpn_preinv_mu_div_qr (mp_ptr qp,

231 mp_ptr rp,

232 mp_ptr np,

233 mp_size_t nn,

234 mp_srcptr dp,

235 mp_size_t dn,

236 mp_srcptr ip,

237 mp_size_t in,

238 mp_ptr scratch)

239 {

240 mp_size_t qn;

241 mp_limb_t cy;

242 mp_ptr tp;

243 mp_limb_t r;

244

245 qn = nn - dn;

246

247 if (qn == 0)

248 {

249 MPN_COPY (rp, np, dn);

250 return;

251 }

252

253 tp = scratch;

254

255 np += qn;

256 qp += qn;

257

258 MPN_COPY (rp, np, dn);

259

260 while (qn > 0)

261 {

262 if (qn < in)

263 {

264 ip += in - qn;

265 in = qn;

266 }

267 np -= in;

268 qp -= in;

269

270 /* Compute the next block of quotient limbs by multiplying the inverse I

271 by the upper part of the partial remainder R. */

272 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */

273 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */

274 ASSERT_ALWAYS (cy == 0); /* FIXME */

275

276 /* Compute the product of the quotient block and the divisor D, to be

277 subtracted from the partial remainder combined with new limbs from the

278 dividend N. We only really need the low dn limbs. */

279 #if WANT_FFT

280 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))

281 {

282 /* Use the wrap-around trick. */

283 mp_size_t m, wn;

284 int k;

285

286 k = mpn_fft_best_k (dn + 1, 0);

287 m = mpn_fft_next_size (dn + 1, k);

288 wn = dn + in - m; /* number of wrapped limbs */

289

290 mpn_mul_fft (tp, m, dp, dn, qp, in, k);

291

292 if (wn > 0)

293 {

294 cy = mpn_add_n (tp, tp, rp + dn - wn, wn);

295 mpn_incr_u (tp + wn, cy);

296

297 cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0;

298 mpn_decr_u (tp, cy);

299 }

300 }

301 else

302 #endif

303 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancel s */

304

305 r = rp[dn - in] - tp[dn];

306

307 /* Subtract the product from the partial remainder combined with new

308 limbs from the dividend N, generating a new partial remainder R. */

309 if (dn != in)

310 {

311 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */

312 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);

313 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */

314 }

315 else

316 {

317 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */

318 }

319

320 STAT (int i; int err = 0;

321 static int errarr[5]; static int err_rec; static int tot);

322

323 /* Check the remainder R and adjust the quotient as needed. */

324 r -= cy;

325 while (r != 0)

326 {

327 /* We loop 0 times with about 69% probability, 1 time with about 31%

328 probability, 2 times with about 0.6% probability, if inverse is

329 computed as recommended. */

330 mpn_incr_u (qp, 1);

331 cy = mpn_sub_n (rp, rp, dp, dn);

332 r -= cy;

333 STAT (err++);

334 }

335 if (mpn_cmp (rp, dp, dn) >= 0)

336 {

337 /* This is executed with about 76% probability. */

338 mpn_incr_u (qp, 1);

339 cy = mpn_sub_n (rp, rp, dp, dn);

340 STAT (err++);

341 }

342

343 STAT (

344 tot++;

345 errarr[err]++;

346 if (err > err_rec)

347 err_rec = err;

348 if (tot % 0x10000 == 0)

349 {

350 for (i = 0; i <= err_rec; i++)

351 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);

352 printf ("\n");

353 }

354 );

355

356 qn -= in;

357 }

358 }

359

360 #define THRES 100 /* FIXME: somewhat arbitrary */

361

362 #ifdef CHECK

363 #undef THRES

364 #define THRES 1

365 #endif

366

367 mp_limb_t

368 mpn_mu_div_qr (mp_ptr qp,

369 mp_ptr rp,

370 mp_ptr np,

371 mp_size_t nn,

372 mp_srcptr dp,

373 mp_size_t dn,

374 mp_ptr scratch)

375 {

376 mp_size_t qn;

377

378 qn = nn - dn;

379 if (qn + THRES < dn)

380 {

381 /* \|______________\|________\| dividend nn

382 \|_______\|________\| divisor dn

383

384 \|______\| quotient (prel) qn

385

386 \|_______________\| quotient * ignored-part-of(divisor) dn-1

387 */

388

389 mp_limb_t cy, x;

390

391 if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0)

392 {

393 /* Quotient is 111...111, could optimize this rare case at some point. */

394 mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);

395 return 0;

396 }

397

398 /* Compute a preliminary quotient and a partial remainder by dividing the

399 most significant limbs of each operand. */

400 mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),

401 np + nn - (2 * qn + 1), 2 * qn + 1,

402 dp + dn - (qn + 1), qn + 1,

403 scratch);

404

405 /* Multiply the quotient by the divisor limbs ignored above. */

406 if (dn - (qn + 1) > qn)

407 mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */

408 else

409 mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */

410

411 cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));

412 cy = mpn_sub_nc (rp + nn - (2 * qn + 1),

413 rp + nn - (2 * qn + 1),

414 scratch + nn - (2 * qn + 1),

415 qn, cy);

416 x = rp[dn - 1];

417 rp[dn - 1] = x - cy;

418 if (cy > x)

419 {

420 mpn_decr_u (qp, 1);

421 mpn_add_n (rp, rp, dp, dn);

422 }

423 }

424 else

425 {

426 return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);

427 }

428

429 return 0; /* FIXME */

430 }

431

432 mp_size_t

433 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)

434 {

435 mp_size_t qn, m;

436 int k;

437

438 /* FIXME: This isn't very carefully written, and might grossly overestimate

439 the amount of scratch needed, and might perhaps also underestimate it,

440 leading to potential buffer overruns. In particular k=0 might lead to

441 gross overestimates. */

442

443 if (dn == 1)

444 return nn;

445

446 qn = nn - dn;

447 if (qn >= dn)

448 {

449 k = mpn_fft_best_k (dn + 1, 0);

450 m = mpn_fft_next_size (dn + 1, k);

451 return (mua_k <= 1

452 ? 6 * dn

453 : m + 2 * dn);

454 }

455 else

456 {

457 k = mpn_fft_best_k (dn + 1, 0);

458 m = mpn_fft_next_size (dn + 1, k);

459 return (mua_k <= 1

460 ? m + 4 * qn

461 : m + 2 * qn);

462 }

463 }

OLD	NEW

« no previous file with comments | « gcc/gmp/mpn/generic/mu_div_q.c ('k') | gcc/gmp/mpn/generic/mullow_basecase.c » ('j') | no next file with comments »