Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(298)

Side by Side Diff: gcc/gmp/mpn/generic/toom62_mul.c

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « gcc/gmp/mpn/generic/toom4_sqr.c ('k') | gcc/gmp/mpn/generic/toom_interpolate_5pts.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
2 as large as bn. Or more accurately, (5/2)bn < an < 6bn.
3
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6 The idea of applying toom to unbalanced multiplication is due to by Marco
7 Bodrato and Alberto Zanoni.
8
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12
13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
14
15 This file is part of the GNU MP Library.
16
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of the GNU Lesser General Public License as published by
19 the Free Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
25 License for more details.
26
27 You should have received a copy of the GNU Lesser General Public License
28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
29
30
31 /*
32 Things to work on:
33
34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
35 avoided by instead reusing the pp area and the scratch allocation.
36 */
37
38 #include "gmp.h"
39 #include "gmp-impl.h"
40
41
42 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
43
44 <-s-><--n--><--n--><--n-->
45 ___ ______ ______ ______ ______ ______
46 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
47 |_b1_|___b0_|
48 <-t--><--n-->
49
50 v0 = a0 * b0 # A(0)*B(0)
51 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 b h <= 1
52 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 b h = 0
53 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 b h <= 2
54 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 b h <= 2
55 vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0 <=bh<=1
56 vinf= a5 * b1 # A(inf)*B(inf)
57 */
58
59 void
60 mpn_toom62_mul (mp_ptr pp,
61 mp_srcptr ap, mp_size_t an,
62 mp_srcptr bp, mp_size_t bn,
63 mp_ptr scratch)
64 {
65 mp_size_t n, s, t;
66 int vm1_neg, vmh_neg, bsm_neg;
67 mp_limb_t cy;
68 mp_ptr a0_a2, a1_a3;
69 mp_ptr as1, asm1, as2, ash, asmh;
70 mp_ptr bs1, bsm1, bs2, bsh, bsmh;
71 enum toom4_flags flags;
72 TMP_DECL;
73
74 #define a0 ap
75 #define a1 (ap + n)
76 #define a2 (ap + 2*n)
77 #define a3 (ap + 3*n)
78 #define a4 (ap + 4*n)
79 #define a5 (ap + 5*n)
80 #define b0 bp
81 #define b1 (bp + n)
82
83 n = 1 + (an >= 3 * bn ? (an - 1) / (unsigned long) 6 : (bn - 1) >> 1);
84
85 s = an - 5 * n;
86 t = bn - n;
87
88 ASSERT (0 < s && s <= n);
89 ASSERT (0 < t && t <= n);
90
91 TMP_MARK;
92
93 as1 = TMP_SALLOC_LIMBS (n + 1);
94 asm1 = TMP_SALLOC_LIMBS (n + 1);
95 as2 = TMP_SALLOC_LIMBS (n + 1);
96 ash = TMP_SALLOC_LIMBS (n + 1);
97 asmh = TMP_SALLOC_LIMBS (n + 1);
98
99 bs1 = TMP_SALLOC_LIMBS (n + 1);
100 bsm1 = TMP_SALLOC_LIMBS (n);
101 bs2 = TMP_SALLOC_LIMBS (n + 1);
102 bsh = TMP_SALLOC_LIMBS (n + 1);
103 bsmh = TMP_SALLOC_LIMBS (n + 1);
104
105 a0_a2 = pp;
106 a1_a3 = pp + n + 1;
107
108 /* Compute as1 and asm1. */
109 a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n);
110 a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n);
111 a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n);
112 a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s);
113 #if HAVE_NATIVE_mpn_addsub_n
114 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
115 {
116 mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1);
117 vm1_neg = 1;
118 }
119 else
120 {
121 mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1);
122 vm1_neg = 0;
123 }
124 #else
125 mpn_add_n (as1, a0_a2, a1_a3, n + 1);
126 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
127 {
128 mpn_sub_n (asm1, a1_a3, a0_a2, n + 1);
129 vm1_neg = 1;
130 }
131 else
132 {
133 mpn_sub_n (asm1, a0_a2, a1_a3, n + 1);
134 vm1_neg = 0;
135 }
136 #endif
137
138 /* Compute as2. */
139 #if HAVE_NATIVE_mpn_addlsh1_n
140 cy = mpn_addlsh1_n (as2, a4, a5, s);
141 if (s != n)
142 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy);
143 cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n);
144 cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
147 #else
148 cy = mpn_lshift (as2, a5, s, 1);
149 cy += mpn_add_n (as2, a4, as2, s);
150 if (s != n)
151 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy);
152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
153 cy += mpn_add_n (as2, a3, as2, n);
154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
155 cy += mpn_add_n (as2, a2, as2, n);
156 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
157 cy += mpn_add_n (as2, a1, as2, n);
158 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
159 cy += mpn_add_n (as2, a0, as2, n);
160 #endif
161 as2[n] = cy;
162
163 /* Compute ash and asmh. */
164 #if HAVE_NATIVE_mpn_addlsh_n
165 cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */
166 cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */
167 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */
168 a0_a2[n] = cy;
169 cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */
170 cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */
171 a1_a3[n] = cy;
172 #else
173 cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */
174 cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */
175 cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */
176 cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */
177 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */
178 a0_a2[n] = cy;
179 cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */
180 cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */
181 cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */
182 cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */
183 a1_a3[n] = cy;
184 #endif
185 #if HAVE_NATIVE_mpn_addsub_n
186 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
187 {
188 mpn_addsub_n (ash, asmh, a1_a3, a0_a2, n + 1);
189 vmh_neg = 1;
190 }
191 else
192 {
193 mpn_addsub_n (ash, asmh, a0_a2, a1_a3, n + 1);
194 vmh_neg = 0;
195 }
196 #else
197 mpn_add_n (ash, a0_a2, a1_a3, n + 1);
198 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
199 {
200 mpn_sub_n (asmh, a1_a3, a0_a2, n + 1);
201 vmh_neg = 1;
202 }
203 else
204 {
205 mpn_sub_n (asmh, a0_a2, a1_a3, n + 1);
206 vmh_neg = 0;
207 }
208 #endif
209
210 /* Compute bs1 and bsm1. */
211 if (t == n)
212 {
213 #if HAVE_NATIVE_mpn_addsub_n
214 if (mpn_cmp (b0, b1, n) < 0)
215 {
216 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
217 bsm_neg = 1;
218 }
219 else
220 {
221 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
222 bsm_neg = 0;
223 }
224 bs1[n] = cy >> 1;
225 #else
226 bs1[n] = mpn_add_n (bs1, b0, b1, n);
227 if (mpn_cmp (b0, b1, n) < 0)
228 {
229 mpn_sub_n (bsm1, b1, b0, n);
230 bsm_neg = 1;
231 }
232 else
233 {
234 mpn_sub_n (bsm1, b0, b1, n);
235 bsm_neg = 0;
236 }
237 #endif
238 }
239 else
240 {
241 bs1[n] = mpn_add (bs1, b0, n, b1, t);
242 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
243 {
244 mpn_sub_n (bsm1, b1, b0, t);
245 MPN_ZERO (bsm1 + t, n - t);
246 bsm_neg = 1;
247 }
248 else
249 {
250 mpn_sub (bsm1, b0, n, b1, t);
251 bsm_neg = 0;
252 }
253 }
254
255 vm1_neg ^= bsm_neg;
256
257 /* Compute bs2, recycling bs1. bs2=bs1+b1 */
258 mpn_add (bs2, bs1, n + 1, b1, t);
259
260 /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */
261 if (bsm_neg == 1)
262 {
263 bsmh[n] = 0;
264 if (mpn_cmp (bsm1, b0, n) < 0)
265 {
266 bsm_neg = 0;
267 mpn_sub_n (bsmh, b0, bsm1, n);
268 }
269 else
270 mpn_sub_n (bsmh, bsm1, b0, n);
271 }
272 else
273 bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n);
274 mpn_add (bsh, bs1, n + 1, b0, n);
275 vmh_neg ^= bsm_neg;
276
277
278 ASSERT (as1[n] <= 5);
279 ASSERT (bs1[n] <= 1);
280 ASSERT (asm1[n] <= 2);
281 /*ASSERT (bsm1[n] == 0);*/
282 ASSERT (as2[n] <= 62);
283 ASSERT (bs2[n] <= 2);
284 ASSERT (ash[n] <= 62);
285 ASSERT (bsh[n] <= 2);
286 ASSERT (asmh[n] <= 41);
287 ASSERT (bsmh[n] <= 1);
288
289 #define v0 pp /* 2n */
290 #define v1 (scratch + 6 * n + 6) /* 2n+1 */
291 #define vinf (pp + 6 * n) /* s+t */
292 #define vm1 scratch /* 2n+1 */
293 #define v2 (scratch + 2 * n + 2) /* 2n+1 */
294 #define vh (pp + 2 * n) /* 2n+1 */
295 #define vmh (scratch + 4 * n + 4)
296
297 /* vm1, 2n+1 limbs */
298 mpn_mul_n (vm1, asm1, bsm1, n);
299 cy = 0;
300 if (asm1[n] == 1)
301 {
302 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
303 }
304 else if (asm1[n] == 2)
305 {
306 #if HAVE_NATIVE_mpn_addlsh1_n
307 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
308 #else
309 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
310 #endif
311 }
312 vm1[2 * n] = cy;
313
314 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
315
316 /* vinf, s+t limbs */
317 if (s > t) mpn_mul (vinf, a5, s, b1, t);
318 else mpn_mul (vinf, b1, t, a5, s);
319
320 /* v1, 2n+1 limbs */
321 mpn_mul_n (v1, as1, bs1, n);
322 if (as1[n] == 1)
323 {
324 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
325 }
326 else if (as1[n] == 2)
327 {
328 #if HAVE_NATIVE_mpn_addlsh1_n
329 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
330 #else
331 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
332 #endif
333 }
334 else if (as1[n] != 0)
335 {
336 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
337 }
338 else
339 cy = 0;
340 if (bs1[n] != 0)
341 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
342 v1[2 * n] = cy;
343
344 mpn_mul_n (vh, ash, bsh, n + 1);
345
346 mpn_mul_n (vmh, asmh, bsmh, n + 1);
347
348 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
349
350 flags = vm1_neg ? toom4_w3_neg : 0;
351 flags |= vmh_neg ? toom4_w1_neg : 0;
352
353 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8);
354
355 TMP_FREE;
356 }
OLDNEW
« no previous file with comments | « gcc/gmp/mpn/generic/toom4_sqr.c ('k') | gcc/gmp/mpn/generic/toom_interpolate_5pts.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698