OLD | NEW |
| (Empty) |
1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times | |
2 as large as bn. Or more accurately, (5/2)bn < an < 6bn. | |
3 | |
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. | |
5 | |
6 The idea of applying toom to unbalanced multiplication is due to by Marco | |
7 Bodrato and Alberto Zanoni. | |
8 | |
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY | |
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST | |
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. | |
12 | |
13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. | |
14 | |
15 This file is part of the GNU MP Library. | |
16 | |
17 The GNU MP Library is free software; you can redistribute it and/or modify | |
18 it under the terms of the GNU Lesser General Public License as published by | |
19 the Free Software Foundation; either version 3 of the License, or (at your | |
20 option) any later version. | |
21 | |
22 The GNU MP Library is distributed in the hope that it will be useful, but | |
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
25 License for more details. | |
26 | |
27 You should have received a copy of the GNU Lesser General Public License | |
28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ | |
29 | |
30 | |
31 /* | |
32 Things to work on: | |
33 | |
34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be | |
35 avoided by instead reusing the pp area and the scratch allocation. | |
36 */ | |
37 | |
38 #include "gmp.h" | |
39 #include "gmp-impl.h" | |
40 | |
41 | |
42 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf | |
43 | |
44 <-s-><--n--><--n--><--n--> | |
45 ___ ______ ______ ______ ______ ______ | |
46 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| | |
47 |_b1_|___b0_| | |
48 <-t--><--n--> | |
49 | |
50 v0 = a0 * b0 # A(0)*B(0) | |
51 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 b
h <= 1 | |
52 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 b
h = 0 | |
53 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 b
h <= 2 | |
54 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 b
h <= 2 | |
55 vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0
<=bh<=1 | |
56 vinf= a5 * b1 # A(inf)*B(inf) | |
57 */ | |
58 | |
59 void | |
60 mpn_toom62_mul (mp_ptr pp, | |
61 mp_srcptr ap, mp_size_t an, | |
62 mp_srcptr bp, mp_size_t bn, | |
63 mp_ptr scratch) | |
64 { | |
65 mp_size_t n, s, t; | |
66 int vm1_neg, vmh_neg, bsm_neg; | |
67 mp_limb_t cy; | |
68 mp_ptr a0_a2, a1_a3; | |
69 mp_ptr as1, asm1, as2, ash, asmh; | |
70 mp_ptr bs1, bsm1, bs2, bsh, bsmh; | |
71 enum toom4_flags flags; | |
72 TMP_DECL; | |
73 | |
74 #define a0 ap | |
75 #define a1 (ap + n) | |
76 #define a2 (ap + 2*n) | |
77 #define a3 (ap + 3*n) | |
78 #define a4 (ap + 4*n) | |
79 #define a5 (ap + 5*n) | |
80 #define b0 bp | |
81 #define b1 (bp + n) | |
82 | |
83 n = 1 + (an >= 3 * bn ? (an - 1) / (unsigned long) 6 : (bn - 1) >> 1); | |
84 | |
85 s = an - 5 * n; | |
86 t = bn - n; | |
87 | |
88 ASSERT (0 < s && s <= n); | |
89 ASSERT (0 < t && t <= n); | |
90 | |
91 TMP_MARK; | |
92 | |
93 as1 = TMP_SALLOC_LIMBS (n + 1); | |
94 asm1 = TMP_SALLOC_LIMBS (n + 1); | |
95 as2 = TMP_SALLOC_LIMBS (n + 1); | |
96 ash = TMP_SALLOC_LIMBS (n + 1); | |
97 asmh = TMP_SALLOC_LIMBS (n + 1); | |
98 | |
99 bs1 = TMP_SALLOC_LIMBS (n + 1); | |
100 bsm1 = TMP_SALLOC_LIMBS (n); | |
101 bs2 = TMP_SALLOC_LIMBS (n + 1); | |
102 bsh = TMP_SALLOC_LIMBS (n + 1); | |
103 bsmh = TMP_SALLOC_LIMBS (n + 1); | |
104 | |
105 a0_a2 = pp; | |
106 a1_a3 = pp + n + 1; | |
107 | |
108 /* Compute as1 and asm1. */ | |
109 a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n); | |
110 a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n); | |
111 a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n); | |
112 a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s); | |
113 #if HAVE_NATIVE_mpn_addsub_n | |
114 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
115 { | |
116 mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1); | |
117 vm1_neg = 1; | |
118 } | |
119 else | |
120 { | |
121 mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1); | |
122 vm1_neg = 0; | |
123 } | |
124 #else | |
125 mpn_add_n (as1, a0_a2, a1_a3, n + 1); | |
126 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
127 { | |
128 mpn_sub_n (asm1, a1_a3, a0_a2, n + 1); | |
129 vm1_neg = 1; | |
130 } | |
131 else | |
132 { | |
133 mpn_sub_n (asm1, a0_a2, a1_a3, n + 1); | |
134 vm1_neg = 0; | |
135 } | |
136 #endif | |
137 | |
138 /* Compute as2. */ | |
139 #if HAVE_NATIVE_mpn_addlsh1_n | |
140 cy = mpn_addlsh1_n (as2, a4, a5, s); | |
141 if (s != n) | |
142 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); | |
143 cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n); | |
144 cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); | |
145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); | |
146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); | |
147 #else | |
148 cy = mpn_lshift (as2, a5, s, 1); | |
149 cy += mpn_add_n (as2, a4, as2, s); | |
150 if (s != n) | |
151 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); | |
152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
153 cy += mpn_add_n (as2, a3, as2, n); | |
154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
155 cy += mpn_add_n (as2, a2, as2, n); | |
156 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
157 cy += mpn_add_n (as2, a1, as2, n); | |
158 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); | |
159 cy += mpn_add_n (as2, a0, as2, n); | |
160 #endif | |
161 as2[n] = cy; | |
162 | |
163 /* Compute ash and asmh. */ | |
164 #if HAVE_NATIVE_mpn_addlsh_n | |
165 cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */ | |
166 cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */ | |
167 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ | |
168 a0_a2[n] = cy; | |
169 cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */ | |
170 cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */ | |
171 a1_a3[n] = cy; | |
172 #else | |
173 cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */ | |
174 cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */ | |
175 cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */ | |
176 cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */ | |
177 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ | |
178 a0_a2[n] = cy; | |
179 cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */ | |
180 cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */ | |
181 cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */ | |
182 cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */ | |
183 a1_a3[n] = cy; | |
184 #endif | |
185 #if HAVE_NATIVE_mpn_addsub_n | |
186 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
187 { | |
188 mpn_addsub_n (ash, asmh, a1_a3, a0_a2, n + 1); | |
189 vmh_neg = 1; | |
190 } | |
191 else | |
192 { | |
193 mpn_addsub_n (ash, asmh, a0_a2, a1_a3, n + 1); | |
194 vmh_neg = 0; | |
195 } | |
196 #else | |
197 mpn_add_n (ash, a0_a2, a1_a3, n + 1); | |
198 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) | |
199 { | |
200 mpn_sub_n (asmh, a1_a3, a0_a2, n + 1); | |
201 vmh_neg = 1; | |
202 } | |
203 else | |
204 { | |
205 mpn_sub_n (asmh, a0_a2, a1_a3, n + 1); | |
206 vmh_neg = 0; | |
207 } | |
208 #endif | |
209 | |
210 /* Compute bs1 and bsm1. */ | |
211 if (t == n) | |
212 { | |
213 #if HAVE_NATIVE_mpn_addsub_n | |
214 if (mpn_cmp (b0, b1, n) < 0) | |
215 { | |
216 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n); | |
217 bsm_neg = 1; | |
218 } | |
219 else | |
220 { | |
221 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n); | |
222 bsm_neg = 0; | |
223 } | |
224 bs1[n] = cy >> 1; | |
225 #else | |
226 bs1[n] = mpn_add_n (bs1, b0, b1, n); | |
227 if (mpn_cmp (b0, b1, n) < 0) | |
228 { | |
229 mpn_sub_n (bsm1, b1, b0, n); | |
230 bsm_neg = 1; | |
231 } | |
232 else | |
233 { | |
234 mpn_sub_n (bsm1, b0, b1, n); | |
235 bsm_neg = 0; | |
236 } | |
237 #endif | |
238 } | |
239 else | |
240 { | |
241 bs1[n] = mpn_add (bs1, b0, n, b1, t); | |
242 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) | |
243 { | |
244 mpn_sub_n (bsm1, b1, b0, t); | |
245 MPN_ZERO (bsm1 + t, n - t); | |
246 bsm_neg = 1; | |
247 } | |
248 else | |
249 { | |
250 mpn_sub (bsm1, b0, n, b1, t); | |
251 bsm_neg = 0; | |
252 } | |
253 } | |
254 | |
255 vm1_neg ^= bsm_neg; | |
256 | |
257 /* Compute bs2, recycling bs1. bs2=bs1+b1 */ | |
258 mpn_add (bs2, bs1, n + 1, b1, t); | |
259 | |
260 /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */ | |
261 if (bsm_neg == 1) | |
262 { | |
263 bsmh[n] = 0; | |
264 if (mpn_cmp (bsm1, b0, n) < 0) | |
265 { | |
266 bsm_neg = 0; | |
267 mpn_sub_n (bsmh, b0, bsm1, n); | |
268 } | |
269 else | |
270 mpn_sub_n (bsmh, bsm1, b0, n); | |
271 } | |
272 else | |
273 bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n); | |
274 mpn_add (bsh, bs1, n + 1, b0, n); | |
275 vmh_neg ^= bsm_neg; | |
276 | |
277 | |
278 ASSERT (as1[n] <= 5); | |
279 ASSERT (bs1[n] <= 1); | |
280 ASSERT (asm1[n] <= 2); | |
281 /*ASSERT (bsm1[n] == 0);*/ | |
282 ASSERT (as2[n] <= 62); | |
283 ASSERT (bs2[n] <= 2); | |
284 ASSERT (ash[n] <= 62); | |
285 ASSERT (bsh[n] <= 2); | |
286 ASSERT (asmh[n] <= 41); | |
287 ASSERT (bsmh[n] <= 1); | |
288 | |
289 #define v0 pp /* 2n */ | |
290 #define v1 (scratch + 6 * n + 6) /* 2n+1 */ | |
291 #define vinf (pp + 6 * n) /* s+t */ | |
292 #define vm1 scratch /* 2n+1 */ | |
293 #define v2 (scratch + 2 * n + 2) /* 2n+1 */ | |
294 #define vh (pp + 2 * n) /* 2n+1 */ | |
295 #define vmh (scratch + 4 * n + 4) | |
296 | |
297 /* vm1, 2n+1 limbs */ | |
298 mpn_mul_n (vm1, asm1, bsm1, n); | |
299 cy = 0; | |
300 if (asm1[n] == 1) | |
301 { | |
302 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); | |
303 } | |
304 else if (asm1[n] == 2) | |
305 { | |
306 #if HAVE_NATIVE_mpn_addlsh1_n | |
307 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); | |
308 #else | |
309 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); | |
310 #endif | |
311 } | |
312 vm1[2 * n] = cy; | |
313 | |
314 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ | |
315 | |
316 /* vinf, s+t limbs */ | |
317 if (s > t) mpn_mul (vinf, a5, s, b1, t); | |
318 else mpn_mul (vinf, b1, t, a5, s); | |
319 | |
320 /* v1, 2n+1 limbs */ | |
321 mpn_mul_n (v1, as1, bs1, n); | |
322 if (as1[n] == 1) | |
323 { | |
324 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); | |
325 } | |
326 else if (as1[n] == 2) | |
327 { | |
328 #if HAVE_NATIVE_mpn_addlsh1_n | |
329 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); | |
330 #else | |
331 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); | |
332 #endif | |
333 } | |
334 else if (as1[n] != 0) | |
335 { | |
336 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); | |
337 } | |
338 else | |
339 cy = 0; | |
340 if (bs1[n] != 0) | |
341 cy += mpn_add_n (v1 + n, v1 + n, as1, n); | |
342 v1[2 * n] = cy; | |
343 | |
344 mpn_mul_n (vh, ash, bsh, n + 1); | |
345 | |
346 mpn_mul_n (vmh, asmh, bsmh, n + 1); | |
347 | |
348 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */ | |
349 | |
350 flags = vm1_neg ? toom4_w3_neg : 0; | |
351 flags |= vmh_neg ? toom4_w1_neg : 0; | |
352 | |
353 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8
* n + 8); | |
354 | |
355 TMP_FREE; | |
356 } | |
OLD | NEW |