OLD | NEW |
| (Empty) |
1 /* crypto/bn/bn_asm.c */ | |
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | |
3 * All rights reserved. | |
4 * | |
5 * This package is an SSL implementation written | |
6 * by Eric Young (eay@cryptsoft.com). | |
7 * The implementation was written so as to conform with Netscapes SSL. | |
8 * | |
9 * This library is free for commercial and non-commercial use as long as | |
10 * the following conditions are aheared to. The following conditions | |
11 * apply to all code found in this distribution, be it the RC4, RSA, | |
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation | |
13 * included with this distribution is covered by the same copyright terms | |
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com). | |
15 * | |
16 * Copyright remains Eric Young's, and as such any Copyright notices in | |
17 * the code are not to be removed. | |
18 * If this package is used in a product, Eric Young should be given attribution | |
19 * as the author of the parts of the library used. | |
20 * This can be in the form of a textual message at program startup or | |
21 * in documentation (online or textual) provided with the package. | |
22 * | |
23 * Redistribution and use in source and binary forms, with or without | |
24 * modification, are permitted provided that the following conditions | |
25 * are met: | |
26 * 1. Redistributions of source code must retain the copyright | |
27 * notice, this list of conditions and the following disclaimer. | |
28 * 2. Redistributions in binary form must reproduce the above copyright | |
29 * notice, this list of conditions and the following disclaimer in the | |
30 * documentation and/or other materials provided with the distribution. | |
31 * 3. All advertising materials mentioning features or use of this software | |
32 * must display the following acknowledgement: | |
33 * "This product includes cryptographic software written by | |
34 * Eric Young (eay@cryptsoft.com)" | |
35 * The word 'cryptographic' can be left out if the rouines from the library | |
36 * being used are not cryptographic related :-). | |
37 * 4. If you include any Windows specific code (or a derivative thereof) from | |
38 * the apps directory (application code) you must include an acknowledgement: | |
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | |
40 * | |
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | |
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
51 * SUCH DAMAGE. | |
52 * | |
53 * The licence and distribution terms for any publically available version or | |
54 * derivative of this code cannot be changed. i.e. this code cannot simply be | |
55 * copied and put under another distribution licence | |
56 * [including the GNU Public Licence.] | |
57 */ | |
58 | |
59 #ifndef BN_DEBUG | |
60 # undef NDEBUG /* avoid conflicting definitions */ | |
61 # define NDEBUG | |
62 #endif | |
63 | |
64 #include <stdio.h> | |
65 #include <assert.h> | |
66 #include "cryptlib.h" | |
67 #include "bn_lcl.h" | |
68 | |
69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) | |
70 | |
71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | |
72 { | |
73 BN_ULONG c1=0; | |
74 | |
75 assert(num >= 0); | |
76 if (num <= 0) return(c1); | |
77 | |
78 #ifndef OPENSSL_SMALL_FOOTPRINT | |
79 while (num&~3) | |
80 { | |
81 mul_add(rp[0],ap[0],w,c1); | |
82 mul_add(rp[1],ap[1],w,c1); | |
83 mul_add(rp[2],ap[2],w,c1); | |
84 mul_add(rp[3],ap[3],w,c1); | |
85 ap+=4; rp+=4; num-=4; | |
86 } | |
87 #endif | |
88 while (num) | |
89 { | |
90 mul_add(rp[0],ap[0],w,c1); | |
91 ap++; rp++; num--; | |
92 } | |
93 | |
94 return(c1); | |
95 } | |
96 | |
97 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | |
98 { | |
99 BN_ULONG c1=0; | |
100 | |
101 assert(num >= 0); | |
102 if (num <= 0) return(c1); | |
103 | |
104 #ifndef OPENSSL_SMALL_FOOTPRINT | |
105 while (num&~3) | |
106 { | |
107 mul(rp[0],ap[0],w,c1); | |
108 mul(rp[1],ap[1],w,c1); | |
109 mul(rp[2],ap[2],w,c1); | |
110 mul(rp[3],ap[3],w,c1); | |
111 ap+=4; rp+=4; num-=4; | |
112 } | |
113 #endif | |
114 while (num) | |
115 { | |
116 mul(rp[0],ap[0],w,c1); | |
117 ap++; rp++; num--; | |
118 } | |
119 return(c1); | |
120 } | |
121 | |
122 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | |
123 { | |
124 assert(n >= 0); | |
125 if (n <= 0) return; | |
126 | |
127 #ifndef OPENSSL_SMALL_FOOTPRINT | |
128 while (n&~3) | |
129 { | |
130 sqr(r[0],r[1],a[0]); | |
131 sqr(r[2],r[3],a[1]); | |
132 sqr(r[4],r[5],a[2]); | |
133 sqr(r[6],r[7],a[3]); | |
134 a+=4; r+=8; n-=4; | |
135 } | |
136 #endif | |
137 while (n) | |
138 { | |
139 sqr(r[0],r[1],a[0]); | |
140 a++; r+=2; n--; | |
141 } | |
142 } | |
143 | |
144 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ | |
145 | |
146 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | |
147 { | |
148 BN_ULONG c=0; | |
149 BN_ULONG bl,bh; | |
150 | |
151 assert(num >= 0); | |
152 if (num <= 0) return((BN_ULONG)0); | |
153 | |
154 bl=LBITS(w); | |
155 bh=HBITS(w); | |
156 | |
157 #ifndef OPENSSL_SMALL_FOOTPRINT | |
158 while (num&~3) | |
159 { | |
160 mul_add(rp[0],ap[0],bl,bh,c); | |
161 mul_add(rp[1],ap[1],bl,bh,c); | |
162 mul_add(rp[2],ap[2],bl,bh,c); | |
163 mul_add(rp[3],ap[3],bl,bh,c); | |
164 ap+=4; rp+=4; num-=4; | |
165 } | |
166 #endif | |
167 while (num) | |
168 { | |
169 mul_add(rp[0],ap[0],bl,bh,c); | |
170 ap++; rp++; num--; | |
171 } | |
172 return(c); | |
173 } | |
174 | |
175 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | |
176 { | |
177 BN_ULONG carry=0; | |
178 BN_ULONG bl,bh; | |
179 | |
180 assert(num >= 0); | |
181 if (num <= 0) return((BN_ULONG)0); | |
182 | |
183 bl=LBITS(w); | |
184 bh=HBITS(w); | |
185 | |
186 #ifndef OPENSSL_SMALL_FOOTPRINT | |
187 while (num&~3) | |
188 { | |
189 mul(rp[0],ap[0],bl,bh,carry); | |
190 mul(rp[1],ap[1],bl,bh,carry); | |
191 mul(rp[2],ap[2],bl,bh,carry); | |
192 mul(rp[3],ap[3],bl,bh,carry); | |
193 ap+=4; rp+=4; num-=4; | |
194 } | |
195 #endif | |
196 while (num) | |
197 { | |
198 mul(rp[0],ap[0],bl,bh,carry); | |
199 ap++; rp++; num--; | |
200 } | |
201 return(carry); | |
202 } | |
203 | |
204 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | |
205 { | |
206 assert(n >= 0); | |
207 if (n <= 0) return; | |
208 | |
209 #ifndef OPENSSL_SMALL_FOOTPRINT | |
210 while (n&~3) | |
211 { | |
212 sqr64(r[0],r[1],a[0]); | |
213 sqr64(r[2],r[3],a[1]); | |
214 sqr64(r[4],r[5],a[2]); | |
215 sqr64(r[6],r[7],a[3]); | |
216 a+=4; r+=8; n-=4; | |
217 } | |
218 #endif | |
219 while (n) | |
220 { | |
221 sqr64(r[0],r[1],a[0]); | |
222 a++; r+=2; n--; | |
223 } | |
224 } | |
225 | |
226 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ | |
227 | |
228 #if defined(BN_LLONG) && defined(BN_DIV2W) | |
229 | |
230 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | |
231 { | |
232 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); | |
233 } | |
234 | |
235 #else | |
236 | |
237 /* Divide h,l by d and return the result. */ | |
238 /* I need to test this some more :-( */ | |
239 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | |
240 { | |
241 BN_ULONG dh,dl,q,ret=0,th,tl,t; | |
242 int i,count=2; | |
243 | |
244 if (d == 0) return(BN_MASK2); | |
245 | |
246 i=BN_num_bits_word(d); | |
247 assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i)); | |
248 | |
249 i=BN_BITS2-i; | |
250 if (h >= d) h-=d; | |
251 | |
252 if (i) | |
253 { | |
254 d<<=i; | |
255 h=(h<<i)|(l>>(BN_BITS2-i)); | |
256 l<<=i; | |
257 } | |
258 dh=(d&BN_MASK2h)>>BN_BITS4; | |
259 dl=(d&BN_MASK2l); | |
260 for (;;) | |
261 { | |
262 if ((h>>BN_BITS4) == dh) | |
263 q=BN_MASK2l; | |
264 else | |
265 q=h/dh; | |
266 | |
267 th=q*dh; | |
268 tl=dl*q; | |
269 for (;;) | |
270 { | |
271 t=h-th; | |
272 if ((t&BN_MASK2h) || | |
273 ((tl) <= ( | |
274 (t<<BN_BITS4)| | |
275 ((l&BN_MASK2h)>>BN_BITS4)))) | |
276 break; | |
277 q--; | |
278 th-=dh; | |
279 tl-=dl; | |
280 } | |
281 t=(tl>>BN_BITS4); | |
282 tl=(tl<<BN_BITS4)&BN_MASK2h; | |
283 th+=t; | |
284 | |
285 if (l < tl) th++; | |
286 l-=tl; | |
287 if (h < th) | |
288 { | |
289 h+=d; | |
290 q--; | |
291 } | |
292 h-=th; | |
293 | |
294 if (--count == 0) break; | |
295 | |
296 ret=q<<BN_BITS4; | |
297 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; | |
298 l=(l&BN_MASK2l)<<BN_BITS4; | |
299 } | |
300 ret|=q; | |
301 return(ret); | |
302 } | |
303 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ | |
304 | |
305 #ifdef BN_LLONG | |
306 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | |
307 { | |
308 BN_ULLONG ll=0; | |
309 | |
310 assert(n >= 0); | |
311 if (n <= 0) return((BN_ULONG)0); | |
312 | |
313 #ifndef OPENSSL_SMALL_FOOTPRINT | |
314 while (n&~3) | |
315 { | |
316 ll+=(BN_ULLONG)a[0]+b[0]; | |
317 r[0]=(BN_ULONG)ll&BN_MASK2; | |
318 ll>>=BN_BITS2; | |
319 ll+=(BN_ULLONG)a[1]+b[1]; | |
320 r[1]=(BN_ULONG)ll&BN_MASK2; | |
321 ll>>=BN_BITS2; | |
322 ll+=(BN_ULLONG)a[2]+b[2]; | |
323 r[2]=(BN_ULONG)ll&BN_MASK2; | |
324 ll>>=BN_BITS2; | |
325 ll+=(BN_ULLONG)a[3]+b[3]; | |
326 r[3]=(BN_ULONG)ll&BN_MASK2; | |
327 ll>>=BN_BITS2; | |
328 a+=4; b+=4; r+=4; n-=4; | |
329 } | |
330 #endif | |
331 while (n) | |
332 { | |
333 ll+=(BN_ULLONG)a[0]+b[0]; | |
334 r[0]=(BN_ULONG)ll&BN_MASK2; | |
335 ll>>=BN_BITS2; | |
336 a++; b++; r++; n--; | |
337 } | |
338 return((BN_ULONG)ll); | |
339 } | |
340 #else /* !BN_LLONG */ | |
341 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | |
342 { | |
343 BN_ULONG c,l,t; | |
344 | |
345 assert(n >= 0); | |
346 if (n <= 0) return((BN_ULONG)0); | |
347 | |
348 c=0; | |
349 #ifndef OPENSSL_SMALL_FOOTPRINT | |
350 while (n&~3) | |
351 { | |
352 t=a[0]; | |
353 t=(t+c)&BN_MASK2; | |
354 c=(t < c); | |
355 l=(t+b[0])&BN_MASK2; | |
356 c+=(l < t); | |
357 r[0]=l; | |
358 t=a[1]; | |
359 t=(t+c)&BN_MASK2; | |
360 c=(t < c); | |
361 l=(t+b[1])&BN_MASK2; | |
362 c+=(l < t); | |
363 r[1]=l; | |
364 t=a[2]; | |
365 t=(t+c)&BN_MASK2; | |
366 c=(t < c); | |
367 l=(t+b[2])&BN_MASK2; | |
368 c+=(l < t); | |
369 r[2]=l; | |
370 t=a[3]; | |
371 t=(t+c)&BN_MASK2; | |
372 c=(t < c); | |
373 l=(t+b[3])&BN_MASK2; | |
374 c+=(l < t); | |
375 r[3]=l; | |
376 a+=4; b+=4; r+=4; n-=4; | |
377 } | |
378 #endif | |
379 while(n) | |
380 { | |
381 t=a[0]; | |
382 t=(t+c)&BN_MASK2; | |
383 c=(t < c); | |
384 l=(t+b[0])&BN_MASK2; | |
385 c+=(l < t); | |
386 r[0]=l; | |
387 a++; b++; r++; n--; | |
388 } | |
389 return((BN_ULONG)c); | |
390 } | |
391 #endif /* !BN_LLONG */ | |
392 | |
393 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | |
394 { | |
395 BN_ULONG t1,t2; | |
396 int c=0; | |
397 | |
398 assert(n >= 0); | |
399 if (n <= 0) return((BN_ULONG)0); | |
400 | |
401 #ifndef OPENSSL_SMALL_FOOTPRINT | |
402 while (n&~3) | |
403 { | |
404 t1=a[0]; t2=b[0]; | |
405 r[0]=(t1-t2-c)&BN_MASK2; | |
406 if (t1 != t2) c=(t1 < t2); | |
407 t1=a[1]; t2=b[1]; | |
408 r[1]=(t1-t2-c)&BN_MASK2; | |
409 if (t1 != t2) c=(t1 < t2); | |
410 t1=a[2]; t2=b[2]; | |
411 r[2]=(t1-t2-c)&BN_MASK2; | |
412 if (t1 != t2) c=(t1 < t2); | |
413 t1=a[3]; t2=b[3]; | |
414 r[3]=(t1-t2-c)&BN_MASK2; | |
415 if (t1 != t2) c=(t1 < t2); | |
416 a+=4; b+=4; r+=4; n-=4; | |
417 } | |
418 #endif | |
419 while (n) | |
420 { | |
421 t1=a[0]; t2=b[0]; | |
422 r[0]=(t1-t2-c)&BN_MASK2; | |
423 if (t1 != t2) c=(t1 < t2); | |
424 a++; b++; r++; n--; | |
425 } | |
426 return(c); | |
427 } | |
428 | |
429 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) | |
430 | |
431 #undef bn_mul_comba8 | |
432 #undef bn_mul_comba4 | |
433 #undef bn_sqr_comba8 | |
434 #undef bn_sqr_comba4 | |
435 | |
436 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | |
437 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | |
438 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | |
439 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
*/ | |
440 | |
441 #ifdef BN_LLONG | |
442 #define mul_add_c(a,b,c0,c1,c2) \ | |
443 t=(BN_ULLONG)a*b; \ | |
444 t1=(BN_ULONG)Lw(t); \ | |
445 t2=(BN_ULONG)Hw(t); \ | |
446 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ | |
447 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; | |
448 | |
449 #define mul_add_c2(a,b,c0,c1,c2) \ | |
450 t=(BN_ULLONG)a*b; \ | |
451 tt=(t+t)&BN_MASK; \ | |
452 if (tt < t) c2++; \ | |
453 t1=(BN_ULONG)Lw(tt); \ | |
454 t2=(BN_ULONG)Hw(tt); \ | |
455 c0=(c0+t1)&BN_MASK2; \ | |
456 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ | |
457 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; | |
458 | |
459 #define sqr_add_c(a,i,c0,c1,c2) \ | |
460 t=(BN_ULLONG)a[i]*a[i]; \ | |
461 t1=(BN_ULONG)Lw(t); \ | |
462 t2=(BN_ULONG)Hw(t); \ | |
463 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ | |
464 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; | |
465 | |
466 #define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
467 mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
468 | |
469 #elif defined(BN_UMULT_LOHI) | |
470 | |
471 #define mul_add_c(a,b,c0,c1,c2) { \ | |
472 BN_ULONG ta=(a),tb=(b); \ | |
473 BN_UMULT_LOHI(t1,t2,ta,tb); \ | |
474 c0 += t1; t2 += (c0<t1)?1:0; \ | |
475 c1 += t2; c2 += (c1<t2)?1:0; \ | |
476 } | |
477 | |
478 #define mul_add_c2(a,b,c0,c1,c2) { \ | |
479 BN_ULONG ta=(a),tb=(b),t0; \ | |
480 BN_UMULT_LOHI(t0,t1,ta,tb); \ | |
481 t2 = t1+t1; c2 += (t2<t1)?1:0; \ | |
482 t1 = t0+t0; t2 += (t1<t0)?1:0; \ | |
483 c0 += t1; t2 += (c0<t1)?1:0; \ | |
484 c1 += t2; c2 += (c1<t2)?1:0; \ | |
485 } | |
486 | |
487 #define sqr_add_c(a,i,c0,c1,c2) { \ | |
488 BN_ULONG ta=(a)[i]; \ | |
489 BN_UMULT_LOHI(t1,t2,ta,ta); \ | |
490 c0 += t1; t2 += (c0<t1)?1:0; \ | |
491 c1 += t2; c2 += (c1<t2)?1:0; \ | |
492 } | |
493 | |
494 #define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
495 mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
496 | |
497 #elif defined(BN_UMULT_HIGH) | |
498 | |
499 #define mul_add_c(a,b,c0,c1,c2) { \ | |
500 BN_ULONG ta=(a),tb=(b); \ | |
501 t1 = ta * tb; \ | |
502 t2 = BN_UMULT_HIGH(ta,tb); \ | |
503 c0 += t1; t2 += (c0<t1)?1:0; \ | |
504 c1 += t2; c2 += (c1<t2)?1:0; \ | |
505 } | |
506 | |
507 #define mul_add_c2(a,b,c0,c1,c2) { \ | |
508 BN_ULONG ta=(a),tb=(b),t0; \ | |
509 t1 = BN_UMULT_HIGH(ta,tb); \ | |
510 t0 = ta * tb; \ | |
511 t2 = t1+t1; c2 += (t2<t1)?1:0; \ | |
512 t1 = t0+t0; t2 += (t1<t0)?1:0; \ | |
513 c0 += t1; t2 += (c0<t1)?1:0; \ | |
514 c1 += t2; c2 += (c1<t2)?1:0; \ | |
515 } | |
516 | |
517 #define sqr_add_c(a,i,c0,c1,c2) { \ | |
518 BN_ULONG ta=(a)[i]; \ | |
519 t1 = ta * ta; \ | |
520 t2 = BN_UMULT_HIGH(ta,ta); \ | |
521 c0 += t1; t2 += (c0<t1)?1:0; \ | |
522 c1 += t2; c2 += (c1<t2)?1:0; \ | |
523 } | |
524 | |
525 #define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
526 mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
527 | |
528 #else /* !BN_LLONG */ | |
529 #define mul_add_c(a,b,c0,c1,c2) \ | |
530 t1=LBITS(a); t2=HBITS(a); \ | |
531 bl=LBITS(b); bh=HBITS(b); \ | |
532 mul64(t1,t2,bl,bh); \ | |
533 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ | |
534 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; | |
535 | |
536 #define mul_add_c2(a,b,c0,c1,c2) \ | |
537 t1=LBITS(a); t2=HBITS(a); \ | |
538 bl=LBITS(b); bh=HBITS(b); \ | |
539 mul64(t1,t2,bl,bh); \ | |
540 if (t2 & BN_TBIT) c2++; \ | |
541 t2=(t2+t2)&BN_MASK2; \ | |
542 if (t1 & BN_TBIT) t2++; \ | |
543 t1=(t1+t1)&BN_MASK2; \ | |
544 c0=(c0+t1)&BN_MASK2; \ | |
545 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ | |
546 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; | |
547 | |
548 #define sqr_add_c(a,i,c0,c1,c2) \ | |
549 sqr64(t1,t2,(a)[i]); \ | |
550 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ | |
551 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; | |
552 | |
553 #define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
554 mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
555 #endif /* !BN_LLONG */ | |
556 | |
557 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
558 { | |
559 #ifdef BN_LLONG | |
560 BN_ULLONG t; | |
561 #else | |
562 BN_ULONG bl,bh; | |
563 #endif | |
564 BN_ULONG t1,t2; | |
565 BN_ULONG c1,c2,c3; | |
566 | |
567 c1=0; | |
568 c2=0; | |
569 c3=0; | |
570 mul_add_c(a[0],b[0],c1,c2,c3); | |
571 r[0]=c1; | |
572 c1=0; | |
573 mul_add_c(a[0],b[1],c2,c3,c1); | |
574 mul_add_c(a[1],b[0],c2,c3,c1); | |
575 r[1]=c2; | |
576 c2=0; | |
577 mul_add_c(a[2],b[0],c3,c1,c2); | |
578 mul_add_c(a[1],b[1],c3,c1,c2); | |
579 mul_add_c(a[0],b[2],c3,c1,c2); | |
580 r[2]=c3; | |
581 c3=0; | |
582 mul_add_c(a[0],b[3],c1,c2,c3); | |
583 mul_add_c(a[1],b[2],c1,c2,c3); | |
584 mul_add_c(a[2],b[1],c1,c2,c3); | |
585 mul_add_c(a[3],b[0],c1,c2,c3); | |
586 r[3]=c1; | |
587 c1=0; | |
588 mul_add_c(a[4],b[0],c2,c3,c1); | |
589 mul_add_c(a[3],b[1],c2,c3,c1); | |
590 mul_add_c(a[2],b[2],c2,c3,c1); | |
591 mul_add_c(a[1],b[3],c2,c3,c1); | |
592 mul_add_c(a[0],b[4],c2,c3,c1); | |
593 r[4]=c2; | |
594 c2=0; | |
595 mul_add_c(a[0],b[5],c3,c1,c2); | |
596 mul_add_c(a[1],b[4],c3,c1,c2); | |
597 mul_add_c(a[2],b[3],c3,c1,c2); | |
598 mul_add_c(a[3],b[2],c3,c1,c2); | |
599 mul_add_c(a[4],b[1],c3,c1,c2); | |
600 mul_add_c(a[5],b[0],c3,c1,c2); | |
601 r[5]=c3; | |
602 c3=0; | |
603 mul_add_c(a[6],b[0],c1,c2,c3); | |
604 mul_add_c(a[5],b[1],c1,c2,c3); | |
605 mul_add_c(a[4],b[2],c1,c2,c3); | |
606 mul_add_c(a[3],b[3],c1,c2,c3); | |
607 mul_add_c(a[2],b[4],c1,c2,c3); | |
608 mul_add_c(a[1],b[5],c1,c2,c3); | |
609 mul_add_c(a[0],b[6],c1,c2,c3); | |
610 r[6]=c1; | |
611 c1=0; | |
612 mul_add_c(a[0],b[7],c2,c3,c1); | |
613 mul_add_c(a[1],b[6],c2,c3,c1); | |
614 mul_add_c(a[2],b[5],c2,c3,c1); | |
615 mul_add_c(a[3],b[4],c2,c3,c1); | |
616 mul_add_c(a[4],b[3],c2,c3,c1); | |
617 mul_add_c(a[5],b[2],c2,c3,c1); | |
618 mul_add_c(a[6],b[1],c2,c3,c1); | |
619 mul_add_c(a[7],b[0],c2,c3,c1); | |
620 r[7]=c2; | |
621 c2=0; | |
622 mul_add_c(a[7],b[1],c3,c1,c2); | |
623 mul_add_c(a[6],b[2],c3,c1,c2); | |
624 mul_add_c(a[5],b[3],c3,c1,c2); | |
625 mul_add_c(a[4],b[4],c3,c1,c2); | |
626 mul_add_c(a[3],b[5],c3,c1,c2); | |
627 mul_add_c(a[2],b[6],c3,c1,c2); | |
628 mul_add_c(a[1],b[7],c3,c1,c2); | |
629 r[8]=c3; | |
630 c3=0; | |
631 mul_add_c(a[2],b[7],c1,c2,c3); | |
632 mul_add_c(a[3],b[6],c1,c2,c3); | |
633 mul_add_c(a[4],b[5],c1,c2,c3); | |
634 mul_add_c(a[5],b[4],c1,c2,c3); | |
635 mul_add_c(a[6],b[3],c1,c2,c3); | |
636 mul_add_c(a[7],b[2],c1,c2,c3); | |
637 r[9]=c1; | |
638 c1=0; | |
639 mul_add_c(a[7],b[3],c2,c3,c1); | |
640 mul_add_c(a[6],b[4],c2,c3,c1); | |
641 mul_add_c(a[5],b[5],c2,c3,c1); | |
642 mul_add_c(a[4],b[6],c2,c3,c1); | |
643 mul_add_c(a[3],b[7],c2,c3,c1); | |
644 r[10]=c2; | |
645 c2=0; | |
646 mul_add_c(a[4],b[7],c3,c1,c2); | |
647 mul_add_c(a[5],b[6],c3,c1,c2); | |
648 mul_add_c(a[6],b[5],c3,c1,c2); | |
649 mul_add_c(a[7],b[4],c3,c1,c2); | |
650 r[11]=c3; | |
651 c3=0; | |
652 mul_add_c(a[7],b[5],c1,c2,c3); | |
653 mul_add_c(a[6],b[6],c1,c2,c3); | |
654 mul_add_c(a[5],b[7],c1,c2,c3); | |
655 r[12]=c1; | |
656 c1=0; | |
657 mul_add_c(a[6],b[7],c2,c3,c1); | |
658 mul_add_c(a[7],b[6],c2,c3,c1); | |
659 r[13]=c2; | |
660 c2=0; | |
661 mul_add_c(a[7],b[7],c3,c1,c2); | |
662 r[14]=c3; | |
663 r[15]=c1; | |
664 } | |
665 | |
666 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
667 { | |
668 #ifdef BN_LLONG | |
669 BN_ULLONG t; | |
670 #else | |
671 BN_ULONG bl,bh; | |
672 #endif | |
673 BN_ULONG t1,t2; | |
674 BN_ULONG c1,c2,c3; | |
675 | |
676 c1=0; | |
677 c2=0; | |
678 c3=0; | |
679 mul_add_c(a[0],b[0],c1,c2,c3); | |
680 r[0]=c1; | |
681 c1=0; | |
682 mul_add_c(a[0],b[1],c2,c3,c1); | |
683 mul_add_c(a[1],b[0],c2,c3,c1); | |
684 r[1]=c2; | |
685 c2=0; | |
686 mul_add_c(a[2],b[0],c3,c1,c2); | |
687 mul_add_c(a[1],b[1],c3,c1,c2); | |
688 mul_add_c(a[0],b[2],c3,c1,c2); | |
689 r[2]=c3; | |
690 c3=0; | |
691 mul_add_c(a[0],b[3],c1,c2,c3); | |
692 mul_add_c(a[1],b[2],c1,c2,c3); | |
693 mul_add_c(a[2],b[1],c1,c2,c3); | |
694 mul_add_c(a[3],b[0],c1,c2,c3); | |
695 r[3]=c1; | |
696 c1=0; | |
697 mul_add_c(a[3],b[1],c2,c3,c1); | |
698 mul_add_c(a[2],b[2],c2,c3,c1); | |
699 mul_add_c(a[1],b[3],c2,c3,c1); | |
700 r[4]=c2; | |
701 c2=0; | |
702 mul_add_c(a[2],b[3],c3,c1,c2); | |
703 mul_add_c(a[3],b[2],c3,c1,c2); | |
704 r[5]=c3; | |
705 c3=0; | |
706 mul_add_c(a[3],b[3],c1,c2,c3); | |
707 r[6]=c1; | |
708 r[7]=c2; | |
709 } | |
710 | |
711 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) | |
712 { | |
713 #ifdef BN_LLONG | |
714 BN_ULLONG t,tt; | |
715 #else | |
716 BN_ULONG bl,bh; | |
717 #endif | |
718 BN_ULONG t1,t2; | |
719 BN_ULONG c1,c2,c3; | |
720 | |
721 c1=0; | |
722 c2=0; | |
723 c3=0; | |
724 sqr_add_c(a,0,c1,c2,c3); | |
725 r[0]=c1; | |
726 c1=0; | |
727 sqr_add_c2(a,1,0,c2,c3,c1); | |
728 r[1]=c2; | |
729 c2=0; | |
730 sqr_add_c(a,1,c3,c1,c2); | |
731 sqr_add_c2(a,2,0,c3,c1,c2); | |
732 r[2]=c3; | |
733 c3=0; | |
734 sqr_add_c2(a,3,0,c1,c2,c3); | |
735 sqr_add_c2(a,2,1,c1,c2,c3); | |
736 r[3]=c1; | |
737 c1=0; | |
738 sqr_add_c(a,2,c2,c3,c1); | |
739 sqr_add_c2(a,3,1,c2,c3,c1); | |
740 sqr_add_c2(a,4,0,c2,c3,c1); | |
741 r[4]=c2; | |
742 c2=0; | |
743 sqr_add_c2(a,5,0,c3,c1,c2); | |
744 sqr_add_c2(a,4,1,c3,c1,c2); | |
745 sqr_add_c2(a,3,2,c3,c1,c2); | |
746 r[5]=c3; | |
747 c3=0; | |
748 sqr_add_c(a,3,c1,c2,c3); | |
749 sqr_add_c2(a,4,2,c1,c2,c3); | |
750 sqr_add_c2(a,5,1,c1,c2,c3); | |
751 sqr_add_c2(a,6,0,c1,c2,c3); | |
752 r[6]=c1; | |
753 c1=0; | |
754 sqr_add_c2(a,7,0,c2,c3,c1); | |
755 sqr_add_c2(a,6,1,c2,c3,c1); | |
756 sqr_add_c2(a,5,2,c2,c3,c1); | |
757 sqr_add_c2(a,4,3,c2,c3,c1); | |
758 r[7]=c2; | |
759 c2=0; | |
760 sqr_add_c(a,4,c3,c1,c2); | |
761 sqr_add_c2(a,5,3,c3,c1,c2); | |
762 sqr_add_c2(a,6,2,c3,c1,c2); | |
763 sqr_add_c2(a,7,1,c3,c1,c2); | |
764 r[8]=c3; | |
765 c3=0; | |
766 sqr_add_c2(a,7,2,c1,c2,c3); | |
767 sqr_add_c2(a,6,3,c1,c2,c3); | |
768 sqr_add_c2(a,5,4,c1,c2,c3); | |
769 r[9]=c1; | |
770 c1=0; | |
771 sqr_add_c(a,5,c2,c3,c1); | |
772 sqr_add_c2(a,6,4,c2,c3,c1); | |
773 sqr_add_c2(a,7,3,c2,c3,c1); | |
774 r[10]=c2; | |
775 c2=0; | |
776 sqr_add_c2(a,7,4,c3,c1,c2); | |
777 sqr_add_c2(a,6,5,c3,c1,c2); | |
778 r[11]=c3; | |
779 c3=0; | |
780 sqr_add_c(a,6,c1,c2,c3); | |
781 sqr_add_c2(a,7,5,c1,c2,c3); | |
782 r[12]=c1; | |
783 c1=0; | |
784 sqr_add_c2(a,7,6,c2,c3,c1); | |
785 r[13]=c2; | |
786 c2=0; | |
787 sqr_add_c(a,7,c3,c1,c2); | |
788 r[14]=c3; | |
789 r[15]=c1; | |
790 } | |
791 | |
792 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) | |
793 { | |
794 #ifdef BN_LLONG | |
795 BN_ULLONG t,tt; | |
796 #else | |
797 BN_ULONG bl,bh; | |
798 #endif | |
799 BN_ULONG t1,t2; | |
800 BN_ULONG c1,c2,c3; | |
801 | |
802 c1=0; | |
803 c2=0; | |
804 c3=0; | |
805 sqr_add_c(a,0,c1,c2,c3); | |
806 r[0]=c1; | |
807 c1=0; | |
808 sqr_add_c2(a,1,0,c2,c3,c1); | |
809 r[1]=c2; | |
810 c2=0; | |
811 sqr_add_c(a,1,c3,c1,c2); | |
812 sqr_add_c2(a,2,0,c3,c1,c2); | |
813 r[2]=c3; | |
814 c3=0; | |
815 sqr_add_c2(a,3,0,c1,c2,c3); | |
816 sqr_add_c2(a,2,1,c1,c2,c3); | |
817 r[3]=c1; | |
818 c1=0; | |
819 sqr_add_c(a,2,c2,c3,c1); | |
820 sqr_add_c2(a,3,1,c2,c3,c1); | |
821 r[4]=c2; | |
822 c2=0; | |
823 sqr_add_c2(a,3,2,c3,c1,c2); | |
824 r[5]=c3; | |
825 c3=0; | |
826 sqr_add_c(a,3,c1,c2,c3); | |
827 r[6]=c1; | |
828 r[7]=c2; | |
829 } | |
830 | |
831 #ifdef OPENSSL_NO_ASM | |
832 #ifdef OPENSSL_BN_ASM_MONT | |
833 #include <alloca.h> | |
834 /* | |
835 * This is essentially reference implementation, which may or may not | |
836 * result in performance improvement. E.g. on IA-32 this routine was | |
837 * observed to give 40% faster rsa1024 private key operations and 10% | |
838 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only | |
839 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a | |
840 * reference implementation, one to be used as starting point for | |
841 * platform-specific assembler. Mentioned numbers apply to compiler | |
842 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and | |
843 * can vary not only from platform to platform, but even for compiler | |
844 * versions. Assembler vs. assembler improvement coefficients can | |
845 * [and are known to] differ and are to be documented elsewhere. | |
846 */ | |
847 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0p, int num) | |
848 { | |
849 BN_ULONG c0,c1,ml,*tp,n0; | |
850 #ifdef mul64 | |
851 BN_ULONG mh; | |
852 #endif | |
853 volatile BN_ULONG *vp; | |
854 int i=0,j; | |
855 | |
856 #if 0 /* template for platform-specific implementation */ | |
857 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); | |
858 #endif | |
859 vp = tp = alloca((num+2)*sizeof(BN_ULONG)); | |
860 | |
861 n0 = *n0p; | |
862 | |
863 c0 = 0; | |
864 ml = bp[0]; | |
865 #ifdef mul64 | |
866 mh = HBITS(ml); | |
867 ml = LBITS(ml); | |
868 for (j=0;j<num;++j) | |
869 mul(tp[j],ap[j],ml,mh,c0); | |
870 #else | |
871 for (j=0;j<num;++j) | |
872 mul(tp[j],ap[j],ml,c0); | |
873 #endif | |
874 | |
875 tp[num] = c0; | |
876 tp[num+1] = 0; | |
877 goto enter; | |
878 | |
879 for(i=0;i<num;i++) | |
880 { | |
881 c0 = 0; | |
882 ml = bp[i]; | |
883 #ifdef mul64 | |
884 mh = HBITS(ml); | |
885 ml = LBITS(ml); | |
886 for (j=0;j<num;++j) | |
887 mul_add(tp[j],ap[j],ml,mh,c0); | |
888 #else | |
889 for (j=0;j<num;++j) | |
890 mul_add(tp[j],ap[j],ml,c0); | |
891 #endif | |
892 c1 = (tp[num] + c0)&BN_MASK2; | |
893 tp[num] = c1; | |
894 tp[num+1] = (c1<c0?1:0); | |
895 enter: | |
896 c1 = tp[0]; | |
897 ml = (c1*n0)&BN_MASK2; | |
898 c0 = 0; | |
899 #ifdef mul64 | |
900 mh = HBITS(ml); | |
901 ml = LBITS(ml); | |
902 mul_add(c1,np[0],ml,mh,c0); | |
903 #else | |
904 mul_add(c1,ml,np[0],c0); | |
905 #endif | |
906 for(j=1;j<num;j++) | |
907 { | |
908 c1 = tp[j]; | |
909 #ifdef mul64 | |
910 mul_add(c1,np[j],ml,mh,c0); | |
911 #else | |
912 mul_add(c1,ml,np[j],c0); | |
913 #endif | |
914 tp[j-1] = c1&BN_MASK2; | |
915 } | |
916 c1 = (tp[num] + c0)&BN_MASK2; | |
917 tp[num-1] = c1; | |
918 tp[num] = tp[num+1] + (c1<c0?1:0); | |
919 } | |
920 | |
921 if (tp[num]!=0 || tp[num-1]>=np[num-1]) | |
922 { | |
923 c0 = bn_sub_words(rp,tp,np,num); | |
924 if (tp[num]!=0 || c0==0) | |
925 { | |
926 for(i=0;i<num+2;i++) vp[i] = 0; | |
927 return 1; | |
928 } | |
929 } | |
930 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; | |
931 vp[num] = 0; | |
932 vp[num+1] = 0; | |
933 return 1; | |
934 } | |
935 #else | |
936 /* | |
937 * Return value of 0 indicates that multiplication/convolution was not | |
938 * performed to signal the caller to fall down to alternative/original | |
939 * code-path. | |
940 */ | |
941 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0, int num) | |
942 { return 0; } | |
943 #endif /* OPENSSL_BN_ASM_MONT */ | |
944 #endif | |
945 | |
946 #else /* !BN_MUL_COMBA */ | |
947 | |
948 /* hmm... is it faster just to do a multiply? */ | |
949 #undef bn_sqr_comba4 | |
950 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) | |
951 { | |
952 BN_ULONG t[8]; | |
953 bn_sqr_normal(r,a,4,t); | |
954 } | |
955 | |
956 #undef bn_sqr_comba8 | |
957 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) | |
958 { | |
959 BN_ULONG t[16]; | |
960 bn_sqr_normal(r,a,8,t); | |
961 } | |
962 | |
963 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
964 { | |
965 r[4]=bn_mul_words( &(r[0]),a,4,b[0]); | |
966 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); | |
967 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); | |
968 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); | |
969 } | |
970 | |
971 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
972 { | |
973 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); | |
974 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); | |
975 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); | |
976 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); | |
977 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); | |
978 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); | |
979 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); | |
980 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); | |
981 } | |
982 | |
983 #ifdef OPENSSL_NO_ASM | |
984 #ifdef OPENSSL_BN_ASM_MONT | |
985 #include <alloca.h> | |
986 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0p, int num) | |
987 { | |
988 BN_ULONG c0,c1,*tp,n0=*n0p; | |
989 volatile BN_ULONG *vp; | |
990 int i=0,j; | |
991 | |
992 vp = tp = alloca((num+2)*sizeof(BN_ULONG)); | |
993 | |
994 for(i=0;i<=num;i++) tp[i]=0; | |
995 | |
996 for(i=0;i<num;i++) | |
997 { | |
998 c0 = bn_mul_add_words(tp,ap,num,bp[i]); | |
999 c1 = (tp[num] + c0)&BN_MASK2; | |
1000 tp[num] = c1; | |
1001 tp[num+1] = (c1<c0?1:0); | |
1002 | |
1003 c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); | |
1004 c1 = (tp[num] + c0)&BN_MASK2; | |
1005 tp[num] = c1; | |
1006 tp[num+1] += (c1<c0?1:0); | |
1007 for(j=0;j<=num;j++) tp[j]=tp[j+1]; | |
1008 } | |
1009 | |
1010 if (tp[num]!=0 || tp[num-1]>=np[num-1]) | |
1011 { | |
1012 c0 = bn_sub_words(rp,tp,np,num); | |
1013 if (tp[num]!=0 || c0==0) | |
1014 { | |
1015 for(i=0;i<num+2;i++) vp[i] = 0; | |
1016 return 1; | |
1017 } | |
1018 } | |
1019 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; | |
1020 vp[num] = 0; | |
1021 vp[num+1] = 0; | |
1022 return 1; | |
1023 } | |
1024 #else | |
1025 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0, int num) | |
1026 { return 0; } | |
1027 #endif /* OPENSSL_BN_ASM_MONT */ | |
1028 #endif | |
1029 | |
1030 #endif /* !BN_MUL_COMBA */ | |
OLD | NEW |