OLD | NEW |
1 /* crypto/bn/bn_asm.c */ | 1 /* crypto/bn/bn_asm.c */ |
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | 2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
3 * All rights reserved. | 3 * All rights reserved. |
4 * | 4 * |
5 * This package is an SSL implementation written | 5 * This package is an SSL implementation written |
6 * by Eric Young (eay@cryptsoft.com). | 6 * by Eric Young (eay@cryptsoft.com). |
7 * The implementation was written so as to conform with Netscapes SSL. | 7 * The implementation was written so as to conform with Netscapes SSL. |
8 * | 8 * |
9 * This library is free for commercial and non-commercial use as long as | 9 * This library is free for commercial and non-commercial use as long as |
10 * the following conditions are aheared to. The following conditions | 10 * the following conditions are aheared to. The following conditions |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
68 | 68 |
69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) | 69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) |
70 | 70 |
71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
72 { | 72 { |
73 BN_ULONG c1=0; | 73 BN_ULONG c1=0; |
74 | 74 |
75 assert(num >= 0); | 75 assert(num >= 0); |
76 if (num <= 0) return(c1); | 76 if (num <= 0) return(c1); |
77 | 77 |
| 78 #ifndef OPENSSL_SMALL_FOOTPRINT |
78 while (num&~3) | 79 while (num&~3) |
79 { | 80 { |
80 mul_add(rp[0],ap[0],w,c1); | 81 mul_add(rp[0],ap[0],w,c1); |
81 mul_add(rp[1],ap[1],w,c1); | 82 mul_add(rp[1],ap[1],w,c1); |
82 mul_add(rp[2],ap[2],w,c1); | 83 mul_add(rp[2],ap[2],w,c1); |
83 mul_add(rp[3],ap[3],w,c1); | 84 mul_add(rp[3],ap[3],w,c1); |
84 ap+=4; rp+=4; num-=4; | 85 ap+=4; rp+=4; num-=4; |
85 } | 86 } |
86 » if (num) | 87 #endif |
| 88 » while (num) |
87 { | 89 { |
88 » » mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; | 90 » » mul_add(rp[0],ap[0],w,c1); |
89 » » mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; | 91 » » ap++; rp++; num--; |
90 » » mul_add(rp[2],ap[2],w,c1); return c1; | |
91 } | 92 } |
92 | 93 |
93 return(c1); | 94 return(c1); |
94 } | 95 } |
95 | 96 |
96 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 97 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
97 { | 98 { |
98 BN_ULONG c1=0; | 99 BN_ULONG c1=0; |
99 | 100 |
100 assert(num >= 0); | 101 assert(num >= 0); |
101 if (num <= 0) return(c1); | 102 if (num <= 0) return(c1); |
102 | 103 |
| 104 #ifndef OPENSSL_SMALL_FOOTPRINT |
103 while (num&~3) | 105 while (num&~3) |
104 { | 106 { |
105 mul(rp[0],ap[0],w,c1); | 107 mul(rp[0],ap[0],w,c1); |
106 mul(rp[1],ap[1],w,c1); | 108 mul(rp[1],ap[1],w,c1); |
107 mul(rp[2],ap[2],w,c1); | 109 mul(rp[2],ap[2],w,c1); |
108 mul(rp[3],ap[3],w,c1); | 110 mul(rp[3],ap[3],w,c1); |
109 ap+=4; rp+=4; num-=4; | 111 ap+=4; rp+=4; num-=4; |
110 } | 112 } |
111 » if (num) | 113 #endif |
| 114 » while (num) |
112 { | 115 { |
113 » » mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; | 116 » » mul(rp[0],ap[0],w,c1); |
114 » » mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; | 117 » » ap++; rp++; num--; |
115 » » mul(rp[2],ap[2],w,c1); | |
116 } | 118 } |
117 return(c1); | 119 return(c1); |
118 } | 120 } |
119 | 121 |
120 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | 122 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
121 { | 123 { |
122 assert(n >= 0); | 124 assert(n >= 0); |
123 if (n <= 0) return; | 125 if (n <= 0) return; |
| 126 |
| 127 #ifndef OPENSSL_SMALL_FOOTPRINT |
124 while (n&~3) | 128 while (n&~3) |
125 { | 129 { |
126 sqr(r[0],r[1],a[0]); | 130 sqr(r[0],r[1],a[0]); |
127 sqr(r[2],r[3],a[1]); | 131 sqr(r[2],r[3],a[1]); |
128 sqr(r[4],r[5],a[2]); | 132 sqr(r[4],r[5],a[2]); |
129 sqr(r[6],r[7],a[3]); | 133 sqr(r[6],r[7],a[3]); |
130 a+=4; r+=8; n-=4; | 134 a+=4; r+=8; n-=4; |
131 } | 135 } |
132 » if (n) | 136 #endif |
| 137 » while (n) |
133 { | 138 { |
134 » » sqr(r[0],r[1],a[0]); if (--n == 0) return; | 139 » » sqr(r[0],r[1],a[0]); |
135 » » sqr(r[2],r[3],a[1]); if (--n == 0) return; | 140 » » a++; r+=2; n--; |
136 » » sqr(r[4],r[5],a[2]); | |
137 } | 141 } |
138 } | 142 } |
139 | 143 |
140 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ | 144 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ |
141 | 145 |
142 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 146 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
143 { | 147 { |
144 BN_ULONG c=0; | 148 BN_ULONG c=0; |
145 BN_ULONG bl,bh; | 149 BN_ULONG bl,bh; |
146 | 150 |
147 assert(num >= 0); | 151 assert(num >= 0); |
148 if (num <= 0) return((BN_ULONG)0); | 152 if (num <= 0) return((BN_ULONG)0); |
149 | 153 |
150 bl=LBITS(w); | 154 bl=LBITS(w); |
151 bh=HBITS(w); | 155 bh=HBITS(w); |
152 | 156 |
153 » for (;;) | 157 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 158 » while (num&~3) |
154 { | 159 { |
155 mul_add(rp[0],ap[0],bl,bh,c); | 160 mul_add(rp[0],ap[0],bl,bh,c); |
156 if (--num == 0) break; | |
157 mul_add(rp[1],ap[1],bl,bh,c); | 161 mul_add(rp[1],ap[1],bl,bh,c); |
158 if (--num == 0) break; | |
159 mul_add(rp[2],ap[2],bl,bh,c); | 162 mul_add(rp[2],ap[2],bl,bh,c); |
160 if (--num == 0) break; | |
161 mul_add(rp[3],ap[3],bl,bh,c); | 163 mul_add(rp[3],ap[3],bl,bh,c); |
162 » » if (--num == 0) break; | 164 » » ap+=4; rp+=4; num-=4; |
163 » » ap+=4; | 165 » » } |
164 » » rp+=4; | 166 #endif |
| 167 » while (num) |
| 168 » » { |
| 169 » » mul_add(rp[0],ap[0],bl,bh,c); |
| 170 » » ap++; rp++; num--; |
165 } | 171 } |
166 return(c); | 172 return(c); |
167 } | 173 } |
168 | 174 |
169 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 175 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
170 { | 176 { |
171 BN_ULONG carry=0; | 177 BN_ULONG carry=0; |
172 BN_ULONG bl,bh; | 178 BN_ULONG bl,bh; |
173 | 179 |
174 assert(num >= 0); | 180 assert(num >= 0); |
175 if (num <= 0) return((BN_ULONG)0); | 181 if (num <= 0) return((BN_ULONG)0); |
176 | 182 |
177 bl=LBITS(w); | 183 bl=LBITS(w); |
178 bh=HBITS(w); | 184 bh=HBITS(w); |
179 | 185 |
180 » for (;;) | 186 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 187 » while (num&~3) |
181 { | 188 { |
182 mul(rp[0],ap[0],bl,bh,carry); | 189 mul(rp[0],ap[0],bl,bh,carry); |
183 if (--num == 0) break; | |
184 mul(rp[1],ap[1],bl,bh,carry); | 190 mul(rp[1],ap[1],bl,bh,carry); |
185 if (--num == 0) break; | |
186 mul(rp[2],ap[2],bl,bh,carry); | 191 mul(rp[2],ap[2],bl,bh,carry); |
187 if (--num == 0) break; | |
188 mul(rp[3],ap[3],bl,bh,carry); | 192 mul(rp[3],ap[3],bl,bh,carry); |
189 » » if (--num == 0) break; | 193 » » ap+=4; rp+=4; num-=4; |
190 » » ap+=4; | 194 » » } |
191 » » rp+=4; | 195 #endif |
| 196 » while (num) |
| 197 » » { |
| 198 » » mul(rp[0],ap[0],bl,bh,carry); |
| 199 » » ap++; rp++; num--; |
192 } | 200 } |
193 return(carry); | 201 return(carry); |
194 } | 202 } |
195 | 203 |
196 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | 204 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
197 { | 205 { |
198 assert(n >= 0); | 206 assert(n >= 0); |
199 if (n <= 0) return; | 207 if (n <= 0) return; |
200 » for (;;) | 208 |
| 209 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 210 » while (n&~3) |
201 { | 211 { |
202 sqr64(r[0],r[1],a[0]); | 212 sqr64(r[0],r[1],a[0]); |
203 if (--n == 0) break; | |
204 | |
205 sqr64(r[2],r[3],a[1]); | 213 sqr64(r[2],r[3],a[1]); |
206 if (--n == 0) break; | |
207 | |
208 sqr64(r[4],r[5],a[2]); | 214 sqr64(r[4],r[5],a[2]); |
209 if (--n == 0) break; | |
210 | |
211 sqr64(r[6],r[7],a[3]); | 215 sqr64(r[6],r[7],a[3]); |
212 » » if (--n == 0) break; | 216 » » a+=4; r+=8; n-=4; |
213 | 217 » » } |
214 » » a+=4; | 218 #endif |
215 » » r+=8; | 219 » while (n) |
| 220 » » { |
| 221 » » sqr64(r[0],r[1],a[0]); |
| 222 » » a++; r+=2; n--; |
216 } | 223 } |
217 } | 224 } |
218 | 225 |
219 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ | 226 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ |
220 | 227 |
221 #if defined(BN_LLONG) && defined(BN_DIV2W) | 228 #if defined(BN_LLONG) && defined(BN_DIV2W) |
222 | 229 |
223 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | 230 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
224 { | 231 { |
225 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); | 232 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); |
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ | 303 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ |
297 | 304 |
298 #ifdef BN_LLONG | 305 #ifdef BN_LLONG |
299 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | 306 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) |
300 { | 307 { |
301 BN_ULLONG ll=0; | 308 BN_ULLONG ll=0; |
302 | 309 |
303 assert(n >= 0); | 310 assert(n >= 0); |
304 if (n <= 0) return((BN_ULONG)0); | 311 if (n <= 0) return((BN_ULONG)0); |
305 | 312 |
306 » for (;;) | 313 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 314 » while (n&~3) |
307 { | 315 { |
308 ll+=(BN_ULLONG)a[0]+b[0]; | 316 ll+=(BN_ULLONG)a[0]+b[0]; |
309 r[0]=(BN_ULONG)ll&BN_MASK2; | 317 r[0]=(BN_ULONG)ll&BN_MASK2; |
310 ll>>=BN_BITS2; | 318 ll>>=BN_BITS2; |
311 if (--n <= 0) break; | |
312 | |
313 ll+=(BN_ULLONG)a[1]+b[1]; | 319 ll+=(BN_ULLONG)a[1]+b[1]; |
314 r[1]=(BN_ULONG)ll&BN_MASK2; | 320 r[1]=(BN_ULONG)ll&BN_MASK2; |
315 ll>>=BN_BITS2; | 321 ll>>=BN_BITS2; |
316 if (--n <= 0) break; | |
317 | |
318 ll+=(BN_ULLONG)a[2]+b[2]; | 322 ll+=(BN_ULLONG)a[2]+b[2]; |
319 r[2]=(BN_ULONG)ll&BN_MASK2; | 323 r[2]=(BN_ULONG)ll&BN_MASK2; |
320 ll>>=BN_BITS2; | 324 ll>>=BN_BITS2; |
321 if (--n <= 0) break; | |
322 | |
323 ll+=(BN_ULLONG)a[3]+b[3]; | 325 ll+=(BN_ULLONG)a[3]+b[3]; |
324 r[3]=(BN_ULONG)ll&BN_MASK2; | 326 r[3]=(BN_ULONG)ll&BN_MASK2; |
325 ll>>=BN_BITS2; | 327 ll>>=BN_BITS2; |
326 » » if (--n <= 0) break; | 328 » » a+=4; b+=4; r+=4; n-=4; |
327 | 329 » » } |
328 » » a+=4; | 330 #endif |
329 » » b+=4; | 331 » while (n) |
330 » » r+=4; | 332 » » { |
| 333 » » ll+=(BN_ULLONG)a[0]+b[0]; |
| 334 » » r[0]=(BN_ULONG)ll&BN_MASK2; |
| 335 » » ll>>=BN_BITS2; |
| 336 » » a++; b++; r++; n--; |
331 } | 337 } |
332 return((BN_ULONG)ll); | 338 return((BN_ULONG)ll); |
333 } | 339 } |
334 #else /* !BN_LLONG */ | 340 #else /* !BN_LLONG */ |
335 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | 341 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) |
336 { | 342 { |
337 BN_ULONG c,l,t; | 343 BN_ULONG c,l,t; |
338 | 344 |
339 assert(n >= 0); | 345 assert(n >= 0); |
340 if (n <= 0) return((BN_ULONG)0); | 346 if (n <= 0) return((BN_ULONG)0); |
341 | 347 |
342 c=0; | 348 c=0; |
343 » for (;;) | 349 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 350 » while (n&~3) |
344 { | 351 { |
345 t=a[0]; | 352 t=a[0]; |
346 t=(t+c)&BN_MASK2; | 353 t=(t+c)&BN_MASK2; |
347 c=(t < c); | 354 c=(t < c); |
348 l=(t+b[0])&BN_MASK2; | 355 l=(t+b[0])&BN_MASK2; |
349 c+=(l < t); | 356 c+=(l < t); |
350 r[0]=l; | 357 r[0]=l; |
351 if (--n <= 0) break; | |
352 | |
353 t=a[1]; | 358 t=a[1]; |
354 t=(t+c)&BN_MASK2; | 359 t=(t+c)&BN_MASK2; |
355 c=(t < c); | 360 c=(t < c); |
356 l=(t+b[1])&BN_MASK2; | 361 l=(t+b[1])&BN_MASK2; |
357 c+=(l < t); | 362 c+=(l < t); |
358 r[1]=l; | 363 r[1]=l; |
359 if (--n <= 0) break; | |
360 | |
361 t=a[2]; | 364 t=a[2]; |
362 t=(t+c)&BN_MASK2; | 365 t=(t+c)&BN_MASK2; |
363 c=(t < c); | 366 c=(t < c); |
364 l=(t+b[2])&BN_MASK2; | 367 l=(t+b[2])&BN_MASK2; |
365 c+=(l < t); | 368 c+=(l < t); |
366 r[2]=l; | 369 r[2]=l; |
367 if (--n <= 0) break; | |
368 | |
369 t=a[3]; | 370 t=a[3]; |
370 t=(t+c)&BN_MASK2; | 371 t=(t+c)&BN_MASK2; |
371 c=(t < c); | 372 c=(t < c); |
372 l=(t+b[3])&BN_MASK2; | 373 l=(t+b[3])&BN_MASK2; |
373 c+=(l < t); | 374 c+=(l < t); |
374 r[3]=l; | 375 r[3]=l; |
375 » » if (--n <= 0) break; | 376 » » a+=4; b+=4; r+=4; n-=4; |
376 | 377 » » } |
377 » » a+=4; | 378 #endif |
378 » » b+=4; | 379 » while(n) |
379 » » r+=4; | 380 » » { |
| 381 » » t=a[0]; |
| 382 » » t=(t+c)&BN_MASK2; |
| 383 » » c=(t < c); |
| 384 » » l=(t+b[0])&BN_MASK2; |
| 385 » » c+=(l < t); |
| 386 » » r[0]=l; |
| 387 » » a++; b++; r++; n--; |
380 } | 388 } |
381 return((BN_ULONG)c); | 389 return((BN_ULONG)c); |
382 } | 390 } |
383 #endif /* !BN_LLONG */ | 391 #endif /* !BN_LLONG */ |
384 | 392 |
385 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | 393 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) |
386 { | 394 { |
387 BN_ULONG t1,t2; | 395 BN_ULONG t1,t2; |
388 int c=0; | 396 int c=0; |
389 | 397 |
390 assert(n >= 0); | 398 assert(n >= 0); |
391 if (n <= 0) return((BN_ULONG)0); | 399 if (n <= 0) return((BN_ULONG)0); |
392 | 400 |
393 » for (;;) | 401 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 402 » while (n&~3) |
394 { | 403 { |
395 t1=a[0]; t2=b[0]; | 404 t1=a[0]; t2=b[0]; |
396 r[0]=(t1-t2-c)&BN_MASK2; | 405 r[0]=(t1-t2-c)&BN_MASK2; |
397 if (t1 != t2) c=(t1 < t2); | 406 if (t1 != t2) c=(t1 < t2); |
398 if (--n <= 0) break; | |
399 | |
400 t1=a[1]; t2=b[1]; | 407 t1=a[1]; t2=b[1]; |
401 r[1]=(t1-t2-c)&BN_MASK2; | 408 r[1]=(t1-t2-c)&BN_MASK2; |
402 if (t1 != t2) c=(t1 < t2); | 409 if (t1 != t2) c=(t1 < t2); |
403 if (--n <= 0) break; | |
404 | |
405 t1=a[2]; t2=b[2]; | 410 t1=a[2]; t2=b[2]; |
406 r[2]=(t1-t2-c)&BN_MASK2; | 411 r[2]=(t1-t2-c)&BN_MASK2; |
407 if (t1 != t2) c=(t1 < t2); | 412 if (t1 != t2) c=(t1 < t2); |
408 if (--n <= 0) break; | |
409 | |
410 t1=a[3]; t2=b[3]; | 413 t1=a[3]; t2=b[3]; |
411 r[3]=(t1-t2-c)&BN_MASK2; | 414 r[3]=(t1-t2-c)&BN_MASK2; |
412 if (t1 != t2) c=(t1 < t2); | 415 if (t1 != t2) c=(t1 < t2); |
413 » » if (--n <= 0) break; | 416 » » a+=4; b+=4; r+=4; n-=4; |
414 | 417 » » } |
415 » » a+=4; | 418 #endif |
416 » » b+=4; | 419 » while (n) |
417 » » r+=4; | 420 » » { |
| 421 » » t1=a[0]; t2=b[0]; |
| 422 » » r[0]=(t1-t2-c)&BN_MASK2; |
| 423 » » if (t1 != t2) c=(t1 < t2); |
| 424 » » a++; b++; r++; n--; |
418 } | 425 } |
419 return(c); | 426 return(c); |
420 } | 427 } |
421 | 428 |
422 #ifdef BN_MUL_COMBA | 429 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
423 | 430 |
424 #undef bn_mul_comba8 | 431 #undef bn_mul_comba8 |
425 #undef bn_mul_comba4 | 432 #undef bn_mul_comba4 |
426 #undef bn_sqr_comba8 | 433 #undef bn_sqr_comba8 |
427 #undef bn_sqr_comba4 | 434 #undef bn_sqr_comba4 |
428 | 435 |
429 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | 436 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
430 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | 437 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
431 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | 438 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
432 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
*/ | 439 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
*/ |
(...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
813 sqr_add_c2(a,3,1,c2,c3,c1); | 820 sqr_add_c2(a,3,1,c2,c3,c1); |
814 r[4]=c2; | 821 r[4]=c2; |
815 c2=0; | 822 c2=0; |
816 sqr_add_c2(a,3,2,c3,c1,c2); | 823 sqr_add_c2(a,3,2,c3,c1,c2); |
817 r[5]=c3; | 824 r[5]=c3; |
818 c3=0; | 825 c3=0; |
819 sqr_add_c(a,3,c1,c2,c3); | 826 sqr_add_c(a,3,c1,c2,c3); |
820 r[6]=c1; | 827 r[6]=c1; |
821 r[7]=c2; | 828 r[7]=c2; |
822 } | 829 } |
| 830 |
| 831 #ifdef OPENSSL_NO_ASM |
| 832 #ifdef OPENSSL_BN_ASM_MONT |
| 833 #include <alloca.h> |
| 834 /* |
| 835 * This is essentially reference implementation, which may or may not |
| 836 * result in performance improvement. E.g. on IA-32 this routine was |
| 837 * observed to give 40% faster rsa1024 private key operations and 10% |
| 838 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only |
| 839 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a |
| 840 * reference implementation, one to be used as starting point for |
| 841 * platform-specific assembler. Mentioned numbers apply to compiler |
| 842 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and |
| 843 * can vary not only from platform to platform, but even for compiler |
| 844 * versions. Assembler vs. assembler improvement coefficients can |
| 845 * [and are known to] differ and are to be documented elsewhere. |
| 846 */ |
| 847 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0p, int num) |
| 848 { |
| 849 BN_ULONG c0,c1,ml,*tp,n0; |
| 850 #ifdef mul64 |
| 851 BN_ULONG mh; |
| 852 #endif |
| 853 volatile BN_ULONG *vp; |
| 854 int i=0,j; |
| 855 |
| 856 #if 0 /* template for platform-specific implementation */ |
| 857 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); |
| 858 #endif |
| 859 vp = tp = alloca((num+2)*sizeof(BN_ULONG)); |
| 860 |
| 861 n0 = *n0p; |
| 862 |
| 863 c0 = 0; |
| 864 ml = bp[0]; |
| 865 #ifdef mul64 |
| 866 mh = HBITS(ml); |
| 867 ml = LBITS(ml); |
| 868 for (j=0;j<num;++j) |
| 869 mul(tp[j],ap[j],ml,mh,c0); |
| 870 #else |
| 871 for (j=0;j<num;++j) |
| 872 mul(tp[j],ap[j],ml,c0); |
| 873 #endif |
| 874 |
| 875 tp[num] = c0; |
| 876 tp[num+1] = 0; |
| 877 goto enter; |
| 878 |
| 879 for(i=0;i<num;i++) |
| 880 { |
| 881 c0 = 0; |
| 882 ml = bp[i]; |
| 883 #ifdef mul64 |
| 884 mh = HBITS(ml); |
| 885 ml = LBITS(ml); |
| 886 for (j=0;j<num;++j) |
| 887 mul_add(tp[j],ap[j],ml,mh,c0); |
| 888 #else |
| 889 for (j=0;j<num;++j) |
| 890 mul_add(tp[j],ap[j],ml,c0); |
| 891 #endif |
| 892 c1 = (tp[num] + c0)&BN_MASK2; |
| 893 tp[num] = c1; |
| 894 tp[num+1] = (c1<c0?1:0); |
| 895 enter: |
| 896 c1 = tp[0]; |
| 897 ml = (c1*n0)&BN_MASK2; |
| 898 c0 = 0; |
| 899 #ifdef mul64 |
| 900 mh = HBITS(ml); |
| 901 ml = LBITS(ml); |
| 902 mul_add(c1,np[0],ml,mh,c0); |
| 903 #else |
| 904 mul_add(c1,ml,np[0],c0); |
| 905 #endif |
| 906 for(j=1;j<num;j++) |
| 907 { |
| 908 c1 = tp[j]; |
| 909 #ifdef mul64 |
| 910 mul_add(c1,np[j],ml,mh,c0); |
| 911 #else |
| 912 mul_add(c1,ml,np[j],c0); |
| 913 #endif |
| 914 tp[j-1] = c1&BN_MASK2; |
| 915 } |
| 916 c1 = (tp[num] + c0)&BN_MASK2; |
| 917 tp[num-1] = c1; |
| 918 tp[num] = tp[num+1] + (c1<c0?1:0); |
| 919 } |
| 920 |
| 921 if (tp[num]!=0 || tp[num-1]>=np[num-1]) |
| 922 { |
| 923 c0 = bn_sub_words(rp,tp,np,num); |
| 924 if (tp[num]!=0 || c0==0) |
| 925 { |
| 926 for(i=0;i<num+2;i++) vp[i] = 0; |
| 927 return 1; |
| 928 } |
| 929 } |
| 930 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; |
| 931 vp[num] = 0; |
| 932 vp[num+1] = 0; |
| 933 return 1; |
| 934 } |
| 935 #else |
| 936 /* |
| 937 * Return value of 0 indicates that multiplication/convolution was not |
| 938 * performed to signal the caller to fall down to alternative/original |
| 939 * code-path. |
| 940 */ |
| 941 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0, int num) |
| 942 { return 0; } |
| 943 #endif /* OPENSSL_BN_ASM_MONT */ |
| 944 #endif |
| 945 |
823 #else /* !BN_MUL_COMBA */ | 946 #else /* !BN_MUL_COMBA */ |
824 | 947 |
825 /* hmm... is it faster just to do a multiply? */ | 948 /* hmm... is it faster just to do a multiply? */ |
826 #undef bn_sqr_comba4 | 949 #undef bn_sqr_comba4 |
827 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | 950 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
828 { | 951 { |
829 BN_ULONG t[8]; | 952 BN_ULONG t[8]; |
830 bn_sqr_normal(r,a,4,t); | 953 bn_sqr_normal(r,a,4,t); |
831 } | 954 } |
832 | 955 |
833 #undef bn_sqr_comba8 | 956 #undef bn_sqr_comba8 |
834 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | 957 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
835 { | 958 { |
836 BN_ULONG t[16]; | 959 BN_ULONG t[16]; |
837 bn_sqr_normal(r,a,8,t); | 960 bn_sqr_normal(r,a,8,t); |
838 } | 961 } |
839 | 962 |
840 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | 963 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
841 { | 964 { |
842 r[4]=bn_mul_words( &(r[0]),a,4,b[0]); | 965 r[4]=bn_mul_words( &(r[0]),a,4,b[0]); |
843 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); | 966 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); |
844 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); | 967 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); |
845 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); | 968 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); |
846 } | 969 } |
847 | 970 |
848 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | 971 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
849 { | 972 { |
850 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); | 973 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); |
851 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); | 974 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); |
852 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); | 975 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); |
853 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); | 976 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); |
854 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); | 977 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); |
855 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); | 978 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); |
856 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); | 979 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); |
857 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); | 980 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); |
858 } | 981 } |
859 | 982 |
| 983 #ifdef OPENSSL_NO_ASM |
| 984 #ifdef OPENSSL_BN_ASM_MONT |
| 985 #include <alloca.h> |
| 986 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0p, int num) |
| 987 { |
| 988 BN_ULONG c0,c1,*tp,n0=*n0p; |
| 989 volatile BN_ULONG *vp; |
| 990 int i=0,j; |
| 991 |
| 992 vp = tp = alloca((num+2)*sizeof(BN_ULONG)); |
| 993 |
| 994 for(i=0;i<=num;i++) tp[i]=0; |
| 995 |
| 996 for(i=0;i<num;i++) |
| 997 { |
| 998 c0 = bn_mul_add_words(tp,ap,num,bp[i]); |
| 999 c1 = (tp[num] + c0)&BN_MASK2; |
| 1000 tp[num] = c1; |
| 1001 tp[num+1] = (c1<c0?1:0); |
| 1002 |
| 1003 c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); |
| 1004 c1 = (tp[num] + c0)&BN_MASK2; |
| 1005 tp[num] = c1; |
| 1006 tp[num+1] += (c1<c0?1:0); |
| 1007 for(j=0;j<=num;j++) tp[j]=tp[j+1]; |
| 1008 } |
| 1009 |
| 1010 if (tp[num]!=0 || tp[num-1]>=np[num-1]) |
| 1011 { |
| 1012 c0 = bn_sub_words(rp,tp,np,num); |
| 1013 if (tp[num]!=0 || c0==0) |
| 1014 { |
| 1015 for(i=0;i<num+2;i++) vp[i] = 0; |
| 1016 return 1; |
| 1017 } |
| 1018 } |
| 1019 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; |
| 1020 vp[num] = 0; |
| 1021 vp[num+1] = 0; |
| 1022 return 1; |
| 1023 } |
| 1024 #else |
| 1025 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0, int num) |
| 1026 { return 0; } |
| 1027 #endif /* OPENSSL_BN_ASM_MONT */ |
| 1028 #endif |
| 1029 |
860 #endif /* !BN_MUL_COMBA */ | 1030 #endif /* !BN_MUL_COMBA */ |
OLD | NEW |