| OLD | NEW |
| 1 /* crypto/bn/bn_asm.c */ | 1 /* crypto/bn/bn_asm.c */ |
| 2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | 2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
| 3 * All rights reserved. | 3 * All rights reserved. |
| 4 * | 4 * |
| 5 * This package is an SSL implementation written | 5 * This package is an SSL implementation written |
| 6 * by Eric Young (eay@cryptsoft.com). | 6 * by Eric Young (eay@cryptsoft.com). |
| 7 * The implementation was written so as to conform with Netscapes SSL. | 7 * The implementation was written so as to conform with Netscapes SSL. |
| 8 * | 8 * |
| 9 * This library is free for commercial and non-commercial use as long as | 9 * This library is free for commercial and non-commercial use as long as |
| 10 * the following conditions are aheared to. The following conditions | 10 * the following conditions are aheared to. The following conditions |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 68 | 68 |
| 69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) | 69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) |
| 70 | 70 |
| 71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
| 72 { | 72 { |
| 73 BN_ULONG c1=0; | 73 BN_ULONG c1=0; |
| 74 | 74 |
| 75 assert(num >= 0); | 75 assert(num >= 0); |
| 76 if (num <= 0) return(c1); | 76 if (num <= 0) return(c1); |
| 77 | 77 |
| 78 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 78 while (num&~3) | 79 while (num&~3) |
| 79 { | 80 { |
| 80 mul_add(rp[0],ap[0],w,c1); | 81 mul_add(rp[0],ap[0],w,c1); |
| 81 mul_add(rp[1],ap[1],w,c1); | 82 mul_add(rp[1],ap[1],w,c1); |
| 82 mul_add(rp[2],ap[2],w,c1); | 83 mul_add(rp[2],ap[2],w,c1); |
| 83 mul_add(rp[3],ap[3],w,c1); | 84 mul_add(rp[3],ap[3],w,c1); |
| 84 ap+=4; rp+=4; num-=4; | 85 ap+=4; rp+=4; num-=4; |
| 85 } | 86 } |
| 86 » if (num) | 87 #endif |
| 88 » while (num) |
| 87 { | 89 { |
| 88 » » mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; | 90 » » mul_add(rp[0],ap[0],w,c1); |
| 89 » » mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; | 91 » » ap++; rp++; num--; |
| 90 » » mul_add(rp[2],ap[2],w,c1); return c1; | |
| 91 } | 92 } |
| 92 | 93 |
| 93 return(c1); | 94 return(c1); |
| 94 } | 95 } |
| 95 | 96 |
| 96 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 97 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
| 97 { | 98 { |
| 98 BN_ULONG c1=0; | 99 BN_ULONG c1=0; |
| 99 | 100 |
| 100 assert(num >= 0); | 101 assert(num >= 0); |
| 101 if (num <= 0) return(c1); | 102 if (num <= 0) return(c1); |
| 102 | 103 |
| 104 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 103 while (num&~3) | 105 while (num&~3) |
| 104 { | 106 { |
| 105 mul(rp[0],ap[0],w,c1); | 107 mul(rp[0],ap[0],w,c1); |
| 106 mul(rp[1],ap[1],w,c1); | 108 mul(rp[1],ap[1],w,c1); |
| 107 mul(rp[2],ap[2],w,c1); | 109 mul(rp[2],ap[2],w,c1); |
| 108 mul(rp[3],ap[3],w,c1); | 110 mul(rp[3],ap[3],w,c1); |
| 109 ap+=4; rp+=4; num-=4; | 111 ap+=4; rp+=4; num-=4; |
| 110 } | 112 } |
| 111 » if (num) | 113 #endif |
| 114 » while (num) |
| 112 { | 115 { |
| 113 » » mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; | 116 » » mul(rp[0],ap[0],w,c1); |
| 114 » » mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; | 117 » » ap++; rp++; num--; |
| 115 » » mul(rp[2],ap[2],w,c1); | |
| 116 } | 118 } |
| 117 return(c1); | 119 return(c1); |
| 118 } | 120 } |
| 119 | 121 |
| 120 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | 122 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
| 121 { | 123 { |
| 122 assert(n >= 0); | 124 assert(n >= 0); |
| 123 if (n <= 0) return; | 125 if (n <= 0) return; |
| 126 |
| 127 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 124 while (n&~3) | 128 while (n&~3) |
| 125 { | 129 { |
| 126 sqr(r[0],r[1],a[0]); | 130 sqr(r[0],r[1],a[0]); |
| 127 sqr(r[2],r[3],a[1]); | 131 sqr(r[2],r[3],a[1]); |
| 128 sqr(r[4],r[5],a[2]); | 132 sqr(r[4],r[5],a[2]); |
| 129 sqr(r[6],r[7],a[3]); | 133 sqr(r[6],r[7],a[3]); |
| 130 a+=4; r+=8; n-=4; | 134 a+=4; r+=8; n-=4; |
| 131 } | 135 } |
| 132 » if (n) | 136 #endif |
| 137 » while (n) |
| 133 { | 138 { |
| 134 » » sqr(r[0],r[1],a[0]); if (--n == 0) return; | 139 » » sqr(r[0],r[1],a[0]); |
| 135 » » sqr(r[2],r[3],a[1]); if (--n == 0) return; | 140 » » a++; r+=2; n--; |
| 136 » » sqr(r[4],r[5],a[2]); | |
| 137 } | 141 } |
| 138 } | 142 } |
| 139 | 143 |
| 140 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ | 144 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ |
| 141 | 145 |
| 142 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 146 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
| 143 { | 147 { |
| 144 BN_ULONG c=0; | 148 BN_ULONG c=0; |
| 145 BN_ULONG bl,bh; | 149 BN_ULONG bl,bh; |
| 146 | 150 |
| 147 assert(num >= 0); | 151 assert(num >= 0); |
| 148 if (num <= 0) return((BN_ULONG)0); | 152 if (num <= 0) return((BN_ULONG)0); |
| 149 | 153 |
| 150 bl=LBITS(w); | 154 bl=LBITS(w); |
| 151 bh=HBITS(w); | 155 bh=HBITS(w); |
| 152 | 156 |
| 153 » for (;;) | 157 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 158 » while (num&~3) |
| 154 { | 159 { |
| 155 mul_add(rp[0],ap[0],bl,bh,c); | 160 mul_add(rp[0],ap[0],bl,bh,c); |
| 156 if (--num == 0) break; | |
| 157 mul_add(rp[1],ap[1],bl,bh,c); | 161 mul_add(rp[1],ap[1],bl,bh,c); |
| 158 if (--num == 0) break; | |
| 159 mul_add(rp[2],ap[2],bl,bh,c); | 162 mul_add(rp[2],ap[2],bl,bh,c); |
| 160 if (--num == 0) break; | |
| 161 mul_add(rp[3],ap[3],bl,bh,c); | 163 mul_add(rp[3],ap[3],bl,bh,c); |
| 162 » » if (--num == 0) break; | 164 » » ap+=4; rp+=4; num-=4; |
| 163 » » ap+=4; | 165 » » } |
| 164 » » rp+=4; | 166 #endif |
| 167 » while (num) |
| 168 » » { |
| 169 » » mul_add(rp[0],ap[0],bl,bh,c); |
| 170 » » ap++; rp++; num--; |
| 165 } | 171 } |
| 166 return(c); | 172 return(c); |
| 167 } | 173 } |
| 168 | 174 |
| 169 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | 175 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
| 170 { | 176 { |
| 171 BN_ULONG carry=0; | 177 BN_ULONG carry=0; |
| 172 BN_ULONG bl,bh; | 178 BN_ULONG bl,bh; |
| 173 | 179 |
| 174 assert(num >= 0); | 180 assert(num >= 0); |
| 175 if (num <= 0) return((BN_ULONG)0); | 181 if (num <= 0) return((BN_ULONG)0); |
| 176 | 182 |
| 177 bl=LBITS(w); | 183 bl=LBITS(w); |
| 178 bh=HBITS(w); | 184 bh=HBITS(w); |
| 179 | 185 |
| 180 » for (;;) | 186 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 187 » while (num&~3) |
| 181 { | 188 { |
| 182 mul(rp[0],ap[0],bl,bh,carry); | 189 mul(rp[0],ap[0],bl,bh,carry); |
| 183 if (--num == 0) break; | |
| 184 mul(rp[1],ap[1],bl,bh,carry); | 190 mul(rp[1],ap[1],bl,bh,carry); |
| 185 if (--num == 0) break; | |
| 186 mul(rp[2],ap[2],bl,bh,carry); | 191 mul(rp[2],ap[2],bl,bh,carry); |
| 187 if (--num == 0) break; | |
| 188 mul(rp[3],ap[3],bl,bh,carry); | 192 mul(rp[3],ap[3],bl,bh,carry); |
| 189 » » if (--num == 0) break; | 193 » » ap+=4; rp+=4; num-=4; |
| 190 » » ap+=4; | 194 » » } |
| 191 » » rp+=4; | 195 #endif |
| 196 » while (num) |
| 197 » » { |
| 198 » » mul(rp[0],ap[0],bl,bh,carry); |
| 199 » » ap++; rp++; num--; |
| 192 } | 200 } |
| 193 return(carry); | 201 return(carry); |
| 194 } | 202 } |
| 195 | 203 |
| 196 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | 204 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
| 197 { | 205 { |
| 198 assert(n >= 0); | 206 assert(n >= 0); |
| 199 if (n <= 0) return; | 207 if (n <= 0) return; |
| 200 » for (;;) | 208 |
| 209 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 210 » while (n&~3) |
| 201 { | 211 { |
| 202 sqr64(r[0],r[1],a[0]); | 212 sqr64(r[0],r[1],a[0]); |
| 203 if (--n == 0) break; | |
| 204 | |
| 205 sqr64(r[2],r[3],a[1]); | 213 sqr64(r[2],r[3],a[1]); |
| 206 if (--n == 0) break; | |
| 207 | |
| 208 sqr64(r[4],r[5],a[2]); | 214 sqr64(r[4],r[5],a[2]); |
| 209 if (--n == 0) break; | |
| 210 | |
| 211 sqr64(r[6],r[7],a[3]); | 215 sqr64(r[6],r[7],a[3]); |
| 212 » » if (--n == 0) break; | 216 » » a+=4; r+=8; n-=4; |
| 213 | 217 » » } |
| 214 » » a+=4; | 218 #endif |
| 215 » » r+=8; | 219 » while (n) |
| 220 » » { |
| 221 » » sqr64(r[0],r[1],a[0]); |
| 222 » » a++; r+=2; n--; |
| 216 } | 223 } |
| 217 } | 224 } |
| 218 | 225 |
| 219 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ | 226 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ |
| 220 | 227 |
| 221 #if defined(BN_LLONG) && defined(BN_DIV2W) | 228 #if defined(BN_LLONG) && defined(BN_DIV2W) |
| 222 | 229 |
| 223 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | 230 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
| 224 { | 231 { |
| 225 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); | 232 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 296 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ | 303 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ |
| 297 | 304 |
| 298 #ifdef BN_LLONG | 305 #ifdef BN_LLONG |
| 299 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | 306 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) |
| 300 { | 307 { |
| 301 BN_ULLONG ll=0; | 308 BN_ULLONG ll=0; |
| 302 | 309 |
| 303 assert(n >= 0); | 310 assert(n >= 0); |
| 304 if (n <= 0) return((BN_ULONG)0); | 311 if (n <= 0) return((BN_ULONG)0); |
| 305 | 312 |
| 306 » for (;;) | 313 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 314 » while (n&~3) |
| 307 { | 315 { |
| 308 ll+=(BN_ULLONG)a[0]+b[0]; | 316 ll+=(BN_ULLONG)a[0]+b[0]; |
| 309 r[0]=(BN_ULONG)ll&BN_MASK2; | 317 r[0]=(BN_ULONG)ll&BN_MASK2; |
| 310 ll>>=BN_BITS2; | 318 ll>>=BN_BITS2; |
| 311 if (--n <= 0) break; | |
| 312 | |
| 313 ll+=(BN_ULLONG)a[1]+b[1]; | 319 ll+=(BN_ULLONG)a[1]+b[1]; |
| 314 r[1]=(BN_ULONG)ll&BN_MASK2; | 320 r[1]=(BN_ULONG)ll&BN_MASK2; |
| 315 ll>>=BN_BITS2; | 321 ll>>=BN_BITS2; |
| 316 if (--n <= 0) break; | |
| 317 | |
| 318 ll+=(BN_ULLONG)a[2]+b[2]; | 322 ll+=(BN_ULLONG)a[2]+b[2]; |
| 319 r[2]=(BN_ULONG)ll&BN_MASK2; | 323 r[2]=(BN_ULONG)ll&BN_MASK2; |
| 320 ll>>=BN_BITS2; | 324 ll>>=BN_BITS2; |
| 321 if (--n <= 0) break; | |
| 322 | |
| 323 ll+=(BN_ULLONG)a[3]+b[3]; | 325 ll+=(BN_ULLONG)a[3]+b[3]; |
| 324 r[3]=(BN_ULONG)ll&BN_MASK2; | 326 r[3]=(BN_ULONG)ll&BN_MASK2; |
| 325 ll>>=BN_BITS2; | 327 ll>>=BN_BITS2; |
| 326 » » if (--n <= 0) break; | 328 » » a+=4; b+=4; r+=4; n-=4; |
| 327 | 329 » » } |
| 328 » » a+=4; | 330 #endif |
| 329 » » b+=4; | 331 » while (n) |
| 330 » » r+=4; | 332 » » { |
| 333 » » ll+=(BN_ULLONG)a[0]+b[0]; |
| 334 » » r[0]=(BN_ULONG)ll&BN_MASK2; |
| 335 » » ll>>=BN_BITS2; |
| 336 » » a++; b++; r++; n--; |
| 331 } | 337 } |
| 332 return((BN_ULONG)ll); | 338 return((BN_ULONG)ll); |
| 333 } | 339 } |
| 334 #else /* !BN_LLONG */ | 340 #else /* !BN_LLONG */ |
| 335 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | 341 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) |
| 336 { | 342 { |
| 337 BN_ULONG c,l,t; | 343 BN_ULONG c,l,t; |
| 338 | 344 |
| 339 assert(n >= 0); | 345 assert(n >= 0); |
| 340 if (n <= 0) return((BN_ULONG)0); | 346 if (n <= 0) return((BN_ULONG)0); |
| 341 | 347 |
| 342 c=0; | 348 c=0; |
| 343 » for (;;) | 349 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 350 » while (n&~3) |
| 344 { | 351 { |
| 345 t=a[0]; | 352 t=a[0]; |
| 346 t=(t+c)&BN_MASK2; | 353 t=(t+c)&BN_MASK2; |
| 347 c=(t < c); | 354 c=(t < c); |
| 348 l=(t+b[0])&BN_MASK2; | 355 l=(t+b[0])&BN_MASK2; |
| 349 c+=(l < t); | 356 c+=(l < t); |
| 350 r[0]=l; | 357 r[0]=l; |
| 351 if (--n <= 0) break; | |
| 352 | |
| 353 t=a[1]; | 358 t=a[1]; |
| 354 t=(t+c)&BN_MASK2; | 359 t=(t+c)&BN_MASK2; |
| 355 c=(t < c); | 360 c=(t < c); |
| 356 l=(t+b[1])&BN_MASK2; | 361 l=(t+b[1])&BN_MASK2; |
| 357 c+=(l < t); | 362 c+=(l < t); |
| 358 r[1]=l; | 363 r[1]=l; |
| 359 if (--n <= 0) break; | |
| 360 | |
| 361 t=a[2]; | 364 t=a[2]; |
| 362 t=(t+c)&BN_MASK2; | 365 t=(t+c)&BN_MASK2; |
| 363 c=(t < c); | 366 c=(t < c); |
| 364 l=(t+b[2])&BN_MASK2; | 367 l=(t+b[2])&BN_MASK2; |
| 365 c+=(l < t); | 368 c+=(l < t); |
| 366 r[2]=l; | 369 r[2]=l; |
| 367 if (--n <= 0) break; | |
| 368 | |
| 369 t=a[3]; | 370 t=a[3]; |
| 370 t=(t+c)&BN_MASK2; | 371 t=(t+c)&BN_MASK2; |
| 371 c=(t < c); | 372 c=(t < c); |
| 372 l=(t+b[3])&BN_MASK2; | 373 l=(t+b[3])&BN_MASK2; |
| 373 c+=(l < t); | 374 c+=(l < t); |
| 374 r[3]=l; | 375 r[3]=l; |
| 375 » » if (--n <= 0) break; | 376 » » a+=4; b+=4; r+=4; n-=4; |
| 376 | 377 » » } |
| 377 » » a+=4; | 378 #endif |
| 378 » » b+=4; | 379 » while(n) |
| 379 » » r+=4; | 380 » » { |
| 381 » » t=a[0]; |
| 382 » » t=(t+c)&BN_MASK2; |
| 383 » » c=(t < c); |
| 384 » » l=(t+b[0])&BN_MASK2; |
| 385 » » c+=(l < t); |
| 386 » » r[0]=l; |
| 387 » » a++; b++; r++; n--; |
| 380 } | 388 } |
| 381 return((BN_ULONG)c); | 389 return((BN_ULONG)c); |
| 382 } | 390 } |
| 383 #endif /* !BN_LLONG */ | 391 #endif /* !BN_LLONG */ |
| 384 | 392 |
| 385 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) | 393 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) |
| 386 { | 394 { |
| 387 BN_ULONG t1,t2; | 395 BN_ULONG t1,t2; |
| 388 int c=0; | 396 int c=0; |
| 389 | 397 |
| 390 assert(n >= 0); | 398 assert(n >= 0); |
| 391 if (n <= 0) return((BN_ULONG)0); | 399 if (n <= 0) return((BN_ULONG)0); |
| 392 | 400 |
| 393 » for (;;) | 401 #ifndef OPENSSL_SMALL_FOOTPRINT |
| 402 » while (n&~3) |
| 394 { | 403 { |
| 395 t1=a[0]; t2=b[0]; | 404 t1=a[0]; t2=b[0]; |
| 396 r[0]=(t1-t2-c)&BN_MASK2; | 405 r[0]=(t1-t2-c)&BN_MASK2; |
| 397 if (t1 != t2) c=(t1 < t2); | 406 if (t1 != t2) c=(t1 < t2); |
| 398 if (--n <= 0) break; | |
| 399 | |
| 400 t1=a[1]; t2=b[1]; | 407 t1=a[1]; t2=b[1]; |
| 401 r[1]=(t1-t2-c)&BN_MASK2; | 408 r[1]=(t1-t2-c)&BN_MASK2; |
| 402 if (t1 != t2) c=(t1 < t2); | 409 if (t1 != t2) c=(t1 < t2); |
| 403 if (--n <= 0) break; | |
| 404 | |
| 405 t1=a[2]; t2=b[2]; | 410 t1=a[2]; t2=b[2]; |
| 406 r[2]=(t1-t2-c)&BN_MASK2; | 411 r[2]=(t1-t2-c)&BN_MASK2; |
| 407 if (t1 != t2) c=(t1 < t2); | 412 if (t1 != t2) c=(t1 < t2); |
| 408 if (--n <= 0) break; | |
| 409 | |
| 410 t1=a[3]; t2=b[3]; | 413 t1=a[3]; t2=b[3]; |
| 411 r[3]=(t1-t2-c)&BN_MASK2; | 414 r[3]=(t1-t2-c)&BN_MASK2; |
| 412 if (t1 != t2) c=(t1 < t2); | 415 if (t1 != t2) c=(t1 < t2); |
| 413 » » if (--n <= 0) break; | 416 » » a+=4; b+=4; r+=4; n-=4; |
| 414 | 417 » » } |
| 415 » » a+=4; | 418 #endif |
| 416 » » b+=4; | 419 » while (n) |
| 417 » » r+=4; | 420 » » { |
| 421 » » t1=a[0]; t2=b[0]; |
| 422 » » r[0]=(t1-t2-c)&BN_MASK2; |
| 423 » » if (t1 != t2) c=(t1 < t2); |
| 424 » » a++; b++; r++; n--; |
| 418 } | 425 } |
| 419 return(c); | 426 return(c); |
| 420 } | 427 } |
| 421 | 428 |
| 422 #ifdef BN_MUL_COMBA | 429 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
| 423 | 430 |
| 424 #undef bn_mul_comba8 | 431 #undef bn_mul_comba8 |
| 425 #undef bn_mul_comba4 | 432 #undef bn_mul_comba4 |
| 426 #undef bn_sqr_comba8 | 433 #undef bn_sqr_comba8 |
| 427 #undef bn_sqr_comba4 | 434 #undef bn_sqr_comba4 |
| 428 | 435 |
| 429 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | 436 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
| 430 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | 437 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
| 431 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | 438 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
| 432 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
*/ | 439 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
*/ |
| (...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 813 sqr_add_c2(a,3,1,c2,c3,c1); | 820 sqr_add_c2(a,3,1,c2,c3,c1); |
| 814 r[4]=c2; | 821 r[4]=c2; |
| 815 c2=0; | 822 c2=0; |
| 816 sqr_add_c2(a,3,2,c3,c1,c2); | 823 sqr_add_c2(a,3,2,c3,c1,c2); |
| 817 r[5]=c3; | 824 r[5]=c3; |
| 818 c3=0; | 825 c3=0; |
| 819 sqr_add_c(a,3,c1,c2,c3); | 826 sqr_add_c(a,3,c1,c2,c3); |
| 820 r[6]=c1; | 827 r[6]=c1; |
| 821 r[7]=c2; | 828 r[7]=c2; |
| 822 } | 829 } |
| 830 |
| 831 #ifdef OPENSSL_NO_ASM |
| 832 #ifdef OPENSSL_BN_ASM_MONT |
| 833 #include <alloca.h> |
| 834 /* |
| 835 * This is essentially reference implementation, which may or may not |
| 836 * result in performance improvement. E.g. on IA-32 this routine was |
| 837 * observed to give 40% faster rsa1024 private key operations and 10% |
| 838 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only |
| 839 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a |
| 840 * reference implementation, one to be used as starting point for |
| 841 * platform-specific assembler. Mentioned numbers apply to compiler |
| 842 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and |
| 843 * can vary not only from platform to platform, but even for compiler |
| 844 * versions. Assembler vs. assembler improvement coefficients can |
| 845 * [and are known to] differ and are to be documented elsewhere. |
| 846 */ |
| 847 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0p, int num) |
| 848 { |
| 849 BN_ULONG c0,c1,ml,*tp,n0; |
| 850 #ifdef mul64 |
| 851 BN_ULONG mh; |
| 852 #endif |
| 853 volatile BN_ULONG *vp; |
| 854 int i=0,j; |
| 855 |
| 856 #if 0 /* template for platform-specific implementation */ |
| 857 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); |
| 858 #endif |
| 859 vp = tp = alloca((num+2)*sizeof(BN_ULONG)); |
| 860 |
| 861 n0 = *n0p; |
| 862 |
| 863 c0 = 0; |
| 864 ml = bp[0]; |
| 865 #ifdef mul64 |
| 866 mh = HBITS(ml); |
| 867 ml = LBITS(ml); |
| 868 for (j=0;j<num;++j) |
| 869 mul(tp[j],ap[j],ml,mh,c0); |
| 870 #else |
| 871 for (j=0;j<num;++j) |
| 872 mul(tp[j],ap[j],ml,c0); |
| 873 #endif |
| 874 |
| 875 tp[num] = c0; |
| 876 tp[num+1] = 0; |
| 877 goto enter; |
| 878 |
| 879 for(i=0;i<num;i++) |
| 880 { |
| 881 c0 = 0; |
| 882 ml = bp[i]; |
| 883 #ifdef mul64 |
| 884 mh = HBITS(ml); |
| 885 ml = LBITS(ml); |
| 886 for (j=0;j<num;++j) |
| 887 mul_add(tp[j],ap[j],ml,mh,c0); |
| 888 #else |
| 889 for (j=0;j<num;++j) |
| 890 mul_add(tp[j],ap[j],ml,c0); |
| 891 #endif |
| 892 c1 = (tp[num] + c0)&BN_MASK2; |
| 893 tp[num] = c1; |
| 894 tp[num+1] = (c1<c0?1:0); |
| 895 enter: |
| 896 c1 = tp[0]; |
| 897 ml = (c1*n0)&BN_MASK2; |
| 898 c0 = 0; |
| 899 #ifdef mul64 |
| 900 mh = HBITS(ml); |
| 901 ml = LBITS(ml); |
| 902 mul_add(c1,np[0],ml,mh,c0); |
| 903 #else |
| 904 mul_add(c1,ml,np[0],c0); |
| 905 #endif |
| 906 for(j=1;j<num;j++) |
| 907 { |
| 908 c1 = tp[j]; |
| 909 #ifdef mul64 |
| 910 mul_add(c1,np[j],ml,mh,c0); |
| 911 #else |
| 912 mul_add(c1,ml,np[j],c0); |
| 913 #endif |
| 914 tp[j-1] = c1&BN_MASK2; |
| 915 } |
| 916 c1 = (tp[num] + c0)&BN_MASK2; |
| 917 tp[num-1] = c1; |
| 918 tp[num] = tp[num+1] + (c1<c0?1:0); |
| 919 } |
| 920 |
| 921 if (tp[num]!=0 || tp[num-1]>=np[num-1]) |
| 922 { |
| 923 c0 = bn_sub_words(rp,tp,np,num); |
| 924 if (tp[num]!=0 || c0==0) |
| 925 { |
| 926 for(i=0;i<num+2;i++) vp[i] = 0; |
| 927 return 1; |
| 928 } |
| 929 } |
| 930 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; |
| 931 vp[num] = 0; |
| 932 vp[num+1] = 0; |
| 933 return 1; |
| 934 } |
| 935 #else |
| 936 /* |
| 937 * Return value of 0 indicates that multiplication/convolution was not |
| 938 * performed to signal the caller to fall down to alternative/original |
| 939 * code-path. |
| 940 */ |
| 941 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0, int num) |
| 942 { return 0; } |
| 943 #endif /* OPENSSL_BN_ASM_MONT */ |
| 944 #endif |
| 945 |
| 823 #else /* !BN_MUL_COMBA */ | 946 #else /* !BN_MUL_COMBA */ |
| 824 | 947 |
| 825 /* hmm... is it faster just to do a multiply? */ | 948 /* hmm... is it faster just to do a multiply? */ |
| 826 #undef bn_sqr_comba4 | 949 #undef bn_sqr_comba4 |
| 827 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | 950 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
| 828 { | 951 { |
| 829 BN_ULONG t[8]; | 952 BN_ULONG t[8]; |
| 830 bn_sqr_normal(r,a,4,t); | 953 bn_sqr_normal(r,a,4,t); |
| 831 } | 954 } |
| 832 | 955 |
| 833 #undef bn_sqr_comba8 | 956 #undef bn_sqr_comba8 |
| 834 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | 957 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
| 835 { | 958 { |
| 836 BN_ULONG t[16]; | 959 BN_ULONG t[16]; |
| 837 bn_sqr_normal(r,a,8,t); | 960 bn_sqr_normal(r,a,8,t); |
| 838 } | 961 } |
| 839 | 962 |
| 840 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | 963 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
| 841 { | 964 { |
| 842 r[4]=bn_mul_words( &(r[0]),a,4,b[0]); | 965 r[4]=bn_mul_words( &(r[0]),a,4,b[0]); |
| 843 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); | 966 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); |
| 844 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); | 967 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); |
| 845 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); | 968 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); |
| 846 } | 969 } |
| 847 | 970 |
| 848 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | 971 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
| 849 { | 972 { |
| 850 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); | 973 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); |
| 851 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); | 974 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); |
| 852 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); | 975 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); |
| 853 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); | 976 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); |
| 854 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); | 977 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); |
| 855 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); | 978 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); |
| 856 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); | 979 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); |
| 857 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); | 980 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); |
| 858 } | 981 } |
| 859 | 982 |
| 983 #ifdef OPENSSL_NO_ASM |
| 984 #ifdef OPENSSL_BN_ASM_MONT |
| 985 #include <alloca.h> |
| 986 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0p, int num) |
| 987 { |
| 988 BN_ULONG c0,c1,*tp,n0=*n0p; |
| 989 volatile BN_ULONG *vp; |
| 990 int i=0,j; |
| 991 |
| 992 vp = tp = alloca((num+2)*sizeof(BN_ULONG)); |
| 993 |
| 994 for(i=0;i<=num;i++) tp[i]=0; |
| 995 |
| 996 for(i=0;i<num;i++) |
| 997 { |
| 998 c0 = bn_mul_add_words(tp,ap,num,bp[i]); |
| 999 c1 = (tp[num] + c0)&BN_MASK2; |
| 1000 tp[num] = c1; |
| 1001 tp[num+1] = (c1<c0?1:0); |
| 1002 |
| 1003 c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); |
| 1004 c1 = (tp[num] + c0)&BN_MASK2; |
| 1005 tp[num] = c1; |
| 1006 tp[num+1] += (c1<c0?1:0); |
| 1007 for(j=0;j<=num;j++) tp[j]=tp[j+1]; |
| 1008 } |
| 1009 |
| 1010 if (tp[num]!=0 || tp[num-1]>=np[num-1]) |
| 1011 { |
| 1012 c0 = bn_sub_words(rp,tp,np,num); |
| 1013 if (tp[num]!=0 || c0==0) |
| 1014 { |
| 1015 for(i=0;i<num+2;i++) vp[i] = 0; |
| 1016 return 1; |
| 1017 } |
| 1018 } |
| 1019 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; |
| 1020 vp[num] = 0; |
| 1021 vp[num+1] = 0; |
| 1022 return 1; |
| 1023 } |
| 1024 #else |
| 1025 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
LONG *np,const BN_ULONG *n0, int num) |
| 1026 { return 0; } |
| 1027 #endif /* OPENSSL_BN_ASM_MONT */ |
| 1028 #endif |
| 1029 |
| 860 #endif /* !BN_MUL_COMBA */ | 1030 #endif /* !BN_MUL_COMBA */ |
| OLD | NEW |