openssl/crypto/bn/bn_asm.c - Issue 2072073002: Delete bundled copy of OpenSSL and replace with README.

Side by Side Diff: openssl/crypto/bn/bn_asm.c

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master

Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /* crypto/bn/bn_asm.c */

2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)

3 * All rights reserved.

4 *

5 * This package is an SSL implementation written

6 * by Eric Young (eay@cryptsoft.com).

7 * The implementation was written so as to conform with Netscapes SSL.

8 *

9 * This library is free for commercial and non-commercial use as long as

10 * the following conditions are aheared to. The following conditions

11 * apply to all code found in this distribution, be it the RC4, RSA,

12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation

13 * included with this distribution is covered by the same copyright terms

14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).

15 *

16 * Copyright remains Eric Young's, and as such any Copyright notices in

17 * the code are not to be removed.

18 * If this package is used in a product, Eric Young should be given attribution

19 * as the author of the parts of the library used.

20 * This can be in the form of a textual message at program startup or

21 * in documentation (online or textual) provided with the package.

22 *

23 * Redistribution and use in source and binary forms, with or without

24 * modification, are permitted provided that the following conditions

25 * are met:

26 * 1. Redistributions of source code must retain the copyright

27 * notice, this list of conditions and the following disclaimer.

28 * 2. Redistributions in binary form must reproduce the above copyright

29 * notice, this list of conditions and the following disclaimer in the

30 * documentation and/or other materials provided with the distribution.

31 * 3. All advertising materials mentioning features or use of this software

32 * must display the following acknowledgement:

33 * "This product includes cryptographic software written by

34 * Eric Young (eay@cryptsoft.com)"

35 * The word 'cryptographic' can be left out if the rouines from the library

36 * being used are not cryptographic related :-).

37 * 4. If you include any Windows specific code (or a derivative thereof) from

38 * the apps directory (application code) you must include an acknowledgement:

39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"

40 *

41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND

42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE

45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

51 * SUCH DAMAGE.

52 *

53 * The licence and distribution terms for any publically available version or

54 * derivative of this code cannot be changed. i.e. this code cannot simply be

55 * copied and put under another distribution licence

56 * [including the GNU Public Licence.]

57 */

58

59 #ifndef BN_DEBUG

60 # undef NDEBUG /* avoid conflicting definitions */

61 # define NDEBUG

62 #endif

63

64 #include <stdio.h>

65 #include <assert.h>

66 #include "cryptlib.h"

67 #include "bn_lcl.h"

68

69 #if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)

70

71 BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)

72 {

73 BN_ULONG c1=0;

74

75 assert(num >= 0);

76 if (num <= 0) return(c1);

77

78 #ifndef OPENSSL_SMALL_FOOTPRINT

79 while (num&~3)

80 {

81 mul_add(rp[0],ap[0],w,c1);

82 mul_add(rp[1],ap[1],w,c1);

83 mul_add(rp[2],ap[2],w,c1);

84 mul_add(rp[3],ap[3],w,c1);

85 ap+=4; rp+=4; num-=4;

86 }

87 #endif

88 while (num)

89 {

90 mul_add(rp[0],ap[0],w,c1);

91 ap++; rp++; num--;

92 }

93

94 return(c1);

95 }

96

97 BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)

98 {

99 BN_ULONG c1=0;

100

101 assert(num >= 0);

102 if (num <= 0) return(c1);

103

104 #ifndef OPENSSL_SMALL_FOOTPRINT

105 while (num&~3)

106 {

107 mul(rp[0],ap[0],w,c1);

108 mul(rp[1],ap[1],w,c1);

109 mul(rp[2],ap[2],w,c1);

110 mul(rp[3],ap[3],w,c1);

111 ap+=4; rp+=4; num-=4;

112 }

113 #endif

114 while (num)

115 {

116 mul(rp[0],ap[0],w,c1);

117 ap++; rp++; num--;

118 }

119 return(c1);

120 }

121

122 void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)

123 {

124 assert(n >= 0);

125 if (n <= 0) return;

126

127 #ifndef OPENSSL_SMALL_FOOTPRINT

128 while (n&~3)

129 {

130 sqr(r[0],r[1],a[0]);

131 sqr(r[2],r[3],a[1]);

132 sqr(r[4],r[5],a[2]);

133 sqr(r[6],r[7],a[3]);

134 a+=4; r+=8; n-=4;

135 }

136 #endif

137 while (n)

138 {

139 sqr(r[0],r[1],a[0]);

140 a++; r+=2; n--;

141 }

142 }

143

144 #else /* !(defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)) */

145

146 BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)

147 {

148 BN_ULONG c=0;

149 BN_ULONG bl,bh;

150

151 assert(num >= 0);

152 if (num <= 0) return((BN_ULONG)0);

153

154 bl=LBITS(w);

155 bh=HBITS(w);

156

157 #ifndef OPENSSL_SMALL_FOOTPRINT

158 while (num&~3)

159 {

160 mul_add(rp[0],ap[0],bl,bh,c);

161 mul_add(rp[1],ap[1],bl,bh,c);

162 mul_add(rp[2],ap[2],bl,bh,c);

163 mul_add(rp[3],ap[3],bl,bh,c);

164 ap+=4; rp+=4; num-=4;

165 }

166 #endif

167 while (num)

168 {

169 mul_add(rp[0],ap[0],bl,bh,c);

170 ap++; rp++; num--;

171 }

172 return(c);

173 }

174

175 BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)

176 {

177 BN_ULONG carry=0;

178 BN_ULONG bl,bh;

179

180 assert(num >= 0);

181 if (num <= 0) return((BN_ULONG)0);

182

183 bl=LBITS(w);

184 bh=HBITS(w);

185

186 #ifndef OPENSSL_SMALL_FOOTPRINT

187 while (num&~3)

188 {

189 mul(rp[0],ap[0],bl,bh,carry);

190 mul(rp[1],ap[1],bl,bh,carry);

191 mul(rp[2],ap[2],bl,bh,carry);

192 mul(rp[3],ap[3],bl,bh,carry);

193 ap+=4; rp+=4; num-=4;

194 }

195 #endif

196 while (num)

197 {

198 mul(rp[0],ap[0],bl,bh,carry);

199 ap++; rp++; num--;

200 }

201 return(carry);

202 }

203

204 void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)

205 {

206 assert(n >= 0);

207 if (n <= 0) return;

208

209 #ifndef OPENSSL_SMALL_FOOTPRINT

210 while (n&~3)

211 {

212 sqr64(r[0],r[1],a[0]);

213 sqr64(r[2],r[3],a[1]);

214 sqr64(r[4],r[5],a[2]);

215 sqr64(r[6],r[7],a[3]);

216 a+=4; r+=8; n-=4;

217 }

218 #endif

219 while (n)

220 {

221 sqr64(r[0],r[1],a[0]);

222 a++; r+=2; n--;

223 }

224 }

225

226 #endif /* !(defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)) */

227

228 #if defined(BN_LLONG) && defined(BN_DIV2W)

229

230 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)

231 {

232 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)\|l)/(BN_ULLONG)d));

233 }

234

235 #else

236

237 /* Divide h,l by d and return the result. */

238 /* I need to test this some more :-( */

239 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)

240 {

241 BN_ULONG dh,dl,q,ret=0,th,tl,t;

242 int i,count=2;

243

244 if (d == 0) return(BN_MASK2);

245

246 i=BN_num_bits_word(d);

247 assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1<<i));

248

249 i=BN_BITS2-i;

250 if (h >= d) h-=d;

251

252 if (i)

253 {

254 d<<=i;

255 h=(h<<i)\|(l>>(BN_BITS2-i));

256 l<<=i;

257 }

258 dh=(d&BN_MASK2h)>>BN_BITS4;

259 dl=(d&BN_MASK2l);

260 for (;;)

261 {

262 if ((h>>BN_BITS4) == dh)

263 q=BN_MASK2l;

264 else

265 q=h/dh;

266

267 th=q*dh;

268 tl=dl*q;

269 for (;;)

270 {

271 t=h-th;

272 if ((t&BN_MASK2h) \|\|

273 ((tl) <= (

274 (t<<BN_BITS4)\|

275 ((l&BN_MASK2h)>>BN_BITS4))))

276 break;

277 q--;

278 th-=dh;

279 tl-=dl;

280 }

281 t=(tl>>BN_BITS4);

282 tl=(tl<<BN_BITS4)&BN_MASK2h;

283 th+=t;

284

285 if (l < tl) th++;

286 l-=tl;

287 if (h < th)

288 {

289 h+=d;

290 q--;

291 }

292 h-=th;

293

294 if (--count == 0) break;

295

296 ret=q<<BN_BITS4;

297 h=((h<<BN_BITS4)\|(l>>BN_BITS4))&BN_MASK2;

298 l=(l&BN_MASK2l)<<BN_BITS4;

299 }

300 ret\|=q;

301 return(ret);

302 }

303 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */

304

305 #ifdef BN_LLONG

306 BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)

307 {

308 BN_ULLONG ll=0;

309

310 assert(n >= 0);

311 if (n <= 0) return((BN_ULONG)0);

312

313 #ifndef OPENSSL_SMALL_FOOTPRINT

314 while (n&~3)

315 {

316 ll+=(BN_ULLONG)a[0]+b[0];

317 r[0]=(BN_ULONG)ll&BN_MASK2;

318 ll>>=BN_BITS2;

319 ll+=(BN_ULLONG)a[1]+b[1];

320 r[1]=(BN_ULONG)ll&BN_MASK2;

321 ll>>=BN_BITS2;

322 ll+=(BN_ULLONG)a[2]+b[2];

323 r[2]=(BN_ULONG)ll&BN_MASK2;

324 ll>>=BN_BITS2;

325 ll+=(BN_ULLONG)a[3]+b[3];

326 r[3]=(BN_ULONG)ll&BN_MASK2;

327 ll>>=BN_BITS2;

328 a+=4; b+=4; r+=4; n-=4;

329 }

330 #endif

331 while (n)

332 {

333 ll+=(BN_ULLONG)a[0]+b[0];

334 r[0]=(BN_ULONG)ll&BN_MASK2;

335 ll>>=BN_BITS2;

336 a++; b++; r++; n--;

337 }

338 return((BN_ULONG)ll);

339 }

340 #else /* !BN_LLONG */

341 BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)

342 {

343 BN_ULONG c,l,t;

344

345 assert(n >= 0);

346 if (n <= 0) return((BN_ULONG)0);

347

348 c=0;

349 #ifndef OPENSSL_SMALL_FOOTPRINT

350 while (n&~3)

351 {

352 t=a[0];

353 t=(t+c)&BN_MASK2;

354 c=(t < c);

355 l=(t+b[0])&BN_MASK2;

356 c+=(l < t);

357 r[0]=l;

358 t=a[1];

359 t=(t+c)&BN_MASK2;

360 c=(t < c);

361 l=(t+b[1])&BN_MASK2;

362 c+=(l < t);

363 r[1]=l;

364 t=a[2];

365 t=(t+c)&BN_MASK2;

366 c=(t < c);

367 l=(t+b[2])&BN_MASK2;

368 c+=(l < t);

369 r[2]=l;

370 t=a[3];

371 t=(t+c)&BN_MASK2;

372 c=(t < c);

373 l=(t+b[3])&BN_MASK2;

374 c+=(l < t);

375 r[3]=l;

376 a+=4; b+=4; r+=4; n-=4;

377 }

378 #endif

379 while(n)

380 {

381 t=a[0];

382 t=(t+c)&BN_MASK2;

383 c=(t < c);

384 l=(t+b[0])&BN_MASK2;

385 c+=(l < t);

386 r[0]=l;

387 a++; b++; r++; n--;

388 }

389 return((BN_ULONG)c);

390 }

391 #endif /* !BN_LLONG */

392

393 BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)

394 {

395 BN_ULONG t1,t2;

396 int c=0;

397

398 assert(n >= 0);

399 if (n <= 0) return((BN_ULONG)0);

400

401 #ifndef OPENSSL_SMALL_FOOTPRINT

402 while (n&~3)

403 {

404 t1=a[0]; t2=b[0];

405 r[0]=(t1-t2-c)&BN_MASK2;

406 if (t1 != t2) c=(t1 < t2);

407 t1=a[1]; t2=b[1];

408 r[1]=(t1-t2-c)&BN_MASK2;

409 if (t1 != t2) c=(t1 < t2);

410 t1=a[2]; t2=b[2];

411 r[2]=(t1-t2-c)&BN_MASK2;

412 if (t1 != t2) c=(t1 < t2);

413 t1=a[3]; t2=b[3];

414 r[3]=(t1-t2-c)&BN_MASK2;

415 if (t1 != t2) c=(t1 < t2);

416 a+=4; b+=4; r+=4; n-=4;

417 }

418 #endif

419 while (n)

420 {

421 t1=a[0]; t2=b[0];

422 r[0]=(t1-t2-c)&BN_MASK2;

423 if (t1 != t2) c=(t1 < t2);

424 a++; b++; r++; n--;

425 }

426 return(c);

427 }

428

429 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)

430

431 #undef bn_mul_comba8

432 #undef bn_mul_comba4

433 #undef bn_sqr_comba8

434 #undef bn_sqr_comba4

435

436 /* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /

437 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */

438 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */

439 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */

440

441 #ifdef BN_LLONG

442 #define mul_add_c(a,b,c0,c1,c2) \

443 t=(BN_ULLONG)a*b; \

444 t1=(BN_ULONG)Lw(t); \

445 t2=(BN_ULONG)Hw(t); \

446 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \

447 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;

448

449 #define mul_add_c2(a,b,c0,c1,c2) \

450 t=(BN_ULLONG)a*b; \

451 tt=(t+t)&BN_MASK; \

452 if (tt < t) c2++; \

453 t1=(BN_ULONG)Lw(tt); \

454 t2=(BN_ULONG)Hw(tt); \

455 c0=(c0+t1)&BN_MASK2; \

456 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \

457 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;

458

459 #define sqr_add_c(a,i,c0,c1,c2) \

460 t=(BN_ULLONG)a[i]*a[i]; \

461 t1=(BN_ULONG)Lw(t); \

462 t2=(BN_ULONG)Hw(t); \

463 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \

464 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;

465

466 #define sqr_add_c2(a,i,j,c0,c1,c2) \

467 mul_add_c2((a)[i],(a)[j],c0,c1,c2)

468

469 #elif defined(BN_UMULT_LOHI)

470

471 #define mul_add_c(a,b,c0,c1,c2) { \

472 BN_ULONG ta=(a),tb=(b); \

473 BN_UMULT_LOHI(t1,t2,ta,tb); \

474 c0 += t1; t2 += (c0<t1)?1:0; \

475 c1 += t2; c2 += (c1<t2)?1:0; \

476 }

477

478 #define mul_add_c2(a,b,c0,c1,c2) { \

479 BN_ULONG ta=(a),tb=(b),t0; \

480 BN_UMULT_LOHI(t0,t1,ta,tb); \

481 t2 = t1+t1; c2 += (t2<t1)?1:0; \

482 t1 = t0+t0; t2 += (t1<t0)?1:0; \

483 c0 += t1; t2 += (c0<t1)?1:0; \

484 c1 += t2; c2 += (c1<t2)?1:0; \

485 }

486

487 #define sqr_add_c(a,i,c0,c1,c2) { \

488 BN_ULONG ta=(a)[i]; \

489 BN_UMULT_LOHI(t1,t2,ta,ta); \

490 c0 += t1; t2 += (c0<t1)?1:0; \

491 c1 += t2; c2 += (c1<t2)?1:0; \

492 }

493

494 #define sqr_add_c2(a,i,j,c0,c1,c2) \

495 mul_add_c2((a)[i],(a)[j],c0,c1,c2)

496

497 #elif defined(BN_UMULT_HIGH)

498

499 #define mul_add_c(a,b,c0,c1,c2) { \

500 BN_ULONG ta=(a),tb=(b); \

501 t1 = ta * tb; \

502 t2 = BN_UMULT_HIGH(ta,tb); \

503 c0 += t1; t2 += (c0<t1)?1:0; \

504 c1 += t2; c2 += (c1<t2)?1:0; \

505 }

506

507 #define mul_add_c2(a,b,c0,c1,c2) { \

508 BN_ULONG ta=(a),tb=(b),t0; \

509 t1 = BN_UMULT_HIGH(ta,tb); \

510 t0 = ta * tb; \

511 t2 = t1+t1; c2 += (t2<t1)?1:0; \

512 t1 = t0+t0; t2 += (t1<t0)?1:0; \

513 c0 += t1; t2 += (c0<t1)?1:0; \

514 c1 += t2; c2 += (c1<t2)?1:0; \

515 }

516

517 #define sqr_add_c(a,i,c0,c1,c2) { \

518 BN_ULONG ta=(a)[i]; \

519 t1 = ta * ta; \

520 t2 = BN_UMULT_HIGH(ta,ta); \

521 c0 += t1; t2 += (c0<t1)?1:0; \

522 c1 += t2; c2 += (c1<t2)?1:0; \

523 }

524

525 #define sqr_add_c2(a,i,j,c0,c1,c2) \

526 mul_add_c2((a)[i],(a)[j],c0,c1,c2)

527

528 #else /* !BN_LLONG */

529 #define mul_add_c(a,b,c0,c1,c2) \

530 t1=LBITS(a); t2=HBITS(a); \

531 bl=LBITS(b); bh=HBITS(b); \

532 mul64(t1,t2,bl,bh); \

533 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \

534 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;

535

536 #define mul_add_c2(a,b,c0,c1,c2) \

537 t1=LBITS(a); t2=HBITS(a); \

538 bl=LBITS(b); bh=HBITS(b); \

539 mul64(t1,t2,bl,bh); \

540 if (t2 & BN_TBIT) c2++; \

541 t2=(t2+t2)&BN_MASK2; \

542 if (t1 & BN_TBIT) t2++; \

543 t1=(t1+t1)&BN_MASK2; \

544 c0=(c0+t1)&BN_MASK2; \

545 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \

546 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;

547

548 #define sqr_add_c(a,i,c0,c1,c2) \

549 sqr64(t1,t2,(a)[i]); \

550 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \

551 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;

552

553 #define sqr_add_c2(a,i,j,c0,c1,c2) \

554 mul_add_c2((a)[i],(a)[j],c0,c1,c2)

555 #endif /* !BN_LLONG */

556

557 void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)

558 {

559 #ifdef BN_LLONG

560 BN_ULLONG t;

561 #else

562 BN_ULONG bl,bh;

563 #endif

564 BN_ULONG t1,t2;

565 BN_ULONG c1,c2,c3;

566

567 c1=0;

568 c2=0;

569 c3=0;

570 mul_add_c(a[0],b[0],c1,c2,c3);

571 r[0]=c1;

572 c1=0;

573 mul_add_c(a[0],b[1],c2,c3,c1);

574 mul_add_c(a[1],b[0],c2,c3,c1);

575 r[1]=c2;

576 c2=0;

577 mul_add_c(a[2],b[0],c3,c1,c2);

578 mul_add_c(a[1],b[1],c3,c1,c2);

579 mul_add_c(a[0],b[2],c3,c1,c2);

580 r[2]=c3;

581 c3=0;

582 mul_add_c(a[0],b[3],c1,c2,c3);

583 mul_add_c(a[1],b[2],c1,c2,c3);

584 mul_add_c(a[2],b[1],c1,c2,c3);

585 mul_add_c(a[3],b[0],c1,c2,c3);

586 r[3]=c1;

587 c1=0;

588 mul_add_c(a[4],b[0],c2,c3,c1);

589 mul_add_c(a[3],b[1],c2,c3,c1);

590 mul_add_c(a[2],b[2],c2,c3,c1);

591 mul_add_c(a[1],b[3],c2,c3,c1);

592 mul_add_c(a[0],b[4],c2,c3,c1);

593 r[4]=c2;

594 c2=0;

595 mul_add_c(a[0],b[5],c3,c1,c2);

596 mul_add_c(a[1],b[4],c3,c1,c2);

597 mul_add_c(a[2],b[3],c3,c1,c2);

598 mul_add_c(a[3],b[2],c3,c1,c2);

599 mul_add_c(a[4],b[1],c3,c1,c2);

600 mul_add_c(a[5],b[0],c3,c1,c2);

601 r[5]=c3;

602 c3=0;

603 mul_add_c(a[6],b[0],c1,c2,c3);

604 mul_add_c(a[5],b[1],c1,c2,c3);

605 mul_add_c(a[4],b[2],c1,c2,c3);

606 mul_add_c(a[3],b[3],c1,c2,c3);

607 mul_add_c(a[2],b[4],c1,c2,c3);

608 mul_add_c(a[1],b[5],c1,c2,c3);

609 mul_add_c(a[0],b[6],c1,c2,c3);

610 r[6]=c1;

611 c1=0;

612 mul_add_c(a[0],b[7],c2,c3,c1);

613 mul_add_c(a[1],b[6],c2,c3,c1);

614 mul_add_c(a[2],b[5],c2,c3,c1);

615 mul_add_c(a[3],b[4],c2,c3,c1);

616 mul_add_c(a[4],b[3],c2,c3,c1);

617 mul_add_c(a[5],b[2],c2,c3,c1);

618 mul_add_c(a[6],b[1],c2,c3,c1);

619 mul_add_c(a[7],b[0],c2,c3,c1);

620 r[7]=c2;

621 c2=0;

622 mul_add_c(a[7],b[1],c3,c1,c2);

623 mul_add_c(a[6],b[2],c3,c1,c2);

624 mul_add_c(a[5],b[3],c3,c1,c2);

625 mul_add_c(a[4],b[4],c3,c1,c2);

626 mul_add_c(a[3],b[5],c3,c1,c2);

627 mul_add_c(a[2],b[6],c3,c1,c2);

628 mul_add_c(a[1],b[7],c3,c1,c2);

629 r[8]=c3;

630 c3=0;

631 mul_add_c(a[2],b[7],c1,c2,c3);

632 mul_add_c(a[3],b[6],c1,c2,c3);

633 mul_add_c(a[4],b[5],c1,c2,c3);

634 mul_add_c(a[5],b[4],c1,c2,c3);

635 mul_add_c(a[6],b[3],c1,c2,c3);

636 mul_add_c(a[7],b[2],c1,c2,c3);

637 r[9]=c1;

638 c1=0;

639 mul_add_c(a[7],b[3],c2,c3,c1);

640 mul_add_c(a[6],b[4],c2,c3,c1);

641 mul_add_c(a[5],b[5],c2,c3,c1);

642 mul_add_c(a[4],b[6],c2,c3,c1);

643 mul_add_c(a[3],b[7],c2,c3,c1);

644 r[10]=c2;

645 c2=0;

646 mul_add_c(a[4],b[7],c3,c1,c2);

647 mul_add_c(a[5],b[6],c3,c1,c2);

648 mul_add_c(a[6],b[5],c3,c1,c2);

649 mul_add_c(a[7],b[4],c3,c1,c2);

650 r[11]=c3;

651 c3=0;

652 mul_add_c(a[7],b[5],c1,c2,c3);

653 mul_add_c(a[6],b[6],c1,c2,c3);

654 mul_add_c(a[5],b[7],c1,c2,c3);

655 r[12]=c1;

656 c1=0;

657 mul_add_c(a[6],b[7],c2,c3,c1);

658 mul_add_c(a[7],b[6],c2,c3,c1);

659 r[13]=c2;

660 c2=0;

661 mul_add_c(a[7],b[7],c3,c1,c2);

662 r[14]=c3;

663 r[15]=c1;

664 }

665

666 void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)

667 {

668 #ifdef BN_LLONG

669 BN_ULLONG t;

670 #else

671 BN_ULONG bl,bh;

672 #endif

673 BN_ULONG t1,t2;

674 BN_ULONG c1,c2,c3;

675

676 c1=0;

677 c2=0;

678 c3=0;

679 mul_add_c(a[0],b[0],c1,c2,c3);

680 r[0]=c1;

681 c1=0;

682 mul_add_c(a[0],b[1],c2,c3,c1);

683 mul_add_c(a[1],b[0],c2,c3,c1);

684 r[1]=c2;

685 c2=0;

686 mul_add_c(a[2],b[0],c3,c1,c2);

687 mul_add_c(a[1],b[1],c3,c1,c2);

688 mul_add_c(a[0],b[2],c3,c1,c2);

689 r[2]=c3;

690 c3=0;

691 mul_add_c(a[0],b[3],c1,c2,c3);

692 mul_add_c(a[1],b[2],c1,c2,c3);

693 mul_add_c(a[2],b[1],c1,c2,c3);

694 mul_add_c(a[3],b[0],c1,c2,c3);

695 r[3]=c1;

696 c1=0;

697 mul_add_c(a[3],b[1],c2,c3,c1);

698 mul_add_c(a[2],b[2],c2,c3,c1);

699 mul_add_c(a[1],b[3],c2,c3,c1);

700 r[4]=c2;

701 c2=0;

702 mul_add_c(a[2],b[3],c3,c1,c2);

703 mul_add_c(a[3],b[2],c3,c1,c2);

704 r[5]=c3;

705 c3=0;

706 mul_add_c(a[3],b[3],c1,c2,c3);

707 r[6]=c1;

708 r[7]=c2;

709 }

710

711 void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)

712 {

713 #ifdef BN_LLONG

714 BN_ULLONG t,tt;

715 #else

716 BN_ULONG bl,bh;

717 #endif

718 BN_ULONG t1,t2;

719 BN_ULONG c1,c2,c3;

720

721 c1=0;

722 c2=0;

723 c3=0;

724 sqr_add_c(a,0,c1,c2,c3);

725 r[0]=c1;

726 c1=0;

727 sqr_add_c2(a,1,0,c2,c3,c1);

728 r[1]=c2;

729 c2=0;

730 sqr_add_c(a,1,c3,c1,c2);

731 sqr_add_c2(a,2,0,c3,c1,c2);

732 r[2]=c3;

733 c3=0;

734 sqr_add_c2(a,3,0,c1,c2,c3);

735 sqr_add_c2(a,2,1,c1,c2,c3);

736 r[3]=c1;

737 c1=0;

738 sqr_add_c(a,2,c2,c3,c1);

739 sqr_add_c2(a,3,1,c2,c3,c1);

740 sqr_add_c2(a,4,0,c2,c3,c1);

741 r[4]=c2;

742 c2=0;

743 sqr_add_c2(a,5,0,c3,c1,c2);

744 sqr_add_c2(a,4,1,c3,c1,c2);

745 sqr_add_c2(a,3,2,c3,c1,c2);

746 r[5]=c3;

747 c3=0;

748 sqr_add_c(a,3,c1,c2,c3);

749 sqr_add_c2(a,4,2,c1,c2,c3);

750 sqr_add_c2(a,5,1,c1,c2,c3);

751 sqr_add_c2(a,6,0,c1,c2,c3);

752 r[6]=c1;

753 c1=0;

754 sqr_add_c2(a,7,0,c2,c3,c1);

755 sqr_add_c2(a,6,1,c2,c3,c1);

756 sqr_add_c2(a,5,2,c2,c3,c1);

757 sqr_add_c2(a,4,3,c2,c3,c1);

758 r[7]=c2;

759 c2=0;

760 sqr_add_c(a,4,c3,c1,c2);

761 sqr_add_c2(a,5,3,c3,c1,c2);

762 sqr_add_c2(a,6,2,c3,c1,c2);

763 sqr_add_c2(a,7,1,c3,c1,c2);

764 r[8]=c3;

765 c3=0;

766 sqr_add_c2(a,7,2,c1,c2,c3);

767 sqr_add_c2(a,6,3,c1,c2,c3);

768 sqr_add_c2(a,5,4,c1,c2,c3);

769 r[9]=c1;

770 c1=0;

771 sqr_add_c(a,5,c2,c3,c1);

772 sqr_add_c2(a,6,4,c2,c3,c1);

773 sqr_add_c2(a,7,3,c2,c3,c1);

774 r[10]=c2;

775 c2=0;

776 sqr_add_c2(a,7,4,c3,c1,c2);

777 sqr_add_c2(a,6,5,c3,c1,c2);

778 r[11]=c3;

779 c3=0;

780 sqr_add_c(a,6,c1,c2,c3);

781 sqr_add_c2(a,7,5,c1,c2,c3);

782 r[12]=c1;

783 c1=0;

784 sqr_add_c2(a,7,6,c2,c3,c1);

785 r[13]=c2;

786 c2=0;

787 sqr_add_c(a,7,c3,c1,c2);

788 r[14]=c3;

789 r[15]=c1;

790 }

791

792 void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)

793 {

794 #ifdef BN_LLONG

795 BN_ULLONG t,tt;

796 #else

797 BN_ULONG bl,bh;

798 #endif

799 BN_ULONG t1,t2;

800 BN_ULONG c1,c2,c3;

801

802 c1=0;

803 c2=0;

804 c3=0;

805 sqr_add_c(a,0,c1,c2,c3);

806 r[0]=c1;

807 c1=0;

808 sqr_add_c2(a,1,0,c2,c3,c1);

809 r[1]=c2;

810 c2=0;

811 sqr_add_c(a,1,c3,c1,c2);

812 sqr_add_c2(a,2,0,c3,c1,c2);

813 r[2]=c3;

814 c3=0;

815 sqr_add_c2(a,3,0,c1,c2,c3);

816 sqr_add_c2(a,2,1,c1,c2,c3);

817 r[3]=c1;

818 c1=0;

819 sqr_add_c(a,2,c2,c3,c1);

820 sqr_add_c2(a,3,1,c2,c3,c1);

821 r[4]=c2;

822 c2=0;

823 sqr_add_c2(a,3,2,c3,c1,c2);

824 r[5]=c3;

825 c3=0;

826 sqr_add_c(a,3,c1,c2,c3);

827 r[6]=c1;

828 r[7]=c2;

829 }

830

831 #ifdef OPENSSL_NO_ASM

832 #ifdef OPENSSL_BN_ASM_MONT

833 #include <alloca.h>

834 /*

835 * This is essentially reference implementation, which may or may not

836 * result in performance improvement. E.g. on IA-32 this routine was

837 * observed to give 40% faster rsa1024 private key operations and 10%

838 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only

839 * by 10% and worsens rsa4096 sign by 15%. Once again, it's a

840 * reference implementation, one to be used as starting point for

841 * platform-specific assembler. Mentioned numbers apply to compiler

842 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and

843 * can vary not only from platform to platform, but even for compiler

844 * versions. Assembler vs. assembler improvement coefficients can

845 * [and are known to] differ and are to be documented elsewhere.

846 */

847 int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_U LONG np,const BN_ULONG *n0p, int num)

848 {

849 BN_ULONG c0,c1,ml,*tp,n0;

850 #ifdef mul64

851 BN_ULONG mh;

852 #endif

853 volatile BN_ULONG *vp;

854 int i=0,j;

855

856 #if 0 /* template for platform-specific implementation */

857 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);

858 #endif

859 vp = tp = alloca((num+2)*sizeof(BN_ULONG));

860

861 n0 = *n0p;

862

863 c0 = 0;

864 ml = bp[0];

865 #ifdef mul64

866 mh = HBITS(ml);

867 ml = LBITS(ml);

868 for (j=0;j<num;++j)

869 mul(tp[j],ap[j],ml,mh,c0);

870 #else

871 for (j=0;j<num;++j)

872 mul(tp[j],ap[j],ml,c0);

873 #endif

874

875 tp[num] = c0;

876 tp[num+1] = 0;

877 goto enter;

878

879 for(i=0;i<num;i++)

880 {

881 c0 = 0;

882 ml = bp[i];

883 #ifdef mul64

884 mh = HBITS(ml);

885 ml = LBITS(ml);

886 for (j=0;j<num;++j)

887 mul_add(tp[j],ap[j],ml,mh,c0);

888 #else

889 for (j=0;j<num;++j)

890 mul_add(tp[j],ap[j],ml,c0);

891 #endif

892 c1 = (tp[num] + c0)&BN_MASK2;

893 tp[num] = c1;

894 tp[num+1] = (c1<c0?1:0);

895 enter:

896 c1 = tp[0];

897 ml = (c1*n0)&BN_MASK2;

898 c0 = 0;

899 #ifdef mul64

900 mh = HBITS(ml);

901 ml = LBITS(ml);

902 mul_add(c1,np[0],ml,mh,c0);

903 #else

904 mul_add(c1,ml,np[0],c0);

905 #endif

906 for(j=1;j<num;j++)

907 {

908 c1 = tp[j];

909 #ifdef mul64

910 mul_add(c1,np[j],ml,mh,c0);

911 #else

912 mul_add(c1,ml,np[j],c0);

913 #endif

914 tp[j-1] = c1&BN_MASK2;

915 }

916 c1 = (tp[num] + c0)&BN_MASK2;

917 tp[num-1] = c1;

918 tp[num] = tp[num+1] + (c1<c0?1:0);

919 }

920

921 if (tp[num]!=0 \|\| tp[num-1]>=np[num-1])

922 {

923 c0 = bn_sub_words(rp,tp,np,num);

924 if (tp[num]!=0 \|\| c0==0)

925 {

926 for(i=0;i<num+2;i++) vp[i] = 0;

927 return 1;

928 }

929 }

930 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;

931 vp[num] = 0;

932 vp[num+1] = 0;

933 return 1;

934 }

935 #else

936 /*

937 * Return value of 0 indicates that multiplication/convolution was not

938 * performed to signal the caller to fall down to alternative/original

939 * code-path.

940 */

941 int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_U LONG np,const BN_ULONG *n0, int num)

942 { return 0; }

943 #endif /* OPENSSL_BN_ASM_MONT */

944 #endif

945

946 #else /* !BN_MUL_COMBA */

947

948 /* hmm... is it faster just to do a multiply? */

949 #undef bn_sqr_comba4

950 void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)

951 {

952 BN_ULONG t[8];

953 bn_sqr_normal(r,a,4,t);

954 }

955

956 #undef bn_sqr_comba8

957 void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)

958 {

959 BN_ULONG t[16];

960 bn_sqr_normal(r,a,8,t);

961 }

962

963 void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)

964 {

965 r[4]=bn_mul_words( &(r[0]),a,4,b[0]);

966 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);

967 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);

968 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);

969 }

970

971 void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)

972 {

973 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);

974 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);

975 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);

976 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);

977 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);

978 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);

979 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);

980 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);

981 }

982

983 #ifdef OPENSSL_NO_ASM

984 #ifdef OPENSSL_BN_ASM_MONT

985 #include <alloca.h>

986 int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_U LONG np,const BN_ULONG *n0p, int num)

987 {

988 BN_ULONG c0,c1,tp,n0=n0p;

989 volatile BN_ULONG *vp;

990 int i=0,j;

991

992 vp = tp = alloca((num+2)*sizeof(BN_ULONG));

993

994 for(i=0;i<=num;i++) tp[i]=0;

995

996 for(i=0;i<num;i++)

997 {

998 c0 = bn_mul_add_words(tp,ap,num,bp[i]);

999 c1 = (tp[num] + c0)&BN_MASK2;

1000 tp[num] = c1;

1001 tp[num+1] = (c1<c0?1:0);

1002

1003 c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);

1004 c1 = (tp[num] + c0)&BN_MASK2;

1005 tp[num] = c1;

1006 tp[num+1] += (c1<c0?1:0);

1007 for(j=0;j<=num;j++) tp[j]=tp[j+1];

1008 }

1009

1010 if (tp[num]!=0 \|\| tp[num-1]>=np[num-1])

1011 {

1012 c0 = bn_sub_words(rp,tp,np,num);

1013 if (tp[num]!=0 \|\| c0==0)

1014 {

1015 for(i=0;i<num+2;i++) vp[i] = 0;

1016 return 1;

1017 }

1018 }

1019 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;

1020 vp[num] = 0;

1021 vp[num+1] = 0;

1022 return 1;

1023 }

1024 #else

1025 int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_U LONG np,const BN_ULONG *n0, int num)

1026 { return 0; }

1027 #endif /* OPENSSL_BN_ASM_MONT */

1028 #endif

1029

1030 #endif /* !BN_MUL_COMBA */

OLD	NEW

« no previous file with comments | « openssl/crypto/bn/bn_add.c ('k') | openssl/crypto/bn/bn_blind.c » ('j') | no next file with comments »