openssl/crypto/aes/asm/aesni-sha1-x86_64.pl - Issue 2072073002: Delete bundled copy of OpenSSL and replace with README.

Side by Side Diff: openssl/crypto/aes/asm/aesni-sha1-x86_64.pl

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master

Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #!/usr/bin/env perl

2 #

3 # ====================================================================

4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL

5 # project. The module is, however, dual licensed under OpenSSL and

6 # CRYPTOGAMS licenses depending on where you obtain it. For further

7 # details see http://www.openssl.org/~appro/cryptogams/.

8 # ====================================================================

9 #

10 # June 2011

11 #

12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled

13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is

14 # that since AESNI-CBC encrypt exhibit very low instruction-level

15 # parallelism, interleaving it with another algorithm would allow to

16 # utilize processor resources better and achieve better performance.

17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and

18 # AESNI code is weaved into it. Below are performance numbers in

19 # cycles per processed byte, less is better, for standalone AESNI-CBC

20 # encrypt, sum of the latter and standalone SHA1, and "stitched"

21 # subroutine:

22 #

23 # AES-128-CBC +SHA1 stitch gain

24 # Westmere 3.77[+5.6] 9.37 6.65 +41%

25 # Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)

26 #

27 # AES-192-CBC

28 # Westmere 4.51 10.11 6.97 +45%

29 # Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)

30 #

31 # AES-256-CBC

32 # Westmere 5.25 10.85 7.25 +50%

33 # Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)

34 #

35 # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for

36 # background information. Above numbers in parentheses are SSSE3

37 # results collected on AVX-capable CPU, i.e. apply on OSes that

38 # don't support AVX.

39 #

40 # Needless to mention that it makes no sense to implement "stitched"

41 # decrypt subroutine. Because both AESNI-CBC decrypt and SHA1

42 # fully utilize parallelism, so stitching would not give any gain

43 # anyway. Well, there might be some, e.g. because of better cache

44 # locality... For reference, here are performance results for

45 # standalone AESNI-CBC decrypt:

46 #

47 # AES-128-CBC AES-192-CBC AES-256-CBC

48 # Westmere 1.31 1.55 1.80

49 # Sandy Bridge 0.93 1.06 1.22

50

51 $flavour = shift;

52 $output = shift;

53 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

54

55 $win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);

56

57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;

58 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or

59 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or

60 die "can't locate x86_64-xlate.pl";

61

62 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`

63 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&

64 $1>=2.19);

65 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ \|\| $ENV{ASM} =~ /nasm/) &&

66 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&

67 $1>=2.09);

68 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ \|\| $ENV{ASM} =~ /ml64/) &&

69 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&

70 $1>=10);

71

72 open OUT,"\| \"$^X\" $xlate $flavour $output";

73 STDOUT=OUT;

74

75 # void aesni_cbc_sha1_enc(const void *inp,

76 # void *out,

77 # size_t length,

78 # const AES_KEY *key,

79 # unsigned char *iv,

80 # SHA_CTX *ctx,

81 # const void *in0);

82

83 $code.=<<___;

84 .text

85 .extern OPENSSL_ia32cap_P

86

87 .globl aesni_cbc_sha1_enc

88 .type aesni_cbc_sha1_enc,\@abi-omnipotent

89 .align 16

90 aesni_cbc_sha1_enc:

91 # caller should check for SSSE3 and AES-NI bits

92 mov OPENSSL_ia32cap_P+0(%rip),%r10d

93 mov OPENSSL_ia32cap_P+4(%rip),%r11d

94 ___

95 $code.=<<___ if ($avx);

96 and \$`1<<28`,%r11d # mask AVX bit

97 and \$`1<<30`,%r10d # mask "Intel CPU" bit

98 or %r11d,%r10d

99 cmp \$`1<<28\|1<<30`,%r10d

100 je aesni_cbc_sha1_enc_avx

101 ___

102 $code.=<<___;

103 jmp aesni_cbc_sha1_enc_ssse3

104 ret

105 .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc

106 ___

107

108 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9" ,"%r10");

109

110 my $Xi=4;

111 my @X=map("%xmm$_",(4..7,0..3));

112 my @Tx=map("%xmm$_",(8..10));

113 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimizat ion

114 my @T=("%esi","%edi");

115 my $j=0; my $jj=0; my $r=0; my $sn=0;

116 my $K_XX_XX="%r11";

117 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));

118 my @rndkey=("%xmm14","%xmm15");

119

120 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm

121 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;

122 my $arg = pop;

123 $arg = "\$$arg" if ($arg*1 eq $arg);

124 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";

125 }

126

127 my $_rol=sub { &rol(@_) };

128 my $_ror=sub { &ror(@_) };

129

130 $code.=<<___;

131 .type aesni_cbc_sha1_enc_ssse3,\@function,6

132 .align 16

133 aesni_cbc_sha1_enc_ssse3:

134 mov `($win64?56:8)`(%rsp),$inp # load 7th argument

135 #shr \$6,$len # debugging artefact

136 #jz .Lepilogue_ssse3 # debugging artefact

137 push %rbx

138 push %rbp

139 push %r12

140 push %r13

141 push %r14

142 push %r15

143 lea `-104-($win64?10*16:0)`(%rsp),%rsp

144 #mov $in0,$inp # debugging artefact

145 #lea 64(%rsp),$ctx # debugging artefact

146 ___

147 $code.=<<___ if ($win64);

148 movaps %xmm6,96+0(%rsp)

149 movaps %xmm7,96+16(%rsp)

150 movaps %xmm8,96+32(%rsp)

151 movaps %xmm9,96+48(%rsp)

152 movaps %xmm10,96+64(%rsp)

153 movaps %xmm11,96+80(%rsp)

154 movaps %xmm12,96+96(%rsp)

155 movaps %xmm13,96+112(%rsp)

156 movaps %xmm14,96+128(%rsp)

157 movaps %xmm15,96+144(%rsp)

158 .Lprologue_ssse3:

159 ___

160 $code.=<<___;

161 mov $in0,%r12 # reassign arguments

162 mov $out,%r13

163 mov $len,%r14

164 mov $key,%r15

165 movdqu ($ivp),$iv # load IV

166 mov $ivp,88(%rsp) # save $ivp

167 ___

168 my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments

169 my $rounds="${ivp}d";

170 $code.=<<___;

171 shl \$6,$len

172 sub $in0,$out

173 mov 240($key),$rounds

174 add $inp,$len # end of input

175

176 lea K_XX_XX(%rip),$K_XX_XX

177 mov 0($ctx),$A # load context

178 mov 4($ctx),$B

179 mov 8($ctx),$C

180 mov 12($ctx),$D

181 mov $B,@T[0] # magic seed

182 mov 16($ctx),$E

183

184 movdqa 64($K_XX_XX),@X[2] # pbswap mask

185 movdqa 0($K_XX_XX),@Tx[1] # K_00_19

186 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]

187 movdqu 16($inp),@X[-3&7]

188 movdqu 32($inp),@X[-2&7]

189 movdqu 48($inp),@X[-1&7]

190 pshufb @X[2],@X[-4&7] # byte swap

191 add \$64,$inp

192 pshufb @X[2],@X[-3&7]

193 pshufb @X[2],@X[-2&7]

194 pshufb @X[2],@X[-1&7]

195 paddd @Tx[1],@X[-4&7] # add K_00_19

196 paddd @Tx[1],@X[-3&7]

197 paddd @Tx[1],@X[-2&7]

198 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU

199 psubd @Tx[1],@X[-4&7] # restore X[]

200 movdqa @X[-3&7],16(%rsp)

201 psubd @Tx[1],@X[-3&7]

202 movdqa @X[-2&7],32(%rsp)

203 psubd @Tx[1],@X[-2&7]

204 movups ($key),$rndkey0 # $key[0]

205 movups 16($key),$rndkey[0] # forward reference

206 jmp .Loop_ssse3

207 ___

208

209 my $aesenc=sub {

210 use integer;

211 my ($n,$k)=($r/10,$r%10);

212 if ($k==0) {

213 $code.=<<___;

214 movups `16*$n`($in0),$in # load input

215 xorps $rndkey0,$in

216 ___

217 $code.=<<___ if ($n);

218 movups $iv,`16*($n-1)`($out,$in0) # write output

219 ___

220 $code.=<<___;

221 xorps $in,$iv

222 aesenc $rndkey[0],$iv

223 movups `32+16*$k`($key),$rndkey[1]

224 ___

225 } elsif ($k==9) {

226 $sn++;

227 $code.=<<___;

228 cmp \$11,$rounds

229 jb .Laesenclast$sn

230 movups `32+16*($k+0)`($key),$rndkey[1]

231 aesenc $rndkey[0],$iv

232 movups `32+16*($k+1)`($key),$rndkey[0]

233 aesenc $rndkey[1],$iv

234 je .Laesenclast$sn

235 movups `32+16*($k+2)`($key),$rndkey[1]

236 aesenc $rndkey[0],$iv

237 movups `32+16*($k+3)`($key),$rndkey[0]

238 aesenc $rndkey[1],$iv

239 .Laesenclast$sn:

240 aesenclast $rndkey[0],$iv

241 movups 16($key),$rndkey[1] # forward reference

242 ___

243 } else {

244 $code.=<<___;

245 aesenc $rndkey[0],$iv

246 movups `32+16*$k`($key),$rndkey[1]

247 ___

248 }

249 $r++; unshift(@rndkey,pop(@rndkey));

250 };

251

252 sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4

253 { use integer;

254 my $body = shift;

255 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions

256 my ($a,$b,$c,$d,$e);

257

258 &movdqa (@X[0],@X[-3&7]);

259 eval(shift(@insns));

260 eval(shift(@insns));

261 &movdqa (@Tx[0],@X[-1&7]);

262 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"

263 eval(shift(@insns));

264 eval(shift(@insns));

265

266 &paddd (@Tx[1],@X[-1&7]);

267 eval(shift(@insns));

268 eval(shift(@insns));

269 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords

270 eval(shift(@insns));

271 eval(shift(@insns));

272 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"

273 eval(shift(@insns));

274 eval(shift(@insns));

275

276 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"

277 eval(shift(@insns));

278 eval(shift(@insns));

279 eval(shift(@insns));

280 eval(shift(@insns));

281

282 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"

283 eval(shift(@insns));

284 eval(shift(@insns));

285 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU

286 eval(shift(@insns));

287 eval(shift(@insns));

288

289 &movdqa (@Tx[2],@X[0]);

290 &movdqa (@Tx[0],@X[0]);

291 eval(shift(@insns));

292 eval(shift(@insns));

293 eval(shift(@insns));

294 eval(shift(@insns));

295

296 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword

297 &paddd (@X[0],@X[0]);

298 eval(shift(@insns));

299 eval(shift(@insns));

300 eval(shift(@insns));

301 eval(shift(@insns));

302

303 &psrld (@Tx[0],31);

304 eval(shift(@insns));

305 eval(shift(@insns));

306 &movdqa (@Tx[1],@Tx[2]);

307 eval(shift(@insns));

308 eval(shift(@insns));

309

310 &psrld (@Tx[2],30);

311 &por (@X[0],@Tx[0]); # "X[0]"<<<=1

312 eval(shift(@insns));

313 eval(shift(@insns));

314 eval(shift(@insns));

315 eval(shift(@insns));

316

317 &pslld (@Tx[1],2);

318 &pxor (@X[0],@Tx[2]);

319 eval(shift(@insns));

320 eval(shift(@insns));

321 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_X X

322 eval(shift(@insns));

323 eval(shift(@insns));

324

325 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2

326

327 foreach (@insns) { eval; } # remaining instructions [if any]

328

329 $Xi++; push(@X,shift(@X)); # "rotate" X[]

330 push(@Tx,shift(@Tx));

331 }

332

333 sub Xupdate_ssse3_32_79()

334 { use integer;

335 my $body = shift;

336 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions

337 my ($a,$b,$c,$d,$e);

338

339 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);

340 eval(shift(@insns)); # body_20_39

341 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"

342 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"

343 eval(shift(@insns));

344 eval(shift(@insns));

345 eval(shift(@insns)); # rol

346

347 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"

348 eval(shift(@insns));

349 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);

350 if ($Xi%5) {

351 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...

352 } else { # ... or load next one

353 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");

354 }

355 &paddd (@Tx[1],@X[-1&7]);

356 eval(shift(@insns)); # ror

357 eval(shift(@insns));

358

359 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"

360 eval(shift(@insns)); # body_20_39

361 eval(shift(@insns));

362 eval(shift(@insns));

363 eval(shift(@insns)); # rol

364

365 &movdqa (@Tx[0],@X[0]);

366 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU

367 eval(shift(@insns));

368 eval(shift(@insns));

369 eval(shift(@insns)); # ror

370 eval(shift(@insns));

371

372 &pslld (@X[0],2);

373 eval(shift(@insns)); # body_20_39

374 eval(shift(@insns));

375 &psrld (@Tx[0],30);

376 eval(shift(@insns));

377 eval(shift(@insns)); # rol

378 eval(shift(@insns));

379 eval(shift(@insns));

380 eval(shift(@insns)); # ror

381 eval(shift(@insns));

382

383 &por (@X[0],@Tx[0]); # "X[0]"<<<=2

384 eval(shift(@insns)); # body_20_39

385 eval(shift(@insns));

386 &movdqa (@Tx[1],@X[0]) if ($Xi<19);

387 eval(shift(@insns));

388 eval(shift(@insns)); # rol

389 eval(shift(@insns));

390 eval(shift(@insns));

391 eval(shift(@insns)); # rol

392 eval(shift(@insns));

393

394 foreach (@insns) { eval; } # remaining instructions

395

396 $Xi++; push(@X,shift(@X)); # "rotate" X[]

397 push(@Tx,shift(@Tx));

398 }

399

400 sub Xuplast_ssse3_80()

401 { use integer;

402 my $body = shift;

403 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions

404 my ($a,$b,$c,$d,$e);

405

406 eval(shift(@insns));

407 &paddd (@Tx[1],@X[-1&7]);

408 eval(shift(@insns));

409 eval(shift(@insns));

410 eval(shift(@insns));

411 eval(shift(@insns));

412

413 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IAL U

414

415 foreach (@insns) { eval; } # remaining instructions

416

417 &cmp ($inp,$len);

418 &je (".Ldone_ssse3");

419

420 unshift(@Tx,pop(@Tx));

421

422 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask

423 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19

424 &movdqu (@X[-4&7],"0($inp)"); # load input

425 &movdqu (@X[-3&7],"16($inp)");

426 &movdqu (@X[-2&7],"32($inp)");

427 &movdqu (@X[-1&7],"48($inp)");

428 &pshufb (@X[-4&7],@X[2]); # byte swap

429 &add ($inp,64);

430

431 $Xi=0;

432 }

433

434 sub Xloop_ssse3()

435 { use integer;

436 my $body = shift;

437 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions

438 my ($a,$b,$c,$d,$e);

439

440 eval(shift(@insns));

441 eval(shift(@insns));

442 &pshufb (@X[($Xi-3)&7],@X[2]);

443 eval(shift(@insns));

444 eval(shift(@insns));

445 &paddd (@X[($Xi-4)&7],@Tx[1]);

446 eval(shift(@insns));

447 eval(shift(@insns));

448 eval(shift(@insns));

449 eval(shift(@insns));

450 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU

451 eval(shift(@insns));

452 eval(shift(@insns));

453 &psubd (@X[($Xi-4)&7],@Tx[1]);

454

455 foreach (@insns) { eval; }

456 $Xi++;

457 }

458

459 sub Xtail_ssse3()

460 { use integer;

461 my $body = shift;

462 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions

463 my ($a,$b,$c,$d,$e);

464

465 foreach (@insns) { eval; }

466 }

467

468 sub body_00_19 () {

469 use integer;

470 my ($k,$n);

471 my @r=(

472 '($a,$b,$c,$d,$e)=@V;'.

473 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer

474 '&xor ($c,$d);',

475 '&mov (@T[1],$a);', # $b in next round

476 '&$_rol ($a,5);',

477 '&and (@T[0],$c);', # ($b&($c^$d))

478 '&xor ($c,$d);', # restore $c

479 '&xor (@T[0],$d);',

480 '&add ($e,$a);',

481 '&$_ror ($b,$j?7:2);', # $b>>>2

482 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T)) ;'

483 );

484 $n = scalar(@r);

485 $k = (($jj+1)12/20)20*$n/12; # 12 aesencs per these 20 rounds

486 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);

487 $jj++;

488 return @r;

489 }

490

491 sub body_20_39 () {

492 use integer;

493 my ($k,$n);

494 my @r=(

495 '($a,$b,$c,$d,$e)=@V;'.

496 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer

497 '&xor (@T[0],$d);', # ($b^$d)

498 '&mov (@T[1],$a);', # $b in next round

499 '&$_rol ($a,5);',

500 '&xor (@T[0],$c);', # ($b^$d^$c)

501 '&add ($e,$a);',

502 '&$_ror ($b,7);', # $b>>>2

503 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'

504 );

505 $n = scalar(@r);

506 $k = (($jj+1)8/20)20*$n/8; # 8 aesencs per these 20 rounds

507 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);

508 $jj++;

509 return @r;

510 }

511

512 sub body_40_59 () {

513 use integer;

514 my ($k,$n);

515 my @r=(

516 '($a,$b,$c,$d,$e)=@V;'.

517 '&mov (@T[1],$c);',

518 '&xor ($c,$d);',

519 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer

520 '&and (@T[1],$d);',

521 '&and (@T[0],$c);', # ($b&($c^$d))

522 '&$_ror ($b,7);', # $b>>>2

523 '&add ($e,@T[1]);',

524 '&mov (@T[1],$a);', # $b in next round

525 '&$_rol ($a,5);',

526 '&add ($e,@T[0]);',

527 '&xor ($c,$d);', # restore $c

528 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'

529 );

530 $n = scalar(@r);

531 $k=(($jj+1)12/20)20*$n/12; # 12 aesencs per these 20 rounds

532 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);

533 $jj++;

534 return @r;

535 }

536 $code.=<<___;

537 .align 16

538 .Loop_ssse3:

539 ___

540 &Xupdate_ssse3_16_31(\&body_00_19);

541 &Xupdate_ssse3_16_31(\&body_00_19);

542 &Xupdate_ssse3_16_31(\&body_00_19);

543 &Xupdate_ssse3_16_31(\&body_00_19);

544 &Xupdate_ssse3_32_79(\&body_00_19);

545 &Xupdate_ssse3_32_79(\&body_20_39);

546 &Xupdate_ssse3_32_79(\&body_20_39);

547 &Xupdate_ssse3_32_79(\&body_20_39);

548 &Xupdate_ssse3_32_79(\&body_20_39);

549 &Xupdate_ssse3_32_79(\&body_20_39);

550 &Xupdate_ssse3_32_79(\&body_40_59);

551 &Xupdate_ssse3_32_79(\&body_40_59);

552 &Xupdate_ssse3_32_79(\&body_40_59);

553 &Xupdate_ssse3_32_79(\&body_40_59);

554 &Xupdate_ssse3_32_79(\&body_40_59);

555 &Xupdate_ssse3_32_79(\&body_20_39);

556 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"

557

558 $saved_j=$j; @saved_V=@V;

559 $saved_r=$r; @saved_rndkey=@rndkey;

560

561 &Xloop_ssse3(\&body_20_39);

562 &Xloop_ssse3(\&body_20_39);

563 &Xloop_ssse3(\&body_20_39);

564

565 $code.=<<___;

566 movups $iv,48($out,$in0) # write output

567 lea 64($in0),$in0

568

569 add 0($ctx),$A # update context

570 add 4($ctx),@T[0]

571 add 8($ctx),$C

572 add 12($ctx),$D

573 mov $A,0($ctx)

574 add 16($ctx),$E

575 mov @T[0],4($ctx)

576 mov @T[0],$B # magic seed

577 mov $C,8($ctx)

578 mov $D,12($ctx)

579 mov $E,16($ctx)

580 jmp .Loop_ssse3

581

582 .align 16

583 .Ldone_ssse3:

584 ___

585 $jj=$j=$saved_j; @V=@saved_V;

586 $r=$saved_r; @rndkey=@saved_rndkey;

587

588 &Xtail_ssse3(\&body_20_39);

589 &Xtail_ssse3(\&body_20_39);

590 &Xtail_ssse3(\&body_20_39);

591

592 $code.=<<___;

593 movups $iv,48($out,$in0) # write output

594 mov 88(%rsp),$ivp # restore $ivp

595

596 add 0($ctx),$A # update context

597 add 4($ctx),@T[0]

598 add 8($ctx),$C

599 mov $A,0($ctx)

600 add 12($ctx),$D

601 mov @T[0],4($ctx)

602 add 16($ctx),$E

603 mov $C,8($ctx)

604 mov $D,12($ctx)

605 mov $E,16($ctx)

606 movups $iv,($ivp) # write IV

607 ___

608 $code.=<<___ if ($win64);

609 movaps 96+0(%rsp),%xmm6

610 movaps 96+16(%rsp),%xmm7

611 movaps 96+32(%rsp),%xmm8

612 movaps 96+48(%rsp),%xmm9

613 movaps 96+64(%rsp),%xmm10

614 movaps 96+80(%rsp),%xmm11

615 movaps 96+96(%rsp),%xmm12

616 movaps 96+112(%rsp),%xmm13

617 movaps 96+128(%rsp),%xmm14

618 movaps 96+144(%rsp),%xmm15

619 ___

620 $code.=<<___;

621 lea `104+($win64?10*16:0)`(%rsp),%rsi

622 mov 0(%rsi),%r15

623 mov 8(%rsi),%r14

624 mov 16(%rsi),%r13

625 mov 24(%rsi),%r12

626 mov 32(%rsi),%rbp

627 mov 40(%rsi),%rbx

628 lea 48(%rsi),%rsp

629 .Lepilogue_ssse3:

630 ret

631 .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3

632 ___

633

634 $j=$jj=$r=$sn=0;

635

636 if ($avx) {

637 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9" ,"%r10");

638

639 my $Xi=4;

640 my @X=map("%xmm$_",(4..7,0..3));

641 my @Tx=map("%xmm$_",(8..10));

642 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimizat ion

643 my @T=("%esi","%edi");

644

645 my $_rol=sub { &shld(@_[0],@_) };

646 my $_ror=sub { &shrd(@_[0],@_) };

647

648 $code.=<<___;

649 .type aesni_cbc_sha1_enc_avx,\@function,6

650 .align 16

651 aesni_cbc_sha1_enc_avx:

652 mov `($win64?56:8)`(%rsp),$inp # load 7th argument

653 #shr \$6,$len # debugging artefact

654 #jz .Lepilogue_avx # debugging artefact

655 push %rbx

656 push %rbp

657 push %r12

658 push %r13

659 push %r14

660 push %r15

661 lea `-104-($win64?10*16:0)`(%rsp),%rsp

662 #mov $in0,$inp # debugging artefact

663 #lea 64(%rsp),$ctx # debugging artefact

664 ___

665 $code.=<<___ if ($win64);

666 movaps %xmm6,96+0(%rsp)

667 movaps %xmm7,96+16(%rsp)

668 movaps %xmm8,96+32(%rsp)

669 movaps %xmm9,96+48(%rsp)

670 movaps %xmm10,96+64(%rsp)

671 movaps %xmm11,96+80(%rsp)

672 movaps %xmm12,96+96(%rsp)

673 movaps %xmm13,96+112(%rsp)

674 movaps %xmm14,96+128(%rsp)

675 movaps %xmm15,96+144(%rsp)

676 .Lprologue_avx:

677 ___

678 $code.=<<___;

679 vzeroall

680 mov $in0,%r12 # reassign arguments

681 mov $out,%r13

682 mov $len,%r14

683 mov $key,%r15

684 vmovdqu ($ivp),$iv # load IV

685 mov $ivp,88(%rsp) # save $ivp

686 ___

687 my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments

688 my $rounds="${ivp}d";

689 $code.=<<___;

690 shl \$6,$len

691 sub $in0,$out

692 mov 240($key),$rounds

693 add \$112,$key # size optimization

694 add $inp,$len # end of input

695

696 lea K_XX_XX(%rip),$K_XX_XX

697 mov 0($ctx),$A # load context

698 mov 4($ctx),$B

699 mov 8($ctx),$C

700 mov 12($ctx),$D

701 mov $B,@T[0] # magic seed

702 mov 16($ctx),$E

703

704 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask

705 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19

706 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]

707 vmovdqu 16($inp),@X[-3&7]

708 vmovdqu 32($inp),@X[-2&7]

709 vmovdqu 48($inp),@X[-1&7]

710 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap

711 add \$64,$inp

712 vpshufb @X[2],@X[-3&7],@X[-3&7]

713 vpshufb @X[2],@X[-2&7],@X[-2&7]

714 vpshufb @X[2],@X[-1&7],@X[-1&7]

715 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19

716 vpaddd @Tx[1],@X[-3&7],@X[1]

717 vpaddd @Tx[1],@X[-2&7],@X[2]

718 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU

719 vmovdqa @X[1],16(%rsp)

720 vmovdqa @X[2],32(%rsp)

721 vmovups -112($key),$rndkey0 # $key[0]

722 vmovups 16-112($key),$rndkey[0] # forward reference

723 jmp .Loop_avx

724 ___

725

726 my $aesenc=sub {

727 use integer;

728 my ($n,$k)=($r/10,$r%10);

729 if ($k==0) {

730 $code.=<<___;

731 vmovups `16*$n`($in0),$in # load input

732 vxorps $rndkey0,$in,$in

733 ___

734 $code.=<<___ if ($n);

735 vmovups $iv,`16*($n-1)`($out,$in0) # write output

736 ___

737 $code.=<<___;

738 vxorps $in,$iv,$iv

739 vaesenc $rndkey[0],$iv,$iv

740 vmovups `32+16*$k-112`($key),$rndkey[1]

741 ___

742 } elsif ($k==9) {

743 $sn++;

744 $code.=<<___;

745 cmp \$11,$rounds

746 jb .Lvaesenclast$sn

747 vaesenc $rndkey[0],$iv,$iv

748 vmovups `32+16*($k+0)-112`($key),$rndkey[1]

749 vaesenc $rndkey[1],$iv,$iv

750 vmovups `32+16*($k+1)-112`($key),$rndkey[0]

751 je .Lvaesenclast$sn

752 vaesenc $rndkey[0],$iv,$iv

753 vmovups `32+16*($k+2)-112`($key),$rndkey[1]

754 vaesenc $rndkey[1],$iv,$iv

755 vmovups `32+16*($k+3)-112`($key),$rndkey[0]

756 .Lvaesenclast$sn:

757 vaesenclast $rndkey[0],$iv,$iv

758 vmovups 16-112($key),$rndkey[1] # forward reference

759 ___

760 } else {

761 $code.=<<___;

762 vaesenc $rndkey[0],$iv,$iv

763 vmovups `32+16*$k-112`($key),$rndkey[1]

764 ___

765 }

766 $r++; unshift(@rndkey,pop(@rndkey));

767 };

768

769 sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4

770 { use integer;

771 my $body = shift;

772 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions

773 my ($a,$b,$c,$d,$e);

774

775 eval(shift(@insns));

776 eval(shift(@insns));

777 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"

778 eval(shift(@insns));

779 eval(shift(@insns));

780

781 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);

782 eval(shift(@insns));

783 eval(shift(@insns));

784 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords

785 eval(shift(@insns));

786 eval(shift(@insns));

787 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"

788 eval(shift(@insns));

789 eval(shift(@insns));

790

791 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"

792 eval(shift(@insns));

793 eval(shift(@insns));

794 eval(shift(@insns));

795 eval(shift(@insns));

796

797 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"

798 eval(shift(@insns));

799 eval(shift(@insns));

800 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU

801 eval(shift(@insns));

802 eval(shift(@insns));

803

804 &vpsrld (@Tx[0],@X[0],31);

805 eval(shift(@insns));

806 eval(shift(@insns));

807 eval(shift(@insns));

808 eval(shift(@insns));

809

810 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword

811 &vpaddd (@X[0],@X[0],@X[0]);

812 eval(shift(@insns));

813 eval(shift(@insns));

814 eval(shift(@insns));

815 eval(shift(@insns));

816

817 &vpsrld (@Tx[1],@Tx[2],30);

818 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1

819 eval(shift(@insns));

820 eval(shift(@insns));

821 eval(shift(@insns));

822 eval(shift(@insns));

823

824 &vpslld (@Tx[2],@Tx[2],2);

825 &vpxor (@X[0],@X[0],@Tx[1]);

826 eval(shift(@insns));

827 eval(shift(@insns));

828 eval(shift(@insns));

829 eval(shift(@insns));

830

831 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2

832 eval(shift(@insns));

833 eval(shift(@insns));

834 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_X X

835 eval(shift(@insns));

836 eval(shift(@insns));

837

838

839 foreach (@insns) { eval; } # remaining instructions [if any]

840

841 $Xi++; push(@X,shift(@X)); # "rotate" X[]

842 push(@Tx,shift(@Tx));

843 }

844

845 sub Xupdate_avx_32_79()

846 { use integer;

847 my $body = shift;

848 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions

849 my ($a,$b,$c,$d,$e);

850

851 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"

852 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"

853 eval(shift(@insns)); # body_20_39

854 eval(shift(@insns));

855 eval(shift(@insns));

856 eval(shift(@insns)); # rol

857

858 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"

859 eval(shift(@insns));

860 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);

861 if ($Xi%5) {

862 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...

863 } else { # ... or load next one

864 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");

865 }

866 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);

867 eval(shift(@insns)); # ror

868 eval(shift(@insns));

869

870 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"

871 eval(shift(@insns)); # body_20_39

872 eval(shift(@insns));

873 eval(shift(@insns));

874 eval(shift(@insns)); # rol

875

876 &vpsrld (@Tx[0],@X[0],30);

877 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU

878 eval(shift(@insns));

879 eval(shift(@insns));

880 eval(shift(@insns)); # ror

881 eval(shift(@insns));

882

883 &vpslld (@X[0],@X[0],2);

884 eval(shift(@insns)); # body_20_39

885 eval(shift(@insns));

886 eval(shift(@insns));

887 eval(shift(@insns)); # rol

888 eval(shift(@insns));

889 eval(shift(@insns));

890 eval(shift(@insns)); # ror

891 eval(shift(@insns));

892

893 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2

894 eval(shift(@insns)); # body_20_39

895 eval(shift(@insns));

896 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);

897 eval(shift(@insns));

898 eval(shift(@insns)); # rol

899 eval(shift(@insns));

900 eval(shift(@insns));

901 eval(shift(@insns)); # rol

902 eval(shift(@insns));

903

904 foreach (@insns) { eval; } # remaining instructions

905

906 $Xi++; push(@X,shift(@X)); # "rotate" X[]

907 push(@Tx,shift(@Tx));

908 }

909

910 sub Xuplast_avx_80()

911 { use integer;

912 my $body = shift;

913 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions

914 my ($a,$b,$c,$d,$e);

915

916 eval(shift(@insns));

917 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);

918 eval(shift(@insns));

919 eval(shift(@insns));

920 eval(shift(@insns));

921 eval(shift(@insns));

922

923 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IAL U

924

925 foreach (@insns) { eval; } # remaining instructions

926

927 &cmp ($inp,$len);

928 &je (".Ldone_avx");

929

930 unshift(@Tx,pop(@Tx));

931

932 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask

933 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19

934 &vmovdqu(@X[-4&7],"0($inp)"); # load input

935 &vmovdqu(@X[-3&7],"16($inp)");

936 &vmovdqu(@X[-2&7],"32($inp)");

937 &vmovdqu(@X[-1&7],"48($inp)");

938 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap

939 &add ($inp,64);

940

941 $Xi=0;

942 }

943

944 sub Xloop_avx()

945 { use integer;

946 my $body = shift;

947 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions

948 my ($a,$b,$c,$d,$e);

949

950 eval(shift(@insns));

951 eval(shift(@insns));

952 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);

953 eval(shift(@insns));

954 eval(shift(@insns));

955 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);

956 eval(shift(@insns));

957 eval(shift(@insns));

958 eval(shift(@insns));

959 eval(shift(@insns));

960 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU

961 eval(shift(@insns));

962 eval(shift(@insns));

963

964 foreach (@insns) { eval; }

965 $Xi++;

966 }

967

968 sub Xtail_avx()

969 { use integer;

970 my $body = shift;

971 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions

972 my ($a,$b,$c,$d,$e);

973

974 foreach (@insns) { eval; }

975 }

976

977 $code.=<<___;

978 .align 16

979 .Loop_avx:

980 ___

981 &Xupdate_avx_16_31(\&body_00_19);

982 &Xupdate_avx_16_31(\&body_00_19);

983 &Xupdate_avx_16_31(\&body_00_19);

984 &Xupdate_avx_16_31(\&body_00_19);

985 &Xupdate_avx_32_79(\&body_00_19);

986 &Xupdate_avx_32_79(\&body_20_39);

987 &Xupdate_avx_32_79(\&body_20_39);

988 &Xupdate_avx_32_79(\&body_20_39);

989 &Xupdate_avx_32_79(\&body_20_39);

990 &Xupdate_avx_32_79(\&body_20_39);

991 &Xupdate_avx_32_79(\&body_40_59);

992 &Xupdate_avx_32_79(\&body_40_59);

993 &Xupdate_avx_32_79(\&body_40_59);

994 &Xupdate_avx_32_79(\&body_40_59);

995 &Xupdate_avx_32_79(\&body_40_59);

996 &Xupdate_avx_32_79(\&body_20_39);

997 &Xuplast_avx_80(\&body_20_39); # can jump to "done"

998

999 $saved_j=$j; @saved_V=@V;

1000 $saved_r=$r; @saved_rndkey=@rndkey;

1001

1002 &Xloop_avx(\&body_20_39);

1003 &Xloop_avx(\&body_20_39);

1004 &Xloop_avx(\&body_20_39);

1005

1006 $code.=<<___;

1007 vmovups $iv,48($out,$in0) # write output

1008 lea 64($in0),$in0

1009

1010 add 0($ctx),$A # update context

1011 add 4($ctx),@T[0]

1012 add 8($ctx),$C

1013 add 12($ctx),$D

1014 mov $A,0($ctx)

1015 add 16($ctx),$E

1016 mov @T[0],4($ctx)

1017 mov @T[0],$B # magic seed

1018 mov $C,8($ctx)

1019 mov $D,12($ctx)

1020 mov $E,16($ctx)

1021 jmp .Loop_avx

1022

1023 .align 16

1024 .Ldone_avx:

1025 ___

1026 $jj=$j=$saved_j; @V=@saved_V;

1027 $r=$saved_r; @rndkey=@saved_rndkey;

1028

1029 &Xtail_avx(\&body_20_39);

1030 &Xtail_avx(\&body_20_39);

1031 &Xtail_avx(\&body_20_39);

1032

1033 $code.=<<___;

1034 vmovups $iv,48($out,$in0) # write output

1035 mov 88(%rsp),$ivp # restore $ivp

1036

1037 add 0($ctx),$A # update context

1038 add 4($ctx),@T[0]

1039 add 8($ctx),$C

1040 mov $A,0($ctx)

1041 add 12($ctx),$D

1042 mov @T[0],4($ctx)

1043 add 16($ctx),$E

1044 mov $C,8($ctx)

1045 mov $D,12($ctx)

1046 mov $E,16($ctx)

1047 vmovups $iv,($ivp) # write IV

1048 vzeroall

1049 ___

1050 $code.=<<___ if ($win64);

1051 movaps 96+0(%rsp),%xmm6

1052 movaps 96+16(%rsp),%xmm7

1053 movaps 96+32(%rsp),%xmm8

1054 movaps 96+48(%rsp),%xmm9

1055 movaps 96+64(%rsp),%xmm10

1056 movaps 96+80(%rsp),%xmm11

1057 movaps 96+96(%rsp),%xmm12

1058 movaps 96+112(%rsp),%xmm13

1059 movaps 96+128(%rsp),%xmm14

1060 movaps 96+144(%rsp),%xmm15

1061 ___

1062 $code.=<<___;

1063 lea `104+($win64?10*16:0)`(%rsp),%rsi

1064 mov 0(%rsi),%r15

1065 mov 8(%rsi),%r14

1066 mov 16(%rsi),%r13

1067 mov 24(%rsi),%r12

1068 mov 32(%rsi),%rbp

1069 mov 40(%rsi),%rbx

1070 lea 48(%rsi),%rsp

1071 .Lepilogue_avx:

1072 ret

1073 .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx

1074 ___

1075 }

1076 $code.=<<___;

1077 .align 64

1078 K_XX_XX:

1079 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19

1080 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39

1081 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59

1082 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79

1083 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask

1084

1085 .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"

1086 .align 64

1087 ___

1088

1089 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,

1090 # CONTEXT context,DISPATCHER_CONTEXT disp)

1091 if ($win64) {

1092 $rec="%rcx";

1093 $frame="%rdx";

1094 $context="%r8";

1095 $disp="%r9";

1096

1097 $code.=<<___;

1098 .extern __imp_RtlVirtualUnwind

1099 .type ssse3_handler,\@abi-omnipotent

1100 .align 16

1101 ssse3_handler:

1102 push %rsi

1103 push %rdi

1104 push %rbx

1105 push %rbp

1106 push %r12

1107 push %r13

1108 push %r14

1109 push %r15

1110 pushfq

1111 sub \$64,%rsp

1112

1113 mov 120($context),%rax # pull context->Rax

1114 mov 248($context),%rbx # pull context->Rip

1115

1116 mov 8($disp),%rsi # disp->ImageBase

1117 mov 56($disp),%r11 # disp->HandlerData

1118

1119 mov 0(%r11),%r10d # HandlerData[0]

1120 lea (%rsi,%r10),%r10 # prologue label

1121 cmp %r10,%rbx # context->Rip<prologue label

1122 jb .Lcommon_seh_tail

1123

1124 mov 152($context),%rax # pull context->Rsp

1125

1126 mov 4(%r11),%r10d # HandlerData[1]

1127 lea (%rsi,%r10),%r10 # epilogue label

1128 cmp %r10,%rbx # context->Rip>=epilogue label

1129 jae .Lcommon_seh_tail

1130

1131 lea 96(%rax),%rsi

1132 lea 512($context),%rdi # &context.Xmm6

1133 mov \$20,%ecx

1134 .long 0xa548f3fc # cld; rep movsq

1135 lea `104+10*16`(%rax),%rax # adjust stack pointer

1136

1137 mov 0(%rax),%r15

1138 mov 8(%rax),%r14

1139 mov 16(%rax),%r13

1140 mov 24(%rax),%r12

1141 mov 32(%rax),%rbp

1142 mov 40(%rax),%rbx

1143 lea 48(%rax),%rax

1144 mov %rbx,144($context) # restore context->Rbx

1145 mov %rbp,160($context) # restore context->Rbp

1146 mov %r12,216($context) # restore context->R12

1147 mov %r13,224($context) # restore context->R13

1148 mov %r14,232($context) # restore context->R14

1149 mov %r15,240($context) # restore context->R15

1150

1151 .Lcommon_seh_tail:

1152 mov 8(%rax),%rdi

1153 mov 16(%rax),%rsi

1154 mov %rax,152($context) # restore context->Rsp

1155 mov %rsi,168($context) # restore context->Rsi

1156 mov %rdi,176($context) # restore context->Rdi

1157

1158 mov 40($disp),%rdi # disp->ContextRecord

1159 mov $context,%rsi # context

1160 mov \$154,%ecx # sizeof(CONTEXT)

1161 .long 0xa548f3fc # cld; rep movsq

1162

1163 mov $disp,%rsi

1164 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER

1165 mov 8(%rsi),%rdx # arg2, disp->ImageBase

1166 mov 0(%rsi),%r8 # arg3, disp->ControlPc

1167 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry

1168 mov 40(%rsi),%r10 # disp->ContextRecord

1169 lea 56(%rsi),%r11 # &disp->HandlerData

1170 lea 24(%rsi),%r12 # &disp->EstablisherFrame

1171 mov %r10,32(%rsp) # arg5

1172 mov %r11,40(%rsp) # arg6

1173 mov %r12,48(%rsp) # arg7

1174 mov %rcx,56(%rsp) # arg8, (NULL)

1175 call *__imp_RtlVirtualUnwind(%rip)

1176

1177 mov \$1,%eax # ExceptionContinueSearch

1178 add \$64,%rsp

1179 popfq

1180 pop %r15

1181 pop %r14

1182 pop %r13

1183 pop %r12

1184 pop %rbp

1185 pop %rbx

1186 pop %rdi

1187 pop %rsi

1188 ret

1189 .size ssse3_handler,.-ssse3_handler

1190

1191 .section .pdata

1192 .align 4

1193 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3

1194 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3

1195 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3

1196 ___

1197 $code.=<<___ if ($avx);

1198 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx

1199 .rva .LSEH_end_aesni_cbc_sha1_enc_avx

1200 .rva .LSEH_info_aesni_cbc_sha1_enc_avx

1201 ___

1202 $code.=<<___;

1203 .section .xdata

1204 .align 8

1205 .LSEH_info_aesni_cbc_sha1_enc_ssse3:

1206 .byte 9,0,0,0

1207 .rva ssse3_handler

1208 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]

1209 ___

1210 $code.=<<___ if ($avx);

1211 .LSEH_info_aesni_cbc_sha1_enc_avx:

1212 .byte 9,0,0,0

1213 .rva ssse3_handler

1214 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]

1215 ___

1216 }

1217

1218 ####################################################################

1219 sub rex {

1220 local *opcode=shift;

1221 my ($dst,$src)=@_;

1222 my $rex=0;

1223

1224 $rex\|=0x04 if($dst>=8);

1225 $rex\|=0x01 if($src>=8);

1226 push @opcode,$rex\|0x40 if($rex);

1227 }

1228

1229 sub aesni {

1230 my $line=shift;

1231 my @opcode=(0x66);

1232

1233 if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {

1234 my %opcodelet = (

1235 "aesenc" => 0xdc, "aesenclast" => 0xdd

1236 );

1237 return undef if (!defined($opcodelet{$1}));

1238 rex(\@opcode,$3,$2);

1239 push @opcode,0x0f,0x38,$opcodelet{$1};

1240 push @opcode,0xc0\|($2&7)\|(($3&7)<<3); # ModR/M

1241 return ".byte\t".join(',',@opcode);

1242 }

1243 return $line;

1244 }

1245

1246 $code =~ s/\`([^\`]*)\`/eval($1)/gem;

1247 $code =~ s/\b(aes.%xmm[0-9]+).$/aesni($1)/gem;

1248

1249 print $code;

1250 close STDOUT;

OLD	NEW

« no previous file with comments | « openssl/crypto/aes/asm/aesni-sha1-x86_64.S ('k') | openssl/crypto/aes/asm/aesni-x86.S » ('j') | no next file with comments »