OLD | NEW |
1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
2 # | 2 # |
3 # ==================================================================== | 3 # ==================================================================== |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5 # project. Rights for redistribution and usage in source and binary | 5 # project. The module is, however, dual licensed under OpenSSL and |
6 # forms are granted according to the OpenSSL license. | 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
7 # ==================================================================== | 8 # ==================================================================== |
8 # | 9 # |
9 # Version 1.2. | 10 # Version 2.1. |
10 # | 11 # |
11 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on | 12 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on |
12 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version | 13 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version |
13 # [you'll notice a lot of resemblance], such as compressed S-boxes | 14 # [you'll notice a lot of resemblance], such as compressed S-boxes |
14 # in little-endian byte order, prefetch of these tables in CBC mode, | 15 # in little-endian byte order, prefetch of these tables in CBC mode, |
15 # as well as avoiding L1 cache aliasing between stack frame and key | 16 # as well as avoiding L1 cache aliasing between stack frame and key |
16 # schedule and already mentioned tables, compressed Td4... | 17 # schedule and already mentioned tables, compressed Td4... |
17 # | 18 # |
18 # Performance in number of cycles per processed byte for 128-bit key: | 19 # Performance in number of cycles per processed byte for 128-bit key: |
19 # | 20 # |
20 #» » ECB» » CBC encrypt | 21 #» » ECB encrypt» ECB decrypt» CBC large chunk |
21 # AMD64»» 13.7» » 13.0(*) | 22 # AMD64»» 33» » 41» » 13.0 |
22 # EM64T»» 20.2» » 18.6(*) | 23 # EM64T»» 38» » 59» » 18.6(*) |
| 24 # Core 2» 30» » 43» » 14.5(*) |
23 # | 25 # |
24 # (*)» CBC benchmarks are better than ECB thanks to custom ABI used | 26 # (*) with hyper-threading off |
25 #» by the private block encryption function. | 27 |
| 28 $flavour = shift; |
| 29 $output = shift; |
| 30 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 31 |
| 32 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| 33 |
| 34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 37 die "can't locate x86_64-xlate.pl"; |
| 38 |
| 39 open STDOUT,"| $^X $xlate $flavour $output"; |
26 | 40 |
27 $verticalspin=1; # unlike 32-bit version $verticalspin performs | 41 $verticalspin=1; # unlike 32-bit version $verticalspin performs |
28 # ~15% better on both AMD and Intel cores | 42 # ~15% better on both AMD and Intel cores |
29 $output=shift; | 43 $speed_limit=512;» # see aes-586.pl for details |
30 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; | |
31 | 44 |
32 $code=".text\n"; | 45 $code=".text\n"; |
33 | 46 |
34 $s0="%eax"; | 47 $s0="%eax"; |
35 $s1="%ebx"; | 48 $s1="%ebx"; |
36 $s2="%ecx"; | 49 $s2="%ecx"; |
37 $s3="%edx"; | 50 $s3="%edx"; |
38 $acc0="%esi"; | 51 $acc0="%esi";» $mask80="%rsi"; |
39 $acc1="%edi"; | 52 $acc1="%edi";» $maskfe="%rdi"; |
40 $acc2="%ebp"; | 53 $acc2="%ebp";» $mask1b="%rbp"; |
41 $inp="%r8"; | 54 $inp="%r8"; |
42 $out="%r9"; | 55 $out="%r9"; |
43 $t0="%r10d"; | 56 $t0="%r10d"; |
44 $t1="%r11d"; | 57 $t1="%r11d"; |
45 $t2="%r12d"; | 58 $t2="%r12d"; |
46 $rnds="%r13d"; | 59 $rnds="%r13d"; |
47 $sbox="%r14"; | 60 $sbox="%r14"; |
48 $key="%r15"; | 61 $key="%r15"; |
49 | 62 |
50 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } | 63 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } |
51 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; | 64 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; |
52 $r =~ s/%[er]([sd]i)/%\1l/; | 65 $r =~ s/%[er]([sd]i)/%\1l/; |
53 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | 66 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } |
| 67 sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; |
| 68 $r =~ s/%r([0-9]+)/%r\1d/; $r; } |
54 sub _data_word() | 69 sub _data_word() |
55 { my $i; | 70 { my $i; |
56 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | 71 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } |
57 } | 72 } |
58 sub data_word() | 73 sub data_word() |
59 { my $i; | 74 { my $i; |
60 my $last=pop(@_); | 75 my $last=pop(@_); |
61 $code.=".long\t"; | 76 $code.=".long\t"; |
62 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } | 77 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } |
63 $code.=sprintf"0x%08x\n",$last; | 78 $code.=sprintf"0x%08x\n",$last; |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
131 ___ | 146 ___ |
132 } | 147 } |
133 | 148 |
134 sub enclastvert() | 149 sub enclastvert() |
135 { my $t3="%r8d"; # zaps $inp! | 150 { my $t3="%r8d"; # zaps $inp! |
136 | 151 |
137 $code.=<<___; | 152 $code.=<<___; |
138 movzb `&lo("$s0")`,$acc0 | 153 movzb `&lo("$s0")`,$acc0 |
139 movzb `&lo("$s1")`,$acc1 | 154 movzb `&lo("$s1")`,$acc1 |
140 movzb `&lo("$s2")`,$acc2 | 155 movzb `&lo("$s2")`,$acc2 |
141 » mov» 2($sbox,$acc0,8),$t0 | 156 » movzb» 2($sbox,$acc0,8),$t0 |
142 » mov» 2($sbox,$acc1,8),$t1 | 157 » movzb» 2($sbox,$acc1,8),$t1 |
143 » mov» 2($sbox,$acc2,8),$t2 | 158 » movzb» 2($sbox,$acc2,8),$t2 |
144 | |
145 » and» \$0x000000ff,$t0 | |
146 » and» \$0x000000ff,$t1 | |
147 » and» \$0x000000ff,$t2 | |
148 | 159 |
149 movzb `&lo("$s3")`,$acc0 | 160 movzb `&lo("$s3")`,$acc0 |
150 movzb `&hi("$s1")`,$acc1 | 161 movzb `&hi("$s1")`,$acc1 |
151 movzb `&hi("$s2")`,$acc2 | 162 movzb `&hi("$s2")`,$acc2 |
152 » mov» 2($sbox,$acc0,8),$t3 | 163 » movzb» 2($sbox,$acc0,8),$t3 |
153 mov 0($sbox,$acc1,8),$acc1 #$t0 | 164 mov 0($sbox,$acc1,8),$acc1 #$t0 |
154 mov 0($sbox,$acc2,8),$acc2 #$t1 | 165 mov 0($sbox,$acc2,8),$acc2 #$t1 |
155 | 166 |
156 and \$0x000000ff,$t3 | |
157 and \$0x0000ff00,$acc1 | 167 and \$0x0000ff00,$acc1 |
158 and \$0x0000ff00,$acc2 | 168 and \$0x0000ff00,$acc2 |
159 | 169 |
160 xor $acc1,$t0 | 170 xor $acc1,$t0 |
161 xor $acc2,$t1 | 171 xor $acc2,$t1 |
162 shr \$16,$s2 | 172 shr \$16,$s2 |
163 | 173 |
164 movzb `&hi("$s3")`,$acc0 | 174 movzb `&hi("$s3")`,$acc0 |
165 movzb `&hi("$s0")`,$acc1 | 175 movzb `&hi("$s0")`,$acc1 |
166 shr \$16,$s3 | 176 shr \$16,$s3 |
(...skipping 171 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
338 xor 16+4($key),$s1 | 348 xor 16+4($key),$s1 |
339 xor 16+8($key),$s2 | 349 xor 16+8($key),$s2 |
340 xor 16+12($key),$s3 | 350 xor 16+12($key),$s3 |
341 ___ | 351 ___ |
342 } | 352 } |
343 $code.=<<___; | 353 $code.=<<___; |
344 .byte 0xf3,0xc3 # rep ret | 354 .byte 0xf3,0xc3 # rep ret |
345 .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt | 355 .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt |
346 ___ | 356 ___ |
347 | 357 |
| 358 # it's possible to implement this by shifting tN by 8, filling least |
| 359 # significant byte with byte load and finally bswap-ing at the end, |
| 360 # but such partial register load kills Core 2... |
| 361 sub enccompactvert() |
| 362 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); |
| 363 |
| 364 $code.=<<___; |
| 365 movzb `&lo("$s0")`,$t0 |
| 366 movzb `&lo("$s1")`,$t1 |
| 367 movzb `&lo("$s2")`,$t2 |
| 368 movzb ($sbox,$t0,1),$t0 |
| 369 movzb ($sbox,$t1,1),$t1 |
| 370 movzb ($sbox,$t2,1),$t2 |
| 371 |
| 372 movzb `&lo("$s3")`,$t3 |
| 373 movzb `&hi("$s1")`,$acc0 |
| 374 movzb `&hi("$s2")`,$acc1 |
| 375 movzb ($sbox,$t3,1),$t3 |
| 376 movzb ($sbox,$acc0,1),$t4 #$t0 |
| 377 movzb ($sbox,$acc1,1),$t5 #$t1 |
| 378 |
| 379 movzb `&hi("$s3")`,$acc2 |
| 380 movzb `&hi("$s0")`,$acc0 |
| 381 shr \$16,$s2 |
| 382 movzb ($sbox,$acc2,1),$acc2 #$t2 |
| 383 movzb ($sbox,$acc0,1),$acc0 #$t3 |
| 384 shr \$16,$s3 |
| 385 |
| 386 movzb `&lo("$s2")`,$acc1 |
| 387 shl \$8,$t4 |
| 388 shl \$8,$t5 |
| 389 movzb ($sbox,$acc1,1),$acc1 #$t0 |
| 390 xor $t4,$t0 |
| 391 xor $t5,$t1 |
| 392 |
| 393 movzb `&lo("$s3")`,$t4 |
| 394 shr \$16,$s0 |
| 395 shr \$16,$s1 |
| 396 movzb `&lo("$s0")`,$t5 |
| 397 shl \$8,$acc2 |
| 398 shl \$8,$acc0 |
| 399 movzb ($sbox,$t4,1),$t4 #$t1 |
| 400 movzb ($sbox,$t5,1),$t5 #$t2 |
| 401 xor $acc2,$t2 |
| 402 xor $acc0,$t3 |
| 403 |
| 404 movzb `&lo("$s1")`,$acc2 |
| 405 movzb `&hi("$s3")`,$acc0 |
| 406 shl \$16,$acc1 |
| 407 movzb ($sbox,$acc2,1),$acc2 #$t3 |
| 408 movzb ($sbox,$acc0,1),$acc0 #$t0 |
| 409 xor $acc1,$t0 |
| 410 |
| 411 movzb `&hi("$s0")`,$acc1 |
| 412 shr \$8,$s2 |
| 413 shr \$8,$s1 |
| 414 movzb ($sbox,$acc1,1),$acc1 #$t1 |
| 415 movzb ($sbox,$s2,1),$s3 #$t3 |
| 416 movzb ($sbox,$s1,1),$s2 #$t2 |
| 417 shl \$16,$t4 |
| 418 shl \$16,$t5 |
| 419 shl \$16,$acc2 |
| 420 xor $t4,$t1 |
| 421 xor $t5,$t2 |
| 422 xor $acc2,$t3 |
| 423 |
| 424 shl \$24,$acc0 |
| 425 shl \$24,$acc1 |
| 426 shl \$24,$s3 |
| 427 xor $acc0,$t0 |
| 428 shl \$24,$s2 |
| 429 xor $acc1,$t1 |
| 430 mov $t0,$s0 |
| 431 mov $t1,$s1 |
| 432 xor $t2,$s2 |
| 433 xor $t3,$s3 |
| 434 ___ |
| 435 } |
| 436 |
| 437 sub enctransform_ref() |
| 438 { my $sn = shift; |
| 439 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); |
| 440 |
| 441 $code.=<<___; |
| 442 mov $sn,$acc |
| 443 and \$0x80808080,$acc |
| 444 mov $acc,$tmp |
| 445 shr \$7,$tmp |
| 446 lea ($sn,$sn),$r2 |
| 447 sub $tmp,$acc |
| 448 and \$0xfefefefe,$r2 |
| 449 and \$0x1b1b1b1b,$acc |
| 450 mov $sn,$tmp |
| 451 xor $acc,$r2 |
| 452 |
| 453 xor $r2,$sn |
| 454 rol \$24,$sn |
| 455 xor $r2,$sn |
| 456 ror \$16,$tmp |
| 457 xor $tmp,$sn |
| 458 ror \$8,$tmp |
| 459 xor $tmp,$sn |
| 460 ___ |
| 461 } |
| 462 |
| 463 # unlike decrypt case it does not pay off to parallelize enctransform |
| 464 sub enctransform() |
| 465 { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); |
| 466 |
| 467 $code.=<<___; |
| 468 mov $s0,$acc0 |
| 469 mov $s1,$acc1 |
| 470 and \$0x80808080,$acc0 |
| 471 and \$0x80808080,$acc1 |
| 472 mov $acc0,$t0 |
| 473 mov $acc1,$t1 |
| 474 shr \$7,$t0 |
| 475 lea ($s0,$s0),$r20 |
| 476 shr \$7,$t1 |
| 477 lea ($s1,$s1),$r21 |
| 478 sub $t0,$acc0 |
| 479 sub $t1,$acc1 |
| 480 and \$0xfefefefe,$r20 |
| 481 and \$0xfefefefe,$r21 |
| 482 and \$0x1b1b1b1b,$acc0 |
| 483 and \$0x1b1b1b1b,$acc1 |
| 484 mov $s0,$t0 |
| 485 mov $s1,$t1 |
| 486 xor $acc0,$r20 |
| 487 xor $acc1,$r21 |
| 488 |
| 489 xor $r20,$s0 |
| 490 xor $r21,$s1 |
| 491 mov $s2,$acc0 |
| 492 mov $s3,$acc1 |
| 493 rol \$24,$s0 |
| 494 rol \$24,$s1 |
| 495 and \$0x80808080,$acc0 |
| 496 and \$0x80808080,$acc1 |
| 497 xor $r20,$s0 |
| 498 xor $r21,$s1 |
| 499 mov $acc0,$t2 |
| 500 mov $acc1,$t3 |
| 501 ror \$16,$t0 |
| 502 ror \$16,$t1 |
| 503 shr \$7,$t2 |
| 504 lea ($s2,$s2),$r20 |
| 505 xor $t0,$s0 |
| 506 xor $t1,$s1 |
| 507 shr \$7,$t3 |
| 508 lea ($s3,$s3),$r21 |
| 509 ror \$8,$t0 |
| 510 ror \$8,$t1 |
| 511 sub $t2,$acc0 |
| 512 sub $t3,$acc1 |
| 513 xor $t0,$s0 |
| 514 xor $t1,$s1 |
| 515 |
| 516 and \$0xfefefefe,$r20 |
| 517 and \$0xfefefefe,$r21 |
| 518 and \$0x1b1b1b1b,$acc0 |
| 519 and \$0x1b1b1b1b,$acc1 |
| 520 mov $s2,$t2 |
| 521 mov $s3,$t3 |
| 522 xor $acc0,$r20 |
| 523 xor $acc1,$r21 |
| 524 |
| 525 xor $r20,$s2 |
| 526 xor $r21,$s3 |
| 527 rol \$24,$s2 |
| 528 rol \$24,$s3 |
| 529 xor $r20,$s2 |
| 530 xor $r21,$s3 |
| 531 mov 0($sbox),$acc0 # prefetch Te4 |
| 532 ror \$16,$t2 |
| 533 ror \$16,$t3 |
| 534 mov 64($sbox),$acc1 |
| 535 xor $t2,$s2 |
| 536 xor $t3,$s3 |
| 537 mov 128($sbox),$r20 |
| 538 ror \$8,$t2 |
| 539 ror \$8,$t3 |
| 540 mov 192($sbox),$r21 |
| 541 xor $t2,$s2 |
| 542 xor $t3,$s3 |
| 543 ___ |
| 544 } |
| 545 |
| 546 $code.=<<___; |
| 547 .type _x86_64_AES_encrypt_compact,\@abi-omnipotent |
| 548 .align 16 |
| 549 _x86_64_AES_encrypt_compact: |
| 550 lea 128($sbox),$inp # size optimization |
| 551 mov 0-128($inp),$acc1 # prefetch Te4 |
| 552 mov 32-128($inp),$acc2 |
| 553 mov 64-128($inp),$t0 |
| 554 mov 96-128($inp),$t1 |
| 555 mov 128-128($inp),$acc1 |
| 556 mov 160-128($inp),$acc2 |
| 557 mov 192-128($inp),$t0 |
| 558 mov 224-128($inp),$t1 |
| 559 jmp .Lenc_loop_compact |
| 560 .align 16 |
| 561 .Lenc_loop_compact: |
| 562 xor 0($key),$s0 # xor with key |
| 563 xor 4($key),$s1 |
| 564 xor 8($key),$s2 |
| 565 xor 12($key),$s3 |
| 566 lea 16($key),$key |
| 567 ___ |
| 568 &enccompactvert(); |
| 569 $code.=<<___; |
| 570 cmp 16(%rsp),$key |
| 571 je .Lenc_compact_done |
| 572 ___ |
| 573 &enctransform(); |
| 574 $code.=<<___; |
| 575 jmp .Lenc_loop_compact |
| 576 .align 16 |
| 577 .Lenc_compact_done: |
| 578 xor 0($key),$s0 |
| 579 xor 4($key),$s1 |
| 580 xor 8($key),$s2 |
| 581 xor 12($key),$s3 |
| 582 .byte 0xf3,0xc3 # rep ret |
| 583 .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact |
| 584 ___ |
| 585 |
348 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 586 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
349 $code.=<<___; | 587 $code.=<<___; |
350 .globl AES_encrypt | 588 .globl AES_encrypt |
351 .type AES_encrypt,\@function,3 | 589 .type AES_encrypt,\@function,3 |
352 .align 16 | 590 .align 16 |
353 AES_encrypt: | 591 AES_encrypt: |
354 push %rbx | 592 push %rbx |
355 push %rbp | 593 push %rbp |
356 push %r12 | 594 push %r12 |
357 push %r13 | 595 push %r13 |
358 push %r14 | 596 push %r14 |
359 push %r15 | 597 push %r15 |
360 | 598 |
| 599 # allocate frame "above" key schedule |
| 600 mov %rsp,%r10 |
| 601 lea -63(%rdx),%rcx # %rdx is key argument |
| 602 and \$-64,%rsp |
| 603 sub %rsp,%rcx |
| 604 neg %rcx |
| 605 and \$0x3c0,%rcx |
| 606 sub %rcx,%rsp |
| 607 sub \$32,%rsp |
| 608 |
| 609 mov %rsi,16(%rsp) # save out |
| 610 mov %r10,24(%rsp) # save real stack pointer |
| 611 .Lenc_prologue: |
| 612 |
361 mov %rdx,$key | 613 mov %rdx,$key |
362 » mov» %rdi,$inp | 614 » mov» 240($key),$rnds»# load rounds |
363 » mov» %rsi,$out | |
364 | 615 |
365 » .picmeup» $sbox | 616 » mov» 0(%rdi),$s0» # load input vector |
366 » lea» AES_Te-.($sbox),$sbox | 617 » mov» 4(%rdi),$s1 |
| 618 » mov» 8(%rdi),$s2 |
| 619 » mov» 12(%rdi),$s3 |
367 | 620 |
368 » mov» 0($inp),$s0 | 621 » shl» \$4,$rnds |
369 » mov» 4($inp),$s1 | 622 » lea» ($key,$rnds),%rbp |
370 » mov» 8($inp),$s2 | 623 » mov» $key,(%rsp)» # key schedule |
371 » mov» 12($inp),$s3 | 624 » mov» %rbp,8(%rsp)» # end of key schedule |
372 | 625 |
373 » call» _x86_64_AES_encrypt | 626 » # pick Te4 copy which can't "overlap" with stack frame or key schedule |
| 627 » lea» .LAES_Te+2048(%rip),$sbox |
| 628 » lea» 768(%rsp),%rbp |
| 629 » sub» $sbox,%rbp |
| 630 » and» \$0x300,%rbp |
| 631 » lea» ($sbox,%rbp),$sbox |
374 | 632 |
375 » mov» $s0,0($out) | 633 » call» _x86_64_AES_encrypt_compact |
| 634 |
| 635 » mov» 16(%rsp),$out» # restore out |
| 636 » mov» 24(%rsp),%rsi» # restore saved stack pointer |
| 637 » mov» $s0,0($out)» # write output vector |
376 mov $s1,4($out) | 638 mov $s1,4($out) |
377 mov $s2,8($out) | 639 mov $s2,8($out) |
378 mov $s3,12($out) | 640 mov $s3,12($out) |
379 | 641 |
380 » pop» %r15 | 642 » mov» (%rsi),%r15 |
381 » pop» %r14 | 643 » mov» 8(%rsi),%r14 |
382 » pop» %r13 | 644 » mov» 16(%rsi),%r13 |
383 » pop» %r12 | 645 » mov» 24(%rsi),%r12 |
384 » pop» %rbp | 646 » mov» 32(%rsi),%rbp |
385 » pop» %rbx | 647 » mov» 40(%rsi),%rbx |
| 648 » lea» 48(%rsi),%rsp |
| 649 .Lenc_epilogue: |
386 ret | 650 ret |
387 .size AES_encrypt,.-AES_encrypt | 651 .size AES_encrypt,.-AES_encrypt |
388 ___ | 652 ___ |
389 | 653 |
390 #------------------------------------------------------------------# | 654 #------------------------------------------------------------------# |
391 | 655 |
392 sub decvert() | 656 sub decvert() |
393 { my $t3="%r8d"; # zaps $inp! | 657 { my $t3="%r8d"; # zaps $inp! |
394 | 658 |
395 $code.=<<___; | 659 $code.=<<___; |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
446 xor $t2,$s2 | 710 xor $t2,$s2 |
447 xor $t1,$s1 | 711 xor $t1,$s1 |
448 xor $t3,$s3 | 712 xor $t3,$s3 |
449 ___ | 713 ___ |
450 } | 714 } |
451 | 715 |
452 sub declastvert() | 716 sub declastvert() |
453 { my $t3="%r8d"; # zaps $inp! | 717 { my $t3="%r8d"; # zaps $inp! |
454 | 718 |
455 $code.=<<___; | 719 $code.=<<___; |
| 720 lea 2048($sbox),$sbox # size optimization |
456 movzb `&lo("$s0")`,$acc0 | 721 movzb `&lo("$s0")`,$acc0 |
457 movzb `&lo("$s1")`,$acc1 | 722 movzb `&lo("$s1")`,$acc1 |
458 movzb `&lo("$s2")`,$acc2 | 723 movzb `&lo("$s2")`,$acc2 |
459 » movzb» 2048($sbox,$acc0,1),$t0 | 724 » movzb» ($sbox,$acc0,1),$t0 |
460 » movzb» 2048($sbox,$acc1,1),$t1 | 725 » movzb» ($sbox,$acc1,1),$t1 |
461 » movzb» 2048($sbox,$acc2,1),$t2 | 726 » movzb» ($sbox,$acc2,1),$t2 |
462 | 727 |
463 movzb `&lo("$s3")`,$acc0 | 728 movzb `&lo("$s3")`,$acc0 |
464 movzb `&hi("$s3")`,$acc1 | 729 movzb `&hi("$s3")`,$acc1 |
465 movzb `&hi("$s0")`,$acc2 | 730 movzb `&hi("$s0")`,$acc2 |
466 » movzb» 2048($sbox,$acc0,1),$t3 | 731 » movzb» ($sbox,$acc0,1),$t3 |
467 » movzb» 2048($sbox,$acc1,1),$acc1» #$t0 | 732 » movzb» ($sbox,$acc1,1),$acc1» #$t0 |
468 » movzb» 2048($sbox,$acc2,1),$acc2» #$t1 | 733 » movzb» ($sbox,$acc2,1),$acc2» #$t1 |
469 | 734 |
470 shl \$8,$acc1 | 735 shl \$8,$acc1 |
471 shl \$8,$acc2 | 736 shl \$8,$acc2 |
472 | 737 |
473 xor $acc1,$t0 | 738 xor $acc1,$t0 |
474 xor $acc2,$t1 | 739 xor $acc2,$t1 |
475 shr \$16,$s3 | 740 shr \$16,$s3 |
476 | 741 |
477 movzb `&hi("$s1")`,$acc0 | 742 movzb `&hi("$s1")`,$acc0 |
478 movzb `&hi("$s2")`,$acc1 | 743 movzb `&hi("$s2")`,$acc1 |
479 shr \$16,$s0 | 744 shr \$16,$s0 |
480 » movzb» 2048($sbox,$acc0,1),$acc0» #$t2 | 745 » movzb» ($sbox,$acc0,1),$acc0» #$t2 |
481 » movzb» 2048($sbox,$acc1,1),$acc1» #$t3 | 746 » movzb» ($sbox,$acc1,1),$acc1» #$t3 |
482 | 747 |
483 shl \$8,$acc0 | 748 shl \$8,$acc0 |
484 shl \$8,$acc1 | 749 shl \$8,$acc1 |
485 shr \$16,$s1 | 750 shr \$16,$s1 |
486 xor $acc0,$t2 | 751 xor $acc0,$t2 |
487 xor $acc1,$t3 | 752 xor $acc1,$t3 |
488 shr \$16,$s2 | 753 shr \$16,$s2 |
489 | 754 |
490 movzb `&lo("$s2")`,$acc0 | 755 movzb `&lo("$s2")`,$acc0 |
491 movzb `&lo("$s3")`,$acc1 | 756 movzb `&lo("$s3")`,$acc1 |
492 movzb `&lo("$s0")`,$acc2 | 757 movzb `&lo("$s0")`,$acc2 |
493 » movzb» 2048($sbox,$acc0,1),$acc0» #$t0 | 758 » movzb» ($sbox,$acc0,1),$acc0» #$t0 |
494 » movzb» 2048($sbox,$acc1,1),$acc1» #$t1 | 759 » movzb» ($sbox,$acc1,1),$acc1» #$t1 |
495 » movzb» 2048($sbox,$acc2,1),$acc2» #$t2 | 760 » movzb» ($sbox,$acc2,1),$acc2» #$t2 |
496 | 761 |
497 shl \$16,$acc0 | 762 shl \$16,$acc0 |
498 shl \$16,$acc1 | 763 shl \$16,$acc1 |
499 shl \$16,$acc2 | 764 shl \$16,$acc2 |
500 | 765 |
501 xor $acc0,$t0 | 766 xor $acc0,$t0 |
502 xor $acc1,$t1 | 767 xor $acc1,$t1 |
503 xor $acc2,$t2 | 768 xor $acc2,$t2 |
504 | 769 |
505 movzb `&lo("$s1")`,$acc0 | 770 movzb `&lo("$s1")`,$acc0 |
506 movzb `&hi("$s1")`,$acc1 | 771 movzb `&hi("$s1")`,$acc1 |
507 movzb `&hi("$s2")`,$acc2 | 772 movzb `&hi("$s2")`,$acc2 |
508 » movzb» 2048($sbox,$acc0,1),$acc0» #$t3 | 773 » movzb» ($sbox,$acc0,1),$acc0» #$t3 |
509 » movzb» 2048($sbox,$acc1,1),$acc1» #$t0 | 774 » movzb» ($sbox,$acc1,1),$acc1» #$t0 |
510 » movzb» 2048($sbox,$acc2,1),$acc2» #$t1 | 775 » movzb» ($sbox,$acc2,1),$acc2» #$t1 |
511 | 776 |
512 shl \$16,$acc0 | 777 shl \$16,$acc0 |
513 shl \$24,$acc1 | 778 shl \$24,$acc1 |
514 shl \$24,$acc2 | 779 shl \$24,$acc2 |
515 | 780 |
516 xor $acc0,$t3 | 781 xor $acc0,$t3 |
517 xor $acc1,$t0 | 782 xor $acc1,$t0 |
518 xor $acc2,$t1 | 783 xor $acc2,$t1 |
519 | 784 |
520 movzb `&hi("$s3")`,$acc0 | 785 movzb `&hi("$s3")`,$acc0 |
521 movzb `&hi("$s0")`,$acc1 | 786 movzb `&hi("$s0")`,$acc1 |
522 mov 16+12($key),$s3 | 787 mov 16+12($key),$s3 |
523 » movzb» 2048($sbox,$acc0,1),$acc0» #$t2 | 788 » movzb» ($sbox,$acc0,1),$acc0» #$t2 |
524 » movzb» 2048($sbox,$acc1,1),$acc1» #$t3 | 789 » movzb» ($sbox,$acc1,1),$acc1» #$t3 |
525 mov 16+0($key),$s0 | 790 mov 16+0($key),$s0 |
526 | 791 |
527 shl \$24,$acc0 | 792 shl \$24,$acc0 |
528 shl \$24,$acc1 | 793 shl \$24,$acc1 |
529 | 794 |
530 xor $acc0,$t2 | 795 xor $acc0,$t2 |
531 xor $acc1,$t3 | 796 xor $acc1,$t3 |
532 | 797 |
533 mov 16+4($key),$s1 | 798 mov 16+4($key),$s1 |
534 mov 16+8($key),$s2 | 799 mov 16+8($key),$s2 |
| 800 lea -2048($sbox),$sbox |
535 xor $t0,$s0 | 801 xor $t0,$s0 |
536 xor $t1,$s1 | 802 xor $t1,$s1 |
537 xor $t2,$s2 | 803 xor $t2,$s2 |
538 xor $t3,$s3 | 804 xor $t3,$s3 |
539 ___ | 805 ___ |
540 } | 806 } |
541 | 807 |
542 sub decstep() | 808 sub decstep() |
543 { my ($i,@s) = @_; | 809 { my ($i,@s) = @_; |
544 my $tmp0=$acc0; | 810 my $tmp0=$acc0; |
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
652 xor 16+4($key),$s1 | 918 xor 16+4($key),$s1 |
653 xor 16+8($key),$s2 | 919 xor 16+8($key),$s2 |
654 xor 16+12($key),$s3 | 920 xor 16+12($key),$s3 |
655 ___ | 921 ___ |
656 } | 922 } |
657 $code.=<<___; | 923 $code.=<<___; |
658 .byte 0xf3,0xc3 # rep ret | 924 .byte 0xf3,0xc3 # rep ret |
659 .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt | 925 .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt |
660 ___ | 926 ___ |
661 | 927 |
| 928 sub deccompactvert() |
| 929 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); |
| 930 |
| 931 $code.=<<___; |
| 932 movzb `&lo("$s0")`,$t0 |
| 933 movzb `&lo("$s1")`,$t1 |
| 934 movzb `&lo("$s2")`,$t2 |
| 935 movzb ($sbox,$t0,1),$t0 |
| 936 movzb ($sbox,$t1,1),$t1 |
| 937 movzb ($sbox,$t2,1),$t2 |
| 938 |
| 939 movzb `&lo("$s3")`,$t3 |
| 940 movzb `&hi("$s3")`,$acc0 |
| 941 movzb `&hi("$s0")`,$acc1 |
| 942 movzb ($sbox,$t3,1),$t3 |
| 943 movzb ($sbox,$acc0,1),$t4 #$t0 |
| 944 movzb ($sbox,$acc1,1),$t5 #$t1 |
| 945 |
| 946 movzb `&hi("$s1")`,$acc2 |
| 947 movzb `&hi("$s2")`,$acc0 |
| 948 shr \$16,$s2 |
| 949 movzb ($sbox,$acc2,1),$acc2 #$t2 |
| 950 movzb ($sbox,$acc0,1),$acc0 #$t3 |
| 951 shr \$16,$s3 |
| 952 |
| 953 movzb `&lo("$s2")`,$acc1 |
| 954 shl \$8,$t4 |
| 955 shl \$8,$t5 |
| 956 movzb ($sbox,$acc1,1),$acc1 #$t0 |
| 957 xor $t4,$t0 |
| 958 xor $t5,$t1 |
| 959 |
| 960 movzb `&lo("$s3")`,$t4 |
| 961 shr \$16,$s0 |
| 962 shr \$16,$s1 |
| 963 movzb `&lo("$s0")`,$t5 |
| 964 shl \$8,$acc2 |
| 965 shl \$8,$acc0 |
| 966 movzb ($sbox,$t4,1),$t4 #$t1 |
| 967 movzb ($sbox,$t5,1),$t5 #$t2 |
| 968 xor $acc2,$t2 |
| 969 xor $acc0,$t3 |
| 970 |
| 971 movzb `&lo("$s1")`,$acc2 |
| 972 movzb `&hi("$s1")`,$acc0 |
| 973 shl \$16,$acc1 |
| 974 movzb ($sbox,$acc2,1),$acc2 #$t3 |
| 975 movzb ($sbox,$acc0,1),$acc0 #$t0 |
| 976 xor $acc1,$t0 |
| 977 |
| 978 movzb `&hi("$s2")`,$acc1 |
| 979 shl \$16,$t4 |
| 980 shl \$16,$t5 |
| 981 movzb ($sbox,$acc1,1),$s1 #$t1 |
| 982 xor $t4,$t1 |
| 983 xor $t5,$t2 |
| 984 |
| 985 movzb `&hi("$s3")`,$acc1 |
| 986 shr \$8,$s0 |
| 987 shl \$16,$acc2 |
| 988 movzb ($sbox,$acc1,1),$s2 #$t2 |
| 989 movzb ($sbox,$s0,1),$s3 #$t3 |
| 990 xor $acc2,$t3 |
| 991 |
| 992 shl \$24,$acc0 |
| 993 shl \$24,$s1 |
| 994 shl \$24,$s2 |
| 995 xor $acc0,$t0 |
| 996 shl \$24,$s3 |
| 997 xor $t1,$s1 |
| 998 mov $t0,$s0 |
| 999 xor $t2,$s2 |
| 1000 xor $t3,$s3 |
| 1001 ___ |
| 1002 } |
| 1003 |
| 1004 # parallelized version! input is pair of 64-bit values: %rax=s1.s0 |
| 1005 # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, |
| 1006 # %ecx=s2 and %edx=s3. |
| 1007 sub dectransform() |
| 1008 { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); |
| 1009 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); |
| 1010 my $prefetch = shift; |
| 1011 |
| 1012 $code.=<<___; |
| 1013 mov $tp10,$acc0 |
| 1014 mov $tp18,$acc8 |
| 1015 and $mask80,$acc0 |
| 1016 and $mask80,$acc8 |
| 1017 mov $acc0,$tp40 |
| 1018 mov $acc8,$tp48 |
| 1019 shr \$7,$tp40 |
| 1020 lea ($tp10,$tp10),$tp20 |
| 1021 shr \$7,$tp48 |
| 1022 lea ($tp18,$tp18),$tp28 |
| 1023 sub $tp40,$acc0 |
| 1024 sub $tp48,$acc8 |
| 1025 and $maskfe,$tp20 |
| 1026 and $maskfe,$tp28 |
| 1027 and $mask1b,$acc0 |
| 1028 and $mask1b,$acc8 |
| 1029 xor $tp20,$acc0 |
| 1030 xor $tp28,$acc8 |
| 1031 mov $acc0,$tp20 |
| 1032 mov $acc8,$tp28 |
| 1033 |
| 1034 and $mask80,$acc0 |
| 1035 and $mask80,$acc8 |
| 1036 mov $acc0,$tp80 |
| 1037 mov $acc8,$tp88 |
| 1038 shr \$7,$tp80 |
| 1039 lea ($tp20,$tp20),$tp40 |
| 1040 shr \$7,$tp88 |
| 1041 lea ($tp28,$tp28),$tp48 |
| 1042 sub $tp80,$acc0 |
| 1043 sub $tp88,$acc8 |
| 1044 and $maskfe,$tp40 |
| 1045 and $maskfe,$tp48 |
| 1046 and $mask1b,$acc0 |
| 1047 and $mask1b,$acc8 |
| 1048 xor $tp40,$acc0 |
| 1049 xor $tp48,$acc8 |
| 1050 mov $acc0,$tp40 |
| 1051 mov $acc8,$tp48 |
| 1052 |
| 1053 and $mask80,$acc0 |
| 1054 and $mask80,$acc8 |
| 1055 mov $acc0,$tp80 |
| 1056 mov $acc8,$tp88 |
| 1057 shr \$7,$tp80 |
| 1058 xor $tp10,$tp20 # tp2^=tp1 |
| 1059 shr \$7,$tp88 |
| 1060 xor $tp18,$tp28 # tp2^=tp1 |
| 1061 sub $tp80,$acc0 |
| 1062 sub $tp88,$acc8 |
| 1063 lea ($tp40,$tp40),$tp80 |
| 1064 lea ($tp48,$tp48),$tp88 |
| 1065 xor $tp10,$tp40 # tp4^=tp1 |
| 1066 xor $tp18,$tp48 # tp4^=tp1 |
| 1067 and $maskfe,$tp80 |
| 1068 and $maskfe,$tp88 |
| 1069 and $mask1b,$acc0 |
| 1070 and $mask1b,$acc8 |
| 1071 xor $acc0,$tp80 |
| 1072 xor $acc8,$tp88 |
| 1073 |
| 1074 xor $tp80,$tp10 # tp1^=tp8 |
| 1075 xor $tp88,$tp18 # tp1^=tp8 |
| 1076 xor $tp80,$tp20 # tp2^tp1^=tp8 |
| 1077 xor $tp88,$tp28 # tp2^tp1^=tp8 |
| 1078 mov $tp10,$acc0 |
| 1079 mov $tp18,$acc8 |
| 1080 xor $tp80,$tp40 # tp4^tp1^=tp8 |
| 1081 xor $tp88,$tp48 # tp4^tp1^=tp8 |
| 1082 shr \$32,$acc0 |
| 1083 shr \$32,$acc8 |
| 1084 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 |
| 1085 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 |
| 1086 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) |
| 1087 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) |
| 1088 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 |
| 1089 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 |
| 1090 |
| 1091 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) |
| 1092 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) |
| 1093 xor `&LO("$tp80")`,`&LO("$tp10")` |
| 1094 xor `&LO("$tp88")`,`&LO("$tp18")` |
| 1095 shr \$32,$tp80 |
| 1096 shr \$32,$tp88 |
| 1097 xor `&LO("$tp80")`,`&LO("$acc0")` |
| 1098 xor `&LO("$tp88")`,`&LO("$acc8")` |
| 1099 |
| 1100 mov $tp20,$tp80 |
| 1101 mov $tp28,$tp88 |
| 1102 shr \$32,$tp80 |
| 1103 shr \$32,$tp88 |
| 1104 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) |
| 1105 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) |
| 1106 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) |
| 1107 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) |
| 1108 xor `&LO("$tp20")`,`&LO("$tp10")` |
| 1109 xor `&LO("$tp28")`,`&LO("$tp18")` |
| 1110 mov $tp40,$tp20 |
| 1111 mov $tp48,$tp28 |
| 1112 xor `&LO("$tp80")`,`&LO("$acc0")` |
| 1113 xor `&LO("$tp88")`,`&LO("$acc8")` |
| 1114 |
| 1115 `"mov 0($sbox),$mask80" if ($prefetch)` |
| 1116 shr \$32,$tp20 |
| 1117 shr \$32,$tp28 |
| 1118 `"mov 64($sbox),$maskfe" if ($prefetch)` |
| 1119 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) |
| 1120 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) |
| 1121 `"mov 128($sbox),$mask1b" if ($prefetch)` |
| 1122 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) |
| 1123 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) |
| 1124 `"mov 192($sbox),$tp80" if ($prefetch)` |
| 1125 xor `&LO("$tp40")`,`&LO("$tp10")` |
| 1126 xor `&LO("$tp48")`,`&LO("$tp18")` |
| 1127 `"mov 256($sbox),$tp88" if ($prefetch)` |
| 1128 xor `&LO("$tp20")`,`&LO("$acc0")` |
| 1129 xor `&LO("$tp28")`,`&LO("$acc8")` |
| 1130 ___ |
| 1131 } |
| 1132 |
| 1133 $code.=<<___; |
| 1134 .type _x86_64_AES_decrypt_compact,\@abi-omnipotent |
| 1135 .align 16 |
| 1136 _x86_64_AES_decrypt_compact: |
| 1137 lea 128($sbox),$inp # size optimization |
| 1138 mov 0-128($inp),$acc1 # prefetch Td4 |
| 1139 mov 32-128($inp),$acc2 |
| 1140 mov 64-128($inp),$t0 |
| 1141 mov 96-128($inp),$t1 |
| 1142 mov 128-128($inp),$acc1 |
| 1143 mov 160-128($inp),$acc2 |
| 1144 mov 192-128($inp),$t0 |
| 1145 mov 224-128($inp),$t1 |
| 1146 jmp .Ldec_loop_compact |
| 1147 |
| 1148 .align 16 |
| 1149 .Ldec_loop_compact: |
| 1150 xor 0($key),$s0 # xor with key |
| 1151 xor 4($key),$s1 |
| 1152 xor 8($key),$s2 |
| 1153 xor 12($key),$s3 |
| 1154 lea 16($key),$key |
| 1155 ___ |
| 1156 &deccompactvert(); |
| 1157 $code.=<<___; |
| 1158 cmp 16(%rsp),$key |
| 1159 je .Ldec_compact_done |
| 1160 |
| 1161 mov 256+0($sbox),$mask80 |
| 1162 shl \$32,%rbx |
| 1163 shl \$32,%rdx |
| 1164 mov 256+8($sbox),$maskfe |
| 1165 or %rbx,%rax |
| 1166 or %rdx,%rcx |
| 1167 mov 256+16($sbox),$mask1b |
| 1168 ___ |
| 1169 &dectransform(1); |
| 1170 $code.=<<___; |
| 1171 jmp .Ldec_loop_compact |
| 1172 .align 16 |
| 1173 .Ldec_compact_done: |
| 1174 xor 0($key),$s0 |
| 1175 xor 4($key),$s1 |
| 1176 xor 8($key),$s2 |
| 1177 xor 12($key),$s3 |
| 1178 .byte 0xf3,0xc3 # rep ret |
| 1179 .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact |
| 1180 ___ |
| 1181 |
662 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1182 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
663 $code.=<<___; | 1183 $code.=<<___; |
664 .globl AES_decrypt | 1184 .globl AES_decrypt |
665 .type AES_decrypt,\@function,3 | 1185 .type AES_decrypt,\@function,3 |
666 .align 16 | 1186 .align 16 |
667 AES_decrypt: | 1187 AES_decrypt: |
668 push %rbx | 1188 push %rbx |
669 push %rbp | 1189 push %rbp |
670 push %r12 | 1190 push %r12 |
671 push %r13 | 1191 push %r13 |
672 push %r14 | 1192 push %r14 |
673 push %r15 | 1193 push %r15 |
674 | 1194 |
| 1195 # allocate frame "above" key schedule |
| 1196 mov %rsp,%r10 |
| 1197 lea -63(%rdx),%rcx # %rdx is key argument |
| 1198 and \$-64,%rsp |
| 1199 sub %rsp,%rcx |
| 1200 neg %rcx |
| 1201 and \$0x3c0,%rcx |
| 1202 sub %rcx,%rsp |
| 1203 sub \$32,%rsp |
| 1204 |
| 1205 mov %rsi,16(%rsp) # save out |
| 1206 mov %r10,24(%rsp) # save real stack pointer |
| 1207 .Ldec_prologue: |
| 1208 |
675 mov %rdx,$key | 1209 mov %rdx,$key |
676 » mov» %rdi,$inp | 1210 » mov» 240($key),$rnds»# load rounds |
677 » mov» %rsi,$out | |
678 | 1211 |
679 » .picmeup» $sbox | 1212 » mov» 0(%rdi),$s0» # load input vector |
680 » lea» AES_Td-.($sbox),$sbox | 1213 » mov» 4(%rdi),$s1 |
| 1214 » mov» 8(%rdi),$s2 |
| 1215 » mov» 12(%rdi),$s3 |
681 | 1216 |
682 » # prefetch Td4 | 1217 » shl» \$4,$rnds |
683 » lea» 2048+128($sbox),$sbox; | 1218 » lea» ($key,$rnds),%rbp |
684 » mov» 0-128($sbox),$s0 | 1219 » mov» $key,(%rsp)» # key schedule |
685 » mov» 32-128($sbox),$s1 | 1220 » mov» %rbp,8(%rsp)» # end of key schedule |
686 » mov» 64-128($sbox),$s2 | |
687 » mov» 96-128($sbox),$s3 | |
688 » mov» 128-128($sbox),$s0 | |
689 » mov» 160-128($sbox),$s1 | |
690 » mov» 192-128($sbox),$s2 | |
691 » mov» 224-128($sbox),$s3 | |
692 » lea» -2048-128($sbox),$sbox; | |
693 | 1221 |
694 » mov» 0($inp),$s0 | 1222 » # pick Td4 copy which can't "overlap" with stack frame or key schedule |
695 » mov» 4($inp),$s1 | 1223 » lea» .LAES_Td+2048(%rip),$sbox |
696 » mov» 8($inp),$s2 | 1224 » lea» 768(%rsp),%rbp |
697 » mov» 12($inp),$s3 | 1225 » sub» $sbox,%rbp |
| 1226 » and» \$0x300,%rbp |
| 1227 » lea» ($sbox,%rbp),$sbox |
| 1228 » shr» \$3,%rbp» # recall "magic" constants! |
| 1229 » add» %rbp,$sbox |
698 | 1230 |
699 » call» _x86_64_AES_decrypt | 1231 » call» _x86_64_AES_decrypt_compact |
700 | 1232 |
701 » mov» $s0,0($out) | 1233 » mov» 16(%rsp),$out» # restore out |
| 1234 » mov» 24(%rsp),%rsi» # restore saved stack pointer |
| 1235 » mov» $s0,0($out)» # write output vector |
702 mov $s1,4($out) | 1236 mov $s1,4($out) |
703 mov $s2,8($out) | 1237 mov $s2,8($out) |
704 mov $s3,12($out) | 1238 mov $s3,12($out) |
705 | 1239 |
706 » pop» %r15 | 1240 » mov» (%rsi),%r15 |
707 » pop» %r14 | 1241 » mov» 8(%rsi),%r14 |
708 » pop» %r13 | 1242 » mov» 16(%rsi),%r13 |
709 » pop» %r12 | 1243 » mov» 24(%rsi),%r12 |
710 » pop» %rbp | 1244 » mov» 32(%rsi),%rbp |
711 » pop» %rbx | 1245 » mov» 40(%rsi),%rbx |
| 1246 » lea» 48(%rsi),%rsp |
| 1247 .Ldec_epilogue: |
712 ret | 1248 ret |
713 .size AES_decrypt,.-AES_decrypt | 1249 .size AES_decrypt,.-AES_decrypt |
714 ___ | 1250 ___ |
715 #------------------------------------------------------------------# | 1251 #------------------------------------------------------------------# |
716 | 1252 |
717 sub enckey() | 1253 sub enckey() |
718 { | 1254 { |
719 $code.=<<___; | 1255 $code.=<<___; |
720 movz %dl,%esi # rk[i]>>0 | 1256 movz %dl,%esi # rk[i]>>0 |
721 » mov» 2(%rbp,%rsi,8),%ebx | 1257 » movzb» -128(%rbp,%rsi),%ebx |
722 movz %dh,%esi # rk[i]>>8 | 1258 movz %dh,%esi # rk[i]>>8 |
723 » and» \$0xFF000000,%ebx | 1259 » shl» \$24,%ebx |
724 xor %ebx,%eax | 1260 xor %ebx,%eax |
725 | 1261 |
726 » mov» 2(%rbp,%rsi,8),%ebx | 1262 » movzb» -128(%rbp,%rsi),%ebx |
727 shr \$16,%edx | 1263 shr \$16,%edx |
728 and \$0x000000FF,%ebx | |
729 movz %dl,%esi # rk[i]>>16 | 1264 movz %dl,%esi # rk[i]>>16 |
730 xor %ebx,%eax | 1265 xor %ebx,%eax |
731 | 1266 |
732 » mov» 0(%rbp,%rsi,8),%ebx | 1267 » movzb» -128(%rbp,%rsi),%ebx |
733 movz %dh,%esi # rk[i]>>24 | 1268 movz %dh,%esi # rk[i]>>24 |
734 » and» \$0x0000FF00,%ebx | 1269 » shl» \$8,%ebx |
735 xor %ebx,%eax | 1270 xor %ebx,%eax |
736 | 1271 |
737 » mov» 0(%rbp,%rsi,8),%ebx | 1272 » movzb» -128(%rbp,%rsi),%ebx |
738 » and» \$0x00FF0000,%ebx | 1273 » shl» \$16,%ebx |
739 xor %ebx,%eax | 1274 xor %ebx,%eax |
740 | 1275 |
741 » xor» 2048(%rbp,%rcx,4),%eax» » # rcon | 1276 » xor» 1024-128(%rbp,%rcx,4),%eax» » # rcon |
742 ___ | 1277 ___ |
743 } | 1278 } |
744 | 1279 |
745 # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 1280 # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
746 # AES_KEY *key) | 1281 # AES_KEY *key) |
747 $code.=<<___; | 1282 $code.=<<___; |
748 .globl AES_set_encrypt_key | 1283 .globl AES_set_encrypt_key |
749 .type AES_set_encrypt_key,\@function,3 | 1284 .type AES_set_encrypt_key,\@function,3 |
750 .align 16 | 1285 .align 16 |
751 AES_set_encrypt_key: | 1286 AES_set_encrypt_key: |
752 push %rbx | 1287 push %rbx |
753 push %rbp | 1288 push %rbp |
| 1289 push %r12 # redundant, but allows to share |
| 1290 push %r13 # exception handler... |
| 1291 push %r14 |
| 1292 push %r15 |
754 sub \$8,%rsp | 1293 sub \$8,%rsp |
| 1294 .Lenc_key_prologue: |
755 | 1295 |
756 call _x86_64_AES_set_encrypt_key | 1296 call _x86_64_AES_set_encrypt_key |
757 | 1297 |
758 » mov» 8(%rsp),%rbp | 1298 » mov» 8(%rsp),%r15 |
759 » mov» 16(%rsp),%rbx | 1299 » mov» 16(%rsp),%r14 |
760 » add» \$24,%rsp | 1300 » mov» 24(%rsp),%r13 |
| 1301 » mov» 32(%rsp),%r12 |
| 1302 » mov» 40(%rsp),%rbp |
| 1303 » mov» 48(%rsp),%rbx |
| 1304 » add» \$56,%rsp |
| 1305 .Lenc_key_epilogue: |
761 ret | 1306 ret |
762 .size AES_set_encrypt_key,.-AES_set_encrypt_key | 1307 .size AES_set_encrypt_key,.-AES_set_encrypt_key |
763 | 1308 |
764 .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | 1309 .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent |
765 .align 16 | 1310 .align 16 |
766 _x86_64_AES_set_encrypt_key: | 1311 _x86_64_AES_set_encrypt_key: |
767 mov %esi,%ecx # %ecx=bits | 1312 mov %esi,%ecx # %ecx=bits |
768 mov %rdi,%rsi # %rsi=userKey | 1313 mov %rdi,%rsi # %rsi=userKey |
769 mov %rdx,%rdi # %rdi=key | 1314 mov %rdx,%rdi # %rdi=key |
770 | 1315 |
771 test \$-1,%rsi | 1316 test \$-1,%rsi |
772 jz .Lbadpointer | 1317 jz .Lbadpointer |
773 test \$-1,%rdi | 1318 test \$-1,%rdi |
774 jz .Lbadpointer | 1319 jz .Lbadpointer |
775 | 1320 |
776 » .picmeup %rbp | 1321 » lea» .LAES_Te(%rip),%rbp |
777 » lea» AES_Te-.(%rbp),%rbp | 1322 » lea» 2048+128(%rbp),%rbp |
| 1323 |
| 1324 » # prefetch Te4 |
| 1325 » mov» 0-128(%rbp),%eax |
| 1326 » mov» 32-128(%rbp),%ebx |
| 1327 » mov» 64-128(%rbp),%r8d |
| 1328 » mov» 96-128(%rbp),%edx |
| 1329 » mov» 128-128(%rbp),%eax |
| 1330 » mov» 160-128(%rbp),%ebx |
| 1331 » mov» 192-128(%rbp),%r8d |
| 1332 » mov» 224-128(%rbp),%edx |
778 | 1333 |
779 cmp \$128,%ecx | 1334 cmp \$128,%ecx |
780 je .L10rounds | 1335 je .L10rounds |
781 cmp \$192,%ecx | 1336 cmp \$192,%ecx |
782 je .L12rounds | 1337 je .L12rounds |
783 cmp \$256,%ecx | 1338 cmp \$256,%ecx |
784 je .L14rounds | 1339 je .L14rounds |
785 mov \$-2,%rax # invalid number of bits | 1340 mov \$-2,%rax # invalid number of bits |
786 jmp .Lexit | 1341 jmp .Lexit |
787 | 1342 |
788 .L10rounds: | 1343 .L10rounds: |
789 » mov» 0(%rsi),%eax» » » # copy first 4 dwords | 1344 » mov» 0(%rsi),%rax» » » # copy first 4 dwords |
790 » mov» 4(%rsi),%ebx | 1345 » mov» 8(%rsi),%rdx |
791 » mov» 8(%rsi),%ecx | 1346 » mov» %rax,0(%rdi) |
792 » mov» 12(%rsi),%edx | 1347 » mov» %rdx,8(%rdi) |
793 » mov» %eax,0(%rdi) | |
794 » mov» %ebx,4(%rdi) | |
795 » mov» %ecx,8(%rdi) | |
796 » mov» %edx,12(%rdi) | |
797 | 1348 |
| 1349 shr \$32,%rdx |
798 xor %ecx,%ecx | 1350 xor %ecx,%ecx |
799 jmp .L10shortcut | 1351 jmp .L10shortcut |
800 .align 4 | 1352 .align 4 |
801 .L10loop: | 1353 .L10loop: |
802 mov 0(%rdi),%eax # rk[0] | 1354 mov 0(%rdi),%eax # rk[0] |
803 mov 12(%rdi),%edx # rk[3] | 1355 mov 12(%rdi),%edx # rk[3] |
804 .L10shortcut: | 1356 .L10shortcut: |
805 ___ | 1357 ___ |
806 &enckey (); | 1358 &enckey (); |
807 $code.=<<___; | 1359 $code.=<<___; |
808 mov %eax,16(%rdi) # rk[4] | 1360 mov %eax,16(%rdi) # rk[4] |
809 xor 4(%rdi),%eax | 1361 xor 4(%rdi),%eax |
810 mov %eax,20(%rdi) # rk[5] | 1362 mov %eax,20(%rdi) # rk[5] |
811 xor 8(%rdi),%eax | 1363 xor 8(%rdi),%eax |
812 mov %eax,24(%rdi) # rk[6] | 1364 mov %eax,24(%rdi) # rk[6] |
813 xor 12(%rdi),%eax | 1365 xor 12(%rdi),%eax |
814 mov %eax,28(%rdi) # rk[7] | 1366 mov %eax,28(%rdi) # rk[7] |
815 add \$1,%ecx | 1367 add \$1,%ecx |
816 lea 16(%rdi),%rdi | 1368 lea 16(%rdi),%rdi |
817 cmp \$10,%ecx | 1369 cmp \$10,%ecx |
818 jl .L10loop | 1370 jl .L10loop |
819 | 1371 |
820 movl \$10,80(%rdi) # setup number of rounds | 1372 movl \$10,80(%rdi) # setup number of rounds |
821 xor %rax,%rax | 1373 xor %rax,%rax |
822 jmp .Lexit | 1374 jmp .Lexit |
823 | 1375 |
824 .L12rounds: | 1376 .L12rounds: |
825 » mov» 0(%rsi),%eax» » » # copy first 6 dwords | 1377 » mov» 0(%rsi),%rax» » » # copy first 6 dwords |
826 » mov» 4(%rsi),%ebx | 1378 » mov» 8(%rsi),%rbx |
827 » mov» 8(%rsi),%ecx | 1379 » mov» 16(%rsi),%rdx |
828 » mov» 12(%rsi),%edx | 1380 » mov» %rax,0(%rdi) |
829 » mov» %eax,0(%rdi) | 1381 » mov» %rbx,8(%rdi) |
830 » mov» %ebx,4(%rdi) | 1382 » mov» %rdx,16(%rdi) |
831 » mov» %ecx,8(%rdi) | |
832 » mov» %edx,12(%rdi) | |
833 » mov» 16(%rsi),%ecx | |
834 » mov» 20(%rsi),%edx | |
835 » mov» %ecx,16(%rdi) | |
836 » mov» %edx,20(%rdi) | |
837 | 1383 |
| 1384 shr \$32,%rdx |
838 xor %ecx,%ecx | 1385 xor %ecx,%ecx |
839 jmp .L12shortcut | 1386 jmp .L12shortcut |
840 .align 4 | 1387 .align 4 |
841 .L12loop: | 1388 .L12loop: |
842 mov 0(%rdi),%eax # rk[0] | 1389 mov 0(%rdi),%eax # rk[0] |
843 mov 20(%rdi),%edx # rk[5] | 1390 mov 20(%rdi),%edx # rk[5] |
844 .L12shortcut: | 1391 .L12shortcut: |
845 ___ | 1392 ___ |
846 &enckey (); | 1393 &enckey (); |
847 $code.=<<___; | 1394 $code.=<<___; |
(...skipping 15 matching lines...) Expand all Loading... |
863 mov %eax,44(%rdi) # rk[11] | 1410 mov %eax,44(%rdi) # rk[11] |
864 | 1411 |
865 lea 24(%rdi),%rdi | 1412 lea 24(%rdi),%rdi |
866 jmp .L12loop | 1413 jmp .L12loop |
867 .L12break: | 1414 .L12break: |
868 movl \$12,72(%rdi) # setup number of rounds | 1415 movl \$12,72(%rdi) # setup number of rounds |
869 xor %rax,%rax | 1416 xor %rax,%rax |
870 jmp .Lexit | 1417 jmp .Lexit |
871 | 1418 |
872 .L14rounds: | 1419 .L14rounds: |
873 » mov» 0(%rsi),%eax» » » # copy first 8 dwords | 1420 » mov» 0(%rsi),%rax» » » # copy first 8 dwords |
874 » mov» 4(%rsi),%ebx | 1421 » mov» 8(%rsi),%rbx |
875 » mov» 8(%rsi),%ecx | 1422 » mov» 16(%rsi),%rcx |
876 » mov» 12(%rsi),%edx | 1423 » mov» 24(%rsi),%rdx |
877 » mov» %eax,0(%rdi) | 1424 » mov» %rax,0(%rdi) |
878 » mov» %ebx,4(%rdi) | 1425 » mov» %rbx,8(%rdi) |
879 » mov» %ecx,8(%rdi) | 1426 » mov» %rcx,16(%rdi) |
880 » mov» %edx,12(%rdi) | 1427 » mov» %rdx,24(%rdi) |
881 » mov» 16(%rsi),%eax | |
882 » mov» 20(%rsi),%ebx | |
883 » mov» 24(%rsi),%ecx | |
884 » mov» 28(%rsi),%edx | |
885 » mov» %eax,16(%rdi) | |
886 » mov» %ebx,20(%rdi) | |
887 » mov» %ecx,24(%rdi) | |
888 » mov» %edx,28(%rdi) | |
889 | 1428 |
| 1429 shr \$32,%rdx |
890 xor %ecx,%ecx | 1430 xor %ecx,%ecx |
891 jmp .L14shortcut | 1431 jmp .L14shortcut |
892 .align 4 | 1432 .align 4 |
893 .L14loop: | 1433 .L14loop: |
| 1434 mov 0(%rdi),%eax # rk[0] |
894 mov 28(%rdi),%edx # rk[4] | 1435 mov 28(%rdi),%edx # rk[4] |
895 .L14shortcut: | 1436 .L14shortcut: |
896 mov 0(%rdi),%eax # rk[0] | |
897 ___ | 1437 ___ |
898 &enckey (); | 1438 &enckey (); |
899 $code.=<<___; | 1439 $code.=<<___; |
900 mov %eax,32(%rdi) # rk[8] | 1440 mov %eax,32(%rdi) # rk[8] |
901 xor 4(%rdi),%eax | 1441 xor 4(%rdi),%eax |
902 mov %eax,36(%rdi) # rk[9] | 1442 mov %eax,36(%rdi) # rk[9] |
903 xor 8(%rdi),%eax | 1443 xor 8(%rdi),%eax |
904 mov %eax,40(%rdi) # rk[10] | 1444 mov %eax,40(%rdi) # rk[10] |
905 xor 12(%rdi),%eax | 1445 xor 12(%rdi),%eax |
906 mov %eax,44(%rdi) # rk[11] | 1446 mov %eax,44(%rdi) # rk[11] |
907 | 1447 |
908 cmp \$6,%ecx | 1448 cmp \$6,%ecx |
909 je .L14break | 1449 je .L14break |
910 add \$1,%ecx | 1450 add \$1,%ecx |
911 | 1451 |
912 mov %eax,%edx | 1452 mov %eax,%edx |
913 mov 16(%rdi),%eax # rk[4] | 1453 mov 16(%rdi),%eax # rk[4] |
914 movz %dl,%esi # rk[11]>>0 | 1454 movz %dl,%esi # rk[11]>>0 |
915 » » mov» 2(%rbp,%rsi,8),%ebx | 1455 » » movzb» -128(%rbp,%rsi),%ebx |
916 movz %dh,%esi # rk[11]>>8 | 1456 movz %dh,%esi # rk[11]>>8 |
917 and \$0x000000FF,%ebx | |
918 xor %ebx,%eax | 1457 xor %ebx,%eax |
919 | 1458 |
920 » » mov» 0(%rbp,%rsi,8),%ebx | 1459 » » movzb» -128(%rbp,%rsi),%ebx |
921 shr \$16,%edx | 1460 shr \$16,%edx |
922 » » and» \$0x0000FF00,%ebx | 1461 » » shl» \$8,%ebx |
923 movz %dl,%esi # rk[11]>>16 | 1462 movz %dl,%esi # rk[11]>>16 |
924 xor %ebx,%eax | 1463 xor %ebx,%eax |
925 | 1464 |
926 » » mov» 0(%rbp,%rsi,8),%ebx | 1465 » » movzb» -128(%rbp,%rsi),%ebx |
927 movz %dh,%esi # rk[11]>>24 | 1466 movz %dh,%esi # rk[11]>>24 |
928 » » and» \$0x00FF0000,%ebx | 1467 » » shl» \$16,%ebx |
929 xor %ebx,%eax | 1468 xor %ebx,%eax |
930 | 1469 |
931 » » mov» 2(%rbp,%rsi,8),%ebx | 1470 » » movzb» -128(%rbp,%rsi),%ebx |
932 » » and» \$0xFF000000,%ebx | 1471 » » shl» \$24,%ebx |
933 xor %ebx,%eax | 1472 xor %ebx,%eax |
934 | 1473 |
935 mov %eax,48(%rdi) # rk[12] | 1474 mov %eax,48(%rdi) # rk[12] |
936 xor 20(%rdi),%eax | 1475 xor 20(%rdi),%eax |
937 mov %eax,52(%rdi) # rk[13] | 1476 mov %eax,52(%rdi) # rk[13] |
938 xor 24(%rdi),%eax | 1477 xor 24(%rdi),%eax |
939 mov %eax,56(%rdi) # rk[14] | 1478 mov %eax,56(%rdi) # rk[14] |
940 xor 28(%rdi),%eax | 1479 xor 28(%rdi),%eax |
941 mov %eax,60(%rdi) # rk[15] | 1480 mov %eax,60(%rdi) # rk[15] |
942 | 1481 |
943 lea 32(%rdi),%rdi | 1482 lea 32(%rdi),%rdi |
944 jmp .L14loop | 1483 jmp .L14loop |
945 .L14break: | 1484 .L14break: |
946 movl \$14,48(%rdi) # setup number of rounds | 1485 movl \$14,48(%rdi) # setup number of rounds |
947 xor %rax,%rax | 1486 xor %rax,%rax |
948 jmp .Lexit | 1487 jmp .Lexit |
949 | 1488 |
950 .Lbadpointer: | 1489 .Lbadpointer: |
951 mov \$-1,%rax | 1490 mov \$-1,%rax |
952 .Lexit: | 1491 .Lexit: |
953 » .byte» 0xf3,0xc3» » # rep ret | 1492 » .byte» 0xf3,0xc3» » » # rep ret |
954 .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key | 1493 .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key |
955 ___ | 1494 ___ |
956 | 1495 |
957 sub deckey() | 1496 sub deckey_ref() |
958 { my ($i,$ptr,$te,$td) = @_; | 1497 { my ($i,$ptr,$te,$td) = @_; |
| 1498 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); |
959 $code.=<<___; | 1499 $code.=<<___; |
960 » mov» $i($ptr),%eax | 1500 » mov» $i($ptr),$tp1 |
961 » mov» %eax,%edx | 1501 » mov» $tp1,$acc |
962 » movz» %ah,%ebx | 1502 » and» \$0x80808080,$acc |
963 » shr» \$16,%edx | 1503 » mov» $acc,$tp4 |
964 » and» \$0xFF,%eax | 1504 » shr» \$7,$tp4 |
965 » movzb» 2($te,%rax,8),%rax | 1505 » lea» 0($tp1,$tp1),$tp2 |
966 » movzb» 2($te,%rbx,8),%rbx | 1506 » sub» $tp4,$acc |
967 » mov» 0($td,%rax,8),%eax | 1507 » and» \$0xfefefefe,$tp2 |
968 » xor» 3($td,%rbx,8),%eax | 1508 » and» \$0x1b1b1b1b,$acc |
969 » movzb» %dh,%ebx | 1509 » xor» $tp2,$acc |
970 » and» \$0xFF,%edx | 1510 » mov» $acc,$tp2 |
971 » movzb» 2($te,%rdx,8),%rdx | 1511 |
972 » movzb» 2($te,%rbx,8),%rbx | 1512 » and» \$0x80808080,$acc |
973 » xor» 2($td,%rdx,8),%eax | 1513 » mov» $acc,$tp8 |
974 » xor» 1($td,%rbx,8),%eax | 1514 » shr» \$7,$tp8 |
975 » mov» %eax,$i($ptr) | 1515 » lea» 0($tp2,$tp2),$tp4 |
| 1516 » sub» $tp8,$acc |
| 1517 » and» \$0xfefefefe,$tp4 |
| 1518 » and» \$0x1b1b1b1b,$acc |
| 1519 » xor» $tp1,$tp2» » # tp2^tp1 |
| 1520 » xor» $tp4,$acc |
| 1521 » mov» $acc,$tp4 |
| 1522 |
| 1523 » and» \$0x80808080,$acc |
| 1524 » mov» $acc,$tp8 |
| 1525 » shr» \$7,$tp8 |
| 1526 » sub» $tp8,$acc |
| 1527 » lea» 0($tp4,$tp4),$tp8 |
| 1528 » xor» $tp1,$tp4» » # tp4^tp1 |
| 1529 » and» \$0xfefefefe,$tp8 |
| 1530 » and» \$0x1b1b1b1b,$acc |
| 1531 » xor» $acc,$tp8 |
| 1532 |
| 1533 » xor» $tp8,$tp1» » # tp1^tp8 |
| 1534 » rol» \$8,$tp1» » # ROTATE(tp1^tp8,8) |
| 1535 » xor» $tp8,$tp2» » # tp2^tp1^tp8 |
| 1536 » xor» $tp8,$tp4» » # tp4^tp1^tp8 |
| 1537 » xor» $tp2,$tp8 |
| 1538 » xor» $tp4,$tp8» » # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp
4^tp2 |
| 1539 |
| 1540 » xor» $tp8,$tp1 |
| 1541 » rol» \$24,$tp2» » # ROTATE(tp2^tp1^tp8,24) |
| 1542 » xor» $tp2,$tp1 |
| 1543 » rol» \$16,$tp4» » # ROTATE(tp4^tp1^tp8,16) |
| 1544 » xor» $tp4,$tp1 |
| 1545 |
| 1546 » mov» $tp1,$i($ptr) |
976 ___ | 1547 ___ |
977 } | 1548 } |
978 | 1549 |
979 # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 1550 # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
980 # AES_KEY *key) | 1551 # AES_KEY *key) |
981 $code.=<<___; | 1552 $code.=<<___; |
982 .globl AES_set_decrypt_key | 1553 .globl AES_set_decrypt_key |
983 .type AES_set_decrypt_key,\@function,3 | 1554 .type AES_set_decrypt_key,\@function,3 |
984 .align 16 | 1555 .align 16 |
985 AES_set_decrypt_key: | 1556 AES_set_decrypt_key: |
986 push %rbx | 1557 push %rbx |
987 push %rbp | 1558 push %rbp |
| 1559 push %r12 |
| 1560 push %r13 |
| 1561 push %r14 |
| 1562 push %r15 |
988 push %rdx # save key schedule | 1563 push %rdx # save key schedule |
| 1564 .Ldec_key_prologue: |
989 | 1565 |
990 call _x86_64_AES_set_encrypt_key | 1566 call _x86_64_AES_set_encrypt_key |
991 mov (%rsp),%r8 # restore key schedule | 1567 mov (%rsp),%r8 # restore key schedule |
992 cmp \$0,%eax | 1568 cmp \$0,%eax |
993 jne .Labort | 1569 jne .Labort |
994 | 1570 |
995 » mov» 240(%r8),%ecx» » # pull number of rounds | 1571 » mov» 240(%r8),%r14d» » # pull number of rounds |
996 xor %rdi,%rdi | 1572 xor %rdi,%rdi |
997 » lea» (%rdi,%rcx,4),%rcx | 1573 » lea» (%rdi,%r14d,4),%rcx |
998 mov %r8,%rsi | 1574 mov %r8,%rsi |
999 lea (%r8,%rcx,4),%rdi # pointer to last chunk | 1575 lea (%r8,%rcx,4),%rdi # pointer to last chunk |
1000 .align 4 | 1576 .align 4 |
1001 .Linvert: | 1577 .Linvert: |
1002 mov 0(%rsi),%rax | 1578 mov 0(%rsi),%rax |
1003 mov 8(%rsi),%rbx | 1579 mov 8(%rsi),%rbx |
1004 mov 0(%rdi),%rcx | 1580 mov 0(%rdi),%rcx |
1005 mov 8(%rdi),%rdx | 1581 mov 8(%rdi),%rdx |
1006 mov %rax,0(%rdi) | 1582 mov %rax,0(%rdi) |
1007 mov %rbx,8(%rdi) | 1583 mov %rbx,8(%rdi) |
1008 mov %rcx,0(%rsi) | 1584 mov %rcx,0(%rsi) |
1009 mov %rdx,8(%rsi) | 1585 mov %rdx,8(%rsi) |
1010 lea 16(%rsi),%rsi | 1586 lea 16(%rsi),%rsi |
1011 lea -16(%rdi),%rdi | 1587 lea -16(%rdi),%rdi |
1012 cmp %rsi,%rdi | 1588 cmp %rsi,%rdi |
1013 jne .Linvert | 1589 jne .Linvert |
1014 | 1590 |
1015 » .picmeup %r9 | 1591 » lea» .LAES_Te+2048+1024(%rip),%rax» # rcon |
1016 » lea» AES_Td-.(%r9),%rdi | |
1017 » lea» AES_Te-AES_Td(%rdi),%r9 | |
1018 | 1592 |
1019 » mov» %r8,%rsi | 1593 » mov» 40(%rax),$mask80 |
1020 » mov» 240(%r8),%ecx» » # pull number of rounds | 1594 » mov» 48(%rax),$maskfe |
1021 » sub» \$1,%ecx | 1595 » mov» 56(%rax),$mask1b |
| 1596 |
| 1597 » mov» %r8,$key |
| 1598 » sub» \$1,%r14d |
1022 .align 4 | 1599 .align 4 |
1023 .Lpermute: | 1600 .Lpermute: |
1024 » » lea» 16(%rsi),%rsi | 1601 » » lea» 16($key),$key |
| 1602 » » mov» 0($key),%rax |
| 1603 » » mov» 8($key),%rcx |
1025 ___ | 1604 ___ |
1026 » » &deckey»(0,"%rsi","%r9","%rdi"); | 1605 » » &dectransform (); |
1027 » » &deckey»(4,"%rsi","%r9","%rdi"); | |
1028 » » &deckey»(8,"%rsi","%r9","%rdi"); | |
1029 » » &deckey»(12,"%rsi","%r9","%rdi"); | |
1030 $code.=<<___; | 1606 $code.=<<___; |
1031 » » sub» \$1,%ecx | 1607 » » mov» %eax,0($key) |
| 1608 » » mov» %ebx,4($key) |
| 1609 » » mov» %ecx,8($key) |
| 1610 » » mov» %edx,12($key) |
| 1611 » » sub» \$1,%r14d |
1032 jnz .Lpermute | 1612 jnz .Lpermute |
1033 | 1613 |
1034 xor %rax,%rax | 1614 xor %rax,%rax |
1035 .Labort: | 1615 .Labort: |
1036 » mov» 8(%rsp),%rbp | 1616 » mov» 8(%rsp),%r15 |
1037 » mov» 16(%rsp),%rbx | 1617 » mov» 16(%rsp),%r14 |
1038 » add» \$24,%rsp | 1618 » mov» 24(%rsp),%r13 |
| 1619 » mov» 32(%rsp),%r12 |
| 1620 » mov» 40(%rsp),%rbp |
| 1621 » mov» 48(%rsp),%rbx |
| 1622 » add» \$56,%rsp |
| 1623 .Ldec_key_epilogue: |
1039 ret | 1624 ret |
1040 .size AES_set_decrypt_key,.-AES_set_decrypt_key | 1625 .size AES_set_decrypt_key,.-AES_set_decrypt_key |
1041 ___ | 1626 ___ |
1042 | 1627 |
1043 # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | 1628 # void AES_cbc_encrypt (const void char *inp, unsigned char *out, |
1044 # size_t length, const AES_KEY *key, | 1629 # size_t length, const AES_KEY *key, |
1045 # unsigned char *ivp,const int enc); | 1630 # unsigned char *ivp,const int enc); |
1046 { | 1631 { |
1047 # stack frame layout | 1632 # stack frame layout |
1048 # -8(%rsp) return address | 1633 # -8(%rsp) return address |
1049 my $_rsp="0(%rsp)";» » # saved %rsp | 1634 my $keyp="0(%rsp)";» » # one to pass as $key |
1050 my $_len="8(%rsp)";» » # copy of 3rd parameter, length | 1635 my $keyend="8(%rsp)";» » # &(keyp->rd_key[4*keyp->rounds]) |
1051 my $_key="16(%rsp)";» » # copy of 4th parameter, key | 1636 my $_rsp="16(%rsp)";» » # saved %rsp |
1052 my $_ivp="24(%rsp)";» » # copy of 5th parameter, ivp | 1637 my $_inp="24(%rsp)";» » # copy of 1st parameter, inp |
1053 my $keyp="32(%rsp)";» » # one to pass as $key | 1638 my $_out="32(%rsp)";» » # copy of 2nd parameter, out |
1054 my $ivec="40(%rsp)";» » # ivec[16] | 1639 my $_len="40(%rsp)";» » # copy of 3rd parameter, length |
1055 my $aes_key="56(%rsp)";»» # copy of aes_key | 1640 my $_key="48(%rsp)";» » # copy of 4th parameter, key |
1056 my $mark="56+240(%rsp)";» # copy of aes_key->rounds | 1641 my $_ivp="56(%rsp)";» » # copy of 5th parameter, ivp |
| 1642 my $ivec="64(%rsp)";» » # ivec[16] |
| 1643 my $aes_key="80(%rsp)";»» # copy of aes_key |
| 1644 my $mark="80+240(%rsp)";» # copy of aes_key->rounds |
1057 | 1645 |
1058 $code.=<<___; | 1646 $code.=<<___; |
1059 .globl AES_cbc_encrypt | 1647 .globl AES_cbc_encrypt |
1060 .type AES_cbc_encrypt,\@function,6 | 1648 .type AES_cbc_encrypt,\@function,6 |
1061 .align 16 | 1649 .align 16 |
| 1650 .extern OPENSSL_ia32cap_P |
1062 AES_cbc_encrypt: | 1651 AES_cbc_encrypt: |
1063 cmp \$0,%rdx # check length | 1652 cmp \$0,%rdx # check length |
1064 » je» .Lcbc_just_ret | 1653 » je» .Lcbc_epilogue |
| 1654 » pushfq |
1065 push %rbx | 1655 push %rbx |
1066 push %rbp | 1656 push %rbp |
1067 push %r12 | 1657 push %r12 |
1068 push %r13 | 1658 push %r13 |
1069 push %r14 | 1659 push %r14 |
1070 push %r15 | 1660 push %r15 |
1071 » pushfq | 1661 .Lcbc_prologue: |
| 1662 |
1072 cld | 1663 cld |
1073 mov %r9d,%r9d # clear upper half of enc | 1664 mov %r9d,%r9d # clear upper half of enc |
1074 | 1665 |
1075 » .picmeup $sbox | 1666 » lea» .LAES_Te(%rip),$sbox |
1076 .Lcbc_pic_point: | 1667 » cmp» \$0,%r9 |
| 1668 » jne» .Lcbc_picked_te |
| 1669 » lea» .LAES_Td(%rip),$sbox |
| 1670 .Lcbc_picked_te: |
1077 | 1671 |
1078 » cmp» \$0,%r9 | 1672 » mov» OPENSSL_ia32cap_P(%rip),%r10d |
1079 » je» .LDECRYPT | 1673 » cmp» \$$speed_limit,%rdx |
1080 | 1674 » jb» .Lcbc_slow_prologue |
1081 » lea» AES_Te-.Lcbc_pic_point($sbox),$sbox | 1675 » test» \$15,%rdx |
| 1676 » jnz» .Lcbc_slow_prologue |
| 1677 » bt» \$28,%r10d |
| 1678 » jc» .Lcbc_slow_prologue |
1082 | 1679 |
1083 # allocate aligned stack frame... | 1680 # allocate aligned stack frame... |
1084 » lea» -64-248(%rsp),$key | 1681 » lea» -88-248(%rsp),$key |
1085 and \$-64,$key | 1682 and \$-64,$key |
1086 | 1683 |
1087 » # ... and make it doesn't alias with AES_Te modulo 4096 | 1684 » # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 |
1088 mov $sbox,%r10 | 1685 mov $sbox,%r10 |
1089 » lea» 2048($sbox),%r11 | 1686 » lea» 2304($sbox),%r11 |
1090 mov $key,%r12 | 1687 mov $key,%r12 |
1091 and \$0xFFF,%r10 # s = $sbox&0xfff | 1688 and \$0xFFF,%r10 # s = $sbox&0xfff |
1092 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff | 1689 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff |
1093 and \$0xFFF,%r12 # p = %rsp&0xfff | 1690 and \$0xFFF,%r12 # p = %rsp&0xfff |
1094 | 1691 |
1095 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); | 1692 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); |
1096 jb .Lcbc_te_break_out | 1693 jb .Lcbc_te_break_out |
1097 sub %r11,%r12 | 1694 sub %r11,%r12 |
1098 sub %r12,$key | 1695 sub %r12,$key |
1099 jmp .Lcbc_te_ok | 1696 jmp .Lcbc_te_ok |
1100 .Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz | 1697 .Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz |
1101 sub %r10,%r12 | 1698 sub %r10,%r12 |
1102 and \$0xFFF,%r12 | 1699 and \$0xFFF,%r12 |
1103 add \$320,%r12 | 1700 add \$320,%r12 |
1104 sub %r12,$key | 1701 sub %r12,$key |
1105 .align 4 | 1702 .align 4 |
1106 .Lcbc_te_ok: | 1703 .Lcbc_te_ok: |
1107 | 1704 |
1108 xchg %rsp,$key | 1705 xchg %rsp,$key |
1109 » add» \$8,%rsp» # reserve for return address! | 1706 » #add» \$8,%rsp» # reserve for return address! |
1110 mov $key,$_rsp # save %rsp | 1707 mov $key,$_rsp # save %rsp |
| 1708 .Lcbc_fast_body: |
| 1709 mov %rdi,$_inp # save copy of inp |
| 1710 mov %rsi,$_out # save copy of out |
1111 mov %rdx,$_len # save copy of len | 1711 mov %rdx,$_len # save copy of len |
1112 mov %rcx,$_key # save copy of key | 1712 mov %rcx,$_key # save copy of key |
1113 mov %r8,$_ivp # save copy of ivp | 1713 mov %r8,$_ivp # save copy of ivp |
1114 movl \$0,$mark # copy of aes_key->rounds = 0; | 1714 movl \$0,$mark # copy of aes_key->rounds = 0; |
1115 mov %r8,%rbp # rearrange input arguments | 1715 mov %r8,%rbp # rearrange input arguments |
| 1716 mov %r9,%rbx |
1116 mov %rsi,$out | 1717 mov %rsi,$out |
1117 mov %rdi,$inp | 1718 mov %rdi,$inp |
1118 mov %rcx,$key | 1719 mov %rcx,$key |
1119 | 1720 |
| 1721 mov 240($key),%eax # key->rounds |
1120 # do we copy key schedule to stack? | 1722 # do we copy key schedule to stack? |
1121 mov $key,%r10 | 1723 mov $key,%r10 |
1122 sub $sbox,%r10 | 1724 sub $sbox,%r10 |
1123 and \$0xfff,%r10 | 1725 and \$0xfff,%r10 |
1124 » cmp» \$2048,%r10 | 1726 » cmp» \$2304,%r10 |
1125 jb .Lcbc_do_ecopy | 1727 jb .Lcbc_do_ecopy |
1126 cmp \$4096-248,%r10 | 1728 cmp \$4096-248,%r10 |
1127 jb .Lcbc_skip_ecopy | 1729 jb .Lcbc_skip_ecopy |
1128 .align 4 | 1730 .align 4 |
1129 .Lcbc_do_ecopy: | 1731 .Lcbc_do_ecopy: |
1130 mov $key,%rsi | 1732 mov $key,%rsi |
1131 lea $aes_key,%rdi | 1733 lea $aes_key,%rdi |
1132 lea $aes_key,$key | 1734 lea $aes_key,$key |
1133 mov \$240/8,%ecx | 1735 mov \$240/8,%ecx |
1134 .long 0x90A548F3 # rep movsq | 1736 .long 0x90A548F3 # rep movsq |
1135 » » mov» (%rsi),%eax» # copy aes_key->rounds | 1737 » » mov» %eax,(%rdi)» # copy aes_key->rounds |
1136 » » mov» %eax,(%rdi) | |
1137 .Lcbc_skip_ecopy: | 1738 .Lcbc_skip_ecopy: |
1138 mov $key,$keyp # save key pointer | 1739 mov $key,$keyp # save key pointer |
1139 | 1740 |
1140 » mov» \$16,%ecx | 1741 » mov» \$18,%ecx |
1141 .align 4 | 1742 .align 4 |
1142 .Lcbc_prefetch_te: | 1743 .Lcbc_prefetch_te: |
1143 mov 0($sbox),%r10 | 1744 mov 0($sbox),%r10 |
1144 mov 32($sbox),%r11 | 1745 mov 32($sbox),%r11 |
1145 mov 64($sbox),%r12 | 1746 mov 64($sbox),%r12 |
1146 mov 96($sbox),%r13 | 1747 mov 96($sbox),%r13 |
1147 lea 128($sbox),$sbox | 1748 lea 128($sbox),$sbox |
1148 sub \$1,%ecx | 1749 sub \$1,%ecx |
1149 jnz .Lcbc_prefetch_te | 1750 jnz .Lcbc_prefetch_te |
1150 » sub» \$2048,$sbox | 1751 » lea» -2304($sbox),$sbox |
1151 | 1752 |
1152 » test» \$-16,%rdx» » # check upon length | 1753 » cmp» \$0,%rbx |
1153 » mov» %rdx,%r10 | 1754 » je» .LFAST_DECRYPT |
| 1755 |
| 1756 #----------------------------- ENCRYPT -----------------------------# |
1154 mov 0(%rbp),$s0 # load iv | 1757 mov 0(%rbp),$s0 # load iv |
1155 mov 4(%rbp),$s1 | 1758 mov 4(%rbp),$s1 |
1156 mov 8(%rbp),$s2 | 1759 mov 8(%rbp),$s2 |
1157 mov 12(%rbp),$s3 | 1760 mov 12(%rbp),$s3 |
1158 jz .Lcbc_enc_tail # short input... | |
1159 | 1761 |
1160 .align 4 | 1762 .align 4 |
1161 .Lcbc_enc_loop: | 1763 .Lcbc_fast_enc_loop: |
1162 xor 0($inp),$s0 | 1764 xor 0($inp),$s0 |
1163 xor 4($inp),$s1 | 1765 xor 4($inp),$s1 |
1164 xor 8($inp),$s2 | 1766 xor 8($inp),$s2 |
1165 xor 12($inp),$s3 | 1767 xor 12($inp),$s3 |
1166 » » mov» $inp,$ivec» # if ($verticalspin) save inp | 1768 » » mov» $keyp,$key» # restore key |
| 1769 » » mov» $inp,$_inp» # if ($verticalspin) save inp |
1167 | 1770 |
1168 mov $keyp,$key # restore key | |
1169 call _x86_64_AES_encrypt | 1771 call _x86_64_AES_encrypt |
1170 | 1772 |
1171 » » mov» $ivec,$inp» # if ($verticalspin) restore inp | 1773 » » mov» $_inp,$inp» # if ($verticalspin) restore inp |
| 1774 » » mov» $_len,%r10 |
1172 mov $s0,0($out) | 1775 mov $s0,0($out) |
1173 mov $s1,4($out) | 1776 mov $s1,4($out) |
1174 mov $s2,8($out) | 1777 mov $s2,8($out) |
1175 mov $s3,12($out) | 1778 mov $s3,12($out) |
1176 | 1779 |
1177 mov $_len,%r10 | |
1178 lea 16($inp),$inp | 1780 lea 16($inp),$inp |
1179 lea 16($out),$out | 1781 lea 16($out),$out |
1180 sub \$16,%r10 | 1782 sub \$16,%r10 |
1181 test \$-16,%r10 | 1783 test \$-16,%r10 |
1182 mov %r10,$_len | 1784 mov %r10,$_len |
1183 » jnz» .Lcbc_enc_loop | 1785 » jnz» .Lcbc_fast_enc_loop |
1184 » test» \$15,%r10 | |
1185 » jnz» .Lcbc_enc_tail | |
1186 mov $_ivp,%rbp # restore ivp | 1786 mov $_ivp,%rbp # restore ivp |
1187 mov $s0,0(%rbp) # save ivec | 1787 mov $s0,0(%rbp) # save ivec |
1188 mov $s1,4(%rbp) | 1788 mov $s1,4(%rbp) |
1189 mov $s2,8(%rbp) | 1789 mov $s2,8(%rbp) |
1190 mov $s3,12(%rbp) | 1790 mov $s3,12(%rbp) |
1191 | 1791 |
1192 .align» 4 | 1792 » jmp» .Lcbc_fast_cleanup |
1193 .Lcbc_cleanup: | 1793 |
1194 » cmpl» \$0,$mark» # was the key schedule copied? | |
1195 » lea» $aes_key,%rdi | |
1196 » je» .Lcbc_exit | |
1197 » » mov» \$240/8,%ecx | |
1198 » » xor» %rax,%rax | |
1199 » » .long» 0x90AB48F3» # rep stosq | |
1200 .Lcbc_exit: | |
1201 » mov» $_rsp,%rsp | |
1202 » popfq | |
1203 » pop» %r15 | |
1204 » pop» %r14 | |
1205 » pop» %r13 | |
1206 » pop» %r12 | |
1207 » pop» %rbp | |
1208 » pop» %rbx | |
1209 .Lcbc_just_ret: | |
1210 » ret | |
1211 .align» 4 | |
1212 .Lcbc_enc_tail: | |
1213 » mov» %rax,%r11 | |
1214 » mov» %rcx,%r12 | |
1215 » mov» %r10,%rcx | |
1216 » mov» $inp,%rsi | |
1217 » mov» $out,%rdi | |
1218 » .long» 0xF689A4F3» » # rep movsb | |
1219 » mov» \$16,%rcx» » # zero tail | |
1220 » sub» %r10,%rcx | |
1221 » xor» %rax,%rax | |
1222 » .long» 0xF689AAF3» » # rep stosb | |
1223 » mov» $out,$inp» » # this is not a mistake! | |
1224 » movq» \$16,$_len» » # len=16 | |
1225 » mov» %r11,%rax | |
1226 » mov» %r12,%rcx | |
1227 » jmp» .Lcbc_enc_loop» » # one more spin... | |
1228 #----------------------------- DECRYPT -----------------------------# | 1794 #----------------------------- DECRYPT -----------------------------# |
1229 .align 16 | 1795 .align 16 |
1230 .LDECRYPT: | 1796 .LFAST_DECRYPT: |
1231 » lea» AES_Td-.Lcbc_pic_point($sbox),$sbox | |
1232 | |
1233 » # allocate aligned stack frame... | |
1234 » lea» -64-248(%rsp),$key | |
1235 » and» \$-64,$key | |
1236 | |
1237 » # ... and make it doesn't alias with AES_Td modulo 4096 | |
1238 » mov» $sbox,%r10 | |
1239 » lea» 2304($sbox),%r11 | |
1240 » mov» $key,%r12 | |
1241 » and» \$0xFFF,%r10» # s = $sbox&0xfff | |
1242 » and» \$0xFFF,%r11» # e = ($sbox+2048+256)&0xfff | |
1243 » and» \$0xFFF,%r12» # p = %rsp&0xfff | |
1244 | |
1245 » cmp» %r11,%r12» # if (p=>e) %rsp =- (p-e); | |
1246 » jb» .Lcbc_td_break_out | |
1247 » sub» %r11,%r12 | |
1248 » sub» %r12,$key | |
1249 » jmp» .Lcbc_td_ok | |
1250 .Lcbc_td_break_out:» » # else %rsp -= (p-s)&0xfff + framesz | |
1251 » sub» %r10,%r12 | |
1252 » and» \$0xFFF,%r12 | |
1253 » add» \$320,%r12 | |
1254 » sub» %r12,$key | |
1255 .align» 4 | |
1256 .Lcbc_td_ok: | |
1257 | |
1258 » xchg» %rsp,$key | |
1259 » add» \$8,%rsp» # reserve for return address! | |
1260 » mov» $key,$_rsp» # save %rsp | |
1261 » mov» %rdx,$_len» # save copy of len | |
1262 » mov» %rcx,$_key» # save copy of key | |
1263 » mov» %r8,$_ivp» # save copy of ivp | |
1264 » movl» \$0,$mark» # copy of aes_key->rounds = 0; | |
1265 » mov» %r8,%rbp» # rearrange input arguments | |
1266 » mov» %rsi,$out | |
1267 » mov» %rdi,$inp | |
1268 » mov» %rcx,$key | |
1269 | |
1270 » # do we copy key schedule to stack? | |
1271 » mov» $key,%r10 | |
1272 » sub» $sbox,%r10 | |
1273 » and» \$0xfff,%r10 | |
1274 » cmp» \$2304,%r10 | |
1275 » jb» .Lcbc_do_dcopy | |
1276 » cmp» \$4096-248,%r10 | |
1277 » jb» .Lcbc_skip_dcopy | |
1278 .align» 4 | |
1279 .Lcbc_do_dcopy: | |
1280 » » mov» $key,%rsi | |
1281 » » lea» $aes_key,%rdi | |
1282 » » lea» $aes_key,$key | |
1283 » » mov» \$240/8,%ecx | |
1284 » » .long» 0x90A548F3» # rep movsq | |
1285 » » mov» (%rsi),%eax» # copy aes_key->rounds | |
1286 » » mov» %eax,(%rdi) | |
1287 .Lcbc_skip_dcopy: | |
1288 » mov» $key,$keyp» # save key pointer | |
1289 | |
1290 » mov» \$18,%ecx | |
1291 .align» 4 | |
1292 .Lcbc_prefetch_td: | |
1293 » » mov» 0($sbox),%r10 | |
1294 » » mov» 32($sbox),%r11 | |
1295 » » mov» 64($sbox),%r12 | |
1296 » » mov» 96($sbox),%r13 | |
1297 » » lea» 128($sbox),$sbox | |
1298 » » sub» \$1,%ecx | |
1299 » jnz» .Lcbc_prefetch_td | |
1300 » sub» \$2304,$sbox | |
1301 | |
1302 cmp $inp,$out | 1797 cmp $inp,$out |
1303 » je» .Lcbc_dec_in_place | 1798 » je» .Lcbc_fast_dec_in_place |
1304 | 1799 |
1305 mov %rbp,$ivec | 1800 mov %rbp,$ivec |
1306 .align 4 | 1801 .align 4 |
1307 .Lcbc_dec_loop: | 1802 .Lcbc_fast_dec_loop: |
1308 » » mov» 0($inp),$s0» » # read input | 1803 » » mov» 0($inp),$s0» # read input |
1309 mov 4($inp),$s1 | 1804 mov 4($inp),$s1 |
1310 mov 8($inp),$s2 | 1805 mov 8($inp),$s2 |
1311 mov 12($inp),$s3 | 1806 mov 12($inp),$s3 |
1312 » » mov» $inp,8+$ivec» # if ($verticalspin) save inp | 1807 » » mov» $keyp,$key» # restore key |
| 1808 » » mov» $inp,$_inp» # if ($verticalspin) save inp |
1313 | 1809 |
1314 mov $keyp,$key # restore key | |
1315 call _x86_64_AES_decrypt | 1810 call _x86_64_AES_decrypt |
1316 | 1811 |
1317 mov $ivec,%rbp # load ivp | 1812 mov $ivec,%rbp # load ivp |
1318 » » mov» 8+$ivec,$inp» # if ($verticalspin) restore inp | 1813 » » mov» $_inp,$inp» # if ($verticalspin) restore inp |
| 1814 » » mov» $_len,%r10» # load len |
1319 xor 0(%rbp),$s0 # xor iv | 1815 xor 0(%rbp),$s0 # xor iv |
1320 xor 4(%rbp),$s1 | 1816 xor 4(%rbp),$s1 |
1321 xor 8(%rbp),$s2 | 1817 xor 8(%rbp),$s2 |
1322 xor 12(%rbp),$s3 | 1818 xor 12(%rbp),$s3 |
1323 mov $inp,%rbp # current input, next iv | 1819 mov $inp,%rbp # current input, next iv |
1324 | 1820 |
1325 mov $_len,%r10 # load len | |
1326 sub \$16,%r10 | 1821 sub \$16,%r10 |
1327 jc .Lcbc_dec_partial | |
1328 mov %r10,$_len # update len | 1822 mov %r10,$_len # update len |
1329 mov %rbp,$ivec # update ivp | 1823 mov %rbp,$ivec # update ivp |
1330 | 1824 |
1331 mov $s0,0($out) # write output | 1825 mov $s0,0($out) # write output |
1332 mov $s1,4($out) | 1826 mov $s1,4($out) |
1333 mov $s2,8($out) | 1827 mov $s2,8($out) |
1334 mov $s3,12($out) | 1828 mov $s3,12($out) |
1335 | 1829 |
1336 lea 16($inp),$inp | 1830 lea 16($inp),$inp |
1337 lea 16($out),$out | 1831 lea 16($out),$out |
1338 » jnz» .Lcbc_dec_loop | 1832 » jnz» .Lcbc_fast_dec_loop |
1339 .Lcbc_dec_end: | |
1340 mov $_ivp,%r12 # load user ivp | 1833 mov $_ivp,%r12 # load user ivp |
1341 mov 0(%rbp),%r10 # load iv | 1834 mov 0(%rbp),%r10 # load iv |
1342 mov 8(%rbp),%r11 | 1835 mov 8(%rbp),%r11 |
1343 mov %r10,0(%r12) # copy back to user | 1836 mov %r10,0(%r12) # copy back to user |
1344 mov %r11,8(%r12) | 1837 mov %r11,8(%r12) |
1345 » jmp» .Lcbc_cleanup | 1838 » jmp» .Lcbc_fast_cleanup |
1346 | 1839 |
1347 .align» 4 | 1840 .align» 16 |
1348 .Lcbc_dec_partial: | 1841 .Lcbc_fast_dec_in_place: |
1349 » mov» $s0,0+$ivec» » # dump output to stack | 1842 » mov» 0(%rbp),%r10» » # copy iv to stack |
1350 » mov» $s1,4+$ivec | 1843 » mov» 8(%rbp),%r11 |
1351 » mov» $s2,8+$ivec | 1844 » mov» %r10,0+$ivec |
1352 » mov» $s3,12+$ivec | 1845 » mov» %r11,8+$ivec |
1353 » mov» $out,%rdi | 1846 .align» 4 |
1354 » lea» $ivec,%rsi | 1847 .Lcbc_fast_dec_in_place_loop: |
1355 » mov» \$16,%rcx | |
1356 » add» %r10,%rcx» » # number of bytes to copy | |
1357 » .long» 0xF689A4F3» » # rep movsb | |
1358 » jmp» .Lcbc_dec_end | |
1359 | |
1360 .align» 16 | |
1361 .Lcbc_dec_in_place: | |
1362 mov 0($inp),$s0 # load input | 1848 mov 0($inp),$s0 # load input |
1363 mov 4($inp),$s1 | 1849 mov 4($inp),$s1 |
1364 mov 8($inp),$s2 | 1850 mov 8($inp),$s2 |
1365 mov 12($inp),$s3 | 1851 mov 12($inp),$s3 |
1366 | 1852 » » mov» $keyp,$key» # restore key |
1367 » » mov» $inp,$ivec» # if ($verticalspin) save inp | 1853 » » mov» $inp,$_inp» # if ($verticalspin) save inp |
1368 » » mov» $keyp,$key | 1854 |
1369 call _x86_64_AES_decrypt | 1855 call _x86_64_AES_decrypt |
1370 | 1856 |
1371 » » mov» $ivec,$inp» # if ($verticalspin) restore inp | 1857 » » mov» $_inp,$inp» # if ($verticalspin) restore inp |
1372 » » mov» $_ivp,%rbp | 1858 » » mov» $_len,%r10 |
1373 » » xor» 0(%rbp),$s0 | 1859 » » xor» 0+$ivec,$s0 |
1374 » » xor» 4(%rbp),$s1 | 1860 » » xor» 4+$ivec,$s1 |
1375 » » xor» 8(%rbp),$s2 | 1861 » » xor» 8+$ivec,$s2 |
1376 » » xor» 12(%rbp),$s3 | 1862 » » xor» 12+$ivec,$s3 |
1377 | 1863 |
1378 » » mov» 0($inp),%r10» # copy input to iv | 1864 » » mov» 0($inp),%r11» # load input |
1379 » » mov» 8($inp),%r11 | 1865 » » mov» 8($inp),%r12 |
1380 » » mov» %r10,0(%rbp) | 1866 » » sub» \$16,%r10 |
1381 » » mov» %r11,8(%rbp) | 1867 » » jz» .Lcbc_fast_dec_in_place_done |
| 1868 |
| 1869 » » mov» %r11,0+$ivec» # copy input to iv |
| 1870 » » mov» %r12,8+$ivec |
1382 | 1871 |
1383 mov $s0,0($out) # save output [zaps input] | 1872 mov $s0,0($out) # save output [zaps input] |
1384 mov $s1,4($out) | 1873 mov $s1,4($out) |
1385 mov $s2,8($out) | 1874 mov $s2,8($out) |
1386 mov $s3,12($out) | 1875 mov $s3,12($out) |
1387 | 1876 |
1388 » » mov» $_len,%rcx | 1877 » » lea» 16($inp),$inp |
1389 » » lea» 16($inp),$inp | 1878 » » lea» 16($out),$out |
1390 » » lea» 16($out),$out | 1879 » » mov» %r10,$_len |
1391 » » sub» \$16,%rcx | 1880 » jmp» .Lcbc_fast_dec_in_place_loop |
1392 » » jc» .Lcbc_dec_in_place_partial | 1881 .Lcbc_fast_dec_in_place_done: |
1393 » » mov» %rcx,$_len | 1882 » mov» $_ivp,%rdi |
1394 » jnz» .Lcbc_dec_in_place | 1883 » mov» %r11,0(%rdi)» # copy iv back to user |
1395 » jmp» .Lcbc_cleanup | 1884 » mov» %r12,8(%rdi) |
1396 | 1885 |
1397 .align» 4 | 1886 » mov» $s0,0($out)» # save output [zaps input] |
1398 .Lcbc_dec_in_place_partial: | 1887 » mov» $s1,4($out) |
1399 » # one can argue if this is actually required | 1888 » mov» $s2,8($out) |
1400 » lea» ($out,%rcx),%rdi | 1889 » mov» $s3,12($out) |
1401 » lea» (%rbp,%rcx),%rsi | 1890 |
1402 » neg» %rcx | 1891 .align» 4 |
1403 » .long» 0xF689A4F3» # rep movsb» # restore tail | 1892 .Lcbc_fast_cleanup: |
1404 » jmp» .Lcbc_cleanup | 1893 » cmpl» \$0,$mark» # was the key schedule copied? |
| 1894 » lea» $aes_key,%rdi |
| 1895 » je» .Lcbc_exit |
| 1896 » » mov» \$240/8,%ecx |
| 1897 » » xor» %rax,%rax |
| 1898 » » .long» 0x90AB48F3» # rep stosq |
| 1899 |
| 1900 » jmp» .Lcbc_exit |
| 1901 |
| 1902 #--------------------------- SLOW ROUTINE ---------------------------# |
| 1903 .align» 16 |
| 1904 .Lcbc_slow_prologue: |
| 1905 » # allocate aligned stack frame... |
| 1906 » lea» -88(%rsp),%rbp |
| 1907 » and» \$-64,%rbp |
| 1908 » # ... just "above" key schedule |
| 1909 » lea» -88-63(%rcx),%r10 |
| 1910 » sub» %rbp,%r10 |
| 1911 » neg» %r10 |
| 1912 » and» \$0x3c0,%r10 |
| 1913 » sub» %r10,%rbp |
| 1914 |
| 1915 » xchg» %rsp,%rbp |
| 1916 » #add» \$8,%rsp» # reserve for return address! |
| 1917 » mov» %rbp,$_rsp» # save %rsp |
| 1918 .Lcbc_slow_body: |
| 1919 » #mov» %rdi,$_inp» # save copy of inp |
| 1920 » #mov» %rsi,$_out» # save copy of out |
| 1921 » #mov» %rdx,$_len» # save copy of len |
| 1922 » #mov» %rcx,$_key» # save copy of key |
| 1923 » mov» %r8,$_ivp» # save copy of ivp |
| 1924 » mov» %r8,%rbp» # rearrange input arguments |
| 1925 » mov» %r9,%rbx |
| 1926 » mov» %rsi,$out |
| 1927 » mov» %rdi,$inp |
| 1928 » mov» %rcx,$key |
| 1929 » mov» %rdx,%r10 |
| 1930 |
| 1931 » mov» 240($key),%eax |
| 1932 » mov» $key,$keyp» # save key pointer |
| 1933 » shl» \$4,%eax |
| 1934 » lea» ($key,%rax),%rax |
| 1935 » mov» %rax,$keyend |
| 1936 |
| 1937 » # pick Te4 copy which can't "overlap" with stack frame or key scdedule |
| 1938 » lea» 2048($sbox),$sbox |
| 1939 » lea» 768-8(%rsp),%rax |
| 1940 » sub» $sbox,%rax |
| 1941 » and» \$0x300,%rax |
| 1942 » lea» ($sbox,%rax),$sbox |
| 1943 |
| 1944 » cmp» \$0,%rbx |
| 1945 » je» .LSLOW_DECRYPT |
| 1946 |
| 1947 #--------------------------- SLOW ENCRYPT ---------------------------# |
| 1948 » test» \$-16,%r10» » # check upon length |
| 1949 » mov» 0(%rbp),$s0» » # load iv |
| 1950 » mov» 4(%rbp),$s1 |
| 1951 » mov» 8(%rbp),$s2 |
| 1952 » mov» 12(%rbp),$s3 |
| 1953 » jz» .Lcbc_slow_enc_tail» # short input... |
| 1954 |
| 1955 .align» 4 |
| 1956 .Lcbc_slow_enc_loop: |
| 1957 » » xor» 0($inp),$s0 |
| 1958 » » xor» 4($inp),$s1 |
| 1959 » » xor» 8($inp),$s2 |
| 1960 » » xor» 12($inp),$s3 |
| 1961 » » mov» $keyp,$key» # restore key |
| 1962 » » mov» $inp,$_inp» # save inp |
| 1963 » » mov» $out,$_out» # save out |
| 1964 » » mov» %r10,$_len» # save len |
| 1965 |
| 1966 » » call» _x86_64_AES_encrypt_compact |
| 1967 |
| 1968 » » mov» $_inp,$inp» # restore inp |
| 1969 » » mov» $_out,$out» # restore out |
| 1970 » » mov» $_len,%r10» # restore len |
| 1971 » » mov» $s0,0($out) |
| 1972 » » mov» $s1,4($out) |
| 1973 » » mov» $s2,8($out) |
| 1974 » » mov» $s3,12($out) |
| 1975 |
| 1976 » » lea» 16($inp),$inp |
| 1977 » » lea» 16($out),$out |
| 1978 » » sub» \$16,%r10 |
| 1979 » » test» \$-16,%r10 |
| 1980 » jnz» .Lcbc_slow_enc_loop |
| 1981 » test» \$15,%r10 |
| 1982 » jnz» .Lcbc_slow_enc_tail |
| 1983 » mov» $_ivp,%rbp» # restore ivp |
| 1984 » mov» $s0,0(%rbp)» # save ivec |
| 1985 » mov» $s1,4(%rbp) |
| 1986 » mov» $s2,8(%rbp) |
| 1987 » mov» $s3,12(%rbp) |
| 1988 |
| 1989 » jmp» .Lcbc_exit |
| 1990 |
| 1991 .align» 4 |
| 1992 .Lcbc_slow_enc_tail: |
| 1993 » mov» %rax,%r11 |
| 1994 » mov» %rcx,%r12 |
| 1995 » mov» %r10,%rcx |
| 1996 » mov» $inp,%rsi |
| 1997 » mov» $out,%rdi |
| 1998 » .long» 0x9066A4F3» » # rep movsb |
| 1999 » mov» \$16,%rcx» » # zero tail |
| 2000 » sub» %r10,%rcx |
| 2001 » xor» %rax,%rax |
| 2002 » .long» 0x9066AAF3» » # rep stosb |
| 2003 » mov» $out,$inp» » # this is not a mistake! |
| 2004 » mov» \$16,%r10» » # len=16 |
| 2005 » mov» %r11,%rax |
| 2006 » mov» %r12,%rcx |
| 2007 » jmp» .Lcbc_slow_enc_loop» # one more spin... |
| 2008 #--------------------------- SLOW DECRYPT ---------------------------# |
| 2009 .align» 16 |
| 2010 .LSLOW_DECRYPT: |
| 2011 » shr» \$3,%rax |
| 2012 » add» %rax,$sbox» » # recall "magic" constants! |
| 2013 |
| 2014 » mov» 0(%rbp),%r11» » # copy iv to stack |
| 2015 » mov» 8(%rbp),%r12 |
| 2016 » mov» %r11,0+$ivec |
| 2017 » mov» %r12,8+$ivec |
| 2018 |
| 2019 .align» 4 |
| 2020 .Lcbc_slow_dec_loop: |
| 2021 » » mov» 0($inp),$s0» # load input |
| 2022 » » mov» 4($inp),$s1 |
| 2023 » » mov» 8($inp),$s2 |
| 2024 » » mov» 12($inp),$s3 |
| 2025 » » mov» $keyp,$key» # restore key |
| 2026 » » mov» $inp,$_inp» # save inp |
| 2027 » » mov» $out,$_out» # save out |
| 2028 » » mov» %r10,$_len» # save len |
| 2029 |
| 2030 » » call» _x86_64_AES_decrypt_compact |
| 2031 |
| 2032 » » mov» $_inp,$inp» # restore inp |
| 2033 » » mov» $_out,$out» # restore out |
| 2034 » » mov» $_len,%r10 |
| 2035 » » xor» 0+$ivec,$s0 |
| 2036 » » xor» 4+$ivec,$s1 |
| 2037 » » xor» 8+$ivec,$s2 |
| 2038 » » xor» 12+$ivec,$s3 |
| 2039 |
| 2040 » » mov» 0($inp),%r11» # load input |
| 2041 » » mov» 8($inp),%r12 |
| 2042 » » sub» \$16,%r10 |
| 2043 » » jc» .Lcbc_slow_dec_partial |
| 2044 » » jz» .Lcbc_slow_dec_done |
| 2045 |
| 2046 » » mov» %r11,0+$ivec» # copy input to iv |
| 2047 » » mov» %r12,8+$ivec |
| 2048 |
| 2049 » » mov» $s0,0($out)» # save output [can zap input] |
| 2050 » » mov» $s1,4($out) |
| 2051 » » mov» $s2,8($out) |
| 2052 » » mov» $s3,12($out) |
| 2053 |
| 2054 » » lea» 16($inp),$inp |
| 2055 » » lea» 16($out),$out |
| 2056 » jmp» .Lcbc_slow_dec_loop |
| 2057 .Lcbc_slow_dec_done: |
| 2058 » mov» $_ivp,%rdi |
| 2059 » mov» %r11,0(%rdi)» » # copy iv back to user |
| 2060 » mov» %r12,8(%rdi) |
| 2061 |
| 2062 » mov» $s0,0($out)» » # save output [can zap input] |
| 2063 » mov» $s1,4($out) |
| 2064 » mov» $s2,8($out) |
| 2065 » mov» $s3,12($out) |
| 2066 |
| 2067 » jmp» .Lcbc_exit |
| 2068 |
| 2069 .align» 4 |
| 2070 .Lcbc_slow_dec_partial: |
| 2071 » mov» $_ivp,%rdi |
| 2072 » mov» %r11,0(%rdi)» » # copy iv back to user |
| 2073 » mov» %r12,8(%rdi) |
| 2074 |
| 2075 » mov» $s0,0+$ivec» » # save output to stack |
| 2076 » mov» $s1,4+$ivec |
| 2077 » mov» $s2,8+$ivec |
| 2078 » mov» $s3,12+$ivec |
| 2079 |
| 2080 » mov» $out,%rdi |
| 2081 » lea» $ivec,%rsi |
| 2082 » lea» 16(%r10),%rcx |
| 2083 » .long» 0x9066A4F3» # rep movsb |
| 2084 » jmp» .Lcbc_exit |
| 2085 |
| 2086 .align» 16 |
| 2087 .Lcbc_exit: |
| 2088 » mov» $_rsp,%rsi |
| 2089 » mov» (%rsi),%r15 |
| 2090 » mov» 8(%rsi),%r14 |
| 2091 » mov» 16(%rsi),%r13 |
| 2092 » mov» 24(%rsi),%r12 |
| 2093 » mov» 32(%rsi),%rbp |
| 2094 » mov» 40(%rsi),%rbx |
| 2095 » lea» 48(%rsi),%rsp |
| 2096 .Lcbc_popfq: |
| 2097 » popfq |
| 2098 .Lcbc_epilogue: |
| 2099 » ret |
1405 .size AES_cbc_encrypt,.-AES_cbc_encrypt | 2100 .size AES_cbc_encrypt,.-AES_cbc_encrypt |
1406 ___ | 2101 ___ |
1407 } | 2102 } |
1408 | 2103 |
1409 $code.=<<___; | 2104 $code.=<<___; |
1410 .globl AES_Te | |
1411 .align 64 | 2105 .align 64 |
1412 AES_Te: | 2106 .LAES_Te: |
1413 ___ | 2107 ___ |
1414 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); | 2108 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); |
1415 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); | 2109 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); |
1416 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); | 2110 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); |
1417 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); | 2111 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); |
1418 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); | 2112 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); |
1419 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); | 2113 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); |
1420 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); | 2114 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); |
1421 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); | 2115 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); |
1422 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); | 2116 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1468 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); | 2162 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); |
1469 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); | 2163 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); |
1470 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); | 2164 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); |
1471 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); | 2165 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); |
1472 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); | 2166 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); |
1473 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); | 2167 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); |
1474 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); | 2168 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); |
1475 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 2169 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
1476 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 2170 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
1477 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 2171 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
| 2172 |
| 2173 #Te4 # four copies of Te4 to choose from to avoid L1 aliasing |
| 2174 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 2175 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 2176 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 2177 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 2178 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 2179 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 2180 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 2181 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 2182 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 2183 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 2184 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 2185 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 2186 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 2187 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 2188 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 2189 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 2190 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 2191 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 2192 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 2193 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 2194 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 2195 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 2196 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 2197 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 2198 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 2199 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 2200 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 2201 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 2202 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 2203 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 2204 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 2205 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
| 2206 |
| 2207 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 2208 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 2209 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 2210 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 2211 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 2212 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 2213 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 2214 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 2215 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 2216 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 2217 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 2218 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 2219 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 2220 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 2221 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 2222 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 2223 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 2224 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 2225 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 2226 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 2227 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 2228 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 2229 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 2230 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 2231 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 2232 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 2233 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 2234 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 2235 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 2236 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 2237 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 2238 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
| 2239 |
| 2240 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 2241 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 2242 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 2243 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 2244 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 2245 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 2246 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 2247 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 2248 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 2249 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 2250 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 2251 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 2252 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 2253 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 2254 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 2255 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 2256 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 2257 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 2258 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 2259 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 2260 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 2261 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 2262 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 2263 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 2264 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 2265 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 2266 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 2267 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 2268 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 2269 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 2270 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 2271 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
| 2272 |
| 2273 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
| 2274 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); |
| 2275 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); |
| 2276 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); |
| 2277 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); |
| 2278 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); |
| 2279 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); |
| 2280 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); |
| 2281 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); |
| 2282 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); |
| 2283 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); |
| 2284 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); |
| 2285 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); |
| 2286 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); |
| 2287 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); |
| 2288 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); |
| 2289 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); |
| 2290 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); |
| 2291 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); |
| 2292 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); |
| 2293 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); |
| 2294 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); |
| 2295 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); |
| 2296 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); |
| 2297 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); |
| 2298 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); |
| 2299 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); |
| 2300 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); |
| 2301 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); |
| 2302 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); |
| 2303 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); |
| 2304 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); |
1478 #rcon: | 2305 #rcon: |
1479 $code.=<<___; | 2306 $code.=<<___; |
1480 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 | 2307 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 |
1481 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 | 2308 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 |
1482 » .long» 0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0 | 2309 » .long» 0x0000001b, 0x00000036, 0x80808080, 0x80808080 |
| 2310 » .long» 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b |
1483 ___ | 2311 ___ |
1484 $code.=<<___; | 2312 $code.=<<___; |
1485 .globl AES_Td | |
1486 .align 64 | 2313 .align 64 |
1487 AES_Td: | 2314 .LAES_Td: |
1488 ___ | 2315 ___ |
1489 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); | 2316 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); |
1490 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); | 2317 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); |
1491 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); | 2318 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); |
1492 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); | 2319 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); |
1493 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); | 2320 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); |
1494 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); | 2321 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); |
1495 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); | 2322 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); |
1496 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); | 2323 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); |
1497 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); | 2324 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1543 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); | 2370 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); |
1544 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); | 2371 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); |
1545 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); | 2372 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); |
1546 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); | 2373 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); |
1547 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); | 2374 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); |
1548 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); | 2375 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); |
1549 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); | 2376 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); |
1550 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 2377 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
1551 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 2378 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
1552 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 2379 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
1553 #Td4: | 2380 |
| 2381 #Td4:» # four copies of Td4 to choose from to avoid L1 aliasing |
1554 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 2382 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
1555 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 2383 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
1556 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 2384 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
1557 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | 2385 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
1558 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | 2386 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
1559 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | 2387 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
1560 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | 2388 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
1561 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | 2389 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
1562 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | 2390 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
1563 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | 2391 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
(...skipping 12 matching lines...) Expand all Loading... |
1576 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | 2404 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
1577 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | 2405 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
1578 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | 2406 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
1579 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | 2407 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
1580 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | 2408 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
1581 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | 2409 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
1582 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | 2410 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
1583 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | 2411 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
1584 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | 2412 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
1585 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | 2413 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 2414 $code.=<<___; |
| 2415 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe |
| 2416 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 |
| 2417 ___ |
| 2418 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 2419 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 2420 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| 2421 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
| 2422 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
| 2423 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
| 2424 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
| 2425 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
| 2426 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
| 2427 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
| 2428 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); |
| 2429 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); |
| 2430 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); |
| 2431 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); |
| 2432 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); |
| 2433 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); |
| 2434 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); |
| 2435 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); |
| 2436 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); |
| 2437 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); |
| 2438 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); |
| 2439 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); |
| 2440 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
| 2441 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
| 2442 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
| 2443 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
| 2444 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
| 2445 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
| 2446 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
| 2447 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 2448 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 2449 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 2450 $code.=<<___; |
| 2451 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe |
| 2452 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 |
| 2453 ___ |
| 2454 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 2455 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 2456 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| 2457 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
| 2458 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
| 2459 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
| 2460 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
| 2461 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
| 2462 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
| 2463 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
| 2464 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); |
| 2465 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); |
| 2466 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); |
| 2467 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); |
| 2468 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); |
| 2469 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); |
| 2470 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); |
| 2471 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); |
| 2472 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); |
| 2473 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); |
| 2474 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); |
| 2475 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); |
| 2476 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
| 2477 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
| 2478 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
| 2479 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
| 2480 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
| 2481 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
| 2482 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
| 2483 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 2484 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 2485 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 2486 $code.=<<___; |
| 2487 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe |
| 2488 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 |
| 2489 ___ |
| 2490 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 2491 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 2492 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| 2493 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); |
| 2494 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); |
| 2495 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); |
| 2496 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); |
| 2497 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); |
| 2498 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); |
| 2499 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); |
| 2500 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); |
| 2501 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); |
| 2502 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); |
| 2503 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); |
| 2504 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); |
| 2505 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); |
| 2506 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); |
| 2507 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); |
| 2508 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); |
| 2509 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); |
| 2510 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); |
| 2511 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); |
| 2512 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); |
| 2513 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); |
| 2514 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); |
| 2515 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); |
| 2516 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); |
| 2517 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); |
| 2518 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); |
| 2519 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 2520 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 2521 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 2522 $code.=<<___; |
| 2523 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe |
| 2524 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 |
| 2525 .asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 2526 .align 64 |
| 2527 ___ |
| 2528 |
| 2529 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
| 2530 # CONTEXT *context,DISPATCHER_CONTEXT *disp) |
| 2531 if ($win64) { |
| 2532 $rec="%rcx"; |
| 2533 $frame="%rdx"; |
| 2534 $context="%r8"; |
| 2535 $disp="%r9"; |
| 2536 |
| 2537 $code.=<<___; |
| 2538 .extern __imp_RtlVirtualUnwind |
| 2539 .type block_se_handler,\@abi-omnipotent |
| 2540 .align 16 |
| 2541 block_se_handler: |
| 2542 push %rsi |
| 2543 push %rdi |
| 2544 push %rbx |
| 2545 push %rbp |
| 2546 push %r12 |
| 2547 push %r13 |
| 2548 push %r14 |
| 2549 push %r15 |
| 2550 pushfq |
| 2551 sub \$64,%rsp |
| 2552 |
| 2553 mov 120($context),%rax # pull context->Rax |
| 2554 mov 248($context),%rbx # pull context->Rip |
| 2555 |
| 2556 mov 8($disp),%rsi # disp->ImageBase |
| 2557 mov 56($disp),%r11 # disp->HandlerData |
| 2558 |
| 2559 mov 0(%r11),%r10d # HandlerData[0] |
| 2560 lea (%rsi,%r10),%r10 # prologue label |
| 2561 cmp %r10,%rbx # context->Rip<prologue label |
| 2562 jb .Lin_block_prologue |
| 2563 |
| 2564 mov 152($context),%rax # pull context->Rsp |
| 2565 |
| 2566 mov 4(%r11),%r10d # HandlerData[1] |
| 2567 lea (%rsi,%r10),%r10 # epilogue label |
| 2568 cmp %r10,%rbx # context->Rip>=epilogue label |
| 2569 jae .Lin_block_prologue |
| 2570 |
| 2571 mov 24(%rax),%rax # pull saved real stack pointer |
| 2572 lea 48(%rax),%rax # adjust... |
| 2573 |
| 2574 mov -8(%rax),%rbx |
| 2575 mov -16(%rax),%rbp |
| 2576 mov -24(%rax),%r12 |
| 2577 mov -32(%rax),%r13 |
| 2578 mov -40(%rax),%r14 |
| 2579 mov -48(%rax),%r15 |
| 2580 mov %rbx,144($context) # restore context->Rbx |
| 2581 mov %rbp,160($context) # restore context->Rbp |
| 2582 mov %r12,216($context) # restore context->R12 |
| 2583 mov %r13,224($context) # restore context->R13 |
| 2584 mov %r14,232($context) # restore context->R14 |
| 2585 mov %r15,240($context) # restore context->R15 |
| 2586 |
| 2587 .Lin_block_prologue: |
| 2588 mov 8(%rax),%rdi |
| 2589 mov 16(%rax),%rsi |
| 2590 mov %rax,152($context) # restore context->Rsp |
| 2591 mov %rsi,168($context) # restore context->Rsi |
| 2592 mov %rdi,176($context) # restore context->Rdi |
| 2593 |
| 2594 jmp .Lcommon_seh_exit |
| 2595 .size block_se_handler,.-block_se_handler |
| 2596 |
| 2597 .type key_se_handler,\@abi-omnipotent |
| 2598 .align 16 |
| 2599 key_se_handler: |
| 2600 push %rsi |
| 2601 push %rdi |
| 2602 push %rbx |
| 2603 push %rbp |
| 2604 push %r12 |
| 2605 push %r13 |
| 2606 push %r14 |
| 2607 push %r15 |
| 2608 pushfq |
| 2609 sub \$64,%rsp |
| 2610 |
| 2611 mov 120($context),%rax # pull context->Rax |
| 2612 mov 248($context),%rbx # pull context->Rip |
| 2613 |
| 2614 mov 8($disp),%rsi # disp->ImageBase |
| 2615 mov 56($disp),%r11 # disp->HandlerData |
| 2616 |
| 2617 mov 0(%r11),%r10d # HandlerData[0] |
| 2618 lea (%rsi,%r10),%r10 # prologue label |
| 2619 cmp %r10,%rbx # context->Rip<prologue label |
| 2620 jb .Lin_key_prologue |
| 2621 |
| 2622 mov 152($context),%rax # pull context->Rsp |
| 2623 |
| 2624 mov 4(%r11),%r10d # HandlerData[1] |
| 2625 lea (%rsi,%r10),%r10 # epilogue label |
| 2626 cmp %r10,%rbx # context->Rip>=epilogue label |
| 2627 jae .Lin_key_prologue |
| 2628 |
| 2629 lea 56(%rax),%rax |
| 2630 |
| 2631 mov -8(%rax),%rbx |
| 2632 mov -16(%rax),%rbp |
| 2633 mov -24(%rax),%r12 |
| 2634 mov -32(%rax),%r13 |
| 2635 mov -40(%rax),%r14 |
| 2636 mov -48(%rax),%r15 |
| 2637 mov %rbx,144($context) # restore context->Rbx |
| 2638 mov %rbp,160($context) # restore context->Rbp |
| 2639 mov %r12,216($context) # restore context->R12 |
| 2640 mov %r13,224($context) # restore context->R13 |
| 2641 mov %r14,232($context) # restore context->R14 |
| 2642 mov %r15,240($context) # restore context->R15 |
| 2643 |
| 2644 .Lin_key_prologue: |
| 2645 mov 8(%rax),%rdi |
| 2646 mov 16(%rax),%rsi |
| 2647 mov %rax,152($context) # restore context->Rsp |
| 2648 mov %rsi,168($context) # restore context->Rsi |
| 2649 mov %rdi,176($context) # restore context->Rdi |
| 2650 |
| 2651 jmp .Lcommon_seh_exit |
| 2652 .size key_se_handler,.-key_se_handler |
| 2653 |
| 2654 .type cbc_se_handler,\@abi-omnipotent |
| 2655 .align 16 |
| 2656 cbc_se_handler: |
| 2657 push %rsi |
| 2658 push %rdi |
| 2659 push %rbx |
| 2660 push %rbp |
| 2661 push %r12 |
| 2662 push %r13 |
| 2663 push %r14 |
| 2664 push %r15 |
| 2665 pushfq |
| 2666 sub \$64,%rsp |
| 2667 |
| 2668 mov 120($context),%rax # pull context->Rax |
| 2669 mov 248($context),%rbx # pull context->Rip |
| 2670 |
| 2671 lea .Lcbc_prologue(%rip),%r10 |
| 2672 cmp %r10,%rbx # context->Rip<.Lcbc_prologue |
| 2673 jb .Lin_cbc_prologue |
| 2674 |
| 2675 lea .Lcbc_fast_body(%rip),%r10 |
| 2676 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body |
| 2677 jb .Lin_cbc_frame_setup |
| 2678 |
| 2679 lea .Lcbc_slow_prologue(%rip),%r10 |
| 2680 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue |
| 2681 jb .Lin_cbc_body |
| 2682 |
| 2683 lea .Lcbc_slow_body(%rip),%r10 |
| 2684 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body |
| 2685 jb .Lin_cbc_frame_setup |
| 2686 |
| 2687 .Lin_cbc_body: |
| 2688 mov 152($context),%rax # pull context->Rsp |
| 2689 |
| 2690 lea .Lcbc_epilogue(%rip),%r10 |
| 2691 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue |
| 2692 jae .Lin_cbc_prologue |
| 2693 |
| 2694 lea 8(%rax),%rax |
| 2695 |
| 2696 lea .Lcbc_popfq(%rip),%r10 |
| 2697 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq |
| 2698 jae .Lin_cbc_prologue |
| 2699 |
| 2700 mov `16-8`(%rax),%rax # biased $_rsp |
| 2701 lea 56(%rax),%rax |
| 2702 |
| 2703 .Lin_cbc_frame_setup: |
| 2704 mov -16(%rax),%rbx |
| 2705 mov -24(%rax),%rbp |
| 2706 mov -32(%rax),%r12 |
| 2707 mov -40(%rax),%r13 |
| 2708 mov -48(%rax),%r14 |
| 2709 mov -56(%rax),%r15 |
| 2710 mov %rbx,144($context) # restore context->Rbx |
| 2711 mov %rbp,160($context) # restore context->Rbp |
| 2712 mov %r12,216($context) # restore context->R12 |
| 2713 mov %r13,224($context) # restore context->R13 |
| 2714 mov %r14,232($context) # restore context->R14 |
| 2715 mov %r15,240($context) # restore context->R15 |
| 2716 |
| 2717 .Lin_cbc_prologue: |
| 2718 mov 8(%rax),%rdi |
| 2719 mov 16(%rax),%rsi |
| 2720 mov %rax,152($context) # restore context->Rsp |
| 2721 mov %rsi,168($context) # restore context->Rsi |
| 2722 mov %rdi,176($context) # restore context->Rdi |
| 2723 |
| 2724 .Lcommon_seh_exit: |
| 2725 |
| 2726 mov 40($disp),%rdi # disp->ContextRecord |
| 2727 mov $context,%rsi # context |
| 2728 mov \$`1232/8`,%ecx # sizeof(CONTEXT) |
| 2729 .long 0xa548f3fc # cld; rep movsq |
| 2730 |
| 2731 mov $disp,%rsi |
| 2732 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER |
| 2733 mov 8(%rsi),%rdx # arg2, disp->ImageBase |
| 2734 mov 0(%rsi),%r8 # arg3, disp->ControlPc |
| 2735 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry |
| 2736 mov 40(%rsi),%r10 # disp->ContextRecord |
| 2737 lea 56(%rsi),%r11 # &disp->HandlerData |
| 2738 lea 24(%rsi),%r12 # &disp->EstablisherFrame |
| 2739 mov %r10,32(%rsp) # arg5 |
| 2740 mov %r11,40(%rsp) # arg6 |
| 2741 mov %r12,48(%rsp) # arg7 |
| 2742 mov %rcx,56(%rsp) # arg8, (NULL) |
| 2743 call *__imp_RtlVirtualUnwind(%rip) |
| 2744 |
| 2745 mov \$1,%eax # ExceptionContinueSearch |
| 2746 add \$64,%rsp |
| 2747 popfq |
| 2748 pop %r15 |
| 2749 pop %r14 |
| 2750 pop %r13 |
| 2751 pop %r12 |
| 2752 pop %rbp |
| 2753 pop %rbx |
| 2754 pop %rdi |
| 2755 pop %rsi |
| 2756 ret |
| 2757 .size cbc_se_handler,.-cbc_se_handler |
| 2758 |
| 2759 .section .pdata |
| 2760 .align 4 |
| 2761 .rva .LSEH_begin_AES_encrypt |
| 2762 .rva .LSEH_end_AES_encrypt |
| 2763 .rva .LSEH_info_AES_encrypt |
| 2764 |
| 2765 .rva .LSEH_begin_AES_decrypt |
| 2766 .rva .LSEH_end_AES_decrypt |
| 2767 .rva .LSEH_info_AES_decrypt |
| 2768 |
| 2769 .rva .LSEH_begin_AES_set_encrypt_key |
| 2770 .rva .LSEH_end_AES_set_encrypt_key |
| 2771 .rva .LSEH_info_AES_set_encrypt_key |
| 2772 |
| 2773 .rva .LSEH_begin_AES_set_decrypt_key |
| 2774 .rva .LSEH_end_AES_set_decrypt_key |
| 2775 .rva .LSEH_info_AES_set_decrypt_key |
| 2776 |
| 2777 .rva .LSEH_begin_AES_cbc_encrypt |
| 2778 .rva .LSEH_end_AES_cbc_encrypt |
| 2779 .rva .LSEH_info_AES_cbc_encrypt |
| 2780 |
| 2781 .section .xdata |
| 2782 .align 8 |
| 2783 .LSEH_info_AES_encrypt: |
| 2784 .byte 9,0,0,0 |
| 2785 .rva block_se_handler |
| 2786 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] |
| 2787 .LSEH_info_AES_decrypt: |
| 2788 .byte 9,0,0,0 |
| 2789 .rva block_se_handler |
| 2790 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] |
| 2791 .LSEH_info_AES_set_encrypt_key: |
| 2792 .byte 9,0,0,0 |
| 2793 .rva key_se_handler |
| 2794 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] |
| 2795 .LSEH_info_AES_set_decrypt_key: |
| 2796 .byte 9,0,0,0 |
| 2797 .rva key_se_handler |
| 2798 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] |
| 2799 .LSEH_info_AES_cbc_encrypt: |
| 2800 .byte 9,0,0,0 |
| 2801 .rva cbc_se_handler |
| 2802 ___ |
| 2803 } |
1586 | 2804 |
1587 $code =~ s/\`([^\`]*)\`/eval($1)/gem; | 2805 $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
1588 | 2806 |
1589 print $code; | 2807 print $code; |
1590 | 2808 |
1591 close STDOUT; | 2809 close STDOUT; |
OLD | NEW |