| Index: openssl/crypto/md5/asm/md5-x86_64.pl
|
| ===================================================================
|
| --- openssl/crypto/md5/asm/md5-x86_64.pl (revision 105093)
|
| +++ openssl/crypto/md5/asm/md5-x86_64.pl (working copy)
|
| @@ -15,11 +15,10 @@
|
| # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
|
| # %r10d = X[k_next]
|
| # %r11d = z' (copy of z for the next step)
|
| -# Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC)
|
| +# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
|
| sub round1_step
|
| {
|
| my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
| - $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
|
| $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($pos == -1);
|
| $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
|
| $code .= <<EOF;
|
| @@ -38,23 +37,26 @@
|
| # round2_step() does:
|
| # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
|
| # %r10d = X[k_next]
|
| -# %r11d = y' (copy of y for the next step)
|
| -# Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC)
|
| +# %r11d = z' (copy of z for the next step)
|
| +# %r12d = z' (copy of z for the next step)
|
| +# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
|
| sub round2_step
|
| {
|
| my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
| - $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
|
| $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1);
|
| - $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
|
| + $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
|
| + $code .= " mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
|
| $code .= <<EOF;
|
| - xor $x, %r11d /* x ^ ... */
|
| + not %r11d /* not z */
|
| lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
|
| - and $z, %r11d /* z & ... */
|
| - xor $y, %r11d /* y ^ ... */
|
| + and $x, %r12d /* x & z */
|
| + and $y, %r11d /* y & (not z) */
|
| mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
|
| - add %r11d, $dst /* dst += ... */
|
| + or %r11d, %r12d /* (y & (not z)) | (x & z) */
|
| + mov $y, %r11d /* (NEXT STEP) z' = $y */
|
| + add %r12d, $dst /* dst += ... */
|
| + mov $y, %r12d /* (NEXT STEP) z' = $y */
|
| rol \$$s, $dst /* dst <<< s */
|
| - mov $x, %r11d /* (NEXT STEP) y' = $x */
|
| add $x, $dst /* dst += x */
|
| EOF
|
| }
|
| @@ -63,11 +65,10 @@
|
| # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
|
| # %r10d = X[k_next]
|
| # %r11d = y' (copy of y for the next step)
|
| -# Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC)
|
| +# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
|
| sub round3_step
|
| {
|
| my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
| - $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
|
| $code .= " mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */\n" if ($pos == -1);
|
| $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
|
| $code .= <<EOF;
|
| @@ -86,11 +87,10 @@
|
| # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
|
| # %r10d = X[k_next]
|
| # %r11d = not z' (copy of not z for the next step)
|
| -# Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC)
|
| +# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
|
| sub round4_step
|
| {
|
| my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
| - $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
|
| $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($pos == -1);
|
| $code .= " mov \$0xffffffff, %r11d\n" if ($pos == -1);
|
| $code .= " xor %edx, %r11d /* (NEXT STEP) not z' = not %edx*/\n"
|
| @@ -108,9 +108,20 @@
|
| EOF
|
| }
|
|
|
| -my $output = shift;
|
| -open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
|
| +my $flavour = shift;
|
| +my $output = shift;
|
| +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
| +my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
| +
|
| +$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
|
| +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
| +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
| +die "can't locate x86_64-xlate.pl";
|
| +
|
| +no warnings qw(uninitialized);
|
| +open STDOUT,"| $^X $xlate $flavour $output";
|
| +
|
| $code .= <<EOF;
|
| .text
|
| .align 16
|
| @@ -120,8 +131,10 @@
|
| md5_block_asm_data_order:
|
| push %rbp
|
| push %rbx
|
| + push %r12
|
| push %r14
|
| push %r15
|
| +.Lprologue:
|
|
|
| # rdi = arg #1 (ctx, MD5_CTX pointer)
|
| # rsi = arg #2 (ptr, data pointer)
|
| @@ -236,14 +249,121 @@
|
| mov %ecx, 2*4(%rbp) # ctx->C = C
|
| mov %edx, 3*4(%rbp) # ctx->D = D
|
|
|
| + mov (%rsp),%r15
|
| + mov 8(%rsp),%r14
|
| + mov 16(%rsp),%r12
|
| + mov 24(%rsp),%rbx
|
| + mov 32(%rsp),%rbp
|
| + add \$40,%rsp
|
| +.Lepilogue:
|
| + ret
|
| +.size md5_block_asm_data_order,.-md5_block_asm_data_order
|
| +EOF
|
| +
|
| +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
| +# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
| +if ($win64) {
|
| +my $rec="%rcx";
|
| +my $frame="%rdx";
|
| +my $context="%r8";
|
| +my $disp="%r9";
|
| +
|
| +$code.=<<___;
|
| +.extern __imp_RtlVirtualUnwind
|
| +.type se_handler,\@abi-omnipotent
|
| +.align 16
|
| +se_handler:
|
| + push %rsi
|
| + push %rdi
|
| + push %rbx
|
| + push %rbp
|
| + push %r12
|
| + push %r13
|
| + push %r14
|
| + push %r15
|
| + pushfq
|
| + sub \$64,%rsp
|
| +
|
| + mov 120($context),%rax # pull context->Rax
|
| + mov 248($context),%rbx # pull context->Rip
|
| +
|
| + lea .Lprologue(%rip),%r10
|
| + cmp %r10,%rbx # context->Rip<.Lprologue
|
| + jb .Lin_prologue
|
| +
|
| + mov 152($context),%rax # pull context->Rsp
|
| +
|
| + lea .Lepilogue(%rip),%r10
|
| + cmp %r10,%rbx # context->Rip>=.Lepilogue
|
| + jae .Lin_prologue
|
| +
|
| + lea 40(%rax),%rax
|
| +
|
| + mov -8(%rax),%rbp
|
| + mov -16(%rax),%rbx
|
| + mov -24(%rax),%r12
|
| + mov -32(%rax),%r14
|
| + mov -40(%rax),%r15
|
| + mov %rbx,144($context) # restore context->Rbx
|
| + mov %rbp,160($context) # restore context->Rbp
|
| + mov %r12,216($context) # restore context->R12
|
| + mov %r14,232($context) # restore context->R14
|
| + mov %r15,240($context) # restore context->R15
|
| +
|
| +.Lin_prologue:
|
| + mov 8(%rax),%rdi
|
| + mov 16(%rax),%rsi
|
| + mov %rax,152($context) # restore context->Rsp
|
| + mov %rsi,168($context) # restore context->Rsi
|
| + mov %rdi,176($context) # restore context->Rdi
|
| +
|
| + mov 40($disp),%rdi # disp->ContextRecord
|
| + mov $context,%rsi # context
|
| + mov \$154,%ecx # sizeof(CONTEXT)
|
| + .long 0xa548f3fc # cld; rep movsq
|
| +
|
| + mov $disp,%rsi
|
| + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
| + mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
| + mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
| + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
| + mov 40(%rsi),%r10 # disp->ContextRecord
|
| + lea 56(%rsi),%r11 # &disp->HandlerData
|
| + lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
| + mov %r10,32(%rsp) # arg5
|
| + mov %r11,40(%rsp) # arg6
|
| + mov %r12,48(%rsp) # arg7
|
| + mov %rcx,56(%rsp) # arg8, (NULL)
|
| + call *__imp_RtlVirtualUnwind(%rip)
|
| +
|
| + mov \$1,%eax # ExceptionContinueSearch
|
| + add \$64,%rsp
|
| + popfq
|
| pop %r15
|
| pop %r14
|
| + pop %r13
|
| + pop %r12
|
| + pop %rbp
|
| pop %rbx
|
| - pop %rbp
|
| + pop %rdi
|
| + pop %rsi
|
| ret
|
| -.size md5_block_asm_data_order,.-md5_block_asm_data_order
|
| -EOF
|
| +.size se_handler,.-se_handler
|
|
|
| +.section .pdata
|
| +.align 4
|
| + .rva .LSEH_begin_md5_block_asm_data_order
|
| + .rva .LSEH_end_md5_block_asm_data_order
|
| + .rva .LSEH_info_md5_block_asm_data_order
|
| +
|
| +.section .xdata
|
| +.align 8
|
| +.LSEH_info_md5_block_asm_data_order:
|
| + .byte 9,0,0,0
|
| + .rva se_handler
|
| +___
|
| +}
|
| +
|
| print $code;
|
|
|
| close STDOUT;
|
|
|