Index: openssl/crypto/sha/asm/sha1-ppc.pl |
=================================================================== |
--- openssl/crypto/sha/asm/sha1-ppc.pl (revision 0) |
+++ openssl/crypto/sha/asm/sha1-ppc.pl (revision 0) |
@@ -0,0 +1,319 @@ |
+#!/usr/bin/env perl |
+ |
+# ==================================================================== |
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
+# project. The module is, however, dual licensed under OpenSSL and |
+# CRYPTOGAMS licenses depending on where you obtain it. For further |
+# details see http://www.openssl.org/~appro/cryptogams/. |
+# ==================================================================== |
+ |
+# I let hardware handle unaligned input(*), except on page boundaries |
+# (see below for details). Otherwise straightforward implementation |
+# with X vector in register bank. The module is big-endian [which is |
+# not big deal as there're no little-endian targets left around]. |
+# |
+# (*) this means that this module is inappropriate for PPC403? Does |
+# anybody know if pre-POWER3 can sustain unaligned load? |
+ |
+# -m64 -m32 |
+# ---------------------------------- |
+# PPC970,gcc-4.0.0 +76% +59% |
+# Power6,xlc-7 +68% +33% |
+ |
+$flavour = shift; |
+ |
+if ($flavour =~ /64/) { |
+ $SIZE_T =8; |
+ $UCMP ="cmpld"; |
+ $STU ="stdu"; |
+ $POP ="ld"; |
+ $PUSH ="std"; |
+} elsif ($flavour =~ /32/) { |
+ $SIZE_T =4; |
+ $UCMP ="cmplw"; |
+ $STU ="stwu"; |
+ $POP ="lwz"; |
+ $PUSH ="stw"; |
+} else { die "nonsense $flavour"; } |
+ |
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
+die "can't locate ppc-xlate.pl"; |
+ |
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
+ |
+$FRAME=24*$SIZE_T; |
+ |
+$K ="r0"; |
+$sp ="r1"; |
+$toc="r2"; |
+$ctx="r3"; |
+$inp="r4"; |
+$num="r5"; |
+$t0 ="r15"; |
+$t1 ="r6"; |
+ |
+$A ="r7"; |
+$B ="r8"; |
+$C ="r9"; |
+$D ="r10"; |
+$E ="r11"; |
+$T ="r12"; |
+ |
+@V=($A,$B,$C,$D,$E,$T); |
+@X=("r16","r17","r18","r19","r20","r21","r22","r23", |
+ "r24","r25","r26","r27","r28","r29","r30","r31"); |
+ |
+sub BODY_00_19 { |
+my ($i,$a,$b,$c,$d,$e,$f)=@_; |
+my $j=$i+1; |
+$code.=<<___ if ($i==0); |
+ lwz @X[$i],`$i*4`($inp) |
+___ |
+$code.=<<___ if ($i<15); |
+ lwz @X[$j],`$j*4`($inp) |
+ add $f,$K,$e |
+ rotlwi $e,$a,5 |
+ add $f,$f,@X[$i] |
+ and $t0,$c,$b |
+ add $f,$f,$e |
+ andc $t1,$d,$b |
+ rotlwi $b,$b,30 |
+ or $t0,$t0,$t1 |
+ add $f,$f,$t0 |
+___ |
+$code.=<<___ if ($i>=15); |
+ add $f,$K,$e |
+ rotlwi $e,$a,5 |
+ xor @X[$j%16],@X[$j%16],@X[($j+2)%16] |
+ add $f,$f,@X[$i%16] |
+ and $t0,$c,$b |
+ xor @X[$j%16],@X[$j%16],@X[($j+8)%16] |
+ add $f,$f,$e |
+ andc $t1,$d,$b |
+ rotlwi $b,$b,30 |
+ or $t0,$t0,$t1 |
+ xor @X[$j%16],@X[$j%16],@X[($j+13)%16] |
+ add $f,$f,$t0 |
+ rotlwi @X[$j%16],@X[$j%16],1 |
+___ |
+} |
+ |
+sub BODY_20_39 { |
+my ($i,$a,$b,$c,$d,$e,$f)=@_; |
+my $j=$i+1; |
+$code.=<<___ if ($i<79); |
+ add $f,$K,$e |
+ rotlwi $e,$a,5 |
+ xor @X[$j%16],@X[$j%16],@X[($j+2)%16] |
+ add $f,$f,@X[$i%16] |
+ xor $t0,$b,$c |
+ xor @X[$j%16],@X[$j%16],@X[($j+8)%16] |
+ add $f,$f,$e |
+ rotlwi $b,$b,30 |
+ xor $t0,$t0,$d |
+ xor @X[$j%16],@X[$j%16],@X[($j+13)%16] |
+ add $f,$f,$t0 |
+ rotlwi @X[$j%16],@X[$j%16],1 |
+___ |
+$code.=<<___ if ($i==79); |
+ add $f,$K,$e |
+ rotlwi $e,$a,5 |
+ lwz r16,0($ctx) |
+ add $f,$f,@X[$i%16] |
+ xor $t0,$b,$c |
+ lwz r17,4($ctx) |
+ add $f,$f,$e |
+ rotlwi $b,$b,30 |
+ lwz r18,8($ctx) |
+ xor $t0,$t0,$d |
+ lwz r19,12($ctx) |
+ add $f,$f,$t0 |
+ lwz r20,16($ctx) |
+___ |
+} |
+ |
+sub BODY_40_59 { |
+my ($i,$a,$b,$c,$d,$e,$f)=@_; |
+my $j=$i+1; |
+$code.=<<___; |
+ add $f,$K,$e |
+ rotlwi $e,$a,5 |
+ xor @X[$j%16],@X[$j%16],@X[($j+2)%16] |
+ add $f,$f,@X[$i%16] |
+ and $t0,$b,$c |
+ xor @X[$j%16],@X[$j%16],@X[($j+8)%16] |
+ add $f,$f,$e |
+ or $t1,$b,$c |
+ rotlwi $b,$b,30 |
+ xor @X[$j%16],@X[$j%16],@X[($j+13)%16] |
+ and $t1,$t1,$d |
+ or $t0,$t0,$t1 |
+ rotlwi @X[$j%16],@X[$j%16],1 |
+ add $f,$f,$t0 |
+___ |
+} |
+ |
+$code=<<___; |
+.machine "any" |
+.text |
+ |
+.globl .sha1_block_data_order |
+.align 4 |
+.sha1_block_data_order: |
+ mflr r0 |
+ $STU $sp,`-($FRAME+64)`($sp) |
+ $PUSH r0,`$FRAME-$SIZE_T*18`($sp) |
+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp) |
+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp) |
+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp) |
+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp) |
+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp) |
+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp) |
+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp) |
+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp) |
+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp) |
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp) |
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp) |
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
+ lwz $A,0($ctx) |
+ lwz $B,4($ctx) |
+ lwz $C,8($ctx) |
+ lwz $D,12($ctx) |
+ lwz $E,16($ctx) |
+ andi. r0,$inp,3 |
+ bne Lunaligned |
+Laligned: |
+ mtctr $num |
+ bl Lsha1_block_private |
+Ldone: |
+ $POP r0,`$FRAME-$SIZE_T*18`($sp) |
+ $POP r15,`$FRAME-$SIZE_T*17`($sp) |
+ $POP r16,`$FRAME-$SIZE_T*16`($sp) |
+ $POP r17,`$FRAME-$SIZE_T*15`($sp) |
+ $POP r18,`$FRAME-$SIZE_T*14`($sp) |
+ $POP r19,`$FRAME-$SIZE_T*13`($sp) |
+ $POP r20,`$FRAME-$SIZE_T*12`($sp) |
+ $POP r21,`$FRAME-$SIZE_T*11`($sp) |
+ $POP r22,`$FRAME-$SIZE_T*10`($sp) |
+ $POP r23,`$FRAME-$SIZE_T*9`($sp) |
+ $POP r24,`$FRAME-$SIZE_T*8`($sp) |
+ $POP r25,`$FRAME-$SIZE_T*7`($sp) |
+ $POP r26,`$FRAME-$SIZE_T*6`($sp) |
+ $POP r27,`$FRAME-$SIZE_T*5`($sp) |
+ $POP r28,`$FRAME-$SIZE_T*4`($sp) |
+ $POP r29,`$FRAME-$SIZE_T*3`($sp) |
+ $POP r30,`$FRAME-$SIZE_T*2`($sp) |
+ $POP r31,`$FRAME-$SIZE_T*1`($sp) |
+ mtlr r0 |
+ addi $sp,$sp,`$FRAME+64` |
+ blr |
+___ |
+ |
+# PowerPC specification allows an implementation to be ill-behaved |
+# upon unaligned access which crosses page boundary. "Better safe |
+# than sorry" principle makes me treat it specially. But I don't |
+# look for particular offending word, but rather for 64-byte input |
+# block which crosses the boundary. Once found that block is aligned |
+# and hashed separately... |
+$code.=<<___; |
+.align 4 |
+Lunaligned: |
+ subfic $t1,$inp,4096 |
+ andi. $t1,$t1,4095 ; distance to closest page boundary |
+ srwi. $t1,$t1,6 ; t1/=64 |
+ beq Lcross_page |
+ $UCMP $num,$t1 |
+ ble- Laligned ; didn't cross the page boundary |
+ mtctr $t1 |
+ subfc $num,$t1,$num |
+ bl Lsha1_block_private |
+Lcross_page: |
+ li $t1,16 |
+ mtctr $t1 |
+ addi r20,$sp,$FRAME ; spot below the frame |
+Lmemcpy: |
+ lbz r16,0($inp) |
+ lbz r17,1($inp) |
+ lbz r18,2($inp) |
+ lbz r19,3($inp) |
+ addi $inp,$inp,4 |
+ stb r16,0(r20) |
+ stb r17,1(r20) |
+ stb r18,2(r20) |
+ stb r19,3(r20) |
+ addi r20,r20,4 |
+ bdnz Lmemcpy |
+ |
+ $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) |
+ li $t1,1 |
+ addi $inp,$sp,$FRAME |
+ mtctr $t1 |
+ bl Lsha1_block_private |
+ $POP $inp,`$FRAME-$SIZE_T*19`($sp) |
+ addic. $num,$num,-1 |
+ bne- Lunaligned |
+ b Ldone |
+___ |
+ |
+# This is private block function, which uses tailored calling |
+# interface, namely upon entry SHA_CTX is pre-loaded to given |
+# registers and counter register contains amount of chunks to |
+# digest... |
+$code.=<<___; |
+.align 4 |
+Lsha1_block_private: |
+___ |
+$code.=<<___; # load K_00_19 |
+ lis $K,0x5a82 |
+ ori $K,$K,0x7999 |
+___ |
+for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } |
+$code.=<<___; # load K_20_39 |
+ lis $K,0x6ed9 |
+ ori $K,$K,0xeba1 |
+___ |
+for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
+$code.=<<___; # load K_40_59 |
+ lis $K,0x8f1b |
+ ori $K,$K,0xbcdc |
+___ |
+for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } |
+$code.=<<___; # load K_60_79 |
+ lis $K,0xca62 |
+ ori $K,$K,0xc1d6 |
+___ |
+for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
+$code.=<<___; |
+ add r16,r16,$E |
+ add r17,r17,$T |
+ add r18,r18,$A |
+ add r19,r19,$B |
+ add r20,r20,$C |
+ stw r16,0($ctx) |
+ mr $A,r16 |
+ stw r17,4($ctx) |
+ mr $B,r17 |
+ stw r18,8($ctx) |
+ mr $C,r18 |
+ stw r19,12($ctx) |
+ mr $D,r19 |
+ stw r20,16($ctx) |
+ mr $E,r20 |
+ addi $inp,$inp,`16*4` |
+ bdnz- Lsha1_block_private |
+ blr |
+___ |
+$code.=<<___; |
+.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" |
+___ |
+ |
+$code =~ s/\`([^\`]*)\`/eval $1/gem; |
+print $code; |
+close STDOUT; |