OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 | |
10 # I let hardware handle unaligned input(*), except on page boundaries | |
11 # (see below for details). Otherwise straightforward implementation | |
12 # with X vector in register bank. The module is big-endian [which is | |
13 # not big deal as there're no little-endian targets left around]. | |
14 # | |
15 # (*) this means that this module is inappropriate for PPC403? Does | |
16 # anybody know if pre-POWER3 can sustain unaligned load? | |
17 | |
18 # -m64 -m32 | |
19 # ---------------------------------- | |
20 # PPC970,gcc-4.0.0 +76% +59% | |
21 # Power6,xlc-7 +68% +33% | |
22 | |
23 $flavour = shift; | |
24 | |
25 if ($flavour =~ /64/) { | |
26 $SIZE_T =8; | |
27 $LRSAVE =2*$SIZE_T; | |
28 $UCMP ="cmpld"; | |
29 $STU ="stdu"; | |
30 $POP ="ld"; | |
31 $PUSH ="std"; | |
32 } elsif ($flavour =~ /32/) { | |
33 $SIZE_T =4; | |
34 $LRSAVE =$SIZE_T; | |
35 $UCMP ="cmplw"; | |
36 $STU ="stwu"; | |
37 $POP ="lwz"; | |
38 $PUSH ="stw"; | |
39 } else { die "nonsense $flavour"; } | |
40 | |
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
42 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
43 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
44 die "can't locate ppc-xlate.pl"; | |
45 | |
46 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
47 | |
48 $FRAME=24*$SIZE_T+64; | |
49 $LOCALS=6*$SIZE_T; | |
50 | |
51 $K ="r0"; | |
52 $sp ="r1"; | |
53 $toc="r2"; | |
54 $ctx="r3"; | |
55 $inp="r4"; | |
56 $num="r5"; | |
57 $t0 ="r15"; | |
58 $t1 ="r6"; | |
59 | |
60 $A ="r7"; | |
61 $B ="r8"; | |
62 $C ="r9"; | |
63 $D ="r10"; | |
64 $E ="r11"; | |
65 $T ="r12"; | |
66 | |
67 @V=($A,$B,$C,$D,$E,$T); | |
68 @X=("r16","r17","r18","r19","r20","r21","r22","r23", | |
69 "r24","r25","r26","r27","r28","r29","r30","r31"); | |
70 | |
71 sub BODY_00_19 { | |
72 my ($i,$a,$b,$c,$d,$e,$f)=@_; | |
73 my $j=$i+1; | |
74 $code.=<<___ if ($i==0); | |
75 lwz @X[$i],`$i*4`($inp) | |
76 ___ | |
77 $code.=<<___ if ($i<15); | |
78 lwz @X[$j],`$j*4`($inp) | |
79 add $f,$K,$e | |
80 rotlwi $e,$a,5 | |
81 add $f,$f,@X[$i] | |
82 and $t0,$c,$b | |
83 add $f,$f,$e | |
84 andc $t1,$d,$b | |
85 rotlwi $b,$b,30 | |
86 or $t0,$t0,$t1 | |
87 add $f,$f,$t0 | |
88 ___ | |
89 $code.=<<___ if ($i>=15); | |
90 add $f,$K,$e | |
91 rotlwi $e,$a,5 | |
92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] | |
93 add $f,$f,@X[$i%16] | |
94 and $t0,$c,$b | |
95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] | |
96 add $f,$f,$e | |
97 andc $t1,$d,$b | |
98 rotlwi $b,$b,30 | |
99 or $t0,$t0,$t1 | |
100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] | |
101 add $f,$f,$t0 | |
102 rotlwi @X[$j%16],@X[$j%16],1 | |
103 ___ | |
104 } | |
105 | |
106 sub BODY_20_39 { | |
107 my ($i,$a,$b,$c,$d,$e,$f)=@_; | |
108 my $j=$i+1; | |
109 $code.=<<___ if ($i<79); | |
110 add $f,$K,$e | |
111 rotlwi $e,$a,5 | |
112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] | |
113 add $f,$f,@X[$i%16] | |
114 xor $t0,$b,$c | |
115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] | |
116 add $f,$f,$e | |
117 rotlwi $b,$b,30 | |
118 xor $t0,$t0,$d | |
119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] | |
120 add $f,$f,$t0 | |
121 rotlwi @X[$j%16],@X[$j%16],1 | |
122 ___ | |
123 $code.=<<___ if ($i==79); | |
124 add $f,$K,$e | |
125 rotlwi $e,$a,5 | |
126 lwz r16,0($ctx) | |
127 add $f,$f,@X[$i%16] | |
128 xor $t0,$b,$c | |
129 lwz r17,4($ctx) | |
130 add $f,$f,$e | |
131 rotlwi $b,$b,30 | |
132 lwz r18,8($ctx) | |
133 xor $t0,$t0,$d | |
134 lwz r19,12($ctx) | |
135 add $f,$f,$t0 | |
136 lwz r20,16($ctx) | |
137 ___ | |
138 } | |
139 | |
140 sub BODY_40_59 { | |
141 my ($i,$a,$b,$c,$d,$e,$f)=@_; | |
142 my $j=$i+1; | |
143 $code.=<<___; | |
144 add $f,$K,$e | |
145 rotlwi $e,$a,5 | |
146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] | |
147 add $f,$f,@X[$i%16] | |
148 and $t0,$b,$c | |
149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] | |
150 add $f,$f,$e | |
151 or $t1,$b,$c | |
152 rotlwi $b,$b,30 | |
153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] | |
154 and $t1,$t1,$d | |
155 or $t0,$t0,$t1 | |
156 rotlwi @X[$j%16],@X[$j%16],1 | |
157 add $f,$f,$t0 | |
158 ___ | |
159 } | |
160 | |
161 $code=<<___; | |
162 .machine "any" | |
163 .text | |
164 | |
165 .globl .sha1_block_data_order | |
166 .align 4 | |
167 .sha1_block_data_order: | |
168 $STU $sp,-$FRAME($sp) | |
169 mflr r0 | |
170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
187 $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
188 lwz $A,0($ctx) | |
189 lwz $B,4($ctx) | |
190 lwz $C,8($ctx) | |
191 lwz $D,12($ctx) | |
192 lwz $E,16($ctx) | |
193 andi. r0,$inp,3 | |
194 bne Lunaligned | |
195 Laligned: | |
196 mtctr $num | |
197 bl Lsha1_block_private | |
198 b Ldone | |
199 | |
200 ; PowerPC specification allows an implementation to be ill-behaved | |
201 ; upon unaligned access which crosses page boundary. "Better safe | |
202 ; than sorry" principle makes me treat it specially. But I don't | |
203 ; look for particular offending word, but rather for 64-byte input | |
204 ; block which crosses the boundary. Once found that block is aligned | |
205 ; and hashed separately... | |
206 .align 4 | |
207 Lunaligned: | |
208 subfic $t1,$inp,4096 | |
209 andi. $t1,$t1,4095 ; distance to closest page boundary | |
210 srwi. $t1,$t1,6 ; t1/=64 | |
211 beq Lcross_page | |
212 $UCMP $num,$t1 | |
213 ble- Laligned ; didn't cross the page boundary | |
214 mtctr $t1 | |
215 subfc $num,$t1,$num | |
216 bl Lsha1_block_private | |
217 Lcross_page: | |
218 li $t1,16 | |
219 mtctr $t1 | |
220 addi r20,$sp,$LOCALS ; spot within the frame | |
221 Lmemcpy: | |
222 lbz r16,0($inp) | |
223 lbz r17,1($inp) | |
224 lbz r18,2($inp) | |
225 lbz r19,3($inp) | |
226 addi $inp,$inp,4 | |
227 stb r16,0(r20) | |
228 stb r17,1(r20) | |
229 stb r18,2(r20) | |
230 stb r19,3(r20) | |
231 addi r20,r20,4 | |
232 bdnz Lmemcpy | |
233 | |
234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) | |
235 li $t1,1 | |
236 addi $inp,$sp,$LOCALS | |
237 mtctr $t1 | |
238 bl Lsha1_block_private | |
239 $POP $inp,`$FRAME-$SIZE_T*18`($sp) | |
240 addic. $num,$num,-1 | |
241 bne- Lunaligned | |
242 | |
243 Ldone: | |
244 $POP r0,`$FRAME+$LRSAVE`($sp) | |
245 $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
246 $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
247 $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
248 $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
249 $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
250 $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
251 $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
252 $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
253 $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
254 $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
255 $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
256 $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
257 $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
258 $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
259 $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
260 $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
261 $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
262 mtlr r0 | |
263 addi $sp,$sp,$FRAME | |
264 blr | |
265 .long 0 | |
266 .byte 0,12,4,1,0x80,18,3,0 | |
267 .long 0 | |
268 ___ | |
269 | |
270 # This is private block function, which uses tailored calling | |
271 # interface, namely upon entry SHA_CTX is pre-loaded to given | |
272 # registers and counter register contains amount of chunks to | |
273 # digest... | |
274 $code.=<<___; | |
275 .align 4 | |
276 Lsha1_block_private: | |
277 ___ | |
278 $code.=<<___; # load K_00_19 | |
279 lis $K,0x5a82 | |
280 ori $K,$K,0x7999 | |
281 ___ | |
282 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | |
283 $code.=<<___; # load K_20_39 | |
284 lis $K,0x6ed9 | |
285 ori $K,$K,0xeba1 | |
286 ___ | |
287 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
288 $code.=<<___; # load K_40_59 | |
289 lis $K,0x8f1b | |
290 ori $K,$K,0xbcdc | |
291 ___ | |
292 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
293 $code.=<<___; # load K_60_79 | |
294 lis $K,0xca62 | |
295 ori $K,$K,0xc1d6 | |
296 ___ | |
297 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
298 $code.=<<___; | |
299 add r16,r16,$E | |
300 add r17,r17,$T | |
301 add r18,r18,$A | |
302 add r19,r19,$B | |
303 add r20,r20,$C | |
304 stw r16,0($ctx) | |
305 mr $A,r16 | |
306 stw r17,4($ctx) | |
307 mr $B,r17 | |
308 stw r18,8($ctx) | |
309 mr $C,r18 | |
310 stw r19,12($ctx) | |
311 mr $D,r19 | |
312 stw r20,16($ctx) | |
313 mr $E,r20 | |
314 addi $inp,$inp,`16*4` | |
315 bdnz- Lsha1_block_private | |
316 blr | |
317 .long 0 | |
318 .byte 0,12,0x14,0,0,0,0,0 | |
319 ___ | |
320 $code.=<<___; | |
321 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | |
322 ___ | |
323 | |
324 $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
325 print $code; | |
326 close STDOUT; | |
OLD | NEW |