OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 | |
10 # SHA1 block procedure for MIPS. | |
11 | |
12 # Performance improvement is 30% on unaligned input. The "secret" is | |
13 # to deploy lwl/lwr pair to load unaligned input. One could have | |
14 # vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- | |
15 # compatible subroutine. There is room for minor optimization on | |
16 # little-endian platforms... | |
17 | |
18 ###################################################################### | |
19 # There is a number of MIPS ABI in use, O32 and N32/64 are most | |
20 # widely used. Then there is a new contender: NUBI. It appears that if | |
21 # one picks the latter, it's possible to arrange code in ABI neutral | |
22 # manner. Therefore let's stick to NUBI register layout: | |
23 # | |
24 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | |
25 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | |
26 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | |
27 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | |
28 # | |
29 # The return value is placed in $a0. Following coding rules facilitate | |
30 # interoperability: | |
31 # | |
32 # - never ever touch $tp, "thread pointer", former $gp; | |
33 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | |
34 # old code]; | |
35 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | |
36 # | |
37 # For reference here is register layout for N32/64 MIPS ABIs: | |
38 # | |
39 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | |
40 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | |
41 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | |
42 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | |
43 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | |
44 # | |
45 $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | |
46 | |
47 if ($flavour =~ /64|n32/i) { | |
48 $PTR_ADD="dadd"; # incidentally works even on n32 | |
49 $PTR_SUB="dsub"; # incidentally works even on n32 | |
50 $REG_S="sd"; | |
51 $REG_L="ld"; | |
52 $PTR_SLL="dsll"; # incidentally works even on n32 | |
53 $SZREG=8; | |
54 } else { | |
55 $PTR_ADD="add"; | |
56 $PTR_SUB="sub"; | |
57 $REG_S="sw"; | |
58 $REG_L="lw"; | |
59 $PTR_SLL="sll"; | |
60 $SZREG=4; | |
61 } | |
62 # | |
63 # <appro@openssl.org> | |
64 # | |
65 ###################################################################### | |
66 | |
67 $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | |
68 | |
69 for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | |
70 open STDOUT,">$output"; | |
71 | |
72 if (!defined($big_endian)) | |
73 { $big_endian=(unpack('L',pack('N',1))==1); } | |
74 | |
75 # offsets of the Most and Least Significant Bytes | |
76 $MSB=$big_endian?0:3; | |
77 $LSB=3&~$MSB; | |
78 | |
79 @X=map("\$$_",(8..23)); # a4-a7,s0-s11 | |
80 | |
81 $ctx=$a0; | |
82 $inp=$a1; | |
83 $num=$a2; | |
84 $A="\$1"; | |
85 $B="\$2"; | |
86 $C="\$3"; | |
87 $D="\$7"; | |
88 $E="\$24"; @V=($A,$B,$C,$D,$E); | |
89 $t0="\$25"; | |
90 $t1=$num; # $num is offloaded to stack | |
91 $t2="\$30"; # fp | |
92 $K="\$31"; # ra | |
93 | |
94 sub BODY_00_14 { | |
95 my ($i,$a,$b,$c,$d,$e)=@_; | |
96 my $j=$i+1; | |
97 $code.=<<___ if (!$big_endian); | |
98 srl $t0,@X[$i],24 # byte swap($i) | |
99 srl $t1,@X[$i],8 | |
100 andi $t2,@X[$i],0xFF00 | |
101 sll @X[$i],@X[$i],24 | |
102 andi $t1,0xFF00 | |
103 sll $t2,$t2,8 | |
104 or @X[$i],$t0 | |
105 or $t1,$t2 | |
106 or @X[$i],$t1 | |
107 ___ | |
108 $code.=<<___; | |
109 lwl @X[$j],$j*4+$MSB($inp) | |
110 sll $t0,$a,5 # $i | |
111 addu $e,$K | |
112 lwr @X[$j],$j*4+$LSB($inp) | |
113 srl $t1,$a,27 | |
114 addu $e,$t0 | |
115 xor $t0,$c,$d | |
116 addu $e,$t1 | |
117 sll $t2,$b,30 | |
118 and $t0,$b | |
119 srl $b,$b,2 | |
120 xor $t0,$d | |
121 addu $e,@X[$i] | |
122 or $b,$t2 | |
123 addu $e,$t0 | |
124 ___ | |
125 } | |
126 | |
127 sub BODY_15_19 { | |
128 my ($i,$a,$b,$c,$d,$e)=@_; | |
129 my $j=$i+1; | |
130 | |
131 $code.=<<___ if (!$big_endian && $i==15); | |
132 srl $t0,@X[$i],24 # byte swap($i) | |
133 srl $t1,@X[$i],8 | |
134 andi $t2,@X[$i],0xFF00 | |
135 sll @X[$i],@X[$i],24 | |
136 andi $t1,0xFF00 | |
137 sll $t2,$t2,8 | |
138 or @X[$i],$t0 | |
139 or @X[$i],$t1 | |
140 or @X[$i],$t2 | |
141 ___ | |
142 $code.=<<___; | |
143 xor @X[$j%16],@X[($j+2)%16] | |
144 sll $t0,$a,5 # $i | |
145 addu $e,$K | |
146 srl $t1,$a,27 | |
147 addu $e,$t0 | |
148 xor @X[$j%16],@X[($j+8)%16] | |
149 xor $t0,$c,$d | |
150 addu $e,$t1 | |
151 xor @X[$j%16],@X[($j+13)%16] | |
152 sll $t2,$b,30 | |
153 and $t0,$b | |
154 srl $t1,@X[$j%16],31 | |
155 addu @X[$j%16],@X[$j%16] | |
156 srl $b,$b,2 | |
157 xor $t0,$d | |
158 or @X[$j%16],$t1 | |
159 addu $e,@X[$i%16] | |
160 or $b,$t2 | |
161 addu $e,$t0 | |
162 ___ | |
163 } | |
164 | |
165 sub BODY_20_39 { | |
166 my ($i,$a,$b,$c,$d,$e)=@_; | |
167 my $j=$i+1; | |
168 $code.=<<___ if ($i<79); | |
169 xor @X[$j%16],@X[($j+2)%16] | |
170 sll $t0,$a,5 # $i | |
171 addu $e,$K | |
172 srl $t1,$a,27 | |
173 addu $e,$t0 | |
174 xor @X[$j%16],@X[($j+8)%16] | |
175 xor $t0,$c,$d | |
176 addu $e,$t1 | |
177 xor @X[$j%16],@X[($j+13)%16] | |
178 sll $t2,$b,30 | |
179 xor $t0,$b | |
180 srl $t1,@X[$j%16],31 | |
181 addu @X[$j%16],@X[$j%16] | |
182 srl $b,$b,2 | |
183 addu $e,@X[$i%16] | |
184 or @X[$j%16],$t1 | |
185 or $b,$t2 | |
186 addu $e,$t0 | |
187 ___ | |
188 $code.=<<___ if ($i==79); | |
189 lw @X[0],0($ctx) | |
190 sll $t0,$a,5 # $i | |
191 addu $e,$K | |
192 lw @X[1],4($ctx) | |
193 srl $t1,$a,27 | |
194 addu $e,$t0 | |
195 lw @X[2],8($ctx) | |
196 xor $t0,$c,$d | |
197 addu $e,$t1 | |
198 lw @X[3],12($ctx) | |
199 sll $t2,$b,30 | |
200 xor $t0,$b | |
201 lw @X[4],16($ctx) | |
202 srl $b,$b,2 | |
203 addu $e,@X[$i%16] | |
204 or $b,$t2 | |
205 addu $e,$t0 | |
206 ___ | |
207 } | |
208 | |
209 sub BODY_40_59 { | |
210 my ($i,$a,$b,$c,$d,$e)=@_; | |
211 my $j=$i+1; | |
212 $code.=<<___ if ($i<79); | |
213 xor @X[$j%16],@X[($j+2)%16] | |
214 sll $t0,$a,5 # $i | |
215 addu $e,$K | |
216 srl $t1,$a,27 | |
217 addu $e,$t0 | |
218 xor @X[$j%16],@X[($j+8)%16] | |
219 and $t0,$c,$d | |
220 addu $e,$t1 | |
221 xor @X[$j%16],@X[($j+13)%16] | |
222 sll $t2,$b,30 | |
223 addu $e,$t0 | |
224 srl $t1,@X[$j%16],31 | |
225 xor $t0,$c,$d | |
226 addu @X[$j%16],@X[$j%16] | |
227 and $t0,$b | |
228 srl $b,$b,2 | |
229 or @X[$j%16],$t1 | |
230 addu $e,@X[$i%16] | |
231 or $b,$t2 | |
232 addu $e,$t0 | |
233 ___ | |
234 } | |
235 | |
236 $FRAMESIZE=16; # large enough to accomodate NUBI saved registers | |
237 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | |
238 | |
239 $code=<<___; | |
240 #ifdef OPENSSL_FIPSCANISTER | |
241 # include <openssl/fipssyms.h> | |
242 #endif | |
243 | |
244 .text | |
245 | |
246 .set noat | |
247 .set noreorder | |
248 .align 5 | |
249 .globl sha1_block_data_order | |
250 .ent sha1_block_data_order | |
251 sha1_block_data_order: | |
252 .frame $sp,$FRAMESIZE*$SZREG,$ra | |
253 .mask $SAVED_REGS_MASK,-$SZREG | |
254 .set noreorder | |
255 $PTR_SUB $sp,$FRAMESIZE*$SZREG | |
256 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) | |
257 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) | |
258 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) | |
259 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) | |
260 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) | |
261 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) | |
262 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) | |
263 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) | |
264 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) | |
265 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) | |
266 ___ | |
267 $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | |
268 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) | |
269 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) | |
270 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) | |
271 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) | |
272 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) | |
273 ___ | |
274 $code.=<<___; | |
275 $PTR_SLL $num,6 | |
276 $PTR_ADD $num,$inp | |
277 $REG_S $num,0($sp) | |
278 lw $A,0($ctx) | |
279 lw $B,4($ctx) | |
280 lw $C,8($ctx) | |
281 lw $D,12($ctx) | |
282 b .Loop | |
283 lw $E,16($ctx) | |
284 .align 4 | |
285 .Loop: | |
286 .set reorder | |
287 lwl @X[0],$MSB($inp) | |
288 lui $K,0x5a82 | |
289 lwr @X[0],$LSB($inp) | |
290 ori $K,0x7999 # K_00_19 | |
291 ___ | |
292 for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } | |
293 for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } | |
294 $code.=<<___; | |
295 lui $K,0x6ed9 | |
296 ori $K,0xeba1 # K_20_39 | |
297 ___ | |
298 for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
299 $code.=<<___; | |
300 lui $K,0x8f1b | |
301 ori $K,0xbcdc # K_40_59 | |
302 ___ | |
303 for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
304 $code.=<<___; | |
305 lui $K,0xca62 | |
306 ori $K,0xc1d6 # K_60_79 | |
307 ___ | |
308 for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
309 $code.=<<___; | |
310 $PTR_ADD $inp,64 | |
311 $REG_L $num,0($sp) | |
312 | |
313 addu $A,$X[0] | |
314 addu $B,$X[1] | |
315 sw $A,0($ctx) | |
316 addu $C,$X[2] | |
317 addu $D,$X[3] | |
318 sw $B,4($ctx) | |
319 addu $E,$X[4] | |
320 sw $C,8($ctx) | |
321 sw $D,12($ctx) | |
322 sw $E,16($ctx) | |
323 .set noreorder | |
324 bne $inp,$num,.Loop | |
325 nop | |
326 | |
327 .set noreorder | |
328 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) | |
329 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) | |
330 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) | |
331 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) | |
332 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) | |
333 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) | |
334 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) | |
335 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) | |
336 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) | |
337 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) | |
338 ___ | |
339 $code.=<<___ if ($flavour =~ /nubi/i); | |
340 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) | |
341 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) | |
342 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) | |
343 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) | |
344 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) | |
345 ___ | |
346 $code.=<<___; | |
347 jr $ra | |
348 $PTR_ADD $sp,$FRAMESIZE*$SZREG | |
349 .end sha1_block_data_order | |
350 .rdata | |
351 .asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | |
352 ___ | |
353 print $code; | |
354 close STDOUT; | |
OLD | NEW |