OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 | |
10 # RC4 for PA-RISC. | |
11 | |
12 # June 2009. | |
13 # | |
14 # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. | |
15 # For reference, [4x] unrolled loop is >40% faster than folded one. | |
16 # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement | |
17 # is believed to be not sufficient to justify the effort... | |
18 # | |
19 # Special thanks to polarhome.com for providing HP-UX account. | |
20 | |
21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
22 | |
23 $flavour = shift; | |
24 $output = shift; | |
25 open STDOUT,">$output"; | |
26 | |
27 if ($flavour =~ /64/) { | |
28 $LEVEL ="2.0W"; | |
29 $SIZE_T =8; | |
30 $FRAME_MARKER =80; | |
31 $SAVED_RP =16; | |
32 $PUSH ="std"; | |
33 $PUSHMA ="std,ma"; | |
34 $POP ="ldd"; | |
35 $POPMB ="ldd,mb"; | |
36 } else { | |
37 $LEVEL ="1.0"; | |
38 $SIZE_T =4; | |
39 $FRAME_MARKER =48; | |
40 $SAVED_RP =20; | |
41 $PUSH ="stw"; | |
42 $PUSHMA ="stwm"; | |
43 $POP ="ldw"; | |
44 $POPMB ="ldwm"; | |
45 } | |
46 | |
47 $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker | |
48 # [+ argument transfer] | |
49 $SZ=1; # defaults to RC4_CHAR | |
50 if (open CONF,"<${dir}../../opensslconf.h") { | |
51 while(<CONF>) { | |
52 if (m/#\s*define\s+RC4_INT\s+(.*)/) { | |
53 $SZ = ($1=~/char$/) ? 1 : 4; | |
54 last; | |
55 } | |
56 } | |
57 close CONF; | |
58 } | |
59 | |
60 if ($SZ==1) { # RC4_CHAR | |
61 $LD="ldb"; | |
62 $LDX="ldbx"; | |
63 $MKX="addl"; | |
64 $ST="stb"; | |
65 } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) | |
66 $LD="ldw"; | |
67 $LDX="ldwx,s"; | |
68 $MKX="sh2addl"; | |
69 $ST="stw"; | |
70 } | |
71 | |
72 $key="%r26"; | |
73 $len="%r25"; | |
74 $inp="%r24"; | |
75 $out="%r23"; | |
76 | |
77 @XX=("%r19","%r20"); | |
78 @TX=("%r21","%r22"); | |
79 $YY="%r28"; | |
80 $TY="%r29"; | |
81 | |
82 $acc="%r1"; | |
83 $ix="%r2"; | |
84 $iy="%r3"; | |
85 $dat0="%r4"; | |
86 $dat1="%r5"; | |
87 $rem="%r6"; | |
88 $mask="%r31"; | |
89 | |
90 sub unrolledloopbody { | |
91 for ($i=0;$i<4;$i++) { | |
92 $code.=<<___; | |
93 ldo 1($XX[0]),$XX[1] | |
94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` | |
95 and $mask,$XX[1],$XX[1] | |
96 $LDX $YY($key),$TY | |
97 $MKX $YY,$key,$ix | |
98 $LDX $XX[1]($key),$TX[1] | |
99 $MKX $XX[0],$key,$iy | |
100 $ST $TX[0],0($ix) | |
101 comclr,<> $XX[1],$YY,%r0 ; conditional | |
102 copy $TX[0],$TX[1] ; move | |
103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` | |
104 $ST $TY,0($iy) | |
105 addl $TX[0],$TY,$TY | |
106 addl $TX[1],$YY,$YY | |
107 and $mask,$TY,$TY | |
108 and $mask,$YY,$YY | |
109 ___ | |
110 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | |
111 } } | |
112 | |
113 sub foldedloop { | |
114 my ($label,$count)=@_; | |
115 $code.=<<___; | |
116 $label | |
117 $MKX $YY,$key,$iy | |
118 $LDX $YY($key),$TY | |
119 $MKX $XX[0],$key,$ix | |
120 $ST $TX[0],0($iy) | |
121 ldo 1($XX[0]),$XX[0] | |
122 $ST $TY,0($ix) | |
123 addl $TX[0],$TY,$TY | |
124 ldbx $inp($out),$dat1 | |
125 and $mask,$TY,$TY | |
126 and $mask,$XX[0],$XX[0] | |
127 $LDX $TY($key),$acc | |
128 $LDX $XX[0]($key),$TX[0] | |
129 ldo 1($out),$out | |
130 xor $dat1,$acc,$acc | |
131 addl $TX[0],$YY,$YY | |
132 stb $acc,-1($out) | |
133 addib,<> -1,$count,$label ; $count is always small | |
134 and $mask,$YY,$YY | |
135 ___ | |
136 } | |
137 | |
138 $code=<<___; | |
139 .LEVEL $LEVEL | |
140 .SPACE \$TEXT\$ | |
141 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | |
142 | |
143 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | |
144 RC4 | |
145 .PROC | |
146 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 | |
147 .ENTRY | |
148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | |
149 $PUSHMA %r3,$FRAME(%sp) | |
150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | |
151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | |
152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | |
153 | |
154 cmpib,*= 0,$len,L\$abort | |
155 sub $inp,$out,$inp ; distance between $inp and $out | |
156 | |
157 $LD `0*$SZ`($key),$XX[0] | |
158 $LD `1*$SZ`($key),$YY | |
159 ldo `2*$SZ`($key),$key | |
160 | |
161 ldi 0xff,$mask | |
162 ldi 3,$dat0 | |
163 | |
164 ldo 1($XX[0]),$XX[0] ; warm up loop | |
165 and $mask,$XX[0],$XX[0] | |
166 $LDX $XX[0]($key),$TX[0] | |
167 addl $TX[0],$YY,$YY | |
168 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? | |
169 and $mask,$YY,$YY | |
170 | |
171 and,<> $out,$dat0,$rem ; is $out aligned? | |
172 b L\$alignedout | |
173 subi 4,$rem,$rem | |
174 sub $len,$rem,$len | |
175 ___ | |
176 &foldedloop("L\$alignout",$rem); # process till $out is aligned | |
177 | |
178 $code.=<<___; | |
179 L\$alignedout ; $len is at least 4 here | |
180 and,<> $inp,$dat0,$acc ; is $inp aligned? | |
181 b L\$oop4 | |
182 sub $inp,$acc,$rem ; align $inp | |
183 | |
184 sh3addl $acc,%r0,$acc | |
185 subi 32,$acc,$acc | |
186 mtctl $acc,%cr11 ; load %sar with vshd align factor | |
187 ldwx $rem($out),$dat0 | |
188 ldo 4($rem),$rem | |
189 L\$oop4misalignedinp | |
190 ___ | |
191 &unrolledloopbody(); | |
192 $code.=<<___; | |
193 $LDX $TY($key),$ix | |
194 ldwx $rem($out),$dat1 | |
195 ldo -4($len),$len | |
196 or $ix,$acc,$acc ; last piece, no need to dep | |
197 vshd $dat0,$dat1,$iy ; align data | |
198 copy $dat1,$dat0 | |
199 xor $iy,$acc,$acc | |
200 stw $acc,0($out) | |
201 cmpib,*<< 3,$len,L\$oop4misalignedinp | |
202 ldo 4($out),$out | |
203 cmpib,*= 0,$len,L\$done | |
204 nop | |
205 b L\$oop1 | |
206 nop | |
207 | |
208 .ALIGN 8 | |
209 L\$oop4 | |
210 ___ | |
211 &unrolledloopbody(); | |
212 $code.=<<___; | |
213 $LDX $TY($key),$ix | |
214 ldwx $inp($out),$dat0 | |
215 ldo -4($len),$len | |
216 or $ix,$acc,$acc ; last piece, no need to dep | |
217 xor $dat0,$acc,$acc | |
218 stw $acc,0($out) | |
219 cmpib,*<< 3,$len,L\$oop4 | |
220 ldo 4($out),$out | |
221 cmpib,*= 0,$len,L\$done | |
222 nop | |
223 ___ | |
224 &foldedloop("L\$oop1",$len); | |
225 $code.=<<___; | |
226 L\$done | |
227 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 | |
228 ldo -1($XX[0]),$XX[0] ; chill out loop | |
229 sub $YY,$TX[0],$YY | |
230 and $mask,$XX[0],$XX[0] | |
231 and $mask,$YY,$YY | |
232 $ST $XX[0],`-2*$SZ`($key) | |
233 $ST $YY,`-1*$SZ`($key) | |
234 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | |
235 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | |
236 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | |
237 L\$abort | |
238 bv (%r2) | |
239 .EXIT | |
240 $POPMB -$FRAME(%sp),%r3 | |
241 .PROCEND | |
242 ___ | |
243 | |
244 $code.=<<___; | |
245 | |
246 .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | |
247 .ALIGN 8 | |
248 private_RC4_set_key | |
249 .PROC | |
250 .CALLINFO NO_CALLS | |
251 .ENTRY | |
252 $ST %r0,`0*$SZ`($key) | |
253 $ST %r0,`1*$SZ`($key) | |
254 ldo `2*$SZ`($key),$key | |
255 copy %r0,@XX[0] | |
256 L\$1st | |
257 $ST @XX[0],0($key) | |
258 ldo 1(@XX[0]),@XX[0] | |
259 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 | |
260 ldo $SZ($key),$key | |
261 | |
262 ldo `-256*$SZ`($key),$key ; rewind $key | |
263 addl $len,$inp,$inp ; $inp to point at the end | |
264 sub %r0,$len,%r23 ; inverse index | |
265 copy %r0,@XX[0] | |
266 copy %r0,@XX[1] | |
267 ldi 0xff,$mask | |
268 | |
269 L\$2nd | |
270 $LDX @XX[0]($key),@TX[0] | |
271 ldbx %r23($inp),@TX[1] | |
272 addi,nuv 1,%r23,%r23 ; increment and conditional | |
273 sub %r0,$len,%r23 ; inverse index | |
274 addl @TX[0],@XX[1],@XX[1] | |
275 addl @TX[1],@XX[1],@XX[1] | |
276 and $mask,@XX[1],@XX[1] | |
277 $MKX @XX[0],$key,$TY | |
278 $LDX @XX[1]($key),@TX[1] | |
279 $MKX @XX[1],$key,$YY | |
280 ldo 1(@XX[0]),@XX[0] | |
281 $ST @TX[0],0($YY) | |
282 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 | |
283 $ST @TX[1],0($TY) | |
284 | |
285 bv,n (%r2) | |
286 .EXIT | |
287 nop | |
288 .PROCEND | |
289 | |
290 .EXPORT RC4_options,ENTRY | |
291 .ALIGN 8 | |
292 RC4_options | |
293 .PROC | |
294 .CALLINFO NO_CALLS | |
295 .ENTRY | |
296 blr %r0,%r28 | |
297 ldi 3,%r1 | |
298 L\$pic | |
299 andcm %r28,%r1,%r28 | |
300 bv (%r2) | |
301 .EXIT | |
302 ldo L\$opts-L\$pic(%r28),%r28 | |
303 .PROCEND | |
304 .ALIGN 8 | |
305 L\$opts | |
306 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" | |
307 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | |
308 ___ | |
309 $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
310 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | |
311 | |
312 print $code; | |
313 close STDOUT; | |
OLD | NEW |