| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # ==================================================================== | |
| 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 | |
| 10 # RC4 for PA-RISC. | |
| 11 | |
| 12 # June 2009. | |
| 13 # | |
| 14 # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. | |
| 15 # For reference, [4x] unrolled loop is >40% faster than folded one. | |
| 16 # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement | |
| 17 # is believed to be not sufficient to justify the effort... | |
| 18 # | |
| 19 # Special thanks to polarhome.com for providing HP-UX account. | |
| 20 | |
| 21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
| 22 | |
| 23 $flavour = shift; | |
| 24 $output = shift; | |
| 25 open STDOUT,">$output"; | |
| 26 | |
| 27 if ($flavour =~ /64/) { | |
| 28 $LEVEL ="2.0W"; | |
| 29 $SIZE_T =8; | |
| 30 $FRAME_MARKER =80; | |
| 31 $SAVED_RP =16; | |
| 32 $PUSH ="std"; | |
| 33 $PUSHMA ="std,ma"; | |
| 34 $POP ="ldd"; | |
| 35 $POPMB ="ldd,mb"; | |
| 36 } else { | |
| 37 $LEVEL ="1.0"; | |
| 38 $SIZE_T =4; | |
| 39 $FRAME_MARKER =48; | |
| 40 $SAVED_RP =20; | |
| 41 $PUSH ="stw"; | |
| 42 $PUSHMA ="stwm"; | |
| 43 $POP ="ldw"; | |
| 44 $POPMB ="ldwm"; | |
| 45 } | |
| 46 | |
| 47 $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker | |
| 48 # [+ argument transfer] | |
| 49 $SZ=1; # defaults to RC4_CHAR | |
| 50 if (open CONF,"<${dir}../../opensslconf.h") { | |
| 51 while(<CONF>) { | |
| 52 if (m/#\s*define\s+RC4_INT\s+(.*)/) { | |
| 53 $SZ = ($1=~/char$/) ? 1 : 4; | |
| 54 last; | |
| 55 } | |
| 56 } | |
| 57 close CONF; | |
| 58 } | |
| 59 | |
| 60 if ($SZ==1) { # RC4_CHAR | |
| 61 $LD="ldb"; | |
| 62 $LDX="ldbx"; | |
| 63 $MKX="addl"; | |
| 64 $ST="stb"; | |
| 65 } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) | |
| 66 $LD="ldw"; | |
| 67 $LDX="ldwx,s"; | |
| 68 $MKX="sh2addl"; | |
| 69 $ST="stw"; | |
| 70 } | |
| 71 | |
| 72 $key="%r26"; | |
| 73 $len="%r25"; | |
| 74 $inp="%r24"; | |
| 75 $out="%r23"; | |
| 76 | |
| 77 @XX=("%r19","%r20"); | |
| 78 @TX=("%r21","%r22"); | |
| 79 $YY="%r28"; | |
| 80 $TY="%r29"; | |
| 81 | |
| 82 $acc="%r1"; | |
| 83 $ix="%r2"; | |
| 84 $iy="%r3"; | |
| 85 $dat0="%r4"; | |
| 86 $dat1="%r5"; | |
| 87 $rem="%r6"; | |
| 88 $mask="%r31"; | |
| 89 | |
| 90 sub unrolledloopbody { | |
| 91 for ($i=0;$i<4;$i++) { | |
| 92 $code.=<<___; | |
| 93 ldo 1($XX[0]),$XX[1] | |
| 94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` | |
| 95 and $mask,$XX[1],$XX[1] | |
| 96 $LDX $YY($key),$TY | |
| 97 $MKX $YY,$key,$ix | |
| 98 $LDX $XX[1]($key),$TX[1] | |
| 99 $MKX $XX[0],$key,$iy | |
| 100 $ST $TX[0],0($ix) | |
| 101 comclr,<> $XX[1],$YY,%r0 ; conditional | |
| 102 copy $TX[0],$TX[1] ; move | |
| 103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` | |
| 104 $ST $TY,0($iy) | |
| 105 addl $TX[0],$TY,$TY | |
| 106 addl $TX[1],$YY,$YY | |
| 107 and $mask,$TY,$TY | |
| 108 and $mask,$YY,$YY | |
| 109 ___ | |
| 110 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | |
| 111 } } | |
| 112 | |
| 113 sub foldedloop { | |
| 114 my ($label,$count)=@_; | |
| 115 $code.=<<___; | |
| 116 $label | |
| 117 $MKX $YY,$key,$iy | |
| 118 $LDX $YY($key),$TY | |
| 119 $MKX $XX[0],$key,$ix | |
| 120 $ST $TX[0],0($iy) | |
| 121 ldo 1($XX[0]),$XX[0] | |
| 122 $ST $TY,0($ix) | |
| 123 addl $TX[0],$TY,$TY | |
| 124 ldbx $inp($out),$dat1 | |
| 125 and $mask,$TY,$TY | |
| 126 and $mask,$XX[0],$XX[0] | |
| 127 $LDX $TY($key),$acc | |
| 128 $LDX $XX[0]($key),$TX[0] | |
| 129 ldo 1($out),$out | |
| 130 xor $dat1,$acc,$acc | |
| 131 addl $TX[0],$YY,$YY | |
| 132 stb $acc,-1($out) | |
| 133 addib,<> -1,$count,$label ; $count is always small | |
| 134 and $mask,$YY,$YY | |
| 135 ___ | |
| 136 } | |
| 137 | |
| 138 $code=<<___; | |
| 139 .LEVEL $LEVEL | |
| 140 .SPACE \$TEXT\$ | |
| 141 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | |
| 142 | |
| 143 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | |
| 144 RC4 | |
| 145 .PROC | |
| 146 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 | |
| 147 .ENTRY | |
| 148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | |
| 149 $PUSHMA %r3,$FRAME(%sp) | |
| 150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | |
| 151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | |
| 152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | |
| 153 | |
| 154 cmpib,*= 0,$len,L\$abort | |
| 155 sub $inp,$out,$inp ; distance between $inp and $out | |
| 156 | |
| 157 $LD `0*$SZ`($key),$XX[0] | |
| 158 $LD `1*$SZ`($key),$YY | |
| 159 ldo `2*$SZ`($key),$key | |
| 160 | |
| 161 ldi 0xff,$mask | |
| 162 ldi 3,$dat0 | |
| 163 | |
| 164 ldo 1($XX[0]),$XX[0] ; warm up loop | |
| 165 and $mask,$XX[0],$XX[0] | |
| 166 $LDX $XX[0]($key),$TX[0] | |
| 167 addl $TX[0],$YY,$YY | |
| 168 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? | |
| 169 and $mask,$YY,$YY | |
| 170 | |
| 171 and,<> $out,$dat0,$rem ; is $out aligned? | |
| 172 b L\$alignedout | |
| 173 subi 4,$rem,$rem | |
| 174 sub $len,$rem,$len | |
| 175 ___ | |
| 176 &foldedloop("L\$alignout",$rem); # process till $out is aligned | |
| 177 | |
| 178 $code.=<<___; | |
| 179 L\$alignedout ; $len is at least 4 here | |
| 180 and,<> $inp,$dat0,$acc ; is $inp aligned? | |
| 181 b L\$oop4 | |
| 182 sub $inp,$acc,$rem ; align $inp | |
| 183 | |
| 184 sh3addl $acc,%r0,$acc | |
| 185 subi 32,$acc,$acc | |
| 186 mtctl $acc,%cr11 ; load %sar with vshd align factor | |
| 187 ldwx $rem($out),$dat0 | |
| 188 ldo 4($rem),$rem | |
| 189 L\$oop4misalignedinp | |
| 190 ___ | |
| 191 &unrolledloopbody(); | |
| 192 $code.=<<___; | |
| 193 $LDX $TY($key),$ix | |
| 194 ldwx $rem($out),$dat1 | |
| 195 ldo -4($len),$len | |
| 196 or $ix,$acc,$acc ; last piece, no need to dep | |
| 197 vshd $dat0,$dat1,$iy ; align data | |
| 198 copy $dat1,$dat0 | |
| 199 xor $iy,$acc,$acc | |
| 200 stw $acc,0($out) | |
| 201 cmpib,*<< 3,$len,L\$oop4misalignedinp | |
| 202 ldo 4($out),$out | |
| 203 cmpib,*= 0,$len,L\$done | |
| 204 nop | |
| 205 b L\$oop1 | |
| 206 nop | |
| 207 | |
| 208 .ALIGN 8 | |
| 209 L\$oop4 | |
| 210 ___ | |
| 211 &unrolledloopbody(); | |
| 212 $code.=<<___; | |
| 213 $LDX $TY($key),$ix | |
| 214 ldwx $inp($out),$dat0 | |
| 215 ldo -4($len),$len | |
| 216 or $ix,$acc,$acc ; last piece, no need to dep | |
| 217 xor $dat0,$acc,$acc | |
| 218 stw $acc,0($out) | |
| 219 cmpib,*<< 3,$len,L\$oop4 | |
| 220 ldo 4($out),$out | |
| 221 cmpib,*= 0,$len,L\$done | |
| 222 nop | |
| 223 ___ | |
| 224 &foldedloop("L\$oop1",$len); | |
| 225 $code.=<<___; | |
| 226 L\$done | |
| 227 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 | |
| 228 ldo -1($XX[0]),$XX[0] ; chill out loop | |
| 229 sub $YY,$TX[0],$YY | |
| 230 and $mask,$XX[0],$XX[0] | |
| 231 and $mask,$YY,$YY | |
| 232 $ST $XX[0],`-2*$SZ`($key) | |
| 233 $ST $YY,`-1*$SZ`($key) | |
| 234 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | |
| 235 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | |
| 236 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | |
| 237 L\$abort | |
| 238 bv (%r2) | |
| 239 .EXIT | |
| 240 $POPMB -$FRAME(%sp),%r3 | |
| 241 .PROCEND | |
| 242 ___ | |
| 243 | |
| 244 $code.=<<___; | |
| 245 | |
| 246 .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | |
| 247 .ALIGN 8 | |
| 248 private_RC4_set_key | |
| 249 .PROC | |
| 250 .CALLINFO NO_CALLS | |
| 251 .ENTRY | |
| 252 $ST %r0,`0*$SZ`($key) | |
| 253 $ST %r0,`1*$SZ`($key) | |
| 254 ldo `2*$SZ`($key),$key | |
| 255 copy %r0,@XX[0] | |
| 256 L\$1st | |
| 257 $ST @XX[0],0($key) | |
| 258 ldo 1(@XX[0]),@XX[0] | |
| 259 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 | |
| 260 ldo $SZ($key),$key | |
| 261 | |
| 262 ldo `-256*$SZ`($key),$key ; rewind $key | |
| 263 addl $len,$inp,$inp ; $inp to point at the end | |
| 264 sub %r0,$len,%r23 ; inverse index | |
| 265 copy %r0,@XX[0] | |
| 266 copy %r0,@XX[1] | |
| 267 ldi 0xff,$mask | |
| 268 | |
| 269 L\$2nd | |
| 270 $LDX @XX[0]($key),@TX[0] | |
| 271 ldbx %r23($inp),@TX[1] | |
| 272 addi,nuv 1,%r23,%r23 ; increment and conditional | |
| 273 sub %r0,$len,%r23 ; inverse index | |
| 274 addl @TX[0],@XX[1],@XX[1] | |
| 275 addl @TX[1],@XX[1],@XX[1] | |
| 276 and $mask,@XX[1],@XX[1] | |
| 277 $MKX @XX[0],$key,$TY | |
| 278 $LDX @XX[1]($key),@TX[1] | |
| 279 $MKX @XX[1],$key,$YY | |
| 280 ldo 1(@XX[0]),@XX[0] | |
| 281 $ST @TX[0],0($YY) | |
| 282 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 | |
| 283 $ST @TX[1],0($TY) | |
| 284 | |
| 285 bv,n (%r2) | |
| 286 .EXIT | |
| 287 nop | |
| 288 .PROCEND | |
| 289 | |
| 290 .EXPORT RC4_options,ENTRY | |
| 291 .ALIGN 8 | |
| 292 RC4_options | |
| 293 .PROC | |
| 294 .CALLINFO NO_CALLS | |
| 295 .ENTRY | |
| 296 blr %r0,%r28 | |
| 297 ldi 3,%r1 | |
| 298 L\$pic | |
| 299 andcm %r28,%r1,%r28 | |
| 300 bv (%r2) | |
| 301 .EXIT | |
| 302 ldo L\$opts-L\$pic(%r28),%r28 | |
| 303 .PROCEND | |
| 304 .ALIGN 8 | |
| 305 L\$opts | |
| 306 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" | |
| 307 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | |
| 308 ___ | |
| 309 $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
| 310 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | |
| 311 | |
| 312 print $code; | |
| 313 close STDOUT; | |
| OLD | NEW |