| OLD | NEW |
| 1 #!/usr/local/bin/perl | 1 #!/usr/bin/env perl |
| 2 | 2 |
| 3 # ==================================================================== |
| 4 # [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 # project. The module is, however, dual licensed under OpenSSL and |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 # ==================================================================== |
| 9 |
| 3 # At some point it became apparent that the original SSLeay RC4 | 10 # At some point it became apparent that the original SSLeay RC4 |
| 4 # assembler implementation performs suboptimaly on latest IA-32 | 11 # assembler implementation performs suboptimally on latest IA-32 |
| 5 # microarchitectures. After re-tuning performance has changed as | 12 # microarchitectures. After re-tuning performance has changed as |
| 6 # following: | 13 # following: |
| 7 # | 14 # |
| 8 # Pentium» +0% | 15 # Pentium» -10% |
| 9 # Pentium III» +17% | 16 # Pentium III» +12% |
| 10 # AMD» » +52%(*) | 17 # AMD» » +50%(*) |
| 11 # P4» » +180%(**) | 18 # P4» » +250%(**) |
| 12 # | 19 # |
| 13 # (*) This number is actually a trade-off:-) It's possible to | 20 # (*) This number is actually a trade-off:-) It's possible to |
| 14 # achieve +72%, but at the cost of -48% off PIII performance. | 21 # achieve +72%, but at the cost of -48% off PIII performance. |
| 15 # In other words code performing further 13% faster on AMD | 22 # In other words code performing further 13% faster on AMD |
| 16 # would perform almost 2 times slower on Intel PIII... | 23 # would perform almost 2 times slower on Intel PIII... |
| 17 # For reference! This code delivers ~80% of rc4-amd64.pl | 24 # For reference! This code delivers ~80% of rc4-amd64.pl |
| 18 # performance on the same Opteron machine. | 25 # performance on the same Opteron machine. |
| 19 # (**) This number requires compressed key schedule set up by | 26 # (**) This number requires compressed key schedule set up by |
| 20 #» RC4_set_key and therefore doesn't apply to 0.9.7 [option for | 27 #» RC4_set_key [see commentary below for further details]. |
| 21 #» compressed key schedule is implemented in 0.9.8 and later, | |
| 22 #» see commentary section in rc4_skey.c for further details]. | |
| 23 # | 28 # |
| 24 # <appro@fy.chalmers.se> | 29 # <appro@fy.chalmers.se> |
| 25 | 30 |
| 26 push(@INC,"perlasm","../../perlasm"); | 31 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 32 push(@INC,"${dir}","${dir}../../perlasm"); |
| 27 require "x86asm.pl"; | 33 require "x86asm.pl"; |
| 28 | 34 |
| 29 &asm_init($ARGV[0],"rc4-586.pl"); | 35 &asm_init($ARGV[0],"rc4-586.pl"); |
| 30 | 36 |
| 31 $x="eax"; | 37 $xx="eax"; |
| 32 $y="ebx"; | 38 $yy="ebx"; |
| 33 $tx="ecx"; | 39 $tx="ecx"; |
| 34 $ty="edx"; | 40 $ty="edx"; |
| 35 $in="esi"; | 41 $inp="esi"; |
| 42 $out="ebp"; |
| 43 $dat="edi"; |
| 44 |
| 45 sub RC4_loop { |
| 46 my $i=shift; |
| 47 my $func = ($i==0)?*mov:*or; |
| 48 |
| 49 » &add» (&LB($yy),&LB($tx)); |
| 50 » &mov» ($ty,&DWP(0,$dat,$yy,4)); |
| 51 » &mov» (&DWP(0,$dat,$yy,4),$tx); |
| 52 » &mov» (&DWP(0,$dat,$xx,4),$ty); |
| 53 » &add» ($ty,$tx); |
| 54 » &inc» (&LB($xx)); |
| 55 » &and» ($ty,0xff); |
| 56 » &ror» ($out,8)» if ($i!=0); |
| 57 » if ($i<3) { |
| 58 » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 59 » } else { |
| 60 » &mov» ($tx,&wparam(3));» # reload [re-biased] out |
| 61 » } |
| 62 » &$func» ($out,&DWP(0,$dat,$ty,4)); |
| 63 } |
| 64 |
| 65 # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); |
| 66 &function_begin("RC4"); |
| 67 » &mov» ($dat,&wparam(0));» # load key schedule pointer |
| 68 » &mov» ($ty, &wparam(1));» # load len |
| 69 » &mov» ($inp,&wparam(2));» # load inp |
| 70 » &mov» ($out,&wparam(3));» # load out |
| 71 |
| 72 » &xor» ($xx,$xx);» » # avoid partial register stalls |
| 73 » &xor» ($yy,$yy); |
| 74 |
| 75 » &cmp» ($ty,0);» » # safety net |
| 76 » &je» (&label("abort")); |
| 77 |
| 78 » &mov» (&LB($xx),&BP(0,$dat));»# load key->x |
| 79 » &mov» (&LB($yy),&BP(4,$dat));»# load key->y |
| 80 » &add» ($dat,8); |
| 81 |
| 82 » &lea» ($tx,&DWP(0,$inp,$ty)); |
| 83 » &sub» ($out,$inp);» » # re-bias out |
| 84 » &mov» (&wparam(1),$tx);» # save input+len |
| 85 |
| 86 » &inc» (&LB($xx)); |
| 87 |
| 88 » # detect compressed key schedule... |
| 89 » &cmp» (&DWP(256,$dat),-1); |
| 90 » &je» (&label("RC4_CHAR")); |
| 91 |
| 92 » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 93 |
| 94 » &and» ($ty,-4);» » # how many 4-byte chunks? |
| 95 » &jz» (&label("loop1")); |
| 96 |
| 97 » &lea» ($ty,&DWP(-4,$inp,$ty)); |
| 98 » &mov» (&wparam(2),$ty);» # save input+(len/4)*4-4 |
| 99 » &mov» (&wparam(3),$out);» # $out as accumulator in this loop |
| 100 |
| 101 » &set_label("loop4",16); |
| 102 » » for ($i=0;$i<4;$i++) { RC4_loop($i); } |
| 103 » » &ror» ($out,8); |
| 104 » » &xor» ($out,&DWP(0,$inp)); |
| 105 » » &cmp» ($inp,&wparam(2));» # compare to input+(len/4)*4-4 |
| 106 » » &mov» (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here |
| 107 » » &lea» ($inp,&DWP(4,$inp)); |
| 108 » » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 109 » &jb» (&label("loop4")); |
| 110 |
| 111 » &cmp» ($inp,&wparam(1));» # compare to input+len |
| 112 » &je» (&label("done")); |
| 113 » &mov» ($out,&wparam(3));» # restore $out |
| 114 |
| 115 » &set_label("loop1",16); |
| 116 » » &add» (&LB($yy),&LB($tx)); |
| 117 » » &mov» ($ty,&DWP(0,$dat,$yy,4)); |
| 118 » » &mov» (&DWP(0,$dat,$yy,4),$tx); |
| 119 » » &mov» (&DWP(0,$dat,$xx,4),$ty); |
| 120 » » &add» ($ty,$tx); |
| 121 » » &inc» (&LB($xx)); |
| 122 » » &and» ($ty,0xff); |
| 123 » » &mov» ($ty,&DWP(0,$dat,$ty,4)); |
| 124 » » &xor» (&LB($ty),&BP(0,$inp)); |
| 125 » » &lea» ($inp,&DWP(1,$inp)); |
| 126 » » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 127 » » &cmp» ($inp,&wparam(1));» # compare to input+len |
| 128 » » &mov» (&BP(-1,$out,$inp),&LB($ty)); |
| 129 » &jb» (&label("loop1")); |
| 130 |
| 131 » &jmp» (&label("done")); |
| 132 |
| 133 # this is essentially Intel P4 specific codepath... |
| 134 &set_label("RC4_CHAR",16); |
| 135 » &movz» ($tx,&BP(0,$dat,$xx)); |
| 136 » # strangely enough unrolled loop performs over 20% slower... |
| 137 » &set_label("cloop1"); |
| 138 » » &add» (&LB($yy),&LB($tx)); |
| 139 » » &movz» ($ty,&BP(0,$dat,$yy)); |
| 140 » » &mov» (&BP(0,$dat,$yy),&LB($tx)); |
| 141 » » &mov» (&BP(0,$dat,$xx),&LB($ty)); |
| 142 » » &add» (&LB($ty),&LB($tx)); |
| 143 » » &movz» ($ty,&BP(0,$dat,$ty)); |
| 144 » » &add» (&LB($xx),1); |
| 145 » » &xor» (&LB($ty),&BP(0,$inp)); |
| 146 » » &lea» ($inp,&DWP(1,$inp)); |
| 147 » » &movz» ($tx,&BP(0,$dat,$xx)); |
| 148 » » &cmp» ($inp,&wparam(1)); |
| 149 » » &mov» (&BP(-1,$out,$inp),&LB($ty)); |
| 150 » &jb» (&label("cloop1")); |
| 151 |
| 152 &set_label("done"); |
| 153 » &dec» (&LB($xx)); |
| 154 » &mov» (&BP(-4,$dat),&LB($yy));» # save key->y |
| 155 » &mov» (&BP(-8,$dat),&LB($xx));» # save key->x |
| 156 &set_label("abort"); |
| 157 &function_end("RC4"); |
| 158 |
| 159 ######################################################################## |
| 160 |
| 161 $inp="esi"; |
| 36 $out="edi"; | 162 $out="edi"; |
| 37 $d="ebp"; | 163 $idi="ebp"; |
| 38 | 164 $ido="ecx"; |
| 39 &RC4("RC4"); | 165 $idx="edx"; |
| 166 |
| 167 &external_label("OPENSSL_ia32cap_P"); |
| 168 |
| 169 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); |
| 170 &function_begin("RC4_set_key"); |
| 171 » &mov» ($out,&wparam(0));» » # load key |
| 172 » &mov» ($idi,&wparam(1));» » # load len |
| 173 » &mov» ($inp,&wparam(2));» » # load data |
| 174 » &picmeup($idx,"OPENSSL_ia32cap_P"); |
| 175 |
| 176 » &lea» ($out,&DWP(2*4,$out));» » # &key->data |
| 177 » &lea» ($inp,&DWP(0,$inp,$idi));» # $inp to point at the end |
| 178 » &neg» ($idi); |
| 179 » &xor» ("eax","eax"); |
| 180 » &mov» (&DWP(-4,$out),$idi);» » # borrow key->y |
| 181 |
| 182 » &bt» (&DWP(0,$idx),20);» » # check for bit#20 |
| 183 » &jc» (&label("c1stloop")); |
| 184 |
| 185 &set_label("w1stloop",16); |
| 186 » &mov» (&DWP(0,$out,"eax",4),"eax");» # key->data[i]=i; |
| 187 » &add» (&LB("eax"),1);»» » # i++; |
| 188 » &jnc» (&label("w1stloop")); |
| 189 |
| 190 » &xor» ($ido,$ido); |
| 191 » &xor» ($idx,$idx); |
| 192 |
| 193 &set_label("w2ndloop",16); |
| 194 » &mov» ("eax",&DWP(0,$out,$ido,4)); |
| 195 » &add» (&LB($idx),&BP(0,$inp,$idi)); |
| 196 » &add» (&LB($idx),&LB("eax")); |
| 197 » &add» ($idi,1); |
| 198 » &mov» ("ebx",&DWP(0,$out,$idx,4)); |
| 199 » &jnz» (&label("wnowrap")); |
| 200 » &mov» ($idi,&DWP(-4,$out)); |
| 201 » &set_label("wnowrap"); |
| 202 » &mov» (&DWP(0,$out,$idx,4),"eax"); |
| 203 » &mov» (&DWP(0,$out,$ido,4),"ebx"); |
| 204 » &add» (&LB($ido),1); |
| 205 » &jnc» (&label("w2ndloop")); |
| 206 &jmp» (&label("exit")); |
| 207 |
| 208 # Unlike all other x86 [and x86_64] implementations, Intel P4 core |
| 209 # [including EM64T] was found to perform poorly with above "32-bit" key |
| 210 # schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded |
| 211 # assembler turned out to be 3.5x if re-coded for compressed 8-bit one, |
| 212 # a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit |
| 213 # schedule for x86[_64], because non-P4 implementations suffer from |
| 214 # significant performance losses then, e.g. PIII exhibits >2x |
| 215 # deterioration, and so does Opteron. In order to assure optimal |
| 216 # all-round performance, we detect P4 at run-time and set up compressed |
| 217 # key schedule, which is recognized by RC4 procedure. |
| 218 |
| 219 &set_label("c1stloop",16); |
| 220 » &mov» (&BP(0,$out,"eax"),&LB("eax"));»# key->data[i]=i; |
| 221 » &add» (&LB("eax"),1);»» » # i++; |
| 222 » &jnc» (&label("c1stloop")); |
| 223 |
| 224 » &xor» ($ido,$ido); |
| 225 » &xor» ($idx,$idx); |
| 226 » &xor» ("ebx","ebx"); |
| 227 |
| 228 &set_label("c2ndloop",16); |
| 229 » &mov» (&LB("eax"),&BP(0,$out,$ido)); |
| 230 » &add» (&LB($idx),&BP(0,$inp,$idi)); |
| 231 » &add» (&LB($idx),&LB("eax")); |
| 232 » &add» ($idi,1); |
| 233 » &mov» (&LB("ebx"),&BP(0,$out,$idx)); |
| 234 » &jnz» (&label("cnowrap")); |
| 235 » &mov» ($idi,&DWP(-4,$out)); |
| 236 » &set_label("cnowrap"); |
| 237 » &mov» (&BP(0,$out,$idx),&LB("eax")); |
| 238 » &mov» (&BP(0,$out,$ido),&LB("ebx")); |
| 239 » &add» (&LB($ido),1); |
| 240 » &jnc» (&label("c2ndloop")); |
| 241 |
| 242 » &mov» (&DWP(256,$out),-1);» » # mark schedule as compressed |
| 243 |
| 244 &set_label("exit"); |
| 245 » &xor» ("eax","eax"); |
| 246 » &mov» (&DWP(-8,$out),"eax");» » # key->x=0; |
| 247 » &mov» (&DWP(-4,$out),"eax");» » # key->y=0; |
| 248 &function_end("RC4_set_key"); |
| 249 |
| 250 # const char *RC4_options(void); |
| 251 &function_begin_B("RC4_options"); |
| 252 » &call» (&label("pic_point")); |
| 253 &set_label("pic_point"); |
| 254 » &blindpop("eax"); |
| 255 » &lea» ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); |
| 256 » &picmeup("edx","OPENSSL_ia32cap_P"); |
| 257 » &bt» (&DWP(0,"edx"),20); |
| 258 » &jnc» (&label("skip")); |
| 259 » &add» ("eax",12); |
| 260 » &set_label("skip"); |
| 261 » &ret» (); |
| 262 &set_label("opts",64); |
| 263 &asciz» ("rc4(4x,int)"); |
| 264 &asciz» ("rc4(1x,char)"); |
| 265 &asciz» ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
| 266 &align» (64); |
| 267 &function_end_B("RC4_options"); |
| 40 | 268 |
| 41 &asm_finish(); | 269 &asm_finish(); |
| 42 | 270 |
| 43 sub RC4_loop | |
| 44 { | |
| 45 local($n,$p,$char)=@_; | |
| 46 | |
| 47 &comment("Round $n"); | |
| 48 | |
| 49 if ($char) | |
| 50 { | |
| 51 if ($p >= 0) | |
| 52 { | |
| 53 &mov($ty, &swtmp(2)); | |
| 54 &cmp($ty, $in); | |
| 55 &jbe(&label("finished")); | |
| 56 &inc($in); | |
| 57 } | |
| 58 else | |
| 59 { | |
| 60 &add($ty, 8); | |
| 61 &inc($in); | |
| 62 &cmp($ty, $in); | |
| 63 &jb(&label("finished")); | |
| 64 &mov(&swtmp(2), $ty); | |
| 65 } | |
| 66 } | |
| 67 # Moved out | |
| 68 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; | |
| 69 | |
| 70 &add( &LB($y), &LB($tx)); | |
| 71 &mov( $ty, &DWP(0,$d,$y,4)); | |
| 72 # XXX | |
| 73 &mov( &DWP(0,$d,$x,4),$ty); | |
| 74 &add( $ty, $tx); | |
| 75 &mov( &DWP(0,$d,$y,4),$tx); | |
| 76 &and( $ty, 0xff); | |
| 77 &inc( &LB($x)); # NEXT ROUND | |
| 78 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND | |
| 79 &mov( $ty, &DWP(0,$d,$ty,4)); | |
| 80 | |
| 81 if (!$char) | |
| 82 { | |
| 83 #moved up into last round | |
| 84 if ($p >= 1) | |
| 85 { | |
| 86 &add( $out, 8) | |
| 87 } | |
| 88 &movb( &BP($n,"esp","",0), &LB($ty)); | |
| 89 } | |
| 90 else | |
| 91 { | |
| 92 # Note in+=8 has occured | |
| 93 &movb( &HB($ty), &BP(-1,$in,"",0)); | |
| 94 # XXX | |
| 95 &xorb(&LB($ty), &HB($ty)); | |
| 96 # XXX | |
| 97 &movb(&BP($n,$out,"",0),&LB($ty)); | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 | |
| 102 sub RC4 | |
| 103 { | |
| 104 local($name)=@_; | |
| 105 | |
| 106 &function_begin_B($name,""); | |
| 107 | |
| 108 &mov($ty,&wparam(1)); # len | |
| 109 &cmp($ty,0); | |
| 110 &jne(&label("proceed")); | |
| 111 &ret(); | |
| 112 &set_label("proceed"); | |
| 113 | |
| 114 &comment(""); | |
| 115 | |
| 116 &push("ebp"); | |
| 117 &push("ebx"); | |
| 118 &push("esi"); | |
| 119 &xor( $x, $x); # avoid partial register stalls | |
| 120 &push("edi"); | |
| 121 &xor( $y, $y); # avoid partial register stalls | |
| 122 &mov( $d, &wparam(0)); # key | |
| 123 &mov( $in, &wparam(2)); | |
| 124 | |
| 125 &movb( &LB($x), &BP(0,$d,"",1)); | |
| 126 &movb( &LB($y), &BP(4,$d,"",1)); | |
| 127 | |
| 128 &mov( $out, &wparam(3)); | |
| 129 &inc( &LB($x)); | |
| 130 | |
| 131 &stack_push(3); # 3 temp variables | |
| 132 &add( $d, 8); | |
| 133 | |
| 134 # detect compressed schedule, see commentary section in rc4_skey.c... | |
| 135 # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, | |
| 136 # as compressed key schedule is set up in 0.9.8 and later. | |
| 137 &cmp(&DWP(256,$d),-1); | |
| 138 &je(&label("RC4_CHAR")); | |
| 139 | |
| 140 &lea( $ty, &DWP(-8,$ty,$in)); | |
| 141 | |
| 142 # check for 0 length input | |
| 143 | |
| 144 &mov( &swtmp(2), $ty); # this is now address to exit at | |
| 145 &mov( $tx, &DWP(0,$d,$x,4)); | |
| 146 | |
| 147 &cmp( $ty, $in); | |
| 148 &jb( &label("end")); # less than 8 bytes | |
| 149 | |
| 150 &set_label("start"); | |
| 151 | |
| 152 # filling DELAY SLOT | |
| 153 &add( $in, 8); | |
| 154 | |
| 155 &RC4_loop(0,-1,0); | |
| 156 &RC4_loop(1,0,0); | |
| 157 &RC4_loop(2,0,0); | |
| 158 &RC4_loop(3,0,0); | |
| 159 &RC4_loop(4,0,0); | |
| 160 &RC4_loop(5,0,0); | |
| 161 &RC4_loop(6,0,0); | |
| 162 &RC4_loop(7,1,0); | |
| 163 | |
| 164 &comment("apply the cipher text"); | |
| 165 # xor the cipher data with input | |
| 166 | |
| 167 #&add( $out, 8); #moved up into last round | |
| 168 | |
| 169 &mov( $tx, &swtmp(0)); | |
| 170 &mov( $ty, &DWP(-8,$in,"",0)); | |
| 171 &xor( $tx, $ty); | |
| 172 &mov( $ty, &DWP(-4,$in,"",0)); | |
| 173 &mov( &DWP(-8,$out,"",0), $tx); | |
| 174 &mov( $tx, &swtmp(1)); | |
| 175 &xor( $tx, $ty); | |
| 176 &mov( $ty, &swtmp(2)); # load end ptr; | |
| 177 &mov( &DWP(-4,$out,"",0), $tx); | |
| 178 &mov( $tx, &DWP(0,$d,$x,4)); | |
| 179 &cmp($in, $ty); | |
| 180 &jbe(&label("start")); | |
| 181 | |
| 182 &set_label("end"); | |
| 183 | |
| 184 # There is quite a bit of extra crap in RC4_loop() for this | |
| 185 # first round | |
| 186 &RC4_loop(0,-1,1); | |
| 187 &RC4_loop(1,0,1); | |
| 188 &RC4_loop(2,0,1); | |
| 189 &RC4_loop(3,0,1); | |
| 190 &RC4_loop(4,0,1); | |
| 191 &RC4_loop(5,0,1); | |
| 192 &RC4_loop(6,1,1); | |
| 193 | |
| 194 &jmp(&label("finished")); | |
| 195 | |
| 196 &align(16); | |
| 197 # this is essentially Intel P4 specific codepath, see rc4_skey.c, | |
| 198 # and is engaged in 0.9.8 and later context... | |
| 199 &set_label("RC4_CHAR"); | |
| 200 | |
| 201 &lea ($ty,&DWP(0,$in,$ty)); | |
| 202 &mov (&swtmp(2),$ty); | |
| 203 &movz ($tx,&BP(0,$d,$x)); | |
| 204 | |
| 205 # strangely enough unrolled loop performs over 20% slower... | |
| 206 &set_label("RC4_CHAR_loop"); | |
| 207 &add (&LB($y),&LB($tx)); | |
| 208 &movz ($ty,&BP(0,$d,$y)); | |
| 209 &movb (&BP(0,$d,$y),&LB($tx)); | |
| 210 &movb (&BP(0,$d,$x),&LB($ty)); | |
| 211 &add (&LB($ty),&LB($tx)); | |
| 212 &movz ($ty,&BP(0,$d,$ty)); | |
| 213 &add (&LB($x),1); | |
| 214 &xorb (&LB($ty),&BP(0,$in)); | |
| 215 &lea ($in,&DWP(1,$in)); | |
| 216 &movz ($tx,&BP(0,$d,$x)); | |
| 217 &cmp ($in,&swtmp(2)); | |
| 218 &movb (&BP(0,$out),&LB($ty)); | |
| 219 &lea ($out,&DWP(1,$out)); | |
| 220 &jb (&label("RC4_CHAR_loop")); | |
| 221 | |
| 222 &set_label("finished"); | |
| 223 &dec( $x); | |
| 224 &stack_pop(3); | |
| 225 &movb( &BP(-4,$d,"",0),&LB($y)); | |
| 226 &movb( &BP(-8,$d,"",0),&LB($x)); | |
| 227 | |
| 228 &function_end($name); | |
| 229 } | |
| 230 | |
| OLD | NEW |