OLD | NEW |
1 #!/usr/local/bin/perl | 1 #!/usr/bin/env perl |
2 | 2 |
| 3 # ==================================================================== |
| 4 # [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 # project. The module is, however, dual licensed under OpenSSL and |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 # ==================================================================== |
| 9 |
3 # At some point it became apparent that the original SSLeay RC4 | 10 # At some point it became apparent that the original SSLeay RC4 |
4 # assembler implementation performs suboptimaly on latest IA-32 | 11 # assembler implementation performs suboptimally on latest IA-32 |
5 # microarchitectures. After re-tuning performance has changed as | 12 # microarchitectures. After re-tuning performance has changed as |
6 # following: | 13 # following: |
7 # | 14 # |
8 # Pentium» +0% | 15 # Pentium» -10% |
9 # Pentium III» +17% | 16 # Pentium III» +12% |
10 # AMD» » +52%(*) | 17 # AMD» » +50%(*) |
11 # P4» » +180%(**) | 18 # P4» » +250%(**) |
12 # | 19 # |
13 # (*) This number is actually a trade-off:-) It's possible to | 20 # (*) This number is actually a trade-off:-) It's possible to |
14 # achieve +72%, but at the cost of -48% off PIII performance. | 21 # achieve +72%, but at the cost of -48% off PIII performance. |
15 # In other words code performing further 13% faster on AMD | 22 # In other words code performing further 13% faster on AMD |
16 # would perform almost 2 times slower on Intel PIII... | 23 # would perform almost 2 times slower on Intel PIII... |
17 # For reference! This code delivers ~80% of rc4-amd64.pl | 24 # For reference! This code delivers ~80% of rc4-amd64.pl |
18 # performance on the same Opteron machine. | 25 # performance on the same Opteron machine. |
19 # (**) This number requires compressed key schedule set up by | 26 # (**) This number requires compressed key schedule set up by |
20 #» RC4_set_key and therefore doesn't apply to 0.9.7 [option for | 27 #» RC4_set_key [see commentary below for further details]. |
21 #» compressed key schedule is implemented in 0.9.8 and later, | |
22 #» see commentary section in rc4_skey.c for further details]. | |
23 # | 28 # |
24 # <appro@fy.chalmers.se> | 29 # <appro@fy.chalmers.se> |
25 | 30 |
26 push(@INC,"perlasm","../../perlasm"); | 31 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 32 push(@INC,"${dir}","${dir}../../perlasm"); |
27 require "x86asm.pl"; | 33 require "x86asm.pl"; |
28 | 34 |
29 &asm_init($ARGV[0],"rc4-586.pl"); | 35 &asm_init($ARGV[0],"rc4-586.pl"); |
30 | 36 |
31 $x="eax"; | 37 $xx="eax"; |
32 $y="ebx"; | 38 $yy="ebx"; |
33 $tx="ecx"; | 39 $tx="ecx"; |
34 $ty="edx"; | 40 $ty="edx"; |
35 $in="esi"; | 41 $inp="esi"; |
| 42 $out="ebp"; |
| 43 $dat="edi"; |
| 44 |
| 45 sub RC4_loop { |
| 46 my $i=shift; |
| 47 my $func = ($i==0)?*mov:*or; |
| 48 |
| 49 » &add» (&LB($yy),&LB($tx)); |
| 50 » &mov» ($ty,&DWP(0,$dat,$yy,4)); |
| 51 » &mov» (&DWP(0,$dat,$yy,4),$tx); |
| 52 » &mov» (&DWP(0,$dat,$xx,4),$ty); |
| 53 » &add» ($ty,$tx); |
| 54 » &inc» (&LB($xx)); |
| 55 » &and» ($ty,0xff); |
| 56 » &ror» ($out,8)» if ($i!=0); |
| 57 » if ($i<3) { |
| 58 » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 59 » } else { |
| 60 » &mov» ($tx,&wparam(3));» # reload [re-biased] out |
| 61 » } |
| 62 » &$func» ($out,&DWP(0,$dat,$ty,4)); |
| 63 } |
| 64 |
| 65 # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); |
| 66 &function_begin("RC4"); |
| 67 » &mov» ($dat,&wparam(0));» # load key schedule pointer |
| 68 » &mov» ($ty, &wparam(1));» # load len |
| 69 » &mov» ($inp,&wparam(2));» # load inp |
| 70 » &mov» ($out,&wparam(3));» # load out |
| 71 |
| 72 » &xor» ($xx,$xx);» » # avoid partial register stalls |
| 73 » &xor» ($yy,$yy); |
| 74 |
| 75 » &cmp» ($ty,0);» » # safety net |
| 76 » &je» (&label("abort")); |
| 77 |
| 78 » &mov» (&LB($xx),&BP(0,$dat));»# load key->x |
| 79 » &mov» (&LB($yy),&BP(4,$dat));»# load key->y |
| 80 » &add» ($dat,8); |
| 81 |
| 82 » &lea» ($tx,&DWP(0,$inp,$ty)); |
| 83 » &sub» ($out,$inp);» » # re-bias out |
| 84 » &mov» (&wparam(1),$tx);» # save input+len |
| 85 |
| 86 » &inc» (&LB($xx)); |
| 87 |
| 88 » # detect compressed key schedule... |
| 89 » &cmp» (&DWP(256,$dat),-1); |
| 90 » &je» (&label("RC4_CHAR")); |
| 91 |
| 92 » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 93 |
| 94 » &and» ($ty,-4);» » # how many 4-byte chunks? |
| 95 » &jz» (&label("loop1")); |
| 96 |
| 97 » &lea» ($ty,&DWP(-4,$inp,$ty)); |
| 98 » &mov» (&wparam(2),$ty);» # save input+(len/4)*4-4 |
| 99 » &mov» (&wparam(3),$out);» # $out as accumulator in this loop |
| 100 |
| 101 » &set_label("loop4",16); |
| 102 » » for ($i=0;$i<4;$i++) { RC4_loop($i); } |
| 103 » » &ror» ($out,8); |
| 104 » » &xor» ($out,&DWP(0,$inp)); |
| 105 » » &cmp» ($inp,&wparam(2));» # compare to input+(len/4)*4-4 |
| 106 » » &mov» (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here |
| 107 » » &lea» ($inp,&DWP(4,$inp)); |
| 108 » » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 109 » &jb» (&label("loop4")); |
| 110 |
| 111 » &cmp» ($inp,&wparam(1));» # compare to input+len |
| 112 » &je» (&label("done")); |
| 113 » &mov» ($out,&wparam(3));» # restore $out |
| 114 |
| 115 » &set_label("loop1",16); |
| 116 » » &add» (&LB($yy),&LB($tx)); |
| 117 » » &mov» ($ty,&DWP(0,$dat,$yy,4)); |
| 118 » » &mov» (&DWP(0,$dat,$yy,4),$tx); |
| 119 » » &mov» (&DWP(0,$dat,$xx,4),$ty); |
| 120 » » &add» ($ty,$tx); |
| 121 » » &inc» (&LB($xx)); |
| 122 » » &and» ($ty,0xff); |
| 123 » » &mov» ($ty,&DWP(0,$dat,$ty,4)); |
| 124 » » &xor» (&LB($ty),&BP(0,$inp)); |
| 125 » » &lea» ($inp,&DWP(1,$inp)); |
| 126 » » &mov» ($tx,&DWP(0,$dat,$xx,4)); |
| 127 » » &cmp» ($inp,&wparam(1));» # compare to input+len |
| 128 » » &mov» (&BP(-1,$out,$inp),&LB($ty)); |
| 129 » &jb» (&label("loop1")); |
| 130 |
| 131 » &jmp» (&label("done")); |
| 132 |
| 133 # this is essentially Intel P4 specific codepath... |
| 134 &set_label("RC4_CHAR",16); |
| 135 » &movz» ($tx,&BP(0,$dat,$xx)); |
| 136 » # strangely enough unrolled loop performs over 20% slower... |
| 137 » &set_label("cloop1"); |
| 138 » » &add» (&LB($yy),&LB($tx)); |
| 139 » » &movz» ($ty,&BP(0,$dat,$yy)); |
| 140 » » &mov» (&BP(0,$dat,$yy),&LB($tx)); |
| 141 » » &mov» (&BP(0,$dat,$xx),&LB($ty)); |
| 142 » » &add» (&LB($ty),&LB($tx)); |
| 143 » » &movz» ($ty,&BP(0,$dat,$ty)); |
| 144 » » &add» (&LB($xx),1); |
| 145 » » &xor» (&LB($ty),&BP(0,$inp)); |
| 146 » » &lea» ($inp,&DWP(1,$inp)); |
| 147 » » &movz» ($tx,&BP(0,$dat,$xx)); |
| 148 » » &cmp» ($inp,&wparam(1)); |
| 149 » » &mov» (&BP(-1,$out,$inp),&LB($ty)); |
| 150 » &jb» (&label("cloop1")); |
| 151 |
| 152 &set_label("done"); |
| 153 » &dec» (&LB($xx)); |
| 154 » &mov» (&BP(-4,$dat),&LB($yy));» # save key->y |
| 155 » &mov» (&BP(-8,$dat),&LB($xx));» # save key->x |
| 156 &set_label("abort"); |
| 157 &function_end("RC4"); |
| 158 |
| 159 ######################################################################## |
| 160 |
| 161 $inp="esi"; |
36 $out="edi"; | 162 $out="edi"; |
37 $d="ebp"; | 163 $idi="ebp"; |
38 | 164 $ido="ecx"; |
39 &RC4("RC4"); | 165 $idx="edx"; |
| 166 |
| 167 &external_label("OPENSSL_ia32cap_P"); |
| 168 |
| 169 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); |
| 170 &function_begin("RC4_set_key"); |
| 171 » &mov» ($out,&wparam(0));» » # load key |
| 172 » &mov» ($idi,&wparam(1));» » # load len |
| 173 » &mov» ($inp,&wparam(2));» » # load data |
| 174 » &picmeup($idx,"OPENSSL_ia32cap_P"); |
| 175 |
| 176 » &lea» ($out,&DWP(2*4,$out));» » # &key->data |
| 177 » &lea» ($inp,&DWP(0,$inp,$idi));» # $inp to point at the end |
| 178 » &neg» ($idi); |
| 179 » &xor» ("eax","eax"); |
| 180 » &mov» (&DWP(-4,$out),$idi);» » # borrow key->y |
| 181 |
| 182 » &bt» (&DWP(0,$idx),20);» » # check for bit#20 |
| 183 » &jc» (&label("c1stloop")); |
| 184 |
| 185 &set_label("w1stloop",16); |
| 186 » &mov» (&DWP(0,$out,"eax",4),"eax");» # key->data[i]=i; |
| 187 » &add» (&LB("eax"),1);»» » # i++; |
| 188 » &jnc» (&label("w1stloop")); |
| 189 |
| 190 » &xor» ($ido,$ido); |
| 191 » &xor» ($idx,$idx); |
| 192 |
| 193 &set_label("w2ndloop",16); |
| 194 » &mov» ("eax",&DWP(0,$out,$ido,4)); |
| 195 » &add» (&LB($idx),&BP(0,$inp,$idi)); |
| 196 » &add» (&LB($idx),&LB("eax")); |
| 197 » &add» ($idi,1); |
| 198 » &mov» ("ebx",&DWP(0,$out,$idx,4)); |
| 199 » &jnz» (&label("wnowrap")); |
| 200 » &mov» ($idi,&DWP(-4,$out)); |
| 201 » &set_label("wnowrap"); |
| 202 » &mov» (&DWP(0,$out,$idx,4),"eax"); |
| 203 » &mov» (&DWP(0,$out,$ido,4),"ebx"); |
| 204 » &add» (&LB($ido),1); |
| 205 » &jnc» (&label("w2ndloop")); |
| 206 &jmp» (&label("exit")); |
| 207 |
| 208 # Unlike all other x86 [and x86_64] implementations, Intel P4 core |
| 209 # [including EM64T] was found to perform poorly with above "32-bit" key |
| 210 # schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded |
| 211 # assembler turned out to be 3.5x if re-coded for compressed 8-bit one, |
| 212 # a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit |
| 213 # schedule for x86[_64], because non-P4 implementations suffer from |
| 214 # significant performance losses then, e.g. PIII exhibits >2x |
| 215 # deterioration, and so does Opteron. In order to assure optimal |
| 216 # all-round performance, we detect P4 at run-time and set up compressed |
| 217 # key schedule, which is recognized by RC4 procedure. |
| 218 |
| 219 &set_label("c1stloop",16); |
| 220 » &mov» (&BP(0,$out,"eax"),&LB("eax"));»# key->data[i]=i; |
| 221 » &add» (&LB("eax"),1);»» » # i++; |
| 222 » &jnc» (&label("c1stloop")); |
| 223 |
| 224 » &xor» ($ido,$ido); |
| 225 » &xor» ($idx,$idx); |
| 226 » &xor» ("ebx","ebx"); |
| 227 |
| 228 &set_label("c2ndloop",16); |
| 229 » &mov» (&LB("eax"),&BP(0,$out,$ido)); |
| 230 » &add» (&LB($idx),&BP(0,$inp,$idi)); |
| 231 » &add» (&LB($idx),&LB("eax")); |
| 232 » &add» ($idi,1); |
| 233 » &mov» (&LB("ebx"),&BP(0,$out,$idx)); |
| 234 » &jnz» (&label("cnowrap")); |
| 235 » &mov» ($idi,&DWP(-4,$out)); |
| 236 » &set_label("cnowrap"); |
| 237 » &mov» (&BP(0,$out,$idx),&LB("eax")); |
| 238 » &mov» (&BP(0,$out,$ido),&LB("ebx")); |
| 239 » &add» (&LB($ido),1); |
| 240 » &jnc» (&label("c2ndloop")); |
| 241 |
| 242 » &mov» (&DWP(256,$out),-1);» » # mark schedule as compressed |
| 243 |
| 244 &set_label("exit"); |
| 245 » &xor» ("eax","eax"); |
| 246 » &mov» (&DWP(-8,$out),"eax");» » # key->x=0; |
| 247 » &mov» (&DWP(-4,$out),"eax");» » # key->y=0; |
| 248 &function_end("RC4_set_key"); |
| 249 |
| 250 # const char *RC4_options(void); |
| 251 &function_begin_B("RC4_options"); |
| 252 » &call» (&label("pic_point")); |
| 253 &set_label("pic_point"); |
| 254 » &blindpop("eax"); |
| 255 » &lea» ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); |
| 256 » &picmeup("edx","OPENSSL_ia32cap_P"); |
| 257 » &bt» (&DWP(0,"edx"),20); |
| 258 » &jnc» (&label("skip")); |
| 259 » &add» ("eax",12); |
| 260 » &set_label("skip"); |
| 261 » &ret» (); |
| 262 &set_label("opts",64); |
| 263 &asciz» ("rc4(4x,int)"); |
| 264 &asciz» ("rc4(1x,char)"); |
| 265 &asciz» ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
| 266 &align» (64); |
| 267 &function_end_B("RC4_options"); |
40 | 268 |
41 &asm_finish(); | 269 &asm_finish(); |
42 | 270 |
43 sub RC4_loop | |
44 { | |
45 local($n,$p,$char)=@_; | |
46 | |
47 &comment("Round $n"); | |
48 | |
49 if ($char) | |
50 { | |
51 if ($p >= 0) | |
52 { | |
53 &mov($ty, &swtmp(2)); | |
54 &cmp($ty, $in); | |
55 &jbe(&label("finished")); | |
56 &inc($in); | |
57 } | |
58 else | |
59 { | |
60 &add($ty, 8); | |
61 &inc($in); | |
62 &cmp($ty, $in); | |
63 &jb(&label("finished")); | |
64 &mov(&swtmp(2), $ty); | |
65 } | |
66 } | |
67 # Moved out | |
68 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; | |
69 | |
70 &add( &LB($y), &LB($tx)); | |
71 &mov( $ty, &DWP(0,$d,$y,4)); | |
72 # XXX | |
73 &mov( &DWP(0,$d,$x,4),$ty); | |
74 &add( $ty, $tx); | |
75 &mov( &DWP(0,$d,$y,4),$tx); | |
76 &and( $ty, 0xff); | |
77 &inc( &LB($x)); # NEXT ROUND | |
78 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND | |
79 &mov( $ty, &DWP(0,$d,$ty,4)); | |
80 | |
81 if (!$char) | |
82 { | |
83 #moved up into last round | |
84 if ($p >= 1) | |
85 { | |
86 &add( $out, 8) | |
87 } | |
88 &movb( &BP($n,"esp","",0), &LB($ty)); | |
89 } | |
90 else | |
91 { | |
92 # Note in+=8 has occured | |
93 &movb( &HB($ty), &BP(-1,$in,"",0)); | |
94 # XXX | |
95 &xorb(&LB($ty), &HB($ty)); | |
96 # XXX | |
97 &movb(&BP($n,$out,"",0),&LB($ty)); | |
98 } | |
99 } | |
100 | |
101 | |
102 sub RC4 | |
103 { | |
104 local($name)=@_; | |
105 | |
106 &function_begin_B($name,""); | |
107 | |
108 &mov($ty,&wparam(1)); # len | |
109 &cmp($ty,0); | |
110 &jne(&label("proceed")); | |
111 &ret(); | |
112 &set_label("proceed"); | |
113 | |
114 &comment(""); | |
115 | |
116 &push("ebp"); | |
117 &push("ebx"); | |
118 &push("esi"); | |
119 &xor( $x, $x); # avoid partial register stalls | |
120 &push("edi"); | |
121 &xor( $y, $y); # avoid partial register stalls | |
122 &mov( $d, &wparam(0)); # key | |
123 &mov( $in, &wparam(2)); | |
124 | |
125 &movb( &LB($x), &BP(0,$d,"",1)); | |
126 &movb( &LB($y), &BP(4,$d,"",1)); | |
127 | |
128 &mov( $out, &wparam(3)); | |
129 &inc( &LB($x)); | |
130 | |
131 &stack_push(3); # 3 temp variables | |
132 &add( $d, 8); | |
133 | |
134 # detect compressed schedule, see commentary section in rc4_skey.c... | |
135 # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, | |
136 # as compressed key schedule is set up in 0.9.8 and later. | |
137 &cmp(&DWP(256,$d),-1); | |
138 &je(&label("RC4_CHAR")); | |
139 | |
140 &lea( $ty, &DWP(-8,$ty,$in)); | |
141 | |
142 # check for 0 length input | |
143 | |
144 &mov( &swtmp(2), $ty); # this is now address to exit at | |
145 &mov( $tx, &DWP(0,$d,$x,4)); | |
146 | |
147 &cmp( $ty, $in); | |
148 &jb( &label("end")); # less than 8 bytes | |
149 | |
150 &set_label("start"); | |
151 | |
152 # filling DELAY SLOT | |
153 &add( $in, 8); | |
154 | |
155 &RC4_loop(0,-1,0); | |
156 &RC4_loop(1,0,0); | |
157 &RC4_loop(2,0,0); | |
158 &RC4_loop(3,0,0); | |
159 &RC4_loop(4,0,0); | |
160 &RC4_loop(5,0,0); | |
161 &RC4_loop(6,0,0); | |
162 &RC4_loop(7,1,0); | |
163 | |
164 &comment("apply the cipher text"); | |
165 # xor the cipher data with input | |
166 | |
167 #&add( $out, 8); #moved up into last round | |
168 | |
169 &mov( $tx, &swtmp(0)); | |
170 &mov( $ty, &DWP(-8,$in,"",0)); | |
171 &xor( $tx, $ty); | |
172 &mov( $ty, &DWP(-4,$in,"",0)); | |
173 &mov( &DWP(-8,$out,"",0), $tx); | |
174 &mov( $tx, &swtmp(1)); | |
175 &xor( $tx, $ty); | |
176 &mov( $ty, &swtmp(2)); # load end ptr; | |
177 &mov( &DWP(-4,$out,"",0), $tx); | |
178 &mov( $tx, &DWP(0,$d,$x,4)); | |
179 &cmp($in, $ty); | |
180 &jbe(&label("start")); | |
181 | |
182 &set_label("end"); | |
183 | |
184 # There is quite a bit of extra crap in RC4_loop() for this | |
185 # first round | |
186 &RC4_loop(0,-1,1); | |
187 &RC4_loop(1,0,1); | |
188 &RC4_loop(2,0,1); | |
189 &RC4_loop(3,0,1); | |
190 &RC4_loop(4,0,1); | |
191 &RC4_loop(5,0,1); | |
192 &RC4_loop(6,1,1); | |
193 | |
194 &jmp(&label("finished")); | |
195 | |
196 &align(16); | |
197 # this is essentially Intel P4 specific codepath, see rc4_skey.c, | |
198 # and is engaged in 0.9.8 and later context... | |
199 &set_label("RC4_CHAR"); | |
200 | |
201 &lea ($ty,&DWP(0,$in,$ty)); | |
202 &mov (&swtmp(2),$ty); | |
203 &movz ($tx,&BP(0,$d,$x)); | |
204 | |
205 # strangely enough unrolled loop performs over 20% slower... | |
206 &set_label("RC4_CHAR_loop"); | |
207 &add (&LB($y),&LB($tx)); | |
208 &movz ($ty,&BP(0,$d,$y)); | |
209 &movb (&BP(0,$d,$y),&LB($tx)); | |
210 &movb (&BP(0,$d,$x),&LB($ty)); | |
211 &add (&LB($ty),&LB($tx)); | |
212 &movz ($ty,&BP(0,$d,$ty)); | |
213 &add (&LB($x),1); | |
214 &xorb (&LB($ty),&BP(0,$in)); | |
215 &lea ($in,&DWP(1,$in)); | |
216 &movz ($tx,&BP(0,$d,$x)); | |
217 &cmp ($in,&swtmp(2)); | |
218 &movb (&BP(0,$out),&LB($ty)); | |
219 &lea ($out,&DWP(1,$out)); | |
220 &jb (&label("RC4_CHAR_loop")); | |
221 | |
222 &set_label("finished"); | |
223 &dec( $x); | |
224 &stack_pop(3); | |
225 &movb( &BP(-4,$d,"",0),&LB($y)); | |
226 &movb( &BP(-8,$d,"",0),&LB($x)); | |
227 | |
228 &function_end($name); | |
229 } | |
230 | |
OLD | NEW |