OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 # | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 # | |
10 # May 2011 | |
11 # | |
12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | |
13 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | |
14 # the time being... Except that it has three code paths: pure integer | |
15 # code suitable for any x86 CPU, MMX code suitable for PIII and later | |
16 # and PCLMULQDQ suitable for Westmere and later. Improvement varies | |
17 # from one benchmark and µ-arch to another. Below are interval values | |
18 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | |
19 # code: | |
20 # | |
21 # PIII 16%-30% | |
22 # P4 12%-12% | |
23 # Opteron 18%-40% | |
24 # Core2 19%-44% | |
25 # Atom 38%-64% | |
26 # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | |
27 # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | |
28 # | |
29 # Note that above improvement coefficients are not coefficients for | |
30 # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | |
31 # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | |
32 # is more and more dominated by other subroutines, most notably by | |
33 # BN_GF2m_mod[_mul]_arr... | |
34 | |
35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
36 push(@INC,"${dir}","${dir}../../perlasm"); | |
37 require "x86asm.pl"; | |
38 | |
39 &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | |
40 | |
41 $sse2=0; | |
42 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
43 | |
44 &external_label("OPENSSL_ia32cap_P") if ($sse2); | |
45 | |
46 $a="eax"; | |
47 $b="ebx"; | |
48 ($a1,$a2,$a4)=("ecx","edx","ebp"); | |
49 | |
50 $R="mm0"; | |
51 @T=("mm1","mm2"); | |
52 ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | |
53 @i=("esi","edi"); | |
54 | |
55 if (!$x86only) { | |
56 &function_begin_B("_mul_1x1_mmx"); | |
57 &sub ("esp",32+4); | |
58 &mov ($a1,$a); | |
59 &lea ($a2,&DWP(0,$a,$a)); | |
60 &and ($a1,0x3fffffff); | |
61 &lea ($a4,&DWP(0,$a2,$a2)); | |
62 &mov (&DWP(0*4,"esp"),0); | |
63 &and ($a2,0x7fffffff); | |
64 &movd ($A,$a); | |
65 &movd ($B,$b); | |
66 &mov (&DWP(1*4,"esp"),$a1); # a1 | |
67 &xor ($a1,$a2); # a1^a2 | |
68 &pxor ($B31,$B31); | |
69 &pxor ($B30,$B30); | |
70 &mov (&DWP(2*4,"esp"),$a2); # a2 | |
71 &xor ($a2,$a4); # a2^a4 | |
72 &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | |
73 &pcmpgtd($B31,$A); # broadcast 31st bit | |
74 &paddd ($A,$A); # $A<<=1 | |
75 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | |
76 &mov (&DWP(4*4,"esp"),$a4); # a4 | |
77 &xor ($a4,$a2); # a2=a4^a2^a4 | |
78 &pand ($B31,$B); | |
79 &pcmpgtd($B30,$A); # broadcast 30th bit | |
80 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | |
81 &xor ($a4,$a1); # a1^a2^a4 | |
82 &psllq ($B31,31); | |
83 &pand ($B30,$B); | |
84 &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | |
85 &mov (@i[0],0x7); | |
86 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | |
87 &mov ($a4,@i[0]); | |
88 &and (@i[0],$b); | |
89 &shr ($b,3); | |
90 &mov (@i[1],$a4); | |
91 &psllq ($B30,30); | |
92 &and (@i[1],$b); | |
93 &shr ($b,3); | |
94 &movd ($R,&DWP(0,"esp",@i[0],4)); | |
95 &mov (@i[0],$a4); | |
96 &and (@i[0],$b); | |
97 &shr ($b,3); | |
98 for($n=1;$n<9;$n++) { | |
99 &movd (@T[1],&DWP(0,"esp",@i[1],4)); | |
100 &mov (@i[1],$a4); | |
101 &psllq (@T[1],3*$n); | |
102 &and (@i[1],$b); | |
103 &shr ($b,3); | |
104 &pxor ($R,@T[1]); | |
105 | |
106 push(@i,shift(@i)); push(@T,shift(@T)); | |
107 } | |
108 &movd (@T[1],&DWP(0,"esp",@i[1],4)); | |
109 &pxor ($R,$B30); | |
110 &psllq (@T[1],3*$n++); | |
111 &pxor ($R,@T[1]); | |
112 | |
113 &movd (@T[0],&DWP(0,"esp",@i[0],4)); | |
114 &pxor ($R,$B31); | |
115 &psllq (@T[0],3*$n); | |
116 &add ("esp",32+4); | |
117 &pxor ($R,@T[0]); | |
118 &ret (); | |
119 &function_end_B("_mul_1x1_mmx"); | |
120 } | |
121 | |
122 ($lo,$hi)=("eax","edx"); | |
123 @T=("ecx","ebp"); | |
124 | |
125 &function_begin_B("_mul_1x1_ialu"); | |
126 &sub ("esp",32+4); | |
127 &mov ($a1,$a); | |
128 &lea ($a2,&DWP(0,$a,$a)); | |
129 &lea ($a4,&DWP(0,"",$a,4)); | |
130 &and ($a1,0x3fffffff); | |
131 &lea (@i[1],&DWP(0,$lo,$lo)); | |
132 &sar ($lo,31); # broadcast 31st bit | |
133 &mov (&DWP(0*4,"esp"),0); | |
134 &and ($a2,0x7fffffff); | |
135 &mov (&DWP(1*4,"esp"),$a1); # a1 | |
136 &xor ($a1,$a2); # a1^a2 | |
137 &mov (&DWP(2*4,"esp"),$a2); # a2 | |
138 &xor ($a2,$a4); # a2^a4 | |
139 &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | |
140 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | |
141 &mov (&DWP(4*4,"esp"),$a4); # a4 | |
142 &xor ($a4,$a2); # a2=a4^a2^a4 | |
143 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | |
144 &xor ($a4,$a1); # a1^a2^a4 | |
145 &sar (@i[1],31); # broardcast 30th bit | |
146 &and ($lo,$b); | |
147 &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | |
148 &and (@i[1],$b); | |
149 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | |
150 &mov ($hi,$lo); | |
151 &shl ($lo,31); | |
152 &mov (@T[0],@i[1]); | |
153 &shr ($hi,1); | |
154 | |
155 &mov (@i[0],0x7); | |
156 &shl (@i[1],30); | |
157 &and (@i[0],$b); | |
158 &shr (@T[0],2); | |
159 &xor ($lo,@i[1]); | |
160 | |
161 &shr ($b,3); | |
162 &mov (@i[1],0x7); # 5-byte instruction!? | |
163 &and (@i[1],$b); | |
164 &shr ($b,3); | |
165 &xor ($hi,@T[0]); | |
166 &xor ($lo,&DWP(0,"esp",@i[0],4)); | |
167 &mov (@i[0],0x7); | |
168 &and (@i[0],$b); | |
169 &shr ($b,3); | |
170 for($n=1;$n<9;$n++) { | |
171 &mov (@T[1],&DWP(0,"esp",@i[1],4)); | |
172 &mov (@i[1],0x7); | |
173 &mov (@T[0],@T[1]); | |
174 &shl (@T[1],3*$n); | |
175 &and (@i[1],$b); | |
176 &shr (@T[0],32-3*$n); | |
177 &xor ($lo,@T[1]); | |
178 &shr ($b,3); | |
179 &xor ($hi,@T[0]); | |
180 | |
181 push(@i,shift(@i)); push(@T,shift(@T)); | |
182 } | |
183 &mov (@T[1],&DWP(0,"esp",@i[1],4)); | |
184 &mov (@T[0],@T[1]); | |
185 &shl (@T[1],3*$n); | |
186 &mov (@i[1],&DWP(0,"esp",@i[0],4)); | |
187 &shr (@T[0],32-3*$n); $n++; | |
188 &mov (@i[0],@i[1]); | |
189 &xor ($lo,@T[1]); | |
190 &shl (@i[1],3*$n); | |
191 &xor ($hi,@T[0]); | |
192 &shr (@i[0],32-3*$n); | |
193 &xor ($lo,@i[1]); | |
194 &xor ($hi,@i[0]); | |
195 | |
196 &add ("esp",32+4); | |
197 &ret (); | |
198 &function_end_B("_mul_1x1_ialu"); | |
199 | |
200 # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_UL
ONG b0); | |
201 &function_begin_B("bn_GF2m_mul_2x2"); | |
202 if (!$x86only) { | |
203 &picmeup("edx","OPENSSL_ia32cap_P"); | |
204 &mov ("eax",&DWP(0,"edx")); | |
205 &mov ("edx",&DWP(4,"edx")); | |
206 &test ("eax",1<<23); # check MMX bit | |
207 &jz (&label("ialu")); | |
208 if ($sse2) { | |
209 &test ("eax",1<<24); # check FXSR bit | |
210 &jz (&label("mmx")); | |
211 &test ("edx",1<<1); # check PCLMULQDQ bit | |
212 &jz (&label("mmx")); | |
213 | |
214 &movups ("xmm0",&QWP(8,"esp")); | |
215 &shufps ("xmm0","xmm0",0b10110001); | |
216 &pclmulqdq ("xmm0","xmm0",1); | |
217 &mov ("eax",&DWP(4,"esp")); | |
218 &movups (&QWP(0,"eax"),"xmm0"); | |
219 &ret (); | |
220 | |
221 &set_label("mmx",16); | |
222 } | |
223 &push ("ebp"); | |
224 &push ("ebx"); | |
225 &push ("esi"); | |
226 &push ("edi"); | |
227 &mov ($a,&wparam(1)); | |
228 &mov ($b,&wparam(3)); | |
229 &call ("_mul_1x1_mmx"); # a1·b1 | |
230 &movq ("mm7",$R); | |
231 | |
232 &mov ($a,&wparam(2)); | |
233 &mov ($b,&wparam(4)); | |
234 &call ("_mul_1x1_mmx"); # a0·b0 | |
235 &movq ("mm6",$R); | |
236 | |
237 &mov ($a,&wparam(1)); | |
238 &mov ($b,&wparam(3)); | |
239 &xor ($a,&wparam(2)); | |
240 &xor ($b,&wparam(4)); | |
241 &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | |
242 &pxor ($R,"mm7"); | |
243 &mov ($a,&wparam(0)); | |
244 &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | |
245 | |
246 &movq ($A,$R); | |
247 &psllq ($R,32); | |
248 &pop ("edi"); | |
249 &psrlq ($A,32); | |
250 &pop ("esi"); | |
251 &pxor ($R,"mm6"); | |
252 &pop ("ebx"); | |
253 &pxor ($A,"mm7"); | |
254 &movq (&QWP(0,$a),$R); | |
255 &pop ("ebp"); | |
256 &movq (&QWP(8,$a),$A); | |
257 &emms (); | |
258 &ret (); | |
259 &set_label("ialu",16); | |
260 } | |
261 &push ("ebp"); | |
262 &push ("ebx"); | |
263 &push ("esi"); | |
264 &push ("edi"); | |
265 &stack_push(4+1); | |
266 | |
267 &mov ($a,&wparam(1)); | |
268 &mov ($b,&wparam(3)); | |
269 &call ("_mul_1x1_ialu"); # a1·b1 | |
270 &mov (&DWP(8,"esp"),$lo); | |
271 &mov (&DWP(12,"esp"),$hi); | |
272 | |
273 &mov ($a,&wparam(2)); | |
274 &mov ($b,&wparam(4)); | |
275 &call ("_mul_1x1_ialu"); # a0·b0 | |
276 &mov (&DWP(0,"esp"),$lo); | |
277 &mov (&DWP(4,"esp"),$hi); | |
278 | |
279 &mov ($a,&wparam(1)); | |
280 &mov ($b,&wparam(3)); | |
281 &xor ($a,&wparam(2)); | |
282 &xor ($b,&wparam(4)); | |
283 &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | |
284 | |
285 &mov ("ebp",&wparam(0)); | |
286 @r=("ebx","ecx","edi","esi"); | |
287 &mov (@r[0],&DWP(0,"esp")); | |
288 &mov (@r[1],&DWP(4,"esp")); | |
289 &mov (@r[2],&DWP(8,"esp")); | |
290 &mov (@r[3],&DWP(12,"esp")); | |
291 | |
292 &xor ($lo,$hi); | |
293 &xor ($hi,@r[1]); | |
294 &xor ($lo,@r[0]); | |
295 &mov (&DWP(0,"ebp"),@r[0]); | |
296 &xor ($hi,@r[2]); | |
297 &mov (&DWP(12,"ebp"),@r[3]); | |
298 &xor ($lo,@r[3]); | |
299 &stack_pop(4+1); | |
300 &xor ($hi,@r[3]); | |
301 &pop ("edi"); | |
302 &xor ($lo,$hi); | |
303 &pop ("esi"); | |
304 &mov (&DWP(8,"ebp"),$hi); | |
305 &pop ("ebx"); | |
306 &mov (&DWP(4,"ebp"),$lo); | |
307 &pop ("ebp"); | |
308 &ret (); | |
309 &function_end_B("bn_GF2m_mul_2x2"); | |
310 | |
311 &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
312 | |
313 &asm_finish(); | |
OLD | NEW |