| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 # | |
| 3 # ==================================================================== | |
| 4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 # | |
| 10 # May 2011 | |
| 11 # | |
| 12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | |
| 13 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | |
| 14 # the time being... Except that it has three code paths: pure integer | |
| 15 # code suitable for any x86 CPU, MMX code suitable for PIII and later | |
| 16 # and PCLMULQDQ suitable for Westmere and later. Improvement varies | |
| 17 # from one benchmark and µ-arch to another. Below are interval values | |
| 18 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | |
| 19 # code: | |
| 20 # | |
| 21 # PIII 16%-30% | |
| 22 # P4 12%-12% | |
| 23 # Opteron 18%-40% | |
| 24 # Core2 19%-44% | |
| 25 # Atom 38%-64% | |
| 26 # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | |
| 27 # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | |
| 28 # | |
| 29 # Note that above improvement coefficients are not coefficients for | |
| 30 # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | |
| 31 # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | |
| 32 # is more and more dominated by other subroutines, most notably by | |
| 33 # BN_GF2m_mod[_mul]_arr... | |
| 34 | |
| 35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
| 36 push(@INC,"${dir}","${dir}../../perlasm"); | |
| 37 require "x86asm.pl"; | |
| 38 | |
| 39 &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | |
| 40 | |
| 41 $sse2=0; | |
| 42 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
| 43 | |
| 44 &external_label("OPENSSL_ia32cap_P") if ($sse2); | |
| 45 | |
| 46 $a="eax"; | |
| 47 $b="ebx"; | |
| 48 ($a1,$a2,$a4)=("ecx","edx","ebp"); | |
| 49 | |
| 50 $R="mm0"; | |
| 51 @T=("mm1","mm2"); | |
| 52 ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | |
| 53 @i=("esi","edi"); | |
| 54 | |
| 55 if (!$x86only) { | |
| 56 &function_begin_B("_mul_1x1_mmx"); | |
| 57 &sub ("esp",32+4); | |
| 58 &mov ($a1,$a); | |
| 59 &lea ($a2,&DWP(0,$a,$a)); | |
| 60 &and ($a1,0x3fffffff); | |
| 61 &lea ($a4,&DWP(0,$a2,$a2)); | |
| 62 &mov (&DWP(0*4,"esp"),0); | |
| 63 &and ($a2,0x7fffffff); | |
| 64 &movd ($A,$a); | |
| 65 &movd ($B,$b); | |
| 66 &mov (&DWP(1*4,"esp"),$a1); # a1 | |
| 67 &xor ($a1,$a2); # a1^a2 | |
| 68 &pxor ($B31,$B31); | |
| 69 &pxor ($B30,$B30); | |
| 70 &mov (&DWP(2*4,"esp"),$a2); # a2 | |
| 71 &xor ($a2,$a4); # a2^a4 | |
| 72 &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | |
| 73 &pcmpgtd($B31,$A); # broadcast 31st bit | |
| 74 &paddd ($A,$A); # $A<<=1 | |
| 75 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | |
| 76 &mov (&DWP(4*4,"esp"),$a4); # a4 | |
| 77 &xor ($a4,$a2); # a2=a4^a2^a4 | |
| 78 &pand ($B31,$B); | |
| 79 &pcmpgtd($B30,$A); # broadcast 30th bit | |
| 80 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | |
| 81 &xor ($a4,$a1); # a1^a2^a4 | |
| 82 &psllq ($B31,31); | |
| 83 &pand ($B30,$B); | |
| 84 &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | |
| 85 &mov (@i[0],0x7); | |
| 86 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | |
| 87 &mov ($a4,@i[0]); | |
| 88 &and (@i[0],$b); | |
| 89 &shr ($b,3); | |
| 90 &mov (@i[1],$a4); | |
| 91 &psllq ($B30,30); | |
| 92 &and (@i[1],$b); | |
| 93 &shr ($b,3); | |
| 94 &movd ($R,&DWP(0,"esp",@i[0],4)); | |
| 95 &mov (@i[0],$a4); | |
| 96 &and (@i[0],$b); | |
| 97 &shr ($b,3); | |
| 98 for($n=1;$n<9;$n++) { | |
| 99 &movd (@T[1],&DWP(0,"esp",@i[1],4)); | |
| 100 &mov (@i[1],$a4); | |
| 101 &psllq (@T[1],3*$n); | |
| 102 &and (@i[1],$b); | |
| 103 &shr ($b,3); | |
| 104 &pxor ($R,@T[1]); | |
| 105 | |
| 106 push(@i,shift(@i)); push(@T,shift(@T)); | |
| 107 } | |
| 108 &movd (@T[1],&DWP(0,"esp",@i[1],4)); | |
| 109 &pxor ($R,$B30); | |
| 110 &psllq (@T[1],3*$n++); | |
| 111 &pxor ($R,@T[1]); | |
| 112 | |
| 113 &movd (@T[0],&DWP(0,"esp",@i[0],4)); | |
| 114 &pxor ($R,$B31); | |
| 115 &psllq (@T[0],3*$n); | |
| 116 &add ("esp",32+4); | |
| 117 &pxor ($R,@T[0]); | |
| 118 &ret (); | |
| 119 &function_end_B("_mul_1x1_mmx"); | |
| 120 } | |
| 121 | |
| 122 ($lo,$hi)=("eax","edx"); | |
| 123 @T=("ecx","ebp"); | |
| 124 | |
| 125 &function_begin_B("_mul_1x1_ialu"); | |
| 126 &sub ("esp",32+4); | |
| 127 &mov ($a1,$a); | |
| 128 &lea ($a2,&DWP(0,$a,$a)); | |
| 129 &lea ($a4,&DWP(0,"",$a,4)); | |
| 130 &and ($a1,0x3fffffff); | |
| 131 &lea (@i[1],&DWP(0,$lo,$lo)); | |
| 132 &sar ($lo,31); # broadcast 31st bit | |
| 133 &mov (&DWP(0*4,"esp"),0); | |
| 134 &and ($a2,0x7fffffff); | |
| 135 &mov (&DWP(1*4,"esp"),$a1); # a1 | |
| 136 &xor ($a1,$a2); # a1^a2 | |
| 137 &mov (&DWP(2*4,"esp"),$a2); # a2 | |
| 138 &xor ($a2,$a4); # a2^a4 | |
| 139 &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | |
| 140 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | |
| 141 &mov (&DWP(4*4,"esp"),$a4); # a4 | |
| 142 &xor ($a4,$a2); # a2=a4^a2^a4 | |
| 143 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | |
| 144 &xor ($a4,$a1); # a1^a2^a4 | |
| 145 &sar (@i[1],31); # broardcast 30th bit | |
| 146 &and ($lo,$b); | |
| 147 &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | |
| 148 &and (@i[1],$b); | |
| 149 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | |
| 150 &mov ($hi,$lo); | |
| 151 &shl ($lo,31); | |
| 152 &mov (@T[0],@i[1]); | |
| 153 &shr ($hi,1); | |
| 154 | |
| 155 &mov (@i[0],0x7); | |
| 156 &shl (@i[1],30); | |
| 157 &and (@i[0],$b); | |
| 158 &shr (@T[0],2); | |
| 159 &xor ($lo,@i[1]); | |
| 160 | |
| 161 &shr ($b,3); | |
| 162 &mov (@i[1],0x7); # 5-byte instruction!? | |
| 163 &and (@i[1],$b); | |
| 164 &shr ($b,3); | |
| 165 &xor ($hi,@T[0]); | |
| 166 &xor ($lo,&DWP(0,"esp",@i[0],4)); | |
| 167 &mov (@i[0],0x7); | |
| 168 &and (@i[0],$b); | |
| 169 &shr ($b,3); | |
| 170 for($n=1;$n<9;$n++) { | |
| 171 &mov (@T[1],&DWP(0,"esp",@i[1],4)); | |
| 172 &mov (@i[1],0x7); | |
| 173 &mov (@T[0],@T[1]); | |
| 174 &shl (@T[1],3*$n); | |
| 175 &and (@i[1],$b); | |
| 176 &shr (@T[0],32-3*$n); | |
| 177 &xor ($lo,@T[1]); | |
| 178 &shr ($b,3); | |
| 179 &xor ($hi,@T[0]); | |
| 180 | |
| 181 push(@i,shift(@i)); push(@T,shift(@T)); | |
| 182 } | |
| 183 &mov (@T[1],&DWP(0,"esp",@i[1],4)); | |
| 184 &mov (@T[0],@T[1]); | |
| 185 &shl (@T[1],3*$n); | |
| 186 &mov (@i[1],&DWP(0,"esp",@i[0],4)); | |
| 187 &shr (@T[0],32-3*$n); $n++; | |
| 188 &mov (@i[0],@i[1]); | |
| 189 &xor ($lo,@T[1]); | |
| 190 &shl (@i[1],3*$n); | |
| 191 &xor ($hi,@T[0]); | |
| 192 &shr (@i[0],32-3*$n); | |
| 193 &xor ($lo,@i[1]); | |
| 194 &xor ($hi,@i[0]); | |
| 195 | |
| 196 &add ("esp",32+4); | |
| 197 &ret (); | |
| 198 &function_end_B("_mul_1x1_ialu"); | |
| 199 | |
| 200 # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_UL
ONG b0); | |
| 201 &function_begin_B("bn_GF2m_mul_2x2"); | |
| 202 if (!$x86only) { | |
| 203 &picmeup("edx","OPENSSL_ia32cap_P"); | |
| 204 &mov ("eax",&DWP(0,"edx")); | |
| 205 &mov ("edx",&DWP(4,"edx")); | |
| 206 &test ("eax",1<<23); # check MMX bit | |
| 207 &jz (&label("ialu")); | |
| 208 if ($sse2) { | |
| 209 &test ("eax",1<<24); # check FXSR bit | |
| 210 &jz (&label("mmx")); | |
| 211 &test ("edx",1<<1); # check PCLMULQDQ bit | |
| 212 &jz (&label("mmx")); | |
| 213 | |
| 214 &movups ("xmm0",&QWP(8,"esp")); | |
| 215 &shufps ("xmm0","xmm0",0b10110001); | |
| 216 &pclmulqdq ("xmm0","xmm0",1); | |
| 217 &mov ("eax",&DWP(4,"esp")); | |
| 218 &movups (&QWP(0,"eax"),"xmm0"); | |
| 219 &ret (); | |
| 220 | |
| 221 &set_label("mmx",16); | |
| 222 } | |
| 223 &push ("ebp"); | |
| 224 &push ("ebx"); | |
| 225 &push ("esi"); | |
| 226 &push ("edi"); | |
| 227 &mov ($a,&wparam(1)); | |
| 228 &mov ($b,&wparam(3)); | |
| 229 &call ("_mul_1x1_mmx"); # a1·b1 | |
| 230 &movq ("mm7",$R); | |
| 231 | |
| 232 &mov ($a,&wparam(2)); | |
| 233 &mov ($b,&wparam(4)); | |
| 234 &call ("_mul_1x1_mmx"); # a0·b0 | |
| 235 &movq ("mm6",$R); | |
| 236 | |
| 237 &mov ($a,&wparam(1)); | |
| 238 &mov ($b,&wparam(3)); | |
| 239 &xor ($a,&wparam(2)); | |
| 240 &xor ($b,&wparam(4)); | |
| 241 &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | |
| 242 &pxor ($R,"mm7"); | |
| 243 &mov ($a,&wparam(0)); | |
| 244 &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | |
| 245 | |
| 246 &movq ($A,$R); | |
| 247 &psllq ($R,32); | |
| 248 &pop ("edi"); | |
| 249 &psrlq ($A,32); | |
| 250 &pop ("esi"); | |
| 251 &pxor ($R,"mm6"); | |
| 252 &pop ("ebx"); | |
| 253 &pxor ($A,"mm7"); | |
| 254 &movq (&QWP(0,$a),$R); | |
| 255 &pop ("ebp"); | |
| 256 &movq (&QWP(8,$a),$A); | |
| 257 &emms (); | |
| 258 &ret (); | |
| 259 &set_label("ialu",16); | |
| 260 } | |
| 261 &push ("ebp"); | |
| 262 &push ("ebx"); | |
| 263 &push ("esi"); | |
| 264 &push ("edi"); | |
| 265 &stack_push(4+1); | |
| 266 | |
| 267 &mov ($a,&wparam(1)); | |
| 268 &mov ($b,&wparam(3)); | |
| 269 &call ("_mul_1x1_ialu"); # a1·b1 | |
| 270 &mov (&DWP(8,"esp"),$lo); | |
| 271 &mov (&DWP(12,"esp"),$hi); | |
| 272 | |
| 273 &mov ($a,&wparam(2)); | |
| 274 &mov ($b,&wparam(4)); | |
| 275 &call ("_mul_1x1_ialu"); # a0·b0 | |
| 276 &mov (&DWP(0,"esp"),$lo); | |
| 277 &mov (&DWP(4,"esp"),$hi); | |
| 278 | |
| 279 &mov ($a,&wparam(1)); | |
| 280 &mov ($b,&wparam(3)); | |
| 281 &xor ($a,&wparam(2)); | |
| 282 &xor ($b,&wparam(4)); | |
| 283 &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | |
| 284 | |
| 285 &mov ("ebp",&wparam(0)); | |
| 286 @r=("ebx","ecx","edi","esi"); | |
| 287 &mov (@r[0],&DWP(0,"esp")); | |
| 288 &mov (@r[1],&DWP(4,"esp")); | |
| 289 &mov (@r[2],&DWP(8,"esp")); | |
| 290 &mov (@r[3],&DWP(12,"esp")); | |
| 291 | |
| 292 &xor ($lo,$hi); | |
| 293 &xor ($hi,@r[1]); | |
| 294 &xor ($lo,@r[0]); | |
| 295 &mov (&DWP(0,"ebp"),@r[0]); | |
| 296 &xor ($hi,@r[2]); | |
| 297 &mov (&DWP(12,"ebp"),@r[3]); | |
| 298 &xor ($lo,@r[3]); | |
| 299 &stack_pop(4+1); | |
| 300 &xor ($hi,@r[3]); | |
| 301 &pop ("edi"); | |
| 302 &xor ($lo,$hi); | |
| 303 &pop ("esi"); | |
| 304 &mov (&DWP(8,"ebp"),$hi); | |
| 305 &pop ("ebx"); | |
| 306 &mov (&DWP(4,"ebp"),$lo); | |
| 307 &pop ("ebp"); | |
| 308 &ret (); | |
| 309 &function_end_B("bn_GF2m_mul_2x2"); | |
| 310 | |
| 311 &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
| 312 | |
| 313 &asm_finish(); | |
| OLD | NEW |