OLD | NEW |
(Empty) | |
| 1 %ifidn __OUTPUT_FORMAT__,obj |
| 2 section code use32 class=code align=64 |
| 3 %elifidn __OUTPUT_FORMAT__,win32 |
| 4 %ifdef __YASM_VERSION_ID__ |
| 5 %if __YASM_VERSION_ID__ < 01010000h |
| 6 %error yasm version 1.1.0 or later needed. |
| 7 %endif |
| 8 ; Yasm automatically includes .00 and complains about redefining it. |
| 9 ; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html |
| 10 %else |
| 11 $@feat.00 equ 1 |
| 12 %endif |
| 13 section .text code align=64 |
| 14 %else |
| 15 section .text code |
| 16 %endif |
| 17 ;extern _OPENSSL_ia32cap_P |
| 18 global _bn_mul_mont |
| 19 align 16 |
| 20 _bn_mul_mont: |
| 21 L$_bn_mul_mont_begin: |
| 22 push ebp |
| 23 push ebx |
| 24 push esi |
| 25 push edi |
| 26 xor eax,eax |
| 27 mov edi,DWORD [40+esp] |
| 28 cmp edi,4 |
| 29 jl NEAR L$000just_leave |
| 30 lea esi,[20+esp] |
| 31 lea edx,[24+esp] |
| 32 mov ebp,esp |
| 33 add edi,2 |
| 34 neg edi |
| 35 lea esp,[edi*4+esp-32] |
| 36 neg edi |
| 37 mov eax,esp |
| 38 sub eax,edx |
| 39 and eax,2047 |
| 40 sub esp,eax |
| 41 xor edx,esp |
| 42 and edx,2048 |
| 43 xor edx,2048 |
| 44 sub esp,edx |
| 45 and esp,-64 |
| 46 mov eax,DWORD [esi] |
| 47 mov ebx,DWORD [4+esi] |
| 48 mov ecx,DWORD [8+esi] |
| 49 mov edx,DWORD [12+esi] |
| 50 mov esi,DWORD [16+esi] |
| 51 mov esi,DWORD [esi] |
| 52 mov DWORD [4+esp],eax |
| 53 mov DWORD [8+esp],ebx |
| 54 mov DWORD [12+esp],ecx |
| 55 mov DWORD [16+esp],edx |
| 56 mov DWORD [20+esp],esi |
| 57 lea ebx,[edi-3] |
| 58 mov DWORD [24+esp],ebp |
| 59 lea eax,[_OPENSSL_ia32cap_P] |
| 60 bt DWORD [eax],26 |
| 61 jnc NEAR L$001non_sse2 |
| 62 mov eax,-1 |
| 63 movd mm7,eax |
| 64 mov esi,DWORD [8+esp] |
| 65 mov edi,DWORD [12+esp] |
| 66 mov ebp,DWORD [16+esp] |
| 67 xor edx,edx |
| 68 xor ecx,ecx |
| 69 movd mm4,DWORD [edi] |
| 70 movd mm5,DWORD [esi] |
| 71 movd mm3,DWORD [ebp] |
| 72 pmuludq mm5,mm4 |
| 73 movq mm2,mm5 |
| 74 movq mm0,mm5 |
| 75 pand mm0,mm7 |
| 76 pmuludq mm5,[20+esp] |
| 77 pmuludq mm3,mm5 |
| 78 paddq mm3,mm0 |
| 79 movd mm1,DWORD [4+ebp] |
| 80 movd mm0,DWORD [4+esi] |
| 81 psrlq mm2,32 |
| 82 psrlq mm3,32 |
| 83 inc ecx |
| 84 align 16 |
| 85 L$0021st: |
| 86 pmuludq mm0,mm4 |
| 87 pmuludq mm1,mm5 |
| 88 paddq mm2,mm0 |
| 89 paddq mm3,mm1 |
| 90 movq mm0,mm2 |
| 91 pand mm0,mm7 |
| 92 movd mm1,DWORD [4+ecx*4+ebp] |
| 93 paddq mm3,mm0 |
| 94 movd mm0,DWORD [4+ecx*4+esi] |
| 95 psrlq mm2,32 |
| 96 movd DWORD [28+ecx*4+esp],mm3 |
| 97 psrlq mm3,32 |
| 98 lea ecx,[1+ecx] |
| 99 cmp ecx,ebx |
| 100 jl NEAR L$0021st |
| 101 pmuludq mm0,mm4 |
| 102 pmuludq mm1,mm5 |
| 103 paddq mm2,mm0 |
| 104 paddq mm3,mm1 |
| 105 movq mm0,mm2 |
| 106 pand mm0,mm7 |
| 107 paddq mm3,mm0 |
| 108 movd DWORD [28+ecx*4+esp],mm3 |
| 109 psrlq mm2,32 |
| 110 psrlq mm3,32 |
| 111 paddq mm3,mm2 |
| 112 movq [32+ebx*4+esp],mm3 |
| 113 inc edx |
| 114 L$003outer: |
| 115 xor ecx,ecx |
| 116 movd mm4,DWORD [edx*4+edi] |
| 117 movd mm5,DWORD [esi] |
| 118 movd mm6,DWORD [32+esp] |
| 119 movd mm3,DWORD [ebp] |
| 120 pmuludq mm5,mm4 |
| 121 paddq mm5,mm6 |
| 122 movq mm0,mm5 |
| 123 movq mm2,mm5 |
| 124 pand mm0,mm7 |
| 125 pmuludq mm5,[20+esp] |
| 126 pmuludq mm3,mm5 |
| 127 paddq mm3,mm0 |
| 128 movd mm6,DWORD [36+esp] |
| 129 movd mm1,DWORD [4+ebp] |
| 130 movd mm0,DWORD [4+esi] |
| 131 psrlq mm2,32 |
| 132 psrlq mm3,32 |
| 133 paddq mm2,mm6 |
| 134 inc ecx |
| 135 dec ebx |
| 136 L$004inner: |
| 137 pmuludq mm0,mm4 |
| 138 pmuludq mm1,mm5 |
| 139 paddq mm2,mm0 |
| 140 paddq mm3,mm1 |
| 141 movq mm0,mm2 |
| 142 movd mm6,DWORD [36+ecx*4+esp] |
| 143 pand mm0,mm7 |
| 144 movd mm1,DWORD [4+ecx*4+ebp] |
| 145 paddq mm3,mm0 |
| 146 movd mm0,DWORD [4+ecx*4+esi] |
| 147 psrlq mm2,32 |
| 148 movd DWORD [28+ecx*4+esp],mm3 |
| 149 psrlq mm3,32 |
| 150 paddq mm2,mm6 |
| 151 dec ebx |
| 152 lea ecx,[1+ecx] |
| 153 jnz NEAR L$004inner |
| 154 mov ebx,ecx |
| 155 pmuludq mm0,mm4 |
| 156 pmuludq mm1,mm5 |
| 157 paddq mm2,mm0 |
| 158 paddq mm3,mm1 |
| 159 movq mm0,mm2 |
| 160 pand mm0,mm7 |
| 161 paddq mm3,mm0 |
| 162 movd DWORD [28+ecx*4+esp],mm3 |
| 163 psrlq mm2,32 |
| 164 psrlq mm3,32 |
| 165 movd mm6,DWORD [36+ebx*4+esp] |
| 166 paddq mm3,mm2 |
| 167 paddq mm3,mm6 |
| 168 movq [32+ebx*4+esp],mm3 |
| 169 lea edx,[1+edx] |
| 170 cmp edx,ebx |
| 171 jle NEAR L$003outer |
| 172 emms |
| 173 jmp NEAR L$005common_tail |
| 174 align 16 |
| 175 L$001non_sse2: |
| 176 mov esi,DWORD [8+esp] |
| 177 lea ebp,[1+ebx] |
| 178 mov edi,DWORD [12+esp] |
| 179 xor ecx,ecx |
| 180 mov edx,esi |
| 181 and ebp,1 |
| 182 sub edx,edi |
| 183 lea eax,[4+ebx*4+edi] |
| 184 or ebp,edx |
| 185 mov edi,DWORD [edi] |
| 186 jz NEAR L$006bn_sqr_mont |
| 187 mov DWORD [28+esp],eax |
| 188 mov eax,DWORD [esi] |
| 189 xor edx,edx |
| 190 align 16 |
| 191 L$007mull: |
| 192 mov ebp,edx |
| 193 mul edi |
| 194 add ebp,eax |
| 195 lea ecx,[1+ecx] |
| 196 adc edx,0 |
| 197 mov eax,DWORD [ecx*4+esi] |
| 198 cmp ecx,ebx |
| 199 mov DWORD [28+ecx*4+esp],ebp |
| 200 jl NEAR L$007mull |
| 201 mov ebp,edx |
| 202 mul edi |
| 203 mov edi,DWORD [20+esp] |
| 204 add eax,ebp |
| 205 mov esi,DWORD [16+esp] |
| 206 adc edx,0 |
| 207 imul edi,DWORD [32+esp] |
| 208 mov DWORD [32+ebx*4+esp],eax |
| 209 xor ecx,ecx |
| 210 mov DWORD [36+ebx*4+esp],edx |
| 211 mov DWORD [40+ebx*4+esp],ecx |
| 212 mov eax,DWORD [esi] |
| 213 mul edi |
| 214 add eax,DWORD [32+esp] |
| 215 mov eax,DWORD [4+esi] |
| 216 adc edx,0 |
| 217 inc ecx |
| 218 jmp NEAR L$0082ndmadd |
| 219 align 16 |
| 220 L$0091stmadd: |
| 221 mov ebp,edx |
| 222 mul edi |
| 223 add ebp,DWORD [32+ecx*4+esp] |
| 224 lea ecx,[1+ecx] |
| 225 adc edx,0 |
| 226 add ebp,eax |
| 227 mov eax,DWORD [ecx*4+esi] |
| 228 adc edx,0 |
| 229 cmp ecx,ebx |
| 230 mov DWORD [28+ecx*4+esp],ebp |
| 231 jl NEAR L$0091stmadd |
| 232 mov ebp,edx |
| 233 mul edi |
| 234 add eax,DWORD [32+ebx*4+esp] |
| 235 mov edi,DWORD [20+esp] |
| 236 adc edx,0 |
| 237 mov esi,DWORD [16+esp] |
| 238 add ebp,eax |
| 239 adc edx,0 |
| 240 imul edi,DWORD [32+esp] |
| 241 xor ecx,ecx |
| 242 add edx,DWORD [36+ebx*4+esp] |
| 243 mov DWORD [32+ebx*4+esp],ebp |
| 244 adc ecx,0 |
| 245 mov eax,DWORD [esi] |
| 246 mov DWORD [36+ebx*4+esp],edx |
| 247 mov DWORD [40+ebx*4+esp],ecx |
| 248 mul edi |
| 249 add eax,DWORD [32+esp] |
| 250 mov eax,DWORD [4+esi] |
| 251 adc edx,0 |
| 252 mov ecx,1 |
| 253 align 16 |
| 254 L$0082ndmadd: |
| 255 mov ebp,edx |
| 256 mul edi |
| 257 add ebp,DWORD [32+ecx*4+esp] |
| 258 lea ecx,[1+ecx] |
| 259 adc edx,0 |
| 260 add ebp,eax |
| 261 mov eax,DWORD [ecx*4+esi] |
| 262 adc edx,0 |
| 263 cmp ecx,ebx |
| 264 mov DWORD [24+ecx*4+esp],ebp |
| 265 jl NEAR L$0082ndmadd |
| 266 mov ebp,edx |
| 267 mul edi |
| 268 add ebp,DWORD [32+ebx*4+esp] |
| 269 adc edx,0 |
| 270 add ebp,eax |
| 271 adc edx,0 |
| 272 mov DWORD [28+ebx*4+esp],ebp |
| 273 xor eax,eax |
| 274 mov ecx,DWORD [12+esp] |
| 275 add edx,DWORD [36+ebx*4+esp] |
| 276 adc eax,DWORD [40+ebx*4+esp] |
| 277 lea ecx,[4+ecx] |
| 278 mov DWORD [32+ebx*4+esp],edx |
| 279 cmp ecx,DWORD [28+esp] |
| 280 mov DWORD [36+ebx*4+esp],eax |
| 281 je NEAR L$005common_tail |
| 282 mov edi,DWORD [ecx] |
| 283 mov esi,DWORD [8+esp] |
| 284 mov DWORD [12+esp],ecx |
| 285 xor ecx,ecx |
| 286 xor edx,edx |
| 287 mov eax,DWORD [esi] |
| 288 jmp NEAR L$0091stmadd |
| 289 align 16 |
| 290 L$006bn_sqr_mont: |
| 291 mov DWORD [esp],ebx |
| 292 mov DWORD [12+esp],ecx |
| 293 mov eax,edi |
| 294 mul edi |
| 295 mov DWORD [32+esp],eax |
| 296 mov ebx,edx |
| 297 shr edx,1 |
| 298 and ebx,1 |
| 299 inc ecx |
| 300 align 16 |
| 301 L$010sqr: |
| 302 mov eax,DWORD [ecx*4+esi] |
| 303 mov ebp,edx |
| 304 mul edi |
| 305 add eax,ebp |
| 306 lea ecx,[1+ecx] |
| 307 adc edx,0 |
| 308 lea ebp,[eax*2+ebx] |
| 309 shr eax,31 |
| 310 cmp ecx,DWORD [esp] |
| 311 mov ebx,eax |
| 312 mov DWORD [28+ecx*4+esp],ebp |
| 313 jl NEAR L$010sqr |
| 314 mov eax,DWORD [ecx*4+esi] |
| 315 mov ebp,edx |
| 316 mul edi |
| 317 add eax,ebp |
| 318 mov edi,DWORD [20+esp] |
| 319 adc edx,0 |
| 320 mov esi,DWORD [16+esp] |
| 321 lea ebp,[eax*2+ebx] |
| 322 imul edi,DWORD [32+esp] |
| 323 shr eax,31 |
| 324 mov DWORD [32+ecx*4+esp],ebp |
| 325 lea ebp,[edx*2+eax] |
| 326 mov eax,DWORD [esi] |
| 327 shr edx,31 |
| 328 mov DWORD [36+ecx*4+esp],ebp |
| 329 mov DWORD [40+ecx*4+esp],edx |
| 330 mul edi |
| 331 add eax,DWORD [32+esp] |
| 332 mov ebx,ecx |
| 333 adc edx,0 |
| 334 mov eax,DWORD [4+esi] |
| 335 mov ecx,1 |
| 336 align 16 |
| 337 L$0113rdmadd: |
| 338 mov ebp,edx |
| 339 mul edi |
| 340 add ebp,DWORD [32+ecx*4+esp] |
| 341 adc edx,0 |
| 342 add ebp,eax |
| 343 mov eax,DWORD [4+ecx*4+esi] |
| 344 adc edx,0 |
| 345 mov DWORD [28+ecx*4+esp],ebp |
| 346 mov ebp,edx |
| 347 mul edi |
| 348 add ebp,DWORD [36+ecx*4+esp] |
| 349 lea ecx,[2+ecx] |
| 350 adc edx,0 |
| 351 add ebp,eax |
| 352 mov eax,DWORD [ecx*4+esi] |
| 353 adc edx,0 |
| 354 cmp ecx,ebx |
| 355 mov DWORD [24+ecx*4+esp],ebp |
| 356 jl NEAR L$0113rdmadd |
| 357 mov ebp,edx |
| 358 mul edi |
| 359 add ebp,DWORD [32+ebx*4+esp] |
| 360 adc edx,0 |
| 361 add ebp,eax |
| 362 adc edx,0 |
| 363 mov DWORD [28+ebx*4+esp],ebp |
| 364 mov ecx,DWORD [12+esp] |
| 365 xor eax,eax |
| 366 mov esi,DWORD [8+esp] |
| 367 add edx,DWORD [36+ebx*4+esp] |
| 368 adc eax,DWORD [40+ebx*4+esp] |
| 369 mov DWORD [32+ebx*4+esp],edx |
| 370 cmp ecx,ebx |
| 371 mov DWORD [36+ebx*4+esp],eax |
| 372 je NEAR L$005common_tail |
| 373 mov edi,DWORD [4+ecx*4+esi] |
| 374 lea ecx,[1+ecx] |
| 375 mov eax,edi |
| 376 mov DWORD [12+esp],ecx |
| 377 mul edi |
| 378 add eax,DWORD [32+ecx*4+esp] |
| 379 adc edx,0 |
| 380 mov DWORD [32+ecx*4+esp],eax |
| 381 xor ebp,ebp |
| 382 cmp ecx,ebx |
| 383 lea ecx,[1+ecx] |
| 384 je NEAR L$012sqrlast |
| 385 mov ebx,edx |
| 386 shr edx,1 |
| 387 and ebx,1 |
| 388 align 16 |
| 389 L$013sqradd: |
| 390 mov eax,DWORD [ecx*4+esi] |
| 391 mov ebp,edx |
| 392 mul edi |
| 393 add eax,ebp |
| 394 lea ebp,[eax*1+eax] |
| 395 adc edx,0 |
| 396 shr eax,31 |
| 397 add ebp,DWORD [32+ecx*4+esp] |
| 398 lea ecx,[1+ecx] |
| 399 adc eax,0 |
| 400 add ebp,ebx |
| 401 adc eax,0 |
| 402 cmp ecx,DWORD [esp] |
| 403 mov DWORD [28+ecx*4+esp],ebp |
| 404 mov ebx,eax |
| 405 jle NEAR L$013sqradd |
| 406 mov ebp,edx |
| 407 add edx,edx |
| 408 shr ebp,31 |
| 409 add edx,ebx |
| 410 adc ebp,0 |
| 411 L$012sqrlast: |
| 412 mov edi,DWORD [20+esp] |
| 413 mov esi,DWORD [16+esp] |
| 414 imul edi,DWORD [32+esp] |
| 415 add edx,DWORD [32+ecx*4+esp] |
| 416 mov eax,DWORD [esi] |
| 417 adc ebp,0 |
| 418 mov DWORD [32+ecx*4+esp],edx |
| 419 mov DWORD [36+ecx*4+esp],ebp |
| 420 mul edi |
| 421 add eax,DWORD [32+esp] |
| 422 lea ebx,[ecx-1] |
| 423 adc edx,0 |
| 424 mov ecx,1 |
| 425 mov eax,DWORD [4+esi] |
| 426 jmp NEAR L$0113rdmadd |
| 427 align 16 |
| 428 L$005common_tail: |
| 429 mov ebp,DWORD [16+esp] |
| 430 mov edi,DWORD [4+esp] |
| 431 lea esi,[32+esp] |
| 432 mov eax,DWORD [esi] |
| 433 mov ecx,ebx |
| 434 xor edx,edx |
| 435 align 16 |
| 436 L$014sub: |
| 437 sbb eax,DWORD [edx*4+ebp] |
| 438 mov DWORD [edx*4+edi],eax |
| 439 dec ecx |
| 440 mov eax,DWORD [4+edx*4+esi] |
| 441 lea edx,[1+edx] |
| 442 jge NEAR L$014sub |
| 443 sbb eax,0 |
| 444 align 16 |
| 445 L$015copy: |
| 446 mov edx,DWORD [ebx*4+esi] |
| 447 mov ebp,DWORD [ebx*4+edi] |
| 448 xor edx,ebp |
| 449 and edx,eax |
| 450 xor edx,ebp |
| 451 mov DWORD [ebx*4+esi],ecx |
| 452 mov DWORD [ebx*4+edi],edx |
| 453 dec ebx |
| 454 jge NEAR L$015copy |
| 455 mov esp,DWORD [24+esp] |
| 456 mov eax,1 |
| 457 L$000just_leave: |
| 458 pop edi |
| 459 pop esi |
| 460 pop ebx |
| 461 pop ebp |
| 462 ret |
| 463 db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 |
| 464 db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 |
| 465 db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 |
| 466 db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 |
| 467 db 111,114,103,62,0 |
| 468 segment .bss |
| 469 common _OPENSSL_ia32cap_P 16 |
OLD | NEW |