| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. | |
| 3 * | |
| 4 * This Source Code Form is subject to the terms of the Mozilla Public | |
| 5 * License, v. 2.0. If a copy of the MPL was not distributed with this | |
| 6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | |
| 7 | |
| 8 #include "mpi-priv.h" | |
| 9 | |
| 10 static int is_sse = -1; | |
| 11 extern unsigned long s_mpi_is_sse2(); | |
| 12 | |
| 13 /* | |
| 14 * ebp - 36: caller's esi | |
| 15 * ebp - 32: caller's edi | |
| 16 * ebp - 28: | |
| 17 * ebp - 24: | |
| 18 * ebp - 20: | |
| 19 * ebp - 16: | |
| 20 * ebp - 12: | |
| 21 * ebp - 8: | |
| 22 * ebp - 4: | |
| 23 * ebp + 0: caller's ebp | |
| 24 * ebp + 4: return address | |
| 25 * ebp + 8: a argument | |
| 26 * ebp + 12: a_len argument | |
| 27 * ebp + 16: b argument | |
| 28 * ebp + 20: c argument | |
| 29 * registers: | |
| 30 * eax: | |
| 31 * ebx: carry | |
| 32 * ecx: a_len | |
| 33 * edx: | |
| 34 * esi: a ptr | |
| 35 * edi: c ptr | |
| 36 */ | |
| 37 __declspec(naked) void | |
| 38 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) | |
| 39 { | |
| 40 __asm { | |
| 41 mov eax, is_sse | |
| 42 cmp eax, 0 | |
| 43 je s_mpv_mul_d_x86 | |
| 44 jg s_mpv_mul_d_sse2 | |
| 45 call s_mpi_is_sse2 | |
| 46 mov is_sse, eax | |
| 47 cmp eax, 0 | |
| 48 jg s_mpv_mul_d_sse2 | |
| 49 s_mpv_mul_d_x86: | |
| 50 push ebp | |
| 51 mov ebp,esp | |
| 52 sub esp,28 | |
| 53 push edi | |
| 54 push esi | |
| 55 push ebx | |
| 56 mov ebx,0 ; carry = 0 | |
| 57 mov ecx,[ebp+12] ; ecx = a_len | |
| 58 mov edi,[ebp+20] | |
| 59 cmp ecx,0 | |
| 60 je L_2 ; jmp if a_len == 0 | |
| 61 mov esi,[ebp+8] ; esi = a | |
| 62 cld | |
| 63 L_1: | |
| 64 lodsd ; eax = [ds:esi]; esi += 4 | |
| 65 mov edx,[ebp+16] ; edx = b | |
| 66 mul edx ; edx:eax = Phi:Plo = a_i * b | |
| 67 | |
| 68 add eax,ebx ; add carry (ebx) to edx:eax | |
| 69 adc edx,0 | |
| 70 mov ebx,edx ; high half of product becomes next carry | |
| 71 | |
| 72 stosd ; [es:edi] = ax; edi += 4; | |
| 73 dec ecx ; --a_len | |
| 74 jnz L_1 ; jmp if a_len != 0 | |
| 75 L_2: | |
| 76 mov [edi],ebx ; *c = carry | |
| 77 pop ebx | |
| 78 pop esi | |
| 79 pop edi | |
| 80 leave | |
| 81 ret | |
| 82 nop | |
| 83 s_mpv_mul_d_sse2: | |
| 84 push ebp | |
| 85 mov ebp, esp | |
| 86 push edi | |
| 87 push esi | |
| 88 psubq mm2, mm2 ; carry = 0 | |
| 89 mov ecx, [ebp+12] ; ecx = a_len | |
| 90 movd mm1, [ebp+16] ; mm1 = b | |
| 91 mov edi, [ebp+20] | |
| 92 cmp ecx, 0 | |
| 93 je L_6 ; jmp if a_len == 0 | |
| 94 mov esi, [ebp+8] ; esi = a | |
| 95 cld | |
| 96 L_5: | |
| 97 movd mm0, [esi] ; mm0 = *a++ | |
| 98 add esi, 4 | |
| 99 pmuludq mm0, mm1 ; mm0 = b * *a++ | |
| 100 paddq mm2, mm0 ; add the carry | |
| 101 movd [edi], mm2 ; store the 32bit result | |
| 102 add edi, 4 | |
| 103 psrlq mm2, 32 ; save the carry | |
| 104 dec ecx ; --a_len | |
| 105 jnz L_5 ; jmp if a_len != 0 | |
| 106 L_6: | |
| 107 movd [edi], mm2 ; *c = carry | |
| 108 emms | |
| 109 pop esi | |
| 110 pop edi | |
| 111 leave | |
| 112 ret | |
| 113 nop | |
| 114 } | |
| 115 } | |
| 116 | |
| 117 /* | |
| 118 * ebp - 36: caller's esi | |
| 119 * ebp - 32: caller's edi | |
| 120 * ebp - 28: | |
| 121 * ebp - 24: | |
| 122 * ebp - 20: | |
| 123 * ebp - 16: | |
| 124 * ebp - 12: | |
| 125 * ebp - 8: | |
| 126 * ebp - 4: | |
| 127 * ebp + 0: caller's ebp | |
| 128 * ebp + 4: return address | |
| 129 * ebp + 8: a argument | |
| 130 * ebp + 12: a_len argument | |
| 131 * ebp + 16: b argument | |
| 132 * ebp + 20: c argument | |
| 133 * registers: | |
| 134 * eax: | |
| 135 * ebx: carry | |
| 136 * ecx: a_len | |
| 137 * edx: | |
| 138 * esi: a ptr | |
| 139 * edi: c ptr | |
| 140 */ | |
| 141 __declspec(naked) void | |
| 142 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) | |
| 143 { | |
| 144 __asm { | |
| 145 mov eax, is_sse | |
| 146 cmp eax, 0 | |
| 147 je s_mpv_mul_d_add_x86 | |
| 148 jg s_mpv_mul_d_add_sse2 | |
| 149 call s_mpi_is_sse2 | |
| 150 mov is_sse, eax | |
| 151 cmp eax, 0 | |
| 152 jg s_mpv_mul_d_add_sse2 | |
| 153 s_mpv_mul_d_add_x86: | |
| 154 push ebp | |
| 155 mov ebp,esp | |
| 156 sub esp,28 | |
| 157 push edi | |
| 158 push esi | |
| 159 push ebx | |
| 160 mov ebx,0 ; carry = 0 | |
| 161 mov ecx,[ebp+12] ; ecx = a_len | |
| 162 mov edi,[ebp+20] | |
| 163 cmp ecx,0 | |
| 164 je L_11 ; jmp if a_len == 0 | |
| 165 mov esi,[ebp+8] ; esi = a | |
| 166 cld | |
| 167 L_10: | |
| 168 lodsd ; eax = [ds:esi]; esi += 4 | |
| 169 mov edx,[ebp+16] ; edx = b | |
| 170 mul edx ; edx:eax = Phi:Plo = a_i * b | |
| 171 | |
| 172 add eax,ebx ; add carry (ebx) to edx:eax | |
| 173 adc edx,0 | |
| 174 mov ebx,[edi] ; add in current word from *c | |
| 175 add eax,ebx | |
| 176 adc edx,0 | |
| 177 mov ebx,edx ; high half of product becomes next carry | |
| 178 | |
| 179 stosd ; [es:edi] = ax; edi += 4; | |
| 180 dec ecx ; --a_len | |
| 181 jnz L_10 ; jmp if a_len != 0 | |
| 182 L_11: | |
| 183 mov [edi],ebx ; *c = carry | |
| 184 pop ebx | |
| 185 pop esi | |
| 186 pop edi | |
| 187 leave | |
| 188 ret | |
| 189 nop | |
| 190 s_mpv_mul_d_add_sse2: | |
| 191 push ebp | |
| 192 mov ebp, esp | |
| 193 push edi | |
| 194 push esi | |
| 195 psubq mm2, mm2 ; carry = 0 | |
| 196 mov ecx, [ebp+12] ; ecx = a_len | |
| 197 movd mm1, [ebp+16] ; mm1 = b | |
| 198 mov edi, [ebp+20] | |
| 199 cmp ecx, 0 | |
| 200 je L_16 ; jmp if a_len == 0 | |
| 201 mov esi, [ebp+8] ; esi = a | |
| 202 cld | |
| 203 L_15: | |
| 204 movd mm0, [esi] ; mm0 = *a++ | |
| 205 add esi, 4 | |
| 206 pmuludq mm0, mm1 ; mm0 = b * *a++ | |
| 207 paddq mm2, mm0 ; add the carry | |
| 208 movd mm0, [edi] | |
| 209 paddq mm2, mm0 ; add the carry | |
| 210 movd [edi], mm2 ; store the 32bit result | |
| 211 add edi, 4 | |
| 212 psrlq mm2, 32 ; save the carry | |
| 213 dec ecx ; --a_len | |
| 214 jnz L_15 ; jmp if a_len != 0 | |
| 215 L_16: | |
| 216 movd [edi], mm2 ; *c = carry | |
| 217 emms | |
| 218 pop esi | |
| 219 pop edi | |
| 220 leave | |
| 221 ret | |
| 222 nop | |
| 223 } | |
| 224 } | |
| 225 | |
| 226 /* | |
| 227 * ebp - 36: caller's esi | |
| 228 * ebp - 32: caller's edi | |
| 229 * ebp - 28: | |
| 230 * ebp - 24: | |
| 231 * ebp - 20: | |
| 232 * ebp - 16: | |
| 233 * ebp - 12: | |
| 234 * ebp - 8: | |
| 235 * ebp - 4: | |
| 236 * ebp + 0: caller's ebp | |
| 237 * ebp + 4: return address | |
| 238 * ebp + 8: a argument | |
| 239 * ebp + 12: a_len argument | |
| 240 * ebp + 16: b argument | |
| 241 * ebp + 20: c argument | |
| 242 * registers: | |
| 243 * eax: | |
| 244 * ebx: carry | |
| 245 * ecx: a_len | |
| 246 * edx: | |
| 247 * esi: a ptr | |
| 248 * edi: c ptr | |
| 249 */ | |
| 250 __declspec(naked) void | |
| 251 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) | |
| 252 { | |
| 253 __asm { | |
| 254 mov eax, is_sse | |
| 255 cmp eax, 0 | |
| 256 je s_mpv_mul_d_add_prop_x86 | |
| 257 jg s_mpv_mul_d_add_prop_sse2 | |
| 258 call s_mpi_is_sse2 | |
| 259 mov is_sse, eax | |
| 260 cmp eax, 0 | |
| 261 jg s_mpv_mul_d_add_prop_sse2 | |
| 262 s_mpv_mul_d_add_prop_x86: | |
| 263 push ebp | |
| 264 mov ebp,esp | |
| 265 sub esp,28 | |
| 266 push edi | |
| 267 push esi | |
| 268 push ebx | |
| 269 mov ebx,0 ; carry = 0 | |
| 270 mov ecx,[ebp+12] ; ecx = a_len | |
| 271 mov edi,[ebp+20] | |
| 272 cmp ecx,0 | |
| 273 je L_21 ; jmp if a_len == 0 | |
| 274 cld | |
| 275 mov esi,[ebp+8] ; esi = a | |
| 276 L_20: | |
| 277 lodsd ; eax = [ds:esi]; esi += 4 | |
| 278 mov edx,[ebp+16] ; edx = b | |
| 279 mul edx ; edx:eax = Phi:Plo = a_i * b | |
| 280 | |
| 281 add eax,ebx ; add carry (ebx) to edx:eax | |
| 282 adc edx,0 | |
| 283 mov ebx,[edi] ; add in current word from *c | |
| 284 add eax,ebx | |
| 285 adc edx,0 | |
| 286 mov ebx,edx ; high half of product becomes next carry | |
| 287 | |
| 288 stosd ; [es:edi] = ax; edi += 4; | |
| 289 dec ecx ; --a_len | |
| 290 jnz L_20 ; jmp if a_len != 0 | |
| 291 L_21: | |
| 292 cmp ebx,0 ; is carry zero? | |
| 293 jz L_23 | |
| 294 mov eax,[edi] ; add in current word from *c | |
| 295 add eax,ebx | |
| 296 stosd ; [es:edi] = ax; edi += 4; | |
| 297 jnc L_23 | |
| 298 L_22: | |
| 299 mov eax,[edi] ; add in current word from *c | |
| 300 adc eax,0 | |
| 301 stosd ; [es:edi] = ax; edi += 4; | |
| 302 jc L_22 | |
| 303 L_23: | |
| 304 pop ebx | |
| 305 pop esi | |
| 306 pop edi | |
| 307 leave | |
| 308 ret | |
| 309 nop | |
| 310 s_mpv_mul_d_add_prop_sse2: | |
| 311 push ebp | |
| 312 mov ebp, esp | |
| 313 push edi | |
| 314 push esi | |
| 315 push ebx | |
| 316 psubq mm2, mm2 ; carry = 0 | |
| 317 mov ecx, [ebp+12] ; ecx = a_len | |
| 318 movd mm1, [ebp+16] ; mm1 = b | |
| 319 mov edi, [ebp+20] | |
| 320 cmp ecx, 0 | |
| 321 je L_26 ; jmp if a_len == 0 | |
| 322 mov esi, [ebp+8] ; esi = a | |
| 323 cld | |
| 324 L_25: | |
| 325 movd mm0, [esi] ; mm0 = *a++ | |
| 326 movd mm3, [edi] ; fetch the sum | |
| 327 add esi, 4 | |
| 328 pmuludq mm0, mm1 ; mm0 = b * *a++ | |
| 329 paddq mm2, mm0 ; add the carry | |
| 330 paddq mm2, mm3 ; add *c++ | |
| 331 movd [edi], mm2 ; store the 32bit result | |
| 332 add edi, 4 | |
| 333 psrlq mm2, 32 ; save the carry | |
| 334 dec ecx ; --a_len | |
| 335 jnz L_25 ; jmp if a_len != 0 | |
| 336 L_26: | |
| 337 movd ebx, mm2 | |
| 338 cmp ebx, 0 ; is carry zero? | |
| 339 jz L_28 | |
| 340 mov eax, [edi] | |
| 341 add eax, ebx | |
| 342 stosd | |
| 343 jnc L_28 | |
| 344 L_27: | |
| 345 mov eax, [edi] ; add in current word from *c | |
| 346 adc eax, 0 | |
| 347 stosd ; [es:edi] = ax; edi += 4; | |
| 348 jc L_27 | |
| 349 L_28: | |
| 350 emms | |
| 351 pop ebx | |
| 352 pop esi | |
| 353 pop edi | |
| 354 leave | |
| 355 ret | |
| 356 nop | |
| 357 } | |
| 358 } | |
| 359 | |
| 360 /* | |
| 361 * ebp - 20: caller's esi | |
| 362 * ebp - 16: caller's edi | |
| 363 * ebp - 12: | |
| 364 * ebp - 8: carry | |
| 365 * ebp - 4: a_len local | |
| 366 * ebp + 0: caller's ebp | |
| 367 * ebp + 4: return address | |
| 368 * ebp + 8: pa argument | |
| 369 * ebp + 12: a_len argument | |
| 370 * ebp + 16: ps argument | |
| 371 * ebp + 20: | |
| 372 * registers: | |
| 373 * eax: | |
| 374 * ebx: carry | |
| 375 * ecx: a_len | |
| 376 * edx: | |
| 377 * esi: a ptr | |
| 378 * edi: c ptr | |
| 379 */ | |
| 380 __declspec(naked) void | |
| 381 s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) | |
| 382 { | |
| 383 __asm { | |
| 384 mov eax, is_sse | |
| 385 cmp eax, 0 | |
| 386 je s_mpv_sqr_add_prop_x86 | |
| 387 jg s_mpv_sqr_add_prop_sse2 | |
| 388 call s_mpi_is_sse2 | |
| 389 mov is_sse, eax | |
| 390 cmp eax, 0 | |
| 391 jg s_mpv_sqr_add_prop_sse2 | |
| 392 s_mpv_sqr_add_prop_x86: | |
| 393 push ebp | |
| 394 mov ebp,esp | |
| 395 sub esp,12 | |
| 396 push edi | |
| 397 push esi | |
| 398 push ebx | |
| 399 mov ebx,0 ; carry = 0 | |
| 400 mov ecx,[ebp+12] ; a_len | |
| 401 mov edi,[ebp+16] ; edi = ps | |
| 402 cmp ecx,0 | |
| 403 je L_31 ; jump if a_len == 0 | |
| 404 cld | |
| 405 mov esi,[ebp+8] ; esi = pa | |
| 406 L_30: | |
| 407 lodsd ; eax = [ds:si]; si += 4; | |
| 408 mul eax | |
| 409 | |
| 410 add eax,ebx ; add "carry" | |
| 411 adc edx,0 | |
| 412 mov ebx,[edi] | |
| 413 add eax,ebx ; add low word from result | |
| 414 mov ebx,[edi+4] | |
| 415 stosd ; [es:di] = eax; di += 4; | |
| 416 adc edx,ebx ; add high word from result | |
| 417 mov ebx,0 | |
| 418 mov eax,edx | |
| 419 adc ebx,0 | |
| 420 stosd ; [es:di] = eax; di += 4; | |
| 421 dec ecx ; --a_len | |
| 422 jnz L_30 ; jmp if a_len != 0 | |
| 423 L_31: | |
| 424 cmp ebx,0 ; is carry zero? | |
| 425 jz L_34 | |
| 426 mov eax,[edi] ; add in current word from *c | |
| 427 add eax,ebx | |
| 428 stosd ; [es:edi] = ax; edi += 4; | |
| 429 jnc L_34 | |
| 430 L_32: | |
| 431 mov eax,[edi] ; add in current word from *c | |
| 432 adc eax,0 | |
| 433 stosd ; [es:edi] = ax; edi += 4; | |
| 434 jc L_32 | |
| 435 L_34: | |
| 436 pop ebx | |
| 437 pop esi | |
| 438 pop edi | |
| 439 leave | |
| 440 ret | |
| 441 nop | |
| 442 s_mpv_sqr_add_prop_sse2: | |
| 443 push ebp | |
| 444 mov ebp, esp | |
| 445 push edi | |
| 446 push esi | |
| 447 push ebx | |
| 448 psubq mm2, mm2 ; carry = 0 | |
| 449 mov ecx, [ebp+12] ; ecx = a_len | |
| 450 mov edi, [ebp+16] | |
| 451 cmp ecx, 0 | |
| 452 je L_36 ; jmp if a_len == 0 | |
| 453 mov esi, [ebp+8] ; esi = a | |
| 454 cld | |
| 455 L_35: | |
| 456 movd mm0, [esi] ; mm0 = *a | |
| 457 movd mm3, [edi] ; fetch the sum | |
| 458 add esi, 4 | |
| 459 pmuludq mm0, mm0 ; mm0 = sqr(a) | |
| 460 paddq mm2, mm0 ; add the carry | |
| 461 paddq mm2, mm3 ; add the low word | |
| 462 movd mm3, [edi+4] | |
| 463 movd [edi], mm2 ; store the 32bit result | |
| 464 psrlq mm2, 32 | |
| 465 paddq mm2, mm3 ; add the high word | |
| 466 movd [edi+4], mm2 ; store the 32bit result | |
| 467 psrlq mm2, 32 ; save the carry. | |
| 468 add edi, 8 | |
| 469 dec ecx ; --a_len | |
| 470 jnz L_35 ; jmp if a_len != 0 | |
| 471 L_36: | |
| 472 movd ebx, mm2 | |
| 473 cmp ebx, 0 ; is carry zero? | |
| 474 jz L_38 | |
| 475 mov eax, [edi] | |
| 476 add eax, ebx | |
| 477 stosd | |
| 478 jnc L_38 | |
| 479 L_37: | |
| 480 mov eax, [edi] ; add in current word from *c | |
| 481 adc eax, 0 | |
| 482 stosd ; [es:edi] = ax; edi += 4; | |
| 483 jc L_37 | |
| 484 L_38: | |
| 485 emms | |
| 486 pop ebx | |
| 487 pop esi | |
| 488 pop edi | |
| 489 leave | |
| 490 ret | |
| 491 nop | |
| 492 } | |
| 493 } | |
| 494 | |
| 495 /* | |
| 496 * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized | |
| 497 * so its high bit is 1. This code is from NSPR. | |
| 498 * | |
| 499 * Dump of assembler code for function s_mpv_div_2dx1d: | |
| 500 * | |
| 501 * esp + 0: Caller's ebx | |
| 502 * esp + 4: return address | |
| 503 * esp + 8: Nhi argument | |
| 504 * esp + 12: Nlo argument | |
| 505 * esp + 16: divisor argument | |
| 506 * esp + 20: qp argument | |
| 507 * esp + 24: rp argument | |
| 508 * registers: | |
| 509 * eax: | |
| 510 * ebx: carry | |
| 511 * ecx: a_len | |
| 512 * edx: | |
| 513 * esi: a ptr | |
| 514 * edi: c ptr | |
| 515 */ | |
| 516 __declspec(naked) mp_err | |
| 517 s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, | |
| 518 mp_digit *qp, mp_digit *rp) | |
| 519 { | |
| 520 __asm { | |
| 521 push ebx | |
| 522 mov edx,[esp+8] | |
| 523 mov eax,[esp+12] | |
| 524 mov ebx,[esp+16] | |
| 525 div ebx | |
| 526 mov ebx,[esp+20] | |
| 527 mov [ebx],eax | |
| 528 mov ebx,[esp+24] | |
| 529 mov [ebx],edx | |
| 530 xor eax,eax ; return zero | |
| 531 pop ebx | |
| 532 ret | |
| 533 nop | |
| 534 } | |
| 535 } | |
| OLD | NEW |