OLD | NEW |
(Empty) | |
| 1 #if defined(__x86_64__) |
| 2 .text |
| 3 |
| 4 |
| 5 |
| 6 .globl _bn_mul_mont_gather5 |
| 7 |
| 8 .p2align 6 |
| 9 _bn_mul_mont_gather5: |
| 10 testl $7,%r9d |
| 11 jnz L$mul_enter |
| 12 jmp L$mul4x_enter |
| 13 |
| 14 .p2align 4 |
| 15 L$mul_enter: |
| 16 movl %r9d,%r9d |
| 17 movq %rsp,%rax |
| 18 movl 8(%rsp),%r10d |
| 19 pushq %rbx |
| 20 pushq %rbp |
| 21 pushq %r12 |
| 22 pushq %r13 |
| 23 pushq %r14 |
| 24 pushq %r15 |
| 25 leaq 2(%r9),%r11 |
| 26 negq %r11 |
| 27 leaq (%rsp,%r11,8),%rsp |
| 28 andq $-1024,%rsp |
| 29 |
| 30 movq %rax,8(%rsp,%r9,8) |
| 31 L$mul_body: |
| 32 movq %rdx,%r12 |
| 33 movq %r10,%r11 |
| 34 shrq $3,%r10 |
| 35 andq $7,%r11 |
| 36 notq %r10 |
| 37 leaq L$magic_masks(%rip),%rax |
| 38 andq $3,%r10 |
| 39 leaq 96(%r12,%r11,8),%r12 |
| 40 movq 0(%rax,%r10,8),%xmm4 |
| 41 movq 8(%rax,%r10,8),%xmm5 |
| 42 movq 16(%rax,%r10,8),%xmm6 |
| 43 movq 24(%rax,%r10,8),%xmm7 |
| 44 |
| 45 movq -96(%r12),%xmm0 |
| 46 movq -32(%r12),%xmm1 |
| 47 pand %xmm4,%xmm0 |
| 48 movq 32(%r12),%xmm2 |
| 49 pand %xmm5,%xmm1 |
| 50 movq 96(%r12),%xmm3 |
| 51 pand %xmm6,%xmm2 |
| 52 por %xmm1,%xmm0 |
| 53 pand %xmm7,%xmm3 |
| 54 por %xmm2,%xmm0 |
| 55 leaq 256(%r12),%r12 |
| 56 por %xmm3,%xmm0 |
| 57 |
| 58 .byte 102,72,15,126,195 |
| 59 |
| 60 movq (%r8),%r8 |
| 61 movq (%rsi),%rax |
| 62 |
| 63 xorq %r14,%r14 |
| 64 xorq %r15,%r15 |
| 65 |
| 66 movq -96(%r12),%xmm0 |
| 67 movq -32(%r12),%xmm1 |
| 68 pand %xmm4,%xmm0 |
| 69 movq 32(%r12),%xmm2 |
| 70 pand %xmm5,%xmm1 |
| 71 |
| 72 movq %r8,%rbp |
| 73 mulq %rbx |
| 74 movq %rax,%r10 |
| 75 movq (%rcx),%rax |
| 76 |
| 77 movq 96(%r12),%xmm3 |
| 78 pand %xmm6,%xmm2 |
| 79 por %xmm1,%xmm0 |
| 80 pand %xmm7,%xmm3 |
| 81 |
| 82 imulq %r10,%rbp |
| 83 movq %rdx,%r11 |
| 84 |
| 85 por %xmm2,%xmm0 |
| 86 leaq 256(%r12),%r12 |
| 87 por %xmm3,%xmm0 |
| 88 |
| 89 mulq %rbp |
| 90 addq %rax,%r10 |
| 91 movq 8(%rsi),%rax |
| 92 adcq $0,%rdx |
| 93 movq %rdx,%r13 |
| 94 |
| 95 leaq 1(%r15),%r15 |
| 96 jmp L$1st_enter |
| 97 |
| 98 .p2align 4 |
| 99 L$1st: |
| 100 addq %rax,%r13 |
| 101 movq (%rsi,%r15,8),%rax |
| 102 adcq $0,%rdx |
| 103 addq %r11,%r13 |
| 104 movq %r10,%r11 |
| 105 adcq $0,%rdx |
| 106 movq %r13,-16(%rsp,%r15,8) |
| 107 movq %rdx,%r13 |
| 108 |
| 109 L$1st_enter: |
| 110 mulq %rbx |
| 111 addq %rax,%r11 |
| 112 movq (%rcx,%r15,8),%rax |
| 113 adcq $0,%rdx |
| 114 leaq 1(%r15),%r15 |
| 115 movq %rdx,%r10 |
| 116 |
| 117 mulq %rbp |
| 118 cmpq %r9,%r15 |
| 119 jne L$1st |
| 120 |
| 121 .byte 102,72,15,126,195 |
| 122 |
| 123 addq %rax,%r13 |
| 124 movq (%rsi),%rax |
| 125 adcq $0,%rdx |
| 126 addq %r11,%r13 |
| 127 adcq $0,%rdx |
| 128 movq %r13,-16(%rsp,%r15,8) |
| 129 movq %rdx,%r13 |
| 130 movq %r10,%r11 |
| 131 |
| 132 xorq %rdx,%rdx |
| 133 addq %r11,%r13 |
| 134 adcq $0,%rdx |
| 135 movq %r13,-8(%rsp,%r9,8) |
| 136 movq %rdx,(%rsp,%r9,8) |
| 137 |
| 138 leaq 1(%r14),%r14 |
| 139 jmp L$outer |
| 140 .p2align 4 |
| 141 L$outer: |
| 142 xorq %r15,%r15 |
| 143 movq %r8,%rbp |
| 144 movq (%rsp),%r10 |
| 145 |
| 146 movq -96(%r12),%xmm0 |
| 147 movq -32(%r12),%xmm1 |
| 148 pand %xmm4,%xmm0 |
| 149 movq 32(%r12),%xmm2 |
| 150 pand %xmm5,%xmm1 |
| 151 |
| 152 mulq %rbx |
| 153 addq %rax,%r10 |
| 154 movq (%rcx),%rax |
| 155 adcq $0,%rdx |
| 156 |
| 157 movq 96(%r12),%xmm3 |
| 158 pand %xmm6,%xmm2 |
| 159 por %xmm1,%xmm0 |
| 160 pand %xmm7,%xmm3 |
| 161 |
| 162 imulq %r10,%rbp |
| 163 movq %rdx,%r11 |
| 164 |
| 165 por %xmm2,%xmm0 |
| 166 leaq 256(%r12),%r12 |
| 167 por %xmm3,%xmm0 |
| 168 |
| 169 mulq %rbp |
| 170 addq %rax,%r10 |
| 171 movq 8(%rsi),%rax |
| 172 adcq $0,%rdx |
| 173 movq 8(%rsp),%r10 |
| 174 movq %rdx,%r13 |
| 175 |
| 176 leaq 1(%r15),%r15 |
| 177 jmp L$inner_enter |
| 178 |
| 179 .p2align 4 |
| 180 L$inner: |
| 181 addq %rax,%r13 |
| 182 movq (%rsi,%r15,8),%rax |
| 183 adcq $0,%rdx |
| 184 addq %r10,%r13 |
| 185 movq (%rsp,%r15,8),%r10 |
| 186 adcq $0,%rdx |
| 187 movq %r13,-16(%rsp,%r15,8) |
| 188 movq %rdx,%r13 |
| 189 |
| 190 L$inner_enter: |
| 191 mulq %rbx |
| 192 addq %rax,%r11 |
| 193 movq (%rcx,%r15,8),%rax |
| 194 adcq $0,%rdx |
| 195 addq %r11,%r10 |
| 196 movq %rdx,%r11 |
| 197 adcq $0,%r11 |
| 198 leaq 1(%r15),%r15 |
| 199 |
| 200 mulq %rbp |
| 201 cmpq %r9,%r15 |
| 202 jne L$inner |
| 203 |
| 204 .byte 102,72,15,126,195 |
| 205 |
| 206 addq %rax,%r13 |
| 207 movq (%rsi),%rax |
| 208 adcq $0,%rdx |
| 209 addq %r10,%r13 |
| 210 movq (%rsp,%r15,8),%r10 |
| 211 adcq $0,%rdx |
| 212 movq %r13,-16(%rsp,%r15,8) |
| 213 movq %rdx,%r13 |
| 214 |
| 215 xorq %rdx,%rdx |
| 216 addq %r11,%r13 |
| 217 adcq $0,%rdx |
| 218 addq %r10,%r13 |
| 219 adcq $0,%rdx |
| 220 movq %r13,-8(%rsp,%r9,8) |
| 221 movq %rdx,(%rsp,%r9,8) |
| 222 |
| 223 leaq 1(%r14),%r14 |
| 224 cmpq %r9,%r14 |
| 225 jb L$outer |
| 226 |
| 227 xorq %r14,%r14 |
| 228 movq (%rsp),%rax |
| 229 leaq (%rsp),%rsi |
| 230 movq %r9,%r15 |
| 231 jmp L$sub |
| 232 .p2align 4 |
| 233 L$sub: sbbq (%rcx,%r14,8),%rax |
| 234 movq %rax,(%rdi,%r14,8) |
| 235 movq 8(%rsi,%r14,8),%rax |
| 236 leaq 1(%r14),%r14 |
| 237 decq %r15 |
| 238 jnz L$sub |
| 239 |
| 240 sbbq $0,%rax |
| 241 xorq %r14,%r14 |
| 242 movq %r9,%r15 |
| 243 .p2align 4 |
| 244 L$copy: |
| 245 movq (%rsp,%r14,8),%rsi |
| 246 movq (%rdi,%r14,8),%rcx |
| 247 xorq %rcx,%rsi |
| 248 andq %rax,%rsi |
| 249 xorq %rcx,%rsi |
| 250 movq %r14,(%rsp,%r14,8) |
| 251 movq %rsi,(%rdi,%r14,8) |
| 252 leaq 1(%r14),%r14 |
| 253 subq $1,%r15 |
| 254 jnz L$copy |
| 255 |
| 256 movq 8(%rsp,%r9,8),%rsi |
| 257 movq $1,%rax |
| 258 movq -48(%rsi),%r15 |
| 259 movq -40(%rsi),%r14 |
| 260 movq -32(%rsi),%r13 |
| 261 movq -24(%rsi),%r12 |
| 262 movq -16(%rsi),%rbp |
| 263 movq -8(%rsi),%rbx |
| 264 leaq (%rsi),%rsp |
| 265 L$mul_epilogue: |
| 266 .byte 0xf3,0xc3 |
| 267 |
| 268 |
| 269 .p2align 5 |
| 270 bn_mul4x_mont_gather5: |
| 271 L$mul4x_enter: |
| 272 .byte 0x67 |
| 273 movq %rsp,%rax |
| 274 pushq %rbx |
| 275 pushq %rbp |
| 276 pushq %r12 |
| 277 pushq %r13 |
| 278 pushq %r14 |
| 279 pushq %r15 |
| 280 .byte 0x67 |
| 281 movl %r9d,%r10d |
| 282 shll $3,%r9d |
| 283 shll $3+2,%r10d |
| 284 negq %r9 |
| 285 |
| 286 |
| 287 |
| 288 |
| 289 |
| 290 |
| 291 |
| 292 |
| 293 leaq -64(%rsp,%r9,2),%r11 |
| 294 subq %rsi,%r11 |
| 295 andq $4095,%r11 |
| 296 cmpq %r11,%r10 |
| 297 jb L$mul4xsp_alt |
| 298 subq %r11,%rsp |
| 299 leaq -64(%rsp,%r9,2),%rsp |
| 300 jmp L$mul4xsp_done |
| 301 |
| 302 .p2align 5 |
| 303 L$mul4xsp_alt: |
| 304 leaq 4096-64(,%r9,2),%r10 |
| 305 leaq -64(%rsp,%r9,2),%rsp |
| 306 subq %r10,%r11 |
| 307 movq $0,%r10 |
| 308 cmovcq %r10,%r11 |
| 309 subq %r11,%rsp |
| 310 L$mul4xsp_done: |
| 311 andq $-64,%rsp |
| 312 negq %r9 |
| 313 |
| 314 movq %rax,40(%rsp) |
| 315 L$mul4x_body: |
| 316 |
| 317 call mul4x_internal |
| 318 |
| 319 movq 40(%rsp),%rsi |
| 320 movq $1,%rax |
| 321 movq -48(%rsi),%r15 |
| 322 movq -40(%rsi),%r14 |
| 323 movq -32(%rsi),%r13 |
| 324 movq -24(%rsi),%r12 |
| 325 movq -16(%rsi),%rbp |
| 326 movq -8(%rsi),%rbx |
| 327 leaq (%rsi),%rsp |
| 328 L$mul4x_epilogue: |
| 329 .byte 0xf3,0xc3 |
| 330 |
| 331 |
| 332 |
| 333 .p2align 5 |
| 334 mul4x_internal: |
| 335 shlq $5,%r9 |
| 336 movl 8(%rax),%r10d |
| 337 leaq 256(%rdx,%r9,1),%r13 |
| 338 shrq $5,%r9 |
| 339 movq %r10,%r11 |
| 340 shrq $3,%r10 |
| 341 andq $7,%r11 |
| 342 notq %r10 |
| 343 leaq L$magic_masks(%rip),%rax |
| 344 andq $3,%r10 |
| 345 leaq 96(%rdx,%r11,8),%r12 |
| 346 movq 0(%rax,%r10,8),%xmm4 |
| 347 movq 8(%rax,%r10,8),%xmm5 |
| 348 addq $7,%r11 |
| 349 movq 16(%rax,%r10,8),%xmm6 |
| 350 movq 24(%rax,%r10,8),%xmm7 |
| 351 andq $7,%r11 |
| 352 |
| 353 movq -96(%r12),%xmm0 |
| 354 leaq 256(%r12),%r14 |
| 355 movq -32(%r12),%xmm1 |
| 356 pand %xmm4,%xmm0 |
| 357 movq 32(%r12),%xmm2 |
| 358 pand %xmm5,%xmm1 |
| 359 movq 96(%r12),%xmm3 |
| 360 pand %xmm6,%xmm2 |
| 361 .byte 0x67 |
| 362 por %xmm1,%xmm0 |
| 363 movq -96(%r14),%xmm1 |
| 364 .byte 0x67 |
| 365 pand %xmm7,%xmm3 |
| 366 .byte 0x67 |
| 367 por %xmm2,%xmm0 |
| 368 movq -32(%r14),%xmm2 |
| 369 .byte 0x67 |
| 370 pand %xmm4,%xmm1 |
| 371 .byte 0x67 |
| 372 por %xmm3,%xmm0 |
| 373 movq 32(%r14),%xmm3 |
| 374 |
| 375 .byte 102,72,15,126,195 |
| 376 movq 96(%r14),%xmm0 |
| 377 movq %r13,16+8(%rsp) |
| 378 movq %rdi,56+8(%rsp) |
| 379 |
| 380 movq (%r8),%r8 |
| 381 movq (%rsi),%rax |
| 382 leaq (%rsi,%r9,1),%rsi |
| 383 negq %r9 |
| 384 |
| 385 movq %r8,%rbp |
| 386 mulq %rbx |
| 387 movq %rax,%r10 |
| 388 movq (%rcx),%rax |
| 389 |
| 390 pand %xmm5,%xmm2 |
| 391 pand %xmm6,%xmm3 |
| 392 por %xmm2,%xmm1 |
| 393 |
| 394 imulq %r10,%rbp |
| 395 |
| 396 |
| 397 |
| 398 |
| 399 |
| 400 |
| 401 |
| 402 leaq 64+8(%rsp,%r11,8),%r14 |
| 403 movq %rdx,%r11 |
| 404 |
| 405 pand %xmm7,%xmm0 |
| 406 por %xmm3,%xmm1 |
| 407 leaq 512(%r12),%r12 |
| 408 por %xmm1,%xmm0 |
| 409 |
| 410 mulq %rbp |
| 411 addq %rax,%r10 |
| 412 movq 8(%rsi,%r9,1),%rax |
| 413 adcq $0,%rdx |
| 414 movq %rdx,%rdi |
| 415 |
| 416 mulq %rbx |
| 417 addq %rax,%r11 |
| 418 movq 16(%rcx),%rax |
| 419 adcq $0,%rdx |
| 420 movq %rdx,%r10 |
| 421 |
| 422 mulq %rbp |
| 423 addq %rax,%rdi |
| 424 movq 16(%rsi,%r9,1),%rax |
| 425 adcq $0,%rdx |
| 426 addq %r11,%rdi |
| 427 leaq 32(%r9),%r15 |
| 428 leaq 64(%rcx),%rcx |
| 429 adcq $0,%rdx |
| 430 movq %rdi,(%r14) |
| 431 movq %rdx,%r13 |
| 432 jmp L$1st4x |
| 433 |
| 434 .p2align 5 |
| 435 L$1st4x: |
| 436 mulq %rbx |
| 437 addq %rax,%r10 |
| 438 movq -32(%rcx),%rax |
| 439 leaq 32(%r14),%r14 |
| 440 adcq $0,%rdx |
| 441 movq %rdx,%r11 |
| 442 |
| 443 mulq %rbp |
| 444 addq %rax,%r13 |
| 445 movq -8(%rsi,%r15,1),%rax |
| 446 adcq $0,%rdx |
| 447 addq %r10,%r13 |
| 448 adcq $0,%rdx |
| 449 movq %r13,-24(%r14) |
| 450 movq %rdx,%rdi |
| 451 |
| 452 mulq %rbx |
| 453 addq %rax,%r11 |
| 454 movq -16(%rcx),%rax |
| 455 adcq $0,%rdx |
| 456 movq %rdx,%r10 |
| 457 |
| 458 mulq %rbp |
| 459 addq %rax,%rdi |
| 460 movq (%rsi,%r15,1),%rax |
| 461 adcq $0,%rdx |
| 462 addq %r11,%rdi |
| 463 adcq $0,%rdx |
| 464 movq %rdi,-16(%r14) |
| 465 movq %rdx,%r13 |
| 466 |
| 467 mulq %rbx |
| 468 addq %rax,%r10 |
| 469 movq 0(%rcx),%rax |
| 470 adcq $0,%rdx |
| 471 movq %rdx,%r11 |
| 472 |
| 473 mulq %rbp |
| 474 addq %rax,%r13 |
| 475 movq 8(%rsi,%r15,1),%rax |
| 476 adcq $0,%rdx |
| 477 addq %r10,%r13 |
| 478 adcq $0,%rdx |
| 479 movq %r13,-8(%r14) |
| 480 movq %rdx,%rdi |
| 481 |
| 482 mulq %rbx |
| 483 addq %rax,%r11 |
| 484 movq 16(%rcx),%rax |
| 485 adcq $0,%rdx |
| 486 movq %rdx,%r10 |
| 487 |
| 488 mulq %rbp |
| 489 addq %rax,%rdi |
| 490 movq 16(%rsi,%r15,1),%rax |
| 491 adcq $0,%rdx |
| 492 addq %r11,%rdi |
| 493 leaq 64(%rcx),%rcx |
| 494 adcq $0,%rdx |
| 495 movq %rdi,(%r14) |
| 496 movq %rdx,%r13 |
| 497 |
| 498 addq $32,%r15 |
| 499 jnz L$1st4x |
| 500 |
| 501 mulq %rbx |
| 502 addq %rax,%r10 |
| 503 movq -32(%rcx),%rax |
| 504 leaq 32(%r14),%r14 |
| 505 adcq $0,%rdx |
| 506 movq %rdx,%r11 |
| 507 |
| 508 mulq %rbp |
| 509 addq %rax,%r13 |
| 510 movq -8(%rsi),%rax |
| 511 adcq $0,%rdx |
| 512 addq %r10,%r13 |
| 513 adcq $0,%rdx |
| 514 movq %r13,-24(%r14) |
| 515 movq %rdx,%rdi |
| 516 |
| 517 mulq %rbx |
| 518 addq %rax,%r11 |
| 519 movq -16(%rcx),%rax |
| 520 adcq $0,%rdx |
| 521 movq %rdx,%r10 |
| 522 |
| 523 mulq %rbp |
| 524 addq %rax,%rdi |
| 525 movq (%rsi,%r9,1),%rax |
| 526 adcq $0,%rdx |
| 527 addq %r11,%rdi |
| 528 adcq $0,%rdx |
| 529 movq %rdi,-16(%r14) |
| 530 movq %rdx,%r13 |
| 531 |
| 532 .byte 102,72,15,126,195 |
| 533 leaq (%rcx,%r9,2),%rcx |
| 534 |
| 535 xorq %rdi,%rdi |
| 536 addq %r10,%r13 |
| 537 adcq $0,%rdi |
| 538 movq %r13,-8(%r14) |
| 539 |
| 540 jmp L$outer4x |
| 541 |
| 542 .p2align 5 |
| 543 L$outer4x: |
| 544 movq (%r14,%r9,1),%r10 |
| 545 movq %r8,%rbp |
| 546 mulq %rbx |
| 547 addq %rax,%r10 |
| 548 movq (%rcx),%rax |
| 549 adcq $0,%rdx |
| 550 |
| 551 movq -96(%r12),%xmm0 |
| 552 movq -32(%r12),%xmm1 |
| 553 pand %xmm4,%xmm0 |
| 554 movq 32(%r12),%xmm2 |
| 555 pand %xmm5,%xmm1 |
| 556 movq 96(%r12),%xmm3 |
| 557 |
| 558 imulq %r10,%rbp |
| 559 .byte 0x67 |
| 560 movq %rdx,%r11 |
| 561 movq %rdi,(%r14) |
| 562 |
| 563 pand %xmm6,%xmm2 |
| 564 por %xmm1,%xmm0 |
| 565 pand %xmm7,%xmm3 |
| 566 por %xmm2,%xmm0 |
| 567 leaq (%r14,%r9,1),%r14 |
| 568 leaq 256(%r12),%r12 |
| 569 por %xmm3,%xmm0 |
| 570 |
| 571 mulq %rbp |
| 572 addq %rax,%r10 |
| 573 movq 8(%rsi,%r9,1),%rax |
| 574 adcq $0,%rdx |
| 575 movq %rdx,%rdi |
| 576 |
| 577 mulq %rbx |
| 578 addq %rax,%r11 |
| 579 movq 16(%rcx),%rax |
| 580 adcq $0,%rdx |
| 581 addq 8(%r14),%r11 |
| 582 adcq $0,%rdx |
| 583 movq %rdx,%r10 |
| 584 |
| 585 mulq %rbp |
| 586 addq %rax,%rdi |
| 587 movq 16(%rsi,%r9,1),%rax |
| 588 adcq $0,%rdx |
| 589 addq %r11,%rdi |
| 590 leaq 32(%r9),%r15 |
| 591 leaq 64(%rcx),%rcx |
| 592 adcq $0,%rdx |
| 593 movq %rdx,%r13 |
| 594 jmp L$inner4x |
| 595 |
| 596 .p2align 5 |
| 597 L$inner4x: |
| 598 mulq %rbx |
| 599 addq %rax,%r10 |
| 600 movq -32(%rcx),%rax |
| 601 adcq $0,%rdx |
| 602 addq 16(%r14),%r10 |
| 603 leaq 32(%r14),%r14 |
| 604 adcq $0,%rdx |
| 605 movq %rdx,%r11 |
| 606 |
| 607 mulq %rbp |
| 608 addq %rax,%r13 |
| 609 movq -8(%rsi,%r15,1),%rax |
| 610 adcq $0,%rdx |
| 611 addq %r10,%r13 |
| 612 adcq $0,%rdx |
| 613 movq %rdi,-32(%r14) |
| 614 movq %rdx,%rdi |
| 615 |
| 616 mulq %rbx |
| 617 addq %rax,%r11 |
| 618 movq -16(%rcx),%rax |
| 619 adcq $0,%rdx |
| 620 addq -8(%r14),%r11 |
| 621 adcq $0,%rdx |
| 622 movq %rdx,%r10 |
| 623 |
| 624 mulq %rbp |
| 625 addq %rax,%rdi |
| 626 movq (%rsi,%r15,1),%rax |
| 627 adcq $0,%rdx |
| 628 addq %r11,%rdi |
| 629 adcq $0,%rdx |
| 630 movq %r13,-24(%r14) |
| 631 movq %rdx,%r13 |
| 632 |
| 633 mulq %rbx |
| 634 addq %rax,%r10 |
| 635 movq 0(%rcx),%rax |
| 636 adcq $0,%rdx |
| 637 addq (%r14),%r10 |
| 638 adcq $0,%rdx |
| 639 movq %rdx,%r11 |
| 640 |
| 641 mulq %rbp |
| 642 addq %rax,%r13 |
| 643 movq 8(%rsi,%r15,1),%rax |
| 644 adcq $0,%rdx |
| 645 addq %r10,%r13 |
| 646 adcq $0,%rdx |
| 647 movq %rdi,-16(%r14) |
| 648 movq %rdx,%rdi |
| 649 |
| 650 mulq %rbx |
| 651 addq %rax,%r11 |
| 652 movq 16(%rcx),%rax |
| 653 adcq $0,%rdx |
| 654 addq 8(%r14),%r11 |
| 655 adcq $0,%rdx |
| 656 movq %rdx,%r10 |
| 657 |
| 658 mulq %rbp |
| 659 addq %rax,%rdi |
| 660 movq 16(%rsi,%r15,1),%rax |
| 661 adcq $0,%rdx |
| 662 addq %r11,%rdi |
| 663 leaq 64(%rcx),%rcx |
| 664 adcq $0,%rdx |
| 665 movq %r13,-8(%r14) |
| 666 movq %rdx,%r13 |
| 667 |
| 668 addq $32,%r15 |
| 669 jnz L$inner4x |
| 670 |
| 671 mulq %rbx |
| 672 addq %rax,%r10 |
| 673 movq -32(%rcx),%rax |
| 674 adcq $0,%rdx |
| 675 addq 16(%r14),%r10 |
| 676 leaq 32(%r14),%r14 |
| 677 adcq $0,%rdx |
| 678 movq %rdx,%r11 |
| 679 |
| 680 mulq %rbp |
| 681 addq %rax,%r13 |
| 682 movq -8(%rsi),%rax |
| 683 adcq $0,%rdx |
| 684 addq %r10,%r13 |
| 685 adcq $0,%rdx |
| 686 movq %rdi,-32(%r14) |
| 687 movq %rdx,%rdi |
| 688 |
| 689 mulq %rbx |
| 690 addq %rax,%r11 |
| 691 movq %rbp,%rax |
| 692 movq -16(%rcx),%rbp |
| 693 adcq $0,%rdx |
| 694 addq -8(%r14),%r11 |
| 695 adcq $0,%rdx |
| 696 movq %rdx,%r10 |
| 697 |
| 698 mulq %rbp |
| 699 addq %rax,%rdi |
| 700 movq (%rsi,%r9,1),%rax |
| 701 adcq $0,%rdx |
| 702 addq %r11,%rdi |
| 703 adcq $0,%rdx |
| 704 movq %r13,-24(%r14) |
| 705 movq %rdx,%r13 |
| 706 |
| 707 .byte 102,72,15,126,195 |
| 708 movq %rdi,-16(%r14) |
| 709 leaq (%rcx,%r9,2),%rcx |
| 710 |
| 711 xorq %rdi,%rdi |
| 712 addq %r10,%r13 |
| 713 adcq $0,%rdi |
| 714 addq (%r14),%r13 |
| 715 adcq $0,%rdi |
| 716 movq %r13,-8(%r14) |
| 717 |
| 718 cmpq 16+8(%rsp),%r12 |
| 719 jb L$outer4x |
| 720 subq %r13,%rbp |
| 721 adcq %r15,%r15 |
| 722 orq %r15,%rdi |
| 723 xorq $1,%rdi |
| 724 leaq (%r14,%r9,1),%rbx |
| 725 leaq (%rcx,%rdi,8),%rbp |
| 726 movq %r9,%rcx |
| 727 sarq $3+2,%rcx |
| 728 movq 56+8(%rsp),%rdi |
| 729 jmp L$sqr4x_sub |
| 730 |
| 731 .globl _bn_power5 |
| 732 |
| 733 .p2align 5 |
| 734 _bn_power5: |
| 735 movq %rsp,%rax |
| 736 pushq %rbx |
| 737 pushq %rbp |
| 738 pushq %r12 |
| 739 pushq %r13 |
| 740 pushq %r14 |
| 741 pushq %r15 |
| 742 movl %r9d,%r10d |
| 743 shll $3,%r9d |
| 744 shll $3+2,%r10d |
| 745 negq %r9 |
| 746 movq (%r8),%r8 |
| 747 |
| 748 |
| 749 |
| 750 |
| 751 |
| 752 |
| 753 |
| 754 leaq -64(%rsp,%r9,2),%r11 |
| 755 subq %rsi,%r11 |
| 756 andq $4095,%r11 |
| 757 cmpq %r11,%r10 |
| 758 jb L$pwr_sp_alt |
| 759 subq %r11,%rsp |
| 760 leaq -64(%rsp,%r9,2),%rsp |
| 761 jmp L$pwr_sp_done |
| 762 |
| 763 .p2align 5 |
| 764 L$pwr_sp_alt: |
| 765 leaq 4096-64(,%r9,2),%r10 |
| 766 leaq -64(%rsp,%r9,2),%rsp |
| 767 subq %r10,%r11 |
| 768 movq $0,%r10 |
| 769 cmovcq %r10,%r11 |
| 770 subq %r11,%rsp |
| 771 L$pwr_sp_done: |
| 772 andq $-64,%rsp |
| 773 movq %r9,%r10 |
| 774 negq %r9 |
| 775 |
| 776 |
| 777 |
| 778 |
| 779 |
| 780 |
| 781 |
| 782 |
| 783 |
| 784 |
| 785 movq %r8,32(%rsp) |
| 786 movq %rax,40(%rsp) |
| 787 L$power5_body: |
| 788 .byte 102,72,15,110,207 |
| 789 .byte 102,72,15,110,209 |
| 790 .byte 102,73,15,110,218 |
| 791 .byte 102,72,15,110,226 |
| 792 |
| 793 call __bn_sqr8x_internal |
| 794 call __bn_sqr8x_internal |
| 795 call __bn_sqr8x_internal |
| 796 call __bn_sqr8x_internal |
| 797 call __bn_sqr8x_internal |
| 798 |
| 799 .byte 102,72,15,126,209 |
| 800 .byte 102,72,15,126,226 |
| 801 movq %rsi,%rdi |
| 802 movq 40(%rsp),%rax |
| 803 leaq 32(%rsp),%r8 |
| 804 |
| 805 call mul4x_internal |
| 806 |
| 807 movq 40(%rsp),%rsi |
| 808 movq $1,%rax |
| 809 movq -48(%rsi),%r15 |
| 810 movq -40(%rsi),%r14 |
| 811 movq -32(%rsi),%r13 |
| 812 movq -24(%rsi),%r12 |
| 813 movq -16(%rsi),%rbp |
| 814 movq -8(%rsi),%rbx |
| 815 leaq (%rsi),%rsp |
| 816 L$power5_epilogue: |
| 817 .byte 0xf3,0xc3 |
| 818 |
| 819 |
| 820 .globl _bn_sqr8x_internal |
| 821 .private_extern _bn_sqr8x_internal |
| 822 |
| 823 .p2align 5 |
| 824 _bn_sqr8x_internal: |
| 825 __bn_sqr8x_internal: |
| 826 |
| 827 |
| 828 |
| 829 |
| 830 |
| 831 |
| 832 |
| 833 |
| 834 |
| 835 |
| 836 |
| 837 |
| 838 |
| 839 |
| 840 |
| 841 |
| 842 |
| 843 |
| 844 |
| 845 |
| 846 |
| 847 |
| 848 |
| 849 |
| 850 |
| 851 |
| 852 |
| 853 |
| 854 |
| 855 |
| 856 |
| 857 |
| 858 |
| 859 |
| 860 |
| 861 |
| 862 |
| 863 |
| 864 |
| 865 |
| 866 |
| 867 |
| 868 |
| 869 |
| 870 |
| 871 |
| 872 |
| 873 |
| 874 |
| 875 |
| 876 |
| 877 |
| 878 |
| 879 |
| 880 |
| 881 |
| 882 |
| 883 |
| 884 |
| 885 |
| 886 |
| 887 |
| 888 |
| 889 |
| 890 |
| 891 |
| 892 |
| 893 |
| 894 |
| 895 |
| 896 |
| 897 |
| 898 |
| 899 leaq 32(%r10),%rbp |
| 900 leaq (%rsi,%r9,1),%rsi |
| 901 |
| 902 movq %r9,%rcx |
| 903 |
| 904 |
| 905 movq -32(%rsi,%rbp,1),%r14 |
| 906 leaq 48+8(%rsp,%r9,2),%rdi |
| 907 movq -24(%rsi,%rbp,1),%rax |
| 908 leaq -32(%rdi,%rbp,1),%rdi |
| 909 movq -16(%rsi,%rbp,1),%rbx |
| 910 movq %rax,%r15 |
| 911 |
| 912 mulq %r14 |
| 913 movq %rax,%r10 |
| 914 movq %rbx,%rax |
| 915 movq %rdx,%r11 |
| 916 movq %r10,-24(%rdi,%rbp,1) |
| 917 |
| 918 mulq %r14 |
| 919 addq %rax,%r11 |
| 920 movq %rbx,%rax |
| 921 adcq $0,%rdx |
| 922 movq %r11,-16(%rdi,%rbp,1) |
| 923 movq %rdx,%r10 |
| 924 |
| 925 |
| 926 movq -8(%rsi,%rbp,1),%rbx |
| 927 mulq %r15 |
| 928 movq %rax,%r12 |
| 929 movq %rbx,%rax |
| 930 movq %rdx,%r13 |
| 931 |
| 932 leaq (%rbp),%rcx |
| 933 mulq %r14 |
| 934 addq %rax,%r10 |
| 935 movq %rbx,%rax |
| 936 movq %rdx,%r11 |
| 937 adcq $0,%r11 |
| 938 addq %r12,%r10 |
| 939 adcq $0,%r11 |
| 940 movq %r10,-8(%rdi,%rcx,1) |
| 941 jmp L$sqr4x_1st |
| 942 |
| 943 .p2align 5 |
| 944 L$sqr4x_1st: |
| 945 movq (%rsi,%rcx,1),%rbx |
| 946 mulq %r15 |
| 947 addq %rax,%r13 |
| 948 movq %rbx,%rax |
| 949 movq %rdx,%r12 |
| 950 adcq $0,%r12 |
| 951 |
| 952 mulq %r14 |
| 953 addq %rax,%r11 |
| 954 movq %rbx,%rax |
| 955 movq 8(%rsi,%rcx,1),%rbx |
| 956 movq %rdx,%r10 |
| 957 adcq $0,%r10 |
| 958 addq %r13,%r11 |
| 959 adcq $0,%r10 |
| 960 |
| 961 |
| 962 mulq %r15 |
| 963 addq %rax,%r12 |
| 964 movq %rbx,%rax |
| 965 movq %r11,(%rdi,%rcx,1) |
| 966 movq %rdx,%r13 |
| 967 adcq $0,%r13 |
| 968 |
| 969 mulq %r14 |
| 970 addq %rax,%r10 |
| 971 movq %rbx,%rax |
| 972 movq 16(%rsi,%rcx,1),%rbx |
| 973 movq %rdx,%r11 |
| 974 adcq $0,%r11 |
| 975 addq %r12,%r10 |
| 976 adcq $0,%r11 |
| 977 |
| 978 mulq %r15 |
| 979 addq %rax,%r13 |
| 980 movq %rbx,%rax |
| 981 movq %r10,8(%rdi,%rcx,1) |
| 982 movq %rdx,%r12 |
| 983 adcq $0,%r12 |
| 984 |
| 985 mulq %r14 |
| 986 addq %rax,%r11 |
| 987 movq %rbx,%rax |
| 988 movq 24(%rsi,%rcx,1),%rbx |
| 989 movq %rdx,%r10 |
| 990 adcq $0,%r10 |
| 991 addq %r13,%r11 |
| 992 adcq $0,%r10 |
| 993 |
| 994 |
| 995 mulq %r15 |
| 996 addq %rax,%r12 |
| 997 movq %rbx,%rax |
| 998 movq %r11,16(%rdi,%rcx,1) |
| 999 movq %rdx,%r13 |
| 1000 adcq $0,%r13 |
| 1001 leaq 32(%rcx),%rcx |
| 1002 |
| 1003 mulq %r14 |
| 1004 addq %rax,%r10 |
| 1005 movq %rbx,%rax |
| 1006 movq %rdx,%r11 |
| 1007 adcq $0,%r11 |
| 1008 addq %r12,%r10 |
| 1009 adcq $0,%r11 |
| 1010 movq %r10,-8(%rdi,%rcx,1) |
| 1011 |
| 1012 cmpq $0,%rcx |
| 1013 jne L$sqr4x_1st |
| 1014 |
| 1015 mulq %r15 |
| 1016 addq %rax,%r13 |
| 1017 leaq 16(%rbp),%rbp |
| 1018 adcq $0,%rdx |
| 1019 addq %r11,%r13 |
| 1020 adcq $0,%rdx |
| 1021 |
| 1022 movq %r13,(%rdi) |
| 1023 movq %rdx,%r12 |
| 1024 movq %rdx,8(%rdi) |
| 1025 jmp L$sqr4x_outer |
| 1026 |
| 1027 .p2align 5 |
| 1028 L$sqr4x_outer: |
| 1029 movq -32(%rsi,%rbp,1),%r14 |
| 1030 leaq 48+8(%rsp,%r9,2),%rdi |
| 1031 movq -24(%rsi,%rbp,1),%rax |
| 1032 leaq -32(%rdi,%rbp,1),%rdi |
| 1033 movq -16(%rsi,%rbp,1),%rbx |
| 1034 movq %rax,%r15 |
| 1035 |
| 1036 mulq %r14 |
| 1037 movq -24(%rdi,%rbp,1),%r10 |
| 1038 addq %rax,%r10 |
| 1039 movq %rbx,%rax |
| 1040 adcq $0,%rdx |
| 1041 movq %r10,-24(%rdi,%rbp,1) |
| 1042 movq %rdx,%r11 |
| 1043 |
| 1044 mulq %r14 |
| 1045 addq %rax,%r11 |
| 1046 movq %rbx,%rax |
| 1047 adcq $0,%rdx |
| 1048 addq -16(%rdi,%rbp,1),%r11 |
| 1049 movq %rdx,%r10 |
| 1050 adcq $0,%r10 |
| 1051 movq %r11,-16(%rdi,%rbp,1) |
| 1052 |
| 1053 xorq %r12,%r12 |
| 1054 |
| 1055 movq -8(%rsi,%rbp,1),%rbx |
| 1056 mulq %r15 |
| 1057 addq %rax,%r12 |
| 1058 movq %rbx,%rax |
| 1059 adcq $0,%rdx |
| 1060 addq -8(%rdi,%rbp,1),%r12 |
| 1061 movq %rdx,%r13 |
| 1062 adcq $0,%r13 |
| 1063 |
| 1064 mulq %r14 |
| 1065 addq %rax,%r10 |
| 1066 movq %rbx,%rax |
| 1067 adcq $0,%rdx |
| 1068 addq %r12,%r10 |
| 1069 movq %rdx,%r11 |
| 1070 adcq $0,%r11 |
| 1071 movq %r10,-8(%rdi,%rbp,1) |
| 1072 |
| 1073 leaq (%rbp),%rcx |
| 1074 jmp L$sqr4x_inner |
| 1075 |
| 1076 .p2align 5 |
| 1077 L$sqr4x_inner: |
| 1078 movq (%rsi,%rcx,1),%rbx |
| 1079 mulq %r15 |
| 1080 addq %rax,%r13 |
| 1081 movq %rbx,%rax |
| 1082 movq %rdx,%r12 |
| 1083 adcq $0,%r12 |
| 1084 addq (%rdi,%rcx,1),%r13 |
| 1085 adcq $0,%r12 |
| 1086 |
| 1087 .byte 0x67 |
| 1088 mulq %r14 |
| 1089 addq %rax,%r11 |
| 1090 movq %rbx,%rax |
| 1091 movq 8(%rsi,%rcx,1),%rbx |
| 1092 movq %rdx,%r10 |
| 1093 adcq $0,%r10 |
| 1094 addq %r13,%r11 |
| 1095 adcq $0,%r10 |
| 1096 |
| 1097 mulq %r15 |
| 1098 addq %rax,%r12 |
| 1099 movq %r11,(%rdi,%rcx,1) |
| 1100 movq %rbx,%rax |
| 1101 movq %rdx,%r13 |
| 1102 adcq $0,%r13 |
| 1103 addq 8(%rdi,%rcx,1),%r12 |
| 1104 leaq 16(%rcx),%rcx |
| 1105 adcq $0,%r13 |
| 1106 |
| 1107 mulq %r14 |
| 1108 addq %rax,%r10 |
| 1109 movq %rbx,%rax |
| 1110 adcq $0,%rdx |
| 1111 addq %r12,%r10 |
| 1112 movq %rdx,%r11 |
| 1113 adcq $0,%r11 |
| 1114 movq %r10,-8(%rdi,%rcx,1) |
| 1115 |
| 1116 cmpq $0,%rcx |
| 1117 jne L$sqr4x_inner |
| 1118 |
| 1119 .byte 0x67 |
| 1120 mulq %r15 |
| 1121 addq %rax,%r13 |
| 1122 adcq $0,%rdx |
| 1123 addq %r11,%r13 |
| 1124 adcq $0,%rdx |
| 1125 |
| 1126 movq %r13,(%rdi) |
| 1127 movq %rdx,%r12 |
| 1128 movq %rdx,8(%rdi) |
| 1129 |
| 1130 addq $16,%rbp |
| 1131 jnz L$sqr4x_outer |
| 1132 |
| 1133 |
| 1134 movq -32(%rsi),%r14 |
| 1135 leaq 48+8(%rsp,%r9,2),%rdi |
| 1136 movq -24(%rsi),%rax |
| 1137 leaq -32(%rdi,%rbp,1),%rdi |
| 1138 movq -16(%rsi),%rbx |
| 1139 movq %rax,%r15 |
| 1140 |
| 1141 mulq %r14 |
| 1142 addq %rax,%r10 |
| 1143 movq %rbx,%rax |
| 1144 movq %rdx,%r11 |
| 1145 adcq $0,%r11 |
| 1146 |
| 1147 mulq %r14 |
| 1148 addq %rax,%r11 |
| 1149 movq %rbx,%rax |
| 1150 movq %r10,-24(%rdi) |
| 1151 movq %rdx,%r10 |
| 1152 adcq $0,%r10 |
| 1153 addq %r13,%r11 |
| 1154 movq -8(%rsi),%rbx |
| 1155 adcq $0,%r10 |
| 1156 |
| 1157 mulq %r15 |
| 1158 addq %rax,%r12 |
| 1159 movq %rbx,%rax |
| 1160 movq %r11,-16(%rdi) |
| 1161 movq %rdx,%r13 |
| 1162 adcq $0,%r13 |
| 1163 |
| 1164 mulq %r14 |
| 1165 addq %rax,%r10 |
| 1166 movq %rbx,%rax |
| 1167 movq %rdx,%r11 |
| 1168 adcq $0,%r11 |
| 1169 addq %r12,%r10 |
| 1170 adcq $0,%r11 |
| 1171 movq %r10,-8(%rdi) |
| 1172 |
| 1173 mulq %r15 |
| 1174 addq %rax,%r13 |
| 1175 movq -16(%rsi),%rax |
| 1176 adcq $0,%rdx |
| 1177 addq %r11,%r13 |
| 1178 adcq $0,%rdx |
| 1179 |
| 1180 movq %r13,(%rdi) |
| 1181 movq %rdx,%r12 |
| 1182 movq %rdx,8(%rdi) |
| 1183 |
| 1184 mulq %rbx |
| 1185 addq $16,%rbp |
| 1186 xorq %r14,%r14 |
| 1187 subq %r9,%rbp |
| 1188 xorq %r15,%r15 |
| 1189 |
| 1190 addq %r12,%rax |
| 1191 adcq $0,%rdx |
| 1192 movq %rax,8(%rdi) |
| 1193 movq %rdx,16(%rdi) |
| 1194 movq %r15,24(%rdi) |
| 1195 |
| 1196 movq -16(%rsi,%rbp,1),%rax |
| 1197 leaq 48+8(%rsp),%rdi |
| 1198 xorq %r10,%r10 |
| 1199 movq 8(%rdi),%r11 |
| 1200 |
| 1201 leaq (%r14,%r10,2),%r12 |
| 1202 shrq $63,%r10 |
| 1203 leaq (%rcx,%r11,2),%r13 |
| 1204 shrq $63,%r11 |
| 1205 orq %r10,%r13 |
| 1206 movq 16(%rdi),%r10 |
| 1207 movq %r11,%r14 |
| 1208 mulq %rax |
| 1209 negq %r15 |
| 1210 movq 24(%rdi),%r11 |
| 1211 adcq %rax,%r12 |
| 1212 movq -8(%rsi,%rbp,1),%rax |
| 1213 movq %r12,(%rdi) |
| 1214 adcq %rdx,%r13 |
| 1215 |
| 1216 leaq (%r14,%r10,2),%rbx |
| 1217 movq %r13,8(%rdi) |
| 1218 sbbq %r15,%r15 |
| 1219 shrq $63,%r10 |
| 1220 leaq (%rcx,%r11,2),%r8 |
| 1221 shrq $63,%r11 |
| 1222 orq %r10,%r8 |
| 1223 movq 32(%rdi),%r10 |
| 1224 movq %r11,%r14 |
| 1225 mulq %rax |
| 1226 negq %r15 |
| 1227 movq 40(%rdi),%r11 |
| 1228 adcq %rax,%rbx |
| 1229 movq 0(%rsi,%rbp,1),%rax |
| 1230 movq %rbx,16(%rdi) |
| 1231 adcq %rdx,%r8 |
| 1232 leaq 16(%rbp),%rbp |
| 1233 movq %r8,24(%rdi) |
| 1234 sbbq %r15,%r15 |
| 1235 leaq 64(%rdi),%rdi |
| 1236 jmp L$sqr4x_shift_n_add |
| 1237 |
| 1238 .p2align 5 |
| 1239 L$sqr4x_shift_n_add: |
| 1240 leaq (%r14,%r10,2),%r12 |
| 1241 shrq $63,%r10 |
| 1242 leaq (%rcx,%r11,2),%r13 |
| 1243 shrq $63,%r11 |
| 1244 orq %r10,%r13 |
| 1245 movq -16(%rdi),%r10 |
| 1246 movq %r11,%r14 |
| 1247 mulq %rax |
| 1248 negq %r15 |
| 1249 movq -8(%rdi),%r11 |
| 1250 adcq %rax,%r12 |
| 1251 movq -8(%rsi,%rbp,1),%rax |
| 1252 movq %r12,-32(%rdi) |
| 1253 adcq %rdx,%r13 |
| 1254 |
| 1255 leaq (%r14,%r10,2),%rbx |
| 1256 movq %r13,-24(%rdi) |
| 1257 sbbq %r15,%r15 |
| 1258 shrq $63,%r10 |
| 1259 leaq (%rcx,%r11,2),%r8 |
| 1260 shrq $63,%r11 |
| 1261 orq %r10,%r8 |
| 1262 movq 0(%rdi),%r10 |
| 1263 movq %r11,%r14 |
| 1264 mulq %rax |
| 1265 negq %r15 |
| 1266 movq 8(%rdi),%r11 |
| 1267 adcq %rax,%rbx |
| 1268 movq 0(%rsi,%rbp,1),%rax |
| 1269 movq %rbx,-16(%rdi) |
| 1270 adcq %rdx,%r8 |
| 1271 |
| 1272 leaq (%r14,%r10,2),%r12 |
| 1273 movq %r8,-8(%rdi) |
| 1274 sbbq %r15,%r15 |
| 1275 shrq $63,%r10 |
| 1276 leaq (%rcx,%r11,2),%r13 |
| 1277 shrq $63,%r11 |
| 1278 orq %r10,%r13 |
| 1279 movq 16(%rdi),%r10 |
| 1280 movq %r11,%r14 |
| 1281 mulq %rax |
| 1282 negq %r15 |
| 1283 movq 24(%rdi),%r11 |
| 1284 adcq %rax,%r12 |
| 1285 movq 8(%rsi,%rbp,1),%rax |
| 1286 movq %r12,0(%rdi) |
| 1287 adcq %rdx,%r13 |
| 1288 |
| 1289 leaq (%r14,%r10,2),%rbx |
| 1290 movq %r13,8(%rdi) |
| 1291 sbbq %r15,%r15 |
| 1292 shrq $63,%r10 |
| 1293 leaq (%rcx,%r11,2),%r8 |
| 1294 shrq $63,%r11 |
| 1295 orq %r10,%r8 |
| 1296 movq 32(%rdi),%r10 |
| 1297 movq %r11,%r14 |
| 1298 mulq %rax |
| 1299 negq %r15 |
| 1300 movq 40(%rdi),%r11 |
| 1301 adcq %rax,%rbx |
| 1302 movq 16(%rsi,%rbp,1),%rax |
| 1303 movq %rbx,16(%rdi) |
| 1304 adcq %rdx,%r8 |
| 1305 movq %r8,24(%rdi) |
| 1306 sbbq %r15,%r15 |
| 1307 leaq 64(%rdi),%rdi |
| 1308 addq $32,%rbp |
| 1309 jnz L$sqr4x_shift_n_add |
| 1310 |
| 1311 leaq (%r14,%r10,2),%r12 |
| 1312 .byte 0x67 |
| 1313 shrq $63,%r10 |
| 1314 leaq (%rcx,%r11,2),%r13 |
| 1315 shrq $63,%r11 |
| 1316 orq %r10,%r13 |
| 1317 movq -16(%rdi),%r10 |
| 1318 movq %r11,%r14 |
| 1319 mulq %rax |
| 1320 negq %r15 |
| 1321 movq -8(%rdi),%r11 |
| 1322 adcq %rax,%r12 |
| 1323 movq -8(%rsi),%rax |
| 1324 movq %r12,-32(%rdi) |
| 1325 adcq %rdx,%r13 |
| 1326 |
| 1327 leaq (%r14,%r10,2),%rbx |
| 1328 movq %r13,-24(%rdi) |
| 1329 sbbq %r15,%r15 |
| 1330 shrq $63,%r10 |
| 1331 leaq (%rcx,%r11,2),%r8 |
| 1332 shrq $63,%r11 |
| 1333 orq %r10,%r8 |
| 1334 mulq %rax |
| 1335 negq %r15 |
| 1336 adcq %rax,%rbx |
| 1337 adcq %rdx,%r8 |
| 1338 movq %rbx,-16(%rdi) |
| 1339 movq %r8,-8(%rdi) |
| 1340 .byte 102,72,15,126,213 |
| 1341 sqr8x_reduction: |
| 1342 xorq %rax,%rax |
| 1343 leaq (%rbp,%r9,2),%rcx |
| 1344 leaq 48+8(%rsp,%r9,2),%rdx |
| 1345 movq %rcx,0+8(%rsp) |
| 1346 leaq 48+8(%rsp,%r9,1),%rdi |
| 1347 movq %rdx,8+8(%rsp) |
| 1348 negq %r9 |
| 1349 jmp L$8x_reduction_loop |
| 1350 |
| 1351 .p2align 5 |
| 1352 L$8x_reduction_loop: |
| 1353 leaq (%rdi,%r9,1),%rdi |
| 1354 .byte 0x66 |
| 1355 movq 0(%rdi),%rbx |
| 1356 movq 8(%rdi),%r9 |
| 1357 movq 16(%rdi),%r10 |
| 1358 movq 24(%rdi),%r11 |
| 1359 movq 32(%rdi),%r12 |
| 1360 movq 40(%rdi),%r13 |
| 1361 movq 48(%rdi),%r14 |
| 1362 movq 56(%rdi),%r15 |
| 1363 movq %rax,(%rdx) |
| 1364 leaq 64(%rdi),%rdi |
| 1365 |
| 1366 .byte 0x67 |
| 1367 movq %rbx,%r8 |
| 1368 imulq 32+8(%rsp),%rbx |
| 1369 movq 0(%rbp),%rax |
| 1370 movl $8,%ecx |
| 1371 jmp L$8x_reduce |
| 1372 |
| 1373 .p2align 5 |
| 1374 L$8x_reduce: |
| 1375 mulq %rbx |
| 1376 movq 16(%rbp),%rax |
| 1377 negq %r8 |
| 1378 movq %rdx,%r8 |
| 1379 adcq $0,%r8 |
| 1380 |
| 1381 mulq %rbx |
| 1382 addq %rax,%r9 |
| 1383 movq 32(%rbp),%rax |
| 1384 adcq $0,%rdx |
| 1385 addq %r9,%r8 |
| 1386 movq %rbx,48-8+8(%rsp,%rcx,8) |
| 1387 movq %rdx,%r9 |
| 1388 adcq $0,%r9 |
| 1389 |
| 1390 mulq %rbx |
| 1391 addq %rax,%r10 |
| 1392 movq 48(%rbp),%rax |
| 1393 adcq $0,%rdx |
| 1394 addq %r10,%r9 |
| 1395 movq 32+8(%rsp),%rsi |
| 1396 movq %rdx,%r10 |
| 1397 adcq $0,%r10 |
| 1398 |
| 1399 mulq %rbx |
| 1400 addq %rax,%r11 |
| 1401 movq 64(%rbp),%rax |
| 1402 adcq $0,%rdx |
| 1403 imulq %r8,%rsi |
| 1404 addq %r11,%r10 |
| 1405 movq %rdx,%r11 |
| 1406 adcq $0,%r11 |
| 1407 |
| 1408 mulq %rbx |
| 1409 addq %rax,%r12 |
| 1410 movq 80(%rbp),%rax |
| 1411 adcq $0,%rdx |
| 1412 addq %r12,%r11 |
| 1413 movq %rdx,%r12 |
| 1414 adcq $0,%r12 |
| 1415 |
| 1416 mulq %rbx |
| 1417 addq %rax,%r13 |
| 1418 movq 96(%rbp),%rax |
| 1419 adcq $0,%rdx |
| 1420 addq %r13,%r12 |
| 1421 movq %rdx,%r13 |
| 1422 adcq $0,%r13 |
| 1423 |
| 1424 mulq %rbx |
| 1425 addq %rax,%r14 |
| 1426 movq 112(%rbp),%rax |
| 1427 adcq $0,%rdx |
| 1428 addq %r14,%r13 |
| 1429 movq %rdx,%r14 |
| 1430 adcq $0,%r14 |
| 1431 |
| 1432 mulq %rbx |
| 1433 movq %rsi,%rbx |
| 1434 addq %rax,%r15 |
| 1435 movq 0(%rbp),%rax |
| 1436 adcq $0,%rdx |
| 1437 addq %r15,%r14 |
| 1438 movq %rdx,%r15 |
| 1439 adcq $0,%r15 |
| 1440 |
| 1441 decl %ecx |
| 1442 jnz L$8x_reduce |
| 1443 |
| 1444 leaq 128(%rbp),%rbp |
| 1445 xorq %rax,%rax |
| 1446 movq 8+8(%rsp),%rdx |
| 1447 cmpq 0+8(%rsp),%rbp |
| 1448 jae L$8x_no_tail |
| 1449 |
| 1450 .byte 0x66 |
| 1451 addq 0(%rdi),%r8 |
| 1452 adcq 8(%rdi),%r9 |
| 1453 adcq 16(%rdi),%r10 |
| 1454 adcq 24(%rdi),%r11 |
| 1455 adcq 32(%rdi),%r12 |
| 1456 adcq 40(%rdi),%r13 |
| 1457 adcq 48(%rdi),%r14 |
| 1458 adcq 56(%rdi),%r15 |
| 1459 sbbq %rsi,%rsi |
| 1460 |
| 1461 movq 48+56+8(%rsp),%rbx |
| 1462 movl $8,%ecx |
| 1463 movq 0(%rbp),%rax |
| 1464 jmp L$8x_tail |
| 1465 |
| 1466 .p2align 5 |
| 1467 L$8x_tail: |
| 1468 mulq %rbx |
| 1469 addq %rax,%r8 |
| 1470 movq 16(%rbp),%rax |
| 1471 movq %r8,(%rdi) |
| 1472 movq %rdx,%r8 |
| 1473 adcq $0,%r8 |
| 1474 |
| 1475 mulq %rbx |
| 1476 addq %rax,%r9 |
| 1477 movq 32(%rbp),%rax |
| 1478 adcq $0,%rdx |
| 1479 addq %r9,%r8 |
| 1480 leaq 8(%rdi),%rdi |
| 1481 movq %rdx,%r9 |
| 1482 adcq $0,%r9 |
| 1483 |
| 1484 mulq %rbx |
| 1485 addq %rax,%r10 |
| 1486 movq 48(%rbp),%rax |
| 1487 adcq $0,%rdx |
| 1488 addq %r10,%r9 |
| 1489 movq %rdx,%r10 |
| 1490 adcq $0,%r10 |
| 1491 |
| 1492 mulq %rbx |
| 1493 addq %rax,%r11 |
| 1494 movq 64(%rbp),%rax |
| 1495 adcq $0,%rdx |
| 1496 addq %r11,%r10 |
| 1497 movq %rdx,%r11 |
| 1498 adcq $0,%r11 |
| 1499 |
| 1500 mulq %rbx |
| 1501 addq %rax,%r12 |
| 1502 movq 80(%rbp),%rax |
| 1503 adcq $0,%rdx |
| 1504 addq %r12,%r11 |
| 1505 movq %rdx,%r12 |
| 1506 adcq $0,%r12 |
| 1507 |
| 1508 mulq %rbx |
| 1509 addq %rax,%r13 |
| 1510 movq 96(%rbp),%rax |
| 1511 adcq $0,%rdx |
| 1512 addq %r13,%r12 |
| 1513 movq %rdx,%r13 |
| 1514 adcq $0,%r13 |
| 1515 |
| 1516 mulq %rbx |
| 1517 addq %rax,%r14 |
| 1518 movq 112(%rbp),%rax |
| 1519 adcq $0,%rdx |
| 1520 addq %r14,%r13 |
| 1521 movq %rdx,%r14 |
| 1522 adcq $0,%r14 |
| 1523 |
| 1524 mulq %rbx |
| 1525 movq 48-16+8(%rsp,%rcx,8),%rbx |
| 1526 addq %rax,%r15 |
| 1527 adcq $0,%rdx |
| 1528 addq %r15,%r14 |
| 1529 movq 0(%rbp),%rax |
| 1530 movq %rdx,%r15 |
| 1531 adcq $0,%r15 |
| 1532 |
| 1533 decl %ecx |
| 1534 jnz L$8x_tail |
| 1535 |
| 1536 leaq 128(%rbp),%rbp |
| 1537 movq 8+8(%rsp),%rdx |
| 1538 cmpq 0+8(%rsp),%rbp |
| 1539 jae L$8x_tail_done |
| 1540 |
| 1541 movq 48+56+8(%rsp),%rbx |
| 1542 negq %rsi |
| 1543 movq 0(%rbp),%rax |
| 1544 adcq 0(%rdi),%r8 |
| 1545 adcq 8(%rdi),%r9 |
| 1546 adcq 16(%rdi),%r10 |
| 1547 adcq 24(%rdi),%r11 |
| 1548 adcq 32(%rdi),%r12 |
| 1549 adcq 40(%rdi),%r13 |
| 1550 adcq 48(%rdi),%r14 |
| 1551 adcq 56(%rdi),%r15 |
| 1552 sbbq %rsi,%rsi |
| 1553 |
| 1554 movl $8,%ecx |
| 1555 jmp L$8x_tail |
| 1556 |
| 1557 .p2align 5 |
| 1558 L$8x_tail_done: |
| 1559 addq (%rdx),%r8 |
| 1560 xorq %rax,%rax |
| 1561 |
| 1562 negq %rsi |
| 1563 L$8x_no_tail: |
| 1564 adcq 0(%rdi),%r8 |
| 1565 adcq 8(%rdi),%r9 |
| 1566 adcq 16(%rdi),%r10 |
| 1567 adcq 24(%rdi),%r11 |
| 1568 adcq 32(%rdi),%r12 |
| 1569 adcq 40(%rdi),%r13 |
| 1570 adcq 48(%rdi),%r14 |
| 1571 adcq 56(%rdi),%r15 |
| 1572 adcq $0,%rax |
| 1573 movq -16(%rbp),%rcx |
| 1574 xorq %rsi,%rsi |
| 1575 |
| 1576 .byte 102,72,15,126,213 |
| 1577 |
| 1578 movq %r8,0(%rdi) |
| 1579 movq %r9,8(%rdi) |
| 1580 .byte 102,73,15,126,217 |
| 1581 movq %r10,16(%rdi) |
| 1582 movq %r11,24(%rdi) |
| 1583 movq %r12,32(%rdi) |
| 1584 movq %r13,40(%rdi) |
| 1585 movq %r14,48(%rdi) |
| 1586 movq %r15,56(%rdi) |
| 1587 leaq 64(%rdi),%rdi |
| 1588 |
| 1589 cmpq %rdx,%rdi |
| 1590 jb L$8x_reduction_loop |
| 1591 |
| 1592 subq %r15,%rcx |
| 1593 leaq (%rdi,%r9,1),%rbx |
| 1594 adcq %rsi,%rsi |
| 1595 movq %r9,%rcx |
| 1596 orq %rsi,%rax |
| 1597 .byte 102,72,15,126,207 |
| 1598 xorq $1,%rax |
| 1599 .byte 102,72,15,126,206 |
| 1600 leaq (%rbp,%rax,8),%rbp |
| 1601 sarq $3+2,%rcx |
| 1602 jmp L$sqr4x_sub |
| 1603 |
| 1604 .p2align 5 |
| 1605 L$sqr4x_sub: |
| 1606 .byte 0x66 |
| 1607 movq 0(%rbx),%r12 |
| 1608 movq 8(%rbx),%r13 |
| 1609 sbbq 0(%rbp),%r12 |
| 1610 movq 16(%rbx),%r14 |
| 1611 sbbq 16(%rbp),%r13 |
| 1612 movq 24(%rbx),%r15 |
| 1613 leaq 32(%rbx),%rbx |
| 1614 sbbq 32(%rbp),%r14 |
| 1615 movq %r12,0(%rdi) |
| 1616 sbbq 48(%rbp),%r15 |
| 1617 leaq 64(%rbp),%rbp |
| 1618 movq %r13,8(%rdi) |
| 1619 movq %r14,16(%rdi) |
| 1620 movq %r15,24(%rdi) |
| 1621 leaq 32(%rdi),%rdi |
| 1622 |
| 1623 incq %rcx |
| 1624 jnz L$sqr4x_sub |
| 1625 movq %r9,%r10 |
| 1626 negq %r9 |
| 1627 .byte 0xf3,0xc3 |
| 1628 |
| 1629 .globl _bn_from_montgomery |
| 1630 |
| 1631 .p2align 5 |
| 1632 _bn_from_montgomery: |
| 1633 testl $7,%r9d |
| 1634 jz bn_from_mont8x |
| 1635 xorl %eax,%eax |
| 1636 .byte 0xf3,0xc3 |
| 1637 |
| 1638 |
| 1639 |
| 1640 .p2align 5 |
| 1641 bn_from_mont8x: |
| 1642 .byte 0x67 |
| 1643 movq %rsp,%rax |
| 1644 pushq %rbx |
| 1645 pushq %rbp |
| 1646 pushq %r12 |
| 1647 pushq %r13 |
| 1648 pushq %r14 |
| 1649 pushq %r15 |
| 1650 .byte 0x67 |
| 1651 movl %r9d,%r10d |
| 1652 shll $3,%r9d |
| 1653 shll $3+2,%r10d |
| 1654 negq %r9 |
| 1655 movq (%r8),%r8 |
| 1656 |
| 1657 |
| 1658 |
| 1659 |
| 1660 |
| 1661 |
| 1662 |
| 1663 leaq -64(%rsp,%r9,2),%r11 |
| 1664 subq %rsi,%r11 |
| 1665 andq $4095,%r11 |
| 1666 cmpq %r11,%r10 |
| 1667 jb L$from_sp_alt |
| 1668 subq %r11,%rsp |
| 1669 leaq -64(%rsp,%r9,2),%rsp |
| 1670 jmp L$from_sp_done |
| 1671 |
| 1672 .p2align 5 |
| 1673 L$from_sp_alt: |
| 1674 leaq 4096-64(,%r9,2),%r10 |
| 1675 leaq -64(%rsp,%r9,2),%rsp |
| 1676 subq %r10,%r11 |
| 1677 movq $0,%r10 |
| 1678 cmovcq %r10,%r11 |
| 1679 subq %r11,%rsp |
| 1680 L$from_sp_done: |
| 1681 andq $-64,%rsp |
| 1682 movq %r9,%r10 |
| 1683 negq %r9 |
| 1684 |
| 1685 |
| 1686 |
| 1687 |
| 1688 |
| 1689 |
| 1690 |
| 1691 |
| 1692 |
| 1693 |
| 1694 movq %r8,32(%rsp) |
| 1695 movq %rax,40(%rsp) |
| 1696 L$from_body: |
| 1697 movq %r9,%r11 |
| 1698 leaq 48(%rsp),%rax |
| 1699 pxor %xmm0,%xmm0 |
| 1700 jmp L$mul_by_1 |
| 1701 |
| 1702 .p2align 5 |
| 1703 L$mul_by_1: |
| 1704 movdqu (%rsi),%xmm1 |
| 1705 movdqu 16(%rsi),%xmm2 |
| 1706 movdqu 32(%rsi),%xmm3 |
| 1707 movdqa %xmm0,(%rax,%r9,1) |
| 1708 movdqu 48(%rsi),%xmm4 |
| 1709 movdqa %xmm0,16(%rax,%r9,1) |
| 1710 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 |
| 1711 movdqa %xmm1,(%rax) |
| 1712 movdqa %xmm0,32(%rax,%r9,1) |
| 1713 movdqa %xmm2,16(%rax) |
| 1714 movdqa %xmm0,48(%rax,%r9,1) |
| 1715 movdqa %xmm3,32(%rax) |
| 1716 movdqa %xmm4,48(%rax) |
| 1717 leaq 64(%rax),%rax |
| 1718 subq $64,%r11 |
| 1719 jnz L$mul_by_1 |
| 1720 |
| 1721 .byte 102,72,15,110,207 |
| 1722 .byte 102,72,15,110,209 |
| 1723 .byte 0x67 |
| 1724 movq %rcx,%rbp |
| 1725 .byte 102,73,15,110,218 |
| 1726 call sqr8x_reduction |
| 1727 |
| 1728 pxor %xmm0,%xmm0 |
| 1729 leaq 48(%rsp),%rax |
| 1730 movq 40(%rsp),%rsi |
| 1731 jmp L$from_mont_zero |
| 1732 |
| 1733 .p2align 5 |
| 1734 L$from_mont_zero: |
| 1735 movdqa %xmm0,0(%rax) |
| 1736 movdqa %xmm0,16(%rax) |
| 1737 movdqa %xmm0,32(%rax) |
| 1738 movdqa %xmm0,48(%rax) |
| 1739 leaq 64(%rax),%rax |
| 1740 subq $32,%r9 |
| 1741 jnz L$from_mont_zero |
| 1742 |
| 1743 movq $1,%rax |
| 1744 movq -48(%rsi),%r15 |
| 1745 movq -40(%rsi),%r14 |
| 1746 movq -32(%rsi),%r13 |
| 1747 movq -24(%rsi),%r12 |
| 1748 movq -16(%rsi),%rbp |
| 1749 movq -8(%rsi),%rbx |
| 1750 leaq (%rsi),%rsp |
| 1751 L$from_epilogue: |
| 1752 .byte 0xf3,0xc3 |
| 1753 |
| 1754 .globl _bn_get_bits5 |
| 1755 |
| 1756 .p2align 4 |
| 1757 _bn_get_bits5: |
| 1758 movq %rdi,%r10 |
| 1759 movl %esi,%ecx |
| 1760 shrl $3,%esi |
| 1761 movzwl (%r10,%rsi,1),%eax |
| 1762 andl $7,%ecx |
| 1763 shrl %cl,%eax |
| 1764 andl $31,%eax |
| 1765 .byte 0xf3,0xc3 |
| 1766 |
| 1767 |
| 1768 .globl _bn_scatter5 |
| 1769 |
| 1770 .p2align 4 |
| 1771 _bn_scatter5: |
| 1772 cmpl $0,%esi |
| 1773 jz L$scatter_epilogue |
| 1774 leaq (%rdx,%rcx,8),%rdx |
| 1775 L$scatter: |
| 1776 movq (%rdi),%rax |
| 1777 leaq 8(%rdi),%rdi |
| 1778 movq %rax,(%rdx) |
| 1779 leaq 256(%rdx),%rdx |
| 1780 subl $1,%esi |
| 1781 jnz L$scatter |
| 1782 L$scatter_epilogue: |
| 1783 .byte 0xf3,0xc3 |
| 1784 |
| 1785 |
| 1786 .globl _bn_gather5 |
| 1787 |
| 1788 .p2align 4 |
| 1789 _bn_gather5: |
| 1790 movl %ecx,%r11d |
| 1791 shrl $3,%ecx |
| 1792 andq $7,%r11 |
| 1793 notl %ecx |
| 1794 leaq L$magic_masks(%rip),%rax |
| 1795 andl $3,%ecx |
| 1796 leaq 128(%rdx,%r11,8),%rdx |
| 1797 movq 0(%rax,%rcx,8),%xmm4 |
| 1798 movq 8(%rax,%rcx,8),%xmm5 |
| 1799 movq 16(%rax,%rcx,8),%xmm6 |
| 1800 movq 24(%rax,%rcx,8),%xmm7 |
| 1801 jmp L$gather |
| 1802 .p2align 4 |
| 1803 L$gather: |
| 1804 movq -128(%rdx),%xmm0 |
| 1805 movq -64(%rdx),%xmm1 |
| 1806 pand %xmm4,%xmm0 |
| 1807 movq 0(%rdx),%xmm2 |
| 1808 pand %xmm5,%xmm1 |
| 1809 movq 64(%rdx),%xmm3 |
| 1810 pand %xmm6,%xmm2 |
| 1811 por %xmm1,%xmm0 |
| 1812 pand %xmm7,%xmm3 |
| 1813 .byte 0x67,0x67 |
| 1814 por %xmm2,%xmm0 |
| 1815 leaq 256(%rdx),%rdx |
| 1816 por %xmm3,%xmm0 |
| 1817 |
| 1818 movq %xmm0,(%rdi) |
| 1819 leaq 8(%rdi),%rdi |
| 1820 subl $1,%esi |
| 1821 jnz L$gather |
| 1822 .byte 0xf3,0xc3 |
| 1823 L$SEH_end_bn_gather5: |
| 1824 |
| 1825 .p2align 6 |
| 1826 L$magic_masks: |
| 1827 .long 0,0, 0,0, 0,0, -1,-1 |
| 1828 .long 0,0, 0,0, 0,0, 0,0 |
| 1829 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97
,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71
,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1
11,114,103,62,0 |
| 1830 #endif |
OLD | NEW |