Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S

Issue 1319703002: Breaking Change: merge BoringSSL branch into master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__x86_64__)
2 .text
3
4 .extern OPENSSL_ia32cap_P
5 .hidden OPENSSL_ia32cap_P
6
7 .globl bn_mul_mont_gather5
8 .hidden bn_mul_mont_gather5
9 .type bn_mul_mont_gather5,@function
10 .align 64
11 bn_mul_mont_gather5:
12 testl $7,%r9d
13 jnz .Lmul_enter
14 jmp .Lmul4x_enter
15
16 .align 16
17 .Lmul_enter:
18 movl %r9d,%r9d
19 movq %rsp,%rax
20 movl 8(%rsp),%r10d
21 pushq %rbx
22 pushq %rbp
23 pushq %r12
24 pushq %r13
25 pushq %r14
26 pushq %r15
27 leaq 2(%r9),%r11
28 negq %r11
29 leaq (%rsp,%r11,8),%rsp
30 andq $-1024,%rsp
31
32 movq %rax,8(%rsp,%r9,8)
33 .Lmul_body:
34 movq %rdx,%r12
35 movq %r10,%r11
36 shrq $3,%r10
37 andq $7,%r11
38 notq %r10
39 leaq .Lmagic_masks(%rip),%rax
40 andq $3,%r10
41 leaq 96(%r12,%r11,8),%r12
42 movq 0(%rax,%r10,8),%xmm4
43 movq 8(%rax,%r10,8),%xmm5
44 movq 16(%rax,%r10,8),%xmm6
45 movq 24(%rax,%r10,8),%xmm7
46
47 movq -96(%r12),%xmm0
48 movq -32(%r12),%xmm1
49 pand %xmm4,%xmm0
50 movq 32(%r12),%xmm2
51 pand %xmm5,%xmm1
52 movq 96(%r12),%xmm3
53 pand %xmm6,%xmm2
54 por %xmm1,%xmm0
55 pand %xmm7,%xmm3
56 por %xmm2,%xmm0
57 leaq 256(%r12),%r12
58 por %xmm3,%xmm0
59
60 .byte 102,72,15,126,195
61
62 movq (%r8),%r8
63 movq (%rsi),%rax
64
65 xorq %r14,%r14
66 xorq %r15,%r15
67
68 movq -96(%r12),%xmm0
69 movq -32(%r12),%xmm1
70 pand %xmm4,%xmm0
71 movq 32(%r12),%xmm2
72 pand %xmm5,%xmm1
73
74 movq %r8,%rbp
75 mulq %rbx
76 movq %rax,%r10
77 movq (%rcx),%rax
78
79 movq 96(%r12),%xmm3
80 pand %xmm6,%xmm2
81 por %xmm1,%xmm0
82 pand %xmm7,%xmm3
83
84 imulq %r10,%rbp
85 movq %rdx,%r11
86
87 por %xmm2,%xmm0
88 leaq 256(%r12),%r12
89 por %xmm3,%xmm0
90
91 mulq %rbp
92 addq %rax,%r10
93 movq 8(%rsi),%rax
94 adcq $0,%rdx
95 movq %rdx,%r13
96
97 leaq 1(%r15),%r15
98 jmp .L1st_enter
99
100 .align 16
101 .L1st:
102 addq %rax,%r13
103 movq (%rsi,%r15,8),%rax
104 adcq $0,%rdx
105 addq %r11,%r13
106 movq %r10,%r11
107 adcq $0,%rdx
108 movq %r13,-16(%rsp,%r15,8)
109 movq %rdx,%r13
110
111 .L1st_enter:
112 mulq %rbx
113 addq %rax,%r11
114 movq (%rcx,%r15,8),%rax
115 adcq $0,%rdx
116 leaq 1(%r15),%r15
117 movq %rdx,%r10
118
119 mulq %rbp
120 cmpq %r9,%r15
121 jne .L1st
122
123 .byte 102,72,15,126,195
124
125 addq %rax,%r13
126 movq (%rsi),%rax
127 adcq $0,%rdx
128 addq %r11,%r13
129 adcq $0,%rdx
130 movq %r13,-16(%rsp,%r15,8)
131 movq %rdx,%r13
132 movq %r10,%r11
133
134 xorq %rdx,%rdx
135 addq %r11,%r13
136 adcq $0,%rdx
137 movq %r13,-8(%rsp,%r9,8)
138 movq %rdx,(%rsp,%r9,8)
139
140 leaq 1(%r14),%r14
141 jmp .Louter
142 .align 16
143 .Louter:
144 xorq %r15,%r15
145 movq %r8,%rbp
146 movq (%rsp),%r10
147
148 movq -96(%r12),%xmm0
149 movq -32(%r12),%xmm1
150 pand %xmm4,%xmm0
151 movq 32(%r12),%xmm2
152 pand %xmm5,%xmm1
153
154 mulq %rbx
155 addq %rax,%r10
156 movq (%rcx),%rax
157 adcq $0,%rdx
158
159 movq 96(%r12),%xmm3
160 pand %xmm6,%xmm2
161 por %xmm1,%xmm0
162 pand %xmm7,%xmm3
163
164 imulq %r10,%rbp
165 movq %rdx,%r11
166
167 por %xmm2,%xmm0
168 leaq 256(%r12),%r12
169 por %xmm3,%xmm0
170
171 mulq %rbp
172 addq %rax,%r10
173 movq 8(%rsi),%rax
174 adcq $0,%rdx
175 movq 8(%rsp),%r10
176 movq %rdx,%r13
177
178 leaq 1(%r15),%r15
179 jmp .Linner_enter
180
181 .align 16
182 .Linner:
183 addq %rax,%r13
184 movq (%rsi,%r15,8),%rax
185 adcq $0,%rdx
186 addq %r10,%r13
187 movq (%rsp,%r15,8),%r10
188 adcq $0,%rdx
189 movq %r13,-16(%rsp,%r15,8)
190 movq %rdx,%r13
191
192 .Linner_enter:
193 mulq %rbx
194 addq %rax,%r11
195 movq (%rcx,%r15,8),%rax
196 adcq $0,%rdx
197 addq %r11,%r10
198 movq %rdx,%r11
199 adcq $0,%r11
200 leaq 1(%r15),%r15
201
202 mulq %rbp
203 cmpq %r9,%r15
204 jne .Linner
205
206 .byte 102,72,15,126,195
207
208 addq %rax,%r13
209 movq (%rsi),%rax
210 adcq $0,%rdx
211 addq %r10,%r13
212 movq (%rsp,%r15,8),%r10
213 adcq $0,%rdx
214 movq %r13,-16(%rsp,%r15,8)
215 movq %rdx,%r13
216
217 xorq %rdx,%rdx
218 addq %r11,%r13
219 adcq $0,%rdx
220 addq %r10,%r13
221 adcq $0,%rdx
222 movq %r13,-8(%rsp,%r9,8)
223 movq %rdx,(%rsp,%r9,8)
224
225 leaq 1(%r14),%r14
226 cmpq %r9,%r14
227 jb .Louter
228
229 xorq %r14,%r14
230 movq (%rsp),%rax
231 leaq (%rsp),%rsi
232 movq %r9,%r15
233 jmp .Lsub
234 .align 16
235 .Lsub: sbbq (%rcx,%r14,8),%rax
236 movq %rax,(%rdi,%r14,8)
237 movq 8(%rsi,%r14,8),%rax
238 leaq 1(%r14),%r14
239 decq %r15
240 jnz .Lsub
241
242 sbbq $0,%rax
243 xorq %r14,%r14
244 movq %r9,%r15
245 .align 16
246 .Lcopy:
247 movq (%rsp,%r14,8),%rsi
248 movq (%rdi,%r14,8),%rcx
249 xorq %rcx,%rsi
250 andq %rax,%rsi
251 xorq %rcx,%rsi
252 movq %r14,(%rsp,%r14,8)
253 movq %rsi,(%rdi,%r14,8)
254 leaq 1(%r14),%r14
255 subq $1,%r15
256 jnz .Lcopy
257
258 movq 8(%rsp,%r9,8),%rsi
259 movq $1,%rax
260 movq -48(%rsi),%r15
261 movq -40(%rsi),%r14
262 movq -32(%rsi),%r13
263 movq -24(%rsi),%r12
264 movq -16(%rsi),%rbp
265 movq -8(%rsi),%rbx
266 leaq (%rsi),%rsp
267 .Lmul_epilogue:
268 .byte 0xf3,0xc3
269 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
270 .type bn_mul4x_mont_gather5,@function
271 .align 32
272 bn_mul4x_mont_gather5:
273 .Lmul4x_enter:
274 .byte 0x67
275 movq %rsp,%rax
276 pushq %rbx
277 pushq %rbp
278 pushq %r12
279 pushq %r13
280 pushq %r14
281 pushq %r15
282 .byte 0x67
283 movl %r9d,%r10d
284 shll $3,%r9d
285 shll $3+2,%r10d
286 negq %r9
287
288
289
290
291
292
293
294
295 leaq -64(%rsp,%r9,2),%r11
296 subq %rsi,%r11
297 andq $4095,%r11
298 cmpq %r11,%r10
299 jb .Lmul4xsp_alt
300 subq %r11,%rsp
301 leaq -64(%rsp,%r9,2),%rsp
302 jmp .Lmul4xsp_done
303
304 .align 32
305 .Lmul4xsp_alt:
306 leaq 4096-64(,%r9,2),%r10
307 leaq -64(%rsp,%r9,2),%rsp
308 subq %r10,%r11
309 movq $0,%r10
310 cmovcq %r10,%r11
311 subq %r11,%rsp
312 .Lmul4xsp_done:
313 andq $-64,%rsp
314 negq %r9
315
316 movq %rax,40(%rsp)
317 .Lmul4x_body:
318
319 call mul4x_internal
320
321 movq 40(%rsp),%rsi
322 movq $1,%rax
323 movq -48(%rsi),%r15
324 movq -40(%rsi),%r14
325 movq -32(%rsi),%r13
326 movq -24(%rsi),%r12
327 movq -16(%rsi),%rbp
328 movq -8(%rsi),%rbx
329 leaq (%rsi),%rsp
330 .Lmul4x_epilogue:
331 .byte 0xf3,0xc3
332 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
333
334 .type mul4x_internal,@function
335 .align 32
336 mul4x_internal:
337 shlq $5,%r9
338 movl 8(%rax),%r10d
339 leaq 256(%rdx,%r9,1),%r13
340 shrq $5,%r9
341 movq %r10,%r11
342 shrq $3,%r10
343 andq $7,%r11
344 notq %r10
345 leaq .Lmagic_masks(%rip),%rax
346 andq $3,%r10
347 leaq 96(%rdx,%r11,8),%r12
348 movq 0(%rax,%r10,8),%xmm4
349 movq 8(%rax,%r10,8),%xmm5
350 addq $7,%r11
351 movq 16(%rax,%r10,8),%xmm6
352 movq 24(%rax,%r10,8),%xmm7
353 andq $7,%r11
354
355 movq -96(%r12),%xmm0
356 leaq 256(%r12),%r14
357 movq -32(%r12),%xmm1
358 pand %xmm4,%xmm0
359 movq 32(%r12),%xmm2
360 pand %xmm5,%xmm1
361 movq 96(%r12),%xmm3
362 pand %xmm6,%xmm2
363 .byte 0x67
364 por %xmm1,%xmm0
365 movq -96(%r14),%xmm1
366 .byte 0x67
367 pand %xmm7,%xmm3
368 .byte 0x67
369 por %xmm2,%xmm0
370 movq -32(%r14),%xmm2
371 .byte 0x67
372 pand %xmm4,%xmm1
373 .byte 0x67
374 por %xmm3,%xmm0
375 movq 32(%r14),%xmm3
376
377 .byte 102,72,15,126,195
378 movq 96(%r14),%xmm0
379 movq %r13,16+8(%rsp)
380 movq %rdi,56+8(%rsp)
381
382 movq (%r8),%r8
383 movq (%rsi),%rax
384 leaq (%rsi,%r9,1),%rsi
385 negq %r9
386
387 movq %r8,%rbp
388 mulq %rbx
389 movq %rax,%r10
390 movq (%rcx),%rax
391
392 pand %xmm5,%xmm2
393 pand %xmm6,%xmm3
394 por %xmm2,%xmm1
395
396 imulq %r10,%rbp
397
398
399
400
401
402
403
404 leaq 64+8(%rsp,%r11,8),%r14
405 movq %rdx,%r11
406
407 pand %xmm7,%xmm0
408 por %xmm3,%xmm1
409 leaq 512(%r12),%r12
410 por %xmm1,%xmm0
411
412 mulq %rbp
413 addq %rax,%r10
414 movq 8(%rsi,%r9,1),%rax
415 adcq $0,%rdx
416 movq %rdx,%rdi
417
418 mulq %rbx
419 addq %rax,%r11
420 movq 16(%rcx),%rax
421 adcq $0,%rdx
422 movq %rdx,%r10
423
424 mulq %rbp
425 addq %rax,%rdi
426 movq 16(%rsi,%r9,1),%rax
427 adcq $0,%rdx
428 addq %r11,%rdi
429 leaq 32(%r9),%r15
430 leaq 64(%rcx),%rcx
431 adcq $0,%rdx
432 movq %rdi,(%r14)
433 movq %rdx,%r13
434 jmp .L1st4x
435
436 .align 32
437 .L1st4x:
438 mulq %rbx
439 addq %rax,%r10
440 movq -32(%rcx),%rax
441 leaq 32(%r14),%r14
442 adcq $0,%rdx
443 movq %rdx,%r11
444
445 mulq %rbp
446 addq %rax,%r13
447 movq -8(%rsi,%r15,1),%rax
448 adcq $0,%rdx
449 addq %r10,%r13
450 adcq $0,%rdx
451 movq %r13,-24(%r14)
452 movq %rdx,%rdi
453
454 mulq %rbx
455 addq %rax,%r11
456 movq -16(%rcx),%rax
457 adcq $0,%rdx
458 movq %rdx,%r10
459
460 mulq %rbp
461 addq %rax,%rdi
462 movq (%rsi,%r15,1),%rax
463 adcq $0,%rdx
464 addq %r11,%rdi
465 adcq $0,%rdx
466 movq %rdi,-16(%r14)
467 movq %rdx,%r13
468
469 mulq %rbx
470 addq %rax,%r10
471 movq 0(%rcx),%rax
472 adcq $0,%rdx
473 movq %rdx,%r11
474
475 mulq %rbp
476 addq %rax,%r13
477 movq 8(%rsi,%r15,1),%rax
478 adcq $0,%rdx
479 addq %r10,%r13
480 adcq $0,%rdx
481 movq %r13,-8(%r14)
482 movq %rdx,%rdi
483
484 mulq %rbx
485 addq %rax,%r11
486 movq 16(%rcx),%rax
487 adcq $0,%rdx
488 movq %rdx,%r10
489
490 mulq %rbp
491 addq %rax,%rdi
492 movq 16(%rsi,%r15,1),%rax
493 adcq $0,%rdx
494 addq %r11,%rdi
495 leaq 64(%rcx),%rcx
496 adcq $0,%rdx
497 movq %rdi,(%r14)
498 movq %rdx,%r13
499
500 addq $32,%r15
501 jnz .L1st4x
502
503 mulq %rbx
504 addq %rax,%r10
505 movq -32(%rcx),%rax
506 leaq 32(%r14),%r14
507 adcq $0,%rdx
508 movq %rdx,%r11
509
510 mulq %rbp
511 addq %rax,%r13
512 movq -8(%rsi),%rax
513 adcq $0,%rdx
514 addq %r10,%r13
515 adcq $0,%rdx
516 movq %r13,-24(%r14)
517 movq %rdx,%rdi
518
519 mulq %rbx
520 addq %rax,%r11
521 movq -16(%rcx),%rax
522 adcq $0,%rdx
523 movq %rdx,%r10
524
525 mulq %rbp
526 addq %rax,%rdi
527 movq (%rsi,%r9,1),%rax
528 adcq $0,%rdx
529 addq %r11,%rdi
530 adcq $0,%rdx
531 movq %rdi,-16(%r14)
532 movq %rdx,%r13
533
534 .byte 102,72,15,126,195
535 leaq (%rcx,%r9,2),%rcx
536
537 xorq %rdi,%rdi
538 addq %r10,%r13
539 adcq $0,%rdi
540 movq %r13,-8(%r14)
541
542 jmp .Louter4x
543
544 .align 32
545 .Louter4x:
546 movq (%r14,%r9,1),%r10
547 movq %r8,%rbp
548 mulq %rbx
549 addq %rax,%r10
550 movq (%rcx),%rax
551 adcq $0,%rdx
552
553 movq -96(%r12),%xmm0
554 movq -32(%r12),%xmm1
555 pand %xmm4,%xmm0
556 movq 32(%r12),%xmm2
557 pand %xmm5,%xmm1
558 movq 96(%r12),%xmm3
559
560 imulq %r10,%rbp
561 .byte 0x67
562 movq %rdx,%r11
563 movq %rdi,(%r14)
564
565 pand %xmm6,%xmm2
566 por %xmm1,%xmm0
567 pand %xmm7,%xmm3
568 por %xmm2,%xmm0
569 leaq (%r14,%r9,1),%r14
570 leaq 256(%r12),%r12
571 por %xmm3,%xmm0
572
573 mulq %rbp
574 addq %rax,%r10
575 movq 8(%rsi,%r9,1),%rax
576 adcq $0,%rdx
577 movq %rdx,%rdi
578
579 mulq %rbx
580 addq %rax,%r11
581 movq 16(%rcx),%rax
582 adcq $0,%rdx
583 addq 8(%r14),%r11
584 adcq $0,%rdx
585 movq %rdx,%r10
586
587 mulq %rbp
588 addq %rax,%rdi
589 movq 16(%rsi,%r9,1),%rax
590 adcq $0,%rdx
591 addq %r11,%rdi
592 leaq 32(%r9),%r15
593 leaq 64(%rcx),%rcx
594 adcq $0,%rdx
595 movq %rdx,%r13
596 jmp .Linner4x
597
598 .align 32
599 .Linner4x:
600 mulq %rbx
601 addq %rax,%r10
602 movq -32(%rcx),%rax
603 adcq $0,%rdx
604 addq 16(%r14),%r10
605 leaq 32(%r14),%r14
606 adcq $0,%rdx
607 movq %rdx,%r11
608
609 mulq %rbp
610 addq %rax,%r13
611 movq -8(%rsi,%r15,1),%rax
612 adcq $0,%rdx
613 addq %r10,%r13
614 adcq $0,%rdx
615 movq %rdi,-32(%r14)
616 movq %rdx,%rdi
617
618 mulq %rbx
619 addq %rax,%r11
620 movq -16(%rcx),%rax
621 adcq $0,%rdx
622 addq -8(%r14),%r11
623 adcq $0,%rdx
624 movq %rdx,%r10
625
626 mulq %rbp
627 addq %rax,%rdi
628 movq (%rsi,%r15,1),%rax
629 adcq $0,%rdx
630 addq %r11,%rdi
631 adcq $0,%rdx
632 movq %r13,-24(%r14)
633 movq %rdx,%r13
634
635 mulq %rbx
636 addq %rax,%r10
637 movq 0(%rcx),%rax
638 adcq $0,%rdx
639 addq (%r14),%r10
640 adcq $0,%rdx
641 movq %rdx,%r11
642
643 mulq %rbp
644 addq %rax,%r13
645 movq 8(%rsi,%r15,1),%rax
646 adcq $0,%rdx
647 addq %r10,%r13
648 adcq $0,%rdx
649 movq %rdi,-16(%r14)
650 movq %rdx,%rdi
651
652 mulq %rbx
653 addq %rax,%r11
654 movq 16(%rcx),%rax
655 adcq $0,%rdx
656 addq 8(%r14),%r11
657 adcq $0,%rdx
658 movq %rdx,%r10
659
660 mulq %rbp
661 addq %rax,%rdi
662 movq 16(%rsi,%r15,1),%rax
663 adcq $0,%rdx
664 addq %r11,%rdi
665 leaq 64(%rcx),%rcx
666 adcq $0,%rdx
667 movq %r13,-8(%r14)
668 movq %rdx,%r13
669
670 addq $32,%r15
671 jnz .Linner4x
672
673 mulq %rbx
674 addq %rax,%r10
675 movq -32(%rcx),%rax
676 adcq $0,%rdx
677 addq 16(%r14),%r10
678 leaq 32(%r14),%r14
679 adcq $0,%rdx
680 movq %rdx,%r11
681
682 mulq %rbp
683 addq %rax,%r13
684 movq -8(%rsi),%rax
685 adcq $0,%rdx
686 addq %r10,%r13
687 adcq $0,%rdx
688 movq %rdi,-32(%r14)
689 movq %rdx,%rdi
690
691 mulq %rbx
692 addq %rax,%r11
693 movq %rbp,%rax
694 movq -16(%rcx),%rbp
695 adcq $0,%rdx
696 addq -8(%r14),%r11
697 adcq $0,%rdx
698 movq %rdx,%r10
699
700 mulq %rbp
701 addq %rax,%rdi
702 movq (%rsi,%r9,1),%rax
703 adcq $0,%rdx
704 addq %r11,%rdi
705 adcq $0,%rdx
706 movq %r13,-24(%r14)
707 movq %rdx,%r13
708
709 .byte 102,72,15,126,195
710 movq %rdi,-16(%r14)
711 leaq (%rcx,%r9,2),%rcx
712
713 xorq %rdi,%rdi
714 addq %r10,%r13
715 adcq $0,%rdi
716 addq (%r14),%r13
717 adcq $0,%rdi
718 movq %r13,-8(%r14)
719
720 cmpq 16+8(%rsp),%r12
721 jb .Louter4x
722 subq %r13,%rbp
723 adcq %r15,%r15
724 orq %r15,%rdi
725 xorq $1,%rdi
726 leaq (%r14,%r9,1),%rbx
727 leaq (%rcx,%rdi,8),%rbp
728 movq %r9,%rcx
729 sarq $3+2,%rcx
730 movq 56+8(%rsp),%rdi
731 jmp .Lsqr4x_sub
732 .size mul4x_internal,.-mul4x_internal
733 .globl bn_power5
734 .hidden bn_power5
735 .type bn_power5,@function
736 .align 32
737 bn_power5:
738 movq %rsp,%rax
739 pushq %rbx
740 pushq %rbp
741 pushq %r12
742 pushq %r13
743 pushq %r14
744 pushq %r15
745 movl %r9d,%r10d
746 shll $3,%r9d
747 shll $3+2,%r10d
748 negq %r9
749 movq (%r8),%r8
750
751
752
753
754
755
756
757 leaq -64(%rsp,%r9,2),%r11
758 subq %rsi,%r11
759 andq $4095,%r11
760 cmpq %r11,%r10
761 jb .Lpwr_sp_alt
762 subq %r11,%rsp
763 leaq -64(%rsp,%r9,2),%rsp
764 jmp .Lpwr_sp_done
765
766 .align 32
767 .Lpwr_sp_alt:
768 leaq 4096-64(,%r9,2),%r10
769 leaq -64(%rsp,%r9,2),%rsp
770 subq %r10,%r11
771 movq $0,%r10
772 cmovcq %r10,%r11
773 subq %r11,%rsp
774 .Lpwr_sp_done:
775 andq $-64,%rsp
776 movq %r9,%r10
777 negq %r9
778
779
780
781
782
783
784
785
786
787
788 movq %r8,32(%rsp)
789 movq %rax,40(%rsp)
790 .Lpower5_body:
791 .byte 102,72,15,110,207
792 .byte 102,72,15,110,209
793 .byte 102,73,15,110,218
794 .byte 102,72,15,110,226
795
796 call __bn_sqr8x_internal
797 call __bn_sqr8x_internal
798 call __bn_sqr8x_internal
799 call __bn_sqr8x_internal
800 call __bn_sqr8x_internal
801
802 .byte 102,72,15,126,209
803 .byte 102,72,15,126,226
804 movq %rsi,%rdi
805 movq 40(%rsp),%rax
806 leaq 32(%rsp),%r8
807
808 call mul4x_internal
809
810 movq 40(%rsp),%rsi
811 movq $1,%rax
812 movq -48(%rsi),%r15
813 movq -40(%rsi),%r14
814 movq -32(%rsi),%r13
815 movq -24(%rsi),%r12
816 movq -16(%rsi),%rbp
817 movq -8(%rsi),%rbx
818 leaq (%rsi),%rsp
819 .Lpower5_epilogue:
820 .byte 0xf3,0xc3
821 .size bn_power5,.-bn_power5
822
823 .globl bn_sqr8x_internal
824 .hidden bn_sqr8x_internal
825 .hidden bn_sqr8x_internal
826 .type bn_sqr8x_internal,@function
827 .align 32
828 bn_sqr8x_internal:
829 __bn_sqr8x_internal:
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903 leaq 32(%r10),%rbp
904 leaq (%rsi,%r9,1),%rsi
905
906 movq %r9,%rcx
907
908
909 movq -32(%rsi,%rbp,1),%r14
910 leaq 48+8(%rsp,%r9,2),%rdi
911 movq -24(%rsi,%rbp,1),%rax
912 leaq -32(%rdi,%rbp,1),%rdi
913 movq -16(%rsi,%rbp,1),%rbx
914 movq %rax,%r15
915
916 mulq %r14
917 movq %rax,%r10
918 movq %rbx,%rax
919 movq %rdx,%r11
920 movq %r10,-24(%rdi,%rbp,1)
921
922 mulq %r14
923 addq %rax,%r11
924 movq %rbx,%rax
925 adcq $0,%rdx
926 movq %r11,-16(%rdi,%rbp,1)
927 movq %rdx,%r10
928
929
930 movq -8(%rsi,%rbp,1),%rbx
931 mulq %r15
932 movq %rax,%r12
933 movq %rbx,%rax
934 movq %rdx,%r13
935
936 leaq (%rbp),%rcx
937 mulq %r14
938 addq %rax,%r10
939 movq %rbx,%rax
940 movq %rdx,%r11
941 adcq $0,%r11
942 addq %r12,%r10
943 adcq $0,%r11
944 movq %r10,-8(%rdi,%rcx,1)
945 jmp .Lsqr4x_1st
946
947 .align 32
948 .Lsqr4x_1st:
949 movq (%rsi,%rcx,1),%rbx
950 mulq %r15
951 addq %rax,%r13
952 movq %rbx,%rax
953 movq %rdx,%r12
954 adcq $0,%r12
955
956 mulq %r14
957 addq %rax,%r11
958 movq %rbx,%rax
959 movq 8(%rsi,%rcx,1),%rbx
960 movq %rdx,%r10
961 adcq $0,%r10
962 addq %r13,%r11
963 adcq $0,%r10
964
965
966 mulq %r15
967 addq %rax,%r12
968 movq %rbx,%rax
969 movq %r11,(%rdi,%rcx,1)
970 movq %rdx,%r13
971 adcq $0,%r13
972
973 mulq %r14
974 addq %rax,%r10
975 movq %rbx,%rax
976 movq 16(%rsi,%rcx,1),%rbx
977 movq %rdx,%r11
978 adcq $0,%r11
979 addq %r12,%r10
980 adcq $0,%r11
981
982 mulq %r15
983 addq %rax,%r13
984 movq %rbx,%rax
985 movq %r10,8(%rdi,%rcx,1)
986 movq %rdx,%r12
987 adcq $0,%r12
988
989 mulq %r14
990 addq %rax,%r11
991 movq %rbx,%rax
992 movq 24(%rsi,%rcx,1),%rbx
993 movq %rdx,%r10
994 adcq $0,%r10
995 addq %r13,%r11
996 adcq $0,%r10
997
998
999 mulq %r15
1000 addq %rax,%r12
1001 movq %rbx,%rax
1002 movq %r11,16(%rdi,%rcx,1)
1003 movq %rdx,%r13
1004 adcq $0,%r13
1005 leaq 32(%rcx),%rcx
1006
1007 mulq %r14
1008 addq %rax,%r10
1009 movq %rbx,%rax
1010 movq %rdx,%r11
1011 adcq $0,%r11
1012 addq %r12,%r10
1013 adcq $0,%r11
1014 movq %r10,-8(%rdi,%rcx,1)
1015
1016 cmpq $0,%rcx
1017 jne .Lsqr4x_1st
1018
1019 mulq %r15
1020 addq %rax,%r13
1021 leaq 16(%rbp),%rbp
1022 adcq $0,%rdx
1023 addq %r11,%r13
1024 adcq $0,%rdx
1025
1026 movq %r13,(%rdi)
1027 movq %rdx,%r12
1028 movq %rdx,8(%rdi)
1029 jmp .Lsqr4x_outer
1030
1031 .align 32
1032 .Lsqr4x_outer:
1033 movq -32(%rsi,%rbp,1),%r14
1034 leaq 48+8(%rsp,%r9,2),%rdi
1035 movq -24(%rsi,%rbp,1),%rax
1036 leaq -32(%rdi,%rbp,1),%rdi
1037 movq -16(%rsi,%rbp,1),%rbx
1038 movq %rax,%r15
1039
1040 mulq %r14
1041 movq -24(%rdi,%rbp,1),%r10
1042 addq %rax,%r10
1043 movq %rbx,%rax
1044 adcq $0,%rdx
1045 movq %r10,-24(%rdi,%rbp,1)
1046 movq %rdx,%r11
1047
1048 mulq %r14
1049 addq %rax,%r11
1050 movq %rbx,%rax
1051 adcq $0,%rdx
1052 addq -16(%rdi,%rbp,1),%r11
1053 movq %rdx,%r10
1054 adcq $0,%r10
1055 movq %r11,-16(%rdi,%rbp,1)
1056
1057 xorq %r12,%r12
1058
1059 movq -8(%rsi,%rbp,1),%rbx
1060 mulq %r15
1061 addq %rax,%r12
1062 movq %rbx,%rax
1063 adcq $0,%rdx
1064 addq -8(%rdi,%rbp,1),%r12
1065 movq %rdx,%r13
1066 adcq $0,%r13
1067
1068 mulq %r14
1069 addq %rax,%r10
1070 movq %rbx,%rax
1071 adcq $0,%rdx
1072 addq %r12,%r10
1073 movq %rdx,%r11
1074 adcq $0,%r11
1075 movq %r10,-8(%rdi,%rbp,1)
1076
1077 leaq (%rbp),%rcx
1078 jmp .Lsqr4x_inner
1079
1080 .align 32
1081 .Lsqr4x_inner:
1082 movq (%rsi,%rcx,1),%rbx
1083 mulq %r15
1084 addq %rax,%r13
1085 movq %rbx,%rax
1086 movq %rdx,%r12
1087 adcq $0,%r12
1088 addq (%rdi,%rcx,1),%r13
1089 adcq $0,%r12
1090
1091 .byte 0x67
1092 mulq %r14
1093 addq %rax,%r11
1094 movq %rbx,%rax
1095 movq 8(%rsi,%rcx,1),%rbx
1096 movq %rdx,%r10
1097 adcq $0,%r10
1098 addq %r13,%r11
1099 adcq $0,%r10
1100
1101 mulq %r15
1102 addq %rax,%r12
1103 movq %r11,(%rdi,%rcx,1)
1104 movq %rbx,%rax
1105 movq %rdx,%r13
1106 adcq $0,%r13
1107 addq 8(%rdi,%rcx,1),%r12
1108 leaq 16(%rcx),%rcx
1109 adcq $0,%r13
1110
1111 mulq %r14
1112 addq %rax,%r10
1113 movq %rbx,%rax
1114 adcq $0,%rdx
1115 addq %r12,%r10
1116 movq %rdx,%r11
1117 adcq $0,%r11
1118 movq %r10,-8(%rdi,%rcx,1)
1119
1120 cmpq $0,%rcx
1121 jne .Lsqr4x_inner
1122
1123 .byte 0x67
1124 mulq %r15
1125 addq %rax,%r13
1126 adcq $0,%rdx
1127 addq %r11,%r13
1128 adcq $0,%rdx
1129
1130 movq %r13,(%rdi)
1131 movq %rdx,%r12
1132 movq %rdx,8(%rdi)
1133
1134 addq $16,%rbp
1135 jnz .Lsqr4x_outer
1136
1137
1138 movq -32(%rsi),%r14
1139 leaq 48+8(%rsp,%r9,2),%rdi
1140 movq -24(%rsi),%rax
1141 leaq -32(%rdi,%rbp,1),%rdi
1142 movq -16(%rsi),%rbx
1143 movq %rax,%r15
1144
1145 mulq %r14
1146 addq %rax,%r10
1147 movq %rbx,%rax
1148 movq %rdx,%r11
1149 adcq $0,%r11
1150
1151 mulq %r14
1152 addq %rax,%r11
1153 movq %rbx,%rax
1154 movq %r10,-24(%rdi)
1155 movq %rdx,%r10
1156 adcq $0,%r10
1157 addq %r13,%r11
1158 movq -8(%rsi),%rbx
1159 adcq $0,%r10
1160
1161 mulq %r15
1162 addq %rax,%r12
1163 movq %rbx,%rax
1164 movq %r11,-16(%rdi)
1165 movq %rdx,%r13
1166 adcq $0,%r13
1167
1168 mulq %r14
1169 addq %rax,%r10
1170 movq %rbx,%rax
1171 movq %rdx,%r11
1172 adcq $0,%r11
1173 addq %r12,%r10
1174 adcq $0,%r11
1175 movq %r10,-8(%rdi)
1176
1177 mulq %r15
1178 addq %rax,%r13
1179 movq -16(%rsi),%rax
1180 adcq $0,%rdx
1181 addq %r11,%r13
1182 adcq $0,%rdx
1183
1184 movq %r13,(%rdi)
1185 movq %rdx,%r12
1186 movq %rdx,8(%rdi)
1187
1188 mulq %rbx
1189 addq $16,%rbp
1190 xorq %r14,%r14
1191 subq %r9,%rbp
1192 xorq %r15,%r15
1193
1194 addq %r12,%rax
1195 adcq $0,%rdx
1196 movq %rax,8(%rdi)
1197 movq %rdx,16(%rdi)
1198 movq %r15,24(%rdi)
1199
1200 movq -16(%rsi,%rbp,1),%rax
1201 leaq 48+8(%rsp),%rdi
1202 xorq %r10,%r10
1203 movq 8(%rdi),%r11
1204
1205 leaq (%r14,%r10,2),%r12
1206 shrq $63,%r10
1207 leaq (%rcx,%r11,2),%r13
1208 shrq $63,%r11
1209 orq %r10,%r13
1210 movq 16(%rdi),%r10
1211 movq %r11,%r14
1212 mulq %rax
1213 negq %r15
1214 movq 24(%rdi),%r11
1215 adcq %rax,%r12
1216 movq -8(%rsi,%rbp,1),%rax
1217 movq %r12,(%rdi)
1218 adcq %rdx,%r13
1219
1220 leaq (%r14,%r10,2),%rbx
1221 movq %r13,8(%rdi)
1222 sbbq %r15,%r15
1223 shrq $63,%r10
1224 leaq (%rcx,%r11,2),%r8
1225 shrq $63,%r11
1226 orq %r10,%r8
1227 movq 32(%rdi),%r10
1228 movq %r11,%r14
1229 mulq %rax
1230 negq %r15
1231 movq 40(%rdi),%r11
1232 adcq %rax,%rbx
1233 movq 0(%rsi,%rbp,1),%rax
1234 movq %rbx,16(%rdi)
1235 adcq %rdx,%r8
1236 leaq 16(%rbp),%rbp
1237 movq %r8,24(%rdi)
1238 sbbq %r15,%r15
1239 leaq 64(%rdi),%rdi
1240 jmp .Lsqr4x_shift_n_add
1241
1242 .align 32
1243 .Lsqr4x_shift_n_add:
1244 leaq (%r14,%r10,2),%r12
1245 shrq $63,%r10
1246 leaq (%rcx,%r11,2),%r13
1247 shrq $63,%r11
1248 orq %r10,%r13
1249 movq -16(%rdi),%r10
1250 movq %r11,%r14
1251 mulq %rax
1252 negq %r15
1253 movq -8(%rdi),%r11
1254 adcq %rax,%r12
1255 movq -8(%rsi,%rbp,1),%rax
1256 movq %r12,-32(%rdi)
1257 adcq %rdx,%r13
1258
1259 leaq (%r14,%r10,2),%rbx
1260 movq %r13,-24(%rdi)
1261 sbbq %r15,%r15
1262 shrq $63,%r10
1263 leaq (%rcx,%r11,2),%r8
1264 shrq $63,%r11
1265 orq %r10,%r8
1266 movq 0(%rdi),%r10
1267 movq %r11,%r14
1268 mulq %rax
1269 negq %r15
1270 movq 8(%rdi),%r11
1271 adcq %rax,%rbx
1272 movq 0(%rsi,%rbp,1),%rax
1273 movq %rbx,-16(%rdi)
1274 adcq %rdx,%r8
1275
1276 leaq (%r14,%r10,2),%r12
1277 movq %r8,-8(%rdi)
1278 sbbq %r15,%r15
1279 shrq $63,%r10
1280 leaq (%rcx,%r11,2),%r13
1281 shrq $63,%r11
1282 orq %r10,%r13
1283 movq 16(%rdi),%r10
1284 movq %r11,%r14
1285 mulq %rax
1286 negq %r15
1287 movq 24(%rdi),%r11
1288 adcq %rax,%r12
1289 movq 8(%rsi,%rbp,1),%rax
1290 movq %r12,0(%rdi)
1291 adcq %rdx,%r13
1292
1293 leaq (%r14,%r10,2),%rbx
1294 movq %r13,8(%rdi)
1295 sbbq %r15,%r15
1296 shrq $63,%r10
1297 leaq (%rcx,%r11,2),%r8
1298 shrq $63,%r11
1299 orq %r10,%r8
1300 movq 32(%rdi),%r10
1301 movq %r11,%r14
1302 mulq %rax
1303 negq %r15
1304 movq 40(%rdi),%r11
1305 adcq %rax,%rbx
1306 movq 16(%rsi,%rbp,1),%rax
1307 movq %rbx,16(%rdi)
1308 adcq %rdx,%r8
1309 movq %r8,24(%rdi)
1310 sbbq %r15,%r15
1311 leaq 64(%rdi),%rdi
1312 addq $32,%rbp
1313 jnz .Lsqr4x_shift_n_add
1314
1315 leaq (%r14,%r10,2),%r12
1316 .byte 0x67
1317 shrq $63,%r10
1318 leaq (%rcx,%r11,2),%r13
1319 shrq $63,%r11
1320 orq %r10,%r13
1321 movq -16(%rdi),%r10
1322 movq %r11,%r14
1323 mulq %rax
1324 negq %r15
1325 movq -8(%rdi),%r11
1326 adcq %rax,%r12
1327 movq -8(%rsi),%rax
1328 movq %r12,-32(%rdi)
1329 adcq %rdx,%r13
1330
1331 leaq (%r14,%r10,2),%rbx
1332 movq %r13,-24(%rdi)
1333 sbbq %r15,%r15
1334 shrq $63,%r10
1335 leaq (%rcx,%r11,2),%r8
1336 shrq $63,%r11
1337 orq %r10,%r8
1338 mulq %rax
1339 negq %r15
1340 adcq %rax,%rbx
1341 adcq %rdx,%r8
1342 movq %rbx,-16(%rdi)
1343 movq %r8,-8(%rdi)
1344 .byte 102,72,15,126,213
1345 sqr8x_reduction:
1346 xorq %rax,%rax
1347 leaq (%rbp,%r9,2),%rcx
1348 leaq 48+8(%rsp,%r9,2),%rdx
1349 movq %rcx,0+8(%rsp)
1350 leaq 48+8(%rsp,%r9,1),%rdi
1351 movq %rdx,8+8(%rsp)
1352 negq %r9
1353 jmp .L8x_reduction_loop
1354
1355 .align 32
1356 .L8x_reduction_loop:
1357 leaq (%rdi,%r9,1),%rdi
1358 .byte 0x66
1359 movq 0(%rdi),%rbx
1360 movq 8(%rdi),%r9
1361 movq 16(%rdi),%r10
1362 movq 24(%rdi),%r11
1363 movq 32(%rdi),%r12
1364 movq 40(%rdi),%r13
1365 movq 48(%rdi),%r14
1366 movq 56(%rdi),%r15
1367 movq %rax,(%rdx)
1368 leaq 64(%rdi),%rdi
1369
1370 .byte 0x67
1371 movq %rbx,%r8
1372 imulq 32+8(%rsp),%rbx
1373 movq 0(%rbp),%rax
1374 movl $8,%ecx
1375 jmp .L8x_reduce
1376
1377 .align 32
1378 .L8x_reduce:
1379 mulq %rbx
1380 movq 16(%rbp),%rax
1381 negq %r8
1382 movq %rdx,%r8
1383 adcq $0,%r8
1384
1385 mulq %rbx
1386 addq %rax,%r9
1387 movq 32(%rbp),%rax
1388 adcq $0,%rdx
1389 addq %r9,%r8
1390 movq %rbx,48-8+8(%rsp,%rcx,8)
1391 movq %rdx,%r9
1392 adcq $0,%r9
1393
1394 mulq %rbx
1395 addq %rax,%r10
1396 movq 48(%rbp),%rax
1397 adcq $0,%rdx
1398 addq %r10,%r9
1399 movq 32+8(%rsp),%rsi
1400 movq %rdx,%r10
1401 adcq $0,%r10
1402
1403 mulq %rbx
1404 addq %rax,%r11
1405 movq 64(%rbp),%rax
1406 adcq $0,%rdx
1407 imulq %r8,%rsi
1408 addq %r11,%r10
1409 movq %rdx,%r11
1410 adcq $0,%r11
1411
1412 mulq %rbx
1413 addq %rax,%r12
1414 movq 80(%rbp),%rax
1415 adcq $0,%rdx
1416 addq %r12,%r11
1417 movq %rdx,%r12
1418 adcq $0,%r12
1419
1420 mulq %rbx
1421 addq %rax,%r13
1422 movq 96(%rbp),%rax
1423 adcq $0,%rdx
1424 addq %r13,%r12
1425 movq %rdx,%r13
1426 adcq $0,%r13
1427
1428 mulq %rbx
1429 addq %rax,%r14
1430 movq 112(%rbp),%rax
1431 adcq $0,%rdx
1432 addq %r14,%r13
1433 movq %rdx,%r14
1434 adcq $0,%r14
1435
1436 mulq %rbx
1437 movq %rsi,%rbx
1438 addq %rax,%r15
1439 movq 0(%rbp),%rax
1440 adcq $0,%rdx
1441 addq %r15,%r14
1442 movq %rdx,%r15
1443 adcq $0,%r15
1444
1445 decl %ecx
1446 jnz .L8x_reduce
1447
1448 leaq 128(%rbp),%rbp
1449 xorq %rax,%rax
1450 movq 8+8(%rsp),%rdx
1451 cmpq 0+8(%rsp),%rbp
1452 jae .L8x_no_tail
1453
1454 .byte 0x66
1455 addq 0(%rdi),%r8
1456 adcq 8(%rdi),%r9
1457 adcq 16(%rdi),%r10
1458 adcq 24(%rdi),%r11
1459 adcq 32(%rdi),%r12
1460 adcq 40(%rdi),%r13
1461 adcq 48(%rdi),%r14
1462 adcq 56(%rdi),%r15
1463 sbbq %rsi,%rsi
1464
1465 movq 48+56+8(%rsp),%rbx
1466 movl $8,%ecx
1467 movq 0(%rbp),%rax
1468 jmp .L8x_tail
1469
1470 .align 32
1471 .L8x_tail:
1472 mulq %rbx
1473 addq %rax,%r8
1474 movq 16(%rbp),%rax
1475 movq %r8,(%rdi)
1476 movq %rdx,%r8
1477 adcq $0,%r8
1478
1479 mulq %rbx
1480 addq %rax,%r9
1481 movq 32(%rbp),%rax
1482 adcq $0,%rdx
1483 addq %r9,%r8
1484 leaq 8(%rdi),%rdi
1485 movq %rdx,%r9
1486 adcq $0,%r9
1487
1488 mulq %rbx
1489 addq %rax,%r10
1490 movq 48(%rbp),%rax
1491 adcq $0,%rdx
1492 addq %r10,%r9
1493 movq %rdx,%r10
1494 adcq $0,%r10
1495
1496 mulq %rbx
1497 addq %rax,%r11
1498 movq 64(%rbp),%rax
1499 adcq $0,%rdx
1500 addq %r11,%r10
1501 movq %rdx,%r11
1502 adcq $0,%r11
1503
1504 mulq %rbx
1505 addq %rax,%r12
1506 movq 80(%rbp),%rax
1507 adcq $0,%rdx
1508 addq %r12,%r11
1509 movq %rdx,%r12
1510 adcq $0,%r12
1511
1512 mulq %rbx
1513 addq %rax,%r13
1514 movq 96(%rbp),%rax
1515 adcq $0,%rdx
1516 addq %r13,%r12
1517 movq %rdx,%r13
1518 adcq $0,%r13
1519
1520 mulq %rbx
1521 addq %rax,%r14
1522 movq 112(%rbp),%rax
1523 adcq $0,%rdx
1524 addq %r14,%r13
1525 movq %rdx,%r14
1526 adcq $0,%r14
1527
1528 mulq %rbx
1529 movq 48-16+8(%rsp,%rcx,8),%rbx
1530 addq %rax,%r15
1531 adcq $0,%rdx
1532 addq %r15,%r14
1533 movq 0(%rbp),%rax
1534 movq %rdx,%r15
1535 adcq $0,%r15
1536
1537 decl %ecx
1538 jnz .L8x_tail
1539
1540 leaq 128(%rbp),%rbp
1541 movq 8+8(%rsp),%rdx
1542 cmpq 0+8(%rsp),%rbp
1543 jae .L8x_tail_done
1544
1545 movq 48+56+8(%rsp),%rbx
1546 negq %rsi
1547 movq 0(%rbp),%rax
1548 adcq 0(%rdi),%r8
1549 adcq 8(%rdi),%r9
1550 adcq 16(%rdi),%r10
1551 adcq 24(%rdi),%r11
1552 adcq 32(%rdi),%r12
1553 adcq 40(%rdi),%r13
1554 adcq 48(%rdi),%r14
1555 adcq 56(%rdi),%r15
1556 sbbq %rsi,%rsi
1557
1558 movl $8,%ecx
1559 jmp .L8x_tail
1560
1561 .align 32
1562 .L8x_tail_done:
1563 addq (%rdx),%r8
1564 xorq %rax,%rax
1565
1566 negq %rsi
1567 .L8x_no_tail:
1568 adcq 0(%rdi),%r8
1569 adcq 8(%rdi),%r9
1570 adcq 16(%rdi),%r10
1571 adcq 24(%rdi),%r11
1572 adcq 32(%rdi),%r12
1573 adcq 40(%rdi),%r13
1574 adcq 48(%rdi),%r14
1575 adcq 56(%rdi),%r15
1576 adcq $0,%rax
1577 movq -16(%rbp),%rcx
1578 xorq %rsi,%rsi
1579
1580 .byte 102,72,15,126,213
1581
1582 movq %r8,0(%rdi)
1583 movq %r9,8(%rdi)
1584 .byte 102,73,15,126,217
1585 movq %r10,16(%rdi)
1586 movq %r11,24(%rdi)
1587 movq %r12,32(%rdi)
1588 movq %r13,40(%rdi)
1589 movq %r14,48(%rdi)
1590 movq %r15,56(%rdi)
1591 leaq 64(%rdi),%rdi
1592
1593 cmpq %rdx,%rdi
1594 jb .L8x_reduction_loop
1595
1596 subq %r15,%rcx
1597 leaq (%rdi,%r9,1),%rbx
1598 adcq %rsi,%rsi
1599 movq %r9,%rcx
1600 orq %rsi,%rax
1601 .byte 102,72,15,126,207
1602 xorq $1,%rax
1603 .byte 102,72,15,126,206
1604 leaq (%rbp,%rax,8),%rbp
1605 sarq $3+2,%rcx
1606 jmp .Lsqr4x_sub
1607
1608 .align 32
1609 .Lsqr4x_sub:
1610 .byte 0x66
1611 movq 0(%rbx),%r12
1612 movq 8(%rbx),%r13
1613 sbbq 0(%rbp),%r12
1614 movq 16(%rbx),%r14
1615 sbbq 16(%rbp),%r13
1616 movq 24(%rbx),%r15
1617 leaq 32(%rbx),%rbx
1618 sbbq 32(%rbp),%r14
1619 movq %r12,0(%rdi)
1620 sbbq 48(%rbp),%r15
1621 leaq 64(%rbp),%rbp
1622 movq %r13,8(%rdi)
1623 movq %r14,16(%rdi)
1624 movq %r15,24(%rdi)
1625 leaq 32(%rdi),%rdi
1626
1627 incq %rcx
1628 jnz .Lsqr4x_sub
1629 movq %r9,%r10
1630 negq %r9
1631 .byte 0xf3,0xc3
1632 .size bn_sqr8x_internal,.-bn_sqr8x_internal
1633 .globl bn_from_montgomery
1634 .hidden bn_from_montgomery
1635 .type bn_from_montgomery,@function
1636 .align 32
1637 bn_from_montgomery:
1638 testl $7,%r9d
1639 jz bn_from_mont8x
1640 xorl %eax,%eax
1641 .byte 0xf3,0xc3
1642 .size bn_from_montgomery,.-bn_from_montgomery
1643
1644 .type bn_from_mont8x,@function
1645 .align 32
1646 bn_from_mont8x:
1647 .byte 0x67
1648 movq %rsp,%rax
1649 pushq %rbx
1650 pushq %rbp
1651 pushq %r12
1652 pushq %r13
1653 pushq %r14
1654 pushq %r15
1655 .byte 0x67
1656 movl %r9d,%r10d
1657 shll $3,%r9d
1658 shll $3+2,%r10d
1659 negq %r9
1660 movq (%r8),%r8
1661
1662
1663
1664
1665
1666
1667
1668 leaq -64(%rsp,%r9,2),%r11
1669 subq %rsi,%r11
1670 andq $4095,%r11
1671 cmpq %r11,%r10
1672 jb .Lfrom_sp_alt
1673 subq %r11,%rsp
1674 leaq -64(%rsp,%r9,2),%rsp
1675 jmp .Lfrom_sp_done
1676
1677 .align 32
1678 .Lfrom_sp_alt:
1679 leaq 4096-64(,%r9,2),%r10
1680 leaq -64(%rsp,%r9,2),%rsp
1681 subq %r10,%r11
1682 movq $0,%r10
1683 cmovcq %r10,%r11
1684 subq %r11,%rsp
1685 .Lfrom_sp_done:
1686 andq $-64,%rsp
1687 movq %r9,%r10
1688 negq %r9
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699 movq %r8,32(%rsp)
1700 movq %rax,40(%rsp)
1701 .Lfrom_body:
1702 movq %r9,%r11
1703 leaq 48(%rsp),%rax
1704 pxor %xmm0,%xmm0
1705 jmp .Lmul_by_1
1706
1707 .align 32
1708 .Lmul_by_1:
1709 movdqu (%rsi),%xmm1
1710 movdqu 16(%rsi),%xmm2
1711 movdqu 32(%rsi),%xmm3
1712 movdqa %xmm0,(%rax,%r9,1)
1713 movdqu 48(%rsi),%xmm4
1714 movdqa %xmm0,16(%rax,%r9,1)
1715 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1716 movdqa %xmm1,(%rax)
1717 movdqa %xmm0,32(%rax,%r9,1)
1718 movdqa %xmm2,16(%rax)
1719 movdqa %xmm0,48(%rax,%r9,1)
1720 movdqa %xmm3,32(%rax)
1721 movdqa %xmm4,48(%rax)
1722 leaq 64(%rax),%rax
1723 subq $64,%r11
1724 jnz .Lmul_by_1
1725
1726 .byte 102,72,15,110,207
1727 .byte 102,72,15,110,209
1728 .byte 0x67
1729 movq %rcx,%rbp
1730 .byte 102,73,15,110,218
1731 call sqr8x_reduction
1732
1733 pxor %xmm0,%xmm0
1734 leaq 48(%rsp),%rax
1735 movq 40(%rsp),%rsi
1736 jmp .Lfrom_mont_zero
1737
1738 .align 32
1739 .Lfrom_mont_zero:
1740 movdqa %xmm0,0(%rax)
1741 movdqa %xmm0,16(%rax)
1742 movdqa %xmm0,32(%rax)
1743 movdqa %xmm0,48(%rax)
1744 leaq 64(%rax),%rax
1745 subq $32,%r9
1746 jnz .Lfrom_mont_zero
1747
1748 movq $1,%rax
1749 movq -48(%rsi),%r15
1750 movq -40(%rsi),%r14
1751 movq -32(%rsi),%r13
1752 movq -24(%rsi),%r12
1753 movq -16(%rsi),%rbp
1754 movq -8(%rsi),%rbx
1755 leaq (%rsi),%rsp
1756 .Lfrom_epilogue:
1757 .byte 0xf3,0xc3
1758 .size bn_from_mont8x,.-bn_from_mont8x
1759 .globl bn_scatter5
1760 .hidden bn_scatter5
1761 .type bn_scatter5,@function
1762 .align 16
1763 bn_scatter5:
1764 cmpl $0,%esi
1765 jz .Lscatter_epilogue
1766 leaq (%rdx,%rcx,8),%rdx
1767 .Lscatter:
1768 movq (%rdi),%rax
1769 leaq 8(%rdi),%rdi
1770 movq %rax,(%rdx)
1771 leaq 256(%rdx),%rdx
1772 subl $1,%esi
1773 jnz .Lscatter
1774 .Lscatter_epilogue:
1775 .byte 0xf3,0xc3
1776 .size bn_scatter5,.-bn_scatter5
1777
1778 .globl bn_gather5
1779 .hidden bn_gather5
1780 .type bn_gather5,@function
1781 .align 16
1782 bn_gather5:
1783 movl %ecx,%r11d
1784 shrl $3,%ecx
1785 andq $7,%r11
1786 notl %ecx
1787 leaq .Lmagic_masks(%rip),%rax
1788 andl $3,%ecx
1789 leaq 128(%rdx,%r11,8),%rdx
1790 movq 0(%rax,%rcx,8),%xmm4
1791 movq 8(%rax,%rcx,8),%xmm5
1792 movq 16(%rax,%rcx,8),%xmm6
1793 movq 24(%rax,%rcx,8),%xmm7
1794 jmp .Lgather
1795 .align 16
1796 .Lgather:
1797 movq -128(%rdx),%xmm0
1798 movq -64(%rdx),%xmm1
1799 pand %xmm4,%xmm0
1800 movq 0(%rdx),%xmm2
1801 pand %xmm5,%xmm1
1802 movq 64(%rdx),%xmm3
1803 pand %xmm6,%xmm2
1804 por %xmm1,%xmm0
1805 pand %xmm7,%xmm3
1806 .byte 0x67,0x67
1807 por %xmm2,%xmm0
1808 leaq 256(%rdx),%rdx
1809 por %xmm3,%xmm0
1810
1811 movq %xmm0,(%rdi)
1812 leaq 8(%rdi),%rdi
1813 subl $1,%esi
1814 jnz .Lgather
1815 .byte 0xf3,0xc3
1816 .LSEH_end_bn_gather5:
1817 .size bn_gather5,.-bn_gather5
1818 .align 64
1819 .Lmagic_masks:
1820 .long 0,0, 0,0, 0,0, -1,-1
1821 .long 0,0, 0,0, 0,0, 0,0
1822 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0
1823 #endif
OLDNEW
« no previous file with comments | « third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S ('k') | third_party/boringssl/linux-x86_64/crypto/cpu-x86_64-asm.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698