Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(199)

Side by Side Diff: third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S

Issue 1319703002: Breaking Change: merge BoringSSL branch into master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__x86_64__)
2 .text
3
4
5
6 .globl _bn_mul_mont_gather5
7 .private_extern _bn_mul_mont_gather5
8
9 .p2align 6
10 _bn_mul_mont_gather5:
11 testl $7,%r9d
12 jnz L$mul_enter
13 jmp L$mul4x_enter
14
15 .p2align 4
16 L$mul_enter:
17 movl %r9d,%r9d
18 movq %rsp,%rax
19 movl 8(%rsp),%r10d
20 pushq %rbx
21 pushq %rbp
22 pushq %r12
23 pushq %r13
24 pushq %r14
25 pushq %r15
26 leaq 2(%r9),%r11
27 negq %r11
28 leaq (%rsp,%r11,8),%rsp
29 andq $-1024,%rsp
30
31 movq %rax,8(%rsp,%r9,8)
32 L$mul_body:
33 movq %rdx,%r12
34 movq %r10,%r11
35 shrq $3,%r10
36 andq $7,%r11
37 notq %r10
38 leaq L$magic_masks(%rip),%rax
39 andq $3,%r10
40 leaq 96(%r12,%r11,8),%r12
41 movq 0(%rax,%r10,8),%xmm4
42 movq 8(%rax,%r10,8),%xmm5
43 movq 16(%rax,%r10,8),%xmm6
44 movq 24(%rax,%r10,8),%xmm7
45
46 movq -96(%r12),%xmm0
47 movq -32(%r12),%xmm1
48 pand %xmm4,%xmm0
49 movq 32(%r12),%xmm2
50 pand %xmm5,%xmm1
51 movq 96(%r12),%xmm3
52 pand %xmm6,%xmm2
53 por %xmm1,%xmm0
54 pand %xmm7,%xmm3
55 por %xmm2,%xmm0
56 leaq 256(%r12),%r12
57 por %xmm3,%xmm0
58
59 .byte 102,72,15,126,195
60
61 movq (%r8),%r8
62 movq (%rsi),%rax
63
64 xorq %r14,%r14
65 xorq %r15,%r15
66
67 movq -96(%r12),%xmm0
68 movq -32(%r12),%xmm1
69 pand %xmm4,%xmm0
70 movq 32(%r12),%xmm2
71 pand %xmm5,%xmm1
72
73 movq %r8,%rbp
74 mulq %rbx
75 movq %rax,%r10
76 movq (%rcx),%rax
77
78 movq 96(%r12),%xmm3
79 pand %xmm6,%xmm2
80 por %xmm1,%xmm0
81 pand %xmm7,%xmm3
82
83 imulq %r10,%rbp
84 movq %rdx,%r11
85
86 por %xmm2,%xmm0
87 leaq 256(%r12),%r12
88 por %xmm3,%xmm0
89
90 mulq %rbp
91 addq %rax,%r10
92 movq 8(%rsi),%rax
93 adcq $0,%rdx
94 movq %rdx,%r13
95
96 leaq 1(%r15),%r15
97 jmp L$1st_enter
98
99 .p2align 4
100 L$1st:
101 addq %rax,%r13
102 movq (%rsi,%r15,8),%rax
103 adcq $0,%rdx
104 addq %r11,%r13
105 movq %r10,%r11
106 adcq $0,%rdx
107 movq %r13,-16(%rsp,%r15,8)
108 movq %rdx,%r13
109
110 L$1st_enter:
111 mulq %rbx
112 addq %rax,%r11
113 movq (%rcx,%r15,8),%rax
114 adcq $0,%rdx
115 leaq 1(%r15),%r15
116 movq %rdx,%r10
117
118 mulq %rbp
119 cmpq %r9,%r15
120 jne L$1st
121
122 .byte 102,72,15,126,195
123
124 addq %rax,%r13
125 movq (%rsi),%rax
126 adcq $0,%rdx
127 addq %r11,%r13
128 adcq $0,%rdx
129 movq %r13,-16(%rsp,%r15,8)
130 movq %rdx,%r13
131 movq %r10,%r11
132
133 xorq %rdx,%rdx
134 addq %r11,%r13
135 adcq $0,%rdx
136 movq %r13,-8(%rsp,%r9,8)
137 movq %rdx,(%rsp,%r9,8)
138
139 leaq 1(%r14),%r14
140 jmp L$outer
141 .p2align 4
142 L$outer:
143 xorq %r15,%r15
144 movq %r8,%rbp
145 movq (%rsp),%r10
146
147 movq -96(%r12),%xmm0
148 movq -32(%r12),%xmm1
149 pand %xmm4,%xmm0
150 movq 32(%r12),%xmm2
151 pand %xmm5,%xmm1
152
153 mulq %rbx
154 addq %rax,%r10
155 movq (%rcx),%rax
156 adcq $0,%rdx
157
158 movq 96(%r12),%xmm3
159 pand %xmm6,%xmm2
160 por %xmm1,%xmm0
161 pand %xmm7,%xmm3
162
163 imulq %r10,%rbp
164 movq %rdx,%r11
165
166 por %xmm2,%xmm0
167 leaq 256(%r12),%r12
168 por %xmm3,%xmm0
169
170 mulq %rbp
171 addq %rax,%r10
172 movq 8(%rsi),%rax
173 adcq $0,%rdx
174 movq 8(%rsp),%r10
175 movq %rdx,%r13
176
177 leaq 1(%r15),%r15
178 jmp L$inner_enter
179
180 .p2align 4
181 L$inner:
182 addq %rax,%r13
183 movq (%rsi,%r15,8),%rax
184 adcq $0,%rdx
185 addq %r10,%r13
186 movq (%rsp,%r15,8),%r10
187 adcq $0,%rdx
188 movq %r13,-16(%rsp,%r15,8)
189 movq %rdx,%r13
190
191 L$inner_enter:
192 mulq %rbx
193 addq %rax,%r11
194 movq (%rcx,%r15,8),%rax
195 adcq $0,%rdx
196 addq %r11,%r10
197 movq %rdx,%r11
198 adcq $0,%r11
199 leaq 1(%r15),%r15
200
201 mulq %rbp
202 cmpq %r9,%r15
203 jne L$inner
204
205 .byte 102,72,15,126,195
206
207 addq %rax,%r13
208 movq (%rsi),%rax
209 adcq $0,%rdx
210 addq %r10,%r13
211 movq (%rsp,%r15,8),%r10
212 adcq $0,%rdx
213 movq %r13,-16(%rsp,%r15,8)
214 movq %rdx,%r13
215
216 xorq %rdx,%rdx
217 addq %r11,%r13
218 adcq $0,%rdx
219 addq %r10,%r13
220 adcq $0,%rdx
221 movq %r13,-8(%rsp,%r9,8)
222 movq %rdx,(%rsp,%r9,8)
223
224 leaq 1(%r14),%r14
225 cmpq %r9,%r14
226 jb L$outer
227
228 xorq %r14,%r14
229 movq (%rsp),%rax
230 leaq (%rsp),%rsi
231 movq %r9,%r15
232 jmp L$sub
233 .p2align 4
234 L$sub: sbbq (%rcx,%r14,8),%rax
235 movq %rax,(%rdi,%r14,8)
236 movq 8(%rsi,%r14,8),%rax
237 leaq 1(%r14),%r14
238 decq %r15
239 jnz L$sub
240
241 sbbq $0,%rax
242 xorq %r14,%r14
243 movq %r9,%r15
244 .p2align 4
245 L$copy:
246 movq (%rsp,%r14,8),%rsi
247 movq (%rdi,%r14,8),%rcx
248 xorq %rcx,%rsi
249 andq %rax,%rsi
250 xorq %rcx,%rsi
251 movq %r14,(%rsp,%r14,8)
252 movq %rsi,(%rdi,%r14,8)
253 leaq 1(%r14),%r14
254 subq $1,%r15
255 jnz L$copy
256
257 movq 8(%rsp,%r9,8),%rsi
258 movq $1,%rax
259 movq -48(%rsi),%r15
260 movq -40(%rsi),%r14
261 movq -32(%rsi),%r13
262 movq -24(%rsi),%r12
263 movq -16(%rsi),%rbp
264 movq -8(%rsi),%rbx
265 leaq (%rsi),%rsp
266 L$mul_epilogue:
267 .byte 0xf3,0xc3
268
269
270 .p2align 5
271 bn_mul4x_mont_gather5:
272 L$mul4x_enter:
273 .byte 0x67
274 movq %rsp,%rax
275 pushq %rbx
276 pushq %rbp
277 pushq %r12
278 pushq %r13
279 pushq %r14
280 pushq %r15
281 .byte 0x67
282 movl %r9d,%r10d
283 shll $3,%r9d
284 shll $3+2,%r10d
285 negq %r9
286
287
288
289
290
291
292
293
294 leaq -64(%rsp,%r9,2),%r11
295 subq %rsi,%r11
296 andq $4095,%r11
297 cmpq %r11,%r10
298 jb L$mul4xsp_alt
299 subq %r11,%rsp
300 leaq -64(%rsp,%r9,2),%rsp
301 jmp L$mul4xsp_done
302
303 .p2align 5
304 L$mul4xsp_alt:
305 leaq 4096-64(,%r9,2),%r10
306 leaq -64(%rsp,%r9,2),%rsp
307 subq %r10,%r11
308 movq $0,%r10
309 cmovcq %r10,%r11
310 subq %r11,%rsp
311 L$mul4xsp_done:
312 andq $-64,%rsp
313 negq %r9
314
315 movq %rax,40(%rsp)
316 L$mul4x_body:
317
318 call mul4x_internal
319
320 movq 40(%rsp),%rsi
321 movq $1,%rax
322 movq -48(%rsi),%r15
323 movq -40(%rsi),%r14
324 movq -32(%rsi),%r13
325 movq -24(%rsi),%r12
326 movq -16(%rsi),%rbp
327 movq -8(%rsi),%rbx
328 leaq (%rsi),%rsp
329 L$mul4x_epilogue:
330 .byte 0xf3,0xc3
331
332
333
334 .p2align 5
335 mul4x_internal:
336 shlq $5,%r9
337 movl 8(%rax),%r10d
338 leaq 256(%rdx,%r9,1),%r13
339 shrq $5,%r9
340 movq %r10,%r11
341 shrq $3,%r10
342 andq $7,%r11
343 notq %r10
344 leaq L$magic_masks(%rip),%rax
345 andq $3,%r10
346 leaq 96(%rdx,%r11,8),%r12
347 movq 0(%rax,%r10,8),%xmm4
348 movq 8(%rax,%r10,8),%xmm5
349 addq $7,%r11
350 movq 16(%rax,%r10,8),%xmm6
351 movq 24(%rax,%r10,8),%xmm7
352 andq $7,%r11
353
354 movq -96(%r12),%xmm0
355 leaq 256(%r12),%r14
356 movq -32(%r12),%xmm1
357 pand %xmm4,%xmm0
358 movq 32(%r12),%xmm2
359 pand %xmm5,%xmm1
360 movq 96(%r12),%xmm3
361 pand %xmm6,%xmm2
362 .byte 0x67
363 por %xmm1,%xmm0
364 movq -96(%r14),%xmm1
365 .byte 0x67
366 pand %xmm7,%xmm3
367 .byte 0x67
368 por %xmm2,%xmm0
369 movq -32(%r14),%xmm2
370 .byte 0x67
371 pand %xmm4,%xmm1
372 .byte 0x67
373 por %xmm3,%xmm0
374 movq 32(%r14),%xmm3
375
376 .byte 102,72,15,126,195
377 movq 96(%r14),%xmm0
378 movq %r13,16+8(%rsp)
379 movq %rdi,56+8(%rsp)
380
381 movq (%r8),%r8
382 movq (%rsi),%rax
383 leaq (%rsi,%r9,1),%rsi
384 negq %r9
385
386 movq %r8,%rbp
387 mulq %rbx
388 movq %rax,%r10
389 movq (%rcx),%rax
390
391 pand %xmm5,%xmm2
392 pand %xmm6,%xmm3
393 por %xmm2,%xmm1
394
395 imulq %r10,%rbp
396
397
398
399
400
401
402
403 leaq 64+8(%rsp,%r11,8),%r14
404 movq %rdx,%r11
405
406 pand %xmm7,%xmm0
407 por %xmm3,%xmm1
408 leaq 512(%r12),%r12
409 por %xmm1,%xmm0
410
411 mulq %rbp
412 addq %rax,%r10
413 movq 8(%rsi,%r9,1),%rax
414 adcq $0,%rdx
415 movq %rdx,%rdi
416
417 mulq %rbx
418 addq %rax,%r11
419 movq 16(%rcx),%rax
420 adcq $0,%rdx
421 movq %rdx,%r10
422
423 mulq %rbp
424 addq %rax,%rdi
425 movq 16(%rsi,%r9,1),%rax
426 adcq $0,%rdx
427 addq %r11,%rdi
428 leaq 32(%r9),%r15
429 leaq 64(%rcx),%rcx
430 adcq $0,%rdx
431 movq %rdi,(%r14)
432 movq %rdx,%r13
433 jmp L$1st4x
434
435 .p2align 5
436 L$1st4x:
437 mulq %rbx
438 addq %rax,%r10
439 movq -32(%rcx),%rax
440 leaq 32(%r14),%r14
441 adcq $0,%rdx
442 movq %rdx,%r11
443
444 mulq %rbp
445 addq %rax,%r13
446 movq -8(%rsi,%r15,1),%rax
447 adcq $0,%rdx
448 addq %r10,%r13
449 adcq $0,%rdx
450 movq %r13,-24(%r14)
451 movq %rdx,%rdi
452
453 mulq %rbx
454 addq %rax,%r11
455 movq -16(%rcx),%rax
456 adcq $0,%rdx
457 movq %rdx,%r10
458
459 mulq %rbp
460 addq %rax,%rdi
461 movq (%rsi,%r15,1),%rax
462 adcq $0,%rdx
463 addq %r11,%rdi
464 adcq $0,%rdx
465 movq %rdi,-16(%r14)
466 movq %rdx,%r13
467
468 mulq %rbx
469 addq %rax,%r10
470 movq 0(%rcx),%rax
471 adcq $0,%rdx
472 movq %rdx,%r11
473
474 mulq %rbp
475 addq %rax,%r13
476 movq 8(%rsi,%r15,1),%rax
477 adcq $0,%rdx
478 addq %r10,%r13
479 adcq $0,%rdx
480 movq %r13,-8(%r14)
481 movq %rdx,%rdi
482
483 mulq %rbx
484 addq %rax,%r11
485 movq 16(%rcx),%rax
486 adcq $0,%rdx
487 movq %rdx,%r10
488
489 mulq %rbp
490 addq %rax,%rdi
491 movq 16(%rsi,%r15,1),%rax
492 adcq $0,%rdx
493 addq %r11,%rdi
494 leaq 64(%rcx),%rcx
495 adcq $0,%rdx
496 movq %rdi,(%r14)
497 movq %rdx,%r13
498
499 addq $32,%r15
500 jnz L$1st4x
501
502 mulq %rbx
503 addq %rax,%r10
504 movq -32(%rcx),%rax
505 leaq 32(%r14),%r14
506 adcq $0,%rdx
507 movq %rdx,%r11
508
509 mulq %rbp
510 addq %rax,%r13
511 movq -8(%rsi),%rax
512 adcq $0,%rdx
513 addq %r10,%r13
514 adcq $0,%rdx
515 movq %r13,-24(%r14)
516 movq %rdx,%rdi
517
518 mulq %rbx
519 addq %rax,%r11
520 movq -16(%rcx),%rax
521 adcq $0,%rdx
522 movq %rdx,%r10
523
524 mulq %rbp
525 addq %rax,%rdi
526 movq (%rsi,%r9,1),%rax
527 adcq $0,%rdx
528 addq %r11,%rdi
529 adcq $0,%rdx
530 movq %rdi,-16(%r14)
531 movq %rdx,%r13
532
533 .byte 102,72,15,126,195
534 leaq (%rcx,%r9,2),%rcx
535
536 xorq %rdi,%rdi
537 addq %r10,%r13
538 adcq $0,%rdi
539 movq %r13,-8(%r14)
540
541 jmp L$outer4x
542
543 .p2align 5
544 L$outer4x:
545 movq (%r14,%r9,1),%r10
546 movq %r8,%rbp
547 mulq %rbx
548 addq %rax,%r10
549 movq (%rcx),%rax
550 adcq $0,%rdx
551
552 movq -96(%r12),%xmm0
553 movq -32(%r12),%xmm1
554 pand %xmm4,%xmm0
555 movq 32(%r12),%xmm2
556 pand %xmm5,%xmm1
557 movq 96(%r12),%xmm3
558
559 imulq %r10,%rbp
560 .byte 0x67
561 movq %rdx,%r11
562 movq %rdi,(%r14)
563
564 pand %xmm6,%xmm2
565 por %xmm1,%xmm0
566 pand %xmm7,%xmm3
567 por %xmm2,%xmm0
568 leaq (%r14,%r9,1),%r14
569 leaq 256(%r12),%r12
570 por %xmm3,%xmm0
571
572 mulq %rbp
573 addq %rax,%r10
574 movq 8(%rsi,%r9,1),%rax
575 adcq $0,%rdx
576 movq %rdx,%rdi
577
578 mulq %rbx
579 addq %rax,%r11
580 movq 16(%rcx),%rax
581 adcq $0,%rdx
582 addq 8(%r14),%r11
583 adcq $0,%rdx
584 movq %rdx,%r10
585
586 mulq %rbp
587 addq %rax,%rdi
588 movq 16(%rsi,%r9,1),%rax
589 adcq $0,%rdx
590 addq %r11,%rdi
591 leaq 32(%r9),%r15
592 leaq 64(%rcx),%rcx
593 adcq $0,%rdx
594 movq %rdx,%r13
595 jmp L$inner4x
596
597 .p2align 5
598 L$inner4x:
599 mulq %rbx
600 addq %rax,%r10
601 movq -32(%rcx),%rax
602 adcq $0,%rdx
603 addq 16(%r14),%r10
604 leaq 32(%r14),%r14
605 adcq $0,%rdx
606 movq %rdx,%r11
607
608 mulq %rbp
609 addq %rax,%r13
610 movq -8(%rsi,%r15,1),%rax
611 adcq $0,%rdx
612 addq %r10,%r13
613 adcq $0,%rdx
614 movq %rdi,-32(%r14)
615 movq %rdx,%rdi
616
617 mulq %rbx
618 addq %rax,%r11
619 movq -16(%rcx),%rax
620 adcq $0,%rdx
621 addq -8(%r14),%r11
622 adcq $0,%rdx
623 movq %rdx,%r10
624
625 mulq %rbp
626 addq %rax,%rdi
627 movq (%rsi,%r15,1),%rax
628 adcq $0,%rdx
629 addq %r11,%rdi
630 adcq $0,%rdx
631 movq %r13,-24(%r14)
632 movq %rdx,%r13
633
634 mulq %rbx
635 addq %rax,%r10
636 movq 0(%rcx),%rax
637 adcq $0,%rdx
638 addq (%r14),%r10
639 adcq $0,%rdx
640 movq %rdx,%r11
641
642 mulq %rbp
643 addq %rax,%r13
644 movq 8(%rsi,%r15,1),%rax
645 adcq $0,%rdx
646 addq %r10,%r13
647 adcq $0,%rdx
648 movq %rdi,-16(%r14)
649 movq %rdx,%rdi
650
651 mulq %rbx
652 addq %rax,%r11
653 movq 16(%rcx),%rax
654 adcq $0,%rdx
655 addq 8(%r14),%r11
656 adcq $0,%rdx
657 movq %rdx,%r10
658
659 mulq %rbp
660 addq %rax,%rdi
661 movq 16(%rsi,%r15,1),%rax
662 adcq $0,%rdx
663 addq %r11,%rdi
664 leaq 64(%rcx),%rcx
665 adcq $0,%rdx
666 movq %r13,-8(%r14)
667 movq %rdx,%r13
668
669 addq $32,%r15
670 jnz L$inner4x
671
672 mulq %rbx
673 addq %rax,%r10
674 movq -32(%rcx),%rax
675 adcq $0,%rdx
676 addq 16(%r14),%r10
677 leaq 32(%r14),%r14
678 adcq $0,%rdx
679 movq %rdx,%r11
680
681 mulq %rbp
682 addq %rax,%r13
683 movq -8(%rsi),%rax
684 adcq $0,%rdx
685 addq %r10,%r13
686 adcq $0,%rdx
687 movq %rdi,-32(%r14)
688 movq %rdx,%rdi
689
690 mulq %rbx
691 addq %rax,%r11
692 movq %rbp,%rax
693 movq -16(%rcx),%rbp
694 adcq $0,%rdx
695 addq -8(%r14),%r11
696 adcq $0,%rdx
697 movq %rdx,%r10
698
699 mulq %rbp
700 addq %rax,%rdi
701 movq (%rsi,%r9,1),%rax
702 adcq $0,%rdx
703 addq %r11,%rdi
704 adcq $0,%rdx
705 movq %r13,-24(%r14)
706 movq %rdx,%r13
707
708 .byte 102,72,15,126,195
709 movq %rdi,-16(%r14)
710 leaq (%rcx,%r9,2),%rcx
711
712 xorq %rdi,%rdi
713 addq %r10,%r13
714 adcq $0,%rdi
715 addq (%r14),%r13
716 adcq $0,%rdi
717 movq %r13,-8(%r14)
718
719 cmpq 16+8(%rsp),%r12
720 jb L$outer4x
721 subq %r13,%rbp
722 adcq %r15,%r15
723 orq %r15,%rdi
724 xorq $1,%rdi
725 leaq (%r14,%r9,1),%rbx
726 leaq (%rcx,%rdi,8),%rbp
727 movq %r9,%rcx
728 sarq $3+2,%rcx
729 movq 56+8(%rsp),%rdi
730 jmp L$sqr4x_sub
731
732 .globl _bn_power5
733 .private_extern _bn_power5
734
735 .p2align 5
736 _bn_power5:
737 movq %rsp,%rax
738 pushq %rbx
739 pushq %rbp
740 pushq %r12
741 pushq %r13
742 pushq %r14
743 pushq %r15
744 movl %r9d,%r10d
745 shll $3,%r9d
746 shll $3+2,%r10d
747 negq %r9
748 movq (%r8),%r8
749
750
751
752
753
754
755
756 leaq -64(%rsp,%r9,2),%r11
757 subq %rsi,%r11
758 andq $4095,%r11
759 cmpq %r11,%r10
760 jb L$pwr_sp_alt
761 subq %r11,%rsp
762 leaq -64(%rsp,%r9,2),%rsp
763 jmp L$pwr_sp_done
764
765 .p2align 5
766 L$pwr_sp_alt:
767 leaq 4096-64(,%r9,2),%r10
768 leaq -64(%rsp,%r9,2),%rsp
769 subq %r10,%r11
770 movq $0,%r10
771 cmovcq %r10,%r11
772 subq %r11,%rsp
773 L$pwr_sp_done:
774 andq $-64,%rsp
775 movq %r9,%r10
776 negq %r9
777
778
779
780
781
782
783
784
785
786
787 movq %r8,32(%rsp)
788 movq %rax,40(%rsp)
789 L$power5_body:
790 .byte 102,72,15,110,207
791 .byte 102,72,15,110,209
792 .byte 102,73,15,110,218
793 .byte 102,72,15,110,226
794
795 call __bn_sqr8x_internal
796 call __bn_sqr8x_internal
797 call __bn_sqr8x_internal
798 call __bn_sqr8x_internal
799 call __bn_sqr8x_internal
800
801 .byte 102,72,15,126,209
802 .byte 102,72,15,126,226
803 movq %rsi,%rdi
804 movq 40(%rsp),%rax
805 leaq 32(%rsp),%r8
806
807 call mul4x_internal
808
809 movq 40(%rsp),%rsi
810 movq $1,%rax
811 movq -48(%rsi),%r15
812 movq -40(%rsi),%r14
813 movq -32(%rsi),%r13
814 movq -24(%rsi),%r12
815 movq -16(%rsi),%rbp
816 movq -8(%rsi),%rbx
817 leaq (%rsi),%rsp
818 L$power5_epilogue:
819 .byte 0xf3,0xc3
820
821
822 .globl _bn_sqr8x_internal
823 .private_extern _bn_sqr8x_internal
824 .private_extern _bn_sqr8x_internal
825
826 .p2align 5
827 _bn_sqr8x_internal:
828 __bn_sqr8x_internal:
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902 leaq 32(%r10),%rbp
903 leaq (%rsi,%r9,1),%rsi
904
905 movq %r9,%rcx
906
907
908 movq -32(%rsi,%rbp,1),%r14
909 leaq 48+8(%rsp,%r9,2),%rdi
910 movq -24(%rsi,%rbp,1),%rax
911 leaq -32(%rdi,%rbp,1),%rdi
912 movq -16(%rsi,%rbp,1),%rbx
913 movq %rax,%r15
914
915 mulq %r14
916 movq %rax,%r10
917 movq %rbx,%rax
918 movq %rdx,%r11
919 movq %r10,-24(%rdi,%rbp,1)
920
921 mulq %r14
922 addq %rax,%r11
923 movq %rbx,%rax
924 adcq $0,%rdx
925 movq %r11,-16(%rdi,%rbp,1)
926 movq %rdx,%r10
927
928
929 movq -8(%rsi,%rbp,1),%rbx
930 mulq %r15
931 movq %rax,%r12
932 movq %rbx,%rax
933 movq %rdx,%r13
934
935 leaq (%rbp),%rcx
936 mulq %r14
937 addq %rax,%r10
938 movq %rbx,%rax
939 movq %rdx,%r11
940 adcq $0,%r11
941 addq %r12,%r10
942 adcq $0,%r11
943 movq %r10,-8(%rdi,%rcx,1)
944 jmp L$sqr4x_1st
945
946 .p2align 5
947 L$sqr4x_1st:
948 movq (%rsi,%rcx,1),%rbx
949 mulq %r15
950 addq %rax,%r13
951 movq %rbx,%rax
952 movq %rdx,%r12
953 adcq $0,%r12
954
955 mulq %r14
956 addq %rax,%r11
957 movq %rbx,%rax
958 movq 8(%rsi,%rcx,1),%rbx
959 movq %rdx,%r10
960 adcq $0,%r10
961 addq %r13,%r11
962 adcq $0,%r10
963
964
965 mulq %r15
966 addq %rax,%r12
967 movq %rbx,%rax
968 movq %r11,(%rdi,%rcx,1)
969 movq %rdx,%r13
970 adcq $0,%r13
971
972 mulq %r14
973 addq %rax,%r10
974 movq %rbx,%rax
975 movq 16(%rsi,%rcx,1),%rbx
976 movq %rdx,%r11
977 adcq $0,%r11
978 addq %r12,%r10
979 adcq $0,%r11
980
981 mulq %r15
982 addq %rax,%r13
983 movq %rbx,%rax
984 movq %r10,8(%rdi,%rcx,1)
985 movq %rdx,%r12
986 adcq $0,%r12
987
988 mulq %r14
989 addq %rax,%r11
990 movq %rbx,%rax
991 movq 24(%rsi,%rcx,1),%rbx
992 movq %rdx,%r10
993 adcq $0,%r10
994 addq %r13,%r11
995 adcq $0,%r10
996
997
998 mulq %r15
999 addq %rax,%r12
1000 movq %rbx,%rax
1001 movq %r11,16(%rdi,%rcx,1)
1002 movq %rdx,%r13
1003 adcq $0,%r13
1004 leaq 32(%rcx),%rcx
1005
1006 mulq %r14
1007 addq %rax,%r10
1008 movq %rbx,%rax
1009 movq %rdx,%r11
1010 adcq $0,%r11
1011 addq %r12,%r10
1012 adcq $0,%r11
1013 movq %r10,-8(%rdi,%rcx,1)
1014
1015 cmpq $0,%rcx
1016 jne L$sqr4x_1st
1017
1018 mulq %r15
1019 addq %rax,%r13
1020 leaq 16(%rbp),%rbp
1021 adcq $0,%rdx
1022 addq %r11,%r13
1023 adcq $0,%rdx
1024
1025 movq %r13,(%rdi)
1026 movq %rdx,%r12
1027 movq %rdx,8(%rdi)
1028 jmp L$sqr4x_outer
1029
1030 .p2align 5
1031 L$sqr4x_outer:
1032 movq -32(%rsi,%rbp,1),%r14
1033 leaq 48+8(%rsp,%r9,2),%rdi
1034 movq -24(%rsi,%rbp,1),%rax
1035 leaq -32(%rdi,%rbp,1),%rdi
1036 movq -16(%rsi,%rbp,1),%rbx
1037 movq %rax,%r15
1038
1039 mulq %r14
1040 movq -24(%rdi,%rbp,1),%r10
1041 addq %rax,%r10
1042 movq %rbx,%rax
1043 adcq $0,%rdx
1044 movq %r10,-24(%rdi,%rbp,1)
1045 movq %rdx,%r11
1046
1047 mulq %r14
1048 addq %rax,%r11
1049 movq %rbx,%rax
1050 adcq $0,%rdx
1051 addq -16(%rdi,%rbp,1),%r11
1052 movq %rdx,%r10
1053 adcq $0,%r10
1054 movq %r11,-16(%rdi,%rbp,1)
1055
1056 xorq %r12,%r12
1057
1058 movq -8(%rsi,%rbp,1),%rbx
1059 mulq %r15
1060 addq %rax,%r12
1061 movq %rbx,%rax
1062 adcq $0,%rdx
1063 addq -8(%rdi,%rbp,1),%r12
1064 movq %rdx,%r13
1065 adcq $0,%r13
1066
1067 mulq %r14
1068 addq %rax,%r10
1069 movq %rbx,%rax
1070 adcq $0,%rdx
1071 addq %r12,%r10
1072 movq %rdx,%r11
1073 adcq $0,%r11
1074 movq %r10,-8(%rdi,%rbp,1)
1075
1076 leaq (%rbp),%rcx
1077 jmp L$sqr4x_inner
1078
1079 .p2align 5
1080 L$sqr4x_inner:
1081 movq (%rsi,%rcx,1),%rbx
1082 mulq %r15
1083 addq %rax,%r13
1084 movq %rbx,%rax
1085 movq %rdx,%r12
1086 adcq $0,%r12
1087 addq (%rdi,%rcx,1),%r13
1088 adcq $0,%r12
1089
1090 .byte 0x67
1091 mulq %r14
1092 addq %rax,%r11
1093 movq %rbx,%rax
1094 movq 8(%rsi,%rcx,1),%rbx
1095 movq %rdx,%r10
1096 adcq $0,%r10
1097 addq %r13,%r11
1098 adcq $0,%r10
1099
1100 mulq %r15
1101 addq %rax,%r12
1102 movq %r11,(%rdi,%rcx,1)
1103 movq %rbx,%rax
1104 movq %rdx,%r13
1105 adcq $0,%r13
1106 addq 8(%rdi,%rcx,1),%r12
1107 leaq 16(%rcx),%rcx
1108 adcq $0,%r13
1109
1110 mulq %r14
1111 addq %rax,%r10
1112 movq %rbx,%rax
1113 adcq $0,%rdx
1114 addq %r12,%r10
1115 movq %rdx,%r11
1116 adcq $0,%r11
1117 movq %r10,-8(%rdi,%rcx,1)
1118
1119 cmpq $0,%rcx
1120 jne L$sqr4x_inner
1121
1122 .byte 0x67
1123 mulq %r15
1124 addq %rax,%r13
1125 adcq $0,%rdx
1126 addq %r11,%r13
1127 adcq $0,%rdx
1128
1129 movq %r13,(%rdi)
1130 movq %rdx,%r12
1131 movq %rdx,8(%rdi)
1132
1133 addq $16,%rbp
1134 jnz L$sqr4x_outer
1135
1136
1137 movq -32(%rsi),%r14
1138 leaq 48+8(%rsp,%r9,2),%rdi
1139 movq -24(%rsi),%rax
1140 leaq -32(%rdi,%rbp,1),%rdi
1141 movq -16(%rsi),%rbx
1142 movq %rax,%r15
1143
1144 mulq %r14
1145 addq %rax,%r10
1146 movq %rbx,%rax
1147 movq %rdx,%r11
1148 adcq $0,%r11
1149
1150 mulq %r14
1151 addq %rax,%r11
1152 movq %rbx,%rax
1153 movq %r10,-24(%rdi)
1154 movq %rdx,%r10
1155 adcq $0,%r10
1156 addq %r13,%r11
1157 movq -8(%rsi),%rbx
1158 adcq $0,%r10
1159
1160 mulq %r15
1161 addq %rax,%r12
1162 movq %rbx,%rax
1163 movq %r11,-16(%rdi)
1164 movq %rdx,%r13
1165 adcq $0,%r13
1166
1167 mulq %r14
1168 addq %rax,%r10
1169 movq %rbx,%rax
1170 movq %rdx,%r11
1171 adcq $0,%r11
1172 addq %r12,%r10
1173 adcq $0,%r11
1174 movq %r10,-8(%rdi)
1175
1176 mulq %r15
1177 addq %rax,%r13
1178 movq -16(%rsi),%rax
1179 adcq $0,%rdx
1180 addq %r11,%r13
1181 adcq $0,%rdx
1182
1183 movq %r13,(%rdi)
1184 movq %rdx,%r12
1185 movq %rdx,8(%rdi)
1186
1187 mulq %rbx
1188 addq $16,%rbp
1189 xorq %r14,%r14
1190 subq %r9,%rbp
1191 xorq %r15,%r15
1192
1193 addq %r12,%rax
1194 adcq $0,%rdx
1195 movq %rax,8(%rdi)
1196 movq %rdx,16(%rdi)
1197 movq %r15,24(%rdi)
1198
1199 movq -16(%rsi,%rbp,1),%rax
1200 leaq 48+8(%rsp),%rdi
1201 xorq %r10,%r10
1202 movq 8(%rdi),%r11
1203
1204 leaq (%r14,%r10,2),%r12
1205 shrq $63,%r10
1206 leaq (%rcx,%r11,2),%r13
1207 shrq $63,%r11
1208 orq %r10,%r13
1209 movq 16(%rdi),%r10
1210 movq %r11,%r14
1211 mulq %rax
1212 negq %r15
1213 movq 24(%rdi),%r11
1214 adcq %rax,%r12
1215 movq -8(%rsi,%rbp,1),%rax
1216 movq %r12,(%rdi)
1217 adcq %rdx,%r13
1218
1219 leaq (%r14,%r10,2),%rbx
1220 movq %r13,8(%rdi)
1221 sbbq %r15,%r15
1222 shrq $63,%r10
1223 leaq (%rcx,%r11,2),%r8
1224 shrq $63,%r11
1225 orq %r10,%r8
1226 movq 32(%rdi),%r10
1227 movq %r11,%r14
1228 mulq %rax
1229 negq %r15
1230 movq 40(%rdi),%r11
1231 adcq %rax,%rbx
1232 movq 0(%rsi,%rbp,1),%rax
1233 movq %rbx,16(%rdi)
1234 adcq %rdx,%r8
1235 leaq 16(%rbp),%rbp
1236 movq %r8,24(%rdi)
1237 sbbq %r15,%r15
1238 leaq 64(%rdi),%rdi
1239 jmp L$sqr4x_shift_n_add
1240
1241 .p2align 5
1242 L$sqr4x_shift_n_add:
1243 leaq (%r14,%r10,2),%r12
1244 shrq $63,%r10
1245 leaq (%rcx,%r11,2),%r13
1246 shrq $63,%r11
1247 orq %r10,%r13
1248 movq -16(%rdi),%r10
1249 movq %r11,%r14
1250 mulq %rax
1251 negq %r15
1252 movq -8(%rdi),%r11
1253 adcq %rax,%r12
1254 movq -8(%rsi,%rbp,1),%rax
1255 movq %r12,-32(%rdi)
1256 adcq %rdx,%r13
1257
1258 leaq (%r14,%r10,2),%rbx
1259 movq %r13,-24(%rdi)
1260 sbbq %r15,%r15
1261 shrq $63,%r10
1262 leaq (%rcx,%r11,2),%r8
1263 shrq $63,%r11
1264 orq %r10,%r8
1265 movq 0(%rdi),%r10
1266 movq %r11,%r14
1267 mulq %rax
1268 negq %r15
1269 movq 8(%rdi),%r11
1270 adcq %rax,%rbx
1271 movq 0(%rsi,%rbp,1),%rax
1272 movq %rbx,-16(%rdi)
1273 adcq %rdx,%r8
1274
1275 leaq (%r14,%r10,2),%r12
1276 movq %r8,-8(%rdi)
1277 sbbq %r15,%r15
1278 shrq $63,%r10
1279 leaq (%rcx,%r11,2),%r13
1280 shrq $63,%r11
1281 orq %r10,%r13
1282 movq 16(%rdi),%r10
1283 movq %r11,%r14
1284 mulq %rax
1285 negq %r15
1286 movq 24(%rdi),%r11
1287 adcq %rax,%r12
1288 movq 8(%rsi,%rbp,1),%rax
1289 movq %r12,0(%rdi)
1290 adcq %rdx,%r13
1291
1292 leaq (%r14,%r10,2),%rbx
1293 movq %r13,8(%rdi)
1294 sbbq %r15,%r15
1295 shrq $63,%r10
1296 leaq (%rcx,%r11,2),%r8
1297 shrq $63,%r11
1298 orq %r10,%r8
1299 movq 32(%rdi),%r10
1300 movq %r11,%r14
1301 mulq %rax
1302 negq %r15
1303 movq 40(%rdi),%r11
1304 adcq %rax,%rbx
1305 movq 16(%rsi,%rbp,1),%rax
1306 movq %rbx,16(%rdi)
1307 adcq %rdx,%r8
1308 movq %r8,24(%rdi)
1309 sbbq %r15,%r15
1310 leaq 64(%rdi),%rdi
1311 addq $32,%rbp
1312 jnz L$sqr4x_shift_n_add
1313
1314 leaq (%r14,%r10,2),%r12
1315 .byte 0x67
1316 shrq $63,%r10
1317 leaq (%rcx,%r11,2),%r13
1318 shrq $63,%r11
1319 orq %r10,%r13
1320 movq -16(%rdi),%r10
1321 movq %r11,%r14
1322 mulq %rax
1323 negq %r15
1324 movq -8(%rdi),%r11
1325 adcq %rax,%r12
1326 movq -8(%rsi),%rax
1327 movq %r12,-32(%rdi)
1328 adcq %rdx,%r13
1329
1330 leaq (%r14,%r10,2),%rbx
1331 movq %r13,-24(%rdi)
1332 sbbq %r15,%r15
1333 shrq $63,%r10
1334 leaq (%rcx,%r11,2),%r8
1335 shrq $63,%r11
1336 orq %r10,%r8
1337 mulq %rax
1338 negq %r15
1339 adcq %rax,%rbx
1340 adcq %rdx,%r8
1341 movq %rbx,-16(%rdi)
1342 movq %r8,-8(%rdi)
1343 .byte 102,72,15,126,213
1344 sqr8x_reduction:
1345 xorq %rax,%rax
1346 leaq (%rbp,%r9,2),%rcx
1347 leaq 48+8(%rsp,%r9,2),%rdx
1348 movq %rcx,0+8(%rsp)
1349 leaq 48+8(%rsp,%r9,1),%rdi
1350 movq %rdx,8+8(%rsp)
1351 negq %r9
1352 jmp L$8x_reduction_loop
1353
1354 .p2align 5
1355 L$8x_reduction_loop:
1356 leaq (%rdi,%r9,1),%rdi
1357 .byte 0x66
1358 movq 0(%rdi),%rbx
1359 movq 8(%rdi),%r9
1360 movq 16(%rdi),%r10
1361 movq 24(%rdi),%r11
1362 movq 32(%rdi),%r12
1363 movq 40(%rdi),%r13
1364 movq 48(%rdi),%r14
1365 movq 56(%rdi),%r15
1366 movq %rax,(%rdx)
1367 leaq 64(%rdi),%rdi
1368
1369 .byte 0x67
1370 movq %rbx,%r8
1371 imulq 32+8(%rsp),%rbx
1372 movq 0(%rbp),%rax
1373 movl $8,%ecx
1374 jmp L$8x_reduce
1375
1376 .p2align 5
1377 L$8x_reduce:
1378 mulq %rbx
1379 movq 16(%rbp),%rax
1380 negq %r8
1381 movq %rdx,%r8
1382 adcq $0,%r8
1383
1384 mulq %rbx
1385 addq %rax,%r9
1386 movq 32(%rbp),%rax
1387 adcq $0,%rdx
1388 addq %r9,%r8
1389 movq %rbx,48-8+8(%rsp,%rcx,8)
1390 movq %rdx,%r9
1391 adcq $0,%r9
1392
1393 mulq %rbx
1394 addq %rax,%r10
1395 movq 48(%rbp),%rax
1396 adcq $0,%rdx
1397 addq %r10,%r9
1398 movq 32+8(%rsp),%rsi
1399 movq %rdx,%r10
1400 adcq $0,%r10
1401
1402 mulq %rbx
1403 addq %rax,%r11
1404 movq 64(%rbp),%rax
1405 adcq $0,%rdx
1406 imulq %r8,%rsi
1407 addq %r11,%r10
1408 movq %rdx,%r11
1409 adcq $0,%r11
1410
1411 mulq %rbx
1412 addq %rax,%r12
1413 movq 80(%rbp),%rax
1414 adcq $0,%rdx
1415 addq %r12,%r11
1416 movq %rdx,%r12
1417 adcq $0,%r12
1418
1419 mulq %rbx
1420 addq %rax,%r13
1421 movq 96(%rbp),%rax
1422 adcq $0,%rdx
1423 addq %r13,%r12
1424 movq %rdx,%r13
1425 adcq $0,%r13
1426
1427 mulq %rbx
1428 addq %rax,%r14
1429 movq 112(%rbp),%rax
1430 adcq $0,%rdx
1431 addq %r14,%r13
1432 movq %rdx,%r14
1433 adcq $0,%r14
1434
1435 mulq %rbx
1436 movq %rsi,%rbx
1437 addq %rax,%r15
1438 movq 0(%rbp),%rax
1439 adcq $0,%rdx
1440 addq %r15,%r14
1441 movq %rdx,%r15
1442 adcq $0,%r15
1443
1444 decl %ecx
1445 jnz L$8x_reduce
1446
1447 leaq 128(%rbp),%rbp
1448 xorq %rax,%rax
1449 movq 8+8(%rsp),%rdx
1450 cmpq 0+8(%rsp),%rbp
1451 jae L$8x_no_tail
1452
1453 .byte 0x66
1454 addq 0(%rdi),%r8
1455 adcq 8(%rdi),%r9
1456 adcq 16(%rdi),%r10
1457 adcq 24(%rdi),%r11
1458 adcq 32(%rdi),%r12
1459 adcq 40(%rdi),%r13
1460 adcq 48(%rdi),%r14
1461 adcq 56(%rdi),%r15
1462 sbbq %rsi,%rsi
1463
1464 movq 48+56+8(%rsp),%rbx
1465 movl $8,%ecx
1466 movq 0(%rbp),%rax
1467 jmp L$8x_tail
1468
1469 .p2align 5
1470 L$8x_tail:
1471 mulq %rbx
1472 addq %rax,%r8
1473 movq 16(%rbp),%rax
1474 movq %r8,(%rdi)
1475 movq %rdx,%r8
1476 adcq $0,%r8
1477
1478 mulq %rbx
1479 addq %rax,%r9
1480 movq 32(%rbp),%rax
1481 adcq $0,%rdx
1482 addq %r9,%r8
1483 leaq 8(%rdi),%rdi
1484 movq %rdx,%r9
1485 adcq $0,%r9
1486
1487 mulq %rbx
1488 addq %rax,%r10
1489 movq 48(%rbp),%rax
1490 adcq $0,%rdx
1491 addq %r10,%r9
1492 movq %rdx,%r10
1493 adcq $0,%r10
1494
1495 mulq %rbx
1496 addq %rax,%r11
1497 movq 64(%rbp),%rax
1498 adcq $0,%rdx
1499 addq %r11,%r10
1500 movq %rdx,%r11
1501 adcq $0,%r11
1502
1503 mulq %rbx
1504 addq %rax,%r12
1505 movq 80(%rbp),%rax
1506 adcq $0,%rdx
1507 addq %r12,%r11
1508 movq %rdx,%r12
1509 adcq $0,%r12
1510
1511 mulq %rbx
1512 addq %rax,%r13
1513 movq 96(%rbp),%rax
1514 adcq $0,%rdx
1515 addq %r13,%r12
1516 movq %rdx,%r13
1517 adcq $0,%r13
1518
1519 mulq %rbx
1520 addq %rax,%r14
1521 movq 112(%rbp),%rax
1522 adcq $0,%rdx
1523 addq %r14,%r13
1524 movq %rdx,%r14
1525 adcq $0,%r14
1526
1527 mulq %rbx
1528 movq 48-16+8(%rsp,%rcx,8),%rbx
1529 addq %rax,%r15
1530 adcq $0,%rdx
1531 addq %r15,%r14
1532 movq 0(%rbp),%rax
1533 movq %rdx,%r15
1534 adcq $0,%r15
1535
1536 decl %ecx
1537 jnz L$8x_tail
1538
1539 leaq 128(%rbp),%rbp
1540 movq 8+8(%rsp),%rdx
1541 cmpq 0+8(%rsp),%rbp
1542 jae L$8x_tail_done
1543
1544 movq 48+56+8(%rsp),%rbx
1545 negq %rsi
1546 movq 0(%rbp),%rax
1547 adcq 0(%rdi),%r8
1548 adcq 8(%rdi),%r9
1549 adcq 16(%rdi),%r10
1550 adcq 24(%rdi),%r11
1551 adcq 32(%rdi),%r12
1552 adcq 40(%rdi),%r13
1553 adcq 48(%rdi),%r14
1554 adcq 56(%rdi),%r15
1555 sbbq %rsi,%rsi
1556
1557 movl $8,%ecx
1558 jmp L$8x_tail
1559
1560 .p2align 5
1561 L$8x_tail_done:
1562 addq (%rdx),%r8
1563 xorq %rax,%rax
1564
1565 negq %rsi
1566 L$8x_no_tail:
1567 adcq 0(%rdi),%r8
1568 adcq 8(%rdi),%r9
1569 adcq 16(%rdi),%r10
1570 adcq 24(%rdi),%r11
1571 adcq 32(%rdi),%r12
1572 adcq 40(%rdi),%r13
1573 adcq 48(%rdi),%r14
1574 adcq 56(%rdi),%r15
1575 adcq $0,%rax
1576 movq -16(%rbp),%rcx
1577 xorq %rsi,%rsi
1578
1579 .byte 102,72,15,126,213
1580
1581 movq %r8,0(%rdi)
1582 movq %r9,8(%rdi)
1583 .byte 102,73,15,126,217
1584 movq %r10,16(%rdi)
1585 movq %r11,24(%rdi)
1586 movq %r12,32(%rdi)
1587 movq %r13,40(%rdi)
1588 movq %r14,48(%rdi)
1589 movq %r15,56(%rdi)
1590 leaq 64(%rdi),%rdi
1591
1592 cmpq %rdx,%rdi
1593 jb L$8x_reduction_loop
1594
1595 subq %r15,%rcx
1596 leaq (%rdi,%r9,1),%rbx
1597 adcq %rsi,%rsi
1598 movq %r9,%rcx
1599 orq %rsi,%rax
1600 .byte 102,72,15,126,207
1601 xorq $1,%rax
1602 .byte 102,72,15,126,206
1603 leaq (%rbp,%rax,8),%rbp
1604 sarq $3+2,%rcx
1605 jmp L$sqr4x_sub
1606
1607 .p2align 5
1608 L$sqr4x_sub:
1609 .byte 0x66
1610 movq 0(%rbx),%r12
1611 movq 8(%rbx),%r13
1612 sbbq 0(%rbp),%r12
1613 movq 16(%rbx),%r14
1614 sbbq 16(%rbp),%r13
1615 movq 24(%rbx),%r15
1616 leaq 32(%rbx),%rbx
1617 sbbq 32(%rbp),%r14
1618 movq %r12,0(%rdi)
1619 sbbq 48(%rbp),%r15
1620 leaq 64(%rbp),%rbp
1621 movq %r13,8(%rdi)
1622 movq %r14,16(%rdi)
1623 movq %r15,24(%rdi)
1624 leaq 32(%rdi),%rdi
1625
1626 incq %rcx
1627 jnz L$sqr4x_sub
1628 movq %r9,%r10
1629 negq %r9
1630 .byte 0xf3,0xc3
1631
1632 .globl _bn_from_montgomery
1633 .private_extern _bn_from_montgomery
1634
1635 .p2align 5
1636 _bn_from_montgomery:
1637 testl $7,%r9d
1638 jz bn_from_mont8x
1639 xorl %eax,%eax
1640 .byte 0xf3,0xc3
1641
1642
1643
1644 .p2align 5
1645 bn_from_mont8x:
1646 .byte 0x67
1647 movq %rsp,%rax
1648 pushq %rbx
1649 pushq %rbp
1650 pushq %r12
1651 pushq %r13
1652 pushq %r14
1653 pushq %r15
1654 .byte 0x67
1655 movl %r9d,%r10d
1656 shll $3,%r9d
1657 shll $3+2,%r10d
1658 negq %r9
1659 movq (%r8),%r8
1660
1661
1662
1663
1664
1665
1666
1667 leaq -64(%rsp,%r9,2),%r11
1668 subq %rsi,%r11
1669 andq $4095,%r11
1670 cmpq %r11,%r10
1671 jb L$from_sp_alt
1672 subq %r11,%rsp
1673 leaq -64(%rsp,%r9,2),%rsp
1674 jmp L$from_sp_done
1675
1676 .p2align 5
1677 L$from_sp_alt:
1678 leaq 4096-64(,%r9,2),%r10
1679 leaq -64(%rsp,%r9,2),%rsp
1680 subq %r10,%r11
1681 movq $0,%r10
1682 cmovcq %r10,%r11
1683 subq %r11,%rsp
1684 L$from_sp_done:
1685 andq $-64,%rsp
1686 movq %r9,%r10
1687 negq %r9
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698 movq %r8,32(%rsp)
1699 movq %rax,40(%rsp)
1700 L$from_body:
1701 movq %r9,%r11
1702 leaq 48(%rsp),%rax
1703 pxor %xmm0,%xmm0
1704 jmp L$mul_by_1
1705
1706 .p2align 5
1707 L$mul_by_1:
1708 movdqu (%rsi),%xmm1
1709 movdqu 16(%rsi),%xmm2
1710 movdqu 32(%rsi),%xmm3
1711 movdqa %xmm0,(%rax,%r9,1)
1712 movdqu 48(%rsi),%xmm4
1713 movdqa %xmm0,16(%rax,%r9,1)
1714 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1715 movdqa %xmm1,(%rax)
1716 movdqa %xmm0,32(%rax,%r9,1)
1717 movdqa %xmm2,16(%rax)
1718 movdqa %xmm0,48(%rax,%r9,1)
1719 movdqa %xmm3,32(%rax)
1720 movdqa %xmm4,48(%rax)
1721 leaq 64(%rax),%rax
1722 subq $64,%r11
1723 jnz L$mul_by_1
1724
1725 .byte 102,72,15,110,207
1726 .byte 102,72,15,110,209
1727 .byte 0x67
1728 movq %rcx,%rbp
1729 .byte 102,73,15,110,218
1730 call sqr8x_reduction
1731
1732 pxor %xmm0,%xmm0
1733 leaq 48(%rsp),%rax
1734 movq 40(%rsp),%rsi
1735 jmp L$from_mont_zero
1736
1737 .p2align 5
1738 L$from_mont_zero:
1739 movdqa %xmm0,0(%rax)
1740 movdqa %xmm0,16(%rax)
1741 movdqa %xmm0,32(%rax)
1742 movdqa %xmm0,48(%rax)
1743 leaq 64(%rax),%rax
1744 subq $32,%r9
1745 jnz L$from_mont_zero
1746
1747 movq $1,%rax
1748 movq -48(%rsi),%r15
1749 movq -40(%rsi),%r14
1750 movq -32(%rsi),%r13
1751 movq -24(%rsi),%r12
1752 movq -16(%rsi),%rbp
1753 movq -8(%rsi),%rbx
1754 leaq (%rsi),%rsp
1755 L$from_epilogue:
1756 .byte 0xf3,0xc3
1757
1758 .globl _bn_scatter5
1759 .private_extern _bn_scatter5
1760
1761 .p2align 4
1762 _bn_scatter5:
1763 cmpl $0,%esi
1764 jz L$scatter_epilogue
1765 leaq (%rdx,%rcx,8),%rdx
1766 L$scatter:
1767 movq (%rdi),%rax
1768 leaq 8(%rdi),%rdi
1769 movq %rax,(%rdx)
1770 leaq 256(%rdx),%rdx
1771 subl $1,%esi
1772 jnz L$scatter
1773 L$scatter_epilogue:
1774 .byte 0xf3,0xc3
1775
1776
1777 .globl _bn_gather5
1778 .private_extern _bn_gather5
1779
1780 .p2align 4
1781 _bn_gather5:
1782 movl %ecx,%r11d
1783 shrl $3,%ecx
1784 andq $7,%r11
1785 notl %ecx
1786 leaq L$magic_masks(%rip),%rax
1787 andl $3,%ecx
1788 leaq 128(%rdx,%r11,8),%rdx
1789 movq 0(%rax,%rcx,8),%xmm4
1790 movq 8(%rax,%rcx,8),%xmm5
1791 movq 16(%rax,%rcx,8),%xmm6
1792 movq 24(%rax,%rcx,8),%xmm7
1793 jmp L$gather
1794 .p2align 4
1795 L$gather:
1796 movq -128(%rdx),%xmm0
1797 movq -64(%rdx),%xmm1
1798 pand %xmm4,%xmm0
1799 movq 0(%rdx),%xmm2
1800 pand %xmm5,%xmm1
1801 movq 64(%rdx),%xmm3
1802 pand %xmm6,%xmm2
1803 por %xmm1,%xmm0
1804 pand %xmm7,%xmm3
1805 .byte 0x67,0x67
1806 por %xmm2,%xmm0
1807 leaq 256(%rdx),%rdx
1808 por %xmm3,%xmm0
1809
1810 movq %xmm0,(%rdi)
1811 leaq 8(%rdi),%rdi
1812 subl $1,%esi
1813 jnz L$gather
1814 .byte 0xf3,0xc3
1815 L$SEH_end_bn_gather5:
1816
1817 .p2align 6
1818 L$magic_masks:
1819 .long 0,0, 0,0, 0,0, -1,-1
1820 .long 0,0, 0,0, 0,0, 0,0
1821 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0
1822 #endif
OLDNEW
« no previous file with comments | « third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S ('k') | third_party/boringssl/mac-x86_64/crypto/cpu-x86_64-asm.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698