Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(145)

Side by Side Diff: third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S

Issue 377783004: Add BoringSSL GYP files. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Final Python fix. Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 #if defined(__x86_64__)
2 .text
3
4
5
6 .globl _bn_mul_mont_gather5
7
8 .p2align 6
9 _bn_mul_mont_gather5:
10 testl $7,%r9d
11 jnz L$mul_enter
12 jmp L$mul4x_enter
13
14 .p2align 4
15 L$mul_enter:
16 movl %r9d,%r9d
17 movq %rsp,%rax
18 movl 8(%rsp),%r10d
19 pushq %rbx
20 pushq %rbp
21 pushq %r12
22 pushq %r13
23 pushq %r14
24 pushq %r15
25 leaq 2(%r9),%r11
26 negq %r11
27 leaq (%rsp,%r11,8),%rsp
28 andq $-1024,%rsp
29
30 movq %rax,8(%rsp,%r9,8)
31 L$mul_body:
32 movq %rdx,%r12
33 movq %r10,%r11
34 shrq $3,%r10
35 andq $7,%r11
36 notq %r10
37 leaq L$magic_masks(%rip),%rax
38 andq $3,%r10
39 leaq 96(%r12,%r11,8),%r12
40 movq 0(%rax,%r10,8),%xmm4
41 movq 8(%rax,%r10,8),%xmm5
42 movq 16(%rax,%r10,8),%xmm6
43 movq 24(%rax,%r10,8),%xmm7
44
45 movq -96(%r12),%xmm0
46 movq -32(%r12),%xmm1
47 pand %xmm4,%xmm0
48 movq 32(%r12),%xmm2
49 pand %xmm5,%xmm1
50 movq 96(%r12),%xmm3
51 pand %xmm6,%xmm2
52 por %xmm1,%xmm0
53 pand %xmm7,%xmm3
54 por %xmm2,%xmm0
55 leaq 256(%r12),%r12
56 por %xmm3,%xmm0
57
58 .byte 102,72,15,126,195
59
60 movq (%r8),%r8
61 movq (%rsi),%rax
62
63 xorq %r14,%r14
64 xorq %r15,%r15
65
66 movq -96(%r12),%xmm0
67 movq -32(%r12),%xmm1
68 pand %xmm4,%xmm0
69 movq 32(%r12),%xmm2
70 pand %xmm5,%xmm1
71
72 movq %r8,%rbp
73 mulq %rbx
74 movq %rax,%r10
75 movq (%rcx),%rax
76
77 movq 96(%r12),%xmm3
78 pand %xmm6,%xmm2
79 por %xmm1,%xmm0
80 pand %xmm7,%xmm3
81
82 imulq %r10,%rbp
83 movq %rdx,%r11
84
85 por %xmm2,%xmm0
86 leaq 256(%r12),%r12
87 por %xmm3,%xmm0
88
89 mulq %rbp
90 addq %rax,%r10
91 movq 8(%rsi),%rax
92 adcq $0,%rdx
93 movq %rdx,%r13
94
95 leaq 1(%r15),%r15
96 jmp L$1st_enter
97
98 .p2align 4
99 L$1st:
100 addq %rax,%r13
101 movq (%rsi,%r15,8),%rax
102 adcq $0,%rdx
103 addq %r11,%r13
104 movq %r10,%r11
105 adcq $0,%rdx
106 movq %r13,-16(%rsp,%r15,8)
107 movq %rdx,%r13
108
109 L$1st_enter:
110 mulq %rbx
111 addq %rax,%r11
112 movq (%rcx,%r15,8),%rax
113 adcq $0,%rdx
114 leaq 1(%r15),%r15
115 movq %rdx,%r10
116
117 mulq %rbp
118 cmpq %r9,%r15
119 jne L$1st
120
121 .byte 102,72,15,126,195
122
123 addq %rax,%r13
124 movq (%rsi),%rax
125 adcq $0,%rdx
126 addq %r11,%r13
127 adcq $0,%rdx
128 movq %r13,-16(%rsp,%r15,8)
129 movq %rdx,%r13
130 movq %r10,%r11
131
132 xorq %rdx,%rdx
133 addq %r11,%r13
134 adcq $0,%rdx
135 movq %r13,-8(%rsp,%r9,8)
136 movq %rdx,(%rsp,%r9,8)
137
138 leaq 1(%r14),%r14
139 jmp L$outer
140 .p2align 4
141 L$outer:
142 xorq %r15,%r15
143 movq %r8,%rbp
144 movq (%rsp),%r10
145
146 movq -96(%r12),%xmm0
147 movq -32(%r12),%xmm1
148 pand %xmm4,%xmm0
149 movq 32(%r12),%xmm2
150 pand %xmm5,%xmm1
151
152 mulq %rbx
153 addq %rax,%r10
154 movq (%rcx),%rax
155 adcq $0,%rdx
156
157 movq 96(%r12),%xmm3
158 pand %xmm6,%xmm2
159 por %xmm1,%xmm0
160 pand %xmm7,%xmm3
161
162 imulq %r10,%rbp
163 movq %rdx,%r11
164
165 por %xmm2,%xmm0
166 leaq 256(%r12),%r12
167 por %xmm3,%xmm0
168
169 mulq %rbp
170 addq %rax,%r10
171 movq 8(%rsi),%rax
172 adcq $0,%rdx
173 movq 8(%rsp),%r10
174 movq %rdx,%r13
175
176 leaq 1(%r15),%r15
177 jmp L$inner_enter
178
179 .p2align 4
180 L$inner:
181 addq %rax,%r13
182 movq (%rsi,%r15,8),%rax
183 adcq $0,%rdx
184 addq %r10,%r13
185 movq (%rsp,%r15,8),%r10
186 adcq $0,%rdx
187 movq %r13,-16(%rsp,%r15,8)
188 movq %rdx,%r13
189
190 L$inner_enter:
191 mulq %rbx
192 addq %rax,%r11
193 movq (%rcx,%r15,8),%rax
194 adcq $0,%rdx
195 addq %r11,%r10
196 movq %rdx,%r11
197 adcq $0,%r11
198 leaq 1(%r15),%r15
199
200 mulq %rbp
201 cmpq %r9,%r15
202 jne L$inner
203
204 .byte 102,72,15,126,195
205
206 addq %rax,%r13
207 movq (%rsi),%rax
208 adcq $0,%rdx
209 addq %r10,%r13
210 movq (%rsp,%r15,8),%r10
211 adcq $0,%rdx
212 movq %r13,-16(%rsp,%r15,8)
213 movq %rdx,%r13
214
215 xorq %rdx,%rdx
216 addq %r11,%r13
217 adcq $0,%rdx
218 addq %r10,%r13
219 adcq $0,%rdx
220 movq %r13,-8(%rsp,%r9,8)
221 movq %rdx,(%rsp,%r9,8)
222
223 leaq 1(%r14),%r14
224 cmpq %r9,%r14
225 jb L$outer
226
227 xorq %r14,%r14
228 movq (%rsp),%rax
229 leaq (%rsp),%rsi
230 movq %r9,%r15
231 jmp L$sub
232 .p2align 4
233 L$sub: sbbq (%rcx,%r14,8),%rax
234 movq %rax,(%rdi,%r14,8)
235 movq 8(%rsi,%r14,8),%rax
236 leaq 1(%r14),%r14
237 decq %r15
238 jnz L$sub
239
240 sbbq $0,%rax
241 xorq %r14,%r14
242 movq %r9,%r15
243 .p2align 4
244 L$copy:
245 movq (%rsp,%r14,8),%rsi
246 movq (%rdi,%r14,8),%rcx
247 xorq %rcx,%rsi
248 andq %rax,%rsi
249 xorq %rcx,%rsi
250 movq %r14,(%rsp,%r14,8)
251 movq %rsi,(%rdi,%r14,8)
252 leaq 1(%r14),%r14
253 subq $1,%r15
254 jnz L$copy
255
256 movq 8(%rsp,%r9,8),%rsi
257 movq $1,%rax
258 movq -48(%rsi),%r15
259 movq -40(%rsi),%r14
260 movq -32(%rsi),%r13
261 movq -24(%rsi),%r12
262 movq -16(%rsi),%rbp
263 movq -8(%rsi),%rbx
264 leaq (%rsi),%rsp
265 L$mul_epilogue:
266 .byte 0xf3,0xc3
267
268
269 .p2align 5
270 bn_mul4x_mont_gather5:
271 L$mul4x_enter:
272 .byte 0x67
273 movq %rsp,%rax
274 pushq %rbx
275 pushq %rbp
276 pushq %r12
277 pushq %r13
278 pushq %r14
279 pushq %r15
280 .byte 0x67
281 movl %r9d,%r10d
282 shll $3,%r9d
283 shll $3+2,%r10d
284 negq %r9
285
286
287
288
289
290
291
292
293 leaq -64(%rsp,%r9,2),%r11
294 subq %rsi,%r11
295 andq $4095,%r11
296 cmpq %r11,%r10
297 jb L$mul4xsp_alt
298 subq %r11,%rsp
299 leaq -64(%rsp,%r9,2),%rsp
300 jmp L$mul4xsp_done
301
302 .p2align 5
303 L$mul4xsp_alt:
304 leaq 4096-64(,%r9,2),%r10
305 leaq -64(%rsp,%r9,2),%rsp
306 subq %r10,%r11
307 movq $0,%r10
308 cmovcq %r10,%r11
309 subq %r11,%rsp
310 L$mul4xsp_done:
311 andq $-64,%rsp
312 negq %r9
313
314 movq %rax,40(%rsp)
315 L$mul4x_body:
316
317 call mul4x_internal
318
319 movq 40(%rsp),%rsi
320 movq $1,%rax
321 movq -48(%rsi),%r15
322 movq -40(%rsi),%r14
323 movq -32(%rsi),%r13
324 movq -24(%rsi),%r12
325 movq -16(%rsi),%rbp
326 movq -8(%rsi),%rbx
327 leaq (%rsi),%rsp
328 L$mul4x_epilogue:
329 .byte 0xf3,0xc3
330
331
332
333 .p2align 5
334 mul4x_internal:
335 shlq $5,%r9
336 movl 8(%rax),%r10d
337 leaq 256(%rdx,%r9,1),%r13
338 shrq $5,%r9
339 movq %r10,%r11
340 shrq $3,%r10
341 andq $7,%r11
342 notq %r10
343 leaq L$magic_masks(%rip),%rax
344 andq $3,%r10
345 leaq 96(%rdx,%r11,8),%r12
346 movq 0(%rax,%r10,8),%xmm4
347 movq 8(%rax,%r10,8),%xmm5
348 addq $7,%r11
349 movq 16(%rax,%r10,8),%xmm6
350 movq 24(%rax,%r10,8),%xmm7
351 andq $7,%r11
352
353 movq -96(%r12),%xmm0
354 leaq 256(%r12),%r14
355 movq -32(%r12),%xmm1
356 pand %xmm4,%xmm0
357 movq 32(%r12),%xmm2
358 pand %xmm5,%xmm1
359 movq 96(%r12),%xmm3
360 pand %xmm6,%xmm2
361 .byte 0x67
362 por %xmm1,%xmm0
363 movq -96(%r14),%xmm1
364 .byte 0x67
365 pand %xmm7,%xmm3
366 .byte 0x67
367 por %xmm2,%xmm0
368 movq -32(%r14),%xmm2
369 .byte 0x67
370 pand %xmm4,%xmm1
371 .byte 0x67
372 por %xmm3,%xmm0
373 movq 32(%r14),%xmm3
374
375 .byte 102,72,15,126,195
376 movq 96(%r14),%xmm0
377 movq %r13,16+8(%rsp)
378 movq %rdi,56+8(%rsp)
379
380 movq (%r8),%r8
381 movq (%rsi),%rax
382 leaq (%rsi,%r9,1),%rsi
383 negq %r9
384
385 movq %r8,%rbp
386 mulq %rbx
387 movq %rax,%r10
388 movq (%rcx),%rax
389
390 pand %xmm5,%xmm2
391 pand %xmm6,%xmm3
392 por %xmm2,%xmm1
393
394 imulq %r10,%rbp
395
396
397
398
399
400
401
402 leaq 64+8(%rsp,%r11,8),%r14
403 movq %rdx,%r11
404
405 pand %xmm7,%xmm0
406 por %xmm3,%xmm1
407 leaq 512(%r12),%r12
408 por %xmm1,%xmm0
409
410 mulq %rbp
411 addq %rax,%r10
412 movq 8(%rsi,%r9,1),%rax
413 adcq $0,%rdx
414 movq %rdx,%rdi
415
416 mulq %rbx
417 addq %rax,%r11
418 movq 16(%rcx),%rax
419 adcq $0,%rdx
420 movq %rdx,%r10
421
422 mulq %rbp
423 addq %rax,%rdi
424 movq 16(%rsi,%r9,1),%rax
425 adcq $0,%rdx
426 addq %r11,%rdi
427 leaq 32(%r9),%r15
428 leaq 64(%rcx),%rcx
429 adcq $0,%rdx
430 movq %rdi,(%r14)
431 movq %rdx,%r13
432 jmp L$1st4x
433
434 .p2align 5
435 L$1st4x:
436 mulq %rbx
437 addq %rax,%r10
438 movq -32(%rcx),%rax
439 leaq 32(%r14),%r14
440 adcq $0,%rdx
441 movq %rdx,%r11
442
443 mulq %rbp
444 addq %rax,%r13
445 movq -8(%rsi,%r15,1),%rax
446 adcq $0,%rdx
447 addq %r10,%r13
448 adcq $0,%rdx
449 movq %r13,-24(%r14)
450 movq %rdx,%rdi
451
452 mulq %rbx
453 addq %rax,%r11
454 movq -16(%rcx),%rax
455 adcq $0,%rdx
456 movq %rdx,%r10
457
458 mulq %rbp
459 addq %rax,%rdi
460 movq (%rsi,%r15,1),%rax
461 adcq $0,%rdx
462 addq %r11,%rdi
463 adcq $0,%rdx
464 movq %rdi,-16(%r14)
465 movq %rdx,%r13
466
467 mulq %rbx
468 addq %rax,%r10
469 movq 0(%rcx),%rax
470 adcq $0,%rdx
471 movq %rdx,%r11
472
473 mulq %rbp
474 addq %rax,%r13
475 movq 8(%rsi,%r15,1),%rax
476 adcq $0,%rdx
477 addq %r10,%r13
478 adcq $0,%rdx
479 movq %r13,-8(%r14)
480 movq %rdx,%rdi
481
482 mulq %rbx
483 addq %rax,%r11
484 movq 16(%rcx),%rax
485 adcq $0,%rdx
486 movq %rdx,%r10
487
488 mulq %rbp
489 addq %rax,%rdi
490 movq 16(%rsi,%r15,1),%rax
491 adcq $0,%rdx
492 addq %r11,%rdi
493 leaq 64(%rcx),%rcx
494 adcq $0,%rdx
495 movq %rdi,(%r14)
496 movq %rdx,%r13
497
498 addq $32,%r15
499 jnz L$1st4x
500
501 mulq %rbx
502 addq %rax,%r10
503 movq -32(%rcx),%rax
504 leaq 32(%r14),%r14
505 adcq $0,%rdx
506 movq %rdx,%r11
507
508 mulq %rbp
509 addq %rax,%r13
510 movq -8(%rsi),%rax
511 adcq $0,%rdx
512 addq %r10,%r13
513 adcq $0,%rdx
514 movq %r13,-24(%r14)
515 movq %rdx,%rdi
516
517 mulq %rbx
518 addq %rax,%r11
519 movq -16(%rcx),%rax
520 adcq $0,%rdx
521 movq %rdx,%r10
522
523 mulq %rbp
524 addq %rax,%rdi
525 movq (%rsi,%r9,1),%rax
526 adcq $0,%rdx
527 addq %r11,%rdi
528 adcq $0,%rdx
529 movq %rdi,-16(%r14)
530 movq %rdx,%r13
531
532 .byte 102,72,15,126,195
533 leaq (%rcx,%r9,2),%rcx
534
535 xorq %rdi,%rdi
536 addq %r10,%r13
537 adcq $0,%rdi
538 movq %r13,-8(%r14)
539
540 jmp L$outer4x
541
542 .p2align 5
543 L$outer4x:
544 movq (%r14,%r9,1),%r10
545 movq %r8,%rbp
546 mulq %rbx
547 addq %rax,%r10
548 movq (%rcx),%rax
549 adcq $0,%rdx
550
551 movq -96(%r12),%xmm0
552 movq -32(%r12),%xmm1
553 pand %xmm4,%xmm0
554 movq 32(%r12),%xmm2
555 pand %xmm5,%xmm1
556 movq 96(%r12),%xmm3
557
558 imulq %r10,%rbp
559 .byte 0x67
560 movq %rdx,%r11
561 movq %rdi,(%r14)
562
563 pand %xmm6,%xmm2
564 por %xmm1,%xmm0
565 pand %xmm7,%xmm3
566 por %xmm2,%xmm0
567 leaq (%r14,%r9,1),%r14
568 leaq 256(%r12),%r12
569 por %xmm3,%xmm0
570
571 mulq %rbp
572 addq %rax,%r10
573 movq 8(%rsi,%r9,1),%rax
574 adcq $0,%rdx
575 movq %rdx,%rdi
576
577 mulq %rbx
578 addq %rax,%r11
579 movq 16(%rcx),%rax
580 adcq $0,%rdx
581 addq 8(%r14),%r11
582 adcq $0,%rdx
583 movq %rdx,%r10
584
585 mulq %rbp
586 addq %rax,%rdi
587 movq 16(%rsi,%r9,1),%rax
588 adcq $0,%rdx
589 addq %r11,%rdi
590 leaq 32(%r9),%r15
591 leaq 64(%rcx),%rcx
592 adcq $0,%rdx
593 movq %rdx,%r13
594 jmp L$inner4x
595
596 .p2align 5
597 L$inner4x:
598 mulq %rbx
599 addq %rax,%r10
600 movq -32(%rcx),%rax
601 adcq $0,%rdx
602 addq 16(%r14),%r10
603 leaq 32(%r14),%r14
604 adcq $0,%rdx
605 movq %rdx,%r11
606
607 mulq %rbp
608 addq %rax,%r13
609 movq -8(%rsi,%r15,1),%rax
610 adcq $0,%rdx
611 addq %r10,%r13
612 adcq $0,%rdx
613 movq %rdi,-32(%r14)
614 movq %rdx,%rdi
615
616 mulq %rbx
617 addq %rax,%r11
618 movq -16(%rcx),%rax
619 adcq $0,%rdx
620 addq -8(%r14),%r11
621 adcq $0,%rdx
622 movq %rdx,%r10
623
624 mulq %rbp
625 addq %rax,%rdi
626 movq (%rsi,%r15,1),%rax
627 adcq $0,%rdx
628 addq %r11,%rdi
629 adcq $0,%rdx
630 movq %r13,-24(%r14)
631 movq %rdx,%r13
632
633 mulq %rbx
634 addq %rax,%r10
635 movq 0(%rcx),%rax
636 adcq $0,%rdx
637 addq (%r14),%r10
638 adcq $0,%rdx
639 movq %rdx,%r11
640
641 mulq %rbp
642 addq %rax,%r13
643 movq 8(%rsi,%r15,1),%rax
644 adcq $0,%rdx
645 addq %r10,%r13
646 adcq $0,%rdx
647 movq %rdi,-16(%r14)
648 movq %rdx,%rdi
649
650 mulq %rbx
651 addq %rax,%r11
652 movq 16(%rcx),%rax
653 adcq $0,%rdx
654 addq 8(%r14),%r11
655 adcq $0,%rdx
656 movq %rdx,%r10
657
658 mulq %rbp
659 addq %rax,%rdi
660 movq 16(%rsi,%r15,1),%rax
661 adcq $0,%rdx
662 addq %r11,%rdi
663 leaq 64(%rcx),%rcx
664 adcq $0,%rdx
665 movq %r13,-8(%r14)
666 movq %rdx,%r13
667
668 addq $32,%r15
669 jnz L$inner4x
670
671 mulq %rbx
672 addq %rax,%r10
673 movq -32(%rcx),%rax
674 adcq $0,%rdx
675 addq 16(%r14),%r10
676 leaq 32(%r14),%r14
677 adcq $0,%rdx
678 movq %rdx,%r11
679
680 mulq %rbp
681 addq %rax,%r13
682 movq -8(%rsi),%rax
683 adcq $0,%rdx
684 addq %r10,%r13
685 adcq $0,%rdx
686 movq %rdi,-32(%r14)
687 movq %rdx,%rdi
688
689 mulq %rbx
690 addq %rax,%r11
691 movq %rbp,%rax
692 movq -16(%rcx),%rbp
693 adcq $0,%rdx
694 addq -8(%r14),%r11
695 adcq $0,%rdx
696 movq %rdx,%r10
697
698 mulq %rbp
699 addq %rax,%rdi
700 movq (%rsi,%r9,1),%rax
701 adcq $0,%rdx
702 addq %r11,%rdi
703 adcq $0,%rdx
704 movq %r13,-24(%r14)
705 movq %rdx,%r13
706
707 .byte 102,72,15,126,195
708 movq %rdi,-16(%r14)
709 leaq (%rcx,%r9,2),%rcx
710
711 xorq %rdi,%rdi
712 addq %r10,%r13
713 adcq $0,%rdi
714 addq (%r14),%r13
715 adcq $0,%rdi
716 movq %r13,-8(%r14)
717
718 cmpq 16+8(%rsp),%r12
719 jb L$outer4x
720 subq %r13,%rbp
721 adcq %r15,%r15
722 orq %r15,%rdi
723 xorq $1,%rdi
724 leaq (%r14,%r9,1),%rbx
725 leaq (%rcx,%rdi,8),%rbp
726 movq %r9,%rcx
727 sarq $3+2,%rcx
728 movq 56+8(%rsp),%rdi
729 jmp L$sqr4x_sub
730
731 .globl _bn_power5
732
733 .p2align 5
734 _bn_power5:
735 movq %rsp,%rax
736 pushq %rbx
737 pushq %rbp
738 pushq %r12
739 pushq %r13
740 pushq %r14
741 pushq %r15
742 movl %r9d,%r10d
743 shll $3,%r9d
744 shll $3+2,%r10d
745 negq %r9
746 movq (%r8),%r8
747
748
749
750
751
752
753
754 leaq -64(%rsp,%r9,2),%r11
755 subq %rsi,%r11
756 andq $4095,%r11
757 cmpq %r11,%r10
758 jb L$pwr_sp_alt
759 subq %r11,%rsp
760 leaq -64(%rsp,%r9,2),%rsp
761 jmp L$pwr_sp_done
762
763 .p2align 5
764 L$pwr_sp_alt:
765 leaq 4096-64(,%r9,2),%r10
766 leaq -64(%rsp,%r9,2),%rsp
767 subq %r10,%r11
768 movq $0,%r10
769 cmovcq %r10,%r11
770 subq %r11,%rsp
771 L$pwr_sp_done:
772 andq $-64,%rsp
773 movq %r9,%r10
774 negq %r9
775
776
777
778
779
780
781
782
783
784
785 movq %r8,32(%rsp)
786 movq %rax,40(%rsp)
787 L$power5_body:
788 .byte 102,72,15,110,207
789 .byte 102,72,15,110,209
790 .byte 102,73,15,110,218
791 .byte 102,72,15,110,226
792
793 call __bn_sqr8x_internal
794 call __bn_sqr8x_internal
795 call __bn_sqr8x_internal
796 call __bn_sqr8x_internal
797 call __bn_sqr8x_internal
798
799 .byte 102,72,15,126,209
800 .byte 102,72,15,126,226
801 movq %rsi,%rdi
802 movq 40(%rsp),%rax
803 leaq 32(%rsp),%r8
804
805 call mul4x_internal
806
807 movq 40(%rsp),%rsi
808 movq $1,%rax
809 movq -48(%rsi),%r15
810 movq -40(%rsi),%r14
811 movq -32(%rsi),%r13
812 movq -24(%rsi),%r12
813 movq -16(%rsi),%rbp
814 movq -8(%rsi),%rbx
815 leaq (%rsi),%rsp
816 L$power5_epilogue:
817 .byte 0xf3,0xc3
818
819
820 .globl _bn_sqr8x_internal
821 .private_extern _bn_sqr8x_internal
822
823 .p2align 5
824 _bn_sqr8x_internal:
825 __bn_sqr8x_internal:
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899 leaq 32(%r10),%rbp
900 leaq (%rsi,%r9,1),%rsi
901
902 movq %r9,%rcx
903
904
905 movq -32(%rsi,%rbp,1),%r14
906 leaq 48+8(%rsp,%r9,2),%rdi
907 movq -24(%rsi,%rbp,1),%rax
908 leaq -32(%rdi,%rbp,1),%rdi
909 movq -16(%rsi,%rbp,1),%rbx
910 movq %rax,%r15
911
912 mulq %r14
913 movq %rax,%r10
914 movq %rbx,%rax
915 movq %rdx,%r11
916 movq %r10,-24(%rdi,%rbp,1)
917
918 mulq %r14
919 addq %rax,%r11
920 movq %rbx,%rax
921 adcq $0,%rdx
922 movq %r11,-16(%rdi,%rbp,1)
923 movq %rdx,%r10
924
925
926 movq -8(%rsi,%rbp,1),%rbx
927 mulq %r15
928 movq %rax,%r12
929 movq %rbx,%rax
930 movq %rdx,%r13
931
932 leaq (%rbp),%rcx
933 mulq %r14
934 addq %rax,%r10
935 movq %rbx,%rax
936 movq %rdx,%r11
937 adcq $0,%r11
938 addq %r12,%r10
939 adcq $0,%r11
940 movq %r10,-8(%rdi,%rcx,1)
941 jmp L$sqr4x_1st
942
943 .p2align 5
944 L$sqr4x_1st:
945 movq (%rsi,%rcx,1),%rbx
946 mulq %r15
947 addq %rax,%r13
948 movq %rbx,%rax
949 movq %rdx,%r12
950 adcq $0,%r12
951
952 mulq %r14
953 addq %rax,%r11
954 movq %rbx,%rax
955 movq 8(%rsi,%rcx,1),%rbx
956 movq %rdx,%r10
957 adcq $0,%r10
958 addq %r13,%r11
959 adcq $0,%r10
960
961
962 mulq %r15
963 addq %rax,%r12
964 movq %rbx,%rax
965 movq %r11,(%rdi,%rcx,1)
966 movq %rdx,%r13
967 adcq $0,%r13
968
969 mulq %r14
970 addq %rax,%r10
971 movq %rbx,%rax
972 movq 16(%rsi,%rcx,1),%rbx
973 movq %rdx,%r11
974 adcq $0,%r11
975 addq %r12,%r10
976 adcq $0,%r11
977
978 mulq %r15
979 addq %rax,%r13
980 movq %rbx,%rax
981 movq %r10,8(%rdi,%rcx,1)
982 movq %rdx,%r12
983 adcq $0,%r12
984
985 mulq %r14
986 addq %rax,%r11
987 movq %rbx,%rax
988 movq 24(%rsi,%rcx,1),%rbx
989 movq %rdx,%r10
990 adcq $0,%r10
991 addq %r13,%r11
992 adcq $0,%r10
993
994
995 mulq %r15
996 addq %rax,%r12
997 movq %rbx,%rax
998 movq %r11,16(%rdi,%rcx,1)
999 movq %rdx,%r13
1000 adcq $0,%r13
1001 leaq 32(%rcx),%rcx
1002
1003 mulq %r14
1004 addq %rax,%r10
1005 movq %rbx,%rax
1006 movq %rdx,%r11
1007 adcq $0,%r11
1008 addq %r12,%r10
1009 adcq $0,%r11
1010 movq %r10,-8(%rdi,%rcx,1)
1011
1012 cmpq $0,%rcx
1013 jne L$sqr4x_1st
1014
1015 mulq %r15
1016 addq %rax,%r13
1017 leaq 16(%rbp),%rbp
1018 adcq $0,%rdx
1019 addq %r11,%r13
1020 adcq $0,%rdx
1021
1022 movq %r13,(%rdi)
1023 movq %rdx,%r12
1024 movq %rdx,8(%rdi)
1025 jmp L$sqr4x_outer
1026
1027 .p2align 5
1028 L$sqr4x_outer:
1029 movq -32(%rsi,%rbp,1),%r14
1030 leaq 48+8(%rsp,%r9,2),%rdi
1031 movq -24(%rsi,%rbp,1),%rax
1032 leaq -32(%rdi,%rbp,1),%rdi
1033 movq -16(%rsi,%rbp,1),%rbx
1034 movq %rax,%r15
1035
1036 mulq %r14
1037 movq -24(%rdi,%rbp,1),%r10
1038 addq %rax,%r10
1039 movq %rbx,%rax
1040 adcq $0,%rdx
1041 movq %r10,-24(%rdi,%rbp,1)
1042 movq %rdx,%r11
1043
1044 mulq %r14
1045 addq %rax,%r11
1046 movq %rbx,%rax
1047 adcq $0,%rdx
1048 addq -16(%rdi,%rbp,1),%r11
1049 movq %rdx,%r10
1050 adcq $0,%r10
1051 movq %r11,-16(%rdi,%rbp,1)
1052
1053 xorq %r12,%r12
1054
1055 movq -8(%rsi,%rbp,1),%rbx
1056 mulq %r15
1057 addq %rax,%r12
1058 movq %rbx,%rax
1059 adcq $0,%rdx
1060 addq -8(%rdi,%rbp,1),%r12
1061 movq %rdx,%r13
1062 adcq $0,%r13
1063
1064 mulq %r14
1065 addq %rax,%r10
1066 movq %rbx,%rax
1067 adcq $0,%rdx
1068 addq %r12,%r10
1069 movq %rdx,%r11
1070 adcq $0,%r11
1071 movq %r10,-8(%rdi,%rbp,1)
1072
1073 leaq (%rbp),%rcx
1074 jmp L$sqr4x_inner
1075
1076 .p2align 5
1077 L$sqr4x_inner:
1078 movq (%rsi,%rcx,1),%rbx
1079 mulq %r15
1080 addq %rax,%r13
1081 movq %rbx,%rax
1082 movq %rdx,%r12
1083 adcq $0,%r12
1084 addq (%rdi,%rcx,1),%r13
1085 adcq $0,%r12
1086
1087 .byte 0x67
1088 mulq %r14
1089 addq %rax,%r11
1090 movq %rbx,%rax
1091 movq 8(%rsi,%rcx,1),%rbx
1092 movq %rdx,%r10
1093 adcq $0,%r10
1094 addq %r13,%r11
1095 adcq $0,%r10
1096
1097 mulq %r15
1098 addq %rax,%r12
1099 movq %r11,(%rdi,%rcx,1)
1100 movq %rbx,%rax
1101 movq %rdx,%r13
1102 adcq $0,%r13
1103 addq 8(%rdi,%rcx,1),%r12
1104 leaq 16(%rcx),%rcx
1105 adcq $0,%r13
1106
1107 mulq %r14
1108 addq %rax,%r10
1109 movq %rbx,%rax
1110 adcq $0,%rdx
1111 addq %r12,%r10
1112 movq %rdx,%r11
1113 adcq $0,%r11
1114 movq %r10,-8(%rdi,%rcx,1)
1115
1116 cmpq $0,%rcx
1117 jne L$sqr4x_inner
1118
1119 .byte 0x67
1120 mulq %r15
1121 addq %rax,%r13
1122 adcq $0,%rdx
1123 addq %r11,%r13
1124 adcq $0,%rdx
1125
1126 movq %r13,(%rdi)
1127 movq %rdx,%r12
1128 movq %rdx,8(%rdi)
1129
1130 addq $16,%rbp
1131 jnz L$sqr4x_outer
1132
1133
1134 movq -32(%rsi),%r14
1135 leaq 48+8(%rsp,%r9,2),%rdi
1136 movq -24(%rsi),%rax
1137 leaq -32(%rdi,%rbp,1),%rdi
1138 movq -16(%rsi),%rbx
1139 movq %rax,%r15
1140
1141 mulq %r14
1142 addq %rax,%r10
1143 movq %rbx,%rax
1144 movq %rdx,%r11
1145 adcq $0,%r11
1146
1147 mulq %r14
1148 addq %rax,%r11
1149 movq %rbx,%rax
1150 movq %r10,-24(%rdi)
1151 movq %rdx,%r10
1152 adcq $0,%r10
1153 addq %r13,%r11
1154 movq -8(%rsi),%rbx
1155 adcq $0,%r10
1156
1157 mulq %r15
1158 addq %rax,%r12
1159 movq %rbx,%rax
1160 movq %r11,-16(%rdi)
1161 movq %rdx,%r13
1162 adcq $0,%r13
1163
1164 mulq %r14
1165 addq %rax,%r10
1166 movq %rbx,%rax
1167 movq %rdx,%r11
1168 adcq $0,%r11
1169 addq %r12,%r10
1170 adcq $0,%r11
1171 movq %r10,-8(%rdi)
1172
1173 mulq %r15
1174 addq %rax,%r13
1175 movq -16(%rsi),%rax
1176 adcq $0,%rdx
1177 addq %r11,%r13
1178 adcq $0,%rdx
1179
1180 movq %r13,(%rdi)
1181 movq %rdx,%r12
1182 movq %rdx,8(%rdi)
1183
1184 mulq %rbx
1185 addq $16,%rbp
1186 xorq %r14,%r14
1187 subq %r9,%rbp
1188 xorq %r15,%r15
1189
1190 addq %r12,%rax
1191 adcq $0,%rdx
1192 movq %rax,8(%rdi)
1193 movq %rdx,16(%rdi)
1194 movq %r15,24(%rdi)
1195
1196 movq -16(%rsi,%rbp,1),%rax
1197 leaq 48+8(%rsp),%rdi
1198 xorq %r10,%r10
1199 movq 8(%rdi),%r11
1200
1201 leaq (%r14,%r10,2),%r12
1202 shrq $63,%r10
1203 leaq (%rcx,%r11,2),%r13
1204 shrq $63,%r11
1205 orq %r10,%r13
1206 movq 16(%rdi),%r10
1207 movq %r11,%r14
1208 mulq %rax
1209 negq %r15
1210 movq 24(%rdi),%r11
1211 adcq %rax,%r12
1212 movq -8(%rsi,%rbp,1),%rax
1213 movq %r12,(%rdi)
1214 adcq %rdx,%r13
1215
1216 leaq (%r14,%r10,2),%rbx
1217 movq %r13,8(%rdi)
1218 sbbq %r15,%r15
1219 shrq $63,%r10
1220 leaq (%rcx,%r11,2),%r8
1221 shrq $63,%r11
1222 orq %r10,%r8
1223 movq 32(%rdi),%r10
1224 movq %r11,%r14
1225 mulq %rax
1226 negq %r15
1227 movq 40(%rdi),%r11
1228 adcq %rax,%rbx
1229 movq 0(%rsi,%rbp,1),%rax
1230 movq %rbx,16(%rdi)
1231 adcq %rdx,%r8
1232 leaq 16(%rbp),%rbp
1233 movq %r8,24(%rdi)
1234 sbbq %r15,%r15
1235 leaq 64(%rdi),%rdi
1236 jmp L$sqr4x_shift_n_add
1237
1238 .p2align 5
1239 L$sqr4x_shift_n_add:
1240 leaq (%r14,%r10,2),%r12
1241 shrq $63,%r10
1242 leaq (%rcx,%r11,2),%r13
1243 shrq $63,%r11
1244 orq %r10,%r13
1245 movq -16(%rdi),%r10
1246 movq %r11,%r14
1247 mulq %rax
1248 negq %r15
1249 movq -8(%rdi),%r11
1250 adcq %rax,%r12
1251 movq -8(%rsi,%rbp,1),%rax
1252 movq %r12,-32(%rdi)
1253 adcq %rdx,%r13
1254
1255 leaq (%r14,%r10,2),%rbx
1256 movq %r13,-24(%rdi)
1257 sbbq %r15,%r15
1258 shrq $63,%r10
1259 leaq (%rcx,%r11,2),%r8
1260 shrq $63,%r11
1261 orq %r10,%r8
1262 movq 0(%rdi),%r10
1263 movq %r11,%r14
1264 mulq %rax
1265 negq %r15
1266 movq 8(%rdi),%r11
1267 adcq %rax,%rbx
1268 movq 0(%rsi,%rbp,1),%rax
1269 movq %rbx,-16(%rdi)
1270 adcq %rdx,%r8
1271
1272 leaq (%r14,%r10,2),%r12
1273 movq %r8,-8(%rdi)
1274 sbbq %r15,%r15
1275 shrq $63,%r10
1276 leaq (%rcx,%r11,2),%r13
1277 shrq $63,%r11
1278 orq %r10,%r13
1279 movq 16(%rdi),%r10
1280 movq %r11,%r14
1281 mulq %rax
1282 negq %r15
1283 movq 24(%rdi),%r11
1284 adcq %rax,%r12
1285 movq 8(%rsi,%rbp,1),%rax
1286 movq %r12,0(%rdi)
1287 adcq %rdx,%r13
1288
1289 leaq (%r14,%r10,2),%rbx
1290 movq %r13,8(%rdi)
1291 sbbq %r15,%r15
1292 shrq $63,%r10
1293 leaq (%rcx,%r11,2),%r8
1294 shrq $63,%r11
1295 orq %r10,%r8
1296 movq 32(%rdi),%r10
1297 movq %r11,%r14
1298 mulq %rax
1299 negq %r15
1300 movq 40(%rdi),%r11
1301 adcq %rax,%rbx
1302 movq 16(%rsi,%rbp,1),%rax
1303 movq %rbx,16(%rdi)
1304 adcq %rdx,%r8
1305 movq %r8,24(%rdi)
1306 sbbq %r15,%r15
1307 leaq 64(%rdi),%rdi
1308 addq $32,%rbp
1309 jnz L$sqr4x_shift_n_add
1310
1311 leaq (%r14,%r10,2),%r12
1312 .byte 0x67
1313 shrq $63,%r10
1314 leaq (%rcx,%r11,2),%r13
1315 shrq $63,%r11
1316 orq %r10,%r13
1317 movq -16(%rdi),%r10
1318 movq %r11,%r14
1319 mulq %rax
1320 negq %r15
1321 movq -8(%rdi),%r11
1322 adcq %rax,%r12
1323 movq -8(%rsi),%rax
1324 movq %r12,-32(%rdi)
1325 adcq %rdx,%r13
1326
1327 leaq (%r14,%r10,2),%rbx
1328 movq %r13,-24(%rdi)
1329 sbbq %r15,%r15
1330 shrq $63,%r10
1331 leaq (%rcx,%r11,2),%r8
1332 shrq $63,%r11
1333 orq %r10,%r8
1334 mulq %rax
1335 negq %r15
1336 adcq %rax,%rbx
1337 adcq %rdx,%r8
1338 movq %rbx,-16(%rdi)
1339 movq %r8,-8(%rdi)
1340 .byte 102,72,15,126,213
1341 sqr8x_reduction:
1342 xorq %rax,%rax
1343 leaq (%rbp,%r9,2),%rcx
1344 leaq 48+8(%rsp,%r9,2),%rdx
1345 movq %rcx,0+8(%rsp)
1346 leaq 48+8(%rsp,%r9,1),%rdi
1347 movq %rdx,8+8(%rsp)
1348 negq %r9
1349 jmp L$8x_reduction_loop
1350
1351 .p2align 5
1352 L$8x_reduction_loop:
1353 leaq (%rdi,%r9,1),%rdi
1354 .byte 0x66
1355 movq 0(%rdi),%rbx
1356 movq 8(%rdi),%r9
1357 movq 16(%rdi),%r10
1358 movq 24(%rdi),%r11
1359 movq 32(%rdi),%r12
1360 movq 40(%rdi),%r13
1361 movq 48(%rdi),%r14
1362 movq 56(%rdi),%r15
1363 movq %rax,(%rdx)
1364 leaq 64(%rdi),%rdi
1365
1366 .byte 0x67
1367 movq %rbx,%r8
1368 imulq 32+8(%rsp),%rbx
1369 movq 0(%rbp),%rax
1370 movl $8,%ecx
1371 jmp L$8x_reduce
1372
1373 .p2align 5
1374 L$8x_reduce:
1375 mulq %rbx
1376 movq 16(%rbp),%rax
1377 negq %r8
1378 movq %rdx,%r8
1379 adcq $0,%r8
1380
1381 mulq %rbx
1382 addq %rax,%r9
1383 movq 32(%rbp),%rax
1384 adcq $0,%rdx
1385 addq %r9,%r8
1386 movq %rbx,48-8+8(%rsp,%rcx,8)
1387 movq %rdx,%r9
1388 adcq $0,%r9
1389
1390 mulq %rbx
1391 addq %rax,%r10
1392 movq 48(%rbp),%rax
1393 adcq $0,%rdx
1394 addq %r10,%r9
1395 movq 32+8(%rsp),%rsi
1396 movq %rdx,%r10
1397 adcq $0,%r10
1398
1399 mulq %rbx
1400 addq %rax,%r11
1401 movq 64(%rbp),%rax
1402 adcq $0,%rdx
1403 imulq %r8,%rsi
1404 addq %r11,%r10
1405 movq %rdx,%r11
1406 adcq $0,%r11
1407
1408 mulq %rbx
1409 addq %rax,%r12
1410 movq 80(%rbp),%rax
1411 adcq $0,%rdx
1412 addq %r12,%r11
1413 movq %rdx,%r12
1414 adcq $0,%r12
1415
1416 mulq %rbx
1417 addq %rax,%r13
1418 movq 96(%rbp),%rax
1419 adcq $0,%rdx
1420 addq %r13,%r12
1421 movq %rdx,%r13
1422 adcq $0,%r13
1423
1424 mulq %rbx
1425 addq %rax,%r14
1426 movq 112(%rbp),%rax
1427 adcq $0,%rdx
1428 addq %r14,%r13
1429 movq %rdx,%r14
1430 adcq $0,%r14
1431
1432 mulq %rbx
1433 movq %rsi,%rbx
1434 addq %rax,%r15
1435 movq 0(%rbp),%rax
1436 adcq $0,%rdx
1437 addq %r15,%r14
1438 movq %rdx,%r15
1439 adcq $0,%r15
1440
1441 decl %ecx
1442 jnz L$8x_reduce
1443
1444 leaq 128(%rbp),%rbp
1445 xorq %rax,%rax
1446 movq 8+8(%rsp),%rdx
1447 cmpq 0+8(%rsp),%rbp
1448 jae L$8x_no_tail
1449
1450 .byte 0x66
1451 addq 0(%rdi),%r8
1452 adcq 8(%rdi),%r9
1453 adcq 16(%rdi),%r10
1454 adcq 24(%rdi),%r11
1455 adcq 32(%rdi),%r12
1456 adcq 40(%rdi),%r13
1457 adcq 48(%rdi),%r14
1458 adcq 56(%rdi),%r15
1459 sbbq %rsi,%rsi
1460
1461 movq 48+56+8(%rsp),%rbx
1462 movl $8,%ecx
1463 movq 0(%rbp),%rax
1464 jmp L$8x_tail
1465
1466 .p2align 5
1467 L$8x_tail:
1468 mulq %rbx
1469 addq %rax,%r8
1470 movq 16(%rbp),%rax
1471 movq %r8,(%rdi)
1472 movq %rdx,%r8
1473 adcq $0,%r8
1474
1475 mulq %rbx
1476 addq %rax,%r9
1477 movq 32(%rbp),%rax
1478 adcq $0,%rdx
1479 addq %r9,%r8
1480 leaq 8(%rdi),%rdi
1481 movq %rdx,%r9
1482 adcq $0,%r9
1483
1484 mulq %rbx
1485 addq %rax,%r10
1486 movq 48(%rbp),%rax
1487 adcq $0,%rdx
1488 addq %r10,%r9
1489 movq %rdx,%r10
1490 adcq $0,%r10
1491
1492 mulq %rbx
1493 addq %rax,%r11
1494 movq 64(%rbp),%rax
1495 adcq $0,%rdx
1496 addq %r11,%r10
1497 movq %rdx,%r11
1498 adcq $0,%r11
1499
1500 mulq %rbx
1501 addq %rax,%r12
1502 movq 80(%rbp),%rax
1503 adcq $0,%rdx
1504 addq %r12,%r11
1505 movq %rdx,%r12
1506 adcq $0,%r12
1507
1508 mulq %rbx
1509 addq %rax,%r13
1510 movq 96(%rbp),%rax
1511 adcq $0,%rdx
1512 addq %r13,%r12
1513 movq %rdx,%r13
1514 adcq $0,%r13
1515
1516 mulq %rbx
1517 addq %rax,%r14
1518 movq 112(%rbp),%rax
1519 adcq $0,%rdx
1520 addq %r14,%r13
1521 movq %rdx,%r14
1522 adcq $0,%r14
1523
1524 mulq %rbx
1525 movq 48-16+8(%rsp,%rcx,8),%rbx
1526 addq %rax,%r15
1527 adcq $0,%rdx
1528 addq %r15,%r14
1529 movq 0(%rbp),%rax
1530 movq %rdx,%r15
1531 adcq $0,%r15
1532
1533 decl %ecx
1534 jnz L$8x_tail
1535
1536 leaq 128(%rbp),%rbp
1537 movq 8+8(%rsp),%rdx
1538 cmpq 0+8(%rsp),%rbp
1539 jae L$8x_tail_done
1540
1541 movq 48+56+8(%rsp),%rbx
1542 negq %rsi
1543 movq 0(%rbp),%rax
1544 adcq 0(%rdi),%r8
1545 adcq 8(%rdi),%r9
1546 adcq 16(%rdi),%r10
1547 adcq 24(%rdi),%r11
1548 adcq 32(%rdi),%r12
1549 adcq 40(%rdi),%r13
1550 adcq 48(%rdi),%r14
1551 adcq 56(%rdi),%r15
1552 sbbq %rsi,%rsi
1553
1554 movl $8,%ecx
1555 jmp L$8x_tail
1556
1557 .p2align 5
1558 L$8x_tail_done:
1559 addq (%rdx),%r8
1560 xorq %rax,%rax
1561
1562 negq %rsi
1563 L$8x_no_tail:
1564 adcq 0(%rdi),%r8
1565 adcq 8(%rdi),%r9
1566 adcq 16(%rdi),%r10
1567 adcq 24(%rdi),%r11
1568 adcq 32(%rdi),%r12
1569 adcq 40(%rdi),%r13
1570 adcq 48(%rdi),%r14
1571 adcq 56(%rdi),%r15
1572 adcq $0,%rax
1573 movq -16(%rbp),%rcx
1574 xorq %rsi,%rsi
1575
1576 .byte 102,72,15,126,213
1577
1578 movq %r8,0(%rdi)
1579 movq %r9,8(%rdi)
1580 .byte 102,73,15,126,217
1581 movq %r10,16(%rdi)
1582 movq %r11,24(%rdi)
1583 movq %r12,32(%rdi)
1584 movq %r13,40(%rdi)
1585 movq %r14,48(%rdi)
1586 movq %r15,56(%rdi)
1587 leaq 64(%rdi),%rdi
1588
1589 cmpq %rdx,%rdi
1590 jb L$8x_reduction_loop
1591
1592 subq %r15,%rcx
1593 leaq (%rdi,%r9,1),%rbx
1594 adcq %rsi,%rsi
1595 movq %r9,%rcx
1596 orq %rsi,%rax
1597 .byte 102,72,15,126,207
1598 xorq $1,%rax
1599 .byte 102,72,15,126,206
1600 leaq (%rbp,%rax,8),%rbp
1601 sarq $3+2,%rcx
1602 jmp L$sqr4x_sub
1603
1604 .p2align 5
1605 L$sqr4x_sub:
1606 .byte 0x66
1607 movq 0(%rbx),%r12
1608 movq 8(%rbx),%r13
1609 sbbq 0(%rbp),%r12
1610 movq 16(%rbx),%r14
1611 sbbq 16(%rbp),%r13
1612 movq 24(%rbx),%r15
1613 leaq 32(%rbx),%rbx
1614 sbbq 32(%rbp),%r14
1615 movq %r12,0(%rdi)
1616 sbbq 48(%rbp),%r15
1617 leaq 64(%rbp),%rbp
1618 movq %r13,8(%rdi)
1619 movq %r14,16(%rdi)
1620 movq %r15,24(%rdi)
1621 leaq 32(%rdi),%rdi
1622
1623 incq %rcx
1624 jnz L$sqr4x_sub
1625 movq %r9,%r10
1626 negq %r9
1627 .byte 0xf3,0xc3
1628
1629 .globl _bn_from_montgomery
1630
1631 .p2align 5
1632 _bn_from_montgomery:
1633 testl $7,%r9d
1634 jz bn_from_mont8x
1635 xorl %eax,%eax
1636 .byte 0xf3,0xc3
1637
1638
1639
1640 .p2align 5
1641 bn_from_mont8x:
1642 .byte 0x67
1643 movq %rsp,%rax
1644 pushq %rbx
1645 pushq %rbp
1646 pushq %r12
1647 pushq %r13
1648 pushq %r14
1649 pushq %r15
1650 .byte 0x67
1651 movl %r9d,%r10d
1652 shll $3,%r9d
1653 shll $3+2,%r10d
1654 negq %r9
1655 movq (%r8),%r8
1656
1657
1658
1659
1660
1661
1662
1663 leaq -64(%rsp,%r9,2),%r11
1664 subq %rsi,%r11
1665 andq $4095,%r11
1666 cmpq %r11,%r10
1667 jb L$from_sp_alt
1668 subq %r11,%rsp
1669 leaq -64(%rsp,%r9,2),%rsp
1670 jmp L$from_sp_done
1671
1672 .p2align 5
1673 L$from_sp_alt:
1674 leaq 4096-64(,%r9,2),%r10
1675 leaq -64(%rsp,%r9,2),%rsp
1676 subq %r10,%r11
1677 movq $0,%r10
1678 cmovcq %r10,%r11
1679 subq %r11,%rsp
1680 L$from_sp_done:
1681 andq $-64,%rsp
1682 movq %r9,%r10
1683 negq %r9
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694 movq %r8,32(%rsp)
1695 movq %rax,40(%rsp)
1696 L$from_body:
1697 movq %r9,%r11
1698 leaq 48(%rsp),%rax
1699 pxor %xmm0,%xmm0
1700 jmp L$mul_by_1
1701
1702 .p2align 5
1703 L$mul_by_1:
1704 movdqu (%rsi),%xmm1
1705 movdqu 16(%rsi),%xmm2
1706 movdqu 32(%rsi),%xmm3
1707 movdqa %xmm0,(%rax,%r9,1)
1708 movdqu 48(%rsi),%xmm4
1709 movdqa %xmm0,16(%rax,%r9,1)
1710 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1711 movdqa %xmm1,(%rax)
1712 movdqa %xmm0,32(%rax,%r9,1)
1713 movdqa %xmm2,16(%rax)
1714 movdqa %xmm0,48(%rax,%r9,1)
1715 movdqa %xmm3,32(%rax)
1716 movdqa %xmm4,48(%rax)
1717 leaq 64(%rax),%rax
1718 subq $64,%r11
1719 jnz L$mul_by_1
1720
1721 .byte 102,72,15,110,207
1722 .byte 102,72,15,110,209
1723 .byte 0x67
1724 movq %rcx,%rbp
1725 .byte 102,73,15,110,218
1726 call sqr8x_reduction
1727
1728 pxor %xmm0,%xmm0
1729 leaq 48(%rsp),%rax
1730 movq 40(%rsp),%rsi
1731 jmp L$from_mont_zero
1732
1733 .p2align 5
1734 L$from_mont_zero:
1735 movdqa %xmm0,0(%rax)
1736 movdqa %xmm0,16(%rax)
1737 movdqa %xmm0,32(%rax)
1738 movdqa %xmm0,48(%rax)
1739 leaq 64(%rax),%rax
1740 subq $32,%r9
1741 jnz L$from_mont_zero
1742
1743 movq $1,%rax
1744 movq -48(%rsi),%r15
1745 movq -40(%rsi),%r14
1746 movq -32(%rsi),%r13
1747 movq -24(%rsi),%r12
1748 movq -16(%rsi),%rbp
1749 movq -8(%rsi),%rbx
1750 leaq (%rsi),%rsp
1751 L$from_epilogue:
1752 .byte 0xf3,0xc3
1753
1754 .globl _bn_get_bits5
1755
1756 .p2align 4
1757 _bn_get_bits5:
1758 movq %rdi,%r10
1759 movl %esi,%ecx
1760 shrl $3,%esi
1761 movzwl (%r10,%rsi,1),%eax
1762 andl $7,%ecx
1763 shrl %cl,%eax
1764 andl $31,%eax
1765 .byte 0xf3,0xc3
1766
1767
1768 .globl _bn_scatter5
1769
1770 .p2align 4
1771 _bn_scatter5:
1772 cmpl $0,%esi
1773 jz L$scatter_epilogue
1774 leaq (%rdx,%rcx,8),%rdx
1775 L$scatter:
1776 movq (%rdi),%rax
1777 leaq 8(%rdi),%rdi
1778 movq %rax,(%rdx)
1779 leaq 256(%rdx),%rdx
1780 subl $1,%esi
1781 jnz L$scatter
1782 L$scatter_epilogue:
1783 .byte 0xf3,0xc3
1784
1785
1786 .globl _bn_gather5
1787
1788 .p2align 4
1789 _bn_gather5:
1790 movl %ecx,%r11d
1791 shrl $3,%ecx
1792 andq $7,%r11
1793 notl %ecx
1794 leaq L$magic_masks(%rip),%rax
1795 andl $3,%ecx
1796 leaq 128(%rdx,%r11,8),%rdx
1797 movq 0(%rax,%rcx,8),%xmm4
1798 movq 8(%rax,%rcx,8),%xmm5
1799 movq 16(%rax,%rcx,8),%xmm6
1800 movq 24(%rax,%rcx,8),%xmm7
1801 jmp L$gather
1802 .p2align 4
1803 L$gather:
1804 movq -128(%rdx),%xmm0
1805 movq -64(%rdx),%xmm1
1806 pand %xmm4,%xmm0
1807 movq 0(%rdx),%xmm2
1808 pand %xmm5,%xmm1
1809 movq 64(%rdx),%xmm3
1810 pand %xmm6,%xmm2
1811 por %xmm1,%xmm0
1812 pand %xmm7,%xmm3
1813 .byte 0x67,0x67
1814 por %xmm2,%xmm0
1815 leaq 256(%rdx),%rdx
1816 por %xmm3,%xmm0
1817
1818 movq %xmm0,(%rdi)
1819 leaq 8(%rdi),%rdi
1820 subl $1,%esi
1821 jnz L$gather
1822 .byte 0xf3,0xc3
1823 L$SEH_end_bn_gather5:
1824
1825 .p2align 6
1826 L$magic_masks:
1827 .long 0,0, 0,0, 0,0, -1,-1
1828 .long 0,0, 0,0, 0,0, 0,0
1829 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0
1830 #endif
OLDNEW
« no previous file with comments | « third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S ('k') | third_party/boringssl/mac-x86_64/crypto/cpu-x86_64-asm.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698