Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(58)

Side by Side Diff: third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S

Issue 2354623003: Pull boringssl generated source from boringssl_gen (Closed)
Patch Set: . Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__x86_64__)
2 .text
3
4
5
6 .globl _bn_mul_mont_gather5
7 .private_extern _bn_mul_mont_gather5
8
9 .p2align 6
10 _bn_mul_mont_gather5:
11 testl $7,%r9d
12 jnz L$mul_enter
13 jmp L$mul4x_enter
14
15 .p2align 4
16 L$mul_enter:
17 movl %r9d,%r9d
18 movq %rsp,%rax
19 movd 8(%rsp),%xmm5
20 leaq L$inc(%rip),%r10
21 pushq %rbx
22 pushq %rbp
23 pushq %r12
24 pushq %r13
25 pushq %r14
26 pushq %r15
27
28 leaq 2(%r9),%r11
29 negq %r11
30 leaq -264(%rsp,%r11,8),%rsp
31 andq $-1024,%rsp
32
33 movq %rax,8(%rsp,%r9,8)
34 L$mul_body:
35 leaq 128(%rdx),%r12
36 movdqa 0(%r10),%xmm0
37 movdqa 16(%r10),%xmm1
38 leaq 24-112(%rsp,%r9,8),%r10
39 andq $-16,%r10
40
41 pshufd $0,%xmm5,%xmm5
42 movdqa %xmm1,%xmm4
43 movdqa %xmm1,%xmm2
44 paddd %xmm0,%xmm1
45 pcmpeqd %xmm5,%xmm0
46 .byte 0x67
47 movdqa %xmm4,%xmm3
48 paddd %xmm1,%xmm2
49 pcmpeqd %xmm5,%xmm1
50 movdqa %xmm0,112(%r10)
51 movdqa %xmm4,%xmm0
52
53 paddd %xmm2,%xmm3
54 pcmpeqd %xmm5,%xmm2
55 movdqa %xmm1,128(%r10)
56 movdqa %xmm4,%xmm1
57
58 paddd %xmm3,%xmm0
59 pcmpeqd %xmm5,%xmm3
60 movdqa %xmm2,144(%r10)
61 movdqa %xmm4,%xmm2
62
63 paddd %xmm0,%xmm1
64 pcmpeqd %xmm5,%xmm0
65 movdqa %xmm3,160(%r10)
66 movdqa %xmm4,%xmm3
67 paddd %xmm1,%xmm2
68 pcmpeqd %xmm5,%xmm1
69 movdqa %xmm0,176(%r10)
70 movdqa %xmm4,%xmm0
71
72 paddd %xmm2,%xmm3
73 pcmpeqd %xmm5,%xmm2
74 movdqa %xmm1,192(%r10)
75 movdqa %xmm4,%xmm1
76
77 paddd %xmm3,%xmm0
78 pcmpeqd %xmm5,%xmm3
79 movdqa %xmm2,208(%r10)
80 movdqa %xmm4,%xmm2
81
82 paddd %xmm0,%xmm1
83 pcmpeqd %xmm5,%xmm0
84 movdqa %xmm3,224(%r10)
85 movdqa %xmm4,%xmm3
86 paddd %xmm1,%xmm2
87 pcmpeqd %xmm5,%xmm1
88 movdqa %xmm0,240(%r10)
89 movdqa %xmm4,%xmm0
90
91 paddd %xmm2,%xmm3
92 pcmpeqd %xmm5,%xmm2
93 movdqa %xmm1,256(%r10)
94 movdqa %xmm4,%xmm1
95
96 paddd %xmm3,%xmm0
97 pcmpeqd %xmm5,%xmm3
98 movdqa %xmm2,272(%r10)
99 movdqa %xmm4,%xmm2
100
101 paddd %xmm0,%xmm1
102 pcmpeqd %xmm5,%xmm0
103 movdqa %xmm3,288(%r10)
104 movdqa %xmm4,%xmm3
105 paddd %xmm1,%xmm2
106 pcmpeqd %xmm5,%xmm1
107 movdqa %xmm0,304(%r10)
108
109 paddd %xmm2,%xmm3
110 .byte 0x67
111 pcmpeqd %xmm5,%xmm2
112 movdqa %xmm1,320(%r10)
113
114 pcmpeqd %xmm5,%xmm3
115 movdqa %xmm2,336(%r10)
116 pand 64(%r12),%xmm0
117
118 pand 80(%r12),%xmm1
119 pand 96(%r12),%xmm2
120 movdqa %xmm3,352(%r10)
121 pand 112(%r12),%xmm3
122 por %xmm2,%xmm0
123 por %xmm3,%xmm1
124 movdqa -128(%r12),%xmm4
125 movdqa -112(%r12),%xmm5
126 movdqa -96(%r12),%xmm2
127 pand 112(%r10),%xmm4
128 movdqa -80(%r12),%xmm3
129 pand 128(%r10),%xmm5
130 por %xmm4,%xmm0
131 pand 144(%r10),%xmm2
132 por %xmm5,%xmm1
133 pand 160(%r10),%xmm3
134 por %xmm2,%xmm0
135 por %xmm3,%xmm1
136 movdqa -64(%r12),%xmm4
137 movdqa -48(%r12),%xmm5
138 movdqa -32(%r12),%xmm2
139 pand 176(%r10),%xmm4
140 movdqa -16(%r12),%xmm3
141 pand 192(%r10),%xmm5
142 por %xmm4,%xmm0
143 pand 208(%r10),%xmm2
144 por %xmm5,%xmm1
145 pand 224(%r10),%xmm3
146 por %xmm2,%xmm0
147 por %xmm3,%xmm1
148 movdqa 0(%r12),%xmm4
149 movdqa 16(%r12),%xmm5
150 movdqa 32(%r12),%xmm2
151 pand 240(%r10),%xmm4
152 movdqa 48(%r12),%xmm3
153 pand 256(%r10),%xmm5
154 por %xmm4,%xmm0
155 pand 272(%r10),%xmm2
156 por %xmm5,%xmm1
157 pand 288(%r10),%xmm3
158 por %xmm2,%xmm0
159 por %xmm3,%xmm1
160 por %xmm1,%xmm0
161 pshufd $0x4e,%xmm0,%xmm1
162 por %xmm1,%xmm0
163 leaq 256(%r12),%r12
164 .byte 102,72,15,126,195
165
166 movq (%r8),%r8
167 movq (%rsi),%rax
168
169 xorq %r14,%r14
170 xorq %r15,%r15
171
172 movq %r8,%rbp
173 mulq %rbx
174 movq %rax,%r10
175 movq (%rcx),%rax
176
177 imulq %r10,%rbp
178 movq %rdx,%r11
179
180 mulq %rbp
181 addq %rax,%r10
182 movq 8(%rsi),%rax
183 adcq $0,%rdx
184 movq %rdx,%r13
185
186 leaq 1(%r15),%r15
187 jmp L$1st_enter
188
189 .p2align 4
190 L$1st:
191 addq %rax,%r13
192 movq (%rsi,%r15,8),%rax
193 adcq $0,%rdx
194 addq %r11,%r13
195 movq %r10,%r11
196 adcq $0,%rdx
197 movq %r13,-16(%rsp,%r15,8)
198 movq %rdx,%r13
199
200 L$1st_enter:
201 mulq %rbx
202 addq %rax,%r11
203 movq (%rcx,%r15,8),%rax
204 adcq $0,%rdx
205 leaq 1(%r15),%r15
206 movq %rdx,%r10
207
208 mulq %rbp
209 cmpq %r9,%r15
210 jne L$1st
211
212
213 addq %rax,%r13
214 adcq $0,%rdx
215 addq %r11,%r13
216 adcq $0,%rdx
217 movq %r13,-16(%rsp,%r9,8)
218 movq %rdx,%r13
219 movq %r10,%r11
220
221 xorq %rdx,%rdx
222 addq %r11,%r13
223 adcq $0,%rdx
224 movq %r13,-8(%rsp,%r9,8)
225 movq %rdx,(%rsp,%r9,8)
226
227 leaq 1(%r14),%r14
228 jmp L$outer
229 .p2align 4
230 L$outer:
231 leaq 24+128(%rsp,%r9,8),%rdx
232 andq $-16,%rdx
233 pxor %xmm4,%xmm4
234 pxor %xmm5,%xmm5
235 movdqa -128(%r12),%xmm0
236 movdqa -112(%r12),%xmm1
237 movdqa -96(%r12),%xmm2
238 movdqa -80(%r12),%xmm3
239 pand -128(%rdx),%xmm0
240 pand -112(%rdx),%xmm1
241 por %xmm0,%xmm4
242 pand -96(%rdx),%xmm2
243 por %xmm1,%xmm5
244 pand -80(%rdx),%xmm3
245 por %xmm2,%xmm4
246 por %xmm3,%xmm5
247 movdqa -64(%r12),%xmm0
248 movdqa -48(%r12),%xmm1
249 movdqa -32(%r12),%xmm2
250 movdqa -16(%r12),%xmm3
251 pand -64(%rdx),%xmm0
252 pand -48(%rdx),%xmm1
253 por %xmm0,%xmm4
254 pand -32(%rdx),%xmm2
255 por %xmm1,%xmm5
256 pand -16(%rdx),%xmm3
257 por %xmm2,%xmm4
258 por %xmm3,%xmm5
259 movdqa 0(%r12),%xmm0
260 movdqa 16(%r12),%xmm1
261 movdqa 32(%r12),%xmm2
262 movdqa 48(%r12),%xmm3
263 pand 0(%rdx),%xmm0
264 pand 16(%rdx),%xmm1
265 por %xmm0,%xmm4
266 pand 32(%rdx),%xmm2
267 por %xmm1,%xmm5
268 pand 48(%rdx),%xmm3
269 por %xmm2,%xmm4
270 por %xmm3,%xmm5
271 movdqa 64(%r12),%xmm0
272 movdqa 80(%r12),%xmm1
273 movdqa 96(%r12),%xmm2
274 movdqa 112(%r12),%xmm3
275 pand 64(%rdx),%xmm0
276 pand 80(%rdx),%xmm1
277 por %xmm0,%xmm4
278 pand 96(%rdx),%xmm2
279 por %xmm1,%xmm5
280 pand 112(%rdx),%xmm3
281 por %xmm2,%xmm4
282 por %xmm3,%xmm5
283 por %xmm5,%xmm4
284 pshufd $0x4e,%xmm4,%xmm0
285 por %xmm4,%xmm0
286 leaq 256(%r12),%r12
287
288 movq (%rsi),%rax
289 .byte 102,72,15,126,195
290
291 xorq %r15,%r15
292 movq %r8,%rbp
293 movq (%rsp),%r10
294
295 mulq %rbx
296 addq %rax,%r10
297 movq (%rcx),%rax
298 adcq $0,%rdx
299
300 imulq %r10,%rbp
301 movq %rdx,%r11
302
303 mulq %rbp
304 addq %rax,%r10
305 movq 8(%rsi),%rax
306 adcq $0,%rdx
307 movq 8(%rsp),%r10
308 movq %rdx,%r13
309
310 leaq 1(%r15),%r15
311 jmp L$inner_enter
312
313 .p2align 4
314 L$inner:
315 addq %rax,%r13
316 movq (%rsi,%r15,8),%rax
317 adcq $0,%rdx
318 addq %r10,%r13
319 movq (%rsp,%r15,8),%r10
320 adcq $0,%rdx
321 movq %r13,-16(%rsp,%r15,8)
322 movq %rdx,%r13
323
324 L$inner_enter:
325 mulq %rbx
326 addq %rax,%r11
327 movq (%rcx,%r15,8),%rax
328 adcq $0,%rdx
329 addq %r11,%r10
330 movq %rdx,%r11
331 adcq $0,%r11
332 leaq 1(%r15),%r15
333
334 mulq %rbp
335 cmpq %r9,%r15
336 jne L$inner
337
338 addq %rax,%r13
339 adcq $0,%rdx
340 addq %r10,%r13
341 movq (%rsp,%r9,8),%r10
342 adcq $0,%rdx
343 movq %r13,-16(%rsp,%r9,8)
344 movq %rdx,%r13
345
346 xorq %rdx,%rdx
347 addq %r11,%r13
348 adcq $0,%rdx
349 addq %r10,%r13
350 adcq $0,%rdx
351 movq %r13,-8(%rsp,%r9,8)
352 movq %rdx,(%rsp,%r9,8)
353
354 leaq 1(%r14),%r14
355 cmpq %r9,%r14
356 jb L$outer
357
358 xorq %r14,%r14
359 movq (%rsp),%rax
360 leaq (%rsp),%rsi
361 movq %r9,%r15
362 jmp L$sub
363 .p2align 4
364 L$sub: sbbq (%rcx,%r14,8),%rax
365 movq %rax,(%rdi,%r14,8)
366 movq 8(%rsi,%r14,8),%rax
367 leaq 1(%r14),%r14
368 decq %r15
369 jnz L$sub
370
371 sbbq $0,%rax
372 xorq %r14,%r14
373 movq %r9,%r15
374 .p2align 4
375 L$copy:
376 movq (%rsp,%r14,8),%rsi
377 movq (%rdi,%r14,8),%rcx
378 xorq %rcx,%rsi
379 andq %rax,%rsi
380 xorq %rcx,%rsi
381 movq %r14,(%rsp,%r14,8)
382 movq %rsi,(%rdi,%r14,8)
383 leaq 1(%r14),%r14
384 subq $1,%r15
385 jnz L$copy
386
387 movq 8(%rsp,%r9,8),%rsi
388 movq $1,%rax
389
390 movq -48(%rsi),%r15
391 movq -40(%rsi),%r14
392 movq -32(%rsi),%r13
393 movq -24(%rsi),%r12
394 movq -16(%rsi),%rbp
395 movq -8(%rsi),%rbx
396 leaq (%rsi),%rsp
397 L$mul_epilogue:
398 .byte 0xf3,0xc3
399
400
401 .p2align 5
402 bn_mul4x_mont_gather5:
403 L$mul4x_enter:
404 .byte 0x67
405 movq %rsp,%rax
406 pushq %rbx
407 pushq %rbp
408 pushq %r12
409 pushq %r13
410 pushq %r14
411 pushq %r15
412
413 .byte 0x67
414 shll $3,%r9d
415 leaq (%r9,%r9,2),%r10
416 negq %r9
417
418
419
420
421
422
423
424
425
426
427 leaq -320(%rsp,%r9,2),%r11
428 subq %rdi,%r11
429 andq $4095,%r11
430 cmpq %r11,%r10
431 jb L$mul4xsp_alt
432 subq %r11,%rsp
433 leaq -320(%rsp,%r9,2),%rsp
434 jmp L$mul4xsp_done
435
436 .p2align 5
437 L$mul4xsp_alt:
438 leaq 4096-320(,%r9,2),%r10
439 leaq -320(%rsp,%r9,2),%rsp
440 subq %r10,%r11
441 movq $0,%r10
442 cmovcq %r10,%r11
443 subq %r11,%rsp
444 L$mul4xsp_done:
445 andq $-64,%rsp
446 negq %r9
447
448 movq %rax,40(%rsp)
449 L$mul4x_body:
450
451 call mul4x_internal
452
453 movq 40(%rsp),%rsi
454 movq $1,%rax
455
456 movq -48(%rsi),%r15
457 movq -40(%rsi),%r14
458 movq -32(%rsi),%r13
459 movq -24(%rsi),%r12
460 movq -16(%rsi),%rbp
461 movq -8(%rsi),%rbx
462 leaq (%rsi),%rsp
463 L$mul4x_epilogue:
464 .byte 0xf3,0xc3
465
466
467
468 .p2align 5
469 mul4x_internal:
470 shlq $5,%r9
471 movd 8(%rax),%xmm5
472 leaq L$inc(%rip),%rax
473 leaq 128(%rdx,%r9,1),%r13
474 shrq $5,%r9
475 movdqa 0(%rax),%xmm0
476 movdqa 16(%rax),%xmm1
477 leaq 88-112(%rsp,%r9,1),%r10
478 leaq 128(%rdx),%r12
479
480 pshufd $0,%xmm5,%xmm5
481 movdqa %xmm1,%xmm4
482 .byte 0x67,0x67
483 movdqa %xmm1,%xmm2
484 paddd %xmm0,%xmm1
485 pcmpeqd %xmm5,%xmm0
486 .byte 0x67
487 movdqa %xmm4,%xmm3
488 paddd %xmm1,%xmm2
489 pcmpeqd %xmm5,%xmm1
490 movdqa %xmm0,112(%r10)
491 movdqa %xmm4,%xmm0
492
493 paddd %xmm2,%xmm3
494 pcmpeqd %xmm5,%xmm2
495 movdqa %xmm1,128(%r10)
496 movdqa %xmm4,%xmm1
497
498 paddd %xmm3,%xmm0
499 pcmpeqd %xmm5,%xmm3
500 movdqa %xmm2,144(%r10)
501 movdqa %xmm4,%xmm2
502
503 paddd %xmm0,%xmm1
504 pcmpeqd %xmm5,%xmm0
505 movdqa %xmm3,160(%r10)
506 movdqa %xmm4,%xmm3
507 paddd %xmm1,%xmm2
508 pcmpeqd %xmm5,%xmm1
509 movdqa %xmm0,176(%r10)
510 movdqa %xmm4,%xmm0
511
512 paddd %xmm2,%xmm3
513 pcmpeqd %xmm5,%xmm2
514 movdqa %xmm1,192(%r10)
515 movdqa %xmm4,%xmm1
516
517 paddd %xmm3,%xmm0
518 pcmpeqd %xmm5,%xmm3
519 movdqa %xmm2,208(%r10)
520 movdqa %xmm4,%xmm2
521
522 paddd %xmm0,%xmm1
523 pcmpeqd %xmm5,%xmm0
524 movdqa %xmm3,224(%r10)
525 movdqa %xmm4,%xmm3
526 paddd %xmm1,%xmm2
527 pcmpeqd %xmm5,%xmm1
528 movdqa %xmm0,240(%r10)
529 movdqa %xmm4,%xmm0
530
531 paddd %xmm2,%xmm3
532 pcmpeqd %xmm5,%xmm2
533 movdqa %xmm1,256(%r10)
534 movdqa %xmm4,%xmm1
535
536 paddd %xmm3,%xmm0
537 pcmpeqd %xmm5,%xmm3
538 movdqa %xmm2,272(%r10)
539 movdqa %xmm4,%xmm2
540
541 paddd %xmm0,%xmm1
542 pcmpeqd %xmm5,%xmm0
543 movdqa %xmm3,288(%r10)
544 movdqa %xmm4,%xmm3
545 paddd %xmm1,%xmm2
546 pcmpeqd %xmm5,%xmm1
547 movdqa %xmm0,304(%r10)
548
549 paddd %xmm2,%xmm3
550 .byte 0x67
551 pcmpeqd %xmm5,%xmm2
552 movdqa %xmm1,320(%r10)
553
554 pcmpeqd %xmm5,%xmm3
555 movdqa %xmm2,336(%r10)
556 pand 64(%r12),%xmm0
557
558 pand 80(%r12),%xmm1
559 pand 96(%r12),%xmm2
560 movdqa %xmm3,352(%r10)
561 pand 112(%r12),%xmm3
562 por %xmm2,%xmm0
563 por %xmm3,%xmm1
564 movdqa -128(%r12),%xmm4
565 movdqa -112(%r12),%xmm5
566 movdqa -96(%r12),%xmm2
567 pand 112(%r10),%xmm4
568 movdqa -80(%r12),%xmm3
569 pand 128(%r10),%xmm5
570 por %xmm4,%xmm0
571 pand 144(%r10),%xmm2
572 por %xmm5,%xmm1
573 pand 160(%r10),%xmm3
574 por %xmm2,%xmm0
575 por %xmm3,%xmm1
576 movdqa -64(%r12),%xmm4
577 movdqa -48(%r12),%xmm5
578 movdqa -32(%r12),%xmm2
579 pand 176(%r10),%xmm4
580 movdqa -16(%r12),%xmm3
581 pand 192(%r10),%xmm5
582 por %xmm4,%xmm0
583 pand 208(%r10),%xmm2
584 por %xmm5,%xmm1
585 pand 224(%r10),%xmm3
586 por %xmm2,%xmm0
587 por %xmm3,%xmm1
588 movdqa 0(%r12),%xmm4
589 movdqa 16(%r12),%xmm5
590 movdqa 32(%r12),%xmm2
591 pand 240(%r10),%xmm4
592 movdqa 48(%r12),%xmm3
593 pand 256(%r10),%xmm5
594 por %xmm4,%xmm0
595 pand 272(%r10),%xmm2
596 por %xmm5,%xmm1
597 pand 288(%r10),%xmm3
598 por %xmm2,%xmm0
599 por %xmm3,%xmm1
600 por %xmm1,%xmm0
601 pshufd $0x4e,%xmm0,%xmm1
602 por %xmm1,%xmm0
603 leaq 256(%r12),%r12
604 .byte 102,72,15,126,195
605
606 movq %r13,16+8(%rsp)
607 movq %rdi,56+8(%rsp)
608
609 movq (%r8),%r8
610 movq (%rsi),%rax
611 leaq (%rsi,%r9,1),%rsi
612 negq %r9
613
614 movq %r8,%rbp
615 mulq %rbx
616 movq %rax,%r10
617 movq (%rcx),%rax
618
619 imulq %r10,%rbp
620 leaq 64+8(%rsp),%r14
621 movq %rdx,%r11
622
623 mulq %rbp
624 addq %rax,%r10
625 movq 8(%rsi,%r9,1),%rax
626 adcq $0,%rdx
627 movq %rdx,%rdi
628
629 mulq %rbx
630 addq %rax,%r11
631 movq 8(%rcx),%rax
632 adcq $0,%rdx
633 movq %rdx,%r10
634
635 mulq %rbp
636 addq %rax,%rdi
637 movq 16(%rsi,%r9,1),%rax
638 adcq $0,%rdx
639 addq %r11,%rdi
640 leaq 32(%r9),%r15
641 leaq 32(%rcx),%rcx
642 adcq $0,%rdx
643 movq %rdi,(%r14)
644 movq %rdx,%r13
645 jmp L$1st4x
646
647 .p2align 5
648 L$1st4x:
649 mulq %rbx
650 addq %rax,%r10
651 movq -16(%rcx),%rax
652 leaq 32(%r14),%r14
653 adcq $0,%rdx
654 movq %rdx,%r11
655
656 mulq %rbp
657 addq %rax,%r13
658 movq -8(%rsi,%r15,1),%rax
659 adcq $0,%rdx
660 addq %r10,%r13
661 adcq $0,%rdx
662 movq %r13,-24(%r14)
663 movq %rdx,%rdi
664
665 mulq %rbx
666 addq %rax,%r11
667 movq -8(%rcx),%rax
668 adcq $0,%rdx
669 movq %rdx,%r10
670
671 mulq %rbp
672 addq %rax,%rdi
673 movq (%rsi,%r15,1),%rax
674 adcq $0,%rdx
675 addq %r11,%rdi
676 adcq $0,%rdx
677 movq %rdi,-16(%r14)
678 movq %rdx,%r13
679
680 mulq %rbx
681 addq %rax,%r10
682 movq 0(%rcx),%rax
683 adcq $0,%rdx
684 movq %rdx,%r11
685
686 mulq %rbp
687 addq %rax,%r13
688 movq 8(%rsi,%r15,1),%rax
689 adcq $0,%rdx
690 addq %r10,%r13
691 adcq $0,%rdx
692 movq %r13,-8(%r14)
693 movq %rdx,%rdi
694
695 mulq %rbx
696 addq %rax,%r11
697 movq 8(%rcx),%rax
698 adcq $0,%rdx
699 movq %rdx,%r10
700
701 mulq %rbp
702 addq %rax,%rdi
703 movq 16(%rsi,%r15,1),%rax
704 adcq $0,%rdx
705 addq %r11,%rdi
706 leaq 32(%rcx),%rcx
707 adcq $0,%rdx
708 movq %rdi,(%r14)
709 movq %rdx,%r13
710
711 addq $32,%r15
712 jnz L$1st4x
713
714 mulq %rbx
715 addq %rax,%r10
716 movq -16(%rcx),%rax
717 leaq 32(%r14),%r14
718 adcq $0,%rdx
719 movq %rdx,%r11
720
721 mulq %rbp
722 addq %rax,%r13
723 movq -8(%rsi),%rax
724 adcq $0,%rdx
725 addq %r10,%r13
726 adcq $0,%rdx
727 movq %r13,-24(%r14)
728 movq %rdx,%rdi
729
730 mulq %rbx
731 addq %rax,%r11
732 movq -8(%rcx),%rax
733 adcq $0,%rdx
734 movq %rdx,%r10
735
736 mulq %rbp
737 addq %rax,%rdi
738 movq (%rsi,%r9,1),%rax
739 adcq $0,%rdx
740 addq %r11,%rdi
741 adcq $0,%rdx
742 movq %rdi,-16(%r14)
743 movq %rdx,%r13
744
745 leaq (%rcx,%r9,1),%rcx
746
747 xorq %rdi,%rdi
748 addq %r10,%r13
749 adcq $0,%rdi
750 movq %r13,-8(%r14)
751
752 jmp L$outer4x
753
754 .p2align 5
755 L$outer4x:
756 leaq 16+128(%r14),%rdx
757 pxor %xmm4,%xmm4
758 pxor %xmm5,%xmm5
759 movdqa -128(%r12),%xmm0
760 movdqa -112(%r12),%xmm1
761 movdqa -96(%r12),%xmm2
762 movdqa -80(%r12),%xmm3
763 pand -128(%rdx),%xmm0
764 pand -112(%rdx),%xmm1
765 por %xmm0,%xmm4
766 pand -96(%rdx),%xmm2
767 por %xmm1,%xmm5
768 pand -80(%rdx),%xmm3
769 por %xmm2,%xmm4
770 por %xmm3,%xmm5
771 movdqa -64(%r12),%xmm0
772 movdqa -48(%r12),%xmm1
773 movdqa -32(%r12),%xmm2
774 movdqa -16(%r12),%xmm3
775 pand -64(%rdx),%xmm0
776 pand -48(%rdx),%xmm1
777 por %xmm0,%xmm4
778 pand -32(%rdx),%xmm2
779 por %xmm1,%xmm5
780 pand -16(%rdx),%xmm3
781 por %xmm2,%xmm4
782 por %xmm3,%xmm5
783 movdqa 0(%r12),%xmm0
784 movdqa 16(%r12),%xmm1
785 movdqa 32(%r12),%xmm2
786 movdqa 48(%r12),%xmm3
787 pand 0(%rdx),%xmm0
788 pand 16(%rdx),%xmm1
789 por %xmm0,%xmm4
790 pand 32(%rdx),%xmm2
791 por %xmm1,%xmm5
792 pand 48(%rdx),%xmm3
793 por %xmm2,%xmm4
794 por %xmm3,%xmm5
795 movdqa 64(%r12),%xmm0
796 movdqa 80(%r12),%xmm1
797 movdqa 96(%r12),%xmm2
798 movdqa 112(%r12),%xmm3
799 pand 64(%rdx),%xmm0
800 pand 80(%rdx),%xmm1
801 por %xmm0,%xmm4
802 pand 96(%rdx),%xmm2
803 por %xmm1,%xmm5
804 pand 112(%rdx),%xmm3
805 por %xmm2,%xmm4
806 por %xmm3,%xmm5
807 por %xmm5,%xmm4
808 pshufd $0x4e,%xmm4,%xmm0
809 por %xmm4,%xmm0
810 leaq 256(%r12),%r12
811 .byte 102,72,15,126,195
812
813 movq (%r14,%r9,1),%r10
814 movq %r8,%rbp
815 mulq %rbx
816 addq %rax,%r10
817 movq (%rcx),%rax
818 adcq $0,%rdx
819
820 imulq %r10,%rbp
821 movq %rdx,%r11
822 movq %rdi,(%r14)
823
824 leaq (%r14,%r9,1),%r14
825
826 mulq %rbp
827 addq %rax,%r10
828 movq 8(%rsi,%r9,1),%rax
829 adcq $0,%rdx
830 movq %rdx,%rdi
831
832 mulq %rbx
833 addq %rax,%r11
834 movq 8(%rcx),%rax
835 adcq $0,%rdx
836 addq 8(%r14),%r11
837 adcq $0,%rdx
838 movq %rdx,%r10
839
840 mulq %rbp
841 addq %rax,%rdi
842 movq 16(%rsi,%r9,1),%rax
843 adcq $0,%rdx
844 addq %r11,%rdi
845 leaq 32(%r9),%r15
846 leaq 32(%rcx),%rcx
847 adcq $0,%rdx
848 movq %rdx,%r13
849 jmp L$inner4x
850
851 .p2align 5
852 L$inner4x:
853 mulq %rbx
854 addq %rax,%r10
855 movq -16(%rcx),%rax
856 adcq $0,%rdx
857 addq 16(%r14),%r10
858 leaq 32(%r14),%r14
859 adcq $0,%rdx
860 movq %rdx,%r11
861
862 mulq %rbp
863 addq %rax,%r13
864 movq -8(%rsi,%r15,1),%rax
865 adcq $0,%rdx
866 addq %r10,%r13
867 adcq $0,%rdx
868 movq %rdi,-32(%r14)
869 movq %rdx,%rdi
870
871 mulq %rbx
872 addq %rax,%r11
873 movq -8(%rcx),%rax
874 adcq $0,%rdx
875 addq -8(%r14),%r11
876 adcq $0,%rdx
877 movq %rdx,%r10
878
879 mulq %rbp
880 addq %rax,%rdi
881 movq (%rsi,%r15,1),%rax
882 adcq $0,%rdx
883 addq %r11,%rdi
884 adcq $0,%rdx
885 movq %r13,-24(%r14)
886 movq %rdx,%r13
887
888 mulq %rbx
889 addq %rax,%r10
890 movq 0(%rcx),%rax
891 adcq $0,%rdx
892 addq (%r14),%r10
893 adcq $0,%rdx
894 movq %rdx,%r11
895
896 mulq %rbp
897 addq %rax,%r13
898 movq 8(%rsi,%r15,1),%rax
899 adcq $0,%rdx
900 addq %r10,%r13
901 adcq $0,%rdx
902 movq %rdi,-16(%r14)
903 movq %rdx,%rdi
904
905 mulq %rbx
906 addq %rax,%r11
907 movq 8(%rcx),%rax
908 adcq $0,%rdx
909 addq 8(%r14),%r11
910 adcq $0,%rdx
911 movq %rdx,%r10
912
913 mulq %rbp
914 addq %rax,%rdi
915 movq 16(%rsi,%r15,1),%rax
916 adcq $0,%rdx
917 addq %r11,%rdi
918 leaq 32(%rcx),%rcx
919 adcq $0,%rdx
920 movq %r13,-8(%r14)
921 movq %rdx,%r13
922
923 addq $32,%r15
924 jnz L$inner4x
925
926 mulq %rbx
927 addq %rax,%r10
928 movq -16(%rcx),%rax
929 adcq $0,%rdx
930 addq 16(%r14),%r10
931 leaq 32(%r14),%r14
932 adcq $0,%rdx
933 movq %rdx,%r11
934
935 mulq %rbp
936 addq %rax,%r13
937 movq -8(%rsi),%rax
938 adcq $0,%rdx
939 addq %r10,%r13
940 adcq $0,%rdx
941 movq %rdi,-32(%r14)
942 movq %rdx,%rdi
943
944 mulq %rbx
945 addq %rax,%r11
946 movq %rbp,%rax
947 movq -8(%rcx),%rbp
948 adcq $0,%rdx
949 addq -8(%r14),%r11
950 adcq $0,%rdx
951 movq %rdx,%r10
952
953 mulq %rbp
954 addq %rax,%rdi
955 movq (%rsi,%r9,1),%rax
956 adcq $0,%rdx
957 addq %r11,%rdi
958 adcq $0,%rdx
959 movq %r13,-24(%r14)
960 movq %rdx,%r13
961
962 movq %rdi,-16(%r14)
963 leaq (%rcx,%r9,1),%rcx
964
965 xorq %rdi,%rdi
966 addq %r10,%r13
967 adcq $0,%rdi
968 addq (%r14),%r13
969 adcq $0,%rdi
970 movq %r13,-8(%r14)
971
972 cmpq 16+8(%rsp),%r12
973 jb L$outer4x
974 xorq %rax,%rax
975 subq %r13,%rbp
976 adcq %r15,%r15
977 orq %r15,%rdi
978 subq %rdi,%rax
979 leaq (%r14,%r9,1),%rbx
980 movq (%rcx),%r12
981 leaq (%rcx),%rbp
982 movq %r9,%rcx
983 sarq $3+2,%rcx
984 movq 56+8(%rsp),%rdi
985 decq %r12
986 xorq %r10,%r10
987 movq 8(%rbp),%r13
988 movq 16(%rbp),%r14
989 movq 24(%rbp),%r15
990 jmp L$sqr4x_sub_entry
991
992 .globl _bn_power5
993 .private_extern _bn_power5
994
995 .p2align 5
996 _bn_power5:
997 movq %rsp,%rax
998 pushq %rbx
999 pushq %rbp
1000 pushq %r12
1001 pushq %r13
1002 pushq %r14
1003 pushq %r15
1004
1005 shll $3,%r9d
1006 leal (%r9,%r9,2),%r10d
1007 negq %r9
1008 movq (%r8),%r8
1009
1010
1011
1012
1013
1014
1015
1016
1017 leaq -320(%rsp,%r9,2),%r11
1018 subq %rdi,%r11
1019 andq $4095,%r11
1020 cmpq %r11,%r10
1021 jb L$pwr_sp_alt
1022 subq %r11,%rsp
1023 leaq -320(%rsp,%r9,2),%rsp
1024 jmp L$pwr_sp_done
1025
1026 .p2align 5
1027 L$pwr_sp_alt:
1028 leaq 4096-320(,%r9,2),%r10
1029 leaq -320(%rsp,%r9,2),%rsp
1030 subq %r10,%r11
1031 movq $0,%r10
1032 cmovcq %r10,%r11
1033 subq %r11,%rsp
1034 L$pwr_sp_done:
1035 andq $-64,%rsp
1036 movq %r9,%r10
1037 negq %r9
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048 movq %r8,32(%rsp)
1049 movq %rax,40(%rsp)
1050 L$power5_body:
1051 .byte 102,72,15,110,207
1052 .byte 102,72,15,110,209
1053 .byte 102,73,15,110,218
1054 .byte 102,72,15,110,226
1055
1056 call __bn_sqr8x_internal
1057 call __bn_post4x_internal
1058 call __bn_sqr8x_internal
1059 call __bn_post4x_internal
1060 call __bn_sqr8x_internal
1061 call __bn_post4x_internal
1062 call __bn_sqr8x_internal
1063 call __bn_post4x_internal
1064 call __bn_sqr8x_internal
1065 call __bn_post4x_internal
1066
1067 .byte 102,72,15,126,209
1068 .byte 102,72,15,126,226
1069 movq %rsi,%rdi
1070 movq 40(%rsp),%rax
1071 leaq 32(%rsp),%r8
1072
1073 call mul4x_internal
1074
1075 movq 40(%rsp),%rsi
1076 movq $1,%rax
1077 movq -48(%rsi),%r15
1078 movq -40(%rsi),%r14
1079 movq -32(%rsi),%r13
1080 movq -24(%rsi),%r12
1081 movq -16(%rsi),%rbp
1082 movq -8(%rsi),%rbx
1083 leaq (%rsi),%rsp
1084 L$power5_epilogue:
1085 .byte 0xf3,0xc3
1086
1087
1088 .globl _bn_sqr8x_internal
1089 .private_extern _bn_sqr8x_internal
1090 .private_extern _bn_sqr8x_internal
1091
1092 .p2align 5
1093 _bn_sqr8x_internal:
1094 __bn_sqr8x_internal:
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168 leaq 32(%r10),%rbp
1169 leaq (%rsi,%r9,1),%rsi
1170
1171 movq %r9,%rcx
1172
1173
1174 movq -32(%rsi,%rbp,1),%r14
1175 leaq 48+8(%rsp,%r9,2),%rdi
1176 movq -24(%rsi,%rbp,1),%rax
1177 leaq -32(%rdi,%rbp,1),%rdi
1178 movq -16(%rsi,%rbp,1),%rbx
1179 movq %rax,%r15
1180
1181 mulq %r14
1182 movq %rax,%r10
1183 movq %rbx,%rax
1184 movq %rdx,%r11
1185 movq %r10,-24(%rdi,%rbp,1)
1186
1187 mulq %r14
1188 addq %rax,%r11
1189 movq %rbx,%rax
1190 adcq $0,%rdx
1191 movq %r11,-16(%rdi,%rbp,1)
1192 movq %rdx,%r10
1193
1194
1195 movq -8(%rsi,%rbp,1),%rbx
1196 mulq %r15
1197 movq %rax,%r12
1198 movq %rbx,%rax
1199 movq %rdx,%r13
1200
1201 leaq (%rbp),%rcx
1202 mulq %r14
1203 addq %rax,%r10
1204 movq %rbx,%rax
1205 movq %rdx,%r11
1206 adcq $0,%r11
1207 addq %r12,%r10
1208 adcq $0,%r11
1209 movq %r10,-8(%rdi,%rcx,1)
1210 jmp L$sqr4x_1st
1211
1212 .p2align 5
1213 L$sqr4x_1st:
1214 movq (%rsi,%rcx,1),%rbx
1215 mulq %r15
1216 addq %rax,%r13
1217 movq %rbx,%rax
1218 movq %rdx,%r12
1219 adcq $0,%r12
1220
1221 mulq %r14
1222 addq %rax,%r11
1223 movq %rbx,%rax
1224 movq 8(%rsi,%rcx,1),%rbx
1225 movq %rdx,%r10
1226 adcq $0,%r10
1227 addq %r13,%r11
1228 adcq $0,%r10
1229
1230
1231 mulq %r15
1232 addq %rax,%r12
1233 movq %rbx,%rax
1234 movq %r11,(%rdi,%rcx,1)
1235 movq %rdx,%r13
1236 adcq $0,%r13
1237
1238 mulq %r14
1239 addq %rax,%r10
1240 movq %rbx,%rax
1241 movq 16(%rsi,%rcx,1),%rbx
1242 movq %rdx,%r11
1243 adcq $0,%r11
1244 addq %r12,%r10
1245 adcq $0,%r11
1246
1247 mulq %r15
1248 addq %rax,%r13
1249 movq %rbx,%rax
1250 movq %r10,8(%rdi,%rcx,1)
1251 movq %rdx,%r12
1252 adcq $0,%r12
1253
1254 mulq %r14
1255 addq %rax,%r11
1256 movq %rbx,%rax
1257 movq 24(%rsi,%rcx,1),%rbx
1258 movq %rdx,%r10
1259 adcq $0,%r10
1260 addq %r13,%r11
1261 adcq $0,%r10
1262
1263
1264 mulq %r15
1265 addq %rax,%r12
1266 movq %rbx,%rax
1267 movq %r11,16(%rdi,%rcx,1)
1268 movq %rdx,%r13
1269 adcq $0,%r13
1270 leaq 32(%rcx),%rcx
1271
1272 mulq %r14
1273 addq %rax,%r10
1274 movq %rbx,%rax
1275 movq %rdx,%r11
1276 adcq $0,%r11
1277 addq %r12,%r10
1278 adcq $0,%r11
1279 movq %r10,-8(%rdi,%rcx,1)
1280
1281 cmpq $0,%rcx
1282 jne L$sqr4x_1st
1283
1284 mulq %r15
1285 addq %rax,%r13
1286 leaq 16(%rbp),%rbp
1287 adcq $0,%rdx
1288 addq %r11,%r13
1289 adcq $0,%rdx
1290
1291 movq %r13,(%rdi)
1292 movq %rdx,%r12
1293 movq %rdx,8(%rdi)
1294 jmp L$sqr4x_outer
1295
1296 .p2align 5
1297 L$sqr4x_outer:
1298 movq -32(%rsi,%rbp,1),%r14
1299 leaq 48+8(%rsp,%r9,2),%rdi
1300 movq -24(%rsi,%rbp,1),%rax
1301 leaq -32(%rdi,%rbp,1),%rdi
1302 movq -16(%rsi,%rbp,1),%rbx
1303 movq %rax,%r15
1304
1305 mulq %r14
1306 movq -24(%rdi,%rbp,1),%r10
1307 addq %rax,%r10
1308 movq %rbx,%rax
1309 adcq $0,%rdx
1310 movq %r10,-24(%rdi,%rbp,1)
1311 movq %rdx,%r11
1312
1313 mulq %r14
1314 addq %rax,%r11
1315 movq %rbx,%rax
1316 adcq $0,%rdx
1317 addq -16(%rdi,%rbp,1),%r11
1318 movq %rdx,%r10
1319 adcq $0,%r10
1320 movq %r11,-16(%rdi,%rbp,1)
1321
1322 xorq %r12,%r12
1323
1324 movq -8(%rsi,%rbp,1),%rbx
1325 mulq %r15
1326 addq %rax,%r12
1327 movq %rbx,%rax
1328 adcq $0,%rdx
1329 addq -8(%rdi,%rbp,1),%r12
1330 movq %rdx,%r13
1331 adcq $0,%r13
1332
1333 mulq %r14
1334 addq %rax,%r10
1335 movq %rbx,%rax
1336 adcq $0,%rdx
1337 addq %r12,%r10
1338 movq %rdx,%r11
1339 adcq $0,%r11
1340 movq %r10,-8(%rdi,%rbp,1)
1341
1342 leaq (%rbp),%rcx
1343 jmp L$sqr4x_inner
1344
1345 .p2align 5
1346 L$sqr4x_inner:
1347 movq (%rsi,%rcx,1),%rbx
1348 mulq %r15
1349 addq %rax,%r13
1350 movq %rbx,%rax
1351 movq %rdx,%r12
1352 adcq $0,%r12
1353 addq (%rdi,%rcx,1),%r13
1354 adcq $0,%r12
1355
1356 .byte 0x67
1357 mulq %r14
1358 addq %rax,%r11
1359 movq %rbx,%rax
1360 movq 8(%rsi,%rcx,1),%rbx
1361 movq %rdx,%r10
1362 adcq $0,%r10
1363 addq %r13,%r11
1364 adcq $0,%r10
1365
1366 mulq %r15
1367 addq %rax,%r12
1368 movq %r11,(%rdi,%rcx,1)
1369 movq %rbx,%rax
1370 movq %rdx,%r13
1371 adcq $0,%r13
1372 addq 8(%rdi,%rcx,1),%r12
1373 leaq 16(%rcx),%rcx
1374 adcq $0,%r13
1375
1376 mulq %r14
1377 addq %rax,%r10
1378 movq %rbx,%rax
1379 adcq $0,%rdx
1380 addq %r12,%r10
1381 movq %rdx,%r11
1382 adcq $0,%r11
1383 movq %r10,-8(%rdi,%rcx,1)
1384
1385 cmpq $0,%rcx
1386 jne L$sqr4x_inner
1387
1388 .byte 0x67
1389 mulq %r15
1390 addq %rax,%r13
1391 adcq $0,%rdx
1392 addq %r11,%r13
1393 adcq $0,%rdx
1394
1395 movq %r13,(%rdi)
1396 movq %rdx,%r12
1397 movq %rdx,8(%rdi)
1398
1399 addq $16,%rbp
1400 jnz L$sqr4x_outer
1401
1402
1403 movq -32(%rsi),%r14
1404 leaq 48+8(%rsp,%r9,2),%rdi
1405 movq -24(%rsi),%rax
1406 leaq -32(%rdi,%rbp,1),%rdi
1407 movq -16(%rsi),%rbx
1408 movq %rax,%r15
1409
1410 mulq %r14
1411 addq %rax,%r10
1412 movq %rbx,%rax
1413 movq %rdx,%r11
1414 adcq $0,%r11
1415
1416 mulq %r14
1417 addq %rax,%r11
1418 movq %rbx,%rax
1419 movq %r10,-24(%rdi)
1420 movq %rdx,%r10
1421 adcq $0,%r10
1422 addq %r13,%r11
1423 movq -8(%rsi),%rbx
1424 adcq $0,%r10
1425
1426 mulq %r15
1427 addq %rax,%r12
1428 movq %rbx,%rax
1429 movq %r11,-16(%rdi)
1430 movq %rdx,%r13
1431 adcq $0,%r13
1432
1433 mulq %r14
1434 addq %rax,%r10
1435 movq %rbx,%rax
1436 movq %rdx,%r11
1437 adcq $0,%r11
1438 addq %r12,%r10
1439 adcq $0,%r11
1440 movq %r10,-8(%rdi)
1441
1442 mulq %r15
1443 addq %rax,%r13
1444 movq -16(%rsi),%rax
1445 adcq $0,%rdx
1446 addq %r11,%r13
1447 adcq $0,%rdx
1448
1449 movq %r13,(%rdi)
1450 movq %rdx,%r12
1451 movq %rdx,8(%rdi)
1452
1453 mulq %rbx
1454 addq $16,%rbp
1455 xorq %r14,%r14
1456 subq %r9,%rbp
1457 xorq %r15,%r15
1458
1459 addq %r12,%rax
1460 adcq $0,%rdx
1461 movq %rax,8(%rdi)
1462 movq %rdx,16(%rdi)
1463 movq %r15,24(%rdi)
1464
1465 movq -16(%rsi,%rbp,1),%rax
1466 leaq 48+8(%rsp),%rdi
1467 xorq %r10,%r10
1468 movq 8(%rdi),%r11
1469
1470 leaq (%r14,%r10,2),%r12
1471 shrq $63,%r10
1472 leaq (%rcx,%r11,2),%r13
1473 shrq $63,%r11
1474 orq %r10,%r13
1475 movq 16(%rdi),%r10
1476 movq %r11,%r14
1477 mulq %rax
1478 negq %r15
1479 movq 24(%rdi),%r11
1480 adcq %rax,%r12
1481 movq -8(%rsi,%rbp,1),%rax
1482 movq %r12,(%rdi)
1483 adcq %rdx,%r13
1484
1485 leaq (%r14,%r10,2),%rbx
1486 movq %r13,8(%rdi)
1487 sbbq %r15,%r15
1488 shrq $63,%r10
1489 leaq (%rcx,%r11,2),%r8
1490 shrq $63,%r11
1491 orq %r10,%r8
1492 movq 32(%rdi),%r10
1493 movq %r11,%r14
1494 mulq %rax
1495 negq %r15
1496 movq 40(%rdi),%r11
1497 adcq %rax,%rbx
1498 movq 0(%rsi,%rbp,1),%rax
1499 movq %rbx,16(%rdi)
1500 adcq %rdx,%r8
1501 leaq 16(%rbp),%rbp
1502 movq %r8,24(%rdi)
1503 sbbq %r15,%r15
1504 leaq 64(%rdi),%rdi
1505 jmp L$sqr4x_shift_n_add
1506
1507 .p2align 5
1508 L$sqr4x_shift_n_add:
1509 leaq (%r14,%r10,2),%r12
1510 shrq $63,%r10
1511 leaq (%rcx,%r11,2),%r13
1512 shrq $63,%r11
1513 orq %r10,%r13
1514 movq -16(%rdi),%r10
1515 movq %r11,%r14
1516 mulq %rax
1517 negq %r15
1518 movq -8(%rdi),%r11
1519 adcq %rax,%r12
1520 movq -8(%rsi,%rbp,1),%rax
1521 movq %r12,-32(%rdi)
1522 adcq %rdx,%r13
1523
1524 leaq (%r14,%r10,2),%rbx
1525 movq %r13,-24(%rdi)
1526 sbbq %r15,%r15
1527 shrq $63,%r10
1528 leaq (%rcx,%r11,2),%r8
1529 shrq $63,%r11
1530 orq %r10,%r8
1531 movq 0(%rdi),%r10
1532 movq %r11,%r14
1533 mulq %rax
1534 negq %r15
1535 movq 8(%rdi),%r11
1536 adcq %rax,%rbx
1537 movq 0(%rsi,%rbp,1),%rax
1538 movq %rbx,-16(%rdi)
1539 adcq %rdx,%r8
1540
1541 leaq (%r14,%r10,2),%r12
1542 movq %r8,-8(%rdi)
1543 sbbq %r15,%r15
1544 shrq $63,%r10
1545 leaq (%rcx,%r11,2),%r13
1546 shrq $63,%r11
1547 orq %r10,%r13
1548 movq 16(%rdi),%r10
1549 movq %r11,%r14
1550 mulq %rax
1551 negq %r15
1552 movq 24(%rdi),%r11
1553 adcq %rax,%r12
1554 movq 8(%rsi,%rbp,1),%rax
1555 movq %r12,0(%rdi)
1556 adcq %rdx,%r13
1557
1558 leaq (%r14,%r10,2),%rbx
1559 movq %r13,8(%rdi)
1560 sbbq %r15,%r15
1561 shrq $63,%r10
1562 leaq (%rcx,%r11,2),%r8
1563 shrq $63,%r11
1564 orq %r10,%r8
1565 movq 32(%rdi),%r10
1566 movq %r11,%r14
1567 mulq %rax
1568 negq %r15
1569 movq 40(%rdi),%r11
1570 adcq %rax,%rbx
1571 movq 16(%rsi,%rbp,1),%rax
1572 movq %rbx,16(%rdi)
1573 adcq %rdx,%r8
1574 movq %r8,24(%rdi)
1575 sbbq %r15,%r15
1576 leaq 64(%rdi),%rdi
1577 addq $32,%rbp
1578 jnz L$sqr4x_shift_n_add
1579
1580 leaq (%r14,%r10,2),%r12
1581 .byte 0x67
1582 shrq $63,%r10
1583 leaq (%rcx,%r11,2),%r13
1584 shrq $63,%r11
1585 orq %r10,%r13
1586 movq -16(%rdi),%r10
1587 movq %r11,%r14
1588 mulq %rax
1589 negq %r15
1590 movq -8(%rdi),%r11
1591 adcq %rax,%r12
1592 movq -8(%rsi),%rax
1593 movq %r12,-32(%rdi)
1594 adcq %rdx,%r13
1595
1596 leaq (%r14,%r10,2),%rbx
1597 movq %r13,-24(%rdi)
1598 sbbq %r15,%r15
1599 shrq $63,%r10
1600 leaq (%rcx,%r11,2),%r8
1601 shrq $63,%r11
1602 orq %r10,%r8
1603 mulq %rax
1604 negq %r15
1605 adcq %rax,%rbx
1606 adcq %rdx,%r8
1607 movq %rbx,-16(%rdi)
1608 movq %r8,-8(%rdi)
1609 .byte 102,72,15,126,213
1610 __bn_sqr8x_reduction:
1611 xorq %rax,%rax
1612 leaq (%r9,%rbp,1),%rcx
1613 leaq 48+8(%rsp,%r9,2),%rdx
1614 movq %rcx,0+8(%rsp)
1615 leaq 48+8(%rsp,%r9,1),%rdi
1616 movq %rdx,8+8(%rsp)
1617 negq %r9
1618 jmp L$8x_reduction_loop
1619
1620 .p2align 5
1621 L$8x_reduction_loop:
1622 leaq (%rdi,%r9,1),%rdi
1623 .byte 0x66
1624 movq 0(%rdi),%rbx
1625 movq 8(%rdi),%r9
1626 movq 16(%rdi),%r10
1627 movq 24(%rdi),%r11
1628 movq 32(%rdi),%r12
1629 movq 40(%rdi),%r13
1630 movq 48(%rdi),%r14
1631 movq 56(%rdi),%r15
1632 movq %rax,(%rdx)
1633 leaq 64(%rdi),%rdi
1634
1635 .byte 0x67
1636 movq %rbx,%r8
1637 imulq 32+8(%rsp),%rbx
1638 movq 0(%rbp),%rax
1639 movl $8,%ecx
1640 jmp L$8x_reduce
1641
1642 .p2align 5
1643 L$8x_reduce:
1644 mulq %rbx
1645 movq 8(%rbp),%rax
1646 negq %r8
1647 movq %rdx,%r8
1648 adcq $0,%r8
1649
1650 mulq %rbx
1651 addq %rax,%r9
1652 movq 16(%rbp),%rax
1653 adcq $0,%rdx
1654 addq %r9,%r8
1655 movq %rbx,48-8+8(%rsp,%rcx,8)
1656 movq %rdx,%r9
1657 adcq $0,%r9
1658
1659 mulq %rbx
1660 addq %rax,%r10
1661 movq 24(%rbp),%rax
1662 adcq $0,%rdx
1663 addq %r10,%r9
1664 movq 32+8(%rsp),%rsi
1665 movq %rdx,%r10
1666 adcq $0,%r10
1667
1668 mulq %rbx
1669 addq %rax,%r11
1670 movq 32(%rbp),%rax
1671 adcq $0,%rdx
1672 imulq %r8,%rsi
1673 addq %r11,%r10
1674 movq %rdx,%r11
1675 adcq $0,%r11
1676
1677 mulq %rbx
1678 addq %rax,%r12
1679 movq 40(%rbp),%rax
1680 adcq $0,%rdx
1681 addq %r12,%r11
1682 movq %rdx,%r12
1683 adcq $0,%r12
1684
1685 mulq %rbx
1686 addq %rax,%r13
1687 movq 48(%rbp),%rax
1688 adcq $0,%rdx
1689 addq %r13,%r12
1690 movq %rdx,%r13
1691 adcq $0,%r13
1692
1693 mulq %rbx
1694 addq %rax,%r14
1695 movq 56(%rbp),%rax
1696 adcq $0,%rdx
1697 addq %r14,%r13
1698 movq %rdx,%r14
1699 adcq $0,%r14
1700
1701 mulq %rbx
1702 movq %rsi,%rbx
1703 addq %rax,%r15
1704 movq 0(%rbp),%rax
1705 adcq $0,%rdx
1706 addq %r15,%r14
1707 movq %rdx,%r15
1708 adcq $0,%r15
1709
1710 decl %ecx
1711 jnz L$8x_reduce
1712
1713 leaq 64(%rbp),%rbp
1714 xorq %rax,%rax
1715 movq 8+8(%rsp),%rdx
1716 cmpq 0+8(%rsp),%rbp
1717 jae L$8x_no_tail
1718
1719 .byte 0x66
1720 addq 0(%rdi),%r8
1721 adcq 8(%rdi),%r9
1722 adcq 16(%rdi),%r10
1723 adcq 24(%rdi),%r11
1724 adcq 32(%rdi),%r12
1725 adcq 40(%rdi),%r13
1726 adcq 48(%rdi),%r14
1727 adcq 56(%rdi),%r15
1728 sbbq %rsi,%rsi
1729
1730 movq 48+56+8(%rsp),%rbx
1731 movl $8,%ecx
1732 movq 0(%rbp),%rax
1733 jmp L$8x_tail
1734
1735 .p2align 5
1736 L$8x_tail:
1737 mulq %rbx
1738 addq %rax,%r8
1739 movq 8(%rbp),%rax
1740 movq %r8,(%rdi)
1741 movq %rdx,%r8
1742 adcq $0,%r8
1743
1744 mulq %rbx
1745 addq %rax,%r9
1746 movq 16(%rbp),%rax
1747 adcq $0,%rdx
1748 addq %r9,%r8
1749 leaq 8(%rdi),%rdi
1750 movq %rdx,%r9
1751 adcq $0,%r9
1752
1753 mulq %rbx
1754 addq %rax,%r10
1755 movq 24(%rbp),%rax
1756 adcq $0,%rdx
1757 addq %r10,%r9
1758 movq %rdx,%r10
1759 adcq $0,%r10
1760
1761 mulq %rbx
1762 addq %rax,%r11
1763 movq 32(%rbp),%rax
1764 adcq $0,%rdx
1765 addq %r11,%r10
1766 movq %rdx,%r11
1767 adcq $0,%r11
1768
1769 mulq %rbx
1770 addq %rax,%r12
1771 movq 40(%rbp),%rax
1772 adcq $0,%rdx
1773 addq %r12,%r11
1774 movq %rdx,%r12
1775 adcq $0,%r12
1776
1777 mulq %rbx
1778 addq %rax,%r13
1779 movq 48(%rbp),%rax
1780 adcq $0,%rdx
1781 addq %r13,%r12
1782 movq %rdx,%r13
1783 adcq $0,%r13
1784
1785 mulq %rbx
1786 addq %rax,%r14
1787 movq 56(%rbp),%rax
1788 adcq $0,%rdx
1789 addq %r14,%r13
1790 movq %rdx,%r14
1791 adcq $0,%r14
1792
1793 mulq %rbx
1794 movq 48-16+8(%rsp,%rcx,8),%rbx
1795 addq %rax,%r15
1796 adcq $0,%rdx
1797 addq %r15,%r14
1798 movq 0(%rbp),%rax
1799 movq %rdx,%r15
1800 adcq $0,%r15
1801
1802 decl %ecx
1803 jnz L$8x_tail
1804
1805 leaq 64(%rbp),%rbp
1806 movq 8+8(%rsp),%rdx
1807 cmpq 0+8(%rsp),%rbp
1808 jae L$8x_tail_done
1809
1810 movq 48+56+8(%rsp),%rbx
1811 negq %rsi
1812 movq 0(%rbp),%rax
1813 adcq 0(%rdi),%r8
1814 adcq 8(%rdi),%r9
1815 adcq 16(%rdi),%r10
1816 adcq 24(%rdi),%r11
1817 adcq 32(%rdi),%r12
1818 adcq 40(%rdi),%r13
1819 adcq 48(%rdi),%r14
1820 adcq 56(%rdi),%r15
1821 sbbq %rsi,%rsi
1822
1823 movl $8,%ecx
1824 jmp L$8x_tail
1825
1826 .p2align 5
1827 L$8x_tail_done:
1828 addq (%rdx),%r8
1829 adcq $0,%r9
1830 adcq $0,%r10
1831 adcq $0,%r11
1832 adcq $0,%r12
1833 adcq $0,%r13
1834 adcq $0,%r14
1835 adcq $0,%r15
1836
1837
1838 xorq %rax,%rax
1839
1840 negq %rsi
1841 L$8x_no_tail:
1842 adcq 0(%rdi),%r8
1843 adcq 8(%rdi),%r9
1844 adcq 16(%rdi),%r10
1845 adcq 24(%rdi),%r11
1846 adcq 32(%rdi),%r12
1847 adcq 40(%rdi),%r13
1848 adcq 48(%rdi),%r14
1849 adcq 56(%rdi),%r15
1850 adcq $0,%rax
1851 movq -8(%rbp),%rcx
1852 xorq %rsi,%rsi
1853
1854 .byte 102,72,15,126,213
1855
1856 movq %r8,0(%rdi)
1857 movq %r9,8(%rdi)
1858 .byte 102,73,15,126,217
1859 movq %r10,16(%rdi)
1860 movq %r11,24(%rdi)
1861 movq %r12,32(%rdi)
1862 movq %r13,40(%rdi)
1863 movq %r14,48(%rdi)
1864 movq %r15,56(%rdi)
1865 leaq 64(%rdi),%rdi
1866
1867 cmpq %rdx,%rdi
1868 jb L$8x_reduction_loop
1869 .byte 0xf3,0xc3
1870
1871
1872 .p2align 5
1873 __bn_post4x_internal:
1874 movq 0(%rbp),%r12
1875 leaq (%rdi,%r9,1),%rbx
1876 movq %r9,%rcx
1877 .byte 102,72,15,126,207
1878 negq %rax
1879 .byte 102,72,15,126,206
1880 sarq $3+2,%rcx
1881 decq %r12
1882 xorq %r10,%r10
1883 movq 8(%rbp),%r13
1884 movq 16(%rbp),%r14
1885 movq 24(%rbp),%r15
1886 jmp L$sqr4x_sub_entry
1887
1888 .p2align 4
1889 L$sqr4x_sub:
1890 movq 0(%rbp),%r12
1891 movq 8(%rbp),%r13
1892 movq 16(%rbp),%r14
1893 movq 24(%rbp),%r15
1894 L$sqr4x_sub_entry:
1895 leaq 32(%rbp),%rbp
1896 notq %r12
1897 notq %r13
1898 notq %r14
1899 notq %r15
1900 andq %rax,%r12
1901 andq %rax,%r13
1902 andq %rax,%r14
1903 andq %rax,%r15
1904
1905 negq %r10
1906 adcq 0(%rbx),%r12
1907 adcq 8(%rbx),%r13
1908 adcq 16(%rbx),%r14
1909 adcq 24(%rbx),%r15
1910 movq %r12,0(%rdi)
1911 leaq 32(%rbx),%rbx
1912 movq %r13,8(%rdi)
1913 sbbq %r10,%r10
1914 movq %r14,16(%rdi)
1915 movq %r15,24(%rdi)
1916 leaq 32(%rdi),%rdi
1917
1918 incq %rcx
1919 jnz L$sqr4x_sub
1920
1921 movq %r9,%r10
1922 negq %r9
1923 .byte 0xf3,0xc3
1924
1925 .globl _bn_from_montgomery
1926 .private_extern _bn_from_montgomery
1927
1928 .p2align 5
1929 _bn_from_montgomery:
1930 testl $7,%r9d
1931 jz bn_from_mont8x
1932 xorl %eax,%eax
1933 .byte 0xf3,0xc3
1934
1935
1936
1937 .p2align 5
1938 bn_from_mont8x:
1939 .byte 0x67
1940 movq %rsp,%rax
1941 pushq %rbx
1942 pushq %rbp
1943 pushq %r12
1944 pushq %r13
1945 pushq %r14
1946 pushq %r15
1947
1948 shll $3,%r9d
1949 leaq (%r9,%r9,2),%r10
1950 negq %r9
1951 movq (%r8),%r8
1952
1953
1954
1955
1956
1957
1958
1959
1960 leaq -320(%rsp,%r9,2),%r11
1961 subq %rdi,%r11
1962 andq $4095,%r11
1963 cmpq %r11,%r10
1964 jb L$from_sp_alt
1965 subq %r11,%rsp
1966 leaq -320(%rsp,%r9,2),%rsp
1967 jmp L$from_sp_done
1968
1969 .p2align 5
1970 L$from_sp_alt:
1971 leaq 4096-320(,%r9,2),%r10
1972 leaq -320(%rsp,%r9,2),%rsp
1973 subq %r10,%r11
1974 movq $0,%r10
1975 cmovcq %r10,%r11
1976 subq %r11,%rsp
1977 L$from_sp_done:
1978 andq $-64,%rsp
1979 movq %r9,%r10
1980 negq %r9
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991 movq %r8,32(%rsp)
1992 movq %rax,40(%rsp)
1993 L$from_body:
1994 movq %r9,%r11
1995 leaq 48(%rsp),%rax
1996 pxor %xmm0,%xmm0
1997 jmp L$mul_by_1
1998
1999 .p2align 5
2000 L$mul_by_1:
2001 movdqu (%rsi),%xmm1
2002 movdqu 16(%rsi),%xmm2
2003 movdqu 32(%rsi),%xmm3
2004 movdqa %xmm0,(%rax,%r9,1)
2005 movdqu 48(%rsi),%xmm4
2006 movdqa %xmm0,16(%rax,%r9,1)
2007 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2008 movdqa %xmm1,(%rax)
2009 movdqa %xmm0,32(%rax,%r9,1)
2010 movdqa %xmm2,16(%rax)
2011 movdqa %xmm0,48(%rax,%r9,1)
2012 movdqa %xmm3,32(%rax)
2013 movdqa %xmm4,48(%rax)
2014 leaq 64(%rax),%rax
2015 subq $64,%r11
2016 jnz L$mul_by_1
2017
2018 .byte 102,72,15,110,207
2019 .byte 102,72,15,110,209
2020 .byte 0x67
2021 movq %rcx,%rbp
2022 .byte 102,73,15,110,218
2023 call __bn_sqr8x_reduction
2024 call __bn_post4x_internal
2025
2026 pxor %xmm0,%xmm0
2027 leaq 48(%rsp),%rax
2028 movq 40(%rsp),%rsi
2029 jmp L$from_mont_zero
2030
2031 .p2align 5
2032 L$from_mont_zero:
2033 movdqa %xmm0,0(%rax)
2034 movdqa %xmm0,16(%rax)
2035 movdqa %xmm0,32(%rax)
2036 movdqa %xmm0,48(%rax)
2037 leaq 64(%rax),%rax
2038 subq $32,%r9
2039 jnz L$from_mont_zero
2040
2041 movq $1,%rax
2042 movq -48(%rsi),%r15
2043 movq -40(%rsi),%r14
2044 movq -32(%rsi),%r13
2045 movq -24(%rsi),%r12
2046 movq -16(%rsi),%rbp
2047 movq -8(%rsi),%rbx
2048 leaq (%rsi),%rsp
2049 L$from_epilogue:
2050 .byte 0xf3,0xc3
2051
2052 .globl _bn_scatter5
2053 .private_extern _bn_scatter5
2054
2055 .p2align 4
2056 _bn_scatter5:
2057 cmpl $0,%esi
2058 jz L$scatter_epilogue
2059 leaq (%rdx,%rcx,8),%rdx
2060 L$scatter:
2061 movq (%rdi),%rax
2062 leaq 8(%rdi),%rdi
2063 movq %rax,(%rdx)
2064 leaq 256(%rdx),%rdx
2065 subl $1,%esi
2066 jnz L$scatter
2067 L$scatter_epilogue:
2068 .byte 0xf3,0xc3
2069
2070
2071 .globl _bn_gather5
2072 .private_extern _bn_gather5
2073
2074 .p2align 5
2075 _bn_gather5:
2076 L$SEH_begin_bn_gather5:
2077
2078 .byte 0x4c,0x8d,0x14,0x24
2079 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
2080 leaq L$inc(%rip),%rax
2081 andq $-16,%rsp
2082
2083 movd %ecx,%xmm5
2084 movdqa 0(%rax),%xmm0
2085 movdqa 16(%rax),%xmm1
2086 leaq 128(%rdx),%r11
2087 leaq 128(%rsp),%rax
2088
2089 pshufd $0,%xmm5,%xmm5
2090 movdqa %xmm1,%xmm4
2091 movdqa %xmm1,%xmm2
2092 paddd %xmm0,%xmm1
2093 pcmpeqd %xmm5,%xmm0
2094 movdqa %xmm4,%xmm3
2095
2096 paddd %xmm1,%xmm2
2097 pcmpeqd %xmm5,%xmm1
2098 movdqa %xmm0,-128(%rax)
2099 movdqa %xmm4,%xmm0
2100
2101 paddd %xmm2,%xmm3
2102 pcmpeqd %xmm5,%xmm2
2103 movdqa %xmm1,-112(%rax)
2104 movdqa %xmm4,%xmm1
2105
2106 paddd %xmm3,%xmm0
2107 pcmpeqd %xmm5,%xmm3
2108 movdqa %xmm2,-96(%rax)
2109 movdqa %xmm4,%xmm2
2110 paddd %xmm0,%xmm1
2111 pcmpeqd %xmm5,%xmm0
2112 movdqa %xmm3,-80(%rax)
2113 movdqa %xmm4,%xmm3
2114
2115 paddd %xmm1,%xmm2
2116 pcmpeqd %xmm5,%xmm1
2117 movdqa %xmm0,-64(%rax)
2118 movdqa %xmm4,%xmm0
2119
2120 paddd %xmm2,%xmm3
2121 pcmpeqd %xmm5,%xmm2
2122 movdqa %xmm1,-48(%rax)
2123 movdqa %xmm4,%xmm1
2124
2125 paddd %xmm3,%xmm0
2126 pcmpeqd %xmm5,%xmm3
2127 movdqa %xmm2,-32(%rax)
2128 movdqa %xmm4,%xmm2
2129 paddd %xmm0,%xmm1
2130 pcmpeqd %xmm5,%xmm0
2131 movdqa %xmm3,-16(%rax)
2132 movdqa %xmm4,%xmm3
2133
2134 paddd %xmm1,%xmm2
2135 pcmpeqd %xmm5,%xmm1
2136 movdqa %xmm0,0(%rax)
2137 movdqa %xmm4,%xmm0
2138
2139 paddd %xmm2,%xmm3
2140 pcmpeqd %xmm5,%xmm2
2141 movdqa %xmm1,16(%rax)
2142 movdqa %xmm4,%xmm1
2143
2144 paddd %xmm3,%xmm0
2145 pcmpeqd %xmm5,%xmm3
2146 movdqa %xmm2,32(%rax)
2147 movdqa %xmm4,%xmm2
2148 paddd %xmm0,%xmm1
2149 pcmpeqd %xmm5,%xmm0
2150 movdqa %xmm3,48(%rax)
2151 movdqa %xmm4,%xmm3
2152
2153 paddd %xmm1,%xmm2
2154 pcmpeqd %xmm5,%xmm1
2155 movdqa %xmm0,64(%rax)
2156 movdqa %xmm4,%xmm0
2157
2158 paddd %xmm2,%xmm3
2159 pcmpeqd %xmm5,%xmm2
2160 movdqa %xmm1,80(%rax)
2161 movdqa %xmm4,%xmm1
2162
2163 paddd %xmm3,%xmm0
2164 pcmpeqd %xmm5,%xmm3
2165 movdqa %xmm2,96(%rax)
2166 movdqa %xmm4,%xmm2
2167 movdqa %xmm3,112(%rax)
2168 jmp L$gather
2169
2170 .p2align 5
2171 L$gather:
2172 pxor %xmm4,%xmm4
2173 pxor %xmm5,%xmm5
2174 movdqa -128(%r11),%xmm0
2175 movdqa -112(%r11),%xmm1
2176 movdqa -96(%r11),%xmm2
2177 pand -128(%rax),%xmm0
2178 movdqa -80(%r11),%xmm3
2179 pand -112(%rax),%xmm1
2180 por %xmm0,%xmm4
2181 pand -96(%rax),%xmm2
2182 por %xmm1,%xmm5
2183 pand -80(%rax),%xmm3
2184 por %xmm2,%xmm4
2185 por %xmm3,%xmm5
2186 movdqa -64(%r11),%xmm0
2187 movdqa -48(%r11),%xmm1
2188 movdqa -32(%r11),%xmm2
2189 pand -64(%rax),%xmm0
2190 movdqa -16(%r11),%xmm3
2191 pand -48(%rax),%xmm1
2192 por %xmm0,%xmm4
2193 pand -32(%rax),%xmm2
2194 por %xmm1,%xmm5
2195 pand -16(%rax),%xmm3
2196 por %xmm2,%xmm4
2197 por %xmm3,%xmm5
2198 movdqa 0(%r11),%xmm0
2199 movdqa 16(%r11),%xmm1
2200 movdqa 32(%r11),%xmm2
2201 pand 0(%rax),%xmm0
2202 movdqa 48(%r11),%xmm3
2203 pand 16(%rax),%xmm1
2204 por %xmm0,%xmm4
2205 pand 32(%rax),%xmm2
2206 por %xmm1,%xmm5
2207 pand 48(%rax),%xmm3
2208 por %xmm2,%xmm4
2209 por %xmm3,%xmm5
2210 movdqa 64(%r11),%xmm0
2211 movdqa 80(%r11),%xmm1
2212 movdqa 96(%r11),%xmm2
2213 pand 64(%rax),%xmm0
2214 movdqa 112(%r11),%xmm3
2215 pand 80(%rax),%xmm1
2216 por %xmm0,%xmm4
2217 pand 96(%rax),%xmm2
2218 por %xmm1,%xmm5
2219 pand 112(%rax),%xmm3
2220 por %xmm2,%xmm4
2221 por %xmm3,%xmm5
2222 por %xmm5,%xmm4
2223 leaq 256(%r11),%r11
2224 pshufd $0x4e,%xmm4,%xmm0
2225 por %xmm4,%xmm0
2226 movq %xmm0,(%rdi)
2227 leaq 8(%rdi),%rdi
2228 subl $1,%esi
2229 jnz L$gather
2230
2231 leaq (%r10),%rsp
2232 .byte 0xf3,0xc3
2233 L$SEH_end_bn_gather5:
2234
2235 .p2align 6
2236 L$inc:
2237 .long 0,0, 1,1
2238 .long 2,2, 2,2
2239 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105 ,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97 ,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71 ,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1 11,114,103,62,0
2240 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698