OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 .extern OPENSSL_ia32cap_P | |
5 .hidden OPENSSL_ia32cap_P | |
6 | |
7 .globl bn_mul_mont_gather5 | |
8 .hidden bn_mul_mont_gather5 | |
9 .type bn_mul_mont_gather5,@function | |
10 .align 64 | |
11 bn_mul_mont_gather5: | |
12 testl $7,%r9d | |
13 jnz .Lmul_enter | |
14 jmp .Lmul4x_enter | |
15 | |
16 .align 16 | |
17 .Lmul_enter: | |
18 movl %r9d,%r9d | |
19 movq %rsp,%rax | |
20 movd 8(%rsp),%xmm5 | |
21 leaq .Linc(%rip),%r10 | |
22 pushq %rbx | |
23 pushq %rbp | |
24 pushq %r12 | |
25 pushq %r13 | |
26 pushq %r14 | |
27 pushq %r15 | |
28 | |
29 leaq 2(%r9),%r11 | |
30 negq %r11 | |
31 leaq -264(%rsp,%r11,8),%rsp | |
32 andq $-1024,%rsp | |
33 | |
34 movq %rax,8(%rsp,%r9,8) | |
35 .Lmul_body: | |
36 leaq 128(%rdx),%r12 | |
37 movdqa 0(%r10),%xmm0 | |
38 movdqa 16(%r10),%xmm1 | |
39 leaq 24-112(%rsp,%r9,8),%r10 | |
40 andq $-16,%r10 | |
41 | |
42 pshufd $0,%xmm5,%xmm5 | |
43 movdqa %xmm1,%xmm4 | |
44 movdqa %xmm1,%xmm2 | |
45 paddd %xmm0,%xmm1 | |
46 pcmpeqd %xmm5,%xmm0 | |
47 .byte 0x67 | |
48 movdqa %xmm4,%xmm3 | |
49 paddd %xmm1,%xmm2 | |
50 pcmpeqd %xmm5,%xmm1 | |
51 movdqa %xmm0,112(%r10) | |
52 movdqa %xmm4,%xmm0 | |
53 | |
54 paddd %xmm2,%xmm3 | |
55 pcmpeqd %xmm5,%xmm2 | |
56 movdqa %xmm1,128(%r10) | |
57 movdqa %xmm4,%xmm1 | |
58 | |
59 paddd %xmm3,%xmm0 | |
60 pcmpeqd %xmm5,%xmm3 | |
61 movdqa %xmm2,144(%r10) | |
62 movdqa %xmm4,%xmm2 | |
63 | |
64 paddd %xmm0,%xmm1 | |
65 pcmpeqd %xmm5,%xmm0 | |
66 movdqa %xmm3,160(%r10) | |
67 movdqa %xmm4,%xmm3 | |
68 paddd %xmm1,%xmm2 | |
69 pcmpeqd %xmm5,%xmm1 | |
70 movdqa %xmm0,176(%r10) | |
71 movdqa %xmm4,%xmm0 | |
72 | |
73 paddd %xmm2,%xmm3 | |
74 pcmpeqd %xmm5,%xmm2 | |
75 movdqa %xmm1,192(%r10) | |
76 movdqa %xmm4,%xmm1 | |
77 | |
78 paddd %xmm3,%xmm0 | |
79 pcmpeqd %xmm5,%xmm3 | |
80 movdqa %xmm2,208(%r10) | |
81 movdqa %xmm4,%xmm2 | |
82 | |
83 paddd %xmm0,%xmm1 | |
84 pcmpeqd %xmm5,%xmm0 | |
85 movdqa %xmm3,224(%r10) | |
86 movdqa %xmm4,%xmm3 | |
87 paddd %xmm1,%xmm2 | |
88 pcmpeqd %xmm5,%xmm1 | |
89 movdqa %xmm0,240(%r10) | |
90 movdqa %xmm4,%xmm0 | |
91 | |
92 paddd %xmm2,%xmm3 | |
93 pcmpeqd %xmm5,%xmm2 | |
94 movdqa %xmm1,256(%r10) | |
95 movdqa %xmm4,%xmm1 | |
96 | |
97 paddd %xmm3,%xmm0 | |
98 pcmpeqd %xmm5,%xmm3 | |
99 movdqa %xmm2,272(%r10) | |
100 movdqa %xmm4,%xmm2 | |
101 | |
102 paddd %xmm0,%xmm1 | |
103 pcmpeqd %xmm5,%xmm0 | |
104 movdqa %xmm3,288(%r10) | |
105 movdqa %xmm4,%xmm3 | |
106 paddd %xmm1,%xmm2 | |
107 pcmpeqd %xmm5,%xmm1 | |
108 movdqa %xmm0,304(%r10) | |
109 | |
110 paddd %xmm2,%xmm3 | |
111 .byte 0x67 | |
112 pcmpeqd %xmm5,%xmm2 | |
113 movdqa %xmm1,320(%r10) | |
114 | |
115 pcmpeqd %xmm5,%xmm3 | |
116 movdqa %xmm2,336(%r10) | |
117 pand 64(%r12),%xmm0 | |
118 | |
119 pand 80(%r12),%xmm1 | |
120 pand 96(%r12),%xmm2 | |
121 movdqa %xmm3,352(%r10) | |
122 pand 112(%r12),%xmm3 | |
123 por %xmm2,%xmm0 | |
124 por %xmm3,%xmm1 | |
125 movdqa -128(%r12),%xmm4 | |
126 movdqa -112(%r12),%xmm5 | |
127 movdqa -96(%r12),%xmm2 | |
128 pand 112(%r10),%xmm4 | |
129 movdqa -80(%r12),%xmm3 | |
130 pand 128(%r10),%xmm5 | |
131 por %xmm4,%xmm0 | |
132 pand 144(%r10),%xmm2 | |
133 por %xmm5,%xmm1 | |
134 pand 160(%r10),%xmm3 | |
135 por %xmm2,%xmm0 | |
136 por %xmm3,%xmm1 | |
137 movdqa -64(%r12),%xmm4 | |
138 movdqa -48(%r12),%xmm5 | |
139 movdqa -32(%r12),%xmm2 | |
140 pand 176(%r10),%xmm4 | |
141 movdqa -16(%r12),%xmm3 | |
142 pand 192(%r10),%xmm5 | |
143 por %xmm4,%xmm0 | |
144 pand 208(%r10),%xmm2 | |
145 por %xmm5,%xmm1 | |
146 pand 224(%r10),%xmm3 | |
147 por %xmm2,%xmm0 | |
148 por %xmm3,%xmm1 | |
149 movdqa 0(%r12),%xmm4 | |
150 movdqa 16(%r12),%xmm5 | |
151 movdqa 32(%r12),%xmm2 | |
152 pand 240(%r10),%xmm4 | |
153 movdqa 48(%r12),%xmm3 | |
154 pand 256(%r10),%xmm5 | |
155 por %xmm4,%xmm0 | |
156 pand 272(%r10),%xmm2 | |
157 por %xmm5,%xmm1 | |
158 pand 288(%r10),%xmm3 | |
159 por %xmm2,%xmm0 | |
160 por %xmm3,%xmm1 | |
161 por %xmm1,%xmm0 | |
162 pshufd $0x4e,%xmm0,%xmm1 | |
163 por %xmm1,%xmm0 | |
164 leaq 256(%r12),%r12 | |
165 .byte 102,72,15,126,195 | |
166 | |
167 movq (%r8),%r8 | |
168 movq (%rsi),%rax | |
169 | |
170 xorq %r14,%r14 | |
171 xorq %r15,%r15 | |
172 | |
173 movq %r8,%rbp | |
174 mulq %rbx | |
175 movq %rax,%r10 | |
176 movq (%rcx),%rax | |
177 | |
178 imulq %r10,%rbp | |
179 movq %rdx,%r11 | |
180 | |
181 mulq %rbp | |
182 addq %rax,%r10 | |
183 movq 8(%rsi),%rax | |
184 adcq $0,%rdx | |
185 movq %rdx,%r13 | |
186 | |
187 leaq 1(%r15),%r15 | |
188 jmp .L1st_enter | |
189 | |
190 .align 16 | |
191 .L1st: | |
192 addq %rax,%r13 | |
193 movq (%rsi,%r15,8),%rax | |
194 adcq $0,%rdx | |
195 addq %r11,%r13 | |
196 movq %r10,%r11 | |
197 adcq $0,%rdx | |
198 movq %r13,-16(%rsp,%r15,8) | |
199 movq %rdx,%r13 | |
200 | |
201 .L1st_enter: | |
202 mulq %rbx | |
203 addq %rax,%r11 | |
204 movq (%rcx,%r15,8),%rax | |
205 adcq $0,%rdx | |
206 leaq 1(%r15),%r15 | |
207 movq %rdx,%r10 | |
208 | |
209 mulq %rbp | |
210 cmpq %r9,%r15 | |
211 jne .L1st | |
212 | |
213 | |
214 addq %rax,%r13 | |
215 adcq $0,%rdx | |
216 addq %r11,%r13 | |
217 adcq $0,%rdx | |
218 movq %r13,-16(%rsp,%r9,8) | |
219 movq %rdx,%r13 | |
220 movq %r10,%r11 | |
221 | |
222 xorq %rdx,%rdx | |
223 addq %r11,%r13 | |
224 adcq $0,%rdx | |
225 movq %r13,-8(%rsp,%r9,8) | |
226 movq %rdx,(%rsp,%r9,8) | |
227 | |
228 leaq 1(%r14),%r14 | |
229 jmp .Louter | |
230 .align 16 | |
231 .Louter: | |
232 leaq 24+128(%rsp,%r9,8),%rdx | |
233 andq $-16,%rdx | |
234 pxor %xmm4,%xmm4 | |
235 pxor %xmm5,%xmm5 | |
236 movdqa -128(%r12),%xmm0 | |
237 movdqa -112(%r12),%xmm1 | |
238 movdqa -96(%r12),%xmm2 | |
239 movdqa -80(%r12),%xmm3 | |
240 pand -128(%rdx),%xmm0 | |
241 pand -112(%rdx),%xmm1 | |
242 por %xmm0,%xmm4 | |
243 pand -96(%rdx),%xmm2 | |
244 por %xmm1,%xmm5 | |
245 pand -80(%rdx),%xmm3 | |
246 por %xmm2,%xmm4 | |
247 por %xmm3,%xmm5 | |
248 movdqa -64(%r12),%xmm0 | |
249 movdqa -48(%r12),%xmm1 | |
250 movdqa -32(%r12),%xmm2 | |
251 movdqa -16(%r12),%xmm3 | |
252 pand -64(%rdx),%xmm0 | |
253 pand -48(%rdx),%xmm1 | |
254 por %xmm0,%xmm4 | |
255 pand -32(%rdx),%xmm2 | |
256 por %xmm1,%xmm5 | |
257 pand -16(%rdx),%xmm3 | |
258 por %xmm2,%xmm4 | |
259 por %xmm3,%xmm5 | |
260 movdqa 0(%r12),%xmm0 | |
261 movdqa 16(%r12),%xmm1 | |
262 movdqa 32(%r12),%xmm2 | |
263 movdqa 48(%r12),%xmm3 | |
264 pand 0(%rdx),%xmm0 | |
265 pand 16(%rdx),%xmm1 | |
266 por %xmm0,%xmm4 | |
267 pand 32(%rdx),%xmm2 | |
268 por %xmm1,%xmm5 | |
269 pand 48(%rdx),%xmm3 | |
270 por %xmm2,%xmm4 | |
271 por %xmm3,%xmm5 | |
272 movdqa 64(%r12),%xmm0 | |
273 movdqa 80(%r12),%xmm1 | |
274 movdqa 96(%r12),%xmm2 | |
275 movdqa 112(%r12),%xmm3 | |
276 pand 64(%rdx),%xmm0 | |
277 pand 80(%rdx),%xmm1 | |
278 por %xmm0,%xmm4 | |
279 pand 96(%rdx),%xmm2 | |
280 por %xmm1,%xmm5 | |
281 pand 112(%rdx),%xmm3 | |
282 por %xmm2,%xmm4 | |
283 por %xmm3,%xmm5 | |
284 por %xmm5,%xmm4 | |
285 pshufd $0x4e,%xmm4,%xmm0 | |
286 por %xmm4,%xmm0 | |
287 leaq 256(%r12),%r12 | |
288 | |
289 movq (%rsi),%rax | |
290 .byte 102,72,15,126,195 | |
291 | |
292 xorq %r15,%r15 | |
293 movq %r8,%rbp | |
294 movq (%rsp),%r10 | |
295 | |
296 mulq %rbx | |
297 addq %rax,%r10 | |
298 movq (%rcx),%rax | |
299 adcq $0,%rdx | |
300 | |
301 imulq %r10,%rbp | |
302 movq %rdx,%r11 | |
303 | |
304 mulq %rbp | |
305 addq %rax,%r10 | |
306 movq 8(%rsi),%rax | |
307 adcq $0,%rdx | |
308 movq 8(%rsp),%r10 | |
309 movq %rdx,%r13 | |
310 | |
311 leaq 1(%r15),%r15 | |
312 jmp .Linner_enter | |
313 | |
314 .align 16 | |
315 .Linner: | |
316 addq %rax,%r13 | |
317 movq (%rsi,%r15,8),%rax | |
318 adcq $0,%rdx | |
319 addq %r10,%r13 | |
320 movq (%rsp,%r15,8),%r10 | |
321 adcq $0,%rdx | |
322 movq %r13,-16(%rsp,%r15,8) | |
323 movq %rdx,%r13 | |
324 | |
325 .Linner_enter: | |
326 mulq %rbx | |
327 addq %rax,%r11 | |
328 movq (%rcx,%r15,8),%rax | |
329 adcq $0,%rdx | |
330 addq %r11,%r10 | |
331 movq %rdx,%r11 | |
332 adcq $0,%r11 | |
333 leaq 1(%r15),%r15 | |
334 | |
335 mulq %rbp | |
336 cmpq %r9,%r15 | |
337 jne .Linner | |
338 | |
339 addq %rax,%r13 | |
340 adcq $0,%rdx | |
341 addq %r10,%r13 | |
342 movq (%rsp,%r9,8),%r10 | |
343 adcq $0,%rdx | |
344 movq %r13,-16(%rsp,%r9,8) | |
345 movq %rdx,%r13 | |
346 | |
347 xorq %rdx,%rdx | |
348 addq %r11,%r13 | |
349 adcq $0,%rdx | |
350 addq %r10,%r13 | |
351 adcq $0,%rdx | |
352 movq %r13,-8(%rsp,%r9,8) | |
353 movq %rdx,(%rsp,%r9,8) | |
354 | |
355 leaq 1(%r14),%r14 | |
356 cmpq %r9,%r14 | |
357 jb .Louter | |
358 | |
359 xorq %r14,%r14 | |
360 movq (%rsp),%rax | |
361 leaq (%rsp),%rsi | |
362 movq %r9,%r15 | |
363 jmp .Lsub | |
364 .align 16 | |
365 .Lsub: sbbq (%rcx,%r14,8),%rax | |
366 movq %rax,(%rdi,%r14,8) | |
367 movq 8(%rsi,%r14,8),%rax | |
368 leaq 1(%r14),%r14 | |
369 decq %r15 | |
370 jnz .Lsub | |
371 | |
372 sbbq $0,%rax | |
373 xorq %r14,%r14 | |
374 movq %r9,%r15 | |
375 .align 16 | |
376 .Lcopy: | |
377 movq (%rsp,%r14,8),%rsi | |
378 movq (%rdi,%r14,8),%rcx | |
379 xorq %rcx,%rsi | |
380 andq %rax,%rsi | |
381 xorq %rcx,%rsi | |
382 movq %r14,(%rsp,%r14,8) | |
383 movq %rsi,(%rdi,%r14,8) | |
384 leaq 1(%r14),%r14 | |
385 subq $1,%r15 | |
386 jnz .Lcopy | |
387 | |
388 movq 8(%rsp,%r9,8),%rsi | |
389 movq $1,%rax | |
390 | |
391 movq -48(%rsi),%r15 | |
392 movq -40(%rsi),%r14 | |
393 movq -32(%rsi),%r13 | |
394 movq -24(%rsi),%r12 | |
395 movq -16(%rsi),%rbp | |
396 movq -8(%rsi),%rbx | |
397 leaq (%rsi),%rsp | |
398 .Lmul_epilogue: | |
399 .byte 0xf3,0xc3 | |
400 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 | |
401 .type bn_mul4x_mont_gather5,@function | |
402 .align 32 | |
403 bn_mul4x_mont_gather5: | |
404 .Lmul4x_enter: | |
405 .byte 0x67 | |
406 movq %rsp,%rax | |
407 pushq %rbx | |
408 pushq %rbp | |
409 pushq %r12 | |
410 pushq %r13 | |
411 pushq %r14 | |
412 pushq %r15 | |
413 | |
414 .byte 0x67 | |
415 shll $3,%r9d | |
416 leaq (%r9,%r9,2),%r10 | |
417 negq %r9 | |
418 | |
419 | |
420 | |
421 | |
422 | |
423 | |
424 | |
425 | |
426 | |
427 | |
428 leaq -320(%rsp,%r9,2),%r11 | |
429 subq %rdi,%r11 | |
430 andq $4095,%r11 | |
431 cmpq %r11,%r10 | |
432 jb .Lmul4xsp_alt | |
433 subq %r11,%rsp | |
434 leaq -320(%rsp,%r9,2),%rsp | |
435 jmp .Lmul4xsp_done | |
436 | |
437 .align 32 | |
438 .Lmul4xsp_alt: | |
439 leaq 4096-320(,%r9,2),%r10 | |
440 leaq -320(%rsp,%r9,2),%rsp | |
441 subq %r10,%r11 | |
442 movq $0,%r10 | |
443 cmovcq %r10,%r11 | |
444 subq %r11,%rsp | |
445 .Lmul4xsp_done: | |
446 andq $-64,%rsp | |
447 negq %r9 | |
448 | |
449 movq %rax,40(%rsp) | |
450 .Lmul4x_body: | |
451 | |
452 call mul4x_internal | |
453 | |
454 movq 40(%rsp),%rsi | |
455 movq $1,%rax | |
456 | |
457 movq -48(%rsi),%r15 | |
458 movq -40(%rsi),%r14 | |
459 movq -32(%rsi),%r13 | |
460 movq -24(%rsi),%r12 | |
461 movq -16(%rsi),%rbp | |
462 movq -8(%rsi),%rbx | |
463 leaq (%rsi),%rsp | |
464 .Lmul4x_epilogue: | |
465 .byte 0xf3,0xc3 | |
466 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 | |
467 | |
468 .type mul4x_internal,@function | |
469 .align 32 | |
470 mul4x_internal: | |
471 shlq $5,%r9 | |
472 movd 8(%rax),%xmm5 | |
473 leaq .Linc(%rip),%rax | |
474 leaq 128(%rdx,%r9,1),%r13 | |
475 shrq $5,%r9 | |
476 movdqa 0(%rax),%xmm0 | |
477 movdqa 16(%rax),%xmm1 | |
478 leaq 88-112(%rsp,%r9,1),%r10 | |
479 leaq 128(%rdx),%r12 | |
480 | |
481 pshufd $0,%xmm5,%xmm5 | |
482 movdqa %xmm1,%xmm4 | |
483 .byte 0x67,0x67 | |
484 movdqa %xmm1,%xmm2 | |
485 paddd %xmm0,%xmm1 | |
486 pcmpeqd %xmm5,%xmm0 | |
487 .byte 0x67 | |
488 movdqa %xmm4,%xmm3 | |
489 paddd %xmm1,%xmm2 | |
490 pcmpeqd %xmm5,%xmm1 | |
491 movdqa %xmm0,112(%r10) | |
492 movdqa %xmm4,%xmm0 | |
493 | |
494 paddd %xmm2,%xmm3 | |
495 pcmpeqd %xmm5,%xmm2 | |
496 movdqa %xmm1,128(%r10) | |
497 movdqa %xmm4,%xmm1 | |
498 | |
499 paddd %xmm3,%xmm0 | |
500 pcmpeqd %xmm5,%xmm3 | |
501 movdqa %xmm2,144(%r10) | |
502 movdqa %xmm4,%xmm2 | |
503 | |
504 paddd %xmm0,%xmm1 | |
505 pcmpeqd %xmm5,%xmm0 | |
506 movdqa %xmm3,160(%r10) | |
507 movdqa %xmm4,%xmm3 | |
508 paddd %xmm1,%xmm2 | |
509 pcmpeqd %xmm5,%xmm1 | |
510 movdqa %xmm0,176(%r10) | |
511 movdqa %xmm4,%xmm0 | |
512 | |
513 paddd %xmm2,%xmm3 | |
514 pcmpeqd %xmm5,%xmm2 | |
515 movdqa %xmm1,192(%r10) | |
516 movdqa %xmm4,%xmm1 | |
517 | |
518 paddd %xmm3,%xmm0 | |
519 pcmpeqd %xmm5,%xmm3 | |
520 movdqa %xmm2,208(%r10) | |
521 movdqa %xmm4,%xmm2 | |
522 | |
523 paddd %xmm0,%xmm1 | |
524 pcmpeqd %xmm5,%xmm0 | |
525 movdqa %xmm3,224(%r10) | |
526 movdqa %xmm4,%xmm3 | |
527 paddd %xmm1,%xmm2 | |
528 pcmpeqd %xmm5,%xmm1 | |
529 movdqa %xmm0,240(%r10) | |
530 movdqa %xmm4,%xmm0 | |
531 | |
532 paddd %xmm2,%xmm3 | |
533 pcmpeqd %xmm5,%xmm2 | |
534 movdqa %xmm1,256(%r10) | |
535 movdqa %xmm4,%xmm1 | |
536 | |
537 paddd %xmm3,%xmm0 | |
538 pcmpeqd %xmm5,%xmm3 | |
539 movdqa %xmm2,272(%r10) | |
540 movdqa %xmm4,%xmm2 | |
541 | |
542 paddd %xmm0,%xmm1 | |
543 pcmpeqd %xmm5,%xmm0 | |
544 movdqa %xmm3,288(%r10) | |
545 movdqa %xmm4,%xmm3 | |
546 paddd %xmm1,%xmm2 | |
547 pcmpeqd %xmm5,%xmm1 | |
548 movdqa %xmm0,304(%r10) | |
549 | |
550 paddd %xmm2,%xmm3 | |
551 .byte 0x67 | |
552 pcmpeqd %xmm5,%xmm2 | |
553 movdqa %xmm1,320(%r10) | |
554 | |
555 pcmpeqd %xmm5,%xmm3 | |
556 movdqa %xmm2,336(%r10) | |
557 pand 64(%r12),%xmm0 | |
558 | |
559 pand 80(%r12),%xmm1 | |
560 pand 96(%r12),%xmm2 | |
561 movdqa %xmm3,352(%r10) | |
562 pand 112(%r12),%xmm3 | |
563 por %xmm2,%xmm0 | |
564 por %xmm3,%xmm1 | |
565 movdqa -128(%r12),%xmm4 | |
566 movdqa -112(%r12),%xmm5 | |
567 movdqa -96(%r12),%xmm2 | |
568 pand 112(%r10),%xmm4 | |
569 movdqa -80(%r12),%xmm3 | |
570 pand 128(%r10),%xmm5 | |
571 por %xmm4,%xmm0 | |
572 pand 144(%r10),%xmm2 | |
573 por %xmm5,%xmm1 | |
574 pand 160(%r10),%xmm3 | |
575 por %xmm2,%xmm0 | |
576 por %xmm3,%xmm1 | |
577 movdqa -64(%r12),%xmm4 | |
578 movdqa -48(%r12),%xmm5 | |
579 movdqa -32(%r12),%xmm2 | |
580 pand 176(%r10),%xmm4 | |
581 movdqa -16(%r12),%xmm3 | |
582 pand 192(%r10),%xmm5 | |
583 por %xmm4,%xmm0 | |
584 pand 208(%r10),%xmm2 | |
585 por %xmm5,%xmm1 | |
586 pand 224(%r10),%xmm3 | |
587 por %xmm2,%xmm0 | |
588 por %xmm3,%xmm1 | |
589 movdqa 0(%r12),%xmm4 | |
590 movdqa 16(%r12),%xmm5 | |
591 movdqa 32(%r12),%xmm2 | |
592 pand 240(%r10),%xmm4 | |
593 movdqa 48(%r12),%xmm3 | |
594 pand 256(%r10),%xmm5 | |
595 por %xmm4,%xmm0 | |
596 pand 272(%r10),%xmm2 | |
597 por %xmm5,%xmm1 | |
598 pand 288(%r10),%xmm3 | |
599 por %xmm2,%xmm0 | |
600 por %xmm3,%xmm1 | |
601 por %xmm1,%xmm0 | |
602 pshufd $0x4e,%xmm0,%xmm1 | |
603 por %xmm1,%xmm0 | |
604 leaq 256(%r12),%r12 | |
605 .byte 102,72,15,126,195 | |
606 | |
607 movq %r13,16+8(%rsp) | |
608 movq %rdi,56+8(%rsp) | |
609 | |
610 movq (%r8),%r8 | |
611 movq (%rsi),%rax | |
612 leaq (%rsi,%r9,1),%rsi | |
613 negq %r9 | |
614 | |
615 movq %r8,%rbp | |
616 mulq %rbx | |
617 movq %rax,%r10 | |
618 movq (%rcx),%rax | |
619 | |
620 imulq %r10,%rbp | |
621 leaq 64+8(%rsp),%r14 | |
622 movq %rdx,%r11 | |
623 | |
624 mulq %rbp | |
625 addq %rax,%r10 | |
626 movq 8(%rsi,%r9,1),%rax | |
627 adcq $0,%rdx | |
628 movq %rdx,%rdi | |
629 | |
630 mulq %rbx | |
631 addq %rax,%r11 | |
632 movq 8(%rcx),%rax | |
633 adcq $0,%rdx | |
634 movq %rdx,%r10 | |
635 | |
636 mulq %rbp | |
637 addq %rax,%rdi | |
638 movq 16(%rsi,%r9,1),%rax | |
639 adcq $0,%rdx | |
640 addq %r11,%rdi | |
641 leaq 32(%r9),%r15 | |
642 leaq 32(%rcx),%rcx | |
643 adcq $0,%rdx | |
644 movq %rdi,(%r14) | |
645 movq %rdx,%r13 | |
646 jmp .L1st4x | |
647 | |
648 .align 32 | |
649 .L1st4x: | |
650 mulq %rbx | |
651 addq %rax,%r10 | |
652 movq -16(%rcx),%rax | |
653 leaq 32(%r14),%r14 | |
654 adcq $0,%rdx | |
655 movq %rdx,%r11 | |
656 | |
657 mulq %rbp | |
658 addq %rax,%r13 | |
659 movq -8(%rsi,%r15,1),%rax | |
660 adcq $0,%rdx | |
661 addq %r10,%r13 | |
662 adcq $0,%rdx | |
663 movq %r13,-24(%r14) | |
664 movq %rdx,%rdi | |
665 | |
666 mulq %rbx | |
667 addq %rax,%r11 | |
668 movq -8(%rcx),%rax | |
669 adcq $0,%rdx | |
670 movq %rdx,%r10 | |
671 | |
672 mulq %rbp | |
673 addq %rax,%rdi | |
674 movq (%rsi,%r15,1),%rax | |
675 adcq $0,%rdx | |
676 addq %r11,%rdi | |
677 adcq $0,%rdx | |
678 movq %rdi,-16(%r14) | |
679 movq %rdx,%r13 | |
680 | |
681 mulq %rbx | |
682 addq %rax,%r10 | |
683 movq 0(%rcx),%rax | |
684 adcq $0,%rdx | |
685 movq %rdx,%r11 | |
686 | |
687 mulq %rbp | |
688 addq %rax,%r13 | |
689 movq 8(%rsi,%r15,1),%rax | |
690 adcq $0,%rdx | |
691 addq %r10,%r13 | |
692 adcq $0,%rdx | |
693 movq %r13,-8(%r14) | |
694 movq %rdx,%rdi | |
695 | |
696 mulq %rbx | |
697 addq %rax,%r11 | |
698 movq 8(%rcx),%rax | |
699 adcq $0,%rdx | |
700 movq %rdx,%r10 | |
701 | |
702 mulq %rbp | |
703 addq %rax,%rdi | |
704 movq 16(%rsi,%r15,1),%rax | |
705 adcq $0,%rdx | |
706 addq %r11,%rdi | |
707 leaq 32(%rcx),%rcx | |
708 adcq $0,%rdx | |
709 movq %rdi,(%r14) | |
710 movq %rdx,%r13 | |
711 | |
712 addq $32,%r15 | |
713 jnz .L1st4x | |
714 | |
715 mulq %rbx | |
716 addq %rax,%r10 | |
717 movq -16(%rcx),%rax | |
718 leaq 32(%r14),%r14 | |
719 adcq $0,%rdx | |
720 movq %rdx,%r11 | |
721 | |
722 mulq %rbp | |
723 addq %rax,%r13 | |
724 movq -8(%rsi),%rax | |
725 adcq $0,%rdx | |
726 addq %r10,%r13 | |
727 adcq $0,%rdx | |
728 movq %r13,-24(%r14) | |
729 movq %rdx,%rdi | |
730 | |
731 mulq %rbx | |
732 addq %rax,%r11 | |
733 movq -8(%rcx),%rax | |
734 adcq $0,%rdx | |
735 movq %rdx,%r10 | |
736 | |
737 mulq %rbp | |
738 addq %rax,%rdi | |
739 movq (%rsi,%r9,1),%rax | |
740 adcq $0,%rdx | |
741 addq %r11,%rdi | |
742 adcq $0,%rdx | |
743 movq %rdi,-16(%r14) | |
744 movq %rdx,%r13 | |
745 | |
746 leaq (%rcx,%r9,1),%rcx | |
747 | |
748 xorq %rdi,%rdi | |
749 addq %r10,%r13 | |
750 adcq $0,%rdi | |
751 movq %r13,-8(%r14) | |
752 | |
753 jmp .Louter4x | |
754 | |
755 .align 32 | |
756 .Louter4x: | |
757 leaq 16+128(%r14),%rdx | |
758 pxor %xmm4,%xmm4 | |
759 pxor %xmm5,%xmm5 | |
760 movdqa -128(%r12),%xmm0 | |
761 movdqa -112(%r12),%xmm1 | |
762 movdqa -96(%r12),%xmm2 | |
763 movdqa -80(%r12),%xmm3 | |
764 pand -128(%rdx),%xmm0 | |
765 pand -112(%rdx),%xmm1 | |
766 por %xmm0,%xmm4 | |
767 pand -96(%rdx),%xmm2 | |
768 por %xmm1,%xmm5 | |
769 pand -80(%rdx),%xmm3 | |
770 por %xmm2,%xmm4 | |
771 por %xmm3,%xmm5 | |
772 movdqa -64(%r12),%xmm0 | |
773 movdqa -48(%r12),%xmm1 | |
774 movdqa -32(%r12),%xmm2 | |
775 movdqa -16(%r12),%xmm3 | |
776 pand -64(%rdx),%xmm0 | |
777 pand -48(%rdx),%xmm1 | |
778 por %xmm0,%xmm4 | |
779 pand -32(%rdx),%xmm2 | |
780 por %xmm1,%xmm5 | |
781 pand -16(%rdx),%xmm3 | |
782 por %xmm2,%xmm4 | |
783 por %xmm3,%xmm5 | |
784 movdqa 0(%r12),%xmm0 | |
785 movdqa 16(%r12),%xmm1 | |
786 movdqa 32(%r12),%xmm2 | |
787 movdqa 48(%r12),%xmm3 | |
788 pand 0(%rdx),%xmm0 | |
789 pand 16(%rdx),%xmm1 | |
790 por %xmm0,%xmm4 | |
791 pand 32(%rdx),%xmm2 | |
792 por %xmm1,%xmm5 | |
793 pand 48(%rdx),%xmm3 | |
794 por %xmm2,%xmm4 | |
795 por %xmm3,%xmm5 | |
796 movdqa 64(%r12),%xmm0 | |
797 movdqa 80(%r12),%xmm1 | |
798 movdqa 96(%r12),%xmm2 | |
799 movdqa 112(%r12),%xmm3 | |
800 pand 64(%rdx),%xmm0 | |
801 pand 80(%rdx),%xmm1 | |
802 por %xmm0,%xmm4 | |
803 pand 96(%rdx),%xmm2 | |
804 por %xmm1,%xmm5 | |
805 pand 112(%rdx),%xmm3 | |
806 por %xmm2,%xmm4 | |
807 por %xmm3,%xmm5 | |
808 por %xmm5,%xmm4 | |
809 pshufd $0x4e,%xmm4,%xmm0 | |
810 por %xmm4,%xmm0 | |
811 leaq 256(%r12),%r12 | |
812 .byte 102,72,15,126,195 | |
813 | |
814 movq (%r14,%r9,1),%r10 | |
815 movq %r8,%rbp | |
816 mulq %rbx | |
817 addq %rax,%r10 | |
818 movq (%rcx),%rax | |
819 adcq $0,%rdx | |
820 | |
821 imulq %r10,%rbp | |
822 movq %rdx,%r11 | |
823 movq %rdi,(%r14) | |
824 | |
825 leaq (%r14,%r9,1),%r14 | |
826 | |
827 mulq %rbp | |
828 addq %rax,%r10 | |
829 movq 8(%rsi,%r9,1),%rax | |
830 adcq $0,%rdx | |
831 movq %rdx,%rdi | |
832 | |
833 mulq %rbx | |
834 addq %rax,%r11 | |
835 movq 8(%rcx),%rax | |
836 adcq $0,%rdx | |
837 addq 8(%r14),%r11 | |
838 adcq $0,%rdx | |
839 movq %rdx,%r10 | |
840 | |
841 mulq %rbp | |
842 addq %rax,%rdi | |
843 movq 16(%rsi,%r9,1),%rax | |
844 adcq $0,%rdx | |
845 addq %r11,%rdi | |
846 leaq 32(%r9),%r15 | |
847 leaq 32(%rcx),%rcx | |
848 adcq $0,%rdx | |
849 movq %rdx,%r13 | |
850 jmp .Linner4x | |
851 | |
852 .align 32 | |
853 .Linner4x: | |
854 mulq %rbx | |
855 addq %rax,%r10 | |
856 movq -16(%rcx),%rax | |
857 adcq $0,%rdx | |
858 addq 16(%r14),%r10 | |
859 leaq 32(%r14),%r14 | |
860 adcq $0,%rdx | |
861 movq %rdx,%r11 | |
862 | |
863 mulq %rbp | |
864 addq %rax,%r13 | |
865 movq -8(%rsi,%r15,1),%rax | |
866 adcq $0,%rdx | |
867 addq %r10,%r13 | |
868 adcq $0,%rdx | |
869 movq %rdi,-32(%r14) | |
870 movq %rdx,%rdi | |
871 | |
872 mulq %rbx | |
873 addq %rax,%r11 | |
874 movq -8(%rcx),%rax | |
875 adcq $0,%rdx | |
876 addq -8(%r14),%r11 | |
877 adcq $0,%rdx | |
878 movq %rdx,%r10 | |
879 | |
880 mulq %rbp | |
881 addq %rax,%rdi | |
882 movq (%rsi,%r15,1),%rax | |
883 adcq $0,%rdx | |
884 addq %r11,%rdi | |
885 adcq $0,%rdx | |
886 movq %r13,-24(%r14) | |
887 movq %rdx,%r13 | |
888 | |
889 mulq %rbx | |
890 addq %rax,%r10 | |
891 movq 0(%rcx),%rax | |
892 adcq $0,%rdx | |
893 addq (%r14),%r10 | |
894 adcq $0,%rdx | |
895 movq %rdx,%r11 | |
896 | |
897 mulq %rbp | |
898 addq %rax,%r13 | |
899 movq 8(%rsi,%r15,1),%rax | |
900 adcq $0,%rdx | |
901 addq %r10,%r13 | |
902 adcq $0,%rdx | |
903 movq %rdi,-16(%r14) | |
904 movq %rdx,%rdi | |
905 | |
906 mulq %rbx | |
907 addq %rax,%r11 | |
908 movq 8(%rcx),%rax | |
909 adcq $0,%rdx | |
910 addq 8(%r14),%r11 | |
911 adcq $0,%rdx | |
912 movq %rdx,%r10 | |
913 | |
914 mulq %rbp | |
915 addq %rax,%rdi | |
916 movq 16(%rsi,%r15,1),%rax | |
917 adcq $0,%rdx | |
918 addq %r11,%rdi | |
919 leaq 32(%rcx),%rcx | |
920 adcq $0,%rdx | |
921 movq %r13,-8(%r14) | |
922 movq %rdx,%r13 | |
923 | |
924 addq $32,%r15 | |
925 jnz .Linner4x | |
926 | |
927 mulq %rbx | |
928 addq %rax,%r10 | |
929 movq -16(%rcx),%rax | |
930 adcq $0,%rdx | |
931 addq 16(%r14),%r10 | |
932 leaq 32(%r14),%r14 | |
933 adcq $0,%rdx | |
934 movq %rdx,%r11 | |
935 | |
936 mulq %rbp | |
937 addq %rax,%r13 | |
938 movq -8(%rsi),%rax | |
939 adcq $0,%rdx | |
940 addq %r10,%r13 | |
941 adcq $0,%rdx | |
942 movq %rdi,-32(%r14) | |
943 movq %rdx,%rdi | |
944 | |
945 mulq %rbx | |
946 addq %rax,%r11 | |
947 movq %rbp,%rax | |
948 movq -8(%rcx),%rbp | |
949 adcq $0,%rdx | |
950 addq -8(%r14),%r11 | |
951 adcq $0,%rdx | |
952 movq %rdx,%r10 | |
953 | |
954 mulq %rbp | |
955 addq %rax,%rdi | |
956 movq (%rsi,%r9,1),%rax | |
957 adcq $0,%rdx | |
958 addq %r11,%rdi | |
959 adcq $0,%rdx | |
960 movq %r13,-24(%r14) | |
961 movq %rdx,%r13 | |
962 | |
963 movq %rdi,-16(%r14) | |
964 leaq (%rcx,%r9,1),%rcx | |
965 | |
966 xorq %rdi,%rdi | |
967 addq %r10,%r13 | |
968 adcq $0,%rdi | |
969 addq (%r14),%r13 | |
970 adcq $0,%rdi | |
971 movq %r13,-8(%r14) | |
972 | |
973 cmpq 16+8(%rsp),%r12 | |
974 jb .Louter4x | |
975 xorq %rax,%rax | |
976 subq %r13,%rbp | |
977 adcq %r15,%r15 | |
978 orq %r15,%rdi | |
979 subq %rdi,%rax | |
980 leaq (%r14,%r9,1),%rbx | |
981 movq (%rcx),%r12 | |
982 leaq (%rcx),%rbp | |
983 movq %r9,%rcx | |
984 sarq $3+2,%rcx | |
985 movq 56+8(%rsp),%rdi | |
986 decq %r12 | |
987 xorq %r10,%r10 | |
988 movq 8(%rbp),%r13 | |
989 movq 16(%rbp),%r14 | |
990 movq 24(%rbp),%r15 | |
991 jmp .Lsqr4x_sub_entry | |
992 .size mul4x_internal,.-mul4x_internal | |
993 .globl bn_power5 | |
994 .hidden bn_power5 | |
995 .type bn_power5,@function | |
996 .align 32 | |
997 bn_power5: | |
998 movq %rsp,%rax | |
999 pushq %rbx | |
1000 pushq %rbp | |
1001 pushq %r12 | |
1002 pushq %r13 | |
1003 pushq %r14 | |
1004 pushq %r15 | |
1005 | |
1006 shll $3,%r9d | |
1007 leal (%r9,%r9,2),%r10d | |
1008 negq %r9 | |
1009 movq (%r8),%r8 | |
1010 | |
1011 | |
1012 | |
1013 | |
1014 | |
1015 | |
1016 | |
1017 | |
1018 leaq -320(%rsp,%r9,2),%r11 | |
1019 subq %rdi,%r11 | |
1020 andq $4095,%r11 | |
1021 cmpq %r11,%r10 | |
1022 jb .Lpwr_sp_alt | |
1023 subq %r11,%rsp | |
1024 leaq -320(%rsp,%r9,2),%rsp | |
1025 jmp .Lpwr_sp_done | |
1026 | |
1027 .align 32 | |
1028 .Lpwr_sp_alt: | |
1029 leaq 4096-320(,%r9,2),%r10 | |
1030 leaq -320(%rsp,%r9,2),%rsp | |
1031 subq %r10,%r11 | |
1032 movq $0,%r10 | |
1033 cmovcq %r10,%r11 | |
1034 subq %r11,%rsp | |
1035 .Lpwr_sp_done: | |
1036 andq $-64,%rsp | |
1037 movq %r9,%r10 | |
1038 negq %r9 | |
1039 | |
1040 | |
1041 | |
1042 | |
1043 | |
1044 | |
1045 | |
1046 | |
1047 | |
1048 | |
1049 movq %r8,32(%rsp) | |
1050 movq %rax,40(%rsp) | |
1051 .Lpower5_body: | |
1052 .byte 102,72,15,110,207 | |
1053 .byte 102,72,15,110,209 | |
1054 .byte 102,73,15,110,218 | |
1055 .byte 102,72,15,110,226 | |
1056 | |
1057 call __bn_sqr8x_internal | |
1058 call __bn_post4x_internal | |
1059 call __bn_sqr8x_internal | |
1060 call __bn_post4x_internal | |
1061 call __bn_sqr8x_internal | |
1062 call __bn_post4x_internal | |
1063 call __bn_sqr8x_internal | |
1064 call __bn_post4x_internal | |
1065 call __bn_sqr8x_internal | |
1066 call __bn_post4x_internal | |
1067 | |
1068 .byte 102,72,15,126,209 | |
1069 .byte 102,72,15,126,226 | |
1070 movq %rsi,%rdi | |
1071 movq 40(%rsp),%rax | |
1072 leaq 32(%rsp),%r8 | |
1073 | |
1074 call mul4x_internal | |
1075 | |
1076 movq 40(%rsp),%rsi | |
1077 movq $1,%rax | |
1078 movq -48(%rsi),%r15 | |
1079 movq -40(%rsi),%r14 | |
1080 movq -32(%rsi),%r13 | |
1081 movq -24(%rsi),%r12 | |
1082 movq -16(%rsi),%rbp | |
1083 movq -8(%rsi),%rbx | |
1084 leaq (%rsi),%rsp | |
1085 .Lpower5_epilogue: | |
1086 .byte 0xf3,0xc3 | |
1087 .size bn_power5,.-bn_power5 | |
1088 | |
1089 .globl bn_sqr8x_internal | |
1090 .hidden bn_sqr8x_internal | |
1091 .hidden bn_sqr8x_internal | |
1092 .type bn_sqr8x_internal,@function | |
1093 .align 32 | |
1094 bn_sqr8x_internal: | |
1095 __bn_sqr8x_internal: | |
1096 | |
1097 | |
1098 | |
1099 | |
1100 | |
1101 | |
1102 | |
1103 | |
1104 | |
1105 | |
1106 | |
1107 | |
1108 | |
1109 | |
1110 | |
1111 | |
1112 | |
1113 | |
1114 | |
1115 | |
1116 | |
1117 | |
1118 | |
1119 | |
1120 | |
1121 | |
1122 | |
1123 | |
1124 | |
1125 | |
1126 | |
1127 | |
1128 | |
1129 | |
1130 | |
1131 | |
1132 | |
1133 | |
1134 | |
1135 | |
1136 | |
1137 | |
1138 | |
1139 | |
1140 | |
1141 | |
1142 | |
1143 | |
1144 | |
1145 | |
1146 | |
1147 | |
1148 | |
1149 | |
1150 | |
1151 | |
1152 | |
1153 | |
1154 | |
1155 | |
1156 | |
1157 | |
1158 | |
1159 | |
1160 | |
1161 | |
1162 | |
1163 | |
1164 | |
1165 | |
1166 | |
1167 | |
1168 | |
1169 leaq 32(%r10),%rbp | |
1170 leaq (%rsi,%r9,1),%rsi | |
1171 | |
1172 movq %r9,%rcx | |
1173 | |
1174 | |
1175 movq -32(%rsi,%rbp,1),%r14 | |
1176 leaq 48+8(%rsp,%r9,2),%rdi | |
1177 movq -24(%rsi,%rbp,1),%rax | |
1178 leaq -32(%rdi,%rbp,1),%rdi | |
1179 movq -16(%rsi,%rbp,1),%rbx | |
1180 movq %rax,%r15 | |
1181 | |
1182 mulq %r14 | |
1183 movq %rax,%r10 | |
1184 movq %rbx,%rax | |
1185 movq %rdx,%r11 | |
1186 movq %r10,-24(%rdi,%rbp,1) | |
1187 | |
1188 mulq %r14 | |
1189 addq %rax,%r11 | |
1190 movq %rbx,%rax | |
1191 adcq $0,%rdx | |
1192 movq %r11,-16(%rdi,%rbp,1) | |
1193 movq %rdx,%r10 | |
1194 | |
1195 | |
1196 movq -8(%rsi,%rbp,1),%rbx | |
1197 mulq %r15 | |
1198 movq %rax,%r12 | |
1199 movq %rbx,%rax | |
1200 movq %rdx,%r13 | |
1201 | |
1202 leaq (%rbp),%rcx | |
1203 mulq %r14 | |
1204 addq %rax,%r10 | |
1205 movq %rbx,%rax | |
1206 movq %rdx,%r11 | |
1207 adcq $0,%r11 | |
1208 addq %r12,%r10 | |
1209 adcq $0,%r11 | |
1210 movq %r10,-8(%rdi,%rcx,1) | |
1211 jmp .Lsqr4x_1st | |
1212 | |
1213 .align 32 | |
1214 .Lsqr4x_1st: | |
1215 movq (%rsi,%rcx,1),%rbx | |
1216 mulq %r15 | |
1217 addq %rax,%r13 | |
1218 movq %rbx,%rax | |
1219 movq %rdx,%r12 | |
1220 adcq $0,%r12 | |
1221 | |
1222 mulq %r14 | |
1223 addq %rax,%r11 | |
1224 movq %rbx,%rax | |
1225 movq 8(%rsi,%rcx,1),%rbx | |
1226 movq %rdx,%r10 | |
1227 adcq $0,%r10 | |
1228 addq %r13,%r11 | |
1229 adcq $0,%r10 | |
1230 | |
1231 | |
1232 mulq %r15 | |
1233 addq %rax,%r12 | |
1234 movq %rbx,%rax | |
1235 movq %r11,(%rdi,%rcx,1) | |
1236 movq %rdx,%r13 | |
1237 adcq $0,%r13 | |
1238 | |
1239 mulq %r14 | |
1240 addq %rax,%r10 | |
1241 movq %rbx,%rax | |
1242 movq 16(%rsi,%rcx,1),%rbx | |
1243 movq %rdx,%r11 | |
1244 adcq $0,%r11 | |
1245 addq %r12,%r10 | |
1246 adcq $0,%r11 | |
1247 | |
1248 mulq %r15 | |
1249 addq %rax,%r13 | |
1250 movq %rbx,%rax | |
1251 movq %r10,8(%rdi,%rcx,1) | |
1252 movq %rdx,%r12 | |
1253 adcq $0,%r12 | |
1254 | |
1255 mulq %r14 | |
1256 addq %rax,%r11 | |
1257 movq %rbx,%rax | |
1258 movq 24(%rsi,%rcx,1),%rbx | |
1259 movq %rdx,%r10 | |
1260 adcq $0,%r10 | |
1261 addq %r13,%r11 | |
1262 adcq $0,%r10 | |
1263 | |
1264 | |
1265 mulq %r15 | |
1266 addq %rax,%r12 | |
1267 movq %rbx,%rax | |
1268 movq %r11,16(%rdi,%rcx,1) | |
1269 movq %rdx,%r13 | |
1270 adcq $0,%r13 | |
1271 leaq 32(%rcx),%rcx | |
1272 | |
1273 mulq %r14 | |
1274 addq %rax,%r10 | |
1275 movq %rbx,%rax | |
1276 movq %rdx,%r11 | |
1277 adcq $0,%r11 | |
1278 addq %r12,%r10 | |
1279 adcq $0,%r11 | |
1280 movq %r10,-8(%rdi,%rcx,1) | |
1281 | |
1282 cmpq $0,%rcx | |
1283 jne .Lsqr4x_1st | |
1284 | |
1285 mulq %r15 | |
1286 addq %rax,%r13 | |
1287 leaq 16(%rbp),%rbp | |
1288 adcq $0,%rdx | |
1289 addq %r11,%r13 | |
1290 adcq $0,%rdx | |
1291 | |
1292 movq %r13,(%rdi) | |
1293 movq %rdx,%r12 | |
1294 movq %rdx,8(%rdi) | |
1295 jmp .Lsqr4x_outer | |
1296 | |
1297 .align 32 | |
1298 .Lsqr4x_outer: | |
1299 movq -32(%rsi,%rbp,1),%r14 | |
1300 leaq 48+8(%rsp,%r9,2),%rdi | |
1301 movq -24(%rsi,%rbp,1),%rax | |
1302 leaq -32(%rdi,%rbp,1),%rdi | |
1303 movq -16(%rsi,%rbp,1),%rbx | |
1304 movq %rax,%r15 | |
1305 | |
1306 mulq %r14 | |
1307 movq -24(%rdi,%rbp,1),%r10 | |
1308 addq %rax,%r10 | |
1309 movq %rbx,%rax | |
1310 adcq $0,%rdx | |
1311 movq %r10,-24(%rdi,%rbp,1) | |
1312 movq %rdx,%r11 | |
1313 | |
1314 mulq %r14 | |
1315 addq %rax,%r11 | |
1316 movq %rbx,%rax | |
1317 adcq $0,%rdx | |
1318 addq -16(%rdi,%rbp,1),%r11 | |
1319 movq %rdx,%r10 | |
1320 adcq $0,%r10 | |
1321 movq %r11,-16(%rdi,%rbp,1) | |
1322 | |
1323 xorq %r12,%r12 | |
1324 | |
1325 movq -8(%rsi,%rbp,1),%rbx | |
1326 mulq %r15 | |
1327 addq %rax,%r12 | |
1328 movq %rbx,%rax | |
1329 adcq $0,%rdx | |
1330 addq -8(%rdi,%rbp,1),%r12 | |
1331 movq %rdx,%r13 | |
1332 adcq $0,%r13 | |
1333 | |
1334 mulq %r14 | |
1335 addq %rax,%r10 | |
1336 movq %rbx,%rax | |
1337 adcq $0,%rdx | |
1338 addq %r12,%r10 | |
1339 movq %rdx,%r11 | |
1340 adcq $0,%r11 | |
1341 movq %r10,-8(%rdi,%rbp,1) | |
1342 | |
1343 leaq (%rbp),%rcx | |
1344 jmp .Lsqr4x_inner | |
1345 | |
1346 .align 32 | |
1347 .Lsqr4x_inner: | |
1348 movq (%rsi,%rcx,1),%rbx | |
1349 mulq %r15 | |
1350 addq %rax,%r13 | |
1351 movq %rbx,%rax | |
1352 movq %rdx,%r12 | |
1353 adcq $0,%r12 | |
1354 addq (%rdi,%rcx,1),%r13 | |
1355 adcq $0,%r12 | |
1356 | |
1357 .byte 0x67 | |
1358 mulq %r14 | |
1359 addq %rax,%r11 | |
1360 movq %rbx,%rax | |
1361 movq 8(%rsi,%rcx,1),%rbx | |
1362 movq %rdx,%r10 | |
1363 adcq $0,%r10 | |
1364 addq %r13,%r11 | |
1365 adcq $0,%r10 | |
1366 | |
1367 mulq %r15 | |
1368 addq %rax,%r12 | |
1369 movq %r11,(%rdi,%rcx,1) | |
1370 movq %rbx,%rax | |
1371 movq %rdx,%r13 | |
1372 adcq $0,%r13 | |
1373 addq 8(%rdi,%rcx,1),%r12 | |
1374 leaq 16(%rcx),%rcx | |
1375 adcq $0,%r13 | |
1376 | |
1377 mulq %r14 | |
1378 addq %rax,%r10 | |
1379 movq %rbx,%rax | |
1380 adcq $0,%rdx | |
1381 addq %r12,%r10 | |
1382 movq %rdx,%r11 | |
1383 adcq $0,%r11 | |
1384 movq %r10,-8(%rdi,%rcx,1) | |
1385 | |
1386 cmpq $0,%rcx | |
1387 jne .Lsqr4x_inner | |
1388 | |
1389 .byte 0x67 | |
1390 mulq %r15 | |
1391 addq %rax,%r13 | |
1392 adcq $0,%rdx | |
1393 addq %r11,%r13 | |
1394 adcq $0,%rdx | |
1395 | |
1396 movq %r13,(%rdi) | |
1397 movq %rdx,%r12 | |
1398 movq %rdx,8(%rdi) | |
1399 | |
1400 addq $16,%rbp | |
1401 jnz .Lsqr4x_outer | |
1402 | |
1403 | |
1404 movq -32(%rsi),%r14 | |
1405 leaq 48+8(%rsp,%r9,2),%rdi | |
1406 movq -24(%rsi),%rax | |
1407 leaq -32(%rdi,%rbp,1),%rdi | |
1408 movq -16(%rsi),%rbx | |
1409 movq %rax,%r15 | |
1410 | |
1411 mulq %r14 | |
1412 addq %rax,%r10 | |
1413 movq %rbx,%rax | |
1414 movq %rdx,%r11 | |
1415 adcq $0,%r11 | |
1416 | |
1417 mulq %r14 | |
1418 addq %rax,%r11 | |
1419 movq %rbx,%rax | |
1420 movq %r10,-24(%rdi) | |
1421 movq %rdx,%r10 | |
1422 adcq $0,%r10 | |
1423 addq %r13,%r11 | |
1424 movq -8(%rsi),%rbx | |
1425 adcq $0,%r10 | |
1426 | |
1427 mulq %r15 | |
1428 addq %rax,%r12 | |
1429 movq %rbx,%rax | |
1430 movq %r11,-16(%rdi) | |
1431 movq %rdx,%r13 | |
1432 adcq $0,%r13 | |
1433 | |
1434 mulq %r14 | |
1435 addq %rax,%r10 | |
1436 movq %rbx,%rax | |
1437 movq %rdx,%r11 | |
1438 adcq $0,%r11 | |
1439 addq %r12,%r10 | |
1440 adcq $0,%r11 | |
1441 movq %r10,-8(%rdi) | |
1442 | |
1443 mulq %r15 | |
1444 addq %rax,%r13 | |
1445 movq -16(%rsi),%rax | |
1446 adcq $0,%rdx | |
1447 addq %r11,%r13 | |
1448 adcq $0,%rdx | |
1449 | |
1450 movq %r13,(%rdi) | |
1451 movq %rdx,%r12 | |
1452 movq %rdx,8(%rdi) | |
1453 | |
1454 mulq %rbx | |
1455 addq $16,%rbp | |
1456 xorq %r14,%r14 | |
1457 subq %r9,%rbp | |
1458 xorq %r15,%r15 | |
1459 | |
1460 addq %r12,%rax | |
1461 adcq $0,%rdx | |
1462 movq %rax,8(%rdi) | |
1463 movq %rdx,16(%rdi) | |
1464 movq %r15,24(%rdi) | |
1465 | |
1466 movq -16(%rsi,%rbp,1),%rax | |
1467 leaq 48+8(%rsp),%rdi | |
1468 xorq %r10,%r10 | |
1469 movq 8(%rdi),%r11 | |
1470 | |
1471 leaq (%r14,%r10,2),%r12 | |
1472 shrq $63,%r10 | |
1473 leaq (%rcx,%r11,2),%r13 | |
1474 shrq $63,%r11 | |
1475 orq %r10,%r13 | |
1476 movq 16(%rdi),%r10 | |
1477 movq %r11,%r14 | |
1478 mulq %rax | |
1479 negq %r15 | |
1480 movq 24(%rdi),%r11 | |
1481 adcq %rax,%r12 | |
1482 movq -8(%rsi,%rbp,1),%rax | |
1483 movq %r12,(%rdi) | |
1484 adcq %rdx,%r13 | |
1485 | |
1486 leaq (%r14,%r10,2),%rbx | |
1487 movq %r13,8(%rdi) | |
1488 sbbq %r15,%r15 | |
1489 shrq $63,%r10 | |
1490 leaq (%rcx,%r11,2),%r8 | |
1491 shrq $63,%r11 | |
1492 orq %r10,%r8 | |
1493 movq 32(%rdi),%r10 | |
1494 movq %r11,%r14 | |
1495 mulq %rax | |
1496 negq %r15 | |
1497 movq 40(%rdi),%r11 | |
1498 adcq %rax,%rbx | |
1499 movq 0(%rsi,%rbp,1),%rax | |
1500 movq %rbx,16(%rdi) | |
1501 adcq %rdx,%r8 | |
1502 leaq 16(%rbp),%rbp | |
1503 movq %r8,24(%rdi) | |
1504 sbbq %r15,%r15 | |
1505 leaq 64(%rdi),%rdi | |
1506 jmp .Lsqr4x_shift_n_add | |
1507 | |
1508 .align 32 | |
1509 .Lsqr4x_shift_n_add: | |
1510 leaq (%r14,%r10,2),%r12 | |
1511 shrq $63,%r10 | |
1512 leaq (%rcx,%r11,2),%r13 | |
1513 shrq $63,%r11 | |
1514 orq %r10,%r13 | |
1515 movq -16(%rdi),%r10 | |
1516 movq %r11,%r14 | |
1517 mulq %rax | |
1518 negq %r15 | |
1519 movq -8(%rdi),%r11 | |
1520 adcq %rax,%r12 | |
1521 movq -8(%rsi,%rbp,1),%rax | |
1522 movq %r12,-32(%rdi) | |
1523 adcq %rdx,%r13 | |
1524 | |
1525 leaq (%r14,%r10,2),%rbx | |
1526 movq %r13,-24(%rdi) | |
1527 sbbq %r15,%r15 | |
1528 shrq $63,%r10 | |
1529 leaq (%rcx,%r11,2),%r8 | |
1530 shrq $63,%r11 | |
1531 orq %r10,%r8 | |
1532 movq 0(%rdi),%r10 | |
1533 movq %r11,%r14 | |
1534 mulq %rax | |
1535 negq %r15 | |
1536 movq 8(%rdi),%r11 | |
1537 adcq %rax,%rbx | |
1538 movq 0(%rsi,%rbp,1),%rax | |
1539 movq %rbx,-16(%rdi) | |
1540 adcq %rdx,%r8 | |
1541 | |
1542 leaq (%r14,%r10,2),%r12 | |
1543 movq %r8,-8(%rdi) | |
1544 sbbq %r15,%r15 | |
1545 shrq $63,%r10 | |
1546 leaq (%rcx,%r11,2),%r13 | |
1547 shrq $63,%r11 | |
1548 orq %r10,%r13 | |
1549 movq 16(%rdi),%r10 | |
1550 movq %r11,%r14 | |
1551 mulq %rax | |
1552 negq %r15 | |
1553 movq 24(%rdi),%r11 | |
1554 adcq %rax,%r12 | |
1555 movq 8(%rsi,%rbp,1),%rax | |
1556 movq %r12,0(%rdi) | |
1557 adcq %rdx,%r13 | |
1558 | |
1559 leaq (%r14,%r10,2),%rbx | |
1560 movq %r13,8(%rdi) | |
1561 sbbq %r15,%r15 | |
1562 shrq $63,%r10 | |
1563 leaq (%rcx,%r11,2),%r8 | |
1564 shrq $63,%r11 | |
1565 orq %r10,%r8 | |
1566 movq 32(%rdi),%r10 | |
1567 movq %r11,%r14 | |
1568 mulq %rax | |
1569 negq %r15 | |
1570 movq 40(%rdi),%r11 | |
1571 adcq %rax,%rbx | |
1572 movq 16(%rsi,%rbp,1),%rax | |
1573 movq %rbx,16(%rdi) | |
1574 adcq %rdx,%r8 | |
1575 movq %r8,24(%rdi) | |
1576 sbbq %r15,%r15 | |
1577 leaq 64(%rdi),%rdi | |
1578 addq $32,%rbp | |
1579 jnz .Lsqr4x_shift_n_add | |
1580 | |
1581 leaq (%r14,%r10,2),%r12 | |
1582 .byte 0x67 | |
1583 shrq $63,%r10 | |
1584 leaq (%rcx,%r11,2),%r13 | |
1585 shrq $63,%r11 | |
1586 orq %r10,%r13 | |
1587 movq -16(%rdi),%r10 | |
1588 movq %r11,%r14 | |
1589 mulq %rax | |
1590 negq %r15 | |
1591 movq -8(%rdi),%r11 | |
1592 adcq %rax,%r12 | |
1593 movq -8(%rsi),%rax | |
1594 movq %r12,-32(%rdi) | |
1595 adcq %rdx,%r13 | |
1596 | |
1597 leaq (%r14,%r10,2),%rbx | |
1598 movq %r13,-24(%rdi) | |
1599 sbbq %r15,%r15 | |
1600 shrq $63,%r10 | |
1601 leaq (%rcx,%r11,2),%r8 | |
1602 shrq $63,%r11 | |
1603 orq %r10,%r8 | |
1604 mulq %rax | |
1605 negq %r15 | |
1606 adcq %rax,%rbx | |
1607 adcq %rdx,%r8 | |
1608 movq %rbx,-16(%rdi) | |
1609 movq %r8,-8(%rdi) | |
1610 .byte 102,72,15,126,213 | |
1611 __bn_sqr8x_reduction: | |
1612 xorq %rax,%rax | |
1613 leaq (%r9,%rbp,1),%rcx | |
1614 leaq 48+8(%rsp,%r9,2),%rdx | |
1615 movq %rcx,0+8(%rsp) | |
1616 leaq 48+8(%rsp,%r9,1),%rdi | |
1617 movq %rdx,8+8(%rsp) | |
1618 negq %r9 | |
1619 jmp .L8x_reduction_loop | |
1620 | |
1621 .align 32 | |
1622 .L8x_reduction_loop: | |
1623 leaq (%rdi,%r9,1),%rdi | |
1624 .byte 0x66 | |
1625 movq 0(%rdi),%rbx | |
1626 movq 8(%rdi),%r9 | |
1627 movq 16(%rdi),%r10 | |
1628 movq 24(%rdi),%r11 | |
1629 movq 32(%rdi),%r12 | |
1630 movq 40(%rdi),%r13 | |
1631 movq 48(%rdi),%r14 | |
1632 movq 56(%rdi),%r15 | |
1633 movq %rax,(%rdx) | |
1634 leaq 64(%rdi),%rdi | |
1635 | |
1636 .byte 0x67 | |
1637 movq %rbx,%r8 | |
1638 imulq 32+8(%rsp),%rbx | |
1639 movq 0(%rbp),%rax | |
1640 movl $8,%ecx | |
1641 jmp .L8x_reduce | |
1642 | |
1643 .align 32 | |
1644 .L8x_reduce: | |
1645 mulq %rbx | |
1646 movq 8(%rbp),%rax | |
1647 negq %r8 | |
1648 movq %rdx,%r8 | |
1649 adcq $0,%r8 | |
1650 | |
1651 mulq %rbx | |
1652 addq %rax,%r9 | |
1653 movq 16(%rbp),%rax | |
1654 adcq $0,%rdx | |
1655 addq %r9,%r8 | |
1656 movq %rbx,48-8+8(%rsp,%rcx,8) | |
1657 movq %rdx,%r9 | |
1658 adcq $0,%r9 | |
1659 | |
1660 mulq %rbx | |
1661 addq %rax,%r10 | |
1662 movq 24(%rbp),%rax | |
1663 adcq $0,%rdx | |
1664 addq %r10,%r9 | |
1665 movq 32+8(%rsp),%rsi | |
1666 movq %rdx,%r10 | |
1667 adcq $0,%r10 | |
1668 | |
1669 mulq %rbx | |
1670 addq %rax,%r11 | |
1671 movq 32(%rbp),%rax | |
1672 adcq $0,%rdx | |
1673 imulq %r8,%rsi | |
1674 addq %r11,%r10 | |
1675 movq %rdx,%r11 | |
1676 adcq $0,%r11 | |
1677 | |
1678 mulq %rbx | |
1679 addq %rax,%r12 | |
1680 movq 40(%rbp),%rax | |
1681 adcq $0,%rdx | |
1682 addq %r12,%r11 | |
1683 movq %rdx,%r12 | |
1684 adcq $0,%r12 | |
1685 | |
1686 mulq %rbx | |
1687 addq %rax,%r13 | |
1688 movq 48(%rbp),%rax | |
1689 adcq $0,%rdx | |
1690 addq %r13,%r12 | |
1691 movq %rdx,%r13 | |
1692 adcq $0,%r13 | |
1693 | |
1694 mulq %rbx | |
1695 addq %rax,%r14 | |
1696 movq 56(%rbp),%rax | |
1697 adcq $0,%rdx | |
1698 addq %r14,%r13 | |
1699 movq %rdx,%r14 | |
1700 adcq $0,%r14 | |
1701 | |
1702 mulq %rbx | |
1703 movq %rsi,%rbx | |
1704 addq %rax,%r15 | |
1705 movq 0(%rbp),%rax | |
1706 adcq $0,%rdx | |
1707 addq %r15,%r14 | |
1708 movq %rdx,%r15 | |
1709 adcq $0,%r15 | |
1710 | |
1711 decl %ecx | |
1712 jnz .L8x_reduce | |
1713 | |
1714 leaq 64(%rbp),%rbp | |
1715 xorq %rax,%rax | |
1716 movq 8+8(%rsp),%rdx | |
1717 cmpq 0+8(%rsp),%rbp | |
1718 jae .L8x_no_tail | |
1719 | |
1720 .byte 0x66 | |
1721 addq 0(%rdi),%r8 | |
1722 adcq 8(%rdi),%r9 | |
1723 adcq 16(%rdi),%r10 | |
1724 adcq 24(%rdi),%r11 | |
1725 adcq 32(%rdi),%r12 | |
1726 adcq 40(%rdi),%r13 | |
1727 adcq 48(%rdi),%r14 | |
1728 adcq 56(%rdi),%r15 | |
1729 sbbq %rsi,%rsi | |
1730 | |
1731 movq 48+56+8(%rsp),%rbx | |
1732 movl $8,%ecx | |
1733 movq 0(%rbp),%rax | |
1734 jmp .L8x_tail | |
1735 | |
1736 .align 32 | |
1737 .L8x_tail: | |
1738 mulq %rbx | |
1739 addq %rax,%r8 | |
1740 movq 8(%rbp),%rax | |
1741 movq %r8,(%rdi) | |
1742 movq %rdx,%r8 | |
1743 adcq $0,%r8 | |
1744 | |
1745 mulq %rbx | |
1746 addq %rax,%r9 | |
1747 movq 16(%rbp),%rax | |
1748 adcq $0,%rdx | |
1749 addq %r9,%r8 | |
1750 leaq 8(%rdi),%rdi | |
1751 movq %rdx,%r9 | |
1752 adcq $0,%r9 | |
1753 | |
1754 mulq %rbx | |
1755 addq %rax,%r10 | |
1756 movq 24(%rbp),%rax | |
1757 adcq $0,%rdx | |
1758 addq %r10,%r9 | |
1759 movq %rdx,%r10 | |
1760 adcq $0,%r10 | |
1761 | |
1762 mulq %rbx | |
1763 addq %rax,%r11 | |
1764 movq 32(%rbp),%rax | |
1765 adcq $0,%rdx | |
1766 addq %r11,%r10 | |
1767 movq %rdx,%r11 | |
1768 adcq $0,%r11 | |
1769 | |
1770 mulq %rbx | |
1771 addq %rax,%r12 | |
1772 movq 40(%rbp),%rax | |
1773 adcq $0,%rdx | |
1774 addq %r12,%r11 | |
1775 movq %rdx,%r12 | |
1776 adcq $0,%r12 | |
1777 | |
1778 mulq %rbx | |
1779 addq %rax,%r13 | |
1780 movq 48(%rbp),%rax | |
1781 adcq $0,%rdx | |
1782 addq %r13,%r12 | |
1783 movq %rdx,%r13 | |
1784 adcq $0,%r13 | |
1785 | |
1786 mulq %rbx | |
1787 addq %rax,%r14 | |
1788 movq 56(%rbp),%rax | |
1789 adcq $0,%rdx | |
1790 addq %r14,%r13 | |
1791 movq %rdx,%r14 | |
1792 adcq $0,%r14 | |
1793 | |
1794 mulq %rbx | |
1795 movq 48-16+8(%rsp,%rcx,8),%rbx | |
1796 addq %rax,%r15 | |
1797 adcq $0,%rdx | |
1798 addq %r15,%r14 | |
1799 movq 0(%rbp),%rax | |
1800 movq %rdx,%r15 | |
1801 adcq $0,%r15 | |
1802 | |
1803 decl %ecx | |
1804 jnz .L8x_tail | |
1805 | |
1806 leaq 64(%rbp),%rbp | |
1807 movq 8+8(%rsp),%rdx | |
1808 cmpq 0+8(%rsp),%rbp | |
1809 jae .L8x_tail_done | |
1810 | |
1811 movq 48+56+8(%rsp),%rbx | |
1812 negq %rsi | |
1813 movq 0(%rbp),%rax | |
1814 adcq 0(%rdi),%r8 | |
1815 adcq 8(%rdi),%r9 | |
1816 adcq 16(%rdi),%r10 | |
1817 adcq 24(%rdi),%r11 | |
1818 adcq 32(%rdi),%r12 | |
1819 adcq 40(%rdi),%r13 | |
1820 adcq 48(%rdi),%r14 | |
1821 adcq 56(%rdi),%r15 | |
1822 sbbq %rsi,%rsi | |
1823 | |
1824 movl $8,%ecx | |
1825 jmp .L8x_tail | |
1826 | |
1827 .align 32 | |
1828 .L8x_tail_done: | |
1829 addq (%rdx),%r8 | |
1830 adcq $0,%r9 | |
1831 adcq $0,%r10 | |
1832 adcq $0,%r11 | |
1833 adcq $0,%r12 | |
1834 adcq $0,%r13 | |
1835 adcq $0,%r14 | |
1836 adcq $0,%r15 | |
1837 | |
1838 | |
1839 xorq %rax,%rax | |
1840 | |
1841 negq %rsi | |
1842 .L8x_no_tail: | |
1843 adcq 0(%rdi),%r8 | |
1844 adcq 8(%rdi),%r9 | |
1845 adcq 16(%rdi),%r10 | |
1846 adcq 24(%rdi),%r11 | |
1847 adcq 32(%rdi),%r12 | |
1848 adcq 40(%rdi),%r13 | |
1849 adcq 48(%rdi),%r14 | |
1850 adcq 56(%rdi),%r15 | |
1851 adcq $0,%rax | |
1852 movq -8(%rbp),%rcx | |
1853 xorq %rsi,%rsi | |
1854 | |
1855 .byte 102,72,15,126,213 | |
1856 | |
1857 movq %r8,0(%rdi) | |
1858 movq %r9,8(%rdi) | |
1859 .byte 102,73,15,126,217 | |
1860 movq %r10,16(%rdi) | |
1861 movq %r11,24(%rdi) | |
1862 movq %r12,32(%rdi) | |
1863 movq %r13,40(%rdi) | |
1864 movq %r14,48(%rdi) | |
1865 movq %r15,56(%rdi) | |
1866 leaq 64(%rdi),%rdi | |
1867 | |
1868 cmpq %rdx,%rdi | |
1869 jb .L8x_reduction_loop | |
1870 .byte 0xf3,0xc3 | |
1871 .size bn_sqr8x_internal,.-bn_sqr8x_internal | |
1872 .type __bn_post4x_internal,@function | |
1873 .align 32 | |
1874 __bn_post4x_internal: | |
1875 movq 0(%rbp),%r12 | |
1876 leaq (%rdi,%r9,1),%rbx | |
1877 movq %r9,%rcx | |
1878 .byte 102,72,15,126,207 | |
1879 negq %rax | |
1880 .byte 102,72,15,126,206 | |
1881 sarq $3+2,%rcx | |
1882 decq %r12 | |
1883 xorq %r10,%r10 | |
1884 movq 8(%rbp),%r13 | |
1885 movq 16(%rbp),%r14 | |
1886 movq 24(%rbp),%r15 | |
1887 jmp .Lsqr4x_sub_entry | |
1888 | |
1889 .align 16 | |
1890 .Lsqr4x_sub: | |
1891 movq 0(%rbp),%r12 | |
1892 movq 8(%rbp),%r13 | |
1893 movq 16(%rbp),%r14 | |
1894 movq 24(%rbp),%r15 | |
1895 .Lsqr4x_sub_entry: | |
1896 leaq 32(%rbp),%rbp | |
1897 notq %r12 | |
1898 notq %r13 | |
1899 notq %r14 | |
1900 notq %r15 | |
1901 andq %rax,%r12 | |
1902 andq %rax,%r13 | |
1903 andq %rax,%r14 | |
1904 andq %rax,%r15 | |
1905 | |
1906 negq %r10 | |
1907 adcq 0(%rbx),%r12 | |
1908 adcq 8(%rbx),%r13 | |
1909 adcq 16(%rbx),%r14 | |
1910 adcq 24(%rbx),%r15 | |
1911 movq %r12,0(%rdi) | |
1912 leaq 32(%rbx),%rbx | |
1913 movq %r13,8(%rdi) | |
1914 sbbq %r10,%r10 | |
1915 movq %r14,16(%rdi) | |
1916 movq %r15,24(%rdi) | |
1917 leaq 32(%rdi),%rdi | |
1918 | |
1919 incq %rcx | |
1920 jnz .Lsqr4x_sub | |
1921 | |
1922 movq %r9,%r10 | |
1923 negq %r9 | |
1924 .byte 0xf3,0xc3 | |
1925 .size __bn_post4x_internal,.-__bn_post4x_internal | |
1926 .globl bn_from_montgomery | |
1927 .hidden bn_from_montgomery | |
1928 .type bn_from_montgomery,@function | |
1929 .align 32 | |
1930 bn_from_montgomery: | |
1931 testl $7,%r9d | |
1932 jz bn_from_mont8x | |
1933 xorl %eax,%eax | |
1934 .byte 0xf3,0xc3 | |
1935 .size bn_from_montgomery,.-bn_from_montgomery | |
1936 | |
1937 .type bn_from_mont8x,@function | |
1938 .align 32 | |
1939 bn_from_mont8x: | |
1940 .byte 0x67 | |
1941 movq %rsp,%rax | |
1942 pushq %rbx | |
1943 pushq %rbp | |
1944 pushq %r12 | |
1945 pushq %r13 | |
1946 pushq %r14 | |
1947 pushq %r15 | |
1948 | |
1949 shll $3,%r9d | |
1950 leaq (%r9,%r9,2),%r10 | |
1951 negq %r9 | |
1952 movq (%r8),%r8 | |
1953 | |
1954 | |
1955 | |
1956 | |
1957 | |
1958 | |
1959 | |
1960 | |
1961 leaq -320(%rsp,%r9,2),%r11 | |
1962 subq %rdi,%r11 | |
1963 andq $4095,%r11 | |
1964 cmpq %r11,%r10 | |
1965 jb .Lfrom_sp_alt | |
1966 subq %r11,%rsp | |
1967 leaq -320(%rsp,%r9,2),%rsp | |
1968 jmp .Lfrom_sp_done | |
1969 | |
1970 .align 32 | |
1971 .Lfrom_sp_alt: | |
1972 leaq 4096-320(,%r9,2),%r10 | |
1973 leaq -320(%rsp,%r9,2),%rsp | |
1974 subq %r10,%r11 | |
1975 movq $0,%r10 | |
1976 cmovcq %r10,%r11 | |
1977 subq %r11,%rsp | |
1978 .Lfrom_sp_done: | |
1979 andq $-64,%rsp | |
1980 movq %r9,%r10 | |
1981 negq %r9 | |
1982 | |
1983 | |
1984 | |
1985 | |
1986 | |
1987 | |
1988 | |
1989 | |
1990 | |
1991 | |
1992 movq %r8,32(%rsp) | |
1993 movq %rax,40(%rsp) | |
1994 .Lfrom_body: | |
1995 movq %r9,%r11 | |
1996 leaq 48(%rsp),%rax | |
1997 pxor %xmm0,%xmm0 | |
1998 jmp .Lmul_by_1 | |
1999 | |
2000 .align 32 | |
2001 .Lmul_by_1: | |
2002 movdqu (%rsi),%xmm1 | |
2003 movdqu 16(%rsi),%xmm2 | |
2004 movdqu 32(%rsi),%xmm3 | |
2005 movdqa %xmm0,(%rax,%r9,1) | |
2006 movdqu 48(%rsi),%xmm4 | |
2007 movdqa %xmm0,16(%rax,%r9,1) | |
2008 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 | |
2009 movdqa %xmm1,(%rax) | |
2010 movdqa %xmm0,32(%rax,%r9,1) | |
2011 movdqa %xmm2,16(%rax) | |
2012 movdqa %xmm0,48(%rax,%r9,1) | |
2013 movdqa %xmm3,32(%rax) | |
2014 movdqa %xmm4,48(%rax) | |
2015 leaq 64(%rax),%rax | |
2016 subq $64,%r11 | |
2017 jnz .Lmul_by_1 | |
2018 | |
2019 .byte 102,72,15,110,207 | |
2020 .byte 102,72,15,110,209 | |
2021 .byte 0x67 | |
2022 movq %rcx,%rbp | |
2023 .byte 102,73,15,110,218 | |
2024 call __bn_sqr8x_reduction | |
2025 call __bn_post4x_internal | |
2026 | |
2027 pxor %xmm0,%xmm0 | |
2028 leaq 48(%rsp),%rax | |
2029 movq 40(%rsp),%rsi | |
2030 jmp .Lfrom_mont_zero | |
2031 | |
2032 .align 32 | |
2033 .Lfrom_mont_zero: | |
2034 movdqa %xmm0,0(%rax) | |
2035 movdqa %xmm0,16(%rax) | |
2036 movdqa %xmm0,32(%rax) | |
2037 movdqa %xmm0,48(%rax) | |
2038 leaq 64(%rax),%rax | |
2039 subq $32,%r9 | |
2040 jnz .Lfrom_mont_zero | |
2041 | |
2042 movq $1,%rax | |
2043 movq -48(%rsi),%r15 | |
2044 movq -40(%rsi),%r14 | |
2045 movq -32(%rsi),%r13 | |
2046 movq -24(%rsi),%r12 | |
2047 movq -16(%rsi),%rbp | |
2048 movq -8(%rsi),%rbx | |
2049 leaq (%rsi),%rsp | |
2050 .Lfrom_epilogue: | |
2051 .byte 0xf3,0xc3 | |
2052 .size bn_from_mont8x,.-bn_from_mont8x | |
2053 .globl bn_scatter5 | |
2054 .hidden bn_scatter5 | |
2055 .type bn_scatter5,@function | |
2056 .align 16 | |
2057 bn_scatter5: | |
2058 cmpl $0,%esi | |
2059 jz .Lscatter_epilogue | |
2060 leaq (%rdx,%rcx,8),%rdx | |
2061 .Lscatter: | |
2062 movq (%rdi),%rax | |
2063 leaq 8(%rdi),%rdi | |
2064 movq %rax,(%rdx) | |
2065 leaq 256(%rdx),%rdx | |
2066 subl $1,%esi | |
2067 jnz .Lscatter | |
2068 .Lscatter_epilogue: | |
2069 .byte 0xf3,0xc3 | |
2070 .size bn_scatter5,.-bn_scatter5 | |
2071 | |
2072 .globl bn_gather5 | |
2073 .hidden bn_gather5 | |
2074 .type bn_gather5,@function | |
2075 .align 32 | |
2076 bn_gather5: | |
2077 .LSEH_begin_bn_gather5: | |
2078 | |
2079 .byte 0x4c,0x8d,0x14,0x24 | |
2080 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 | |
2081 leaq .Linc(%rip),%rax | |
2082 andq $-16,%rsp | |
2083 | |
2084 movd %ecx,%xmm5 | |
2085 movdqa 0(%rax),%xmm0 | |
2086 movdqa 16(%rax),%xmm1 | |
2087 leaq 128(%rdx),%r11 | |
2088 leaq 128(%rsp),%rax | |
2089 | |
2090 pshufd $0,%xmm5,%xmm5 | |
2091 movdqa %xmm1,%xmm4 | |
2092 movdqa %xmm1,%xmm2 | |
2093 paddd %xmm0,%xmm1 | |
2094 pcmpeqd %xmm5,%xmm0 | |
2095 movdqa %xmm4,%xmm3 | |
2096 | |
2097 paddd %xmm1,%xmm2 | |
2098 pcmpeqd %xmm5,%xmm1 | |
2099 movdqa %xmm0,-128(%rax) | |
2100 movdqa %xmm4,%xmm0 | |
2101 | |
2102 paddd %xmm2,%xmm3 | |
2103 pcmpeqd %xmm5,%xmm2 | |
2104 movdqa %xmm1,-112(%rax) | |
2105 movdqa %xmm4,%xmm1 | |
2106 | |
2107 paddd %xmm3,%xmm0 | |
2108 pcmpeqd %xmm5,%xmm3 | |
2109 movdqa %xmm2,-96(%rax) | |
2110 movdqa %xmm4,%xmm2 | |
2111 paddd %xmm0,%xmm1 | |
2112 pcmpeqd %xmm5,%xmm0 | |
2113 movdqa %xmm3,-80(%rax) | |
2114 movdqa %xmm4,%xmm3 | |
2115 | |
2116 paddd %xmm1,%xmm2 | |
2117 pcmpeqd %xmm5,%xmm1 | |
2118 movdqa %xmm0,-64(%rax) | |
2119 movdqa %xmm4,%xmm0 | |
2120 | |
2121 paddd %xmm2,%xmm3 | |
2122 pcmpeqd %xmm5,%xmm2 | |
2123 movdqa %xmm1,-48(%rax) | |
2124 movdqa %xmm4,%xmm1 | |
2125 | |
2126 paddd %xmm3,%xmm0 | |
2127 pcmpeqd %xmm5,%xmm3 | |
2128 movdqa %xmm2,-32(%rax) | |
2129 movdqa %xmm4,%xmm2 | |
2130 paddd %xmm0,%xmm1 | |
2131 pcmpeqd %xmm5,%xmm0 | |
2132 movdqa %xmm3,-16(%rax) | |
2133 movdqa %xmm4,%xmm3 | |
2134 | |
2135 paddd %xmm1,%xmm2 | |
2136 pcmpeqd %xmm5,%xmm1 | |
2137 movdqa %xmm0,0(%rax) | |
2138 movdqa %xmm4,%xmm0 | |
2139 | |
2140 paddd %xmm2,%xmm3 | |
2141 pcmpeqd %xmm5,%xmm2 | |
2142 movdqa %xmm1,16(%rax) | |
2143 movdqa %xmm4,%xmm1 | |
2144 | |
2145 paddd %xmm3,%xmm0 | |
2146 pcmpeqd %xmm5,%xmm3 | |
2147 movdqa %xmm2,32(%rax) | |
2148 movdqa %xmm4,%xmm2 | |
2149 paddd %xmm0,%xmm1 | |
2150 pcmpeqd %xmm5,%xmm0 | |
2151 movdqa %xmm3,48(%rax) | |
2152 movdqa %xmm4,%xmm3 | |
2153 | |
2154 paddd %xmm1,%xmm2 | |
2155 pcmpeqd %xmm5,%xmm1 | |
2156 movdqa %xmm0,64(%rax) | |
2157 movdqa %xmm4,%xmm0 | |
2158 | |
2159 paddd %xmm2,%xmm3 | |
2160 pcmpeqd %xmm5,%xmm2 | |
2161 movdqa %xmm1,80(%rax) | |
2162 movdqa %xmm4,%xmm1 | |
2163 | |
2164 paddd %xmm3,%xmm0 | |
2165 pcmpeqd %xmm5,%xmm3 | |
2166 movdqa %xmm2,96(%rax) | |
2167 movdqa %xmm4,%xmm2 | |
2168 movdqa %xmm3,112(%rax) | |
2169 jmp .Lgather | |
2170 | |
2171 .align 32 | |
2172 .Lgather: | |
2173 pxor %xmm4,%xmm4 | |
2174 pxor %xmm5,%xmm5 | |
2175 movdqa -128(%r11),%xmm0 | |
2176 movdqa -112(%r11),%xmm1 | |
2177 movdqa -96(%r11),%xmm2 | |
2178 pand -128(%rax),%xmm0 | |
2179 movdqa -80(%r11),%xmm3 | |
2180 pand -112(%rax),%xmm1 | |
2181 por %xmm0,%xmm4 | |
2182 pand -96(%rax),%xmm2 | |
2183 por %xmm1,%xmm5 | |
2184 pand -80(%rax),%xmm3 | |
2185 por %xmm2,%xmm4 | |
2186 por %xmm3,%xmm5 | |
2187 movdqa -64(%r11),%xmm0 | |
2188 movdqa -48(%r11),%xmm1 | |
2189 movdqa -32(%r11),%xmm2 | |
2190 pand -64(%rax),%xmm0 | |
2191 movdqa -16(%r11),%xmm3 | |
2192 pand -48(%rax),%xmm1 | |
2193 por %xmm0,%xmm4 | |
2194 pand -32(%rax),%xmm2 | |
2195 por %xmm1,%xmm5 | |
2196 pand -16(%rax),%xmm3 | |
2197 por %xmm2,%xmm4 | |
2198 por %xmm3,%xmm5 | |
2199 movdqa 0(%r11),%xmm0 | |
2200 movdqa 16(%r11),%xmm1 | |
2201 movdqa 32(%r11),%xmm2 | |
2202 pand 0(%rax),%xmm0 | |
2203 movdqa 48(%r11),%xmm3 | |
2204 pand 16(%rax),%xmm1 | |
2205 por %xmm0,%xmm4 | |
2206 pand 32(%rax),%xmm2 | |
2207 por %xmm1,%xmm5 | |
2208 pand 48(%rax),%xmm3 | |
2209 por %xmm2,%xmm4 | |
2210 por %xmm3,%xmm5 | |
2211 movdqa 64(%r11),%xmm0 | |
2212 movdqa 80(%r11),%xmm1 | |
2213 movdqa 96(%r11),%xmm2 | |
2214 pand 64(%rax),%xmm0 | |
2215 movdqa 112(%r11),%xmm3 | |
2216 pand 80(%rax),%xmm1 | |
2217 por %xmm0,%xmm4 | |
2218 pand 96(%rax),%xmm2 | |
2219 por %xmm1,%xmm5 | |
2220 pand 112(%rax),%xmm3 | |
2221 por %xmm2,%xmm4 | |
2222 por %xmm3,%xmm5 | |
2223 por %xmm5,%xmm4 | |
2224 leaq 256(%r11),%r11 | |
2225 pshufd $0x4e,%xmm4,%xmm0 | |
2226 por %xmm4,%xmm0 | |
2227 movq %xmm0,(%rdi) | |
2228 leaq 8(%rdi),%rdi | |
2229 subl $1,%esi | |
2230 jnz .Lgather | |
2231 | |
2232 leaq (%r10),%rsp | |
2233 .byte 0xf3,0xc3 | |
2234 .LSEH_end_bn_gather5: | |
2235 .size bn_gather5,.-bn_gather5 | |
2236 .align 64 | |
2237 .Linc: | |
2238 .long 0,0, 1,1 | |
2239 .long 2,2, 2,2 | |
2240 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97
,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71
,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1
11,114,103,62,0 | |
2241 #endif | |
OLD | NEW |