OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 | |
5 | |
6 .globl _bn_mul_mont_gather5 | |
7 .private_extern _bn_mul_mont_gather5 | |
8 | |
9 .p2align 6 | |
10 _bn_mul_mont_gather5: | |
11 testl $7,%r9d | |
12 jnz L$mul_enter | |
13 jmp L$mul4x_enter | |
14 | |
15 .p2align 4 | |
16 L$mul_enter: | |
17 movl %r9d,%r9d | |
18 movq %rsp,%rax | |
19 movd 8(%rsp),%xmm5 | |
20 leaq L$inc(%rip),%r10 | |
21 pushq %rbx | |
22 pushq %rbp | |
23 pushq %r12 | |
24 pushq %r13 | |
25 pushq %r14 | |
26 pushq %r15 | |
27 | |
28 leaq 2(%r9),%r11 | |
29 negq %r11 | |
30 leaq -264(%rsp,%r11,8),%rsp | |
31 andq $-1024,%rsp | |
32 | |
33 movq %rax,8(%rsp,%r9,8) | |
34 L$mul_body: | |
35 leaq 128(%rdx),%r12 | |
36 movdqa 0(%r10),%xmm0 | |
37 movdqa 16(%r10),%xmm1 | |
38 leaq 24-112(%rsp,%r9,8),%r10 | |
39 andq $-16,%r10 | |
40 | |
41 pshufd $0,%xmm5,%xmm5 | |
42 movdqa %xmm1,%xmm4 | |
43 movdqa %xmm1,%xmm2 | |
44 paddd %xmm0,%xmm1 | |
45 pcmpeqd %xmm5,%xmm0 | |
46 .byte 0x67 | |
47 movdqa %xmm4,%xmm3 | |
48 paddd %xmm1,%xmm2 | |
49 pcmpeqd %xmm5,%xmm1 | |
50 movdqa %xmm0,112(%r10) | |
51 movdqa %xmm4,%xmm0 | |
52 | |
53 paddd %xmm2,%xmm3 | |
54 pcmpeqd %xmm5,%xmm2 | |
55 movdqa %xmm1,128(%r10) | |
56 movdqa %xmm4,%xmm1 | |
57 | |
58 paddd %xmm3,%xmm0 | |
59 pcmpeqd %xmm5,%xmm3 | |
60 movdqa %xmm2,144(%r10) | |
61 movdqa %xmm4,%xmm2 | |
62 | |
63 paddd %xmm0,%xmm1 | |
64 pcmpeqd %xmm5,%xmm0 | |
65 movdqa %xmm3,160(%r10) | |
66 movdqa %xmm4,%xmm3 | |
67 paddd %xmm1,%xmm2 | |
68 pcmpeqd %xmm5,%xmm1 | |
69 movdqa %xmm0,176(%r10) | |
70 movdqa %xmm4,%xmm0 | |
71 | |
72 paddd %xmm2,%xmm3 | |
73 pcmpeqd %xmm5,%xmm2 | |
74 movdqa %xmm1,192(%r10) | |
75 movdqa %xmm4,%xmm1 | |
76 | |
77 paddd %xmm3,%xmm0 | |
78 pcmpeqd %xmm5,%xmm3 | |
79 movdqa %xmm2,208(%r10) | |
80 movdqa %xmm4,%xmm2 | |
81 | |
82 paddd %xmm0,%xmm1 | |
83 pcmpeqd %xmm5,%xmm0 | |
84 movdqa %xmm3,224(%r10) | |
85 movdqa %xmm4,%xmm3 | |
86 paddd %xmm1,%xmm2 | |
87 pcmpeqd %xmm5,%xmm1 | |
88 movdqa %xmm0,240(%r10) | |
89 movdqa %xmm4,%xmm0 | |
90 | |
91 paddd %xmm2,%xmm3 | |
92 pcmpeqd %xmm5,%xmm2 | |
93 movdqa %xmm1,256(%r10) | |
94 movdqa %xmm4,%xmm1 | |
95 | |
96 paddd %xmm3,%xmm0 | |
97 pcmpeqd %xmm5,%xmm3 | |
98 movdqa %xmm2,272(%r10) | |
99 movdqa %xmm4,%xmm2 | |
100 | |
101 paddd %xmm0,%xmm1 | |
102 pcmpeqd %xmm5,%xmm0 | |
103 movdqa %xmm3,288(%r10) | |
104 movdqa %xmm4,%xmm3 | |
105 paddd %xmm1,%xmm2 | |
106 pcmpeqd %xmm5,%xmm1 | |
107 movdqa %xmm0,304(%r10) | |
108 | |
109 paddd %xmm2,%xmm3 | |
110 .byte 0x67 | |
111 pcmpeqd %xmm5,%xmm2 | |
112 movdqa %xmm1,320(%r10) | |
113 | |
114 pcmpeqd %xmm5,%xmm3 | |
115 movdqa %xmm2,336(%r10) | |
116 pand 64(%r12),%xmm0 | |
117 | |
118 pand 80(%r12),%xmm1 | |
119 pand 96(%r12),%xmm2 | |
120 movdqa %xmm3,352(%r10) | |
121 pand 112(%r12),%xmm3 | |
122 por %xmm2,%xmm0 | |
123 por %xmm3,%xmm1 | |
124 movdqa -128(%r12),%xmm4 | |
125 movdqa -112(%r12),%xmm5 | |
126 movdqa -96(%r12),%xmm2 | |
127 pand 112(%r10),%xmm4 | |
128 movdqa -80(%r12),%xmm3 | |
129 pand 128(%r10),%xmm5 | |
130 por %xmm4,%xmm0 | |
131 pand 144(%r10),%xmm2 | |
132 por %xmm5,%xmm1 | |
133 pand 160(%r10),%xmm3 | |
134 por %xmm2,%xmm0 | |
135 por %xmm3,%xmm1 | |
136 movdqa -64(%r12),%xmm4 | |
137 movdqa -48(%r12),%xmm5 | |
138 movdqa -32(%r12),%xmm2 | |
139 pand 176(%r10),%xmm4 | |
140 movdqa -16(%r12),%xmm3 | |
141 pand 192(%r10),%xmm5 | |
142 por %xmm4,%xmm0 | |
143 pand 208(%r10),%xmm2 | |
144 por %xmm5,%xmm1 | |
145 pand 224(%r10),%xmm3 | |
146 por %xmm2,%xmm0 | |
147 por %xmm3,%xmm1 | |
148 movdqa 0(%r12),%xmm4 | |
149 movdqa 16(%r12),%xmm5 | |
150 movdqa 32(%r12),%xmm2 | |
151 pand 240(%r10),%xmm4 | |
152 movdqa 48(%r12),%xmm3 | |
153 pand 256(%r10),%xmm5 | |
154 por %xmm4,%xmm0 | |
155 pand 272(%r10),%xmm2 | |
156 por %xmm5,%xmm1 | |
157 pand 288(%r10),%xmm3 | |
158 por %xmm2,%xmm0 | |
159 por %xmm3,%xmm1 | |
160 por %xmm1,%xmm0 | |
161 pshufd $0x4e,%xmm0,%xmm1 | |
162 por %xmm1,%xmm0 | |
163 leaq 256(%r12),%r12 | |
164 .byte 102,72,15,126,195 | |
165 | |
166 movq (%r8),%r8 | |
167 movq (%rsi),%rax | |
168 | |
169 xorq %r14,%r14 | |
170 xorq %r15,%r15 | |
171 | |
172 movq %r8,%rbp | |
173 mulq %rbx | |
174 movq %rax,%r10 | |
175 movq (%rcx),%rax | |
176 | |
177 imulq %r10,%rbp | |
178 movq %rdx,%r11 | |
179 | |
180 mulq %rbp | |
181 addq %rax,%r10 | |
182 movq 8(%rsi),%rax | |
183 adcq $0,%rdx | |
184 movq %rdx,%r13 | |
185 | |
186 leaq 1(%r15),%r15 | |
187 jmp L$1st_enter | |
188 | |
189 .p2align 4 | |
190 L$1st: | |
191 addq %rax,%r13 | |
192 movq (%rsi,%r15,8),%rax | |
193 adcq $0,%rdx | |
194 addq %r11,%r13 | |
195 movq %r10,%r11 | |
196 adcq $0,%rdx | |
197 movq %r13,-16(%rsp,%r15,8) | |
198 movq %rdx,%r13 | |
199 | |
200 L$1st_enter: | |
201 mulq %rbx | |
202 addq %rax,%r11 | |
203 movq (%rcx,%r15,8),%rax | |
204 adcq $0,%rdx | |
205 leaq 1(%r15),%r15 | |
206 movq %rdx,%r10 | |
207 | |
208 mulq %rbp | |
209 cmpq %r9,%r15 | |
210 jne L$1st | |
211 | |
212 | |
213 addq %rax,%r13 | |
214 adcq $0,%rdx | |
215 addq %r11,%r13 | |
216 adcq $0,%rdx | |
217 movq %r13,-16(%rsp,%r9,8) | |
218 movq %rdx,%r13 | |
219 movq %r10,%r11 | |
220 | |
221 xorq %rdx,%rdx | |
222 addq %r11,%r13 | |
223 adcq $0,%rdx | |
224 movq %r13,-8(%rsp,%r9,8) | |
225 movq %rdx,(%rsp,%r9,8) | |
226 | |
227 leaq 1(%r14),%r14 | |
228 jmp L$outer | |
229 .p2align 4 | |
230 L$outer: | |
231 leaq 24+128(%rsp,%r9,8),%rdx | |
232 andq $-16,%rdx | |
233 pxor %xmm4,%xmm4 | |
234 pxor %xmm5,%xmm5 | |
235 movdqa -128(%r12),%xmm0 | |
236 movdqa -112(%r12),%xmm1 | |
237 movdqa -96(%r12),%xmm2 | |
238 movdqa -80(%r12),%xmm3 | |
239 pand -128(%rdx),%xmm0 | |
240 pand -112(%rdx),%xmm1 | |
241 por %xmm0,%xmm4 | |
242 pand -96(%rdx),%xmm2 | |
243 por %xmm1,%xmm5 | |
244 pand -80(%rdx),%xmm3 | |
245 por %xmm2,%xmm4 | |
246 por %xmm3,%xmm5 | |
247 movdqa -64(%r12),%xmm0 | |
248 movdqa -48(%r12),%xmm1 | |
249 movdqa -32(%r12),%xmm2 | |
250 movdqa -16(%r12),%xmm3 | |
251 pand -64(%rdx),%xmm0 | |
252 pand -48(%rdx),%xmm1 | |
253 por %xmm0,%xmm4 | |
254 pand -32(%rdx),%xmm2 | |
255 por %xmm1,%xmm5 | |
256 pand -16(%rdx),%xmm3 | |
257 por %xmm2,%xmm4 | |
258 por %xmm3,%xmm5 | |
259 movdqa 0(%r12),%xmm0 | |
260 movdqa 16(%r12),%xmm1 | |
261 movdqa 32(%r12),%xmm2 | |
262 movdqa 48(%r12),%xmm3 | |
263 pand 0(%rdx),%xmm0 | |
264 pand 16(%rdx),%xmm1 | |
265 por %xmm0,%xmm4 | |
266 pand 32(%rdx),%xmm2 | |
267 por %xmm1,%xmm5 | |
268 pand 48(%rdx),%xmm3 | |
269 por %xmm2,%xmm4 | |
270 por %xmm3,%xmm5 | |
271 movdqa 64(%r12),%xmm0 | |
272 movdqa 80(%r12),%xmm1 | |
273 movdqa 96(%r12),%xmm2 | |
274 movdqa 112(%r12),%xmm3 | |
275 pand 64(%rdx),%xmm0 | |
276 pand 80(%rdx),%xmm1 | |
277 por %xmm0,%xmm4 | |
278 pand 96(%rdx),%xmm2 | |
279 por %xmm1,%xmm5 | |
280 pand 112(%rdx),%xmm3 | |
281 por %xmm2,%xmm4 | |
282 por %xmm3,%xmm5 | |
283 por %xmm5,%xmm4 | |
284 pshufd $0x4e,%xmm4,%xmm0 | |
285 por %xmm4,%xmm0 | |
286 leaq 256(%r12),%r12 | |
287 | |
288 movq (%rsi),%rax | |
289 .byte 102,72,15,126,195 | |
290 | |
291 xorq %r15,%r15 | |
292 movq %r8,%rbp | |
293 movq (%rsp),%r10 | |
294 | |
295 mulq %rbx | |
296 addq %rax,%r10 | |
297 movq (%rcx),%rax | |
298 adcq $0,%rdx | |
299 | |
300 imulq %r10,%rbp | |
301 movq %rdx,%r11 | |
302 | |
303 mulq %rbp | |
304 addq %rax,%r10 | |
305 movq 8(%rsi),%rax | |
306 adcq $0,%rdx | |
307 movq 8(%rsp),%r10 | |
308 movq %rdx,%r13 | |
309 | |
310 leaq 1(%r15),%r15 | |
311 jmp L$inner_enter | |
312 | |
313 .p2align 4 | |
314 L$inner: | |
315 addq %rax,%r13 | |
316 movq (%rsi,%r15,8),%rax | |
317 adcq $0,%rdx | |
318 addq %r10,%r13 | |
319 movq (%rsp,%r15,8),%r10 | |
320 adcq $0,%rdx | |
321 movq %r13,-16(%rsp,%r15,8) | |
322 movq %rdx,%r13 | |
323 | |
324 L$inner_enter: | |
325 mulq %rbx | |
326 addq %rax,%r11 | |
327 movq (%rcx,%r15,8),%rax | |
328 adcq $0,%rdx | |
329 addq %r11,%r10 | |
330 movq %rdx,%r11 | |
331 adcq $0,%r11 | |
332 leaq 1(%r15),%r15 | |
333 | |
334 mulq %rbp | |
335 cmpq %r9,%r15 | |
336 jne L$inner | |
337 | |
338 addq %rax,%r13 | |
339 adcq $0,%rdx | |
340 addq %r10,%r13 | |
341 movq (%rsp,%r9,8),%r10 | |
342 adcq $0,%rdx | |
343 movq %r13,-16(%rsp,%r9,8) | |
344 movq %rdx,%r13 | |
345 | |
346 xorq %rdx,%rdx | |
347 addq %r11,%r13 | |
348 adcq $0,%rdx | |
349 addq %r10,%r13 | |
350 adcq $0,%rdx | |
351 movq %r13,-8(%rsp,%r9,8) | |
352 movq %rdx,(%rsp,%r9,8) | |
353 | |
354 leaq 1(%r14),%r14 | |
355 cmpq %r9,%r14 | |
356 jb L$outer | |
357 | |
358 xorq %r14,%r14 | |
359 movq (%rsp),%rax | |
360 leaq (%rsp),%rsi | |
361 movq %r9,%r15 | |
362 jmp L$sub | |
363 .p2align 4 | |
364 L$sub: sbbq (%rcx,%r14,8),%rax | |
365 movq %rax,(%rdi,%r14,8) | |
366 movq 8(%rsi,%r14,8),%rax | |
367 leaq 1(%r14),%r14 | |
368 decq %r15 | |
369 jnz L$sub | |
370 | |
371 sbbq $0,%rax | |
372 xorq %r14,%r14 | |
373 movq %r9,%r15 | |
374 .p2align 4 | |
375 L$copy: | |
376 movq (%rsp,%r14,8),%rsi | |
377 movq (%rdi,%r14,8),%rcx | |
378 xorq %rcx,%rsi | |
379 andq %rax,%rsi | |
380 xorq %rcx,%rsi | |
381 movq %r14,(%rsp,%r14,8) | |
382 movq %rsi,(%rdi,%r14,8) | |
383 leaq 1(%r14),%r14 | |
384 subq $1,%r15 | |
385 jnz L$copy | |
386 | |
387 movq 8(%rsp,%r9,8),%rsi | |
388 movq $1,%rax | |
389 | |
390 movq -48(%rsi),%r15 | |
391 movq -40(%rsi),%r14 | |
392 movq -32(%rsi),%r13 | |
393 movq -24(%rsi),%r12 | |
394 movq -16(%rsi),%rbp | |
395 movq -8(%rsi),%rbx | |
396 leaq (%rsi),%rsp | |
397 L$mul_epilogue: | |
398 .byte 0xf3,0xc3 | |
399 | |
400 | |
401 .p2align 5 | |
402 bn_mul4x_mont_gather5: | |
403 L$mul4x_enter: | |
404 .byte 0x67 | |
405 movq %rsp,%rax | |
406 pushq %rbx | |
407 pushq %rbp | |
408 pushq %r12 | |
409 pushq %r13 | |
410 pushq %r14 | |
411 pushq %r15 | |
412 | |
413 .byte 0x67 | |
414 shll $3,%r9d | |
415 leaq (%r9,%r9,2),%r10 | |
416 negq %r9 | |
417 | |
418 | |
419 | |
420 | |
421 | |
422 | |
423 | |
424 | |
425 | |
426 | |
427 leaq -320(%rsp,%r9,2),%r11 | |
428 subq %rdi,%r11 | |
429 andq $4095,%r11 | |
430 cmpq %r11,%r10 | |
431 jb L$mul4xsp_alt | |
432 subq %r11,%rsp | |
433 leaq -320(%rsp,%r9,2),%rsp | |
434 jmp L$mul4xsp_done | |
435 | |
436 .p2align 5 | |
437 L$mul4xsp_alt: | |
438 leaq 4096-320(,%r9,2),%r10 | |
439 leaq -320(%rsp,%r9,2),%rsp | |
440 subq %r10,%r11 | |
441 movq $0,%r10 | |
442 cmovcq %r10,%r11 | |
443 subq %r11,%rsp | |
444 L$mul4xsp_done: | |
445 andq $-64,%rsp | |
446 negq %r9 | |
447 | |
448 movq %rax,40(%rsp) | |
449 L$mul4x_body: | |
450 | |
451 call mul4x_internal | |
452 | |
453 movq 40(%rsp),%rsi | |
454 movq $1,%rax | |
455 | |
456 movq -48(%rsi),%r15 | |
457 movq -40(%rsi),%r14 | |
458 movq -32(%rsi),%r13 | |
459 movq -24(%rsi),%r12 | |
460 movq -16(%rsi),%rbp | |
461 movq -8(%rsi),%rbx | |
462 leaq (%rsi),%rsp | |
463 L$mul4x_epilogue: | |
464 .byte 0xf3,0xc3 | |
465 | |
466 | |
467 | |
468 .p2align 5 | |
469 mul4x_internal: | |
470 shlq $5,%r9 | |
471 movd 8(%rax),%xmm5 | |
472 leaq L$inc(%rip),%rax | |
473 leaq 128(%rdx,%r9,1),%r13 | |
474 shrq $5,%r9 | |
475 movdqa 0(%rax),%xmm0 | |
476 movdqa 16(%rax),%xmm1 | |
477 leaq 88-112(%rsp,%r9,1),%r10 | |
478 leaq 128(%rdx),%r12 | |
479 | |
480 pshufd $0,%xmm5,%xmm5 | |
481 movdqa %xmm1,%xmm4 | |
482 .byte 0x67,0x67 | |
483 movdqa %xmm1,%xmm2 | |
484 paddd %xmm0,%xmm1 | |
485 pcmpeqd %xmm5,%xmm0 | |
486 .byte 0x67 | |
487 movdqa %xmm4,%xmm3 | |
488 paddd %xmm1,%xmm2 | |
489 pcmpeqd %xmm5,%xmm1 | |
490 movdqa %xmm0,112(%r10) | |
491 movdqa %xmm4,%xmm0 | |
492 | |
493 paddd %xmm2,%xmm3 | |
494 pcmpeqd %xmm5,%xmm2 | |
495 movdqa %xmm1,128(%r10) | |
496 movdqa %xmm4,%xmm1 | |
497 | |
498 paddd %xmm3,%xmm0 | |
499 pcmpeqd %xmm5,%xmm3 | |
500 movdqa %xmm2,144(%r10) | |
501 movdqa %xmm4,%xmm2 | |
502 | |
503 paddd %xmm0,%xmm1 | |
504 pcmpeqd %xmm5,%xmm0 | |
505 movdqa %xmm3,160(%r10) | |
506 movdqa %xmm4,%xmm3 | |
507 paddd %xmm1,%xmm2 | |
508 pcmpeqd %xmm5,%xmm1 | |
509 movdqa %xmm0,176(%r10) | |
510 movdqa %xmm4,%xmm0 | |
511 | |
512 paddd %xmm2,%xmm3 | |
513 pcmpeqd %xmm5,%xmm2 | |
514 movdqa %xmm1,192(%r10) | |
515 movdqa %xmm4,%xmm1 | |
516 | |
517 paddd %xmm3,%xmm0 | |
518 pcmpeqd %xmm5,%xmm3 | |
519 movdqa %xmm2,208(%r10) | |
520 movdqa %xmm4,%xmm2 | |
521 | |
522 paddd %xmm0,%xmm1 | |
523 pcmpeqd %xmm5,%xmm0 | |
524 movdqa %xmm3,224(%r10) | |
525 movdqa %xmm4,%xmm3 | |
526 paddd %xmm1,%xmm2 | |
527 pcmpeqd %xmm5,%xmm1 | |
528 movdqa %xmm0,240(%r10) | |
529 movdqa %xmm4,%xmm0 | |
530 | |
531 paddd %xmm2,%xmm3 | |
532 pcmpeqd %xmm5,%xmm2 | |
533 movdqa %xmm1,256(%r10) | |
534 movdqa %xmm4,%xmm1 | |
535 | |
536 paddd %xmm3,%xmm0 | |
537 pcmpeqd %xmm5,%xmm3 | |
538 movdqa %xmm2,272(%r10) | |
539 movdqa %xmm4,%xmm2 | |
540 | |
541 paddd %xmm0,%xmm1 | |
542 pcmpeqd %xmm5,%xmm0 | |
543 movdqa %xmm3,288(%r10) | |
544 movdqa %xmm4,%xmm3 | |
545 paddd %xmm1,%xmm2 | |
546 pcmpeqd %xmm5,%xmm1 | |
547 movdqa %xmm0,304(%r10) | |
548 | |
549 paddd %xmm2,%xmm3 | |
550 .byte 0x67 | |
551 pcmpeqd %xmm5,%xmm2 | |
552 movdqa %xmm1,320(%r10) | |
553 | |
554 pcmpeqd %xmm5,%xmm3 | |
555 movdqa %xmm2,336(%r10) | |
556 pand 64(%r12),%xmm0 | |
557 | |
558 pand 80(%r12),%xmm1 | |
559 pand 96(%r12),%xmm2 | |
560 movdqa %xmm3,352(%r10) | |
561 pand 112(%r12),%xmm3 | |
562 por %xmm2,%xmm0 | |
563 por %xmm3,%xmm1 | |
564 movdqa -128(%r12),%xmm4 | |
565 movdqa -112(%r12),%xmm5 | |
566 movdqa -96(%r12),%xmm2 | |
567 pand 112(%r10),%xmm4 | |
568 movdqa -80(%r12),%xmm3 | |
569 pand 128(%r10),%xmm5 | |
570 por %xmm4,%xmm0 | |
571 pand 144(%r10),%xmm2 | |
572 por %xmm5,%xmm1 | |
573 pand 160(%r10),%xmm3 | |
574 por %xmm2,%xmm0 | |
575 por %xmm3,%xmm1 | |
576 movdqa -64(%r12),%xmm4 | |
577 movdqa -48(%r12),%xmm5 | |
578 movdqa -32(%r12),%xmm2 | |
579 pand 176(%r10),%xmm4 | |
580 movdqa -16(%r12),%xmm3 | |
581 pand 192(%r10),%xmm5 | |
582 por %xmm4,%xmm0 | |
583 pand 208(%r10),%xmm2 | |
584 por %xmm5,%xmm1 | |
585 pand 224(%r10),%xmm3 | |
586 por %xmm2,%xmm0 | |
587 por %xmm3,%xmm1 | |
588 movdqa 0(%r12),%xmm4 | |
589 movdqa 16(%r12),%xmm5 | |
590 movdqa 32(%r12),%xmm2 | |
591 pand 240(%r10),%xmm4 | |
592 movdqa 48(%r12),%xmm3 | |
593 pand 256(%r10),%xmm5 | |
594 por %xmm4,%xmm0 | |
595 pand 272(%r10),%xmm2 | |
596 por %xmm5,%xmm1 | |
597 pand 288(%r10),%xmm3 | |
598 por %xmm2,%xmm0 | |
599 por %xmm3,%xmm1 | |
600 por %xmm1,%xmm0 | |
601 pshufd $0x4e,%xmm0,%xmm1 | |
602 por %xmm1,%xmm0 | |
603 leaq 256(%r12),%r12 | |
604 .byte 102,72,15,126,195 | |
605 | |
606 movq %r13,16+8(%rsp) | |
607 movq %rdi,56+8(%rsp) | |
608 | |
609 movq (%r8),%r8 | |
610 movq (%rsi),%rax | |
611 leaq (%rsi,%r9,1),%rsi | |
612 negq %r9 | |
613 | |
614 movq %r8,%rbp | |
615 mulq %rbx | |
616 movq %rax,%r10 | |
617 movq (%rcx),%rax | |
618 | |
619 imulq %r10,%rbp | |
620 leaq 64+8(%rsp),%r14 | |
621 movq %rdx,%r11 | |
622 | |
623 mulq %rbp | |
624 addq %rax,%r10 | |
625 movq 8(%rsi,%r9,1),%rax | |
626 adcq $0,%rdx | |
627 movq %rdx,%rdi | |
628 | |
629 mulq %rbx | |
630 addq %rax,%r11 | |
631 movq 8(%rcx),%rax | |
632 adcq $0,%rdx | |
633 movq %rdx,%r10 | |
634 | |
635 mulq %rbp | |
636 addq %rax,%rdi | |
637 movq 16(%rsi,%r9,1),%rax | |
638 adcq $0,%rdx | |
639 addq %r11,%rdi | |
640 leaq 32(%r9),%r15 | |
641 leaq 32(%rcx),%rcx | |
642 adcq $0,%rdx | |
643 movq %rdi,(%r14) | |
644 movq %rdx,%r13 | |
645 jmp L$1st4x | |
646 | |
647 .p2align 5 | |
648 L$1st4x: | |
649 mulq %rbx | |
650 addq %rax,%r10 | |
651 movq -16(%rcx),%rax | |
652 leaq 32(%r14),%r14 | |
653 adcq $0,%rdx | |
654 movq %rdx,%r11 | |
655 | |
656 mulq %rbp | |
657 addq %rax,%r13 | |
658 movq -8(%rsi,%r15,1),%rax | |
659 adcq $0,%rdx | |
660 addq %r10,%r13 | |
661 adcq $0,%rdx | |
662 movq %r13,-24(%r14) | |
663 movq %rdx,%rdi | |
664 | |
665 mulq %rbx | |
666 addq %rax,%r11 | |
667 movq -8(%rcx),%rax | |
668 adcq $0,%rdx | |
669 movq %rdx,%r10 | |
670 | |
671 mulq %rbp | |
672 addq %rax,%rdi | |
673 movq (%rsi,%r15,1),%rax | |
674 adcq $0,%rdx | |
675 addq %r11,%rdi | |
676 adcq $0,%rdx | |
677 movq %rdi,-16(%r14) | |
678 movq %rdx,%r13 | |
679 | |
680 mulq %rbx | |
681 addq %rax,%r10 | |
682 movq 0(%rcx),%rax | |
683 adcq $0,%rdx | |
684 movq %rdx,%r11 | |
685 | |
686 mulq %rbp | |
687 addq %rax,%r13 | |
688 movq 8(%rsi,%r15,1),%rax | |
689 adcq $0,%rdx | |
690 addq %r10,%r13 | |
691 adcq $0,%rdx | |
692 movq %r13,-8(%r14) | |
693 movq %rdx,%rdi | |
694 | |
695 mulq %rbx | |
696 addq %rax,%r11 | |
697 movq 8(%rcx),%rax | |
698 adcq $0,%rdx | |
699 movq %rdx,%r10 | |
700 | |
701 mulq %rbp | |
702 addq %rax,%rdi | |
703 movq 16(%rsi,%r15,1),%rax | |
704 adcq $0,%rdx | |
705 addq %r11,%rdi | |
706 leaq 32(%rcx),%rcx | |
707 adcq $0,%rdx | |
708 movq %rdi,(%r14) | |
709 movq %rdx,%r13 | |
710 | |
711 addq $32,%r15 | |
712 jnz L$1st4x | |
713 | |
714 mulq %rbx | |
715 addq %rax,%r10 | |
716 movq -16(%rcx),%rax | |
717 leaq 32(%r14),%r14 | |
718 adcq $0,%rdx | |
719 movq %rdx,%r11 | |
720 | |
721 mulq %rbp | |
722 addq %rax,%r13 | |
723 movq -8(%rsi),%rax | |
724 adcq $0,%rdx | |
725 addq %r10,%r13 | |
726 adcq $0,%rdx | |
727 movq %r13,-24(%r14) | |
728 movq %rdx,%rdi | |
729 | |
730 mulq %rbx | |
731 addq %rax,%r11 | |
732 movq -8(%rcx),%rax | |
733 adcq $0,%rdx | |
734 movq %rdx,%r10 | |
735 | |
736 mulq %rbp | |
737 addq %rax,%rdi | |
738 movq (%rsi,%r9,1),%rax | |
739 adcq $0,%rdx | |
740 addq %r11,%rdi | |
741 adcq $0,%rdx | |
742 movq %rdi,-16(%r14) | |
743 movq %rdx,%r13 | |
744 | |
745 leaq (%rcx,%r9,1),%rcx | |
746 | |
747 xorq %rdi,%rdi | |
748 addq %r10,%r13 | |
749 adcq $0,%rdi | |
750 movq %r13,-8(%r14) | |
751 | |
752 jmp L$outer4x | |
753 | |
754 .p2align 5 | |
755 L$outer4x: | |
756 leaq 16+128(%r14),%rdx | |
757 pxor %xmm4,%xmm4 | |
758 pxor %xmm5,%xmm5 | |
759 movdqa -128(%r12),%xmm0 | |
760 movdqa -112(%r12),%xmm1 | |
761 movdqa -96(%r12),%xmm2 | |
762 movdqa -80(%r12),%xmm3 | |
763 pand -128(%rdx),%xmm0 | |
764 pand -112(%rdx),%xmm1 | |
765 por %xmm0,%xmm4 | |
766 pand -96(%rdx),%xmm2 | |
767 por %xmm1,%xmm5 | |
768 pand -80(%rdx),%xmm3 | |
769 por %xmm2,%xmm4 | |
770 por %xmm3,%xmm5 | |
771 movdqa -64(%r12),%xmm0 | |
772 movdqa -48(%r12),%xmm1 | |
773 movdqa -32(%r12),%xmm2 | |
774 movdqa -16(%r12),%xmm3 | |
775 pand -64(%rdx),%xmm0 | |
776 pand -48(%rdx),%xmm1 | |
777 por %xmm0,%xmm4 | |
778 pand -32(%rdx),%xmm2 | |
779 por %xmm1,%xmm5 | |
780 pand -16(%rdx),%xmm3 | |
781 por %xmm2,%xmm4 | |
782 por %xmm3,%xmm5 | |
783 movdqa 0(%r12),%xmm0 | |
784 movdqa 16(%r12),%xmm1 | |
785 movdqa 32(%r12),%xmm2 | |
786 movdqa 48(%r12),%xmm3 | |
787 pand 0(%rdx),%xmm0 | |
788 pand 16(%rdx),%xmm1 | |
789 por %xmm0,%xmm4 | |
790 pand 32(%rdx),%xmm2 | |
791 por %xmm1,%xmm5 | |
792 pand 48(%rdx),%xmm3 | |
793 por %xmm2,%xmm4 | |
794 por %xmm3,%xmm5 | |
795 movdqa 64(%r12),%xmm0 | |
796 movdqa 80(%r12),%xmm1 | |
797 movdqa 96(%r12),%xmm2 | |
798 movdqa 112(%r12),%xmm3 | |
799 pand 64(%rdx),%xmm0 | |
800 pand 80(%rdx),%xmm1 | |
801 por %xmm0,%xmm4 | |
802 pand 96(%rdx),%xmm2 | |
803 por %xmm1,%xmm5 | |
804 pand 112(%rdx),%xmm3 | |
805 por %xmm2,%xmm4 | |
806 por %xmm3,%xmm5 | |
807 por %xmm5,%xmm4 | |
808 pshufd $0x4e,%xmm4,%xmm0 | |
809 por %xmm4,%xmm0 | |
810 leaq 256(%r12),%r12 | |
811 .byte 102,72,15,126,195 | |
812 | |
813 movq (%r14,%r9,1),%r10 | |
814 movq %r8,%rbp | |
815 mulq %rbx | |
816 addq %rax,%r10 | |
817 movq (%rcx),%rax | |
818 adcq $0,%rdx | |
819 | |
820 imulq %r10,%rbp | |
821 movq %rdx,%r11 | |
822 movq %rdi,(%r14) | |
823 | |
824 leaq (%r14,%r9,1),%r14 | |
825 | |
826 mulq %rbp | |
827 addq %rax,%r10 | |
828 movq 8(%rsi,%r9,1),%rax | |
829 adcq $0,%rdx | |
830 movq %rdx,%rdi | |
831 | |
832 mulq %rbx | |
833 addq %rax,%r11 | |
834 movq 8(%rcx),%rax | |
835 adcq $0,%rdx | |
836 addq 8(%r14),%r11 | |
837 adcq $0,%rdx | |
838 movq %rdx,%r10 | |
839 | |
840 mulq %rbp | |
841 addq %rax,%rdi | |
842 movq 16(%rsi,%r9,1),%rax | |
843 adcq $0,%rdx | |
844 addq %r11,%rdi | |
845 leaq 32(%r9),%r15 | |
846 leaq 32(%rcx),%rcx | |
847 adcq $0,%rdx | |
848 movq %rdx,%r13 | |
849 jmp L$inner4x | |
850 | |
851 .p2align 5 | |
852 L$inner4x: | |
853 mulq %rbx | |
854 addq %rax,%r10 | |
855 movq -16(%rcx),%rax | |
856 adcq $0,%rdx | |
857 addq 16(%r14),%r10 | |
858 leaq 32(%r14),%r14 | |
859 adcq $0,%rdx | |
860 movq %rdx,%r11 | |
861 | |
862 mulq %rbp | |
863 addq %rax,%r13 | |
864 movq -8(%rsi,%r15,1),%rax | |
865 adcq $0,%rdx | |
866 addq %r10,%r13 | |
867 adcq $0,%rdx | |
868 movq %rdi,-32(%r14) | |
869 movq %rdx,%rdi | |
870 | |
871 mulq %rbx | |
872 addq %rax,%r11 | |
873 movq -8(%rcx),%rax | |
874 adcq $0,%rdx | |
875 addq -8(%r14),%r11 | |
876 adcq $0,%rdx | |
877 movq %rdx,%r10 | |
878 | |
879 mulq %rbp | |
880 addq %rax,%rdi | |
881 movq (%rsi,%r15,1),%rax | |
882 adcq $0,%rdx | |
883 addq %r11,%rdi | |
884 adcq $0,%rdx | |
885 movq %r13,-24(%r14) | |
886 movq %rdx,%r13 | |
887 | |
888 mulq %rbx | |
889 addq %rax,%r10 | |
890 movq 0(%rcx),%rax | |
891 adcq $0,%rdx | |
892 addq (%r14),%r10 | |
893 adcq $0,%rdx | |
894 movq %rdx,%r11 | |
895 | |
896 mulq %rbp | |
897 addq %rax,%r13 | |
898 movq 8(%rsi,%r15,1),%rax | |
899 adcq $0,%rdx | |
900 addq %r10,%r13 | |
901 adcq $0,%rdx | |
902 movq %rdi,-16(%r14) | |
903 movq %rdx,%rdi | |
904 | |
905 mulq %rbx | |
906 addq %rax,%r11 | |
907 movq 8(%rcx),%rax | |
908 adcq $0,%rdx | |
909 addq 8(%r14),%r11 | |
910 adcq $0,%rdx | |
911 movq %rdx,%r10 | |
912 | |
913 mulq %rbp | |
914 addq %rax,%rdi | |
915 movq 16(%rsi,%r15,1),%rax | |
916 adcq $0,%rdx | |
917 addq %r11,%rdi | |
918 leaq 32(%rcx),%rcx | |
919 adcq $0,%rdx | |
920 movq %r13,-8(%r14) | |
921 movq %rdx,%r13 | |
922 | |
923 addq $32,%r15 | |
924 jnz L$inner4x | |
925 | |
926 mulq %rbx | |
927 addq %rax,%r10 | |
928 movq -16(%rcx),%rax | |
929 adcq $0,%rdx | |
930 addq 16(%r14),%r10 | |
931 leaq 32(%r14),%r14 | |
932 adcq $0,%rdx | |
933 movq %rdx,%r11 | |
934 | |
935 mulq %rbp | |
936 addq %rax,%r13 | |
937 movq -8(%rsi),%rax | |
938 adcq $0,%rdx | |
939 addq %r10,%r13 | |
940 adcq $0,%rdx | |
941 movq %rdi,-32(%r14) | |
942 movq %rdx,%rdi | |
943 | |
944 mulq %rbx | |
945 addq %rax,%r11 | |
946 movq %rbp,%rax | |
947 movq -8(%rcx),%rbp | |
948 adcq $0,%rdx | |
949 addq -8(%r14),%r11 | |
950 adcq $0,%rdx | |
951 movq %rdx,%r10 | |
952 | |
953 mulq %rbp | |
954 addq %rax,%rdi | |
955 movq (%rsi,%r9,1),%rax | |
956 adcq $0,%rdx | |
957 addq %r11,%rdi | |
958 adcq $0,%rdx | |
959 movq %r13,-24(%r14) | |
960 movq %rdx,%r13 | |
961 | |
962 movq %rdi,-16(%r14) | |
963 leaq (%rcx,%r9,1),%rcx | |
964 | |
965 xorq %rdi,%rdi | |
966 addq %r10,%r13 | |
967 adcq $0,%rdi | |
968 addq (%r14),%r13 | |
969 adcq $0,%rdi | |
970 movq %r13,-8(%r14) | |
971 | |
972 cmpq 16+8(%rsp),%r12 | |
973 jb L$outer4x | |
974 xorq %rax,%rax | |
975 subq %r13,%rbp | |
976 adcq %r15,%r15 | |
977 orq %r15,%rdi | |
978 subq %rdi,%rax | |
979 leaq (%r14,%r9,1),%rbx | |
980 movq (%rcx),%r12 | |
981 leaq (%rcx),%rbp | |
982 movq %r9,%rcx | |
983 sarq $3+2,%rcx | |
984 movq 56+8(%rsp),%rdi | |
985 decq %r12 | |
986 xorq %r10,%r10 | |
987 movq 8(%rbp),%r13 | |
988 movq 16(%rbp),%r14 | |
989 movq 24(%rbp),%r15 | |
990 jmp L$sqr4x_sub_entry | |
991 | |
992 .globl _bn_power5 | |
993 .private_extern _bn_power5 | |
994 | |
995 .p2align 5 | |
996 _bn_power5: | |
997 movq %rsp,%rax | |
998 pushq %rbx | |
999 pushq %rbp | |
1000 pushq %r12 | |
1001 pushq %r13 | |
1002 pushq %r14 | |
1003 pushq %r15 | |
1004 | |
1005 shll $3,%r9d | |
1006 leal (%r9,%r9,2),%r10d | |
1007 negq %r9 | |
1008 movq (%r8),%r8 | |
1009 | |
1010 | |
1011 | |
1012 | |
1013 | |
1014 | |
1015 | |
1016 | |
1017 leaq -320(%rsp,%r9,2),%r11 | |
1018 subq %rdi,%r11 | |
1019 andq $4095,%r11 | |
1020 cmpq %r11,%r10 | |
1021 jb L$pwr_sp_alt | |
1022 subq %r11,%rsp | |
1023 leaq -320(%rsp,%r9,2),%rsp | |
1024 jmp L$pwr_sp_done | |
1025 | |
1026 .p2align 5 | |
1027 L$pwr_sp_alt: | |
1028 leaq 4096-320(,%r9,2),%r10 | |
1029 leaq -320(%rsp,%r9,2),%rsp | |
1030 subq %r10,%r11 | |
1031 movq $0,%r10 | |
1032 cmovcq %r10,%r11 | |
1033 subq %r11,%rsp | |
1034 L$pwr_sp_done: | |
1035 andq $-64,%rsp | |
1036 movq %r9,%r10 | |
1037 negq %r9 | |
1038 | |
1039 | |
1040 | |
1041 | |
1042 | |
1043 | |
1044 | |
1045 | |
1046 | |
1047 | |
1048 movq %r8,32(%rsp) | |
1049 movq %rax,40(%rsp) | |
1050 L$power5_body: | |
1051 .byte 102,72,15,110,207 | |
1052 .byte 102,72,15,110,209 | |
1053 .byte 102,73,15,110,218 | |
1054 .byte 102,72,15,110,226 | |
1055 | |
1056 call __bn_sqr8x_internal | |
1057 call __bn_post4x_internal | |
1058 call __bn_sqr8x_internal | |
1059 call __bn_post4x_internal | |
1060 call __bn_sqr8x_internal | |
1061 call __bn_post4x_internal | |
1062 call __bn_sqr8x_internal | |
1063 call __bn_post4x_internal | |
1064 call __bn_sqr8x_internal | |
1065 call __bn_post4x_internal | |
1066 | |
1067 .byte 102,72,15,126,209 | |
1068 .byte 102,72,15,126,226 | |
1069 movq %rsi,%rdi | |
1070 movq 40(%rsp),%rax | |
1071 leaq 32(%rsp),%r8 | |
1072 | |
1073 call mul4x_internal | |
1074 | |
1075 movq 40(%rsp),%rsi | |
1076 movq $1,%rax | |
1077 movq -48(%rsi),%r15 | |
1078 movq -40(%rsi),%r14 | |
1079 movq -32(%rsi),%r13 | |
1080 movq -24(%rsi),%r12 | |
1081 movq -16(%rsi),%rbp | |
1082 movq -8(%rsi),%rbx | |
1083 leaq (%rsi),%rsp | |
1084 L$power5_epilogue: | |
1085 .byte 0xf3,0xc3 | |
1086 | |
1087 | |
1088 .globl _bn_sqr8x_internal | |
1089 .private_extern _bn_sqr8x_internal | |
1090 .private_extern _bn_sqr8x_internal | |
1091 | |
1092 .p2align 5 | |
1093 _bn_sqr8x_internal: | |
1094 __bn_sqr8x_internal: | |
1095 | |
1096 | |
1097 | |
1098 | |
1099 | |
1100 | |
1101 | |
1102 | |
1103 | |
1104 | |
1105 | |
1106 | |
1107 | |
1108 | |
1109 | |
1110 | |
1111 | |
1112 | |
1113 | |
1114 | |
1115 | |
1116 | |
1117 | |
1118 | |
1119 | |
1120 | |
1121 | |
1122 | |
1123 | |
1124 | |
1125 | |
1126 | |
1127 | |
1128 | |
1129 | |
1130 | |
1131 | |
1132 | |
1133 | |
1134 | |
1135 | |
1136 | |
1137 | |
1138 | |
1139 | |
1140 | |
1141 | |
1142 | |
1143 | |
1144 | |
1145 | |
1146 | |
1147 | |
1148 | |
1149 | |
1150 | |
1151 | |
1152 | |
1153 | |
1154 | |
1155 | |
1156 | |
1157 | |
1158 | |
1159 | |
1160 | |
1161 | |
1162 | |
1163 | |
1164 | |
1165 | |
1166 | |
1167 | |
1168 leaq 32(%r10),%rbp | |
1169 leaq (%rsi,%r9,1),%rsi | |
1170 | |
1171 movq %r9,%rcx | |
1172 | |
1173 | |
1174 movq -32(%rsi,%rbp,1),%r14 | |
1175 leaq 48+8(%rsp,%r9,2),%rdi | |
1176 movq -24(%rsi,%rbp,1),%rax | |
1177 leaq -32(%rdi,%rbp,1),%rdi | |
1178 movq -16(%rsi,%rbp,1),%rbx | |
1179 movq %rax,%r15 | |
1180 | |
1181 mulq %r14 | |
1182 movq %rax,%r10 | |
1183 movq %rbx,%rax | |
1184 movq %rdx,%r11 | |
1185 movq %r10,-24(%rdi,%rbp,1) | |
1186 | |
1187 mulq %r14 | |
1188 addq %rax,%r11 | |
1189 movq %rbx,%rax | |
1190 adcq $0,%rdx | |
1191 movq %r11,-16(%rdi,%rbp,1) | |
1192 movq %rdx,%r10 | |
1193 | |
1194 | |
1195 movq -8(%rsi,%rbp,1),%rbx | |
1196 mulq %r15 | |
1197 movq %rax,%r12 | |
1198 movq %rbx,%rax | |
1199 movq %rdx,%r13 | |
1200 | |
1201 leaq (%rbp),%rcx | |
1202 mulq %r14 | |
1203 addq %rax,%r10 | |
1204 movq %rbx,%rax | |
1205 movq %rdx,%r11 | |
1206 adcq $0,%r11 | |
1207 addq %r12,%r10 | |
1208 adcq $0,%r11 | |
1209 movq %r10,-8(%rdi,%rcx,1) | |
1210 jmp L$sqr4x_1st | |
1211 | |
1212 .p2align 5 | |
1213 L$sqr4x_1st: | |
1214 movq (%rsi,%rcx,1),%rbx | |
1215 mulq %r15 | |
1216 addq %rax,%r13 | |
1217 movq %rbx,%rax | |
1218 movq %rdx,%r12 | |
1219 adcq $0,%r12 | |
1220 | |
1221 mulq %r14 | |
1222 addq %rax,%r11 | |
1223 movq %rbx,%rax | |
1224 movq 8(%rsi,%rcx,1),%rbx | |
1225 movq %rdx,%r10 | |
1226 adcq $0,%r10 | |
1227 addq %r13,%r11 | |
1228 adcq $0,%r10 | |
1229 | |
1230 | |
1231 mulq %r15 | |
1232 addq %rax,%r12 | |
1233 movq %rbx,%rax | |
1234 movq %r11,(%rdi,%rcx,1) | |
1235 movq %rdx,%r13 | |
1236 adcq $0,%r13 | |
1237 | |
1238 mulq %r14 | |
1239 addq %rax,%r10 | |
1240 movq %rbx,%rax | |
1241 movq 16(%rsi,%rcx,1),%rbx | |
1242 movq %rdx,%r11 | |
1243 adcq $0,%r11 | |
1244 addq %r12,%r10 | |
1245 adcq $0,%r11 | |
1246 | |
1247 mulq %r15 | |
1248 addq %rax,%r13 | |
1249 movq %rbx,%rax | |
1250 movq %r10,8(%rdi,%rcx,1) | |
1251 movq %rdx,%r12 | |
1252 adcq $0,%r12 | |
1253 | |
1254 mulq %r14 | |
1255 addq %rax,%r11 | |
1256 movq %rbx,%rax | |
1257 movq 24(%rsi,%rcx,1),%rbx | |
1258 movq %rdx,%r10 | |
1259 adcq $0,%r10 | |
1260 addq %r13,%r11 | |
1261 adcq $0,%r10 | |
1262 | |
1263 | |
1264 mulq %r15 | |
1265 addq %rax,%r12 | |
1266 movq %rbx,%rax | |
1267 movq %r11,16(%rdi,%rcx,1) | |
1268 movq %rdx,%r13 | |
1269 adcq $0,%r13 | |
1270 leaq 32(%rcx),%rcx | |
1271 | |
1272 mulq %r14 | |
1273 addq %rax,%r10 | |
1274 movq %rbx,%rax | |
1275 movq %rdx,%r11 | |
1276 adcq $0,%r11 | |
1277 addq %r12,%r10 | |
1278 adcq $0,%r11 | |
1279 movq %r10,-8(%rdi,%rcx,1) | |
1280 | |
1281 cmpq $0,%rcx | |
1282 jne L$sqr4x_1st | |
1283 | |
1284 mulq %r15 | |
1285 addq %rax,%r13 | |
1286 leaq 16(%rbp),%rbp | |
1287 adcq $0,%rdx | |
1288 addq %r11,%r13 | |
1289 adcq $0,%rdx | |
1290 | |
1291 movq %r13,(%rdi) | |
1292 movq %rdx,%r12 | |
1293 movq %rdx,8(%rdi) | |
1294 jmp L$sqr4x_outer | |
1295 | |
1296 .p2align 5 | |
1297 L$sqr4x_outer: | |
1298 movq -32(%rsi,%rbp,1),%r14 | |
1299 leaq 48+8(%rsp,%r9,2),%rdi | |
1300 movq -24(%rsi,%rbp,1),%rax | |
1301 leaq -32(%rdi,%rbp,1),%rdi | |
1302 movq -16(%rsi,%rbp,1),%rbx | |
1303 movq %rax,%r15 | |
1304 | |
1305 mulq %r14 | |
1306 movq -24(%rdi,%rbp,1),%r10 | |
1307 addq %rax,%r10 | |
1308 movq %rbx,%rax | |
1309 adcq $0,%rdx | |
1310 movq %r10,-24(%rdi,%rbp,1) | |
1311 movq %rdx,%r11 | |
1312 | |
1313 mulq %r14 | |
1314 addq %rax,%r11 | |
1315 movq %rbx,%rax | |
1316 adcq $0,%rdx | |
1317 addq -16(%rdi,%rbp,1),%r11 | |
1318 movq %rdx,%r10 | |
1319 adcq $0,%r10 | |
1320 movq %r11,-16(%rdi,%rbp,1) | |
1321 | |
1322 xorq %r12,%r12 | |
1323 | |
1324 movq -8(%rsi,%rbp,1),%rbx | |
1325 mulq %r15 | |
1326 addq %rax,%r12 | |
1327 movq %rbx,%rax | |
1328 adcq $0,%rdx | |
1329 addq -8(%rdi,%rbp,1),%r12 | |
1330 movq %rdx,%r13 | |
1331 adcq $0,%r13 | |
1332 | |
1333 mulq %r14 | |
1334 addq %rax,%r10 | |
1335 movq %rbx,%rax | |
1336 adcq $0,%rdx | |
1337 addq %r12,%r10 | |
1338 movq %rdx,%r11 | |
1339 adcq $0,%r11 | |
1340 movq %r10,-8(%rdi,%rbp,1) | |
1341 | |
1342 leaq (%rbp),%rcx | |
1343 jmp L$sqr4x_inner | |
1344 | |
1345 .p2align 5 | |
1346 L$sqr4x_inner: | |
1347 movq (%rsi,%rcx,1),%rbx | |
1348 mulq %r15 | |
1349 addq %rax,%r13 | |
1350 movq %rbx,%rax | |
1351 movq %rdx,%r12 | |
1352 adcq $0,%r12 | |
1353 addq (%rdi,%rcx,1),%r13 | |
1354 adcq $0,%r12 | |
1355 | |
1356 .byte 0x67 | |
1357 mulq %r14 | |
1358 addq %rax,%r11 | |
1359 movq %rbx,%rax | |
1360 movq 8(%rsi,%rcx,1),%rbx | |
1361 movq %rdx,%r10 | |
1362 adcq $0,%r10 | |
1363 addq %r13,%r11 | |
1364 adcq $0,%r10 | |
1365 | |
1366 mulq %r15 | |
1367 addq %rax,%r12 | |
1368 movq %r11,(%rdi,%rcx,1) | |
1369 movq %rbx,%rax | |
1370 movq %rdx,%r13 | |
1371 adcq $0,%r13 | |
1372 addq 8(%rdi,%rcx,1),%r12 | |
1373 leaq 16(%rcx),%rcx | |
1374 adcq $0,%r13 | |
1375 | |
1376 mulq %r14 | |
1377 addq %rax,%r10 | |
1378 movq %rbx,%rax | |
1379 adcq $0,%rdx | |
1380 addq %r12,%r10 | |
1381 movq %rdx,%r11 | |
1382 adcq $0,%r11 | |
1383 movq %r10,-8(%rdi,%rcx,1) | |
1384 | |
1385 cmpq $0,%rcx | |
1386 jne L$sqr4x_inner | |
1387 | |
1388 .byte 0x67 | |
1389 mulq %r15 | |
1390 addq %rax,%r13 | |
1391 adcq $0,%rdx | |
1392 addq %r11,%r13 | |
1393 adcq $0,%rdx | |
1394 | |
1395 movq %r13,(%rdi) | |
1396 movq %rdx,%r12 | |
1397 movq %rdx,8(%rdi) | |
1398 | |
1399 addq $16,%rbp | |
1400 jnz L$sqr4x_outer | |
1401 | |
1402 | |
1403 movq -32(%rsi),%r14 | |
1404 leaq 48+8(%rsp,%r9,2),%rdi | |
1405 movq -24(%rsi),%rax | |
1406 leaq -32(%rdi,%rbp,1),%rdi | |
1407 movq -16(%rsi),%rbx | |
1408 movq %rax,%r15 | |
1409 | |
1410 mulq %r14 | |
1411 addq %rax,%r10 | |
1412 movq %rbx,%rax | |
1413 movq %rdx,%r11 | |
1414 adcq $0,%r11 | |
1415 | |
1416 mulq %r14 | |
1417 addq %rax,%r11 | |
1418 movq %rbx,%rax | |
1419 movq %r10,-24(%rdi) | |
1420 movq %rdx,%r10 | |
1421 adcq $0,%r10 | |
1422 addq %r13,%r11 | |
1423 movq -8(%rsi),%rbx | |
1424 adcq $0,%r10 | |
1425 | |
1426 mulq %r15 | |
1427 addq %rax,%r12 | |
1428 movq %rbx,%rax | |
1429 movq %r11,-16(%rdi) | |
1430 movq %rdx,%r13 | |
1431 adcq $0,%r13 | |
1432 | |
1433 mulq %r14 | |
1434 addq %rax,%r10 | |
1435 movq %rbx,%rax | |
1436 movq %rdx,%r11 | |
1437 adcq $0,%r11 | |
1438 addq %r12,%r10 | |
1439 adcq $0,%r11 | |
1440 movq %r10,-8(%rdi) | |
1441 | |
1442 mulq %r15 | |
1443 addq %rax,%r13 | |
1444 movq -16(%rsi),%rax | |
1445 adcq $0,%rdx | |
1446 addq %r11,%r13 | |
1447 adcq $0,%rdx | |
1448 | |
1449 movq %r13,(%rdi) | |
1450 movq %rdx,%r12 | |
1451 movq %rdx,8(%rdi) | |
1452 | |
1453 mulq %rbx | |
1454 addq $16,%rbp | |
1455 xorq %r14,%r14 | |
1456 subq %r9,%rbp | |
1457 xorq %r15,%r15 | |
1458 | |
1459 addq %r12,%rax | |
1460 adcq $0,%rdx | |
1461 movq %rax,8(%rdi) | |
1462 movq %rdx,16(%rdi) | |
1463 movq %r15,24(%rdi) | |
1464 | |
1465 movq -16(%rsi,%rbp,1),%rax | |
1466 leaq 48+8(%rsp),%rdi | |
1467 xorq %r10,%r10 | |
1468 movq 8(%rdi),%r11 | |
1469 | |
1470 leaq (%r14,%r10,2),%r12 | |
1471 shrq $63,%r10 | |
1472 leaq (%rcx,%r11,2),%r13 | |
1473 shrq $63,%r11 | |
1474 orq %r10,%r13 | |
1475 movq 16(%rdi),%r10 | |
1476 movq %r11,%r14 | |
1477 mulq %rax | |
1478 negq %r15 | |
1479 movq 24(%rdi),%r11 | |
1480 adcq %rax,%r12 | |
1481 movq -8(%rsi,%rbp,1),%rax | |
1482 movq %r12,(%rdi) | |
1483 adcq %rdx,%r13 | |
1484 | |
1485 leaq (%r14,%r10,2),%rbx | |
1486 movq %r13,8(%rdi) | |
1487 sbbq %r15,%r15 | |
1488 shrq $63,%r10 | |
1489 leaq (%rcx,%r11,2),%r8 | |
1490 shrq $63,%r11 | |
1491 orq %r10,%r8 | |
1492 movq 32(%rdi),%r10 | |
1493 movq %r11,%r14 | |
1494 mulq %rax | |
1495 negq %r15 | |
1496 movq 40(%rdi),%r11 | |
1497 adcq %rax,%rbx | |
1498 movq 0(%rsi,%rbp,1),%rax | |
1499 movq %rbx,16(%rdi) | |
1500 adcq %rdx,%r8 | |
1501 leaq 16(%rbp),%rbp | |
1502 movq %r8,24(%rdi) | |
1503 sbbq %r15,%r15 | |
1504 leaq 64(%rdi),%rdi | |
1505 jmp L$sqr4x_shift_n_add | |
1506 | |
1507 .p2align 5 | |
1508 L$sqr4x_shift_n_add: | |
1509 leaq (%r14,%r10,2),%r12 | |
1510 shrq $63,%r10 | |
1511 leaq (%rcx,%r11,2),%r13 | |
1512 shrq $63,%r11 | |
1513 orq %r10,%r13 | |
1514 movq -16(%rdi),%r10 | |
1515 movq %r11,%r14 | |
1516 mulq %rax | |
1517 negq %r15 | |
1518 movq -8(%rdi),%r11 | |
1519 adcq %rax,%r12 | |
1520 movq -8(%rsi,%rbp,1),%rax | |
1521 movq %r12,-32(%rdi) | |
1522 adcq %rdx,%r13 | |
1523 | |
1524 leaq (%r14,%r10,2),%rbx | |
1525 movq %r13,-24(%rdi) | |
1526 sbbq %r15,%r15 | |
1527 shrq $63,%r10 | |
1528 leaq (%rcx,%r11,2),%r8 | |
1529 shrq $63,%r11 | |
1530 orq %r10,%r8 | |
1531 movq 0(%rdi),%r10 | |
1532 movq %r11,%r14 | |
1533 mulq %rax | |
1534 negq %r15 | |
1535 movq 8(%rdi),%r11 | |
1536 adcq %rax,%rbx | |
1537 movq 0(%rsi,%rbp,1),%rax | |
1538 movq %rbx,-16(%rdi) | |
1539 adcq %rdx,%r8 | |
1540 | |
1541 leaq (%r14,%r10,2),%r12 | |
1542 movq %r8,-8(%rdi) | |
1543 sbbq %r15,%r15 | |
1544 shrq $63,%r10 | |
1545 leaq (%rcx,%r11,2),%r13 | |
1546 shrq $63,%r11 | |
1547 orq %r10,%r13 | |
1548 movq 16(%rdi),%r10 | |
1549 movq %r11,%r14 | |
1550 mulq %rax | |
1551 negq %r15 | |
1552 movq 24(%rdi),%r11 | |
1553 adcq %rax,%r12 | |
1554 movq 8(%rsi,%rbp,1),%rax | |
1555 movq %r12,0(%rdi) | |
1556 adcq %rdx,%r13 | |
1557 | |
1558 leaq (%r14,%r10,2),%rbx | |
1559 movq %r13,8(%rdi) | |
1560 sbbq %r15,%r15 | |
1561 shrq $63,%r10 | |
1562 leaq (%rcx,%r11,2),%r8 | |
1563 shrq $63,%r11 | |
1564 orq %r10,%r8 | |
1565 movq 32(%rdi),%r10 | |
1566 movq %r11,%r14 | |
1567 mulq %rax | |
1568 negq %r15 | |
1569 movq 40(%rdi),%r11 | |
1570 adcq %rax,%rbx | |
1571 movq 16(%rsi,%rbp,1),%rax | |
1572 movq %rbx,16(%rdi) | |
1573 adcq %rdx,%r8 | |
1574 movq %r8,24(%rdi) | |
1575 sbbq %r15,%r15 | |
1576 leaq 64(%rdi),%rdi | |
1577 addq $32,%rbp | |
1578 jnz L$sqr4x_shift_n_add | |
1579 | |
1580 leaq (%r14,%r10,2),%r12 | |
1581 .byte 0x67 | |
1582 shrq $63,%r10 | |
1583 leaq (%rcx,%r11,2),%r13 | |
1584 shrq $63,%r11 | |
1585 orq %r10,%r13 | |
1586 movq -16(%rdi),%r10 | |
1587 movq %r11,%r14 | |
1588 mulq %rax | |
1589 negq %r15 | |
1590 movq -8(%rdi),%r11 | |
1591 adcq %rax,%r12 | |
1592 movq -8(%rsi),%rax | |
1593 movq %r12,-32(%rdi) | |
1594 adcq %rdx,%r13 | |
1595 | |
1596 leaq (%r14,%r10,2),%rbx | |
1597 movq %r13,-24(%rdi) | |
1598 sbbq %r15,%r15 | |
1599 shrq $63,%r10 | |
1600 leaq (%rcx,%r11,2),%r8 | |
1601 shrq $63,%r11 | |
1602 orq %r10,%r8 | |
1603 mulq %rax | |
1604 negq %r15 | |
1605 adcq %rax,%rbx | |
1606 adcq %rdx,%r8 | |
1607 movq %rbx,-16(%rdi) | |
1608 movq %r8,-8(%rdi) | |
1609 .byte 102,72,15,126,213 | |
1610 __bn_sqr8x_reduction: | |
1611 xorq %rax,%rax | |
1612 leaq (%r9,%rbp,1),%rcx | |
1613 leaq 48+8(%rsp,%r9,2),%rdx | |
1614 movq %rcx,0+8(%rsp) | |
1615 leaq 48+8(%rsp,%r9,1),%rdi | |
1616 movq %rdx,8+8(%rsp) | |
1617 negq %r9 | |
1618 jmp L$8x_reduction_loop | |
1619 | |
1620 .p2align 5 | |
1621 L$8x_reduction_loop: | |
1622 leaq (%rdi,%r9,1),%rdi | |
1623 .byte 0x66 | |
1624 movq 0(%rdi),%rbx | |
1625 movq 8(%rdi),%r9 | |
1626 movq 16(%rdi),%r10 | |
1627 movq 24(%rdi),%r11 | |
1628 movq 32(%rdi),%r12 | |
1629 movq 40(%rdi),%r13 | |
1630 movq 48(%rdi),%r14 | |
1631 movq 56(%rdi),%r15 | |
1632 movq %rax,(%rdx) | |
1633 leaq 64(%rdi),%rdi | |
1634 | |
1635 .byte 0x67 | |
1636 movq %rbx,%r8 | |
1637 imulq 32+8(%rsp),%rbx | |
1638 movq 0(%rbp),%rax | |
1639 movl $8,%ecx | |
1640 jmp L$8x_reduce | |
1641 | |
1642 .p2align 5 | |
1643 L$8x_reduce: | |
1644 mulq %rbx | |
1645 movq 8(%rbp),%rax | |
1646 negq %r8 | |
1647 movq %rdx,%r8 | |
1648 adcq $0,%r8 | |
1649 | |
1650 mulq %rbx | |
1651 addq %rax,%r9 | |
1652 movq 16(%rbp),%rax | |
1653 adcq $0,%rdx | |
1654 addq %r9,%r8 | |
1655 movq %rbx,48-8+8(%rsp,%rcx,8) | |
1656 movq %rdx,%r9 | |
1657 adcq $0,%r9 | |
1658 | |
1659 mulq %rbx | |
1660 addq %rax,%r10 | |
1661 movq 24(%rbp),%rax | |
1662 adcq $0,%rdx | |
1663 addq %r10,%r9 | |
1664 movq 32+8(%rsp),%rsi | |
1665 movq %rdx,%r10 | |
1666 adcq $0,%r10 | |
1667 | |
1668 mulq %rbx | |
1669 addq %rax,%r11 | |
1670 movq 32(%rbp),%rax | |
1671 adcq $0,%rdx | |
1672 imulq %r8,%rsi | |
1673 addq %r11,%r10 | |
1674 movq %rdx,%r11 | |
1675 adcq $0,%r11 | |
1676 | |
1677 mulq %rbx | |
1678 addq %rax,%r12 | |
1679 movq 40(%rbp),%rax | |
1680 adcq $0,%rdx | |
1681 addq %r12,%r11 | |
1682 movq %rdx,%r12 | |
1683 adcq $0,%r12 | |
1684 | |
1685 mulq %rbx | |
1686 addq %rax,%r13 | |
1687 movq 48(%rbp),%rax | |
1688 adcq $0,%rdx | |
1689 addq %r13,%r12 | |
1690 movq %rdx,%r13 | |
1691 adcq $0,%r13 | |
1692 | |
1693 mulq %rbx | |
1694 addq %rax,%r14 | |
1695 movq 56(%rbp),%rax | |
1696 adcq $0,%rdx | |
1697 addq %r14,%r13 | |
1698 movq %rdx,%r14 | |
1699 adcq $0,%r14 | |
1700 | |
1701 mulq %rbx | |
1702 movq %rsi,%rbx | |
1703 addq %rax,%r15 | |
1704 movq 0(%rbp),%rax | |
1705 adcq $0,%rdx | |
1706 addq %r15,%r14 | |
1707 movq %rdx,%r15 | |
1708 adcq $0,%r15 | |
1709 | |
1710 decl %ecx | |
1711 jnz L$8x_reduce | |
1712 | |
1713 leaq 64(%rbp),%rbp | |
1714 xorq %rax,%rax | |
1715 movq 8+8(%rsp),%rdx | |
1716 cmpq 0+8(%rsp),%rbp | |
1717 jae L$8x_no_tail | |
1718 | |
1719 .byte 0x66 | |
1720 addq 0(%rdi),%r8 | |
1721 adcq 8(%rdi),%r9 | |
1722 adcq 16(%rdi),%r10 | |
1723 adcq 24(%rdi),%r11 | |
1724 adcq 32(%rdi),%r12 | |
1725 adcq 40(%rdi),%r13 | |
1726 adcq 48(%rdi),%r14 | |
1727 adcq 56(%rdi),%r15 | |
1728 sbbq %rsi,%rsi | |
1729 | |
1730 movq 48+56+8(%rsp),%rbx | |
1731 movl $8,%ecx | |
1732 movq 0(%rbp),%rax | |
1733 jmp L$8x_tail | |
1734 | |
1735 .p2align 5 | |
1736 L$8x_tail: | |
1737 mulq %rbx | |
1738 addq %rax,%r8 | |
1739 movq 8(%rbp),%rax | |
1740 movq %r8,(%rdi) | |
1741 movq %rdx,%r8 | |
1742 adcq $0,%r8 | |
1743 | |
1744 mulq %rbx | |
1745 addq %rax,%r9 | |
1746 movq 16(%rbp),%rax | |
1747 adcq $0,%rdx | |
1748 addq %r9,%r8 | |
1749 leaq 8(%rdi),%rdi | |
1750 movq %rdx,%r9 | |
1751 adcq $0,%r9 | |
1752 | |
1753 mulq %rbx | |
1754 addq %rax,%r10 | |
1755 movq 24(%rbp),%rax | |
1756 adcq $0,%rdx | |
1757 addq %r10,%r9 | |
1758 movq %rdx,%r10 | |
1759 adcq $0,%r10 | |
1760 | |
1761 mulq %rbx | |
1762 addq %rax,%r11 | |
1763 movq 32(%rbp),%rax | |
1764 adcq $0,%rdx | |
1765 addq %r11,%r10 | |
1766 movq %rdx,%r11 | |
1767 adcq $0,%r11 | |
1768 | |
1769 mulq %rbx | |
1770 addq %rax,%r12 | |
1771 movq 40(%rbp),%rax | |
1772 adcq $0,%rdx | |
1773 addq %r12,%r11 | |
1774 movq %rdx,%r12 | |
1775 adcq $0,%r12 | |
1776 | |
1777 mulq %rbx | |
1778 addq %rax,%r13 | |
1779 movq 48(%rbp),%rax | |
1780 adcq $0,%rdx | |
1781 addq %r13,%r12 | |
1782 movq %rdx,%r13 | |
1783 adcq $0,%r13 | |
1784 | |
1785 mulq %rbx | |
1786 addq %rax,%r14 | |
1787 movq 56(%rbp),%rax | |
1788 adcq $0,%rdx | |
1789 addq %r14,%r13 | |
1790 movq %rdx,%r14 | |
1791 adcq $0,%r14 | |
1792 | |
1793 mulq %rbx | |
1794 movq 48-16+8(%rsp,%rcx,8),%rbx | |
1795 addq %rax,%r15 | |
1796 adcq $0,%rdx | |
1797 addq %r15,%r14 | |
1798 movq 0(%rbp),%rax | |
1799 movq %rdx,%r15 | |
1800 adcq $0,%r15 | |
1801 | |
1802 decl %ecx | |
1803 jnz L$8x_tail | |
1804 | |
1805 leaq 64(%rbp),%rbp | |
1806 movq 8+8(%rsp),%rdx | |
1807 cmpq 0+8(%rsp),%rbp | |
1808 jae L$8x_tail_done | |
1809 | |
1810 movq 48+56+8(%rsp),%rbx | |
1811 negq %rsi | |
1812 movq 0(%rbp),%rax | |
1813 adcq 0(%rdi),%r8 | |
1814 adcq 8(%rdi),%r9 | |
1815 adcq 16(%rdi),%r10 | |
1816 adcq 24(%rdi),%r11 | |
1817 adcq 32(%rdi),%r12 | |
1818 adcq 40(%rdi),%r13 | |
1819 adcq 48(%rdi),%r14 | |
1820 adcq 56(%rdi),%r15 | |
1821 sbbq %rsi,%rsi | |
1822 | |
1823 movl $8,%ecx | |
1824 jmp L$8x_tail | |
1825 | |
1826 .p2align 5 | |
1827 L$8x_tail_done: | |
1828 addq (%rdx),%r8 | |
1829 adcq $0,%r9 | |
1830 adcq $0,%r10 | |
1831 adcq $0,%r11 | |
1832 adcq $0,%r12 | |
1833 adcq $0,%r13 | |
1834 adcq $0,%r14 | |
1835 adcq $0,%r15 | |
1836 | |
1837 | |
1838 xorq %rax,%rax | |
1839 | |
1840 negq %rsi | |
1841 L$8x_no_tail: | |
1842 adcq 0(%rdi),%r8 | |
1843 adcq 8(%rdi),%r9 | |
1844 adcq 16(%rdi),%r10 | |
1845 adcq 24(%rdi),%r11 | |
1846 adcq 32(%rdi),%r12 | |
1847 adcq 40(%rdi),%r13 | |
1848 adcq 48(%rdi),%r14 | |
1849 adcq 56(%rdi),%r15 | |
1850 adcq $0,%rax | |
1851 movq -8(%rbp),%rcx | |
1852 xorq %rsi,%rsi | |
1853 | |
1854 .byte 102,72,15,126,213 | |
1855 | |
1856 movq %r8,0(%rdi) | |
1857 movq %r9,8(%rdi) | |
1858 .byte 102,73,15,126,217 | |
1859 movq %r10,16(%rdi) | |
1860 movq %r11,24(%rdi) | |
1861 movq %r12,32(%rdi) | |
1862 movq %r13,40(%rdi) | |
1863 movq %r14,48(%rdi) | |
1864 movq %r15,56(%rdi) | |
1865 leaq 64(%rdi),%rdi | |
1866 | |
1867 cmpq %rdx,%rdi | |
1868 jb L$8x_reduction_loop | |
1869 .byte 0xf3,0xc3 | |
1870 | |
1871 | |
1872 .p2align 5 | |
1873 __bn_post4x_internal: | |
1874 movq 0(%rbp),%r12 | |
1875 leaq (%rdi,%r9,1),%rbx | |
1876 movq %r9,%rcx | |
1877 .byte 102,72,15,126,207 | |
1878 negq %rax | |
1879 .byte 102,72,15,126,206 | |
1880 sarq $3+2,%rcx | |
1881 decq %r12 | |
1882 xorq %r10,%r10 | |
1883 movq 8(%rbp),%r13 | |
1884 movq 16(%rbp),%r14 | |
1885 movq 24(%rbp),%r15 | |
1886 jmp L$sqr4x_sub_entry | |
1887 | |
1888 .p2align 4 | |
1889 L$sqr4x_sub: | |
1890 movq 0(%rbp),%r12 | |
1891 movq 8(%rbp),%r13 | |
1892 movq 16(%rbp),%r14 | |
1893 movq 24(%rbp),%r15 | |
1894 L$sqr4x_sub_entry: | |
1895 leaq 32(%rbp),%rbp | |
1896 notq %r12 | |
1897 notq %r13 | |
1898 notq %r14 | |
1899 notq %r15 | |
1900 andq %rax,%r12 | |
1901 andq %rax,%r13 | |
1902 andq %rax,%r14 | |
1903 andq %rax,%r15 | |
1904 | |
1905 negq %r10 | |
1906 adcq 0(%rbx),%r12 | |
1907 adcq 8(%rbx),%r13 | |
1908 adcq 16(%rbx),%r14 | |
1909 adcq 24(%rbx),%r15 | |
1910 movq %r12,0(%rdi) | |
1911 leaq 32(%rbx),%rbx | |
1912 movq %r13,8(%rdi) | |
1913 sbbq %r10,%r10 | |
1914 movq %r14,16(%rdi) | |
1915 movq %r15,24(%rdi) | |
1916 leaq 32(%rdi),%rdi | |
1917 | |
1918 incq %rcx | |
1919 jnz L$sqr4x_sub | |
1920 | |
1921 movq %r9,%r10 | |
1922 negq %r9 | |
1923 .byte 0xf3,0xc3 | |
1924 | |
1925 .globl _bn_from_montgomery | |
1926 .private_extern _bn_from_montgomery | |
1927 | |
1928 .p2align 5 | |
1929 _bn_from_montgomery: | |
1930 testl $7,%r9d | |
1931 jz bn_from_mont8x | |
1932 xorl %eax,%eax | |
1933 .byte 0xf3,0xc3 | |
1934 | |
1935 | |
1936 | |
1937 .p2align 5 | |
1938 bn_from_mont8x: | |
1939 .byte 0x67 | |
1940 movq %rsp,%rax | |
1941 pushq %rbx | |
1942 pushq %rbp | |
1943 pushq %r12 | |
1944 pushq %r13 | |
1945 pushq %r14 | |
1946 pushq %r15 | |
1947 | |
1948 shll $3,%r9d | |
1949 leaq (%r9,%r9,2),%r10 | |
1950 negq %r9 | |
1951 movq (%r8),%r8 | |
1952 | |
1953 | |
1954 | |
1955 | |
1956 | |
1957 | |
1958 | |
1959 | |
1960 leaq -320(%rsp,%r9,2),%r11 | |
1961 subq %rdi,%r11 | |
1962 andq $4095,%r11 | |
1963 cmpq %r11,%r10 | |
1964 jb L$from_sp_alt | |
1965 subq %r11,%rsp | |
1966 leaq -320(%rsp,%r9,2),%rsp | |
1967 jmp L$from_sp_done | |
1968 | |
1969 .p2align 5 | |
1970 L$from_sp_alt: | |
1971 leaq 4096-320(,%r9,2),%r10 | |
1972 leaq -320(%rsp,%r9,2),%rsp | |
1973 subq %r10,%r11 | |
1974 movq $0,%r10 | |
1975 cmovcq %r10,%r11 | |
1976 subq %r11,%rsp | |
1977 L$from_sp_done: | |
1978 andq $-64,%rsp | |
1979 movq %r9,%r10 | |
1980 negq %r9 | |
1981 | |
1982 | |
1983 | |
1984 | |
1985 | |
1986 | |
1987 | |
1988 | |
1989 | |
1990 | |
1991 movq %r8,32(%rsp) | |
1992 movq %rax,40(%rsp) | |
1993 L$from_body: | |
1994 movq %r9,%r11 | |
1995 leaq 48(%rsp),%rax | |
1996 pxor %xmm0,%xmm0 | |
1997 jmp L$mul_by_1 | |
1998 | |
1999 .p2align 5 | |
2000 L$mul_by_1: | |
2001 movdqu (%rsi),%xmm1 | |
2002 movdqu 16(%rsi),%xmm2 | |
2003 movdqu 32(%rsi),%xmm3 | |
2004 movdqa %xmm0,(%rax,%r9,1) | |
2005 movdqu 48(%rsi),%xmm4 | |
2006 movdqa %xmm0,16(%rax,%r9,1) | |
2007 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 | |
2008 movdqa %xmm1,(%rax) | |
2009 movdqa %xmm0,32(%rax,%r9,1) | |
2010 movdqa %xmm2,16(%rax) | |
2011 movdqa %xmm0,48(%rax,%r9,1) | |
2012 movdqa %xmm3,32(%rax) | |
2013 movdqa %xmm4,48(%rax) | |
2014 leaq 64(%rax),%rax | |
2015 subq $64,%r11 | |
2016 jnz L$mul_by_1 | |
2017 | |
2018 .byte 102,72,15,110,207 | |
2019 .byte 102,72,15,110,209 | |
2020 .byte 0x67 | |
2021 movq %rcx,%rbp | |
2022 .byte 102,73,15,110,218 | |
2023 call __bn_sqr8x_reduction | |
2024 call __bn_post4x_internal | |
2025 | |
2026 pxor %xmm0,%xmm0 | |
2027 leaq 48(%rsp),%rax | |
2028 movq 40(%rsp),%rsi | |
2029 jmp L$from_mont_zero | |
2030 | |
2031 .p2align 5 | |
2032 L$from_mont_zero: | |
2033 movdqa %xmm0,0(%rax) | |
2034 movdqa %xmm0,16(%rax) | |
2035 movdqa %xmm0,32(%rax) | |
2036 movdqa %xmm0,48(%rax) | |
2037 leaq 64(%rax),%rax | |
2038 subq $32,%r9 | |
2039 jnz L$from_mont_zero | |
2040 | |
2041 movq $1,%rax | |
2042 movq -48(%rsi),%r15 | |
2043 movq -40(%rsi),%r14 | |
2044 movq -32(%rsi),%r13 | |
2045 movq -24(%rsi),%r12 | |
2046 movq -16(%rsi),%rbp | |
2047 movq -8(%rsi),%rbx | |
2048 leaq (%rsi),%rsp | |
2049 L$from_epilogue: | |
2050 .byte 0xf3,0xc3 | |
2051 | |
2052 .globl _bn_scatter5 | |
2053 .private_extern _bn_scatter5 | |
2054 | |
2055 .p2align 4 | |
2056 _bn_scatter5: | |
2057 cmpl $0,%esi | |
2058 jz L$scatter_epilogue | |
2059 leaq (%rdx,%rcx,8),%rdx | |
2060 L$scatter: | |
2061 movq (%rdi),%rax | |
2062 leaq 8(%rdi),%rdi | |
2063 movq %rax,(%rdx) | |
2064 leaq 256(%rdx),%rdx | |
2065 subl $1,%esi | |
2066 jnz L$scatter | |
2067 L$scatter_epilogue: | |
2068 .byte 0xf3,0xc3 | |
2069 | |
2070 | |
2071 .globl _bn_gather5 | |
2072 .private_extern _bn_gather5 | |
2073 | |
2074 .p2align 5 | |
2075 _bn_gather5: | |
2076 L$SEH_begin_bn_gather5: | |
2077 | |
2078 .byte 0x4c,0x8d,0x14,0x24 | |
2079 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 | |
2080 leaq L$inc(%rip),%rax | |
2081 andq $-16,%rsp | |
2082 | |
2083 movd %ecx,%xmm5 | |
2084 movdqa 0(%rax),%xmm0 | |
2085 movdqa 16(%rax),%xmm1 | |
2086 leaq 128(%rdx),%r11 | |
2087 leaq 128(%rsp),%rax | |
2088 | |
2089 pshufd $0,%xmm5,%xmm5 | |
2090 movdqa %xmm1,%xmm4 | |
2091 movdqa %xmm1,%xmm2 | |
2092 paddd %xmm0,%xmm1 | |
2093 pcmpeqd %xmm5,%xmm0 | |
2094 movdqa %xmm4,%xmm3 | |
2095 | |
2096 paddd %xmm1,%xmm2 | |
2097 pcmpeqd %xmm5,%xmm1 | |
2098 movdqa %xmm0,-128(%rax) | |
2099 movdqa %xmm4,%xmm0 | |
2100 | |
2101 paddd %xmm2,%xmm3 | |
2102 pcmpeqd %xmm5,%xmm2 | |
2103 movdqa %xmm1,-112(%rax) | |
2104 movdqa %xmm4,%xmm1 | |
2105 | |
2106 paddd %xmm3,%xmm0 | |
2107 pcmpeqd %xmm5,%xmm3 | |
2108 movdqa %xmm2,-96(%rax) | |
2109 movdqa %xmm4,%xmm2 | |
2110 paddd %xmm0,%xmm1 | |
2111 pcmpeqd %xmm5,%xmm0 | |
2112 movdqa %xmm3,-80(%rax) | |
2113 movdqa %xmm4,%xmm3 | |
2114 | |
2115 paddd %xmm1,%xmm2 | |
2116 pcmpeqd %xmm5,%xmm1 | |
2117 movdqa %xmm0,-64(%rax) | |
2118 movdqa %xmm4,%xmm0 | |
2119 | |
2120 paddd %xmm2,%xmm3 | |
2121 pcmpeqd %xmm5,%xmm2 | |
2122 movdqa %xmm1,-48(%rax) | |
2123 movdqa %xmm4,%xmm1 | |
2124 | |
2125 paddd %xmm3,%xmm0 | |
2126 pcmpeqd %xmm5,%xmm3 | |
2127 movdqa %xmm2,-32(%rax) | |
2128 movdqa %xmm4,%xmm2 | |
2129 paddd %xmm0,%xmm1 | |
2130 pcmpeqd %xmm5,%xmm0 | |
2131 movdqa %xmm3,-16(%rax) | |
2132 movdqa %xmm4,%xmm3 | |
2133 | |
2134 paddd %xmm1,%xmm2 | |
2135 pcmpeqd %xmm5,%xmm1 | |
2136 movdqa %xmm0,0(%rax) | |
2137 movdqa %xmm4,%xmm0 | |
2138 | |
2139 paddd %xmm2,%xmm3 | |
2140 pcmpeqd %xmm5,%xmm2 | |
2141 movdqa %xmm1,16(%rax) | |
2142 movdqa %xmm4,%xmm1 | |
2143 | |
2144 paddd %xmm3,%xmm0 | |
2145 pcmpeqd %xmm5,%xmm3 | |
2146 movdqa %xmm2,32(%rax) | |
2147 movdqa %xmm4,%xmm2 | |
2148 paddd %xmm0,%xmm1 | |
2149 pcmpeqd %xmm5,%xmm0 | |
2150 movdqa %xmm3,48(%rax) | |
2151 movdqa %xmm4,%xmm3 | |
2152 | |
2153 paddd %xmm1,%xmm2 | |
2154 pcmpeqd %xmm5,%xmm1 | |
2155 movdqa %xmm0,64(%rax) | |
2156 movdqa %xmm4,%xmm0 | |
2157 | |
2158 paddd %xmm2,%xmm3 | |
2159 pcmpeqd %xmm5,%xmm2 | |
2160 movdqa %xmm1,80(%rax) | |
2161 movdqa %xmm4,%xmm1 | |
2162 | |
2163 paddd %xmm3,%xmm0 | |
2164 pcmpeqd %xmm5,%xmm3 | |
2165 movdqa %xmm2,96(%rax) | |
2166 movdqa %xmm4,%xmm2 | |
2167 movdqa %xmm3,112(%rax) | |
2168 jmp L$gather | |
2169 | |
2170 .p2align 5 | |
2171 L$gather: | |
2172 pxor %xmm4,%xmm4 | |
2173 pxor %xmm5,%xmm5 | |
2174 movdqa -128(%r11),%xmm0 | |
2175 movdqa -112(%r11),%xmm1 | |
2176 movdqa -96(%r11),%xmm2 | |
2177 pand -128(%rax),%xmm0 | |
2178 movdqa -80(%r11),%xmm3 | |
2179 pand -112(%rax),%xmm1 | |
2180 por %xmm0,%xmm4 | |
2181 pand -96(%rax),%xmm2 | |
2182 por %xmm1,%xmm5 | |
2183 pand -80(%rax),%xmm3 | |
2184 por %xmm2,%xmm4 | |
2185 por %xmm3,%xmm5 | |
2186 movdqa -64(%r11),%xmm0 | |
2187 movdqa -48(%r11),%xmm1 | |
2188 movdqa -32(%r11),%xmm2 | |
2189 pand -64(%rax),%xmm0 | |
2190 movdqa -16(%r11),%xmm3 | |
2191 pand -48(%rax),%xmm1 | |
2192 por %xmm0,%xmm4 | |
2193 pand -32(%rax),%xmm2 | |
2194 por %xmm1,%xmm5 | |
2195 pand -16(%rax),%xmm3 | |
2196 por %xmm2,%xmm4 | |
2197 por %xmm3,%xmm5 | |
2198 movdqa 0(%r11),%xmm0 | |
2199 movdqa 16(%r11),%xmm1 | |
2200 movdqa 32(%r11),%xmm2 | |
2201 pand 0(%rax),%xmm0 | |
2202 movdqa 48(%r11),%xmm3 | |
2203 pand 16(%rax),%xmm1 | |
2204 por %xmm0,%xmm4 | |
2205 pand 32(%rax),%xmm2 | |
2206 por %xmm1,%xmm5 | |
2207 pand 48(%rax),%xmm3 | |
2208 por %xmm2,%xmm4 | |
2209 por %xmm3,%xmm5 | |
2210 movdqa 64(%r11),%xmm0 | |
2211 movdqa 80(%r11),%xmm1 | |
2212 movdqa 96(%r11),%xmm2 | |
2213 pand 64(%rax),%xmm0 | |
2214 movdqa 112(%r11),%xmm3 | |
2215 pand 80(%rax),%xmm1 | |
2216 por %xmm0,%xmm4 | |
2217 pand 96(%rax),%xmm2 | |
2218 por %xmm1,%xmm5 | |
2219 pand 112(%rax),%xmm3 | |
2220 por %xmm2,%xmm4 | |
2221 por %xmm3,%xmm5 | |
2222 por %xmm5,%xmm4 | |
2223 leaq 256(%r11),%r11 | |
2224 pshufd $0x4e,%xmm4,%xmm0 | |
2225 por %xmm4,%xmm0 | |
2226 movq %xmm0,(%rdi) | |
2227 leaq 8(%rdi),%rdi | |
2228 subl $1,%esi | |
2229 jnz L$gather | |
2230 | |
2231 leaq (%r10),%rsp | |
2232 .byte 0xf3,0xc3 | |
2233 L$SEH_end_bn_gather5: | |
2234 | |
2235 .p2align 6 | |
2236 L$inc: | |
2237 .long 0,0, 1,1 | |
2238 .long 2,2, 2,2 | |
2239 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97
,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71
,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1
11,114,103,62,0 | |
2240 #endif | |
OLD | NEW |