OLD | NEW |
| (Empty) |
1 .text | |
2 | |
3 .globl bn_mul_mont_gather5 | |
4 .type bn_mul_mont_gather5,@function | |
5 .align 64 | |
6 bn_mul_mont_gather5: | |
7 testl $3,%r9d | |
8 jnz .Lmul_enter | |
9 cmpl $8,%r9d | |
10 jb .Lmul_enter | |
11 jmp .Lmul4x_enter | |
12 | |
13 .align 16 | |
14 .Lmul_enter: | |
15 movl %r9d,%r9d | |
16 movl 8(%rsp),%r10d | |
17 pushq %rbx | |
18 pushq %rbp | |
19 pushq %r12 | |
20 pushq %r13 | |
21 pushq %r14 | |
22 pushq %r15 | |
23 movq %rsp,%rax | |
24 leaq 2(%r9),%r11 | |
25 negq %r11 | |
26 leaq (%rsp,%r11,8),%rsp | |
27 andq $-1024,%rsp | |
28 | |
29 movq %rax,8(%rsp,%r9,8) | |
30 .Lmul_body: | |
31 movq %rdx,%r12 | |
32 movq %r10,%r11 | |
33 shrq $3,%r10 | |
34 andq $7,%r11 | |
35 notq %r10 | |
36 leaq .Lmagic_masks(%rip),%rax | |
37 andq $3,%r10 | |
38 leaq 96(%r12,%r11,8),%r12 | |
39 movq 0(%rax,%r10,8),%xmm4 | |
40 movq 8(%rax,%r10,8),%xmm5 | |
41 movq 16(%rax,%r10,8),%xmm6 | |
42 movq 24(%rax,%r10,8),%xmm7 | |
43 | |
44 movq -96(%r12),%xmm0 | |
45 movq -32(%r12),%xmm1 | |
46 pand %xmm4,%xmm0 | |
47 movq 32(%r12),%xmm2 | |
48 pand %xmm5,%xmm1 | |
49 movq 96(%r12),%xmm3 | |
50 pand %xmm6,%xmm2 | |
51 por %xmm1,%xmm0 | |
52 pand %xmm7,%xmm3 | |
53 por %xmm2,%xmm0 | |
54 leaq 256(%r12),%r12 | |
55 por %xmm3,%xmm0 | |
56 | |
57 .byte 102,72,15,126,195 | |
58 | |
59 movq (%r8),%r8 | |
60 movq (%rsi),%rax | |
61 | |
62 xorq %r14,%r14 | |
63 xorq %r15,%r15 | |
64 | |
65 movq -96(%r12),%xmm0 | |
66 movq -32(%r12),%xmm1 | |
67 pand %xmm4,%xmm0 | |
68 movq 32(%r12),%xmm2 | |
69 pand %xmm5,%xmm1 | |
70 | |
71 movq %r8,%rbp | |
72 mulq %rbx | |
73 movq %rax,%r10 | |
74 movq (%rcx),%rax | |
75 | |
76 movq 96(%r12),%xmm3 | |
77 pand %xmm6,%xmm2 | |
78 por %xmm1,%xmm0 | |
79 pand %xmm7,%xmm3 | |
80 | |
81 imulq %r10,%rbp | |
82 movq %rdx,%r11 | |
83 | |
84 por %xmm2,%xmm0 | |
85 leaq 256(%r12),%r12 | |
86 por %xmm3,%xmm0 | |
87 | |
88 mulq %rbp | |
89 addq %rax,%r10 | |
90 movq 8(%rsi),%rax | |
91 adcq $0,%rdx | |
92 movq %rdx,%r13 | |
93 | |
94 leaq 1(%r15),%r15 | |
95 jmp .L1st_enter | |
96 | |
97 .align 16 | |
98 .L1st: | |
99 addq %rax,%r13 | |
100 movq (%rsi,%r15,8),%rax | |
101 adcq $0,%rdx | |
102 addq %r11,%r13 | |
103 movq %r10,%r11 | |
104 adcq $0,%rdx | |
105 movq %r13,-16(%rsp,%r15,8) | |
106 movq %rdx,%r13 | |
107 | |
108 .L1st_enter: | |
109 mulq %rbx | |
110 addq %rax,%r11 | |
111 movq (%rcx,%r15,8),%rax | |
112 adcq $0,%rdx | |
113 leaq 1(%r15),%r15 | |
114 movq %rdx,%r10 | |
115 | |
116 mulq %rbp | |
117 cmpq %r9,%r15 | |
118 jne .L1st | |
119 | |
120 .byte 102,72,15,126,195 | |
121 | |
122 addq %rax,%r13 | |
123 movq (%rsi),%rax | |
124 adcq $0,%rdx | |
125 addq %r11,%r13 | |
126 adcq $0,%rdx | |
127 movq %r13,-16(%rsp,%r15,8) | |
128 movq %rdx,%r13 | |
129 movq %r10,%r11 | |
130 | |
131 xorq %rdx,%rdx | |
132 addq %r11,%r13 | |
133 adcq $0,%rdx | |
134 movq %r13,-8(%rsp,%r9,8) | |
135 movq %rdx,(%rsp,%r9,8) | |
136 | |
137 leaq 1(%r14),%r14 | |
138 jmp .Louter | |
139 .align 16 | |
140 .Louter: | |
141 xorq %r15,%r15 | |
142 movq %r8,%rbp | |
143 movq (%rsp),%r10 | |
144 | |
145 movq -96(%r12),%xmm0 | |
146 movq -32(%r12),%xmm1 | |
147 pand %xmm4,%xmm0 | |
148 movq 32(%r12),%xmm2 | |
149 pand %xmm5,%xmm1 | |
150 | |
151 mulq %rbx | |
152 addq %rax,%r10 | |
153 movq (%rcx),%rax | |
154 adcq $0,%rdx | |
155 | |
156 movq 96(%r12),%xmm3 | |
157 pand %xmm6,%xmm2 | |
158 por %xmm1,%xmm0 | |
159 pand %xmm7,%xmm3 | |
160 | |
161 imulq %r10,%rbp | |
162 movq %rdx,%r11 | |
163 | |
164 por %xmm2,%xmm0 | |
165 leaq 256(%r12),%r12 | |
166 por %xmm3,%xmm0 | |
167 | |
168 mulq %rbp | |
169 addq %rax,%r10 | |
170 movq 8(%rsi),%rax | |
171 adcq $0,%rdx | |
172 movq 8(%rsp),%r10 | |
173 movq %rdx,%r13 | |
174 | |
175 leaq 1(%r15),%r15 | |
176 jmp .Linner_enter | |
177 | |
178 .align 16 | |
179 .Linner: | |
180 addq %rax,%r13 | |
181 movq (%rsi,%r15,8),%rax | |
182 adcq $0,%rdx | |
183 addq %r10,%r13 | |
184 movq (%rsp,%r15,8),%r10 | |
185 adcq $0,%rdx | |
186 movq %r13,-16(%rsp,%r15,8) | |
187 movq %rdx,%r13 | |
188 | |
189 .Linner_enter: | |
190 mulq %rbx | |
191 addq %rax,%r11 | |
192 movq (%rcx,%r15,8),%rax | |
193 adcq $0,%rdx | |
194 addq %r11,%r10 | |
195 movq %rdx,%r11 | |
196 adcq $0,%r11 | |
197 leaq 1(%r15),%r15 | |
198 | |
199 mulq %rbp | |
200 cmpq %r9,%r15 | |
201 jne .Linner | |
202 | |
203 .byte 102,72,15,126,195 | |
204 | |
205 addq %rax,%r13 | |
206 movq (%rsi),%rax | |
207 adcq $0,%rdx | |
208 addq %r10,%r13 | |
209 movq (%rsp,%r15,8),%r10 | |
210 adcq $0,%rdx | |
211 movq %r13,-16(%rsp,%r15,8) | |
212 movq %rdx,%r13 | |
213 | |
214 xorq %rdx,%rdx | |
215 addq %r11,%r13 | |
216 adcq $0,%rdx | |
217 addq %r10,%r13 | |
218 adcq $0,%rdx | |
219 movq %r13,-8(%rsp,%r9,8) | |
220 movq %rdx,(%rsp,%r9,8) | |
221 | |
222 leaq 1(%r14),%r14 | |
223 cmpq %r9,%r14 | |
224 jl .Louter | |
225 | |
226 xorq %r14,%r14 | |
227 movq (%rsp),%rax | |
228 leaq (%rsp),%rsi | |
229 movq %r9,%r15 | |
230 jmp .Lsub | |
231 .align 16 | |
232 .Lsub: sbbq (%rcx,%r14,8),%rax | |
233 movq %rax,(%rdi,%r14,8) | |
234 movq 8(%rsi,%r14,8),%rax | |
235 leaq 1(%r14),%r14 | |
236 decq %r15 | |
237 jnz .Lsub | |
238 | |
239 sbbq $0,%rax | |
240 xorq %r14,%r14 | |
241 andq %rax,%rsi | |
242 notq %rax | |
243 movq %rdi,%rcx | |
244 andq %rax,%rcx | |
245 movq %r9,%r15 | |
246 orq %rcx,%rsi | |
247 .align 16 | |
248 .Lcopy: | |
249 movq (%rsi,%r14,8),%rax | |
250 movq %r14,(%rsp,%r14,8) | |
251 movq %rax,(%rdi,%r14,8) | |
252 leaq 1(%r14),%r14 | |
253 subq $1,%r15 | |
254 jnz .Lcopy | |
255 | |
256 movq 8(%rsp,%r9,8),%rsi | |
257 movq $1,%rax | |
258 movq (%rsi),%r15 | |
259 movq 8(%rsi),%r14 | |
260 movq 16(%rsi),%r13 | |
261 movq 24(%rsi),%r12 | |
262 movq 32(%rsi),%rbp | |
263 movq 40(%rsi),%rbx | |
264 leaq 48(%rsi),%rsp | |
265 .Lmul_epilogue: | |
266 .byte 0xf3,0xc3 | |
267 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 | |
268 .type bn_mul4x_mont_gather5,@function | |
269 .align 16 | |
270 bn_mul4x_mont_gather5: | |
271 .Lmul4x_enter: | |
272 movl %r9d,%r9d | |
273 movl 8(%rsp),%r10d | |
274 pushq %rbx | |
275 pushq %rbp | |
276 pushq %r12 | |
277 pushq %r13 | |
278 pushq %r14 | |
279 pushq %r15 | |
280 movq %rsp,%rax | |
281 leaq 4(%r9),%r11 | |
282 negq %r11 | |
283 leaq (%rsp,%r11,8),%rsp | |
284 andq $-1024,%rsp | |
285 | |
286 movq %rax,8(%rsp,%r9,8) | |
287 .Lmul4x_body: | |
288 movq %rdi,16(%rsp,%r9,8) | |
289 movq %rdx,%r12 | |
290 movq %r10,%r11 | |
291 shrq $3,%r10 | |
292 andq $7,%r11 | |
293 notq %r10 | |
294 leaq .Lmagic_masks(%rip),%rax | |
295 andq $3,%r10 | |
296 leaq 96(%r12,%r11,8),%r12 | |
297 movq 0(%rax,%r10,8),%xmm4 | |
298 movq 8(%rax,%r10,8),%xmm5 | |
299 movq 16(%rax,%r10,8),%xmm6 | |
300 movq 24(%rax,%r10,8),%xmm7 | |
301 | |
302 movq -96(%r12),%xmm0 | |
303 movq -32(%r12),%xmm1 | |
304 pand %xmm4,%xmm0 | |
305 movq 32(%r12),%xmm2 | |
306 pand %xmm5,%xmm1 | |
307 movq 96(%r12),%xmm3 | |
308 pand %xmm6,%xmm2 | |
309 por %xmm1,%xmm0 | |
310 pand %xmm7,%xmm3 | |
311 por %xmm2,%xmm0 | |
312 leaq 256(%r12),%r12 | |
313 por %xmm3,%xmm0 | |
314 | |
315 .byte 102,72,15,126,195 | |
316 movq (%r8),%r8 | |
317 movq (%rsi),%rax | |
318 | |
319 xorq %r14,%r14 | |
320 xorq %r15,%r15 | |
321 | |
322 movq -96(%r12),%xmm0 | |
323 movq -32(%r12),%xmm1 | |
324 pand %xmm4,%xmm0 | |
325 movq 32(%r12),%xmm2 | |
326 pand %xmm5,%xmm1 | |
327 | |
328 movq %r8,%rbp | |
329 mulq %rbx | |
330 movq %rax,%r10 | |
331 movq (%rcx),%rax | |
332 | |
333 movq 96(%r12),%xmm3 | |
334 pand %xmm6,%xmm2 | |
335 por %xmm1,%xmm0 | |
336 pand %xmm7,%xmm3 | |
337 | |
338 imulq %r10,%rbp | |
339 movq %rdx,%r11 | |
340 | |
341 por %xmm2,%xmm0 | |
342 leaq 256(%r12),%r12 | |
343 por %xmm3,%xmm0 | |
344 | |
345 mulq %rbp | |
346 addq %rax,%r10 | |
347 movq 8(%rsi),%rax | |
348 adcq $0,%rdx | |
349 movq %rdx,%rdi | |
350 | |
351 mulq %rbx | |
352 addq %rax,%r11 | |
353 movq 8(%rcx),%rax | |
354 adcq $0,%rdx | |
355 movq %rdx,%r10 | |
356 | |
357 mulq %rbp | |
358 addq %rax,%rdi | |
359 movq 16(%rsi),%rax | |
360 adcq $0,%rdx | |
361 addq %r11,%rdi | |
362 leaq 4(%r15),%r15 | |
363 adcq $0,%rdx | |
364 movq %rdi,(%rsp) | |
365 movq %rdx,%r13 | |
366 jmp .L1st4x | |
367 .align 16 | |
368 .L1st4x: | |
369 mulq %rbx | |
370 addq %rax,%r10 | |
371 movq -16(%rcx,%r15,8),%rax | |
372 adcq $0,%rdx | |
373 movq %rdx,%r11 | |
374 | |
375 mulq %rbp | |
376 addq %rax,%r13 | |
377 movq -8(%rsi,%r15,8),%rax | |
378 adcq $0,%rdx | |
379 addq %r10,%r13 | |
380 adcq $0,%rdx | |
381 movq %r13,-24(%rsp,%r15,8) | |
382 movq %rdx,%rdi | |
383 | |
384 mulq %rbx | |
385 addq %rax,%r11 | |
386 movq -8(%rcx,%r15,8),%rax | |
387 adcq $0,%rdx | |
388 movq %rdx,%r10 | |
389 | |
390 mulq %rbp | |
391 addq %rax,%rdi | |
392 movq (%rsi,%r15,8),%rax | |
393 adcq $0,%rdx | |
394 addq %r11,%rdi | |
395 adcq $0,%rdx | |
396 movq %rdi,-16(%rsp,%r15,8) | |
397 movq %rdx,%r13 | |
398 | |
399 mulq %rbx | |
400 addq %rax,%r10 | |
401 movq (%rcx,%r15,8),%rax | |
402 adcq $0,%rdx | |
403 movq %rdx,%r11 | |
404 | |
405 mulq %rbp | |
406 addq %rax,%r13 | |
407 movq 8(%rsi,%r15,8),%rax | |
408 adcq $0,%rdx | |
409 addq %r10,%r13 | |
410 adcq $0,%rdx | |
411 movq %r13,-8(%rsp,%r15,8) | |
412 movq %rdx,%rdi | |
413 | |
414 mulq %rbx | |
415 addq %rax,%r11 | |
416 movq 8(%rcx,%r15,8),%rax | |
417 adcq $0,%rdx | |
418 leaq 4(%r15),%r15 | |
419 movq %rdx,%r10 | |
420 | |
421 mulq %rbp | |
422 addq %rax,%rdi | |
423 movq -16(%rsi,%r15,8),%rax | |
424 adcq $0,%rdx | |
425 addq %r11,%rdi | |
426 adcq $0,%rdx | |
427 movq %rdi,-32(%rsp,%r15,8) | |
428 movq %rdx,%r13 | |
429 cmpq %r9,%r15 | |
430 jl .L1st4x | |
431 | |
432 mulq %rbx | |
433 addq %rax,%r10 | |
434 movq -16(%rcx,%r15,8),%rax | |
435 adcq $0,%rdx | |
436 movq %rdx,%r11 | |
437 | |
438 mulq %rbp | |
439 addq %rax,%r13 | |
440 movq -8(%rsi,%r15,8),%rax | |
441 adcq $0,%rdx | |
442 addq %r10,%r13 | |
443 adcq $0,%rdx | |
444 movq %r13,-24(%rsp,%r15,8) | |
445 movq %rdx,%rdi | |
446 | |
447 mulq %rbx | |
448 addq %rax,%r11 | |
449 movq -8(%rcx,%r15,8),%rax | |
450 adcq $0,%rdx | |
451 movq %rdx,%r10 | |
452 | |
453 mulq %rbp | |
454 addq %rax,%rdi | |
455 movq (%rsi),%rax | |
456 adcq $0,%rdx | |
457 addq %r11,%rdi | |
458 adcq $0,%rdx | |
459 movq %rdi,-16(%rsp,%r15,8) | |
460 movq %rdx,%r13 | |
461 | |
462 .byte 102,72,15,126,195 | |
463 | |
464 xorq %rdi,%rdi | |
465 addq %r10,%r13 | |
466 adcq $0,%rdi | |
467 movq %r13,-8(%rsp,%r15,8) | |
468 movq %rdi,(%rsp,%r15,8) | |
469 | |
470 leaq 1(%r14),%r14 | |
471 .align 4 | |
472 .Louter4x: | |
473 xorq %r15,%r15 | |
474 movq -96(%r12),%xmm0 | |
475 movq -32(%r12),%xmm1 | |
476 pand %xmm4,%xmm0 | |
477 movq 32(%r12),%xmm2 | |
478 pand %xmm5,%xmm1 | |
479 | |
480 movq (%rsp),%r10 | |
481 movq %r8,%rbp | |
482 mulq %rbx | |
483 addq %rax,%r10 | |
484 movq (%rcx),%rax | |
485 adcq $0,%rdx | |
486 | |
487 movq 96(%r12),%xmm3 | |
488 pand %xmm6,%xmm2 | |
489 por %xmm1,%xmm0 | |
490 pand %xmm7,%xmm3 | |
491 | |
492 imulq %r10,%rbp | |
493 movq %rdx,%r11 | |
494 | |
495 por %xmm2,%xmm0 | |
496 leaq 256(%r12),%r12 | |
497 por %xmm3,%xmm0 | |
498 | |
499 mulq %rbp | |
500 addq %rax,%r10 | |
501 movq 8(%rsi),%rax | |
502 adcq $0,%rdx | |
503 movq %rdx,%rdi | |
504 | |
505 mulq %rbx | |
506 addq %rax,%r11 | |
507 movq 8(%rcx),%rax | |
508 adcq $0,%rdx | |
509 addq 8(%rsp),%r11 | |
510 adcq $0,%rdx | |
511 movq %rdx,%r10 | |
512 | |
513 mulq %rbp | |
514 addq %rax,%rdi | |
515 movq 16(%rsi),%rax | |
516 adcq $0,%rdx | |
517 addq %r11,%rdi | |
518 leaq 4(%r15),%r15 | |
519 adcq $0,%rdx | |
520 movq %rdx,%r13 | |
521 jmp .Linner4x | |
522 .align 16 | |
523 .Linner4x: | |
524 mulq %rbx | |
525 addq %rax,%r10 | |
526 movq -16(%rcx,%r15,8),%rax | |
527 adcq $0,%rdx | |
528 addq -16(%rsp,%r15,8),%r10 | |
529 adcq $0,%rdx | |
530 movq %rdx,%r11 | |
531 | |
532 mulq %rbp | |
533 addq %rax,%r13 | |
534 movq -8(%rsi,%r15,8),%rax | |
535 adcq $0,%rdx | |
536 addq %r10,%r13 | |
537 adcq $0,%rdx | |
538 movq %rdi,-32(%rsp,%r15,8) | |
539 movq %rdx,%rdi | |
540 | |
541 mulq %rbx | |
542 addq %rax,%r11 | |
543 movq -8(%rcx,%r15,8),%rax | |
544 adcq $0,%rdx | |
545 addq -8(%rsp,%r15,8),%r11 | |
546 adcq $0,%rdx | |
547 movq %rdx,%r10 | |
548 | |
549 mulq %rbp | |
550 addq %rax,%rdi | |
551 movq (%rsi,%r15,8),%rax | |
552 adcq $0,%rdx | |
553 addq %r11,%rdi | |
554 adcq $0,%rdx | |
555 movq %r13,-24(%rsp,%r15,8) | |
556 movq %rdx,%r13 | |
557 | |
558 mulq %rbx | |
559 addq %rax,%r10 | |
560 movq (%rcx,%r15,8),%rax | |
561 adcq $0,%rdx | |
562 addq (%rsp,%r15,8),%r10 | |
563 adcq $0,%rdx | |
564 movq %rdx,%r11 | |
565 | |
566 mulq %rbp | |
567 addq %rax,%r13 | |
568 movq 8(%rsi,%r15,8),%rax | |
569 adcq $0,%rdx | |
570 addq %r10,%r13 | |
571 adcq $0,%rdx | |
572 movq %rdi,-16(%rsp,%r15,8) | |
573 movq %rdx,%rdi | |
574 | |
575 mulq %rbx | |
576 addq %rax,%r11 | |
577 movq 8(%rcx,%r15,8),%rax | |
578 adcq $0,%rdx | |
579 addq 8(%rsp,%r15,8),%r11 | |
580 adcq $0,%rdx | |
581 leaq 4(%r15),%r15 | |
582 movq %rdx,%r10 | |
583 | |
584 mulq %rbp | |
585 addq %rax,%rdi | |
586 movq -16(%rsi,%r15,8),%rax | |
587 adcq $0,%rdx | |
588 addq %r11,%rdi | |
589 adcq $0,%rdx | |
590 movq %r13,-40(%rsp,%r15,8) | |
591 movq %rdx,%r13 | |
592 cmpq %r9,%r15 | |
593 jl .Linner4x | |
594 | |
595 mulq %rbx | |
596 addq %rax,%r10 | |
597 movq -16(%rcx,%r15,8),%rax | |
598 adcq $0,%rdx | |
599 addq -16(%rsp,%r15,8),%r10 | |
600 adcq $0,%rdx | |
601 movq %rdx,%r11 | |
602 | |
603 mulq %rbp | |
604 addq %rax,%r13 | |
605 movq -8(%rsi,%r15,8),%rax | |
606 adcq $0,%rdx | |
607 addq %r10,%r13 | |
608 adcq $0,%rdx | |
609 movq %rdi,-32(%rsp,%r15,8) | |
610 movq %rdx,%rdi | |
611 | |
612 mulq %rbx | |
613 addq %rax,%r11 | |
614 movq -8(%rcx,%r15,8),%rax | |
615 adcq $0,%rdx | |
616 addq -8(%rsp,%r15,8),%r11 | |
617 adcq $0,%rdx | |
618 leaq 1(%r14),%r14 | |
619 movq %rdx,%r10 | |
620 | |
621 mulq %rbp | |
622 addq %rax,%rdi | |
623 movq (%rsi),%rax | |
624 adcq $0,%rdx | |
625 addq %r11,%rdi | |
626 adcq $0,%rdx | |
627 movq %r13,-24(%rsp,%r15,8) | |
628 movq %rdx,%r13 | |
629 | |
630 .byte 102,72,15,126,195 | |
631 movq %rdi,-16(%rsp,%r15,8) | |
632 | |
633 xorq %rdi,%rdi | |
634 addq %r10,%r13 | |
635 adcq $0,%rdi | |
636 addq (%rsp,%r9,8),%r13 | |
637 adcq $0,%rdi | |
638 movq %r13,-8(%rsp,%r15,8) | |
639 movq %rdi,(%rsp,%r15,8) | |
640 | |
641 cmpq %r9,%r14 | |
642 jl .Louter4x | |
643 movq 16(%rsp,%r9,8),%rdi | |
644 movq 0(%rsp),%rax | |
645 pxor %xmm0,%xmm0 | |
646 movq 8(%rsp),%rdx | |
647 shrq $2,%r9 | |
648 leaq (%rsp),%rsi | |
649 xorq %r14,%r14 | |
650 | |
651 subq 0(%rcx),%rax | |
652 movq 16(%rsi),%rbx | |
653 movq 24(%rsi),%rbp | |
654 sbbq 8(%rcx),%rdx | |
655 leaq -1(%r9),%r15 | |
656 jmp .Lsub4x | |
657 .align 16 | |
658 .Lsub4x: | |
659 movq %rax,0(%rdi,%r14,8) | |
660 movq %rdx,8(%rdi,%r14,8) | |
661 sbbq 16(%rcx,%r14,8),%rbx | |
662 movq 32(%rsi,%r14,8),%rax | |
663 movq 40(%rsi,%r14,8),%rdx | |
664 sbbq 24(%rcx,%r14,8),%rbp | |
665 movq %rbx,16(%rdi,%r14,8) | |
666 movq %rbp,24(%rdi,%r14,8) | |
667 sbbq 32(%rcx,%r14,8),%rax | |
668 movq 48(%rsi,%r14,8),%rbx | |
669 movq 56(%rsi,%r14,8),%rbp | |
670 sbbq 40(%rcx,%r14,8),%rdx | |
671 leaq 4(%r14),%r14 | |
672 decq %r15 | |
673 jnz .Lsub4x | |
674 | |
675 movq %rax,0(%rdi,%r14,8) | |
676 movq 32(%rsi,%r14,8),%rax | |
677 sbbq 16(%rcx,%r14,8),%rbx | |
678 movq %rdx,8(%rdi,%r14,8) | |
679 sbbq 24(%rcx,%r14,8),%rbp | |
680 movq %rbx,16(%rdi,%r14,8) | |
681 | |
682 sbbq $0,%rax | |
683 movq %rbp,24(%rdi,%r14,8) | |
684 xorq %r14,%r14 | |
685 andq %rax,%rsi | |
686 notq %rax | |
687 movq %rdi,%rcx | |
688 andq %rax,%rcx | |
689 leaq -1(%r9),%r15 | |
690 orq %rcx,%rsi | |
691 | |
692 movdqu (%rsi),%xmm1 | |
693 movdqa %xmm0,(%rsp) | |
694 movdqu %xmm1,(%rdi) | |
695 jmp .Lcopy4x | |
696 .align 16 | |
697 .Lcopy4x: | |
698 movdqu 16(%rsi,%r14,1),%xmm2 | |
699 movdqu 32(%rsi,%r14,1),%xmm1 | |
700 movdqa %xmm0,16(%rsp,%r14,1) | |
701 movdqu %xmm2,16(%rdi,%r14,1) | |
702 movdqa %xmm0,32(%rsp,%r14,1) | |
703 movdqu %xmm1,32(%rdi,%r14,1) | |
704 leaq 32(%r14),%r14 | |
705 decq %r15 | |
706 jnz .Lcopy4x | |
707 | |
708 shlq $2,%r9 | |
709 movdqu 16(%rsi,%r14,1),%xmm2 | |
710 movdqa %xmm0,16(%rsp,%r14,1) | |
711 movdqu %xmm2,16(%rdi,%r14,1) | |
712 movq 8(%rsp,%r9,8),%rsi | |
713 movq $1,%rax | |
714 movq (%rsi),%r15 | |
715 movq 8(%rsi),%r14 | |
716 movq 16(%rsi),%r13 | |
717 movq 24(%rsi),%r12 | |
718 movq 32(%rsi),%rbp | |
719 movq 40(%rsi),%rbx | |
720 leaq 48(%rsi),%rsp | |
721 .Lmul4x_epilogue: | |
722 .byte 0xf3,0xc3 | |
723 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 | |
724 .globl bn_scatter5 | |
725 .type bn_scatter5,@function | |
726 .align 16 | |
727 bn_scatter5: | |
728 cmpq $0,%rsi | |
729 jz .Lscatter_epilogue | |
730 leaq (%rdx,%rcx,8),%rdx | |
731 .Lscatter: | |
732 movq (%rdi),%rax | |
733 leaq 8(%rdi),%rdi | |
734 movq %rax,(%rdx) | |
735 leaq 256(%rdx),%rdx | |
736 subq $1,%rsi | |
737 jnz .Lscatter | |
738 .Lscatter_epilogue: | |
739 .byte 0xf3,0xc3 | |
740 .size bn_scatter5,.-bn_scatter5 | |
741 | |
742 .globl bn_gather5 | |
743 .type bn_gather5,@function | |
744 .align 16 | |
745 bn_gather5: | |
746 movq %rcx,%r11 | |
747 shrq $3,%rcx | |
748 andq $7,%r11 | |
749 notq %rcx | |
750 leaq .Lmagic_masks(%rip),%rax | |
751 andq $3,%rcx | |
752 leaq 96(%rdx,%r11,8),%rdx | |
753 movq 0(%rax,%rcx,8),%xmm4 | |
754 movq 8(%rax,%rcx,8),%xmm5 | |
755 movq 16(%rax,%rcx,8),%xmm6 | |
756 movq 24(%rax,%rcx,8),%xmm7 | |
757 jmp .Lgather | |
758 .align 16 | |
759 .Lgather: | |
760 movq -96(%rdx),%xmm0 | |
761 movq -32(%rdx),%xmm1 | |
762 pand %xmm4,%xmm0 | |
763 movq 32(%rdx),%xmm2 | |
764 pand %xmm5,%xmm1 | |
765 movq 96(%rdx),%xmm3 | |
766 pand %xmm6,%xmm2 | |
767 por %xmm1,%xmm0 | |
768 pand %xmm7,%xmm3 | |
769 por %xmm2,%xmm0 | |
770 leaq 256(%rdx),%rdx | |
771 por %xmm3,%xmm0 | |
772 | |
773 movq %xmm0,(%rdi) | |
774 leaq 8(%rdi),%rdi | |
775 subq $1,%rsi | |
776 jnz .Lgather | |
777 .byte 0xf3,0xc3 | |
778 .LSEH_end_bn_gather5: | |
779 .size bn_gather5,.-bn_gather5 | |
780 .align 64 | |
781 .Lmagic_masks: | |
782 .long 0,0, 0,0, 0,0, -1,-1 | |
783 .long 0,0, 0,0, 0,0, 0,0 | |
784 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97
,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71
,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,1
11,114,103,62,0 | |
OLD | NEW |