OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 | |
5 | |
6 .globl _rsaz_512_sqr | |
7 .private_extern _rsaz_512_sqr | |
8 | |
9 .p2align 5 | |
10 _rsaz_512_sqr: | |
11 pushq %rbx | |
12 pushq %rbp | |
13 pushq %r12 | |
14 pushq %r13 | |
15 pushq %r14 | |
16 pushq %r15 | |
17 | |
18 subq $128+24,%rsp | |
19 L$sqr_body: | |
20 movq %rdx,%rbp | |
21 movq (%rsi),%rdx | |
22 movq 8(%rsi),%rax | |
23 movq %rcx,128(%rsp) | |
24 jmp L$oop_sqr | |
25 | |
26 .p2align 5 | |
27 L$oop_sqr: | |
28 movl %r8d,128+8(%rsp) | |
29 | |
30 movq %rdx,%rbx | |
31 mulq %rdx | |
32 movq %rax,%r8 | |
33 movq 16(%rsi),%rax | |
34 movq %rdx,%r9 | |
35 | |
36 mulq %rbx | |
37 addq %rax,%r9 | |
38 movq 24(%rsi),%rax | |
39 movq %rdx,%r10 | |
40 adcq $0,%r10 | |
41 | |
42 mulq %rbx | |
43 addq %rax,%r10 | |
44 movq 32(%rsi),%rax | |
45 movq %rdx,%r11 | |
46 adcq $0,%r11 | |
47 | |
48 mulq %rbx | |
49 addq %rax,%r11 | |
50 movq 40(%rsi),%rax | |
51 movq %rdx,%r12 | |
52 adcq $0,%r12 | |
53 | |
54 mulq %rbx | |
55 addq %rax,%r12 | |
56 movq 48(%rsi),%rax | |
57 movq %rdx,%r13 | |
58 adcq $0,%r13 | |
59 | |
60 mulq %rbx | |
61 addq %rax,%r13 | |
62 movq 56(%rsi),%rax | |
63 movq %rdx,%r14 | |
64 adcq $0,%r14 | |
65 | |
66 mulq %rbx | |
67 addq %rax,%r14 | |
68 movq %rbx,%rax | |
69 movq %rdx,%r15 | |
70 adcq $0,%r15 | |
71 | |
72 addq %r8,%r8 | |
73 movq %r9,%rcx | |
74 adcq %r9,%r9 | |
75 | |
76 mulq %rax | |
77 movq %rax,(%rsp) | |
78 addq %rdx,%r8 | |
79 adcq $0,%r9 | |
80 | |
81 movq %r8,8(%rsp) | |
82 shrq $63,%rcx | |
83 | |
84 | |
85 movq 8(%rsi),%r8 | |
86 movq 16(%rsi),%rax | |
87 mulq %r8 | |
88 addq %rax,%r10 | |
89 movq 24(%rsi),%rax | |
90 movq %rdx,%rbx | |
91 adcq $0,%rbx | |
92 | |
93 mulq %r8 | |
94 addq %rax,%r11 | |
95 movq 32(%rsi),%rax | |
96 adcq $0,%rdx | |
97 addq %rbx,%r11 | |
98 movq %rdx,%rbx | |
99 adcq $0,%rbx | |
100 | |
101 mulq %r8 | |
102 addq %rax,%r12 | |
103 movq 40(%rsi),%rax | |
104 adcq $0,%rdx | |
105 addq %rbx,%r12 | |
106 movq %rdx,%rbx | |
107 adcq $0,%rbx | |
108 | |
109 mulq %r8 | |
110 addq %rax,%r13 | |
111 movq 48(%rsi),%rax | |
112 adcq $0,%rdx | |
113 addq %rbx,%r13 | |
114 movq %rdx,%rbx | |
115 adcq $0,%rbx | |
116 | |
117 mulq %r8 | |
118 addq %rax,%r14 | |
119 movq 56(%rsi),%rax | |
120 adcq $0,%rdx | |
121 addq %rbx,%r14 | |
122 movq %rdx,%rbx | |
123 adcq $0,%rbx | |
124 | |
125 mulq %r8 | |
126 addq %rax,%r15 | |
127 movq %r8,%rax | |
128 adcq $0,%rdx | |
129 addq %rbx,%r15 | |
130 movq %rdx,%r8 | |
131 movq %r10,%rdx | |
132 adcq $0,%r8 | |
133 | |
134 addq %rdx,%rdx | |
135 leaq (%rcx,%r10,2),%r10 | |
136 movq %r11,%rbx | |
137 adcq %r11,%r11 | |
138 | |
139 mulq %rax | |
140 addq %rax,%r9 | |
141 adcq %rdx,%r10 | |
142 adcq $0,%r11 | |
143 | |
144 movq %r9,16(%rsp) | |
145 movq %r10,24(%rsp) | |
146 shrq $63,%rbx | |
147 | |
148 | |
149 movq 16(%rsi),%r9 | |
150 movq 24(%rsi),%rax | |
151 mulq %r9 | |
152 addq %rax,%r12 | |
153 movq 32(%rsi),%rax | |
154 movq %rdx,%rcx | |
155 adcq $0,%rcx | |
156 | |
157 mulq %r9 | |
158 addq %rax,%r13 | |
159 movq 40(%rsi),%rax | |
160 adcq $0,%rdx | |
161 addq %rcx,%r13 | |
162 movq %rdx,%rcx | |
163 adcq $0,%rcx | |
164 | |
165 mulq %r9 | |
166 addq %rax,%r14 | |
167 movq 48(%rsi),%rax | |
168 adcq $0,%rdx | |
169 addq %rcx,%r14 | |
170 movq %rdx,%rcx | |
171 adcq $0,%rcx | |
172 | |
173 mulq %r9 | |
174 movq %r12,%r10 | |
175 leaq (%rbx,%r12,2),%r12 | |
176 addq %rax,%r15 | |
177 movq 56(%rsi),%rax | |
178 adcq $0,%rdx | |
179 addq %rcx,%r15 | |
180 movq %rdx,%rcx | |
181 adcq $0,%rcx | |
182 | |
183 mulq %r9 | |
184 shrq $63,%r10 | |
185 addq %rax,%r8 | |
186 movq %r9,%rax | |
187 adcq $0,%rdx | |
188 addq %rcx,%r8 | |
189 movq %rdx,%r9 | |
190 adcq $0,%r9 | |
191 | |
192 movq %r13,%rcx | |
193 leaq (%r10,%r13,2),%r13 | |
194 | |
195 mulq %rax | |
196 addq %rax,%r11 | |
197 adcq %rdx,%r12 | |
198 adcq $0,%r13 | |
199 | |
200 movq %r11,32(%rsp) | |
201 movq %r12,40(%rsp) | |
202 shrq $63,%rcx | |
203 | |
204 | |
205 movq 24(%rsi),%r10 | |
206 movq 32(%rsi),%rax | |
207 mulq %r10 | |
208 addq %rax,%r14 | |
209 movq 40(%rsi),%rax | |
210 movq %rdx,%rbx | |
211 adcq $0,%rbx | |
212 | |
213 mulq %r10 | |
214 addq %rax,%r15 | |
215 movq 48(%rsi),%rax | |
216 adcq $0,%rdx | |
217 addq %rbx,%r15 | |
218 movq %rdx,%rbx | |
219 adcq $0,%rbx | |
220 | |
221 mulq %r10 | |
222 movq %r14,%r12 | |
223 leaq (%rcx,%r14,2),%r14 | |
224 addq %rax,%r8 | |
225 movq 56(%rsi),%rax | |
226 adcq $0,%rdx | |
227 addq %rbx,%r8 | |
228 movq %rdx,%rbx | |
229 adcq $0,%rbx | |
230 | |
231 mulq %r10 | |
232 shrq $63,%r12 | |
233 addq %rax,%r9 | |
234 movq %r10,%rax | |
235 adcq $0,%rdx | |
236 addq %rbx,%r9 | |
237 movq %rdx,%r10 | |
238 adcq $0,%r10 | |
239 | |
240 movq %r15,%rbx | |
241 leaq (%r12,%r15,2),%r15 | |
242 | |
243 mulq %rax | |
244 addq %rax,%r13 | |
245 adcq %rdx,%r14 | |
246 adcq $0,%r15 | |
247 | |
248 movq %r13,48(%rsp) | |
249 movq %r14,56(%rsp) | |
250 shrq $63,%rbx | |
251 | |
252 | |
253 movq 32(%rsi),%r11 | |
254 movq 40(%rsi),%rax | |
255 mulq %r11 | |
256 addq %rax,%r8 | |
257 movq 48(%rsi),%rax | |
258 movq %rdx,%rcx | |
259 adcq $0,%rcx | |
260 | |
261 mulq %r11 | |
262 addq %rax,%r9 | |
263 movq 56(%rsi),%rax | |
264 adcq $0,%rdx | |
265 movq %r8,%r12 | |
266 leaq (%rbx,%r8,2),%r8 | |
267 addq %rcx,%r9 | |
268 movq %rdx,%rcx | |
269 adcq $0,%rcx | |
270 | |
271 mulq %r11 | |
272 shrq $63,%r12 | |
273 addq %rax,%r10 | |
274 movq %r11,%rax | |
275 adcq $0,%rdx | |
276 addq %rcx,%r10 | |
277 movq %rdx,%r11 | |
278 adcq $0,%r11 | |
279 | |
280 movq %r9,%rcx | |
281 leaq (%r12,%r9,2),%r9 | |
282 | |
283 mulq %rax | |
284 addq %rax,%r15 | |
285 adcq %rdx,%r8 | |
286 adcq $0,%r9 | |
287 | |
288 movq %r15,64(%rsp) | |
289 movq %r8,72(%rsp) | |
290 shrq $63,%rcx | |
291 | |
292 | |
293 movq 40(%rsi),%r12 | |
294 movq 48(%rsi),%rax | |
295 mulq %r12 | |
296 addq %rax,%r10 | |
297 movq 56(%rsi),%rax | |
298 movq %rdx,%rbx | |
299 adcq $0,%rbx | |
300 | |
301 mulq %r12 | |
302 addq %rax,%r11 | |
303 movq %r12,%rax | |
304 movq %r10,%r15 | |
305 leaq (%rcx,%r10,2),%r10 | |
306 adcq $0,%rdx | |
307 shrq $63,%r15 | |
308 addq %rbx,%r11 | |
309 movq %rdx,%r12 | |
310 adcq $0,%r12 | |
311 | |
312 movq %r11,%rbx | |
313 leaq (%r15,%r11,2),%r11 | |
314 | |
315 mulq %rax | |
316 addq %rax,%r9 | |
317 adcq %rdx,%r10 | |
318 adcq $0,%r11 | |
319 | |
320 movq %r9,80(%rsp) | |
321 movq %r10,88(%rsp) | |
322 | |
323 | |
324 movq 48(%rsi),%r13 | |
325 movq 56(%rsi),%rax | |
326 mulq %r13 | |
327 addq %rax,%r12 | |
328 movq %r13,%rax | |
329 movq %rdx,%r13 | |
330 adcq $0,%r13 | |
331 | |
332 xorq %r14,%r14 | |
333 shlq $1,%rbx | |
334 adcq %r12,%r12 | |
335 adcq %r13,%r13 | |
336 adcq %r14,%r14 | |
337 | |
338 mulq %rax | |
339 addq %rax,%r11 | |
340 adcq %rdx,%r12 | |
341 adcq $0,%r13 | |
342 | |
343 movq %r11,96(%rsp) | |
344 movq %r12,104(%rsp) | |
345 | |
346 | |
347 movq 56(%rsi),%rax | |
348 mulq %rax | |
349 addq %rax,%r13 | |
350 adcq $0,%rdx | |
351 | |
352 addq %rdx,%r14 | |
353 | |
354 movq %r13,112(%rsp) | |
355 movq %r14,120(%rsp) | |
356 | |
357 movq (%rsp),%r8 | |
358 movq 8(%rsp),%r9 | |
359 movq 16(%rsp),%r10 | |
360 movq 24(%rsp),%r11 | |
361 movq 32(%rsp),%r12 | |
362 movq 40(%rsp),%r13 | |
363 movq 48(%rsp),%r14 | |
364 movq 56(%rsp),%r15 | |
365 | |
366 call __rsaz_512_reduce | |
367 | |
368 addq 64(%rsp),%r8 | |
369 adcq 72(%rsp),%r9 | |
370 adcq 80(%rsp),%r10 | |
371 adcq 88(%rsp),%r11 | |
372 adcq 96(%rsp),%r12 | |
373 adcq 104(%rsp),%r13 | |
374 adcq 112(%rsp),%r14 | |
375 adcq 120(%rsp),%r15 | |
376 sbbq %rcx,%rcx | |
377 | |
378 call __rsaz_512_subtract | |
379 | |
380 movq %r8,%rdx | |
381 movq %r9,%rax | |
382 movl 128+8(%rsp),%r8d | |
383 movq %rdi,%rsi | |
384 | |
385 decl %r8d | |
386 jnz L$oop_sqr | |
387 | |
388 leaq 128+24+48(%rsp),%rax | |
389 movq -48(%rax),%r15 | |
390 movq -40(%rax),%r14 | |
391 movq -32(%rax),%r13 | |
392 movq -24(%rax),%r12 | |
393 movq -16(%rax),%rbp | |
394 movq -8(%rax),%rbx | |
395 leaq (%rax),%rsp | |
396 L$sqr_epilogue: | |
397 .byte 0xf3,0xc3 | |
398 | |
399 .globl _rsaz_512_mul | |
400 .private_extern _rsaz_512_mul | |
401 | |
402 .p2align 5 | |
403 _rsaz_512_mul: | |
404 pushq %rbx | |
405 pushq %rbp | |
406 pushq %r12 | |
407 pushq %r13 | |
408 pushq %r14 | |
409 pushq %r15 | |
410 | |
411 subq $128+24,%rsp | |
412 L$mul_body: | |
413 .byte 102,72,15,110,199 | |
414 .byte 102,72,15,110,201 | |
415 movq %r8,128(%rsp) | |
416 movq (%rdx),%rbx | |
417 movq %rdx,%rbp | |
418 call __rsaz_512_mul | |
419 | |
420 .byte 102,72,15,126,199 | |
421 .byte 102,72,15,126,205 | |
422 | |
423 movq (%rsp),%r8 | |
424 movq 8(%rsp),%r9 | |
425 movq 16(%rsp),%r10 | |
426 movq 24(%rsp),%r11 | |
427 movq 32(%rsp),%r12 | |
428 movq 40(%rsp),%r13 | |
429 movq 48(%rsp),%r14 | |
430 movq 56(%rsp),%r15 | |
431 | |
432 call __rsaz_512_reduce | |
433 addq 64(%rsp),%r8 | |
434 adcq 72(%rsp),%r9 | |
435 adcq 80(%rsp),%r10 | |
436 adcq 88(%rsp),%r11 | |
437 adcq 96(%rsp),%r12 | |
438 adcq 104(%rsp),%r13 | |
439 adcq 112(%rsp),%r14 | |
440 adcq 120(%rsp),%r15 | |
441 sbbq %rcx,%rcx | |
442 | |
443 call __rsaz_512_subtract | |
444 | |
445 leaq 128+24+48(%rsp),%rax | |
446 movq -48(%rax),%r15 | |
447 movq -40(%rax),%r14 | |
448 movq -32(%rax),%r13 | |
449 movq -24(%rax),%r12 | |
450 movq -16(%rax),%rbp | |
451 movq -8(%rax),%rbx | |
452 leaq (%rax),%rsp | |
453 L$mul_epilogue: | |
454 .byte 0xf3,0xc3 | |
455 | |
456 .globl _rsaz_512_mul_gather4 | |
457 .private_extern _rsaz_512_mul_gather4 | |
458 | |
459 .p2align 5 | |
460 _rsaz_512_mul_gather4: | |
461 pushq %rbx | |
462 pushq %rbp | |
463 pushq %r12 | |
464 pushq %r13 | |
465 pushq %r14 | |
466 pushq %r15 | |
467 | |
468 subq $152,%rsp | |
469 L$mul_gather4_body: | |
470 movd %r9d,%xmm8 | |
471 movdqa L$inc+16(%rip),%xmm1 | |
472 movdqa L$inc(%rip),%xmm0 | |
473 | |
474 pshufd $0,%xmm8,%xmm8 | |
475 movdqa %xmm1,%xmm7 | |
476 movdqa %xmm1,%xmm2 | |
477 paddd %xmm0,%xmm1 | |
478 pcmpeqd %xmm8,%xmm0 | |
479 movdqa %xmm7,%xmm3 | |
480 paddd %xmm1,%xmm2 | |
481 pcmpeqd %xmm8,%xmm1 | |
482 movdqa %xmm7,%xmm4 | |
483 paddd %xmm2,%xmm3 | |
484 pcmpeqd %xmm8,%xmm2 | |
485 movdqa %xmm7,%xmm5 | |
486 paddd %xmm3,%xmm4 | |
487 pcmpeqd %xmm8,%xmm3 | |
488 movdqa %xmm7,%xmm6 | |
489 paddd %xmm4,%xmm5 | |
490 pcmpeqd %xmm8,%xmm4 | |
491 paddd %xmm5,%xmm6 | |
492 pcmpeqd %xmm8,%xmm5 | |
493 paddd %xmm6,%xmm7 | |
494 pcmpeqd %xmm8,%xmm6 | |
495 pcmpeqd %xmm8,%xmm7 | |
496 | |
497 movdqa 0(%rdx),%xmm8 | |
498 movdqa 16(%rdx),%xmm9 | |
499 movdqa 32(%rdx),%xmm10 | |
500 movdqa 48(%rdx),%xmm11 | |
501 pand %xmm0,%xmm8 | |
502 movdqa 64(%rdx),%xmm12 | |
503 pand %xmm1,%xmm9 | |
504 movdqa 80(%rdx),%xmm13 | |
505 pand %xmm2,%xmm10 | |
506 movdqa 96(%rdx),%xmm14 | |
507 pand %xmm3,%xmm11 | |
508 movdqa 112(%rdx),%xmm15 | |
509 leaq 128(%rdx),%rbp | |
510 pand %xmm4,%xmm12 | |
511 pand %xmm5,%xmm13 | |
512 pand %xmm6,%xmm14 | |
513 pand %xmm7,%xmm15 | |
514 por %xmm10,%xmm8 | |
515 por %xmm11,%xmm9 | |
516 por %xmm12,%xmm8 | |
517 por %xmm13,%xmm9 | |
518 por %xmm14,%xmm8 | |
519 por %xmm15,%xmm9 | |
520 | |
521 por %xmm9,%xmm8 | |
522 pshufd $0x4e,%xmm8,%xmm9 | |
523 por %xmm9,%xmm8 | |
524 .byte 102,76,15,126,195 | |
525 | |
526 movq %r8,128(%rsp) | |
527 movq %rdi,128+8(%rsp) | |
528 movq %rcx,128+16(%rsp) | |
529 | |
530 movq (%rsi),%rax | |
531 movq 8(%rsi),%rcx | |
532 mulq %rbx | |
533 movq %rax,(%rsp) | |
534 movq %rcx,%rax | |
535 movq %rdx,%r8 | |
536 | |
537 mulq %rbx | |
538 addq %rax,%r8 | |
539 movq 16(%rsi),%rax | |
540 movq %rdx,%r9 | |
541 adcq $0,%r9 | |
542 | |
543 mulq %rbx | |
544 addq %rax,%r9 | |
545 movq 24(%rsi),%rax | |
546 movq %rdx,%r10 | |
547 adcq $0,%r10 | |
548 | |
549 mulq %rbx | |
550 addq %rax,%r10 | |
551 movq 32(%rsi),%rax | |
552 movq %rdx,%r11 | |
553 adcq $0,%r11 | |
554 | |
555 mulq %rbx | |
556 addq %rax,%r11 | |
557 movq 40(%rsi),%rax | |
558 movq %rdx,%r12 | |
559 adcq $0,%r12 | |
560 | |
561 mulq %rbx | |
562 addq %rax,%r12 | |
563 movq 48(%rsi),%rax | |
564 movq %rdx,%r13 | |
565 adcq $0,%r13 | |
566 | |
567 mulq %rbx | |
568 addq %rax,%r13 | |
569 movq 56(%rsi),%rax | |
570 movq %rdx,%r14 | |
571 adcq $0,%r14 | |
572 | |
573 mulq %rbx | |
574 addq %rax,%r14 | |
575 movq (%rsi),%rax | |
576 movq %rdx,%r15 | |
577 adcq $0,%r15 | |
578 | |
579 leaq 8(%rsp),%rdi | |
580 movl $7,%ecx | |
581 jmp L$oop_mul_gather | |
582 | |
583 .p2align 5 | |
584 L$oop_mul_gather: | |
585 movdqa 0(%rbp),%xmm8 | |
586 movdqa 16(%rbp),%xmm9 | |
587 movdqa 32(%rbp),%xmm10 | |
588 movdqa 48(%rbp),%xmm11 | |
589 pand %xmm0,%xmm8 | |
590 movdqa 64(%rbp),%xmm12 | |
591 pand %xmm1,%xmm9 | |
592 movdqa 80(%rbp),%xmm13 | |
593 pand %xmm2,%xmm10 | |
594 movdqa 96(%rbp),%xmm14 | |
595 pand %xmm3,%xmm11 | |
596 movdqa 112(%rbp),%xmm15 | |
597 leaq 128(%rbp),%rbp | |
598 pand %xmm4,%xmm12 | |
599 pand %xmm5,%xmm13 | |
600 pand %xmm6,%xmm14 | |
601 pand %xmm7,%xmm15 | |
602 por %xmm10,%xmm8 | |
603 por %xmm11,%xmm9 | |
604 por %xmm12,%xmm8 | |
605 por %xmm13,%xmm9 | |
606 por %xmm14,%xmm8 | |
607 por %xmm15,%xmm9 | |
608 | |
609 por %xmm9,%xmm8 | |
610 pshufd $0x4e,%xmm8,%xmm9 | |
611 por %xmm9,%xmm8 | |
612 .byte 102,76,15,126,195 | |
613 | |
614 mulq %rbx | |
615 addq %rax,%r8 | |
616 movq 8(%rsi),%rax | |
617 movq %r8,(%rdi) | |
618 movq %rdx,%r8 | |
619 adcq $0,%r8 | |
620 | |
621 mulq %rbx | |
622 addq %rax,%r9 | |
623 movq 16(%rsi),%rax | |
624 adcq $0,%rdx | |
625 addq %r9,%r8 | |
626 movq %rdx,%r9 | |
627 adcq $0,%r9 | |
628 | |
629 mulq %rbx | |
630 addq %rax,%r10 | |
631 movq 24(%rsi),%rax | |
632 adcq $0,%rdx | |
633 addq %r10,%r9 | |
634 movq %rdx,%r10 | |
635 adcq $0,%r10 | |
636 | |
637 mulq %rbx | |
638 addq %rax,%r11 | |
639 movq 32(%rsi),%rax | |
640 adcq $0,%rdx | |
641 addq %r11,%r10 | |
642 movq %rdx,%r11 | |
643 adcq $0,%r11 | |
644 | |
645 mulq %rbx | |
646 addq %rax,%r12 | |
647 movq 40(%rsi),%rax | |
648 adcq $0,%rdx | |
649 addq %r12,%r11 | |
650 movq %rdx,%r12 | |
651 adcq $0,%r12 | |
652 | |
653 mulq %rbx | |
654 addq %rax,%r13 | |
655 movq 48(%rsi),%rax | |
656 adcq $0,%rdx | |
657 addq %r13,%r12 | |
658 movq %rdx,%r13 | |
659 adcq $0,%r13 | |
660 | |
661 mulq %rbx | |
662 addq %rax,%r14 | |
663 movq 56(%rsi),%rax | |
664 adcq $0,%rdx | |
665 addq %r14,%r13 | |
666 movq %rdx,%r14 | |
667 adcq $0,%r14 | |
668 | |
669 mulq %rbx | |
670 addq %rax,%r15 | |
671 movq (%rsi),%rax | |
672 adcq $0,%rdx | |
673 addq %r15,%r14 | |
674 movq %rdx,%r15 | |
675 adcq $0,%r15 | |
676 | |
677 leaq 8(%rdi),%rdi | |
678 | |
679 decl %ecx | |
680 jnz L$oop_mul_gather | |
681 | |
682 movq %r8,(%rdi) | |
683 movq %r9,8(%rdi) | |
684 movq %r10,16(%rdi) | |
685 movq %r11,24(%rdi) | |
686 movq %r12,32(%rdi) | |
687 movq %r13,40(%rdi) | |
688 movq %r14,48(%rdi) | |
689 movq %r15,56(%rdi) | |
690 | |
691 movq 128+8(%rsp),%rdi | |
692 movq 128+16(%rsp),%rbp | |
693 | |
694 movq (%rsp),%r8 | |
695 movq 8(%rsp),%r9 | |
696 movq 16(%rsp),%r10 | |
697 movq 24(%rsp),%r11 | |
698 movq 32(%rsp),%r12 | |
699 movq 40(%rsp),%r13 | |
700 movq 48(%rsp),%r14 | |
701 movq 56(%rsp),%r15 | |
702 | |
703 call __rsaz_512_reduce | |
704 addq 64(%rsp),%r8 | |
705 adcq 72(%rsp),%r9 | |
706 adcq 80(%rsp),%r10 | |
707 adcq 88(%rsp),%r11 | |
708 adcq 96(%rsp),%r12 | |
709 adcq 104(%rsp),%r13 | |
710 adcq 112(%rsp),%r14 | |
711 adcq 120(%rsp),%r15 | |
712 sbbq %rcx,%rcx | |
713 | |
714 call __rsaz_512_subtract | |
715 | |
716 leaq 128+24+48(%rsp),%rax | |
717 movq -48(%rax),%r15 | |
718 movq -40(%rax),%r14 | |
719 movq -32(%rax),%r13 | |
720 movq -24(%rax),%r12 | |
721 movq -16(%rax),%rbp | |
722 movq -8(%rax),%rbx | |
723 leaq (%rax),%rsp | |
724 L$mul_gather4_epilogue: | |
725 .byte 0xf3,0xc3 | |
726 | |
727 .globl _rsaz_512_mul_scatter4 | |
728 .private_extern _rsaz_512_mul_scatter4 | |
729 | |
730 .p2align 5 | |
731 _rsaz_512_mul_scatter4: | |
732 pushq %rbx | |
733 pushq %rbp | |
734 pushq %r12 | |
735 pushq %r13 | |
736 pushq %r14 | |
737 pushq %r15 | |
738 | |
739 movl %r9d,%r9d | |
740 subq $128+24,%rsp | |
741 L$mul_scatter4_body: | |
742 leaq (%r8,%r9,8),%r8 | |
743 .byte 102,72,15,110,199 | |
744 .byte 102,72,15,110,202 | |
745 .byte 102,73,15,110,208 | |
746 movq %rcx,128(%rsp) | |
747 | |
748 movq %rdi,%rbp | |
749 movq (%rdi),%rbx | |
750 call __rsaz_512_mul | |
751 | |
752 .byte 102,72,15,126,199 | |
753 .byte 102,72,15,126,205 | |
754 | |
755 movq (%rsp),%r8 | |
756 movq 8(%rsp),%r9 | |
757 movq 16(%rsp),%r10 | |
758 movq 24(%rsp),%r11 | |
759 movq 32(%rsp),%r12 | |
760 movq 40(%rsp),%r13 | |
761 movq 48(%rsp),%r14 | |
762 movq 56(%rsp),%r15 | |
763 | |
764 call __rsaz_512_reduce | |
765 addq 64(%rsp),%r8 | |
766 adcq 72(%rsp),%r9 | |
767 adcq 80(%rsp),%r10 | |
768 adcq 88(%rsp),%r11 | |
769 adcq 96(%rsp),%r12 | |
770 adcq 104(%rsp),%r13 | |
771 adcq 112(%rsp),%r14 | |
772 adcq 120(%rsp),%r15 | |
773 .byte 102,72,15,126,214 | |
774 sbbq %rcx,%rcx | |
775 | |
776 call __rsaz_512_subtract | |
777 | |
778 movq %r8,0(%rsi) | |
779 movq %r9,128(%rsi) | |
780 movq %r10,256(%rsi) | |
781 movq %r11,384(%rsi) | |
782 movq %r12,512(%rsi) | |
783 movq %r13,640(%rsi) | |
784 movq %r14,768(%rsi) | |
785 movq %r15,896(%rsi) | |
786 | |
787 leaq 128+24+48(%rsp),%rax | |
788 movq -48(%rax),%r15 | |
789 movq -40(%rax),%r14 | |
790 movq -32(%rax),%r13 | |
791 movq -24(%rax),%r12 | |
792 movq -16(%rax),%rbp | |
793 movq -8(%rax),%rbx | |
794 leaq (%rax),%rsp | |
795 L$mul_scatter4_epilogue: | |
796 .byte 0xf3,0xc3 | |
797 | |
798 .globl _rsaz_512_mul_by_one | |
799 .private_extern _rsaz_512_mul_by_one | |
800 | |
801 .p2align 5 | |
802 _rsaz_512_mul_by_one: | |
803 pushq %rbx | |
804 pushq %rbp | |
805 pushq %r12 | |
806 pushq %r13 | |
807 pushq %r14 | |
808 pushq %r15 | |
809 | |
810 subq $128+24,%rsp | |
811 L$mul_by_one_body: | |
812 movq %rdx,%rbp | |
813 movq %rcx,128(%rsp) | |
814 | |
815 movq (%rsi),%r8 | |
816 pxor %xmm0,%xmm0 | |
817 movq 8(%rsi),%r9 | |
818 movq 16(%rsi),%r10 | |
819 movq 24(%rsi),%r11 | |
820 movq 32(%rsi),%r12 | |
821 movq 40(%rsi),%r13 | |
822 movq 48(%rsi),%r14 | |
823 movq 56(%rsi),%r15 | |
824 | |
825 movdqa %xmm0,(%rsp) | |
826 movdqa %xmm0,16(%rsp) | |
827 movdqa %xmm0,32(%rsp) | |
828 movdqa %xmm0,48(%rsp) | |
829 movdqa %xmm0,64(%rsp) | |
830 movdqa %xmm0,80(%rsp) | |
831 movdqa %xmm0,96(%rsp) | |
832 call __rsaz_512_reduce | |
833 movq %r8,(%rdi) | |
834 movq %r9,8(%rdi) | |
835 movq %r10,16(%rdi) | |
836 movq %r11,24(%rdi) | |
837 movq %r12,32(%rdi) | |
838 movq %r13,40(%rdi) | |
839 movq %r14,48(%rdi) | |
840 movq %r15,56(%rdi) | |
841 | |
842 leaq 128+24+48(%rsp),%rax | |
843 movq -48(%rax),%r15 | |
844 movq -40(%rax),%r14 | |
845 movq -32(%rax),%r13 | |
846 movq -24(%rax),%r12 | |
847 movq -16(%rax),%rbp | |
848 movq -8(%rax),%rbx | |
849 leaq (%rax),%rsp | |
850 L$mul_by_one_epilogue: | |
851 .byte 0xf3,0xc3 | |
852 | |
853 | |
854 .p2align 5 | |
855 __rsaz_512_reduce: | |
856 movq %r8,%rbx | |
857 imulq 128+8(%rsp),%rbx | |
858 movq 0(%rbp),%rax | |
859 movl $8,%ecx | |
860 jmp L$reduction_loop | |
861 | |
862 .p2align 5 | |
863 L$reduction_loop: | |
864 mulq %rbx | |
865 movq 8(%rbp),%rax | |
866 negq %r8 | |
867 movq %rdx,%r8 | |
868 adcq $0,%r8 | |
869 | |
870 mulq %rbx | |
871 addq %rax,%r9 | |
872 movq 16(%rbp),%rax | |
873 adcq $0,%rdx | |
874 addq %r9,%r8 | |
875 movq %rdx,%r9 | |
876 adcq $0,%r9 | |
877 | |
878 mulq %rbx | |
879 addq %rax,%r10 | |
880 movq 24(%rbp),%rax | |
881 adcq $0,%rdx | |
882 addq %r10,%r9 | |
883 movq %rdx,%r10 | |
884 adcq $0,%r10 | |
885 | |
886 mulq %rbx | |
887 addq %rax,%r11 | |
888 movq 32(%rbp),%rax | |
889 adcq $0,%rdx | |
890 addq %r11,%r10 | |
891 movq 128+8(%rsp),%rsi | |
892 | |
893 | |
894 adcq $0,%rdx | |
895 movq %rdx,%r11 | |
896 | |
897 mulq %rbx | |
898 addq %rax,%r12 | |
899 movq 40(%rbp),%rax | |
900 adcq $0,%rdx | |
901 imulq %r8,%rsi | |
902 addq %r12,%r11 | |
903 movq %rdx,%r12 | |
904 adcq $0,%r12 | |
905 | |
906 mulq %rbx | |
907 addq %rax,%r13 | |
908 movq 48(%rbp),%rax | |
909 adcq $0,%rdx | |
910 addq %r13,%r12 | |
911 movq %rdx,%r13 | |
912 adcq $0,%r13 | |
913 | |
914 mulq %rbx | |
915 addq %rax,%r14 | |
916 movq 56(%rbp),%rax | |
917 adcq $0,%rdx | |
918 addq %r14,%r13 | |
919 movq %rdx,%r14 | |
920 adcq $0,%r14 | |
921 | |
922 mulq %rbx | |
923 movq %rsi,%rbx | |
924 addq %rax,%r15 | |
925 movq 0(%rbp),%rax | |
926 adcq $0,%rdx | |
927 addq %r15,%r14 | |
928 movq %rdx,%r15 | |
929 adcq $0,%r15 | |
930 | |
931 decl %ecx | |
932 jne L$reduction_loop | |
933 | |
934 .byte 0xf3,0xc3 | |
935 | |
936 | |
937 .p2align 5 | |
938 __rsaz_512_subtract: | |
939 movq %r8,(%rdi) | |
940 movq %r9,8(%rdi) | |
941 movq %r10,16(%rdi) | |
942 movq %r11,24(%rdi) | |
943 movq %r12,32(%rdi) | |
944 movq %r13,40(%rdi) | |
945 movq %r14,48(%rdi) | |
946 movq %r15,56(%rdi) | |
947 | |
948 movq 0(%rbp),%r8 | |
949 movq 8(%rbp),%r9 | |
950 negq %r8 | |
951 notq %r9 | |
952 andq %rcx,%r8 | |
953 movq 16(%rbp),%r10 | |
954 andq %rcx,%r9 | |
955 notq %r10 | |
956 movq 24(%rbp),%r11 | |
957 andq %rcx,%r10 | |
958 notq %r11 | |
959 movq 32(%rbp),%r12 | |
960 andq %rcx,%r11 | |
961 notq %r12 | |
962 movq 40(%rbp),%r13 | |
963 andq %rcx,%r12 | |
964 notq %r13 | |
965 movq 48(%rbp),%r14 | |
966 andq %rcx,%r13 | |
967 notq %r14 | |
968 movq 56(%rbp),%r15 | |
969 andq %rcx,%r14 | |
970 notq %r15 | |
971 andq %rcx,%r15 | |
972 | |
973 addq (%rdi),%r8 | |
974 adcq 8(%rdi),%r9 | |
975 adcq 16(%rdi),%r10 | |
976 adcq 24(%rdi),%r11 | |
977 adcq 32(%rdi),%r12 | |
978 adcq 40(%rdi),%r13 | |
979 adcq 48(%rdi),%r14 | |
980 adcq 56(%rdi),%r15 | |
981 | |
982 movq %r8,(%rdi) | |
983 movq %r9,8(%rdi) | |
984 movq %r10,16(%rdi) | |
985 movq %r11,24(%rdi) | |
986 movq %r12,32(%rdi) | |
987 movq %r13,40(%rdi) | |
988 movq %r14,48(%rdi) | |
989 movq %r15,56(%rdi) | |
990 | |
991 .byte 0xf3,0xc3 | |
992 | |
993 | |
994 .p2align 5 | |
995 __rsaz_512_mul: | |
996 leaq 8(%rsp),%rdi | |
997 | |
998 movq (%rsi),%rax | |
999 mulq %rbx | |
1000 movq %rax,(%rdi) | |
1001 movq 8(%rsi),%rax | |
1002 movq %rdx,%r8 | |
1003 | |
1004 mulq %rbx | |
1005 addq %rax,%r8 | |
1006 movq 16(%rsi),%rax | |
1007 movq %rdx,%r9 | |
1008 adcq $0,%r9 | |
1009 | |
1010 mulq %rbx | |
1011 addq %rax,%r9 | |
1012 movq 24(%rsi),%rax | |
1013 movq %rdx,%r10 | |
1014 adcq $0,%r10 | |
1015 | |
1016 mulq %rbx | |
1017 addq %rax,%r10 | |
1018 movq 32(%rsi),%rax | |
1019 movq %rdx,%r11 | |
1020 adcq $0,%r11 | |
1021 | |
1022 mulq %rbx | |
1023 addq %rax,%r11 | |
1024 movq 40(%rsi),%rax | |
1025 movq %rdx,%r12 | |
1026 adcq $0,%r12 | |
1027 | |
1028 mulq %rbx | |
1029 addq %rax,%r12 | |
1030 movq 48(%rsi),%rax | |
1031 movq %rdx,%r13 | |
1032 adcq $0,%r13 | |
1033 | |
1034 mulq %rbx | |
1035 addq %rax,%r13 | |
1036 movq 56(%rsi),%rax | |
1037 movq %rdx,%r14 | |
1038 adcq $0,%r14 | |
1039 | |
1040 mulq %rbx | |
1041 addq %rax,%r14 | |
1042 movq (%rsi),%rax | |
1043 movq %rdx,%r15 | |
1044 adcq $0,%r15 | |
1045 | |
1046 leaq 8(%rbp),%rbp | |
1047 leaq 8(%rdi),%rdi | |
1048 | |
1049 movl $7,%ecx | |
1050 jmp L$oop_mul | |
1051 | |
1052 .p2align 5 | |
1053 L$oop_mul: | |
1054 movq (%rbp),%rbx | |
1055 mulq %rbx | |
1056 addq %rax,%r8 | |
1057 movq 8(%rsi),%rax | |
1058 movq %r8,(%rdi) | |
1059 movq %rdx,%r8 | |
1060 adcq $0,%r8 | |
1061 | |
1062 mulq %rbx | |
1063 addq %rax,%r9 | |
1064 movq 16(%rsi),%rax | |
1065 adcq $0,%rdx | |
1066 addq %r9,%r8 | |
1067 movq %rdx,%r9 | |
1068 adcq $0,%r9 | |
1069 | |
1070 mulq %rbx | |
1071 addq %rax,%r10 | |
1072 movq 24(%rsi),%rax | |
1073 adcq $0,%rdx | |
1074 addq %r10,%r9 | |
1075 movq %rdx,%r10 | |
1076 adcq $0,%r10 | |
1077 | |
1078 mulq %rbx | |
1079 addq %rax,%r11 | |
1080 movq 32(%rsi),%rax | |
1081 adcq $0,%rdx | |
1082 addq %r11,%r10 | |
1083 movq %rdx,%r11 | |
1084 adcq $0,%r11 | |
1085 | |
1086 mulq %rbx | |
1087 addq %rax,%r12 | |
1088 movq 40(%rsi),%rax | |
1089 adcq $0,%rdx | |
1090 addq %r12,%r11 | |
1091 movq %rdx,%r12 | |
1092 adcq $0,%r12 | |
1093 | |
1094 mulq %rbx | |
1095 addq %rax,%r13 | |
1096 movq 48(%rsi),%rax | |
1097 adcq $0,%rdx | |
1098 addq %r13,%r12 | |
1099 movq %rdx,%r13 | |
1100 adcq $0,%r13 | |
1101 | |
1102 mulq %rbx | |
1103 addq %rax,%r14 | |
1104 movq 56(%rsi),%rax | |
1105 adcq $0,%rdx | |
1106 addq %r14,%r13 | |
1107 movq %rdx,%r14 | |
1108 leaq 8(%rbp),%rbp | |
1109 adcq $0,%r14 | |
1110 | |
1111 mulq %rbx | |
1112 addq %rax,%r15 | |
1113 movq (%rsi),%rax | |
1114 adcq $0,%rdx | |
1115 addq %r15,%r14 | |
1116 movq %rdx,%r15 | |
1117 adcq $0,%r15 | |
1118 | |
1119 leaq 8(%rdi),%rdi | |
1120 | |
1121 decl %ecx | |
1122 jnz L$oop_mul | |
1123 | |
1124 movq %r8,(%rdi) | |
1125 movq %r9,8(%rdi) | |
1126 movq %r10,16(%rdi) | |
1127 movq %r11,24(%rdi) | |
1128 movq %r12,32(%rdi) | |
1129 movq %r13,40(%rdi) | |
1130 movq %r14,48(%rdi) | |
1131 movq %r15,56(%rdi) | |
1132 | |
1133 .byte 0xf3,0xc3 | |
1134 | |
1135 .globl _rsaz_512_scatter4 | |
1136 .private_extern _rsaz_512_scatter4 | |
1137 | |
1138 .p2align 4 | |
1139 _rsaz_512_scatter4: | |
1140 leaq (%rdi,%rdx,8),%rdi | |
1141 movl $8,%r9d | |
1142 jmp L$oop_scatter | |
1143 .p2align 4 | |
1144 L$oop_scatter: | |
1145 movq (%rsi),%rax | |
1146 leaq 8(%rsi),%rsi | |
1147 movq %rax,(%rdi) | |
1148 leaq 128(%rdi),%rdi | |
1149 decl %r9d | |
1150 jnz L$oop_scatter | |
1151 .byte 0xf3,0xc3 | |
1152 | |
1153 | |
1154 .globl _rsaz_512_gather4 | |
1155 .private_extern _rsaz_512_gather4 | |
1156 | |
1157 .p2align 4 | |
1158 _rsaz_512_gather4: | |
1159 movd %edx,%xmm8 | |
1160 movdqa L$inc+16(%rip),%xmm1 | |
1161 movdqa L$inc(%rip),%xmm0 | |
1162 | |
1163 pshufd $0,%xmm8,%xmm8 | |
1164 movdqa %xmm1,%xmm7 | |
1165 movdqa %xmm1,%xmm2 | |
1166 paddd %xmm0,%xmm1 | |
1167 pcmpeqd %xmm8,%xmm0 | |
1168 movdqa %xmm7,%xmm3 | |
1169 paddd %xmm1,%xmm2 | |
1170 pcmpeqd %xmm8,%xmm1 | |
1171 movdqa %xmm7,%xmm4 | |
1172 paddd %xmm2,%xmm3 | |
1173 pcmpeqd %xmm8,%xmm2 | |
1174 movdqa %xmm7,%xmm5 | |
1175 paddd %xmm3,%xmm4 | |
1176 pcmpeqd %xmm8,%xmm3 | |
1177 movdqa %xmm7,%xmm6 | |
1178 paddd %xmm4,%xmm5 | |
1179 pcmpeqd %xmm8,%xmm4 | |
1180 paddd %xmm5,%xmm6 | |
1181 pcmpeqd %xmm8,%xmm5 | |
1182 paddd %xmm6,%xmm7 | |
1183 pcmpeqd %xmm8,%xmm6 | |
1184 pcmpeqd %xmm8,%xmm7 | |
1185 movl $8,%r9d | |
1186 jmp L$oop_gather | |
1187 .p2align 4 | |
1188 L$oop_gather: | |
1189 movdqa 0(%rsi),%xmm8 | |
1190 movdqa 16(%rsi),%xmm9 | |
1191 movdqa 32(%rsi),%xmm10 | |
1192 movdqa 48(%rsi),%xmm11 | |
1193 pand %xmm0,%xmm8 | |
1194 movdqa 64(%rsi),%xmm12 | |
1195 pand %xmm1,%xmm9 | |
1196 movdqa 80(%rsi),%xmm13 | |
1197 pand %xmm2,%xmm10 | |
1198 movdqa 96(%rsi),%xmm14 | |
1199 pand %xmm3,%xmm11 | |
1200 movdqa 112(%rsi),%xmm15 | |
1201 leaq 128(%rsi),%rsi | |
1202 pand %xmm4,%xmm12 | |
1203 pand %xmm5,%xmm13 | |
1204 pand %xmm6,%xmm14 | |
1205 pand %xmm7,%xmm15 | |
1206 por %xmm10,%xmm8 | |
1207 por %xmm11,%xmm9 | |
1208 por %xmm12,%xmm8 | |
1209 por %xmm13,%xmm9 | |
1210 por %xmm14,%xmm8 | |
1211 por %xmm15,%xmm9 | |
1212 | |
1213 por %xmm9,%xmm8 | |
1214 pshufd $0x4e,%xmm8,%xmm9 | |
1215 por %xmm9,%xmm8 | |
1216 movq %xmm8,(%rdi) | |
1217 leaq 8(%rdi),%rdi | |
1218 decl %r9d | |
1219 jnz L$oop_gather | |
1220 .byte 0xf3,0xc3 | |
1221 L$SEH_end_rsaz_512_gather4: | |
1222 | |
1223 | |
1224 .p2align 6 | |
1225 L$inc: | |
1226 .long 0,0, 1,1 | |
1227 .long 2,2, 2,2 | |
1228 #endif | |
OLD | NEW |