OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 .extern OPENSSL_ia32cap_P | |
5 .hidden OPENSSL_ia32cap_P | |
6 | |
7 .globl rsaz_512_sqr | |
8 .hidden rsaz_512_sqr | |
9 .type rsaz_512_sqr,@function | |
10 .align 32 | |
11 rsaz_512_sqr: | |
12 pushq %rbx | |
13 pushq %rbp | |
14 pushq %r12 | |
15 pushq %r13 | |
16 pushq %r14 | |
17 pushq %r15 | |
18 | |
19 subq $128+24,%rsp | |
20 .Lsqr_body: | |
21 movq %rdx,%rbp | |
22 movq (%rsi),%rdx | |
23 movq 8(%rsi),%rax | |
24 movq %rcx,128(%rsp) | |
25 jmp .Loop_sqr | |
26 | |
27 .align 32 | |
28 .Loop_sqr: | |
29 movl %r8d,128+8(%rsp) | |
30 | |
31 movq %rdx,%rbx | |
32 mulq %rdx | |
33 movq %rax,%r8 | |
34 movq 16(%rsi),%rax | |
35 movq %rdx,%r9 | |
36 | |
37 mulq %rbx | |
38 addq %rax,%r9 | |
39 movq 24(%rsi),%rax | |
40 movq %rdx,%r10 | |
41 adcq $0,%r10 | |
42 | |
43 mulq %rbx | |
44 addq %rax,%r10 | |
45 movq 32(%rsi),%rax | |
46 movq %rdx,%r11 | |
47 adcq $0,%r11 | |
48 | |
49 mulq %rbx | |
50 addq %rax,%r11 | |
51 movq 40(%rsi),%rax | |
52 movq %rdx,%r12 | |
53 adcq $0,%r12 | |
54 | |
55 mulq %rbx | |
56 addq %rax,%r12 | |
57 movq 48(%rsi),%rax | |
58 movq %rdx,%r13 | |
59 adcq $0,%r13 | |
60 | |
61 mulq %rbx | |
62 addq %rax,%r13 | |
63 movq 56(%rsi),%rax | |
64 movq %rdx,%r14 | |
65 adcq $0,%r14 | |
66 | |
67 mulq %rbx | |
68 addq %rax,%r14 | |
69 movq %rbx,%rax | |
70 movq %rdx,%r15 | |
71 adcq $0,%r15 | |
72 | |
73 addq %r8,%r8 | |
74 movq %r9,%rcx | |
75 adcq %r9,%r9 | |
76 | |
77 mulq %rax | |
78 movq %rax,(%rsp) | |
79 addq %rdx,%r8 | |
80 adcq $0,%r9 | |
81 | |
82 movq %r8,8(%rsp) | |
83 shrq $63,%rcx | |
84 | |
85 | |
86 movq 8(%rsi),%r8 | |
87 movq 16(%rsi),%rax | |
88 mulq %r8 | |
89 addq %rax,%r10 | |
90 movq 24(%rsi),%rax | |
91 movq %rdx,%rbx | |
92 adcq $0,%rbx | |
93 | |
94 mulq %r8 | |
95 addq %rax,%r11 | |
96 movq 32(%rsi),%rax | |
97 adcq $0,%rdx | |
98 addq %rbx,%r11 | |
99 movq %rdx,%rbx | |
100 adcq $0,%rbx | |
101 | |
102 mulq %r8 | |
103 addq %rax,%r12 | |
104 movq 40(%rsi),%rax | |
105 adcq $0,%rdx | |
106 addq %rbx,%r12 | |
107 movq %rdx,%rbx | |
108 adcq $0,%rbx | |
109 | |
110 mulq %r8 | |
111 addq %rax,%r13 | |
112 movq 48(%rsi),%rax | |
113 adcq $0,%rdx | |
114 addq %rbx,%r13 | |
115 movq %rdx,%rbx | |
116 adcq $0,%rbx | |
117 | |
118 mulq %r8 | |
119 addq %rax,%r14 | |
120 movq 56(%rsi),%rax | |
121 adcq $0,%rdx | |
122 addq %rbx,%r14 | |
123 movq %rdx,%rbx | |
124 adcq $0,%rbx | |
125 | |
126 mulq %r8 | |
127 addq %rax,%r15 | |
128 movq %r8,%rax | |
129 adcq $0,%rdx | |
130 addq %rbx,%r15 | |
131 movq %rdx,%r8 | |
132 movq %r10,%rdx | |
133 adcq $0,%r8 | |
134 | |
135 addq %rdx,%rdx | |
136 leaq (%rcx,%r10,2),%r10 | |
137 movq %r11,%rbx | |
138 adcq %r11,%r11 | |
139 | |
140 mulq %rax | |
141 addq %rax,%r9 | |
142 adcq %rdx,%r10 | |
143 adcq $0,%r11 | |
144 | |
145 movq %r9,16(%rsp) | |
146 movq %r10,24(%rsp) | |
147 shrq $63,%rbx | |
148 | |
149 | |
150 movq 16(%rsi),%r9 | |
151 movq 24(%rsi),%rax | |
152 mulq %r9 | |
153 addq %rax,%r12 | |
154 movq 32(%rsi),%rax | |
155 movq %rdx,%rcx | |
156 adcq $0,%rcx | |
157 | |
158 mulq %r9 | |
159 addq %rax,%r13 | |
160 movq 40(%rsi),%rax | |
161 adcq $0,%rdx | |
162 addq %rcx,%r13 | |
163 movq %rdx,%rcx | |
164 adcq $0,%rcx | |
165 | |
166 mulq %r9 | |
167 addq %rax,%r14 | |
168 movq 48(%rsi),%rax | |
169 adcq $0,%rdx | |
170 addq %rcx,%r14 | |
171 movq %rdx,%rcx | |
172 adcq $0,%rcx | |
173 | |
174 mulq %r9 | |
175 movq %r12,%r10 | |
176 leaq (%rbx,%r12,2),%r12 | |
177 addq %rax,%r15 | |
178 movq 56(%rsi),%rax | |
179 adcq $0,%rdx | |
180 addq %rcx,%r15 | |
181 movq %rdx,%rcx | |
182 adcq $0,%rcx | |
183 | |
184 mulq %r9 | |
185 shrq $63,%r10 | |
186 addq %rax,%r8 | |
187 movq %r9,%rax | |
188 adcq $0,%rdx | |
189 addq %rcx,%r8 | |
190 movq %rdx,%r9 | |
191 adcq $0,%r9 | |
192 | |
193 movq %r13,%rcx | |
194 leaq (%r10,%r13,2),%r13 | |
195 | |
196 mulq %rax | |
197 addq %rax,%r11 | |
198 adcq %rdx,%r12 | |
199 adcq $0,%r13 | |
200 | |
201 movq %r11,32(%rsp) | |
202 movq %r12,40(%rsp) | |
203 shrq $63,%rcx | |
204 | |
205 | |
206 movq 24(%rsi),%r10 | |
207 movq 32(%rsi),%rax | |
208 mulq %r10 | |
209 addq %rax,%r14 | |
210 movq 40(%rsi),%rax | |
211 movq %rdx,%rbx | |
212 adcq $0,%rbx | |
213 | |
214 mulq %r10 | |
215 addq %rax,%r15 | |
216 movq 48(%rsi),%rax | |
217 adcq $0,%rdx | |
218 addq %rbx,%r15 | |
219 movq %rdx,%rbx | |
220 adcq $0,%rbx | |
221 | |
222 mulq %r10 | |
223 movq %r14,%r12 | |
224 leaq (%rcx,%r14,2),%r14 | |
225 addq %rax,%r8 | |
226 movq 56(%rsi),%rax | |
227 adcq $0,%rdx | |
228 addq %rbx,%r8 | |
229 movq %rdx,%rbx | |
230 adcq $0,%rbx | |
231 | |
232 mulq %r10 | |
233 shrq $63,%r12 | |
234 addq %rax,%r9 | |
235 movq %r10,%rax | |
236 adcq $0,%rdx | |
237 addq %rbx,%r9 | |
238 movq %rdx,%r10 | |
239 adcq $0,%r10 | |
240 | |
241 movq %r15,%rbx | |
242 leaq (%r12,%r15,2),%r15 | |
243 | |
244 mulq %rax | |
245 addq %rax,%r13 | |
246 adcq %rdx,%r14 | |
247 adcq $0,%r15 | |
248 | |
249 movq %r13,48(%rsp) | |
250 movq %r14,56(%rsp) | |
251 shrq $63,%rbx | |
252 | |
253 | |
254 movq 32(%rsi),%r11 | |
255 movq 40(%rsi),%rax | |
256 mulq %r11 | |
257 addq %rax,%r8 | |
258 movq 48(%rsi),%rax | |
259 movq %rdx,%rcx | |
260 adcq $0,%rcx | |
261 | |
262 mulq %r11 | |
263 addq %rax,%r9 | |
264 movq 56(%rsi),%rax | |
265 adcq $0,%rdx | |
266 movq %r8,%r12 | |
267 leaq (%rbx,%r8,2),%r8 | |
268 addq %rcx,%r9 | |
269 movq %rdx,%rcx | |
270 adcq $0,%rcx | |
271 | |
272 mulq %r11 | |
273 shrq $63,%r12 | |
274 addq %rax,%r10 | |
275 movq %r11,%rax | |
276 adcq $0,%rdx | |
277 addq %rcx,%r10 | |
278 movq %rdx,%r11 | |
279 adcq $0,%r11 | |
280 | |
281 movq %r9,%rcx | |
282 leaq (%r12,%r9,2),%r9 | |
283 | |
284 mulq %rax | |
285 addq %rax,%r15 | |
286 adcq %rdx,%r8 | |
287 adcq $0,%r9 | |
288 | |
289 movq %r15,64(%rsp) | |
290 movq %r8,72(%rsp) | |
291 shrq $63,%rcx | |
292 | |
293 | |
294 movq 40(%rsi),%r12 | |
295 movq 48(%rsi),%rax | |
296 mulq %r12 | |
297 addq %rax,%r10 | |
298 movq 56(%rsi),%rax | |
299 movq %rdx,%rbx | |
300 adcq $0,%rbx | |
301 | |
302 mulq %r12 | |
303 addq %rax,%r11 | |
304 movq %r12,%rax | |
305 movq %r10,%r15 | |
306 leaq (%rcx,%r10,2),%r10 | |
307 adcq $0,%rdx | |
308 shrq $63,%r15 | |
309 addq %rbx,%r11 | |
310 movq %rdx,%r12 | |
311 adcq $0,%r12 | |
312 | |
313 movq %r11,%rbx | |
314 leaq (%r15,%r11,2),%r11 | |
315 | |
316 mulq %rax | |
317 addq %rax,%r9 | |
318 adcq %rdx,%r10 | |
319 adcq $0,%r11 | |
320 | |
321 movq %r9,80(%rsp) | |
322 movq %r10,88(%rsp) | |
323 | |
324 | |
325 movq 48(%rsi),%r13 | |
326 movq 56(%rsi),%rax | |
327 mulq %r13 | |
328 addq %rax,%r12 | |
329 movq %r13,%rax | |
330 movq %rdx,%r13 | |
331 adcq $0,%r13 | |
332 | |
333 xorq %r14,%r14 | |
334 shlq $1,%rbx | |
335 adcq %r12,%r12 | |
336 adcq %r13,%r13 | |
337 adcq %r14,%r14 | |
338 | |
339 mulq %rax | |
340 addq %rax,%r11 | |
341 adcq %rdx,%r12 | |
342 adcq $0,%r13 | |
343 | |
344 movq %r11,96(%rsp) | |
345 movq %r12,104(%rsp) | |
346 | |
347 | |
348 movq 56(%rsi),%rax | |
349 mulq %rax | |
350 addq %rax,%r13 | |
351 adcq $0,%rdx | |
352 | |
353 addq %rdx,%r14 | |
354 | |
355 movq %r13,112(%rsp) | |
356 movq %r14,120(%rsp) | |
357 | |
358 movq (%rsp),%r8 | |
359 movq 8(%rsp),%r9 | |
360 movq 16(%rsp),%r10 | |
361 movq 24(%rsp),%r11 | |
362 movq 32(%rsp),%r12 | |
363 movq 40(%rsp),%r13 | |
364 movq 48(%rsp),%r14 | |
365 movq 56(%rsp),%r15 | |
366 | |
367 call __rsaz_512_reduce | |
368 | |
369 addq 64(%rsp),%r8 | |
370 adcq 72(%rsp),%r9 | |
371 adcq 80(%rsp),%r10 | |
372 adcq 88(%rsp),%r11 | |
373 adcq 96(%rsp),%r12 | |
374 adcq 104(%rsp),%r13 | |
375 adcq 112(%rsp),%r14 | |
376 adcq 120(%rsp),%r15 | |
377 sbbq %rcx,%rcx | |
378 | |
379 call __rsaz_512_subtract | |
380 | |
381 movq %r8,%rdx | |
382 movq %r9,%rax | |
383 movl 128+8(%rsp),%r8d | |
384 movq %rdi,%rsi | |
385 | |
386 decl %r8d | |
387 jnz .Loop_sqr | |
388 | |
389 leaq 128+24+48(%rsp),%rax | |
390 movq -48(%rax),%r15 | |
391 movq -40(%rax),%r14 | |
392 movq -32(%rax),%r13 | |
393 movq -24(%rax),%r12 | |
394 movq -16(%rax),%rbp | |
395 movq -8(%rax),%rbx | |
396 leaq (%rax),%rsp | |
397 .Lsqr_epilogue: | |
398 .byte 0xf3,0xc3 | |
399 .size rsaz_512_sqr,.-rsaz_512_sqr | |
400 .globl rsaz_512_mul | |
401 .hidden rsaz_512_mul | |
402 .type rsaz_512_mul,@function | |
403 .align 32 | |
404 rsaz_512_mul: | |
405 pushq %rbx | |
406 pushq %rbp | |
407 pushq %r12 | |
408 pushq %r13 | |
409 pushq %r14 | |
410 pushq %r15 | |
411 | |
412 subq $128+24,%rsp | |
413 .Lmul_body: | |
414 .byte 102,72,15,110,199 | |
415 .byte 102,72,15,110,201 | |
416 movq %r8,128(%rsp) | |
417 movq (%rdx),%rbx | |
418 movq %rdx,%rbp | |
419 call __rsaz_512_mul | |
420 | |
421 .byte 102,72,15,126,199 | |
422 .byte 102,72,15,126,205 | |
423 | |
424 movq (%rsp),%r8 | |
425 movq 8(%rsp),%r9 | |
426 movq 16(%rsp),%r10 | |
427 movq 24(%rsp),%r11 | |
428 movq 32(%rsp),%r12 | |
429 movq 40(%rsp),%r13 | |
430 movq 48(%rsp),%r14 | |
431 movq 56(%rsp),%r15 | |
432 | |
433 call __rsaz_512_reduce | |
434 addq 64(%rsp),%r8 | |
435 adcq 72(%rsp),%r9 | |
436 adcq 80(%rsp),%r10 | |
437 adcq 88(%rsp),%r11 | |
438 adcq 96(%rsp),%r12 | |
439 adcq 104(%rsp),%r13 | |
440 adcq 112(%rsp),%r14 | |
441 adcq 120(%rsp),%r15 | |
442 sbbq %rcx,%rcx | |
443 | |
444 call __rsaz_512_subtract | |
445 | |
446 leaq 128+24+48(%rsp),%rax | |
447 movq -48(%rax),%r15 | |
448 movq -40(%rax),%r14 | |
449 movq -32(%rax),%r13 | |
450 movq -24(%rax),%r12 | |
451 movq -16(%rax),%rbp | |
452 movq -8(%rax),%rbx | |
453 leaq (%rax),%rsp | |
454 .Lmul_epilogue: | |
455 .byte 0xf3,0xc3 | |
456 .size rsaz_512_mul,.-rsaz_512_mul | |
457 .globl rsaz_512_mul_gather4 | |
458 .hidden rsaz_512_mul_gather4 | |
459 .type rsaz_512_mul_gather4,@function | |
460 .align 32 | |
461 rsaz_512_mul_gather4: | |
462 pushq %rbx | |
463 pushq %rbp | |
464 pushq %r12 | |
465 pushq %r13 | |
466 pushq %r14 | |
467 pushq %r15 | |
468 | |
469 subq $152,%rsp | |
470 .Lmul_gather4_body: | |
471 movd %r9d,%xmm8 | |
472 movdqa .Linc+16(%rip),%xmm1 | |
473 movdqa .Linc(%rip),%xmm0 | |
474 | |
475 pshufd $0,%xmm8,%xmm8 | |
476 movdqa %xmm1,%xmm7 | |
477 movdqa %xmm1,%xmm2 | |
478 paddd %xmm0,%xmm1 | |
479 pcmpeqd %xmm8,%xmm0 | |
480 movdqa %xmm7,%xmm3 | |
481 paddd %xmm1,%xmm2 | |
482 pcmpeqd %xmm8,%xmm1 | |
483 movdqa %xmm7,%xmm4 | |
484 paddd %xmm2,%xmm3 | |
485 pcmpeqd %xmm8,%xmm2 | |
486 movdqa %xmm7,%xmm5 | |
487 paddd %xmm3,%xmm4 | |
488 pcmpeqd %xmm8,%xmm3 | |
489 movdqa %xmm7,%xmm6 | |
490 paddd %xmm4,%xmm5 | |
491 pcmpeqd %xmm8,%xmm4 | |
492 paddd %xmm5,%xmm6 | |
493 pcmpeqd %xmm8,%xmm5 | |
494 paddd %xmm6,%xmm7 | |
495 pcmpeqd %xmm8,%xmm6 | |
496 pcmpeqd %xmm8,%xmm7 | |
497 | |
498 movdqa 0(%rdx),%xmm8 | |
499 movdqa 16(%rdx),%xmm9 | |
500 movdqa 32(%rdx),%xmm10 | |
501 movdqa 48(%rdx),%xmm11 | |
502 pand %xmm0,%xmm8 | |
503 movdqa 64(%rdx),%xmm12 | |
504 pand %xmm1,%xmm9 | |
505 movdqa 80(%rdx),%xmm13 | |
506 pand %xmm2,%xmm10 | |
507 movdqa 96(%rdx),%xmm14 | |
508 pand %xmm3,%xmm11 | |
509 movdqa 112(%rdx),%xmm15 | |
510 leaq 128(%rdx),%rbp | |
511 pand %xmm4,%xmm12 | |
512 pand %xmm5,%xmm13 | |
513 pand %xmm6,%xmm14 | |
514 pand %xmm7,%xmm15 | |
515 por %xmm10,%xmm8 | |
516 por %xmm11,%xmm9 | |
517 por %xmm12,%xmm8 | |
518 por %xmm13,%xmm9 | |
519 por %xmm14,%xmm8 | |
520 por %xmm15,%xmm9 | |
521 | |
522 por %xmm9,%xmm8 | |
523 pshufd $0x4e,%xmm8,%xmm9 | |
524 por %xmm9,%xmm8 | |
525 .byte 102,76,15,126,195 | |
526 | |
527 movq %r8,128(%rsp) | |
528 movq %rdi,128+8(%rsp) | |
529 movq %rcx,128+16(%rsp) | |
530 | |
531 movq (%rsi),%rax | |
532 movq 8(%rsi),%rcx | |
533 mulq %rbx | |
534 movq %rax,(%rsp) | |
535 movq %rcx,%rax | |
536 movq %rdx,%r8 | |
537 | |
538 mulq %rbx | |
539 addq %rax,%r8 | |
540 movq 16(%rsi),%rax | |
541 movq %rdx,%r9 | |
542 adcq $0,%r9 | |
543 | |
544 mulq %rbx | |
545 addq %rax,%r9 | |
546 movq 24(%rsi),%rax | |
547 movq %rdx,%r10 | |
548 adcq $0,%r10 | |
549 | |
550 mulq %rbx | |
551 addq %rax,%r10 | |
552 movq 32(%rsi),%rax | |
553 movq %rdx,%r11 | |
554 adcq $0,%r11 | |
555 | |
556 mulq %rbx | |
557 addq %rax,%r11 | |
558 movq 40(%rsi),%rax | |
559 movq %rdx,%r12 | |
560 adcq $0,%r12 | |
561 | |
562 mulq %rbx | |
563 addq %rax,%r12 | |
564 movq 48(%rsi),%rax | |
565 movq %rdx,%r13 | |
566 adcq $0,%r13 | |
567 | |
568 mulq %rbx | |
569 addq %rax,%r13 | |
570 movq 56(%rsi),%rax | |
571 movq %rdx,%r14 | |
572 adcq $0,%r14 | |
573 | |
574 mulq %rbx | |
575 addq %rax,%r14 | |
576 movq (%rsi),%rax | |
577 movq %rdx,%r15 | |
578 adcq $0,%r15 | |
579 | |
580 leaq 8(%rsp),%rdi | |
581 movl $7,%ecx | |
582 jmp .Loop_mul_gather | |
583 | |
584 .align 32 | |
585 .Loop_mul_gather: | |
586 movdqa 0(%rbp),%xmm8 | |
587 movdqa 16(%rbp),%xmm9 | |
588 movdqa 32(%rbp),%xmm10 | |
589 movdqa 48(%rbp),%xmm11 | |
590 pand %xmm0,%xmm8 | |
591 movdqa 64(%rbp),%xmm12 | |
592 pand %xmm1,%xmm9 | |
593 movdqa 80(%rbp),%xmm13 | |
594 pand %xmm2,%xmm10 | |
595 movdqa 96(%rbp),%xmm14 | |
596 pand %xmm3,%xmm11 | |
597 movdqa 112(%rbp),%xmm15 | |
598 leaq 128(%rbp),%rbp | |
599 pand %xmm4,%xmm12 | |
600 pand %xmm5,%xmm13 | |
601 pand %xmm6,%xmm14 | |
602 pand %xmm7,%xmm15 | |
603 por %xmm10,%xmm8 | |
604 por %xmm11,%xmm9 | |
605 por %xmm12,%xmm8 | |
606 por %xmm13,%xmm9 | |
607 por %xmm14,%xmm8 | |
608 por %xmm15,%xmm9 | |
609 | |
610 por %xmm9,%xmm8 | |
611 pshufd $0x4e,%xmm8,%xmm9 | |
612 por %xmm9,%xmm8 | |
613 .byte 102,76,15,126,195 | |
614 | |
615 mulq %rbx | |
616 addq %rax,%r8 | |
617 movq 8(%rsi),%rax | |
618 movq %r8,(%rdi) | |
619 movq %rdx,%r8 | |
620 adcq $0,%r8 | |
621 | |
622 mulq %rbx | |
623 addq %rax,%r9 | |
624 movq 16(%rsi),%rax | |
625 adcq $0,%rdx | |
626 addq %r9,%r8 | |
627 movq %rdx,%r9 | |
628 adcq $0,%r9 | |
629 | |
630 mulq %rbx | |
631 addq %rax,%r10 | |
632 movq 24(%rsi),%rax | |
633 adcq $0,%rdx | |
634 addq %r10,%r9 | |
635 movq %rdx,%r10 | |
636 adcq $0,%r10 | |
637 | |
638 mulq %rbx | |
639 addq %rax,%r11 | |
640 movq 32(%rsi),%rax | |
641 adcq $0,%rdx | |
642 addq %r11,%r10 | |
643 movq %rdx,%r11 | |
644 adcq $0,%r11 | |
645 | |
646 mulq %rbx | |
647 addq %rax,%r12 | |
648 movq 40(%rsi),%rax | |
649 adcq $0,%rdx | |
650 addq %r12,%r11 | |
651 movq %rdx,%r12 | |
652 adcq $0,%r12 | |
653 | |
654 mulq %rbx | |
655 addq %rax,%r13 | |
656 movq 48(%rsi),%rax | |
657 adcq $0,%rdx | |
658 addq %r13,%r12 | |
659 movq %rdx,%r13 | |
660 adcq $0,%r13 | |
661 | |
662 mulq %rbx | |
663 addq %rax,%r14 | |
664 movq 56(%rsi),%rax | |
665 adcq $0,%rdx | |
666 addq %r14,%r13 | |
667 movq %rdx,%r14 | |
668 adcq $0,%r14 | |
669 | |
670 mulq %rbx | |
671 addq %rax,%r15 | |
672 movq (%rsi),%rax | |
673 adcq $0,%rdx | |
674 addq %r15,%r14 | |
675 movq %rdx,%r15 | |
676 adcq $0,%r15 | |
677 | |
678 leaq 8(%rdi),%rdi | |
679 | |
680 decl %ecx | |
681 jnz .Loop_mul_gather | |
682 | |
683 movq %r8,(%rdi) | |
684 movq %r9,8(%rdi) | |
685 movq %r10,16(%rdi) | |
686 movq %r11,24(%rdi) | |
687 movq %r12,32(%rdi) | |
688 movq %r13,40(%rdi) | |
689 movq %r14,48(%rdi) | |
690 movq %r15,56(%rdi) | |
691 | |
692 movq 128+8(%rsp),%rdi | |
693 movq 128+16(%rsp),%rbp | |
694 | |
695 movq (%rsp),%r8 | |
696 movq 8(%rsp),%r9 | |
697 movq 16(%rsp),%r10 | |
698 movq 24(%rsp),%r11 | |
699 movq 32(%rsp),%r12 | |
700 movq 40(%rsp),%r13 | |
701 movq 48(%rsp),%r14 | |
702 movq 56(%rsp),%r15 | |
703 | |
704 call __rsaz_512_reduce | |
705 addq 64(%rsp),%r8 | |
706 adcq 72(%rsp),%r9 | |
707 adcq 80(%rsp),%r10 | |
708 adcq 88(%rsp),%r11 | |
709 adcq 96(%rsp),%r12 | |
710 adcq 104(%rsp),%r13 | |
711 adcq 112(%rsp),%r14 | |
712 adcq 120(%rsp),%r15 | |
713 sbbq %rcx,%rcx | |
714 | |
715 call __rsaz_512_subtract | |
716 | |
717 leaq 128+24+48(%rsp),%rax | |
718 movq -48(%rax),%r15 | |
719 movq -40(%rax),%r14 | |
720 movq -32(%rax),%r13 | |
721 movq -24(%rax),%r12 | |
722 movq -16(%rax),%rbp | |
723 movq -8(%rax),%rbx | |
724 leaq (%rax),%rsp | |
725 .Lmul_gather4_epilogue: | |
726 .byte 0xf3,0xc3 | |
727 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 | |
728 .globl rsaz_512_mul_scatter4 | |
729 .hidden rsaz_512_mul_scatter4 | |
730 .type rsaz_512_mul_scatter4,@function | |
731 .align 32 | |
732 rsaz_512_mul_scatter4: | |
733 pushq %rbx | |
734 pushq %rbp | |
735 pushq %r12 | |
736 pushq %r13 | |
737 pushq %r14 | |
738 pushq %r15 | |
739 | |
740 movl %r9d,%r9d | |
741 subq $128+24,%rsp | |
742 .Lmul_scatter4_body: | |
743 leaq (%r8,%r9,8),%r8 | |
744 .byte 102,72,15,110,199 | |
745 .byte 102,72,15,110,202 | |
746 .byte 102,73,15,110,208 | |
747 movq %rcx,128(%rsp) | |
748 | |
749 movq %rdi,%rbp | |
750 movq (%rdi),%rbx | |
751 call __rsaz_512_mul | |
752 | |
753 .byte 102,72,15,126,199 | |
754 .byte 102,72,15,126,205 | |
755 | |
756 movq (%rsp),%r8 | |
757 movq 8(%rsp),%r9 | |
758 movq 16(%rsp),%r10 | |
759 movq 24(%rsp),%r11 | |
760 movq 32(%rsp),%r12 | |
761 movq 40(%rsp),%r13 | |
762 movq 48(%rsp),%r14 | |
763 movq 56(%rsp),%r15 | |
764 | |
765 call __rsaz_512_reduce | |
766 addq 64(%rsp),%r8 | |
767 adcq 72(%rsp),%r9 | |
768 adcq 80(%rsp),%r10 | |
769 adcq 88(%rsp),%r11 | |
770 adcq 96(%rsp),%r12 | |
771 adcq 104(%rsp),%r13 | |
772 adcq 112(%rsp),%r14 | |
773 adcq 120(%rsp),%r15 | |
774 .byte 102,72,15,126,214 | |
775 sbbq %rcx,%rcx | |
776 | |
777 call __rsaz_512_subtract | |
778 | |
779 movq %r8,0(%rsi) | |
780 movq %r9,128(%rsi) | |
781 movq %r10,256(%rsi) | |
782 movq %r11,384(%rsi) | |
783 movq %r12,512(%rsi) | |
784 movq %r13,640(%rsi) | |
785 movq %r14,768(%rsi) | |
786 movq %r15,896(%rsi) | |
787 | |
788 leaq 128+24+48(%rsp),%rax | |
789 movq -48(%rax),%r15 | |
790 movq -40(%rax),%r14 | |
791 movq -32(%rax),%r13 | |
792 movq -24(%rax),%r12 | |
793 movq -16(%rax),%rbp | |
794 movq -8(%rax),%rbx | |
795 leaq (%rax),%rsp | |
796 .Lmul_scatter4_epilogue: | |
797 .byte 0xf3,0xc3 | |
798 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 | |
799 .globl rsaz_512_mul_by_one | |
800 .hidden rsaz_512_mul_by_one | |
801 .type rsaz_512_mul_by_one,@function | |
802 .align 32 | |
803 rsaz_512_mul_by_one: | |
804 pushq %rbx | |
805 pushq %rbp | |
806 pushq %r12 | |
807 pushq %r13 | |
808 pushq %r14 | |
809 pushq %r15 | |
810 | |
811 subq $128+24,%rsp | |
812 .Lmul_by_one_body: | |
813 movq %rdx,%rbp | |
814 movq %rcx,128(%rsp) | |
815 | |
816 movq (%rsi),%r8 | |
817 pxor %xmm0,%xmm0 | |
818 movq 8(%rsi),%r9 | |
819 movq 16(%rsi),%r10 | |
820 movq 24(%rsi),%r11 | |
821 movq 32(%rsi),%r12 | |
822 movq 40(%rsi),%r13 | |
823 movq 48(%rsi),%r14 | |
824 movq 56(%rsi),%r15 | |
825 | |
826 movdqa %xmm0,(%rsp) | |
827 movdqa %xmm0,16(%rsp) | |
828 movdqa %xmm0,32(%rsp) | |
829 movdqa %xmm0,48(%rsp) | |
830 movdqa %xmm0,64(%rsp) | |
831 movdqa %xmm0,80(%rsp) | |
832 movdqa %xmm0,96(%rsp) | |
833 call __rsaz_512_reduce | |
834 movq %r8,(%rdi) | |
835 movq %r9,8(%rdi) | |
836 movq %r10,16(%rdi) | |
837 movq %r11,24(%rdi) | |
838 movq %r12,32(%rdi) | |
839 movq %r13,40(%rdi) | |
840 movq %r14,48(%rdi) | |
841 movq %r15,56(%rdi) | |
842 | |
843 leaq 128+24+48(%rsp),%rax | |
844 movq -48(%rax),%r15 | |
845 movq -40(%rax),%r14 | |
846 movq -32(%rax),%r13 | |
847 movq -24(%rax),%r12 | |
848 movq -16(%rax),%rbp | |
849 movq -8(%rax),%rbx | |
850 leaq (%rax),%rsp | |
851 .Lmul_by_one_epilogue: | |
852 .byte 0xf3,0xc3 | |
853 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one | |
854 .type __rsaz_512_reduce,@function | |
855 .align 32 | |
856 __rsaz_512_reduce: | |
857 movq %r8,%rbx | |
858 imulq 128+8(%rsp),%rbx | |
859 movq 0(%rbp),%rax | |
860 movl $8,%ecx | |
861 jmp .Lreduction_loop | |
862 | |
863 .align 32 | |
864 .Lreduction_loop: | |
865 mulq %rbx | |
866 movq 8(%rbp),%rax | |
867 negq %r8 | |
868 movq %rdx,%r8 | |
869 adcq $0,%r8 | |
870 | |
871 mulq %rbx | |
872 addq %rax,%r9 | |
873 movq 16(%rbp),%rax | |
874 adcq $0,%rdx | |
875 addq %r9,%r8 | |
876 movq %rdx,%r9 | |
877 adcq $0,%r9 | |
878 | |
879 mulq %rbx | |
880 addq %rax,%r10 | |
881 movq 24(%rbp),%rax | |
882 adcq $0,%rdx | |
883 addq %r10,%r9 | |
884 movq %rdx,%r10 | |
885 adcq $0,%r10 | |
886 | |
887 mulq %rbx | |
888 addq %rax,%r11 | |
889 movq 32(%rbp),%rax | |
890 adcq $0,%rdx | |
891 addq %r11,%r10 | |
892 movq 128+8(%rsp),%rsi | |
893 | |
894 | |
895 adcq $0,%rdx | |
896 movq %rdx,%r11 | |
897 | |
898 mulq %rbx | |
899 addq %rax,%r12 | |
900 movq 40(%rbp),%rax | |
901 adcq $0,%rdx | |
902 imulq %r8,%rsi | |
903 addq %r12,%r11 | |
904 movq %rdx,%r12 | |
905 adcq $0,%r12 | |
906 | |
907 mulq %rbx | |
908 addq %rax,%r13 | |
909 movq 48(%rbp),%rax | |
910 adcq $0,%rdx | |
911 addq %r13,%r12 | |
912 movq %rdx,%r13 | |
913 adcq $0,%r13 | |
914 | |
915 mulq %rbx | |
916 addq %rax,%r14 | |
917 movq 56(%rbp),%rax | |
918 adcq $0,%rdx | |
919 addq %r14,%r13 | |
920 movq %rdx,%r14 | |
921 adcq $0,%r14 | |
922 | |
923 mulq %rbx | |
924 movq %rsi,%rbx | |
925 addq %rax,%r15 | |
926 movq 0(%rbp),%rax | |
927 adcq $0,%rdx | |
928 addq %r15,%r14 | |
929 movq %rdx,%r15 | |
930 adcq $0,%r15 | |
931 | |
932 decl %ecx | |
933 jne .Lreduction_loop | |
934 | |
935 .byte 0xf3,0xc3 | |
936 .size __rsaz_512_reduce,.-__rsaz_512_reduce | |
937 .type __rsaz_512_subtract,@function | |
938 .align 32 | |
939 __rsaz_512_subtract: | |
940 movq %r8,(%rdi) | |
941 movq %r9,8(%rdi) | |
942 movq %r10,16(%rdi) | |
943 movq %r11,24(%rdi) | |
944 movq %r12,32(%rdi) | |
945 movq %r13,40(%rdi) | |
946 movq %r14,48(%rdi) | |
947 movq %r15,56(%rdi) | |
948 | |
949 movq 0(%rbp),%r8 | |
950 movq 8(%rbp),%r9 | |
951 negq %r8 | |
952 notq %r9 | |
953 andq %rcx,%r8 | |
954 movq 16(%rbp),%r10 | |
955 andq %rcx,%r9 | |
956 notq %r10 | |
957 movq 24(%rbp),%r11 | |
958 andq %rcx,%r10 | |
959 notq %r11 | |
960 movq 32(%rbp),%r12 | |
961 andq %rcx,%r11 | |
962 notq %r12 | |
963 movq 40(%rbp),%r13 | |
964 andq %rcx,%r12 | |
965 notq %r13 | |
966 movq 48(%rbp),%r14 | |
967 andq %rcx,%r13 | |
968 notq %r14 | |
969 movq 56(%rbp),%r15 | |
970 andq %rcx,%r14 | |
971 notq %r15 | |
972 andq %rcx,%r15 | |
973 | |
974 addq (%rdi),%r8 | |
975 adcq 8(%rdi),%r9 | |
976 adcq 16(%rdi),%r10 | |
977 adcq 24(%rdi),%r11 | |
978 adcq 32(%rdi),%r12 | |
979 adcq 40(%rdi),%r13 | |
980 adcq 48(%rdi),%r14 | |
981 adcq 56(%rdi),%r15 | |
982 | |
983 movq %r8,(%rdi) | |
984 movq %r9,8(%rdi) | |
985 movq %r10,16(%rdi) | |
986 movq %r11,24(%rdi) | |
987 movq %r12,32(%rdi) | |
988 movq %r13,40(%rdi) | |
989 movq %r14,48(%rdi) | |
990 movq %r15,56(%rdi) | |
991 | |
992 .byte 0xf3,0xc3 | |
993 .size __rsaz_512_subtract,.-__rsaz_512_subtract | |
994 .type __rsaz_512_mul,@function | |
995 .align 32 | |
996 __rsaz_512_mul: | |
997 leaq 8(%rsp),%rdi | |
998 | |
999 movq (%rsi),%rax | |
1000 mulq %rbx | |
1001 movq %rax,(%rdi) | |
1002 movq 8(%rsi),%rax | |
1003 movq %rdx,%r8 | |
1004 | |
1005 mulq %rbx | |
1006 addq %rax,%r8 | |
1007 movq 16(%rsi),%rax | |
1008 movq %rdx,%r9 | |
1009 adcq $0,%r9 | |
1010 | |
1011 mulq %rbx | |
1012 addq %rax,%r9 | |
1013 movq 24(%rsi),%rax | |
1014 movq %rdx,%r10 | |
1015 adcq $0,%r10 | |
1016 | |
1017 mulq %rbx | |
1018 addq %rax,%r10 | |
1019 movq 32(%rsi),%rax | |
1020 movq %rdx,%r11 | |
1021 adcq $0,%r11 | |
1022 | |
1023 mulq %rbx | |
1024 addq %rax,%r11 | |
1025 movq 40(%rsi),%rax | |
1026 movq %rdx,%r12 | |
1027 adcq $0,%r12 | |
1028 | |
1029 mulq %rbx | |
1030 addq %rax,%r12 | |
1031 movq 48(%rsi),%rax | |
1032 movq %rdx,%r13 | |
1033 adcq $0,%r13 | |
1034 | |
1035 mulq %rbx | |
1036 addq %rax,%r13 | |
1037 movq 56(%rsi),%rax | |
1038 movq %rdx,%r14 | |
1039 adcq $0,%r14 | |
1040 | |
1041 mulq %rbx | |
1042 addq %rax,%r14 | |
1043 movq (%rsi),%rax | |
1044 movq %rdx,%r15 | |
1045 adcq $0,%r15 | |
1046 | |
1047 leaq 8(%rbp),%rbp | |
1048 leaq 8(%rdi),%rdi | |
1049 | |
1050 movl $7,%ecx | |
1051 jmp .Loop_mul | |
1052 | |
1053 .align 32 | |
1054 .Loop_mul: | |
1055 movq (%rbp),%rbx | |
1056 mulq %rbx | |
1057 addq %rax,%r8 | |
1058 movq 8(%rsi),%rax | |
1059 movq %r8,(%rdi) | |
1060 movq %rdx,%r8 | |
1061 adcq $0,%r8 | |
1062 | |
1063 mulq %rbx | |
1064 addq %rax,%r9 | |
1065 movq 16(%rsi),%rax | |
1066 adcq $0,%rdx | |
1067 addq %r9,%r8 | |
1068 movq %rdx,%r9 | |
1069 adcq $0,%r9 | |
1070 | |
1071 mulq %rbx | |
1072 addq %rax,%r10 | |
1073 movq 24(%rsi),%rax | |
1074 adcq $0,%rdx | |
1075 addq %r10,%r9 | |
1076 movq %rdx,%r10 | |
1077 adcq $0,%r10 | |
1078 | |
1079 mulq %rbx | |
1080 addq %rax,%r11 | |
1081 movq 32(%rsi),%rax | |
1082 adcq $0,%rdx | |
1083 addq %r11,%r10 | |
1084 movq %rdx,%r11 | |
1085 adcq $0,%r11 | |
1086 | |
1087 mulq %rbx | |
1088 addq %rax,%r12 | |
1089 movq 40(%rsi),%rax | |
1090 adcq $0,%rdx | |
1091 addq %r12,%r11 | |
1092 movq %rdx,%r12 | |
1093 adcq $0,%r12 | |
1094 | |
1095 mulq %rbx | |
1096 addq %rax,%r13 | |
1097 movq 48(%rsi),%rax | |
1098 adcq $0,%rdx | |
1099 addq %r13,%r12 | |
1100 movq %rdx,%r13 | |
1101 adcq $0,%r13 | |
1102 | |
1103 mulq %rbx | |
1104 addq %rax,%r14 | |
1105 movq 56(%rsi),%rax | |
1106 adcq $0,%rdx | |
1107 addq %r14,%r13 | |
1108 movq %rdx,%r14 | |
1109 leaq 8(%rbp),%rbp | |
1110 adcq $0,%r14 | |
1111 | |
1112 mulq %rbx | |
1113 addq %rax,%r15 | |
1114 movq (%rsi),%rax | |
1115 adcq $0,%rdx | |
1116 addq %r15,%r14 | |
1117 movq %rdx,%r15 | |
1118 adcq $0,%r15 | |
1119 | |
1120 leaq 8(%rdi),%rdi | |
1121 | |
1122 decl %ecx | |
1123 jnz .Loop_mul | |
1124 | |
1125 movq %r8,(%rdi) | |
1126 movq %r9,8(%rdi) | |
1127 movq %r10,16(%rdi) | |
1128 movq %r11,24(%rdi) | |
1129 movq %r12,32(%rdi) | |
1130 movq %r13,40(%rdi) | |
1131 movq %r14,48(%rdi) | |
1132 movq %r15,56(%rdi) | |
1133 | |
1134 .byte 0xf3,0xc3 | |
1135 .size __rsaz_512_mul,.-__rsaz_512_mul | |
1136 .globl rsaz_512_scatter4 | |
1137 .hidden rsaz_512_scatter4 | |
1138 .type rsaz_512_scatter4,@function | |
1139 .align 16 | |
1140 rsaz_512_scatter4: | |
1141 leaq (%rdi,%rdx,8),%rdi | |
1142 movl $8,%r9d | |
1143 jmp .Loop_scatter | |
1144 .align 16 | |
1145 .Loop_scatter: | |
1146 movq (%rsi),%rax | |
1147 leaq 8(%rsi),%rsi | |
1148 movq %rax,(%rdi) | |
1149 leaq 128(%rdi),%rdi | |
1150 decl %r9d | |
1151 jnz .Loop_scatter | |
1152 .byte 0xf3,0xc3 | |
1153 .size rsaz_512_scatter4,.-rsaz_512_scatter4 | |
1154 | |
1155 .globl rsaz_512_gather4 | |
1156 .hidden rsaz_512_gather4 | |
1157 .type rsaz_512_gather4,@function | |
1158 .align 16 | |
1159 rsaz_512_gather4: | |
1160 movd %edx,%xmm8 | |
1161 movdqa .Linc+16(%rip),%xmm1 | |
1162 movdqa .Linc(%rip),%xmm0 | |
1163 | |
1164 pshufd $0,%xmm8,%xmm8 | |
1165 movdqa %xmm1,%xmm7 | |
1166 movdqa %xmm1,%xmm2 | |
1167 paddd %xmm0,%xmm1 | |
1168 pcmpeqd %xmm8,%xmm0 | |
1169 movdqa %xmm7,%xmm3 | |
1170 paddd %xmm1,%xmm2 | |
1171 pcmpeqd %xmm8,%xmm1 | |
1172 movdqa %xmm7,%xmm4 | |
1173 paddd %xmm2,%xmm3 | |
1174 pcmpeqd %xmm8,%xmm2 | |
1175 movdqa %xmm7,%xmm5 | |
1176 paddd %xmm3,%xmm4 | |
1177 pcmpeqd %xmm8,%xmm3 | |
1178 movdqa %xmm7,%xmm6 | |
1179 paddd %xmm4,%xmm5 | |
1180 pcmpeqd %xmm8,%xmm4 | |
1181 paddd %xmm5,%xmm6 | |
1182 pcmpeqd %xmm8,%xmm5 | |
1183 paddd %xmm6,%xmm7 | |
1184 pcmpeqd %xmm8,%xmm6 | |
1185 pcmpeqd %xmm8,%xmm7 | |
1186 movl $8,%r9d | |
1187 jmp .Loop_gather | |
1188 .align 16 | |
1189 .Loop_gather: | |
1190 movdqa 0(%rsi),%xmm8 | |
1191 movdqa 16(%rsi),%xmm9 | |
1192 movdqa 32(%rsi),%xmm10 | |
1193 movdqa 48(%rsi),%xmm11 | |
1194 pand %xmm0,%xmm8 | |
1195 movdqa 64(%rsi),%xmm12 | |
1196 pand %xmm1,%xmm9 | |
1197 movdqa 80(%rsi),%xmm13 | |
1198 pand %xmm2,%xmm10 | |
1199 movdqa 96(%rsi),%xmm14 | |
1200 pand %xmm3,%xmm11 | |
1201 movdqa 112(%rsi),%xmm15 | |
1202 leaq 128(%rsi),%rsi | |
1203 pand %xmm4,%xmm12 | |
1204 pand %xmm5,%xmm13 | |
1205 pand %xmm6,%xmm14 | |
1206 pand %xmm7,%xmm15 | |
1207 por %xmm10,%xmm8 | |
1208 por %xmm11,%xmm9 | |
1209 por %xmm12,%xmm8 | |
1210 por %xmm13,%xmm9 | |
1211 por %xmm14,%xmm8 | |
1212 por %xmm15,%xmm9 | |
1213 | |
1214 por %xmm9,%xmm8 | |
1215 pshufd $0x4e,%xmm8,%xmm9 | |
1216 por %xmm9,%xmm8 | |
1217 movq %xmm8,(%rdi) | |
1218 leaq 8(%rdi),%rdi | |
1219 decl %r9d | |
1220 jnz .Loop_gather | |
1221 .byte 0xf3,0xc3 | |
1222 .LSEH_end_rsaz_512_gather4: | |
1223 .size rsaz_512_gather4,.-rsaz_512_gather4 | |
1224 | |
1225 .align 64 | |
1226 .Linc: | |
1227 .long 0,0, 1,1 | |
1228 .long 2,2, 2,2 | |
1229 #endif | |
OLD | NEW |