OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 | |
5 | |
6 .p2align 6 | |
7 L$poly: | |
8 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00
000001 | |
9 | |
10 L$One: | |
11 .long 1,1,1,1,1,1,1,1 | |
12 L$Two: | |
13 .long 2,2,2,2,2,2,2,2 | |
14 L$Three: | |
15 .long 3,3,3,3,3,3,3,3 | |
16 L$ONE_mont: | |
17 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000ff
fffffe | |
18 | |
19 | |
20 .p2align 6 | |
21 ecp_nistz256_mul_by_2: | |
22 pushq %r12 | |
23 pushq %r13 | |
24 | |
25 movq 0(%rsi),%r8 | |
26 movq 8(%rsi),%r9 | |
27 addq %r8,%r8 | |
28 movq 16(%rsi),%r10 | |
29 adcq %r9,%r9 | |
30 movq 24(%rsi),%r11 | |
31 leaq L$poly(%rip),%rsi | |
32 movq %r8,%rax | |
33 adcq %r10,%r10 | |
34 adcq %r11,%r11 | |
35 movq %r9,%rdx | |
36 sbbq %r13,%r13 | |
37 | |
38 subq 0(%rsi),%r8 | |
39 movq %r10,%rcx | |
40 sbbq 8(%rsi),%r9 | |
41 sbbq 16(%rsi),%r10 | |
42 movq %r11,%r12 | |
43 sbbq 24(%rsi),%r11 | |
44 testq %r13,%r13 | |
45 | |
46 cmovzq %rax,%r8 | |
47 cmovzq %rdx,%r9 | |
48 movq %r8,0(%rdi) | |
49 cmovzq %rcx,%r10 | |
50 movq %r9,8(%rdi) | |
51 cmovzq %r12,%r11 | |
52 movq %r10,16(%rdi) | |
53 movq %r11,24(%rdi) | |
54 | |
55 popq %r13 | |
56 popq %r12 | |
57 .byte 0xf3,0xc3 | |
58 | |
59 | |
60 | |
61 | |
62 .globl _ecp_nistz256_neg | |
63 .private_extern _ecp_nistz256_neg | |
64 | |
65 .p2align 5 | |
66 _ecp_nistz256_neg: | |
67 pushq %r12 | |
68 pushq %r13 | |
69 | |
70 xorq %r8,%r8 | |
71 xorq %r9,%r9 | |
72 xorq %r10,%r10 | |
73 xorq %r11,%r11 | |
74 xorq %r13,%r13 | |
75 | |
76 subq 0(%rsi),%r8 | |
77 sbbq 8(%rsi),%r9 | |
78 sbbq 16(%rsi),%r10 | |
79 movq %r8,%rax | |
80 sbbq 24(%rsi),%r11 | |
81 leaq L$poly(%rip),%rsi | |
82 movq %r9,%rdx | |
83 sbbq $0,%r13 | |
84 | |
85 addq 0(%rsi),%r8 | |
86 movq %r10,%rcx | |
87 adcq 8(%rsi),%r9 | |
88 adcq 16(%rsi),%r10 | |
89 movq %r11,%r12 | |
90 adcq 24(%rsi),%r11 | |
91 testq %r13,%r13 | |
92 | |
93 cmovzq %rax,%r8 | |
94 cmovzq %rdx,%r9 | |
95 movq %r8,0(%rdi) | |
96 cmovzq %rcx,%r10 | |
97 movq %r9,8(%rdi) | |
98 cmovzq %r12,%r11 | |
99 movq %r10,16(%rdi) | |
100 movq %r11,24(%rdi) | |
101 | |
102 popq %r13 | |
103 popq %r12 | |
104 .byte 0xf3,0xc3 | |
105 | |
106 | |
107 | |
108 | |
109 | |
110 | |
111 | |
112 .globl _ecp_nistz256_mul_mont | |
113 .private_extern _ecp_nistz256_mul_mont | |
114 | |
115 .p2align 5 | |
116 _ecp_nistz256_mul_mont: | |
117 L$mul_mont: | |
118 pushq %rbp | |
119 pushq %rbx | |
120 pushq %r12 | |
121 pushq %r13 | |
122 pushq %r14 | |
123 pushq %r15 | |
124 movq %rdx,%rbx | |
125 movq 0(%rdx),%rax | |
126 movq 0(%rsi),%r9 | |
127 movq 8(%rsi),%r10 | |
128 movq 16(%rsi),%r11 | |
129 movq 24(%rsi),%r12 | |
130 | |
131 call __ecp_nistz256_mul_montq | |
132 L$mul_mont_done: | |
133 popq %r15 | |
134 popq %r14 | |
135 popq %r13 | |
136 popq %r12 | |
137 popq %rbx | |
138 popq %rbp | |
139 .byte 0xf3,0xc3 | |
140 | |
141 | |
142 | |
143 .p2align 5 | |
144 __ecp_nistz256_mul_montq: | |
145 | |
146 | |
147 movq %rax,%rbp | |
148 mulq %r9 | |
149 movq L$poly+8(%rip),%r14 | |
150 movq %rax,%r8 | |
151 movq %rbp,%rax | |
152 movq %rdx,%r9 | |
153 | |
154 mulq %r10 | |
155 movq L$poly+24(%rip),%r15 | |
156 addq %rax,%r9 | |
157 movq %rbp,%rax | |
158 adcq $0,%rdx | |
159 movq %rdx,%r10 | |
160 | |
161 mulq %r11 | |
162 addq %rax,%r10 | |
163 movq %rbp,%rax | |
164 adcq $0,%rdx | |
165 movq %rdx,%r11 | |
166 | |
167 mulq %r12 | |
168 addq %rax,%r11 | |
169 movq %r8,%rax | |
170 adcq $0,%rdx | |
171 xorq %r13,%r13 | |
172 movq %rdx,%r12 | |
173 | |
174 | |
175 | |
176 | |
177 | |
178 | |
179 | |
180 | |
181 | |
182 | |
183 movq %r8,%rbp | |
184 shlq $32,%r8 | |
185 mulq %r15 | |
186 shrq $32,%rbp | |
187 addq %r8,%r9 | |
188 adcq %rbp,%r10 | |
189 adcq %rax,%r11 | |
190 movq 8(%rbx),%rax | |
191 adcq %rdx,%r12 | |
192 adcq $0,%r13 | |
193 xorq %r8,%r8 | |
194 | |
195 | |
196 | |
197 movq %rax,%rbp | |
198 mulq 0(%rsi) | |
199 addq %rax,%r9 | |
200 movq %rbp,%rax | |
201 adcq $0,%rdx | |
202 movq %rdx,%rcx | |
203 | |
204 mulq 8(%rsi) | |
205 addq %rcx,%r10 | |
206 adcq $0,%rdx | |
207 addq %rax,%r10 | |
208 movq %rbp,%rax | |
209 adcq $0,%rdx | |
210 movq %rdx,%rcx | |
211 | |
212 mulq 16(%rsi) | |
213 addq %rcx,%r11 | |
214 adcq $0,%rdx | |
215 addq %rax,%r11 | |
216 movq %rbp,%rax | |
217 adcq $0,%rdx | |
218 movq %rdx,%rcx | |
219 | |
220 mulq 24(%rsi) | |
221 addq %rcx,%r12 | |
222 adcq $0,%rdx | |
223 addq %rax,%r12 | |
224 movq %r9,%rax | |
225 adcq %rdx,%r13 | |
226 adcq $0,%r8 | |
227 | |
228 | |
229 | |
230 movq %r9,%rbp | |
231 shlq $32,%r9 | |
232 mulq %r15 | |
233 shrq $32,%rbp | |
234 addq %r9,%r10 | |
235 adcq %rbp,%r11 | |
236 adcq %rax,%r12 | |
237 movq 16(%rbx),%rax | |
238 adcq %rdx,%r13 | |
239 adcq $0,%r8 | |
240 xorq %r9,%r9 | |
241 | |
242 | |
243 | |
244 movq %rax,%rbp | |
245 mulq 0(%rsi) | |
246 addq %rax,%r10 | |
247 movq %rbp,%rax | |
248 adcq $0,%rdx | |
249 movq %rdx,%rcx | |
250 | |
251 mulq 8(%rsi) | |
252 addq %rcx,%r11 | |
253 adcq $0,%rdx | |
254 addq %rax,%r11 | |
255 movq %rbp,%rax | |
256 adcq $0,%rdx | |
257 movq %rdx,%rcx | |
258 | |
259 mulq 16(%rsi) | |
260 addq %rcx,%r12 | |
261 adcq $0,%rdx | |
262 addq %rax,%r12 | |
263 movq %rbp,%rax | |
264 adcq $0,%rdx | |
265 movq %rdx,%rcx | |
266 | |
267 mulq 24(%rsi) | |
268 addq %rcx,%r13 | |
269 adcq $0,%rdx | |
270 addq %rax,%r13 | |
271 movq %r10,%rax | |
272 adcq %rdx,%r8 | |
273 adcq $0,%r9 | |
274 | |
275 | |
276 | |
277 movq %r10,%rbp | |
278 shlq $32,%r10 | |
279 mulq %r15 | |
280 shrq $32,%rbp | |
281 addq %r10,%r11 | |
282 adcq %rbp,%r12 | |
283 adcq %rax,%r13 | |
284 movq 24(%rbx),%rax | |
285 adcq %rdx,%r8 | |
286 adcq $0,%r9 | |
287 xorq %r10,%r10 | |
288 | |
289 | |
290 | |
291 movq %rax,%rbp | |
292 mulq 0(%rsi) | |
293 addq %rax,%r11 | |
294 movq %rbp,%rax | |
295 adcq $0,%rdx | |
296 movq %rdx,%rcx | |
297 | |
298 mulq 8(%rsi) | |
299 addq %rcx,%r12 | |
300 adcq $0,%rdx | |
301 addq %rax,%r12 | |
302 movq %rbp,%rax | |
303 adcq $0,%rdx | |
304 movq %rdx,%rcx | |
305 | |
306 mulq 16(%rsi) | |
307 addq %rcx,%r13 | |
308 adcq $0,%rdx | |
309 addq %rax,%r13 | |
310 movq %rbp,%rax | |
311 adcq $0,%rdx | |
312 movq %rdx,%rcx | |
313 | |
314 mulq 24(%rsi) | |
315 addq %rcx,%r8 | |
316 adcq $0,%rdx | |
317 addq %rax,%r8 | |
318 movq %r11,%rax | |
319 adcq %rdx,%r9 | |
320 adcq $0,%r10 | |
321 | |
322 | |
323 | |
324 movq %r11,%rbp | |
325 shlq $32,%r11 | |
326 mulq %r15 | |
327 shrq $32,%rbp | |
328 addq %r11,%r12 | |
329 adcq %rbp,%r13 | |
330 movq %r12,%rcx | |
331 adcq %rax,%r8 | |
332 adcq %rdx,%r9 | |
333 movq %r13,%rbp | |
334 adcq $0,%r10 | |
335 | |
336 | |
337 | |
338 subq $-1,%r12 | |
339 movq %r8,%rbx | |
340 sbbq %r14,%r13 | |
341 sbbq $0,%r8 | |
342 movq %r9,%rdx | |
343 sbbq %r15,%r9 | |
344 sbbq $0,%r10 | |
345 | |
346 cmovcq %rcx,%r12 | |
347 cmovcq %rbp,%r13 | |
348 movq %r12,0(%rdi) | |
349 cmovcq %rbx,%r8 | |
350 movq %r13,8(%rdi) | |
351 cmovcq %rdx,%r9 | |
352 movq %r8,16(%rdi) | |
353 movq %r9,24(%rdi) | |
354 | |
355 .byte 0xf3,0xc3 | |
356 | |
357 | |
358 | |
359 | |
360 | |
361 | |
362 | |
363 | |
364 | |
365 .globl _ecp_nistz256_sqr_mont | |
366 .private_extern _ecp_nistz256_sqr_mont | |
367 | |
368 .p2align 5 | |
369 _ecp_nistz256_sqr_mont: | |
370 pushq %rbp | |
371 pushq %rbx | |
372 pushq %r12 | |
373 pushq %r13 | |
374 pushq %r14 | |
375 pushq %r15 | |
376 movq 0(%rsi),%rax | |
377 movq 8(%rsi),%r14 | |
378 movq 16(%rsi),%r15 | |
379 movq 24(%rsi),%r8 | |
380 | |
381 call __ecp_nistz256_sqr_montq | |
382 L$sqr_mont_done: | |
383 popq %r15 | |
384 popq %r14 | |
385 popq %r13 | |
386 popq %r12 | |
387 popq %rbx | |
388 popq %rbp | |
389 .byte 0xf3,0xc3 | |
390 | |
391 | |
392 | |
393 .p2align 5 | |
394 __ecp_nistz256_sqr_montq: | |
395 movq %rax,%r13 | |
396 mulq %r14 | |
397 movq %rax,%r9 | |
398 movq %r15,%rax | |
399 movq %rdx,%r10 | |
400 | |
401 mulq %r13 | |
402 addq %rax,%r10 | |
403 movq %r8,%rax | |
404 adcq $0,%rdx | |
405 movq %rdx,%r11 | |
406 | |
407 mulq %r13 | |
408 addq %rax,%r11 | |
409 movq %r15,%rax | |
410 adcq $0,%rdx | |
411 movq %rdx,%r12 | |
412 | |
413 | |
414 mulq %r14 | |
415 addq %rax,%r11 | |
416 movq %r8,%rax | |
417 adcq $0,%rdx | |
418 movq %rdx,%rbp | |
419 | |
420 mulq %r14 | |
421 addq %rax,%r12 | |
422 movq %r8,%rax | |
423 adcq $0,%rdx | |
424 addq %rbp,%r12 | |
425 movq %rdx,%r13 | |
426 adcq $0,%r13 | |
427 | |
428 | |
429 mulq %r15 | |
430 xorq %r15,%r15 | |
431 addq %rax,%r13 | |
432 movq 0(%rsi),%rax | |
433 movq %rdx,%r14 | |
434 adcq $0,%r14 | |
435 | |
436 addq %r9,%r9 | |
437 adcq %r10,%r10 | |
438 adcq %r11,%r11 | |
439 adcq %r12,%r12 | |
440 adcq %r13,%r13 | |
441 adcq %r14,%r14 | |
442 adcq $0,%r15 | |
443 | |
444 mulq %rax | |
445 movq %rax,%r8 | |
446 movq 8(%rsi),%rax | |
447 movq %rdx,%rcx | |
448 | |
449 mulq %rax | |
450 addq %rcx,%r9 | |
451 adcq %rax,%r10 | |
452 movq 16(%rsi),%rax | |
453 adcq $0,%rdx | |
454 movq %rdx,%rcx | |
455 | |
456 mulq %rax | |
457 addq %rcx,%r11 | |
458 adcq %rax,%r12 | |
459 movq 24(%rsi),%rax | |
460 adcq $0,%rdx | |
461 movq %rdx,%rcx | |
462 | |
463 mulq %rax | |
464 addq %rcx,%r13 | |
465 adcq %rax,%r14 | |
466 movq %r8,%rax | |
467 adcq %rdx,%r15 | |
468 | |
469 movq L$poly+8(%rip),%rsi | |
470 movq L$poly+24(%rip),%rbp | |
471 | |
472 | |
473 | |
474 | |
475 movq %r8,%rcx | |
476 shlq $32,%r8 | |
477 mulq %rbp | |
478 shrq $32,%rcx | |
479 addq %r8,%r9 | |
480 adcq %rcx,%r10 | |
481 adcq %rax,%r11 | |
482 movq %r9,%rax | |
483 adcq $0,%rdx | |
484 | |
485 | |
486 | |
487 movq %r9,%rcx | |
488 shlq $32,%r9 | |
489 movq %rdx,%r8 | |
490 mulq %rbp | |
491 shrq $32,%rcx | |
492 addq %r9,%r10 | |
493 adcq %rcx,%r11 | |
494 adcq %rax,%r8 | |
495 movq %r10,%rax | |
496 adcq $0,%rdx | |
497 | |
498 | |
499 | |
500 movq %r10,%rcx | |
501 shlq $32,%r10 | |
502 movq %rdx,%r9 | |
503 mulq %rbp | |
504 shrq $32,%rcx | |
505 addq %r10,%r11 | |
506 adcq %rcx,%r8 | |
507 adcq %rax,%r9 | |
508 movq %r11,%rax | |
509 adcq $0,%rdx | |
510 | |
511 | |
512 | |
513 movq %r11,%rcx | |
514 shlq $32,%r11 | |
515 movq %rdx,%r10 | |
516 mulq %rbp | |
517 shrq $32,%rcx | |
518 addq %r11,%r8 | |
519 adcq %rcx,%r9 | |
520 adcq %rax,%r10 | |
521 adcq $0,%rdx | |
522 xorq %r11,%r11 | |
523 | |
524 | |
525 | |
526 addq %r8,%r12 | |
527 adcq %r9,%r13 | |
528 movq %r12,%r8 | |
529 adcq %r10,%r14 | |
530 adcq %rdx,%r15 | |
531 movq %r13,%r9 | |
532 adcq $0,%r11 | |
533 | |
534 subq $-1,%r12 | |
535 movq %r14,%r10 | |
536 sbbq %rsi,%r13 | |
537 sbbq $0,%r14 | |
538 movq %r15,%rcx | |
539 sbbq %rbp,%r15 | |
540 sbbq $0,%r11 | |
541 | |
542 cmovcq %r8,%r12 | |
543 cmovcq %r9,%r13 | |
544 movq %r12,0(%rdi) | |
545 cmovcq %r10,%r14 | |
546 movq %r13,8(%rdi) | |
547 cmovcq %rcx,%r15 | |
548 movq %r14,16(%rdi) | |
549 movq %r15,24(%rdi) | |
550 | |
551 .byte 0xf3,0xc3 | |
552 | |
553 | |
554 | |
555 | |
556 | |
557 | |
558 | |
559 .globl _ecp_nistz256_from_mont | |
560 .private_extern _ecp_nistz256_from_mont | |
561 | |
562 .p2align 5 | |
563 _ecp_nistz256_from_mont: | |
564 pushq %r12 | |
565 pushq %r13 | |
566 | |
567 movq 0(%rsi),%rax | |
568 movq L$poly+24(%rip),%r13 | |
569 movq 8(%rsi),%r9 | |
570 movq 16(%rsi),%r10 | |
571 movq 24(%rsi),%r11 | |
572 movq %rax,%r8 | |
573 movq L$poly+8(%rip),%r12 | |
574 | |
575 | |
576 | |
577 movq %rax,%rcx | |
578 shlq $32,%r8 | |
579 mulq %r13 | |
580 shrq $32,%rcx | |
581 addq %r8,%r9 | |
582 adcq %rcx,%r10 | |
583 adcq %rax,%r11 | |
584 movq %r9,%rax | |
585 adcq $0,%rdx | |
586 | |
587 | |
588 | |
589 movq %r9,%rcx | |
590 shlq $32,%r9 | |
591 movq %rdx,%r8 | |
592 mulq %r13 | |
593 shrq $32,%rcx | |
594 addq %r9,%r10 | |
595 adcq %rcx,%r11 | |
596 adcq %rax,%r8 | |
597 movq %r10,%rax | |
598 adcq $0,%rdx | |
599 | |
600 | |
601 | |
602 movq %r10,%rcx | |
603 shlq $32,%r10 | |
604 movq %rdx,%r9 | |
605 mulq %r13 | |
606 shrq $32,%rcx | |
607 addq %r10,%r11 | |
608 adcq %rcx,%r8 | |
609 adcq %rax,%r9 | |
610 movq %r11,%rax | |
611 adcq $0,%rdx | |
612 | |
613 | |
614 | |
615 movq %r11,%rcx | |
616 shlq $32,%r11 | |
617 movq %rdx,%r10 | |
618 mulq %r13 | |
619 shrq $32,%rcx | |
620 addq %r11,%r8 | |
621 adcq %rcx,%r9 | |
622 movq %r8,%rcx | |
623 adcq %rax,%r10 | |
624 movq %r9,%rsi | |
625 adcq $0,%rdx | |
626 | |
627 subq $-1,%r8 | |
628 movq %r10,%rax | |
629 sbbq %r12,%r9 | |
630 sbbq $0,%r10 | |
631 movq %rdx,%r11 | |
632 sbbq %r13,%rdx | |
633 sbbq %r13,%r13 | |
634 | |
635 cmovnzq %rcx,%r8 | |
636 cmovnzq %rsi,%r9 | |
637 movq %r8,0(%rdi) | |
638 cmovnzq %rax,%r10 | |
639 movq %r9,8(%rdi) | |
640 cmovzq %rdx,%r11 | |
641 movq %r10,16(%rdi) | |
642 movq %r11,24(%rdi) | |
643 | |
644 popq %r13 | |
645 popq %r12 | |
646 .byte 0xf3,0xc3 | |
647 | |
648 | |
649 | |
650 .globl _ecp_nistz256_select_w5 | |
651 .private_extern _ecp_nistz256_select_w5 | |
652 | |
653 .p2align 5 | |
654 _ecp_nistz256_select_w5: | |
655 movdqa L$One(%rip),%xmm0 | |
656 movd %edx,%xmm1 | |
657 | |
658 pxor %xmm2,%xmm2 | |
659 pxor %xmm3,%xmm3 | |
660 pxor %xmm4,%xmm4 | |
661 pxor %xmm5,%xmm5 | |
662 pxor %xmm6,%xmm6 | |
663 pxor %xmm7,%xmm7 | |
664 | |
665 movdqa %xmm0,%xmm8 | |
666 pshufd $0,%xmm1,%xmm1 | |
667 | |
668 movq $16,%rax | |
669 L$select_loop_sse_w5: | |
670 | |
671 movdqa %xmm8,%xmm15 | |
672 paddd %xmm0,%xmm8 | |
673 pcmpeqd %xmm1,%xmm15 | |
674 | |
675 movdqa 0(%rsi),%xmm9 | |
676 movdqa 16(%rsi),%xmm10 | |
677 movdqa 32(%rsi),%xmm11 | |
678 movdqa 48(%rsi),%xmm12 | |
679 movdqa 64(%rsi),%xmm13 | |
680 movdqa 80(%rsi),%xmm14 | |
681 leaq 96(%rsi),%rsi | |
682 | |
683 pand %xmm15,%xmm9 | |
684 pand %xmm15,%xmm10 | |
685 por %xmm9,%xmm2 | |
686 pand %xmm15,%xmm11 | |
687 por %xmm10,%xmm3 | |
688 pand %xmm15,%xmm12 | |
689 por %xmm11,%xmm4 | |
690 pand %xmm15,%xmm13 | |
691 por %xmm12,%xmm5 | |
692 pand %xmm15,%xmm14 | |
693 por %xmm13,%xmm6 | |
694 por %xmm14,%xmm7 | |
695 | |
696 decq %rax | |
697 jnz L$select_loop_sse_w5 | |
698 | |
699 movdqu %xmm2,0(%rdi) | |
700 movdqu %xmm3,16(%rdi) | |
701 movdqu %xmm4,32(%rdi) | |
702 movdqu %xmm5,48(%rdi) | |
703 movdqu %xmm6,64(%rdi) | |
704 movdqu %xmm7,80(%rdi) | |
705 .byte 0xf3,0xc3 | |
706 | |
707 | |
708 | |
709 | |
710 .globl _ecp_nistz256_select_w7 | |
711 .private_extern _ecp_nistz256_select_w7 | |
712 | |
713 .p2align 5 | |
714 _ecp_nistz256_select_w7: | |
715 movdqa L$One(%rip),%xmm8 | |
716 movd %edx,%xmm1 | |
717 | |
718 pxor %xmm2,%xmm2 | |
719 pxor %xmm3,%xmm3 | |
720 pxor %xmm4,%xmm4 | |
721 pxor %xmm5,%xmm5 | |
722 | |
723 movdqa %xmm8,%xmm0 | |
724 pshufd $0,%xmm1,%xmm1 | |
725 movq $64,%rax | |
726 | |
727 L$select_loop_sse_w7: | |
728 movdqa %xmm8,%xmm15 | |
729 paddd %xmm0,%xmm8 | |
730 movdqa 0(%rsi),%xmm9 | |
731 movdqa 16(%rsi),%xmm10 | |
732 pcmpeqd %xmm1,%xmm15 | |
733 movdqa 32(%rsi),%xmm11 | |
734 movdqa 48(%rsi),%xmm12 | |
735 leaq 64(%rsi),%rsi | |
736 | |
737 pand %xmm15,%xmm9 | |
738 pand %xmm15,%xmm10 | |
739 por %xmm9,%xmm2 | |
740 pand %xmm15,%xmm11 | |
741 por %xmm10,%xmm3 | |
742 pand %xmm15,%xmm12 | |
743 por %xmm11,%xmm4 | |
744 prefetcht0 255(%rsi) | |
745 por %xmm12,%xmm5 | |
746 | |
747 decq %rax | |
748 jnz L$select_loop_sse_w7 | |
749 | |
750 movdqu %xmm2,0(%rdi) | |
751 movdqu %xmm3,16(%rdi) | |
752 movdqu %xmm4,32(%rdi) | |
753 movdqu %xmm5,48(%rdi) | |
754 .byte 0xf3,0xc3 | |
755 | |
756 .globl _ecp_nistz256_avx2_select_w7 | |
757 .private_extern _ecp_nistz256_avx2_select_w7 | |
758 | |
759 .p2align 5 | |
760 _ecp_nistz256_avx2_select_w7: | |
761 .byte 0x0f,0x0b | |
762 .byte 0xf3,0xc3 | |
763 | |
764 | |
765 .p2align 5 | |
766 __ecp_nistz256_add_toq: | |
767 addq 0(%rbx),%r12 | |
768 adcq 8(%rbx),%r13 | |
769 movq %r12,%rax | |
770 adcq 16(%rbx),%r8 | |
771 adcq 24(%rbx),%r9 | |
772 movq %r13,%rbp | |
773 sbbq %r11,%r11 | |
774 | |
775 subq $-1,%r12 | |
776 movq %r8,%rcx | |
777 sbbq %r14,%r13 | |
778 sbbq $0,%r8 | |
779 movq %r9,%r10 | |
780 sbbq %r15,%r9 | |
781 testq %r11,%r11 | |
782 | |
783 cmovzq %rax,%r12 | |
784 cmovzq %rbp,%r13 | |
785 movq %r12,0(%rdi) | |
786 cmovzq %rcx,%r8 | |
787 movq %r13,8(%rdi) | |
788 cmovzq %r10,%r9 | |
789 movq %r8,16(%rdi) | |
790 movq %r9,24(%rdi) | |
791 | |
792 .byte 0xf3,0xc3 | |
793 | |
794 | |
795 | |
796 .p2align 5 | |
797 __ecp_nistz256_sub_fromq: | |
798 subq 0(%rbx),%r12 | |
799 sbbq 8(%rbx),%r13 | |
800 movq %r12,%rax | |
801 sbbq 16(%rbx),%r8 | |
802 sbbq 24(%rbx),%r9 | |
803 movq %r13,%rbp | |
804 sbbq %r11,%r11 | |
805 | |
806 addq $-1,%r12 | |
807 movq %r8,%rcx | |
808 adcq %r14,%r13 | |
809 adcq $0,%r8 | |
810 movq %r9,%r10 | |
811 adcq %r15,%r9 | |
812 testq %r11,%r11 | |
813 | |
814 cmovzq %rax,%r12 | |
815 cmovzq %rbp,%r13 | |
816 movq %r12,0(%rdi) | |
817 cmovzq %rcx,%r8 | |
818 movq %r13,8(%rdi) | |
819 cmovzq %r10,%r9 | |
820 movq %r8,16(%rdi) | |
821 movq %r9,24(%rdi) | |
822 | |
823 .byte 0xf3,0xc3 | |
824 | |
825 | |
826 | |
827 .p2align 5 | |
828 __ecp_nistz256_subq: | |
829 subq %r12,%rax | |
830 sbbq %r13,%rbp | |
831 movq %rax,%r12 | |
832 sbbq %r8,%rcx | |
833 sbbq %r9,%r10 | |
834 movq %rbp,%r13 | |
835 sbbq %r11,%r11 | |
836 | |
837 addq $-1,%rax | |
838 movq %rcx,%r8 | |
839 adcq %r14,%rbp | |
840 adcq $0,%rcx | |
841 movq %r10,%r9 | |
842 adcq %r15,%r10 | |
843 testq %r11,%r11 | |
844 | |
845 cmovnzq %rax,%r12 | |
846 cmovnzq %rbp,%r13 | |
847 cmovnzq %rcx,%r8 | |
848 cmovnzq %r10,%r9 | |
849 | |
850 .byte 0xf3,0xc3 | |
851 | |
852 | |
853 | |
854 .p2align 5 | |
855 __ecp_nistz256_mul_by_2q: | |
856 addq %r12,%r12 | |
857 adcq %r13,%r13 | |
858 movq %r12,%rax | |
859 adcq %r8,%r8 | |
860 adcq %r9,%r9 | |
861 movq %r13,%rbp | |
862 sbbq %r11,%r11 | |
863 | |
864 subq $-1,%r12 | |
865 movq %r8,%rcx | |
866 sbbq %r14,%r13 | |
867 sbbq $0,%r8 | |
868 movq %r9,%r10 | |
869 sbbq %r15,%r9 | |
870 testq %r11,%r11 | |
871 | |
872 cmovzq %rax,%r12 | |
873 cmovzq %rbp,%r13 | |
874 movq %r12,0(%rdi) | |
875 cmovzq %rcx,%r8 | |
876 movq %r13,8(%rdi) | |
877 cmovzq %r10,%r9 | |
878 movq %r8,16(%rdi) | |
879 movq %r9,24(%rdi) | |
880 | |
881 .byte 0xf3,0xc3 | |
882 | |
883 .globl _ecp_nistz256_point_double | |
884 .private_extern _ecp_nistz256_point_double | |
885 | |
886 .p2align 5 | |
887 _ecp_nistz256_point_double: | |
888 pushq %rbp | |
889 pushq %rbx | |
890 pushq %r12 | |
891 pushq %r13 | |
892 pushq %r14 | |
893 pushq %r15 | |
894 subq $160+8,%rsp | |
895 | |
896 L$point_double_shortcutq: | |
897 movdqu 0(%rsi),%xmm0 | |
898 movq %rsi,%rbx | |
899 movdqu 16(%rsi),%xmm1 | |
900 movq 32+0(%rsi),%r12 | |
901 movq 32+8(%rsi),%r13 | |
902 movq 32+16(%rsi),%r8 | |
903 movq 32+24(%rsi),%r9 | |
904 movq L$poly+8(%rip),%r14 | |
905 movq L$poly+24(%rip),%r15 | |
906 movdqa %xmm0,96(%rsp) | |
907 movdqa %xmm1,96+16(%rsp) | |
908 leaq 32(%rdi),%r10 | |
909 leaq 64(%rdi),%r11 | |
910 .byte 102,72,15,110,199 | |
911 .byte 102,73,15,110,202 | |
912 .byte 102,73,15,110,211 | |
913 | |
914 leaq 0(%rsp),%rdi | |
915 call __ecp_nistz256_mul_by_2q | |
916 | |
917 movq 64+0(%rsi),%rax | |
918 movq 64+8(%rsi),%r14 | |
919 movq 64+16(%rsi),%r15 | |
920 movq 64+24(%rsi),%r8 | |
921 leaq 64-0(%rsi),%rsi | |
922 leaq 64(%rsp),%rdi | |
923 call __ecp_nistz256_sqr_montq | |
924 | |
925 movq 0+0(%rsp),%rax | |
926 movq 8+0(%rsp),%r14 | |
927 leaq 0+0(%rsp),%rsi | |
928 movq 16+0(%rsp),%r15 | |
929 movq 24+0(%rsp),%r8 | |
930 leaq 0(%rsp),%rdi | |
931 call __ecp_nistz256_sqr_montq | |
932 | |
933 movq 32(%rbx),%rax | |
934 movq 64+0(%rbx),%r9 | |
935 movq 64+8(%rbx),%r10 | |
936 movq 64+16(%rbx),%r11 | |
937 movq 64+24(%rbx),%r12 | |
938 leaq 64-0(%rbx),%rsi | |
939 leaq 32(%rbx),%rbx | |
940 .byte 102,72,15,126,215 | |
941 call __ecp_nistz256_mul_montq | |
942 call __ecp_nistz256_mul_by_2q | |
943 | |
944 movq 96+0(%rsp),%r12 | |
945 movq 96+8(%rsp),%r13 | |
946 leaq 64(%rsp),%rbx | |
947 movq 96+16(%rsp),%r8 | |
948 movq 96+24(%rsp),%r9 | |
949 leaq 32(%rsp),%rdi | |
950 call __ecp_nistz256_add_toq | |
951 | |
952 movq 96+0(%rsp),%r12 | |
953 movq 96+8(%rsp),%r13 | |
954 leaq 64(%rsp),%rbx | |
955 movq 96+16(%rsp),%r8 | |
956 movq 96+24(%rsp),%r9 | |
957 leaq 64(%rsp),%rdi | |
958 call __ecp_nistz256_sub_fromq | |
959 | |
960 movq 0+0(%rsp),%rax | |
961 movq 8+0(%rsp),%r14 | |
962 leaq 0+0(%rsp),%rsi | |
963 movq 16+0(%rsp),%r15 | |
964 movq 24+0(%rsp),%r8 | |
965 .byte 102,72,15,126,207 | |
966 call __ecp_nistz256_sqr_montq | |
967 xorq %r9,%r9 | |
968 movq %r12,%rax | |
969 addq $-1,%r12 | |
970 movq %r13,%r10 | |
971 adcq %rsi,%r13 | |
972 movq %r14,%rcx | |
973 adcq $0,%r14 | |
974 movq %r15,%r8 | |
975 adcq %rbp,%r15 | |
976 adcq $0,%r9 | |
977 xorq %rsi,%rsi | |
978 testq $1,%rax | |
979 | |
980 cmovzq %rax,%r12 | |
981 cmovzq %r10,%r13 | |
982 cmovzq %rcx,%r14 | |
983 cmovzq %r8,%r15 | |
984 cmovzq %rsi,%r9 | |
985 | |
986 movq %r13,%rax | |
987 shrq $1,%r12 | |
988 shlq $63,%rax | |
989 movq %r14,%r10 | |
990 shrq $1,%r13 | |
991 orq %rax,%r12 | |
992 shlq $63,%r10 | |
993 movq %r15,%rcx | |
994 shrq $1,%r14 | |
995 orq %r10,%r13 | |
996 shlq $63,%rcx | |
997 movq %r12,0(%rdi) | |
998 shrq $1,%r15 | |
999 movq %r13,8(%rdi) | |
1000 shlq $63,%r9 | |
1001 orq %rcx,%r14 | |
1002 orq %r9,%r15 | |
1003 movq %r14,16(%rdi) | |
1004 movq %r15,24(%rdi) | |
1005 movq 64(%rsp),%rax | |
1006 leaq 64(%rsp),%rbx | |
1007 movq 0+32(%rsp),%r9 | |
1008 movq 8+32(%rsp),%r10 | |
1009 leaq 0+32(%rsp),%rsi | |
1010 movq 16+32(%rsp),%r11 | |
1011 movq 24+32(%rsp),%r12 | |
1012 leaq 32(%rsp),%rdi | |
1013 call __ecp_nistz256_mul_montq | |
1014 | |
1015 leaq 128(%rsp),%rdi | |
1016 call __ecp_nistz256_mul_by_2q | |
1017 | |
1018 leaq 32(%rsp),%rbx | |
1019 leaq 32(%rsp),%rdi | |
1020 call __ecp_nistz256_add_toq | |
1021 | |
1022 movq 96(%rsp),%rax | |
1023 leaq 96(%rsp),%rbx | |
1024 movq 0+0(%rsp),%r9 | |
1025 movq 8+0(%rsp),%r10 | |
1026 leaq 0+0(%rsp),%rsi | |
1027 movq 16+0(%rsp),%r11 | |
1028 movq 24+0(%rsp),%r12 | |
1029 leaq 0(%rsp),%rdi | |
1030 call __ecp_nistz256_mul_montq | |
1031 | |
1032 leaq 128(%rsp),%rdi | |
1033 call __ecp_nistz256_mul_by_2q | |
1034 | |
1035 movq 0+32(%rsp),%rax | |
1036 movq 8+32(%rsp),%r14 | |
1037 leaq 0+32(%rsp),%rsi | |
1038 movq 16+32(%rsp),%r15 | |
1039 movq 24+32(%rsp),%r8 | |
1040 .byte 102,72,15,126,199 | |
1041 call __ecp_nistz256_sqr_montq | |
1042 | |
1043 leaq 128(%rsp),%rbx | |
1044 movq %r14,%r8 | |
1045 movq %r15,%r9 | |
1046 movq %rsi,%r14 | |
1047 movq %rbp,%r15 | |
1048 call __ecp_nistz256_sub_fromq | |
1049 | |
1050 movq 0+0(%rsp),%rax | |
1051 movq 0+8(%rsp),%rbp | |
1052 movq 0+16(%rsp),%rcx | |
1053 movq 0+24(%rsp),%r10 | |
1054 leaq 0(%rsp),%rdi | |
1055 call __ecp_nistz256_subq | |
1056 | |
1057 movq 32(%rsp),%rax | |
1058 leaq 32(%rsp),%rbx | |
1059 movq %r12,%r14 | |
1060 xorl %ecx,%ecx | |
1061 movq %r12,0+0(%rsp) | |
1062 movq %r13,%r10 | |
1063 movq %r13,0+8(%rsp) | |
1064 cmovzq %r8,%r11 | |
1065 movq %r8,0+16(%rsp) | |
1066 leaq 0-0(%rsp),%rsi | |
1067 cmovzq %r9,%r12 | |
1068 movq %r9,0+24(%rsp) | |
1069 movq %r14,%r9 | |
1070 leaq 0(%rsp),%rdi | |
1071 call __ecp_nistz256_mul_montq | |
1072 | |
1073 .byte 102,72,15,126,203 | |
1074 .byte 102,72,15,126,207 | |
1075 call __ecp_nistz256_sub_fromq | |
1076 | |
1077 addq $160+8,%rsp | |
1078 popq %r15 | |
1079 popq %r14 | |
1080 popq %r13 | |
1081 popq %r12 | |
1082 popq %rbx | |
1083 popq %rbp | |
1084 .byte 0xf3,0xc3 | |
1085 | |
1086 .globl _ecp_nistz256_point_add | |
1087 .private_extern _ecp_nistz256_point_add | |
1088 | |
1089 .p2align 5 | |
1090 _ecp_nistz256_point_add: | |
1091 pushq %rbp | |
1092 pushq %rbx | |
1093 pushq %r12 | |
1094 pushq %r13 | |
1095 pushq %r14 | |
1096 pushq %r15 | |
1097 subq $576+8,%rsp | |
1098 | |
1099 movdqu 0(%rsi),%xmm0 | |
1100 movdqu 16(%rsi),%xmm1 | |
1101 movdqu 32(%rsi),%xmm2 | |
1102 movdqu 48(%rsi),%xmm3 | |
1103 movdqu 64(%rsi),%xmm4 | |
1104 movdqu 80(%rsi),%xmm5 | |
1105 movq %rsi,%rbx | |
1106 movq %rdx,%rsi | |
1107 movdqa %xmm0,384(%rsp) | |
1108 movdqa %xmm1,384+16(%rsp) | |
1109 por %xmm0,%xmm1 | |
1110 movdqa %xmm2,416(%rsp) | |
1111 movdqa %xmm3,416+16(%rsp) | |
1112 por %xmm2,%xmm3 | |
1113 movdqa %xmm4,448(%rsp) | |
1114 movdqa %xmm5,448+16(%rsp) | |
1115 por %xmm1,%xmm3 | |
1116 | |
1117 movdqu 0(%rsi),%xmm0 | |
1118 pshufd $0xb1,%xmm3,%xmm5 | |
1119 movdqu 16(%rsi),%xmm1 | |
1120 movdqu 32(%rsi),%xmm2 | |
1121 por %xmm3,%xmm5 | |
1122 movdqu 48(%rsi),%xmm3 | |
1123 movq 64+0(%rsi),%rax | |
1124 movq 64+8(%rsi),%r14 | |
1125 movq 64+16(%rsi),%r15 | |
1126 movq 64+24(%rsi),%r8 | |
1127 movdqa %xmm0,480(%rsp) | |
1128 pshufd $0x1e,%xmm5,%xmm4 | |
1129 movdqa %xmm1,480+16(%rsp) | |
1130 por %xmm0,%xmm1 | |
1131 .byte 102,72,15,110,199 | |
1132 movdqa %xmm2,512(%rsp) | |
1133 movdqa %xmm3,512+16(%rsp) | |
1134 por %xmm2,%xmm3 | |
1135 por %xmm4,%xmm5 | |
1136 pxor %xmm4,%xmm4 | |
1137 por %xmm1,%xmm3 | |
1138 | |
1139 leaq 64-0(%rsi),%rsi | |
1140 movq %rax,544+0(%rsp) | |
1141 movq %r14,544+8(%rsp) | |
1142 movq %r15,544+16(%rsp) | |
1143 movq %r8,544+24(%rsp) | |
1144 leaq 96(%rsp),%rdi | |
1145 call __ecp_nistz256_sqr_montq | |
1146 | |
1147 pcmpeqd %xmm4,%xmm5 | |
1148 pshufd $0xb1,%xmm3,%xmm4 | |
1149 por %xmm3,%xmm4 | |
1150 pshufd $0,%xmm5,%xmm5 | |
1151 pshufd $0x1e,%xmm4,%xmm3 | |
1152 por %xmm3,%xmm4 | |
1153 pxor %xmm3,%xmm3 | |
1154 pcmpeqd %xmm3,%xmm4 | |
1155 pshufd $0,%xmm4,%xmm4 | |
1156 movq 64+0(%rbx),%rax | |
1157 movq 64+8(%rbx),%r14 | |
1158 movq 64+16(%rbx),%r15 | |
1159 movq 64+24(%rbx),%r8 | |
1160 .byte 102,72,15,110,203 | |
1161 | |
1162 leaq 64-0(%rbx),%rsi | |
1163 leaq 32(%rsp),%rdi | |
1164 call __ecp_nistz256_sqr_montq | |
1165 | |
1166 movq 544(%rsp),%rax | |
1167 leaq 544(%rsp),%rbx | |
1168 movq 0+96(%rsp),%r9 | |
1169 movq 8+96(%rsp),%r10 | |
1170 leaq 0+96(%rsp),%rsi | |
1171 movq 16+96(%rsp),%r11 | |
1172 movq 24+96(%rsp),%r12 | |
1173 leaq 224(%rsp),%rdi | |
1174 call __ecp_nistz256_mul_montq | |
1175 | |
1176 movq 448(%rsp),%rax | |
1177 leaq 448(%rsp),%rbx | |
1178 movq 0+32(%rsp),%r9 | |
1179 movq 8+32(%rsp),%r10 | |
1180 leaq 0+32(%rsp),%rsi | |
1181 movq 16+32(%rsp),%r11 | |
1182 movq 24+32(%rsp),%r12 | |
1183 leaq 256(%rsp),%rdi | |
1184 call __ecp_nistz256_mul_montq | |
1185 | |
1186 movq 416(%rsp),%rax | |
1187 leaq 416(%rsp),%rbx | |
1188 movq 0+224(%rsp),%r9 | |
1189 movq 8+224(%rsp),%r10 | |
1190 leaq 0+224(%rsp),%rsi | |
1191 movq 16+224(%rsp),%r11 | |
1192 movq 24+224(%rsp),%r12 | |
1193 leaq 224(%rsp),%rdi | |
1194 call __ecp_nistz256_mul_montq | |
1195 | |
1196 movq 512(%rsp),%rax | |
1197 leaq 512(%rsp),%rbx | |
1198 movq 0+256(%rsp),%r9 | |
1199 movq 8+256(%rsp),%r10 | |
1200 leaq 0+256(%rsp),%rsi | |
1201 movq 16+256(%rsp),%r11 | |
1202 movq 24+256(%rsp),%r12 | |
1203 leaq 256(%rsp),%rdi | |
1204 call __ecp_nistz256_mul_montq | |
1205 | |
1206 leaq 224(%rsp),%rbx | |
1207 leaq 64(%rsp),%rdi | |
1208 call __ecp_nistz256_sub_fromq | |
1209 | |
1210 orq %r13,%r12 | |
1211 movdqa %xmm4,%xmm2 | |
1212 orq %r8,%r12 | |
1213 orq %r9,%r12 | |
1214 por %xmm5,%xmm2 | |
1215 .byte 102,73,15,110,220 | |
1216 | |
1217 movq 384(%rsp),%rax | |
1218 leaq 384(%rsp),%rbx | |
1219 movq 0+96(%rsp),%r9 | |
1220 movq 8+96(%rsp),%r10 | |
1221 leaq 0+96(%rsp),%rsi | |
1222 movq 16+96(%rsp),%r11 | |
1223 movq 24+96(%rsp),%r12 | |
1224 leaq 160(%rsp),%rdi | |
1225 call __ecp_nistz256_mul_montq | |
1226 | |
1227 movq 480(%rsp),%rax | |
1228 leaq 480(%rsp),%rbx | |
1229 movq 0+32(%rsp),%r9 | |
1230 movq 8+32(%rsp),%r10 | |
1231 leaq 0+32(%rsp),%rsi | |
1232 movq 16+32(%rsp),%r11 | |
1233 movq 24+32(%rsp),%r12 | |
1234 leaq 192(%rsp),%rdi | |
1235 call __ecp_nistz256_mul_montq | |
1236 | |
1237 leaq 160(%rsp),%rbx | |
1238 leaq 0(%rsp),%rdi | |
1239 call __ecp_nistz256_sub_fromq | |
1240 | |
1241 orq %r13,%r12 | |
1242 orq %r8,%r12 | |
1243 orq %r9,%r12 | |
1244 | |
1245 .byte 0x3e | |
1246 jnz L$add_proceedq | |
1247 .byte 102,73,15,126,208 | |
1248 .byte 102,73,15,126,217 | |
1249 testq %r8,%r8 | |
1250 jnz L$add_proceedq | |
1251 testq %r9,%r9 | |
1252 jz L$add_doubleq | |
1253 | |
1254 .byte 102,72,15,126,199 | |
1255 pxor %xmm0,%xmm0 | |
1256 movdqu %xmm0,0(%rdi) | |
1257 movdqu %xmm0,16(%rdi) | |
1258 movdqu %xmm0,32(%rdi) | |
1259 movdqu %xmm0,48(%rdi) | |
1260 movdqu %xmm0,64(%rdi) | |
1261 movdqu %xmm0,80(%rdi) | |
1262 jmp L$add_doneq | |
1263 | |
1264 .p2align 5 | |
1265 L$add_doubleq: | |
1266 .byte 102,72,15,126,206 | |
1267 .byte 102,72,15,126,199 | |
1268 addq $416,%rsp | |
1269 jmp L$point_double_shortcutq | |
1270 | |
1271 .p2align 5 | |
1272 L$add_proceedq: | |
1273 movq 0+64(%rsp),%rax | |
1274 movq 8+64(%rsp),%r14 | |
1275 leaq 0+64(%rsp),%rsi | |
1276 movq 16+64(%rsp),%r15 | |
1277 movq 24+64(%rsp),%r8 | |
1278 leaq 96(%rsp),%rdi | |
1279 call __ecp_nistz256_sqr_montq | |
1280 | |
1281 movq 448(%rsp),%rax | |
1282 leaq 448(%rsp),%rbx | |
1283 movq 0+0(%rsp),%r9 | |
1284 movq 8+0(%rsp),%r10 | |
1285 leaq 0+0(%rsp),%rsi | |
1286 movq 16+0(%rsp),%r11 | |
1287 movq 24+0(%rsp),%r12 | |
1288 leaq 352(%rsp),%rdi | |
1289 call __ecp_nistz256_mul_montq | |
1290 | |
1291 movq 0+0(%rsp),%rax | |
1292 movq 8+0(%rsp),%r14 | |
1293 leaq 0+0(%rsp),%rsi | |
1294 movq 16+0(%rsp),%r15 | |
1295 movq 24+0(%rsp),%r8 | |
1296 leaq 32(%rsp),%rdi | |
1297 call __ecp_nistz256_sqr_montq | |
1298 | |
1299 movq 544(%rsp),%rax | |
1300 leaq 544(%rsp),%rbx | |
1301 movq 0+352(%rsp),%r9 | |
1302 movq 8+352(%rsp),%r10 | |
1303 leaq 0+352(%rsp),%rsi | |
1304 movq 16+352(%rsp),%r11 | |
1305 movq 24+352(%rsp),%r12 | |
1306 leaq 352(%rsp),%rdi | |
1307 call __ecp_nistz256_mul_montq | |
1308 | |
1309 movq 0(%rsp),%rax | |
1310 leaq 0(%rsp),%rbx | |
1311 movq 0+32(%rsp),%r9 | |
1312 movq 8+32(%rsp),%r10 | |
1313 leaq 0+32(%rsp),%rsi | |
1314 movq 16+32(%rsp),%r11 | |
1315 movq 24+32(%rsp),%r12 | |
1316 leaq 128(%rsp),%rdi | |
1317 call __ecp_nistz256_mul_montq | |
1318 | |
1319 movq 160(%rsp),%rax | |
1320 leaq 160(%rsp),%rbx | |
1321 movq 0+32(%rsp),%r9 | |
1322 movq 8+32(%rsp),%r10 | |
1323 leaq 0+32(%rsp),%rsi | |
1324 movq 16+32(%rsp),%r11 | |
1325 movq 24+32(%rsp),%r12 | |
1326 leaq 192(%rsp),%rdi | |
1327 call __ecp_nistz256_mul_montq | |
1328 | |
1329 | |
1330 | |
1331 | |
1332 addq %r12,%r12 | |
1333 leaq 96(%rsp),%rsi | |
1334 adcq %r13,%r13 | |
1335 movq %r12,%rax | |
1336 adcq %r8,%r8 | |
1337 adcq %r9,%r9 | |
1338 movq %r13,%rbp | |
1339 sbbq %r11,%r11 | |
1340 | |
1341 subq $-1,%r12 | |
1342 movq %r8,%rcx | |
1343 sbbq %r14,%r13 | |
1344 sbbq $0,%r8 | |
1345 movq %r9,%r10 | |
1346 sbbq %r15,%r9 | |
1347 testq %r11,%r11 | |
1348 | |
1349 cmovzq %rax,%r12 | |
1350 movq 0(%rsi),%rax | |
1351 cmovzq %rbp,%r13 | |
1352 movq 8(%rsi),%rbp | |
1353 cmovzq %rcx,%r8 | |
1354 movq 16(%rsi),%rcx | |
1355 cmovzq %r10,%r9 | |
1356 movq 24(%rsi),%r10 | |
1357 | |
1358 call __ecp_nistz256_subq | |
1359 | |
1360 leaq 128(%rsp),%rbx | |
1361 leaq 288(%rsp),%rdi | |
1362 call __ecp_nistz256_sub_fromq | |
1363 | |
1364 movq 192+0(%rsp),%rax | |
1365 movq 192+8(%rsp),%rbp | |
1366 movq 192+16(%rsp),%rcx | |
1367 movq 192+24(%rsp),%r10 | |
1368 leaq 320(%rsp),%rdi | |
1369 | |
1370 call __ecp_nistz256_subq | |
1371 | |
1372 movq %r12,0(%rdi) | |
1373 movq %r13,8(%rdi) | |
1374 movq %r8,16(%rdi) | |
1375 movq %r9,24(%rdi) | |
1376 movq 128(%rsp),%rax | |
1377 leaq 128(%rsp),%rbx | |
1378 movq 0+224(%rsp),%r9 | |
1379 movq 8+224(%rsp),%r10 | |
1380 leaq 0+224(%rsp),%rsi | |
1381 movq 16+224(%rsp),%r11 | |
1382 movq 24+224(%rsp),%r12 | |
1383 leaq 256(%rsp),%rdi | |
1384 call __ecp_nistz256_mul_montq | |
1385 | |
1386 movq 320(%rsp),%rax | |
1387 leaq 320(%rsp),%rbx | |
1388 movq 0+64(%rsp),%r9 | |
1389 movq 8+64(%rsp),%r10 | |
1390 leaq 0+64(%rsp),%rsi | |
1391 movq 16+64(%rsp),%r11 | |
1392 movq 24+64(%rsp),%r12 | |
1393 leaq 320(%rsp),%rdi | |
1394 call __ecp_nistz256_mul_montq | |
1395 | |
1396 leaq 256(%rsp),%rbx | |
1397 leaq 320(%rsp),%rdi | |
1398 call __ecp_nistz256_sub_fromq | |
1399 | |
1400 .byte 102,72,15,126,199 | |
1401 | |
1402 movdqa %xmm5,%xmm0 | |
1403 movdqa %xmm5,%xmm1 | |
1404 pandn 352(%rsp),%xmm0 | |
1405 movdqa %xmm5,%xmm2 | |
1406 pandn 352+16(%rsp),%xmm1 | |
1407 movdqa %xmm5,%xmm3 | |
1408 pand 544(%rsp),%xmm2 | |
1409 pand 544+16(%rsp),%xmm3 | |
1410 por %xmm0,%xmm2 | |
1411 por %xmm1,%xmm3 | |
1412 | |
1413 movdqa %xmm4,%xmm0 | |
1414 movdqa %xmm4,%xmm1 | |
1415 pandn %xmm2,%xmm0 | |
1416 movdqa %xmm4,%xmm2 | |
1417 pandn %xmm3,%xmm1 | |
1418 movdqa %xmm4,%xmm3 | |
1419 pand 448(%rsp),%xmm2 | |
1420 pand 448+16(%rsp),%xmm3 | |
1421 por %xmm0,%xmm2 | |
1422 por %xmm1,%xmm3 | |
1423 movdqu %xmm2,64(%rdi) | |
1424 movdqu %xmm3,80(%rdi) | |
1425 | |
1426 movdqa %xmm5,%xmm0 | |
1427 movdqa %xmm5,%xmm1 | |
1428 pandn 288(%rsp),%xmm0 | |
1429 movdqa %xmm5,%xmm2 | |
1430 pandn 288+16(%rsp),%xmm1 | |
1431 movdqa %xmm5,%xmm3 | |
1432 pand 480(%rsp),%xmm2 | |
1433 pand 480+16(%rsp),%xmm3 | |
1434 por %xmm0,%xmm2 | |
1435 por %xmm1,%xmm3 | |
1436 | |
1437 movdqa %xmm4,%xmm0 | |
1438 movdqa %xmm4,%xmm1 | |
1439 pandn %xmm2,%xmm0 | |
1440 movdqa %xmm4,%xmm2 | |
1441 pandn %xmm3,%xmm1 | |
1442 movdqa %xmm4,%xmm3 | |
1443 pand 384(%rsp),%xmm2 | |
1444 pand 384+16(%rsp),%xmm3 | |
1445 por %xmm0,%xmm2 | |
1446 por %xmm1,%xmm3 | |
1447 movdqu %xmm2,0(%rdi) | |
1448 movdqu %xmm3,16(%rdi) | |
1449 | |
1450 movdqa %xmm5,%xmm0 | |
1451 movdqa %xmm5,%xmm1 | |
1452 pandn 320(%rsp),%xmm0 | |
1453 movdqa %xmm5,%xmm2 | |
1454 pandn 320+16(%rsp),%xmm1 | |
1455 movdqa %xmm5,%xmm3 | |
1456 pand 512(%rsp),%xmm2 | |
1457 pand 512+16(%rsp),%xmm3 | |
1458 por %xmm0,%xmm2 | |
1459 por %xmm1,%xmm3 | |
1460 | |
1461 movdqa %xmm4,%xmm0 | |
1462 movdqa %xmm4,%xmm1 | |
1463 pandn %xmm2,%xmm0 | |
1464 movdqa %xmm4,%xmm2 | |
1465 pandn %xmm3,%xmm1 | |
1466 movdqa %xmm4,%xmm3 | |
1467 pand 416(%rsp),%xmm2 | |
1468 pand 416+16(%rsp),%xmm3 | |
1469 por %xmm0,%xmm2 | |
1470 por %xmm1,%xmm3 | |
1471 movdqu %xmm2,32(%rdi) | |
1472 movdqu %xmm3,48(%rdi) | |
1473 | |
1474 L$add_doneq: | |
1475 addq $576+8,%rsp | |
1476 popq %r15 | |
1477 popq %r14 | |
1478 popq %r13 | |
1479 popq %r12 | |
1480 popq %rbx | |
1481 popq %rbp | |
1482 .byte 0xf3,0xc3 | |
1483 | |
1484 .globl _ecp_nistz256_point_add_affine | |
1485 .private_extern _ecp_nistz256_point_add_affine | |
1486 | |
1487 .p2align 5 | |
1488 _ecp_nistz256_point_add_affine: | |
1489 pushq %rbp | |
1490 pushq %rbx | |
1491 pushq %r12 | |
1492 pushq %r13 | |
1493 pushq %r14 | |
1494 pushq %r15 | |
1495 subq $480+8,%rsp | |
1496 | |
1497 movdqu 0(%rsi),%xmm0 | |
1498 movq %rdx,%rbx | |
1499 movdqu 16(%rsi),%xmm1 | |
1500 movdqu 32(%rsi),%xmm2 | |
1501 movdqu 48(%rsi),%xmm3 | |
1502 movdqu 64(%rsi),%xmm4 | |
1503 movdqu 80(%rsi),%xmm5 | |
1504 movq 64+0(%rsi),%rax | |
1505 movq 64+8(%rsi),%r14 | |
1506 movq 64+16(%rsi),%r15 | |
1507 movq 64+24(%rsi),%r8 | |
1508 movdqa %xmm0,320(%rsp) | |
1509 movdqa %xmm1,320+16(%rsp) | |
1510 por %xmm0,%xmm1 | |
1511 movdqa %xmm2,352(%rsp) | |
1512 movdqa %xmm3,352+16(%rsp) | |
1513 por %xmm2,%xmm3 | |
1514 movdqa %xmm4,384(%rsp) | |
1515 movdqa %xmm5,384+16(%rsp) | |
1516 por %xmm1,%xmm3 | |
1517 | |
1518 movdqu 0(%rbx),%xmm0 | |
1519 pshufd $0xb1,%xmm3,%xmm5 | |
1520 movdqu 16(%rbx),%xmm1 | |
1521 movdqu 32(%rbx),%xmm2 | |
1522 por %xmm3,%xmm5 | |
1523 movdqu 48(%rbx),%xmm3 | |
1524 movdqa %xmm0,416(%rsp) | |
1525 pshufd $0x1e,%xmm5,%xmm4 | |
1526 movdqa %xmm1,416+16(%rsp) | |
1527 por %xmm0,%xmm1 | |
1528 .byte 102,72,15,110,199 | |
1529 movdqa %xmm2,448(%rsp) | |
1530 movdqa %xmm3,448+16(%rsp) | |
1531 por %xmm2,%xmm3 | |
1532 por %xmm4,%xmm5 | |
1533 pxor %xmm4,%xmm4 | |
1534 por %xmm1,%xmm3 | |
1535 | |
1536 leaq 64-0(%rsi),%rsi | |
1537 leaq 32(%rsp),%rdi | |
1538 call __ecp_nistz256_sqr_montq | |
1539 | |
1540 pcmpeqd %xmm4,%xmm5 | |
1541 pshufd $0xb1,%xmm3,%xmm4 | |
1542 movq 0(%rbx),%rax | |
1543 | |
1544 movq %r12,%r9 | |
1545 por %xmm3,%xmm4 | |
1546 pshufd $0,%xmm5,%xmm5 | |
1547 pshufd $0x1e,%xmm4,%xmm3 | |
1548 movq %r13,%r10 | |
1549 por %xmm3,%xmm4 | |
1550 pxor %xmm3,%xmm3 | |
1551 movq %r14,%r11 | |
1552 pcmpeqd %xmm3,%xmm4 | |
1553 pshufd $0,%xmm4,%xmm4 | |
1554 | |
1555 leaq 32-0(%rsp),%rsi | |
1556 movq %r15,%r12 | |
1557 leaq 0(%rsp),%rdi | |
1558 call __ecp_nistz256_mul_montq | |
1559 | |
1560 leaq 320(%rsp),%rbx | |
1561 leaq 64(%rsp),%rdi | |
1562 call __ecp_nistz256_sub_fromq | |
1563 | |
1564 movq 384(%rsp),%rax | |
1565 leaq 384(%rsp),%rbx | |
1566 movq 0+32(%rsp),%r9 | |
1567 movq 8+32(%rsp),%r10 | |
1568 leaq 0+32(%rsp),%rsi | |
1569 movq 16+32(%rsp),%r11 | |
1570 movq 24+32(%rsp),%r12 | |
1571 leaq 32(%rsp),%rdi | |
1572 call __ecp_nistz256_mul_montq | |
1573 | |
1574 movq 384(%rsp),%rax | |
1575 leaq 384(%rsp),%rbx | |
1576 movq 0+64(%rsp),%r9 | |
1577 movq 8+64(%rsp),%r10 | |
1578 leaq 0+64(%rsp),%rsi | |
1579 movq 16+64(%rsp),%r11 | |
1580 movq 24+64(%rsp),%r12 | |
1581 leaq 288(%rsp),%rdi | |
1582 call __ecp_nistz256_mul_montq | |
1583 | |
1584 movq 448(%rsp),%rax | |
1585 leaq 448(%rsp),%rbx | |
1586 movq 0+32(%rsp),%r9 | |
1587 movq 8+32(%rsp),%r10 | |
1588 leaq 0+32(%rsp),%rsi | |
1589 movq 16+32(%rsp),%r11 | |
1590 movq 24+32(%rsp),%r12 | |
1591 leaq 32(%rsp),%rdi | |
1592 call __ecp_nistz256_mul_montq | |
1593 | |
1594 leaq 352(%rsp),%rbx | |
1595 leaq 96(%rsp),%rdi | |
1596 call __ecp_nistz256_sub_fromq | |
1597 | |
1598 movq 0+64(%rsp),%rax | |
1599 movq 8+64(%rsp),%r14 | |
1600 leaq 0+64(%rsp),%rsi | |
1601 movq 16+64(%rsp),%r15 | |
1602 movq 24+64(%rsp),%r8 | |
1603 leaq 128(%rsp),%rdi | |
1604 call __ecp_nistz256_sqr_montq | |
1605 | |
1606 movq 0+96(%rsp),%rax | |
1607 movq 8+96(%rsp),%r14 | |
1608 leaq 0+96(%rsp),%rsi | |
1609 movq 16+96(%rsp),%r15 | |
1610 movq 24+96(%rsp),%r8 | |
1611 leaq 192(%rsp),%rdi | |
1612 call __ecp_nistz256_sqr_montq | |
1613 | |
1614 movq 128(%rsp),%rax | |
1615 leaq 128(%rsp),%rbx | |
1616 movq 0+64(%rsp),%r9 | |
1617 movq 8+64(%rsp),%r10 | |
1618 leaq 0+64(%rsp),%rsi | |
1619 movq 16+64(%rsp),%r11 | |
1620 movq 24+64(%rsp),%r12 | |
1621 leaq 160(%rsp),%rdi | |
1622 call __ecp_nistz256_mul_montq | |
1623 | |
1624 movq 320(%rsp),%rax | |
1625 leaq 320(%rsp),%rbx | |
1626 movq 0+128(%rsp),%r9 | |
1627 movq 8+128(%rsp),%r10 | |
1628 leaq 0+128(%rsp),%rsi | |
1629 movq 16+128(%rsp),%r11 | |
1630 movq 24+128(%rsp),%r12 | |
1631 leaq 0(%rsp),%rdi | |
1632 call __ecp_nistz256_mul_montq | |
1633 | |
1634 | |
1635 | |
1636 | |
1637 addq %r12,%r12 | |
1638 leaq 192(%rsp),%rsi | |
1639 adcq %r13,%r13 | |
1640 movq %r12,%rax | |
1641 adcq %r8,%r8 | |
1642 adcq %r9,%r9 | |
1643 movq %r13,%rbp | |
1644 sbbq %r11,%r11 | |
1645 | |
1646 subq $-1,%r12 | |
1647 movq %r8,%rcx | |
1648 sbbq %r14,%r13 | |
1649 sbbq $0,%r8 | |
1650 movq %r9,%r10 | |
1651 sbbq %r15,%r9 | |
1652 testq %r11,%r11 | |
1653 | |
1654 cmovzq %rax,%r12 | |
1655 movq 0(%rsi),%rax | |
1656 cmovzq %rbp,%r13 | |
1657 movq 8(%rsi),%rbp | |
1658 cmovzq %rcx,%r8 | |
1659 movq 16(%rsi),%rcx | |
1660 cmovzq %r10,%r9 | |
1661 movq 24(%rsi),%r10 | |
1662 | |
1663 call __ecp_nistz256_subq | |
1664 | |
1665 leaq 160(%rsp),%rbx | |
1666 leaq 224(%rsp),%rdi | |
1667 call __ecp_nistz256_sub_fromq | |
1668 | |
1669 movq 0+0(%rsp),%rax | |
1670 movq 0+8(%rsp),%rbp | |
1671 movq 0+16(%rsp),%rcx | |
1672 movq 0+24(%rsp),%r10 | |
1673 leaq 64(%rsp),%rdi | |
1674 | |
1675 call __ecp_nistz256_subq | |
1676 | |
1677 movq %r12,0(%rdi) | |
1678 movq %r13,8(%rdi) | |
1679 movq %r8,16(%rdi) | |
1680 movq %r9,24(%rdi) | |
1681 movq 352(%rsp),%rax | |
1682 leaq 352(%rsp),%rbx | |
1683 movq 0+160(%rsp),%r9 | |
1684 movq 8+160(%rsp),%r10 | |
1685 leaq 0+160(%rsp),%rsi | |
1686 movq 16+160(%rsp),%r11 | |
1687 movq 24+160(%rsp),%r12 | |
1688 leaq 32(%rsp),%rdi | |
1689 call __ecp_nistz256_mul_montq | |
1690 | |
1691 movq 96(%rsp),%rax | |
1692 leaq 96(%rsp),%rbx | |
1693 movq 0+64(%rsp),%r9 | |
1694 movq 8+64(%rsp),%r10 | |
1695 leaq 0+64(%rsp),%rsi | |
1696 movq 16+64(%rsp),%r11 | |
1697 movq 24+64(%rsp),%r12 | |
1698 leaq 64(%rsp),%rdi | |
1699 call __ecp_nistz256_mul_montq | |
1700 | |
1701 leaq 32(%rsp),%rbx | |
1702 leaq 256(%rsp),%rdi | |
1703 call __ecp_nistz256_sub_fromq | |
1704 | |
1705 .byte 102,72,15,126,199 | |
1706 | |
1707 movdqa %xmm5,%xmm0 | |
1708 movdqa %xmm5,%xmm1 | |
1709 pandn 288(%rsp),%xmm0 | |
1710 movdqa %xmm5,%xmm2 | |
1711 pandn 288+16(%rsp),%xmm1 | |
1712 movdqa %xmm5,%xmm3 | |
1713 pand L$ONE_mont(%rip),%xmm2 | |
1714 pand L$ONE_mont+16(%rip),%xmm3 | |
1715 por %xmm0,%xmm2 | |
1716 por %xmm1,%xmm3 | |
1717 | |
1718 movdqa %xmm4,%xmm0 | |
1719 movdqa %xmm4,%xmm1 | |
1720 pandn %xmm2,%xmm0 | |
1721 movdqa %xmm4,%xmm2 | |
1722 pandn %xmm3,%xmm1 | |
1723 movdqa %xmm4,%xmm3 | |
1724 pand 384(%rsp),%xmm2 | |
1725 pand 384+16(%rsp),%xmm3 | |
1726 por %xmm0,%xmm2 | |
1727 por %xmm1,%xmm3 | |
1728 movdqu %xmm2,64(%rdi) | |
1729 movdqu %xmm3,80(%rdi) | |
1730 | |
1731 movdqa %xmm5,%xmm0 | |
1732 movdqa %xmm5,%xmm1 | |
1733 pandn 224(%rsp),%xmm0 | |
1734 movdqa %xmm5,%xmm2 | |
1735 pandn 224+16(%rsp),%xmm1 | |
1736 movdqa %xmm5,%xmm3 | |
1737 pand 416(%rsp),%xmm2 | |
1738 pand 416+16(%rsp),%xmm3 | |
1739 por %xmm0,%xmm2 | |
1740 por %xmm1,%xmm3 | |
1741 | |
1742 movdqa %xmm4,%xmm0 | |
1743 movdqa %xmm4,%xmm1 | |
1744 pandn %xmm2,%xmm0 | |
1745 movdqa %xmm4,%xmm2 | |
1746 pandn %xmm3,%xmm1 | |
1747 movdqa %xmm4,%xmm3 | |
1748 pand 320(%rsp),%xmm2 | |
1749 pand 320+16(%rsp),%xmm3 | |
1750 por %xmm0,%xmm2 | |
1751 por %xmm1,%xmm3 | |
1752 movdqu %xmm2,0(%rdi) | |
1753 movdqu %xmm3,16(%rdi) | |
1754 | |
1755 movdqa %xmm5,%xmm0 | |
1756 movdqa %xmm5,%xmm1 | |
1757 pandn 256(%rsp),%xmm0 | |
1758 movdqa %xmm5,%xmm2 | |
1759 pandn 256+16(%rsp),%xmm1 | |
1760 movdqa %xmm5,%xmm3 | |
1761 pand 448(%rsp),%xmm2 | |
1762 pand 448+16(%rsp),%xmm3 | |
1763 por %xmm0,%xmm2 | |
1764 por %xmm1,%xmm3 | |
1765 | |
1766 movdqa %xmm4,%xmm0 | |
1767 movdqa %xmm4,%xmm1 | |
1768 pandn %xmm2,%xmm0 | |
1769 movdqa %xmm4,%xmm2 | |
1770 pandn %xmm3,%xmm1 | |
1771 movdqa %xmm4,%xmm3 | |
1772 pand 352(%rsp),%xmm2 | |
1773 pand 352+16(%rsp),%xmm3 | |
1774 por %xmm0,%xmm2 | |
1775 por %xmm1,%xmm3 | |
1776 movdqu %xmm2,32(%rdi) | |
1777 movdqu %xmm3,48(%rdi) | |
1778 | |
1779 addq $480+8,%rsp | |
1780 popq %r15 | |
1781 popq %r14 | |
1782 popq %r13 | |
1783 popq %r12 | |
1784 popq %rbx | |
1785 popq %rbp | |
1786 .byte 0xf3,0xc3 | |
1787 | |
1788 #endif | |
OLD | NEW |