OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 .extern OPENSSL_ia32cap_P | |
5 .hidden OPENSSL_ia32cap_P | |
6 | |
7 .globl bn_mul_mont | |
8 .hidden bn_mul_mont | |
9 .type bn_mul_mont,@function | |
10 .align 16 | |
11 bn_mul_mont: | |
12 testl $3,%r9d | |
13 jnz .Lmul_enter | |
14 cmpl $8,%r9d | |
15 jb .Lmul_enter | |
16 cmpq %rsi,%rdx | |
17 jne .Lmul4x_enter | |
18 testl $7,%r9d | |
19 jz .Lsqr8x_enter | |
20 jmp .Lmul4x_enter | |
21 | |
22 .align 16 | |
23 .Lmul_enter: | |
24 pushq %rbx | |
25 pushq %rbp | |
26 pushq %r12 | |
27 pushq %r13 | |
28 pushq %r14 | |
29 pushq %r15 | |
30 | |
31 movl %r9d,%r9d | |
32 leaq 2(%r9),%r10 | |
33 movq %rsp,%r11 | |
34 negq %r10 | |
35 leaq (%rsp,%r10,8),%rsp | |
36 andq $-1024,%rsp | |
37 | |
38 movq %r11,8(%rsp,%r9,8) | |
39 .Lmul_body: | |
40 movq %rdx,%r12 | |
41 movq (%r8),%r8 | |
42 movq (%r12),%rbx | |
43 movq (%rsi),%rax | |
44 | |
45 xorq %r14,%r14 | |
46 xorq %r15,%r15 | |
47 | |
48 movq %r8,%rbp | |
49 mulq %rbx | |
50 movq %rax,%r10 | |
51 movq (%rcx),%rax | |
52 | |
53 imulq %r10,%rbp | |
54 movq %rdx,%r11 | |
55 | |
56 mulq %rbp | |
57 addq %rax,%r10 | |
58 movq 8(%rsi),%rax | |
59 adcq $0,%rdx | |
60 movq %rdx,%r13 | |
61 | |
62 leaq 1(%r15),%r15 | |
63 jmp .L1st_enter | |
64 | |
65 .align 16 | |
66 .L1st: | |
67 addq %rax,%r13 | |
68 movq (%rsi,%r15,8),%rax | |
69 adcq $0,%rdx | |
70 addq %r11,%r13 | |
71 movq %r10,%r11 | |
72 adcq $0,%rdx | |
73 movq %r13,-16(%rsp,%r15,8) | |
74 movq %rdx,%r13 | |
75 | |
76 .L1st_enter: | |
77 mulq %rbx | |
78 addq %rax,%r11 | |
79 movq (%rcx,%r15,8),%rax | |
80 adcq $0,%rdx | |
81 leaq 1(%r15),%r15 | |
82 movq %rdx,%r10 | |
83 | |
84 mulq %rbp | |
85 cmpq %r9,%r15 | |
86 jne .L1st | |
87 | |
88 addq %rax,%r13 | |
89 movq (%rsi),%rax | |
90 adcq $0,%rdx | |
91 addq %r11,%r13 | |
92 adcq $0,%rdx | |
93 movq %r13,-16(%rsp,%r15,8) | |
94 movq %rdx,%r13 | |
95 movq %r10,%r11 | |
96 | |
97 xorq %rdx,%rdx | |
98 addq %r11,%r13 | |
99 adcq $0,%rdx | |
100 movq %r13,-8(%rsp,%r9,8) | |
101 movq %rdx,(%rsp,%r9,8) | |
102 | |
103 leaq 1(%r14),%r14 | |
104 jmp .Louter | |
105 .align 16 | |
106 .Louter: | |
107 movq (%r12,%r14,8),%rbx | |
108 xorq %r15,%r15 | |
109 movq %r8,%rbp | |
110 movq (%rsp),%r10 | |
111 mulq %rbx | |
112 addq %rax,%r10 | |
113 movq (%rcx),%rax | |
114 adcq $0,%rdx | |
115 | |
116 imulq %r10,%rbp | |
117 movq %rdx,%r11 | |
118 | |
119 mulq %rbp | |
120 addq %rax,%r10 | |
121 movq 8(%rsi),%rax | |
122 adcq $0,%rdx | |
123 movq 8(%rsp),%r10 | |
124 movq %rdx,%r13 | |
125 | |
126 leaq 1(%r15),%r15 | |
127 jmp .Linner_enter | |
128 | |
129 .align 16 | |
130 .Linner: | |
131 addq %rax,%r13 | |
132 movq (%rsi,%r15,8),%rax | |
133 adcq $0,%rdx | |
134 addq %r10,%r13 | |
135 movq (%rsp,%r15,8),%r10 | |
136 adcq $0,%rdx | |
137 movq %r13,-16(%rsp,%r15,8) | |
138 movq %rdx,%r13 | |
139 | |
140 .Linner_enter: | |
141 mulq %rbx | |
142 addq %rax,%r11 | |
143 movq (%rcx,%r15,8),%rax | |
144 adcq $0,%rdx | |
145 addq %r11,%r10 | |
146 movq %rdx,%r11 | |
147 adcq $0,%r11 | |
148 leaq 1(%r15),%r15 | |
149 | |
150 mulq %rbp | |
151 cmpq %r9,%r15 | |
152 jne .Linner | |
153 | |
154 addq %rax,%r13 | |
155 movq (%rsi),%rax | |
156 adcq $0,%rdx | |
157 addq %r10,%r13 | |
158 movq (%rsp,%r15,8),%r10 | |
159 adcq $0,%rdx | |
160 movq %r13,-16(%rsp,%r15,8) | |
161 movq %rdx,%r13 | |
162 | |
163 xorq %rdx,%rdx | |
164 addq %r11,%r13 | |
165 adcq $0,%rdx | |
166 addq %r10,%r13 | |
167 adcq $0,%rdx | |
168 movq %r13,-8(%rsp,%r9,8) | |
169 movq %rdx,(%rsp,%r9,8) | |
170 | |
171 leaq 1(%r14),%r14 | |
172 cmpq %r9,%r14 | |
173 jb .Louter | |
174 | |
175 xorq %r14,%r14 | |
176 movq (%rsp),%rax | |
177 leaq (%rsp),%rsi | |
178 movq %r9,%r15 | |
179 jmp .Lsub | |
180 .align 16 | |
181 .Lsub: sbbq (%rcx,%r14,8),%rax | |
182 movq %rax,(%rdi,%r14,8) | |
183 movq 8(%rsi,%r14,8),%rax | |
184 leaq 1(%r14),%r14 | |
185 decq %r15 | |
186 jnz .Lsub | |
187 | |
188 sbbq $0,%rax | |
189 xorq %r14,%r14 | |
190 movq %r9,%r15 | |
191 .align 16 | |
192 .Lcopy: | |
193 movq (%rsp,%r14,8),%rsi | |
194 movq (%rdi,%r14,8),%rcx | |
195 xorq %rcx,%rsi | |
196 andq %rax,%rsi | |
197 xorq %rcx,%rsi | |
198 movq %r14,(%rsp,%r14,8) | |
199 movq %rsi,(%rdi,%r14,8) | |
200 leaq 1(%r14),%r14 | |
201 subq $1,%r15 | |
202 jnz .Lcopy | |
203 | |
204 movq 8(%rsp,%r9,8),%rsi | |
205 movq $1,%rax | |
206 movq (%rsi),%r15 | |
207 movq 8(%rsi),%r14 | |
208 movq 16(%rsi),%r13 | |
209 movq 24(%rsi),%r12 | |
210 movq 32(%rsi),%rbp | |
211 movq 40(%rsi),%rbx | |
212 leaq 48(%rsi),%rsp | |
213 .Lmul_epilogue: | |
214 .byte 0xf3,0xc3 | |
215 .size bn_mul_mont,.-bn_mul_mont | |
216 .type bn_mul4x_mont,@function | |
217 .align 16 | |
218 bn_mul4x_mont: | |
219 .Lmul4x_enter: | |
220 pushq %rbx | |
221 pushq %rbp | |
222 pushq %r12 | |
223 pushq %r13 | |
224 pushq %r14 | |
225 pushq %r15 | |
226 | |
227 movl %r9d,%r9d | |
228 leaq 4(%r9),%r10 | |
229 movq %rsp,%r11 | |
230 negq %r10 | |
231 leaq (%rsp,%r10,8),%rsp | |
232 andq $-1024,%rsp | |
233 | |
234 movq %r11,8(%rsp,%r9,8) | |
235 .Lmul4x_body: | |
236 movq %rdi,16(%rsp,%r9,8) | |
237 movq %rdx,%r12 | |
238 movq (%r8),%r8 | |
239 movq (%r12),%rbx | |
240 movq (%rsi),%rax | |
241 | |
242 xorq %r14,%r14 | |
243 xorq %r15,%r15 | |
244 | |
245 movq %r8,%rbp | |
246 mulq %rbx | |
247 movq %rax,%r10 | |
248 movq (%rcx),%rax | |
249 | |
250 imulq %r10,%rbp | |
251 movq %rdx,%r11 | |
252 | |
253 mulq %rbp | |
254 addq %rax,%r10 | |
255 movq 8(%rsi),%rax | |
256 adcq $0,%rdx | |
257 movq %rdx,%rdi | |
258 | |
259 mulq %rbx | |
260 addq %rax,%r11 | |
261 movq 8(%rcx),%rax | |
262 adcq $0,%rdx | |
263 movq %rdx,%r10 | |
264 | |
265 mulq %rbp | |
266 addq %rax,%rdi | |
267 movq 16(%rsi),%rax | |
268 adcq $0,%rdx | |
269 addq %r11,%rdi | |
270 leaq 4(%r15),%r15 | |
271 adcq $0,%rdx | |
272 movq %rdi,(%rsp) | |
273 movq %rdx,%r13 | |
274 jmp .L1st4x | |
275 .align 16 | |
276 .L1st4x: | |
277 mulq %rbx | |
278 addq %rax,%r10 | |
279 movq -16(%rcx,%r15,8),%rax | |
280 adcq $0,%rdx | |
281 movq %rdx,%r11 | |
282 | |
283 mulq %rbp | |
284 addq %rax,%r13 | |
285 movq -8(%rsi,%r15,8),%rax | |
286 adcq $0,%rdx | |
287 addq %r10,%r13 | |
288 adcq $0,%rdx | |
289 movq %r13,-24(%rsp,%r15,8) | |
290 movq %rdx,%rdi | |
291 | |
292 mulq %rbx | |
293 addq %rax,%r11 | |
294 movq -8(%rcx,%r15,8),%rax | |
295 adcq $0,%rdx | |
296 movq %rdx,%r10 | |
297 | |
298 mulq %rbp | |
299 addq %rax,%rdi | |
300 movq (%rsi,%r15,8),%rax | |
301 adcq $0,%rdx | |
302 addq %r11,%rdi | |
303 adcq $0,%rdx | |
304 movq %rdi,-16(%rsp,%r15,8) | |
305 movq %rdx,%r13 | |
306 | |
307 mulq %rbx | |
308 addq %rax,%r10 | |
309 movq (%rcx,%r15,8),%rax | |
310 adcq $0,%rdx | |
311 movq %rdx,%r11 | |
312 | |
313 mulq %rbp | |
314 addq %rax,%r13 | |
315 movq 8(%rsi,%r15,8),%rax | |
316 adcq $0,%rdx | |
317 addq %r10,%r13 | |
318 adcq $0,%rdx | |
319 movq %r13,-8(%rsp,%r15,8) | |
320 movq %rdx,%rdi | |
321 | |
322 mulq %rbx | |
323 addq %rax,%r11 | |
324 movq 8(%rcx,%r15,8),%rax | |
325 adcq $0,%rdx | |
326 leaq 4(%r15),%r15 | |
327 movq %rdx,%r10 | |
328 | |
329 mulq %rbp | |
330 addq %rax,%rdi | |
331 movq -16(%rsi,%r15,8),%rax | |
332 adcq $0,%rdx | |
333 addq %r11,%rdi | |
334 adcq $0,%rdx | |
335 movq %rdi,-32(%rsp,%r15,8) | |
336 movq %rdx,%r13 | |
337 cmpq %r9,%r15 | |
338 jb .L1st4x | |
339 | |
340 mulq %rbx | |
341 addq %rax,%r10 | |
342 movq -16(%rcx,%r15,8),%rax | |
343 adcq $0,%rdx | |
344 movq %rdx,%r11 | |
345 | |
346 mulq %rbp | |
347 addq %rax,%r13 | |
348 movq -8(%rsi,%r15,8),%rax | |
349 adcq $0,%rdx | |
350 addq %r10,%r13 | |
351 adcq $0,%rdx | |
352 movq %r13,-24(%rsp,%r15,8) | |
353 movq %rdx,%rdi | |
354 | |
355 mulq %rbx | |
356 addq %rax,%r11 | |
357 movq -8(%rcx,%r15,8),%rax | |
358 adcq $0,%rdx | |
359 movq %rdx,%r10 | |
360 | |
361 mulq %rbp | |
362 addq %rax,%rdi | |
363 movq (%rsi),%rax | |
364 adcq $0,%rdx | |
365 addq %r11,%rdi | |
366 adcq $0,%rdx | |
367 movq %rdi,-16(%rsp,%r15,8) | |
368 movq %rdx,%r13 | |
369 | |
370 xorq %rdi,%rdi | |
371 addq %r10,%r13 | |
372 adcq $0,%rdi | |
373 movq %r13,-8(%rsp,%r15,8) | |
374 movq %rdi,(%rsp,%r15,8) | |
375 | |
376 leaq 1(%r14),%r14 | |
377 .align 4 | |
378 .Louter4x: | |
379 movq (%r12,%r14,8),%rbx | |
380 xorq %r15,%r15 | |
381 movq (%rsp),%r10 | |
382 movq %r8,%rbp | |
383 mulq %rbx | |
384 addq %rax,%r10 | |
385 movq (%rcx),%rax | |
386 adcq $0,%rdx | |
387 | |
388 imulq %r10,%rbp | |
389 movq %rdx,%r11 | |
390 | |
391 mulq %rbp | |
392 addq %rax,%r10 | |
393 movq 8(%rsi),%rax | |
394 adcq $0,%rdx | |
395 movq %rdx,%rdi | |
396 | |
397 mulq %rbx | |
398 addq %rax,%r11 | |
399 movq 8(%rcx),%rax | |
400 adcq $0,%rdx | |
401 addq 8(%rsp),%r11 | |
402 adcq $0,%rdx | |
403 movq %rdx,%r10 | |
404 | |
405 mulq %rbp | |
406 addq %rax,%rdi | |
407 movq 16(%rsi),%rax | |
408 adcq $0,%rdx | |
409 addq %r11,%rdi | |
410 leaq 4(%r15),%r15 | |
411 adcq $0,%rdx | |
412 movq %rdi,(%rsp) | |
413 movq %rdx,%r13 | |
414 jmp .Linner4x | |
415 .align 16 | |
416 .Linner4x: | |
417 mulq %rbx | |
418 addq %rax,%r10 | |
419 movq -16(%rcx,%r15,8),%rax | |
420 adcq $0,%rdx | |
421 addq -16(%rsp,%r15,8),%r10 | |
422 adcq $0,%rdx | |
423 movq %rdx,%r11 | |
424 | |
425 mulq %rbp | |
426 addq %rax,%r13 | |
427 movq -8(%rsi,%r15,8),%rax | |
428 adcq $0,%rdx | |
429 addq %r10,%r13 | |
430 adcq $0,%rdx | |
431 movq %r13,-24(%rsp,%r15,8) | |
432 movq %rdx,%rdi | |
433 | |
434 mulq %rbx | |
435 addq %rax,%r11 | |
436 movq -8(%rcx,%r15,8),%rax | |
437 adcq $0,%rdx | |
438 addq -8(%rsp,%r15,8),%r11 | |
439 adcq $0,%rdx | |
440 movq %rdx,%r10 | |
441 | |
442 mulq %rbp | |
443 addq %rax,%rdi | |
444 movq (%rsi,%r15,8),%rax | |
445 adcq $0,%rdx | |
446 addq %r11,%rdi | |
447 adcq $0,%rdx | |
448 movq %rdi,-16(%rsp,%r15,8) | |
449 movq %rdx,%r13 | |
450 | |
451 mulq %rbx | |
452 addq %rax,%r10 | |
453 movq (%rcx,%r15,8),%rax | |
454 adcq $0,%rdx | |
455 addq (%rsp,%r15,8),%r10 | |
456 adcq $0,%rdx | |
457 movq %rdx,%r11 | |
458 | |
459 mulq %rbp | |
460 addq %rax,%r13 | |
461 movq 8(%rsi,%r15,8),%rax | |
462 adcq $0,%rdx | |
463 addq %r10,%r13 | |
464 adcq $0,%rdx | |
465 movq %r13,-8(%rsp,%r15,8) | |
466 movq %rdx,%rdi | |
467 | |
468 mulq %rbx | |
469 addq %rax,%r11 | |
470 movq 8(%rcx,%r15,8),%rax | |
471 adcq $0,%rdx | |
472 addq 8(%rsp,%r15,8),%r11 | |
473 adcq $0,%rdx | |
474 leaq 4(%r15),%r15 | |
475 movq %rdx,%r10 | |
476 | |
477 mulq %rbp | |
478 addq %rax,%rdi | |
479 movq -16(%rsi,%r15,8),%rax | |
480 adcq $0,%rdx | |
481 addq %r11,%rdi | |
482 adcq $0,%rdx | |
483 movq %rdi,-32(%rsp,%r15,8) | |
484 movq %rdx,%r13 | |
485 cmpq %r9,%r15 | |
486 jb .Linner4x | |
487 | |
488 mulq %rbx | |
489 addq %rax,%r10 | |
490 movq -16(%rcx,%r15,8),%rax | |
491 adcq $0,%rdx | |
492 addq -16(%rsp,%r15,8),%r10 | |
493 adcq $0,%rdx | |
494 movq %rdx,%r11 | |
495 | |
496 mulq %rbp | |
497 addq %rax,%r13 | |
498 movq -8(%rsi,%r15,8),%rax | |
499 adcq $0,%rdx | |
500 addq %r10,%r13 | |
501 adcq $0,%rdx | |
502 movq %r13,-24(%rsp,%r15,8) | |
503 movq %rdx,%rdi | |
504 | |
505 mulq %rbx | |
506 addq %rax,%r11 | |
507 movq -8(%rcx,%r15,8),%rax | |
508 adcq $0,%rdx | |
509 addq -8(%rsp,%r15,8),%r11 | |
510 adcq $0,%rdx | |
511 leaq 1(%r14),%r14 | |
512 movq %rdx,%r10 | |
513 | |
514 mulq %rbp | |
515 addq %rax,%rdi | |
516 movq (%rsi),%rax | |
517 adcq $0,%rdx | |
518 addq %r11,%rdi | |
519 adcq $0,%rdx | |
520 movq %rdi,-16(%rsp,%r15,8) | |
521 movq %rdx,%r13 | |
522 | |
523 xorq %rdi,%rdi | |
524 addq %r10,%r13 | |
525 adcq $0,%rdi | |
526 addq (%rsp,%r9,8),%r13 | |
527 adcq $0,%rdi | |
528 movq %r13,-8(%rsp,%r15,8) | |
529 movq %rdi,(%rsp,%r15,8) | |
530 | |
531 cmpq %r9,%r14 | |
532 jb .Louter4x | |
533 movq 16(%rsp,%r9,8),%rdi | |
534 movq 0(%rsp),%rax | |
535 movq 8(%rsp),%rdx | |
536 shrq $2,%r9 | |
537 leaq (%rsp),%rsi | |
538 xorq %r14,%r14 | |
539 | |
540 subq 0(%rcx),%rax | |
541 movq 16(%rsi),%rbx | |
542 movq 24(%rsi),%rbp | |
543 sbbq 8(%rcx),%rdx | |
544 leaq -1(%r9),%r15 | |
545 jmp .Lsub4x | |
546 .align 16 | |
547 .Lsub4x: | |
548 movq %rax,0(%rdi,%r14,8) | |
549 movq %rdx,8(%rdi,%r14,8) | |
550 sbbq 16(%rcx,%r14,8),%rbx | |
551 movq 32(%rsi,%r14,8),%rax | |
552 movq 40(%rsi,%r14,8),%rdx | |
553 sbbq 24(%rcx,%r14,8),%rbp | |
554 movq %rbx,16(%rdi,%r14,8) | |
555 movq %rbp,24(%rdi,%r14,8) | |
556 sbbq 32(%rcx,%r14,8),%rax | |
557 movq 48(%rsi,%r14,8),%rbx | |
558 movq 56(%rsi,%r14,8),%rbp | |
559 sbbq 40(%rcx,%r14,8),%rdx | |
560 leaq 4(%r14),%r14 | |
561 decq %r15 | |
562 jnz .Lsub4x | |
563 | |
564 movq %rax,0(%rdi,%r14,8) | |
565 movq 32(%rsi,%r14,8),%rax | |
566 sbbq 16(%rcx,%r14,8),%rbx | |
567 movq %rdx,8(%rdi,%r14,8) | |
568 sbbq 24(%rcx,%r14,8),%rbp | |
569 movq %rbx,16(%rdi,%r14,8) | |
570 | |
571 sbbq $0,%rax | |
572 movq %rax,%xmm0 | |
573 punpcklqdq %xmm0,%xmm0 | |
574 movq %rbp,24(%rdi,%r14,8) | |
575 xorq %r14,%r14 | |
576 | |
577 movq %r9,%r15 | |
578 pxor %xmm5,%xmm5 | |
579 jmp .Lcopy4x | |
580 .align 16 | |
581 .Lcopy4x: | |
582 movdqu (%rsp,%r14,1),%xmm2 | |
583 movdqu 16(%rsp,%r14,1),%xmm4 | |
584 movdqu (%rdi,%r14,1),%xmm1 | |
585 movdqu 16(%rdi,%r14,1),%xmm3 | |
586 pxor %xmm1,%xmm2 | |
587 pxor %xmm3,%xmm4 | |
588 pand %xmm0,%xmm2 | |
589 pand %xmm0,%xmm4 | |
590 pxor %xmm1,%xmm2 | |
591 pxor %xmm3,%xmm4 | |
592 movdqu %xmm2,(%rdi,%r14,1) | |
593 movdqu %xmm4,16(%rdi,%r14,1) | |
594 movdqa %xmm5,(%rsp,%r14,1) | |
595 movdqa %xmm5,16(%rsp,%r14,1) | |
596 | |
597 leaq 32(%r14),%r14 | |
598 decq %r15 | |
599 jnz .Lcopy4x | |
600 | |
601 shlq $2,%r9 | |
602 movq 8(%rsp,%r9,8),%rsi | |
603 movq $1,%rax | |
604 movq (%rsi),%r15 | |
605 movq 8(%rsi),%r14 | |
606 movq 16(%rsi),%r13 | |
607 movq 24(%rsi),%r12 | |
608 movq 32(%rsi),%rbp | |
609 movq 40(%rsi),%rbx | |
610 leaq 48(%rsi),%rsp | |
611 .Lmul4x_epilogue: | |
612 .byte 0xf3,0xc3 | |
613 .size bn_mul4x_mont,.-bn_mul4x_mont | |
614 .extern bn_sqr8x_internal | |
615 .hidden bn_sqr8x_internal | |
616 | |
617 .type bn_sqr8x_mont,@function | |
618 .align 32 | |
619 bn_sqr8x_mont: | |
620 .Lsqr8x_enter: | |
621 movq %rsp,%rax | |
622 pushq %rbx | |
623 pushq %rbp | |
624 pushq %r12 | |
625 pushq %r13 | |
626 pushq %r14 | |
627 pushq %r15 | |
628 | |
629 movl %r9d,%r10d | |
630 shll $3,%r9d | |
631 shlq $3+2,%r10 | |
632 negq %r9 | |
633 | |
634 | |
635 | |
636 | |
637 | |
638 | |
639 leaq -64(%rsp,%r9,2),%r11 | |
640 movq (%r8),%r8 | |
641 subq %rsi,%r11 | |
642 andq $4095,%r11 | |
643 cmpq %r11,%r10 | |
644 jb .Lsqr8x_sp_alt | |
645 subq %r11,%rsp | |
646 leaq -64(%rsp,%r9,2),%rsp | |
647 jmp .Lsqr8x_sp_done | |
648 | |
649 .align 32 | |
650 .Lsqr8x_sp_alt: | |
651 leaq 4096-64(,%r9,2),%r10 | |
652 leaq -64(%rsp,%r9,2),%rsp | |
653 subq %r10,%r11 | |
654 movq $0,%r10 | |
655 cmovcq %r10,%r11 | |
656 subq %r11,%rsp | |
657 .Lsqr8x_sp_done: | |
658 andq $-64,%rsp | |
659 movq %r9,%r10 | |
660 negq %r9 | |
661 | |
662 movq %r8,32(%rsp) | |
663 movq %rax,40(%rsp) | |
664 .Lsqr8x_body: | |
665 | |
666 .byte 102,72,15,110,209 | |
667 pxor %xmm0,%xmm0 | |
668 .byte 102,72,15,110,207 | |
669 .byte 102,73,15,110,218 | |
670 call bn_sqr8x_internal | |
671 | |
672 | |
673 | |
674 | |
675 leaq (%rdi,%r9,1),%rbx | |
676 movq %r9,%rcx | |
677 movq %r9,%rdx | |
678 .byte 102,72,15,126,207 | |
679 sarq $3+2,%rcx | |
680 jmp .Lsqr8x_sub | |
681 | |
682 .align 32 | |
683 .Lsqr8x_sub: | |
684 movq 0(%rbx),%r12 | |
685 movq 8(%rbx),%r13 | |
686 movq 16(%rbx),%r14 | |
687 movq 24(%rbx),%r15 | |
688 leaq 32(%rbx),%rbx | |
689 sbbq 0(%rbp),%r12 | |
690 sbbq 8(%rbp),%r13 | |
691 sbbq 16(%rbp),%r14 | |
692 sbbq 24(%rbp),%r15 | |
693 leaq 32(%rbp),%rbp | |
694 movq %r12,0(%rdi) | |
695 movq %r13,8(%rdi) | |
696 movq %r14,16(%rdi) | |
697 movq %r15,24(%rdi) | |
698 leaq 32(%rdi),%rdi | |
699 incq %rcx | |
700 jnz .Lsqr8x_sub | |
701 | |
702 sbbq $0,%rax | |
703 leaq (%rbx,%r9,1),%rbx | |
704 leaq (%rdi,%r9,1),%rdi | |
705 | |
706 .byte 102,72,15,110,200 | |
707 pxor %xmm0,%xmm0 | |
708 pshufd $0,%xmm1,%xmm1 | |
709 movq 40(%rsp),%rsi | |
710 jmp .Lsqr8x_cond_copy | |
711 | |
712 .align 32 | |
713 .Lsqr8x_cond_copy: | |
714 movdqa 0(%rbx),%xmm2 | |
715 movdqa 16(%rbx),%xmm3 | |
716 leaq 32(%rbx),%rbx | |
717 movdqu 0(%rdi),%xmm4 | |
718 movdqu 16(%rdi),%xmm5 | |
719 leaq 32(%rdi),%rdi | |
720 movdqa %xmm0,-32(%rbx) | |
721 movdqa %xmm0,-16(%rbx) | |
722 movdqa %xmm0,-32(%rbx,%rdx,1) | |
723 movdqa %xmm0,-16(%rbx,%rdx,1) | |
724 pcmpeqd %xmm1,%xmm0 | |
725 pand %xmm1,%xmm2 | |
726 pand %xmm1,%xmm3 | |
727 pand %xmm0,%xmm4 | |
728 pand %xmm0,%xmm5 | |
729 pxor %xmm0,%xmm0 | |
730 por %xmm2,%xmm4 | |
731 por %xmm3,%xmm5 | |
732 movdqu %xmm4,-32(%rdi) | |
733 movdqu %xmm5,-16(%rdi) | |
734 addq $32,%r9 | |
735 jnz .Lsqr8x_cond_copy | |
736 | |
737 movq $1,%rax | |
738 movq -48(%rsi),%r15 | |
739 movq -40(%rsi),%r14 | |
740 movq -32(%rsi),%r13 | |
741 movq -24(%rsi),%r12 | |
742 movq -16(%rsi),%rbp | |
743 movq -8(%rsi),%rbx | |
744 leaq (%rsi),%rsp | |
745 .Lsqr8x_epilogue: | |
746 .byte 0xf3,0xc3 | |
747 .size bn_sqr8x_mont,.-bn_sqr8x_mont | |
748 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84
,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,10
8,46,111,114,103,62,0 | |
749 .align 16 | |
750 #endif | |
OLD | NEW |