OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 | |
5 | |
6 .globl _bn_mul_mont | |
7 .private_extern _bn_mul_mont | |
8 | |
9 .p2align 4 | |
10 _bn_mul_mont: | |
11 testl $3,%r9d | |
12 jnz L$mul_enter | |
13 cmpl $8,%r9d | |
14 jb L$mul_enter | |
15 cmpq %rsi,%rdx | |
16 jne L$mul4x_enter | |
17 testl $7,%r9d | |
18 jz L$sqr8x_enter | |
19 jmp L$mul4x_enter | |
20 | |
21 .p2align 4 | |
22 L$mul_enter: | |
23 pushq %rbx | |
24 pushq %rbp | |
25 pushq %r12 | |
26 pushq %r13 | |
27 pushq %r14 | |
28 pushq %r15 | |
29 | |
30 movl %r9d,%r9d | |
31 leaq 2(%r9),%r10 | |
32 movq %rsp,%r11 | |
33 negq %r10 | |
34 leaq (%rsp,%r10,8),%rsp | |
35 andq $-1024,%rsp | |
36 | |
37 movq %r11,8(%rsp,%r9,8) | |
38 L$mul_body: | |
39 movq %rdx,%r12 | |
40 movq (%r8),%r8 | |
41 movq (%r12),%rbx | |
42 movq (%rsi),%rax | |
43 | |
44 xorq %r14,%r14 | |
45 xorq %r15,%r15 | |
46 | |
47 movq %r8,%rbp | |
48 mulq %rbx | |
49 movq %rax,%r10 | |
50 movq (%rcx),%rax | |
51 | |
52 imulq %r10,%rbp | |
53 movq %rdx,%r11 | |
54 | |
55 mulq %rbp | |
56 addq %rax,%r10 | |
57 movq 8(%rsi),%rax | |
58 adcq $0,%rdx | |
59 movq %rdx,%r13 | |
60 | |
61 leaq 1(%r15),%r15 | |
62 jmp L$1st_enter | |
63 | |
64 .p2align 4 | |
65 L$1st: | |
66 addq %rax,%r13 | |
67 movq (%rsi,%r15,8),%rax | |
68 adcq $0,%rdx | |
69 addq %r11,%r13 | |
70 movq %r10,%r11 | |
71 adcq $0,%rdx | |
72 movq %r13,-16(%rsp,%r15,8) | |
73 movq %rdx,%r13 | |
74 | |
75 L$1st_enter: | |
76 mulq %rbx | |
77 addq %rax,%r11 | |
78 movq (%rcx,%r15,8),%rax | |
79 adcq $0,%rdx | |
80 leaq 1(%r15),%r15 | |
81 movq %rdx,%r10 | |
82 | |
83 mulq %rbp | |
84 cmpq %r9,%r15 | |
85 jne L$1st | |
86 | |
87 addq %rax,%r13 | |
88 movq (%rsi),%rax | |
89 adcq $0,%rdx | |
90 addq %r11,%r13 | |
91 adcq $0,%rdx | |
92 movq %r13,-16(%rsp,%r15,8) | |
93 movq %rdx,%r13 | |
94 movq %r10,%r11 | |
95 | |
96 xorq %rdx,%rdx | |
97 addq %r11,%r13 | |
98 adcq $0,%rdx | |
99 movq %r13,-8(%rsp,%r9,8) | |
100 movq %rdx,(%rsp,%r9,8) | |
101 | |
102 leaq 1(%r14),%r14 | |
103 jmp L$outer | |
104 .p2align 4 | |
105 L$outer: | |
106 movq (%r12,%r14,8),%rbx | |
107 xorq %r15,%r15 | |
108 movq %r8,%rbp | |
109 movq (%rsp),%r10 | |
110 mulq %rbx | |
111 addq %rax,%r10 | |
112 movq (%rcx),%rax | |
113 adcq $0,%rdx | |
114 | |
115 imulq %r10,%rbp | |
116 movq %rdx,%r11 | |
117 | |
118 mulq %rbp | |
119 addq %rax,%r10 | |
120 movq 8(%rsi),%rax | |
121 adcq $0,%rdx | |
122 movq 8(%rsp),%r10 | |
123 movq %rdx,%r13 | |
124 | |
125 leaq 1(%r15),%r15 | |
126 jmp L$inner_enter | |
127 | |
128 .p2align 4 | |
129 L$inner: | |
130 addq %rax,%r13 | |
131 movq (%rsi,%r15,8),%rax | |
132 adcq $0,%rdx | |
133 addq %r10,%r13 | |
134 movq (%rsp,%r15,8),%r10 | |
135 adcq $0,%rdx | |
136 movq %r13,-16(%rsp,%r15,8) | |
137 movq %rdx,%r13 | |
138 | |
139 L$inner_enter: | |
140 mulq %rbx | |
141 addq %rax,%r11 | |
142 movq (%rcx,%r15,8),%rax | |
143 adcq $0,%rdx | |
144 addq %r11,%r10 | |
145 movq %rdx,%r11 | |
146 adcq $0,%r11 | |
147 leaq 1(%r15),%r15 | |
148 | |
149 mulq %rbp | |
150 cmpq %r9,%r15 | |
151 jne L$inner | |
152 | |
153 addq %rax,%r13 | |
154 movq (%rsi),%rax | |
155 adcq $0,%rdx | |
156 addq %r10,%r13 | |
157 movq (%rsp,%r15,8),%r10 | |
158 adcq $0,%rdx | |
159 movq %r13,-16(%rsp,%r15,8) | |
160 movq %rdx,%r13 | |
161 | |
162 xorq %rdx,%rdx | |
163 addq %r11,%r13 | |
164 adcq $0,%rdx | |
165 addq %r10,%r13 | |
166 adcq $0,%rdx | |
167 movq %r13,-8(%rsp,%r9,8) | |
168 movq %rdx,(%rsp,%r9,8) | |
169 | |
170 leaq 1(%r14),%r14 | |
171 cmpq %r9,%r14 | |
172 jb L$outer | |
173 | |
174 xorq %r14,%r14 | |
175 movq (%rsp),%rax | |
176 leaq (%rsp),%rsi | |
177 movq %r9,%r15 | |
178 jmp L$sub | |
179 .p2align 4 | |
180 L$sub: sbbq (%rcx,%r14,8),%rax | |
181 movq %rax,(%rdi,%r14,8) | |
182 movq 8(%rsi,%r14,8),%rax | |
183 leaq 1(%r14),%r14 | |
184 decq %r15 | |
185 jnz L$sub | |
186 | |
187 sbbq $0,%rax | |
188 xorq %r14,%r14 | |
189 movq %r9,%r15 | |
190 .p2align 4 | |
191 L$copy: | |
192 movq (%rsp,%r14,8),%rsi | |
193 movq (%rdi,%r14,8),%rcx | |
194 xorq %rcx,%rsi | |
195 andq %rax,%rsi | |
196 xorq %rcx,%rsi | |
197 movq %r14,(%rsp,%r14,8) | |
198 movq %rsi,(%rdi,%r14,8) | |
199 leaq 1(%r14),%r14 | |
200 subq $1,%r15 | |
201 jnz L$copy | |
202 | |
203 movq 8(%rsp,%r9,8),%rsi | |
204 movq $1,%rax | |
205 movq (%rsi),%r15 | |
206 movq 8(%rsi),%r14 | |
207 movq 16(%rsi),%r13 | |
208 movq 24(%rsi),%r12 | |
209 movq 32(%rsi),%rbp | |
210 movq 40(%rsi),%rbx | |
211 leaq 48(%rsi),%rsp | |
212 L$mul_epilogue: | |
213 .byte 0xf3,0xc3 | |
214 | |
215 | |
216 .p2align 4 | |
217 bn_mul4x_mont: | |
218 L$mul4x_enter: | |
219 pushq %rbx | |
220 pushq %rbp | |
221 pushq %r12 | |
222 pushq %r13 | |
223 pushq %r14 | |
224 pushq %r15 | |
225 | |
226 movl %r9d,%r9d | |
227 leaq 4(%r9),%r10 | |
228 movq %rsp,%r11 | |
229 negq %r10 | |
230 leaq (%rsp,%r10,8),%rsp | |
231 andq $-1024,%rsp | |
232 | |
233 movq %r11,8(%rsp,%r9,8) | |
234 L$mul4x_body: | |
235 movq %rdi,16(%rsp,%r9,8) | |
236 movq %rdx,%r12 | |
237 movq (%r8),%r8 | |
238 movq (%r12),%rbx | |
239 movq (%rsi),%rax | |
240 | |
241 xorq %r14,%r14 | |
242 xorq %r15,%r15 | |
243 | |
244 movq %r8,%rbp | |
245 mulq %rbx | |
246 movq %rax,%r10 | |
247 movq (%rcx),%rax | |
248 | |
249 imulq %r10,%rbp | |
250 movq %rdx,%r11 | |
251 | |
252 mulq %rbp | |
253 addq %rax,%r10 | |
254 movq 8(%rsi),%rax | |
255 adcq $0,%rdx | |
256 movq %rdx,%rdi | |
257 | |
258 mulq %rbx | |
259 addq %rax,%r11 | |
260 movq 8(%rcx),%rax | |
261 adcq $0,%rdx | |
262 movq %rdx,%r10 | |
263 | |
264 mulq %rbp | |
265 addq %rax,%rdi | |
266 movq 16(%rsi),%rax | |
267 adcq $0,%rdx | |
268 addq %r11,%rdi | |
269 leaq 4(%r15),%r15 | |
270 adcq $0,%rdx | |
271 movq %rdi,(%rsp) | |
272 movq %rdx,%r13 | |
273 jmp L$1st4x | |
274 .p2align 4 | |
275 L$1st4x: | |
276 mulq %rbx | |
277 addq %rax,%r10 | |
278 movq -16(%rcx,%r15,8),%rax | |
279 adcq $0,%rdx | |
280 movq %rdx,%r11 | |
281 | |
282 mulq %rbp | |
283 addq %rax,%r13 | |
284 movq -8(%rsi,%r15,8),%rax | |
285 adcq $0,%rdx | |
286 addq %r10,%r13 | |
287 adcq $0,%rdx | |
288 movq %r13,-24(%rsp,%r15,8) | |
289 movq %rdx,%rdi | |
290 | |
291 mulq %rbx | |
292 addq %rax,%r11 | |
293 movq -8(%rcx,%r15,8),%rax | |
294 adcq $0,%rdx | |
295 movq %rdx,%r10 | |
296 | |
297 mulq %rbp | |
298 addq %rax,%rdi | |
299 movq (%rsi,%r15,8),%rax | |
300 adcq $0,%rdx | |
301 addq %r11,%rdi | |
302 adcq $0,%rdx | |
303 movq %rdi,-16(%rsp,%r15,8) | |
304 movq %rdx,%r13 | |
305 | |
306 mulq %rbx | |
307 addq %rax,%r10 | |
308 movq (%rcx,%r15,8),%rax | |
309 adcq $0,%rdx | |
310 movq %rdx,%r11 | |
311 | |
312 mulq %rbp | |
313 addq %rax,%r13 | |
314 movq 8(%rsi,%r15,8),%rax | |
315 adcq $0,%rdx | |
316 addq %r10,%r13 | |
317 adcq $0,%rdx | |
318 movq %r13,-8(%rsp,%r15,8) | |
319 movq %rdx,%rdi | |
320 | |
321 mulq %rbx | |
322 addq %rax,%r11 | |
323 movq 8(%rcx,%r15,8),%rax | |
324 adcq $0,%rdx | |
325 leaq 4(%r15),%r15 | |
326 movq %rdx,%r10 | |
327 | |
328 mulq %rbp | |
329 addq %rax,%rdi | |
330 movq -16(%rsi,%r15,8),%rax | |
331 adcq $0,%rdx | |
332 addq %r11,%rdi | |
333 adcq $0,%rdx | |
334 movq %rdi,-32(%rsp,%r15,8) | |
335 movq %rdx,%r13 | |
336 cmpq %r9,%r15 | |
337 jb L$1st4x | |
338 | |
339 mulq %rbx | |
340 addq %rax,%r10 | |
341 movq -16(%rcx,%r15,8),%rax | |
342 adcq $0,%rdx | |
343 movq %rdx,%r11 | |
344 | |
345 mulq %rbp | |
346 addq %rax,%r13 | |
347 movq -8(%rsi,%r15,8),%rax | |
348 adcq $0,%rdx | |
349 addq %r10,%r13 | |
350 adcq $0,%rdx | |
351 movq %r13,-24(%rsp,%r15,8) | |
352 movq %rdx,%rdi | |
353 | |
354 mulq %rbx | |
355 addq %rax,%r11 | |
356 movq -8(%rcx,%r15,8),%rax | |
357 adcq $0,%rdx | |
358 movq %rdx,%r10 | |
359 | |
360 mulq %rbp | |
361 addq %rax,%rdi | |
362 movq (%rsi),%rax | |
363 adcq $0,%rdx | |
364 addq %r11,%rdi | |
365 adcq $0,%rdx | |
366 movq %rdi,-16(%rsp,%r15,8) | |
367 movq %rdx,%r13 | |
368 | |
369 xorq %rdi,%rdi | |
370 addq %r10,%r13 | |
371 adcq $0,%rdi | |
372 movq %r13,-8(%rsp,%r15,8) | |
373 movq %rdi,(%rsp,%r15,8) | |
374 | |
375 leaq 1(%r14),%r14 | |
376 .p2align 2 | |
377 L$outer4x: | |
378 movq (%r12,%r14,8),%rbx | |
379 xorq %r15,%r15 | |
380 movq (%rsp),%r10 | |
381 movq %r8,%rbp | |
382 mulq %rbx | |
383 addq %rax,%r10 | |
384 movq (%rcx),%rax | |
385 adcq $0,%rdx | |
386 | |
387 imulq %r10,%rbp | |
388 movq %rdx,%r11 | |
389 | |
390 mulq %rbp | |
391 addq %rax,%r10 | |
392 movq 8(%rsi),%rax | |
393 adcq $0,%rdx | |
394 movq %rdx,%rdi | |
395 | |
396 mulq %rbx | |
397 addq %rax,%r11 | |
398 movq 8(%rcx),%rax | |
399 adcq $0,%rdx | |
400 addq 8(%rsp),%r11 | |
401 adcq $0,%rdx | |
402 movq %rdx,%r10 | |
403 | |
404 mulq %rbp | |
405 addq %rax,%rdi | |
406 movq 16(%rsi),%rax | |
407 adcq $0,%rdx | |
408 addq %r11,%rdi | |
409 leaq 4(%r15),%r15 | |
410 adcq $0,%rdx | |
411 movq %rdi,(%rsp) | |
412 movq %rdx,%r13 | |
413 jmp L$inner4x | |
414 .p2align 4 | |
415 L$inner4x: | |
416 mulq %rbx | |
417 addq %rax,%r10 | |
418 movq -16(%rcx,%r15,8),%rax | |
419 adcq $0,%rdx | |
420 addq -16(%rsp,%r15,8),%r10 | |
421 adcq $0,%rdx | |
422 movq %rdx,%r11 | |
423 | |
424 mulq %rbp | |
425 addq %rax,%r13 | |
426 movq -8(%rsi,%r15,8),%rax | |
427 adcq $0,%rdx | |
428 addq %r10,%r13 | |
429 adcq $0,%rdx | |
430 movq %r13,-24(%rsp,%r15,8) | |
431 movq %rdx,%rdi | |
432 | |
433 mulq %rbx | |
434 addq %rax,%r11 | |
435 movq -8(%rcx,%r15,8),%rax | |
436 adcq $0,%rdx | |
437 addq -8(%rsp,%r15,8),%r11 | |
438 adcq $0,%rdx | |
439 movq %rdx,%r10 | |
440 | |
441 mulq %rbp | |
442 addq %rax,%rdi | |
443 movq (%rsi,%r15,8),%rax | |
444 adcq $0,%rdx | |
445 addq %r11,%rdi | |
446 adcq $0,%rdx | |
447 movq %rdi,-16(%rsp,%r15,8) | |
448 movq %rdx,%r13 | |
449 | |
450 mulq %rbx | |
451 addq %rax,%r10 | |
452 movq (%rcx,%r15,8),%rax | |
453 adcq $0,%rdx | |
454 addq (%rsp,%r15,8),%r10 | |
455 adcq $0,%rdx | |
456 movq %rdx,%r11 | |
457 | |
458 mulq %rbp | |
459 addq %rax,%r13 | |
460 movq 8(%rsi,%r15,8),%rax | |
461 adcq $0,%rdx | |
462 addq %r10,%r13 | |
463 adcq $0,%rdx | |
464 movq %r13,-8(%rsp,%r15,8) | |
465 movq %rdx,%rdi | |
466 | |
467 mulq %rbx | |
468 addq %rax,%r11 | |
469 movq 8(%rcx,%r15,8),%rax | |
470 adcq $0,%rdx | |
471 addq 8(%rsp,%r15,8),%r11 | |
472 adcq $0,%rdx | |
473 leaq 4(%r15),%r15 | |
474 movq %rdx,%r10 | |
475 | |
476 mulq %rbp | |
477 addq %rax,%rdi | |
478 movq -16(%rsi,%r15,8),%rax | |
479 adcq $0,%rdx | |
480 addq %r11,%rdi | |
481 adcq $0,%rdx | |
482 movq %rdi,-32(%rsp,%r15,8) | |
483 movq %rdx,%r13 | |
484 cmpq %r9,%r15 | |
485 jb L$inner4x | |
486 | |
487 mulq %rbx | |
488 addq %rax,%r10 | |
489 movq -16(%rcx,%r15,8),%rax | |
490 adcq $0,%rdx | |
491 addq -16(%rsp,%r15,8),%r10 | |
492 adcq $0,%rdx | |
493 movq %rdx,%r11 | |
494 | |
495 mulq %rbp | |
496 addq %rax,%r13 | |
497 movq -8(%rsi,%r15,8),%rax | |
498 adcq $0,%rdx | |
499 addq %r10,%r13 | |
500 adcq $0,%rdx | |
501 movq %r13,-24(%rsp,%r15,8) | |
502 movq %rdx,%rdi | |
503 | |
504 mulq %rbx | |
505 addq %rax,%r11 | |
506 movq -8(%rcx,%r15,8),%rax | |
507 adcq $0,%rdx | |
508 addq -8(%rsp,%r15,8),%r11 | |
509 adcq $0,%rdx | |
510 leaq 1(%r14),%r14 | |
511 movq %rdx,%r10 | |
512 | |
513 mulq %rbp | |
514 addq %rax,%rdi | |
515 movq (%rsi),%rax | |
516 adcq $0,%rdx | |
517 addq %r11,%rdi | |
518 adcq $0,%rdx | |
519 movq %rdi,-16(%rsp,%r15,8) | |
520 movq %rdx,%r13 | |
521 | |
522 xorq %rdi,%rdi | |
523 addq %r10,%r13 | |
524 adcq $0,%rdi | |
525 addq (%rsp,%r9,8),%r13 | |
526 adcq $0,%rdi | |
527 movq %r13,-8(%rsp,%r15,8) | |
528 movq %rdi,(%rsp,%r15,8) | |
529 | |
530 cmpq %r9,%r14 | |
531 jb L$outer4x | |
532 movq 16(%rsp,%r9,8),%rdi | |
533 movq 0(%rsp),%rax | |
534 movq 8(%rsp),%rdx | |
535 shrq $2,%r9 | |
536 leaq (%rsp),%rsi | |
537 xorq %r14,%r14 | |
538 | |
539 subq 0(%rcx),%rax | |
540 movq 16(%rsi),%rbx | |
541 movq 24(%rsi),%rbp | |
542 sbbq 8(%rcx),%rdx | |
543 leaq -1(%r9),%r15 | |
544 jmp L$sub4x | |
545 .p2align 4 | |
546 L$sub4x: | |
547 movq %rax,0(%rdi,%r14,8) | |
548 movq %rdx,8(%rdi,%r14,8) | |
549 sbbq 16(%rcx,%r14,8),%rbx | |
550 movq 32(%rsi,%r14,8),%rax | |
551 movq 40(%rsi,%r14,8),%rdx | |
552 sbbq 24(%rcx,%r14,8),%rbp | |
553 movq %rbx,16(%rdi,%r14,8) | |
554 movq %rbp,24(%rdi,%r14,8) | |
555 sbbq 32(%rcx,%r14,8),%rax | |
556 movq 48(%rsi,%r14,8),%rbx | |
557 movq 56(%rsi,%r14,8),%rbp | |
558 sbbq 40(%rcx,%r14,8),%rdx | |
559 leaq 4(%r14),%r14 | |
560 decq %r15 | |
561 jnz L$sub4x | |
562 | |
563 movq %rax,0(%rdi,%r14,8) | |
564 movq 32(%rsi,%r14,8),%rax | |
565 sbbq 16(%rcx,%r14,8),%rbx | |
566 movq %rdx,8(%rdi,%r14,8) | |
567 sbbq 24(%rcx,%r14,8),%rbp | |
568 movq %rbx,16(%rdi,%r14,8) | |
569 | |
570 sbbq $0,%rax | |
571 movq %rax,%xmm0 | |
572 punpcklqdq %xmm0,%xmm0 | |
573 movq %rbp,24(%rdi,%r14,8) | |
574 xorq %r14,%r14 | |
575 | |
576 movq %r9,%r15 | |
577 pxor %xmm5,%xmm5 | |
578 jmp L$copy4x | |
579 .p2align 4 | |
580 L$copy4x: | |
581 movdqu (%rsp,%r14,1),%xmm2 | |
582 movdqu 16(%rsp,%r14,1),%xmm4 | |
583 movdqu (%rdi,%r14,1),%xmm1 | |
584 movdqu 16(%rdi,%r14,1),%xmm3 | |
585 pxor %xmm1,%xmm2 | |
586 pxor %xmm3,%xmm4 | |
587 pand %xmm0,%xmm2 | |
588 pand %xmm0,%xmm4 | |
589 pxor %xmm1,%xmm2 | |
590 pxor %xmm3,%xmm4 | |
591 movdqu %xmm2,(%rdi,%r14,1) | |
592 movdqu %xmm4,16(%rdi,%r14,1) | |
593 movdqa %xmm5,(%rsp,%r14,1) | |
594 movdqa %xmm5,16(%rsp,%r14,1) | |
595 | |
596 leaq 32(%r14),%r14 | |
597 decq %r15 | |
598 jnz L$copy4x | |
599 | |
600 shlq $2,%r9 | |
601 movq 8(%rsp,%r9,8),%rsi | |
602 movq $1,%rax | |
603 movq (%rsi),%r15 | |
604 movq 8(%rsi),%r14 | |
605 movq 16(%rsi),%r13 | |
606 movq 24(%rsi),%r12 | |
607 movq 32(%rsi),%rbp | |
608 movq 40(%rsi),%rbx | |
609 leaq 48(%rsi),%rsp | |
610 L$mul4x_epilogue: | |
611 .byte 0xf3,0xc3 | |
612 | |
613 | |
614 | |
615 | |
616 .p2align 5 | |
617 bn_sqr8x_mont: | |
618 L$sqr8x_enter: | |
619 movq %rsp,%rax | |
620 pushq %rbx | |
621 pushq %rbp | |
622 pushq %r12 | |
623 pushq %r13 | |
624 pushq %r14 | |
625 pushq %r15 | |
626 | |
627 movl %r9d,%r10d | |
628 shll $3,%r9d | |
629 shlq $3+2,%r10 | |
630 negq %r9 | |
631 | |
632 | |
633 | |
634 | |
635 | |
636 | |
637 leaq -64(%rsp,%r9,2),%r11 | |
638 movq (%r8),%r8 | |
639 subq %rsi,%r11 | |
640 andq $4095,%r11 | |
641 cmpq %r11,%r10 | |
642 jb L$sqr8x_sp_alt | |
643 subq %r11,%rsp | |
644 leaq -64(%rsp,%r9,2),%rsp | |
645 jmp L$sqr8x_sp_done | |
646 | |
647 .p2align 5 | |
648 L$sqr8x_sp_alt: | |
649 leaq 4096-64(,%r9,2),%r10 | |
650 leaq -64(%rsp,%r9,2),%rsp | |
651 subq %r10,%r11 | |
652 movq $0,%r10 | |
653 cmovcq %r10,%r11 | |
654 subq %r11,%rsp | |
655 L$sqr8x_sp_done: | |
656 andq $-64,%rsp | |
657 movq %r9,%r10 | |
658 negq %r9 | |
659 | |
660 movq %r8,32(%rsp) | |
661 movq %rax,40(%rsp) | |
662 L$sqr8x_body: | |
663 | |
664 .byte 102,72,15,110,209 | |
665 pxor %xmm0,%xmm0 | |
666 .byte 102,72,15,110,207 | |
667 .byte 102,73,15,110,218 | |
668 call _bn_sqr8x_internal | |
669 | |
670 | |
671 | |
672 | |
673 leaq (%rdi,%r9,1),%rbx | |
674 movq %r9,%rcx | |
675 movq %r9,%rdx | |
676 .byte 102,72,15,126,207 | |
677 sarq $3+2,%rcx | |
678 jmp L$sqr8x_sub | |
679 | |
680 .p2align 5 | |
681 L$sqr8x_sub: | |
682 movq 0(%rbx),%r12 | |
683 movq 8(%rbx),%r13 | |
684 movq 16(%rbx),%r14 | |
685 movq 24(%rbx),%r15 | |
686 leaq 32(%rbx),%rbx | |
687 sbbq 0(%rbp),%r12 | |
688 sbbq 8(%rbp),%r13 | |
689 sbbq 16(%rbp),%r14 | |
690 sbbq 24(%rbp),%r15 | |
691 leaq 32(%rbp),%rbp | |
692 movq %r12,0(%rdi) | |
693 movq %r13,8(%rdi) | |
694 movq %r14,16(%rdi) | |
695 movq %r15,24(%rdi) | |
696 leaq 32(%rdi),%rdi | |
697 incq %rcx | |
698 jnz L$sqr8x_sub | |
699 | |
700 sbbq $0,%rax | |
701 leaq (%rbx,%r9,1),%rbx | |
702 leaq (%rdi,%r9,1),%rdi | |
703 | |
704 .byte 102,72,15,110,200 | |
705 pxor %xmm0,%xmm0 | |
706 pshufd $0,%xmm1,%xmm1 | |
707 movq 40(%rsp),%rsi | |
708 jmp L$sqr8x_cond_copy | |
709 | |
710 .p2align 5 | |
711 L$sqr8x_cond_copy: | |
712 movdqa 0(%rbx),%xmm2 | |
713 movdqa 16(%rbx),%xmm3 | |
714 leaq 32(%rbx),%rbx | |
715 movdqu 0(%rdi),%xmm4 | |
716 movdqu 16(%rdi),%xmm5 | |
717 leaq 32(%rdi),%rdi | |
718 movdqa %xmm0,-32(%rbx) | |
719 movdqa %xmm0,-16(%rbx) | |
720 movdqa %xmm0,-32(%rbx,%rdx,1) | |
721 movdqa %xmm0,-16(%rbx,%rdx,1) | |
722 pcmpeqd %xmm1,%xmm0 | |
723 pand %xmm1,%xmm2 | |
724 pand %xmm1,%xmm3 | |
725 pand %xmm0,%xmm4 | |
726 pand %xmm0,%xmm5 | |
727 pxor %xmm0,%xmm0 | |
728 por %xmm2,%xmm4 | |
729 por %xmm3,%xmm5 | |
730 movdqu %xmm4,-32(%rdi) | |
731 movdqu %xmm5,-16(%rdi) | |
732 addq $32,%r9 | |
733 jnz L$sqr8x_cond_copy | |
734 | |
735 movq $1,%rax | |
736 movq -48(%rsi),%r15 | |
737 movq -40(%rsi),%r14 | |
738 movq -32(%rsi),%r13 | |
739 movq -24(%rsi),%r12 | |
740 movq -16(%rsi),%rbp | |
741 movq -8(%rsi),%rbx | |
742 leaq (%rsi),%rsp | |
743 L$sqr8x_epilogue: | |
744 .byte 0xf3,0xc3 | |
745 | |
746 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105
,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84
,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,10
8,46,111,114,103,62,0 | |
747 .p2align 4 | |
748 #endif | |
OLD | NEW |