Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(290)

Side by Side Diff: third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S

Issue 2219933002: Land BoringSSL roll on master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__x86_64__)
2 .text
3
4
5
6 .p2align 6
7 L$zero:
8 .long 0,0,0,0
9 L$one:
10 .long 1,0,0,0
11 L$inc:
12 .long 0,1,2,3
13 L$four:
14 .long 4,4,4,4
15 L$incy:
16 .long 0,2,4,6,1,3,5,7
17 L$eight:
18 .long 8,8,8,8,8,8,8,8
19 L$rot16:
20 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
21 L$rot24:
22 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
23 L$sigma:
24 .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
25 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67, 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110 ,115,115,108,46,111,114,103,62,0
26 .globl _ChaCha20_ctr32
27 .private_extern _ChaCha20_ctr32
28
29 .p2align 6
30 _ChaCha20_ctr32:
31 cmpq $0,%rdx
32 je L$no_data
33 movq _OPENSSL_ia32cap_P+4(%rip),%r10
34 testl $512,%r10d
35 jnz L$ChaCha20_ssse3
36
37 pushq %rbx
38 pushq %rbp
39 pushq %r12
40 pushq %r13
41 pushq %r14
42 pushq %r15
43 subq $64+24,%rsp
44
45
46 movdqu (%rcx),%xmm1
47 movdqu 16(%rcx),%xmm2
48 movdqu (%r8),%xmm3
49 movdqa L$one(%rip),%xmm4
50
51
52 movdqa %xmm1,16(%rsp)
53 movdqa %xmm2,32(%rsp)
54 movdqa %xmm3,48(%rsp)
55 movq %rdx,%rbp
56 jmp L$oop_outer
57
58 .p2align 5
59 L$oop_outer:
60 movl $0x61707865,%eax
61 movl $0x3320646e,%ebx
62 movl $0x79622d32,%ecx
63 movl $0x6b206574,%edx
64 movl 16(%rsp),%r8d
65 movl 20(%rsp),%r9d
66 movl 24(%rsp),%r10d
67 movl 28(%rsp),%r11d
68 movd %xmm3,%r12d
69 movl 52(%rsp),%r13d
70 movl 56(%rsp),%r14d
71 movl 60(%rsp),%r15d
72
73 movq %rbp,64+0(%rsp)
74 movl $10,%ebp
75 movq %rsi,64+8(%rsp)
76 .byte 102,72,15,126,214
77 movq %rdi,64+16(%rsp)
78 movq %rsi,%rdi
79 shrq $32,%rdi
80 jmp L$oop
81
82 .p2align 5
83 L$oop:
84 addl %r8d,%eax
85 xorl %eax,%r12d
86 roll $16,%r12d
87 addl %r9d,%ebx
88 xorl %ebx,%r13d
89 roll $16,%r13d
90 addl %r12d,%esi
91 xorl %esi,%r8d
92 roll $12,%r8d
93 addl %r13d,%edi
94 xorl %edi,%r9d
95 roll $12,%r9d
96 addl %r8d,%eax
97 xorl %eax,%r12d
98 roll $8,%r12d
99 addl %r9d,%ebx
100 xorl %ebx,%r13d
101 roll $8,%r13d
102 addl %r12d,%esi
103 xorl %esi,%r8d
104 roll $7,%r8d
105 addl %r13d,%edi
106 xorl %edi,%r9d
107 roll $7,%r9d
108 movl %esi,32(%rsp)
109 movl %edi,36(%rsp)
110 movl 40(%rsp),%esi
111 movl 44(%rsp),%edi
112 addl %r10d,%ecx
113 xorl %ecx,%r14d
114 roll $16,%r14d
115 addl %r11d,%edx
116 xorl %edx,%r15d
117 roll $16,%r15d
118 addl %r14d,%esi
119 xorl %esi,%r10d
120 roll $12,%r10d
121 addl %r15d,%edi
122 xorl %edi,%r11d
123 roll $12,%r11d
124 addl %r10d,%ecx
125 xorl %ecx,%r14d
126 roll $8,%r14d
127 addl %r11d,%edx
128 xorl %edx,%r15d
129 roll $8,%r15d
130 addl %r14d,%esi
131 xorl %esi,%r10d
132 roll $7,%r10d
133 addl %r15d,%edi
134 xorl %edi,%r11d
135 roll $7,%r11d
136 addl %r9d,%eax
137 xorl %eax,%r15d
138 roll $16,%r15d
139 addl %r10d,%ebx
140 xorl %ebx,%r12d
141 roll $16,%r12d
142 addl %r15d,%esi
143 xorl %esi,%r9d
144 roll $12,%r9d
145 addl %r12d,%edi
146 xorl %edi,%r10d
147 roll $12,%r10d
148 addl %r9d,%eax
149 xorl %eax,%r15d
150 roll $8,%r15d
151 addl %r10d,%ebx
152 xorl %ebx,%r12d
153 roll $8,%r12d
154 addl %r15d,%esi
155 xorl %esi,%r9d
156 roll $7,%r9d
157 addl %r12d,%edi
158 xorl %edi,%r10d
159 roll $7,%r10d
160 movl %esi,40(%rsp)
161 movl %edi,44(%rsp)
162 movl 32(%rsp),%esi
163 movl 36(%rsp),%edi
164 addl %r11d,%ecx
165 xorl %ecx,%r13d
166 roll $16,%r13d
167 addl %r8d,%edx
168 xorl %edx,%r14d
169 roll $16,%r14d
170 addl %r13d,%esi
171 xorl %esi,%r11d
172 roll $12,%r11d
173 addl %r14d,%edi
174 xorl %edi,%r8d
175 roll $12,%r8d
176 addl %r11d,%ecx
177 xorl %ecx,%r13d
178 roll $8,%r13d
179 addl %r8d,%edx
180 xorl %edx,%r14d
181 roll $8,%r14d
182 addl %r13d,%esi
183 xorl %esi,%r11d
184 roll $7,%r11d
185 addl %r14d,%edi
186 xorl %edi,%r8d
187 roll $7,%r8d
188 decl %ebp
189 jnz L$oop
190 movl %edi,36(%rsp)
191 movl %esi,32(%rsp)
192 movq 64(%rsp),%rbp
193 movdqa %xmm2,%xmm1
194 movq 64+8(%rsp),%rsi
195 paddd %xmm4,%xmm3
196 movq 64+16(%rsp),%rdi
197
198 addl $0x61707865,%eax
199 addl $0x3320646e,%ebx
200 addl $0x79622d32,%ecx
201 addl $0x6b206574,%edx
202 addl 16(%rsp),%r8d
203 addl 20(%rsp),%r9d
204 addl 24(%rsp),%r10d
205 addl 28(%rsp),%r11d
206 addl 48(%rsp),%r12d
207 addl 52(%rsp),%r13d
208 addl 56(%rsp),%r14d
209 addl 60(%rsp),%r15d
210 paddd 32(%rsp),%xmm1
211
212 cmpq $64,%rbp
213 jb L$tail
214
215 xorl 0(%rsi),%eax
216 xorl 4(%rsi),%ebx
217 xorl 8(%rsi),%ecx
218 xorl 12(%rsi),%edx
219 xorl 16(%rsi),%r8d
220 xorl 20(%rsi),%r9d
221 xorl 24(%rsi),%r10d
222 xorl 28(%rsi),%r11d
223 movdqu 32(%rsi),%xmm0
224 xorl 48(%rsi),%r12d
225 xorl 52(%rsi),%r13d
226 xorl 56(%rsi),%r14d
227 xorl 60(%rsi),%r15d
228 leaq 64(%rsi),%rsi
229 pxor %xmm1,%xmm0
230
231 movdqa %xmm2,32(%rsp)
232 movd %xmm3,48(%rsp)
233
234 movl %eax,0(%rdi)
235 movl %ebx,4(%rdi)
236 movl %ecx,8(%rdi)
237 movl %edx,12(%rdi)
238 movl %r8d,16(%rdi)
239 movl %r9d,20(%rdi)
240 movl %r10d,24(%rdi)
241 movl %r11d,28(%rdi)
242 movdqu %xmm0,32(%rdi)
243 movl %r12d,48(%rdi)
244 movl %r13d,52(%rdi)
245 movl %r14d,56(%rdi)
246 movl %r15d,60(%rdi)
247 leaq 64(%rdi),%rdi
248
249 subq $64,%rbp
250 jnz L$oop_outer
251
252 jmp L$done
253
254 .p2align 4
255 L$tail:
256 movl %eax,0(%rsp)
257 movl %ebx,4(%rsp)
258 xorq %rbx,%rbx
259 movl %ecx,8(%rsp)
260 movl %edx,12(%rsp)
261 movl %r8d,16(%rsp)
262 movl %r9d,20(%rsp)
263 movl %r10d,24(%rsp)
264 movl %r11d,28(%rsp)
265 movdqa %xmm1,32(%rsp)
266 movl %r12d,48(%rsp)
267 movl %r13d,52(%rsp)
268 movl %r14d,56(%rsp)
269 movl %r15d,60(%rsp)
270
271 L$oop_tail:
272 movzbl (%rsi,%rbx,1),%eax
273 movzbl (%rsp,%rbx,1),%edx
274 leaq 1(%rbx),%rbx
275 xorl %edx,%eax
276 movb %al,-1(%rdi,%rbx,1)
277 decq %rbp
278 jnz L$oop_tail
279
280 L$done:
281 addq $64+24,%rsp
282 popq %r15
283 popq %r14
284 popq %r13
285 popq %r12
286 popq %rbp
287 popq %rbx
288 L$no_data:
289 .byte 0xf3,0xc3
290
291
292 .p2align 5
293 ChaCha20_ssse3:
294 L$ChaCha20_ssse3:
295 cmpq $128,%rdx
296 ja L$ChaCha20_4x
297
298 L$do_sse3_after_all:
299 pushq %rbx
300 pushq %rbp
301 pushq %r12
302 pushq %r13
303 pushq %r14
304 pushq %r15
305
306 subq $64+24,%rsp
307 movdqa L$sigma(%rip),%xmm0
308 movdqu (%rcx),%xmm1
309 movdqu 16(%rcx),%xmm2
310 movdqu (%r8),%xmm3
311 movdqa L$rot16(%rip),%xmm6
312 movdqa L$rot24(%rip),%xmm7
313
314 movdqa %xmm0,0(%rsp)
315 movdqa %xmm1,16(%rsp)
316 movdqa %xmm2,32(%rsp)
317 movdqa %xmm3,48(%rsp)
318 movl $10,%ebp
319 jmp L$oop_ssse3
320
321 .p2align 5
322 L$oop_outer_ssse3:
323 movdqa L$one(%rip),%xmm3
324 movdqa 0(%rsp),%xmm0
325 movdqa 16(%rsp),%xmm1
326 movdqa 32(%rsp),%xmm2
327 paddd 48(%rsp),%xmm3
328 movl $10,%ebp
329 movdqa %xmm3,48(%rsp)
330 jmp L$oop_ssse3
331
332 .p2align 5
333 L$oop_ssse3:
334 paddd %xmm1,%xmm0
335 pxor %xmm0,%xmm3
336 .byte 102,15,56,0,222
337 paddd %xmm3,%xmm2
338 pxor %xmm2,%xmm1
339 movdqa %xmm1,%xmm4
340 psrld $20,%xmm1
341 pslld $12,%xmm4
342 por %xmm4,%xmm1
343 paddd %xmm1,%xmm0
344 pxor %xmm0,%xmm3
345 .byte 102,15,56,0,223
346 paddd %xmm3,%xmm2
347 pxor %xmm2,%xmm1
348 movdqa %xmm1,%xmm4
349 psrld $25,%xmm1
350 pslld $7,%xmm4
351 por %xmm4,%xmm1
352 pshufd $78,%xmm2,%xmm2
353 pshufd $57,%xmm1,%xmm1
354 pshufd $147,%xmm3,%xmm3
355 nop
356 paddd %xmm1,%xmm0
357 pxor %xmm0,%xmm3
358 .byte 102,15,56,0,222
359 paddd %xmm3,%xmm2
360 pxor %xmm2,%xmm1
361 movdqa %xmm1,%xmm4
362 psrld $20,%xmm1
363 pslld $12,%xmm4
364 por %xmm4,%xmm1
365 paddd %xmm1,%xmm0
366 pxor %xmm0,%xmm3
367 .byte 102,15,56,0,223
368 paddd %xmm3,%xmm2
369 pxor %xmm2,%xmm1
370 movdqa %xmm1,%xmm4
371 psrld $25,%xmm1
372 pslld $7,%xmm4
373 por %xmm4,%xmm1
374 pshufd $78,%xmm2,%xmm2
375 pshufd $147,%xmm1,%xmm1
376 pshufd $57,%xmm3,%xmm3
377 decl %ebp
378 jnz L$oop_ssse3
379 paddd 0(%rsp),%xmm0
380 paddd 16(%rsp),%xmm1
381 paddd 32(%rsp),%xmm2
382 paddd 48(%rsp),%xmm3
383
384 cmpq $64,%rdx
385 jb L$tail_ssse3
386
387 movdqu 0(%rsi),%xmm4
388 movdqu 16(%rsi),%xmm5
389 pxor %xmm4,%xmm0
390 movdqu 32(%rsi),%xmm4
391 pxor %xmm5,%xmm1
392 movdqu 48(%rsi),%xmm5
393 leaq 64(%rsi),%rsi
394 pxor %xmm4,%xmm2
395 pxor %xmm5,%xmm3
396
397 movdqu %xmm0,0(%rdi)
398 movdqu %xmm1,16(%rdi)
399 movdqu %xmm2,32(%rdi)
400 movdqu %xmm3,48(%rdi)
401 leaq 64(%rdi),%rdi
402
403 subq $64,%rdx
404 jnz L$oop_outer_ssse3
405
406 jmp L$done_ssse3
407
408 .p2align 4
409 L$tail_ssse3:
410 movdqa %xmm0,0(%rsp)
411 movdqa %xmm1,16(%rsp)
412 movdqa %xmm2,32(%rsp)
413 movdqa %xmm3,48(%rsp)
414 xorq %rbx,%rbx
415
416 L$oop_tail_ssse3:
417 movzbl (%rsi,%rbx,1),%eax
418 movzbl (%rsp,%rbx,1),%ecx
419 leaq 1(%rbx),%rbx
420 xorl %ecx,%eax
421 movb %al,-1(%rdi,%rbx,1)
422 decq %rdx
423 jnz L$oop_tail_ssse3
424
425 L$done_ssse3:
426 addq $64+24,%rsp
427 popq %r15
428 popq %r14
429 popq %r13
430 popq %r12
431 popq %rbp
432 popq %rbx
433 .byte 0xf3,0xc3
434
435
436 .p2align 5
437 ChaCha20_4x:
438 L$ChaCha20_4x:
439 movq %r10,%r11
440 shrq $32,%r10
441 testq $32,%r10
442 jnz L$ChaCha20_8x
443 cmpq $192,%rdx
444 ja L$proceed4x
445
446 andq $71303168,%r11
447 cmpq $4194304,%r11
448 je L$do_sse3_after_all
449
450 L$proceed4x:
451 leaq -120(%rsp),%r11
452 subq $0x148+0,%rsp
453 movdqa L$sigma(%rip),%xmm11
454 movdqu (%rcx),%xmm15
455 movdqu 16(%rcx),%xmm7
456 movdqu (%r8),%xmm3
457 leaq 256(%rsp),%rcx
458 leaq L$rot16(%rip),%r10
459 leaq L$rot24(%rip),%r11
460
461 pshufd $0x00,%xmm11,%xmm8
462 pshufd $0x55,%xmm11,%xmm9
463 movdqa %xmm8,64(%rsp)
464 pshufd $0xaa,%xmm11,%xmm10
465 movdqa %xmm9,80(%rsp)
466 pshufd $0xff,%xmm11,%xmm11
467 movdqa %xmm10,96(%rsp)
468 movdqa %xmm11,112(%rsp)
469
470 pshufd $0x00,%xmm15,%xmm12
471 pshufd $0x55,%xmm15,%xmm13
472 movdqa %xmm12,128-256(%rcx)
473 pshufd $0xaa,%xmm15,%xmm14
474 movdqa %xmm13,144-256(%rcx)
475 pshufd $0xff,%xmm15,%xmm15
476 movdqa %xmm14,160-256(%rcx)
477 movdqa %xmm15,176-256(%rcx)
478
479 pshufd $0x00,%xmm7,%xmm4
480 pshufd $0x55,%xmm7,%xmm5
481 movdqa %xmm4,192-256(%rcx)
482 pshufd $0xaa,%xmm7,%xmm6
483 movdqa %xmm5,208-256(%rcx)
484 pshufd $0xff,%xmm7,%xmm7
485 movdqa %xmm6,224-256(%rcx)
486 movdqa %xmm7,240-256(%rcx)
487
488 pshufd $0x00,%xmm3,%xmm0
489 pshufd $0x55,%xmm3,%xmm1
490 paddd L$inc(%rip),%xmm0
491 pshufd $0xaa,%xmm3,%xmm2
492 movdqa %xmm1,272-256(%rcx)
493 pshufd $0xff,%xmm3,%xmm3
494 movdqa %xmm2,288-256(%rcx)
495 movdqa %xmm3,304-256(%rcx)
496
497 jmp L$oop_enter4x
498
499 .p2align 5
500 L$oop_outer4x:
501 movdqa 64(%rsp),%xmm8
502 movdqa 80(%rsp),%xmm9
503 movdqa 96(%rsp),%xmm10
504 movdqa 112(%rsp),%xmm11
505 movdqa 128-256(%rcx),%xmm12
506 movdqa 144-256(%rcx),%xmm13
507 movdqa 160-256(%rcx),%xmm14
508 movdqa 176-256(%rcx),%xmm15
509 movdqa 192-256(%rcx),%xmm4
510 movdqa 208-256(%rcx),%xmm5
511 movdqa 224-256(%rcx),%xmm6
512 movdqa 240-256(%rcx),%xmm7
513 movdqa 256-256(%rcx),%xmm0
514 movdqa 272-256(%rcx),%xmm1
515 movdqa 288-256(%rcx),%xmm2
516 movdqa 304-256(%rcx),%xmm3
517 paddd L$four(%rip),%xmm0
518
519 L$oop_enter4x:
520 movdqa %xmm6,32(%rsp)
521 movdqa %xmm7,48(%rsp)
522 movdqa (%r10),%xmm7
523 movl $10,%eax
524 movdqa %xmm0,256-256(%rcx)
525 jmp L$oop4x
526
527 .p2align 5
528 L$oop4x:
529 paddd %xmm12,%xmm8
530 paddd %xmm13,%xmm9
531 pxor %xmm8,%xmm0
532 pxor %xmm9,%xmm1
533 .byte 102,15,56,0,199
534 .byte 102,15,56,0,207
535 paddd %xmm0,%xmm4
536 paddd %xmm1,%xmm5
537 pxor %xmm4,%xmm12
538 pxor %xmm5,%xmm13
539 movdqa %xmm12,%xmm6
540 pslld $12,%xmm12
541 psrld $20,%xmm6
542 movdqa %xmm13,%xmm7
543 pslld $12,%xmm13
544 por %xmm6,%xmm12
545 psrld $20,%xmm7
546 movdqa (%r11),%xmm6
547 por %xmm7,%xmm13
548 paddd %xmm12,%xmm8
549 paddd %xmm13,%xmm9
550 pxor %xmm8,%xmm0
551 pxor %xmm9,%xmm1
552 .byte 102,15,56,0,198
553 .byte 102,15,56,0,206
554 paddd %xmm0,%xmm4
555 paddd %xmm1,%xmm5
556 pxor %xmm4,%xmm12
557 pxor %xmm5,%xmm13
558 movdqa %xmm12,%xmm7
559 pslld $7,%xmm12
560 psrld $25,%xmm7
561 movdqa %xmm13,%xmm6
562 pslld $7,%xmm13
563 por %xmm7,%xmm12
564 psrld $25,%xmm6
565 movdqa (%r10),%xmm7
566 por %xmm6,%xmm13
567 movdqa %xmm4,0(%rsp)
568 movdqa %xmm5,16(%rsp)
569 movdqa 32(%rsp),%xmm4
570 movdqa 48(%rsp),%xmm5
571 paddd %xmm14,%xmm10
572 paddd %xmm15,%xmm11
573 pxor %xmm10,%xmm2
574 pxor %xmm11,%xmm3
575 .byte 102,15,56,0,215
576 .byte 102,15,56,0,223
577 paddd %xmm2,%xmm4
578 paddd %xmm3,%xmm5
579 pxor %xmm4,%xmm14
580 pxor %xmm5,%xmm15
581 movdqa %xmm14,%xmm6
582 pslld $12,%xmm14
583 psrld $20,%xmm6
584 movdqa %xmm15,%xmm7
585 pslld $12,%xmm15
586 por %xmm6,%xmm14
587 psrld $20,%xmm7
588 movdqa (%r11),%xmm6
589 por %xmm7,%xmm15
590 paddd %xmm14,%xmm10
591 paddd %xmm15,%xmm11
592 pxor %xmm10,%xmm2
593 pxor %xmm11,%xmm3
594 .byte 102,15,56,0,214
595 .byte 102,15,56,0,222
596 paddd %xmm2,%xmm4
597 paddd %xmm3,%xmm5
598 pxor %xmm4,%xmm14
599 pxor %xmm5,%xmm15
600 movdqa %xmm14,%xmm7
601 pslld $7,%xmm14
602 psrld $25,%xmm7
603 movdqa %xmm15,%xmm6
604 pslld $7,%xmm15
605 por %xmm7,%xmm14
606 psrld $25,%xmm6
607 movdqa (%r10),%xmm7
608 por %xmm6,%xmm15
609 paddd %xmm13,%xmm8
610 paddd %xmm14,%xmm9
611 pxor %xmm8,%xmm3
612 pxor %xmm9,%xmm0
613 .byte 102,15,56,0,223
614 .byte 102,15,56,0,199
615 paddd %xmm3,%xmm4
616 paddd %xmm0,%xmm5
617 pxor %xmm4,%xmm13
618 pxor %xmm5,%xmm14
619 movdqa %xmm13,%xmm6
620 pslld $12,%xmm13
621 psrld $20,%xmm6
622 movdqa %xmm14,%xmm7
623 pslld $12,%xmm14
624 por %xmm6,%xmm13
625 psrld $20,%xmm7
626 movdqa (%r11),%xmm6
627 por %xmm7,%xmm14
628 paddd %xmm13,%xmm8
629 paddd %xmm14,%xmm9
630 pxor %xmm8,%xmm3
631 pxor %xmm9,%xmm0
632 .byte 102,15,56,0,222
633 .byte 102,15,56,0,198
634 paddd %xmm3,%xmm4
635 paddd %xmm0,%xmm5
636 pxor %xmm4,%xmm13
637 pxor %xmm5,%xmm14
638 movdqa %xmm13,%xmm7
639 pslld $7,%xmm13
640 psrld $25,%xmm7
641 movdqa %xmm14,%xmm6
642 pslld $7,%xmm14
643 por %xmm7,%xmm13
644 psrld $25,%xmm6
645 movdqa (%r10),%xmm7
646 por %xmm6,%xmm14
647 movdqa %xmm4,32(%rsp)
648 movdqa %xmm5,48(%rsp)
649 movdqa 0(%rsp),%xmm4
650 movdqa 16(%rsp),%xmm5
651 paddd %xmm15,%xmm10
652 paddd %xmm12,%xmm11
653 pxor %xmm10,%xmm1
654 pxor %xmm11,%xmm2
655 .byte 102,15,56,0,207
656 .byte 102,15,56,0,215
657 paddd %xmm1,%xmm4
658 paddd %xmm2,%xmm5
659 pxor %xmm4,%xmm15
660 pxor %xmm5,%xmm12
661 movdqa %xmm15,%xmm6
662 pslld $12,%xmm15
663 psrld $20,%xmm6
664 movdqa %xmm12,%xmm7
665 pslld $12,%xmm12
666 por %xmm6,%xmm15
667 psrld $20,%xmm7
668 movdqa (%r11),%xmm6
669 por %xmm7,%xmm12
670 paddd %xmm15,%xmm10
671 paddd %xmm12,%xmm11
672 pxor %xmm10,%xmm1
673 pxor %xmm11,%xmm2
674 .byte 102,15,56,0,206
675 .byte 102,15,56,0,214
676 paddd %xmm1,%xmm4
677 paddd %xmm2,%xmm5
678 pxor %xmm4,%xmm15
679 pxor %xmm5,%xmm12
680 movdqa %xmm15,%xmm7
681 pslld $7,%xmm15
682 psrld $25,%xmm7
683 movdqa %xmm12,%xmm6
684 pslld $7,%xmm12
685 por %xmm7,%xmm15
686 psrld $25,%xmm6
687 movdqa (%r10),%xmm7
688 por %xmm6,%xmm12
689 decl %eax
690 jnz L$oop4x
691
692 paddd 64(%rsp),%xmm8
693 paddd 80(%rsp),%xmm9
694 paddd 96(%rsp),%xmm10
695 paddd 112(%rsp),%xmm11
696
697 movdqa %xmm8,%xmm6
698 punpckldq %xmm9,%xmm8
699 movdqa %xmm10,%xmm7
700 punpckldq %xmm11,%xmm10
701 punpckhdq %xmm9,%xmm6
702 punpckhdq %xmm11,%xmm7
703 movdqa %xmm8,%xmm9
704 punpcklqdq %xmm10,%xmm8
705 movdqa %xmm6,%xmm11
706 punpcklqdq %xmm7,%xmm6
707 punpckhqdq %xmm10,%xmm9
708 punpckhqdq %xmm7,%xmm11
709 paddd 128-256(%rcx),%xmm12
710 paddd 144-256(%rcx),%xmm13
711 paddd 160-256(%rcx),%xmm14
712 paddd 176-256(%rcx),%xmm15
713
714 movdqa %xmm8,0(%rsp)
715 movdqa %xmm9,16(%rsp)
716 movdqa 32(%rsp),%xmm8
717 movdqa 48(%rsp),%xmm9
718
719 movdqa %xmm12,%xmm10
720 punpckldq %xmm13,%xmm12
721 movdqa %xmm14,%xmm7
722 punpckldq %xmm15,%xmm14
723 punpckhdq %xmm13,%xmm10
724 punpckhdq %xmm15,%xmm7
725 movdqa %xmm12,%xmm13
726 punpcklqdq %xmm14,%xmm12
727 movdqa %xmm10,%xmm15
728 punpcklqdq %xmm7,%xmm10
729 punpckhqdq %xmm14,%xmm13
730 punpckhqdq %xmm7,%xmm15
731 paddd 192-256(%rcx),%xmm4
732 paddd 208-256(%rcx),%xmm5
733 paddd 224-256(%rcx),%xmm8
734 paddd 240-256(%rcx),%xmm9
735
736 movdqa %xmm6,32(%rsp)
737 movdqa %xmm11,48(%rsp)
738
739 movdqa %xmm4,%xmm14
740 punpckldq %xmm5,%xmm4
741 movdqa %xmm8,%xmm7
742 punpckldq %xmm9,%xmm8
743 punpckhdq %xmm5,%xmm14
744 punpckhdq %xmm9,%xmm7
745 movdqa %xmm4,%xmm5
746 punpcklqdq %xmm8,%xmm4
747 movdqa %xmm14,%xmm9
748 punpcklqdq %xmm7,%xmm14
749 punpckhqdq %xmm8,%xmm5
750 punpckhqdq %xmm7,%xmm9
751 paddd 256-256(%rcx),%xmm0
752 paddd 272-256(%rcx),%xmm1
753 paddd 288-256(%rcx),%xmm2
754 paddd 304-256(%rcx),%xmm3
755
756 movdqa %xmm0,%xmm8
757 punpckldq %xmm1,%xmm0
758 movdqa %xmm2,%xmm7
759 punpckldq %xmm3,%xmm2
760 punpckhdq %xmm1,%xmm8
761 punpckhdq %xmm3,%xmm7
762 movdqa %xmm0,%xmm1
763 punpcklqdq %xmm2,%xmm0
764 movdqa %xmm8,%xmm3
765 punpcklqdq %xmm7,%xmm8
766 punpckhqdq %xmm2,%xmm1
767 punpckhqdq %xmm7,%xmm3
768 cmpq $256,%rdx
769 jb L$tail4x
770
771 movdqu 0(%rsi),%xmm6
772 movdqu 16(%rsi),%xmm11
773 movdqu 32(%rsi),%xmm2
774 movdqu 48(%rsi),%xmm7
775 pxor 0(%rsp),%xmm6
776 pxor %xmm12,%xmm11
777 pxor %xmm4,%xmm2
778 pxor %xmm0,%xmm7
779
780 movdqu %xmm6,0(%rdi)
781 movdqu 64(%rsi),%xmm6
782 movdqu %xmm11,16(%rdi)
783 movdqu 80(%rsi),%xmm11
784 movdqu %xmm2,32(%rdi)
785 movdqu 96(%rsi),%xmm2
786 movdqu %xmm7,48(%rdi)
787 movdqu 112(%rsi),%xmm7
788 leaq 128(%rsi),%rsi
789 pxor 16(%rsp),%xmm6
790 pxor %xmm13,%xmm11
791 pxor %xmm5,%xmm2
792 pxor %xmm1,%xmm7
793
794 movdqu %xmm6,64(%rdi)
795 movdqu 0(%rsi),%xmm6
796 movdqu %xmm11,80(%rdi)
797 movdqu 16(%rsi),%xmm11
798 movdqu %xmm2,96(%rdi)
799 movdqu 32(%rsi),%xmm2
800 movdqu %xmm7,112(%rdi)
801 leaq 128(%rdi),%rdi
802 movdqu 48(%rsi),%xmm7
803 pxor 32(%rsp),%xmm6
804 pxor %xmm10,%xmm11
805 pxor %xmm14,%xmm2
806 pxor %xmm8,%xmm7
807
808 movdqu %xmm6,0(%rdi)
809 movdqu 64(%rsi),%xmm6
810 movdqu %xmm11,16(%rdi)
811 movdqu 80(%rsi),%xmm11
812 movdqu %xmm2,32(%rdi)
813 movdqu 96(%rsi),%xmm2
814 movdqu %xmm7,48(%rdi)
815 movdqu 112(%rsi),%xmm7
816 leaq 128(%rsi),%rsi
817 pxor 48(%rsp),%xmm6
818 pxor %xmm15,%xmm11
819 pxor %xmm9,%xmm2
820 pxor %xmm3,%xmm7
821 movdqu %xmm6,64(%rdi)
822 movdqu %xmm11,80(%rdi)
823 movdqu %xmm2,96(%rdi)
824 movdqu %xmm7,112(%rdi)
825 leaq 128(%rdi),%rdi
826
827 subq $256,%rdx
828 jnz L$oop_outer4x
829
830 jmp L$done4x
831
832 L$tail4x:
833 cmpq $192,%rdx
834 jae L$192_or_more4x
835 cmpq $128,%rdx
836 jae L$128_or_more4x
837 cmpq $64,%rdx
838 jae L$64_or_more4x
839
840
841 xorq %r10,%r10
842
843 movdqa %xmm12,16(%rsp)
844 movdqa %xmm4,32(%rsp)
845 movdqa %xmm0,48(%rsp)
846 jmp L$oop_tail4x
847
848 .p2align 5
849 L$64_or_more4x:
850 movdqu 0(%rsi),%xmm6
851 movdqu 16(%rsi),%xmm11
852 movdqu 32(%rsi),%xmm2
853 movdqu 48(%rsi),%xmm7
854 pxor 0(%rsp),%xmm6
855 pxor %xmm12,%xmm11
856 pxor %xmm4,%xmm2
857 pxor %xmm0,%xmm7
858 movdqu %xmm6,0(%rdi)
859 movdqu %xmm11,16(%rdi)
860 movdqu %xmm2,32(%rdi)
861 movdqu %xmm7,48(%rdi)
862 je L$done4x
863
864 movdqa 16(%rsp),%xmm6
865 leaq 64(%rsi),%rsi
866 xorq %r10,%r10
867 movdqa %xmm6,0(%rsp)
868 movdqa %xmm13,16(%rsp)
869 leaq 64(%rdi),%rdi
870 movdqa %xmm5,32(%rsp)
871 subq $64,%rdx
872 movdqa %xmm1,48(%rsp)
873 jmp L$oop_tail4x
874
875 .p2align 5
876 L$128_or_more4x:
877 movdqu 0(%rsi),%xmm6
878 movdqu 16(%rsi),%xmm11
879 movdqu 32(%rsi),%xmm2
880 movdqu 48(%rsi),%xmm7
881 pxor 0(%rsp),%xmm6
882 pxor %xmm12,%xmm11
883 pxor %xmm4,%xmm2
884 pxor %xmm0,%xmm7
885
886 movdqu %xmm6,0(%rdi)
887 movdqu 64(%rsi),%xmm6
888 movdqu %xmm11,16(%rdi)
889 movdqu 80(%rsi),%xmm11
890 movdqu %xmm2,32(%rdi)
891 movdqu 96(%rsi),%xmm2
892 movdqu %xmm7,48(%rdi)
893 movdqu 112(%rsi),%xmm7
894 pxor 16(%rsp),%xmm6
895 pxor %xmm13,%xmm11
896 pxor %xmm5,%xmm2
897 pxor %xmm1,%xmm7
898 movdqu %xmm6,64(%rdi)
899 movdqu %xmm11,80(%rdi)
900 movdqu %xmm2,96(%rdi)
901 movdqu %xmm7,112(%rdi)
902 je L$done4x
903
904 movdqa 32(%rsp),%xmm6
905 leaq 128(%rsi),%rsi
906 xorq %r10,%r10
907 movdqa %xmm6,0(%rsp)
908 movdqa %xmm10,16(%rsp)
909 leaq 128(%rdi),%rdi
910 movdqa %xmm14,32(%rsp)
911 subq $128,%rdx
912 movdqa %xmm8,48(%rsp)
913 jmp L$oop_tail4x
914
915 .p2align 5
916 L$192_or_more4x:
917 movdqu 0(%rsi),%xmm6
918 movdqu 16(%rsi),%xmm11
919 movdqu 32(%rsi),%xmm2
920 movdqu 48(%rsi),%xmm7
921 pxor 0(%rsp),%xmm6
922 pxor %xmm12,%xmm11
923 pxor %xmm4,%xmm2
924 pxor %xmm0,%xmm7
925
926 movdqu %xmm6,0(%rdi)
927 movdqu 64(%rsi),%xmm6
928 movdqu %xmm11,16(%rdi)
929 movdqu 80(%rsi),%xmm11
930 movdqu %xmm2,32(%rdi)
931 movdqu 96(%rsi),%xmm2
932 movdqu %xmm7,48(%rdi)
933 movdqu 112(%rsi),%xmm7
934 leaq 128(%rsi),%rsi
935 pxor 16(%rsp),%xmm6
936 pxor %xmm13,%xmm11
937 pxor %xmm5,%xmm2
938 pxor %xmm1,%xmm7
939
940 movdqu %xmm6,64(%rdi)
941 movdqu 0(%rsi),%xmm6
942 movdqu %xmm11,80(%rdi)
943 movdqu 16(%rsi),%xmm11
944 movdqu %xmm2,96(%rdi)
945 movdqu 32(%rsi),%xmm2
946 movdqu %xmm7,112(%rdi)
947 leaq 128(%rdi),%rdi
948 movdqu 48(%rsi),%xmm7
949 pxor 32(%rsp),%xmm6
950 pxor %xmm10,%xmm11
951 pxor %xmm14,%xmm2
952 pxor %xmm8,%xmm7
953 movdqu %xmm6,0(%rdi)
954 movdqu %xmm11,16(%rdi)
955 movdqu %xmm2,32(%rdi)
956 movdqu %xmm7,48(%rdi)
957 je L$done4x
958
959 movdqa 48(%rsp),%xmm6
960 leaq 64(%rsi),%rsi
961 xorq %r10,%r10
962 movdqa %xmm6,0(%rsp)
963 movdqa %xmm15,16(%rsp)
964 leaq 64(%rdi),%rdi
965 movdqa %xmm9,32(%rsp)
966 subq $192,%rdx
967 movdqa %xmm3,48(%rsp)
968
969 L$oop_tail4x:
970 movzbl (%rsi,%r10,1),%eax
971 movzbl (%rsp,%r10,1),%ecx
972 leaq 1(%r10),%r10
973 xorl %ecx,%eax
974 movb %al,-1(%rdi,%r10,1)
975 decq %rdx
976 jnz L$oop_tail4x
977
978 L$done4x:
979 addq $0x148+0,%rsp
980 .byte 0xf3,0xc3
981
982
983 .p2align 5
984 ChaCha20_8x:
985 L$ChaCha20_8x:
986 movq %rsp,%r10
987 subq $0x280+8,%rsp
988 andq $-32,%rsp
989 vzeroupper
990 movq %r10,640(%rsp)
991
992
993
994
995
996
997
998
999
1000
1001 vbroadcasti128 L$sigma(%rip),%ymm11
1002 vbroadcasti128 (%rcx),%ymm3
1003 vbroadcasti128 16(%rcx),%ymm15
1004 vbroadcasti128 (%r8),%ymm7
1005 leaq 256(%rsp),%rcx
1006 leaq 512(%rsp),%rax
1007 leaq L$rot16(%rip),%r10
1008 leaq L$rot24(%rip),%r11
1009
1010 vpshufd $0x00,%ymm11,%ymm8
1011 vpshufd $0x55,%ymm11,%ymm9
1012 vmovdqa %ymm8,128-256(%rcx)
1013 vpshufd $0xaa,%ymm11,%ymm10
1014 vmovdqa %ymm9,160-256(%rcx)
1015 vpshufd $0xff,%ymm11,%ymm11
1016 vmovdqa %ymm10,192-256(%rcx)
1017 vmovdqa %ymm11,224-256(%rcx)
1018
1019 vpshufd $0x00,%ymm3,%ymm0
1020 vpshufd $0x55,%ymm3,%ymm1
1021 vmovdqa %ymm0,256-256(%rcx)
1022 vpshufd $0xaa,%ymm3,%ymm2
1023 vmovdqa %ymm1,288-256(%rcx)
1024 vpshufd $0xff,%ymm3,%ymm3
1025 vmovdqa %ymm2,320-256(%rcx)
1026 vmovdqa %ymm3,352-256(%rcx)
1027
1028 vpshufd $0x00,%ymm15,%ymm12
1029 vpshufd $0x55,%ymm15,%ymm13
1030 vmovdqa %ymm12,384-512(%rax)
1031 vpshufd $0xaa,%ymm15,%ymm14
1032 vmovdqa %ymm13,416-512(%rax)
1033 vpshufd $0xff,%ymm15,%ymm15
1034 vmovdqa %ymm14,448-512(%rax)
1035 vmovdqa %ymm15,480-512(%rax)
1036
1037 vpshufd $0x00,%ymm7,%ymm4
1038 vpshufd $0x55,%ymm7,%ymm5
1039 vpaddd L$incy(%rip),%ymm4,%ymm4
1040 vpshufd $0xaa,%ymm7,%ymm6
1041 vmovdqa %ymm5,544-512(%rax)
1042 vpshufd $0xff,%ymm7,%ymm7
1043 vmovdqa %ymm6,576-512(%rax)
1044 vmovdqa %ymm7,608-512(%rax)
1045
1046 jmp L$oop_enter8x
1047
1048 .p2align 5
1049 L$oop_outer8x:
1050 vmovdqa 128-256(%rcx),%ymm8
1051 vmovdqa 160-256(%rcx),%ymm9
1052 vmovdqa 192-256(%rcx),%ymm10
1053 vmovdqa 224-256(%rcx),%ymm11
1054 vmovdqa 256-256(%rcx),%ymm0
1055 vmovdqa 288-256(%rcx),%ymm1
1056 vmovdqa 320-256(%rcx),%ymm2
1057 vmovdqa 352-256(%rcx),%ymm3
1058 vmovdqa 384-512(%rax),%ymm12
1059 vmovdqa 416-512(%rax),%ymm13
1060 vmovdqa 448-512(%rax),%ymm14
1061 vmovdqa 480-512(%rax),%ymm15
1062 vmovdqa 512-512(%rax),%ymm4
1063 vmovdqa 544-512(%rax),%ymm5
1064 vmovdqa 576-512(%rax),%ymm6
1065 vmovdqa 608-512(%rax),%ymm7
1066 vpaddd L$eight(%rip),%ymm4,%ymm4
1067
1068 L$oop_enter8x:
1069 vmovdqa %ymm14,64(%rsp)
1070 vmovdqa %ymm15,96(%rsp)
1071 vbroadcasti128 (%r10),%ymm15
1072 vmovdqa %ymm4,512-512(%rax)
1073 movl $10,%eax
1074 jmp L$oop8x
1075
1076 .p2align 5
1077 L$oop8x:
1078 vpaddd %ymm0,%ymm8,%ymm8
1079 vpxor %ymm4,%ymm8,%ymm4
1080 vpshufb %ymm15,%ymm4,%ymm4
1081 vpaddd %ymm1,%ymm9,%ymm9
1082 vpxor %ymm5,%ymm9,%ymm5
1083 vpshufb %ymm15,%ymm5,%ymm5
1084 vpaddd %ymm4,%ymm12,%ymm12
1085 vpxor %ymm0,%ymm12,%ymm0
1086 vpslld $12,%ymm0,%ymm14
1087 vpsrld $20,%ymm0,%ymm0
1088 vpor %ymm0,%ymm14,%ymm0
1089 vbroadcasti128 (%r11),%ymm14
1090 vpaddd %ymm5,%ymm13,%ymm13
1091 vpxor %ymm1,%ymm13,%ymm1
1092 vpslld $12,%ymm1,%ymm15
1093 vpsrld $20,%ymm1,%ymm1
1094 vpor %ymm1,%ymm15,%ymm1
1095 vpaddd %ymm0,%ymm8,%ymm8
1096 vpxor %ymm4,%ymm8,%ymm4
1097 vpshufb %ymm14,%ymm4,%ymm4
1098 vpaddd %ymm1,%ymm9,%ymm9
1099 vpxor %ymm5,%ymm9,%ymm5
1100 vpshufb %ymm14,%ymm5,%ymm5
1101 vpaddd %ymm4,%ymm12,%ymm12
1102 vpxor %ymm0,%ymm12,%ymm0
1103 vpslld $7,%ymm0,%ymm15
1104 vpsrld $25,%ymm0,%ymm0
1105 vpor %ymm0,%ymm15,%ymm0
1106 vbroadcasti128 (%r10),%ymm15
1107 vpaddd %ymm5,%ymm13,%ymm13
1108 vpxor %ymm1,%ymm13,%ymm1
1109 vpslld $7,%ymm1,%ymm14
1110 vpsrld $25,%ymm1,%ymm1
1111 vpor %ymm1,%ymm14,%ymm1
1112 vmovdqa %ymm12,0(%rsp)
1113 vmovdqa %ymm13,32(%rsp)
1114 vmovdqa 64(%rsp),%ymm12
1115 vmovdqa 96(%rsp),%ymm13
1116 vpaddd %ymm2,%ymm10,%ymm10
1117 vpxor %ymm6,%ymm10,%ymm6
1118 vpshufb %ymm15,%ymm6,%ymm6
1119 vpaddd %ymm3,%ymm11,%ymm11
1120 vpxor %ymm7,%ymm11,%ymm7
1121 vpshufb %ymm15,%ymm7,%ymm7
1122 vpaddd %ymm6,%ymm12,%ymm12
1123 vpxor %ymm2,%ymm12,%ymm2
1124 vpslld $12,%ymm2,%ymm14
1125 vpsrld $20,%ymm2,%ymm2
1126 vpor %ymm2,%ymm14,%ymm2
1127 vbroadcasti128 (%r11),%ymm14
1128 vpaddd %ymm7,%ymm13,%ymm13
1129 vpxor %ymm3,%ymm13,%ymm3
1130 vpslld $12,%ymm3,%ymm15
1131 vpsrld $20,%ymm3,%ymm3
1132 vpor %ymm3,%ymm15,%ymm3
1133 vpaddd %ymm2,%ymm10,%ymm10
1134 vpxor %ymm6,%ymm10,%ymm6
1135 vpshufb %ymm14,%ymm6,%ymm6
1136 vpaddd %ymm3,%ymm11,%ymm11
1137 vpxor %ymm7,%ymm11,%ymm7
1138 vpshufb %ymm14,%ymm7,%ymm7
1139 vpaddd %ymm6,%ymm12,%ymm12
1140 vpxor %ymm2,%ymm12,%ymm2
1141 vpslld $7,%ymm2,%ymm15
1142 vpsrld $25,%ymm2,%ymm2
1143 vpor %ymm2,%ymm15,%ymm2
1144 vbroadcasti128 (%r10),%ymm15
1145 vpaddd %ymm7,%ymm13,%ymm13
1146 vpxor %ymm3,%ymm13,%ymm3
1147 vpslld $7,%ymm3,%ymm14
1148 vpsrld $25,%ymm3,%ymm3
1149 vpor %ymm3,%ymm14,%ymm3
1150 vpaddd %ymm1,%ymm8,%ymm8
1151 vpxor %ymm7,%ymm8,%ymm7
1152 vpshufb %ymm15,%ymm7,%ymm7
1153 vpaddd %ymm2,%ymm9,%ymm9
1154 vpxor %ymm4,%ymm9,%ymm4
1155 vpshufb %ymm15,%ymm4,%ymm4
1156 vpaddd %ymm7,%ymm12,%ymm12
1157 vpxor %ymm1,%ymm12,%ymm1
1158 vpslld $12,%ymm1,%ymm14
1159 vpsrld $20,%ymm1,%ymm1
1160 vpor %ymm1,%ymm14,%ymm1
1161 vbroadcasti128 (%r11),%ymm14
1162 vpaddd %ymm4,%ymm13,%ymm13
1163 vpxor %ymm2,%ymm13,%ymm2
1164 vpslld $12,%ymm2,%ymm15
1165 vpsrld $20,%ymm2,%ymm2
1166 vpor %ymm2,%ymm15,%ymm2
1167 vpaddd %ymm1,%ymm8,%ymm8
1168 vpxor %ymm7,%ymm8,%ymm7
1169 vpshufb %ymm14,%ymm7,%ymm7
1170 vpaddd %ymm2,%ymm9,%ymm9
1171 vpxor %ymm4,%ymm9,%ymm4
1172 vpshufb %ymm14,%ymm4,%ymm4
1173 vpaddd %ymm7,%ymm12,%ymm12
1174 vpxor %ymm1,%ymm12,%ymm1
1175 vpslld $7,%ymm1,%ymm15
1176 vpsrld $25,%ymm1,%ymm1
1177 vpor %ymm1,%ymm15,%ymm1
1178 vbroadcasti128 (%r10),%ymm15
1179 vpaddd %ymm4,%ymm13,%ymm13
1180 vpxor %ymm2,%ymm13,%ymm2
1181 vpslld $7,%ymm2,%ymm14
1182 vpsrld $25,%ymm2,%ymm2
1183 vpor %ymm2,%ymm14,%ymm2
1184 vmovdqa %ymm12,64(%rsp)
1185 vmovdqa %ymm13,96(%rsp)
1186 vmovdqa 0(%rsp),%ymm12
1187 vmovdqa 32(%rsp),%ymm13
1188 vpaddd %ymm3,%ymm10,%ymm10
1189 vpxor %ymm5,%ymm10,%ymm5
1190 vpshufb %ymm15,%ymm5,%ymm5
1191 vpaddd %ymm0,%ymm11,%ymm11
1192 vpxor %ymm6,%ymm11,%ymm6
1193 vpshufb %ymm15,%ymm6,%ymm6
1194 vpaddd %ymm5,%ymm12,%ymm12
1195 vpxor %ymm3,%ymm12,%ymm3
1196 vpslld $12,%ymm3,%ymm14
1197 vpsrld $20,%ymm3,%ymm3
1198 vpor %ymm3,%ymm14,%ymm3
1199 vbroadcasti128 (%r11),%ymm14
1200 vpaddd %ymm6,%ymm13,%ymm13
1201 vpxor %ymm0,%ymm13,%ymm0
1202 vpslld $12,%ymm0,%ymm15
1203 vpsrld $20,%ymm0,%ymm0
1204 vpor %ymm0,%ymm15,%ymm0
1205 vpaddd %ymm3,%ymm10,%ymm10
1206 vpxor %ymm5,%ymm10,%ymm5
1207 vpshufb %ymm14,%ymm5,%ymm5
1208 vpaddd %ymm0,%ymm11,%ymm11
1209 vpxor %ymm6,%ymm11,%ymm6
1210 vpshufb %ymm14,%ymm6,%ymm6
1211 vpaddd %ymm5,%ymm12,%ymm12
1212 vpxor %ymm3,%ymm12,%ymm3
1213 vpslld $7,%ymm3,%ymm15
1214 vpsrld $25,%ymm3,%ymm3
1215 vpor %ymm3,%ymm15,%ymm3
1216 vbroadcasti128 (%r10),%ymm15
1217 vpaddd %ymm6,%ymm13,%ymm13
1218 vpxor %ymm0,%ymm13,%ymm0
1219 vpslld $7,%ymm0,%ymm14
1220 vpsrld $25,%ymm0,%ymm0
1221 vpor %ymm0,%ymm14,%ymm0
1222 decl %eax
1223 jnz L$oop8x
1224
1225 leaq 512(%rsp),%rax
1226 vpaddd 128-256(%rcx),%ymm8,%ymm8
1227 vpaddd 160-256(%rcx),%ymm9,%ymm9
1228 vpaddd 192-256(%rcx),%ymm10,%ymm10
1229 vpaddd 224-256(%rcx),%ymm11,%ymm11
1230
1231 vpunpckldq %ymm9,%ymm8,%ymm14
1232 vpunpckldq %ymm11,%ymm10,%ymm15
1233 vpunpckhdq %ymm9,%ymm8,%ymm8
1234 vpunpckhdq %ymm11,%ymm10,%ymm10
1235 vpunpcklqdq %ymm15,%ymm14,%ymm9
1236 vpunpckhqdq %ymm15,%ymm14,%ymm14
1237 vpunpcklqdq %ymm10,%ymm8,%ymm11
1238 vpunpckhqdq %ymm10,%ymm8,%ymm8
1239 vpaddd 256-256(%rcx),%ymm0,%ymm0
1240 vpaddd 288-256(%rcx),%ymm1,%ymm1
1241 vpaddd 320-256(%rcx),%ymm2,%ymm2
1242 vpaddd 352-256(%rcx),%ymm3,%ymm3
1243
1244 vpunpckldq %ymm1,%ymm0,%ymm10
1245 vpunpckldq %ymm3,%ymm2,%ymm15
1246 vpunpckhdq %ymm1,%ymm0,%ymm0
1247 vpunpckhdq %ymm3,%ymm2,%ymm2
1248 vpunpcklqdq %ymm15,%ymm10,%ymm1
1249 vpunpckhqdq %ymm15,%ymm10,%ymm10
1250 vpunpcklqdq %ymm2,%ymm0,%ymm3
1251 vpunpckhqdq %ymm2,%ymm0,%ymm0
1252 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
1253 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
1254 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
1255 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
1256 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
1257 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
1258 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
1259 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
1260 vmovdqa %ymm15,0(%rsp)
1261 vmovdqa %ymm9,32(%rsp)
1262 vmovdqa 64(%rsp),%ymm15
1263 vmovdqa 96(%rsp),%ymm9
1264
1265 vpaddd 384-512(%rax),%ymm12,%ymm12
1266 vpaddd 416-512(%rax),%ymm13,%ymm13
1267 vpaddd 448-512(%rax),%ymm15,%ymm15
1268 vpaddd 480-512(%rax),%ymm9,%ymm9
1269
1270 vpunpckldq %ymm13,%ymm12,%ymm2
1271 vpunpckldq %ymm9,%ymm15,%ymm8
1272 vpunpckhdq %ymm13,%ymm12,%ymm12
1273 vpunpckhdq %ymm9,%ymm15,%ymm15
1274 vpunpcklqdq %ymm8,%ymm2,%ymm13
1275 vpunpckhqdq %ymm8,%ymm2,%ymm2
1276 vpunpcklqdq %ymm15,%ymm12,%ymm9
1277 vpunpckhqdq %ymm15,%ymm12,%ymm12
1278 vpaddd 512-512(%rax),%ymm4,%ymm4
1279 vpaddd 544-512(%rax),%ymm5,%ymm5
1280 vpaddd 576-512(%rax),%ymm6,%ymm6
1281 vpaddd 608-512(%rax),%ymm7,%ymm7
1282
1283 vpunpckldq %ymm5,%ymm4,%ymm15
1284 vpunpckldq %ymm7,%ymm6,%ymm8
1285 vpunpckhdq %ymm5,%ymm4,%ymm4
1286 vpunpckhdq %ymm7,%ymm6,%ymm6
1287 vpunpcklqdq %ymm8,%ymm15,%ymm5
1288 vpunpckhqdq %ymm8,%ymm15,%ymm15
1289 vpunpcklqdq %ymm6,%ymm4,%ymm7
1290 vpunpckhqdq %ymm6,%ymm4,%ymm4
1291 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
1292 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
1293 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
1294 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
1295 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
1296 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
1297 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
1298 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
1299 vmovdqa 0(%rsp),%ymm6
1300 vmovdqa 32(%rsp),%ymm12
1301
1302 cmpq $512,%rdx
1303 jb L$tail8x
1304
1305 vpxor 0(%rsi),%ymm6,%ymm6
1306 vpxor 32(%rsi),%ymm8,%ymm8
1307 vpxor 64(%rsi),%ymm1,%ymm1
1308 vpxor 96(%rsi),%ymm5,%ymm5
1309 leaq 128(%rsi),%rsi
1310 vmovdqu %ymm6,0(%rdi)
1311 vmovdqu %ymm8,32(%rdi)
1312 vmovdqu %ymm1,64(%rdi)
1313 vmovdqu %ymm5,96(%rdi)
1314 leaq 128(%rdi),%rdi
1315
1316 vpxor 0(%rsi),%ymm12,%ymm12
1317 vpxor 32(%rsi),%ymm13,%ymm13
1318 vpxor 64(%rsi),%ymm10,%ymm10
1319 vpxor 96(%rsi),%ymm15,%ymm15
1320 leaq 128(%rsi),%rsi
1321 vmovdqu %ymm12,0(%rdi)
1322 vmovdqu %ymm13,32(%rdi)
1323 vmovdqu %ymm10,64(%rdi)
1324 vmovdqu %ymm15,96(%rdi)
1325 leaq 128(%rdi),%rdi
1326
1327 vpxor 0(%rsi),%ymm14,%ymm14
1328 vpxor 32(%rsi),%ymm2,%ymm2
1329 vpxor 64(%rsi),%ymm3,%ymm3
1330 vpxor 96(%rsi),%ymm7,%ymm7
1331 leaq 128(%rsi),%rsi
1332 vmovdqu %ymm14,0(%rdi)
1333 vmovdqu %ymm2,32(%rdi)
1334 vmovdqu %ymm3,64(%rdi)
1335 vmovdqu %ymm7,96(%rdi)
1336 leaq 128(%rdi),%rdi
1337
1338 vpxor 0(%rsi),%ymm11,%ymm11
1339 vpxor 32(%rsi),%ymm9,%ymm9
1340 vpxor 64(%rsi),%ymm0,%ymm0
1341 vpxor 96(%rsi),%ymm4,%ymm4
1342 leaq 128(%rsi),%rsi
1343 vmovdqu %ymm11,0(%rdi)
1344 vmovdqu %ymm9,32(%rdi)
1345 vmovdqu %ymm0,64(%rdi)
1346 vmovdqu %ymm4,96(%rdi)
1347 leaq 128(%rdi),%rdi
1348
1349 subq $512,%rdx
1350 jnz L$oop_outer8x
1351
1352 jmp L$done8x
1353
1354 L$tail8x:
1355 cmpq $448,%rdx
1356 jae L$448_or_more8x
1357 cmpq $384,%rdx
1358 jae L$384_or_more8x
1359 cmpq $320,%rdx
1360 jae L$320_or_more8x
1361 cmpq $256,%rdx
1362 jae L$256_or_more8x
1363 cmpq $192,%rdx
1364 jae L$192_or_more8x
1365 cmpq $128,%rdx
1366 jae L$128_or_more8x
1367 cmpq $64,%rdx
1368 jae L$64_or_more8x
1369
1370 xorq %r10,%r10
1371 vmovdqa %ymm6,0(%rsp)
1372 vmovdqa %ymm8,32(%rsp)
1373 jmp L$oop_tail8x
1374
1375 .p2align 5
1376 L$64_or_more8x:
1377 vpxor 0(%rsi),%ymm6,%ymm6
1378 vpxor 32(%rsi),%ymm8,%ymm8
1379 vmovdqu %ymm6,0(%rdi)
1380 vmovdqu %ymm8,32(%rdi)
1381 je L$done8x
1382
1383 leaq 64(%rsi),%rsi
1384 xorq %r10,%r10
1385 vmovdqa %ymm1,0(%rsp)
1386 leaq 64(%rdi),%rdi
1387 subq $64,%rdx
1388 vmovdqa %ymm5,32(%rsp)
1389 jmp L$oop_tail8x
1390
1391 .p2align 5
1392 L$128_or_more8x:
1393 vpxor 0(%rsi),%ymm6,%ymm6
1394 vpxor 32(%rsi),%ymm8,%ymm8
1395 vpxor 64(%rsi),%ymm1,%ymm1
1396 vpxor 96(%rsi),%ymm5,%ymm5
1397 vmovdqu %ymm6,0(%rdi)
1398 vmovdqu %ymm8,32(%rdi)
1399 vmovdqu %ymm1,64(%rdi)
1400 vmovdqu %ymm5,96(%rdi)
1401 je L$done8x
1402
1403 leaq 128(%rsi),%rsi
1404 xorq %r10,%r10
1405 vmovdqa %ymm12,0(%rsp)
1406 leaq 128(%rdi),%rdi
1407 subq $128,%rdx
1408 vmovdqa %ymm13,32(%rsp)
1409 jmp L$oop_tail8x
1410
1411 .p2align 5
1412 L$192_or_more8x:
1413 vpxor 0(%rsi),%ymm6,%ymm6
1414 vpxor 32(%rsi),%ymm8,%ymm8
1415 vpxor 64(%rsi),%ymm1,%ymm1
1416 vpxor 96(%rsi),%ymm5,%ymm5
1417 vpxor 128(%rsi),%ymm12,%ymm12
1418 vpxor 160(%rsi),%ymm13,%ymm13
1419 vmovdqu %ymm6,0(%rdi)
1420 vmovdqu %ymm8,32(%rdi)
1421 vmovdqu %ymm1,64(%rdi)
1422 vmovdqu %ymm5,96(%rdi)
1423 vmovdqu %ymm12,128(%rdi)
1424 vmovdqu %ymm13,160(%rdi)
1425 je L$done8x
1426
1427 leaq 192(%rsi),%rsi
1428 xorq %r10,%r10
1429 vmovdqa %ymm10,0(%rsp)
1430 leaq 192(%rdi),%rdi
1431 subq $192,%rdx
1432 vmovdqa %ymm15,32(%rsp)
1433 jmp L$oop_tail8x
1434
1435 .p2align 5
1436 L$256_or_more8x:
1437 vpxor 0(%rsi),%ymm6,%ymm6
1438 vpxor 32(%rsi),%ymm8,%ymm8
1439 vpxor 64(%rsi),%ymm1,%ymm1
1440 vpxor 96(%rsi),%ymm5,%ymm5
1441 vpxor 128(%rsi),%ymm12,%ymm12
1442 vpxor 160(%rsi),%ymm13,%ymm13
1443 vpxor 192(%rsi),%ymm10,%ymm10
1444 vpxor 224(%rsi),%ymm15,%ymm15
1445 vmovdqu %ymm6,0(%rdi)
1446 vmovdqu %ymm8,32(%rdi)
1447 vmovdqu %ymm1,64(%rdi)
1448 vmovdqu %ymm5,96(%rdi)
1449 vmovdqu %ymm12,128(%rdi)
1450 vmovdqu %ymm13,160(%rdi)
1451 vmovdqu %ymm10,192(%rdi)
1452 vmovdqu %ymm15,224(%rdi)
1453 je L$done8x
1454
1455 leaq 256(%rsi),%rsi
1456 xorq %r10,%r10
1457 vmovdqa %ymm14,0(%rsp)
1458 leaq 256(%rdi),%rdi
1459 subq $256,%rdx
1460 vmovdqa %ymm2,32(%rsp)
1461 jmp L$oop_tail8x
1462
1463 .p2align 5
1464 L$320_or_more8x:
1465 vpxor 0(%rsi),%ymm6,%ymm6
1466 vpxor 32(%rsi),%ymm8,%ymm8
1467 vpxor 64(%rsi),%ymm1,%ymm1
1468 vpxor 96(%rsi),%ymm5,%ymm5
1469 vpxor 128(%rsi),%ymm12,%ymm12
1470 vpxor 160(%rsi),%ymm13,%ymm13
1471 vpxor 192(%rsi),%ymm10,%ymm10
1472 vpxor 224(%rsi),%ymm15,%ymm15
1473 vpxor 256(%rsi),%ymm14,%ymm14
1474 vpxor 288(%rsi),%ymm2,%ymm2
1475 vmovdqu %ymm6,0(%rdi)
1476 vmovdqu %ymm8,32(%rdi)
1477 vmovdqu %ymm1,64(%rdi)
1478 vmovdqu %ymm5,96(%rdi)
1479 vmovdqu %ymm12,128(%rdi)
1480 vmovdqu %ymm13,160(%rdi)
1481 vmovdqu %ymm10,192(%rdi)
1482 vmovdqu %ymm15,224(%rdi)
1483 vmovdqu %ymm14,256(%rdi)
1484 vmovdqu %ymm2,288(%rdi)
1485 je L$done8x
1486
1487 leaq 320(%rsi),%rsi
1488 xorq %r10,%r10
1489 vmovdqa %ymm3,0(%rsp)
1490 leaq 320(%rdi),%rdi
1491 subq $320,%rdx
1492 vmovdqa %ymm7,32(%rsp)
1493 jmp L$oop_tail8x
1494
1495 .p2align 5
1496 L$384_or_more8x:
1497 vpxor 0(%rsi),%ymm6,%ymm6
1498 vpxor 32(%rsi),%ymm8,%ymm8
1499 vpxor 64(%rsi),%ymm1,%ymm1
1500 vpxor 96(%rsi),%ymm5,%ymm5
1501 vpxor 128(%rsi),%ymm12,%ymm12
1502 vpxor 160(%rsi),%ymm13,%ymm13
1503 vpxor 192(%rsi),%ymm10,%ymm10
1504 vpxor 224(%rsi),%ymm15,%ymm15
1505 vpxor 256(%rsi),%ymm14,%ymm14
1506 vpxor 288(%rsi),%ymm2,%ymm2
1507 vpxor 320(%rsi),%ymm3,%ymm3
1508 vpxor 352(%rsi),%ymm7,%ymm7
1509 vmovdqu %ymm6,0(%rdi)
1510 vmovdqu %ymm8,32(%rdi)
1511 vmovdqu %ymm1,64(%rdi)
1512 vmovdqu %ymm5,96(%rdi)
1513 vmovdqu %ymm12,128(%rdi)
1514 vmovdqu %ymm13,160(%rdi)
1515 vmovdqu %ymm10,192(%rdi)
1516 vmovdqu %ymm15,224(%rdi)
1517 vmovdqu %ymm14,256(%rdi)
1518 vmovdqu %ymm2,288(%rdi)
1519 vmovdqu %ymm3,320(%rdi)
1520 vmovdqu %ymm7,352(%rdi)
1521 je L$done8x
1522
1523 leaq 384(%rsi),%rsi
1524 xorq %r10,%r10
1525 vmovdqa %ymm11,0(%rsp)
1526 leaq 384(%rdi),%rdi
1527 subq $384,%rdx
1528 vmovdqa %ymm9,32(%rsp)
1529 jmp L$oop_tail8x
1530
1531 .p2align 5
1532 L$448_or_more8x:
1533 vpxor 0(%rsi),%ymm6,%ymm6
1534 vpxor 32(%rsi),%ymm8,%ymm8
1535 vpxor 64(%rsi),%ymm1,%ymm1
1536 vpxor 96(%rsi),%ymm5,%ymm5
1537 vpxor 128(%rsi),%ymm12,%ymm12
1538 vpxor 160(%rsi),%ymm13,%ymm13
1539 vpxor 192(%rsi),%ymm10,%ymm10
1540 vpxor 224(%rsi),%ymm15,%ymm15
1541 vpxor 256(%rsi),%ymm14,%ymm14
1542 vpxor 288(%rsi),%ymm2,%ymm2
1543 vpxor 320(%rsi),%ymm3,%ymm3
1544 vpxor 352(%rsi),%ymm7,%ymm7
1545 vpxor 384(%rsi),%ymm11,%ymm11
1546 vpxor 416(%rsi),%ymm9,%ymm9
1547 vmovdqu %ymm6,0(%rdi)
1548 vmovdqu %ymm8,32(%rdi)
1549 vmovdqu %ymm1,64(%rdi)
1550 vmovdqu %ymm5,96(%rdi)
1551 vmovdqu %ymm12,128(%rdi)
1552 vmovdqu %ymm13,160(%rdi)
1553 vmovdqu %ymm10,192(%rdi)
1554 vmovdqu %ymm15,224(%rdi)
1555 vmovdqu %ymm14,256(%rdi)
1556 vmovdqu %ymm2,288(%rdi)
1557 vmovdqu %ymm3,320(%rdi)
1558 vmovdqu %ymm7,352(%rdi)
1559 vmovdqu %ymm11,384(%rdi)
1560 vmovdqu %ymm9,416(%rdi)
1561 je L$done8x
1562
1563 leaq 448(%rsi),%rsi
1564 xorq %r10,%r10
1565 vmovdqa %ymm0,0(%rsp)
1566 leaq 448(%rdi),%rdi
1567 subq $448,%rdx
1568 vmovdqa %ymm4,32(%rsp)
1569
1570 L$oop_tail8x:
1571 movzbl (%rsi,%r10,1),%eax
1572 movzbl (%rsp,%r10,1),%ecx
1573 leaq 1(%r10),%r10
1574 xorl %ecx,%eax
1575 movb %al,-1(%rdi,%r10,1)
1576 decq %rdx
1577 jnz L$oop_tail8x
1578
1579 L$done8x:
1580 vzeroall
1581 movq 640(%rsp),%rsp
1582 .byte 0xf3,0xc3
1583
1584 #endif
OLDNEW
« no previous file with comments | « third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S ('k') | third_party/boringssl/mac-x86_64/crypto/cpu-x86_64-asm.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698