Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(324)

Side by Side Diff: third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S

Issue 2219933002: Land BoringSSL roll on master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__x86_64__)
2 .text
3
4 .extern OPENSSL_ia32cap_P
5 .hidden OPENSSL_ia32cap_P
6
7 .align 64
8 .Lzero:
9 .long 0,0,0,0
10 .Lone:
11 .long 1,0,0,0
12 .Linc:
13 .long 0,1,2,3
14 .Lfour:
15 .long 4,4,4,4
16 .Lincy:
17 .long 0,2,4,6,1,3,5,7
18 .Leight:
19 .long 8,8,8,8,8,8,8,8
20 .Lrot16:
21 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
22 .Lrot24:
23 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
24 .Lsigma:
25 .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
26 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67, 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110 ,115,115,108,46,111,114,103,62,0
27 .globl ChaCha20_ctr32
28 .hidden ChaCha20_ctr32
29 .type ChaCha20_ctr32,@function
30 .align 64
31 ChaCha20_ctr32:
32 cmpq $0,%rdx
33 je .Lno_data
34 movq OPENSSL_ia32cap_P+4(%rip),%r10
35 testl $512,%r10d
36 jnz .LChaCha20_ssse3
37
38 pushq %rbx
39 pushq %rbp
40 pushq %r12
41 pushq %r13
42 pushq %r14
43 pushq %r15
44 subq $64+24,%rsp
45
46
47 movdqu (%rcx),%xmm1
48 movdqu 16(%rcx),%xmm2
49 movdqu (%r8),%xmm3
50 movdqa .Lone(%rip),%xmm4
51
52
53 movdqa %xmm1,16(%rsp)
54 movdqa %xmm2,32(%rsp)
55 movdqa %xmm3,48(%rsp)
56 movq %rdx,%rbp
57 jmp .Loop_outer
58
59 .align 32
60 .Loop_outer:
61 movl $0x61707865,%eax
62 movl $0x3320646e,%ebx
63 movl $0x79622d32,%ecx
64 movl $0x6b206574,%edx
65 movl 16(%rsp),%r8d
66 movl 20(%rsp),%r9d
67 movl 24(%rsp),%r10d
68 movl 28(%rsp),%r11d
69 movd %xmm3,%r12d
70 movl 52(%rsp),%r13d
71 movl 56(%rsp),%r14d
72 movl 60(%rsp),%r15d
73
74 movq %rbp,64+0(%rsp)
75 movl $10,%ebp
76 movq %rsi,64+8(%rsp)
77 .byte 102,72,15,126,214
78 movq %rdi,64+16(%rsp)
79 movq %rsi,%rdi
80 shrq $32,%rdi
81 jmp .Loop
82
83 .align 32
84 .Loop:
85 addl %r8d,%eax
86 xorl %eax,%r12d
87 roll $16,%r12d
88 addl %r9d,%ebx
89 xorl %ebx,%r13d
90 roll $16,%r13d
91 addl %r12d,%esi
92 xorl %esi,%r8d
93 roll $12,%r8d
94 addl %r13d,%edi
95 xorl %edi,%r9d
96 roll $12,%r9d
97 addl %r8d,%eax
98 xorl %eax,%r12d
99 roll $8,%r12d
100 addl %r9d,%ebx
101 xorl %ebx,%r13d
102 roll $8,%r13d
103 addl %r12d,%esi
104 xorl %esi,%r8d
105 roll $7,%r8d
106 addl %r13d,%edi
107 xorl %edi,%r9d
108 roll $7,%r9d
109 movl %esi,32(%rsp)
110 movl %edi,36(%rsp)
111 movl 40(%rsp),%esi
112 movl 44(%rsp),%edi
113 addl %r10d,%ecx
114 xorl %ecx,%r14d
115 roll $16,%r14d
116 addl %r11d,%edx
117 xorl %edx,%r15d
118 roll $16,%r15d
119 addl %r14d,%esi
120 xorl %esi,%r10d
121 roll $12,%r10d
122 addl %r15d,%edi
123 xorl %edi,%r11d
124 roll $12,%r11d
125 addl %r10d,%ecx
126 xorl %ecx,%r14d
127 roll $8,%r14d
128 addl %r11d,%edx
129 xorl %edx,%r15d
130 roll $8,%r15d
131 addl %r14d,%esi
132 xorl %esi,%r10d
133 roll $7,%r10d
134 addl %r15d,%edi
135 xorl %edi,%r11d
136 roll $7,%r11d
137 addl %r9d,%eax
138 xorl %eax,%r15d
139 roll $16,%r15d
140 addl %r10d,%ebx
141 xorl %ebx,%r12d
142 roll $16,%r12d
143 addl %r15d,%esi
144 xorl %esi,%r9d
145 roll $12,%r9d
146 addl %r12d,%edi
147 xorl %edi,%r10d
148 roll $12,%r10d
149 addl %r9d,%eax
150 xorl %eax,%r15d
151 roll $8,%r15d
152 addl %r10d,%ebx
153 xorl %ebx,%r12d
154 roll $8,%r12d
155 addl %r15d,%esi
156 xorl %esi,%r9d
157 roll $7,%r9d
158 addl %r12d,%edi
159 xorl %edi,%r10d
160 roll $7,%r10d
161 movl %esi,40(%rsp)
162 movl %edi,44(%rsp)
163 movl 32(%rsp),%esi
164 movl 36(%rsp),%edi
165 addl %r11d,%ecx
166 xorl %ecx,%r13d
167 roll $16,%r13d
168 addl %r8d,%edx
169 xorl %edx,%r14d
170 roll $16,%r14d
171 addl %r13d,%esi
172 xorl %esi,%r11d
173 roll $12,%r11d
174 addl %r14d,%edi
175 xorl %edi,%r8d
176 roll $12,%r8d
177 addl %r11d,%ecx
178 xorl %ecx,%r13d
179 roll $8,%r13d
180 addl %r8d,%edx
181 xorl %edx,%r14d
182 roll $8,%r14d
183 addl %r13d,%esi
184 xorl %esi,%r11d
185 roll $7,%r11d
186 addl %r14d,%edi
187 xorl %edi,%r8d
188 roll $7,%r8d
189 decl %ebp
190 jnz .Loop
191 movl %edi,36(%rsp)
192 movl %esi,32(%rsp)
193 movq 64(%rsp),%rbp
194 movdqa %xmm2,%xmm1
195 movq 64+8(%rsp),%rsi
196 paddd %xmm4,%xmm3
197 movq 64+16(%rsp),%rdi
198
199 addl $0x61707865,%eax
200 addl $0x3320646e,%ebx
201 addl $0x79622d32,%ecx
202 addl $0x6b206574,%edx
203 addl 16(%rsp),%r8d
204 addl 20(%rsp),%r9d
205 addl 24(%rsp),%r10d
206 addl 28(%rsp),%r11d
207 addl 48(%rsp),%r12d
208 addl 52(%rsp),%r13d
209 addl 56(%rsp),%r14d
210 addl 60(%rsp),%r15d
211 paddd 32(%rsp),%xmm1
212
213 cmpq $64,%rbp
214 jb .Ltail
215
216 xorl 0(%rsi),%eax
217 xorl 4(%rsi),%ebx
218 xorl 8(%rsi),%ecx
219 xorl 12(%rsi),%edx
220 xorl 16(%rsi),%r8d
221 xorl 20(%rsi),%r9d
222 xorl 24(%rsi),%r10d
223 xorl 28(%rsi),%r11d
224 movdqu 32(%rsi),%xmm0
225 xorl 48(%rsi),%r12d
226 xorl 52(%rsi),%r13d
227 xorl 56(%rsi),%r14d
228 xorl 60(%rsi),%r15d
229 leaq 64(%rsi),%rsi
230 pxor %xmm1,%xmm0
231
232 movdqa %xmm2,32(%rsp)
233 movd %xmm3,48(%rsp)
234
235 movl %eax,0(%rdi)
236 movl %ebx,4(%rdi)
237 movl %ecx,8(%rdi)
238 movl %edx,12(%rdi)
239 movl %r8d,16(%rdi)
240 movl %r9d,20(%rdi)
241 movl %r10d,24(%rdi)
242 movl %r11d,28(%rdi)
243 movdqu %xmm0,32(%rdi)
244 movl %r12d,48(%rdi)
245 movl %r13d,52(%rdi)
246 movl %r14d,56(%rdi)
247 movl %r15d,60(%rdi)
248 leaq 64(%rdi),%rdi
249
250 subq $64,%rbp
251 jnz .Loop_outer
252
253 jmp .Ldone
254
255 .align 16
256 .Ltail:
257 movl %eax,0(%rsp)
258 movl %ebx,4(%rsp)
259 xorq %rbx,%rbx
260 movl %ecx,8(%rsp)
261 movl %edx,12(%rsp)
262 movl %r8d,16(%rsp)
263 movl %r9d,20(%rsp)
264 movl %r10d,24(%rsp)
265 movl %r11d,28(%rsp)
266 movdqa %xmm1,32(%rsp)
267 movl %r12d,48(%rsp)
268 movl %r13d,52(%rsp)
269 movl %r14d,56(%rsp)
270 movl %r15d,60(%rsp)
271
272 .Loop_tail:
273 movzbl (%rsi,%rbx,1),%eax
274 movzbl (%rsp,%rbx,1),%edx
275 leaq 1(%rbx),%rbx
276 xorl %edx,%eax
277 movb %al,-1(%rdi,%rbx,1)
278 decq %rbp
279 jnz .Loop_tail
280
281 .Ldone:
282 addq $64+24,%rsp
283 popq %r15
284 popq %r14
285 popq %r13
286 popq %r12
287 popq %rbp
288 popq %rbx
289 .Lno_data:
290 .byte 0xf3,0xc3
291 .size ChaCha20_ctr32,.-ChaCha20_ctr32
292 .type ChaCha20_ssse3,@function
293 .align 32
294 ChaCha20_ssse3:
295 .LChaCha20_ssse3:
296 cmpq $128,%rdx
297 ja .LChaCha20_4x
298
299 .Ldo_sse3_after_all:
300 pushq %rbx
301 pushq %rbp
302 pushq %r12
303 pushq %r13
304 pushq %r14
305 pushq %r15
306
307 subq $64+24,%rsp
308 movdqa .Lsigma(%rip),%xmm0
309 movdqu (%rcx),%xmm1
310 movdqu 16(%rcx),%xmm2
311 movdqu (%r8),%xmm3
312 movdqa .Lrot16(%rip),%xmm6
313 movdqa .Lrot24(%rip),%xmm7
314
315 movdqa %xmm0,0(%rsp)
316 movdqa %xmm1,16(%rsp)
317 movdqa %xmm2,32(%rsp)
318 movdqa %xmm3,48(%rsp)
319 movl $10,%ebp
320 jmp .Loop_ssse3
321
322 .align 32
323 .Loop_outer_ssse3:
324 movdqa .Lone(%rip),%xmm3
325 movdqa 0(%rsp),%xmm0
326 movdqa 16(%rsp),%xmm1
327 movdqa 32(%rsp),%xmm2
328 paddd 48(%rsp),%xmm3
329 movl $10,%ebp
330 movdqa %xmm3,48(%rsp)
331 jmp .Loop_ssse3
332
333 .align 32
334 .Loop_ssse3:
335 paddd %xmm1,%xmm0
336 pxor %xmm0,%xmm3
337 .byte 102,15,56,0,222
338 paddd %xmm3,%xmm2
339 pxor %xmm2,%xmm1
340 movdqa %xmm1,%xmm4
341 psrld $20,%xmm1
342 pslld $12,%xmm4
343 por %xmm4,%xmm1
344 paddd %xmm1,%xmm0
345 pxor %xmm0,%xmm3
346 .byte 102,15,56,0,223
347 paddd %xmm3,%xmm2
348 pxor %xmm2,%xmm1
349 movdqa %xmm1,%xmm4
350 psrld $25,%xmm1
351 pslld $7,%xmm4
352 por %xmm4,%xmm1
353 pshufd $78,%xmm2,%xmm2
354 pshufd $57,%xmm1,%xmm1
355 pshufd $147,%xmm3,%xmm3
356 nop
357 paddd %xmm1,%xmm0
358 pxor %xmm0,%xmm3
359 .byte 102,15,56,0,222
360 paddd %xmm3,%xmm2
361 pxor %xmm2,%xmm1
362 movdqa %xmm1,%xmm4
363 psrld $20,%xmm1
364 pslld $12,%xmm4
365 por %xmm4,%xmm1
366 paddd %xmm1,%xmm0
367 pxor %xmm0,%xmm3
368 .byte 102,15,56,0,223
369 paddd %xmm3,%xmm2
370 pxor %xmm2,%xmm1
371 movdqa %xmm1,%xmm4
372 psrld $25,%xmm1
373 pslld $7,%xmm4
374 por %xmm4,%xmm1
375 pshufd $78,%xmm2,%xmm2
376 pshufd $147,%xmm1,%xmm1
377 pshufd $57,%xmm3,%xmm3
378 decl %ebp
379 jnz .Loop_ssse3
380 paddd 0(%rsp),%xmm0
381 paddd 16(%rsp),%xmm1
382 paddd 32(%rsp),%xmm2
383 paddd 48(%rsp),%xmm3
384
385 cmpq $64,%rdx
386 jb .Ltail_ssse3
387
388 movdqu 0(%rsi),%xmm4
389 movdqu 16(%rsi),%xmm5
390 pxor %xmm4,%xmm0
391 movdqu 32(%rsi),%xmm4
392 pxor %xmm5,%xmm1
393 movdqu 48(%rsi),%xmm5
394 leaq 64(%rsi),%rsi
395 pxor %xmm4,%xmm2
396 pxor %xmm5,%xmm3
397
398 movdqu %xmm0,0(%rdi)
399 movdqu %xmm1,16(%rdi)
400 movdqu %xmm2,32(%rdi)
401 movdqu %xmm3,48(%rdi)
402 leaq 64(%rdi),%rdi
403
404 subq $64,%rdx
405 jnz .Loop_outer_ssse3
406
407 jmp .Ldone_ssse3
408
409 .align 16
410 .Ltail_ssse3:
411 movdqa %xmm0,0(%rsp)
412 movdqa %xmm1,16(%rsp)
413 movdqa %xmm2,32(%rsp)
414 movdqa %xmm3,48(%rsp)
415 xorq %rbx,%rbx
416
417 .Loop_tail_ssse3:
418 movzbl (%rsi,%rbx,1),%eax
419 movzbl (%rsp,%rbx,1),%ecx
420 leaq 1(%rbx),%rbx
421 xorl %ecx,%eax
422 movb %al,-1(%rdi,%rbx,1)
423 decq %rdx
424 jnz .Loop_tail_ssse3
425
426 .Ldone_ssse3:
427 addq $64+24,%rsp
428 popq %r15
429 popq %r14
430 popq %r13
431 popq %r12
432 popq %rbp
433 popq %rbx
434 .byte 0xf3,0xc3
435 .size ChaCha20_ssse3,.-ChaCha20_ssse3
436 .type ChaCha20_4x,@function
437 .align 32
438 ChaCha20_4x:
439 .LChaCha20_4x:
440 movq %r10,%r11
441 shrq $32,%r10
442 testq $32,%r10
443 jnz .LChaCha20_8x
444 cmpq $192,%rdx
445 ja .Lproceed4x
446
447 andq $71303168,%r11
448 cmpq $4194304,%r11
449 je .Ldo_sse3_after_all
450
451 .Lproceed4x:
452 leaq -120(%rsp),%r11
453 subq $0x148+0,%rsp
454 movdqa .Lsigma(%rip),%xmm11
455 movdqu (%rcx),%xmm15
456 movdqu 16(%rcx),%xmm7
457 movdqu (%r8),%xmm3
458 leaq 256(%rsp),%rcx
459 leaq .Lrot16(%rip),%r10
460 leaq .Lrot24(%rip),%r11
461
462 pshufd $0x00,%xmm11,%xmm8
463 pshufd $0x55,%xmm11,%xmm9
464 movdqa %xmm8,64(%rsp)
465 pshufd $0xaa,%xmm11,%xmm10
466 movdqa %xmm9,80(%rsp)
467 pshufd $0xff,%xmm11,%xmm11
468 movdqa %xmm10,96(%rsp)
469 movdqa %xmm11,112(%rsp)
470
471 pshufd $0x00,%xmm15,%xmm12
472 pshufd $0x55,%xmm15,%xmm13
473 movdqa %xmm12,128-256(%rcx)
474 pshufd $0xaa,%xmm15,%xmm14
475 movdqa %xmm13,144-256(%rcx)
476 pshufd $0xff,%xmm15,%xmm15
477 movdqa %xmm14,160-256(%rcx)
478 movdqa %xmm15,176-256(%rcx)
479
480 pshufd $0x00,%xmm7,%xmm4
481 pshufd $0x55,%xmm7,%xmm5
482 movdqa %xmm4,192-256(%rcx)
483 pshufd $0xaa,%xmm7,%xmm6
484 movdqa %xmm5,208-256(%rcx)
485 pshufd $0xff,%xmm7,%xmm7
486 movdqa %xmm6,224-256(%rcx)
487 movdqa %xmm7,240-256(%rcx)
488
489 pshufd $0x00,%xmm3,%xmm0
490 pshufd $0x55,%xmm3,%xmm1
491 paddd .Linc(%rip),%xmm0
492 pshufd $0xaa,%xmm3,%xmm2
493 movdqa %xmm1,272-256(%rcx)
494 pshufd $0xff,%xmm3,%xmm3
495 movdqa %xmm2,288-256(%rcx)
496 movdqa %xmm3,304-256(%rcx)
497
498 jmp .Loop_enter4x
499
500 .align 32
501 .Loop_outer4x:
502 movdqa 64(%rsp),%xmm8
503 movdqa 80(%rsp),%xmm9
504 movdqa 96(%rsp),%xmm10
505 movdqa 112(%rsp),%xmm11
506 movdqa 128-256(%rcx),%xmm12
507 movdqa 144-256(%rcx),%xmm13
508 movdqa 160-256(%rcx),%xmm14
509 movdqa 176-256(%rcx),%xmm15
510 movdqa 192-256(%rcx),%xmm4
511 movdqa 208-256(%rcx),%xmm5
512 movdqa 224-256(%rcx),%xmm6
513 movdqa 240-256(%rcx),%xmm7
514 movdqa 256-256(%rcx),%xmm0
515 movdqa 272-256(%rcx),%xmm1
516 movdqa 288-256(%rcx),%xmm2
517 movdqa 304-256(%rcx),%xmm3
518 paddd .Lfour(%rip),%xmm0
519
520 .Loop_enter4x:
521 movdqa %xmm6,32(%rsp)
522 movdqa %xmm7,48(%rsp)
523 movdqa (%r10),%xmm7
524 movl $10,%eax
525 movdqa %xmm0,256-256(%rcx)
526 jmp .Loop4x
527
528 .align 32
529 .Loop4x:
530 paddd %xmm12,%xmm8
531 paddd %xmm13,%xmm9
532 pxor %xmm8,%xmm0
533 pxor %xmm9,%xmm1
534 .byte 102,15,56,0,199
535 .byte 102,15,56,0,207
536 paddd %xmm0,%xmm4
537 paddd %xmm1,%xmm5
538 pxor %xmm4,%xmm12
539 pxor %xmm5,%xmm13
540 movdqa %xmm12,%xmm6
541 pslld $12,%xmm12
542 psrld $20,%xmm6
543 movdqa %xmm13,%xmm7
544 pslld $12,%xmm13
545 por %xmm6,%xmm12
546 psrld $20,%xmm7
547 movdqa (%r11),%xmm6
548 por %xmm7,%xmm13
549 paddd %xmm12,%xmm8
550 paddd %xmm13,%xmm9
551 pxor %xmm8,%xmm0
552 pxor %xmm9,%xmm1
553 .byte 102,15,56,0,198
554 .byte 102,15,56,0,206
555 paddd %xmm0,%xmm4
556 paddd %xmm1,%xmm5
557 pxor %xmm4,%xmm12
558 pxor %xmm5,%xmm13
559 movdqa %xmm12,%xmm7
560 pslld $7,%xmm12
561 psrld $25,%xmm7
562 movdqa %xmm13,%xmm6
563 pslld $7,%xmm13
564 por %xmm7,%xmm12
565 psrld $25,%xmm6
566 movdqa (%r10),%xmm7
567 por %xmm6,%xmm13
568 movdqa %xmm4,0(%rsp)
569 movdqa %xmm5,16(%rsp)
570 movdqa 32(%rsp),%xmm4
571 movdqa 48(%rsp),%xmm5
572 paddd %xmm14,%xmm10
573 paddd %xmm15,%xmm11
574 pxor %xmm10,%xmm2
575 pxor %xmm11,%xmm3
576 .byte 102,15,56,0,215
577 .byte 102,15,56,0,223
578 paddd %xmm2,%xmm4
579 paddd %xmm3,%xmm5
580 pxor %xmm4,%xmm14
581 pxor %xmm5,%xmm15
582 movdqa %xmm14,%xmm6
583 pslld $12,%xmm14
584 psrld $20,%xmm6
585 movdqa %xmm15,%xmm7
586 pslld $12,%xmm15
587 por %xmm6,%xmm14
588 psrld $20,%xmm7
589 movdqa (%r11),%xmm6
590 por %xmm7,%xmm15
591 paddd %xmm14,%xmm10
592 paddd %xmm15,%xmm11
593 pxor %xmm10,%xmm2
594 pxor %xmm11,%xmm3
595 .byte 102,15,56,0,214
596 .byte 102,15,56,0,222
597 paddd %xmm2,%xmm4
598 paddd %xmm3,%xmm5
599 pxor %xmm4,%xmm14
600 pxor %xmm5,%xmm15
601 movdqa %xmm14,%xmm7
602 pslld $7,%xmm14
603 psrld $25,%xmm7
604 movdqa %xmm15,%xmm6
605 pslld $7,%xmm15
606 por %xmm7,%xmm14
607 psrld $25,%xmm6
608 movdqa (%r10),%xmm7
609 por %xmm6,%xmm15
610 paddd %xmm13,%xmm8
611 paddd %xmm14,%xmm9
612 pxor %xmm8,%xmm3
613 pxor %xmm9,%xmm0
614 .byte 102,15,56,0,223
615 .byte 102,15,56,0,199
616 paddd %xmm3,%xmm4
617 paddd %xmm0,%xmm5
618 pxor %xmm4,%xmm13
619 pxor %xmm5,%xmm14
620 movdqa %xmm13,%xmm6
621 pslld $12,%xmm13
622 psrld $20,%xmm6
623 movdqa %xmm14,%xmm7
624 pslld $12,%xmm14
625 por %xmm6,%xmm13
626 psrld $20,%xmm7
627 movdqa (%r11),%xmm6
628 por %xmm7,%xmm14
629 paddd %xmm13,%xmm8
630 paddd %xmm14,%xmm9
631 pxor %xmm8,%xmm3
632 pxor %xmm9,%xmm0
633 .byte 102,15,56,0,222
634 .byte 102,15,56,0,198
635 paddd %xmm3,%xmm4
636 paddd %xmm0,%xmm5
637 pxor %xmm4,%xmm13
638 pxor %xmm5,%xmm14
639 movdqa %xmm13,%xmm7
640 pslld $7,%xmm13
641 psrld $25,%xmm7
642 movdqa %xmm14,%xmm6
643 pslld $7,%xmm14
644 por %xmm7,%xmm13
645 psrld $25,%xmm6
646 movdqa (%r10),%xmm7
647 por %xmm6,%xmm14
648 movdqa %xmm4,32(%rsp)
649 movdqa %xmm5,48(%rsp)
650 movdqa 0(%rsp),%xmm4
651 movdqa 16(%rsp),%xmm5
652 paddd %xmm15,%xmm10
653 paddd %xmm12,%xmm11
654 pxor %xmm10,%xmm1
655 pxor %xmm11,%xmm2
656 .byte 102,15,56,0,207
657 .byte 102,15,56,0,215
658 paddd %xmm1,%xmm4
659 paddd %xmm2,%xmm5
660 pxor %xmm4,%xmm15
661 pxor %xmm5,%xmm12
662 movdqa %xmm15,%xmm6
663 pslld $12,%xmm15
664 psrld $20,%xmm6
665 movdqa %xmm12,%xmm7
666 pslld $12,%xmm12
667 por %xmm6,%xmm15
668 psrld $20,%xmm7
669 movdqa (%r11),%xmm6
670 por %xmm7,%xmm12
671 paddd %xmm15,%xmm10
672 paddd %xmm12,%xmm11
673 pxor %xmm10,%xmm1
674 pxor %xmm11,%xmm2
675 .byte 102,15,56,0,206
676 .byte 102,15,56,0,214
677 paddd %xmm1,%xmm4
678 paddd %xmm2,%xmm5
679 pxor %xmm4,%xmm15
680 pxor %xmm5,%xmm12
681 movdqa %xmm15,%xmm7
682 pslld $7,%xmm15
683 psrld $25,%xmm7
684 movdqa %xmm12,%xmm6
685 pslld $7,%xmm12
686 por %xmm7,%xmm15
687 psrld $25,%xmm6
688 movdqa (%r10),%xmm7
689 por %xmm6,%xmm12
690 decl %eax
691 jnz .Loop4x
692
693 paddd 64(%rsp),%xmm8
694 paddd 80(%rsp),%xmm9
695 paddd 96(%rsp),%xmm10
696 paddd 112(%rsp),%xmm11
697
698 movdqa %xmm8,%xmm6
699 punpckldq %xmm9,%xmm8
700 movdqa %xmm10,%xmm7
701 punpckldq %xmm11,%xmm10
702 punpckhdq %xmm9,%xmm6
703 punpckhdq %xmm11,%xmm7
704 movdqa %xmm8,%xmm9
705 punpcklqdq %xmm10,%xmm8
706 movdqa %xmm6,%xmm11
707 punpcklqdq %xmm7,%xmm6
708 punpckhqdq %xmm10,%xmm9
709 punpckhqdq %xmm7,%xmm11
710 paddd 128-256(%rcx),%xmm12
711 paddd 144-256(%rcx),%xmm13
712 paddd 160-256(%rcx),%xmm14
713 paddd 176-256(%rcx),%xmm15
714
715 movdqa %xmm8,0(%rsp)
716 movdqa %xmm9,16(%rsp)
717 movdqa 32(%rsp),%xmm8
718 movdqa 48(%rsp),%xmm9
719
720 movdqa %xmm12,%xmm10
721 punpckldq %xmm13,%xmm12
722 movdqa %xmm14,%xmm7
723 punpckldq %xmm15,%xmm14
724 punpckhdq %xmm13,%xmm10
725 punpckhdq %xmm15,%xmm7
726 movdqa %xmm12,%xmm13
727 punpcklqdq %xmm14,%xmm12
728 movdqa %xmm10,%xmm15
729 punpcklqdq %xmm7,%xmm10
730 punpckhqdq %xmm14,%xmm13
731 punpckhqdq %xmm7,%xmm15
732 paddd 192-256(%rcx),%xmm4
733 paddd 208-256(%rcx),%xmm5
734 paddd 224-256(%rcx),%xmm8
735 paddd 240-256(%rcx),%xmm9
736
737 movdqa %xmm6,32(%rsp)
738 movdqa %xmm11,48(%rsp)
739
740 movdqa %xmm4,%xmm14
741 punpckldq %xmm5,%xmm4
742 movdqa %xmm8,%xmm7
743 punpckldq %xmm9,%xmm8
744 punpckhdq %xmm5,%xmm14
745 punpckhdq %xmm9,%xmm7
746 movdqa %xmm4,%xmm5
747 punpcklqdq %xmm8,%xmm4
748 movdqa %xmm14,%xmm9
749 punpcklqdq %xmm7,%xmm14
750 punpckhqdq %xmm8,%xmm5
751 punpckhqdq %xmm7,%xmm9
752 paddd 256-256(%rcx),%xmm0
753 paddd 272-256(%rcx),%xmm1
754 paddd 288-256(%rcx),%xmm2
755 paddd 304-256(%rcx),%xmm3
756
757 movdqa %xmm0,%xmm8
758 punpckldq %xmm1,%xmm0
759 movdqa %xmm2,%xmm7
760 punpckldq %xmm3,%xmm2
761 punpckhdq %xmm1,%xmm8
762 punpckhdq %xmm3,%xmm7
763 movdqa %xmm0,%xmm1
764 punpcklqdq %xmm2,%xmm0
765 movdqa %xmm8,%xmm3
766 punpcklqdq %xmm7,%xmm8
767 punpckhqdq %xmm2,%xmm1
768 punpckhqdq %xmm7,%xmm3
769 cmpq $256,%rdx
770 jb .Ltail4x
771
772 movdqu 0(%rsi),%xmm6
773 movdqu 16(%rsi),%xmm11
774 movdqu 32(%rsi),%xmm2
775 movdqu 48(%rsi),%xmm7
776 pxor 0(%rsp),%xmm6
777 pxor %xmm12,%xmm11
778 pxor %xmm4,%xmm2
779 pxor %xmm0,%xmm7
780
781 movdqu %xmm6,0(%rdi)
782 movdqu 64(%rsi),%xmm6
783 movdqu %xmm11,16(%rdi)
784 movdqu 80(%rsi),%xmm11
785 movdqu %xmm2,32(%rdi)
786 movdqu 96(%rsi),%xmm2
787 movdqu %xmm7,48(%rdi)
788 movdqu 112(%rsi),%xmm7
789 leaq 128(%rsi),%rsi
790 pxor 16(%rsp),%xmm6
791 pxor %xmm13,%xmm11
792 pxor %xmm5,%xmm2
793 pxor %xmm1,%xmm7
794
795 movdqu %xmm6,64(%rdi)
796 movdqu 0(%rsi),%xmm6
797 movdqu %xmm11,80(%rdi)
798 movdqu 16(%rsi),%xmm11
799 movdqu %xmm2,96(%rdi)
800 movdqu 32(%rsi),%xmm2
801 movdqu %xmm7,112(%rdi)
802 leaq 128(%rdi),%rdi
803 movdqu 48(%rsi),%xmm7
804 pxor 32(%rsp),%xmm6
805 pxor %xmm10,%xmm11
806 pxor %xmm14,%xmm2
807 pxor %xmm8,%xmm7
808
809 movdqu %xmm6,0(%rdi)
810 movdqu 64(%rsi),%xmm6
811 movdqu %xmm11,16(%rdi)
812 movdqu 80(%rsi),%xmm11
813 movdqu %xmm2,32(%rdi)
814 movdqu 96(%rsi),%xmm2
815 movdqu %xmm7,48(%rdi)
816 movdqu 112(%rsi),%xmm7
817 leaq 128(%rsi),%rsi
818 pxor 48(%rsp),%xmm6
819 pxor %xmm15,%xmm11
820 pxor %xmm9,%xmm2
821 pxor %xmm3,%xmm7
822 movdqu %xmm6,64(%rdi)
823 movdqu %xmm11,80(%rdi)
824 movdqu %xmm2,96(%rdi)
825 movdqu %xmm7,112(%rdi)
826 leaq 128(%rdi),%rdi
827
828 subq $256,%rdx
829 jnz .Loop_outer4x
830
831 jmp .Ldone4x
832
833 .Ltail4x:
834 cmpq $192,%rdx
835 jae .L192_or_more4x
836 cmpq $128,%rdx
837 jae .L128_or_more4x
838 cmpq $64,%rdx
839 jae .L64_or_more4x
840
841
842 xorq %r10,%r10
843
844 movdqa %xmm12,16(%rsp)
845 movdqa %xmm4,32(%rsp)
846 movdqa %xmm0,48(%rsp)
847 jmp .Loop_tail4x
848
849 .align 32
850 .L64_or_more4x:
851 movdqu 0(%rsi),%xmm6
852 movdqu 16(%rsi),%xmm11
853 movdqu 32(%rsi),%xmm2
854 movdqu 48(%rsi),%xmm7
855 pxor 0(%rsp),%xmm6
856 pxor %xmm12,%xmm11
857 pxor %xmm4,%xmm2
858 pxor %xmm0,%xmm7
859 movdqu %xmm6,0(%rdi)
860 movdqu %xmm11,16(%rdi)
861 movdqu %xmm2,32(%rdi)
862 movdqu %xmm7,48(%rdi)
863 je .Ldone4x
864
865 movdqa 16(%rsp),%xmm6
866 leaq 64(%rsi),%rsi
867 xorq %r10,%r10
868 movdqa %xmm6,0(%rsp)
869 movdqa %xmm13,16(%rsp)
870 leaq 64(%rdi),%rdi
871 movdqa %xmm5,32(%rsp)
872 subq $64,%rdx
873 movdqa %xmm1,48(%rsp)
874 jmp .Loop_tail4x
875
876 .align 32
877 .L128_or_more4x:
878 movdqu 0(%rsi),%xmm6
879 movdqu 16(%rsi),%xmm11
880 movdqu 32(%rsi),%xmm2
881 movdqu 48(%rsi),%xmm7
882 pxor 0(%rsp),%xmm6
883 pxor %xmm12,%xmm11
884 pxor %xmm4,%xmm2
885 pxor %xmm0,%xmm7
886
887 movdqu %xmm6,0(%rdi)
888 movdqu 64(%rsi),%xmm6
889 movdqu %xmm11,16(%rdi)
890 movdqu 80(%rsi),%xmm11
891 movdqu %xmm2,32(%rdi)
892 movdqu 96(%rsi),%xmm2
893 movdqu %xmm7,48(%rdi)
894 movdqu 112(%rsi),%xmm7
895 pxor 16(%rsp),%xmm6
896 pxor %xmm13,%xmm11
897 pxor %xmm5,%xmm2
898 pxor %xmm1,%xmm7
899 movdqu %xmm6,64(%rdi)
900 movdqu %xmm11,80(%rdi)
901 movdqu %xmm2,96(%rdi)
902 movdqu %xmm7,112(%rdi)
903 je .Ldone4x
904
905 movdqa 32(%rsp),%xmm6
906 leaq 128(%rsi),%rsi
907 xorq %r10,%r10
908 movdqa %xmm6,0(%rsp)
909 movdqa %xmm10,16(%rsp)
910 leaq 128(%rdi),%rdi
911 movdqa %xmm14,32(%rsp)
912 subq $128,%rdx
913 movdqa %xmm8,48(%rsp)
914 jmp .Loop_tail4x
915
916 .align 32
917 .L192_or_more4x:
918 movdqu 0(%rsi),%xmm6
919 movdqu 16(%rsi),%xmm11
920 movdqu 32(%rsi),%xmm2
921 movdqu 48(%rsi),%xmm7
922 pxor 0(%rsp),%xmm6
923 pxor %xmm12,%xmm11
924 pxor %xmm4,%xmm2
925 pxor %xmm0,%xmm7
926
927 movdqu %xmm6,0(%rdi)
928 movdqu 64(%rsi),%xmm6
929 movdqu %xmm11,16(%rdi)
930 movdqu 80(%rsi),%xmm11
931 movdqu %xmm2,32(%rdi)
932 movdqu 96(%rsi),%xmm2
933 movdqu %xmm7,48(%rdi)
934 movdqu 112(%rsi),%xmm7
935 leaq 128(%rsi),%rsi
936 pxor 16(%rsp),%xmm6
937 pxor %xmm13,%xmm11
938 pxor %xmm5,%xmm2
939 pxor %xmm1,%xmm7
940
941 movdqu %xmm6,64(%rdi)
942 movdqu 0(%rsi),%xmm6
943 movdqu %xmm11,80(%rdi)
944 movdqu 16(%rsi),%xmm11
945 movdqu %xmm2,96(%rdi)
946 movdqu 32(%rsi),%xmm2
947 movdqu %xmm7,112(%rdi)
948 leaq 128(%rdi),%rdi
949 movdqu 48(%rsi),%xmm7
950 pxor 32(%rsp),%xmm6
951 pxor %xmm10,%xmm11
952 pxor %xmm14,%xmm2
953 pxor %xmm8,%xmm7
954 movdqu %xmm6,0(%rdi)
955 movdqu %xmm11,16(%rdi)
956 movdqu %xmm2,32(%rdi)
957 movdqu %xmm7,48(%rdi)
958 je .Ldone4x
959
960 movdqa 48(%rsp),%xmm6
961 leaq 64(%rsi),%rsi
962 xorq %r10,%r10
963 movdqa %xmm6,0(%rsp)
964 movdqa %xmm15,16(%rsp)
965 leaq 64(%rdi),%rdi
966 movdqa %xmm9,32(%rsp)
967 subq $192,%rdx
968 movdqa %xmm3,48(%rsp)
969
970 .Loop_tail4x:
971 movzbl (%rsi,%r10,1),%eax
972 movzbl (%rsp,%r10,1),%ecx
973 leaq 1(%r10),%r10
974 xorl %ecx,%eax
975 movb %al,-1(%rdi,%r10,1)
976 decq %rdx
977 jnz .Loop_tail4x
978
979 .Ldone4x:
980 addq $0x148+0,%rsp
981 .byte 0xf3,0xc3
982 .size ChaCha20_4x,.-ChaCha20_4x
983 .type ChaCha20_8x,@function
984 .align 32
985 ChaCha20_8x:
986 .LChaCha20_8x:
987 movq %rsp,%r10
988 subq $0x280+8,%rsp
989 andq $-32,%rsp
990 vzeroupper
991 movq %r10,640(%rsp)
992
993
994
995
996
997
998
999
1000
1001
1002 vbroadcasti128 .Lsigma(%rip),%ymm11
1003 vbroadcasti128 (%rcx),%ymm3
1004 vbroadcasti128 16(%rcx),%ymm15
1005 vbroadcasti128 (%r8),%ymm7
1006 leaq 256(%rsp),%rcx
1007 leaq 512(%rsp),%rax
1008 leaq .Lrot16(%rip),%r10
1009 leaq .Lrot24(%rip),%r11
1010
1011 vpshufd $0x00,%ymm11,%ymm8
1012 vpshufd $0x55,%ymm11,%ymm9
1013 vmovdqa %ymm8,128-256(%rcx)
1014 vpshufd $0xaa,%ymm11,%ymm10
1015 vmovdqa %ymm9,160-256(%rcx)
1016 vpshufd $0xff,%ymm11,%ymm11
1017 vmovdqa %ymm10,192-256(%rcx)
1018 vmovdqa %ymm11,224-256(%rcx)
1019
1020 vpshufd $0x00,%ymm3,%ymm0
1021 vpshufd $0x55,%ymm3,%ymm1
1022 vmovdqa %ymm0,256-256(%rcx)
1023 vpshufd $0xaa,%ymm3,%ymm2
1024 vmovdqa %ymm1,288-256(%rcx)
1025 vpshufd $0xff,%ymm3,%ymm3
1026 vmovdqa %ymm2,320-256(%rcx)
1027 vmovdqa %ymm3,352-256(%rcx)
1028
1029 vpshufd $0x00,%ymm15,%ymm12
1030 vpshufd $0x55,%ymm15,%ymm13
1031 vmovdqa %ymm12,384-512(%rax)
1032 vpshufd $0xaa,%ymm15,%ymm14
1033 vmovdqa %ymm13,416-512(%rax)
1034 vpshufd $0xff,%ymm15,%ymm15
1035 vmovdqa %ymm14,448-512(%rax)
1036 vmovdqa %ymm15,480-512(%rax)
1037
1038 vpshufd $0x00,%ymm7,%ymm4
1039 vpshufd $0x55,%ymm7,%ymm5
1040 vpaddd .Lincy(%rip),%ymm4,%ymm4
1041 vpshufd $0xaa,%ymm7,%ymm6
1042 vmovdqa %ymm5,544-512(%rax)
1043 vpshufd $0xff,%ymm7,%ymm7
1044 vmovdqa %ymm6,576-512(%rax)
1045 vmovdqa %ymm7,608-512(%rax)
1046
1047 jmp .Loop_enter8x
1048
1049 .align 32
1050 .Loop_outer8x:
1051 vmovdqa 128-256(%rcx),%ymm8
1052 vmovdqa 160-256(%rcx),%ymm9
1053 vmovdqa 192-256(%rcx),%ymm10
1054 vmovdqa 224-256(%rcx),%ymm11
1055 vmovdqa 256-256(%rcx),%ymm0
1056 vmovdqa 288-256(%rcx),%ymm1
1057 vmovdqa 320-256(%rcx),%ymm2
1058 vmovdqa 352-256(%rcx),%ymm3
1059 vmovdqa 384-512(%rax),%ymm12
1060 vmovdqa 416-512(%rax),%ymm13
1061 vmovdqa 448-512(%rax),%ymm14
1062 vmovdqa 480-512(%rax),%ymm15
1063 vmovdqa 512-512(%rax),%ymm4
1064 vmovdqa 544-512(%rax),%ymm5
1065 vmovdqa 576-512(%rax),%ymm6
1066 vmovdqa 608-512(%rax),%ymm7
1067 vpaddd .Leight(%rip),%ymm4,%ymm4
1068
1069 .Loop_enter8x:
1070 vmovdqa %ymm14,64(%rsp)
1071 vmovdqa %ymm15,96(%rsp)
1072 vbroadcasti128 (%r10),%ymm15
1073 vmovdqa %ymm4,512-512(%rax)
1074 movl $10,%eax
1075 jmp .Loop8x
1076
1077 .align 32
1078 .Loop8x:
1079 vpaddd %ymm0,%ymm8,%ymm8
1080 vpxor %ymm4,%ymm8,%ymm4
1081 vpshufb %ymm15,%ymm4,%ymm4
1082 vpaddd %ymm1,%ymm9,%ymm9
1083 vpxor %ymm5,%ymm9,%ymm5
1084 vpshufb %ymm15,%ymm5,%ymm5
1085 vpaddd %ymm4,%ymm12,%ymm12
1086 vpxor %ymm0,%ymm12,%ymm0
1087 vpslld $12,%ymm0,%ymm14
1088 vpsrld $20,%ymm0,%ymm0
1089 vpor %ymm0,%ymm14,%ymm0
1090 vbroadcasti128 (%r11),%ymm14
1091 vpaddd %ymm5,%ymm13,%ymm13
1092 vpxor %ymm1,%ymm13,%ymm1
1093 vpslld $12,%ymm1,%ymm15
1094 vpsrld $20,%ymm1,%ymm1
1095 vpor %ymm1,%ymm15,%ymm1
1096 vpaddd %ymm0,%ymm8,%ymm8
1097 vpxor %ymm4,%ymm8,%ymm4
1098 vpshufb %ymm14,%ymm4,%ymm4
1099 vpaddd %ymm1,%ymm9,%ymm9
1100 vpxor %ymm5,%ymm9,%ymm5
1101 vpshufb %ymm14,%ymm5,%ymm5
1102 vpaddd %ymm4,%ymm12,%ymm12
1103 vpxor %ymm0,%ymm12,%ymm0
1104 vpslld $7,%ymm0,%ymm15
1105 vpsrld $25,%ymm0,%ymm0
1106 vpor %ymm0,%ymm15,%ymm0
1107 vbroadcasti128 (%r10),%ymm15
1108 vpaddd %ymm5,%ymm13,%ymm13
1109 vpxor %ymm1,%ymm13,%ymm1
1110 vpslld $7,%ymm1,%ymm14
1111 vpsrld $25,%ymm1,%ymm1
1112 vpor %ymm1,%ymm14,%ymm1
1113 vmovdqa %ymm12,0(%rsp)
1114 vmovdqa %ymm13,32(%rsp)
1115 vmovdqa 64(%rsp),%ymm12
1116 vmovdqa 96(%rsp),%ymm13
1117 vpaddd %ymm2,%ymm10,%ymm10
1118 vpxor %ymm6,%ymm10,%ymm6
1119 vpshufb %ymm15,%ymm6,%ymm6
1120 vpaddd %ymm3,%ymm11,%ymm11
1121 vpxor %ymm7,%ymm11,%ymm7
1122 vpshufb %ymm15,%ymm7,%ymm7
1123 vpaddd %ymm6,%ymm12,%ymm12
1124 vpxor %ymm2,%ymm12,%ymm2
1125 vpslld $12,%ymm2,%ymm14
1126 vpsrld $20,%ymm2,%ymm2
1127 vpor %ymm2,%ymm14,%ymm2
1128 vbroadcasti128 (%r11),%ymm14
1129 vpaddd %ymm7,%ymm13,%ymm13
1130 vpxor %ymm3,%ymm13,%ymm3
1131 vpslld $12,%ymm3,%ymm15
1132 vpsrld $20,%ymm3,%ymm3
1133 vpor %ymm3,%ymm15,%ymm3
1134 vpaddd %ymm2,%ymm10,%ymm10
1135 vpxor %ymm6,%ymm10,%ymm6
1136 vpshufb %ymm14,%ymm6,%ymm6
1137 vpaddd %ymm3,%ymm11,%ymm11
1138 vpxor %ymm7,%ymm11,%ymm7
1139 vpshufb %ymm14,%ymm7,%ymm7
1140 vpaddd %ymm6,%ymm12,%ymm12
1141 vpxor %ymm2,%ymm12,%ymm2
1142 vpslld $7,%ymm2,%ymm15
1143 vpsrld $25,%ymm2,%ymm2
1144 vpor %ymm2,%ymm15,%ymm2
1145 vbroadcasti128 (%r10),%ymm15
1146 vpaddd %ymm7,%ymm13,%ymm13
1147 vpxor %ymm3,%ymm13,%ymm3
1148 vpslld $7,%ymm3,%ymm14
1149 vpsrld $25,%ymm3,%ymm3
1150 vpor %ymm3,%ymm14,%ymm3
1151 vpaddd %ymm1,%ymm8,%ymm8
1152 vpxor %ymm7,%ymm8,%ymm7
1153 vpshufb %ymm15,%ymm7,%ymm7
1154 vpaddd %ymm2,%ymm9,%ymm9
1155 vpxor %ymm4,%ymm9,%ymm4
1156 vpshufb %ymm15,%ymm4,%ymm4
1157 vpaddd %ymm7,%ymm12,%ymm12
1158 vpxor %ymm1,%ymm12,%ymm1
1159 vpslld $12,%ymm1,%ymm14
1160 vpsrld $20,%ymm1,%ymm1
1161 vpor %ymm1,%ymm14,%ymm1
1162 vbroadcasti128 (%r11),%ymm14
1163 vpaddd %ymm4,%ymm13,%ymm13
1164 vpxor %ymm2,%ymm13,%ymm2
1165 vpslld $12,%ymm2,%ymm15
1166 vpsrld $20,%ymm2,%ymm2
1167 vpor %ymm2,%ymm15,%ymm2
1168 vpaddd %ymm1,%ymm8,%ymm8
1169 vpxor %ymm7,%ymm8,%ymm7
1170 vpshufb %ymm14,%ymm7,%ymm7
1171 vpaddd %ymm2,%ymm9,%ymm9
1172 vpxor %ymm4,%ymm9,%ymm4
1173 vpshufb %ymm14,%ymm4,%ymm4
1174 vpaddd %ymm7,%ymm12,%ymm12
1175 vpxor %ymm1,%ymm12,%ymm1
1176 vpslld $7,%ymm1,%ymm15
1177 vpsrld $25,%ymm1,%ymm1
1178 vpor %ymm1,%ymm15,%ymm1
1179 vbroadcasti128 (%r10),%ymm15
1180 vpaddd %ymm4,%ymm13,%ymm13
1181 vpxor %ymm2,%ymm13,%ymm2
1182 vpslld $7,%ymm2,%ymm14
1183 vpsrld $25,%ymm2,%ymm2
1184 vpor %ymm2,%ymm14,%ymm2
1185 vmovdqa %ymm12,64(%rsp)
1186 vmovdqa %ymm13,96(%rsp)
1187 vmovdqa 0(%rsp),%ymm12
1188 vmovdqa 32(%rsp),%ymm13
1189 vpaddd %ymm3,%ymm10,%ymm10
1190 vpxor %ymm5,%ymm10,%ymm5
1191 vpshufb %ymm15,%ymm5,%ymm5
1192 vpaddd %ymm0,%ymm11,%ymm11
1193 vpxor %ymm6,%ymm11,%ymm6
1194 vpshufb %ymm15,%ymm6,%ymm6
1195 vpaddd %ymm5,%ymm12,%ymm12
1196 vpxor %ymm3,%ymm12,%ymm3
1197 vpslld $12,%ymm3,%ymm14
1198 vpsrld $20,%ymm3,%ymm3
1199 vpor %ymm3,%ymm14,%ymm3
1200 vbroadcasti128 (%r11),%ymm14
1201 vpaddd %ymm6,%ymm13,%ymm13
1202 vpxor %ymm0,%ymm13,%ymm0
1203 vpslld $12,%ymm0,%ymm15
1204 vpsrld $20,%ymm0,%ymm0
1205 vpor %ymm0,%ymm15,%ymm0
1206 vpaddd %ymm3,%ymm10,%ymm10
1207 vpxor %ymm5,%ymm10,%ymm5
1208 vpshufb %ymm14,%ymm5,%ymm5
1209 vpaddd %ymm0,%ymm11,%ymm11
1210 vpxor %ymm6,%ymm11,%ymm6
1211 vpshufb %ymm14,%ymm6,%ymm6
1212 vpaddd %ymm5,%ymm12,%ymm12
1213 vpxor %ymm3,%ymm12,%ymm3
1214 vpslld $7,%ymm3,%ymm15
1215 vpsrld $25,%ymm3,%ymm3
1216 vpor %ymm3,%ymm15,%ymm3
1217 vbroadcasti128 (%r10),%ymm15
1218 vpaddd %ymm6,%ymm13,%ymm13
1219 vpxor %ymm0,%ymm13,%ymm0
1220 vpslld $7,%ymm0,%ymm14
1221 vpsrld $25,%ymm0,%ymm0
1222 vpor %ymm0,%ymm14,%ymm0
1223 decl %eax
1224 jnz .Loop8x
1225
1226 leaq 512(%rsp),%rax
1227 vpaddd 128-256(%rcx),%ymm8,%ymm8
1228 vpaddd 160-256(%rcx),%ymm9,%ymm9
1229 vpaddd 192-256(%rcx),%ymm10,%ymm10
1230 vpaddd 224-256(%rcx),%ymm11,%ymm11
1231
1232 vpunpckldq %ymm9,%ymm8,%ymm14
1233 vpunpckldq %ymm11,%ymm10,%ymm15
1234 vpunpckhdq %ymm9,%ymm8,%ymm8
1235 vpunpckhdq %ymm11,%ymm10,%ymm10
1236 vpunpcklqdq %ymm15,%ymm14,%ymm9
1237 vpunpckhqdq %ymm15,%ymm14,%ymm14
1238 vpunpcklqdq %ymm10,%ymm8,%ymm11
1239 vpunpckhqdq %ymm10,%ymm8,%ymm8
1240 vpaddd 256-256(%rcx),%ymm0,%ymm0
1241 vpaddd 288-256(%rcx),%ymm1,%ymm1
1242 vpaddd 320-256(%rcx),%ymm2,%ymm2
1243 vpaddd 352-256(%rcx),%ymm3,%ymm3
1244
1245 vpunpckldq %ymm1,%ymm0,%ymm10
1246 vpunpckldq %ymm3,%ymm2,%ymm15
1247 vpunpckhdq %ymm1,%ymm0,%ymm0
1248 vpunpckhdq %ymm3,%ymm2,%ymm2
1249 vpunpcklqdq %ymm15,%ymm10,%ymm1
1250 vpunpckhqdq %ymm15,%ymm10,%ymm10
1251 vpunpcklqdq %ymm2,%ymm0,%ymm3
1252 vpunpckhqdq %ymm2,%ymm0,%ymm0
1253 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
1254 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
1255 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
1256 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
1257 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
1258 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
1259 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
1260 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
1261 vmovdqa %ymm15,0(%rsp)
1262 vmovdqa %ymm9,32(%rsp)
1263 vmovdqa 64(%rsp),%ymm15
1264 vmovdqa 96(%rsp),%ymm9
1265
1266 vpaddd 384-512(%rax),%ymm12,%ymm12
1267 vpaddd 416-512(%rax),%ymm13,%ymm13
1268 vpaddd 448-512(%rax),%ymm15,%ymm15
1269 vpaddd 480-512(%rax),%ymm9,%ymm9
1270
1271 vpunpckldq %ymm13,%ymm12,%ymm2
1272 vpunpckldq %ymm9,%ymm15,%ymm8
1273 vpunpckhdq %ymm13,%ymm12,%ymm12
1274 vpunpckhdq %ymm9,%ymm15,%ymm15
1275 vpunpcklqdq %ymm8,%ymm2,%ymm13
1276 vpunpckhqdq %ymm8,%ymm2,%ymm2
1277 vpunpcklqdq %ymm15,%ymm12,%ymm9
1278 vpunpckhqdq %ymm15,%ymm12,%ymm12
1279 vpaddd 512-512(%rax),%ymm4,%ymm4
1280 vpaddd 544-512(%rax),%ymm5,%ymm5
1281 vpaddd 576-512(%rax),%ymm6,%ymm6
1282 vpaddd 608-512(%rax),%ymm7,%ymm7
1283
1284 vpunpckldq %ymm5,%ymm4,%ymm15
1285 vpunpckldq %ymm7,%ymm6,%ymm8
1286 vpunpckhdq %ymm5,%ymm4,%ymm4
1287 vpunpckhdq %ymm7,%ymm6,%ymm6
1288 vpunpcklqdq %ymm8,%ymm15,%ymm5
1289 vpunpckhqdq %ymm8,%ymm15,%ymm15
1290 vpunpcklqdq %ymm6,%ymm4,%ymm7
1291 vpunpckhqdq %ymm6,%ymm4,%ymm4
1292 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
1293 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
1294 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
1295 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
1296 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
1297 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
1298 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
1299 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
1300 vmovdqa 0(%rsp),%ymm6
1301 vmovdqa 32(%rsp),%ymm12
1302
1303 cmpq $512,%rdx
1304 jb .Ltail8x
1305
1306 vpxor 0(%rsi),%ymm6,%ymm6
1307 vpxor 32(%rsi),%ymm8,%ymm8
1308 vpxor 64(%rsi),%ymm1,%ymm1
1309 vpxor 96(%rsi),%ymm5,%ymm5
1310 leaq 128(%rsi),%rsi
1311 vmovdqu %ymm6,0(%rdi)
1312 vmovdqu %ymm8,32(%rdi)
1313 vmovdqu %ymm1,64(%rdi)
1314 vmovdqu %ymm5,96(%rdi)
1315 leaq 128(%rdi),%rdi
1316
1317 vpxor 0(%rsi),%ymm12,%ymm12
1318 vpxor 32(%rsi),%ymm13,%ymm13
1319 vpxor 64(%rsi),%ymm10,%ymm10
1320 vpxor 96(%rsi),%ymm15,%ymm15
1321 leaq 128(%rsi),%rsi
1322 vmovdqu %ymm12,0(%rdi)
1323 vmovdqu %ymm13,32(%rdi)
1324 vmovdqu %ymm10,64(%rdi)
1325 vmovdqu %ymm15,96(%rdi)
1326 leaq 128(%rdi),%rdi
1327
1328 vpxor 0(%rsi),%ymm14,%ymm14
1329 vpxor 32(%rsi),%ymm2,%ymm2
1330 vpxor 64(%rsi),%ymm3,%ymm3
1331 vpxor 96(%rsi),%ymm7,%ymm7
1332 leaq 128(%rsi),%rsi
1333 vmovdqu %ymm14,0(%rdi)
1334 vmovdqu %ymm2,32(%rdi)
1335 vmovdqu %ymm3,64(%rdi)
1336 vmovdqu %ymm7,96(%rdi)
1337 leaq 128(%rdi),%rdi
1338
1339 vpxor 0(%rsi),%ymm11,%ymm11
1340 vpxor 32(%rsi),%ymm9,%ymm9
1341 vpxor 64(%rsi),%ymm0,%ymm0
1342 vpxor 96(%rsi),%ymm4,%ymm4
1343 leaq 128(%rsi),%rsi
1344 vmovdqu %ymm11,0(%rdi)
1345 vmovdqu %ymm9,32(%rdi)
1346 vmovdqu %ymm0,64(%rdi)
1347 vmovdqu %ymm4,96(%rdi)
1348 leaq 128(%rdi),%rdi
1349
1350 subq $512,%rdx
1351 jnz .Loop_outer8x
1352
1353 jmp .Ldone8x
1354
1355 .Ltail8x:
1356 cmpq $448,%rdx
1357 jae .L448_or_more8x
1358 cmpq $384,%rdx
1359 jae .L384_or_more8x
1360 cmpq $320,%rdx
1361 jae .L320_or_more8x
1362 cmpq $256,%rdx
1363 jae .L256_or_more8x
1364 cmpq $192,%rdx
1365 jae .L192_or_more8x
1366 cmpq $128,%rdx
1367 jae .L128_or_more8x
1368 cmpq $64,%rdx
1369 jae .L64_or_more8x
1370
1371 xorq %r10,%r10
1372 vmovdqa %ymm6,0(%rsp)
1373 vmovdqa %ymm8,32(%rsp)
1374 jmp .Loop_tail8x
1375
1376 .align 32
1377 .L64_or_more8x:
1378 vpxor 0(%rsi),%ymm6,%ymm6
1379 vpxor 32(%rsi),%ymm8,%ymm8
1380 vmovdqu %ymm6,0(%rdi)
1381 vmovdqu %ymm8,32(%rdi)
1382 je .Ldone8x
1383
1384 leaq 64(%rsi),%rsi
1385 xorq %r10,%r10
1386 vmovdqa %ymm1,0(%rsp)
1387 leaq 64(%rdi),%rdi
1388 subq $64,%rdx
1389 vmovdqa %ymm5,32(%rsp)
1390 jmp .Loop_tail8x
1391
1392 .align 32
1393 .L128_or_more8x:
1394 vpxor 0(%rsi),%ymm6,%ymm6
1395 vpxor 32(%rsi),%ymm8,%ymm8
1396 vpxor 64(%rsi),%ymm1,%ymm1
1397 vpxor 96(%rsi),%ymm5,%ymm5
1398 vmovdqu %ymm6,0(%rdi)
1399 vmovdqu %ymm8,32(%rdi)
1400 vmovdqu %ymm1,64(%rdi)
1401 vmovdqu %ymm5,96(%rdi)
1402 je .Ldone8x
1403
1404 leaq 128(%rsi),%rsi
1405 xorq %r10,%r10
1406 vmovdqa %ymm12,0(%rsp)
1407 leaq 128(%rdi),%rdi
1408 subq $128,%rdx
1409 vmovdqa %ymm13,32(%rsp)
1410 jmp .Loop_tail8x
1411
1412 .align 32
1413 .L192_or_more8x:
1414 vpxor 0(%rsi),%ymm6,%ymm6
1415 vpxor 32(%rsi),%ymm8,%ymm8
1416 vpxor 64(%rsi),%ymm1,%ymm1
1417 vpxor 96(%rsi),%ymm5,%ymm5
1418 vpxor 128(%rsi),%ymm12,%ymm12
1419 vpxor 160(%rsi),%ymm13,%ymm13
1420 vmovdqu %ymm6,0(%rdi)
1421 vmovdqu %ymm8,32(%rdi)
1422 vmovdqu %ymm1,64(%rdi)
1423 vmovdqu %ymm5,96(%rdi)
1424 vmovdqu %ymm12,128(%rdi)
1425 vmovdqu %ymm13,160(%rdi)
1426 je .Ldone8x
1427
1428 leaq 192(%rsi),%rsi
1429 xorq %r10,%r10
1430 vmovdqa %ymm10,0(%rsp)
1431 leaq 192(%rdi),%rdi
1432 subq $192,%rdx
1433 vmovdqa %ymm15,32(%rsp)
1434 jmp .Loop_tail8x
1435
1436 .align 32
1437 .L256_or_more8x:
1438 vpxor 0(%rsi),%ymm6,%ymm6
1439 vpxor 32(%rsi),%ymm8,%ymm8
1440 vpxor 64(%rsi),%ymm1,%ymm1
1441 vpxor 96(%rsi),%ymm5,%ymm5
1442 vpxor 128(%rsi),%ymm12,%ymm12
1443 vpxor 160(%rsi),%ymm13,%ymm13
1444 vpxor 192(%rsi),%ymm10,%ymm10
1445 vpxor 224(%rsi),%ymm15,%ymm15
1446 vmovdqu %ymm6,0(%rdi)
1447 vmovdqu %ymm8,32(%rdi)
1448 vmovdqu %ymm1,64(%rdi)
1449 vmovdqu %ymm5,96(%rdi)
1450 vmovdqu %ymm12,128(%rdi)
1451 vmovdqu %ymm13,160(%rdi)
1452 vmovdqu %ymm10,192(%rdi)
1453 vmovdqu %ymm15,224(%rdi)
1454 je .Ldone8x
1455
1456 leaq 256(%rsi),%rsi
1457 xorq %r10,%r10
1458 vmovdqa %ymm14,0(%rsp)
1459 leaq 256(%rdi),%rdi
1460 subq $256,%rdx
1461 vmovdqa %ymm2,32(%rsp)
1462 jmp .Loop_tail8x
1463
1464 .align 32
1465 .L320_or_more8x:
1466 vpxor 0(%rsi),%ymm6,%ymm6
1467 vpxor 32(%rsi),%ymm8,%ymm8
1468 vpxor 64(%rsi),%ymm1,%ymm1
1469 vpxor 96(%rsi),%ymm5,%ymm5
1470 vpxor 128(%rsi),%ymm12,%ymm12
1471 vpxor 160(%rsi),%ymm13,%ymm13
1472 vpxor 192(%rsi),%ymm10,%ymm10
1473 vpxor 224(%rsi),%ymm15,%ymm15
1474 vpxor 256(%rsi),%ymm14,%ymm14
1475 vpxor 288(%rsi),%ymm2,%ymm2
1476 vmovdqu %ymm6,0(%rdi)
1477 vmovdqu %ymm8,32(%rdi)
1478 vmovdqu %ymm1,64(%rdi)
1479 vmovdqu %ymm5,96(%rdi)
1480 vmovdqu %ymm12,128(%rdi)
1481 vmovdqu %ymm13,160(%rdi)
1482 vmovdqu %ymm10,192(%rdi)
1483 vmovdqu %ymm15,224(%rdi)
1484 vmovdqu %ymm14,256(%rdi)
1485 vmovdqu %ymm2,288(%rdi)
1486 je .Ldone8x
1487
1488 leaq 320(%rsi),%rsi
1489 xorq %r10,%r10
1490 vmovdqa %ymm3,0(%rsp)
1491 leaq 320(%rdi),%rdi
1492 subq $320,%rdx
1493 vmovdqa %ymm7,32(%rsp)
1494 jmp .Loop_tail8x
1495
1496 .align 32
1497 .L384_or_more8x:
1498 vpxor 0(%rsi),%ymm6,%ymm6
1499 vpxor 32(%rsi),%ymm8,%ymm8
1500 vpxor 64(%rsi),%ymm1,%ymm1
1501 vpxor 96(%rsi),%ymm5,%ymm5
1502 vpxor 128(%rsi),%ymm12,%ymm12
1503 vpxor 160(%rsi),%ymm13,%ymm13
1504 vpxor 192(%rsi),%ymm10,%ymm10
1505 vpxor 224(%rsi),%ymm15,%ymm15
1506 vpxor 256(%rsi),%ymm14,%ymm14
1507 vpxor 288(%rsi),%ymm2,%ymm2
1508 vpxor 320(%rsi),%ymm3,%ymm3
1509 vpxor 352(%rsi),%ymm7,%ymm7
1510 vmovdqu %ymm6,0(%rdi)
1511 vmovdqu %ymm8,32(%rdi)
1512 vmovdqu %ymm1,64(%rdi)
1513 vmovdqu %ymm5,96(%rdi)
1514 vmovdqu %ymm12,128(%rdi)
1515 vmovdqu %ymm13,160(%rdi)
1516 vmovdqu %ymm10,192(%rdi)
1517 vmovdqu %ymm15,224(%rdi)
1518 vmovdqu %ymm14,256(%rdi)
1519 vmovdqu %ymm2,288(%rdi)
1520 vmovdqu %ymm3,320(%rdi)
1521 vmovdqu %ymm7,352(%rdi)
1522 je .Ldone8x
1523
1524 leaq 384(%rsi),%rsi
1525 xorq %r10,%r10
1526 vmovdqa %ymm11,0(%rsp)
1527 leaq 384(%rdi),%rdi
1528 subq $384,%rdx
1529 vmovdqa %ymm9,32(%rsp)
1530 jmp .Loop_tail8x
1531
1532 .align 32
1533 .L448_or_more8x:
1534 vpxor 0(%rsi),%ymm6,%ymm6
1535 vpxor 32(%rsi),%ymm8,%ymm8
1536 vpxor 64(%rsi),%ymm1,%ymm1
1537 vpxor 96(%rsi),%ymm5,%ymm5
1538 vpxor 128(%rsi),%ymm12,%ymm12
1539 vpxor 160(%rsi),%ymm13,%ymm13
1540 vpxor 192(%rsi),%ymm10,%ymm10
1541 vpxor 224(%rsi),%ymm15,%ymm15
1542 vpxor 256(%rsi),%ymm14,%ymm14
1543 vpxor 288(%rsi),%ymm2,%ymm2
1544 vpxor 320(%rsi),%ymm3,%ymm3
1545 vpxor 352(%rsi),%ymm7,%ymm7
1546 vpxor 384(%rsi),%ymm11,%ymm11
1547 vpxor 416(%rsi),%ymm9,%ymm9
1548 vmovdqu %ymm6,0(%rdi)
1549 vmovdqu %ymm8,32(%rdi)
1550 vmovdqu %ymm1,64(%rdi)
1551 vmovdqu %ymm5,96(%rdi)
1552 vmovdqu %ymm12,128(%rdi)
1553 vmovdqu %ymm13,160(%rdi)
1554 vmovdqu %ymm10,192(%rdi)
1555 vmovdqu %ymm15,224(%rdi)
1556 vmovdqu %ymm14,256(%rdi)
1557 vmovdqu %ymm2,288(%rdi)
1558 vmovdqu %ymm3,320(%rdi)
1559 vmovdqu %ymm7,352(%rdi)
1560 vmovdqu %ymm11,384(%rdi)
1561 vmovdqu %ymm9,416(%rdi)
1562 je .Ldone8x
1563
1564 leaq 448(%rsi),%rsi
1565 xorq %r10,%r10
1566 vmovdqa %ymm0,0(%rsp)
1567 leaq 448(%rdi),%rdi
1568 subq $448,%rdx
1569 vmovdqa %ymm4,32(%rsp)
1570
1571 .Loop_tail8x:
1572 movzbl (%rsi,%r10,1),%eax
1573 movzbl (%rsp,%r10,1),%ecx
1574 leaq 1(%r10),%r10
1575 xorl %ecx,%eax
1576 movb %al,-1(%rdi,%r10,1)
1577 decq %rdx
1578 jnz .Loop_tail8x
1579
1580 .Ldone8x:
1581 vzeroall
1582 movq 640(%rsp),%rsp
1583 .byte 0xf3,0xc3
1584 .size ChaCha20_8x,.-ChaCha20_8x
1585 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698