OLD | NEW |
1 #if defined(__x86_64__) | 1 #if defined(__x86_64__) |
2 .text | 2 .text |
3 | 3 |
4 .extern OPENSSL_ia32cap_P | 4 .extern OPENSSL_ia32cap_P |
5 .hidden OPENSSL_ia32cap_P | 5 .hidden OPENSSL_ia32cap_P |
6 | 6 |
7 .align 64 | 7 .align 64 |
8 .Lzero: | 8 .Lzero: |
9 .long 0,0,0,0 | 9 .long 0,0,0,0 |
10 .Lone: | 10 .Lone: |
11 .long 1,0,0,0 | 11 .long 1,0,0,0 |
12 .Linc: | 12 .Linc: |
13 .long 0,1,2,3 | 13 .long 0,1,2,3 |
14 .Lfour: | 14 .Lfour: |
15 .long 4,4,4,4 | 15 .long 4,4,4,4 |
16 .Lincy: | 16 .Lincy: |
17 .long 0,2,4,6,1,3,5,7 | 17 .long 0,2,4,6,1,3,5,7 |
18 .Leight: | 18 .Leight: |
19 .long 8,8,8,8,8,8,8,8 | 19 .long 8,8,8,8,8,8,8,8 |
20 .Lrot16: | 20 .Lrot16: |
21 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd | 21 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd |
22 .Lrot24: | 22 .Lrot24: |
23 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe | 23 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe |
24 .Lsigma: | 24 .Lsigma: |
25 .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 | 25 .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 |
| 26 .align 64 |
| 27 .Lzeroz: |
| 28 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 |
| 29 .Lfourz: |
| 30 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 |
| 31 .Lincz: |
| 32 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 |
| 33 .Lsixteen: |
| 34 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 |
26 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,
82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110
,115,115,108,46,111,114,103,62,0 | 35 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,
82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110
,115,115,108,46,111,114,103,62,0 |
27 .globl ChaCha20_ctr32 | 36 .globl ChaCha20_ctr32 |
28 .hidden ChaCha20_ctr32 | 37 .hidden ChaCha20_ctr32 |
29 .type ChaCha20_ctr32,@function | 38 .type ChaCha20_ctr32,@function |
30 .align 64 | 39 .align 64 |
31 ChaCha20_ctr32: | 40 ChaCha20_ctr32: |
32 cmpq $0,%rdx | 41 cmpq $0,%rdx |
33 je .Lno_data | 42 je .Lno_data |
34 movq OPENSSL_ia32cap_P+4(%rip),%r10 | 43 movq OPENSSL_ia32cap_P+4(%rip),%r10 |
35 testl $512,%r10d | 44 testl $512,%r10d |
(...skipping 273 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
309 movdqu (%rcx),%xmm1 | 318 movdqu (%rcx),%xmm1 |
310 movdqu 16(%rcx),%xmm2 | 319 movdqu 16(%rcx),%xmm2 |
311 movdqu (%r8),%xmm3 | 320 movdqu (%r8),%xmm3 |
312 movdqa .Lrot16(%rip),%xmm6 | 321 movdqa .Lrot16(%rip),%xmm6 |
313 movdqa .Lrot24(%rip),%xmm7 | 322 movdqa .Lrot24(%rip),%xmm7 |
314 | 323 |
315 movdqa %xmm0,0(%rsp) | 324 movdqa %xmm0,0(%rsp) |
316 movdqa %xmm1,16(%rsp) | 325 movdqa %xmm1,16(%rsp) |
317 movdqa %xmm2,32(%rsp) | 326 movdqa %xmm2,32(%rsp) |
318 movdqa %xmm3,48(%rsp) | 327 movdqa %xmm3,48(%rsp) |
319 » movl» $10,%ebp | 328 » movq» $10,%r8 |
320 jmp .Loop_ssse3 | 329 jmp .Loop_ssse3 |
321 | 330 |
322 .align 32 | 331 .align 32 |
323 .Loop_outer_ssse3: | 332 .Loop_outer_ssse3: |
324 movdqa .Lone(%rip),%xmm3 | 333 movdqa .Lone(%rip),%xmm3 |
325 movdqa 0(%rsp),%xmm0 | 334 movdqa 0(%rsp),%xmm0 |
326 movdqa 16(%rsp),%xmm1 | 335 movdqa 16(%rsp),%xmm1 |
327 movdqa 32(%rsp),%xmm2 | 336 movdqa 32(%rsp),%xmm2 |
328 paddd 48(%rsp),%xmm3 | 337 paddd 48(%rsp),%xmm3 |
329 » movl» $10,%ebp | 338 » movq» $10,%r8 |
330 movdqa %xmm3,48(%rsp) | 339 movdqa %xmm3,48(%rsp) |
331 jmp .Loop_ssse3 | 340 jmp .Loop_ssse3 |
332 | 341 |
333 .align 32 | 342 .align 32 |
334 .Loop_ssse3: | 343 .Loop_ssse3: |
335 paddd %xmm1,%xmm0 | 344 paddd %xmm1,%xmm0 |
336 pxor %xmm0,%xmm3 | 345 pxor %xmm0,%xmm3 |
337 .byte 102,15,56,0,222 | 346 .byte 102,15,56,0,222 |
338 paddd %xmm3,%xmm2 | 347 paddd %xmm3,%xmm2 |
339 pxor %xmm2,%xmm1 | 348 pxor %xmm2,%xmm1 |
(...skipping 28 matching lines...) Expand all Loading... |
368 .byte 102,15,56,0,223 | 377 .byte 102,15,56,0,223 |
369 paddd %xmm3,%xmm2 | 378 paddd %xmm3,%xmm2 |
370 pxor %xmm2,%xmm1 | 379 pxor %xmm2,%xmm1 |
371 movdqa %xmm1,%xmm4 | 380 movdqa %xmm1,%xmm4 |
372 psrld $25,%xmm1 | 381 psrld $25,%xmm1 |
373 pslld $7,%xmm4 | 382 pslld $7,%xmm4 |
374 por %xmm4,%xmm1 | 383 por %xmm4,%xmm1 |
375 pshufd $78,%xmm2,%xmm2 | 384 pshufd $78,%xmm2,%xmm2 |
376 pshufd $147,%xmm1,%xmm1 | 385 pshufd $147,%xmm1,%xmm1 |
377 pshufd $57,%xmm3,%xmm3 | 386 pshufd $57,%xmm3,%xmm3 |
378 » decl» %ebp | 387 » decq» %r8 |
379 jnz .Loop_ssse3 | 388 jnz .Loop_ssse3 |
380 paddd 0(%rsp),%xmm0 | 389 paddd 0(%rsp),%xmm0 |
381 paddd 16(%rsp),%xmm1 | 390 paddd 16(%rsp),%xmm1 |
382 paddd 32(%rsp),%xmm2 | 391 paddd 32(%rsp),%xmm2 |
383 paddd 48(%rsp),%xmm3 | 392 paddd 48(%rsp),%xmm3 |
384 | 393 |
385 cmpq $64,%rdx | 394 cmpq $64,%rdx |
386 jb .Ltail_ssse3 | 395 jb .Ltail_ssse3 |
387 | 396 |
388 movdqu 0(%rsi),%xmm4 | 397 movdqu 0(%rsi),%xmm4 |
(...skipping 16 matching lines...) Expand all Loading... |
405 jnz .Loop_outer_ssse3 | 414 jnz .Loop_outer_ssse3 |
406 | 415 |
407 jmp .Ldone_ssse3 | 416 jmp .Ldone_ssse3 |
408 | 417 |
409 .align 16 | 418 .align 16 |
410 .Ltail_ssse3: | 419 .Ltail_ssse3: |
411 movdqa %xmm0,0(%rsp) | 420 movdqa %xmm0,0(%rsp) |
412 movdqa %xmm1,16(%rsp) | 421 movdqa %xmm1,16(%rsp) |
413 movdqa %xmm2,32(%rsp) | 422 movdqa %xmm2,32(%rsp) |
414 movdqa %xmm3,48(%rsp) | 423 movdqa %xmm3,48(%rsp) |
415 » xorq» %rbx,%rbx | 424 » xorq» %r8,%r8 |
416 | 425 |
417 .Loop_tail_ssse3: | 426 .Loop_tail_ssse3: |
418 » movzbl» (%rsi,%rbx,1),%eax | 427 » movzbl» (%rsi,%r8,1),%eax |
419 » movzbl» (%rsp,%rbx,1),%ecx | 428 » movzbl» (%rsp,%r8,1),%ecx |
420 » leaq» 1(%rbx),%rbx | 429 » leaq» 1(%r8),%r8 |
421 xorl %ecx,%eax | 430 xorl %ecx,%eax |
422 » movb» %al,-1(%rdi,%rbx,1) | 431 » movb» %al,-1(%rdi,%r8,1) |
423 decq %rdx | 432 decq %rdx |
424 jnz .Loop_tail_ssse3 | 433 jnz .Loop_tail_ssse3 |
425 | 434 |
426 .Ldone_ssse3: | 435 .Ldone_ssse3: |
427 » addq» $64+24,%rsp | 436 » addq» $64+24+48,%rsp |
428 » popq» %r15 | |
429 » popq» %r14 | |
430 » popq» %r13 | |
431 » popq» %r12 | |
432 » popq» %rbp | |
433 » popq» %rbx | |
434 .byte 0xf3,0xc3 | 437 .byte 0xf3,0xc3 |
435 .size ChaCha20_ssse3,.-ChaCha20_ssse3 | 438 .size ChaCha20_ssse3,.-ChaCha20_ssse3 |
436 .type ChaCha20_4x,@function | 439 .type ChaCha20_4x,@function |
437 .align 32 | 440 .align 32 |
438 ChaCha20_4x: | 441 ChaCha20_4x: |
439 .LChaCha20_4x: | 442 .LChaCha20_4x: |
440 movq %r10,%r11 | 443 movq %r10,%r11 |
441 shrq $32,%r10 | 444 shrq $32,%r10 |
442 testq $32,%r10 | 445 testq $32,%r10 |
443 jnz .LChaCha20_8x | 446 jnz .LChaCha20_8x |
(...skipping 1132 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1576 movb %al,-1(%rdi,%r10,1) | 1579 movb %al,-1(%rdi,%r10,1) |
1577 decq %rdx | 1580 decq %rdx |
1578 jnz .Loop_tail8x | 1581 jnz .Loop_tail8x |
1579 | 1582 |
1580 .Ldone8x: | 1583 .Ldone8x: |
1581 vzeroall | 1584 vzeroall |
1582 movq 640(%rsp),%rsp | 1585 movq 640(%rsp),%rsp |
1583 .byte 0xf3,0xc3 | 1586 .byte 0xf3,0xc3 |
1584 .size ChaCha20_8x,.-ChaCha20_8x | 1587 .size ChaCha20_8x,.-ChaCha20_8x |
1585 #endif | 1588 #endif |
OLD | NEW |