Index: third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S |
diff --git a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S |
deleted file mode 100644 |
index 6ff6bffb66bb2e2279def64813a9e09d2c432aa8..0000000000000000000000000000000000000000 |
--- a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S |
+++ /dev/null |
@@ -1,1971 +0,0 @@ |
-#if defined(__aarch64__) |
-#include <openssl/arm_arch.h> |
- |
-.text |
- |
- |
- |
-.align 5 |
-.Lsigma: |
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral |
-.Lone: |
-.long 1,0,0,0 |
-.LOPENSSL_armcap_P: |
-#ifdef __ILP32__ |
-.long OPENSSL_armcap_P-. |
-#else |
-.quad OPENSSL_armcap_P-. |
-#endif |
-.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
-.align 2 |
- |
-.globl ChaCha20_ctr32 |
-.hidden ChaCha20_ctr32 |
-.type ChaCha20_ctr32,%function |
-.align 5 |
-ChaCha20_ctr32: |
- cbz x2,.Labort |
- adr x5,.LOPENSSL_armcap_P |
- cmp x2,#192 |
- b.lo .Lshort |
-#ifdef __ILP32__ |
- ldrsw x6,[x5] |
-#else |
- ldr x6,[x5] |
-#endif |
- ldr w17,[x6,x5] |
- tst w17,#ARMV7_NEON |
- b.ne ChaCha20_neon |
- |
-.Lshort: |
- stp x29,x30,[sp,#-96]! |
- add x29,sp,#0 |
- |
- adr x5,.Lsigma |
- stp x19,x20,[sp,#16] |
- stp x21,x22,[sp,#32] |
- stp x23,x24,[sp,#48] |
- stp x25,x26,[sp,#64] |
- stp x27,x28,[sp,#80] |
- sub sp,sp,#64 |
- |
- ldp x22,x23,[x5] // load sigma |
- ldp x24,x25,[x3] // load key |
- ldp x26,x27,[x3,#16] |
- ldp x28,x30,[x4] // load counter |
-#ifdef __ARMEB__ |
- ror x24,x24,#32 |
- ror x25,x25,#32 |
- ror x26,x26,#32 |
- ror x27,x27,#32 |
- ror x28,x28,#32 |
- ror x30,x30,#32 |
-#endif |
- |
-.Loop_outer: |
- mov w5,w22 // unpack key block |
- lsr x6,x22,#32 |
- mov w7,w23 |
- lsr x8,x23,#32 |
- mov w9,w24 |
- lsr x10,x24,#32 |
- mov w11,w25 |
- lsr x12,x25,#32 |
- mov w13,w26 |
- lsr x14,x26,#32 |
- mov w15,w27 |
- lsr x16,x27,#32 |
- mov w17,w28 |
- lsr x19,x28,#32 |
- mov w20,w30 |
- lsr x21,x30,#32 |
- |
- mov x4,#10 |
- subs x2,x2,#64 |
-.Loop: |
- sub x4,x4,#1 |
- add w5,w5,w9 |
- add w6,w6,w10 |
- add w7,w7,w11 |
- add w8,w8,w12 |
- eor w17,w17,w5 |
- eor w19,w19,w6 |
- eor w20,w20,w7 |
- eor w21,w21,w8 |
- ror w17,w17,#16 |
- ror w19,w19,#16 |
- ror w20,w20,#16 |
- ror w21,w21,#16 |
- add w13,w13,w17 |
- add w14,w14,w19 |
- add w15,w15,w20 |
- add w16,w16,w21 |
- eor w9,w9,w13 |
- eor w10,w10,w14 |
- eor w11,w11,w15 |
- eor w12,w12,w16 |
- ror w9,w9,#20 |
- ror w10,w10,#20 |
- ror w11,w11,#20 |
- ror w12,w12,#20 |
- add w5,w5,w9 |
- add w6,w6,w10 |
- add w7,w7,w11 |
- add w8,w8,w12 |
- eor w17,w17,w5 |
- eor w19,w19,w6 |
- eor w20,w20,w7 |
- eor w21,w21,w8 |
- ror w17,w17,#24 |
- ror w19,w19,#24 |
- ror w20,w20,#24 |
- ror w21,w21,#24 |
- add w13,w13,w17 |
- add w14,w14,w19 |
- add w15,w15,w20 |
- add w16,w16,w21 |
- eor w9,w9,w13 |
- eor w10,w10,w14 |
- eor w11,w11,w15 |
- eor w12,w12,w16 |
- ror w9,w9,#25 |
- ror w10,w10,#25 |
- ror w11,w11,#25 |
- ror w12,w12,#25 |
- add w5,w5,w10 |
- add w6,w6,w11 |
- add w7,w7,w12 |
- add w8,w8,w9 |
- eor w21,w21,w5 |
- eor w17,w17,w6 |
- eor w19,w19,w7 |
- eor w20,w20,w8 |
- ror w21,w21,#16 |
- ror w17,w17,#16 |
- ror w19,w19,#16 |
- ror w20,w20,#16 |
- add w15,w15,w21 |
- add w16,w16,w17 |
- add w13,w13,w19 |
- add w14,w14,w20 |
- eor w10,w10,w15 |
- eor w11,w11,w16 |
- eor w12,w12,w13 |
- eor w9,w9,w14 |
- ror w10,w10,#20 |
- ror w11,w11,#20 |
- ror w12,w12,#20 |
- ror w9,w9,#20 |
- add w5,w5,w10 |
- add w6,w6,w11 |
- add w7,w7,w12 |
- add w8,w8,w9 |
- eor w21,w21,w5 |
- eor w17,w17,w6 |
- eor w19,w19,w7 |
- eor w20,w20,w8 |
- ror w21,w21,#24 |
- ror w17,w17,#24 |
- ror w19,w19,#24 |
- ror w20,w20,#24 |
- add w15,w15,w21 |
- add w16,w16,w17 |
- add w13,w13,w19 |
- add w14,w14,w20 |
- eor w10,w10,w15 |
- eor w11,w11,w16 |
- eor w12,w12,w13 |
- eor w9,w9,w14 |
- ror w10,w10,#25 |
- ror w11,w11,#25 |
- ror w12,w12,#25 |
- ror w9,w9,#25 |
- cbnz x4,.Loop |
- |
- add w5,w5,w22 // accumulate key block |
- add x6,x6,x22,lsr#32 |
- add w7,w7,w23 |
- add x8,x8,x23,lsr#32 |
- add w9,w9,w24 |
- add x10,x10,x24,lsr#32 |
- add w11,w11,w25 |
- add x12,x12,x25,lsr#32 |
- add w13,w13,w26 |
- add x14,x14,x26,lsr#32 |
- add w15,w15,w27 |
- add x16,x16,x27,lsr#32 |
- add w17,w17,w28 |
- add x19,x19,x28,lsr#32 |
- add w20,w20,w30 |
- add x21,x21,x30,lsr#32 |
- |
- b.lo .Ltail |
- |
- add x5,x5,x6,lsl#32 // pack |
- add x7,x7,x8,lsl#32 |
- ldp x6,x8,[x1,#0] // load input |
- add x9,x9,x10,lsl#32 |
- add x11,x11,x12,lsl#32 |
- ldp x10,x12,[x1,#16] |
- add x13,x13,x14,lsl#32 |
- add x15,x15,x16,lsl#32 |
- ldp x14,x16,[x1,#32] |
- add x17,x17,x19,lsl#32 |
- add x20,x20,x21,lsl#32 |
- ldp x19,x21,[x1,#48] |
- add x1,x1,#64 |
-#ifdef __ARMEB__ |
- rev x5,x5 |
- rev x7,x7 |
- rev x9,x9 |
- rev x11,x11 |
- rev x13,x13 |
- rev x15,x15 |
- rev x17,x17 |
- rev x20,x20 |
-#endif |
- eor x5,x5,x6 |
- eor x7,x7,x8 |
- eor x9,x9,x10 |
- eor x11,x11,x12 |
- eor x13,x13,x14 |
- eor x15,x15,x16 |
- eor x17,x17,x19 |
- eor x20,x20,x21 |
- |
- stp x5,x7,[x0,#0] // store output |
- add x28,x28,#1 // increment counter |
- stp x9,x11,[x0,#16] |
- stp x13,x15,[x0,#32] |
- stp x17,x20,[x0,#48] |
- add x0,x0,#64 |
- |
- b.hi .Loop_outer |
- |
- ldp x19,x20,[x29,#16] |
- add sp,sp,#64 |
- ldp x21,x22,[x29,#32] |
- ldp x23,x24,[x29,#48] |
- ldp x25,x26,[x29,#64] |
- ldp x27,x28,[x29,#80] |
- ldp x29,x30,[sp],#96 |
-.Labort: |
- ret |
- |
-.align 4 |
-.Ltail: |
- add x2,x2,#64 |
-.Less_than_64: |
- sub x0,x0,#1 |
- add x1,x1,x2 |
- add x0,x0,x2 |
- add x4,sp,x2 |
- neg x2,x2 |
- |
- add x5,x5,x6,lsl#32 // pack |
- add x7,x7,x8,lsl#32 |
- add x9,x9,x10,lsl#32 |
- add x11,x11,x12,lsl#32 |
- add x13,x13,x14,lsl#32 |
- add x15,x15,x16,lsl#32 |
- add x17,x17,x19,lsl#32 |
- add x20,x20,x21,lsl#32 |
-#ifdef __ARMEB__ |
- rev x5,x5 |
- rev x7,x7 |
- rev x9,x9 |
- rev x11,x11 |
- rev x13,x13 |
- rev x15,x15 |
- rev x17,x17 |
- rev x20,x20 |
-#endif |
- stp x5,x7,[sp,#0] |
- stp x9,x11,[sp,#16] |
- stp x13,x15,[sp,#32] |
- stp x17,x20,[sp,#48] |
- |
-.Loop_tail: |
- ldrb w10,[x1,x2] |
- ldrb w11,[x4,x2] |
- add x2,x2,#1 |
- eor w10,w10,w11 |
- strb w10,[x0,x2] |
- cbnz x2,.Loop_tail |
- |
- stp xzr,xzr,[sp,#0] |
- stp xzr,xzr,[sp,#16] |
- stp xzr,xzr,[sp,#32] |
- stp xzr,xzr,[sp,#48] |
- |
- ldp x19,x20,[x29,#16] |
- add sp,sp,#64 |
- ldp x21,x22,[x29,#32] |
- ldp x23,x24,[x29,#48] |
- ldp x25,x26,[x29,#64] |
- ldp x27,x28,[x29,#80] |
- ldp x29,x30,[sp],#96 |
- ret |
-.size ChaCha20_ctr32,.-ChaCha20_ctr32 |
- |
-.type ChaCha20_neon,%function |
-.align 5 |
-ChaCha20_neon: |
- stp x29,x30,[sp,#-96]! |
- add x29,sp,#0 |
- |
- adr x5,.Lsigma |
- stp x19,x20,[sp,#16] |
- stp x21,x22,[sp,#32] |
- stp x23,x24,[sp,#48] |
- stp x25,x26,[sp,#64] |
- stp x27,x28,[sp,#80] |
- cmp x2,#512 |
- b.hs .L512_or_more_neon |
- |
- sub sp,sp,#64 |
- |
- ldp x22,x23,[x5] // load sigma |
- ld1 {v24.4s},[x5],#16 |
- ldp x24,x25,[x3] // load key |
- ldp x26,x27,[x3,#16] |
- ld1 {v25.4s,v26.4s},[x3] |
- ldp x28,x30,[x4] // load counter |
- ld1 {v27.4s},[x4] |
- ld1 {v31.4s},[x5] |
-#ifdef __ARMEB__ |
- rev64 v24.4s,v24.4s |
- ror x24,x24,#32 |
- ror x25,x25,#32 |
- ror x26,x26,#32 |
- ror x27,x27,#32 |
- ror x28,x28,#32 |
- ror x30,x30,#32 |
-#endif |
- add v27.4s,v27.4s,v31.4s // += 1 |
- add v28.4s,v27.4s,v31.4s |
- add v29.4s,v28.4s,v31.4s |
- shl v31.4s,v31.4s,#2 // 1 -> 4 |
- |
-.Loop_outer_neon: |
- mov w5,w22 // unpack key block |
- lsr x6,x22,#32 |
- mov v0.16b,v24.16b |
- mov w7,w23 |
- lsr x8,x23,#32 |
- mov v4.16b,v24.16b |
- mov w9,w24 |
- lsr x10,x24,#32 |
- mov v16.16b,v24.16b |
- mov w11,w25 |
- mov v1.16b,v25.16b |
- lsr x12,x25,#32 |
- mov v5.16b,v25.16b |
- mov w13,w26 |
- mov v17.16b,v25.16b |
- lsr x14,x26,#32 |
- mov v3.16b,v27.16b |
- mov w15,w27 |
- mov v7.16b,v28.16b |
- lsr x16,x27,#32 |
- mov v19.16b,v29.16b |
- mov w17,w28 |
- mov v2.16b,v26.16b |
- lsr x19,x28,#32 |
- mov v6.16b,v26.16b |
- mov w20,w30 |
- mov v18.16b,v26.16b |
- lsr x21,x30,#32 |
- |
- mov x4,#10 |
- subs x2,x2,#256 |
-.Loop_neon: |
- sub x4,x4,#1 |
- add v0.4s,v0.4s,v1.4s |
- add w5,w5,w9 |
- add v4.4s,v4.4s,v5.4s |
- add w6,w6,w10 |
- add v16.4s,v16.4s,v17.4s |
- add w7,w7,w11 |
- eor v3.16b,v3.16b,v0.16b |
- add w8,w8,w12 |
- eor v7.16b,v7.16b,v4.16b |
- eor w17,w17,w5 |
- eor v19.16b,v19.16b,v16.16b |
- eor w19,w19,w6 |
- rev32 v3.8h,v3.8h |
- eor w20,w20,w7 |
- rev32 v7.8h,v7.8h |
- eor w21,w21,w8 |
- rev32 v19.8h,v19.8h |
- ror w17,w17,#16 |
- add v2.4s,v2.4s,v3.4s |
- ror w19,w19,#16 |
- add v6.4s,v6.4s,v7.4s |
- ror w20,w20,#16 |
- add v18.4s,v18.4s,v19.4s |
- ror w21,w21,#16 |
- eor v20.16b,v1.16b,v2.16b |
- add w13,w13,w17 |
- eor v21.16b,v5.16b,v6.16b |
- add w14,w14,w19 |
- eor v22.16b,v17.16b,v18.16b |
- add w15,w15,w20 |
- ushr v1.4s,v20.4s,#20 |
- add w16,w16,w21 |
- ushr v5.4s,v21.4s,#20 |
- eor w9,w9,w13 |
- ushr v17.4s,v22.4s,#20 |
- eor w10,w10,w14 |
- sli v1.4s,v20.4s,#12 |
- eor w11,w11,w15 |
- sli v5.4s,v21.4s,#12 |
- eor w12,w12,w16 |
- sli v17.4s,v22.4s,#12 |
- ror w9,w9,#20 |
- add v0.4s,v0.4s,v1.4s |
- ror w10,w10,#20 |
- add v4.4s,v4.4s,v5.4s |
- ror w11,w11,#20 |
- add v16.4s,v16.4s,v17.4s |
- ror w12,w12,#20 |
- eor v20.16b,v3.16b,v0.16b |
- add w5,w5,w9 |
- eor v21.16b,v7.16b,v4.16b |
- add w6,w6,w10 |
- eor v22.16b,v19.16b,v16.16b |
- add w7,w7,w11 |
- ushr v3.4s,v20.4s,#24 |
- add w8,w8,w12 |
- ushr v7.4s,v21.4s,#24 |
- eor w17,w17,w5 |
- ushr v19.4s,v22.4s,#24 |
- eor w19,w19,w6 |
- sli v3.4s,v20.4s,#8 |
- eor w20,w20,w7 |
- sli v7.4s,v21.4s,#8 |
- eor w21,w21,w8 |
- sli v19.4s,v22.4s,#8 |
- ror w17,w17,#24 |
- add v2.4s,v2.4s,v3.4s |
- ror w19,w19,#24 |
- add v6.4s,v6.4s,v7.4s |
- ror w20,w20,#24 |
- add v18.4s,v18.4s,v19.4s |
- ror w21,w21,#24 |
- eor v20.16b,v1.16b,v2.16b |
- add w13,w13,w17 |
- eor v21.16b,v5.16b,v6.16b |
- add w14,w14,w19 |
- eor v22.16b,v17.16b,v18.16b |
- add w15,w15,w20 |
- ushr v1.4s,v20.4s,#25 |
- add w16,w16,w21 |
- ushr v5.4s,v21.4s,#25 |
- eor w9,w9,w13 |
- ushr v17.4s,v22.4s,#25 |
- eor w10,w10,w14 |
- sli v1.4s,v20.4s,#7 |
- eor w11,w11,w15 |
- sli v5.4s,v21.4s,#7 |
- eor w12,w12,w16 |
- sli v17.4s,v22.4s,#7 |
- ror w9,w9,#25 |
- ext v2.16b,v2.16b,v2.16b,#8 |
- ror w10,w10,#25 |
- ext v6.16b,v6.16b,v6.16b,#8 |
- ror w11,w11,#25 |
- ext v18.16b,v18.16b,v18.16b,#8 |
- ror w12,w12,#25 |
- ext v3.16b,v3.16b,v3.16b,#12 |
- ext v7.16b,v7.16b,v7.16b,#12 |
- ext v19.16b,v19.16b,v19.16b,#12 |
- ext v1.16b,v1.16b,v1.16b,#4 |
- ext v5.16b,v5.16b,v5.16b,#4 |
- ext v17.16b,v17.16b,v17.16b,#4 |
- add v0.4s,v0.4s,v1.4s |
- add w5,w5,w10 |
- add v4.4s,v4.4s,v5.4s |
- add w6,w6,w11 |
- add v16.4s,v16.4s,v17.4s |
- add w7,w7,w12 |
- eor v3.16b,v3.16b,v0.16b |
- add w8,w8,w9 |
- eor v7.16b,v7.16b,v4.16b |
- eor w21,w21,w5 |
- eor v19.16b,v19.16b,v16.16b |
- eor w17,w17,w6 |
- rev32 v3.8h,v3.8h |
- eor w19,w19,w7 |
- rev32 v7.8h,v7.8h |
- eor w20,w20,w8 |
- rev32 v19.8h,v19.8h |
- ror w21,w21,#16 |
- add v2.4s,v2.4s,v3.4s |
- ror w17,w17,#16 |
- add v6.4s,v6.4s,v7.4s |
- ror w19,w19,#16 |
- add v18.4s,v18.4s,v19.4s |
- ror w20,w20,#16 |
- eor v20.16b,v1.16b,v2.16b |
- add w15,w15,w21 |
- eor v21.16b,v5.16b,v6.16b |
- add w16,w16,w17 |
- eor v22.16b,v17.16b,v18.16b |
- add w13,w13,w19 |
- ushr v1.4s,v20.4s,#20 |
- add w14,w14,w20 |
- ushr v5.4s,v21.4s,#20 |
- eor w10,w10,w15 |
- ushr v17.4s,v22.4s,#20 |
- eor w11,w11,w16 |
- sli v1.4s,v20.4s,#12 |
- eor w12,w12,w13 |
- sli v5.4s,v21.4s,#12 |
- eor w9,w9,w14 |
- sli v17.4s,v22.4s,#12 |
- ror w10,w10,#20 |
- add v0.4s,v0.4s,v1.4s |
- ror w11,w11,#20 |
- add v4.4s,v4.4s,v5.4s |
- ror w12,w12,#20 |
- add v16.4s,v16.4s,v17.4s |
- ror w9,w9,#20 |
- eor v20.16b,v3.16b,v0.16b |
- add w5,w5,w10 |
- eor v21.16b,v7.16b,v4.16b |
- add w6,w6,w11 |
- eor v22.16b,v19.16b,v16.16b |
- add w7,w7,w12 |
- ushr v3.4s,v20.4s,#24 |
- add w8,w8,w9 |
- ushr v7.4s,v21.4s,#24 |
- eor w21,w21,w5 |
- ushr v19.4s,v22.4s,#24 |
- eor w17,w17,w6 |
- sli v3.4s,v20.4s,#8 |
- eor w19,w19,w7 |
- sli v7.4s,v21.4s,#8 |
- eor w20,w20,w8 |
- sli v19.4s,v22.4s,#8 |
- ror w21,w21,#24 |
- add v2.4s,v2.4s,v3.4s |
- ror w17,w17,#24 |
- add v6.4s,v6.4s,v7.4s |
- ror w19,w19,#24 |
- add v18.4s,v18.4s,v19.4s |
- ror w20,w20,#24 |
- eor v20.16b,v1.16b,v2.16b |
- add w15,w15,w21 |
- eor v21.16b,v5.16b,v6.16b |
- add w16,w16,w17 |
- eor v22.16b,v17.16b,v18.16b |
- add w13,w13,w19 |
- ushr v1.4s,v20.4s,#25 |
- add w14,w14,w20 |
- ushr v5.4s,v21.4s,#25 |
- eor w10,w10,w15 |
- ushr v17.4s,v22.4s,#25 |
- eor w11,w11,w16 |
- sli v1.4s,v20.4s,#7 |
- eor w12,w12,w13 |
- sli v5.4s,v21.4s,#7 |
- eor w9,w9,w14 |
- sli v17.4s,v22.4s,#7 |
- ror w10,w10,#25 |
- ext v2.16b,v2.16b,v2.16b,#8 |
- ror w11,w11,#25 |
- ext v6.16b,v6.16b,v6.16b,#8 |
- ror w12,w12,#25 |
- ext v18.16b,v18.16b,v18.16b,#8 |
- ror w9,w9,#25 |
- ext v3.16b,v3.16b,v3.16b,#4 |
- ext v7.16b,v7.16b,v7.16b,#4 |
- ext v19.16b,v19.16b,v19.16b,#4 |
- ext v1.16b,v1.16b,v1.16b,#12 |
- ext v5.16b,v5.16b,v5.16b,#12 |
- ext v17.16b,v17.16b,v17.16b,#12 |
- cbnz x4,.Loop_neon |
- |
- add w5,w5,w22 // accumulate key block |
- add v0.4s,v0.4s,v24.4s |
- add x6,x6,x22,lsr#32 |
- add v4.4s,v4.4s,v24.4s |
- add w7,w7,w23 |
- add v16.4s,v16.4s,v24.4s |
- add x8,x8,x23,lsr#32 |
- add v2.4s,v2.4s,v26.4s |
- add w9,w9,w24 |
- add v6.4s,v6.4s,v26.4s |
- add x10,x10,x24,lsr#32 |
- add v18.4s,v18.4s,v26.4s |
- add w11,w11,w25 |
- add v3.4s,v3.4s,v27.4s |
- add x12,x12,x25,lsr#32 |
- add w13,w13,w26 |
- add v7.4s,v7.4s,v28.4s |
- add x14,x14,x26,lsr#32 |
- add w15,w15,w27 |
- add v19.4s,v19.4s,v29.4s |
- add x16,x16,x27,lsr#32 |
- add w17,w17,w28 |
- add v1.4s,v1.4s,v25.4s |
- add x19,x19,x28,lsr#32 |
- add w20,w20,w30 |
- add v5.4s,v5.4s,v25.4s |
- add x21,x21,x30,lsr#32 |
- add v17.4s,v17.4s,v25.4s |
- |
- b.lo .Ltail_neon |
- |
- add x5,x5,x6,lsl#32 // pack |
- add x7,x7,x8,lsl#32 |
- ldp x6,x8,[x1,#0] // load input |
- add x9,x9,x10,lsl#32 |
- add x11,x11,x12,lsl#32 |
- ldp x10,x12,[x1,#16] |
- add x13,x13,x14,lsl#32 |
- add x15,x15,x16,lsl#32 |
- ldp x14,x16,[x1,#32] |
- add x17,x17,x19,lsl#32 |
- add x20,x20,x21,lsl#32 |
- ldp x19,x21,[x1,#48] |
- add x1,x1,#64 |
-#ifdef __ARMEB__ |
- rev x5,x5 |
- rev x7,x7 |
- rev x9,x9 |
- rev x11,x11 |
- rev x13,x13 |
- rev x15,x15 |
- rev x17,x17 |
- rev x20,x20 |
-#endif |
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
- eor x5,x5,x6 |
- eor x7,x7,x8 |
- eor x9,x9,x10 |
- eor x11,x11,x12 |
- eor x13,x13,x14 |
- eor v0.16b,v0.16b,v20.16b |
- eor x15,x15,x16 |
- eor v1.16b,v1.16b,v21.16b |
- eor x17,x17,x19 |
- eor v2.16b,v2.16b,v22.16b |
- eor x20,x20,x21 |
- eor v3.16b,v3.16b,v23.16b |
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
- |
- stp x5,x7,[x0,#0] // store output |
- add x28,x28,#4 // increment counter |
- stp x9,x11,[x0,#16] |
- add v27.4s,v27.4s,v31.4s // += 4 |
- stp x13,x15,[x0,#32] |
- add v28.4s,v28.4s,v31.4s |
- stp x17,x20,[x0,#48] |
- add v29.4s,v29.4s,v31.4s |
- add x0,x0,#64 |
- |
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
- |
- eor v4.16b,v4.16b,v20.16b |
- eor v5.16b,v5.16b,v21.16b |
- eor v6.16b,v6.16b,v22.16b |
- eor v7.16b,v7.16b,v23.16b |
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
- |
- eor v16.16b,v16.16b,v0.16b |
- eor v17.16b,v17.16b,v1.16b |
- eor v18.16b,v18.16b,v2.16b |
- eor v19.16b,v19.16b,v3.16b |
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
- |
- b.hi .Loop_outer_neon |
- |
- ldp x19,x20,[x29,#16] |
- add sp,sp,#64 |
- ldp x21,x22,[x29,#32] |
- ldp x23,x24,[x29,#48] |
- ldp x25,x26,[x29,#64] |
- ldp x27,x28,[x29,#80] |
- ldp x29,x30,[sp],#96 |
- ret |
- |
-.Ltail_neon: |
- add x2,x2,#256 |
- cmp x2,#64 |
- b.lo .Less_than_64 |
- |
- add x5,x5,x6,lsl#32 // pack |
- add x7,x7,x8,lsl#32 |
- ldp x6,x8,[x1,#0] // load input |
- add x9,x9,x10,lsl#32 |
- add x11,x11,x12,lsl#32 |
- ldp x10,x12,[x1,#16] |
- add x13,x13,x14,lsl#32 |
- add x15,x15,x16,lsl#32 |
- ldp x14,x16,[x1,#32] |
- add x17,x17,x19,lsl#32 |
- add x20,x20,x21,lsl#32 |
- ldp x19,x21,[x1,#48] |
- add x1,x1,#64 |
-#ifdef __ARMEB__ |
- rev x5,x5 |
- rev x7,x7 |
- rev x9,x9 |
- rev x11,x11 |
- rev x13,x13 |
- rev x15,x15 |
- rev x17,x17 |
- rev x20,x20 |
-#endif |
- eor x5,x5,x6 |
- eor x7,x7,x8 |
- eor x9,x9,x10 |
- eor x11,x11,x12 |
- eor x13,x13,x14 |
- eor x15,x15,x16 |
- eor x17,x17,x19 |
- eor x20,x20,x21 |
- |
- stp x5,x7,[x0,#0] // store output |
- add x28,x28,#4 // increment counter |
- stp x9,x11,[x0,#16] |
- stp x13,x15,[x0,#32] |
- stp x17,x20,[x0,#48] |
- add x0,x0,#64 |
- b.eq .Ldone_neon |
- sub x2,x2,#64 |
- cmp x2,#64 |
- b.lo .Less_than_128 |
- |
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
- eor v0.16b,v0.16b,v20.16b |
- eor v1.16b,v1.16b,v21.16b |
- eor v2.16b,v2.16b,v22.16b |
- eor v3.16b,v3.16b,v23.16b |
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
- b.eq .Ldone_neon |
- sub x2,x2,#64 |
- cmp x2,#64 |
- b.lo .Less_than_192 |
- |
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
- eor v4.16b,v4.16b,v20.16b |
- eor v5.16b,v5.16b,v21.16b |
- eor v6.16b,v6.16b,v22.16b |
- eor v7.16b,v7.16b,v23.16b |
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
- b.eq .Ldone_neon |
- sub x2,x2,#64 |
- |
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] |
- b .Last_neon |
- |
-.Less_than_128: |
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] |
- b .Last_neon |
-.Less_than_192: |
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] |
- b .Last_neon |
- |
-.align 4 |
-.Last_neon: |
- sub x0,x0,#1 |
- add x1,x1,x2 |
- add x0,x0,x2 |
- add x4,sp,x2 |
- neg x2,x2 |
- |
-.Loop_tail_neon: |
- ldrb w10,[x1,x2] |
- ldrb w11,[x4,x2] |
- add x2,x2,#1 |
- eor w10,w10,w11 |
- strb w10,[x0,x2] |
- cbnz x2,.Loop_tail_neon |
- |
- stp xzr,xzr,[sp,#0] |
- stp xzr,xzr,[sp,#16] |
- stp xzr,xzr,[sp,#32] |
- stp xzr,xzr,[sp,#48] |
- |
-.Ldone_neon: |
- ldp x19,x20,[x29,#16] |
- add sp,sp,#64 |
- ldp x21,x22,[x29,#32] |
- ldp x23,x24,[x29,#48] |
- ldp x25,x26,[x29,#64] |
- ldp x27,x28,[x29,#80] |
- ldp x29,x30,[sp],#96 |
- ret |
-.size ChaCha20_neon,.-ChaCha20_neon |
-.type ChaCha20_512_neon,%function |
-.align 5 |
-ChaCha20_512_neon: |
- stp x29,x30,[sp,#-96]! |
- add x29,sp,#0 |
- |
- adr x5,.Lsigma |
- stp x19,x20,[sp,#16] |
- stp x21,x22,[sp,#32] |
- stp x23,x24,[sp,#48] |
- stp x25,x26,[sp,#64] |
- stp x27,x28,[sp,#80] |
- |
-.L512_or_more_neon: |
- sub sp,sp,#128+64 |
- |
- ldp x22,x23,[x5] // load sigma |
- ld1 {v24.4s},[x5],#16 |
- ldp x24,x25,[x3] // load key |
- ldp x26,x27,[x3,#16] |
- ld1 {v25.4s,v26.4s},[x3] |
- ldp x28,x30,[x4] // load counter |
- ld1 {v27.4s},[x4] |
- ld1 {v31.4s},[x5] |
-#ifdef __ARMEB__ |
- rev64 v24.4s,v24.4s |
- ror x24,x24,#32 |
- ror x25,x25,#32 |
- ror x26,x26,#32 |
- ror x27,x27,#32 |
- ror x28,x28,#32 |
- ror x30,x30,#32 |
-#endif |
- add v27.4s,v27.4s,v31.4s // += 1 |
- stp q24,q25,[sp,#0] // off-load key block, invariant part |
- add v27.4s,v27.4s,v31.4s // not typo |
- str q26,[sp,#32] |
- add v28.4s,v27.4s,v31.4s |
- add v29.4s,v28.4s,v31.4s |
- add v30.4s,v29.4s,v31.4s |
- shl v31.4s,v31.4s,#2 // 1 -> 4 |
- |
- stp d8,d9,[sp,#128+0] // meet ABI requirements |
- stp d10,d11,[sp,#128+16] |
- stp d12,d13,[sp,#128+32] |
- stp d14,d15,[sp,#128+48] |
- |
- sub x2,x2,#512 // not typo |
- |
-.Loop_outer_512_neon: |
- mov v0.16b,v24.16b |
- mov v4.16b,v24.16b |
- mov v8.16b,v24.16b |
- mov v12.16b,v24.16b |
- mov v16.16b,v24.16b |
- mov v20.16b,v24.16b |
- mov v1.16b,v25.16b |
- mov w5,w22 // unpack key block |
- mov v5.16b,v25.16b |
- lsr x6,x22,#32 |
- mov v9.16b,v25.16b |
- mov w7,w23 |
- mov v13.16b,v25.16b |
- lsr x8,x23,#32 |
- mov v17.16b,v25.16b |
- mov w9,w24 |
- mov v21.16b,v25.16b |
- lsr x10,x24,#32 |
- mov v3.16b,v27.16b |
- mov w11,w25 |
- mov v7.16b,v28.16b |
- lsr x12,x25,#32 |
- mov v11.16b,v29.16b |
- mov w13,w26 |
- mov v15.16b,v30.16b |
- lsr x14,x26,#32 |
- mov v2.16b,v26.16b |
- mov w15,w27 |
- mov v6.16b,v26.16b |
- lsr x16,x27,#32 |
- add v19.4s,v3.4s,v31.4s // +4 |
- mov w17,w28 |
- add v23.4s,v7.4s,v31.4s // +4 |
- lsr x19,x28,#32 |
- mov v10.16b,v26.16b |
- mov w20,w30 |
- mov v14.16b,v26.16b |
- lsr x21,x30,#32 |
- mov v18.16b,v26.16b |
- stp q27,q28,[sp,#48] // off-load key block, variable part |
- mov v22.16b,v26.16b |
- str q29,[sp,#80] |
- |
- mov x4,#5 |
- subs x2,x2,#512 |
-.Loop_upper_neon: |
- sub x4,x4,#1 |
- add v0.4s,v0.4s,v1.4s |
- add w5,w5,w9 |
- add v4.4s,v4.4s,v5.4s |
- add w6,w6,w10 |
- add v8.4s,v8.4s,v9.4s |
- add w7,w7,w11 |
- add v12.4s,v12.4s,v13.4s |
- add w8,w8,w12 |
- add v16.4s,v16.4s,v17.4s |
- eor w17,w17,w5 |
- add v20.4s,v20.4s,v21.4s |
- eor w19,w19,w6 |
- eor v3.16b,v3.16b,v0.16b |
- eor w20,w20,w7 |
- eor v7.16b,v7.16b,v4.16b |
- eor w21,w21,w8 |
- eor v11.16b,v11.16b,v8.16b |
- ror w17,w17,#16 |
- eor v15.16b,v15.16b,v12.16b |
- ror w19,w19,#16 |
- eor v19.16b,v19.16b,v16.16b |
- ror w20,w20,#16 |
- eor v23.16b,v23.16b,v20.16b |
- ror w21,w21,#16 |
- rev32 v3.8h,v3.8h |
- add w13,w13,w17 |
- rev32 v7.8h,v7.8h |
- add w14,w14,w19 |
- rev32 v11.8h,v11.8h |
- add w15,w15,w20 |
- rev32 v15.8h,v15.8h |
- add w16,w16,w21 |
- rev32 v19.8h,v19.8h |
- eor w9,w9,w13 |
- rev32 v23.8h,v23.8h |
- eor w10,w10,w14 |
- add v2.4s,v2.4s,v3.4s |
- eor w11,w11,w15 |
- add v6.4s,v6.4s,v7.4s |
- eor w12,w12,w16 |
- add v10.4s,v10.4s,v11.4s |
- ror w9,w9,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w10,w10,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w11,w11,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w12,w12,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w9 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w10 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w11 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w12 |
- eor v28.16b,v17.16b,v18.16b |
- eor w17,w17,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w19,w19,w6 |
- ushr v1.4s,v24.4s,#20 |
- eor w20,w20,w7 |
- ushr v5.4s,v25.4s,#20 |
- eor w21,w21,w8 |
- ushr v9.4s,v26.4s,#20 |
- ror w17,w17,#24 |
- ushr v13.4s,v27.4s,#20 |
- ror w19,w19,#24 |
- ushr v17.4s,v28.4s,#20 |
- ror w20,w20,#24 |
- ushr v21.4s,v29.4s,#20 |
- ror w21,w21,#24 |
- sli v1.4s,v24.4s,#12 |
- add w13,w13,w17 |
- sli v5.4s,v25.4s,#12 |
- add w14,w14,w19 |
- sli v9.4s,v26.4s,#12 |
- add w15,w15,w20 |
- sli v13.4s,v27.4s,#12 |
- add w16,w16,w21 |
- sli v17.4s,v28.4s,#12 |
- eor w9,w9,w13 |
- sli v21.4s,v29.4s,#12 |
- eor w10,w10,w14 |
- add v0.4s,v0.4s,v1.4s |
- eor w11,w11,w15 |
- add v4.4s,v4.4s,v5.4s |
- eor w12,w12,w16 |
- add v8.4s,v8.4s,v9.4s |
- ror w9,w9,#25 |
- add v12.4s,v12.4s,v13.4s |
- ror w10,w10,#25 |
- add v16.4s,v16.4s,v17.4s |
- ror w11,w11,#25 |
- add v20.4s,v20.4s,v21.4s |
- ror w12,w12,#25 |
- eor v24.16b,v3.16b,v0.16b |
- add w5,w5,w10 |
- eor v25.16b,v7.16b,v4.16b |
- add w6,w6,w11 |
- eor v26.16b,v11.16b,v8.16b |
- add w7,w7,w12 |
- eor v27.16b,v15.16b,v12.16b |
- add w8,w8,w9 |
- eor v28.16b,v19.16b,v16.16b |
- eor w21,w21,w5 |
- eor v29.16b,v23.16b,v20.16b |
- eor w17,w17,w6 |
- ushr v3.4s,v24.4s,#24 |
- eor w19,w19,w7 |
- ushr v7.4s,v25.4s,#24 |
- eor w20,w20,w8 |
- ushr v11.4s,v26.4s,#24 |
- ror w21,w21,#16 |
- ushr v15.4s,v27.4s,#24 |
- ror w17,w17,#16 |
- ushr v19.4s,v28.4s,#24 |
- ror w19,w19,#16 |
- ushr v23.4s,v29.4s,#24 |
- ror w20,w20,#16 |
- sli v3.4s,v24.4s,#8 |
- add w15,w15,w21 |
- sli v7.4s,v25.4s,#8 |
- add w16,w16,w17 |
- sli v11.4s,v26.4s,#8 |
- add w13,w13,w19 |
- sli v15.4s,v27.4s,#8 |
- add w14,w14,w20 |
- sli v19.4s,v28.4s,#8 |
- eor w10,w10,w15 |
- sli v23.4s,v29.4s,#8 |
- eor w11,w11,w16 |
- add v2.4s,v2.4s,v3.4s |
- eor w12,w12,w13 |
- add v6.4s,v6.4s,v7.4s |
- eor w9,w9,w14 |
- add v10.4s,v10.4s,v11.4s |
- ror w10,w10,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w11,w11,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w12,w12,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w9,w9,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w10 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w11 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w12 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w9 |
- eor v28.16b,v17.16b,v18.16b |
- eor w21,w21,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w17,w17,w6 |
- ushr v1.4s,v24.4s,#25 |
- eor w19,w19,w7 |
- ushr v5.4s,v25.4s,#25 |
- eor w20,w20,w8 |
- ushr v9.4s,v26.4s,#25 |
- ror w21,w21,#24 |
- ushr v13.4s,v27.4s,#25 |
- ror w17,w17,#24 |
- ushr v17.4s,v28.4s,#25 |
- ror w19,w19,#24 |
- ushr v21.4s,v29.4s,#25 |
- ror w20,w20,#24 |
- sli v1.4s,v24.4s,#7 |
- add w15,w15,w21 |
- sli v5.4s,v25.4s,#7 |
- add w16,w16,w17 |
- sli v9.4s,v26.4s,#7 |
- add w13,w13,w19 |
- sli v13.4s,v27.4s,#7 |
- add w14,w14,w20 |
- sli v17.4s,v28.4s,#7 |
- eor w10,w10,w15 |
- sli v21.4s,v29.4s,#7 |
- eor w11,w11,w16 |
- ext v2.16b,v2.16b,v2.16b,#8 |
- eor w12,w12,w13 |
- ext v6.16b,v6.16b,v6.16b,#8 |
- eor w9,w9,w14 |
- ext v10.16b,v10.16b,v10.16b,#8 |
- ror w10,w10,#25 |
- ext v14.16b,v14.16b,v14.16b,#8 |
- ror w11,w11,#25 |
- ext v18.16b,v18.16b,v18.16b,#8 |
- ror w12,w12,#25 |
- ext v22.16b,v22.16b,v22.16b,#8 |
- ror w9,w9,#25 |
- ext v3.16b,v3.16b,v3.16b,#12 |
- ext v7.16b,v7.16b,v7.16b,#12 |
- ext v11.16b,v11.16b,v11.16b,#12 |
- ext v15.16b,v15.16b,v15.16b,#12 |
- ext v19.16b,v19.16b,v19.16b,#12 |
- ext v23.16b,v23.16b,v23.16b,#12 |
- ext v1.16b,v1.16b,v1.16b,#4 |
- ext v5.16b,v5.16b,v5.16b,#4 |
- ext v9.16b,v9.16b,v9.16b,#4 |
- ext v13.16b,v13.16b,v13.16b,#4 |
- ext v17.16b,v17.16b,v17.16b,#4 |
- ext v21.16b,v21.16b,v21.16b,#4 |
- add v0.4s,v0.4s,v1.4s |
- add w5,w5,w9 |
- add v4.4s,v4.4s,v5.4s |
- add w6,w6,w10 |
- add v8.4s,v8.4s,v9.4s |
- add w7,w7,w11 |
- add v12.4s,v12.4s,v13.4s |
- add w8,w8,w12 |
- add v16.4s,v16.4s,v17.4s |
- eor w17,w17,w5 |
- add v20.4s,v20.4s,v21.4s |
- eor w19,w19,w6 |
- eor v3.16b,v3.16b,v0.16b |
- eor w20,w20,w7 |
- eor v7.16b,v7.16b,v4.16b |
- eor w21,w21,w8 |
- eor v11.16b,v11.16b,v8.16b |
- ror w17,w17,#16 |
- eor v15.16b,v15.16b,v12.16b |
- ror w19,w19,#16 |
- eor v19.16b,v19.16b,v16.16b |
- ror w20,w20,#16 |
- eor v23.16b,v23.16b,v20.16b |
- ror w21,w21,#16 |
- rev32 v3.8h,v3.8h |
- add w13,w13,w17 |
- rev32 v7.8h,v7.8h |
- add w14,w14,w19 |
- rev32 v11.8h,v11.8h |
- add w15,w15,w20 |
- rev32 v15.8h,v15.8h |
- add w16,w16,w21 |
- rev32 v19.8h,v19.8h |
- eor w9,w9,w13 |
- rev32 v23.8h,v23.8h |
- eor w10,w10,w14 |
- add v2.4s,v2.4s,v3.4s |
- eor w11,w11,w15 |
- add v6.4s,v6.4s,v7.4s |
- eor w12,w12,w16 |
- add v10.4s,v10.4s,v11.4s |
- ror w9,w9,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w10,w10,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w11,w11,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w12,w12,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w9 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w10 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w11 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w12 |
- eor v28.16b,v17.16b,v18.16b |
- eor w17,w17,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w19,w19,w6 |
- ushr v1.4s,v24.4s,#20 |
- eor w20,w20,w7 |
- ushr v5.4s,v25.4s,#20 |
- eor w21,w21,w8 |
- ushr v9.4s,v26.4s,#20 |
- ror w17,w17,#24 |
- ushr v13.4s,v27.4s,#20 |
- ror w19,w19,#24 |
- ushr v17.4s,v28.4s,#20 |
- ror w20,w20,#24 |
- ushr v21.4s,v29.4s,#20 |
- ror w21,w21,#24 |
- sli v1.4s,v24.4s,#12 |
- add w13,w13,w17 |
- sli v5.4s,v25.4s,#12 |
- add w14,w14,w19 |
- sli v9.4s,v26.4s,#12 |
- add w15,w15,w20 |
- sli v13.4s,v27.4s,#12 |
- add w16,w16,w21 |
- sli v17.4s,v28.4s,#12 |
- eor w9,w9,w13 |
- sli v21.4s,v29.4s,#12 |
- eor w10,w10,w14 |
- add v0.4s,v0.4s,v1.4s |
- eor w11,w11,w15 |
- add v4.4s,v4.4s,v5.4s |
- eor w12,w12,w16 |
- add v8.4s,v8.4s,v9.4s |
- ror w9,w9,#25 |
- add v12.4s,v12.4s,v13.4s |
- ror w10,w10,#25 |
- add v16.4s,v16.4s,v17.4s |
- ror w11,w11,#25 |
- add v20.4s,v20.4s,v21.4s |
- ror w12,w12,#25 |
- eor v24.16b,v3.16b,v0.16b |
- add w5,w5,w10 |
- eor v25.16b,v7.16b,v4.16b |
- add w6,w6,w11 |
- eor v26.16b,v11.16b,v8.16b |
- add w7,w7,w12 |
- eor v27.16b,v15.16b,v12.16b |
- add w8,w8,w9 |
- eor v28.16b,v19.16b,v16.16b |
- eor w21,w21,w5 |
- eor v29.16b,v23.16b,v20.16b |
- eor w17,w17,w6 |
- ushr v3.4s,v24.4s,#24 |
- eor w19,w19,w7 |
- ushr v7.4s,v25.4s,#24 |
- eor w20,w20,w8 |
- ushr v11.4s,v26.4s,#24 |
- ror w21,w21,#16 |
- ushr v15.4s,v27.4s,#24 |
- ror w17,w17,#16 |
- ushr v19.4s,v28.4s,#24 |
- ror w19,w19,#16 |
- ushr v23.4s,v29.4s,#24 |
- ror w20,w20,#16 |
- sli v3.4s,v24.4s,#8 |
- add w15,w15,w21 |
- sli v7.4s,v25.4s,#8 |
- add w16,w16,w17 |
- sli v11.4s,v26.4s,#8 |
- add w13,w13,w19 |
- sli v15.4s,v27.4s,#8 |
- add w14,w14,w20 |
- sli v19.4s,v28.4s,#8 |
- eor w10,w10,w15 |
- sli v23.4s,v29.4s,#8 |
- eor w11,w11,w16 |
- add v2.4s,v2.4s,v3.4s |
- eor w12,w12,w13 |
- add v6.4s,v6.4s,v7.4s |
- eor w9,w9,w14 |
- add v10.4s,v10.4s,v11.4s |
- ror w10,w10,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w11,w11,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w12,w12,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w9,w9,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w10 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w11 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w12 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w9 |
- eor v28.16b,v17.16b,v18.16b |
- eor w21,w21,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w17,w17,w6 |
- ushr v1.4s,v24.4s,#25 |
- eor w19,w19,w7 |
- ushr v5.4s,v25.4s,#25 |
- eor w20,w20,w8 |
- ushr v9.4s,v26.4s,#25 |
- ror w21,w21,#24 |
- ushr v13.4s,v27.4s,#25 |
- ror w17,w17,#24 |
- ushr v17.4s,v28.4s,#25 |
- ror w19,w19,#24 |
- ushr v21.4s,v29.4s,#25 |
- ror w20,w20,#24 |
- sli v1.4s,v24.4s,#7 |
- add w15,w15,w21 |
- sli v5.4s,v25.4s,#7 |
- add w16,w16,w17 |
- sli v9.4s,v26.4s,#7 |
- add w13,w13,w19 |
- sli v13.4s,v27.4s,#7 |
- add w14,w14,w20 |
- sli v17.4s,v28.4s,#7 |
- eor w10,w10,w15 |
- sli v21.4s,v29.4s,#7 |
- eor w11,w11,w16 |
- ext v2.16b,v2.16b,v2.16b,#8 |
- eor w12,w12,w13 |
- ext v6.16b,v6.16b,v6.16b,#8 |
- eor w9,w9,w14 |
- ext v10.16b,v10.16b,v10.16b,#8 |
- ror w10,w10,#25 |
- ext v14.16b,v14.16b,v14.16b,#8 |
- ror w11,w11,#25 |
- ext v18.16b,v18.16b,v18.16b,#8 |
- ror w12,w12,#25 |
- ext v22.16b,v22.16b,v22.16b,#8 |
- ror w9,w9,#25 |
- ext v3.16b,v3.16b,v3.16b,#4 |
- ext v7.16b,v7.16b,v7.16b,#4 |
- ext v11.16b,v11.16b,v11.16b,#4 |
- ext v15.16b,v15.16b,v15.16b,#4 |
- ext v19.16b,v19.16b,v19.16b,#4 |
- ext v23.16b,v23.16b,v23.16b,#4 |
- ext v1.16b,v1.16b,v1.16b,#12 |
- ext v5.16b,v5.16b,v5.16b,#12 |
- ext v9.16b,v9.16b,v9.16b,#12 |
- ext v13.16b,v13.16b,v13.16b,#12 |
- ext v17.16b,v17.16b,v17.16b,#12 |
- ext v21.16b,v21.16b,v21.16b,#12 |
- cbnz x4,.Loop_upper_neon |
- |
- add w5,w5,w22 // accumulate key block |
- add x6,x6,x22,lsr#32 |
- add w7,w7,w23 |
- add x8,x8,x23,lsr#32 |
- add w9,w9,w24 |
- add x10,x10,x24,lsr#32 |
- add w11,w11,w25 |
- add x12,x12,x25,lsr#32 |
- add w13,w13,w26 |
- add x14,x14,x26,lsr#32 |
- add w15,w15,w27 |
- add x16,x16,x27,lsr#32 |
- add w17,w17,w28 |
- add x19,x19,x28,lsr#32 |
- add w20,w20,w30 |
- add x21,x21,x30,lsr#32 |
- |
- add x5,x5,x6,lsl#32 // pack |
- add x7,x7,x8,lsl#32 |
- ldp x6,x8,[x1,#0] // load input |
- add x9,x9,x10,lsl#32 |
- add x11,x11,x12,lsl#32 |
- ldp x10,x12,[x1,#16] |
- add x13,x13,x14,lsl#32 |
- add x15,x15,x16,lsl#32 |
- ldp x14,x16,[x1,#32] |
- add x17,x17,x19,lsl#32 |
- add x20,x20,x21,lsl#32 |
- ldp x19,x21,[x1,#48] |
- add x1,x1,#64 |
-#ifdef __ARMEB__ |
- rev x5,x5 |
- rev x7,x7 |
- rev x9,x9 |
- rev x11,x11 |
- rev x13,x13 |
- rev x15,x15 |
- rev x17,x17 |
- rev x20,x20 |
-#endif |
- eor x5,x5,x6 |
- eor x7,x7,x8 |
- eor x9,x9,x10 |
- eor x11,x11,x12 |
- eor x13,x13,x14 |
- eor x15,x15,x16 |
- eor x17,x17,x19 |
- eor x20,x20,x21 |
- |
- stp x5,x7,[x0,#0] // store output |
- add x28,x28,#1 // increment counter |
- mov w5,w22 // unpack key block |
- lsr x6,x22,#32 |
- stp x9,x11,[x0,#16] |
- mov w7,w23 |
- lsr x8,x23,#32 |
- stp x13,x15,[x0,#32] |
- mov w9,w24 |
- lsr x10,x24,#32 |
- stp x17,x20,[x0,#48] |
- add x0,x0,#64 |
- mov w11,w25 |
- lsr x12,x25,#32 |
- mov w13,w26 |
- lsr x14,x26,#32 |
- mov w15,w27 |
- lsr x16,x27,#32 |
- mov w17,w28 |
- lsr x19,x28,#32 |
- mov w20,w30 |
- lsr x21,x30,#32 |
- |
- mov x4,#5 |
-.Loop_lower_neon: |
- sub x4,x4,#1 |
- add v0.4s,v0.4s,v1.4s |
- add w5,w5,w9 |
- add v4.4s,v4.4s,v5.4s |
- add w6,w6,w10 |
- add v8.4s,v8.4s,v9.4s |
- add w7,w7,w11 |
- add v12.4s,v12.4s,v13.4s |
- add w8,w8,w12 |
- add v16.4s,v16.4s,v17.4s |
- eor w17,w17,w5 |
- add v20.4s,v20.4s,v21.4s |
- eor w19,w19,w6 |
- eor v3.16b,v3.16b,v0.16b |
- eor w20,w20,w7 |
- eor v7.16b,v7.16b,v4.16b |
- eor w21,w21,w8 |
- eor v11.16b,v11.16b,v8.16b |
- ror w17,w17,#16 |
- eor v15.16b,v15.16b,v12.16b |
- ror w19,w19,#16 |
- eor v19.16b,v19.16b,v16.16b |
- ror w20,w20,#16 |
- eor v23.16b,v23.16b,v20.16b |
- ror w21,w21,#16 |
- rev32 v3.8h,v3.8h |
- add w13,w13,w17 |
- rev32 v7.8h,v7.8h |
- add w14,w14,w19 |
- rev32 v11.8h,v11.8h |
- add w15,w15,w20 |
- rev32 v15.8h,v15.8h |
- add w16,w16,w21 |
- rev32 v19.8h,v19.8h |
- eor w9,w9,w13 |
- rev32 v23.8h,v23.8h |
- eor w10,w10,w14 |
- add v2.4s,v2.4s,v3.4s |
- eor w11,w11,w15 |
- add v6.4s,v6.4s,v7.4s |
- eor w12,w12,w16 |
- add v10.4s,v10.4s,v11.4s |
- ror w9,w9,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w10,w10,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w11,w11,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w12,w12,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w9 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w10 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w11 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w12 |
- eor v28.16b,v17.16b,v18.16b |
- eor w17,w17,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w19,w19,w6 |
- ushr v1.4s,v24.4s,#20 |
- eor w20,w20,w7 |
- ushr v5.4s,v25.4s,#20 |
- eor w21,w21,w8 |
- ushr v9.4s,v26.4s,#20 |
- ror w17,w17,#24 |
- ushr v13.4s,v27.4s,#20 |
- ror w19,w19,#24 |
- ushr v17.4s,v28.4s,#20 |
- ror w20,w20,#24 |
- ushr v21.4s,v29.4s,#20 |
- ror w21,w21,#24 |
- sli v1.4s,v24.4s,#12 |
- add w13,w13,w17 |
- sli v5.4s,v25.4s,#12 |
- add w14,w14,w19 |
- sli v9.4s,v26.4s,#12 |
- add w15,w15,w20 |
- sli v13.4s,v27.4s,#12 |
- add w16,w16,w21 |
- sli v17.4s,v28.4s,#12 |
- eor w9,w9,w13 |
- sli v21.4s,v29.4s,#12 |
- eor w10,w10,w14 |
- add v0.4s,v0.4s,v1.4s |
- eor w11,w11,w15 |
- add v4.4s,v4.4s,v5.4s |
- eor w12,w12,w16 |
- add v8.4s,v8.4s,v9.4s |
- ror w9,w9,#25 |
- add v12.4s,v12.4s,v13.4s |
- ror w10,w10,#25 |
- add v16.4s,v16.4s,v17.4s |
- ror w11,w11,#25 |
- add v20.4s,v20.4s,v21.4s |
- ror w12,w12,#25 |
- eor v24.16b,v3.16b,v0.16b |
- add w5,w5,w10 |
- eor v25.16b,v7.16b,v4.16b |
- add w6,w6,w11 |
- eor v26.16b,v11.16b,v8.16b |
- add w7,w7,w12 |
- eor v27.16b,v15.16b,v12.16b |
- add w8,w8,w9 |
- eor v28.16b,v19.16b,v16.16b |
- eor w21,w21,w5 |
- eor v29.16b,v23.16b,v20.16b |
- eor w17,w17,w6 |
- ushr v3.4s,v24.4s,#24 |
- eor w19,w19,w7 |
- ushr v7.4s,v25.4s,#24 |
- eor w20,w20,w8 |
- ushr v11.4s,v26.4s,#24 |
- ror w21,w21,#16 |
- ushr v15.4s,v27.4s,#24 |
- ror w17,w17,#16 |
- ushr v19.4s,v28.4s,#24 |
- ror w19,w19,#16 |
- ushr v23.4s,v29.4s,#24 |
- ror w20,w20,#16 |
- sli v3.4s,v24.4s,#8 |
- add w15,w15,w21 |
- sli v7.4s,v25.4s,#8 |
- add w16,w16,w17 |
- sli v11.4s,v26.4s,#8 |
- add w13,w13,w19 |
- sli v15.4s,v27.4s,#8 |
- add w14,w14,w20 |
- sli v19.4s,v28.4s,#8 |
- eor w10,w10,w15 |
- sli v23.4s,v29.4s,#8 |
- eor w11,w11,w16 |
- add v2.4s,v2.4s,v3.4s |
- eor w12,w12,w13 |
- add v6.4s,v6.4s,v7.4s |
- eor w9,w9,w14 |
- add v10.4s,v10.4s,v11.4s |
- ror w10,w10,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w11,w11,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w12,w12,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w9,w9,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w10 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w11 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w12 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w9 |
- eor v28.16b,v17.16b,v18.16b |
- eor w21,w21,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w17,w17,w6 |
- ushr v1.4s,v24.4s,#25 |
- eor w19,w19,w7 |
- ushr v5.4s,v25.4s,#25 |
- eor w20,w20,w8 |
- ushr v9.4s,v26.4s,#25 |
- ror w21,w21,#24 |
- ushr v13.4s,v27.4s,#25 |
- ror w17,w17,#24 |
- ushr v17.4s,v28.4s,#25 |
- ror w19,w19,#24 |
- ushr v21.4s,v29.4s,#25 |
- ror w20,w20,#24 |
- sli v1.4s,v24.4s,#7 |
- add w15,w15,w21 |
- sli v5.4s,v25.4s,#7 |
- add w16,w16,w17 |
- sli v9.4s,v26.4s,#7 |
- add w13,w13,w19 |
- sli v13.4s,v27.4s,#7 |
- add w14,w14,w20 |
- sli v17.4s,v28.4s,#7 |
- eor w10,w10,w15 |
- sli v21.4s,v29.4s,#7 |
- eor w11,w11,w16 |
- ext v2.16b,v2.16b,v2.16b,#8 |
- eor w12,w12,w13 |
- ext v6.16b,v6.16b,v6.16b,#8 |
- eor w9,w9,w14 |
- ext v10.16b,v10.16b,v10.16b,#8 |
- ror w10,w10,#25 |
- ext v14.16b,v14.16b,v14.16b,#8 |
- ror w11,w11,#25 |
- ext v18.16b,v18.16b,v18.16b,#8 |
- ror w12,w12,#25 |
- ext v22.16b,v22.16b,v22.16b,#8 |
- ror w9,w9,#25 |
- ext v3.16b,v3.16b,v3.16b,#12 |
- ext v7.16b,v7.16b,v7.16b,#12 |
- ext v11.16b,v11.16b,v11.16b,#12 |
- ext v15.16b,v15.16b,v15.16b,#12 |
- ext v19.16b,v19.16b,v19.16b,#12 |
- ext v23.16b,v23.16b,v23.16b,#12 |
- ext v1.16b,v1.16b,v1.16b,#4 |
- ext v5.16b,v5.16b,v5.16b,#4 |
- ext v9.16b,v9.16b,v9.16b,#4 |
- ext v13.16b,v13.16b,v13.16b,#4 |
- ext v17.16b,v17.16b,v17.16b,#4 |
- ext v21.16b,v21.16b,v21.16b,#4 |
- add v0.4s,v0.4s,v1.4s |
- add w5,w5,w9 |
- add v4.4s,v4.4s,v5.4s |
- add w6,w6,w10 |
- add v8.4s,v8.4s,v9.4s |
- add w7,w7,w11 |
- add v12.4s,v12.4s,v13.4s |
- add w8,w8,w12 |
- add v16.4s,v16.4s,v17.4s |
- eor w17,w17,w5 |
- add v20.4s,v20.4s,v21.4s |
- eor w19,w19,w6 |
- eor v3.16b,v3.16b,v0.16b |
- eor w20,w20,w7 |
- eor v7.16b,v7.16b,v4.16b |
- eor w21,w21,w8 |
- eor v11.16b,v11.16b,v8.16b |
- ror w17,w17,#16 |
- eor v15.16b,v15.16b,v12.16b |
- ror w19,w19,#16 |
- eor v19.16b,v19.16b,v16.16b |
- ror w20,w20,#16 |
- eor v23.16b,v23.16b,v20.16b |
- ror w21,w21,#16 |
- rev32 v3.8h,v3.8h |
- add w13,w13,w17 |
- rev32 v7.8h,v7.8h |
- add w14,w14,w19 |
- rev32 v11.8h,v11.8h |
- add w15,w15,w20 |
- rev32 v15.8h,v15.8h |
- add w16,w16,w21 |
- rev32 v19.8h,v19.8h |
- eor w9,w9,w13 |
- rev32 v23.8h,v23.8h |
- eor w10,w10,w14 |
- add v2.4s,v2.4s,v3.4s |
- eor w11,w11,w15 |
- add v6.4s,v6.4s,v7.4s |
- eor w12,w12,w16 |
- add v10.4s,v10.4s,v11.4s |
- ror w9,w9,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w10,w10,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w11,w11,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w12,w12,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w9 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w10 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w11 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w12 |
- eor v28.16b,v17.16b,v18.16b |
- eor w17,w17,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w19,w19,w6 |
- ushr v1.4s,v24.4s,#20 |
- eor w20,w20,w7 |
- ushr v5.4s,v25.4s,#20 |
- eor w21,w21,w8 |
- ushr v9.4s,v26.4s,#20 |
- ror w17,w17,#24 |
- ushr v13.4s,v27.4s,#20 |
- ror w19,w19,#24 |
- ushr v17.4s,v28.4s,#20 |
- ror w20,w20,#24 |
- ushr v21.4s,v29.4s,#20 |
- ror w21,w21,#24 |
- sli v1.4s,v24.4s,#12 |
- add w13,w13,w17 |
- sli v5.4s,v25.4s,#12 |
- add w14,w14,w19 |
- sli v9.4s,v26.4s,#12 |
- add w15,w15,w20 |
- sli v13.4s,v27.4s,#12 |
- add w16,w16,w21 |
- sli v17.4s,v28.4s,#12 |
- eor w9,w9,w13 |
- sli v21.4s,v29.4s,#12 |
- eor w10,w10,w14 |
- add v0.4s,v0.4s,v1.4s |
- eor w11,w11,w15 |
- add v4.4s,v4.4s,v5.4s |
- eor w12,w12,w16 |
- add v8.4s,v8.4s,v9.4s |
- ror w9,w9,#25 |
- add v12.4s,v12.4s,v13.4s |
- ror w10,w10,#25 |
- add v16.4s,v16.4s,v17.4s |
- ror w11,w11,#25 |
- add v20.4s,v20.4s,v21.4s |
- ror w12,w12,#25 |
- eor v24.16b,v3.16b,v0.16b |
- add w5,w5,w10 |
- eor v25.16b,v7.16b,v4.16b |
- add w6,w6,w11 |
- eor v26.16b,v11.16b,v8.16b |
- add w7,w7,w12 |
- eor v27.16b,v15.16b,v12.16b |
- add w8,w8,w9 |
- eor v28.16b,v19.16b,v16.16b |
- eor w21,w21,w5 |
- eor v29.16b,v23.16b,v20.16b |
- eor w17,w17,w6 |
- ushr v3.4s,v24.4s,#24 |
- eor w19,w19,w7 |
- ushr v7.4s,v25.4s,#24 |
- eor w20,w20,w8 |
- ushr v11.4s,v26.4s,#24 |
- ror w21,w21,#16 |
- ushr v15.4s,v27.4s,#24 |
- ror w17,w17,#16 |
- ushr v19.4s,v28.4s,#24 |
- ror w19,w19,#16 |
- ushr v23.4s,v29.4s,#24 |
- ror w20,w20,#16 |
- sli v3.4s,v24.4s,#8 |
- add w15,w15,w21 |
- sli v7.4s,v25.4s,#8 |
- add w16,w16,w17 |
- sli v11.4s,v26.4s,#8 |
- add w13,w13,w19 |
- sli v15.4s,v27.4s,#8 |
- add w14,w14,w20 |
- sli v19.4s,v28.4s,#8 |
- eor w10,w10,w15 |
- sli v23.4s,v29.4s,#8 |
- eor w11,w11,w16 |
- add v2.4s,v2.4s,v3.4s |
- eor w12,w12,w13 |
- add v6.4s,v6.4s,v7.4s |
- eor w9,w9,w14 |
- add v10.4s,v10.4s,v11.4s |
- ror w10,w10,#20 |
- add v14.4s,v14.4s,v15.4s |
- ror w11,w11,#20 |
- add v18.4s,v18.4s,v19.4s |
- ror w12,w12,#20 |
- add v22.4s,v22.4s,v23.4s |
- ror w9,w9,#20 |
- eor v24.16b,v1.16b,v2.16b |
- add w5,w5,w10 |
- eor v25.16b,v5.16b,v6.16b |
- add w6,w6,w11 |
- eor v26.16b,v9.16b,v10.16b |
- add w7,w7,w12 |
- eor v27.16b,v13.16b,v14.16b |
- add w8,w8,w9 |
- eor v28.16b,v17.16b,v18.16b |
- eor w21,w21,w5 |
- eor v29.16b,v21.16b,v22.16b |
- eor w17,w17,w6 |
- ushr v1.4s,v24.4s,#25 |
- eor w19,w19,w7 |
- ushr v5.4s,v25.4s,#25 |
- eor w20,w20,w8 |
- ushr v9.4s,v26.4s,#25 |
- ror w21,w21,#24 |
- ushr v13.4s,v27.4s,#25 |
- ror w17,w17,#24 |
- ushr v17.4s,v28.4s,#25 |
- ror w19,w19,#24 |
- ushr v21.4s,v29.4s,#25 |
- ror w20,w20,#24 |
- sli v1.4s,v24.4s,#7 |
- add w15,w15,w21 |
- sli v5.4s,v25.4s,#7 |
- add w16,w16,w17 |
- sli v9.4s,v26.4s,#7 |
- add w13,w13,w19 |
- sli v13.4s,v27.4s,#7 |
- add w14,w14,w20 |
- sli v17.4s,v28.4s,#7 |
- eor w10,w10,w15 |
- sli v21.4s,v29.4s,#7 |
- eor w11,w11,w16 |
- ext v2.16b,v2.16b,v2.16b,#8 |
- eor w12,w12,w13 |
- ext v6.16b,v6.16b,v6.16b,#8 |
- eor w9,w9,w14 |
- ext v10.16b,v10.16b,v10.16b,#8 |
- ror w10,w10,#25 |
- ext v14.16b,v14.16b,v14.16b,#8 |
- ror w11,w11,#25 |
- ext v18.16b,v18.16b,v18.16b,#8 |
- ror w12,w12,#25 |
- ext v22.16b,v22.16b,v22.16b,#8 |
- ror w9,w9,#25 |
- ext v3.16b,v3.16b,v3.16b,#4 |
- ext v7.16b,v7.16b,v7.16b,#4 |
- ext v11.16b,v11.16b,v11.16b,#4 |
- ext v15.16b,v15.16b,v15.16b,#4 |
- ext v19.16b,v19.16b,v19.16b,#4 |
- ext v23.16b,v23.16b,v23.16b,#4 |
- ext v1.16b,v1.16b,v1.16b,#12 |
- ext v5.16b,v5.16b,v5.16b,#12 |
- ext v9.16b,v9.16b,v9.16b,#12 |
- ext v13.16b,v13.16b,v13.16b,#12 |
- ext v17.16b,v17.16b,v17.16b,#12 |
- ext v21.16b,v21.16b,v21.16b,#12 |
- cbnz x4,.Loop_lower_neon |
- |
- add w5,w5,w22 // accumulate key block |
- ldp q24,q25,[sp,#0] |
- add x6,x6,x22,lsr#32 |
- ldp q26,q27,[sp,#32] |
- add w7,w7,w23 |
- ldp q28,q29,[sp,#64] |
- add x8,x8,x23,lsr#32 |
- add v0.4s,v0.4s,v24.4s |
- add w9,w9,w24 |
- add v4.4s,v4.4s,v24.4s |
- add x10,x10,x24,lsr#32 |
- add v8.4s,v8.4s,v24.4s |
- add w11,w11,w25 |
- add v12.4s,v12.4s,v24.4s |
- add x12,x12,x25,lsr#32 |
- add v16.4s,v16.4s,v24.4s |
- add w13,w13,w26 |
- add v20.4s,v20.4s,v24.4s |
- add x14,x14,x26,lsr#32 |
- add v2.4s,v2.4s,v26.4s |
- add w15,w15,w27 |
- add v6.4s,v6.4s,v26.4s |
- add x16,x16,x27,lsr#32 |
- add v10.4s,v10.4s,v26.4s |
- add w17,w17,w28 |
- add v14.4s,v14.4s,v26.4s |
- add x19,x19,x28,lsr#32 |
- add v18.4s,v18.4s,v26.4s |
- add w20,w20,w30 |
- add v22.4s,v22.4s,v26.4s |
- add x21,x21,x30,lsr#32 |
- add v19.4s,v19.4s,v31.4s // +4 |
- add x5,x5,x6,lsl#32 // pack |
- add v23.4s,v23.4s,v31.4s // +4 |
- add x7,x7,x8,lsl#32 |
- add v3.4s,v3.4s,v27.4s |
- ldp x6,x8,[x1,#0] // load input |
- add v7.4s,v7.4s,v28.4s |
- add x9,x9,x10,lsl#32 |
- add v11.4s,v11.4s,v29.4s |
- add x11,x11,x12,lsl#32 |
- add v15.4s,v15.4s,v30.4s |
- ldp x10,x12,[x1,#16] |
- add v19.4s,v19.4s,v27.4s |
- add x13,x13,x14,lsl#32 |
- add v23.4s,v23.4s,v28.4s |
- add x15,x15,x16,lsl#32 |
- add v1.4s,v1.4s,v25.4s |
- ldp x14,x16,[x1,#32] |
- add v5.4s,v5.4s,v25.4s |
- add x17,x17,x19,lsl#32 |
- add v9.4s,v9.4s,v25.4s |
- add x20,x20,x21,lsl#32 |
- add v13.4s,v13.4s,v25.4s |
- ldp x19,x21,[x1,#48] |
- add v17.4s,v17.4s,v25.4s |
- add x1,x1,#64 |
- add v21.4s,v21.4s,v25.4s |
- |
-#ifdef __ARMEB__ |
- rev x5,x5 |
- rev x7,x7 |
- rev x9,x9 |
- rev x11,x11 |
- rev x13,x13 |
- rev x15,x15 |
- rev x17,x17 |
- rev x20,x20 |
-#endif |
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
- eor x5,x5,x6 |
- eor x7,x7,x8 |
- eor x9,x9,x10 |
- eor x11,x11,x12 |
- eor x13,x13,x14 |
- eor v0.16b,v0.16b,v24.16b |
- eor x15,x15,x16 |
- eor v1.16b,v1.16b,v25.16b |
- eor x17,x17,x19 |
- eor v2.16b,v2.16b,v26.16b |
- eor x20,x20,x21 |
- eor v3.16b,v3.16b,v27.16b |
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
- |
- stp x5,x7,[x0,#0] // store output |
- add x28,x28,#7 // increment counter |
- stp x9,x11,[x0,#16] |
- stp x13,x15,[x0,#32] |
- stp x17,x20,[x0,#48] |
- add x0,x0,#64 |
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
- |
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
- eor v4.16b,v4.16b,v24.16b |
- eor v5.16b,v5.16b,v25.16b |
- eor v6.16b,v6.16b,v26.16b |
- eor v7.16b,v7.16b,v27.16b |
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
- |
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 |
- eor v8.16b,v8.16b,v0.16b |
- ldp q24,q25,[sp,#0] |
- eor v9.16b,v9.16b,v1.16b |
- ldp q26,q27,[sp,#32] |
- eor v10.16b,v10.16b,v2.16b |
- eor v11.16b,v11.16b,v3.16b |
- st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 |
- |
- ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 |
- eor v12.16b,v12.16b,v4.16b |
- eor v13.16b,v13.16b,v5.16b |
- eor v14.16b,v14.16b,v6.16b |
- eor v15.16b,v15.16b,v7.16b |
- st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 |
- |
- ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 |
- eor v16.16b,v16.16b,v8.16b |
- eor v17.16b,v17.16b,v9.16b |
- eor v18.16b,v18.16b,v10.16b |
- eor v19.16b,v19.16b,v11.16b |
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
- |
- shl v0.4s,v31.4s,#1 // 4 -> 8 |
- eor v20.16b,v20.16b,v12.16b |
- eor v21.16b,v21.16b,v13.16b |
- eor v22.16b,v22.16b,v14.16b |
- eor v23.16b,v23.16b,v15.16b |
- st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 |
- |
- add v27.4s,v27.4s,v0.4s // += 8 |
- add v28.4s,v28.4s,v0.4s |
- add v29.4s,v29.4s,v0.4s |
- add v30.4s,v30.4s,v0.4s |
- |
- b.hs .Loop_outer_512_neon |
- |
- adds x2,x2,#512 |
- ushr v0.4s,v31.4s,#2 // 4 -> 1 |
- |
- ldp d8,d9,[sp,#128+0] // meet ABI requirements |
- ldp d10,d11,[sp,#128+16] |
- ldp d12,d13,[sp,#128+32] |
- ldp d14,d15,[sp,#128+48] |
- |
- stp q24,q31,[sp,#0] // wipe off-load area |
- stp q24,q31,[sp,#32] |
- stp q24,q31,[sp,#64] |
- |
- b.eq .Ldone_512_neon |
- |
- cmp x2,#192 |
- sub v27.4s,v27.4s,v0.4s // -= 1 |
- sub v28.4s,v28.4s,v0.4s |
- sub v29.4s,v29.4s,v0.4s |
- add sp,sp,#128 |
- b.hs .Loop_outer_neon |
- |
- eor v25.16b,v25.16b,v25.16b |
- eor v26.16b,v26.16b,v26.16b |
- eor v27.16b,v27.16b,v27.16b |
- eor v28.16b,v28.16b,v28.16b |
- eor v29.16b,v29.16b,v29.16b |
- eor v30.16b,v30.16b,v30.16b |
- b .Loop_outer |
- |
-.Ldone_512_neon: |
- ldp x19,x20,[x29,#16] |
- add sp,sp,#128+64 |
- ldp x21,x22,[x29,#32] |
- ldp x23,x24,[x29,#48] |
- ldp x25,x26,[x29,#64] |
- ldp x27,x28,[x29,#80] |
- ldp x29,x30,[sp],#96 |
- ret |
-.size ChaCha20_512_neon,.-ChaCha20_512_neon |
-#endif |