Index: third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S |
diff --git a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S |
new file mode 100644 |
index 0000000000000000000000000000000000000000..6ff6bffb66bb2e2279def64813a9e09d2c432aa8 |
--- /dev/null |
+++ b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S |
@@ -0,0 +1,1971 @@ |
+#if defined(__aarch64__) |
+#include <openssl/arm_arch.h> |
+ |
+.text |
+ |
+ |
+ |
+.align 5 |
+.Lsigma: |
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral |
+.Lone: |
+.long 1,0,0,0 |
+.LOPENSSL_armcap_P: |
+#ifdef __ILP32__ |
+.long OPENSSL_armcap_P-. |
+#else |
+.quad OPENSSL_armcap_P-. |
+#endif |
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
+.align 2 |
+ |
+.globl ChaCha20_ctr32 |
+.hidden ChaCha20_ctr32 |
+.type ChaCha20_ctr32,%function |
+.align 5 |
+ChaCha20_ctr32: |
+ cbz x2,.Labort |
+ adr x5,.LOPENSSL_armcap_P |
+ cmp x2,#192 |
+ b.lo .Lshort |
+#ifdef __ILP32__ |
+ ldrsw x6,[x5] |
+#else |
+ ldr x6,[x5] |
+#endif |
+ ldr w17,[x6,x5] |
+ tst w17,#ARMV7_NEON |
+ b.ne ChaCha20_neon |
+ |
+.Lshort: |
+ stp x29,x30,[sp,#-96]! |
+ add x29,sp,#0 |
+ |
+ adr x5,.Lsigma |
+ stp x19,x20,[sp,#16] |
+ stp x21,x22,[sp,#32] |
+ stp x23,x24,[sp,#48] |
+ stp x25,x26,[sp,#64] |
+ stp x27,x28,[sp,#80] |
+ sub sp,sp,#64 |
+ |
+ ldp x22,x23,[x5] // load sigma |
+ ldp x24,x25,[x3] // load key |
+ ldp x26,x27,[x3,#16] |
+ ldp x28,x30,[x4] // load counter |
+#ifdef __ARMEB__ |
+ ror x24,x24,#32 |
+ ror x25,x25,#32 |
+ ror x26,x26,#32 |
+ ror x27,x27,#32 |
+ ror x28,x28,#32 |
+ ror x30,x30,#32 |
+#endif |
+ |
+.Loop_outer: |
+ mov w5,w22 // unpack key block |
+ lsr x6,x22,#32 |
+ mov w7,w23 |
+ lsr x8,x23,#32 |
+ mov w9,w24 |
+ lsr x10,x24,#32 |
+ mov w11,w25 |
+ lsr x12,x25,#32 |
+ mov w13,w26 |
+ lsr x14,x26,#32 |
+ mov w15,w27 |
+ lsr x16,x27,#32 |
+ mov w17,w28 |
+ lsr x19,x28,#32 |
+ mov w20,w30 |
+ lsr x21,x30,#32 |
+ |
+ mov x4,#10 |
+ subs x2,x2,#64 |
+.Loop: |
+ sub x4,x4,#1 |
+ add w5,w5,w9 |
+ add w6,w6,w10 |
+ add w7,w7,w11 |
+ add w8,w8,w12 |
+ eor w17,w17,w5 |
+ eor w19,w19,w6 |
+ eor w20,w20,w7 |
+ eor w21,w21,w8 |
+ ror w17,w17,#16 |
+ ror w19,w19,#16 |
+ ror w20,w20,#16 |
+ ror w21,w21,#16 |
+ add w13,w13,w17 |
+ add w14,w14,w19 |
+ add w15,w15,w20 |
+ add w16,w16,w21 |
+ eor w9,w9,w13 |
+ eor w10,w10,w14 |
+ eor w11,w11,w15 |
+ eor w12,w12,w16 |
+ ror w9,w9,#20 |
+ ror w10,w10,#20 |
+ ror w11,w11,#20 |
+ ror w12,w12,#20 |
+ add w5,w5,w9 |
+ add w6,w6,w10 |
+ add w7,w7,w11 |
+ add w8,w8,w12 |
+ eor w17,w17,w5 |
+ eor w19,w19,w6 |
+ eor w20,w20,w7 |
+ eor w21,w21,w8 |
+ ror w17,w17,#24 |
+ ror w19,w19,#24 |
+ ror w20,w20,#24 |
+ ror w21,w21,#24 |
+ add w13,w13,w17 |
+ add w14,w14,w19 |
+ add w15,w15,w20 |
+ add w16,w16,w21 |
+ eor w9,w9,w13 |
+ eor w10,w10,w14 |
+ eor w11,w11,w15 |
+ eor w12,w12,w16 |
+ ror w9,w9,#25 |
+ ror w10,w10,#25 |
+ ror w11,w11,#25 |
+ ror w12,w12,#25 |
+ add w5,w5,w10 |
+ add w6,w6,w11 |
+ add w7,w7,w12 |
+ add w8,w8,w9 |
+ eor w21,w21,w5 |
+ eor w17,w17,w6 |
+ eor w19,w19,w7 |
+ eor w20,w20,w8 |
+ ror w21,w21,#16 |
+ ror w17,w17,#16 |
+ ror w19,w19,#16 |
+ ror w20,w20,#16 |
+ add w15,w15,w21 |
+ add w16,w16,w17 |
+ add w13,w13,w19 |
+ add w14,w14,w20 |
+ eor w10,w10,w15 |
+ eor w11,w11,w16 |
+ eor w12,w12,w13 |
+ eor w9,w9,w14 |
+ ror w10,w10,#20 |
+ ror w11,w11,#20 |
+ ror w12,w12,#20 |
+ ror w9,w9,#20 |
+ add w5,w5,w10 |
+ add w6,w6,w11 |
+ add w7,w7,w12 |
+ add w8,w8,w9 |
+ eor w21,w21,w5 |
+ eor w17,w17,w6 |
+ eor w19,w19,w7 |
+ eor w20,w20,w8 |
+ ror w21,w21,#24 |
+ ror w17,w17,#24 |
+ ror w19,w19,#24 |
+ ror w20,w20,#24 |
+ add w15,w15,w21 |
+ add w16,w16,w17 |
+ add w13,w13,w19 |
+ add w14,w14,w20 |
+ eor w10,w10,w15 |
+ eor w11,w11,w16 |
+ eor w12,w12,w13 |
+ eor w9,w9,w14 |
+ ror w10,w10,#25 |
+ ror w11,w11,#25 |
+ ror w12,w12,#25 |
+ ror w9,w9,#25 |
+ cbnz x4,.Loop |
+ |
+ add w5,w5,w22 // accumulate key block |
+ add x6,x6,x22,lsr#32 |
+ add w7,w7,w23 |
+ add x8,x8,x23,lsr#32 |
+ add w9,w9,w24 |
+ add x10,x10,x24,lsr#32 |
+ add w11,w11,w25 |
+ add x12,x12,x25,lsr#32 |
+ add w13,w13,w26 |
+ add x14,x14,x26,lsr#32 |
+ add w15,w15,w27 |
+ add x16,x16,x27,lsr#32 |
+ add w17,w17,w28 |
+ add x19,x19,x28,lsr#32 |
+ add w20,w20,w30 |
+ add x21,x21,x30,lsr#32 |
+ |
+ b.lo .Ltail |
+ |
+ add x5,x5,x6,lsl#32 // pack |
+ add x7,x7,x8,lsl#32 |
+ ldp x6,x8,[x1,#0] // load input |
+ add x9,x9,x10,lsl#32 |
+ add x11,x11,x12,lsl#32 |
+ ldp x10,x12,[x1,#16] |
+ add x13,x13,x14,lsl#32 |
+ add x15,x15,x16,lsl#32 |
+ ldp x14,x16,[x1,#32] |
+ add x17,x17,x19,lsl#32 |
+ add x20,x20,x21,lsl#32 |
+ ldp x19,x21,[x1,#48] |
+ add x1,x1,#64 |
+#ifdef __ARMEB__ |
+ rev x5,x5 |
+ rev x7,x7 |
+ rev x9,x9 |
+ rev x11,x11 |
+ rev x13,x13 |
+ rev x15,x15 |
+ rev x17,x17 |
+ rev x20,x20 |
+#endif |
+ eor x5,x5,x6 |
+ eor x7,x7,x8 |
+ eor x9,x9,x10 |
+ eor x11,x11,x12 |
+ eor x13,x13,x14 |
+ eor x15,x15,x16 |
+ eor x17,x17,x19 |
+ eor x20,x20,x21 |
+ |
+ stp x5,x7,[x0,#0] // store output |
+ add x28,x28,#1 // increment counter |
+ stp x9,x11,[x0,#16] |
+ stp x13,x15,[x0,#32] |
+ stp x17,x20,[x0,#48] |
+ add x0,x0,#64 |
+ |
+ b.hi .Loop_outer |
+ |
+ ldp x19,x20,[x29,#16] |
+ add sp,sp,#64 |
+ ldp x21,x22,[x29,#32] |
+ ldp x23,x24,[x29,#48] |
+ ldp x25,x26,[x29,#64] |
+ ldp x27,x28,[x29,#80] |
+ ldp x29,x30,[sp],#96 |
+.Labort: |
+ ret |
+ |
+.align 4 |
+.Ltail: |
+ add x2,x2,#64 |
+.Less_than_64: |
+ sub x0,x0,#1 |
+ add x1,x1,x2 |
+ add x0,x0,x2 |
+ add x4,sp,x2 |
+ neg x2,x2 |
+ |
+ add x5,x5,x6,lsl#32 // pack |
+ add x7,x7,x8,lsl#32 |
+ add x9,x9,x10,lsl#32 |
+ add x11,x11,x12,lsl#32 |
+ add x13,x13,x14,lsl#32 |
+ add x15,x15,x16,lsl#32 |
+ add x17,x17,x19,lsl#32 |
+ add x20,x20,x21,lsl#32 |
+#ifdef __ARMEB__ |
+ rev x5,x5 |
+ rev x7,x7 |
+ rev x9,x9 |
+ rev x11,x11 |
+ rev x13,x13 |
+ rev x15,x15 |
+ rev x17,x17 |
+ rev x20,x20 |
+#endif |
+ stp x5,x7,[sp,#0] |
+ stp x9,x11,[sp,#16] |
+ stp x13,x15,[sp,#32] |
+ stp x17,x20,[sp,#48] |
+ |
+.Loop_tail: |
+ ldrb w10,[x1,x2] |
+ ldrb w11,[x4,x2] |
+ add x2,x2,#1 |
+ eor w10,w10,w11 |
+ strb w10,[x0,x2] |
+ cbnz x2,.Loop_tail |
+ |
+ stp xzr,xzr,[sp,#0] |
+ stp xzr,xzr,[sp,#16] |
+ stp xzr,xzr,[sp,#32] |
+ stp xzr,xzr,[sp,#48] |
+ |
+ ldp x19,x20,[x29,#16] |
+ add sp,sp,#64 |
+ ldp x21,x22,[x29,#32] |
+ ldp x23,x24,[x29,#48] |
+ ldp x25,x26,[x29,#64] |
+ ldp x27,x28,[x29,#80] |
+ ldp x29,x30,[sp],#96 |
+ ret |
+.size ChaCha20_ctr32,.-ChaCha20_ctr32 |
+ |
+.type ChaCha20_neon,%function |
+.align 5 |
+ChaCha20_neon: |
+ stp x29,x30,[sp,#-96]! |
+ add x29,sp,#0 |
+ |
+ adr x5,.Lsigma |
+ stp x19,x20,[sp,#16] |
+ stp x21,x22,[sp,#32] |
+ stp x23,x24,[sp,#48] |
+ stp x25,x26,[sp,#64] |
+ stp x27,x28,[sp,#80] |
+ cmp x2,#512 |
+ b.hs .L512_or_more_neon |
+ |
+ sub sp,sp,#64 |
+ |
+ ldp x22,x23,[x5] // load sigma |
+ ld1 {v24.4s},[x5],#16 |
+ ldp x24,x25,[x3] // load key |
+ ldp x26,x27,[x3,#16] |
+ ld1 {v25.4s,v26.4s},[x3] |
+ ldp x28,x30,[x4] // load counter |
+ ld1 {v27.4s},[x4] |
+ ld1 {v31.4s},[x5] |
+#ifdef __ARMEB__ |
+ rev64 v24.4s,v24.4s |
+ ror x24,x24,#32 |
+ ror x25,x25,#32 |
+ ror x26,x26,#32 |
+ ror x27,x27,#32 |
+ ror x28,x28,#32 |
+ ror x30,x30,#32 |
+#endif |
+ add v27.4s,v27.4s,v31.4s // += 1 |
+ add v28.4s,v27.4s,v31.4s |
+ add v29.4s,v28.4s,v31.4s |
+ shl v31.4s,v31.4s,#2 // 1 -> 4 |
+ |
+.Loop_outer_neon: |
+ mov w5,w22 // unpack key block |
+ lsr x6,x22,#32 |
+ mov v0.16b,v24.16b |
+ mov w7,w23 |
+ lsr x8,x23,#32 |
+ mov v4.16b,v24.16b |
+ mov w9,w24 |
+ lsr x10,x24,#32 |
+ mov v16.16b,v24.16b |
+ mov w11,w25 |
+ mov v1.16b,v25.16b |
+ lsr x12,x25,#32 |
+ mov v5.16b,v25.16b |
+ mov w13,w26 |
+ mov v17.16b,v25.16b |
+ lsr x14,x26,#32 |
+ mov v3.16b,v27.16b |
+ mov w15,w27 |
+ mov v7.16b,v28.16b |
+ lsr x16,x27,#32 |
+ mov v19.16b,v29.16b |
+ mov w17,w28 |
+ mov v2.16b,v26.16b |
+ lsr x19,x28,#32 |
+ mov v6.16b,v26.16b |
+ mov w20,w30 |
+ mov v18.16b,v26.16b |
+ lsr x21,x30,#32 |
+ |
+ mov x4,#10 |
+ subs x2,x2,#256 |
+.Loop_neon: |
+ sub x4,x4,#1 |
+ add v0.4s,v0.4s,v1.4s |
+ add w5,w5,w9 |
+ add v4.4s,v4.4s,v5.4s |
+ add w6,w6,w10 |
+ add v16.4s,v16.4s,v17.4s |
+ add w7,w7,w11 |
+ eor v3.16b,v3.16b,v0.16b |
+ add w8,w8,w12 |
+ eor v7.16b,v7.16b,v4.16b |
+ eor w17,w17,w5 |
+ eor v19.16b,v19.16b,v16.16b |
+ eor w19,w19,w6 |
+ rev32 v3.8h,v3.8h |
+ eor w20,w20,w7 |
+ rev32 v7.8h,v7.8h |
+ eor w21,w21,w8 |
+ rev32 v19.8h,v19.8h |
+ ror w17,w17,#16 |
+ add v2.4s,v2.4s,v3.4s |
+ ror w19,w19,#16 |
+ add v6.4s,v6.4s,v7.4s |
+ ror w20,w20,#16 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w21,w21,#16 |
+ eor v20.16b,v1.16b,v2.16b |
+ add w13,w13,w17 |
+ eor v21.16b,v5.16b,v6.16b |
+ add w14,w14,w19 |
+ eor v22.16b,v17.16b,v18.16b |
+ add w15,w15,w20 |
+ ushr v1.4s,v20.4s,#20 |
+ add w16,w16,w21 |
+ ushr v5.4s,v21.4s,#20 |
+ eor w9,w9,w13 |
+ ushr v17.4s,v22.4s,#20 |
+ eor w10,w10,w14 |
+ sli v1.4s,v20.4s,#12 |
+ eor w11,w11,w15 |
+ sli v5.4s,v21.4s,#12 |
+ eor w12,w12,w16 |
+ sli v17.4s,v22.4s,#12 |
+ ror w9,w9,#20 |
+ add v0.4s,v0.4s,v1.4s |
+ ror w10,w10,#20 |
+ add v4.4s,v4.4s,v5.4s |
+ ror w11,w11,#20 |
+ add v16.4s,v16.4s,v17.4s |
+ ror w12,w12,#20 |
+ eor v20.16b,v3.16b,v0.16b |
+ add w5,w5,w9 |
+ eor v21.16b,v7.16b,v4.16b |
+ add w6,w6,w10 |
+ eor v22.16b,v19.16b,v16.16b |
+ add w7,w7,w11 |
+ ushr v3.4s,v20.4s,#24 |
+ add w8,w8,w12 |
+ ushr v7.4s,v21.4s,#24 |
+ eor w17,w17,w5 |
+ ushr v19.4s,v22.4s,#24 |
+ eor w19,w19,w6 |
+ sli v3.4s,v20.4s,#8 |
+ eor w20,w20,w7 |
+ sli v7.4s,v21.4s,#8 |
+ eor w21,w21,w8 |
+ sli v19.4s,v22.4s,#8 |
+ ror w17,w17,#24 |
+ add v2.4s,v2.4s,v3.4s |
+ ror w19,w19,#24 |
+ add v6.4s,v6.4s,v7.4s |
+ ror w20,w20,#24 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w21,w21,#24 |
+ eor v20.16b,v1.16b,v2.16b |
+ add w13,w13,w17 |
+ eor v21.16b,v5.16b,v6.16b |
+ add w14,w14,w19 |
+ eor v22.16b,v17.16b,v18.16b |
+ add w15,w15,w20 |
+ ushr v1.4s,v20.4s,#25 |
+ add w16,w16,w21 |
+ ushr v5.4s,v21.4s,#25 |
+ eor w9,w9,w13 |
+ ushr v17.4s,v22.4s,#25 |
+ eor w10,w10,w14 |
+ sli v1.4s,v20.4s,#7 |
+ eor w11,w11,w15 |
+ sli v5.4s,v21.4s,#7 |
+ eor w12,w12,w16 |
+ sli v17.4s,v22.4s,#7 |
+ ror w9,w9,#25 |
+ ext v2.16b,v2.16b,v2.16b,#8 |
+ ror w10,w10,#25 |
+ ext v6.16b,v6.16b,v6.16b,#8 |
+ ror w11,w11,#25 |
+ ext v18.16b,v18.16b,v18.16b,#8 |
+ ror w12,w12,#25 |
+ ext v3.16b,v3.16b,v3.16b,#12 |
+ ext v7.16b,v7.16b,v7.16b,#12 |
+ ext v19.16b,v19.16b,v19.16b,#12 |
+ ext v1.16b,v1.16b,v1.16b,#4 |
+ ext v5.16b,v5.16b,v5.16b,#4 |
+ ext v17.16b,v17.16b,v17.16b,#4 |
+ add v0.4s,v0.4s,v1.4s |
+ add w5,w5,w10 |
+ add v4.4s,v4.4s,v5.4s |
+ add w6,w6,w11 |
+ add v16.4s,v16.4s,v17.4s |
+ add w7,w7,w12 |
+ eor v3.16b,v3.16b,v0.16b |
+ add w8,w8,w9 |
+ eor v7.16b,v7.16b,v4.16b |
+ eor w21,w21,w5 |
+ eor v19.16b,v19.16b,v16.16b |
+ eor w17,w17,w6 |
+ rev32 v3.8h,v3.8h |
+ eor w19,w19,w7 |
+ rev32 v7.8h,v7.8h |
+ eor w20,w20,w8 |
+ rev32 v19.8h,v19.8h |
+ ror w21,w21,#16 |
+ add v2.4s,v2.4s,v3.4s |
+ ror w17,w17,#16 |
+ add v6.4s,v6.4s,v7.4s |
+ ror w19,w19,#16 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w20,w20,#16 |
+ eor v20.16b,v1.16b,v2.16b |
+ add w15,w15,w21 |
+ eor v21.16b,v5.16b,v6.16b |
+ add w16,w16,w17 |
+ eor v22.16b,v17.16b,v18.16b |
+ add w13,w13,w19 |
+ ushr v1.4s,v20.4s,#20 |
+ add w14,w14,w20 |
+ ushr v5.4s,v21.4s,#20 |
+ eor w10,w10,w15 |
+ ushr v17.4s,v22.4s,#20 |
+ eor w11,w11,w16 |
+ sli v1.4s,v20.4s,#12 |
+ eor w12,w12,w13 |
+ sli v5.4s,v21.4s,#12 |
+ eor w9,w9,w14 |
+ sli v17.4s,v22.4s,#12 |
+ ror w10,w10,#20 |
+ add v0.4s,v0.4s,v1.4s |
+ ror w11,w11,#20 |
+ add v4.4s,v4.4s,v5.4s |
+ ror w12,w12,#20 |
+ add v16.4s,v16.4s,v17.4s |
+ ror w9,w9,#20 |
+ eor v20.16b,v3.16b,v0.16b |
+ add w5,w5,w10 |
+ eor v21.16b,v7.16b,v4.16b |
+ add w6,w6,w11 |
+ eor v22.16b,v19.16b,v16.16b |
+ add w7,w7,w12 |
+ ushr v3.4s,v20.4s,#24 |
+ add w8,w8,w9 |
+ ushr v7.4s,v21.4s,#24 |
+ eor w21,w21,w5 |
+ ushr v19.4s,v22.4s,#24 |
+ eor w17,w17,w6 |
+ sli v3.4s,v20.4s,#8 |
+ eor w19,w19,w7 |
+ sli v7.4s,v21.4s,#8 |
+ eor w20,w20,w8 |
+ sli v19.4s,v22.4s,#8 |
+ ror w21,w21,#24 |
+ add v2.4s,v2.4s,v3.4s |
+ ror w17,w17,#24 |
+ add v6.4s,v6.4s,v7.4s |
+ ror w19,w19,#24 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w20,w20,#24 |
+ eor v20.16b,v1.16b,v2.16b |
+ add w15,w15,w21 |
+ eor v21.16b,v5.16b,v6.16b |
+ add w16,w16,w17 |
+ eor v22.16b,v17.16b,v18.16b |
+ add w13,w13,w19 |
+ ushr v1.4s,v20.4s,#25 |
+ add w14,w14,w20 |
+ ushr v5.4s,v21.4s,#25 |
+ eor w10,w10,w15 |
+ ushr v17.4s,v22.4s,#25 |
+ eor w11,w11,w16 |
+ sli v1.4s,v20.4s,#7 |
+ eor w12,w12,w13 |
+ sli v5.4s,v21.4s,#7 |
+ eor w9,w9,w14 |
+ sli v17.4s,v22.4s,#7 |
+ ror w10,w10,#25 |
+ ext v2.16b,v2.16b,v2.16b,#8 |
+ ror w11,w11,#25 |
+ ext v6.16b,v6.16b,v6.16b,#8 |
+ ror w12,w12,#25 |
+ ext v18.16b,v18.16b,v18.16b,#8 |
+ ror w9,w9,#25 |
+ ext v3.16b,v3.16b,v3.16b,#4 |
+ ext v7.16b,v7.16b,v7.16b,#4 |
+ ext v19.16b,v19.16b,v19.16b,#4 |
+ ext v1.16b,v1.16b,v1.16b,#12 |
+ ext v5.16b,v5.16b,v5.16b,#12 |
+ ext v17.16b,v17.16b,v17.16b,#12 |
+ cbnz x4,.Loop_neon |
+ |
+ add w5,w5,w22 // accumulate key block |
+ add v0.4s,v0.4s,v24.4s |
+ add x6,x6,x22,lsr#32 |
+ add v4.4s,v4.4s,v24.4s |
+ add w7,w7,w23 |
+ add v16.4s,v16.4s,v24.4s |
+ add x8,x8,x23,lsr#32 |
+ add v2.4s,v2.4s,v26.4s |
+ add w9,w9,w24 |
+ add v6.4s,v6.4s,v26.4s |
+ add x10,x10,x24,lsr#32 |
+ add v18.4s,v18.4s,v26.4s |
+ add w11,w11,w25 |
+ add v3.4s,v3.4s,v27.4s |
+ add x12,x12,x25,lsr#32 |
+ add w13,w13,w26 |
+ add v7.4s,v7.4s,v28.4s |
+ add x14,x14,x26,lsr#32 |
+ add w15,w15,w27 |
+ add v19.4s,v19.4s,v29.4s |
+ add x16,x16,x27,lsr#32 |
+ add w17,w17,w28 |
+ add v1.4s,v1.4s,v25.4s |
+ add x19,x19,x28,lsr#32 |
+ add w20,w20,w30 |
+ add v5.4s,v5.4s,v25.4s |
+ add x21,x21,x30,lsr#32 |
+ add v17.4s,v17.4s,v25.4s |
+ |
+ b.lo .Ltail_neon |
+ |
+ add x5,x5,x6,lsl#32 // pack |
+ add x7,x7,x8,lsl#32 |
+ ldp x6,x8,[x1,#0] // load input |
+ add x9,x9,x10,lsl#32 |
+ add x11,x11,x12,lsl#32 |
+ ldp x10,x12,[x1,#16] |
+ add x13,x13,x14,lsl#32 |
+ add x15,x15,x16,lsl#32 |
+ ldp x14,x16,[x1,#32] |
+ add x17,x17,x19,lsl#32 |
+ add x20,x20,x21,lsl#32 |
+ ldp x19,x21,[x1,#48] |
+ add x1,x1,#64 |
+#ifdef __ARMEB__ |
+ rev x5,x5 |
+ rev x7,x7 |
+ rev x9,x9 |
+ rev x11,x11 |
+ rev x13,x13 |
+ rev x15,x15 |
+ rev x17,x17 |
+ rev x20,x20 |
+#endif |
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
+ eor x5,x5,x6 |
+ eor x7,x7,x8 |
+ eor x9,x9,x10 |
+ eor x11,x11,x12 |
+ eor x13,x13,x14 |
+ eor v0.16b,v0.16b,v20.16b |
+ eor x15,x15,x16 |
+ eor v1.16b,v1.16b,v21.16b |
+ eor x17,x17,x19 |
+ eor v2.16b,v2.16b,v22.16b |
+ eor x20,x20,x21 |
+ eor v3.16b,v3.16b,v23.16b |
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
+ |
+ stp x5,x7,[x0,#0] // store output |
+ add x28,x28,#4 // increment counter |
+ stp x9,x11,[x0,#16] |
+ add v27.4s,v27.4s,v31.4s // += 4 |
+ stp x13,x15,[x0,#32] |
+ add v28.4s,v28.4s,v31.4s |
+ stp x17,x20,[x0,#48] |
+ add v29.4s,v29.4s,v31.4s |
+ add x0,x0,#64 |
+ |
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
+ |
+ eor v4.16b,v4.16b,v20.16b |
+ eor v5.16b,v5.16b,v21.16b |
+ eor v6.16b,v6.16b,v22.16b |
+ eor v7.16b,v7.16b,v23.16b |
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
+ |
+ eor v16.16b,v16.16b,v0.16b |
+ eor v17.16b,v17.16b,v1.16b |
+ eor v18.16b,v18.16b,v2.16b |
+ eor v19.16b,v19.16b,v3.16b |
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
+ |
+ b.hi .Loop_outer_neon |
+ |
+ ldp x19,x20,[x29,#16] |
+ add sp,sp,#64 |
+ ldp x21,x22,[x29,#32] |
+ ldp x23,x24,[x29,#48] |
+ ldp x25,x26,[x29,#64] |
+ ldp x27,x28,[x29,#80] |
+ ldp x29,x30,[sp],#96 |
+ ret |
+ |
+.Ltail_neon: |
+ add x2,x2,#256 |
+ cmp x2,#64 |
+ b.lo .Less_than_64 |
+ |
+ add x5,x5,x6,lsl#32 // pack |
+ add x7,x7,x8,lsl#32 |
+ ldp x6,x8,[x1,#0] // load input |
+ add x9,x9,x10,lsl#32 |
+ add x11,x11,x12,lsl#32 |
+ ldp x10,x12,[x1,#16] |
+ add x13,x13,x14,lsl#32 |
+ add x15,x15,x16,lsl#32 |
+ ldp x14,x16,[x1,#32] |
+ add x17,x17,x19,lsl#32 |
+ add x20,x20,x21,lsl#32 |
+ ldp x19,x21,[x1,#48] |
+ add x1,x1,#64 |
+#ifdef __ARMEB__ |
+ rev x5,x5 |
+ rev x7,x7 |
+ rev x9,x9 |
+ rev x11,x11 |
+ rev x13,x13 |
+ rev x15,x15 |
+ rev x17,x17 |
+ rev x20,x20 |
+#endif |
+ eor x5,x5,x6 |
+ eor x7,x7,x8 |
+ eor x9,x9,x10 |
+ eor x11,x11,x12 |
+ eor x13,x13,x14 |
+ eor x15,x15,x16 |
+ eor x17,x17,x19 |
+ eor x20,x20,x21 |
+ |
+ stp x5,x7,[x0,#0] // store output |
+ add x28,x28,#4 // increment counter |
+ stp x9,x11,[x0,#16] |
+ stp x13,x15,[x0,#32] |
+ stp x17,x20,[x0,#48] |
+ add x0,x0,#64 |
+ b.eq .Ldone_neon |
+ sub x2,x2,#64 |
+ cmp x2,#64 |
+ b.lo .Less_than_128 |
+ |
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
+ eor v0.16b,v0.16b,v20.16b |
+ eor v1.16b,v1.16b,v21.16b |
+ eor v2.16b,v2.16b,v22.16b |
+ eor v3.16b,v3.16b,v23.16b |
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
+ b.eq .Ldone_neon |
+ sub x2,x2,#64 |
+ cmp x2,#64 |
+ b.lo .Less_than_192 |
+ |
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
+ eor v4.16b,v4.16b,v20.16b |
+ eor v5.16b,v5.16b,v21.16b |
+ eor v6.16b,v6.16b,v22.16b |
+ eor v7.16b,v7.16b,v23.16b |
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
+ b.eq .Ldone_neon |
+ sub x2,x2,#64 |
+ |
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] |
+ b .Last_neon |
+ |
+.Less_than_128: |
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] |
+ b .Last_neon |
+.Less_than_192: |
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] |
+ b .Last_neon |
+ |
+.align 4 |
+.Last_neon: |
+ sub x0,x0,#1 |
+ add x1,x1,x2 |
+ add x0,x0,x2 |
+ add x4,sp,x2 |
+ neg x2,x2 |
+ |
+.Loop_tail_neon: |
+ ldrb w10,[x1,x2] |
+ ldrb w11,[x4,x2] |
+ add x2,x2,#1 |
+ eor w10,w10,w11 |
+ strb w10,[x0,x2] |
+ cbnz x2,.Loop_tail_neon |
+ |
+ stp xzr,xzr,[sp,#0] |
+ stp xzr,xzr,[sp,#16] |
+ stp xzr,xzr,[sp,#32] |
+ stp xzr,xzr,[sp,#48] |
+ |
+.Ldone_neon: |
+ ldp x19,x20,[x29,#16] |
+ add sp,sp,#64 |
+ ldp x21,x22,[x29,#32] |
+ ldp x23,x24,[x29,#48] |
+ ldp x25,x26,[x29,#64] |
+ ldp x27,x28,[x29,#80] |
+ ldp x29,x30,[sp],#96 |
+ ret |
+.size ChaCha20_neon,.-ChaCha20_neon |
+.type ChaCha20_512_neon,%function |
+.align 5 |
+ChaCha20_512_neon: |
+ stp x29,x30,[sp,#-96]! |
+ add x29,sp,#0 |
+ |
+ adr x5,.Lsigma |
+ stp x19,x20,[sp,#16] |
+ stp x21,x22,[sp,#32] |
+ stp x23,x24,[sp,#48] |
+ stp x25,x26,[sp,#64] |
+ stp x27,x28,[sp,#80] |
+ |
+.L512_or_more_neon: |
+ sub sp,sp,#128+64 |
+ |
+ ldp x22,x23,[x5] // load sigma |
+ ld1 {v24.4s},[x5],#16 |
+ ldp x24,x25,[x3] // load key |
+ ldp x26,x27,[x3,#16] |
+ ld1 {v25.4s,v26.4s},[x3] |
+ ldp x28,x30,[x4] // load counter |
+ ld1 {v27.4s},[x4] |
+ ld1 {v31.4s},[x5] |
+#ifdef __ARMEB__ |
+ rev64 v24.4s,v24.4s |
+ ror x24,x24,#32 |
+ ror x25,x25,#32 |
+ ror x26,x26,#32 |
+ ror x27,x27,#32 |
+ ror x28,x28,#32 |
+ ror x30,x30,#32 |
+#endif |
+ add v27.4s,v27.4s,v31.4s // += 1 |
+ stp q24,q25,[sp,#0] // off-load key block, invariant part |
+ add v27.4s,v27.4s,v31.4s // not typo |
+ str q26,[sp,#32] |
+ add v28.4s,v27.4s,v31.4s |
+ add v29.4s,v28.4s,v31.4s |
+ add v30.4s,v29.4s,v31.4s |
+ shl v31.4s,v31.4s,#2 // 1 -> 4 |
+ |
+ stp d8,d9,[sp,#128+0] // meet ABI requirements |
+ stp d10,d11,[sp,#128+16] |
+ stp d12,d13,[sp,#128+32] |
+ stp d14,d15,[sp,#128+48] |
+ |
+ sub x2,x2,#512 // not typo |
+ |
+.Loop_outer_512_neon: |
+ mov v0.16b,v24.16b |
+ mov v4.16b,v24.16b |
+ mov v8.16b,v24.16b |
+ mov v12.16b,v24.16b |
+ mov v16.16b,v24.16b |
+ mov v20.16b,v24.16b |
+ mov v1.16b,v25.16b |
+ mov w5,w22 // unpack key block |
+ mov v5.16b,v25.16b |
+ lsr x6,x22,#32 |
+ mov v9.16b,v25.16b |
+ mov w7,w23 |
+ mov v13.16b,v25.16b |
+ lsr x8,x23,#32 |
+ mov v17.16b,v25.16b |
+ mov w9,w24 |
+ mov v21.16b,v25.16b |
+ lsr x10,x24,#32 |
+ mov v3.16b,v27.16b |
+ mov w11,w25 |
+ mov v7.16b,v28.16b |
+ lsr x12,x25,#32 |
+ mov v11.16b,v29.16b |
+ mov w13,w26 |
+ mov v15.16b,v30.16b |
+ lsr x14,x26,#32 |
+ mov v2.16b,v26.16b |
+ mov w15,w27 |
+ mov v6.16b,v26.16b |
+ lsr x16,x27,#32 |
+ add v19.4s,v3.4s,v31.4s // +4 |
+ mov w17,w28 |
+ add v23.4s,v7.4s,v31.4s // +4 |
+ lsr x19,x28,#32 |
+ mov v10.16b,v26.16b |
+ mov w20,w30 |
+ mov v14.16b,v26.16b |
+ lsr x21,x30,#32 |
+ mov v18.16b,v26.16b |
+ stp q27,q28,[sp,#48] // off-load key block, variable part |
+ mov v22.16b,v26.16b |
+ str q29,[sp,#80] |
+ |
+ mov x4,#5 |
+ subs x2,x2,#512 |
+.Loop_upper_neon: |
+ sub x4,x4,#1 |
+ add v0.4s,v0.4s,v1.4s |
+ add w5,w5,w9 |
+ add v4.4s,v4.4s,v5.4s |
+ add w6,w6,w10 |
+ add v8.4s,v8.4s,v9.4s |
+ add w7,w7,w11 |
+ add v12.4s,v12.4s,v13.4s |
+ add w8,w8,w12 |
+ add v16.4s,v16.4s,v17.4s |
+ eor w17,w17,w5 |
+ add v20.4s,v20.4s,v21.4s |
+ eor w19,w19,w6 |
+ eor v3.16b,v3.16b,v0.16b |
+ eor w20,w20,w7 |
+ eor v7.16b,v7.16b,v4.16b |
+ eor w21,w21,w8 |
+ eor v11.16b,v11.16b,v8.16b |
+ ror w17,w17,#16 |
+ eor v15.16b,v15.16b,v12.16b |
+ ror w19,w19,#16 |
+ eor v19.16b,v19.16b,v16.16b |
+ ror w20,w20,#16 |
+ eor v23.16b,v23.16b,v20.16b |
+ ror w21,w21,#16 |
+ rev32 v3.8h,v3.8h |
+ add w13,w13,w17 |
+ rev32 v7.8h,v7.8h |
+ add w14,w14,w19 |
+ rev32 v11.8h,v11.8h |
+ add w15,w15,w20 |
+ rev32 v15.8h,v15.8h |
+ add w16,w16,w21 |
+ rev32 v19.8h,v19.8h |
+ eor w9,w9,w13 |
+ rev32 v23.8h,v23.8h |
+ eor w10,w10,w14 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w11,w11,w15 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w12,w12,w16 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w9,w9,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w10,w10,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w11,w11,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w12,w12,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w9 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w10 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w11 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w12 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w17,w17,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w19,w19,w6 |
+ ushr v1.4s,v24.4s,#20 |
+ eor w20,w20,w7 |
+ ushr v5.4s,v25.4s,#20 |
+ eor w21,w21,w8 |
+ ushr v9.4s,v26.4s,#20 |
+ ror w17,w17,#24 |
+ ushr v13.4s,v27.4s,#20 |
+ ror w19,w19,#24 |
+ ushr v17.4s,v28.4s,#20 |
+ ror w20,w20,#24 |
+ ushr v21.4s,v29.4s,#20 |
+ ror w21,w21,#24 |
+ sli v1.4s,v24.4s,#12 |
+ add w13,w13,w17 |
+ sli v5.4s,v25.4s,#12 |
+ add w14,w14,w19 |
+ sli v9.4s,v26.4s,#12 |
+ add w15,w15,w20 |
+ sli v13.4s,v27.4s,#12 |
+ add w16,w16,w21 |
+ sli v17.4s,v28.4s,#12 |
+ eor w9,w9,w13 |
+ sli v21.4s,v29.4s,#12 |
+ eor w10,w10,w14 |
+ add v0.4s,v0.4s,v1.4s |
+ eor w11,w11,w15 |
+ add v4.4s,v4.4s,v5.4s |
+ eor w12,w12,w16 |
+ add v8.4s,v8.4s,v9.4s |
+ ror w9,w9,#25 |
+ add v12.4s,v12.4s,v13.4s |
+ ror w10,w10,#25 |
+ add v16.4s,v16.4s,v17.4s |
+ ror w11,w11,#25 |
+ add v20.4s,v20.4s,v21.4s |
+ ror w12,w12,#25 |
+ eor v24.16b,v3.16b,v0.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v7.16b,v4.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v11.16b,v8.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v15.16b,v12.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v19.16b,v16.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v23.16b,v20.16b |
+ eor w17,w17,w6 |
+ ushr v3.4s,v24.4s,#24 |
+ eor w19,w19,w7 |
+ ushr v7.4s,v25.4s,#24 |
+ eor w20,w20,w8 |
+ ushr v11.4s,v26.4s,#24 |
+ ror w21,w21,#16 |
+ ushr v15.4s,v27.4s,#24 |
+ ror w17,w17,#16 |
+ ushr v19.4s,v28.4s,#24 |
+ ror w19,w19,#16 |
+ ushr v23.4s,v29.4s,#24 |
+ ror w20,w20,#16 |
+ sli v3.4s,v24.4s,#8 |
+ add w15,w15,w21 |
+ sli v7.4s,v25.4s,#8 |
+ add w16,w16,w17 |
+ sli v11.4s,v26.4s,#8 |
+ add w13,w13,w19 |
+ sli v15.4s,v27.4s,#8 |
+ add w14,w14,w20 |
+ sli v19.4s,v28.4s,#8 |
+ eor w10,w10,w15 |
+ sli v23.4s,v29.4s,#8 |
+ eor w11,w11,w16 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w12,w12,w13 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w9,w9,w14 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w10,w10,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w11,w11,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w12,w12,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w9,w9,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w17,w17,w6 |
+ ushr v1.4s,v24.4s,#25 |
+ eor w19,w19,w7 |
+ ushr v5.4s,v25.4s,#25 |
+ eor w20,w20,w8 |
+ ushr v9.4s,v26.4s,#25 |
+ ror w21,w21,#24 |
+ ushr v13.4s,v27.4s,#25 |
+ ror w17,w17,#24 |
+ ushr v17.4s,v28.4s,#25 |
+ ror w19,w19,#24 |
+ ushr v21.4s,v29.4s,#25 |
+ ror w20,w20,#24 |
+ sli v1.4s,v24.4s,#7 |
+ add w15,w15,w21 |
+ sli v5.4s,v25.4s,#7 |
+ add w16,w16,w17 |
+ sli v9.4s,v26.4s,#7 |
+ add w13,w13,w19 |
+ sli v13.4s,v27.4s,#7 |
+ add w14,w14,w20 |
+ sli v17.4s,v28.4s,#7 |
+ eor w10,w10,w15 |
+ sli v21.4s,v29.4s,#7 |
+ eor w11,w11,w16 |
+ ext v2.16b,v2.16b,v2.16b,#8 |
+ eor w12,w12,w13 |
+ ext v6.16b,v6.16b,v6.16b,#8 |
+ eor w9,w9,w14 |
+ ext v10.16b,v10.16b,v10.16b,#8 |
+ ror w10,w10,#25 |
+ ext v14.16b,v14.16b,v14.16b,#8 |
+ ror w11,w11,#25 |
+ ext v18.16b,v18.16b,v18.16b,#8 |
+ ror w12,w12,#25 |
+ ext v22.16b,v22.16b,v22.16b,#8 |
+ ror w9,w9,#25 |
+ ext v3.16b,v3.16b,v3.16b,#12 |
+ ext v7.16b,v7.16b,v7.16b,#12 |
+ ext v11.16b,v11.16b,v11.16b,#12 |
+ ext v15.16b,v15.16b,v15.16b,#12 |
+ ext v19.16b,v19.16b,v19.16b,#12 |
+ ext v23.16b,v23.16b,v23.16b,#12 |
+ ext v1.16b,v1.16b,v1.16b,#4 |
+ ext v5.16b,v5.16b,v5.16b,#4 |
+ ext v9.16b,v9.16b,v9.16b,#4 |
+ ext v13.16b,v13.16b,v13.16b,#4 |
+ ext v17.16b,v17.16b,v17.16b,#4 |
+ ext v21.16b,v21.16b,v21.16b,#4 |
+ add v0.4s,v0.4s,v1.4s |
+ add w5,w5,w9 |
+ add v4.4s,v4.4s,v5.4s |
+ add w6,w6,w10 |
+ add v8.4s,v8.4s,v9.4s |
+ add w7,w7,w11 |
+ add v12.4s,v12.4s,v13.4s |
+ add w8,w8,w12 |
+ add v16.4s,v16.4s,v17.4s |
+ eor w17,w17,w5 |
+ add v20.4s,v20.4s,v21.4s |
+ eor w19,w19,w6 |
+ eor v3.16b,v3.16b,v0.16b |
+ eor w20,w20,w7 |
+ eor v7.16b,v7.16b,v4.16b |
+ eor w21,w21,w8 |
+ eor v11.16b,v11.16b,v8.16b |
+ ror w17,w17,#16 |
+ eor v15.16b,v15.16b,v12.16b |
+ ror w19,w19,#16 |
+ eor v19.16b,v19.16b,v16.16b |
+ ror w20,w20,#16 |
+ eor v23.16b,v23.16b,v20.16b |
+ ror w21,w21,#16 |
+ rev32 v3.8h,v3.8h |
+ add w13,w13,w17 |
+ rev32 v7.8h,v7.8h |
+ add w14,w14,w19 |
+ rev32 v11.8h,v11.8h |
+ add w15,w15,w20 |
+ rev32 v15.8h,v15.8h |
+ add w16,w16,w21 |
+ rev32 v19.8h,v19.8h |
+ eor w9,w9,w13 |
+ rev32 v23.8h,v23.8h |
+ eor w10,w10,w14 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w11,w11,w15 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w12,w12,w16 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w9,w9,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w10,w10,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w11,w11,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w12,w12,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w9 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w10 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w11 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w12 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w17,w17,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w19,w19,w6 |
+ ushr v1.4s,v24.4s,#20 |
+ eor w20,w20,w7 |
+ ushr v5.4s,v25.4s,#20 |
+ eor w21,w21,w8 |
+ ushr v9.4s,v26.4s,#20 |
+ ror w17,w17,#24 |
+ ushr v13.4s,v27.4s,#20 |
+ ror w19,w19,#24 |
+ ushr v17.4s,v28.4s,#20 |
+ ror w20,w20,#24 |
+ ushr v21.4s,v29.4s,#20 |
+ ror w21,w21,#24 |
+ sli v1.4s,v24.4s,#12 |
+ add w13,w13,w17 |
+ sli v5.4s,v25.4s,#12 |
+ add w14,w14,w19 |
+ sli v9.4s,v26.4s,#12 |
+ add w15,w15,w20 |
+ sli v13.4s,v27.4s,#12 |
+ add w16,w16,w21 |
+ sli v17.4s,v28.4s,#12 |
+ eor w9,w9,w13 |
+ sli v21.4s,v29.4s,#12 |
+ eor w10,w10,w14 |
+ add v0.4s,v0.4s,v1.4s |
+ eor w11,w11,w15 |
+ add v4.4s,v4.4s,v5.4s |
+ eor w12,w12,w16 |
+ add v8.4s,v8.4s,v9.4s |
+ ror w9,w9,#25 |
+ add v12.4s,v12.4s,v13.4s |
+ ror w10,w10,#25 |
+ add v16.4s,v16.4s,v17.4s |
+ ror w11,w11,#25 |
+ add v20.4s,v20.4s,v21.4s |
+ ror w12,w12,#25 |
+ eor v24.16b,v3.16b,v0.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v7.16b,v4.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v11.16b,v8.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v15.16b,v12.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v19.16b,v16.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v23.16b,v20.16b |
+ eor w17,w17,w6 |
+ ushr v3.4s,v24.4s,#24 |
+ eor w19,w19,w7 |
+ ushr v7.4s,v25.4s,#24 |
+ eor w20,w20,w8 |
+ ushr v11.4s,v26.4s,#24 |
+ ror w21,w21,#16 |
+ ushr v15.4s,v27.4s,#24 |
+ ror w17,w17,#16 |
+ ushr v19.4s,v28.4s,#24 |
+ ror w19,w19,#16 |
+ ushr v23.4s,v29.4s,#24 |
+ ror w20,w20,#16 |
+ sli v3.4s,v24.4s,#8 |
+ add w15,w15,w21 |
+ sli v7.4s,v25.4s,#8 |
+ add w16,w16,w17 |
+ sli v11.4s,v26.4s,#8 |
+ add w13,w13,w19 |
+ sli v15.4s,v27.4s,#8 |
+ add w14,w14,w20 |
+ sli v19.4s,v28.4s,#8 |
+ eor w10,w10,w15 |
+ sli v23.4s,v29.4s,#8 |
+ eor w11,w11,w16 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w12,w12,w13 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w9,w9,w14 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w10,w10,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w11,w11,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w12,w12,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w9,w9,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w17,w17,w6 |
+ ushr v1.4s,v24.4s,#25 |
+ eor w19,w19,w7 |
+ ushr v5.4s,v25.4s,#25 |
+ eor w20,w20,w8 |
+ ushr v9.4s,v26.4s,#25 |
+ ror w21,w21,#24 |
+ ushr v13.4s,v27.4s,#25 |
+ ror w17,w17,#24 |
+ ushr v17.4s,v28.4s,#25 |
+ ror w19,w19,#24 |
+ ushr v21.4s,v29.4s,#25 |
+ ror w20,w20,#24 |
+ sli v1.4s,v24.4s,#7 |
+ add w15,w15,w21 |
+ sli v5.4s,v25.4s,#7 |
+ add w16,w16,w17 |
+ sli v9.4s,v26.4s,#7 |
+ add w13,w13,w19 |
+ sli v13.4s,v27.4s,#7 |
+ add w14,w14,w20 |
+ sli v17.4s,v28.4s,#7 |
+ eor w10,w10,w15 |
+ sli v21.4s,v29.4s,#7 |
+ eor w11,w11,w16 |
+ ext v2.16b,v2.16b,v2.16b,#8 |
+ eor w12,w12,w13 |
+ ext v6.16b,v6.16b,v6.16b,#8 |
+ eor w9,w9,w14 |
+ ext v10.16b,v10.16b,v10.16b,#8 |
+ ror w10,w10,#25 |
+ ext v14.16b,v14.16b,v14.16b,#8 |
+ ror w11,w11,#25 |
+ ext v18.16b,v18.16b,v18.16b,#8 |
+ ror w12,w12,#25 |
+ ext v22.16b,v22.16b,v22.16b,#8 |
+ ror w9,w9,#25 |
+ ext v3.16b,v3.16b,v3.16b,#4 |
+ ext v7.16b,v7.16b,v7.16b,#4 |
+ ext v11.16b,v11.16b,v11.16b,#4 |
+ ext v15.16b,v15.16b,v15.16b,#4 |
+ ext v19.16b,v19.16b,v19.16b,#4 |
+ ext v23.16b,v23.16b,v23.16b,#4 |
+ ext v1.16b,v1.16b,v1.16b,#12 |
+ ext v5.16b,v5.16b,v5.16b,#12 |
+ ext v9.16b,v9.16b,v9.16b,#12 |
+ ext v13.16b,v13.16b,v13.16b,#12 |
+ ext v17.16b,v17.16b,v17.16b,#12 |
+ ext v21.16b,v21.16b,v21.16b,#12 |
+ cbnz x4,.Loop_upper_neon |
+ |
+ add w5,w5,w22 // accumulate key block |
+ add x6,x6,x22,lsr#32 |
+ add w7,w7,w23 |
+ add x8,x8,x23,lsr#32 |
+ add w9,w9,w24 |
+ add x10,x10,x24,lsr#32 |
+ add w11,w11,w25 |
+ add x12,x12,x25,lsr#32 |
+ add w13,w13,w26 |
+ add x14,x14,x26,lsr#32 |
+ add w15,w15,w27 |
+ add x16,x16,x27,lsr#32 |
+ add w17,w17,w28 |
+ add x19,x19,x28,lsr#32 |
+ add w20,w20,w30 |
+ add x21,x21,x30,lsr#32 |
+ |
+ add x5,x5,x6,lsl#32 // pack |
+ add x7,x7,x8,lsl#32 |
+ ldp x6,x8,[x1,#0] // load input |
+ add x9,x9,x10,lsl#32 |
+ add x11,x11,x12,lsl#32 |
+ ldp x10,x12,[x1,#16] |
+ add x13,x13,x14,lsl#32 |
+ add x15,x15,x16,lsl#32 |
+ ldp x14,x16,[x1,#32] |
+ add x17,x17,x19,lsl#32 |
+ add x20,x20,x21,lsl#32 |
+ ldp x19,x21,[x1,#48] |
+ add x1,x1,#64 |
+#ifdef __ARMEB__ |
+ rev x5,x5 |
+ rev x7,x7 |
+ rev x9,x9 |
+ rev x11,x11 |
+ rev x13,x13 |
+ rev x15,x15 |
+ rev x17,x17 |
+ rev x20,x20 |
+#endif |
+ eor x5,x5,x6 |
+ eor x7,x7,x8 |
+ eor x9,x9,x10 |
+ eor x11,x11,x12 |
+ eor x13,x13,x14 |
+ eor x15,x15,x16 |
+ eor x17,x17,x19 |
+ eor x20,x20,x21 |
+ |
+ stp x5,x7,[x0,#0] // store output |
+ add x28,x28,#1 // increment counter |
+ mov w5,w22 // unpack key block |
+ lsr x6,x22,#32 |
+ stp x9,x11,[x0,#16] |
+ mov w7,w23 |
+ lsr x8,x23,#32 |
+ stp x13,x15,[x0,#32] |
+ mov w9,w24 |
+ lsr x10,x24,#32 |
+ stp x17,x20,[x0,#48] |
+ add x0,x0,#64 |
+ mov w11,w25 |
+ lsr x12,x25,#32 |
+ mov w13,w26 |
+ lsr x14,x26,#32 |
+ mov w15,w27 |
+ lsr x16,x27,#32 |
+ mov w17,w28 |
+ lsr x19,x28,#32 |
+ mov w20,w30 |
+ lsr x21,x30,#32 |
+ |
+ mov x4,#5 |
+.Loop_lower_neon: |
+ sub x4,x4,#1 |
+ add v0.4s,v0.4s,v1.4s |
+ add w5,w5,w9 |
+ add v4.4s,v4.4s,v5.4s |
+ add w6,w6,w10 |
+ add v8.4s,v8.4s,v9.4s |
+ add w7,w7,w11 |
+ add v12.4s,v12.4s,v13.4s |
+ add w8,w8,w12 |
+ add v16.4s,v16.4s,v17.4s |
+ eor w17,w17,w5 |
+ add v20.4s,v20.4s,v21.4s |
+ eor w19,w19,w6 |
+ eor v3.16b,v3.16b,v0.16b |
+ eor w20,w20,w7 |
+ eor v7.16b,v7.16b,v4.16b |
+ eor w21,w21,w8 |
+ eor v11.16b,v11.16b,v8.16b |
+ ror w17,w17,#16 |
+ eor v15.16b,v15.16b,v12.16b |
+ ror w19,w19,#16 |
+ eor v19.16b,v19.16b,v16.16b |
+ ror w20,w20,#16 |
+ eor v23.16b,v23.16b,v20.16b |
+ ror w21,w21,#16 |
+ rev32 v3.8h,v3.8h |
+ add w13,w13,w17 |
+ rev32 v7.8h,v7.8h |
+ add w14,w14,w19 |
+ rev32 v11.8h,v11.8h |
+ add w15,w15,w20 |
+ rev32 v15.8h,v15.8h |
+ add w16,w16,w21 |
+ rev32 v19.8h,v19.8h |
+ eor w9,w9,w13 |
+ rev32 v23.8h,v23.8h |
+ eor w10,w10,w14 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w11,w11,w15 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w12,w12,w16 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w9,w9,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w10,w10,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w11,w11,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w12,w12,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w9 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w10 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w11 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w12 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w17,w17,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w19,w19,w6 |
+ ushr v1.4s,v24.4s,#20 |
+ eor w20,w20,w7 |
+ ushr v5.4s,v25.4s,#20 |
+ eor w21,w21,w8 |
+ ushr v9.4s,v26.4s,#20 |
+ ror w17,w17,#24 |
+ ushr v13.4s,v27.4s,#20 |
+ ror w19,w19,#24 |
+ ushr v17.4s,v28.4s,#20 |
+ ror w20,w20,#24 |
+ ushr v21.4s,v29.4s,#20 |
+ ror w21,w21,#24 |
+ sli v1.4s,v24.4s,#12 |
+ add w13,w13,w17 |
+ sli v5.4s,v25.4s,#12 |
+ add w14,w14,w19 |
+ sli v9.4s,v26.4s,#12 |
+ add w15,w15,w20 |
+ sli v13.4s,v27.4s,#12 |
+ add w16,w16,w21 |
+ sli v17.4s,v28.4s,#12 |
+ eor w9,w9,w13 |
+ sli v21.4s,v29.4s,#12 |
+ eor w10,w10,w14 |
+ add v0.4s,v0.4s,v1.4s |
+ eor w11,w11,w15 |
+ add v4.4s,v4.4s,v5.4s |
+ eor w12,w12,w16 |
+ add v8.4s,v8.4s,v9.4s |
+ ror w9,w9,#25 |
+ add v12.4s,v12.4s,v13.4s |
+ ror w10,w10,#25 |
+ add v16.4s,v16.4s,v17.4s |
+ ror w11,w11,#25 |
+ add v20.4s,v20.4s,v21.4s |
+ ror w12,w12,#25 |
+ eor v24.16b,v3.16b,v0.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v7.16b,v4.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v11.16b,v8.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v15.16b,v12.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v19.16b,v16.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v23.16b,v20.16b |
+ eor w17,w17,w6 |
+ ushr v3.4s,v24.4s,#24 |
+ eor w19,w19,w7 |
+ ushr v7.4s,v25.4s,#24 |
+ eor w20,w20,w8 |
+ ushr v11.4s,v26.4s,#24 |
+ ror w21,w21,#16 |
+ ushr v15.4s,v27.4s,#24 |
+ ror w17,w17,#16 |
+ ushr v19.4s,v28.4s,#24 |
+ ror w19,w19,#16 |
+ ushr v23.4s,v29.4s,#24 |
+ ror w20,w20,#16 |
+ sli v3.4s,v24.4s,#8 |
+ add w15,w15,w21 |
+ sli v7.4s,v25.4s,#8 |
+ add w16,w16,w17 |
+ sli v11.4s,v26.4s,#8 |
+ add w13,w13,w19 |
+ sli v15.4s,v27.4s,#8 |
+ add w14,w14,w20 |
+ sli v19.4s,v28.4s,#8 |
+ eor w10,w10,w15 |
+ sli v23.4s,v29.4s,#8 |
+ eor w11,w11,w16 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w12,w12,w13 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w9,w9,w14 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w10,w10,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w11,w11,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w12,w12,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w9,w9,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w17,w17,w6 |
+ ushr v1.4s,v24.4s,#25 |
+ eor w19,w19,w7 |
+ ushr v5.4s,v25.4s,#25 |
+ eor w20,w20,w8 |
+ ushr v9.4s,v26.4s,#25 |
+ ror w21,w21,#24 |
+ ushr v13.4s,v27.4s,#25 |
+ ror w17,w17,#24 |
+ ushr v17.4s,v28.4s,#25 |
+ ror w19,w19,#24 |
+ ushr v21.4s,v29.4s,#25 |
+ ror w20,w20,#24 |
+ sli v1.4s,v24.4s,#7 |
+ add w15,w15,w21 |
+ sli v5.4s,v25.4s,#7 |
+ add w16,w16,w17 |
+ sli v9.4s,v26.4s,#7 |
+ add w13,w13,w19 |
+ sli v13.4s,v27.4s,#7 |
+ add w14,w14,w20 |
+ sli v17.4s,v28.4s,#7 |
+ eor w10,w10,w15 |
+ sli v21.4s,v29.4s,#7 |
+ eor w11,w11,w16 |
+ ext v2.16b,v2.16b,v2.16b,#8 |
+ eor w12,w12,w13 |
+ ext v6.16b,v6.16b,v6.16b,#8 |
+ eor w9,w9,w14 |
+ ext v10.16b,v10.16b,v10.16b,#8 |
+ ror w10,w10,#25 |
+ ext v14.16b,v14.16b,v14.16b,#8 |
+ ror w11,w11,#25 |
+ ext v18.16b,v18.16b,v18.16b,#8 |
+ ror w12,w12,#25 |
+ ext v22.16b,v22.16b,v22.16b,#8 |
+ ror w9,w9,#25 |
+ ext v3.16b,v3.16b,v3.16b,#12 |
+ ext v7.16b,v7.16b,v7.16b,#12 |
+ ext v11.16b,v11.16b,v11.16b,#12 |
+ ext v15.16b,v15.16b,v15.16b,#12 |
+ ext v19.16b,v19.16b,v19.16b,#12 |
+ ext v23.16b,v23.16b,v23.16b,#12 |
+ ext v1.16b,v1.16b,v1.16b,#4 |
+ ext v5.16b,v5.16b,v5.16b,#4 |
+ ext v9.16b,v9.16b,v9.16b,#4 |
+ ext v13.16b,v13.16b,v13.16b,#4 |
+ ext v17.16b,v17.16b,v17.16b,#4 |
+ ext v21.16b,v21.16b,v21.16b,#4 |
+ add v0.4s,v0.4s,v1.4s |
+ add w5,w5,w9 |
+ add v4.4s,v4.4s,v5.4s |
+ add w6,w6,w10 |
+ add v8.4s,v8.4s,v9.4s |
+ add w7,w7,w11 |
+ add v12.4s,v12.4s,v13.4s |
+ add w8,w8,w12 |
+ add v16.4s,v16.4s,v17.4s |
+ eor w17,w17,w5 |
+ add v20.4s,v20.4s,v21.4s |
+ eor w19,w19,w6 |
+ eor v3.16b,v3.16b,v0.16b |
+ eor w20,w20,w7 |
+ eor v7.16b,v7.16b,v4.16b |
+ eor w21,w21,w8 |
+ eor v11.16b,v11.16b,v8.16b |
+ ror w17,w17,#16 |
+ eor v15.16b,v15.16b,v12.16b |
+ ror w19,w19,#16 |
+ eor v19.16b,v19.16b,v16.16b |
+ ror w20,w20,#16 |
+ eor v23.16b,v23.16b,v20.16b |
+ ror w21,w21,#16 |
+ rev32 v3.8h,v3.8h |
+ add w13,w13,w17 |
+ rev32 v7.8h,v7.8h |
+ add w14,w14,w19 |
+ rev32 v11.8h,v11.8h |
+ add w15,w15,w20 |
+ rev32 v15.8h,v15.8h |
+ add w16,w16,w21 |
+ rev32 v19.8h,v19.8h |
+ eor w9,w9,w13 |
+ rev32 v23.8h,v23.8h |
+ eor w10,w10,w14 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w11,w11,w15 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w12,w12,w16 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w9,w9,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w10,w10,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w11,w11,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w12,w12,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w9 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w10 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w11 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w12 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w17,w17,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w19,w19,w6 |
+ ushr v1.4s,v24.4s,#20 |
+ eor w20,w20,w7 |
+ ushr v5.4s,v25.4s,#20 |
+ eor w21,w21,w8 |
+ ushr v9.4s,v26.4s,#20 |
+ ror w17,w17,#24 |
+ ushr v13.4s,v27.4s,#20 |
+ ror w19,w19,#24 |
+ ushr v17.4s,v28.4s,#20 |
+ ror w20,w20,#24 |
+ ushr v21.4s,v29.4s,#20 |
+ ror w21,w21,#24 |
+ sli v1.4s,v24.4s,#12 |
+ add w13,w13,w17 |
+ sli v5.4s,v25.4s,#12 |
+ add w14,w14,w19 |
+ sli v9.4s,v26.4s,#12 |
+ add w15,w15,w20 |
+ sli v13.4s,v27.4s,#12 |
+ add w16,w16,w21 |
+ sli v17.4s,v28.4s,#12 |
+ eor w9,w9,w13 |
+ sli v21.4s,v29.4s,#12 |
+ eor w10,w10,w14 |
+ add v0.4s,v0.4s,v1.4s |
+ eor w11,w11,w15 |
+ add v4.4s,v4.4s,v5.4s |
+ eor w12,w12,w16 |
+ add v8.4s,v8.4s,v9.4s |
+ ror w9,w9,#25 |
+ add v12.4s,v12.4s,v13.4s |
+ ror w10,w10,#25 |
+ add v16.4s,v16.4s,v17.4s |
+ ror w11,w11,#25 |
+ add v20.4s,v20.4s,v21.4s |
+ ror w12,w12,#25 |
+ eor v24.16b,v3.16b,v0.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v7.16b,v4.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v11.16b,v8.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v15.16b,v12.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v19.16b,v16.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v23.16b,v20.16b |
+ eor w17,w17,w6 |
+ ushr v3.4s,v24.4s,#24 |
+ eor w19,w19,w7 |
+ ushr v7.4s,v25.4s,#24 |
+ eor w20,w20,w8 |
+ ushr v11.4s,v26.4s,#24 |
+ ror w21,w21,#16 |
+ ushr v15.4s,v27.4s,#24 |
+ ror w17,w17,#16 |
+ ushr v19.4s,v28.4s,#24 |
+ ror w19,w19,#16 |
+ ushr v23.4s,v29.4s,#24 |
+ ror w20,w20,#16 |
+ sli v3.4s,v24.4s,#8 |
+ add w15,w15,w21 |
+ sli v7.4s,v25.4s,#8 |
+ add w16,w16,w17 |
+ sli v11.4s,v26.4s,#8 |
+ add w13,w13,w19 |
+ sli v15.4s,v27.4s,#8 |
+ add w14,w14,w20 |
+ sli v19.4s,v28.4s,#8 |
+ eor w10,w10,w15 |
+ sli v23.4s,v29.4s,#8 |
+ eor w11,w11,w16 |
+ add v2.4s,v2.4s,v3.4s |
+ eor w12,w12,w13 |
+ add v6.4s,v6.4s,v7.4s |
+ eor w9,w9,w14 |
+ add v10.4s,v10.4s,v11.4s |
+ ror w10,w10,#20 |
+ add v14.4s,v14.4s,v15.4s |
+ ror w11,w11,#20 |
+ add v18.4s,v18.4s,v19.4s |
+ ror w12,w12,#20 |
+ add v22.4s,v22.4s,v23.4s |
+ ror w9,w9,#20 |
+ eor v24.16b,v1.16b,v2.16b |
+ add w5,w5,w10 |
+ eor v25.16b,v5.16b,v6.16b |
+ add w6,w6,w11 |
+ eor v26.16b,v9.16b,v10.16b |
+ add w7,w7,w12 |
+ eor v27.16b,v13.16b,v14.16b |
+ add w8,w8,w9 |
+ eor v28.16b,v17.16b,v18.16b |
+ eor w21,w21,w5 |
+ eor v29.16b,v21.16b,v22.16b |
+ eor w17,w17,w6 |
+ ushr v1.4s,v24.4s,#25 |
+ eor w19,w19,w7 |
+ ushr v5.4s,v25.4s,#25 |
+ eor w20,w20,w8 |
+ ushr v9.4s,v26.4s,#25 |
+ ror w21,w21,#24 |
+ ushr v13.4s,v27.4s,#25 |
+ ror w17,w17,#24 |
+ ushr v17.4s,v28.4s,#25 |
+ ror w19,w19,#24 |
+ ushr v21.4s,v29.4s,#25 |
+ ror w20,w20,#24 |
+ sli v1.4s,v24.4s,#7 |
+ add w15,w15,w21 |
+ sli v5.4s,v25.4s,#7 |
+ add w16,w16,w17 |
+ sli v9.4s,v26.4s,#7 |
+ add w13,w13,w19 |
+ sli v13.4s,v27.4s,#7 |
+ add w14,w14,w20 |
+ sli v17.4s,v28.4s,#7 |
+ eor w10,w10,w15 |
+ sli v21.4s,v29.4s,#7 |
+ eor w11,w11,w16 |
+ ext v2.16b,v2.16b,v2.16b,#8 |
+ eor w12,w12,w13 |
+ ext v6.16b,v6.16b,v6.16b,#8 |
+ eor w9,w9,w14 |
+ ext v10.16b,v10.16b,v10.16b,#8 |
+ ror w10,w10,#25 |
+ ext v14.16b,v14.16b,v14.16b,#8 |
+ ror w11,w11,#25 |
+ ext v18.16b,v18.16b,v18.16b,#8 |
+ ror w12,w12,#25 |
+ ext v22.16b,v22.16b,v22.16b,#8 |
+ ror w9,w9,#25 |
+ ext v3.16b,v3.16b,v3.16b,#4 |
+ ext v7.16b,v7.16b,v7.16b,#4 |
+ ext v11.16b,v11.16b,v11.16b,#4 |
+ ext v15.16b,v15.16b,v15.16b,#4 |
+ ext v19.16b,v19.16b,v19.16b,#4 |
+ ext v23.16b,v23.16b,v23.16b,#4 |
+ ext v1.16b,v1.16b,v1.16b,#12 |
+ ext v5.16b,v5.16b,v5.16b,#12 |
+ ext v9.16b,v9.16b,v9.16b,#12 |
+ ext v13.16b,v13.16b,v13.16b,#12 |
+ ext v17.16b,v17.16b,v17.16b,#12 |
+ ext v21.16b,v21.16b,v21.16b,#12 |
+ cbnz x4,.Loop_lower_neon |
+ |
+ add w5,w5,w22 // accumulate key block |
+ ldp q24,q25,[sp,#0] |
+ add x6,x6,x22,lsr#32 |
+ ldp q26,q27,[sp,#32] |
+ add w7,w7,w23 |
+ ldp q28,q29,[sp,#64] |
+ add x8,x8,x23,lsr#32 |
+ add v0.4s,v0.4s,v24.4s |
+ add w9,w9,w24 |
+ add v4.4s,v4.4s,v24.4s |
+ add x10,x10,x24,lsr#32 |
+ add v8.4s,v8.4s,v24.4s |
+ add w11,w11,w25 |
+ add v12.4s,v12.4s,v24.4s |
+ add x12,x12,x25,lsr#32 |
+ add v16.4s,v16.4s,v24.4s |
+ add w13,w13,w26 |
+ add v20.4s,v20.4s,v24.4s |
+ add x14,x14,x26,lsr#32 |
+ add v2.4s,v2.4s,v26.4s |
+ add w15,w15,w27 |
+ add v6.4s,v6.4s,v26.4s |
+ add x16,x16,x27,lsr#32 |
+ add v10.4s,v10.4s,v26.4s |
+ add w17,w17,w28 |
+ add v14.4s,v14.4s,v26.4s |
+ add x19,x19,x28,lsr#32 |
+ add v18.4s,v18.4s,v26.4s |
+ add w20,w20,w30 |
+ add v22.4s,v22.4s,v26.4s |
+ add x21,x21,x30,lsr#32 |
+ add v19.4s,v19.4s,v31.4s // +4 |
+ add x5,x5,x6,lsl#32 // pack |
+ add v23.4s,v23.4s,v31.4s // +4 |
+ add x7,x7,x8,lsl#32 |
+ add v3.4s,v3.4s,v27.4s |
+ ldp x6,x8,[x1,#0] // load input |
+ add v7.4s,v7.4s,v28.4s |
+ add x9,x9,x10,lsl#32 |
+ add v11.4s,v11.4s,v29.4s |
+ add x11,x11,x12,lsl#32 |
+ add v15.4s,v15.4s,v30.4s |
+ ldp x10,x12,[x1,#16] |
+ add v19.4s,v19.4s,v27.4s |
+ add x13,x13,x14,lsl#32 |
+ add v23.4s,v23.4s,v28.4s |
+ add x15,x15,x16,lsl#32 |
+ add v1.4s,v1.4s,v25.4s |
+ ldp x14,x16,[x1,#32] |
+ add v5.4s,v5.4s,v25.4s |
+ add x17,x17,x19,lsl#32 |
+ add v9.4s,v9.4s,v25.4s |
+ add x20,x20,x21,lsl#32 |
+ add v13.4s,v13.4s,v25.4s |
+ ldp x19,x21,[x1,#48] |
+ add v17.4s,v17.4s,v25.4s |
+ add x1,x1,#64 |
+ add v21.4s,v21.4s,v25.4s |
+ |
+#ifdef __ARMEB__ |
+ rev x5,x5 |
+ rev x7,x7 |
+ rev x9,x9 |
+ rev x11,x11 |
+ rev x13,x13 |
+ rev x15,x15 |
+ rev x17,x17 |
+ rev x20,x20 |
+#endif |
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
+ eor x5,x5,x6 |
+ eor x7,x7,x8 |
+ eor x9,x9,x10 |
+ eor x11,x11,x12 |
+ eor x13,x13,x14 |
+ eor v0.16b,v0.16b,v24.16b |
+ eor x15,x15,x16 |
+ eor v1.16b,v1.16b,v25.16b |
+ eor x17,x17,x19 |
+ eor v2.16b,v2.16b,v26.16b |
+ eor x20,x20,x21 |
+ eor v3.16b,v3.16b,v27.16b |
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
+ |
+ stp x5,x7,[x0,#0] // store output |
+ add x28,x28,#7 // increment counter |
+ stp x9,x11,[x0,#16] |
+ stp x13,x15,[x0,#32] |
+ stp x17,x20,[x0,#48] |
+ add x0,x0,#64 |
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
+ |
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
+ eor v4.16b,v4.16b,v24.16b |
+ eor v5.16b,v5.16b,v25.16b |
+ eor v6.16b,v6.16b,v26.16b |
+ eor v7.16b,v7.16b,v27.16b |
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
+ |
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 |
+ eor v8.16b,v8.16b,v0.16b |
+ ldp q24,q25,[sp,#0] |
+ eor v9.16b,v9.16b,v1.16b |
+ ldp q26,q27,[sp,#32] |
+ eor v10.16b,v10.16b,v2.16b |
+ eor v11.16b,v11.16b,v3.16b |
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 |
+ |
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 |
+ eor v12.16b,v12.16b,v4.16b |
+ eor v13.16b,v13.16b,v5.16b |
+ eor v14.16b,v14.16b,v6.16b |
+ eor v15.16b,v15.16b,v7.16b |
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 |
+ |
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 |
+ eor v16.16b,v16.16b,v8.16b |
+ eor v17.16b,v17.16b,v9.16b |
+ eor v18.16b,v18.16b,v10.16b |
+ eor v19.16b,v19.16b,v11.16b |
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
+ |
+ shl v0.4s,v31.4s,#1 // 4 -> 8 |
+ eor v20.16b,v20.16b,v12.16b |
+ eor v21.16b,v21.16b,v13.16b |
+ eor v22.16b,v22.16b,v14.16b |
+ eor v23.16b,v23.16b,v15.16b |
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 |
+ |
+ add v27.4s,v27.4s,v0.4s // += 8 |
+ add v28.4s,v28.4s,v0.4s |
+ add v29.4s,v29.4s,v0.4s |
+ add v30.4s,v30.4s,v0.4s |
+ |
+ b.hs .Loop_outer_512_neon |
+ |
+ adds x2,x2,#512 |
+ ushr v0.4s,v31.4s,#2 // 4 -> 1 |
+ |
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements |
+ ldp d10,d11,[sp,#128+16] |
+ ldp d12,d13,[sp,#128+32] |
+ ldp d14,d15,[sp,#128+48] |
+ |
+ stp q24,q31,[sp,#0] // wipe off-load area |
+ stp q24,q31,[sp,#32] |
+ stp q24,q31,[sp,#64] |
+ |
+ b.eq .Ldone_512_neon |
+ |
+ cmp x2,#192 |
+ sub v27.4s,v27.4s,v0.4s // -= 1 |
+ sub v28.4s,v28.4s,v0.4s |
+ sub v29.4s,v29.4s,v0.4s |
+ add sp,sp,#128 |
+ b.hs .Loop_outer_neon |
+ |
+ eor v25.16b,v25.16b,v25.16b |
+ eor v26.16b,v26.16b,v26.16b |
+ eor v27.16b,v27.16b,v27.16b |
+ eor v28.16b,v28.16b,v28.16b |
+ eor v29.16b,v29.16b,v29.16b |
+ eor v30.16b,v30.16b,v30.16b |
+ b .Loop_outer |
+ |
+.Ldone_512_neon: |
+ ldp x19,x20,[x29,#16] |
+ add sp,sp,#128+64 |
+ ldp x21,x22,[x29,#32] |
+ ldp x23,x24,[x29,#48] |
+ ldp x25,x26,[x29,#64] |
+ ldp x27,x28,[x29,#80] |
+ ldp x29,x30,[sp],#96 |
+ ret |
+.size ChaCha20_512_neon,.-ChaCha20_512_neon |
+#endif |