| Index: third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S
|
| diff --git a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S
|
| deleted file mode 100644
|
| index 6ff6bffb66bb2e2279def64813a9e09d2c432aa8..0000000000000000000000000000000000000000
|
| --- a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S
|
| +++ /dev/null
|
| @@ -1,1971 +0,0 @@
|
| -#if defined(__aarch64__)
|
| -#include <openssl/arm_arch.h>
|
| -
|
| -.text
|
| -
|
| -
|
| -
|
| -.align 5
|
| -.Lsigma:
|
| -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
|
| -.Lone:
|
| -.long 1,0,0,0
|
| -.LOPENSSL_armcap_P:
|
| -#ifdef __ILP32__
|
| -.long OPENSSL_armcap_P-.
|
| -#else
|
| -.quad OPENSSL_armcap_P-.
|
| -#endif
|
| -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
| -.align 2
|
| -
|
| -.globl ChaCha20_ctr32
|
| -.hidden ChaCha20_ctr32
|
| -.type ChaCha20_ctr32,%function
|
| -.align 5
|
| -ChaCha20_ctr32:
|
| - cbz x2,.Labort
|
| - adr x5,.LOPENSSL_armcap_P
|
| - cmp x2,#192
|
| - b.lo .Lshort
|
| -#ifdef __ILP32__
|
| - ldrsw x6,[x5]
|
| -#else
|
| - ldr x6,[x5]
|
| -#endif
|
| - ldr w17,[x6,x5]
|
| - tst w17,#ARMV7_NEON
|
| - b.ne ChaCha20_neon
|
| -
|
| -.Lshort:
|
| - stp x29,x30,[sp,#-96]!
|
| - add x29,sp,#0
|
| -
|
| - adr x5,.Lsigma
|
| - stp x19,x20,[sp,#16]
|
| - stp x21,x22,[sp,#32]
|
| - stp x23,x24,[sp,#48]
|
| - stp x25,x26,[sp,#64]
|
| - stp x27,x28,[sp,#80]
|
| - sub sp,sp,#64
|
| -
|
| - ldp x22,x23,[x5] // load sigma
|
| - ldp x24,x25,[x3] // load key
|
| - ldp x26,x27,[x3,#16]
|
| - ldp x28,x30,[x4] // load counter
|
| -#ifdef __ARMEB__
|
| - ror x24,x24,#32
|
| - ror x25,x25,#32
|
| - ror x26,x26,#32
|
| - ror x27,x27,#32
|
| - ror x28,x28,#32
|
| - ror x30,x30,#32
|
| -#endif
|
| -
|
| -.Loop_outer:
|
| - mov w5,w22 // unpack key block
|
| - lsr x6,x22,#32
|
| - mov w7,w23
|
| - lsr x8,x23,#32
|
| - mov w9,w24
|
| - lsr x10,x24,#32
|
| - mov w11,w25
|
| - lsr x12,x25,#32
|
| - mov w13,w26
|
| - lsr x14,x26,#32
|
| - mov w15,w27
|
| - lsr x16,x27,#32
|
| - mov w17,w28
|
| - lsr x19,x28,#32
|
| - mov w20,w30
|
| - lsr x21,x30,#32
|
| -
|
| - mov x4,#10
|
| - subs x2,x2,#64
|
| -.Loop:
|
| - sub x4,x4,#1
|
| - add w5,w5,w9
|
| - add w6,w6,w10
|
| - add w7,w7,w11
|
| - add w8,w8,w12
|
| - eor w17,w17,w5
|
| - eor w19,w19,w6
|
| - eor w20,w20,w7
|
| - eor w21,w21,w8
|
| - ror w17,w17,#16
|
| - ror w19,w19,#16
|
| - ror w20,w20,#16
|
| - ror w21,w21,#16
|
| - add w13,w13,w17
|
| - add w14,w14,w19
|
| - add w15,w15,w20
|
| - add w16,w16,w21
|
| - eor w9,w9,w13
|
| - eor w10,w10,w14
|
| - eor w11,w11,w15
|
| - eor w12,w12,w16
|
| - ror w9,w9,#20
|
| - ror w10,w10,#20
|
| - ror w11,w11,#20
|
| - ror w12,w12,#20
|
| - add w5,w5,w9
|
| - add w6,w6,w10
|
| - add w7,w7,w11
|
| - add w8,w8,w12
|
| - eor w17,w17,w5
|
| - eor w19,w19,w6
|
| - eor w20,w20,w7
|
| - eor w21,w21,w8
|
| - ror w17,w17,#24
|
| - ror w19,w19,#24
|
| - ror w20,w20,#24
|
| - ror w21,w21,#24
|
| - add w13,w13,w17
|
| - add w14,w14,w19
|
| - add w15,w15,w20
|
| - add w16,w16,w21
|
| - eor w9,w9,w13
|
| - eor w10,w10,w14
|
| - eor w11,w11,w15
|
| - eor w12,w12,w16
|
| - ror w9,w9,#25
|
| - ror w10,w10,#25
|
| - ror w11,w11,#25
|
| - ror w12,w12,#25
|
| - add w5,w5,w10
|
| - add w6,w6,w11
|
| - add w7,w7,w12
|
| - add w8,w8,w9
|
| - eor w21,w21,w5
|
| - eor w17,w17,w6
|
| - eor w19,w19,w7
|
| - eor w20,w20,w8
|
| - ror w21,w21,#16
|
| - ror w17,w17,#16
|
| - ror w19,w19,#16
|
| - ror w20,w20,#16
|
| - add w15,w15,w21
|
| - add w16,w16,w17
|
| - add w13,w13,w19
|
| - add w14,w14,w20
|
| - eor w10,w10,w15
|
| - eor w11,w11,w16
|
| - eor w12,w12,w13
|
| - eor w9,w9,w14
|
| - ror w10,w10,#20
|
| - ror w11,w11,#20
|
| - ror w12,w12,#20
|
| - ror w9,w9,#20
|
| - add w5,w5,w10
|
| - add w6,w6,w11
|
| - add w7,w7,w12
|
| - add w8,w8,w9
|
| - eor w21,w21,w5
|
| - eor w17,w17,w6
|
| - eor w19,w19,w7
|
| - eor w20,w20,w8
|
| - ror w21,w21,#24
|
| - ror w17,w17,#24
|
| - ror w19,w19,#24
|
| - ror w20,w20,#24
|
| - add w15,w15,w21
|
| - add w16,w16,w17
|
| - add w13,w13,w19
|
| - add w14,w14,w20
|
| - eor w10,w10,w15
|
| - eor w11,w11,w16
|
| - eor w12,w12,w13
|
| - eor w9,w9,w14
|
| - ror w10,w10,#25
|
| - ror w11,w11,#25
|
| - ror w12,w12,#25
|
| - ror w9,w9,#25
|
| - cbnz x4,.Loop
|
| -
|
| - add w5,w5,w22 // accumulate key block
|
| - add x6,x6,x22,lsr#32
|
| - add w7,w7,w23
|
| - add x8,x8,x23,lsr#32
|
| - add w9,w9,w24
|
| - add x10,x10,x24,lsr#32
|
| - add w11,w11,w25
|
| - add x12,x12,x25,lsr#32
|
| - add w13,w13,w26
|
| - add x14,x14,x26,lsr#32
|
| - add w15,w15,w27
|
| - add x16,x16,x27,lsr#32
|
| - add w17,w17,w28
|
| - add x19,x19,x28,lsr#32
|
| - add w20,w20,w30
|
| - add x21,x21,x30,lsr#32
|
| -
|
| - b.lo .Ltail
|
| -
|
| - add x5,x5,x6,lsl#32 // pack
|
| - add x7,x7,x8,lsl#32
|
| - ldp x6,x8,[x1,#0] // load input
|
| - add x9,x9,x10,lsl#32
|
| - add x11,x11,x12,lsl#32
|
| - ldp x10,x12,[x1,#16]
|
| - add x13,x13,x14,lsl#32
|
| - add x15,x15,x16,lsl#32
|
| - ldp x14,x16,[x1,#32]
|
| - add x17,x17,x19,lsl#32
|
| - add x20,x20,x21,lsl#32
|
| - ldp x19,x21,[x1,#48]
|
| - add x1,x1,#64
|
| -#ifdef __ARMEB__
|
| - rev x5,x5
|
| - rev x7,x7
|
| - rev x9,x9
|
| - rev x11,x11
|
| - rev x13,x13
|
| - rev x15,x15
|
| - rev x17,x17
|
| - rev x20,x20
|
| -#endif
|
| - eor x5,x5,x6
|
| - eor x7,x7,x8
|
| - eor x9,x9,x10
|
| - eor x11,x11,x12
|
| - eor x13,x13,x14
|
| - eor x15,x15,x16
|
| - eor x17,x17,x19
|
| - eor x20,x20,x21
|
| -
|
| - stp x5,x7,[x0,#0] // store output
|
| - add x28,x28,#1 // increment counter
|
| - stp x9,x11,[x0,#16]
|
| - stp x13,x15,[x0,#32]
|
| - stp x17,x20,[x0,#48]
|
| - add x0,x0,#64
|
| -
|
| - b.hi .Loop_outer
|
| -
|
| - ldp x19,x20,[x29,#16]
|
| - add sp,sp,#64
|
| - ldp x21,x22,[x29,#32]
|
| - ldp x23,x24,[x29,#48]
|
| - ldp x25,x26,[x29,#64]
|
| - ldp x27,x28,[x29,#80]
|
| - ldp x29,x30,[sp],#96
|
| -.Labort:
|
| - ret
|
| -
|
| -.align 4
|
| -.Ltail:
|
| - add x2,x2,#64
|
| -.Less_than_64:
|
| - sub x0,x0,#1
|
| - add x1,x1,x2
|
| - add x0,x0,x2
|
| - add x4,sp,x2
|
| - neg x2,x2
|
| -
|
| - add x5,x5,x6,lsl#32 // pack
|
| - add x7,x7,x8,lsl#32
|
| - add x9,x9,x10,lsl#32
|
| - add x11,x11,x12,lsl#32
|
| - add x13,x13,x14,lsl#32
|
| - add x15,x15,x16,lsl#32
|
| - add x17,x17,x19,lsl#32
|
| - add x20,x20,x21,lsl#32
|
| -#ifdef __ARMEB__
|
| - rev x5,x5
|
| - rev x7,x7
|
| - rev x9,x9
|
| - rev x11,x11
|
| - rev x13,x13
|
| - rev x15,x15
|
| - rev x17,x17
|
| - rev x20,x20
|
| -#endif
|
| - stp x5,x7,[sp,#0]
|
| - stp x9,x11,[sp,#16]
|
| - stp x13,x15,[sp,#32]
|
| - stp x17,x20,[sp,#48]
|
| -
|
| -.Loop_tail:
|
| - ldrb w10,[x1,x2]
|
| - ldrb w11,[x4,x2]
|
| - add x2,x2,#1
|
| - eor w10,w10,w11
|
| - strb w10,[x0,x2]
|
| - cbnz x2,.Loop_tail
|
| -
|
| - stp xzr,xzr,[sp,#0]
|
| - stp xzr,xzr,[sp,#16]
|
| - stp xzr,xzr,[sp,#32]
|
| - stp xzr,xzr,[sp,#48]
|
| -
|
| - ldp x19,x20,[x29,#16]
|
| - add sp,sp,#64
|
| - ldp x21,x22,[x29,#32]
|
| - ldp x23,x24,[x29,#48]
|
| - ldp x25,x26,[x29,#64]
|
| - ldp x27,x28,[x29,#80]
|
| - ldp x29,x30,[sp],#96
|
| - ret
|
| -.size ChaCha20_ctr32,.-ChaCha20_ctr32
|
| -
|
| -.type ChaCha20_neon,%function
|
| -.align 5
|
| -ChaCha20_neon:
|
| - stp x29,x30,[sp,#-96]!
|
| - add x29,sp,#0
|
| -
|
| - adr x5,.Lsigma
|
| - stp x19,x20,[sp,#16]
|
| - stp x21,x22,[sp,#32]
|
| - stp x23,x24,[sp,#48]
|
| - stp x25,x26,[sp,#64]
|
| - stp x27,x28,[sp,#80]
|
| - cmp x2,#512
|
| - b.hs .L512_or_more_neon
|
| -
|
| - sub sp,sp,#64
|
| -
|
| - ldp x22,x23,[x5] // load sigma
|
| - ld1 {v24.4s},[x5],#16
|
| - ldp x24,x25,[x3] // load key
|
| - ldp x26,x27,[x3,#16]
|
| - ld1 {v25.4s,v26.4s},[x3]
|
| - ldp x28,x30,[x4] // load counter
|
| - ld1 {v27.4s},[x4]
|
| - ld1 {v31.4s},[x5]
|
| -#ifdef __ARMEB__
|
| - rev64 v24.4s,v24.4s
|
| - ror x24,x24,#32
|
| - ror x25,x25,#32
|
| - ror x26,x26,#32
|
| - ror x27,x27,#32
|
| - ror x28,x28,#32
|
| - ror x30,x30,#32
|
| -#endif
|
| - add v27.4s,v27.4s,v31.4s // += 1
|
| - add v28.4s,v27.4s,v31.4s
|
| - add v29.4s,v28.4s,v31.4s
|
| - shl v31.4s,v31.4s,#2 // 1 -> 4
|
| -
|
| -.Loop_outer_neon:
|
| - mov w5,w22 // unpack key block
|
| - lsr x6,x22,#32
|
| - mov v0.16b,v24.16b
|
| - mov w7,w23
|
| - lsr x8,x23,#32
|
| - mov v4.16b,v24.16b
|
| - mov w9,w24
|
| - lsr x10,x24,#32
|
| - mov v16.16b,v24.16b
|
| - mov w11,w25
|
| - mov v1.16b,v25.16b
|
| - lsr x12,x25,#32
|
| - mov v5.16b,v25.16b
|
| - mov w13,w26
|
| - mov v17.16b,v25.16b
|
| - lsr x14,x26,#32
|
| - mov v3.16b,v27.16b
|
| - mov w15,w27
|
| - mov v7.16b,v28.16b
|
| - lsr x16,x27,#32
|
| - mov v19.16b,v29.16b
|
| - mov w17,w28
|
| - mov v2.16b,v26.16b
|
| - lsr x19,x28,#32
|
| - mov v6.16b,v26.16b
|
| - mov w20,w30
|
| - mov v18.16b,v26.16b
|
| - lsr x21,x30,#32
|
| -
|
| - mov x4,#10
|
| - subs x2,x2,#256
|
| -.Loop_neon:
|
| - sub x4,x4,#1
|
| - add v0.4s,v0.4s,v1.4s
|
| - add w5,w5,w9
|
| - add v4.4s,v4.4s,v5.4s
|
| - add w6,w6,w10
|
| - add v16.4s,v16.4s,v17.4s
|
| - add w7,w7,w11
|
| - eor v3.16b,v3.16b,v0.16b
|
| - add w8,w8,w12
|
| - eor v7.16b,v7.16b,v4.16b
|
| - eor w17,w17,w5
|
| - eor v19.16b,v19.16b,v16.16b
|
| - eor w19,w19,w6
|
| - rev32 v3.8h,v3.8h
|
| - eor w20,w20,w7
|
| - rev32 v7.8h,v7.8h
|
| - eor w21,w21,w8
|
| - rev32 v19.8h,v19.8h
|
| - ror w17,w17,#16
|
| - add v2.4s,v2.4s,v3.4s
|
| - ror w19,w19,#16
|
| - add v6.4s,v6.4s,v7.4s
|
| - ror w20,w20,#16
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w21,w21,#16
|
| - eor v20.16b,v1.16b,v2.16b
|
| - add w13,w13,w17
|
| - eor v21.16b,v5.16b,v6.16b
|
| - add w14,w14,w19
|
| - eor v22.16b,v17.16b,v18.16b
|
| - add w15,w15,w20
|
| - ushr v1.4s,v20.4s,#20
|
| - add w16,w16,w21
|
| - ushr v5.4s,v21.4s,#20
|
| - eor w9,w9,w13
|
| - ushr v17.4s,v22.4s,#20
|
| - eor w10,w10,w14
|
| - sli v1.4s,v20.4s,#12
|
| - eor w11,w11,w15
|
| - sli v5.4s,v21.4s,#12
|
| - eor w12,w12,w16
|
| - sli v17.4s,v22.4s,#12
|
| - ror w9,w9,#20
|
| - add v0.4s,v0.4s,v1.4s
|
| - ror w10,w10,#20
|
| - add v4.4s,v4.4s,v5.4s
|
| - ror w11,w11,#20
|
| - add v16.4s,v16.4s,v17.4s
|
| - ror w12,w12,#20
|
| - eor v20.16b,v3.16b,v0.16b
|
| - add w5,w5,w9
|
| - eor v21.16b,v7.16b,v4.16b
|
| - add w6,w6,w10
|
| - eor v22.16b,v19.16b,v16.16b
|
| - add w7,w7,w11
|
| - ushr v3.4s,v20.4s,#24
|
| - add w8,w8,w12
|
| - ushr v7.4s,v21.4s,#24
|
| - eor w17,w17,w5
|
| - ushr v19.4s,v22.4s,#24
|
| - eor w19,w19,w6
|
| - sli v3.4s,v20.4s,#8
|
| - eor w20,w20,w7
|
| - sli v7.4s,v21.4s,#8
|
| - eor w21,w21,w8
|
| - sli v19.4s,v22.4s,#8
|
| - ror w17,w17,#24
|
| - add v2.4s,v2.4s,v3.4s
|
| - ror w19,w19,#24
|
| - add v6.4s,v6.4s,v7.4s
|
| - ror w20,w20,#24
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w21,w21,#24
|
| - eor v20.16b,v1.16b,v2.16b
|
| - add w13,w13,w17
|
| - eor v21.16b,v5.16b,v6.16b
|
| - add w14,w14,w19
|
| - eor v22.16b,v17.16b,v18.16b
|
| - add w15,w15,w20
|
| - ushr v1.4s,v20.4s,#25
|
| - add w16,w16,w21
|
| - ushr v5.4s,v21.4s,#25
|
| - eor w9,w9,w13
|
| - ushr v17.4s,v22.4s,#25
|
| - eor w10,w10,w14
|
| - sli v1.4s,v20.4s,#7
|
| - eor w11,w11,w15
|
| - sli v5.4s,v21.4s,#7
|
| - eor w12,w12,w16
|
| - sli v17.4s,v22.4s,#7
|
| - ror w9,w9,#25
|
| - ext v2.16b,v2.16b,v2.16b,#8
|
| - ror w10,w10,#25
|
| - ext v6.16b,v6.16b,v6.16b,#8
|
| - ror w11,w11,#25
|
| - ext v18.16b,v18.16b,v18.16b,#8
|
| - ror w12,w12,#25
|
| - ext v3.16b,v3.16b,v3.16b,#12
|
| - ext v7.16b,v7.16b,v7.16b,#12
|
| - ext v19.16b,v19.16b,v19.16b,#12
|
| - ext v1.16b,v1.16b,v1.16b,#4
|
| - ext v5.16b,v5.16b,v5.16b,#4
|
| - ext v17.16b,v17.16b,v17.16b,#4
|
| - add v0.4s,v0.4s,v1.4s
|
| - add w5,w5,w10
|
| - add v4.4s,v4.4s,v5.4s
|
| - add w6,w6,w11
|
| - add v16.4s,v16.4s,v17.4s
|
| - add w7,w7,w12
|
| - eor v3.16b,v3.16b,v0.16b
|
| - add w8,w8,w9
|
| - eor v7.16b,v7.16b,v4.16b
|
| - eor w21,w21,w5
|
| - eor v19.16b,v19.16b,v16.16b
|
| - eor w17,w17,w6
|
| - rev32 v3.8h,v3.8h
|
| - eor w19,w19,w7
|
| - rev32 v7.8h,v7.8h
|
| - eor w20,w20,w8
|
| - rev32 v19.8h,v19.8h
|
| - ror w21,w21,#16
|
| - add v2.4s,v2.4s,v3.4s
|
| - ror w17,w17,#16
|
| - add v6.4s,v6.4s,v7.4s
|
| - ror w19,w19,#16
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w20,w20,#16
|
| - eor v20.16b,v1.16b,v2.16b
|
| - add w15,w15,w21
|
| - eor v21.16b,v5.16b,v6.16b
|
| - add w16,w16,w17
|
| - eor v22.16b,v17.16b,v18.16b
|
| - add w13,w13,w19
|
| - ushr v1.4s,v20.4s,#20
|
| - add w14,w14,w20
|
| - ushr v5.4s,v21.4s,#20
|
| - eor w10,w10,w15
|
| - ushr v17.4s,v22.4s,#20
|
| - eor w11,w11,w16
|
| - sli v1.4s,v20.4s,#12
|
| - eor w12,w12,w13
|
| - sli v5.4s,v21.4s,#12
|
| - eor w9,w9,w14
|
| - sli v17.4s,v22.4s,#12
|
| - ror w10,w10,#20
|
| - add v0.4s,v0.4s,v1.4s
|
| - ror w11,w11,#20
|
| - add v4.4s,v4.4s,v5.4s
|
| - ror w12,w12,#20
|
| - add v16.4s,v16.4s,v17.4s
|
| - ror w9,w9,#20
|
| - eor v20.16b,v3.16b,v0.16b
|
| - add w5,w5,w10
|
| - eor v21.16b,v7.16b,v4.16b
|
| - add w6,w6,w11
|
| - eor v22.16b,v19.16b,v16.16b
|
| - add w7,w7,w12
|
| - ushr v3.4s,v20.4s,#24
|
| - add w8,w8,w9
|
| - ushr v7.4s,v21.4s,#24
|
| - eor w21,w21,w5
|
| - ushr v19.4s,v22.4s,#24
|
| - eor w17,w17,w6
|
| - sli v3.4s,v20.4s,#8
|
| - eor w19,w19,w7
|
| - sli v7.4s,v21.4s,#8
|
| - eor w20,w20,w8
|
| - sli v19.4s,v22.4s,#8
|
| - ror w21,w21,#24
|
| - add v2.4s,v2.4s,v3.4s
|
| - ror w17,w17,#24
|
| - add v6.4s,v6.4s,v7.4s
|
| - ror w19,w19,#24
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w20,w20,#24
|
| - eor v20.16b,v1.16b,v2.16b
|
| - add w15,w15,w21
|
| - eor v21.16b,v5.16b,v6.16b
|
| - add w16,w16,w17
|
| - eor v22.16b,v17.16b,v18.16b
|
| - add w13,w13,w19
|
| - ushr v1.4s,v20.4s,#25
|
| - add w14,w14,w20
|
| - ushr v5.4s,v21.4s,#25
|
| - eor w10,w10,w15
|
| - ushr v17.4s,v22.4s,#25
|
| - eor w11,w11,w16
|
| - sli v1.4s,v20.4s,#7
|
| - eor w12,w12,w13
|
| - sli v5.4s,v21.4s,#7
|
| - eor w9,w9,w14
|
| - sli v17.4s,v22.4s,#7
|
| - ror w10,w10,#25
|
| - ext v2.16b,v2.16b,v2.16b,#8
|
| - ror w11,w11,#25
|
| - ext v6.16b,v6.16b,v6.16b,#8
|
| - ror w12,w12,#25
|
| - ext v18.16b,v18.16b,v18.16b,#8
|
| - ror w9,w9,#25
|
| - ext v3.16b,v3.16b,v3.16b,#4
|
| - ext v7.16b,v7.16b,v7.16b,#4
|
| - ext v19.16b,v19.16b,v19.16b,#4
|
| - ext v1.16b,v1.16b,v1.16b,#12
|
| - ext v5.16b,v5.16b,v5.16b,#12
|
| - ext v17.16b,v17.16b,v17.16b,#12
|
| - cbnz x4,.Loop_neon
|
| -
|
| - add w5,w5,w22 // accumulate key block
|
| - add v0.4s,v0.4s,v24.4s
|
| - add x6,x6,x22,lsr#32
|
| - add v4.4s,v4.4s,v24.4s
|
| - add w7,w7,w23
|
| - add v16.4s,v16.4s,v24.4s
|
| - add x8,x8,x23,lsr#32
|
| - add v2.4s,v2.4s,v26.4s
|
| - add w9,w9,w24
|
| - add v6.4s,v6.4s,v26.4s
|
| - add x10,x10,x24,lsr#32
|
| - add v18.4s,v18.4s,v26.4s
|
| - add w11,w11,w25
|
| - add v3.4s,v3.4s,v27.4s
|
| - add x12,x12,x25,lsr#32
|
| - add w13,w13,w26
|
| - add v7.4s,v7.4s,v28.4s
|
| - add x14,x14,x26,lsr#32
|
| - add w15,w15,w27
|
| - add v19.4s,v19.4s,v29.4s
|
| - add x16,x16,x27,lsr#32
|
| - add w17,w17,w28
|
| - add v1.4s,v1.4s,v25.4s
|
| - add x19,x19,x28,lsr#32
|
| - add w20,w20,w30
|
| - add v5.4s,v5.4s,v25.4s
|
| - add x21,x21,x30,lsr#32
|
| - add v17.4s,v17.4s,v25.4s
|
| -
|
| - b.lo .Ltail_neon
|
| -
|
| - add x5,x5,x6,lsl#32 // pack
|
| - add x7,x7,x8,lsl#32
|
| - ldp x6,x8,[x1,#0] // load input
|
| - add x9,x9,x10,lsl#32
|
| - add x11,x11,x12,lsl#32
|
| - ldp x10,x12,[x1,#16]
|
| - add x13,x13,x14,lsl#32
|
| - add x15,x15,x16,lsl#32
|
| - ldp x14,x16,[x1,#32]
|
| - add x17,x17,x19,lsl#32
|
| - add x20,x20,x21,lsl#32
|
| - ldp x19,x21,[x1,#48]
|
| - add x1,x1,#64
|
| -#ifdef __ARMEB__
|
| - rev x5,x5
|
| - rev x7,x7
|
| - rev x9,x9
|
| - rev x11,x11
|
| - rev x13,x13
|
| - rev x15,x15
|
| - rev x17,x17
|
| - rev x20,x20
|
| -#endif
|
| - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
| - eor x5,x5,x6
|
| - eor x7,x7,x8
|
| - eor x9,x9,x10
|
| - eor x11,x11,x12
|
| - eor x13,x13,x14
|
| - eor v0.16b,v0.16b,v20.16b
|
| - eor x15,x15,x16
|
| - eor v1.16b,v1.16b,v21.16b
|
| - eor x17,x17,x19
|
| - eor v2.16b,v2.16b,v22.16b
|
| - eor x20,x20,x21
|
| - eor v3.16b,v3.16b,v23.16b
|
| - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
| -
|
| - stp x5,x7,[x0,#0] // store output
|
| - add x28,x28,#4 // increment counter
|
| - stp x9,x11,[x0,#16]
|
| - add v27.4s,v27.4s,v31.4s // += 4
|
| - stp x13,x15,[x0,#32]
|
| - add v28.4s,v28.4s,v31.4s
|
| - stp x17,x20,[x0,#48]
|
| - add v29.4s,v29.4s,v31.4s
|
| - add x0,x0,#64
|
| -
|
| - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
| - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
|
| -
|
| - eor v4.16b,v4.16b,v20.16b
|
| - eor v5.16b,v5.16b,v21.16b
|
| - eor v6.16b,v6.16b,v22.16b
|
| - eor v7.16b,v7.16b,v23.16b
|
| - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
| -
|
| - eor v16.16b,v16.16b,v0.16b
|
| - eor v17.16b,v17.16b,v1.16b
|
| - eor v18.16b,v18.16b,v2.16b
|
| - eor v19.16b,v19.16b,v3.16b
|
| - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
|
| -
|
| - b.hi .Loop_outer_neon
|
| -
|
| - ldp x19,x20,[x29,#16]
|
| - add sp,sp,#64
|
| - ldp x21,x22,[x29,#32]
|
| - ldp x23,x24,[x29,#48]
|
| - ldp x25,x26,[x29,#64]
|
| - ldp x27,x28,[x29,#80]
|
| - ldp x29,x30,[sp],#96
|
| - ret
|
| -
|
| -.Ltail_neon:
|
| - add x2,x2,#256
|
| - cmp x2,#64
|
| - b.lo .Less_than_64
|
| -
|
| - add x5,x5,x6,lsl#32 // pack
|
| - add x7,x7,x8,lsl#32
|
| - ldp x6,x8,[x1,#0] // load input
|
| - add x9,x9,x10,lsl#32
|
| - add x11,x11,x12,lsl#32
|
| - ldp x10,x12,[x1,#16]
|
| - add x13,x13,x14,lsl#32
|
| - add x15,x15,x16,lsl#32
|
| - ldp x14,x16,[x1,#32]
|
| - add x17,x17,x19,lsl#32
|
| - add x20,x20,x21,lsl#32
|
| - ldp x19,x21,[x1,#48]
|
| - add x1,x1,#64
|
| -#ifdef __ARMEB__
|
| - rev x5,x5
|
| - rev x7,x7
|
| - rev x9,x9
|
| - rev x11,x11
|
| - rev x13,x13
|
| - rev x15,x15
|
| - rev x17,x17
|
| - rev x20,x20
|
| -#endif
|
| - eor x5,x5,x6
|
| - eor x7,x7,x8
|
| - eor x9,x9,x10
|
| - eor x11,x11,x12
|
| - eor x13,x13,x14
|
| - eor x15,x15,x16
|
| - eor x17,x17,x19
|
| - eor x20,x20,x21
|
| -
|
| - stp x5,x7,[x0,#0] // store output
|
| - add x28,x28,#4 // increment counter
|
| - stp x9,x11,[x0,#16]
|
| - stp x13,x15,[x0,#32]
|
| - stp x17,x20,[x0,#48]
|
| - add x0,x0,#64
|
| - b.eq .Ldone_neon
|
| - sub x2,x2,#64
|
| - cmp x2,#64
|
| - b.lo .Less_than_128
|
| -
|
| - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
| - eor v0.16b,v0.16b,v20.16b
|
| - eor v1.16b,v1.16b,v21.16b
|
| - eor v2.16b,v2.16b,v22.16b
|
| - eor v3.16b,v3.16b,v23.16b
|
| - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
| - b.eq .Ldone_neon
|
| - sub x2,x2,#64
|
| - cmp x2,#64
|
| - b.lo .Less_than_192
|
| -
|
| - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
| - eor v4.16b,v4.16b,v20.16b
|
| - eor v5.16b,v5.16b,v21.16b
|
| - eor v6.16b,v6.16b,v22.16b
|
| - eor v7.16b,v7.16b,v23.16b
|
| - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
| - b.eq .Ldone_neon
|
| - sub x2,x2,#64
|
| -
|
| - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
|
| - b .Last_neon
|
| -
|
| -.Less_than_128:
|
| - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
|
| - b .Last_neon
|
| -.Less_than_192:
|
| - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
|
| - b .Last_neon
|
| -
|
| -.align 4
|
| -.Last_neon:
|
| - sub x0,x0,#1
|
| - add x1,x1,x2
|
| - add x0,x0,x2
|
| - add x4,sp,x2
|
| - neg x2,x2
|
| -
|
| -.Loop_tail_neon:
|
| - ldrb w10,[x1,x2]
|
| - ldrb w11,[x4,x2]
|
| - add x2,x2,#1
|
| - eor w10,w10,w11
|
| - strb w10,[x0,x2]
|
| - cbnz x2,.Loop_tail_neon
|
| -
|
| - stp xzr,xzr,[sp,#0]
|
| - stp xzr,xzr,[sp,#16]
|
| - stp xzr,xzr,[sp,#32]
|
| - stp xzr,xzr,[sp,#48]
|
| -
|
| -.Ldone_neon:
|
| - ldp x19,x20,[x29,#16]
|
| - add sp,sp,#64
|
| - ldp x21,x22,[x29,#32]
|
| - ldp x23,x24,[x29,#48]
|
| - ldp x25,x26,[x29,#64]
|
| - ldp x27,x28,[x29,#80]
|
| - ldp x29,x30,[sp],#96
|
| - ret
|
| -.size ChaCha20_neon,.-ChaCha20_neon
|
| -.type ChaCha20_512_neon,%function
|
| -.align 5
|
| -ChaCha20_512_neon:
|
| - stp x29,x30,[sp,#-96]!
|
| - add x29,sp,#0
|
| -
|
| - adr x5,.Lsigma
|
| - stp x19,x20,[sp,#16]
|
| - stp x21,x22,[sp,#32]
|
| - stp x23,x24,[sp,#48]
|
| - stp x25,x26,[sp,#64]
|
| - stp x27,x28,[sp,#80]
|
| -
|
| -.L512_or_more_neon:
|
| - sub sp,sp,#128+64
|
| -
|
| - ldp x22,x23,[x5] // load sigma
|
| - ld1 {v24.4s},[x5],#16
|
| - ldp x24,x25,[x3] // load key
|
| - ldp x26,x27,[x3,#16]
|
| - ld1 {v25.4s,v26.4s},[x3]
|
| - ldp x28,x30,[x4] // load counter
|
| - ld1 {v27.4s},[x4]
|
| - ld1 {v31.4s},[x5]
|
| -#ifdef __ARMEB__
|
| - rev64 v24.4s,v24.4s
|
| - ror x24,x24,#32
|
| - ror x25,x25,#32
|
| - ror x26,x26,#32
|
| - ror x27,x27,#32
|
| - ror x28,x28,#32
|
| - ror x30,x30,#32
|
| -#endif
|
| - add v27.4s,v27.4s,v31.4s // += 1
|
| - stp q24,q25,[sp,#0] // off-load key block, invariant part
|
| - add v27.4s,v27.4s,v31.4s // not typo
|
| - str q26,[sp,#32]
|
| - add v28.4s,v27.4s,v31.4s
|
| - add v29.4s,v28.4s,v31.4s
|
| - add v30.4s,v29.4s,v31.4s
|
| - shl v31.4s,v31.4s,#2 // 1 -> 4
|
| -
|
| - stp d8,d9,[sp,#128+0] // meet ABI requirements
|
| - stp d10,d11,[sp,#128+16]
|
| - stp d12,d13,[sp,#128+32]
|
| - stp d14,d15,[sp,#128+48]
|
| -
|
| - sub x2,x2,#512 // not typo
|
| -
|
| -.Loop_outer_512_neon:
|
| - mov v0.16b,v24.16b
|
| - mov v4.16b,v24.16b
|
| - mov v8.16b,v24.16b
|
| - mov v12.16b,v24.16b
|
| - mov v16.16b,v24.16b
|
| - mov v20.16b,v24.16b
|
| - mov v1.16b,v25.16b
|
| - mov w5,w22 // unpack key block
|
| - mov v5.16b,v25.16b
|
| - lsr x6,x22,#32
|
| - mov v9.16b,v25.16b
|
| - mov w7,w23
|
| - mov v13.16b,v25.16b
|
| - lsr x8,x23,#32
|
| - mov v17.16b,v25.16b
|
| - mov w9,w24
|
| - mov v21.16b,v25.16b
|
| - lsr x10,x24,#32
|
| - mov v3.16b,v27.16b
|
| - mov w11,w25
|
| - mov v7.16b,v28.16b
|
| - lsr x12,x25,#32
|
| - mov v11.16b,v29.16b
|
| - mov w13,w26
|
| - mov v15.16b,v30.16b
|
| - lsr x14,x26,#32
|
| - mov v2.16b,v26.16b
|
| - mov w15,w27
|
| - mov v6.16b,v26.16b
|
| - lsr x16,x27,#32
|
| - add v19.4s,v3.4s,v31.4s // +4
|
| - mov w17,w28
|
| - add v23.4s,v7.4s,v31.4s // +4
|
| - lsr x19,x28,#32
|
| - mov v10.16b,v26.16b
|
| - mov w20,w30
|
| - mov v14.16b,v26.16b
|
| - lsr x21,x30,#32
|
| - mov v18.16b,v26.16b
|
| - stp q27,q28,[sp,#48] // off-load key block, variable part
|
| - mov v22.16b,v26.16b
|
| - str q29,[sp,#80]
|
| -
|
| - mov x4,#5
|
| - subs x2,x2,#512
|
| -.Loop_upper_neon:
|
| - sub x4,x4,#1
|
| - add v0.4s,v0.4s,v1.4s
|
| - add w5,w5,w9
|
| - add v4.4s,v4.4s,v5.4s
|
| - add w6,w6,w10
|
| - add v8.4s,v8.4s,v9.4s
|
| - add w7,w7,w11
|
| - add v12.4s,v12.4s,v13.4s
|
| - add w8,w8,w12
|
| - add v16.4s,v16.4s,v17.4s
|
| - eor w17,w17,w5
|
| - add v20.4s,v20.4s,v21.4s
|
| - eor w19,w19,w6
|
| - eor v3.16b,v3.16b,v0.16b
|
| - eor w20,w20,w7
|
| - eor v7.16b,v7.16b,v4.16b
|
| - eor w21,w21,w8
|
| - eor v11.16b,v11.16b,v8.16b
|
| - ror w17,w17,#16
|
| - eor v15.16b,v15.16b,v12.16b
|
| - ror w19,w19,#16
|
| - eor v19.16b,v19.16b,v16.16b
|
| - ror w20,w20,#16
|
| - eor v23.16b,v23.16b,v20.16b
|
| - ror w21,w21,#16
|
| - rev32 v3.8h,v3.8h
|
| - add w13,w13,w17
|
| - rev32 v7.8h,v7.8h
|
| - add w14,w14,w19
|
| - rev32 v11.8h,v11.8h
|
| - add w15,w15,w20
|
| - rev32 v15.8h,v15.8h
|
| - add w16,w16,w21
|
| - rev32 v19.8h,v19.8h
|
| - eor w9,w9,w13
|
| - rev32 v23.8h,v23.8h
|
| - eor w10,w10,w14
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w11,w11,w15
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w12,w12,w16
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w9,w9,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w10,w10,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w11,w11,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w12,w12,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w9
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w10
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w11
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w12
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w17,w17,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w19,w19,w6
|
| - ushr v1.4s,v24.4s,#20
|
| - eor w20,w20,w7
|
| - ushr v5.4s,v25.4s,#20
|
| - eor w21,w21,w8
|
| - ushr v9.4s,v26.4s,#20
|
| - ror w17,w17,#24
|
| - ushr v13.4s,v27.4s,#20
|
| - ror w19,w19,#24
|
| - ushr v17.4s,v28.4s,#20
|
| - ror w20,w20,#24
|
| - ushr v21.4s,v29.4s,#20
|
| - ror w21,w21,#24
|
| - sli v1.4s,v24.4s,#12
|
| - add w13,w13,w17
|
| - sli v5.4s,v25.4s,#12
|
| - add w14,w14,w19
|
| - sli v9.4s,v26.4s,#12
|
| - add w15,w15,w20
|
| - sli v13.4s,v27.4s,#12
|
| - add w16,w16,w21
|
| - sli v17.4s,v28.4s,#12
|
| - eor w9,w9,w13
|
| - sli v21.4s,v29.4s,#12
|
| - eor w10,w10,w14
|
| - add v0.4s,v0.4s,v1.4s
|
| - eor w11,w11,w15
|
| - add v4.4s,v4.4s,v5.4s
|
| - eor w12,w12,w16
|
| - add v8.4s,v8.4s,v9.4s
|
| - ror w9,w9,#25
|
| - add v12.4s,v12.4s,v13.4s
|
| - ror w10,w10,#25
|
| - add v16.4s,v16.4s,v17.4s
|
| - ror w11,w11,#25
|
| - add v20.4s,v20.4s,v21.4s
|
| - ror w12,w12,#25
|
| - eor v24.16b,v3.16b,v0.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v7.16b,v4.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v11.16b,v8.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v15.16b,v12.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v19.16b,v16.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v23.16b,v20.16b
|
| - eor w17,w17,w6
|
| - ushr v3.4s,v24.4s,#24
|
| - eor w19,w19,w7
|
| - ushr v7.4s,v25.4s,#24
|
| - eor w20,w20,w8
|
| - ushr v11.4s,v26.4s,#24
|
| - ror w21,w21,#16
|
| - ushr v15.4s,v27.4s,#24
|
| - ror w17,w17,#16
|
| - ushr v19.4s,v28.4s,#24
|
| - ror w19,w19,#16
|
| - ushr v23.4s,v29.4s,#24
|
| - ror w20,w20,#16
|
| - sli v3.4s,v24.4s,#8
|
| - add w15,w15,w21
|
| - sli v7.4s,v25.4s,#8
|
| - add w16,w16,w17
|
| - sli v11.4s,v26.4s,#8
|
| - add w13,w13,w19
|
| - sli v15.4s,v27.4s,#8
|
| - add w14,w14,w20
|
| - sli v19.4s,v28.4s,#8
|
| - eor w10,w10,w15
|
| - sli v23.4s,v29.4s,#8
|
| - eor w11,w11,w16
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w12,w12,w13
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w9,w9,w14
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w10,w10,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w11,w11,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w12,w12,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w9,w9,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w17,w17,w6
|
| - ushr v1.4s,v24.4s,#25
|
| - eor w19,w19,w7
|
| - ushr v5.4s,v25.4s,#25
|
| - eor w20,w20,w8
|
| - ushr v9.4s,v26.4s,#25
|
| - ror w21,w21,#24
|
| - ushr v13.4s,v27.4s,#25
|
| - ror w17,w17,#24
|
| - ushr v17.4s,v28.4s,#25
|
| - ror w19,w19,#24
|
| - ushr v21.4s,v29.4s,#25
|
| - ror w20,w20,#24
|
| - sli v1.4s,v24.4s,#7
|
| - add w15,w15,w21
|
| - sli v5.4s,v25.4s,#7
|
| - add w16,w16,w17
|
| - sli v9.4s,v26.4s,#7
|
| - add w13,w13,w19
|
| - sli v13.4s,v27.4s,#7
|
| - add w14,w14,w20
|
| - sli v17.4s,v28.4s,#7
|
| - eor w10,w10,w15
|
| - sli v21.4s,v29.4s,#7
|
| - eor w11,w11,w16
|
| - ext v2.16b,v2.16b,v2.16b,#8
|
| - eor w12,w12,w13
|
| - ext v6.16b,v6.16b,v6.16b,#8
|
| - eor w9,w9,w14
|
| - ext v10.16b,v10.16b,v10.16b,#8
|
| - ror w10,w10,#25
|
| - ext v14.16b,v14.16b,v14.16b,#8
|
| - ror w11,w11,#25
|
| - ext v18.16b,v18.16b,v18.16b,#8
|
| - ror w12,w12,#25
|
| - ext v22.16b,v22.16b,v22.16b,#8
|
| - ror w9,w9,#25
|
| - ext v3.16b,v3.16b,v3.16b,#12
|
| - ext v7.16b,v7.16b,v7.16b,#12
|
| - ext v11.16b,v11.16b,v11.16b,#12
|
| - ext v15.16b,v15.16b,v15.16b,#12
|
| - ext v19.16b,v19.16b,v19.16b,#12
|
| - ext v23.16b,v23.16b,v23.16b,#12
|
| - ext v1.16b,v1.16b,v1.16b,#4
|
| - ext v5.16b,v5.16b,v5.16b,#4
|
| - ext v9.16b,v9.16b,v9.16b,#4
|
| - ext v13.16b,v13.16b,v13.16b,#4
|
| - ext v17.16b,v17.16b,v17.16b,#4
|
| - ext v21.16b,v21.16b,v21.16b,#4
|
| - add v0.4s,v0.4s,v1.4s
|
| - add w5,w5,w9
|
| - add v4.4s,v4.4s,v5.4s
|
| - add w6,w6,w10
|
| - add v8.4s,v8.4s,v9.4s
|
| - add w7,w7,w11
|
| - add v12.4s,v12.4s,v13.4s
|
| - add w8,w8,w12
|
| - add v16.4s,v16.4s,v17.4s
|
| - eor w17,w17,w5
|
| - add v20.4s,v20.4s,v21.4s
|
| - eor w19,w19,w6
|
| - eor v3.16b,v3.16b,v0.16b
|
| - eor w20,w20,w7
|
| - eor v7.16b,v7.16b,v4.16b
|
| - eor w21,w21,w8
|
| - eor v11.16b,v11.16b,v8.16b
|
| - ror w17,w17,#16
|
| - eor v15.16b,v15.16b,v12.16b
|
| - ror w19,w19,#16
|
| - eor v19.16b,v19.16b,v16.16b
|
| - ror w20,w20,#16
|
| - eor v23.16b,v23.16b,v20.16b
|
| - ror w21,w21,#16
|
| - rev32 v3.8h,v3.8h
|
| - add w13,w13,w17
|
| - rev32 v7.8h,v7.8h
|
| - add w14,w14,w19
|
| - rev32 v11.8h,v11.8h
|
| - add w15,w15,w20
|
| - rev32 v15.8h,v15.8h
|
| - add w16,w16,w21
|
| - rev32 v19.8h,v19.8h
|
| - eor w9,w9,w13
|
| - rev32 v23.8h,v23.8h
|
| - eor w10,w10,w14
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w11,w11,w15
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w12,w12,w16
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w9,w9,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w10,w10,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w11,w11,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w12,w12,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w9
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w10
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w11
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w12
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w17,w17,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w19,w19,w6
|
| - ushr v1.4s,v24.4s,#20
|
| - eor w20,w20,w7
|
| - ushr v5.4s,v25.4s,#20
|
| - eor w21,w21,w8
|
| - ushr v9.4s,v26.4s,#20
|
| - ror w17,w17,#24
|
| - ushr v13.4s,v27.4s,#20
|
| - ror w19,w19,#24
|
| - ushr v17.4s,v28.4s,#20
|
| - ror w20,w20,#24
|
| - ushr v21.4s,v29.4s,#20
|
| - ror w21,w21,#24
|
| - sli v1.4s,v24.4s,#12
|
| - add w13,w13,w17
|
| - sli v5.4s,v25.4s,#12
|
| - add w14,w14,w19
|
| - sli v9.4s,v26.4s,#12
|
| - add w15,w15,w20
|
| - sli v13.4s,v27.4s,#12
|
| - add w16,w16,w21
|
| - sli v17.4s,v28.4s,#12
|
| - eor w9,w9,w13
|
| - sli v21.4s,v29.4s,#12
|
| - eor w10,w10,w14
|
| - add v0.4s,v0.4s,v1.4s
|
| - eor w11,w11,w15
|
| - add v4.4s,v4.4s,v5.4s
|
| - eor w12,w12,w16
|
| - add v8.4s,v8.4s,v9.4s
|
| - ror w9,w9,#25
|
| - add v12.4s,v12.4s,v13.4s
|
| - ror w10,w10,#25
|
| - add v16.4s,v16.4s,v17.4s
|
| - ror w11,w11,#25
|
| - add v20.4s,v20.4s,v21.4s
|
| - ror w12,w12,#25
|
| - eor v24.16b,v3.16b,v0.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v7.16b,v4.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v11.16b,v8.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v15.16b,v12.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v19.16b,v16.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v23.16b,v20.16b
|
| - eor w17,w17,w6
|
| - ushr v3.4s,v24.4s,#24
|
| - eor w19,w19,w7
|
| - ushr v7.4s,v25.4s,#24
|
| - eor w20,w20,w8
|
| - ushr v11.4s,v26.4s,#24
|
| - ror w21,w21,#16
|
| - ushr v15.4s,v27.4s,#24
|
| - ror w17,w17,#16
|
| - ushr v19.4s,v28.4s,#24
|
| - ror w19,w19,#16
|
| - ushr v23.4s,v29.4s,#24
|
| - ror w20,w20,#16
|
| - sli v3.4s,v24.4s,#8
|
| - add w15,w15,w21
|
| - sli v7.4s,v25.4s,#8
|
| - add w16,w16,w17
|
| - sli v11.4s,v26.4s,#8
|
| - add w13,w13,w19
|
| - sli v15.4s,v27.4s,#8
|
| - add w14,w14,w20
|
| - sli v19.4s,v28.4s,#8
|
| - eor w10,w10,w15
|
| - sli v23.4s,v29.4s,#8
|
| - eor w11,w11,w16
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w12,w12,w13
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w9,w9,w14
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w10,w10,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w11,w11,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w12,w12,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w9,w9,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w17,w17,w6
|
| - ushr v1.4s,v24.4s,#25
|
| - eor w19,w19,w7
|
| - ushr v5.4s,v25.4s,#25
|
| - eor w20,w20,w8
|
| - ushr v9.4s,v26.4s,#25
|
| - ror w21,w21,#24
|
| - ushr v13.4s,v27.4s,#25
|
| - ror w17,w17,#24
|
| - ushr v17.4s,v28.4s,#25
|
| - ror w19,w19,#24
|
| - ushr v21.4s,v29.4s,#25
|
| - ror w20,w20,#24
|
| - sli v1.4s,v24.4s,#7
|
| - add w15,w15,w21
|
| - sli v5.4s,v25.4s,#7
|
| - add w16,w16,w17
|
| - sli v9.4s,v26.4s,#7
|
| - add w13,w13,w19
|
| - sli v13.4s,v27.4s,#7
|
| - add w14,w14,w20
|
| - sli v17.4s,v28.4s,#7
|
| - eor w10,w10,w15
|
| - sli v21.4s,v29.4s,#7
|
| - eor w11,w11,w16
|
| - ext v2.16b,v2.16b,v2.16b,#8
|
| - eor w12,w12,w13
|
| - ext v6.16b,v6.16b,v6.16b,#8
|
| - eor w9,w9,w14
|
| - ext v10.16b,v10.16b,v10.16b,#8
|
| - ror w10,w10,#25
|
| - ext v14.16b,v14.16b,v14.16b,#8
|
| - ror w11,w11,#25
|
| - ext v18.16b,v18.16b,v18.16b,#8
|
| - ror w12,w12,#25
|
| - ext v22.16b,v22.16b,v22.16b,#8
|
| - ror w9,w9,#25
|
| - ext v3.16b,v3.16b,v3.16b,#4
|
| - ext v7.16b,v7.16b,v7.16b,#4
|
| - ext v11.16b,v11.16b,v11.16b,#4
|
| - ext v15.16b,v15.16b,v15.16b,#4
|
| - ext v19.16b,v19.16b,v19.16b,#4
|
| - ext v23.16b,v23.16b,v23.16b,#4
|
| - ext v1.16b,v1.16b,v1.16b,#12
|
| - ext v5.16b,v5.16b,v5.16b,#12
|
| - ext v9.16b,v9.16b,v9.16b,#12
|
| - ext v13.16b,v13.16b,v13.16b,#12
|
| - ext v17.16b,v17.16b,v17.16b,#12
|
| - ext v21.16b,v21.16b,v21.16b,#12
|
| - cbnz x4,.Loop_upper_neon
|
| -
|
| - add w5,w5,w22 // accumulate key block
|
| - add x6,x6,x22,lsr#32
|
| - add w7,w7,w23
|
| - add x8,x8,x23,lsr#32
|
| - add w9,w9,w24
|
| - add x10,x10,x24,lsr#32
|
| - add w11,w11,w25
|
| - add x12,x12,x25,lsr#32
|
| - add w13,w13,w26
|
| - add x14,x14,x26,lsr#32
|
| - add w15,w15,w27
|
| - add x16,x16,x27,lsr#32
|
| - add w17,w17,w28
|
| - add x19,x19,x28,lsr#32
|
| - add w20,w20,w30
|
| - add x21,x21,x30,lsr#32
|
| -
|
| - add x5,x5,x6,lsl#32 // pack
|
| - add x7,x7,x8,lsl#32
|
| - ldp x6,x8,[x1,#0] // load input
|
| - add x9,x9,x10,lsl#32
|
| - add x11,x11,x12,lsl#32
|
| - ldp x10,x12,[x1,#16]
|
| - add x13,x13,x14,lsl#32
|
| - add x15,x15,x16,lsl#32
|
| - ldp x14,x16,[x1,#32]
|
| - add x17,x17,x19,lsl#32
|
| - add x20,x20,x21,lsl#32
|
| - ldp x19,x21,[x1,#48]
|
| - add x1,x1,#64
|
| -#ifdef __ARMEB__
|
| - rev x5,x5
|
| - rev x7,x7
|
| - rev x9,x9
|
| - rev x11,x11
|
| - rev x13,x13
|
| - rev x15,x15
|
| - rev x17,x17
|
| - rev x20,x20
|
| -#endif
|
| - eor x5,x5,x6
|
| - eor x7,x7,x8
|
| - eor x9,x9,x10
|
| - eor x11,x11,x12
|
| - eor x13,x13,x14
|
| - eor x15,x15,x16
|
| - eor x17,x17,x19
|
| - eor x20,x20,x21
|
| -
|
| - stp x5,x7,[x0,#0] // store output
|
| - add x28,x28,#1 // increment counter
|
| - mov w5,w22 // unpack key block
|
| - lsr x6,x22,#32
|
| - stp x9,x11,[x0,#16]
|
| - mov w7,w23
|
| - lsr x8,x23,#32
|
| - stp x13,x15,[x0,#32]
|
| - mov w9,w24
|
| - lsr x10,x24,#32
|
| - stp x17,x20,[x0,#48]
|
| - add x0,x0,#64
|
| - mov w11,w25
|
| - lsr x12,x25,#32
|
| - mov w13,w26
|
| - lsr x14,x26,#32
|
| - mov w15,w27
|
| - lsr x16,x27,#32
|
| - mov w17,w28
|
| - lsr x19,x28,#32
|
| - mov w20,w30
|
| - lsr x21,x30,#32
|
| -
|
| - mov x4,#5
|
| -.Loop_lower_neon:
|
| - sub x4,x4,#1
|
| - add v0.4s,v0.4s,v1.4s
|
| - add w5,w5,w9
|
| - add v4.4s,v4.4s,v5.4s
|
| - add w6,w6,w10
|
| - add v8.4s,v8.4s,v9.4s
|
| - add w7,w7,w11
|
| - add v12.4s,v12.4s,v13.4s
|
| - add w8,w8,w12
|
| - add v16.4s,v16.4s,v17.4s
|
| - eor w17,w17,w5
|
| - add v20.4s,v20.4s,v21.4s
|
| - eor w19,w19,w6
|
| - eor v3.16b,v3.16b,v0.16b
|
| - eor w20,w20,w7
|
| - eor v7.16b,v7.16b,v4.16b
|
| - eor w21,w21,w8
|
| - eor v11.16b,v11.16b,v8.16b
|
| - ror w17,w17,#16
|
| - eor v15.16b,v15.16b,v12.16b
|
| - ror w19,w19,#16
|
| - eor v19.16b,v19.16b,v16.16b
|
| - ror w20,w20,#16
|
| - eor v23.16b,v23.16b,v20.16b
|
| - ror w21,w21,#16
|
| - rev32 v3.8h,v3.8h
|
| - add w13,w13,w17
|
| - rev32 v7.8h,v7.8h
|
| - add w14,w14,w19
|
| - rev32 v11.8h,v11.8h
|
| - add w15,w15,w20
|
| - rev32 v15.8h,v15.8h
|
| - add w16,w16,w21
|
| - rev32 v19.8h,v19.8h
|
| - eor w9,w9,w13
|
| - rev32 v23.8h,v23.8h
|
| - eor w10,w10,w14
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w11,w11,w15
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w12,w12,w16
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w9,w9,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w10,w10,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w11,w11,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w12,w12,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w9
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w10
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w11
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w12
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w17,w17,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w19,w19,w6
|
| - ushr v1.4s,v24.4s,#20
|
| - eor w20,w20,w7
|
| - ushr v5.4s,v25.4s,#20
|
| - eor w21,w21,w8
|
| - ushr v9.4s,v26.4s,#20
|
| - ror w17,w17,#24
|
| - ushr v13.4s,v27.4s,#20
|
| - ror w19,w19,#24
|
| - ushr v17.4s,v28.4s,#20
|
| - ror w20,w20,#24
|
| - ushr v21.4s,v29.4s,#20
|
| - ror w21,w21,#24
|
| - sli v1.4s,v24.4s,#12
|
| - add w13,w13,w17
|
| - sli v5.4s,v25.4s,#12
|
| - add w14,w14,w19
|
| - sli v9.4s,v26.4s,#12
|
| - add w15,w15,w20
|
| - sli v13.4s,v27.4s,#12
|
| - add w16,w16,w21
|
| - sli v17.4s,v28.4s,#12
|
| - eor w9,w9,w13
|
| - sli v21.4s,v29.4s,#12
|
| - eor w10,w10,w14
|
| - add v0.4s,v0.4s,v1.4s
|
| - eor w11,w11,w15
|
| - add v4.4s,v4.4s,v5.4s
|
| - eor w12,w12,w16
|
| - add v8.4s,v8.4s,v9.4s
|
| - ror w9,w9,#25
|
| - add v12.4s,v12.4s,v13.4s
|
| - ror w10,w10,#25
|
| - add v16.4s,v16.4s,v17.4s
|
| - ror w11,w11,#25
|
| - add v20.4s,v20.4s,v21.4s
|
| - ror w12,w12,#25
|
| - eor v24.16b,v3.16b,v0.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v7.16b,v4.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v11.16b,v8.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v15.16b,v12.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v19.16b,v16.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v23.16b,v20.16b
|
| - eor w17,w17,w6
|
| - ushr v3.4s,v24.4s,#24
|
| - eor w19,w19,w7
|
| - ushr v7.4s,v25.4s,#24
|
| - eor w20,w20,w8
|
| - ushr v11.4s,v26.4s,#24
|
| - ror w21,w21,#16
|
| - ushr v15.4s,v27.4s,#24
|
| - ror w17,w17,#16
|
| - ushr v19.4s,v28.4s,#24
|
| - ror w19,w19,#16
|
| - ushr v23.4s,v29.4s,#24
|
| - ror w20,w20,#16
|
| - sli v3.4s,v24.4s,#8
|
| - add w15,w15,w21
|
| - sli v7.4s,v25.4s,#8
|
| - add w16,w16,w17
|
| - sli v11.4s,v26.4s,#8
|
| - add w13,w13,w19
|
| - sli v15.4s,v27.4s,#8
|
| - add w14,w14,w20
|
| - sli v19.4s,v28.4s,#8
|
| - eor w10,w10,w15
|
| - sli v23.4s,v29.4s,#8
|
| - eor w11,w11,w16
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w12,w12,w13
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w9,w9,w14
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w10,w10,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w11,w11,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w12,w12,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w9,w9,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w17,w17,w6
|
| - ushr v1.4s,v24.4s,#25
|
| - eor w19,w19,w7
|
| - ushr v5.4s,v25.4s,#25
|
| - eor w20,w20,w8
|
| - ushr v9.4s,v26.4s,#25
|
| - ror w21,w21,#24
|
| - ushr v13.4s,v27.4s,#25
|
| - ror w17,w17,#24
|
| - ushr v17.4s,v28.4s,#25
|
| - ror w19,w19,#24
|
| - ushr v21.4s,v29.4s,#25
|
| - ror w20,w20,#24
|
| - sli v1.4s,v24.4s,#7
|
| - add w15,w15,w21
|
| - sli v5.4s,v25.4s,#7
|
| - add w16,w16,w17
|
| - sli v9.4s,v26.4s,#7
|
| - add w13,w13,w19
|
| - sli v13.4s,v27.4s,#7
|
| - add w14,w14,w20
|
| - sli v17.4s,v28.4s,#7
|
| - eor w10,w10,w15
|
| - sli v21.4s,v29.4s,#7
|
| - eor w11,w11,w16
|
| - ext v2.16b,v2.16b,v2.16b,#8
|
| - eor w12,w12,w13
|
| - ext v6.16b,v6.16b,v6.16b,#8
|
| - eor w9,w9,w14
|
| - ext v10.16b,v10.16b,v10.16b,#8
|
| - ror w10,w10,#25
|
| - ext v14.16b,v14.16b,v14.16b,#8
|
| - ror w11,w11,#25
|
| - ext v18.16b,v18.16b,v18.16b,#8
|
| - ror w12,w12,#25
|
| - ext v22.16b,v22.16b,v22.16b,#8
|
| - ror w9,w9,#25
|
| - ext v3.16b,v3.16b,v3.16b,#12
|
| - ext v7.16b,v7.16b,v7.16b,#12
|
| - ext v11.16b,v11.16b,v11.16b,#12
|
| - ext v15.16b,v15.16b,v15.16b,#12
|
| - ext v19.16b,v19.16b,v19.16b,#12
|
| - ext v23.16b,v23.16b,v23.16b,#12
|
| - ext v1.16b,v1.16b,v1.16b,#4
|
| - ext v5.16b,v5.16b,v5.16b,#4
|
| - ext v9.16b,v9.16b,v9.16b,#4
|
| - ext v13.16b,v13.16b,v13.16b,#4
|
| - ext v17.16b,v17.16b,v17.16b,#4
|
| - ext v21.16b,v21.16b,v21.16b,#4
|
| - add v0.4s,v0.4s,v1.4s
|
| - add w5,w5,w9
|
| - add v4.4s,v4.4s,v5.4s
|
| - add w6,w6,w10
|
| - add v8.4s,v8.4s,v9.4s
|
| - add w7,w7,w11
|
| - add v12.4s,v12.4s,v13.4s
|
| - add w8,w8,w12
|
| - add v16.4s,v16.4s,v17.4s
|
| - eor w17,w17,w5
|
| - add v20.4s,v20.4s,v21.4s
|
| - eor w19,w19,w6
|
| - eor v3.16b,v3.16b,v0.16b
|
| - eor w20,w20,w7
|
| - eor v7.16b,v7.16b,v4.16b
|
| - eor w21,w21,w8
|
| - eor v11.16b,v11.16b,v8.16b
|
| - ror w17,w17,#16
|
| - eor v15.16b,v15.16b,v12.16b
|
| - ror w19,w19,#16
|
| - eor v19.16b,v19.16b,v16.16b
|
| - ror w20,w20,#16
|
| - eor v23.16b,v23.16b,v20.16b
|
| - ror w21,w21,#16
|
| - rev32 v3.8h,v3.8h
|
| - add w13,w13,w17
|
| - rev32 v7.8h,v7.8h
|
| - add w14,w14,w19
|
| - rev32 v11.8h,v11.8h
|
| - add w15,w15,w20
|
| - rev32 v15.8h,v15.8h
|
| - add w16,w16,w21
|
| - rev32 v19.8h,v19.8h
|
| - eor w9,w9,w13
|
| - rev32 v23.8h,v23.8h
|
| - eor w10,w10,w14
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w11,w11,w15
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w12,w12,w16
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w9,w9,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w10,w10,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w11,w11,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w12,w12,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w9
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w10
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w11
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w12
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w17,w17,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w19,w19,w6
|
| - ushr v1.4s,v24.4s,#20
|
| - eor w20,w20,w7
|
| - ushr v5.4s,v25.4s,#20
|
| - eor w21,w21,w8
|
| - ushr v9.4s,v26.4s,#20
|
| - ror w17,w17,#24
|
| - ushr v13.4s,v27.4s,#20
|
| - ror w19,w19,#24
|
| - ushr v17.4s,v28.4s,#20
|
| - ror w20,w20,#24
|
| - ushr v21.4s,v29.4s,#20
|
| - ror w21,w21,#24
|
| - sli v1.4s,v24.4s,#12
|
| - add w13,w13,w17
|
| - sli v5.4s,v25.4s,#12
|
| - add w14,w14,w19
|
| - sli v9.4s,v26.4s,#12
|
| - add w15,w15,w20
|
| - sli v13.4s,v27.4s,#12
|
| - add w16,w16,w21
|
| - sli v17.4s,v28.4s,#12
|
| - eor w9,w9,w13
|
| - sli v21.4s,v29.4s,#12
|
| - eor w10,w10,w14
|
| - add v0.4s,v0.4s,v1.4s
|
| - eor w11,w11,w15
|
| - add v4.4s,v4.4s,v5.4s
|
| - eor w12,w12,w16
|
| - add v8.4s,v8.4s,v9.4s
|
| - ror w9,w9,#25
|
| - add v12.4s,v12.4s,v13.4s
|
| - ror w10,w10,#25
|
| - add v16.4s,v16.4s,v17.4s
|
| - ror w11,w11,#25
|
| - add v20.4s,v20.4s,v21.4s
|
| - ror w12,w12,#25
|
| - eor v24.16b,v3.16b,v0.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v7.16b,v4.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v11.16b,v8.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v15.16b,v12.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v19.16b,v16.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v23.16b,v20.16b
|
| - eor w17,w17,w6
|
| - ushr v3.4s,v24.4s,#24
|
| - eor w19,w19,w7
|
| - ushr v7.4s,v25.4s,#24
|
| - eor w20,w20,w8
|
| - ushr v11.4s,v26.4s,#24
|
| - ror w21,w21,#16
|
| - ushr v15.4s,v27.4s,#24
|
| - ror w17,w17,#16
|
| - ushr v19.4s,v28.4s,#24
|
| - ror w19,w19,#16
|
| - ushr v23.4s,v29.4s,#24
|
| - ror w20,w20,#16
|
| - sli v3.4s,v24.4s,#8
|
| - add w15,w15,w21
|
| - sli v7.4s,v25.4s,#8
|
| - add w16,w16,w17
|
| - sli v11.4s,v26.4s,#8
|
| - add w13,w13,w19
|
| - sli v15.4s,v27.4s,#8
|
| - add w14,w14,w20
|
| - sli v19.4s,v28.4s,#8
|
| - eor w10,w10,w15
|
| - sli v23.4s,v29.4s,#8
|
| - eor w11,w11,w16
|
| - add v2.4s,v2.4s,v3.4s
|
| - eor w12,w12,w13
|
| - add v6.4s,v6.4s,v7.4s
|
| - eor w9,w9,w14
|
| - add v10.4s,v10.4s,v11.4s
|
| - ror w10,w10,#20
|
| - add v14.4s,v14.4s,v15.4s
|
| - ror w11,w11,#20
|
| - add v18.4s,v18.4s,v19.4s
|
| - ror w12,w12,#20
|
| - add v22.4s,v22.4s,v23.4s
|
| - ror w9,w9,#20
|
| - eor v24.16b,v1.16b,v2.16b
|
| - add w5,w5,w10
|
| - eor v25.16b,v5.16b,v6.16b
|
| - add w6,w6,w11
|
| - eor v26.16b,v9.16b,v10.16b
|
| - add w7,w7,w12
|
| - eor v27.16b,v13.16b,v14.16b
|
| - add w8,w8,w9
|
| - eor v28.16b,v17.16b,v18.16b
|
| - eor w21,w21,w5
|
| - eor v29.16b,v21.16b,v22.16b
|
| - eor w17,w17,w6
|
| - ushr v1.4s,v24.4s,#25
|
| - eor w19,w19,w7
|
| - ushr v5.4s,v25.4s,#25
|
| - eor w20,w20,w8
|
| - ushr v9.4s,v26.4s,#25
|
| - ror w21,w21,#24
|
| - ushr v13.4s,v27.4s,#25
|
| - ror w17,w17,#24
|
| - ushr v17.4s,v28.4s,#25
|
| - ror w19,w19,#24
|
| - ushr v21.4s,v29.4s,#25
|
| - ror w20,w20,#24
|
| - sli v1.4s,v24.4s,#7
|
| - add w15,w15,w21
|
| - sli v5.4s,v25.4s,#7
|
| - add w16,w16,w17
|
| - sli v9.4s,v26.4s,#7
|
| - add w13,w13,w19
|
| - sli v13.4s,v27.4s,#7
|
| - add w14,w14,w20
|
| - sli v17.4s,v28.4s,#7
|
| - eor w10,w10,w15
|
| - sli v21.4s,v29.4s,#7
|
| - eor w11,w11,w16
|
| - ext v2.16b,v2.16b,v2.16b,#8
|
| - eor w12,w12,w13
|
| - ext v6.16b,v6.16b,v6.16b,#8
|
| - eor w9,w9,w14
|
| - ext v10.16b,v10.16b,v10.16b,#8
|
| - ror w10,w10,#25
|
| - ext v14.16b,v14.16b,v14.16b,#8
|
| - ror w11,w11,#25
|
| - ext v18.16b,v18.16b,v18.16b,#8
|
| - ror w12,w12,#25
|
| - ext v22.16b,v22.16b,v22.16b,#8
|
| - ror w9,w9,#25
|
| - ext v3.16b,v3.16b,v3.16b,#4
|
| - ext v7.16b,v7.16b,v7.16b,#4
|
| - ext v11.16b,v11.16b,v11.16b,#4
|
| - ext v15.16b,v15.16b,v15.16b,#4
|
| - ext v19.16b,v19.16b,v19.16b,#4
|
| - ext v23.16b,v23.16b,v23.16b,#4
|
| - ext v1.16b,v1.16b,v1.16b,#12
|
| - ext v5.16b,v5.16b,v5.16b,#12
|
| - ext v9.16b,v9.16b,v9.16b,#12
|
| - ext v13.16b,v13.16b,v13.16b,#12
|
| - ext v17.16b,v17.16b,v17.16b,#12
|
| - ext v21.16b,v21.16b,v21.16b,#12
|
| - cbnz x4,.Loop_lower_neon
|
| -
|
| - add w5,w5,w22 // accumulate key block
|
| - ldp q24,q25,[sp,#0]
|
| - add x6,x6,x22,lsr#32
|
| - ldp q26,q27,[sp,#32]
|
| - add w7,w7,w23
|
| - ldp q28,q29,[sp,#64]
|
| - add x8,x8,x23,lsr#32
|
| - add v0.4s,v0.4s,v24.4s
|
| - add w9,w9,w24
|
| - add v4.4s,v4.4s,v24.4s
|
| - add x10,x10,x24,lsr#32
|
| - add v8.4s,v8.4s,v24.4s
|
| - add w11,w11,w25
|
| - add v12.4s,v12.4s,v24.4s
|
| - add x12,x12,x25,lsr#32
|
| - add v16.4s,v16.4s,v24.4s
|
| - add w13,w13,w26
|
| - add v20.4s,v20.4s,v24.4s
|
| - add x14,x14,x26,lsr#32
|
| - add v2.4s,v2.4s,v26.4s
|
| - add w15,w15,w27
|
| - add v6.4s,v6.4s,v26.4s
|
| - add x16,x16,x27,lsr#32
|
| - add v10.4s,v10.4s,v26.4s
|
| - add w17,w17,w28
|
| - add v14.4s,v14.4s,v26.4s
|
| - add x19,x19,x28,lsr#32
|
| - add v18.4s,v18.4s,v26.4s
|
| - add w20,w20,w30
|
| - add v22.4s,v22.4s,v26.4s
|
| - add x21,x21,x30,lsr#32
|
| - add v19.4s,v19.4s,v31.4s // +4
|
| - add x5,x5,x6,lsl#32 // pack
|
| - add v23.4s,v23.4s,v31.4s // +4
|
| - add x7,x7,x8,lsl#32
|
| - add v3.4s,v3.4s,v27.4s
|
| - ldp x6,x8,[x1,#0] // load input
|
| - add v7.4s,v7.4s,v28.4s
|
| - add x9,x9,x10,lsl#32
|
| - add v11.4s,v11.4s,v29.4s
|
| - add x11,x11,x12,lsl#32
|
| - add v15.4s,v15.4s,v30.4s
|
| - ldp x10,x12,[x1,#16]
|
| - add v19.4s,v19.4s,v27.4s
|
| - add x13,x13,x14,lsl#32
|
| - add v23.4s,v23.4s,v28.4s
|
| - add x15,x15,x16,lsl#32
|
| - add v1.4s,v1.4s,v25.4s
|
| - ldp x14,x16,[x1,#32]
|
| - add v5.4s,v5.4s,v25.4s
|
| - add x17,x17,x19,lsl#32
|
| - add v9.4s,v9.4s,v25.4s
|
| - add x20,x20,x21,lsl#32
|
| - add v13.4s,v13.4s,v25.4s
|
| - ldp x19,x21,[x1,#48]
|
| - add v17.4s,v17.4s,v25.4s
|
| - add x1,x1,#64
|
| - add v21.4s,v21.4s,v25.4s
|
| -
|
| -#ifdef __ARMEB__
|
| - rev x5,x5
|
| - rev x7,x7
|
| - rev x9,x9
|
| - rev x11,x11
|
| - rev x13,x13
|
| - rev x15,x15
|
| - rev x17,x17
|
| - rev x20,x20
|
| -#endif
|
| - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
|
| - eor x5,x5,x6
|
| - eor x7,x7,x8
|
| - eor x9,x9,x10
|
| - eor x11,x11,x12
|
| - eor x13,x13,x14
|
| - eor v0.16b,v0.16b,v24.16b
|
| - eor x15,x15,x16
|
| - eor v1.16b,v1.16b,v25.16b
|
| - eor x17,x17,x19
|
| - eor v2.16b,v2.16b,v26.16b
|
| - eor x20,x20,x21
|
| - eor v3.16b,v3.16b,v27.16b
|
| - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
|
| -
|
| - stp x5,x7,[x0,#0] // store output
|
| - add x28,x28,#7 // increment counter
|
| - stp x9,x11,[x0,#16]
|
| - stp x13,x15,[x0,#32]
|
| - stp x17,x20,[x0,#48]
|
| - add x0,x0,#64
|
| - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
|
| -
|
| - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
|
| - eor v4.16b,v4.16b,v24.16b
|
| - eor v5.16b,v5.16b,v25.16b
|
| - eor v6.16b,v6.16b,v26.16b
|
| - eor v7.16b,v7.16b,v27.16b
|
| - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
|
| -
|
| - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
|
| - eor v8.16b,v8.16b,v0.16b
|
| - ldp q24,q25,[sp,#0]
|
| - eor v9.16b,v9.16b,v1.16b
|
| - ldp q26,q27,[sp,#32]
|
| - eor v10.16b,v10.16b,v2.16b
|
| - eor v11.16b,v11.16b,v3.16b
|
| - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
|
| -
|
| - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
|
| - eor v12.16b,v12.16b,v4.16b
|
| - eor v13.16b,v13.16b,v5.16b
|
| - eor v14.16b,v14.16b,v6.16b
|
| - eor v15.16b,v15.16b,v7.16b
|
| - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
|
| -
|
| - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
|
| - eor v16.16b,v16.16b,v8.16b
|
| - eor v17.16b,v17.16b,v9.16b
|
| - eor v18.16b,v18.16b,v10.16b
|
| - eor v19.16b,v19.16b,v11.16b
|
| - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
|
| -
|
| - shl v0.4s,v31.4s,#1 // 4 -> 8
|
| - eor v20.16b,v20.16b,v12.16b
|
| - eor v21.16b,v21.16b,v13.16b
|
| - eor v22.16b,v22.16b,v14.16b
|
| - eor v23.16b,v23.16b,v15.16b
|
| - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
|
| -
|
| - add v27.4s,v27.4s,v0.4s // += 8
|
| - add v28.4s,v28.4s,v0.4s
|
| - add v29.4s,v29.4s,v0.4s
|
| - add v30.4s,v30.4s,v0.4s
|
| -
|
| - b.hs .Loop_outer_512_neon
|
| -
|
| - adds x2,x2,#512
|
| - ushr v0.4s,v31.4s,#2 // 4 -> 1
|
| -
|
| - ldp d8,d9,[sp,#128+0] // meet ABI requirements
|
| - ldp d10,d11,[sp,#128+16]
|
| - ldp d12,d13,[sp,#128+32]
|
| - ldp d14,d15,[sp,#128+48]
|
| -
|
| - stp q24,q31,[sp,#0] // wipe off-load area
|
| - stp q24,q31,[sp,#32]
|
| - stp q24,q31,[sp,#64]
|
| -
|
| - b.eq .Ldone_512_neon
|
| -
|
| - cmp x2,#192
|
| - sub v27.4s,v27.4s,v0.4s // -= 1
|
| - sub v28.4s,v28.4s,v0.4s
|
| - sub v29.4s,v29.4s,v0.4s
|
| - add sp,sp,#128
|
| - b.hs .Loop_outer_neon
|
| -
|
| - eor v25.16b,v25.16b,v25.16b
|
| - eor v26.16b,v26.16b,v26.16b
|
| - eor v27.16b,v27.16b,v27.16b
|
| - eor v28.16b,v28.16b,v28.16b
|
| - eor v29.16b,v29.16b,v29.16b
|
| - eor v30.16b,v30.16b,v30.16b
|
| - b .Loop_outer
|
| -
|
| -.Ldone_512_neon:
|
| - ldp x19,x20,[x29,#16]
|
| - add sp,sp,#128+64
|
| - ldp x21,x22,[x29,#32]
|
| - ldp x23,x24,[x29,#48]
|
| - ldp x25,x26,[x29,#64]
|
| - ldp x27,x28,[x29,#80]
|
| - ldp x29,x30,[sp],#96
|
| - ret
|
| -.size ChaCha20_512_neon,.-ChaCha20_512_neon
|
| -#endif
|
|
|