OLD | NEW |
(Empty) | |
| 1 #if defined(__aarch64__) |
| 2 #include <openssl/arm_arch.h> |
| 3 |
| 4 .text |
| 5 |
| 6 |
| 7 |
| 8 .align 5 |
| 9 .Lsigma: |
| 10 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral |
| 11 .Lone: |
| 12 .long 1,0,0,0 |
| 13 .LOPENSSL_armcap_P: |
| 14 #ifdef __ILP32__ |
| 15 .long OPENSSL_armcap_P-. |
| 16 #else |
| 17 .quad OPENSSL_armcap_P-. |
| 18 #endif |
| 19 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,
89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,11
5,115,108,46,111,114,103,62,0 |
| 20 .align 2 |
| 21 |
| 22 .globl ChaCha20_ctr32 |
| 23 .hidden ChaCha20_ctr32 |
| 24 .type ChaCha20_ctr32,%function |
| 25 .align 5 |
| 26 ChaCha20_ctr32: |
| 27 cbz x2,.Labort |
| 28 adr x5,.LOPENSSL_armcap_P |
| 29 cmp x2,#192 |
| 30 b.lo .Lshort |
| 31 #ifdef __ILP32__ |
| 32 ldrsw x6,[x5] |
| 33 #else |
| 34 ldr x6,[x5] |
| 35 #endif |
| 36 ldr w17,[x6,x5] |
| 37 tst w17,#ARMV7_NEON |
| 38 b.ne ChaCha20_neon |
| 39 |
| 40 .Lshort: |
| 41 stp x29,x30,[sp,#-96]! |
| 42 add x29,sp,#0 |
| 43 |
| 44 adr x5,.Lsigma |
| 45 stp x19,x20,[sp,#16] |
| 46 stp x21,x22,[sp,#32] |
| 47 stp x23,x24,[sp,#48] |
| 48 stp x25,x26,[sp,#64] |
| 49 stp x27,x28,[sp,#80] |
| 50 sub sp,sp,#64 |
| 51 |
| 52 ldp x22,x23,[x5] // load sigma |
| 53 ldp x24,x25,[x3] // load key |
| 54 ldp x26,x27,[x3,#16] |
| 55 ldp x28,x30,[x4] // load counter |
| 56 #ifdef __ARMEB__ |
| 57 ror x24,x24,#32 |
| 58 ror x25,x25,#32 |
| 59 ror x26,x26,#32 |
| 60 ror x27,x27,#32 |
| 61 ror x28,x28,#32 |
| 62 ror x30,x30,#32 |
| 63 #endif |
| 64 |
| 65 .Loop_outer: |
| 66 mov w5,w22 // unpack key block |
| 67 lsr x6,x22,#32 |
| 68 mov w7,w23 |
| 69 lsr x8,x23,#32 |
| 70 mov w9,w24 |
| 71 lsr x10,x24,#32 |
| 72 mov w11,w25 |
| 73 lsr x12,x25,#32 |
| 74 mov w13,w26 |
| 75 lsr x14,x26,#32 |
| 76 mov w15,w27 |
| 77 lsr x16,x27,#32 |
| 78 mov w17,w28 |
| 79 lsr x19,x28,#32 |
| 80 mov w20,w30 |
| 81 lsr x21,x30,#32 |
| 82 |
| 83 mov x4,#10 |
| 84 subs x2,x2,#64 |
| 85 .Loop: |
| 86 sub x4,x4,#1 |
| 87 add w5,w5,w9 |
| 88 add w6,w6,w10 |
| 89 add w7,w7,w11 |
| 90 add w8,w8,w12 |
| 91 eor w17,w17,w5 |
| 92 eor w19,w19,w6 |
| 93 eor w20,w20,w7 |
| 94 eor w21,w21,w8 |
| 95 ror w17,w17,#16 |
| 96 ror w19,w19,#16 |
| 97 ror w20,w20,#16 |
| 98 ror w21,w21,#16 |
| 99 add w13,w13,w17 |
| 100 add w14,w14,w19 |
| 101 add w15,w15,w20 |
| 102 add w16,w16,w21 |
| 103 eor w9,w9,w13 |
| 104 eor w10,w10,w14 |
| 105 eor w11,w11,w15 |
| 106 eor w12,w12,w16 |
| 107 ror w9,w9,#20 |
| 108 ror w10,w10,#20 |
| 109 ror w11,w11,#20 |
| 110 ror w12,w12,#20 |
| 111 add w5,w5,w9 |
| 112 add w6,w6,w10 |
| 113 add w7,w7,w11 |
| 114 add w8,w8,w12 |
| 115 eor w17,w17,w5 |
| 116 eor w19,w19,w6 |
| 117 eor w20,w20,w7 |
| 118 eor w21,w21,w8 |
| 119 ror w17,w17,#24 |
| 120 ror w19,w19,#24 |
| 121 ror w20,w20,#24 |
| 122 ror w21,w21,#24 |
| 123 add w13,w13,w17 |
| 124 add w14,w14,w19 |
| 125 add w15,w15,w20 |
| 126 add w16,w16,w21 |
| 127 eor w9,w9,w13 |
| 128 eor w10,w10,w14 |
| 129 eor w11,w11,w15 |
| 130 eor w12,w12,w16 |
| 131 ror w9,w9,#25 |
| 132 ror w10,w10,#25 |
| 133 ror w11,w11,#25 |
| 134 ror w12,w12,#25 |
| 135 add w5,w5,w10 |
| 136 add w6,w6,w11 |
| 137 add w7,w7,w12 |
| 138 add w8,w8,w9 |
| 139 eor w21,w21,w5 |
| 140 eor w17,w17,w6 |
| 141 eor w19,w19,w7 |
| 142 eor w20,w20,w8 |
| 143 ror w21,w21,#16 |
| 144 ror w17,w17,#16 |
| 145 ror w19,w19,#16 |
| 146 ror w20,w20,#16 |
| 147 add w15,w15,w21 |
| 148 add w16,w16,w17 |
| 149 add w13,w13,w19 |
| 150 add w14,w14,w20 |
| 151 eor w10,w10,w15 |
| 152 eor w11,w11,w16 |
| 153 eor w12,w12,w13 |
| 154 eor w9,w9,w14 |
| 155 ror w10,w10,#20 |
| 156 ror w11,w11,#20 |
| 157 ror w12,w12,#20 |
| 158 ror w9,w9,#20 |
| 159 add w5,w5,w10 |
| 160 add w6,w6,w11 |
| 161 add w7,w7,w12 |
| 162 add w8,w8,w9 |
| 163 eor w21,w21,w5 |
| 164 eor w17,w17,w6 |
| 165 eor w19,w19,w7 |
| 166 eor w20,w20,w8 |
| 167 ror w21,w21,#24 |
| 168 ror w17,w17,#24 |
| 169 ror w19,w19,#24 |
| 170 ror w20,w20,#24 |
| 171 add w15,w15,w21 |
| 172 add w16,w16,w17 |
| 173 add w13,w13,w19 |
| 174 add w14,w14,w20 |
| 175 eor w10,w10,w15 |
| 176 eor w11,w11,w16 |
| 177 eor w12,w12,w13 |
| 178 eor w9,w9,w14 |
| 179 ror w10,w10,#25 |
| 180 ror w11,w11,#25 |
| 181 ror w12,w12,#25 |
| 182 ror w9,w9,#25 |
| 183 cbnz x4,.Loop |
| 184 |
| 185 add w5,w5,w22 // accumulate key block |
| 186 add x6,x6,x22,lsr#32 |
| 187 add w7,w7,w23 |
| 188 add x8,x8,x23,lsr#32 |
| 189 add w9,w9,w24 |
| 190 add x10,x10,x24,lsr#32 |
| 191 add w11,w11,w25 |
| 192 add x12,x12,x25,lsr#32 |
| 193 add w13,w13,w26 |
| 194 add x14,x14,x26,lsr#32 |
| 195 add w15,w15,w27 |
| 196 add x16,x16,x27,lsr#32 |
| 197 add w17,w17,w28 |
| 198 add x19,x19,x28,lsr#32 |
| 199 add w20,w20,w30 |
| 200 add x21,x21,x30,lsr#32 |
| 201 |
| 202 b.lo .Ltail |
| 203 |
| 204 add x5,x5,x6,lsl#32 // pack |
| 205 add x7,x7,x8,lsl#32 |
| 206 ldp x6,x8,[x1,#0] // load input |
| 207 add x9,x9,x10,lsl#32 |
| 208 add x11,x11,x12,lsl#32 |
| 209 ldp x10,x12,[x1,#16] |
| 210 add x13,x13,x14,lsl#32 |
| 211 add x15,x15,x16,lsl#32 |
| 212 ldp x14,x16,[x1,#32] |
| 213 add x17,x17,x19,lsl#32 |
| 214 add x20,x20,x21,lsl#32 |
| 215 ldp x19,x21,[x1,#48] |
| 216 add x1,x1,#64 |
| 217 #ifdef __ARMEB__ |
| 218 rev x5,x5 |
| 219 rev x7,x7 |
| 220 rev x9,x9 |
| 221 rev x11,x11 |
| 222 rev x13,x13 |
| 223 rev x15,x15 |
| 224 rev x17,x17 |
| 225 rev x20,x20 |
| 226 #endif |
| 227 eor x5,x5,x6 |
| 228 eor x7,x7,x8 |
| 229 eor x9,x9,x10 |
| 230 eor x11,x11,x12 |
| 231 eor x13,x13,x14 |
| 232 eor x15,x15,x16 |
| 233 eor x17,x17,x19 |
| 234 eor x20,x20,x21 |
| 235 |
| 236 stp x5,x7,[x0,#0] // store output |
| 237 add x28,x28,#1 // increment counter |
| 238 stp x9,x11,[x0,#16] |
| 239 stp x13,x15,[x0,#32] |
| 240 stp x17,x20,[x0,#48] |
| 241 add x0,x0,#64 |
| 242 |
| 243 b.hi .Loop_outer |
| 244 |
| 245 ldp x19,x20,[x29,#16] |
| 246 add sp,sp,#64 |
| 247 ldp x21,x22,[x29,#32] |
| 248 ldp x23,x24,[x29,#48] |
| 249 ldp x25,x26,[x29,#64] |
| 250 ldp x27,x28,[x29,#80] |
| 251 ldp x29,x30,[sp],#96 |
| 252 .Labort: |
| 253 ret |
| 254 |
| 255 .align 4 |
| 256 .Ltail: |
| 257 add x2,x2,#64 |
| 258 .Less_than_64: |
| 259 sub x0,x0,#1 |
| 260 add x1,x1,x2 |
| 261 add x0,x0,x2 |
| 262 add x4,sp,x2 |
| 263 neg x2,x2 |
| 264 |
| 265 add x5,x5,x6,lsl#32 // pack |
| 266 add x7,x7,x8,lsl#32 |
| 267 add x9,x9,x10,lsl#32 |
| 268 add x11,x11,x12,lsl#32 |
| 269 add x13,x13,x14,lsl#32 |
| 270 add x15,x15,x16,lsl#32 |
| 271 add x17,x17,x19,lsl#32 |
| 272 add x20,x20,x21,lsl#32 |
| 273 #ifdef __ARMEB__ |
| 274 rev x5,x5 |
| 275 rev x7,x7 |
| 276 rev x9,x9 |
| 277 rev x11,x11 |
| 278 rev x13,x13 |
| 279 rev x15,x15 |
| 280 rev x17,x17 |
| 281 rev x20,x20 |
| 282 #endif |
| 283 stp x5,x7,[sp,#0] |
| 284 stp x9,x11,[sp,#16] |
| 285 stp x13,x15,[sp,#32] |
| 286 stp x17,x20,[sp,#48] |
| 287 |
| 288 .Loop_tail: |
| 289 ldrb w10,[x1,x2] |
| 290 ldrb w11,[x4,x2] |
| 291 add x2,x2,#1 |
| 292 eor w10,w10,w11 |
| 293 strb w10,[x0,x2] |
| 294 cbnz x2,.Loop_tail |
| 295 |
| 296 stp xzr,xzr,[sp,#0] |
| 297 stp xzr,xzr,[sp,#16] |
| 298 stp xzr,xzr,[sp,#32] |
| 299 stp xzr,xzr,[sp,#48] |
| 300 |
| 301 ldp x19,x20,[x29,#16] |
| 302 add sp,sp,#64 |
| 303 ldp x21,x22,[x29,#32] |
| 304 ldp x23,x24,[x29,#48] |
| 305 ldp x25,x26,[x29,#64] |
| 306 ldp x27,x28,[x29,#80] |
| 307 ldp x29,x30,[sp],#96 |
| 308 ret |
| 309 .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
| 310 |
| 311 .type ChaCha20_neon,%function |
| 312 .align 5 |
| 313 ChaCha20_neon: |
| 314 stp x29,x30,[sp,#-96]! |
| 315 add x29,sp,#0 |
| 316 |
| 317 adr x5,.Lsigma |
| 318 stp x19,x20,[sp,#16] |
| 319 stp x21,x22,[sp,#32] |
| 320 stp x23,x24,[sp,#48] |
| 321 stp x25,x26,[sp,#64] |
| 322 stp x27,x28,[sp,#80] |
| 323 cmp x2,#512 |
| 324 b.hs .L512_or_more_neon |
| 325 |
| 326 sub sp,sp,#64 |
| 327 |
| 328 ldp x22,x23,[x5] // load sigma |
| 329 ld1 {v24.4s},[x5],#16 |
| 330 ldp x24,x25,[x3] // load key |
| 331 ldp x26,x27,[x3,#16] |
| 332 ld1 {v25.4s,v26.4s},[x3] |
| 333 ldp x28,x30,[x4] // load counter |
| 334 ld1 {v27.4s},[x4] |
| 335 ld1 {v31.4s},[x5] |
| 336 #ifdef __ARMEB__ |
| 337 rev64 v24.4s,v24.4s |
| 338 ror x24,x24,#32 |
| 339 ror x25,x25,#32 |
| 340 ror x26,x26,#32 |
| 341 ror x27,x27,#32 |
| 342 ror x28,x28,#32 |
| 343 ror x30,x30,#32 |
| 344 #endif |
| 345 add v27.4s,v27.4s,v31.4s // += 1 |
| 346 add v28.4s,v27.4s,v31.4s |
| 347 add v29.4s,v28.4s,v31.4s |
| 348 shl v31.4s,v31.4s,#2 // 1 -> 4 |
| 349 |
| 350 .Loop_outer_neon: |
| 351 mov w5,w22 // unpack key block |
| 352 lsr x6,x22,#32 |
| 353 mov v0.16b,v24.16b |
| 354 mov w7,w23 |
| 355 lsr x8,x23,#32 |
| 356 mov v4.16b,v24.16b |
| 357 mov w9,w24 |
| 358 lsr x10,x24,#32 |
| 359 mov v16.16b,v24.16b |
| 360 mov w11,w25 |
| 361 mov v1.16b,v25.16b |
| 362 lsr x12,x25,#32 |
| 363 mov v5.16b,v25.16b |
| 364 mov w13,w26 |
| 365 mov v17.16b,v25.16b |
| 366 lsr x14,x26,#32 |
| 367 mov v3.16b,v27.16b |
| 368 mov w15,w27 |
| 369 mov v7.16b,v28.16b |
| 370 lsr x16,x27,#32 |
| 371 mov v19.16b,v29.16b |
| 372 mov w17,w28 |
| 373 mov v2.16b,v26.16b |
| 374 lsr x19,x28,#32 |
| 375 mov v6.16b,v26.16b |
| 376 mov w20,w30 |
| 377 mov v18.16b,v26.16b |
| 378 lsr x21,x30,#32 |
| 379 |
| 380 mov x4,#10 |
| 381 subs x2,x2,#256 |
| 382 .Loop_neon: |
| 383 sub x4,x4,#1 |
| 384 add v0.4s,v0.4s,v1.4s |
| 385 add w5,w5,w9 |
| 386 add v4.4s,v4.4s,v5.4s |
| 387 add w6,w6,w10 |
| 388 add v16.4s,v16.4s,v17.4s |
| 389 add w7,w7,w11 |
| 390 eor v3.16b,v3.16b,v0.16b |
| 391 add w8,w8,w12 |
| 392 eor v7.16b,v7.16b,v4.16b |
| 393 eor w17,w17,w5 |
| 394 eor v19.16b,v19.16b,v16.16b |
| 395 eor w19,w19,w6 |
| 396 rev32 v3.8h,v3.8h |
| 397 eor w20,w20,w7 |
| 398 rev32 v7.8h,v7.8h |
| 399 eor w21,w21,w8 |
| 400 rev32 v19.8h,v19.8h |
| 401 ror w17,w17,#16 |
| 402 add v2.4s,v2.4s,v3.4s |
| 403 ror w19,w19,#16 |
| 404 add v6.4s,v6.4s,v7.4s |
| 405 ror w20,w20,#16 |
| 406 add v18.4s,v18.4s,v19.4s |
| 407 ror w21,w21,#16 |
| 408 eor v20.16b,v1.16b,v2.16b |
| 409 add w13,w13,w17 |
| 410 eor v21.16b,v5.16b,v6.16b |
| 411 add w14,w14,w19 |
| 412 eor v22.16b,v17.16b,v18.16b |
| 413 add w15,w15,w20 |
| 414 ushr v1.4s,v20.4s,#20 |
| 415 add w16,w16,w21 |
| 416 ushr v5.4s,v21.4s,#20 |
| 417 eor w9,w9,w13 |
| 418 ushr v17.4s,v22.4s,#20 |
| 419 eor w10,w10,w14 |
| 420 sli v1.4s,v20.4s,#12 |
| 421 eor w11,w11,w15 |
| 422 sli v5.4s,v21.4s,#12 |
| 423 eor w12,w12,w16 |
| 424 sli v17.4s,v22.4s,#12 |
| 425 ror w9,w9,#20 |
| 426 add v0.4s,v0.4s,v1.4s |
| 427 ror w10,w10,#20 |
| 428 add v4.4s,v4.4s,v5.4s |
| 429 ror w11,w11,#20 |
| 430 add v16.4s,v16.4s,v17.4s |
| 431 ror w12,w12,#20 |
| 432 eor v20.16b,v3.16b,v0.16b |
| 433 add w5,w5,w9 |
| 434 eor v21.16b,v7.16b,v4.16b |
| 435 add w6,w6,w10 |
| 436 eor v22.16b,v19.16b,v16.16b |
| 437 add w7,w7,w11 |
| 438 ushr v3.4s,v20.4s,#24 |
| 439 add w8,w8,w12 |
| 440 ushr v7.4s,v21.4s,#24 |
| 441 eor w17,w17,w5 |
| 442 ushr v19.4s,v22.4s,#24 |
| 443 eor w19,w19,w6 |
| 444 sli v3.4s,v20.4s,#8 |
| 445 eor w20,w20,w7 |
| 446 sli v7.4s,v21.4s,#8 |
| 447 eor w21,w21,w8 |
| 448 sli v19.4s,v22.4s,#8 |
| 449 ror w17,w17,#24 |
| 450 add v2.4s,v2.4s,v3.4s |
| 451 ror w19,w19,#24 |
| 452 add v6.4s,v6.4s,v7.4s |
| 453 ror w20,w20,#24 |
| 454 add v18.4s,v18.4s,v19.4s |
| 455 ror w21,w21,#24 |
| 456 eor v20.16b,v1.16b,v2.16b |
| 457 add w13,w13,w17 |
| 458 eor v21.16b,v5.16b,v6.16b |
| 459 add w14,w14,w19 |
| 460 eor v22.16b,v17.16b,v18.16b |
| 461 add w15,w15,w20 |
| 462 ushr v1.4s,v20.4s,#25 |
| 463 add w16,w16,w21 |
| 464 ushr v5.4s,v21.4s,#25 |
| 465 eor w9,w9,w13 |
| 466 ushr v17.4s,v22.4s,#25 |
| 467 eor w10,w10,w14 |
| 468 sli v1.4s,v20.4s,#7 |
| 469 eor w11,w11,w15 |
| 470 sli v5.4s,v21.4s,#7 |
| 471 eor w12,w12,w16 |
| 472 sli v17.4s,v22.4s,#7 |
| 473 ror w9,w9,#25 |
| 474 ext v2.16b,v2.16b,v2.16b,#8 |
| 475 ror w10,w10,#25 |
| 476 ext v6.16b,v6.16b,v6.16b,#8 |
| 477 ror w11,w11,#25 |
| 478 ext v18.16b,v18.16b,v18.16b,#8 |
| 479 ror w12,w12,#25 |
| 480 ext v3.16b,v3.16b,v3.16b,#12 |
| 481 ext v7.16b,v7.16b,v7.16b,#12 |
| 482 ext v19.16b,v19.16b,v19.16b,#12 |
| 483 ext v1.16b,v1.16b,v1.16b,#4 |
| 484 ext v5.16b,v5.16b,v5.16b,#4 |
| 485 ext v17.16b,v17.16b,v17.16b,#4 |
| 486 add v0.4s,v0.4s,v1.4s |
| 487 add w5,w5,w10 |
| 488 add v4.4s,v4.4s,v5.4s |
| 489 add w6,w6,w11 |
| 490 add v16.4s,v16.4s,v17.4s |
| 491 add w7,w7,w12 |
| 492 eor v3.16b,v3.16b,v0.16b |
| 493 add w8,w8,w9 |
| 494 eor v7.16b,v7.16b,v4.16b |
| 495 eor w21,w21,w5 |
| 496 eor v19.16b,v19.16b,v16.16b |
| 497 eor w17,w17,w6 |
| 498 rev32 v3.8h,v3.8h |
| 499 eor w19,w19,w7 |
| 500 rev32 v7.8h,v7.8h |
| 501 eor w20,w20,w8 |
| 502 rev32 v19.8h,v19.8h |
| 503 ror w21,w21,#16 |
| 504 add v2.4s,v2.4s,v3.4s |
| 505 ror w17,w17,#16 |
| 506 add v6.4s,v6.4s,v7.4s |
| 507 ror w19,w19,#16 |
| 508 add v18.4s,v18.4s,v19.4s |
| 509 ror w20,w20,#16 |
| 510 eor v20.16b,v1.16b,v2.16b |
| 511 add w15,w15,w21 |
| 512 eor v21.16b,v5.16b,v6.16b |
| 513 add w16,w16,w17 |
| 514 eor v22.16b,v17.16b,v18.16b |
| 515 add w13,w13,w19 |
| 516 ushr v1.4s,v20.4s,#20 |
| 517 add w14,w14,w20 |
| 518 ushr v5.4s,v21.4s,#20 |
| 519 eor w10,w10,w15 |
| 520 ushr v17.4s,v22.4s,#20 |
| 521 eor w11,w11,w16 |
| 522 sli v1.4s,v20.4s,#12 |
| 523 eor w12,w12,w13 |
| 524 sli v5.4s,v21.4s,#12 |
| 525 eor w9,w9,w14 |
| 526 sli v17.4s,v22.4s,#12 |
| 527 ror w10,w10,#20 |
| 528 add v0.4s,v0.4s,v1.4s |
| 529 ror w11,w11,#20 |
| 530 add v4.4s,v4.4s,v5.4s |
| 531 ror w12,w12,#20 |
| 532 add v16.4s,v16.4s,v17.4s |
| 533 ror w9,w9,#20 |
| 534 eor v20.16b,v3.16b,v0.16b |
| 535 add w5,w5,w10 |
| 536 eor v21.16b,v7.16b,v4.16b |
| 537 add w6,w6,w11 |
| 538 eor v22.16b,v19.16b,v16.16b |
| 539 add w7,w7,w12 |
| 540 ushr v3.4s,v20.4s,#24 |
| 541 add w8,w8,w9 |
| 542 ushr v7.4s,v21.4s,#24 |
| 543 eor w21,w21,w5 |
| 544 ushr v19.4s,v22.4s,#24 |
| 545 eor w17,w17,w6 |
| 546 sli v3.4s,v20.4s,#8 |
| 547 eor w19,w19,w7 |
| 548 sli v7.4s,v21.4s,#8 |
| 549 eor w20,w20,w8 |
| 550 sli v19.4s,v22.4s,#8 |
| 551 ror w21,w21,#24 |
| 552 add v2.4s,v2.4s,v3.4s |
| 553 ror w17,w17,#24 |
| 554 add v6.4s,v6.4s,v7.4s |
| 555 ror w19,w19,#24 |
| 556 add v18.4s,v18.4s,v19.4s |
| 557 ror w20,w20,#24 |
| 558 eor v20.16b,v1.16b,v2.16b |
| 559 add w15,w15,w21 |
| 560 eor v21.16b,v5.16b,v6.16b |
| 561 add w16,w16,w17 |
| 562 eor v22.16b,v17.16b,v18.16b |
| 563 add w13,w13,w19 |
| 564 ushr v1.4s,v20.4s,#25 |
| 565 add w14,w14,w20 |
| 566 ushr v5.4s,v21.4s,#25 |
| 567 eor w10,w10,w15 |
| 568 ushr v17.4s,v22.4s,#25 |
| 569 eor w11,w11,w16 |
| 570 sli v1.4s,v20.4s,#7 |
| 571 eor w12,w12,w13 |
| 572 sli v5.4s,v21.4s,#7 |
| 573 eor w9,w9,w14 |
| 574 sli v17.4s,v22.4s,#7 |
| 575 ror w10,w10,#25 |
| 576 ext v2.16b,v2.16b,v2.16b,#8 |
| 577 ror w11,w11,#25 |
| 578 ext v6.16b,v6.16b,v6.16b,#8 |
| 579 ror w12,w12,#25 |
| 580 ext v18.16b,v18.16b,v18.16b,#8 |
| 581 ror w9,w9,#25 |
| 582 ext v3.16b,v3.16b,v3.16b,#4 |
| 583 ext v7.16b,v7.16b,v7.16b,#4 |
| 584 ext v19.16b,v19.16b,v19.16b,#4 |
| 585 ext v1.16b,v1.16b,v1.16b,#12 |
| 586 ext v5.16b,v5.16b,v5.16b,#12 |
| 587 ext v17.16b,v17.16b,v17.16b,#12 |
| 588 cbnz x4,.Loop_neon |
| 589 |
| 590 add w5,w5,w22 // accumulate key block |
| 591 add v0.4s,v0.4s,v24.4s |
| 592 add x6,x6,x22,lsr#32 |
| 593 add v4.4s,v4.4s,v24.4s |
| 594 add w7,w7,w23 |
| 595 add v16.4s,v16.4s,v24.4s |
| 596 add x8,x8,x23,lsr#32 |
| 597 add v2.4s,v2.4s,v26.4s |
| 598 add w9,w9,w24 |
| 599 add v6.4s,v6.4s,v26.4s |
| 600 add x10,x10,x24,lsr#32 |
| 601 add v18.4s,v18.4s,v26.4s |
| 602 add w11,w11,w25 |
| 603 add v3.4s,v3.4s,v27.4s |
| 604 add x12,x12,x25,lsr#32 |
| 605 add w13,w13,w26 |
| 606 add v7.4s,v7.4s,v28.4s |
| 607 add x14,x14,x26,lsr#32 |
| 608 add w15,w15,w27 |
| 609 add v19.4s,v19.4s,v29.4s |
| 610 add x16,x16,x27,lsr#32 |
| 611 add w17,w17,w28 |
| 612 add v1.4s,v1.4s,v25.4s |
| 613 add x19,x19,x28,lsr#32 |
| 614 add w20,w20,w30 |
| 615 add v5.4s,v5.4s,v25.4s |
| 616 add x21,x21,x30,lsr#32 |
| 617 add v17.4s,v17.4s,v25.4s |
| 618 |
| 619 b.lo .Ltail_neon |
| 620 |
| 621 add x5,x5,x6,lsl#32 // pack |
| 622 add x7,x7,x8,lsl#32 |
| 623 ldp x6,x8,[x1,#0] // load input |
| 624 add x9,x9,x10,lsl#32 |
| 625 add x11,x11,x12,lsl#32 |
| 626 ldp x10,x12,[x1,#16] |
| 627 add x13,x13,x14,lsl#32 |
| 628 add x15,x15,x16,lsl#32 |
| 629 ldp x14,x16,[x1,#32] |
| 630 add x17,x17,x19,lsl#32 |
| 631 add x20,x20,x21,lsl#32 |
| 632 ldp x19,x21,[x1,#48] |
| 633 add x1,x1,#64 |
| 634 #ifdef __ARMEB__ |
| 635 rev x5,x5 |
| 636 rev x7,x7 |
| 637 rev x9,x9 |
| 638 rev x11,x11 |
| 639 rev x13,x13 |
| 640 rev x15,x15 |
| 641 rev x17,x17 |
| 642 rev x20,x20 |
| 643 #endif |
| 644 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| 645 eor x5,x5,x6 |
| 646 eor x7,x7,x8 |
| 647 eor x9,x9,x10 |
| 648 eor x11,x11,x12 |
| 649 eor x13,x13,x14 |
| 650 eor v0.16b,v0.16b,v20.16b |
| 651 eor x15,x15,x16 |
| 652 eor v1.16b,v1.16b,v21.16b |
| 653 eor x17,x17,x19 |
| 654 eor v2.16b,v2.16b,v22.16b |
| 655 eor x20,x20,x21 |
| 656 eor v3.16b,v3.16b,v23.16b |
| 657 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| 658 |
| 659 stp x5,x7,[x0,#0] // store output |
| 660 add x28,x28,#4 // increment counter |
| 661 stp x9,x11,[x0,#16] |
| 662 add v27.4s,v27.4s,v31.4s // += 4 |
| 663 stp x13,x15,[x0,#32] |
| 664 add v28.4s,v28.4s,v31.4s |
| 665 stp x17,x20,[x0,#48] |
| 666 add v29.4s,v29.4s,v31.4s |
| 667 add x0,x0,#64 |
| 668 |
| 669 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
| 670 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
| 671 |
| 672 eor v4.16b,v4.16b,v20.16b |
| 673 eor v5.16b,v5.16b,v21.16b |
| 674 eor v6.16b,v6.16b,v22.16b |
| 675 eor v7.16b,v7.16b,v23.16b |
| 676 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
| 677 |
| 678 eor v16.16b,v16.16b,v0.16b |
| 679 eor v17.16b,v17.16b,v1.16b |
| 680 eor v18.16b,v18.16b,v2.16b |
| 681 eor v19.16b,v19.16b,v3.16b |
| 682 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
| 683 |
| 684 b.hi .Loop_outer_neon |
| 685 |
| 686 ldp x19,x20,[x29,#16] |
| 687 add sp,sp,#64 |
| 688 ldp x21,x22,[x29,#32] |
| 689 ldp x23,x24,[x29,#48] |
| 690 ldp x25,x26,[x29,#64] |
| 691 ldp x27,x28,[x29,#80] |
| 692 ldp x29,x30,[sp],#96 |
| 693 ret |
| 694 |
| 695 .Ltail_neon: |
| 696 add x2,x2,#256 |
| 697 cmp x2,#64 |
| 698 b.lo .Less_than_64 |
| 699 |
| 700 add x5,x5,x6,lsl#32 // pack |
| 701 add x7,x7,x8,lsl#32 |
| 702 ldp x6,x8,[x1,#0] // load input |
| 703 add x9,x9,x10,lsl#32 |
| 704 add x11,x11,x12,lsl#32 |
| 705 ldp x10,x12,[x1,#16] |
| 706 add x13,x13,x14,lsl#32 |
| 707 add x15,x15,x16,lsl#32 |
| 708 ldp x14,x16,[x1,#32] |
| 709 add x17,x17,x19,lsl#32 |
| 710 add x20,x20,x21,lsl#32 |
| 711 ldp x19,x21,[x1,#48] |
| 712 add x1,x1,#64 |
| 713 #ifdef __ARMEB__ |
| 714 rev x5,x5 |
| 715 rev x7,x7 |
| 716 rev x9,x9 |
| 717 rev x11,x11 |
| 718 rev x13,x13 |
| 719 rev x15,x15 |
| 720 rev x17,x17 |
| 721 rev x20,x20 |
| 722 #endif |
| 723 eor x5,x5,x6 |
| 724 eor x7,x7,x8 |
| 725 eor x9,x9,x10 |
| 726 eor x11,x11,x12 |
| 727 eor x13,x13,x14 |
| 728 eor x15,x15,x16 |
| 729 eor x17,x17,x19 |
| 730 eor x20,x20,x21 |
| 731 |
| 732 stp x5,x7,[x0,#0] // store output |
| 733 add x28,x28,#4 // increment counter |
| 734 stp x9,x11,[x0,#16] |
| 735 stp x13,x15,[x0,#32] |
| 736 stp x17,x20,[x0,#48] |
| 737 add x0,x0,#64 |
| 738 b.eq .Ldone_neon |
| 739 sub x2,x2,#64 |
| 740 cmp x2,#64 |
| 741 b.lo .Less_than_128 |
| 742 |
| 743 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| 744 eor v0.16b,v0.16b,v20.16b |
| 745 eor v1.16b,v1.16b,v21.16b |
| 746 eor v2.16b,v2.16b,v22.16b |
| 747 eor v3.16b,v3.16b,v23.16b |
| 748 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
| 749 b.eq .Ldone_neon |
| 750 sub x2,x2,#64 |
| 751 cmp x2,#64 |
| 752 b.lo .Less_than_192 |
| 753 |
| 754 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| 755 eor v4.16b,v4.16b,v20.16b |
| 756 eor v5.16b,v5.16b,v21.16b |
| 757 eor v6.16b,v6.16b,v22.16b |
| 758 eor v7.16b,v7.16b,v23.16b |
| 759 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
| 760 b.eq .Ldone_neon |
| 761 sub x2,x2,#64 |
| 762 |
| 763 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] |
| 764 b .Last_neon |
| 765 |
| 766 .Less_than_128: |
| 767 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] |
| 768 b .Last_neon |
| 769 .Less_than_192: |
| 770 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] |
| 771 b .Last_neon |
| 772 |
| 773 .align 4 |
| 774 .Last_neon: |
| 775 sub x0,x0,#1 |
| 776 add x1,x1,x2 |
| 777 add x0,x0,x2 |
| 778 add x4,sp,x2 |
| 779 neg x2,x2 |
| 780 |
| 781 .Loop_tail_neon: |
| 782 ldrb w10,[x1,x2] |
| 783 ldrb w11,[x4,x2] |
| 784 add x2,x2,#1 |
| 785 eor w10,w10,w11 |
| 786 strb w10,[x0,x2] |
| 787 cbnz x2,.Loop_tail_neon |
| 788 |
| 789 stp xzr,xzr,[sp,#0] |
| 790 stp xzr,xzr,[sp,#16] |
| 791 stp xzr,xzr,[sp,#32] |
| 792 stp xzr,xzr,[sp,#48] |
| 793 |
| 794 .Ldone_neon: |
| 795 ldp x19,x20,[x29,#16] |
| 796 add sp,sp,#64 |
| 797 ldp x21,x22,[x29,#32] |
| 798 ldp x23,x24,[x29,#48] |
| 799 ldp x25,x26,[x29,#64] |
| 800 ldp x27,x28,[x29,#80] |
| 801 ldp x29,x30,[sp],#96 |
| 802 ret |
| 803 .size ChaCha20_neon,.-ChaCha20_neon |
| 804 .type ChaCha20_512_neon,%function |
| 805 .align 5 |
| 806 ChaCha20_512_neon: |
| 807 stp x29,x30,[sp,#-96]! |
| 808 add x29,sp,#0 |
| 809 |
| 810 adr x5,.Lsigma |
| 811 stp x19,x20,[sp,#16] |
| 812 stp x21,x22,[sp,#32] |
| 813 stp x23,x24,[sp,#48] |
| 814 stp x25,x26,[sp,#64] |
| 815 stp x27,x28,[sp,#80] |
| 816 |
| 817 .L512_or_more_neon: |
| 818 sub sp,sp,#128+64 |
| 819 |
| 820 ldp x22,x23,[x5] // load sigma |
| 821 ld1 {v24.4s},[x5],#16 |
| 822 ldp x24,x25,[x3] // load key |
| 823 ldp x26,x27,[x3,#16] |
| 824 ld1 {v25.4s,v26.4s},[x3] |
| 825 ldp x28,x30,[x4] // load counter |
| 826 ld1 {v27.4s},[x4] |
| 827 ld1 {v31.4s},[x5] |
| 828 #ifdef __ARMEB__ |
| 829 rev64 v24.4s,v24.4s |
| 830 ror x24,x24,#32 |
| 831 ror x25,x25,#32 |
| 832 ror x26,x26,#32 |
| 833 ror x27,x27,#32 |
| 834 ror x28,x28,#32 |
| 835 ror x30,x30,#32 |
| 836 #endif |
| 837 add v27.4s,v27.4s,v31.4s // += 1 |
| 838 stp q24,q25,[sp,#0] // off-load key block, invariant part |
| 839 add v27.4s,v27.4s,v31.4s // not typo |
| 840 str q26,[sp,#32] |
| 841 add v28.4s,v27.4s,v31.4s |
| 842 add v29.4s,v28.4s,v31.4s |
| 843 add v30.4s,v29.4s,v31.4s |
| 844 shl v31.4s,v31.4s,#2 // 1 -> 4 |
| 845 |
| 846 stp d8,d9,[sp,#128+0] // meet ABI requirements |
| 847 stp d10,d11,[sp,#128+16] |
| 848 stp d12,d13,[sp,#128+32] |
| 849 stp d14,d15,[sp,#128+48] |
| 850 |
| 851 sub x2,x2,#512 // not typo |
| 852 |
| 853 .Loop_outer_512_neon: |
| 854 mov v0.16b,v24.16b |
| 855 mov v4.16b,v24.16b |
| 856 mov v8.16b,v24.16b |
| 857 mov v12.16b,v24.16b |
| 858 mov v16.16b,v24.16b |
| 859 mov v20.16b,v24.16b |
| 860 mov v1.16b,v25.16b |
| 861 mov w5,w22 // unpack key block |
| 862 mov v5.16b,v25.16b |
| 863 lsr x6,x22,#32 |
| 864 mov v9.16b,v25.16b |
| 865 mov w7,w23 |
| 866 mov v13.16b,v25.16b |
| 867 lsr x8,x23,#32 |
| 868 mov v17.16b,v25.16b |
| 869 mov w9,w24 |
| 870 mov v21.16b,v25.16b |
| 871 lsr x10,x24,#32 |
| 872 mov v3.16b,v27.16b |
| 873 mov w11,w25 |
| 874 mov v7.16b,v28.16b |
| 875 lsr x12,x25,#32 |
| 876 mov v11.16b,v29.16b |
| 877 mov w13,w26 |
| 878 mov v15.16b,v30.16b |
| 879 lsr x14,x26,#32 |
| 880 mov v2.16b,v26.16b |
| 881 mov w15,w27 |
| 882 mov v6.16b,v26.16b |
| 883 lsr x16,x27,#32 |
| 884 add v19.4s,v3.4s,v31.4s // +4 |
| 885 mov w17,w28 |
| 886 add v23.4s,v7.4s,v31.4s // +4 |
| 887 lsr x19,x28,#32 |
| 888 mov v10.16b,v26.16b |
| 889 mov w20,w30 |
| 890 mov v14.16b,v26.16b |
| 891 lsr x21,x30,#32 |
| 892 mov v18.16b,v26.16b |
| 893 stp q27,q28,[sp,#48] // off-load key block, variable
part |
| 894 mov v22.16b,v26.16b |
| 895 str q29,[sp,#80] |
| 896 |
| 897 mov x4,#5 |
| 898 subs x2,x2,#512 |
| 899 .Loop_upper_neon: |
| 900 sub x4,x4,#1 |
| 901 add v0.4s,v0.4s,v1.4s |
| 902 add w5,w5,w9 |
| 903 add v4.4s,v4.4s,v5.4s |
| 904 add w6,w6,w10 |
| 905 add v8.4s,v8.4s,v9.4s |
| 906 add w7,w7,w11 |
| 907 add v12.4s,v12.4s,v13.4s |
| 908 add w8,w8,w12 |
| 909 add v16.4s,v16.4s,v17.4s |
| 910 eor w17,w17,w5 |
| 911 add v20.4s,v20.4s,v21.4s |
| 912 eor w19,w19,w6 |
| 913 eor v3.16b,v3.16b,v0.16b |
| 914 eor w20,w20,w7 |
| 915 eor v7.16b,v7.16b,v4.16b |
| 916 eor w21,w21,w8 |
| 917 eor v11.16b,v11.16b,v8.16b |
| 918 ror w17,w17,#16 |
| 919 eor v15.16b,v15.16b,v12.16b |
| 920 ror w19,w19,#16 |
| 921 eor v19.16b,v19.16b,v16.16b |
| 922 ror w20,w20,#16 |
| 923 eor v23.16b,v23.16b,v20.16b |
| 924 ror w21,w21,#16 |
| 925 rev32 v3.8h,v3.8h |
| 926 add w13,w13,w17 |
| 927 rev32 v7.8h,v7.8h |
| 928 add w14,w14,w19 |
| 929 rev32 v11.8h,v11.8h |
| 930 add w15,w15,w20 |
| 931 rev32 v15.8h,v15.8h |
| 932 add w16,w16,w21 |
| 933 rev32 v19.8h,v19.8h |
| 934 eor w9,w9,w13 |
| 935 rev32 v23.8h,v23.8h |
| 936 eor w10,w10,w14 |
| 937 add v2.4s,v2.4s,v3.4s |
| 938 eor w11,w11,w15 |
| 939 add v6.4s,v6.4s,v7.4s |
| 940 eor w12,w12,w16 |
| 941 add v10.4s,v10.4s,v11.4s |
| 942 ror w9,w9,#20 |
| 943 add v14.4s,v14.4s,v15.4s |
| 944 ror w10,w10,#20 |
| 945 add v18.4s,v18.4s,v19.4s |
| 946 ror w11,w11,#20 |
| 947 add v22.4s,v22.4s,v23.4s |
| 948 ror w12,w12,#20 |
| 949 eor v24.16b,v1.16b,v2.16b |
| 950 add w5,w5,w9 |
| 951 eor v25.16b,v5.16b,v6.16b |
| 952 add w6,w6,w10 |
| 953 eor v26.16b,v9.16b,v10.16b |
| 954 add w7,w7,w11 |
| 955 eor v27.16b,v13.16b,v14.16b |
| 956 add w8,w8,w12 |
| 957 eor v28.16b,v17.16b,v18.16b |
| 958 eor w17,w17,w5 |
| 959 eor v29.16b,v21.16b,v22.16b |
| 960 eor w19,w19,w6 |
| 961 ushr v1.4s,v24.4s,#20 |
| 962 eor w20,w20,w7 |
| 963 ushr v5.4s,v25.4s,#20 |
| 964 eor w21,w21,w8 |
| 965 ushr v9.4s,v26.4s,#20 |
| 966 ror w17,w17,#24 |
| 967 ushr v13.4s,v27.4s,#20 |
| 968 ror w19,w19,#24 |
| 969 ushr v17.4s,v28.4s,#20 |
| 970 ror w20,w20,#24 |
| 971 ushr v21.4s,v29.4s,#20 |
| 972 ror w21,w21,#24 |
| 973 sli v1.4s,v24.4s,#12 |
| 974 add w13,w13,w17 |
| 975 sli v5.4s,v25.4s,#12 |
| 976 add w14,w14,w19 |
| 977 sli v9.4s,v26.4s,#12 |
| 978 add w15,w15,w20 |
| 979 sli v13.4s,v27.4s,#12 |
| 980 add w16,w16,w21 |
| 981 sli v17.4s,v28.4s,#12 |
| 982 eor w9,w9,w13 |
| 983 sli v21.4s,v29.4s,#12 |
| 984 eor w10,w10,w14 |
| 985 add v0.4s,v0.4s,v1.4s |
| 986 eor w11,w11,w15 |
| 987 add v4.4s,v4.4s,v5.4s |
| 988 eor w12,w12,w16 |
| 989 add v8.4s,v8.4s,v9.4s |
| 990 ror w9,w9,#25 |
| 991 add v12.4s,v12.4s,v13.4s |
| 992 ror w10,w10,#25 |
| 993 add v16.4s,v16.4s,v17.4s |
| 994 ror w11,w11,#25 |
| 995 add v20.4s,v20.4s,v21.4s |
| 996 ror w12,w12,#25 |
| 997 eor v24.16b,v3.16b,v0.16b |
| 998 add w5,w5,w10 |
| 999 eor v25.16b,v7.16b,v4.16b |
| 1000 add w6,w6,w11 |
| 1001 eor v26.16b,v11.16b,v8.16b |
| 1002 add w7,w7,w12 |
| 1003 eor v27.16b,v15.16b,v12.16b |
| 1004 add w8,w8,w9 |
| 1005 eor v28.16b,v19.16b,v16.16b |
| 1006 eor w21,w21,w5 |
| 1007 eor v29.16b,v23.16b,v20.16b |
| 1008 eor w17,w17,w6 |
| 1009 ushr v3.4s,v24.4s,#24 |
| 1010 eor w19,w19,w7 |
| 1011 ushr v7.4s,v25.4s,#24 |
| 1012 eor w20,w20,w8 |
| 1013 ushr v11.4s,v26.4s,#24 |
| 1014 ror w21,w21,#16 |
| 1015 ushr v15.4s,v27.4s,#24 |
| 1016 ror w17,w17,#16 |
| 1017 ushr v19.4s,v28.4s,#24 |
| 1018 ror w19,w19,#16 |
| 1019 ushr v23.4s,v29.4s,#24 |
| 1020 ror w20,w20,#16 |
| 1021 sli v3.4s,v24.4s,#8 |
| 1022 add w15,w15,w21 |
| 1023 sli v7.4s,v25.4s,#8 |
| 1024 add w16,w16,w17 |
| 1025 sli v11.4s,v26.4s,#8 |
| 1026 add w13,w13,w19 |
| 1027 sli v15.4s,v27.4s,#8 |
| 1028 add w14,w14,w20 |
| 1029 sli v19.4s,v28.4s,#8 |
| 1030 eor w10,w10,w15 |
| 1031 sli v23.4s,v29.4s,#8 |
| 1032 eor w11,w11,w16 |
| 1033 add v2.4s,v2.4s,v3.4s |
| 1034 eor w12,w12,w13 |
| 1035 add v6.4s,v6.4s,v7.4s |
| 1036 eor w9,w9,w14 |
| 1037 add v10.4s,v10.4s,v11.4s |
| 1038 ror w10,w10,#20 |
| 1039 add v14.4s,v14.4s,v15.4s |
| 1040 ror w11,w11,#20 |
| 1041 add v18.4s,v18.4s,v19.4s |
| 1042 ror w12,w12,#20 |
| 1043 add v22.4s,v22.4s,v23.4s |
| 1044 ror w9,w9,#20 |
| 1045 eor v24.16b,v1.16b,v2.16b |
| 1046 add w5,w5,w10 |
| 1047 eor v25.16b,v5.16b,v6.16b |
| 1048 add w6,w6,w11 |
| 1049 eor v26.16b,v9.16b,v10.16b |
| 1050 add w7,w7,w12 |
| 1051 eor v27.16b,v13.16b,v14.16b |
| 1052 add w8,w8,w9 |
| 1053 eor v28.16b,v17.16b,v18.16b |
| 1054 eor w21,w21,w5 |
| 1055 eor v29.16b,v21.16b,v22.16b |
| 1056 eor w17,w17,w6 |
| 1057 ushr v1.4s,v24.4s,#25 |
| 1058 eor w19,w19,w7 |
| 1059 ushr v5.4s,v25.4s,#25 |
| 1060 eor w20,w20,w8 |
| 1061 ushr v9.4s,v26.4s,#25 |
| 1062 ror w21,w21,#24 |
| 1063 ushr v13.4s,v27.4s,#25 |
| 1064 ror w17,w17,#24 |
| 1065 ushr v17.4s,v28.4s,#25 |
| 1066 ror w19,w19,#24 |
| 1067 ushr v21.4s,v29.4s,#25 |
| 1068 ror w20,w20,#24 |
| 1069 sli v1.4s,v24.4s,#7 |
| 1070 add w15,w15,w21 |
| 1071 sli v5.4s,v25.4s,#7 |
| 1072 add w16,w16,w17 |
| 1073 sli v9.4s,v26.4s,#7 |
| 1074 add w13,w13,w19 |
| 1075 sli v13.4s,v27.4s,#7 |
| 1076 add w14,w14,w20 |
| 1077 sli v17.4s,v28.4s,#7 |
| 1078 eor w10,w10,w15 |
| 1079 sli v21.4s,v29.4s,#7 |
| 1080 eor w11,w11,w16 |
| 1081 ext v2.16b,v2.16b,v2.16b,#8 |
| 1082 eor w12,w12,w13 |
| 1083 ext v6.16b,v6.16b,v6.16b,#8 |
| 1084 eor w9,w9,w14 |
| 1085 ext v10.16b,v10.16b,v10.16b,#8 |
| 1086 ror w10,w10,#25 |
| 1087 ext v14.16b,v14.16b,v14.16b,#8 |
| 1088 ror w11,w11,#25 |
| 1089 ext v18.16b,v18.16b,v18.16b,#8 |
| 1090 ror w12,w12,#25 |
| 1091 ext v22.16b,v22.16b,v22.16b,#8 |
| 1092 ror w9,w9,#25 |
| 1093 ext v3.16b,v3.16b,v3.16b,#12 |
| 1094 ext v7.16b,v7.16b,v7.16b,#12 |
| 1095 ext v11.16b,v11.16b,v11.16b,#12 |
| 1096 ext v15.16b,v15.16b,v15.16b,#12 |
| 1097 ext v19.16b,v19.16b,v19.16b,#12 |
| 1098 ext v23.16b,v23.16b,v23.16b,#12 |
| 1099 ext v1.16b,v1.16b,v1.16b,#4 |
| 1100 ext v5.16b,v5.16b,v5.16b,#4 |
| 1101 ext v9.16b,v9.16b,v9.16b,#4 |
| 1102 ext v13.16b,v13.16b,v13.16b,#4 |
| 1103 ext v17.16b,v17.16b,v17.16b,#4 |
| 1104 ext v21.16b,v21.16b,v21.16b,#4 |
| 1105 add v0.4s,v0.4s,v1.4s |
| 1106 add w5,w5,w9 |
| 1107 add v4.4s,v4.4s,v5.4s |
| 1108 add w6,w6,w10 |
| 1109 add v8.4s,v8.4s,v9.4s |
| 1110 add w7,w7,w11 |
| 1111 add v12.4s,v12.4s,v13.4s |
| 1112 add w8,w8,w12 |
| 1113 add v16.4s,v16.4s,v17.4s |
| 1114 eor w17,w17,w5 |
| 1115 add v20.4s,v20.4s,v21.4s |
| 1116 eor w19,w19,w6 |
| 1117 eor v3.16b,v3.16b,v0.16b |
| 1118 eor w20,w20,w7 |
| 1119 eor v7.16b,v7.16b,v4.16b |
| 1120 eor w21,w21,w8 |
| 1121 eor v11.16b,v11.16b,v8.16b |
| 1122 ror w17,w17,#16 |
| 1123 eor v15.16b,v15.16b,v12.16b |
| 1124 ror w19,w19,#16 |
| 1125 eor v19.16b,v19.16b,v16.16b |
| 1126 ror w20,w20,#16 |
| 1127 eor v23.16b,v23.16b,v20.16b |
| 1128 ror w21,w21,#16 |
| 1129 rev32 v3.8h,v3.8h |
| 1130 add w13,w13,w17 |
| 1131 rev32 v7.8h,v7.8h |
| 1132 add w14,w14,w19 |
| 1133 rev32 v11.8h,v11.8h |
| 1134 add w15,w15,w20 |
| 1135 rev32 v15.8h,v15.8h |
| 1136 add w16,w16,w21 |
| 1137 rev32 v19.8h,v19.8h |
| 1138 eor w9,w9,w13 |
| 1139 rev32 v23.8h,v23.8h |
| 1140 eor w10,w10,w14 |
| 1141 add v2.4s,v2.4s,v3.4s |
| 1142 eor w11,w11,w15 |
| 1143 add v6.4s,v6.4s,v7.4s |
| 1144 eor w12,w12,w16 |
| 1145 add v10.4s,v10.4s,v11.4s |
| 1146 ror w9,w9,#20 |
| 1147 add v14.4s,v14.4s,v15.4s |
| 1148 ror w10,w10,#20 |
| 1149 add v18.4s,v18.4s,v19.4s |
| 1150 ror w11,w11,#20 |
| 1151 add v22.4s,v22.4s,v23.4s |
| 1152 ror w12,w12,#20 |
| 1153 eor v24.16b,v1.16b,v2.16b |
| 1154 add w5,w5,w9 |
| 1155 eor v25.16b,v5.16b,v6.16b |
| 1156 add w6,w6,w10 |
| 1157 eor v26.16b,v9.16b,v10.16b |
| 1158 add w7,w7,w11 |
| 1159 eor v27.16b,v13.16b,v14.16b |
| 1160 add w8,w8,w12 |
| 1161 eor v28.16b,v17.16b,v18.16b |
| 1162 eor w17,w17,w5 |
| 1163 eor v29.16b,v21.16b,v22.16b |
| 1164 eor w19,w19,w6 |
| 1165 ushr v1.4s,v24.4s,#20 |
| 1166 eor w20,w20,w7 |
| 1167 ushr v5.4s,v25.4s,#20 |
| 1168 eor w21,w21,w8 |
| 1169 ushr v9.4s,v26.4s,#20 |
| 1170 ror w17,w17,#24 |
| 1171 ushr v13.4s,v27.4s,#20 |
| 1172 ror w19,w19,#24 |
| 1173 ushr v17.4s,v28.4s,#20 |
| 1174 ror w20,w20,#24 |
| 1175 ushr v21.4s,v29.4s,#20 |
| 1176 ror w21,w21,#24 |
| 1177 sli v1.4s,v24.4s,#12 |
| 1178 add w13,w13,w17 |
| 1179 sli v5.4s,v25.4s,#12 |
| 1180 add w14,w14,w19 |
| 1181 sli v9.4s,v26.4s,#12 |
| 1182 add w15,w15,w20 |
| 1183 sli v13.4s,v27.4s,#12 |
| 1184 add w16,w16,w21 |
| 1185 sli v17.4s,v28.4s,#12 |
| 1186 eor w9,w9,w13 |
| 1187 sli v21.4s,v29.4s,#12 |
| 1188 eor w10,w10,w14 |
| 1189 add v0.4s,v0.4s,v1.4s |
| 1190 eor w11,w11,w15 |
| 1191 add v4.4s,v4.4s,v5.4s |
| 1192 eor w12,w12,w16 |
| 1193 add v8.4s,v8.4s,v9.4s |
| 1194 ror w9,w9,#25 |
| 1195 add v12.4s,v12.4s,v13.4s |
| 1196 ror w10,w10,#25 |
| 1197 add v16.4s,v16.4s,v17.4s |
| 1198 ror w11,w11,#25 |
| 1199 add v20.4s,v20.4s,v21.4s |
| 1200 ror w12,w12,#25 |
| 1201 eor v24.16b,v3.16b,v0.16b |
| 1202 add w5,w5,w10 |
| 1203 eor v25.16b,v7.16b,v4.16b |
| 1204 add w6,w6,w11 |
| 1205 eor v26.16b,v11.16b,v8.16b |
| 1206 add w7,w7,w12 |
| 1207 eor v27.16b,v15.16b,v12.16b |
| 1208 add w8,w8,w9 |
| 1209 eor v28.16b,v19.16b,v16.16b |
| 1210 eor w21,w21,w5 |
| 1211 eor v29.16b,v23.16b,v20.16b |
| 1212 eor w17,w17,w6 |
| 1213 ushr v3.4s,v24.4s,#24 |
| 1214 eor w19,w19,w7 |
| 1215 ushr v7.4s,v25.4s,#24 |
| 1216 eor w20,w20,w8 |
| 1217 ushr v11.4s,v26.4s,#24 |
| 1218 ror w21,w21,#16 |
| 1219 ushr v15.4s,v27.4s,#24 |
| 1220 ror w17,w17,#16 |
| 1221 ushr v19.4s,v28.4s,#24 |
| 1222 ror w19,w19,#16 |
| 1223 ushr v23.4s,v29.4s,#24 |
| 1224 ror w20,w20,#16 |
| 1225 sli v3.4s,v24.4s,#8 |
| 1226 add w15,w15,w21 |
| 1227 sli v7.4s,v25.4s,#8 |
| 1228 add w16,w16,w17 |
| 1229 sli v11.4s,v26.4s,#8 |
| 1230 add w13,w13,w19 |
| 1231 sli v15.4s,v27.4s,#8 |
| 1232 add w14,w14,w20 |
| 1233 sli v19.4s,v28.4s,#8 |
| 1234 eor w10,w10,w15 |
| 1235 sli v23.4s,v29.4s,#8 |
| 1236 eor w11,w11,w16 |
| 1237 add v2.4s,v2.4s,v3.4s |
| 1238 eor w12,w12,w13 |
| 1239 add v6.4s,v6.4s,v7.4s |
| 1240 eor w9,w9,w14 |
| 1241 add v10.4s,v10.4s,v11.4s |
| 1242 ror w10,w10,#20 |
| 1243 add v14.4s,v14.4s,v15.4s |
| 1244 ror w11,w11,#20 |
| 1245 add v18.4s,v18.4s,v19.4s |
| 1246 ror w12,w12,#20 |
| 1247 add v22.4s,v22.4s,v23.4s |
| 1248 ror w9,w9,#20 |
| 1249 eor v24.16b,v1.16b,v2.16b |
| 1250 add w5,w5,w10 |
| 1251 eor v25.16b,v5.16b,v6.16b |
| 1252 add w6,w6,w11 |
| 1253 eor v26.16b,v9.16b,v10.16b |
| 1254 add w7,w7,w12 |
| 1255 eor v27.16b,v13.16b,v14.16b |
| 1256 add w8,w8,w9 |
| 1257 eor v28.16b,v17.16b,v18.16b |
| 1258 eor w21,w21,w5 |
| 1259 eor v29.16b,v21.16b,v22.16b |
| 1260 eor w17,w17,w6 |
| 1261 ushr v1.4s,v24.4s,#25 |
| 1262 eor w19,w19,w7 |
| 1263 ushr v5.4s,v25.4s,#25 |
| 1264 eor w20,w20,w8 |
| 1265 ushr v9.4s,v26.4s,#25 |
| 1266 ror w21,w21,#24 |
| 1267 ushr v13.4s,v27.4s,#25 |
| 1268 ror w17,w17,#24 |
| 1269 ushr v17.4s,v28.4s,#25 |
| 1270 ror w19,w19,#24 |
| 1271 ushr v21.4s,v29.4s,#25 |
| 1272 ror w20,w20,#24 |
| 1273 sli v1.4s,v24.4s,#7 |
| 1274 add w15,w15,w21 |
| 1275 sli v5.4s,v25.4s,#7 |
| 1276 add w16,w16,w17 |
| 1277 sli v9.4s,v26.4s,#7 |
| 1278 add w13,w13,w19 |
| 1279 sli v13.4s,v27.4s,#7 |
| 1280 add w14,w14,w20 |
| 1281 sli v17.4s,v28.4s,#7 |
| 1282 eor w10,w10,w15 |
| 1283 sli v21.4s,v29.4s,#7 |
| 1284 eor w11,w11,w16 |
| 1285 ext v2.16b,v2.16b,v2.16b,#8 |
| 1286 eor w12,w12,w13 |
| 1287 ext v6.16b,v6.16b,v6.16b,#8 |
| 1288 eor w9,w9,w14 |
| 1289 ext v10.16b,v10.16b,v10.16b,#8 |
| 1290 ror w10,w10,#25 |
| 1291 ext v14.16b,v14.16b,v14.16b,#8 |
| 1292 ror w11,w11,#25 |
| 1293 ext v18.16b,v18.16b,v18.16b,#8 |
| 1294 ror w12,w12,#25 |
| 1295 ext v22.16b,v22.16b,v22.16b,#8 |
| 1296 ror w9,w9,#25 |
| 1297 ext v3.16b,v3.16b,v3.16b,#4 |
| 1298 ext v7.16b,v7.16b,v7.16b,#4 |
| 1299 ext v11.16b,v11.16b,v11.16b,#4 |
| 1300 ext v15.16b,v15.16b,v15.16b,#4 |
| 1301 ext v19.16b,v19.16b,v19.16b,#4 |
| 1302 ext v23.16b,v23.16b,v23.16b,#4 |
| 1303 ext v1.16b,v1.16b,v1.16b,#12 |
| 1304 ext v5.16b,v5.16b,v5.16b,#12 |
| 1305 ext v9.16b,v9.16b,v9.16b,#12 |
| 1306 ext v13.16b,v13.16b,v13.16b,#12 |
| 1307 ext v17.16b,v17.16b,v17.16b,#12 |
| 1308 ext v21.16b,v21.16b,v21.16b,#12 |
| 1309 cbnz x4,.Loop_upper_neon |
| 1310 |
| 1311 add w5,w5,w22 // accumulate key block |
| 1312 add x6,x6,x22,lsr#32 |
| 1313 add w7,w7,w23 |
| 1314 add x8,x8,x23,lsr#32 |
| 1315 add w9,w9,w24 |
| 1316 add x10,x10,x24,lsr#32 |
| 1317 add w11,w11,w25 |
| 1318 add x12,x12,x25,lsr#32 |
| 1319 add w13,w13,w26 |
| 1320 add x14,x14,x26,lsr#32 |
| 1321 add w15,w15,w27 |
| 1322 add x16,x16,x27,lsr#32 |
| 1323 add w17,w17,w28 |
| 1324 add x19,x19,x28,lsr#32 |
| 1325 add w20,w20,w30 |
| 1326 add x21,x21,x30,lsr#32 |
| 1327 |
| 1328 add x5,x5,x6,lsl#32 // pack |
| 1329 add x7,x7,x8,lsl#32 |
| 1330 ldp x6,x8,[x1,#0] // load input |
| 1331 add x9,x9,x10,lsl#32 |
| 1332 add x11,x11,x12,lsl#32 |
| 1333 ldp x10,x12,[x1,#16] |
| 1334 add x13,x13,x14,lsl#32 |
| 1335 add x15,x15,x16,lsl#32 |
| 1336 ldp x14,x16,[x1,#32] |
| 1337 add x17,x17,x19,lsl#32 |
| 1338 add x20,x20,x21,lsl#32 |
| 1339 ldp x19,x21,[x1,#48] |
| 1340 add x1,x1,#64 |
| 1341 #ifdef __ARMEB__ |
| 1342 rev x5,x5 |
| 1343 rev x7,x7 |
| 1344 rev x9,x9 |
| 1345 rev x11,x11 |
| 1346 rev x13,x13 |
| 1347 rev x15,x15 |
| 1348 rev x17,x17 |
| 1349 rev x20,x20 |
| 1350 #endif |
| 1351 eor x5,x5,x6 |
| 1352 eor x7,x7,x8 |
| 1353 eor x9,x9,x10 |
| 1354 eor x11,x11,x12 |
| 1355 eor x13,x13,x14 |
| 1356 eor x15,x15,x16 |
| 1357 eor x17,x17,x19 |
| 1358 eor x20,x20,x21 |
| 1359 |
| 1360 stp x5,x7,[x0,#0] // store output |
| 1361 add x28,x28,#1 // increment counter |
| 1362 mov w5,w22 // unpack key block |
| 1363 lsr x6,x22,#32 |
| 1364 stp x9,x11,[x0,#16] |
| 1365 mov w7,w23 |
| 1366 lsr x8,x23,#32 |
| 1367 stp x13,x15,[x0,#32] |
| 1368 mov w9,w24 |
| 1369 lsr x10,x24,#32 |
| 1370 stp x17,x20,[x0,#48] |
| 1371 add x0,x0,#64 |
| 1372 mov w11,w25 |
| 1373 lsr x12,x25,#32 |
| 1374 mov w13,w26 |
| 1375 lsr x14,x26,#32 |
| 1376 mov w15,w27 |
| 1377 lsr x16,x27,#32 |
| 1378 mov w17,w28 |
| 1379 lsr x19,x28,#32 |
| 1380 mov w20,w30 |
| 1381 lsr x21,x30,#32 |
| 1382 |
| 1383 mov x4,#5 |
| 1384 .Loop_lower_neon: |
| 1385 sub x4,x4,#1 |
| 1386 add v0.4s,v0.4s,v1.4s |
| 1387 add w5,w5,w9 |
| 1388 add v4.4s,v4.4s,v5.4s |
| 1389 add w6,w6,w10 |
| 1390 add v8.4s,v8.4s,v9.4s |
| 1391 add w7,w7,w11 |
| 1392 add v12.4s,v12.4s,v13.4s |
| 1393 add w8,w8,w12 |
| 1394 add v16.4s,v16.4s,v17.4s |
| 1395 eor w17,w17,w5 |
| 1396 add v20.4s,v20.4s,v21.4s |
| 1397 eor w19,w19,w6 |
| 1398 eor v3.16b,v3.16b,v0.16b |
| 1399 eor w20,w20,w7 |
| 1400 eor v7.16b,v7.16b,v4.16b |
| 1401 eor w21,w21,w8 |
| 1402 eor v11.16b,v11.16b,v8.16b |
| 1403 ror w17,w17,#16 |
| 1404 eor v15.16b,v15.16b,v12.16b |
| 1405 ror w19,w19,#16 |
| 1406 eor v19.16b,v19.16b,v16.16b |
| 1407 ror w20,w20,#16 |
| 1408 eor v23.16b,v23.16b,v20.16b |
| 1409 ror w21,w21,#16 |
| 1410 rev32 v3.8h,v3.8h |
| 1411 add w13,w13,w17 |
| 1412 rev32 v7.8h,v7.8h |
| 1413 add w14,w14,w19 |
| 1414 rev32 v11.8h,v11.8h |
| 1415 add w15,w15,w20 |
| 1416 rev32 v15.8h,v15.8h |
| 1417 add w16,w16,w21 |
| 1418 rev32 v19.8h,v19.8h |
| 1419 eor w9,w9,w13 |
| 1420 rev32 v23.8h,v23.8h |
| 1421 eor w10,w10,w14 |
| 1422 add v2.4s,v2.4s,v3.4s |
| 1423 eor w11,w11,w15 |
| 1424 add v6.4s,v6.4s,v7.4s |
| 1425 eor w12,w12,w16 |
| 1426 add v10.4s,v10.4s,v11.4s |
| 1427 ror w9,w9,#20 |
| 1428 add v14.4s,v14.4s,v15.4s |
| 1429 ror w10,w10,#20 |
| 1430 add v18.4s,v18.4s,v19.4s |
| 1431 ror w11,w11,#20 |
| 1432 add v22.4s,v22.4s,v23.4s |
| 1433 ror w12,w12,#20 |
| 1434 eor v24.16b,v1.16b,v2.16b |
| 1435 add w5,w5,w9 |
| 1436 eor v25.16b,v5.16b,v6.16b |
| 1437 add w6,w6,w10 |
| 1438 eor v26.16b,v9.16b,v10.16b |
| 1439 add w7,w7,w11 |
| 1440 eor v27.16b,v13.16b,v14.16b |
| 1441 add w8,w8,w12 |
| 1442 eor v28.16b,v17.16b,v18.16b |
| 1443 eor w17,w17,w5 |
| 1444 eor v29.16b,v21.16b,v22.16b |
| 1445 eor w19,w19,w6 |
| 1446 ushr v1.4s,v24.4s,#20 |
| 1447 eor w20,w20,w7 |
| 1448 ushr v5.4s,v25.4s,#20 |
| 1449 eor w21,w21,w8 |
| 1450 ushr v9.4s,v26.4s,#20 |
| 1451 ror w17,w17,#24 |
| 1452 ushr v13.4s,v27.4s,#20 |
| 1453 ror w19,w19,#24 |
| 1454 ushr v17.4s,v28.4s,#20 |
| 1455 ror w20,w20,#24 |
| 1456 ushr v21.4s,v29.4s,#20 |
| 1457 ror w21,w21,#24 |
| 1458 sli v1.4s,v24.4s,#12 |
| 1459 add w13,w13,w17 |
| 1460 sli v5.4s,v25.4s,#12 |
| 1461 add w14,w14,w19 |
| 1462 sli v9.4s,v26.4s,#12 |
| 1463 add w15,w15,w20 |
| 1464 sli v13.4s,v27.4s,#12 |
| 1465 add w16,w16,w21 |
| 1466 sli v17.4s,v28.4s,#12 |
| 1467 eor w9,w9,w13 |
| 1468 sli v21.4s,v29.4s,#12 |
| 1469 eor w10,w10,w14 |
| 1470 add v0.4s,v0.4s,v1.4s |
| 1471 eor w11,w11,w15 |
| 1472 add v4.4s,v4.4s,v5.4s |
| 1473 eor w12,w12,w16 |
| 1474 add v8.4s,v8.4s,v9.4s |
| 1475 ror w9,w9,#25 |
| 1476 add v12.4s,v12.4s,v13.4s |
| 1477 ror w10,w10,#25 |
| 1478 add v16.4s,v16.4s,v17.4s |
| 1479 ror w11,w11,#25 |
| 1480 add v20.4s,v20.4s,v21.4s |
| 1481 ror w12,w12,#25 |
| 1482 eor v24.16b,v3.16b,v0.16b |
| 1483 add w5,w5,w10 |
| 1484 eor v25.16b,v7.16b,v4.16b |
| 1485 add w6,w6,w11 |
| 1486 eor v26.16b,v11.16b,v8.16b |
| 1487 add w7,w7,w12 |
| 1488 eor v27.16b,v15.16b,v12.16b |
| 1489 add w8,w8,w9 |
| 1490 eor v28.16b,v19.16b,v16.16b |
| 1491 eor w21,w21,w5 |
| 1492 eor v29.16b,v23.16b,v20.16b |
| 1493 eor w17,w17,w6 |
| 1494 ushr v3.4s,v24.4s,#24 |
| 1495 eor w19,w19,w7 |
| 1496 ushr v7.4s,v25.4s,#24 |
| 1497 eor w20,w20,w8 |
| 1498 ushr v11.4s,v26.4s,#24 |
| 1499 ror w21,w21,#16 |
| 1500 ushr v15.4s,v27.4s,#24 |
| 1501 ror w17,w17,#16 |
| 1502 ushr v19.4s,v28.4s,#24 |
| 1503 ror w19,w19,#16 |
| 1504 ushr v23.4s,v29.4s,#24 |
| 1505 ror w20,w20,#16 |
| 1506 sli v3.4s,v24.4s,#8 |
| 1507 add w15,w15,w21 |
| 1508 sli v7.4s,v25.4s,#8 |
| 1509 add w16,w16,w17 |
| 1510 sli v11.4s,v26.4s,#8 |
| 1511 add w13,w13,w19 |
| 1512 sli v15.4s,v27.4s,#8 |
| 1513 add w14,w14,w20 |
| 1514 sli v19.4s,v28.4s,#8 |
| 1515 eor w10,w10,w15 |
| 1516 sli v23.4s,v29.4s,#8 |
| 1517 eor w11,w11,w16 |
| 1518 add v2.4s,v2.4s,v3.4s |
| 1519 eor w12,w12,w13 |
| 1520 add v6.4s,v6.4s,v7.4s |
| 1521 eor w9,w9,w14 |
| 1522 add v10.4s,v10.4s,v11.4s |
| 1523 ror w10,w10,#20 |
| 1524 add v14.4s,v14.4s,v15.4s |
| 1525 ror w11,w11,#20 |
| 1526 add v18.4s,v18.4s,v19.4s |
| 1527 ror w12,w12,#20 |
| 1528 add v22.4s,v22.4s,v23.4s |
| 1529 ror w9,w9,#20 |
| 1530 eor v24.16b,v1.16b,v2.16b |
| 1531 add w5,w5,w10 |
| 1532 eor v25.16b,v5.16b,v6.16b |
| 1533 add w6,w6,w11 |
| 1534 eor v26.16b,v9.16b,v10.16b |
| 1535 add w7,w7,w12 |
| 1536 eor v27.16b,v13.16b,v14.16b |
| 1537 add w8,w8,w9 |
| 1538 eor v28.16b,v17.16b,v18.16b |
| 1539 eor w21,w21,w5 |
| 1540 eor v29.16b,v21.16b,v22.16b |
| 1541 eor w17,w17,w6 |
| 1542 ushr v1.4s,v24.4s,#25 |
| 1543 eor w19,w19,w7 |
| 1544 ushr v5.4s,v25.4s,#25 |
| 1545 eor w20,w20,w8 |
| 1546 ushr v9.4s,v26.4s,#25 |
| 1547 ror w21,w21,#24 |
| 1548 ushr v13.4s,v27.4s,#25 |
| 1549 ror w17,w17,#24 |
| 1550 ushr v17.4s,v28.4s,#25 |
| 1551 ror w19,w19,#24 |
| 1552 ushr v21.4s,v29.4s,#25 |
| 1553 ror w20,w20,#24 |
| 1554 sli v1.4s,v24.4s,#7 |
| 1555 add w15,w15,w21 |
| 1556 sli v5.4s,v25.4s,#7 |
| 1557 add w16,w16,w17 |
| 1558 sli v9.4s,v26.4s,#7 |
| 1559 add w13,w13,w19 |
| 1560 sli v13.4s,v27.4s,#7 |
| 1561 add w14,w14,w20 |
| 1562 sli v17.4s,v28.4s,#7 |
| 1563 eor w10,w10,w15 |
| 1564 sli v21.4s,v29.4s,#7 |
| 1565 eor w11,w11,w16 |
| 1566 ext v2.16b,v2.16b,v2.16b,#8 |
| 1567 eor w12,w12,w13 |
| 1568 ext v6.16b,v6.16b,v6.16b,#8 |
| 1569 eor w9,w9,w14 |
| 1570 ext v10.16b,v10.16b,v10.16b,#8 |
| 1571 ror w10,w10,#25 |
| 1572 ext v14.16b,v14.16b,v14.16b,#8 |
| 1573 ror w11,w11,#25 |
| 1574 ext v18.16b,v18.16b,v18.16b,#8 |
| 1575 ror w12,w12,#25 |
| 1576 ext v22.16b,v22.16b,v22.16b,#8 |
| 1577 ror w9,w9,#25 |
| 1578 ext v3.16b,v3.16b,v3.16b,#12 |
| 1579 ext v7.16b,v7.16b,v7.16b,#12 |
| 1580 ext v11.16b,v11.16b,v11.16b,#12 |
| 1581 ext v15.16b,v15.16b,v15.16b,#12 |
| 1582 ext v19.16b,v19.16b,v19.16b,#12 |
| 1583 ext v23.16b,v23.16b,v23.16b,#12 |
| 1584 ext v1.16b,v1.16b,v1.16b,#4 |
| 1585 ext v5.16b,v5.16b,v5.16b,#4 |
| 1586 ext v9.16b,v9.16b,v9.16b,#4 |
| 1587 ext v13.16b,v13.16b,v13.16b,#4 |
| 1588 ext v17.16b,v17.16b,v17.16b,#4 |
| 1589 ext v21.16b,v21.16b,v21.16b,#4 |
| 1590 add v0.4s,v0.4s,v1.4s |
| 1591 add w5,w5,w9 |
| 1592 add v4.4s,v4.4s,v5.4s |
| 1593 add w6,w6,w10 |
| 1594 add v8.4s,v8.4s,v9.4s |
| 1595 add w7,w7,w11 |
| 1596 add v12.4s,v12.4s,v13.4s |
| 1597 add w8,w8,w12 |
| 1598 add v16.4s,v16.4s,v17.4s |
| 1599 eor w17,w17,w5 |
| 1600 add v20.4s,v20.4s,v21.4s |
| 1601 eor w19,w19,w6 |
| 1602 eor v3.16b,v3.16b,v0.16b |
| 1603 eor w20,w20,w7 |
| 1604 eor v7.16b,v7.16b,v4.16b |
| 1605 eor w21,w21,w8 |
| 1606 eor v11.16b,v11.16b,v8.16b |
| 1607 ror w17,w17,#16 |
| 1608 eor v15.16b,v15.16b,v12.16b |
| 1609 ror w19,w19,#16 |
| 1610 eor v19.16b,v19.16b,v16.16b |
| 1611 ror w20,w20,#16 |
| 1612 eor v23.16b,v23.16b,v20.16b |
| 1613 ror w21,w21,#16 |
| 1614 rev32 v3.8h,v3.8h |
| 1615 add w13,w13,w17 |
| 1616 rev32 v7.8h,v7.8h |
| 1617 add w14,w14,w19 |
| 1618 rev32 v11.8h,v11.8h |
| 1619 add w15,w15,w20 |
| 1620 rev32 v15.8h,v15.8h |
| 1621 add w16,w16,w21 |
| 1622 rev32 v19.8h,v19.8h |
| 1623 eor w9,w9,w13 |
| 1624 rev32 v23.8h,v23.8h |
| 1625 eor w10,w10,w14 |
| 1626 add v2.4s,v2.4s,v3.4s |
| 1627 eor w11,w11,w15 |
| 1628 add v6.4s,v6.4s,v7.4s |
| 1629 eor w12,w12,w16 |
| 1630 add v10.4s,v10.4s,v11.4s |
| 1631 ror w9,w9,#20 |
| 1632 add v14.4s,v14.4s,v15.4s |
| 1633 ror w10,w10,#20 |
| 1634 add v18.4s,v18.4s,v19.4s |
| 1635 ror w11,w11,#20 |
| 1636 add v22.4s,v22.4s,v23.4s |
| 1637 ror w12,w12,#20 |
| 1638 eor v24.16b,v1.16b,v2.16b |
| 1639 add w5,w5,w9 |
| 1640 eor v25.16b,v5.16b,v6.16b |
| 1641 add w6,w6,w10 |
| 1642 eor v26.16b,v9.16b,v10.16b |
| 1643 add w7,w7,w11 |
| 1644 eor v27.16b,v13.16b,v14.16b |
| 1645 add w8,w8,w12 |
| 1646 eor v28.16b,v17.16b,v18.16b |
| 1647 eor w17,w17,w5 |
| 1648 eor v29.16b,v21.16b,v22.16b |
| 1649 eor w19,w19,w6 |
| 1650 ushr v1.4s,v24.4s,#20 |
| 1651 eor w20,w20,w7 |
| 1652 ushr v5.4s,v25.4s,#20 |
| 1653 eor w21,w21,w8 |
| 1654 ushr v9.4s,v26.4s,#20 |
| 1655 ror w17,w17,#24 |
| 1656 ushr v13.4s,v27.4s,#20 |
| 1657 ror w19,w19,#24 |
| 1658 ushr v17.4s,v28.4s,#20 |
| 1659 ror w20,w20,#24 |
| 1660 ushr v21.4s,v29.4s,#20 |
| 1661 ror w21,w21,#24 |
| 1662 sli v1.4s,v24.4s,#12 |
| 1663 add w13,w13,w17 |
| 1664 sli v5.4s,v25.4s,#12 |
| 1665 add w14,w14,w19 |
| 1666 sli v9.4s,v26.4s,#12 |
| 1667 add w15,w15,w20 |
| 1668 sli v13.4s,v27.4s,#12 |
| 1669 add w16,w16,w21 |
| 1670 sli v17.4s,v28.4s,#12 |
| 1671 eor w9,w9,w13 |
| 1672 sli v21.4s,v29.4s,#12 |
| 1673 eor w10,w10,w14 |
| 1674 add v0.4s,v0.4s,v1.4s |
| 1675 eor w11,w11,w15 |
| 1676 add v4.4s,v4.4s,v5.4s |
| 1677 eor w12,w12,w16 |
| 1678 add v8.4s,v8.4s,v9.4s |
| 1679 ror w9,w9,#25 |
| 1680 add v12.4s,v12.4s,v13.4s |
| 1681 ror w10,w10,#25 |
| 1682 add v16.4s,v16.4s,v17.4s |
| 1683 ror w11,w11,#25 |
| 1684 add v20.4s,v20.4s,v21.4s |
| 1685 ror w12,w12,#25 |
| 1686 eor v24.16b,v3.16b,v0.16b |
| 1687 add w5,w5,w10 |
| 1688 eor v25.16b,v7.16b,v4.16b |
| 1689 add w6,w6,w11 |
| 1690 eor v26.16b,v11.16b,v8.16b |
| 1691 add w7,w7,w12 |
| 1692 eor v27.16b,v15.16b,v12.16b |
| 1693 add w8,w8,w9 |
| 1694 eor v28.16b,v19.16b,v16.16b |
| 1695 eor w21,w21,w5 |
| 1696 eor v29.16b,v23.16b,v20.16b |
| 1697 eor w17,w17,w6 |
| 1698 ushr v3.4s,v24.4s,#24 |
| 1699 eor w19,w19,w7 |
| 1700 ushr v7.4s,v25.4s,#24 |
| 1701 eor w20,w20,w8 |
| 1702 ushr v11.4s,v26.4s,#24 |
| 1703 ror w21,w21,#16 |
| 1704 ushr v15.4s,v27.4s,#24 |
| 1705 ror w17,w17,#16 |
| 1706 ushr v19.4s,v28.4s,#24 |
| 1707 ror w19,w19,#16 |
| 1708 ushr v23.4s,v29.4s,#24 |
| 1709 ror w20,w20,#16 |
| 1710 sli v3.4s,v24.4s,#8 |
| 1711 add w15,w15,w21 |
| 1712 sli v7.4s,v25.4s,#8 |
| 1713 add w16,w16,w17 |
| 1714 sli v11.4s,v26.4s,#8 |
| 1715 add w13,w13,w19 |
| 1716 sli v15.4s,v27.4s,#8 |
| 1717 add w14,w14,w20 |
| 1718 sli v19.4s,v28.4s,#8 |
| 1719 eor w10,w10,w15 |
| 1720 sli v23.4s,v29.4s,#8 |
| 1721 eor w11,w11,w16 |
| 1722 add v2.4s,v2.4s,v3.4s |
| 1723 eor w12,w12,w13 |
| 1724 add v6.4s,v6.4s,v7.4s |
| 1725 eor w9,w9,w14 |
| 1726 add v10.4s,v10.4s,v11.4s |
| 1727 ror w10,w10,#20 |
| 1728 add v14.4s,v14.4s,v15.4s |
| 1729 ror w11,w11,#20 |
| 1730 add v18.4s,v18.4s,v19.4s |
| 1731 ror w12,w12,#20 |
| 1732 add v22.4s,v22.4s,v23.4s |
| 1733 ror w9,w9,#20 |
| 1734 eor v24.16b,v1.16b,v2.16b |
| 1735 add w5,w5,w10 |
| 1736 eor v25.16b,v5.16b,v6.16b |
| 1737 add w6,w6,w11 |
| 1738 eor v26.16b,v9.16b,v10.16b |
| 1739 add w7,w7,w12 |
| 1740 eor v27.16b,v13.16b,v14.16b |
| 1741 add w8,w8,w9 |
| 1742 eor v28.16b,v17.16b,v18.16b |
| 1743 eor w21,w21,w5 |
| 1744 eor v29.16b,v21.16b,v22.16b |
| 1745 eor w17,w17,w6 |
| 1746 ushr v1.4s,v24.4s,#25 |
| 1747 eor w19,w19,w7 |
| 1748 ushr v5.4s,v25.4s,#25 |
| 1749 eor w20,w20,w8 |
| 1750 ushr v9.4s,v26.4s,#25 |
| 1751 ror w21,w21,#24 |
| 1752 ushr v13.4s,v27.4s,#25 |
| 1753 ror w17,w17,#24 |
| 1754 ushr v17.4s,v28.4s,#25 |
| 1755 ror w19,w19,#24 |
| 1756 ushr v21.4s,v29.4s,#25 |
| 1757 ror w20,w20,#24 |
| 1758 sli v1.4s,v24.4s,#7 |
| 1759 add w15,w15,w21 |
| 1760 sli v5.4s,v25.4s,#7 |
| 1761 add w16,w16,w17 |
| 1762 sli v9.4s,v26.4s,#7 |
| 1763 add w13,w13,w19 |
| 1764 sli v13.4s,v27.4s,#7 |
| 1765 add w14,w14,w20 |
| 1766 sli v17.4s,v28.4s,#7 |
| 1767 eor w10,w10,w15 |
| 1768 sli v21.4s,v29.4s,#7 |
| 1769 eor w11,w11,w16 |
| 1770 ext v2.16b,v2.16b,v2.16b,#8 |
| 1771 eor w12,w12,w13 |
| 1772 ext v6.16b,v6.16b,v6.16b,#8 |
| 1773 eor w9,w9,w14 |
| 1774 ext v10.16b,v10.16b,v10.16b,#8 |
| 1775 ror w10,w10,#25 |
| 1776 ext v14.16b,v14.16b,v14.16b,#8 |
| 1777 ror w11,w11,#25 |
| 1778 ext v18.16b,v18.16b,v18.16b,#8 |
| 1779 ror w12,w12,#25 |
| 1780 ext v22.16b,v22.16b,v22.16b,#8 |
| 1781 ror w9,w9,#25 |
| 1782 ext v3.16b,v3.16b,v3.16b,#4 |
| 1783 ext v7.16b,v7.16b,v7.16b,#4 |
| 1784 ext v11.16b,v11.16b,v11.16b,#4 |
| 1785 ext v15.16b,v15.16b,v15.16b,#4 |
| 1786 ext v19.16b,v19.16b,v19.16b,#4 |
| 1787 ext v23.16b,v23.16b,v23.16b,#4 |
| 1788 ext v1.16b,v1.16b,v1.16b,#12 |
| 1789 ext v5.16b,v5.16b,v5.16b,#12 |
| 1790 ext v9.16b,v9.16b,v9.16b,#12 |
| 1791 ext v13.16b,v13.16b,v13.16b,#12 |
| 1792 ext v17.16b,v17.16b,v17.16b,#12 |
| 1793 ext v21.16b,v21.16b,v21.16b,#12 |
| 1794 cbnz x4,.Loop_lower_neon |
| 1795 |
| 1796 add w5,w5,w22 // accumulate key block |
| 1797 ldp q24,q25,[sp,#0] |
| 1798 add x6,x6,x22,lsr#32 |
| 1799 ldp q26,q27,[sp,#32] |
| 1800 add w7,w7,w23 |
| 1801 ldp q28,q29,[sp,#64] |
| 1802 add x8,x8,x23,lsr#32 |
| 1803 add v0.4s,v0.4s,v24.4s |
| 1804 add w9,w9,w24 |
| 1805 add v4.4s,v4.4s,v24.4s |
| 1806 add x10,x10,x24,lsr#32 |
| 1807 add v8.4s,v8.4s,v24.4s |
| 1808 add w11,w11,w25 |
| 1809 add v12.4s,v12.4s,v24.4s |
| 1810 add x12,x12,x25,lsr#32 |
| 1811 add v16.4s,v16.4s,v24.4s |
| 1812 add w13,w13,w26 |
| 1813 add v20.4s,v20.4s,v24.4s |
| 1814 add x14,x14,x26,lsr#32 |
| 1815 add v2.4s,v2.4s,v26.4s |
| 1816 add w15,w15,w27 |
| 1817 add v6.4s,v6.4s,v26.4s |
| 1818 add x16,x16,x27,lsr#32 |
| 1819 add v10.4s,v10.4s,v26.4s |
| 1820 add w17,w17,w28 |
| 1821 add v14.4s,v14.4s,v26.4s |
| 1822 add x19,x19,x28,lsr#32 |
| 1823 add v18.4s,v18.4s,v26.4s |
| 1824 add w20,w20,w30 |
| 1825 add v22.4s,v22.4s,v26.4s |
| 1826 add x21,x21,x30,lsr#32 |
| 1827 add v19.4s,v19.4s,v31.4s // +4 |
| 1828 add x5,x5,x6,lsl#32 // pack |
| 1829 add v23.4s,v23.4s,v31.4s // +4 |
| 1830 add x7,x7,x8,lsl#32 |
| 1831 add v3.4s,v3.4s,v27.4s |
| 1832 ldp x6,x8,[x1,#0] // load input |
| 1833 add v7.4s,v7.4s,v28.4s |
| 1834 add x9,x9,x10,lsl#32 |
| 1835 add v11.4s,v11.4s,v29.4s |
| 1836 add x11,x11,x12,lsl#32 |
| 1837 add v15.4s,v15.4s,v30.4s |
| 1838 ldp x10,x12,[x1,#16] |
| 1839 add v19.4s,v19.4s,v27.4s |
| 1840 add x13,x13,x14,lsl#32 |
| 1841 add v23.4s,v23.4s,v28.4s |
| 1842 add x15,x15,x16,lsl#32 |
| 1843 add v1.4s,v1.4s,v25.4s |
| 1844 ldp x14,x16,[x1,#32] |
| 1845 add v5.4s,v5.4s,v25.4s |
| 1846 add x17,x17,x19,lsl#32 |
| 1847 add v9.4s,v9.4s,v25.4s |
| 1848 add x20,x20,x21,lsl#32 |
| 1849 add v13.4s,v13.4s,v25.4s |
| 1850 ldp x19,x21,[x1,#48] |
| 1851 add v17.4s,v17.4s,v25.4s |
| 1852 add x1,x1,#64 |
| 1853 add v21.4s,v21.4s,v25.4s |
| 1854 |
| 1855 #ifdef __ARMEB__ |
| 1856 rev x5,x5 |
| 1857 rev x7,x7 |
| 1858 rev x9,x9 |
| 1859 rev x11,x11 |
| 1860 rev x13,x13 |
| 1861 rev x15,x15 |
| 1862 rev x17,x17 |
| 1863 rev x20,x20 |
| 1864 #endif |
| 1865 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
| 1866 eor x5,x5,x6 |
| 1867 eor x7,x7,x8 |
| 1868 eor x9,x9,x10 |
| 1869 eor x11,x11,x12 |
| 1870 eor x13,x13,x14 |
| 1871 eor v0.16b,v0.16b,v24.16b |
| 1872 eor x15,x15,x16 |
| 1873 eor v1.16b,v1.16b,v25.16b |
| 1874 eor x17,x17,x19 |
| 1875 eor v2.16b,v2.16b,v26.16b |
| 1876 eor x20,x20,x21 |
| 1877 eor v3.16b,v3.16b,v27.16b |
| 1878 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
| 1879 |
| 1880 stp x5,x7,[x0,#0] // store output |
| 1881 add x28,x28,#7 // increment counter |
| 1882 stp x9,x11,[x0,#16] |
| 1883 stp x13,x15,[x0,#32] |
| 1884 stp x17,x20,[x0,#48] |
| 1885 add x0,x0,#64 |
| 1886 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
| 1887 |
| 1888 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
| 1889 eor v4.16b,v4.16b,v24.16b |
| 1890 eor v5.16b,v5.16b,v25.16b |
| 1891 eor v6.16b,v6.16b,v26.16b |
| 1892 eor v7.16b,v7.16b,v27.16b |
| 1893 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
| 1894 |
| 1895 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 |
| 1896 eor v8.16b,v8.16b,v0.16b |
| 1897 ldp q24,q25,[sp,#0] |
| 1898 eor v9.16b,v9.16b,v1.16b |
| 1899 ldp q26,q27,[sp,#32] |
| 1900 eor v10.16b,v10.16b,v2.16b |
| 1901 eor v11.16b,v11.16b,v3.16b |
| 1902 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 |
| 1903 |
| 1904 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 |
| 1905 eor v12.16b,v12.16b,v4.16b |
| 1906 eor v13.16b,v13.16b,v5.16b |
| 1907 eor v14.16b,v14.16b,v6.16b |
| 1908 eor v15.16b,v15.16b,v7.16b |
| 1909 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 |
| 1910 |
| 1911 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 |
| 1912 eor v16.16b,v16.16b,v8.16b |
| 1913 eor v17.16b,v17.16b,v9.16b |
| 1914 eor v18.16b,v18.16b,v10.16b |
| 1915 eor v19.16b,v19.16b,v11.16b |
| 1916 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
| 1917 |
| 1918 shl v0.4s,v31.4s,#1 // 4 -> 8 |
| 1919 eor v20.16b,v20.16b,v12.16b |
| 1920 eor v21.16b,v21.16b,v13.16b |
| 1921 eor v22.16b,v22.16b,v14.16b |
| 1922 eor v23.16b,v23.16b,v15.16b |
| 1923 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 |
| 1924 |
| 1925 add v27.4s,v27.4s,v0.4s // += 8 |
| 1926 add v28.4s,v28.4s,v0.4s |
| 1927 add v29.4s,v29.4s,v0.4s |
| 1928 add v30.4s,v30.4s,v0.4s |
| 1929 |
| 1930 b.hs .Loop_outer_512_neon |
| 1931 |
| 1932 adds x2,x2,#512 |
| 1933 ushr v0.4s,v31.4s,#2 // 4 -> 1 |
| 1934 |
| 1935 ldp d8,d9,[sp,#128+0] // meet ABI requirements |
| 1936 ldp d10,d11,[sp,#128+16] |
| 1937 ldp d12,d13,[sp,#128+32] |
| 1938 ldp d14,d15,[sp,#128+48] |
| 1939 |
| 1940 stp q24,q31,[sp,#0] // wipe off-load area |
| 1941 stp q24,q31,[sp,#32] |
| 1942 stp q24,q31,[sp,#64] |
| 1943 |
| 1944 b.eq .Ldone_512_neon |
| 1945 |
| 1946 cmp x2,#192 |
| 1947 sub v27.4s,v27.4s,v0.4s // -= 1 |
| 1948 sub v28.4s,v28.4s,v0.4s |
| 1949 sub v29.4s,v29.4s,v0.4s |
| 1950 add sp,sp,#128 |
| 1951 b.hs .Loop_outer_neon |
| 1952 |
| 1953 eor v25.16b,v25.16b,v25.16b |
| 1954 eor v26.16b,v26.16b,v26.16b |
| 1955 eor v27.16b,v27.16b,v27.16b |
| 1956 eor v28.16b,v28.16b,v28.16b |
| 1957 eor v29.16b,v29.16b,v29.16b |
| 1958 eor v30.16b,v30.16b,v30.16b |
| 1959 b .Loop_outer |
| 1960 |
| 1961 .Ldone_512_neon: |
| 1962 ldp x19,x20,[x29,#16] |
| 1963 add sp,sp,#128+64 |
| 1964 ldp x21,x22,[x29,#32] |
| 1965 ldp x23,x24,[x29,#48] |
| 1966 ldp x25,x26,[x29,#64] |
| 1967 ldp x27,x28,[x29,#80] |
| 1968 ldp x29,x30,[sp],#96 |
| 1969 ret |
| 1970 .size ChaCha20_512_neon,.-ChaCha20_512_neon |
| 1971 #endif |
OLD | NEW |