Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(378)

Side by Side Diff: third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S

Issue 2354623003: Pull boringssl generated source from boringssl_gen (Closed)
Patch Set: . Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #if defined(__aarch64__)
2 #include <openssl/arm_arch.h>
3
4 .text
5
6
7
8 .align 5
9 .Lsigma:
10 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
11 .Lone:
12 .long 1,0,0,0
13 .LOPENSSL_armcap_P:
14 #ifdef __ILP32__
15 .long OPENSSL_armcap_P-.
16 #else
17 .quad OPENSSL_armcap_P-.
18 #endif
19 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82, 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,11 5,115,108,46,111,114,103,62,0
20 .align 2
21
22 .globl ChaCha20_ctr32
23 .hidden ChaCha20_ctr32
24 .type ChaCha20_ctr32,%function
25 .align 5
26 ChaCha20_ctr32:
27 cbz x2,.Labort
28 adr x5,.LOPENSSL_armcap_P
29 cmp x2,#192
30 b.lo .Lshort
31 #ifdef __ILP32__
32 ldrsw x6,[x5]
33 #else
34 ldr x6,[x5]
35 #endif
36 ldr w17,[x6,x5]
37 tst w17,#ARMV7_NEON
38 b.ne ChaCha20_neon
39
40 .Lshort:
41 stp x29,x30,[sp,#-96]!
42 add x29,sp,#0
43
44 adr x5,.Lsigma
45 stp x19,x20,[sp,#16]
46 stp x21,x22,[sp,#32]
47 stp x23,x24,[sp,#48]
48 stp x25,x26,[sp,#64]
49 stp x27,x28,[sp,#80]
50 sub sp,sp,#64
51
52 ldp x22,x23,[x5] // load sigma
53 ldp x24,x25,[x3] // load key
54 ldp x26,x27,[x3,#16]
55 ldp x28,x30,[x4] // load counter
56 #ifdef __ARMEB__
57 ror x24,x24,#32
58 ror x25,x25,#32
59 ror x26,x26,#32
60 ror x27,x27,#32
61 ror x28,x28,#32
62 ror x30,x30,#32
63 #endif
64
65 .Loop_outer:
66 mov w5,w22 // unpack key block
67 lsr x6,x22,#32
68 mov w7,w23
69 lsr x8,x23,#32
70 mov w9,w24
71 lsr x10,x24,#32
72 mov w11,w25
73 lsr x12,x25,#32
74 mov w13,w26
75 lsr x14,x26,#32
76 mov w15,w27
77 lsr x16,x27,#32
78 mov w17,w28
79 lsr x19,x28,#32
80 mov w20,w30
81 lsr x21,x30,#32
82
83 mov x4,#10
84 subs x2,x2,#64
85 .Loop:
86 sub x4,x4,#1
87 add w5,w5,w9
88 add w6,w6,w10
89 add w7,w7,w11
90 add w8,w8,w12
91 eor w17,w17,w5
92 eor w19,w19,w6
93 eor w20,w20,w7
94 eor w21,w21,w8
95 ror w17,w17,#16
96 ror w19,w19,#16
97 ror w20,w20,#16
98 ror w21,w21,#16
99 add w13,w13,w17
100 add w14,w14,w19
101 add w15,w15,w20
102 add w16,w16,w21
103 eor w9,w9,w13
104 eor w10,w10,w14
105 eor w11,w11,w15
106 eor w12,w12,w16
107 ror w9,w9,#20
108 ror w10,w10,#20
109 ror w11,w11,#20
110 ror w12,w12,#20
111 add w5,w5,w9
112 add w6,w6,w10
113 add w7,w7,w11
114 add w8,w8,w12
115 eor w17,w17,w5
116 eor w19,w19,w6
117 eor w20,w20,w7
118 eor w21,w21,w8
119 ror w17,w17,#24
120 ror w19,w19,#24
121 ror w20,w20,#24
122 ror w21,w21,#24
123 add w13,w13,w17
124 add w14,w14,w19
125 add w15,w15,w20
126 add w16,w16,w21
127 eor w9,w9,w13
128 eor w10,w10,w14
129 eor w11,w11,w15
130 eor w12,w12,w16
131 ror w9,w9,#25
132 ror w10,w10,#25
133 ror w11,w11,#25
134 ror w12,w12,#25
135 add w5,w5,w10
136 add w6,w6,w11
137 add w7,w7,w12
138 add w8,w8,w9
139 eor w21,w21,w5
140 eor w17,w17,w6
141 eor w19,w19,w7
142 eor w20,w20,w8
143 ror w21,w21,#16
144 ror w17,w17,#16
145 ror w19,w19,#16
146 ror w20,w20,#16
147 add w15,w15,w21
148 add w16,w16,w17
149 add w13,w13,w19
150 add w14,w14,w20
151 eor w10,w10,w15
152 eor w11,w11,w16
153 eor w12,w12,w13
154 eor w9,w9,w14
155 ror w10,w10,#20
156 ror w11,w11,#20
157 ror w12,w12,#20
158 ror w9,w9,#20
159 add w5,w5,w10
160 add w6,w6,w11
161 add w7,w7,w12
162 add w8,w8,w9
163 eor w21,w21,w5
164 eor w17,w17,w6
165 eor w19,w19,w7
166 eor w20,w20,w8
167 ror w21,w21,#24
168 ror w17,w17,#24
169 ror w19,w19,#24
170 ror w20,w20,#24
171 add w15,w15,w21
172 add w16,w16,w17
173 add w13,w13,w19
174 add w14,w14,w20
175 eor w10,w10,w15
176 eor w11,w11,w16
177 eor w12,w12,w13
178 eor w9,w9,w14
179 ror w10,w10,#25
180 ror w11,w11,#25
181 ror w12,w12,#25
182 ror w9,w9,#25
183 cbnz x4,.Loop
184
185 add w5,w5,w22 // accumulate key block
186 add x6,x6,x22,lsr#32
187 add w7,w7,w23
188 add x8,x8,x23,lsr#32
189 add w9,w9,w24
190 add x10,x10,x24,lsr#32
191 add w11,w11,w25
192 add x12,x12,x25,lsr#32
193 add w13,w13,w26
194 add x14,x14,x26,lsr#32
195 add w15,w15,w27
196 add x16,x16,x27,lsr#32
197 add w17,w17,w28
198 add x19,x19,x28,lsr#32
199 add w20,w20,w30
200 add x21,x21,x30,lsr#32
201
202 b.lo .Ltail
203
204 add x5,x5,x6,lsl#32 // pack
205 add x7,x7,x8,lsl#32
206 ldp x6,x8,[x1,#0] // load input
207 add x9,x9,x10,lsl#32
208 add x11,x11,x12,lsl#32
209 ldp x10,x12,[x1,#16]
210 add x13,x13,x14,lsl#32
211 add x15,x15,x16,lsl#32
212 ldp x14,x16,[x1,#32]
213 add x17,x17,x19,lsl#32
214 add x20,x20,x21,lsl#32
215 ldp x19,x21,[x1,#48]
216 add x1,x1,#64
217 #ifdef __ARMEB__
218 rev x5,x5
219 rev x7,x7
220 rev x9,x9
221 rev x11,x11
222 rev x13,x13
223 rev x15,x15
224 rev x17,x17
225 rev x20,x20
226 #endif
227 eor x5,x5,x6
228 eor x7,x7,x8
229 eor x9,x9,x10
230 eor x11,x11,x12
231 eor x13,x13,x14
232 eor x15,x15,x16
233 eor x17,x17,x19
234 eor x20,x20,x21
235
236 stp x5,x7,[x0,#0] // store output
237 add x28,x28,#1 // increment counter
238 stp x9,x11,[x0,#16]
239 stp x13,x15,[x0,#32]
240 stp x17,x20,[x0,#48]
241 add x0,x0,#64
242
243 b.hi .Loop_outer
244
245 ldp x19,x20,[x29,#16]
246 add sp,sp,#64
247 ldp x21,x22,[x29,#32]
248 ldp x23,x24,[x29,#48]
249 ldp x25,x26,[x29,#64]
250 ldp x27,x28,[x29,#80]
251 ldp x29,x30,[sp],#96
252 .Labort:
253 ret
254
255 .align 4
256 .Ltail:
257 add x2,x2,#64
258 .Less_than_64:
259 sub x0,x0,#1
260 add x1,x1,x2
261 add x0,x0,x2
262 add x4,sp,x2
263 neg x2,x2
264
265 add x5,x5,x6,lsl#32 // pack
266 add x7,x7,x8,lsl#32
267 add x9,x9,x10,lsl#32
268 add x11,x11,x12,lsl#32
269 add x13,x13,x14,lsl#32
270 add x15,x15,x16,lsl#32
271 add x17,x17,x19,lsl#32
272 add x20,x20,x21,lsl#32
273 #ifdef __ARMEB__
274 rev x5,x5
275 rev x7,x7
276 rev x9,x9
277 rev x11,x11
278 rev x13,x13
279 rev x15,x15
280 rev x17,x17
281 rev x20,x20
282 #endif
283 stp x5,x7,[sp,#0]
284 stp x9,x11,[sp,#16]
285 stp x13,x15,[sp,#32]
286 stp x17,x20,[sp,#48]
287
288 .Loop_tail:
289 ldrb w10,[x1,x2]
290 ldrb w11,[x4,x2]
291 add x2,x2,#1
292 eor w10,w10,w11
293 strb w10,[x0,x2]
294 cbnz x2,.Loop_tail
295
296 stp xzr,xzr,[sp,#0]
297 stp xzr,xzr,[sp,#16]
298 stp xzr,xzr,[sp,#32]
299 stp xzr,xzr,[sp,#48]
300
301 ldp x19,x20,[x29,#16]
302 add sp,sp,#64
303 ldp x21,x22,[x29,#32]
304 ldp x23,x24,[x29,#48]
305 ldp x25,x26,[x29,#64]
306 ldp x27,x28,[x29,#80]
307 ldp x29,x30,[sp],#96
308 ret
309 .size ChaCha20_ctr32,.-ChaCha20_ctr32
310
311 .type ChaCha20_neon,%function
312 .align 5
313 ChaCha20_neon:
314 stp x29,x30,[sp,#-96]!
315 add x29,sp,#0
316
317 adr x5,.Lsigma
318 stp x19,x20,[sp,#16]
319 stp x21,x22,[sp,#32]
320 stp x23,x24,[sp,#48]
321 stp x25,x26,[sp,#64]
322 stp x27,x28,[sp,#80]
323 cmp x2,#512
324 b.hs .L512_or_more_neon
325
326 sub sp,sp,#64
327
328 ldp x22,x23,[x5] // load sigma
329 ld1 {v24.4s},[x5],#16
330 ldp x24,x25,[x3] // load key
331 ldp x26,x27,[x3,#16]
332 ld1 {v25.4s,v26.4s},[x3]
333 ldp x28,x30,[x4] // load counter
334 ld1 {v27.4s},[x4]
335 ld1 {v31.4s},[x5]
336 #ifdef __ARMEB__
337 rev64 v24.4s,v24.4s
338 ror x24,x24,#32
339 ror x25,x25,#32
340 ror x26,x26,#32
341 ror x27,x27,#32
342 ror x28,x28,#32
343 ror x30,x30,#32
344 #endif
345 add v27.4s,v27.4s,v31.4s // += 1
346 add v28.4s,v27.4s,v31.4s
347 add v29.4s,v28.4s,v31.4s
348 shl v31.4s,v31.4s,#2 // 1 -> 4
349
350 .Loop_outer_neon:
351 mov w5,w22 // unpack key block
352 lsr x6,x22,#32
353 mov v0.16b,v24.16b
354 mov w7,w23
355 lsr x8,x23,#32
356 mov v4.16b,v24.16b
357 mov w9,w24
358 lsr x10,x24,#32
359 mov v16.16b,v24.16b
360 mov w11,w25
361 mov v1.16b,v25.16b
362 lsr x12,x25,#32
363 mov v5.16b,v25.16b
364 mov w13,w26
365 mov v17.16b,v25.16b
366 lsr x14,x26,#32
367 mov v3.16b,v27.16b
368 mov w15,w27
369 mov v7.16b,v28.16b
370 lsr x16,x27,#32
371 mov v19.16b,v29.16b
372 mov w17,w28
373 mov v2.16b,v26.16b
374 lsr x19,x28,#32
375 mov v6.16b,v26.16b
376 mov w20,w30
377 mov v18.16b,v26.16b
378 lsr x21,x30,#32
379
380 mov x4,#10
381 subs x2,x2,#256
382 .Loop_neon:
383 sub x4,x4,#1
384 add v0.4s,v0.4s,v1.4s
385 add w5,w5,w9
386 add v4.4s,v4.4s,v5.4s
387 add w6,w6,w10
388 add v16.4s,v16.4s,v17.4s
389 add w7,w7,w11
390 eor v3.16b,v3.16b,v0.16b
391 add w8,w8,w12
392 eor v7.16b,v7.16b,v4.16b
393 eor w17,w17,w5
394 eor v19.16b,v19.16b,v16.16b
395 eor w19,w19,w6
396 rev32 v3.8h,v3.8h
397 eor w20,w20,w7
398 rev32 v7.8h,v7.8h
399 eor w21,w21,w8
400 rev32 v19.8h,v19.8h
401 ror w17,w17,#16
402 add v2.4s,v2.4s,v3.4s
403 ror w19,w19,#16
404 add v6.4s,v6.4s,v7.4s
405 ror w20,w20,#16
406 add v18.4s,v18.4s,v19.4s
407 ror w21,w21,#16
408 eor v20.16b,v1.16b,v2.16b
409 add w13,w13,w17
410 eor v21.16b,v5.16b,v6.16b
411 add w14,w14,w19
412 eor v22.16b,v17.16b,v18.16b
413 add w15,w15,w20
414 ushr v1.4s,v20.4s,#20
415 add w16,w16,w21
416 ushr v5.4s,v21.4s,#20
417 eor w9,w9,w13
418 ushr v17.4s,v22.4s,#20
419 eor w10,w10,w14
420 sli v1.4s,v20.4s,#12
421 eor w11,w11,w15
422 sli v5.4s,v21.4s,#12
423 eor w12,w12,w16
424 sli v17.4s,v22.4s,#12
425 ror w9,w9,#20
426 add v0.4s,v0.4s,v1.4s
427 ror w10,w10,#20
428 add v4.4s,v4.4s,v5.4s
429 ror w11,w11,#20
430 add v16.4s,v16.4s,v17.4s
431 ror w12,w12,#20
432 eor v20.16b,v3.16b,v0.16b
433 add w5,w5,w9
434 eor v21.16b,v7.16b,v4.16b
435 add w6,w6,w10
436 eor v22.16b,v19.16b,v16.16b
437 add w7,w7,w11
438 ushr v3.4s,v20.4s,#24
439 add w8,w8,w12
440 ushr v7.4s,v21.4s,#24
441 eor w17,w17,w5
442 ushr v19.4s,v22.4s,#24
443 eor w19,w19,w6
444 sli v3.4s,v20.4s,#8
445 eor w20,w20,w7
446 sli v7.4s,v21.4s,#8
447 eor w21,w21,w8
448 sli v19.4s,v22.4s,#8
449 ror w17,w17,#24
450 add v2.4s,v2.4s,v3.4s
451 ror w19,w19,#24
452 add v6.4s,v6.4s,v7.4s
453 ror w20,w20,#24
454 add v18.4s,v18.4s,v19.4s
455 ror w21,w21,#24
456 eor v20.16b,v1.16b,v2.16b
457 add w13,w13,w17
458 eor v21.16b,v5.16b,v6.16b
459 add w14,w14,w19
460 eor v22.16b,v17.16b,v18.16b
461 add w15,w15,w20
462 ushr v1.4s,v20.4s,#25
463 add w16,w16,w21
464 ushr v5.4s,v21.4s,#25
465 eor w9,w9,w13
466 ushr v17.4s,v22.4s,#25
467 eor w10,w10,w14
468 sli v1.4s,v20.4s,#7
469 eor w11,w11,w15
470 sli v5.4s,v21.4s,#7
471 eor w12,w12,w16
472 sli v17.4s,v22.4s,#7
473 ror w9,w9,#25
474 ext v2.16b,v2.16b,v2.16b,#8
475 ror w10,w10,#25
476 ext v6.16b,v6.16b,v6.16b,#8
477 ror w11,w11,#25
478 ext v18.16b,v18.16b,v18.16b,#8
479 ror w12,w12,#25
480 ext v3.16b,v3.16b,v3.16b,#12
481 ext v7.16b,v7.16b,v7.16b,#12
482 ext v19.16b,v19.16b,v19.16b,#12
483 ext v1.16b,v1.16b,v1.16b,#4
484 ext v5.16b,v5.16b,v5.16b,#4
485 ext v17.16b,v17.16b,v17.16b,#4
486 add v0.4s,v0.4s,v1.4s
487 add w5,w5,w10
488 add v4.4s,v4.4s,v5.4s
489 add w6,w6,w11
490 add v16.4s,v16.4s,v17.4s
491 add w7,w7,w12
492 eor v3.16b,v3.16b,v0.16b
493 add w8,w8,w9
494 eor v7.16b,v7.16b,v4.16b
495 eor w21,w21,w5
496 eor v19.16b,v19.16b,v16.16b
497 eor w17,w17,w6
498 rev32 v3.8h,v3.8h
499 eor w19,w19,w7
500 rev32 v7.8h,v7.8h
501 eor w20,w20,w8
502 rev32 v19.8h,v19.8h
503 ror w21,w21,#16
504 add v2.4s,v2.4s,v3.4s
505 ror w17,w17,#16
506 add v6.4s,v6.4s,v7.4s
507 ror w19,w19,#16
508 add v18.4s,v18.4s,v19.4s
509 ror w20,w20,#16
510 eor v20.16b,v1.16b,v2.16b
511 add w15,w15,w21
512 eor v21.16b,v5.16b,v6.16b
513 add w16,w16,w17
514 eor v22.16b,v17.16b,v18.16b
515 add w13,w13,w19
516 ushr v1.4s,v20.4s,#20
517 add w14,w14,w20
518 ushr v5.4s,v21.4s,#20
519 eor w10,w10,w15
520 ushr v17.4s,v22.4s,#20
521 eor w11,w11,w16
522 sli v1.4s,v20.4s,#12
523 eor w12,w12,w13
524 sli v5.4s,v21.4s,#12
525 eor w9,w9,w14
526 sli v17.4s,v22.4s,#12
527 ror w10,w10,#20
528 add v0.4s,v0.4s,v1.4s
529 ror w11,w11,#20
530 add v4.4s,v4.4s,v5.4s
531 ror w12,w12,#20
532 add v16.4s,v16.4s,v17.4s
533 ror w9,w9,#20
534 eor v20.16b,v3.16b,v0.16b
535 add w5,w5,w10
536 eor v21.16b,v7.16b,v4.16b
537 add w6,w6,w11
538 eor v22.16b,v19.16b,v16.16b
539 add w7,w7,w12
540 ushr v3.4s,v20.4s,#24
541 add w8,w8,w9
542 ushr v7.4s,v21.4s,#24
543 eor w21,w21,w5
544 ushr v19.4s,v22.4s,#24
545 eor w17,w17,w6
546 sli v3.4s,v20.4s,#8
547 eor w19,w19,w7
548 sli v7.4s,v21.4s,#8
549 eor w20,w20,w8
550 sli v19.4s,v22.4s,#8
551 ror w21,w21,#24
552 add v2.4s,v2.4s,v3.4s
553 ror w17,w17,#24
554 add v6.4s,v6.4s,v7.4s
555 ror w19,w19,#24
556 add v18.4s,v18.4s,v19.4s
557 ror w20,w20,#24
558 eor v20.16b,v1.16b,v2.16b
559 add w15,w15,w21
560 eor v21.16b,v5.16b,v6.16b
561 add w16,w16,w17
562 eor v22.16b,v17.16b,v18.16b
563 add w13,w13,w19
564 ushr v1.4s,v20.4s,#25
565 add w14,w14,w20
566 ushr v5.4s,v21.4s,#25
567 eor w10,w10,w15
568 ushr v17.4s,v22.4s,#25
569 eor w11,w11,w16
570 sli v1.4s,v20.4s,#7
571 eor w12,w12,w13
572 sli v5.4s,v21.4s,#7
573 eor w9,w9,w14
574 sli v17.4s,v22.4s,#7
575 ror w10,w10,#25
576 ext v2.16b,v2.16b,v2.16b,#8
577 ror w11,w11,#25
578 ext v6.16b,v6.16b,v6.16b,#8
579 ror w12,w12,#25
580 ext v18.16b,v18.16b,v18.16b,#8
581 ror w9,w9,#25
582 ext v3.16b,v3.16b,v3.16b,#4
583 ext v7.16b,v7.16b,v7.16b,#4
584 ext v19.16b,v19.16b,v19.16b,#4
585 ext v1.16b,v1.16b,v1.16b,#12
586 ext v5.16b,v5.16b,v5.16b,#12
587 ext v17.16b,v17.16b,v17.16b,#12
588 cbnz x4,.Loop_neon
589
590 add w5,w5,w22 // accumulate key block
591 add v0.4s,v0.4s,v24.4s
592 add x6,x6,x22,lsr#32
593 add v4.4s,v4.4s,v24.4s
594 add w7,w7,w23
595 add v16.4s,v16.4s,v24.4s
596 add x8,x8,x23,lsr#32
597 add v2.4s,v2.4s,v26.4s
598 add w9,w9,w24
599 add v6.4s,v6.4s,v26.4s
600 add x10,x10,x24,lsr#32
601 add v18.4s,v18.4s,v26.4s
602 add w11,w11,w25
603 add v3.4s,v3.4s,v27.4s
604 add x12,x12,x25,lsr#32
605 add w13,w13,w26
606 add v7.4s,v7.4s,v28.4s
607 add x14,x14,x26,lsr#32
608 add w15,w15,w27
609 add v19.4s,v19.4s,v29.4s
610 add x16,x16,x27,lsr#32
611 add w17,w17,w28
612 add v1.4s,v1.4s,v25.4s
613 add x19,x19,x28,lsr#32
614 add w20,w20,w30
615 add v5.4s,v5.4s,v25.4s
616 add x21,x21,x30,lsr#32
617 add v17.4s,v17.4s,v25.4s
618
619 b.lo .Ltail_neon
620
621 add x5,x5,x6,lsl#32 // pack
622 add x7,x7,x8,lsl#32
623 ldp x6,x8,[x1,#0] // load input
624 add x9,x9,x10,lsl#32
625 add x11,x11,x12,lsl#32
626 ldp x10,x12,[x1,#16]
627 add x13,x13,x14,lsl#32
628 add x15,x15,x16,lsl#32
629 ldp x14,x16,[x1,#32]
630 add x17,x17,x19,lsl#32
631 add x20,x20,x21,lsl#32
632 ldp x19,x21,[x1,#48]
633 add x1,x1,#64
634 #ifdef __ARMEB__
635 rev x5,x5
636 rev x7,x7
637 rev x9,x9
638 rev x11,x11
639 rev x13,x13
640 rev x15,x15
641 rev x17,x17
642 rev x20,x20
643 #endif
644 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
645 eor x5,x5,x6
646 eor x7,x7,x8
647 eor x9,x9,x10
648 eor x11,x11,x12
649 eor x13,x13,x14
650 eor v0.16b,v0.16b,v20.16b
651 eor x15,x15,x16
652 eor v1.16b,v1.16b,v21.16b
653 eor x17,x17,x19
654 eor v2.16b,v2.16b,v22.16b
655 eor x20,x20,x21
656 eor v3.16b,v3.16b,v23.16b
657 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
658
659 stp x5,x7,[x0,#0] // store output
660 add x28,x28,#4 // increment counter
661 stp x9,x11,[x0,#16]
662 add v27.4s,v27.4s,v31.4s // += 4
663 stp x13,x15,[x0,#32]
664 add v28.4s,v28.4s,v31.4s
665 stp x17,x20,[x0,#48]
666 add v29.4s,v29.4s,v31.4s
667 add x0,x0,#64
668
669 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
670 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
671
672 eor v4.16b,v4.16b,v20.16b
673 eor v5.16b,v5.16b,v21.16b
674 eor v6.16b,v6.16b,v22.16b
675 eor v7.16b,v7.16b,v23.16b
676 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
677
678 eor v16.16b,v16.16b,v0.16b
679 eor v17.16b,v17.16b,v1.16b
680 eor v18.16b,v18.16b,v2.16b
681 eor v19.16b,v19.16b,v3.16b
682 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
683
684 b.hi .Loop_outer_neon
685
686 ldp x19,x20,[x29,#16]
687 add sp,sp,#64
688 ldp x21,x22,[x29,#32]
689 ldp x23,x24,[x29,#48]
690 ldp x25,x26,[x29,#64]
691 ldp x27,x28,[x29,#80]
692 ldp x29,x30,[sp],#96
693 ret
694
695 .Ltail_neon:
696 add x2,x2,#256
697 cmp x2,#64
698 b.lo .Less_than_64
699
700 add x5,x5,x6,lsl#32 // pack
701 add x7,x7,x8,lsl#32
702 ldp x6,x8,[x1,#0] // load input
703 add x9,x9,x10,lsl#32
704 add x11,x11,x12,lsl#32
705 ldp x10,x12,[x1,#16]
706 add x13,x13,x14,lsl#32
707 add x15,x15,x16,lsl#32
708 ldp x14,x16,[x1,#32]
709 add x17,x17,x19,lsl#32
710 add x20,x20,x21,lsl#32
711 ldp x19,x21,[x1,#48]
712 add x1,x1,#64
713 #ifdef __ARMEB__
714 rev x5,x5
715 rev x7,x7
716 rev x9,x9
717 rev x11,x11
718 rev x13,x13
719 rev x15,x15
720 rev x17,x17
721 rev x20,x20
722 #endif
723 eor x5,x5,x6
724 eor x7,x7,x8
725 eor x9,x9,x10
726 eor x11,x11,x12
727 eor x13,x13,x14
728 eor x15,x15,x16
729 eor x17,x17,x19
730 eor x20,x20,x21
731
732 stp x5,x7,[x0,#0] // store output
733 add x28,x28,#4 // increment counter
734 stp x9,x11,[x0,#16]
735 stp x13,x15,[x0,#32]
736 stp x17,x20,[x0,#48]
737 add x0,x0,#64
738 b.eq .Ldone_neon
739 sub x2,x2,#64
740 cmp x2,#64
741 b.lo .Less_than_128
742
743 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
744 eor v0.16b,v0.16b,v20.16b
745 eor v1.16b,v1.16b,v21.16b
746 eor v2.16b,v2.16b,v22.16b
747 eor v3.16b,v3.16b,v23.16b
748 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
749 b.eq .Ldone_neon
750 sub x2,x2,#64
751 cmp x2,#64
752 b.lo .Less_than_192
753
754 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
755 eor v4.16b,v4.16b,v20.16b
756 eor v5.16b,v5.16b,v21.16b
757 eor v6.16b,v6.16b,v22.16b
758 eor v7.16b,v7.16b,v23.16b
759 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
760 b.eq .Ldone_neon
761 sub x2,x2,#64
762
763 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
764 b .Last_neon
765
766 .Less_than_128:
767 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
768 b .Last_neon
769 .Less_than_192:
770 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
771 b .Last_neon
772
773 .align 4
774 .Last_neon:
775 sub x0,x0,#1
776 add x1,x1,x2
777 add x0,x0,x2
778 add x4,sp,x2
779 neg x2,x2
780
781 .Loop_tail_neon:
782 ldrb w10,[x1,x2]
783 ldrb w11,[x4,x2]
784 add x2,x2,#1
785 eor w10,w10,w11
786 strb w10,[x0,x2]
787 cbnz x2,.Loop_tail_neon
788
789 stp xzr,xzr,[sp,#0]
790 stp xzr,xzr,[sp,#16]
791 stp xzr,xzr,[sp,#32]
792 stp xzr,xzr,[sp,#48]
793
794 .Ldone_neon:
795 ldp x19,x20,[x29,#16]
796 add sp,sp,#64
797 ldp x21,x22,[x29,#32]
798 ldp x23,x24,[x29,#48]
799 ldp x25,x26,[x29,#64]
800 ldp x27,x28,[x29,#80]
801 ldp x29,x30,[sp],#96
802 ret
803 .size ChaCha20_neon,.-ChaCha20_neon
804 .type ChaCha20_512_neon,%function
805 .align 5
806 ChaCha20_512_neon:
807 stp x29,x30,[sp,#-96]!
808 add x29,sp,#0
809
810 adr x5,.Lsigma
811 stp x19,x20,[sp,#16]
812 stp x21,x22,[sp,#32]
813 stp x23,x24,[sp,#48]
814 stp x25,x26,[sp,#64]
815 stp x27,x28,[sp,#80]
816
817 .L512_or_more_neon:
818 sub sp,sp,#128+64
819
820 ldp x22,x23,[x5] // load sigma
821 ld1 {v24.4s},[x5],#16
822 ldp x24,x25,[x3] // load key
823 ldp x26,x27,[x3,#16]
824 ld1 {v25.4s,v26.4s},[x3]
825 ldp x28,x30,[x4] // load counter
826 ld1 {v27.4s},[x4]
827 ld1 {v31.4s},[x5]
828 #ifdef __ARMEB__
829 rev64 v24.4s,v24.4s
830 ror x24,x24,#32
831 ror x25,x25,#32
832 ror x26,x26,#32
833 ror x27,x27,#32
834 ror x28,x28,#32
835 ror x30,x30,#32
836 #endif
837 add v27.4s,v27.4s,v31.4s // += 1
838 stp q24,q25,[sp,#0] // off-load key block, invariant part
839 add v27.4s,v27.4s,v31.4s // not typo
840 str q26,[sp,#32]
841 add v28.4s,v27.4s,v31.4s
842 add v29.4s,v28.4s,v31.4s
843 add v30.4s,v29.4s,v31.4s
844 shl v31.4s,v31.4s,#2 // 1 -> 4
845
846 stp d8,d9,[sp,#128+0] // meet ABI requirements
847 stp d10,d11,[sp,#128+16]
848 stp d12,d13,[sp,#128+32]
849 stp d14,d15,[sp,#128+48]
850
851 sub x2,x2,#512 // not typo
852
853 .Loop_outer_512_neon:
854 mov v0.16b,v24.16b
855 mov v4.16b,v24.16b
856 mov v8.16b,v24.16b
857 mov v12.16b,v24.16b
858 mov v16.16b,v24.16b
859 mov v20.16b,v24.16b
860 mov v1.16b,v25.16b
861 mov w5,w22 // unpack key block
862 mov v5.16b,v25.16b
863 lsr x6,x22,#32
864 mov v9.16b,v25.16b
865 mov w7,w23
866 mov v13.16b,v25.16b
867 lsr x8,x23,#32
868 mov v17.16b,v25.16b
869 mov w9,w24
870 mov v21.16b,v25.16b
871 lsr x10,x24,#32
872 mov v3.16b,v27.16b
873 mov w11,w25
874 mov v7.16b,v28.16b
875 lsr x12,x25,#32
876 mov v11.16b,v29.16b
877 mov w13,w26
878 mov v15.16b,v30.16b
879 lsr x14,x26,#32
880 mov v2.16b,v26.16b
881 mov w15,w27
882 mov v6.16b,v26.16b
883 lsr x16,x27,#32
884 add v19.4s,v3.4s,v31.4s // +4
885 mov w17,w28
886 add v23.4s,v7.4s,v31.4s // +4
887 lsr x19,x28,#32
888 mov v10.16b,v26.16b
889 mov w20,w30
890 mov v14.16b,v26.16b
891 lsr x21,x30,#32
892 mov v18.16b,v26.16b
893 stp q27,q28,[sp,#48] // off-load key block, variable part
894 mov v22.16b,v26.16b
895 str q29,[sp,#80]
896
897 mov x4,#5
898 subs x2,x2,#512
899 .Loop_upper_neon:
900 sub x4,x4,#1
901 add v0.4s,v0.4s,v1.4s
902 add w5,w5,w9
903 add v4.4s,v4.4s,v5.4s
904 add w6,w6,w10
905 add v8.4s,v8.4s,v9.4s
906 add w7,w7,w11
907 add v12.4s,v12.4s,v13.4s
908 add w8,w8,w12
909 add v16.4s,v16.4s,v17.4s
910 eor w17,w17,w5
911 add v20.4s,v20.4s,v21.4s
912 eor w19,w19,w6
913 eor v3.16b,v3.16b,v0.16b
914 eor w20,w20,w7
915 eor v7.16b,v7.16b,v4.16b
916 eor w21,w21,w8
917 eor v11.16b,v11.16b,v8.16b
918 ror w17,w17,#16
919 eor v15.16b,v15.16b,v12.16b
920 ror w19,w19,#16
921 eor v19.16b,v19.16b,v16.16b
922 ror w20,w20,#16
923 eor v23.16b,v23.16b,v20.16b
924 ror w21,w21,#16
925 rev32 v3.8h,v3.8h
926 add w13,w13,w17
927 rev32 v7.8h,v7.8h
928 add w14,w14,w19
929 rev32 v11.8h,v11.8h
930 add w15,w15,w20
931 rev32 v15.8h,v15.8h
932 add w16,w16,w21
933 rev32 v19.8h,v19.8h
934 eor w9,w9,w13
935 rev32 v23.8h,v23.8h
936 eor w10,w10,w14
937 add v2.4s,v2.4s,v3.4s
938 eor w11,w11,w15
939 add v6.4s,v6.4s,v7.4s
940 eor w12,w12,w16
941 add v10.4s,v10.4s,v11.4s
942 ror w9,w9,#20
943 add v14.4s,v14.4s,v15.4s
944 ror w10,w10,#20
945 add v18.4s,v18.4s,v19.4s
946 ror w11,w11,#20
947 add v22.4s,v22.4s,v23.4s
948 ror w12,w12,#20
949 eor v24.16b,v1.16b,v2.16b
950 add w5,w5,w9
951 eor v25.16b,v5.16b,v6.16b
952 add w6,w6,w10
953 eor v26.16b,v9.16b,v10.16b
954 add w7,w7,w11
955 eor v27.16b,v13.16b,v14.16b
956 add w8,w8,w12
957 eor v28.16b,v17.16b,v18.16b
958 eor w17,w17,w5
959 eor v29.16b,v21.16b,v22.16b
960 eor w19,w19,w6
961 ushr v1.4s,v24.4s,#20
962 eor w20,w20,w7
963 ushr v5.4s,v25.4s,#20
964 eor w21,w21,w8
965 ushr v9.4s,v26.4s,#20
966 ror w17,w17,#24
967 ushr v13.4s,v27.4s,#20
968 ror w19,w19,#24
969 ushr v17.4s,v28.4s,#20
970 ror w20,w20,#24
971 ushr v21.4s,v29.4s,#20
972 ror w21,w21,#24
973 sli v1.4s,v24.4s,#12
974 add w13,w13,w17
975 sli v5.4s,v25.4s,#12
976 add w14,w14,w19
977 sli v9.4s,v26.4s,#12
978 add w15,w15,w20
979 sli v13.4s,v27.4s,#12
980 add w16,w16,w21
981 sli v17.4s,v28.4s,#12
982 eor w9,w9,w13
983 sli v21.4s,v29.4s,#12
984 eor w10,w10,w14
985 add v0.4s,v0.4s,v1.4s
986 eor w11,w11,w15
987 add v4.4s,v4.4s,v5.4s
988 eor w12,w12,w16
989 add v8.4s,v8.4s,v9.4s
990 ror w9,w9,#25
991 add v12.4s,v12.4s,v13.4s
992 ror w10,w10,#25
993 add v16.4s,v16.4s,v17.4s
994 ror w11,w11,#25
995 add v20.4s,v20.4s,v21.4s
996 ror w12,w12,#25
997 eor v24.16b,v3.16b,v0.16b
998 add w5,w5,w10
999 eor v25.16b,v7.16b,v4.16b
1000 add w6,w6,w11
1001 eor v26.16b,v11.16b,v8.16b
1002 add w7,w7,w12
1003 eor v27.16b,v15.16b,v12.16b
1004 add w8,w8,w9
1005 eor v28.16b,v19.16b,v16.16b
1006 eor w21,w21,w5
1007 eor v29.16b,v23.16b,v20.16b
1008 eor w17,w17,w6
1009 ushr v3.4s,v24.4s,#24
1010 eor w19,w19,w7
1011 ushr v7.4s,v25.4s,#24
1012 eor w20,w20,w8
1013 ushr v11.4s,v26.4s,#24
1014 ror w21,w21,#16
1015 ushr v15.4s,v27.4s,#24
1016 ror w17,w17,#16
1017 ushr v19.4s,v28.4s,#24
1018 ror w19,w19,#16
1019 ushr v23.4s,v29.4s,#24
1020 ror w20,w20,#16
1021 sli v3.4s,v24.4s,#8
1022 add w15,w15,w21
1023 sli v7.4s,v25.4s,#8
1024 add w16,w16,w17
1025 sli v11.4s,v26.4s,#8
1026 add w13,w13,w19
1027 sli v15.4s,v27.4s,#8
1028 add w14,w14,w20
1029 sli v19.4s,v28.4s,#8
1030 eor w10,w10,w15
1031 sli v23.4s,v29.4s,#8
1032 eor w11,w11,w16
1033 add v2.4s,v2.4s,v3.4s
1034 eor w12,w12,w13
1035 add v6.4s,v6.4s,v7.4s
1036 eor w9,w9,w14
1037 add v10.4s,v10.4s,v11.4s
1038 ror w10,w10,#20
1039 add v14.4s,v14.4s,v15.4s
1040 ror w11,w11,#20
1041 add v18.4s,v18.4s,v19.4s
1042 ror w12,w12,#20
1043 add v22.4s,v22.4s,v23.4s
1044 ror w9,w9,#20
1045 eor v24.16b,v1.16b,v2.16b
1046 add w5,w5,w10
1047 eor v25.16b,v5.16b,v6.16b
1048 add w6,w6,w11
1049 eor v26.16b,v9.16b,v10.16b
1050 add w7,w7,w12
1051 eor v27.16b,v13.16b,v14.16b
1052 add w8,w8,w9
1053 eor v28.16b,v17.16b,v18.16b
1054 eor w21,w21,w5
1055 eor v29.16b,v21.16b,v22.16b
1056 eor w17,w17,w6
1057 ushr v1.4s,v24.4s,#25
1058 eor w19,w19,w7
1059 ushr v5.4s,v25.4s,#25
1060 eor w20,w20,w8
1061 ushr v9.4s,v26.4s,#25
1062 ror w21,w21,#24
1063 ushr v13.4s,v27.4s,#25
1064 ror w17,w17,#24
1065 ushr v17.4s,v28.4s,#25
1066 ror w19,w19,#24
1067 ushr v21.4s,v29.4s,#25
1068 ror w20,w20,#24
1069 sli v1.4s,v24.4s,#7
1070 add w15,w15,w21
1071 sli v5.4s,v25.4s,#7
1072 add w16,w16,w17
1073 sli v9.4s,v26.4s,#7
1074 add w13,w13,w19
1075 sli v13.4s,v27.4s,#7
1076 add w14,w14,w20
1077 sli v17.4s,v28.4s,#7
1078 eor w10,w10,w15
1079 sli v21.4s,v29.4s,#7
1080 eor w11,w11,w16
1081 ext v2.16b,v2.16b,v2.16b,#8
1082 eor w12,w12,w13
1083 ext v6.16b,v6.16b,v6.16b,#8
1084 eor w9,w9,w14
1085 ext v10.16b,v10.16b,v10.16b,#8
1086 ror w10,w10,#25
1087 ext v14.16b,v14.16b,v14.16b,#8
1088 ror w11,w11,#25
1089 ext v18.16b,v18.16b,v18.16b,#8
1090 ror w12,w12,#25
1091 ext v22.16b,v22.16b,v22.16b,#8
1092 ror w9,w9,#25
1093 ext v3.16b,v3.16b,v3.16b,#12
1094 ext v7.16b,v7.16b,v7.16b,#12
1095 ext v11.16b,v11.16b,v11.16b,#12
1096 ext v15.16b,v15.16b,v15.16b,#12
1097 ext v19.16b,v19.16b,v19.16b,#12
1098 ext v23.16b,v23.16b,v23.16b,#12
1099 ext v1.16b,v1.16b,v1.16b,#4
1100 ext v5.16b,v5.16b,v5.16b,#4
1101 ext v9.16b,v9.16b,v9.16b,#4
1102 ext v13.16b,v13.16b,v13.16b,#4
1103 ext v17.16b,v17.16b,v17.16b,#4
1104 ext v21.16b,v21.16b,v21.16b,#4
1105 add v0.4s,v0.4s,v1.4s
1106 add w5,w5,w9
1107 add v4.4s,v4.4s,v5.4s
1108 add w6,w6,w10
1109 add v8.4s,v8.4s,v9.4s
1110 add w7,w7,w11
1111 add v12.4s,v12.4s,v13.4s
1112 add w8,w8,w12
1113 add v16.4s,v16.4s,v17.4s
1114 eor w17,w17,w5
1115 add v20.4s,v20.4s,v21.4s
1116 eor w19,w19,w6
1117 eor v3.16b,v3.16b,v0.16b
1118 eor w20,w20,w7
1119 eor v7.16b,v7.16b,v4.16b
1120 eor w21,w21,w8
1121 eor v11.16b,v11.16b,v8.16b
1122 ror w17,w17,#16
1123 eor v15.16b,v15.16b,v12.16b
1124 ror w19,w19,#16
1125 eor v19.16b,v19.16b,v16.16b
1126 ror w20,w20,#16
1127 eor v23.16b,v23.16b,v20.16b
1128 ror w21,w21,#16
1129 rev32 v3.8h,v3.8h
1130 add w13,w13,w17
1131 rev32 v7.8h,v7.8h
1132 add w14,w14,w19
1133 rev32 v11.8h,v11.8h
1134 add w15,w15,w20
1135 rev32 v15.8h,v15.8h
1136 add w16,w16,w21
1137 rev32 v19.8h,v19.8h
1138 eor w9,w9,w13
1139 rev32 v23.8h,v23.8h
1140 eor w10,w10,w14
1141 add v2.4s,v2.4s,v3.4s
1142 eor w11,w11,w15
1143 add v6.4s,v6.4s,v7.4s
1144 eor w12,w12,w16
1145 add v10.4s,v10.4s,v11.4s
1146 ror w9,w9,#20
1147 add v14.4s,v14.4s,v15.4s
1148 ror w10,w10,#20
1149 add v18.4s,v18.4s,v19.4s
1150 ror w11,w11,#20
1151 add v22.4s,v22.4s,v23.4s
1152 ror w12,w12,#20
1153 eor v24.16b,v1.16b,v2.16b
1154 add w5,w5,w9
1155 eor v25.16b,v5.16b,v6.16b
1156 add w6,w6,w10
1157 eor v26.16b,v9.16b,v10.16b
1158 add w7,w7,w11
1159 eor v27.16b,v13.16b,v14.16b
1160 add w8,w8,w12
1161 eor v28.16b,v17.16b,v18.16b
1162 eor w17,w17,w5
1163 eor v29.16b,v21.16b,v22.16b
1164 eor w19,w19,w6
1165 ushr v1.4s,v24.4s,#20
1166 eor w20,w20,w7
1167 ushr v5.4s,v25.4s,#20
1168 eor w21,w21,w8
1169 ushr v9.4s,v26.4s,#20
1170 ror w17,w17,#24
1171 ushr v13.4s,v27.4s,#20
1172 ror w19,w19,#24
1173 ushr v17.4s,v28.4s,#20
1174 ror w20,w20,#24
1175 ushr v21.4s,v29.4s,#20
1176 ror w21,w21,#24
1177 sli v1.4s,v24.4s,#12
1178 add w13,w13,w17
1179 sli v5.4s,v25.4s,#12
1180 add w14,w14,w19
1181 sli v9.4s,v26.4s,#12
1182 add w15,w15,w20
1183 sli v13.4s,v27.4s,#12
1184 add w16,w16,w21
1185 sli v17.4s,v28.4s,#12
1186 eor w9,w9,w13
1187 sli v21.4s,v29.4s,#12
1188 eor w10,w10,w14
1189 add v0.4s,v0.4s,v1.4s
1190 eor w11,w11,w15
1191 add v4.4s,v4.4s,v5.4s
1192 eor w12,w12,w16
1193 add v8.4s,v8.4s,v9.4s
1194 ror w9,w9,#25
1195 add v12.4s,v12.4s,v13.4s
1196 ror w10,w10,#25
1197 add v16.4s,v16.4s,v17.4s
1198 ror w11,w11,#25
1199 add v20.4s,v20.4s,v21.4s
1200 ror w12,w12,#25
1201 eor v24.16b,v3.16b,v0.16b
1202 add w5,w5,w10
1203 eor v25.16b,v7.16b,v4.16b
1204 add w6,w6,w11
1205 eor v26.16b,v11.16b,v8.16b
1206 add w7,w7,w12
1207 eor v27.16b,v15.16b,v12.16b
1208 add w8,w8,w9
1209 eor v28.16b,v19.16b,v16.16b
1210 eor w21,w21,w5
1211 eor v29.16b,v23.16b,v20.16b
1212 eor w17,w17,w6
1213 ushr v3.4s,v24.4s,#24
1214 eor w19,w19,w7
1215 ushr v7.4s,v25.4s,#24
1216 eor w20,w20,w8
1217 ushr v11.4s,v26.4s,#24
1218 ror w21,w21,#16
1219 ushr v15.4s,v27.4s,#24
1220 ror w17,w17,#16
1221 ushr v19.4s,v28.4s,#24
1222 ror w19,w19,#16
1223 ushr v23.4s,v29.4s,#24
1224 ror w20,w20,#16
1225 sli v3.4s,v24.4s,#8
1226 add w15,w15,w21
1227 sli v7.4s,v25.4s,#8
1228 add w16,w16,w17
1229 sli v11.4s,v26.4s,#8
1230 add w13,w13,w19
1231 sli v15.4s,v27.4s,#8
1232 add w14,w14,w20
1233 sli v19.4s,v28.4s,#8
1234 eor w10,w10,w15
1235 sli v23.4s,v29.4s,#8
1236 eor w11,w11,w16
1237 add v2.4s,v2.4s,v3.4s
1238 eor w12,w12,w13
1239 add v6.4s,v6.4s,v7.4s
1240 eor w9,w9,w14
1241 add v10.4s,v10.4s,v11.4s
1242 ror w10,w10,#20
1243 add v14.4s,v14.4s,v15.4s
1244 ror w11,w11,#20
1245 add v18.4s,v18.4s,v19.4s
1246 ror w12,w12,#20
1247 add v22.4s,v22.4s,v23.4s
1248 ror w9,w9,#20
1249 eor v24.16b,v1.16b,v2.16b
1250 add w5,w5,w10
1251 eor v25.16b,v5.16b,v6.16b
1252 add w6,w6,w11
1253 eor v26.16b,v9.16b,v10.16b
1254 add w7,w7,w12
1255 eor v27.16b,v13.16b,v14.16b
1256 add w8,w8,w9
1257 eor v28.16b,v17.16b,v18.16b
1258 eor w21,w21,w5
1259 eor v29.16b,v21.16b,v22.16b
1260 eor w17,w17,w6
1261 ushr v1.4s,v24.4s,#25
1262 eor w19,w19,w7
1263 ushr v5.4s,v25.4s,#25
1264 eor w20,w20,w8
1265 ushr v9.4s,v26.4s,#25
1266 ror w21,w21,#24
1267 ushr v13.4s,v27.4s,#25
1268 ror w17,w17,#24
1269 ushr v17.4s,v28.4s,#25
1270 ror w19,w19,#24
1271 ushr v21.4s,v29.4s,#25
1272 ror w20,w20,#24
1273 sli v1.4s,v24.4s,#7
1274 add w15,w15,w21
1275 sli v5.4s,v25.4s,#7
1276 add w16,w16,w17
1277 sli v9.4s,v26.4s,#7
1278 add w13,w13,w19
1279 sli v13.4s,v27.4s,#7
1280 add w14,w14,w20
1281 sli v17.4s,v28.4s,#7
1282 eor w10,w10,w15
1283 sli v21.4s,v29.4s,#7
1284 eor w11,w11,w16
1285 ext v2.16b,v2.16b,v2.16b,#8
1286 eor w12,w12,w13
1287 ext v6.16b,v6.16b,v6.16b,#8
1288 eor w9,w9,w14
1289 ext v10.16b,v10.16b,v10.16b,#8
1290 ror w10,w10,#25
1291 ext v14.16b,v14.16b,v14.16b,#8
1292 ror w11,w11,#25
1293 ext v18.16b,v18.16b,v18.16b,#8
1294 ror w12,w12,#25
1295 ext v22.16b,v22.16b,v22.16b,#8
1296 ror w9,w9,#25
1297 ext v3.16b,v3.16b,v3.16b,#4
1298 ext v7.16b,v7.16b,v7.16b,#4
1299 ext v11.16b,v11.16b,v11.16b,#4
1300 ext v15.16b,v15.16b,v15.16b,#4
1301 ext v19.16b,v19.16b,v19.16b,#4
1302 ext v23.16b,v23.16b,v23.16b,#4
1303 ext v1.16b,v1.16b,v1.16b,#12
1304 ext v5.16b,v5.16b,v5.16b,#12
1305 ext v9.16b,v9.16b,v9.16b,#12
1306 ext v13.16b,v13.16b,v13.16b,#12
1307 ext v17.16b,v17.16b,v17.16b,#12
1308 ext v21.16b,v21.16b,v21.16b,#12
1309 cbnz x4,.Loop_upper_neon
1310
1311 add w5,w5,w22 // accumulate key block
1312 add x6,x6,x22,lsr#32
1313 add w7,w7,w23
1314 add x8,x8,x23,lsr#32
1315 add w9,w9,w24
1316 add x10,x10,x24,lsr#32
1317 add w11,w11,w25
1318 add x12,x12,x25,lsr#32
1319 add w13,w13,w26
1320 add x14,x14,x26,lsr#32
1321 add w15,w15,w27
1322 add x16,x16,x27,lsr#32
1323 add w17,w17,w28
1324 add x19,x19,x28,lsr#32
1325 add w20,w20,w30
1326 add x21,x21,x30,lsr#32
1327
1328 add x5,x5,x6,lsl#32 // pack
1329 add x7,x7,x8,lsl#32
1330 ldp x6,x8,[x1,#0] // load input
1331 add x9,x9,x10,lsl#32
1332 add x11,x11,x12,lsl#32
1333 ldp x10,x12,[x1,#16]
1334 add x13,x13,x14,lsl#32
1335 add x15,x15,x16,lsl#32
1336 ldp x14,x16,[x1,#32]
1337 add x17,x17,x19,lsl#32
1338 add x20,x20,x21,lsl#32
1339 ldp x19,x21,[x1,#48]
1340 add x1,x1,#64
1341 #ifdef __ARMEB__
1342 rev x5,x5
1343 rev x7,x7
1344 rev x9,x9
1345 rev x11,x11
1346 rev x13,x13
1347 rev x15,x15
1348 rev x17,x17
1349 rev x20,x20
1350 #endif
1351 eor x5,x5,x6
1352 eor x7,x7,x8
1353 eor x9,x9,x10
1354 eor x11,x11,x12
1355 eor x13,x13,x14
1356 eor x15,x15,x16
1357 eor x17,x17,x19
1358 eor x20,x20,x21
1359
1360 stp x5,x7,[x0,#0] // store output
1361 add x28,x28,#1 // increment counter
1362 mov w5,w22 // unpack key block
1363 lsr x6,x22,#32
1364 stp x9,x11,[x0,#16]
1365 mov w7,w23
1366 lsr x8,x23,#32
1367 stp x13,x15,[x0,#32]
1368 mov w9,w24
1369 lsr x10,x24,#32
1370 stp x17,x20,[x0,#48]
1371 add x0,x0,#64
1372 mov w11,w25
1373 lsr x12,x25,#32
1374 mov w13,w26
1375 lsr x14,x26,#32
1376 mov w15,w27
1377 lsr x16,x27,#32
1378 mov w17,w28
1379 lsr x19,x28,#32
1380 mov w20,w30
1381 lsr x21,x30,#32
1382
1383 mov x4,#5
1384 .Loop_lower_neon:
1385 sub x4,x4,#1
1386 add v0.4s,v0.4s,v1.4s
1387 add w5,w5,w9
1388 add v4.4s,v4.4s,v5.4s
1389 add w6,w6,w10
1390 add v8.4s,v8.4s,v9.4s
1391 add w7,w7,w11
1392 add v12.4s,v12.4s,v13.4s
1393 add w8,w8,w12
1394 add v16.4s,v16.4s,v17.4s
1395 eor w17,w17,w5
1396 add v20.4s,v20.4s,v21.4s
1397 eor w19,w19,w6
1398 eor v3.16b,v3.16b,v0.16b
1399 eor w20,w20,w7
1400 eor v7.16b,v7.16b,v4.16b
1401 eor w21,w21,w8
1402 eor v11.16b,v11.16b,v8.16b
1403 ror w17,w17,#16
1404 eor v15.16b,v15.16b,v12.16b
1405 ror w19,w19,#16
1406 eor v19.16b,v19.16b,v16.16b
1407 ror w20,w20,#16
1408 eor v23.16b,v23.16b,v20.16b
1409 ror w21,w21,#16
1410 rev32 v3.8h,v3.8h
1411 add w13,w13,w17
1412 rev32 v7.8h,v7.8h
1413 add w14,w14,w19
1414 rev32 v11.8h,v11.8h
1415 add w15,w15,w20
1416 rev32 v15.8h,v15.8h
1417 add w16,w16,w21
1418 rev32 v19.8h,v19.8h
1419 eor w9,w9,w13
1420 rev32 v23.8h,v23.8h
1421 eor w10,w10,w14
1422 add v2.4s,v2.4s,v3.4s
1423 eor w11,w11,w15
1424 add v6.4s,v6.4s,v7.4s
1425 eor w12,w12,w16
1426 add v10.4s,v10.4s,v11.4s
1427 ror w9,w9,#20
1428 add v14.4s,v14.4s,v15.4s
1429 ror w10,w10,#20
1430 add v18.4s,v18.4s,v19.4s
1431 ror w11,w11,#20
1432 add v22.4s,v22.4s,v23.4s
1433 ror w12,w12,#20
1434 eor v24.16b,v1.16b,v2.16b
1435 add w5,w5,w9
1436 eor v25.16b,v5.16b,v6.16b
1437 add w6,w6,w10
1438 eor v26.16b,v9.16b,v10.16b
1439 add w7,w7,w11
1440 eor v27.16b,v13.16b,v14.16b
1441 add w8,w8,w12
1442 eor v28.16b,v17.16b,v18.16b
1443 eor w17,w17,w5
1444 eor v29.16b,v21.16b,v22.16b
1445 eor w19,w19,w6
1446 ushr v1.4s,v24.4s,#20
1447 eor w20,w20,w7
1448 ushr v5.4s,v25.4s,#20
1449 eor w21,w21,w8
1450 ushr v9.4s,v26.4s,#20
1451 ror w17,w17,#24
1452 ushr v13.4s,v27.4s,#20
1453 ror w19,w19,#24
1454 ushr v17.4s,v28.4s,#20
1455 ror w20,w20,#24
1456 ushr v21.4s,v29.4s,#20
1457 ror w21,w21,#24
1458 sli v1.4s,v24.4s,#12
1459 add w13,w13,w17
1460 sli v5.4s,v25.4s,#12
1461 add w14,w14,w19
1462 sli v9.4s,v26.4s,#12
1463 add w15,w15,w20
1464 sli v13.4s,v27.4s,#12
1465 add w16,w16,w21
1466 sli v17.4s,v28.4s,#12
1467 eor w9,w9,w13
1468 sli v21.4s,v29.4s,#12
1469 eor w10,w10,w14
1470 add v0.4s,v0.4s,v1.4s
1471 eor w11,w11,w15
1472 add v4.4s,v4.4s,v5.4s
1473 eor w12,w12,w16
1474 add v8.4s,v8.4s,v9.4s
1475 ror w9,w9,#25
1476 add v12.4s,v12.4s,v13.4s
1477 ror w10,w10,#25
1478 add v16.4s,v16.4s,v17.4s
1479 ror w11,w11,#25
1480 add v20.4s,v20.4s,v21.4s
1481 ror w12,w12,#25
1482 eor v24.16b,v3.16b,v0.16b
1483 add w5,w5,w10
1484 eor v25.16b,v7.16b,v4.16b
1485 add w6,w6,w11
1486 eor v26.16b,v11.16b,v8.16b
1487 add w7,w7,w12
1488 eor v27.16b,v15.16b,v12.16b
1489 add w8,w8,w9
1490 eor v28.16b,v19.16b,v16.16b
1491 eor w21,w21,w5
1492 eor v29.16b,v23.16b,v20.16b
1493 eor w17,w17,w6
1494 ushr v3.4s,v24.4s,#24
1495 eor w19,w19,w7
1496 ushr v7.4s,v25.4s,#24
1497 eor w20,w20,w8
1498 ushr v11.4s,v26.4s,#24
1499 ror w21,w21,#16
1500 ushr v15.4s,v27.4s,#24
1501 ror w17,w17,#16
1502 ushr v19.4s,v28.4s,#24
1503 ror w19,w19,#16
1504 ushr v23.4s,v29.4s,#24
1505 ror w20,w20,#16
1506 sli v3.4s,v24.4s,#8
1507 add w15,w15,w21
1508 sli v7.4s,v25.4s,#8
1509 add w16,w16,w17
1510 sli v11.4s,v26.4s,#8
1511 add w13,w13,w19
1512 sli v15.4s,v27.4s,#8
1513 add w14,w14,w20
1514 sli v19.4s,v28.4s,#8
1515 eor w10,w10,w15
1516 sli v23.4s,v29.4s,#8
1517 eor w11,w11,w16
1518 add v2.4s,v2.4s,v3.4s
1519 eor w12,w12,w13
1520 add v6.4s,v6.4s,v7.4s
1521 eor w9,w9,w14
1522 add v10.4s,v10.4s,v11.4s
1523 ror w10,w10,#20
1524 add v14.4s,v14.4s,v15.4s
1525 ror w11,w11,#20
1526 add v18.4s,v18.4s,v19.4s
1527 ror w12,w12,#20
1528 add v22.4s,v22.4s,v23.4s
1529 ror w9,w9,#20
1530 eor v24.16b,v1.16b,v2.16b
1531 add w5,w5,w10
1532 eor v25.16b,v5.16b,v6.16b
1533 add w6,w6,w11
1534 eor v26.16b,v9.16b,v10.16b
1535 add w7,w7,w12
1536 eor v27.16b,v13.16b,v14.16b
1537 add w8,w8,w9
1538 eor v28.16b,v17.16b,v18.16b
1539 eor w21,w21,w5
1540 eor v29.16b,v21.16b,v22.16b
1541 eor w17,w17,w6
1542 ushr v1.4s,v24.4s,#25
1543 eor w19,w19,w7
1544 ushr v5.4s,v25.4s,#25
1545 eor w20,w20,w8
1546 ushr v9.4s,v26.4s,#25
1547 ror w21,w21,#24
1548 ushr v13.4s,v27.4s,#25
1549 ror w17,w17,#24
1550 ushr v17.4s,v28.4s,#25
1551 ror w19,w19,#24
1552 ushr v21.4s,v29.4s,#25
1553 ror w20,w20,#24
1554 sli v1.4s,v24.4s,#7
1555 add w15,w15,w21
1556 sli v5.4s,v25.4s,#7
1557 add w16,w16,w17
1558 sli v9.4s,v26.4s,#7
1559 add w13,w13,w19
1560 sli v13.4s,v27.4s,#7
1561 add w14,w14,w20
1562 sli v17.4s,v28.4s,#7
1563 eor w10,w10,w15
1564 sli v21.4s,v29.4s,#7
1565 eor w11,w11,w16
1566 ext v2.16b,v2.16b,v2.16b,#8
1567 eor w12,w12,w13
1568 ext v6.16b,v6.16b,v6.16b,#8
1569 eor w9,w9,w14
1570 ext v10.16b,v10.16b,v10.16b,#8
1571 ror w10,w10,#25
1572 ext v14.16b,v14.16b,v14.16b,#8
1573 ror w11,w11,#25
1574 ext v18.16b,v18.16b,v18.16b,#8
1575 ror w12,w12,#25
1576 ext v22.16b,v22.16b,v22.16b,#8
1577 ror w9,w9,#25
1578 ext v3.16b,v3.16b,v3.16b,#12
1579 ext v7.16b,v7.16b,v7.16b,#12
1580 ext v11.16b,v11.16b,v11.16b,#12
1581 ext v15.16b,v15.16b,v15.16b,#12
1582 ext v19.16b,v19.16b,v19.16b,#12
1583 ext v23.16b,v23.16b,v23.16b,#12
1584 ext v1.16b,v1.16b,v1.16b,#4
1585 ext v5.16b,v5.16b,v5.16b,#4
1586 ext v9.16b,v9.16b,v9.16b,#4
1587 ext v13.16b,v13.16b,v13.16b,#4
1588 ext v17.16b,v17.16b,v17.16b,#4
1589 ext v21.16b,v21.16b,v21.16b,#4
1590 add v0.4s,v0.4s,v1.4s
1591 add w5,w5,w9
1592 add v4.4s,v4.4s,v5.4s
1593 add w6,w6,w10
1594 add v8.4s,v8.4s,v9.4s
1595 add w7,w7,w11
1596 add v12.4s,v12.4s,v13.4s
1597 add w8,w8,w12
1598 add v16.4s,v16.4s,v17.4s
1599 eor w17,w17,w5
1600 add v20.4s,v20.4s,v21.4s
1601 eor w19,w19,w6
1602 eor v3.16b,v3.16b,v0.16b
1603 eor w20,w20,w7
1604 eor v7.16b,v7.16b,v4.16b
1605 eor w21,w21,w8
1606 eor v11.16b,v11.16b,v8.16b
1607 ror w17,w17,#16
1608 eor v15.16b,v15.16b,v12.16b
1609 ror w19,w19,#16
1610 eor v19.16b,v19.16b,v16.16b
1611 ror w20,w20,#16
1612 eor v23.16b,v23.16b,v20.16b
1613 ror w21,w21,#16
1614 rev32 v3.8h,v3.8h
1615 add w13,w13,w17
1616 rev32 v7.8h,v7.8h
1617 add w14,w14,w19
1618 rev32 v11.8h,v11.8h
1619 add w15,w15,w20
1620 rev32 v15.8h,v15.8h
1621 add w16,w16,w21
1622 rev32 v19.8h,v19.8h
1623 eor w9,w9,w13
1624 rev32 v23.8h,v23.8h
1625 eor w10,w10,w14
1626 add v2.4s,v2.4s,v3.4s
1627 eor w11,w11,w15
1628 add v6.4s,v6.4s,v7.4s
1629 eor w12,w12,w16
1630 add v10.4s,v10.4s,v11.4s
1631 ror w9,w9,#20
1632 add v14.4s,v14.4s,v15.4s
1633 ror w10,w10,#20
1634 add v18.4s,v18.4s,v19.4s
1635 ror w11,w11,#20
1636 add v22.4s,v22.4s,v23.4s
1637 ror w12,w12,#20
1638 eor v24.16b,v1.16b,v2.16b
1639 add w5,w5,w9
1640 eor v25.16b,v5.16b,v6.16b
1641 add w6,w6,w10
1642 eor v26.16b,v9.16b,v10.16b
1643 add w7,w7,w11
1644 eor v27.16b,v13.16b,v14.16b
1645 add w8,w8,w12
1646 eor v28.16b,v17.16b,v18.16b
1647 eor w17,w17,w5
1648 eor v29.16b,v21.16b,v22.16b
1649 eor w19,w19,w6
1650 ushr v1.4s,v24.4s,#20
1651 eor w20,w20,w7
1652 ushr v5.4s,v25.4s,#20
1653 eor w21,w21,w8
1654 ushr v9.4s,v26.4s,#20
1655 ror w17,w17,#24
1656 ushr v13.4s,v27.4s,#20
1657 ror w19,w19,#24
1658 ushr v17.4s,v28.4s,#20
1659 ror w20,w20,#24
1660 ushr v21.4s,v29.4s,#20
1661 ror w21,w21,#24
1662 sli v1.4s,v24.4s,#12
1663 add w13,w13,w17
1664 sli v5.4s,v25.4s,#12
1665 add w14,w14,w19
1666 sli v9.4s,v26.4s,#12
1667 add w15,w15,w20
1668 sli v13.4s,v27.4s,#12
1669 add w16,w16,w21
1670 sli v17.4s,v28.4s,#12
1671 eor w9,w9,w13
1672 sli v21.4s,v29.4s,#12
1673 eor w10,w10,w14
1674 add v0.4s,v0.4s,v1.4s
1675 eor w11,w11,w15
1676 add v4.4s,v4.4s,v5.4s
1677 eor w12,w12,w16
1678 add v8.4s,v8.4s,v9.4s
1679 ror w9,w9,#25
1680 add v12.4s,v12.4s,v13.4s
1681 ror w10,w10,#25
1682 add v16.4s,v16.4s,v17.4s
1683 ror w11,w11,#25
1684 add v20.4s,v20.4s,v21.4s
1685 ror w12,w12,#25
1686 eor v24.16b,v3.16b,v0.16b
1687 add w5,w5,w10
1688 eor v25.16b,v7.16b,v4.16b
1689 add w6,w6,w11
1690 eor v26.16b,v11.16b,v8.16b
1691 add w7,w7,w12
1692 eor v27.16b,v15.16b,v12.16b
1693 add w8,w8,w9
1694 eor v28.16b,v19.16b,v16.16b
1695 eor w21,w21,w5
1696 eor v29.16b,v23.16b,v20.16b
1697 eor w17,w17,w6
1698 ushr v3.4s,v24.4s,#24
1699 eor w19,w19,w7
1700 ushr v7.4s,v25.4s,#24
1701 eor w20,w20,w8
1702 ushr v11.4s,v26.4s,#24
1703 ror w21,w21,#16
1704 ushr v15.4s,v27.4s,#24
1705 ror w17,w17,#16
1706 ushr v19.4s,v28.4s,#24
1707 ror w19,w19,#16
1708 ushr v23.4s,v29.4s,#24
1709 ror w20,w20,#16
1710 sli v3.4s,v24.4s,#8
1711 add w15,w15,w21
1712 sli v7.4s,v25.4s,#8
1713 add w16,w16,w17
1714 sli v11.4s,v26.4s,#8
1715 add w13,w13,w19
1716 sli v15.4s,v27.4s,#8
1717 add w14,w14,w20
1718 sli v19.4s,v28.4s,#8
1719 eor w10,w10,w15
1720 sli v23.4s,v29.4s,#8
1721 eor w11,w11,w16
1722 add v2.4s,v2.4s,v3.4s
1723 eor w12,w12,w13
1724 add v6.4s,v6.4s,v7.4s
1725 eor w9,w9,w14
1726 add v10.4s,v10.4s,v11.4s
1727 ror w10,w10,#20
1728 add v14.4s,v14.4s,v15.4s
1729 ror w11,w11,#20
1730 add v18.4s,v18.4s,v19.4s
1731 ror w12,w12,#20
1732 add v22.4s,v22.4s,v23.4s
1733 ror w9,w9,#20
1734 eor v24.16b,v1.16b,v2.16b
1735 add w5,w5,w10
1736 eor v25.16b,v5.16b,v6.16b
1737 add w6,w6,w11
1738 eor v26.16b,v9.16b,v10.16b
1739 add w7,w7,w12
1740 eor v27.16b,v13.16b,v14.16b
1741 add w8,w8,w9
1742 eor v28.16b,v17.16b,v18.16b
1743 eor w21,w21,w5
1744 eor v29.16b,v21.16b,v22.16b
1745 eor w17,w17,w6
1746 ushr v1.4s,v24.4s,#25
1747 eor w19,w19,w7
1748 ushr v5.4s,v25.4s,#25
1749 eor w20,w20,w8
1750 ushr v9.4s,v26.4s,#25
1751 ror w21,w21,#24
1752 ushr v13.4s,v27.4s,#25
1753 ror w17,w17,#24
1754 ushr v17.4s,v28.4s,#25
1755 ror w19,w19,#24
1756 ushr v21.4s,v29.4s,#25
1757 ror w20,w20,#24
1758 sli v1.4s,v24.4s,#7
1759 add w15,w15,w21
1760 sli v5.4s,v25.4s,#7
1761 add w16,w16,w17
1762 sli v9.4s,v26.4s,#7
1763 add w13,w13,w19
1764 sli v13.4s,v27.4s,#7
1765 add w14,w14,w20
1766 sli v17.4s,v28.4s,#7
1767 eor w10,w10,w15
1768 sli v21.4s,v29.4s,#7
1769 eor w11,w11,w16
1770 ext v2.16b,v2.16b,v2.16b,#8
1771 eor w12,w12,w13
1772 ext v6.16b,v6.16b,v6.16b,#8
1773 eor w9,w9,w14
1774 ext v10.16b,v10.16b,v10.16b,#8
1775 ror w10,w10,#25
1776 ext v14.16b,v14.16b,v14.16b,#8
1777 ror w11,w11,#25
1778 ext v18.16b,v18.16b,v18.16b,#8
1779 ror w12,w12,#25
1780 ext v22.16b,v22.16b,v22.16b,#8
1781 ror w9,w9,#25
1782 ext v3.16b,v3.16b,v3.16b,#4
1783 ext v7.16b,v7.16b,v7.16b,#4
1784 ext v11.16b,v11.16b,v11.16b,#4
1785 ext v15.16b,v15.16b,v15.16b,#4
1786 ext v19.16b,v19.16b,v19.16b,#4
1787 ext v23.16b,v23.16b,v23.16b,#4
1788 ext v1.16b,v1.16b,v1.16b,#12
1789 ext v5.16b,v5.16b,v5.16b,#12
1790 ext v9.16b,v9.16b,v9.16b,#12
1791 ext v13.16b,v13.16b,v13.16b,#12
1792 ext v17.16b,v17.16b,v17.16b,#12
1793 ext v21.16b,v21.16b,v21.16b,#12
1794 cbnz x4,.Loop_lower_neon
1795
1796 add w5,w5,w22 // accumulate key block
1797 ldp q24,q25,[sp,#0]
1798 add x6,x6,x22,lsr#32
1799 ldp q26,q27,[sp,#32]
1800 add w7,w7,w23
1801 ldp q28,q29,[sp,#64]
1802 add x8,x8,x23,lsr#32
1803 add v0.4s,v0.4s,v24.4s
1804 add w9,w9,w24
1805 add v4.4s,v4.4s,v24.4s
1806 add x10,x10,x24,lsr#32
1807 add v8.4s,v8.4s,v24.4s
1808 add w11,w11,w25
1809 add v12.4s,v12.4s,v24.4s
1810 add x12,x12,x25,lsr#32
1811 add v16.4s,v16.4s,v24.4s
1812 add w13,w13,w26
1813 add v20.4s,v20.4s,v24.4s
1814 add x14,x14,x26,lsr#32
1815 add v2.4s,v2.4s,v26.4s
1816 add w15,w15,w27
1817 add v6.4s,v6.4s,v26.4s
1818 add x16,x16,x27,lsr#32
1819 add v10.4s,v10.4s,v26.4s
1820 add w17,w17,w28
1821 add v14.4s,v14.4s,v26.4s
1822 add x19,x19,x28,lsr#32
1823 add v18.4s,v18.4s,v26.4s
1824 add w20,w20,w30
1825 add v22.4s,v22.4s,v26.4s
1826 add x21,x21,x30,lsr#32
1827 add v19.4s,v19.4s,v31.4s // +4
1828 add x5,x5,x6,lsl#32 // pack
1829 add v23.4s,v23.4s,v31.4s // +4
1830 add x7,x7,x8,lsl#32
1831 add v3.4s,v3.4s,v27.4s
1832 ldp x6,x8,[x1,#0] // load input
1833 add v7.4s,v7.4s,v28.4s
1834 add x9,x9,x10,lsl#32
1835 add v11.4s,v11.4s,v29.4s
1836 add x11,x11,x12,lsl#32
1837 add v15.4s,v15.4s,v30.4s
1838 ldp x10,x12,[x1,#16]
1839 add v19.4s,v19.4s,v27.4s
1840 add x13,x13,x14,lsl#32
1841 add v23.4s,v23.4s,v28.4s
1842 add x15,x15,x16,lsl#32
1843 add v1.4s,v1.4s,v25.4s
1844 ldp x14,x16,[x1,#32]
1845 add v5.4s,v5.4s,v25.4s
1846 add x17,x17,x19,lsl#32
1847 add v9.4s,v9.4s,v25.4s
1848 add x20,x20,x21,lsl#32
1849 add v13.4s,v13.4s,v25.4s
1850 ldp x19,x21,[x1,#48]
1851 add v17.4s,v17.4s,v25.4s
1852 add x1,x1,#64
1853 add v21.4s,v21.4s,v25.4s
1854
1855 #ifdef __ARMEB__
1856 rev x5,x5
1857 rev x7,x7
1858 rev x9,x9
1859 rev x11,x11
1860 rev x13,x13
1861 rev x15,x15
1862 rev x17,x17
1863 rev x20,x20
1864 #endif
1865 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1866 eor x5,x5,x6
1867 eor x7,x7,x8
1868 eor x9,x9,x10
1869 eor x11,x11,x12
1870 eor x13,x13,x14
1871 eor v0.16b,v0.16b,v24.16b
1872 eor x15,x15,x16
1873 eor v1.16b,v1.16b,v25.16b
1874 eor x17,x17,x19
1875 eor v2.16b,v2.16b,v26.16b
1876 eor x20,x20,x21
1877 eor v3.16b,v3.16b,v27.16b
1878 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1879
1880 stp x5,x7,[x0,#0] // store output
1881 add x28,x28,#7 // increment counter
1882 stp x9,x11,[x0,#16]
1883 stp x13,x15,[x0,#32]
1884 stp x17,x20,[x0,#48]
1885 add x0,x0,#64
1886 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1887
1888 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1889 eor v4.16b,v4.16b,v24.16b
1890 eor v5.16b,v5.16b,v25.16b
1891 eor v6.16b,v6.16b,v26.16b
1892 eor v7.16b,v7.16b,v27.16b
1893 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1894
1895 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1896 eor v8.16b,v8.16b,v0.16b
1897 ldp q24,q25,[sp,#0]
1898 eor v9.16b,v9.16b,v1.16b
1899 ldp q26,q27,[sp,#32]
1900 eor v10.16b,v10.16b,v2.16b
1901 eor v11.16b,v11.16b,v3.16b
1902 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1903
1904 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1905 eor v12.16b,v12.16b,v4.16b
1906 eor v13.16b,v13.16b,v5.16b
1907 eor v14.16b,v14.16b,v6.16b
1908 eor v15.16b,v15.16b,v7.16b
1909 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1910
1911 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1912 eor v16.16b,v16.16b,v8.16b
1913 eor v17.16b,v17.16b,v9.16b
1914 eor v18.16b,v18.16b,v10.16b
1915 eor v19.16b,v19.16b,v11.16b
1916 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1917
1918 shl v0.4s,v31.4s,#1 // 4 -> 8
1919 eor v20.16b,v20.16b,v12.16b
1920 eor v21.16b,v21.16b,v13.16b
1921 eor v22.16b,v22.16b,v14.16b
1922 eor v23.16b,v23.16b,v15.16b
1923 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1924
1925 add v27.4s,v27.4s,v0.4s // += 8
1926 add v28.4s,v28.4s,v0.4s
1927 add v29.4s,v29.4s,v0.4s
1928 add v30.4s,v30.4s,v0.4s
1929
1930 b.hs .Loop_outer_512_neon
1931
1932 adds x2,x2,#512
1933 ushr v0.4s,v31.4s,#2 // 4 -> 1
1934
1935 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1936 ldp d10,d11,[sp,#128+16]
1937 ldp d12,d13,[sp,#128+32]
1938 ldp d14,d15,[sp,#128+48]
1939
1940 stp q24,q31,[sp,#0] // wipe off-load area
1941 stp q24,q31,[sp,#32]
1942 stp q24,q31,[sp,#64]
1943
1944 b.eq .Ldone_512_neon
1945
1946 cmp x2,#192
1947 sub v27.4s,v27.4s,v0.4s // -= 1
1948 sub v28.4s,v28.4s,v0.4s
1949 sub v29.4s,v29.4s,v0.4s
1950 add sp,sp,#128
1951 b.hs .Loop_outer_neon
1952
1953 eor v25.16b,v25.16b,v25.16b
1954 eor v26.16b,v26.16b,v26.16b
1955 eor v27.16b,v27.16b,v27.16b
1956 eor v28.16b,v28.16b,v28.16b
1957 eor v29.16b,v29.16b,v29.16b
1958 eor v30.16b,v30.16b,v30.16b
1959 b .Loop_outer
1960
1961 .Ldone_512_neon:
1962 ldp x19,x20,[x29,#16]
1963 add sp,sp,#128+64
1964 ldp x21,x22,[x29,#32]
1965 ldp x23,x24,[x29,#48]
1966 ldp x25,x26,[x29,#64]
1967 ldp x27,x28,[x29,#80]
1968 ldp x29,x30,[sp],#96
1969 ret
1970 .size ChaCha20_512_neon,.-ChaCha20_512_neon
1971 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698