OLD | NEW |
| (Empty) |
1 #if defined(__aarch64__) | |
2 #include <openssl/arm_arch.h> | |
3 | |
4 .text | |
5 | |
6 | |
7 | |
8 .align 5 | |
9 .Lsigma: | |
10 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral | |
11 .Lone: | |
12 .long 1,0,0,0 | |
13 .LOPENSSL_armcap_P: | |
14 #ifdef __ILP32__ | |
15 .long OPENSSL_armcap_P-. | |
16 #else | |
17 .quad OPENSSL_armcap_P-. | |
18 #endif | |
19 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,
89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,11
5,115,108,46,111,114,103,62,0 | |
20 .align 2 | |
21 | |
22 .globl ChaCha20_ctr32 | |
23 .hidden ChaCha20_ctr32 | |
24 .type ChaCha20_ctr32,%function | |
25 .align 5 | |
26 ChaCha20_ctr32: | |
27 cbz x2,.Labort | |
28 adr x5,.LOPENSSL_armcap_P | |
29 cmp x2,#192 | |
30 b.lo .Lshort | |
31 #ifdef __ILP32__ | |
32 ldrsw x6,[x5] | |
33 #else | |
34 ldr x6,[x5] | |
35 #endif | |
36 ldr w17,[x6,x5] | |
37 tst w17,#ARMV7_NEON | |
38 b.ne ChaCha20_neon | |
39 | |
40 .Lshort: | |
41 stp x29,x30,[sp,#-96]! | |
42 add x29,sp,#0 | |
43 | |
44 adr x5,.Lsigma | |
45 stp x19,x20,[sp,#16] | |
46 stp x21,x22,[sp,#32] | |
47 stp x23,x24,[sp,#48] | |
48 stp x25,x26,[sp,#64] | |
49 stp x27,x28,[sp,#80] | |
50 sub sp,sp,#64 | |
51 | |
52 ldp x22,x23,[x5] // load sigma | |
53 ldp x24,x25,[x3] // load key | |
54 ldp x26,x27,[x3,#16] | |
55 ldp x28,x30,[x4] // load counter | |
56 #ifdef __ARMEB__ | |
57 ror x24,x24,#32 | |
58 ror x25,x25,#32 | |
59 ror x26,x26,#32 | |
60 ror x27,x27,#32 | |
61 ror x28,x28,#32 | |
62 ror x30,x30,#32 | |
63 #endif | |
64 | |
65 .Loop_outer: | |
66 mov w5,w22 // unpack key block | |
67 lsr x6,x22,#32 | |
68 mov w7,w23 | |
69 lsr x8,x23,#32 | |
70 mov w9,w24 | |
71 lsr x10,x24,#32 | |
72 mov w11,w25 | |
73 lsr x12,x25,#32 | |
74 mov w13,w26 | |
75 lsr x14,x26,#32 | |
76 mov w15,w27 | |
77 lsr x16,x27,#32 | |
78 mov w17,w28 | |
79 lsr x19,x28,#32 | |
80 mov w20,w30 | |
81 lsr x21,x30,#32 | |
82 | |
83 mov x4,#10 | |
84 subs x2,x2,#64 | |
85 .Loop: | |
86 sub x4,x4,#1 | |
87 add w5,w5,w9 | |
88 add w6,w6,w10 | |
89 add w7,w7,w11 | |
90 add w8,w8,w12 | |
91 eor w17,w17,w5 | |
92 eor w19,w19,w6 | |
93 eor w20,w20,w7 | |
94 eor w21,w21,w8 | |
95 ror w17,w17,#16 | |
96 ror w19,w19,#16 | |
97 ror w20,w20,#16 | |
98 ror w21,w21,#16 | |
99 add w13,w13,w17 | |
100 add w14,w14,w19 | |
101 add w15,w15,w20 | |
102 add w16,w16,w21 | |
103 eor w9,w9,w13 | |
104 eor w10,w10,w14 | |
105 eor w11,w11,w15 | |
106 eor w12,w12,w16 | |
107 ror w9,w9,#20 | |
108 ror w10,w10,#20 | |
109 ror w11,w11,#20 | |
110 ror w12,w12,#20 | |
111 add w5,w5,w9 | |
112 add w6,w6,w10 | |
113 add w7,w7,w11 | |
114 add w8,w8,w12 | |
115 eor w17,w17,w5 | |
116 eor w19,w19,w6 | |
117 eor w20,w20,w7 | |
118 eor w21,w21,w8 | |
119 ror w17,w17,#24 | |
120 ror w19,w19,#24 | |
121 ror w20,w20,#24 | |
122 ror w21,w21,#24 | |
123 add w13,w13,w17 | |
124 add w14,w14,w19 | |
125 add w15,w15,w20 | |
126 add w16,w16,w21 | |
127 eor w9,w9,w13 | |
128 eor w10,w10,w14 | |
129 eor w11,w11,w15 | |
130 eor w12,w12,w16 | |
131 ror w9,w9,#25 | |
132 ror w10,w10,#25 | |
133 ror w11,w11,#25 | |
134 ror w12,w12,#25 | |
135 add w5,w5,w10 | |
136 add w6,w6,w11 | |
137 add w7,w7,w12 | |
138 add w8,w8,w9 | |
139 eor w21,w21,w5 | |
140 eor w17,w17,w6 | |
141 eor w19,w19,w7 | |
142 eor w20,w20,w8 | |
143 ror w21,w21,#16 | |
144 ror w17,w17,#16 | |
145 ror w19,w19,#16 | |
146 ror w20,w20,#16 | |
147 add w15,w15,w21 | |
148 add w16,w16,w17 | |
149 add w13,w13,w19 | |
150 add w14,w14,w20 | |
151 eor w10,w10,w15 | |
152 eor w11,w11,w16 | |
153 eor w12,w12,w13 | |
154 eor w9,w9,w14 | |
155 ror w10,w10,#20 | |
156 ror w11,w11,#20 | |
157 ror w12,w12,#20 | |
158 ror w9,w9,#20 | |
159 add w5,w5,w10 | |
160 add w6,w6,w11 | |
161 add w7,w7,w12 | |
162 add w8,w8,w9 | |
163 eor w21,w21,w5 | |
164 eor w17,w17,w6 | |
165 eor w19,w19,w7 | |
166 eor w20,w20,w8 | |
167 ror w21,w21,#24 | |
168 ror w17,w17,#24 | |
169 ror w19,w19,#24 | |
170 ror w20,w20,#24 | |
171 add w15,w15,w21 | |
172 add w16,w16,w17 | |
173 add w13,w13,w19 | |
174 add w14,w14,w20 | |
175 eor w10,w10,w15 | |
176 eor w11,w11,w16 | |
177 eor w12,w12,w13 | |
178 eor w9,w9,w14 | |
179 ror w10,w10,#25 | |
180 ror w11,w11,#25 | |
181 ror w12,w12,#25 | |
182 ror w9,w9,#25 | |
183 cbnz x4,.Loop | |
184 | |
185 add w5,w5,w22 // accumulate key block | |
186 add x6,x6,x22,lsr#32 | |
187 add w7,w7,w23 | |
188 add x8,x8,x23,lsr#32 | |
189 add w9,w9,w24 | |
190 add x10,x10,x24,lsr#32 | |
191 add w11,w11,w25 | |
192 add x12,x12,x25,lsr#32 | |
193 add w13,w13,w26 | |
194 add x14,x14,x26,lsr#32 | |
195 add w15,w15,w27 | |
196 add x16,x16,x27,lsr#32 | |
197 add w17,w17,w28 | |
198 add x19,x19,x28,lsr#32 | |
199 add w20,w20,w30 | |
200 add x21,x21,x30,lsr#32 | |
201 | |
202 b.lo .Ltail | |
203 | |
204 add x5,x5,x6,lsl#32 // pack | |
205 add x7,x7,x8,lsl#32 | |
206 ldp x6,x8,[x1,#0] // load input | |
207 add x9,x9,x10,lsl#32 | |
208 add x11,x11,x12,lsl#32 | |
209 ldp x10,x12,[x1,#16] | |
210 add x13,x13,x14,lsl#32 | |
211 add x15,x15,x16,lsl#32 | |
212 ldp x14,x16,[x1,#32] | |
213 add x17,x17,x19,lsl#32 | |
214 add x20,x20,x21,lsl#32 | |
215 ldp x19,x21,[x1,#48] | |
216 add x1,x1,#64 | |
217 #ifdef __ARMEB__ | |
218 rev x5,x5 | |
219 rev x7,x7 | |
220 rev x9,x9 | |
221 rev x11,x11 | |
222 rev x13,x13 | |
223 rev x15,x15 | |
224 rev x17,x17 | |
225 rev x20,x20 | |
226 #endif | |
227 eor x5,x5,x6 | |
228 eor x7,x7,x8 | |
229 eor x9,x9,x10 | |
230 eor x11,x11,x12 | |
231 eor x13,x13,x14 | |
232 eor x15,x15,x16 | |
233 eor x17,x17,x19 | |
234 eor x20,x20,x21 | |
235 | |
236 stp x5,x7,[x0,#0] // store output | |
237 add x28,x28,#1 // increment counter | |
238 stp x9,x11,[x0,#16] | |
239 stp x13,x15,[x0,#32] | |
240 stp x17,x20,[x0,#48] | |
241 add x0,x0,#64 | |
242 | |
243 b.hi .Loop_outer | |
244 | |
245 ldp x19,x20,[x29,#16] | |
246 add sp,sp,#64 | |
247 ldp x21,x22,[x29,#32] | |
248 ldp x23,x24,[x29,#48] | |
249 ldp x25,x26,[x29,#64] | |
250 ldp x27,x28,[x29,#80] | |
251 ldp x29,x30,[sp],#96 | |
252 .Labort: | |
253 ret | |
254 | |
255 .align 4 | |
256 .Ltail: | |
257 add x2,x2,#64 | |
258 .Less_than_64: | |
259 sub x0,x0,#1 | |
260 add x1,x1,x2 | |
261 add x0,x0,x2 | |
262 add x4,sp,x2 | |
263 neg x2,x2 | |
264 | |
265 add x5,x5,x6,lsl#32 // pack | |
266 add x7,x7,x8,lsl#32 | |
267 add x9,x9,x10,lsl#32 | |
268 add x11,x11,x12,lsl#32 | |
269 add x13,x13,x14,lsl#32 | |
270 add x15,x15,x16,lsl#32 | |
271 add x17,x17,x19,lsl#32 | |
272 add x20,x20,x21,lsl#32 | |
273 #ifdef __ARMEB__ | |
274 rev x5,x5 | |
275 rev x7,x7 | |
276 rev x9,x9 | |
277 rev x11,x11 | |
278 rev x13,x13 | |
279 rev x15,x15 | |
280 rev x17,x17 | |
281 rev x20,x20 | |
282 #endif | |
283 stp x5,x7,[sp,#0] | |
284 stp x9,x11,[sp,#16] | |
285 stp x13,x15,[sp,#32] | |
286 stp x17,x20,[sp,#48] | |
287 | |
288 .Loop_tail: | |
289 ldrb w10,[x1,x2] | |
290 ldrb w11,[x4,x2] | |
291 add x2,x2,#1 | |
292 eor w10,w10,w11 | |
293 strb w10,[x0,x2] | |
294 cbnz x2,.Loop_tail | |
295 | |
296 stp xzr,xzr,[sp,#0] | |
297 stp xzr,xzr,[sp,#16] | |
298 stp xzr,xzr,[sp,#32] | |
299 stp xzr,xzr,[sp,#48] | |
300 | |
301 ldp x19,x20,[x29,#16] | |
302 add sp,sp,#64 | |
303 ldp x21,x22,[x29,#32] | |
304 ldp x23,x24,[x29,#48] | |
305 ldp x25,x26,[x29,#64] | |
306 ldp x27,x28,[x29,#80] | |
307 ldp x29,x30,[sp],#96 | |
308 ret | |
309 .size ChaCha20_ctr32,.-ChaCha20_ctr32 | |
310 | |
311 .type ChaCha20_neon,%function | |
312 .align 5 | |
313 ChaCha20_neon: | |
314 stp x29,x30,[sp,#-96]! | |
315 add x29,sp,#0 | |
316 | |
317 adr x5,.Lsigma | |
318 stp x19,x20,[sp,#16] | |
319 stp x21,x22,[sp,#32] | |
320 stp x23,x24,[sp,#48] | |
321 stp x25,x26,[sp,#64] | |
322 stp x27,x28,[sp,#80] | |
323 cmp x2,#512 | |
324 b.hs .L512_or_more_neon | |
325 | |
326 sub sp,sp,#64 | |
327 | |
328 ldp x22,x23,[x5] // load sigma | |
329 ld1 {v24.4s},[x5],#16 | |
330 ldp x24,x25,[x3] // load key | |
331 ldp x26,x27,[x3,#16] | |
332 ld1 {v25.4s,v26.4s},[x3] | |
333 ldp x28,x30,[x4] // load counter | |
334 ld1 {v27.4s},[x4] | |
335 ld1 {v31.4s},[x5] | |
336 #ifdef __ARMEB__ | |
337 rev64 v24.4s,v24.4s | |
338 ror x24,x24,#32 | |
339 ror x25,x25,#32 | |
340 ror x26,x26,#32 | |
341 ror x27,x27,#32 | |
342 ror x28,x28,#32 | |
343 ror x30,x30,#32 | |
344 #endif | |
345 add v27.4s,v27.4s,v31.4s // += 1 | |
346 add v28.4s,v27.4s,v31.4s | |
347 add v29.4s,v28.4s,v31.4s | |
348 shl v31.4s,v31.4s,#2 // 1 -> 4 | |
349 | |
350 .Loop_outer_neon: | |
351 mov w5,w22 // unpack key block | |
352 lsr x6,x22,#32 | |
353 mov v0.16b,v24.16b | |
354 mov w7,w23 | |
355 lsr x8,x23,#32 | |
356 mov v4.16b,v24.16b | |
357 mov w9,w24 | |
358 lsr x10,x24,#32 | |
359 mov v16.16b,v24.16b | |
360 mov w11,w25 | |
361 mov v1.16b,v25.16b | |
362 lsr x12,x25,#32 | |
363 mov v5.16b,v25.16b | |
364 mov w13,w26 | |
365 mov v17.16b,v25.16b | |
366 lsr x14,x26,#32 | |
367 mov v3.16b,v27.16b | |
368 mov w15,w27 | |
369 mov v7.16b,v28.16b | |
370 lsr x16,x27,#32 | |
371 mov v19.16b,v29.16b | |
372 mov w17,w28 | |
373 mov v2.16b,v26.16b | |
374 lsr x19,x28,#32 | |
375 mov v6.16b,v26.16b | |
376 mov w20,w30 | |
377 mov v18.16b,v26.16b | |
378 lsr x21,x30,#32 | |
379 | |
380 mov x4,#10 | |
381 subs x2,x2,#256 | |
382 .Loop_neon: | |
383 sub x4,x4,#1 | |
384 add v0.4s,v0.4s,v1.4s | |
385 add w5,w5,w9 | |
386 add v4.4s,v4.4s,v5.4s | |
387 add w6,w6,w10 | |
388 add v16.4s,v16.4s,v17.4s | |
389 add w7,w7,w11 | |
390 eor v3.16b,v3.16b,v0.16b | |
391 add w8,w8,w12 | |
392 eor v7.16b,v7.16b,v4.16b | |
393 eor w17,w17,w5 | |
394 eor v19.16b,v19.16b,v16.16b | |
395 eor w19,w19,w6 | |
396 rev32 v3.8h,v3.8h | |
397 eor w20,w20,w7 | |
398 rev32 v7.8h,v7.8h | |
399 eor w21,w21,w8 | |
400 rev32 v19.8h,v19.8h | |
401 ror w17,w17,#16 | |
402 add v2.4s,v2.4s,v3.4s | |
403 ror w19,w19,#16 | |
404 add v6.4s,v6.4s,v7.4s | |
405 ror w20,w20,#16 | |
406 add v18.4s,v18.4s,v19.4s | |
407 ror w21,w21,#16 | |
408 eor v20.16b,v1.16b,v2.16b | |
409 add w13,w13,w17 | |
410 eor v21.16b,v5.16b,v6.16b | |
411 add w14,w14,w19 | |
412 eor v22.16b,v17.16b,v18.16b | |
413 add w15,w15,w20 | |
414 ushr v1.4s,v20.4s,#20 | |
415 add w16,w16,w21 | |
416 ushr v5.4s,v21.4s,#20 | |
417 eor w9,w9,w13 | |
418 ushr v17.4s,v22.4s,#20 | |
419 eor w10,w10,w14 | |
420 sli v1.4s,v20.4s,#12 | |
421 eor w11,w11,w15 | |
422 sli v5.4s,v21.4s,#12 | |
423 eor w12,w12,w16 | |
424 sli v17.4s,v22.4s,#12 | |
425 ror w9,w9,#20 | |
426 add v0.4s,v0.4s,v1.4s | |
427 ror w10,w10,#20 | |
428 add v4.4s,v4.4s,v5.4s | |
429 ror w11,w11,#20 | |
430 add v16.4s,v16.4s,v17.4s | |
431 ror w12,w12,#20 | |
432 eor v20.16b,v3.16b,v0.16b | |
433 add w5,w5,w9 | |
434 eor v21.16b,v7.16b,v4.16b | |
435 add w6,w6,w10 | |
436 eor v22.16b,v19.16b,v16.16b | |
437 add w7,w7,w11 | |
438 ushr v3.4s,v20.4s,#24 | |
439 add w8,w8,w12 | |
440 ushr v7.4s,v21.4s,#24 | |
441 eor w17,w17,w5 | |
442 ushr v19.4s,v22.4s,#24 | |
443 eor w19,w19,w6 | |
444 sli v3.4s,v20.4s,#8 | |
445 eor w20,w20,w7 | |
446 sli v7.4s,v21.4s,#8 | |
447 eor w21,w21,w8 | |
448 sli v19.4s,v22.4s,#8 | |
449 ror w17,w17,#24 | |
450 add v2.4s,v2.4s,v3.4s | |
451 ror w19,w19,#24 | |
452 add v6.4s,v6.4s,v7.4s | |
453 ror w20,w20,#24 | |
454 add v18.4s,v18.4s,v19.4s | |
455 ror w21,w21,#24 | |
456 eor v20.16b,v1.16b,v2.16b | |
457 add w13,w13,w17 | |
458 eor v21.16b,v5.16b,v6.16b | |
459 add w14,w14,w19 | |
460 eor v22.16b,v17.16b,v18.16b | |
461 add w15,w15,w20 | |
462 ushr v1.4s,v20.4s,#25 | |
463 add w16,w16,w21 | |
464 ushr v5.4s,v21.4s,#25 | |
465 eor w9,w9,w13 | |
466 ushr v17.4s,v22.4s,#25 | |
467 eor w10,w10,w14 | |
468 sli v1.4s,v20.4s,#7 | |
469 eor w11,w11,w15 | |
470 sli v5.4s,v21.4s,#7 | |
471 eor w12,w12,w16 | |
472 sli v17.4s,v22.4s,#7 | |
473 ror w9,w9,#25 | |
474 ext v2.16b,v2.16b,v2.16b,#8 | |
475 ror w10,w10,#25 | |
476 ext v6.16b,v6.16b,v6.16b,#8 | |
477 ror w11,w11,#25 | |
478 ext v18.16b,v18.16b,v18.16b,#8 | |
479 ror w12,w12,#25 | |
480 ext v3.16b,v3.16b,v3.16b,#12 | |
481 ext v7.16b,v7.16b,v7.16b,#12 | |
482 ext v19.16b,v19.16b,v19.16b,#12 | |
483 ext v1.16b,v1.16b,v1.16b,#4 | |
484 ext v5.16b,v5.16b,v5.16b,#4 | |
485 ext v17.16b,v17.16b,v17.16b,#4 | |
486 add v0.4s,v0.4s,v1.4s | |
487 add w5,w5,w10 | |
488 add v4.4s,v4.4s,v5.4s | |
489 add w6,w6,w11 | |
490 add v16.4s,v16.4s,v17.4s | |
491 add w7,w7,w12 | |
492 eor v3.16b,v3.16b,v0.16b | |
493 add w8,w8,w9 | |
494 eor v7.16b,v7.16b,v4.16b | |
495 eor w21,w21,w5 | |
496 eor v19.16b,v19.16b,v16.16b | |
497 eor w17,w17,w6 | |
498 rev32 v3.8h,v3.8h | |
499 eor w19,w19,w7 | |
500 rev32 v7.8h,v7.8h | |
501 eor w20,w20,w8 | |
502 rev32 v19.8h,v19.8h | |
503 ror w21,w21,#16 | |
504 add v2.4s,v2.4s,v3.4s | |
505 ror w17,w17,#16 | |
506 add v6.4s,v6.4s,v7.4s | |
507 ror w19,w19,#16 | |
508 add v18.4s,v18.4s,v19.4s | |
509 ror w20,w20,#16 | |
510 eor v20.16b,v1.16b,v2.16b | |
511 add w15,w15,w21 | |
512 eor v21.16b,v5.16b,v6.16b | |
513 add w16,w16,w17 | |
514 eor v22.16b,v17.16b,v18.16b | |
515 add w13,w13,w19 | |
516 ushr v1.4s,v20.4s,#20 | |
517 add w14,w14,w20 | |
518 ushr v5.4s,v21.4s,#20 | |
519 eor w10,w10,w15 | |
520 ushr v17.4s,v22.4s,#20 | |
521 eor w11,w11,w16 | |
522 sli v1.4s,v20.4s,#12 | |
523 eor w12,w12,w13 | |
524 sli v5.4s,v21.4s,#12 | |
525 eor w9,w9,w14 | |
526 sli v17.4s,v22.4s,#12 | |
527 ror w10,w10,#20 | |
528 add v0.4s,v0.4s,v1.4s | |
529 ror w11,w11,#20 | |
530 add v4.4s,v4.4s,v5.4s | |
531 ror w12,w12,#20 | |
532 add v16.4s,v16.4s,v17.4s | |
533 ror w9,w9,#20 | |
534 eor v20.16b,v3.16b,v0.16b | |
535 add w5,w5,w10 | |
536 eor v21.16b,v7.16b,v4.16b | |
537 add w6,w6,w11 | |
538 eor v22.16b,v19.16b,v16.16b | |
539 add w7,w7,w12 | |
540 ushr v3.4s,v20.4s,#24 | |
541 add w8,w8,w9 | |
542 ushr v7.4s,v21.4s,#24 | |
543 eor w21,w21,w5 | |
544 ushr v19.4s,v22.4s,#24 | |
545 eor w17,w17,w6 | |
546 sli v3.4s,v20.4s,#8 | |
547 eor w19,w19,w7 | |
548 sli v7.4s,v21.4s,#8 | |
549 eor w20,w20,w8 | |
550 sli v19.4s,v22.4s,#8 | |
551 ror w21,w21,#24 | |
552 add v2.4s,v2.4s,v3.4s | |
553 ror w17,w17,#24 | |
554 add v6.4s,v6.4s,v7.4s | |
555 ror w19,w19,#24 | |
556 add v18.4s,v18.4s,v19.4s | |
557 ror w20,w20,#24 | |
558 eor v20.16b,v1.16b,v2.16b | |
559 add w15,w15,w21 | |
560 eor v21.16b,v5.16b,v6.16b | |
561 add w16,w16,w17 | |
562 eor v22.16b,v17.16b,v18.16b | |
563 add w13,w13,w19 | |
564 ushr v1.4s,v20.4s,#25 | |
565 add w14,w14,w20 | |
566 ushr v5.4s,v21.4s,#25 | |
567 eor w10,w10,w15 | |
568 ushr v17.4s,v22.4s,#25 | |
569 eor w11,w11,w16 | |
570 sli v1.4s,v20.4s,#7 | |
571 eor w12,w12,w13 | |
572 sli v5.4s,v21.4s,#7 | |
573 eor w9,w9,w14 | |
574 sli v17.4s,v22.4s,#7 | |
575 ror w10,w10,#25 | |
576 ext v2.16b,v2.16b,v2.16b,#8 | |
577 ror w11,w11,#25 | |
578 ext v6.16b,v6.16b,v6.16b,#8 | |
579 ror w12,w12,#25 | |
580 ext v18.16b,v18.16b,v18.16b,#8 | |
581 ror w9,w9,#25 | |
582 ext v3.16b,v3.16b,v3.16b,#4 | |
583 ext v7.16b,v7.16b,v7.16b,#4 | |
584 ext v19.16b,v19.16b,v19.16b,#4 | |
585 ext v1.16b,v1.16b,v1.16b,#12 | |
586 ext v5.16b,v5.16b,v5.16b,#12 | |
587 ext v17.16b,v17.16b,v17.16b,#12 | |
588 cbnz x4,.Loop_neon | |
589 | |
590 add w5,w5,w22 // accumulate key block | |
591 add v0.4s,v0.4s,v24.4s | |
592 add x6,x6,x22,lsr#32 | |
593 add v4.4s,v4.4s,v24.4s | |
594 add w7,w7,w23 | |
595 add v16.4s,v16.4s,v24.4s | |
596 add x8,x8,x23,lsr#32 | |
597 add v2.4s,v2.4s,v26.4s | |
598 add w9,w9,w24 | |
599 add v6.4s,v6.4s,v26.4s | |
600 add x10,x10,x24,lsr#32 | |
601 add v18.4s,v18.4s,v26.4s | |
602 add w11,w11,w25 | |
603 add v3.4s,v3.4s,v27.4s | |
604 add x12,x12,x25,lsr#32 | |
605 add w13,w13,w26 | |
606 add v7.4s,v7.4s,v28.4s | |
607 add x14,x14,x26,lsr#32 | |
608 add w15,w15,w27 | |
609 add v19.4s,v19.4s,v29.4s | |
610 add x16,x16,x27,lsr#32 | |
611 add w17,w17,w28 | |
612 add v1.4s,v1.4s,v25.4s | |
613 add x19,x19,x28,lsr#32 | |
614 add w20,w20,w30 | |
615 add v5.4s,v5.4s,v25.4s | |
616 add x21,x21,x30,lsr#32 | |
617 add v17.4s,v17.4s,v25.4s | |
618 | |
619 b.lo .Ltail_neon | |
620 | |
621 add x5,x5,x6,lsl#32 // pack | |
622 add x7,x7,x8,lsl#32 | |
623 ldp x6,x8,[x1,#0] // load input | |
624 add x9,x9,x10,lsl#32 | |
625 add x11,x11,x12,lsl#32 | |
626 ldp x10,x12,[x1,#16] | |
627 add x13,x13,x14,lsl#32 | |
628 add x15,x15,x16,lsl#32 | |
629 ldp x14,x16,[x1,#32] | |
630 add x17,x17,x19,lsl#32 | |
631 add x20,x20,x21,lsl#32 | |
632 ldp x19,x21,[x1,#48] | |
633 add x1,x1,#64 | |
634 #ifdef __ARMEB__ | |
635 rev x5,x5 | |
636 rev x7,x7 | |
637 rev x9,x9 | |
638 rev x11,x11 | |
639 rev x13,x13 | |
640 rev x15,x15 | |
641 rev x17,x17 | |
642 rev x20,x20 | |
643 #endif | |
644 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 | |
645 eor x5,x5,x6 | |
646 eor x7,x7,x8 | |
647 eor x9,x9,x10 | |
648 eor x11,x11,x12 | |
649 eor x13,x13,x14 | |
650 eor v0.16b,v0.16b,v20.16b | |
651 eor x15,x15,x16 | |
652 eor v1.16b,v1.16b,v21.16b | |
653 eor x17,x17,x19 | |
654 eor v2.16b,v2.16b,v22.16b | |
655 eor x20,x20,x21 | |
656 eor v3.16b,v3.16b,v23.16b | |
657 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 | |
658 | |
659 stp x5,x7,[x0,#0] // store output | |
660 add x28,x28,#4 // increment counter | |
661 stp x9,x11,[x0,#16] | |
662 add v27.4s,v27.4s,v31.4s // += 4 | |
663 stp x13,x15,[x0,#32] | |
664 add v28.4s,v28.4s,v31.4s | |
665 stp x17,x20,[x0,#48] | |
666 add v29.4s,v29.4s,v31.4s | |
667 add x0,x0,#64 | |
668 | |
669 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 | |
670 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 | |
671 | |
672 eor v4.16b,v4.16b,v20.16b | |
673 eor v5.16b,v5.16b,v21.16b | |
674 eor v6.16b,v6.16b,v22.16b | |
675 eor v7.16b,v7.16b,v23.16b | |
676 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 | |
677 | |
678 eor v16.16b,v16.16b,v0.16b | |
679 eor v17.16b,v17.16b,v1.16b | |
680 eor v18.16b,v18.16b,v2.16b | |
681 eor v19.16b,v19.16b,v3.16b | |
682 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 | |
683 | |
684 b.hi .Loop_outer_neon | |
685 | |
686 ldp x19,x20,[x29,#16] | |
687 add sp,sp,#64 | |
688 ldp x21,x22,[x29,#32] | |
689 ldp x23,x24,[x29,#48] | |
690 ldp x25,x26,[x29,#64] | |
691 ldp x27,x28,[x29,#80] | |
692 ldp x29,x30,[sp],#96 | |
693 ret | |
694 | |
695 .Ltail_neon: | |
696 add x2,x2,#256 | |
697 cmp x2,#64 | |
698 b.lo .Less_than_64 | |
699 | |
700 add x5,x5,x6,lsl#32 // pack | |
701 add x7,x7,x8,lsl#32 | |
702 ldp x6,x8,[x1,#0] // load input | |
703 add x9,x9,x10,lsl#32 | |
704 add x11,x11,x12,lsl#32 | |
705 ldp x10,x12,[x1,#16] | |
706 add x13,x13,x14,lsl#32 | |
707 add x15,x15,x16,lsl#32 | |
708 ldp x14,x16,[x1,#32] | |
709 add x17,x17,x19,lsl#32 | |
710 add x20,x20,x21,lsl#32 | |
711 ldp x19,x21,[x1,#48] | |
712 add x1,x1,#64 | |
713 #ifdef __ARMEB__ | |
714 rev x5,x5 | |
715 rev x7,x7 | |
716 rev x9,x9 | |
717 rev x11,x11 | |
718 rev x13,x13 | |
719 rev x15,x15 | |
720 rev x17,x17 | |
721 rev x20,x20 | |
722 #endif | |
723 eor x5,x5,x6 | |
724 eor x7,x7,x8 | |
725 eor x9,x9,x10 | |
726 eor x11,x11,x12 | |
727 eor x13,x13,x14 | |
728 eor x15,x15,x16 | |
729 eor x17,x17,x19 | |
730 eor x20,x20,x21 | |
731 | |
732 stp x5,x7,[x0,#0] // store output | |
733 add x28,x28,#4 // increment counter | |
734 stp x9,x11,[x0,#16] | |
735 stp x13,x15,[x0,#32] | |
736 stp x17,x20,[x0,#48] | |
737 add x0,x0,#64 | |
738 b.eq .Ldone_neon | |
739 sub x2,x2,#64 | |
740 cmp x2,#64 | |
741 b.lo .Less_than_128 | |
742 | |
743 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 | |
744 eor v0.16b,v0.16b,v20.16b | |
745 eor v1.16b,v1.16b,v21.16b | |
746 eor v2.16b,v2.16b,v22.16b | |
747 eor v3.16b,v3.16b,v23.16b | |
748 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 | |
749 b.eq .Ldone_neon | |
750 sub x2,x2,#64 | |
751 cmp x2,#64 | |
752 b.lo .Less_than_192 | |
753 | |
754 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 | |
755 eor v4.16b,v4.16b,v20.16b | |
756 eor v5.16b,v5.16b,v21.16b | |
757 eor v6.16b,v6.16b,v22.16b | |
758 eor v7.16b,v7.16b,v23.16b | |
759 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 | |
760 b.eq .Ldone_neon | |
761 sub x2,x2,#64 | |
762 | |
763 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] | |
764 b .Last_neon | |
765 | |
766 .Less_than_128: | |
767 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] | |
768 b .Last_neon | |
769 .Less_than_192: | |
770 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] | |
771 b .Last_neon | |
772 | |
773 .align 4 | |
774 .Last_neon: | |
775 sub x0,x0,#1 | |
776 add x1,x1,x2 | |
777 add x0,x0,x2 | |
778 add x4,sp,x2 | |
779 neg x2,x2 | |
780 | |
781 .Loop_tail_neon: | |
782 ldrb w10,[x1,x2] | |
783 ldrb w11,[x4,x2] | |
784 add x2,x2,#1 | |
785 eor w10,w10,w11 | |
786 strb w10,[x0,x2] | |
787 cbnz x2,.Loop_tail_neon | |
788 | |
789 stp xzr,xzr,[sp,#0] | |
790 stp xzr,xzr,[sp,#16] | |
791 stp xzr,xzr,[sp,#32] | |
792 stp xzr,xzr,[sp,#48] | |
793 | |
794 .Ldone_neon: | |
795 ldp x19,x20,[x29,#16] | |
796 add sp,sp,#64 | |
797 ldp x21,x22,[x29,#32] | |
798 ldp x23,x24,[x29,#48] | |
799 ldp x25,x26,[x29,#64] | |
800 ldp x27,x28,[x29,#80] | |
801 ldp x29,x30,[sp],#96 | |
802 ret | |
803 .size ChaCha20_neon,.-ChaCha20_neon | |
804 .type ChaCha20_512_neon,%function | |
805 .align 5 | |
806 ChaCha20_512_neon: | |
807 stp x29,x30,[sp,#-96]! | |
808 add x29,sp,#0 | |
809 | |
810 adr x5,.Lsigma | |
811 stp x19,x20,[sp,#16] | |
812 stp x21,x22,[sp,#32] | |
813 stp x23,x24,[sp,#48] | |
814 stp x25,x26,[sp,#64] | |
815 stp x27,x28,[sp,#80] | |
816 | |
817 .L512_or_more_neon: | |
818 sub sp,sp,#128+64 | |
819 | |
820 ldp x22,x23,[x5] // load sigma | |
821 ld1 {v24.4s},[x5],#16 | |
822 ldp x24,x25,[x3] // load key | |
823 ldp x26,x27,[x3,#16] | |
824 ld1 {v25.4s,v26.4s},[x3] | |
825 ldp x28,x30,[x4] // load counter | |
826 ld1 {v27.4s},[x4] | |
827 ld1 {v31.4s},[x5] | |
828 #ifdef __ARMEB__ | |
829 rev64 v24.4s,v24.4s | |
830 ror x24,x24,#32 | |
831 ror x25,x25,#32 | |
832 ror x26,x26,#32 | |
833 ror x27,x27,#32 | |
834 ror x28,x28,#32 | |
835 ror x30,x30,#32 | |
836 #endif | |
837 add v27.4s,v27.4s,v31.4s // += 1 | |
838 stp q24,q25,[sp,#0] // off-load key block, invariant part | |
839 add v27.4s,v27.4s,v31.4s // not typo | |
840 str q26,[sp,#32] | |
841 add v28.4s,v27.4s,v31.4s | |
842 add v29.4s,v28.4s,v31.4s | |
843 add v30.4s,v29.4s,v31.4s | |
844 shl v31.4s,v31.4s,#2 // 1 -> 4 | |
845 | |
846 stp d8,d9,[sp,#128+0] // meet ABI requirements | |
847 stp d10,d11,[sp,#128+16] | |
848 stp d12,d13,[sp,#128+32] | |
849 stp d14,d15,[sp,#128+48] | |
850 | |
851 sub x2,x2,#512 // not typo | |
852 | |
853 .Loop_outer_512_neon: | |
854 mov v0.16b,v24.16b | |
855 mov v4.16b,v24.16b | |
856 mov v8.16b,v24.16b | |
857 mov v12.16b,v24.16b | |
858 mov v16.16b,v24.16b | |
859 mov v20.16b,v24.16b | |
860 mov v1.16b,v25.16b | |
861 mov w5,w22 // unpack key block | |
862 mov v5.16b,v25.16b | |
863 lsr x6,x22,#32 | |
864 mov v9.16b,v25.16b | |
865 mov w7,w23 | |
866 mov v13.16b,v25.16b | |
867 lsr x8,x23,#32 | |
868 mov v17.16b,v25.16b | |
869 mov w9,w24 | |
870 mov v21.16b,v25.16b | |
871 lsr x10,x24,#32 | |
872 mov v3.16b,v27.16b | |
873 mov w11,w25 | |
874 mov v7.16b,v28.16b | |
875 lsr x12,x25,#32 | |
876 mov v11.16b,v29.16b | |
877 mov w13,w26 | |
878 mov v15.16b,v30.16b | |
879 lsr x14,x26,#32 | |
880 mov v2.16b,v26.16b | |
881 mov w15,w27 | |
882 mov v6.16b,v26.16b | |
883 lsr x16,x27,#32 | |
884 add v19.4s,v3.4s,v31.4s // +4 | |
885 mov w17,w28 | |
886 add v23.4s,v7.4s,v31.4s // +4 | |
887 lsr x19,x28,#32 | |
888 mov v10.16b,v26.16b | |
889 mov w20,w30 | |
890 mov v14.16b,v26.16b | |
891 lsr x21,x30,#32 | |
892 mov v18.16b,v26.16b | |
893 stp q27,q28,[sp,#48] // off-load key block, variable
part | |
894 mov v22.16b,v26.16b | |
895 str q29,[sp,#80] | |
896 | |
897 mov x4,#5 | |
898 subs x2,x2,#512 | |
899 .Loop_upper_neon: | |
900 sub x4,x4,#1 | |
901 add v0.4s,v0.4s,v1.4s | |
902 add w5,w5,w9 | |
903 add v4.4s,v4.4s,v5.4s | |
904 add w6,w6,w10 | |
905 add v8.4s,v8.4s,v9.4s | |
906 add w7,w7,w11 | |
907 add v12.4s,v12.4s,v13.4s | |
908 add w8,w8,w12 | |
909 add v16.4s,v16.4s,v17.4s | |
910 eor w17,w17,w5 | |
911 add v20.4s,v20.4s,v21.4s | |
912 eor w19,w19,w6 | |
913 eor v3.16b,v3.16b,v0.16b | |
914 eor w20,w20,w7 | |
915 eor v7.16b,v7.16b,v4.16b | |
916 eor w21,w21,w8 | |
917 eor v11.16b,v11.16b,v8.16b | |
918 ror w17,w17,#16 | |
919 eor v15.16b,v15.16b,v12.16b | |
920 ror w19,w19,#16 | |
921 eor v19.16b,v19.16b,v16.16b | |
922 ror w20,w20,#16 | |
923 eor v23.16b,v23.16b,v20.16b | |
924 ror w21,w21,#16 | |
925 rev32 v3.8h,v3.8h | |
926 add w13,w13,w17 | |
927 rev32 v7.8h,v7.8h | |
928 add w14,w14,w19 | |
929 rev32 v11.8h,v11.8h | |
930 add w15,w15,w20 | |
931 rev32 v15.8h,v15.8h | |
932 add w16,w16,w21 | |
933 rev32 v19.8h,v19.8h | |
934 eor w9,w9,w13 | |
935 rev32 v23.8h,v23.8h | |
936 eor w10,w10,w14 | |
937 add v2.4s,v2.4s,v3.4s | |
938 eor w11,w11,w15 | |
939 add v6.4s,v6.4s,v7.4s | |
940 eor w12,w12,w16 | |
941 add v10.4s,v10.4s,v11.4s | |
942 ror w9,w9,#20 | |
943 add v14.4s,v14.4s,v15.4s | |
944 ror w10,w10,#20 | |
945 add v18.4s,v18.4s,v19.4s | |
946 ror w11,w11,#20 | |
947 add v22.4s,v22.4s,v23.4s | |
948 ror w12,w12,#20 | |
949 eor v24.16b,v1.16b,v2.16b | |
950 add w5,w5,w9 | |
951 eor v25.16b,v5.16b,v6.16b | |
952 add w6,w6,w10 | |
953 eor v26.16b,v9.16b,v10.16b | |
954 add w7,w7,w11 | |
955 eor v27.16b,v13.16b,v14.16b | |
956 add w8,w8,w12 | |
957 eor v28.16b,v17.16b,v18.16b | |
958 eor w17,w17,w5 | |
959 eor v29.16b,v21.16b,v22.16b | |
960 eor w19,w19,w6 | |
961 ushr v1.4s,v24.4s,#20 | |
962 eor w20,w20,w7 | |
963 ushr v5.4s,v25.4s,#20 | |
964 eor w21,w21,w8 | |
965 ushr v9.4s,v26.4s,#20 | |
966 ror w17,w17,#24 | |
967 ushr v13.4s,v27.4s,#20 | |
968 ror w19,w19,#24 | |
969 ushr v17.4s,v28.4s,#20 | |
970 ror w20,w20,#24 | |
971 ushr v21.4s,v29.4s,#20 | |
972 ror w21,w21,#24 | |
973 sli v1.4s,v24.4s,#12 | |
974 add w13,w13,w17 | |
975 sli v5.4s,v25.4s,#12 | |
976 add w14,w14,w19 | |
977 sli v9.4s,v26.4s,#12 | |
978 add w15,w15,w20 | |
979 sli v13.4s,v27.4s,#12 | |
980 add w16,w16,w21 | |
981 sli v17.4s,v28.4s,#12 | |
982 eor w9,w9,w13 | |
983 sli v21.4s,v29.4s,#12 | |
984 eor w10,w10,w14 | |
985 add v0.4s,v0.4s,v1.4s | |
986 eor w11,w11,w15 | |
987 add v4.4s,v4.4s,v5.4s | |
988 eor w12,w12,w16 | |
989 add v8.4s,v8.4s,v9.4s | |
990 ror w9,w9,#25 | |
991 add v12.4s,v12.4s,v13.4s | |
992 ror w10,w10,#25 | |
993 add v16.4s,v16.4s,v17.4s | |
994 ror w11,w11,#25 | |
995 add v20.4s,v20.4s,v21.4s | |
996 ror w12,w12,#25 | |
997 eor v24.16b,v3.16b,v0.16b | |
998 add w5,w5,w10 | |
999 eor v25.16b,v7.16b,v4.16b | |
1000 add w6,w6,w11 | |
1001 eor v26.16b,v11.16b,v8.16b | |
1002 add w7,w7,w12 | |
1003 eor v27.16b,v15.16b,v12.16b | |
1004 add w8,w8,w9 | |
1005 eor v28.16b,v19.16b,v16.16b | |
1006 eor w21,w21,w5 | |
1007 eor v29.16b,v23.16b,v20.16b | |
1008 eor w17,w17,w6 | |
1009 ushr v3.4s,v24.4s,#24 | |
1010 eor w19,w19,w7 | |
1011 ushr v7.4s,v25.4s,#24 | |
1012 eor w20,w20,w8 | |
1013 ushr v11.4s,v26.4s,#24 | |
1014 ror w21,w21,#16 | |
1015 ushr v15.4s,v27.4s,#24 | |
1016 ror w17,w17,#16 | |
1017 ushr v19.4s,v28.4s,#24 | |
1018 ror w19,w19,#16 | |
1019 ushr v23.4s,v29.4s,#24 | |
1020 ror w20,w20,#16 | |
1021 sli v3.4s,v24.4s,#8 | |
1022 add w15,w15,w21 | |
1023 sli v7.4s,v25.4s,#8 | |
1024 add w16,w16,w17 | |
1025 sli v11.4s,v26.4s,#8 | |
1026 add w13,w13,w19 | |
1027 sli v15.4s,v27.4s,#8 | |
1028 add w14,w14,w20 | |
1029 sli v19.4s,v28.4s,#8 | |
1030 eor w10,w10,w15 | |
1031 sli v23.4s,v29.4s,#8 | |
1032 eor w11,w11,w16 | |
1033 add v2.4s,v2.4s,v3.4s | |
1034 eor w12,w12,w13 | |
1035 add v6.4s,v6.4s,v7.4s | |
1036 eor w9,w9,w14 | |
1037 add v10.4s,v10.4s,v11.4s | |
1038 ror w10,w10,#20 | |
1039 add v14.4s,v14.4s,v15.4s | |
1040 ror w11,w11,#20 | |
1041 add v18.4s,v18.4s,v19.4s | |
1042 ror w12,w12,#20 | |
1043 add v22.4s,v22.4s,v23.4s | |
1044 ror w9,w9,#20 | |
1045 eor v24.16b,v1.16b,v2.16b | |
1046 add w5,w5,w10 | |
1047 eor v25.16b,v5.16b,v6.16b | |
1048 add w6,w6,w11 | |
1049 eor v26.16b,v9.16b,v10.16b | |
1050 add w7,w7,w12 | |
1051 eor v27.16b,v13.16b,v14.16b | |
1052 add w8,w8,w9 | |
1053 eor v28.16b,v17.16b,v18.16b | |
1054 eor w21,w21,w5 | |
1055 eor v29.16b,v21.16b,v22.16b | |
1056 eor w17,w17,w6 | |
1057 ushr v1.4s,v24.4s,#25 | |
1058 eor w19,w19,w7 | |
1059 ushr v5.4s,v25.4s,#25 | |
1060 eor w20,w20,w8 | |
1061 ushr v9.4s,v26.4s,#25 | |
1062 ror w21,w21,#24 | |
1063 ushr v13.4s,v27.4s,#25 | |
1064 ror w17,w17,#24 | |
1065 ushr v17.4s,v28.4s,#25 | |
1066 ror w19,w19,#24 | |
1067 ushr v21.4s,v29.4s,#25 | |
1068 ror w20,w20,#24 | |
1069 sli v1.4s,v24.4s,#7 | |
1070 add w15,w15,w21 | |
1071 sli v5.4s,v25.4s,#7 | |
1072 add w16,w16,w17 | |
1073 sli v9.4s,v26.4s,#7 | |
1074 add w13,w13,w19 | |
1075 sli v13.4s,v27.4s,#7 | |
1076 add w14,w14,w20 | |
1077 sli v17.4s,v28.4s,#7 | |
1078 eor w10,w10,w15 | |
1079 sli v21.4s,v29.4s,#7 | |
1080 eor w11,w11,w16 | |
1081 ext v2.16b,v2.16b,v2.16b,#8 | |
1082 eor w12,w12,w13 | |
1083 ext v6.16b,v6.16b,v6.16b,#8 | |
1084 eor w9,w9,w14 | |
1085 ext v10.16b,v10.16b,v10.16b,#8 | |
1086 ror w10,w10,#25 | |
1087 ext v14.16b,v14.16b,v14.16b,#8 | |
1088 ror w11,w11,#25 | |
1089 ext v18.16b,v18.16b,v18.16b,#8 | |
1090 ror w12,w12,#25 | |
1091 ext v22.16b,v22.16b,v22.16b,#8 | |
1092 ror w9,w9,#25 | |
1093 ext v3.16b,v3.16b,v3.16b,#12 | |
1094 ext v7.16b,v7.16b,v7.16b,#12 | |
1095 ext v11.16b,v11.16b,v11.16b,#12 | |
1096 ext v15.16b,v15.16b,v15.16b,#12 | |
1097 ext v19.16b,v19.16b,v19.16b,#12 | |
1098 ext v23.16b,v23.16b,v23.16b,#12 | |
1099 ext v1.16b,v1.16b,v1.16b,#4 | |
1100 ext v5.16b,v5.16b,v5.16b,#4 | |
1101 ext v9.16b,v9.16b,v9.16b,#4 | |
1102 ext v13.16b,v13.16b,v13.16b,#4 | |
1103 ext v17.16b,v17.16b,v17.16b,#4 | |
1104 ext v21.16b,v21.16b,v21.16b,#4 | |
1105 add v0.4s,v0.4s,v1.4s | |
1106 add w5,w5,w9 | |
1107 add v4.4s,v4.4s,v5.4s | |
1108 add w6,w6,w10 | |
1109 add v8.4s,v8.4s,v9.4s | |
1110 add w7,w7,w11 | |
1111 add v12.4s,v12.4s,v13.4s | |
1112 add w8,w8,w12 | |
1113 add v16.4s,v16.4s,v17.4s | |
1114 eor w17,w17,w5 | |
1115 add v20.4s,v20.4s,v21.4s | |
1116 eor w19,w19,w6 | |
1117 eor v3.16b,v3.16b,v0.16b | |
1118 eor w20,w20,w7 | |
1119 eor v7.16b,v7.16b,v4.16b | |
1120 eor w21,w21,w8 | |
1121 eor v11.16b,v11.16b,v8.16b | |
1122 ror w17,w17,#16 | |
1123 eor v15.16b,v15.16b,v12.16b | |
1124 ror w19,w19,#16 | |
1125 eor v19.16b,v19.16b,v16.16b | |
1126 ror w20,w20,#16 | |
1127 eor v23.16b,v23.16b,v20.16b | |
1128 ror w21,w21,#16 | |
1129 rev32 v3.8h,v3.8h | |
1130 add w13,w13,w17 | |
1131 rev32 v7.8h,v7.8h | |
1132 add w14,w14,w19 | |
1133 rev32 v11.8h,v11.8h | |
1134 add w15,w15,w20 | |
1135 rev32 v15.8h,v15.8h | |
1136 add w16,w16,w21 | |
1137 rev32 v19.8h,v19.8h | |
1138 eor w9,w9,w13 | |
1139 rev32 v23.8h,v23.8h | |
1140 eor w10,w10,w14 | |
1141 add v2.4s,v2.4s,v3.4s | |
1142 eor w11,w11,w15 | |
1143 add v6.4s,v6.4s,v7.4s | |
1144 eor w12,w12,w16 | |
1145 add v10.4s,v10.4s,v11.4s | |
1146 ror w9,w9,#20 | |
1147 add v14.4s,v14.4s,v15.4s | |
1148 ror w10,w10,#20 | |
1149 add v18.4s,v18.4s,v19.4s | |
1150 ror w11,w11,#20 | |
1151 add v22.4s,v22.4s,v23.4s | |
1152 ror w12,w12,#20 | |
1153 eor v24.16b,v1.16b,v2.16b | |
1154 add w5,w5,w9 | |
1155 eor v25.16b,v5.16b,v6.16b | |
1156 add w6,w6,w10 | |
1157 eor v26.16b,v9.16b,v10.16b | |
1158 add w7,w7,w11 | |
1159 eor v27.16b,v13.16b,v14.16b | |
1160 add w8,w8,w12 | |
1161 eor v28.16b,v17.16b,v18.16b | |
1162 eor w17,w17,w5 | |
1163 eor v29.16b,v21.16b,v22.16b | |
1164 eor w19,w19,w6 | |
1165 ushr v1.4s,v24.4s,#20 | |
1166 eor w20,w20,w7 | |
1167 ushr v5.4s,v25.4s,#20 | |
1168 eor w21,w21,w8 | |
1169 ushr v9.4s,v26.4s,#20 | |
1170 ror w17,w17,#24 | |
1171 ushr v13.4s,v27.4s,#20 | |
1172 ror w19,w19,#24 | |
1173 ushr v17.4s,v28.4s,#20 | |
1174 ror w20,w20,#24 | |
1175 ushr v21.4s,v29.4s,#20 | |
1176 ror w21,w21,#24 | |
1177 sli v1.4s,v24.4s,#12 | |
1178 add w13,w13,w17 | |
1179 sli v5.4s,v25.4s,#12 | |
1180 add w14,w14,w19 | |
1181 sli v9.4s,v26.4s,#12 | |
1182 add w15,w15,w20 | |
1183 sli v13.4s,v27.4s,#12 | |
1184 add w16,w16,w21 | |
1185 sli v17.4s,v28.4s,#12 | |
1186 eor w9,w9,w13 | |
1187 sli v21.4s,v29.4s,#12 | |
1188 eor w10,w10,w14 | |
1189 add v0.4s,v0.4s,v1.4s | |
1190 eor w11,w11,w15 | |
1191 add v4.4s,v4.4s,v5.4s | |
1192 eor w12,w12,w16 | |
1193 add v8.4s,v8.4s,v9.4s | |
1194 ror w9,w9,#25 | |
1195 add v12.4s,v12.4s,v13.4s | |
1196 ror w10,w10,#25 | |
1197 add v16.4s,v16.4s,v17.4s | |
1198 ror w11,w11,#25 | |
1199 add v20.4s,v20.4s,v21.4s | |
1200 ror w12,w12,#25 | |
1201 eor v24.16b,v3.16b,v0.16b | |
1202 add w5,w5,w10 | |
1203 eor v25.16b,v7.16b,v4.16b | |
1204 add w6,w6,w11 | |
1205 eor v26.16b,v11.16b,v8.16b | |
1206 add w7,w7,w12 | |
1207 eor v27.16b,v15.16b,v12.16b | |
1208 add w8,w8,w9 | |
1209 eor v28.16b,v19.16b,v16.16b | |
1210 eor w21,w21,w5 | |
1211 eor v29.16b,v23.16b,v20.16b | |
1212 eor w17,w17,w6 | |
1213 ushr v3.4s,v24.4s,#24 | |
1214 eor w19,w19,w7 | |
1215 ushr v7.4s,v25.4s,#24 | |
1216 eor w20,w20,w8 | |
1217 ushr v11.4s,v26.4s,#24 | |
1218 ror w21,w21,#16 | |
1219 ushr v15.4s,v27.4s,#24 | |
1220 ror w17,w17,#16 | |
1221 ushr v19.4s,v28.4s,#24 | |
1222 ror w19,w19,#16 | |
1223 ushr v23.4s,v29.4s,#24 | |
1224 ror w20,w20,#16 | |
1225 sli v3.4s,v24.4s,#8 | |
1226 add w15,w15,w21 | |
1227 sli v7.4s,v25.4s,#8 | |
1228 add w16,w16,w17 | |
1229 sli v11.4s,v26.4s,#8 | |
1230 add w13,w13,w19 | |
1231 sli v15.4s,v27.4s,#8 | |
1232 add w14,w14,w20 | |
1233 sli v19.4s,v28.4s,#8 | |
1234 eor w10,w10,w15 | |
1235 sli v23.4s,v29.4s,#8 | |
1236 eor w11,w11,w16 | |
1237 add v2.4s,v2.4s,v3.4s | |
1238 eor w12,w12,w13 | |
1239 add v6.4s,v6.4s,v7.4s | |
1240 eor w9,w9,w14 | |
1241 add v10.4s,v10.4s,v11.4s | |
1242 ror w10,w10,#20 | |
1243 add v14.4s,v14.4s,v15.4s | |
1244 ror w11,w11,#20 | |
1245 add v18.4s,v18.4s,v19.4s | |
1246 ror w12,w12,#20 | |
1247 add v22.4s,v22.4s,v23.4s | |
1248 ror w9,w9,#20 | |
1249 eor v24.16b,v1.16b,v2.16b | |
1250 add w5,w5,w10 | |
1251 eor v25.16b,v5.16b,v6.16b | |
1252 add w6,w6,w11 | |
1253 eor v26.16b,v9.16b,v10.16b | |
1254 add w7,w7,w12 | |
1255 eor v27.16b,v13.16b,v14.16b | |
1256 add w8,w8,w9 | |
1257 eor v28.16b,v17.16b,v18.16b | |
1258 eor w21,w21,w5 | |
1259 eor v29.16b,v21.16b,v22.16b | |
1260 eor w17,w17,w6 | |
1261 ushr v1.4s,v24.4s,#25 | |
1262 eor w19,w19,w7 | |
1263 ushr v5.4s,v25.4s,#25 | |
1264 eor w20,w20,w8 | |
1265 ushr v9.4s,v26.4s,#25 | |
1266 ror w21,w21,#24 | |
1267 ushr v13.4s,v27.4s,#25 | |
1268 ror w17,w17,#24 | |
1269 ushr v17.4s,v28.4s,#25 | |
1270 ror w19,w19,#24 | |
1271 ushr v21.4s,v29.4s,#25 | |
1272 ror w20,w20,#24 | |
1273 sli v1.4s,v24.4s,#7 | |
1274 add w15,w15,w21 | |
1275 sli v5.4s,v25.4s,#7 | |
1276 add w16,w16,w17 | |
1277 sli v9.4s,v26.4s,#7 | |
1278 add w13,w13,w19 | |
1279 sli v13.4s,v27.4s,#7 | |
1280 add w14,w14,w20 | |
1281 sli v17.4s,v28.4s,#7 | |
1282 eor w10,w10,w15 | |
1283 sli v21.4s,v29.4s,#7 | |
1284 eor w11,w11,w16 | |
1285 ext v2.16b,v2.16b,v2.16b,#8 | |
1286 eor w12,w12,w13 | |
1287 ext v6.16b,v6.16b,v6.16b,#8 | |
1288 eor w9,w9,w14 | |
1289 ext v10.16b,v10.16b,v10.16b,#8 | |
1290 ror w10,w10,#25 | |
1291 ext v14.16b,v14.16b,v14.16b,#8 | |
1292 ror w11,w11,#25 | |
1293 ext v18.16b,v18.16b,v18.16b,#8 | |
1294 ror w12,w12,#25 | |
1295 ext v22.16b,v22.16b,v22.16b,#8 | |
1296 ror w9,w9,#25 | |
1297 ext v3.16b,v3.16b,v3.16b,#4 | |
1298 ext v7.16b,v7.16b,v7.16b,#4 | |
1299 ext v11.16b,v11.16b,v11.16b,#4 | |
1300 ext v15.16b,v15.16b,v15.16b,#4 | |
1301 ext v19.16b,v19.16b,v19.16b,#4 | |
1302 ext v23.16b,v23.16b,v23.16b,#4 | |
1303 ext v1.16b,v1.16b,v1.16b,#12 | |
1304 ext v5.16b,v5.16b,v5.16b,#12 | |
1305 ext v9.16b,v9.16b,v9.16b,#12 | |
1306 ext v13.16b,v13.16b,v13.16b,#12 | |
1307 ext v17.16b,v17.16b,v17.16b,#12 | |
1308 ext v21.16b,v21.16b,v21.16b,#12 | |
1309 cbnz x4,.Loop_upper_neon | |
1310 | |
1311 add w5,w5,w22 // accumulate key block | |
1312 add x6,x6,x22,lsr#32 | |
1313 add w7,w7,w23 | |
1314 add x8,x8,x23,lsr#32 | |
1315 add w9,w9,w24 | |
1316 add x10,x10,x24,lsr#32 | |
1317 add w11,w11,w25 | |
1318 add x12,x12,x25,lsr#32 | |
1319 add w13,w13,w26 | |
1320 add x14,x14,x26,lsr#32 | |
1321 add w15,w15,w27 | |
1322 add x16,x16,x27,lsr#32 | |
1323 add w17,w17,w28 | |
1324 add x19,x19,x28,lsr#32 | |
1325 add w20,w20,w30 | |
1326 add x21,x21,x30,lsr#32 | |
1327 | |
1328 add x5,x5,x6,lsl#32 // pack | |
1329 add x7,x7,x8,lsl#32 | |
1330 ldp x6,x8,[x1,#0] // load input | |
1331 add x9,x9,x10,lsl#32 | |
1332 add x11,x11,x12,lsl#32 | |
1333 ldp x10,x12,[x1,#16] | |
1334 add x13,x13,x14,lsl#32 | |
1335 add x15,x15,x16,lsl#32 | |
1336 ldp x14,x16,[x1,#32] | |
1337 add x17,x17,x19,lsl#32 | |
1338 add x20,x20,x21,lsl#32 | |
1339 ldp x19,x21,[x1,#48] | |
1340 add x1,x1,#64 | |
1341 #ifdef __ARMEB__ | |
1342 rev x5,x5 | |
1343 rev x7,x7 | |
1344 rev x9,x9 | |
1345 rev x11,x11 | |
1346 rev x13,x13 | |
1347 rev x15,x15 | |
1348 rev x17,x17 | |
1349 rev x20,x20 | |
1350 #endif | |
1351 eor x5,x5,x6 | |
1352 eor x7,x7,x8 | |
1353 eor x9,x9,x10 | |
1354 eor x11,x11,x12 | |
1355 eor x13,x13,x14 | |
1356 eor x15,x15,x16 | |
1357 eor x17,x17,x19 | |
1358 eor x20,x20,x21 | |
1359 | |
1360 stp x5,x7,[x0,#0] // store output | |
1361 add x28,x28,#1 // increment counter | |
1362 mov w5,w22 // unpack key block | |
1363 lsr x6,x22,#32 | |
1364 stp x9,x11,[x0,#16] | |
1365 mov w7,w23 | |
1366 lsr x8,x23,#32 | |
1367 stp x13,x15,[x0,#32] | |
1368 mov w9,w24 | |
1369 lsr x10,x24,#32 | |
1370 stp x17,x20,[x0,#48] | |
1371 add x0,x0,#64 | |
1372 mov w11,w25 | |
1373 lsr x12,x25,#32 | |
1374 mov w13,w26 | |
1375 lsr x14,x26,#32 | |
1376 mov w15,w27 | |
1377 lsr x16,x27,#32 | |
1378 mov w17,w28 | |
1379 lsr x19,x28,#32 | |
1380 mov w20,w30 | |
1381 lsr x21,x30,#32 | |
1382 | |
1383 mov x4,#5 | |
1384 .Loop_lower_neon: | |
1385 sub x4,x4,#1 | |
1386 add v0.4s,v0.4s,v1.4s | |
1387 add w5,w5,w9 | |
1388 add v4.4s,v4.4s,v5.4s | |
1389 add w6,w6,w10 | |
1390 add v8.4s,v8.4s,v9.4s | |
1391 add w7,w7,w11 | |
1392 add v12.4s,v12.4s,v13.4s | |
1393 add w8,w8,w12 | |
1394 add v16.4s,v16.4s,v17.4s | |
1395 eor w17,w17,w5 | |
1396 add v20.4s,v20.4s,v21.4s | |
1397 eor w19,w19,w6 | |
1398 eor v3.16b,v3.16b,v0.16b | |
1399 eor w20,w20,w7 | |
1400 eor v7.16b,v7.16b,v4.16b | |
1401 eor w21,w21,w8 | |
1402 eor v11.16b,v11.16b,v8.16b | |
1403 ror w17,w17,#16 | |
1404 eor v15.16b,v15.16b,v12.16b | |
1405 ror w19,w19,#16 | |
1406 eor v19.16b,v19.16b,v16.16b | |
1407 ror w20,w20,#16 | |
1408 eor v23.16b,v23.16b,v20.16b | |
1409 ror w21,w21,#16 | |
1410 rev32 v3.8h,v3.8h | |
1411 add w13,w13,w17 | |
1412 rev32 v7.8h,v7.8h | |
1413 add w14,w14,w19 | |
1414 rev32 v11.8h,v11.8h | |
1415 add w15,w15,w20 | |
1416 rev32 v15.8h,v15.8h | |
1417 add w16,w16,w21 | |
1418 rev32 v19.8h,v19.8h | |
1419 eor w9,w9,w13 | |
1420 rev32 v23.8h,v23.8h | |
1421 eor w10,w10,w14 | |
1422 add v2.4s,v2.4s,v3.4s | |
1423 eor w11,w11,w15 | |
1424 add v6.4s,v6.4s,v7.4s | |
1425 eor w12,w12,w16 | |
1426 add v10.4s,v10.4s,v11.4s | |
1427 ror w9,w9,#20 | |
1428 add v14.4s,v14.4s,v15.4s | |
1429 ror w10,w10,#20 | |
1430 add v18.4s,v18.4s,v19.4s | |
1431 ror w11,w11,#20 | |
1432 add v22.4s,v22.4s,v23.4s | |
1433 ror w12,w12,#20 | |
1434 eor v24.16b,v1.16b,v2.16b | |
1435 add w5,w5,w9 | |
1436 eor v25.16b,v5.16b,v6.16b | |
1437 add w6,w6,w10 | |
1438 eor v26.16b,v9.16b,v10.16b | |
1439 add w7,w7,w11 | |
1440 eor v27.16b,v13.16b,v14.16b | |
1441 add w8,w8,w12 | |
1442 eor v28.16b,v17.16b,v18.16b | |
1443 eor w17,w17,w5 | |
1444 eor v29.16b,v21.16b,v22.16b | |
1445 eor w19,w19,w6 | |
1446 ushr v1.4s,v24.4s,#20 | |
1447 eor w20,w20,w7 | |
1448 ushr v5.4s,v25.4s,#20 | |
1449 eor w21,w21,w8 | |
1450 ushr v9.4s,v26.4s,#20 | |
1451 ror w17,w17,#24 | |
1452 ushr v13.4s,v27.4s,#20 | |
1453 ror w19,w19,#24 | |
1454 ushr v17.4s,v28.4s,#20 | |
1455 ror w20,w20,#24 | |
1456 ushr v21.4s,v29.4s,#20 | |
1457 ror w21,w21,#24 | |
1458 sli v1.4s,v24.4s,#12 | |
1459 add w13,w13,w17 | |
1460 sli v5.4s,v25.4s,#12 | |
1461 add w14,w14,w19 | |
1462 sli v9.4s,v26.4s,#12 | |
1463 add w15,w15,w20 | |
1464 sli v13.4s,v27.4s,#12 | |
1465 add w16,w16,w21 | |
1466 sli v17.4s,v28.4s,#12 | |
1467 eor w9,w9,w13 | |
1468 sli v21.4s,v29.4s,#12 | |
1469 eor w10,w10,w14 | |
1470 add v0.4s,v0.4s,v1.4s | |
1471 eor w11,w11,w15 | |
1472 add v4.4s,v4.4s,v5.4s | |
1473 eor w12,w12,w16 | |
1474 add v8.4s,v8.4s,v9.4s | |
1475 ror w9,w9,#25 | |
1476 add v12.4s,v12.4s,v13.4s | |
1477 ror w10,w10,#25 | |
1478 add v16.4s,v16.4s,v17.4s | |
1479 ror w11,w11,#25 | |
1480 add v20.4s,v20.4s,v21.4s | |
1481 ror w12,w12,#25 | |
1482 eor v24.16b,v3.16b,v0.16b | |
1483 add w5,w5,w10 | |
1484 eor v25.16b,v7.16b,v4.16b | |
1485 add w6,w6,w11 | |
1486 eor v26.16b,v11.16b,v8.16b | |
1487 add w7,w7,w12 | |
1488 eor v27.16b,v15.16b,v12.16b | |
1489 add w8,w8,w9 | |
1490 eor v28.16b,v19.16b,v16.16b | |
1491 eor w21,w21,w5 | |
1492 eor v29.16b,v23.16b,v20.16b | |
1493 eor w17,w17,w6 | |
1494 ushr v3.4s,v24.4s,#24 | |
1495 eor w19,w19,w7 | |
1496 ushr v7.4s,v25.4s,#24 | |
1497 eor w20,w20,w8 | |
1498 ushr v11.4s,v26.4s,#24 | |
1499 ror w21,w21,#16 | |
1500 ushr v15.4s,v27.4s,#24 | |
1501 ror w17,w17,#16 | |
1502 ushr v19.4s,v28.4s,#24 | |
1503 ror w19,w19,#16 | |
1504 ushr v23.4s,v29.4s,#24 | |
1505 ror w20,w20,#16 | |
1506 sli v3.4s,v24.4s,#8 | |
1507 add w15,w15,w21 | |
1508 sli v7.4s,v25.4s,#8 | |
1509 add w16,w16,w17 | |
1510 sli v11.4s,v26.4s,#8 | |
1511 add w13,w13,w19 | |
1512 sli v15.4s,v27.4s,#8 | |
1513 add w14,w14,w20 | |
1514 sli v19.4s,v28.4s,#8 | |
1515 eor w10,w10,w15 | |
1516 sli v23.4s,v29.4s,#8 | |
1517 eor w11,w11,w16 | |
1518 add v2.4s,v2.4s,v3.4s | |
1519 eor w12,w12,w13 | |
1520 add v6.4s,v6.4s,v7.4s | |
1521 eor w9,w9,w14 | |
1522 add v10.4s,v10.4s,v11.4s | |
1523 ror w10,w10,#20 | |
1524 add v14.4s,v14.4s,v15.4s | |
1525 ror w11,w11,#20 | |
1526 add v18.4s,v18.4s,v19.4s | |
1527 ror w12,w12,#20 | |
1528 add v22.4s,v22.4s,v23.4s | |
1529 ror w9,w9,#20 | |
1530 eor v24.16b,v1.16b,v2.16b | |
1531 add w5,w5,w10 | |
1532 eor v25.16b,v5.16b,v6.16b | |
1533 add w6,w6,w11 | |
1534 eor v26.16b,v9.16b,v10.16b | |
1535 add w7,w7,w12 | |
1536 eor v27.16b,v13.16b,v14.16b | |
1537 add w8,w8,w9 | |
1538 eor v28.16b,v17.16b,v18.16b | |
1539 eor w21,w21,w5 | |
1540 eor v29.16b,v21.16b,v22.16b | |
1541 eor w17,w17,w6 | |
1542 ushr v1.4s,v24.4s,#25 | |
1543 eor w19,w19,w7 | |
1544 ushr v5.4s,v25.4s,#25 | |
1545 eor w20,w20,w8 | |
1546 ushr v9.4s,v26.4s,#25 | |
1547 ror w21,w21,#24 | |
1548 ushr v13.4s,v27.4s,#25 | |
1549 ror w17,w17,#24 | |
1550 ushr v17.4s,v28.4s,#25 | |
1551 ror w19,w19,#24 | |
1552 ushr v21.4s,v29.4s,#25 | |
1553 ror w20,w20,#24 | |
1554 sli v1.4s,v24.4s,#7 | |
1555 add w15,w15,w21 | |
1556 sli v5.4s,v25.4s,#7 | |
1557 add w16,w16,w17 | |
1558 sli v9.4s,v26.4s,#7 | |
1559 add w13,w13,w19 | |
1560 sli v13.4s,v27.4s,#7 | |
1561 add w14,w14,w20 | |
1562 sli v17.4s,v28.4s,#7 | |
1563 eor w10,w10,w15 | |
1564 sli v21.4s,v29.4s,#7 | |
1565 eor w11,w11,w16 | |
1566 ext v2.16b,v2.16b,v2.16b,#8 | |
1567 eor w12,w12,w13 | |
1568 ext v6.16b,v6.16b,v6.16b,#8 | |
1569 eor w9,w9,w14 | |
1570 ext v10.16b,v10.16b,v10.16b,#8 | |
1571 ror w10,w10,#25 | |
1572 ext v14.16b,v14.16b,v14.16b,#8 | |
1573 ror w11,w11,#25 | |
1574 ext v18.16b,v18.16b,v18.16b,#8 | |
1575 ror w12,w12,#25 | |
1576 ext v22.16b,v22.16b,v22.16b,#8 | |
1577 ror w9,w9,#25 | |
1578 ext v3.16b,v3.16b,v3.16b,#12 | |
1579 ext v7.16b,v7.16b,v7.16b,#12 | |
1580 ext v11.16b,v11.16b,v11.16b,#12 | |
1581 ext v15.16b,v15.16b,v15.16b,#12 | |
1582 ext v19.16b,v19.16b,v19.16b,#12 | |
1583 ext v23.16b,v23.16b,v23.16b,#12 | |
1584 ext v1.16b,v1.16b,v1.16b,#4 | |
1585 ext v5.16b,v5.16b,v5.16b,#4 | |
1586 ext v9.16b,v9.16b,v9.16b,#4 | |
1587 ext v13.16b,v13.16b,v13.16b,#4 | |
1588 ext v17.16b,v17.16b,v17.16b,#4 | |
1589 ext v21.16b,v21.16b,v21.16b,#4 | |
1590 add v0.4s,v0.4s,v1.4s | |
1591 add w5,w5,w9 | |
1592 add v4.4s,v4.4s,v5.4s | |
1593 add w6,w6,w10 | |
1594 add v8.4s,v8.4s,v9.4s | |
1595 add w7,w7,w11 | |
1596 add v12.4s,v12.4s,v13.4s | |
1597 add w8,w8,w12 | |
1598 add v16.4s,v16.4s,v17.4s | |
1599 eor w17,w17,w5 | |
1600 add v20.4s,v20.4s,v21.4s | |
1601 eor w19,w19,w6 | |
1602 eor v3.16b,v3.16b,v0.16b | |
1603 eor w20,w20,w7 | |
1604 eor v7.16b,v7.16b,v4.16b | |
1605 eor w21,w21,w8 | |
1606 eor v11.16b,v11.16b,v8.16b | |
1607 ror w17,w17,#16 | |
1608 eor v15.16b,v15.16b,v12.16b | |
1609 ror w19,w19,#16 | |
1610 eor v19.16b,v19.16b,v16.16b | |
1611 ror w20,w20,#16 | |
1612 eor v23.16b,v23.16b,v20.16b | |
1613 ror w21,w21,#16 | |
1614 rev32 v3.8h,v3.8h | |
1615 add w13,w13,w17 | |
1616 rev32 v7.8h,v7.8h | |
1617 add w14,w14,w19 | |
1618 rev32 v11.8h,v11.8h | |
1619 add w15,w15,w20 | |
1620 rev32 v15.8h,v15.8h | |
1621 add w16,w16,w21 | |
1622 rev32 v19.8h,v19.8h | |
1623 eor w9,w9,w13 | |
1624 rev32 v23.8h,v23.8h | |
1625 eor w10,w10,w14 | |
1626 add v2.4s,v2.4s,v3.4s | |
1627 eor w11,w11,w15 | |
1628 add v6.4s,v6.4s,v7.4s | |
1629 eor w12,w12,w16 | |
1630 add v10.4s,v10.4s,v11.4s | |
1631 ror w9,w9,#20 | |
1632 add v14.4s,v14.4s,v15.4s | |
1633 ror w10,w10,#20 | |
1634 add v18.4s,v18.4s,v19.4s | |
1635 ror w11,w11,#20 | |
1636 add v22.4s,v22.4s,v23.4s | |
1637 ror w12,w12,#20 | |
1638 eor v24.16b,v1.16b,v2.16b | |
1639 add w5,w5,w9 | |
1640 eor v25.16b,v5.16b,v6.16b | |
1641 add w6,w6,w10 | |
1642 eor v26.16b,v9.16b,v10.16b | |
1643 add w7,w7,w11 | |
1644 eor v27.16b,v13.16b,v14.16b | |
1645 add w8,w8,w12 | |
1646 eor v28.16b,v17.16b,v18.16b | |
1647 eor w17,w17,w5 | |
1648 eor v29.16b,v21.16b,v22.16b | |
1649 eor w19,w19,w6 | |
1650 ushr v1.4s,v24.4s,#20 | |
1651 eor w20,w20,w7 | |
1652 ushr v5.4s,v25.4s,#20 | |
1653 eor w21,w21,w8 | |
1654 ushr v9.4s,v26.4s,#20 | |
1655 ror w17,w17,#24 | |
1656 ushr v13.4s,v27.4s,#20 | |
1657 ror w19,w19,#24 | |
1658 ushr v17.4s,v28.4s,#20 | |
1659 ror w20,w20,#24 | |
1660 ushr v21.4s,v29.4s,#20 | |
1661 ror w21,w21,#24 | |
1662 sli v1.4s,v24.4s,#12 | |
1663 add w13,w13,w17 | |
1664 sli v5.4s,v25.4s,#12 | |
1665 add w14,w14,w19 | |
1666 sli v9.4s,v26.4s,#12 | |
1667 add w15,w15,w20 | |
1668 sli v13.4s,v27.4s,#12 | |
1669 add w16,w16,w21 | |
1670 sli v17.4s,v28.4s,#12 | |
1671 eor w9,w9,w13 | |
1672 sli v21.4s,v29.4s,#12 | |
1673 eor w10,w10,w14 | |
1674 add v0.4s,v0.4s,v1.4s | |
1675 eor w11,w11,w15 | |
1676 add v4.4s,v4.4s,v5.4s | |
1677 eor w12,w12,w16 | |
1678 add v8.4s,v8.4s,v9.4s | |
1679 ror w9,w9,#25 | |
1680 add v12.4s,v12.4s,v13.4s | |
1681 ror w10,w10,#25 | |
1682 add v16.4s,v16.4s,v17.4s | |
1683 ror w11,w11,#25 | |
1684 add v20.4s,v20.4s,v21.4s | |
1685 ror w12,w12,#25 | |
1686 eor v24.16b,v3.16b,v0.16b | |
1687 add w5,w5,w10 | |
1688 eor v25.16b,v7.16b,v4.16b | |
1689 add w6,w6,w11 | |
1690 eor v26.16b,v11.16b,v8.16b | |
1691 add w7,w7,w12 | |
1692 eor v27.16b,v15.16b,v12.16b | |
1693 add w8,w8,w9 | |
1694 eor v28.16b,v19.16b,v16.16b | |
1695 eor w21,w21,w5 | |
1696 eor v29.16b,v23.16b,v20.16b | |
1697 eor w17,w17,w6 | |
1698 ushr v3.4s,v24.4s,#24 | |
1699 eor w19,w19,w7 | |
1700 ushr v7.4s,v25.4s,#24 | |
1701 eor w20,w20,w8 | |
1702 ushr v11.4s,v26.4s,#24 | |
1703 ror w21,w21,#16 | |
1704 ushr v15.4s,v27.4s,#24 | |
1705 ror w17,w17,#16 | |
1706 ushr v19.4s,v28.4s,#24 | |
1707 ror w19,w19,#16 | |
1708 ushr v23.4s,v29.4s,#24 | |
1709 ror w20,w20,#16 | |
1710 sli v3.4s,v24.4s,#8 | |
1711 add w15,w15,w21 | |
1712 sli v7.4s,v25.4s,#8 | |
1713 add w16,w16,w17 | |
1714 sli v11.4s,v26.4s,#8 | |
1715 add w13,w13,w19 | |
1716 sli v15.4s,v27.4s,#8 | |
1717 add w14,w14,w20 | |
1718 sli v19.4s,v28.4s,#8 | |
1719 eor w10,w10,w15 | |
1720 sli v23.4s,v29.4s,#8 | |
1721 eor w11,w11,w16 | |
1722 add v2.4s,v2.4s,v3.4s | |
1723 eor w12,w12,w13 | |
1724 add v6.4s,v6.4s,v7.4s | |
1725 eor w9,w9,w14 | |
1726 add v10.4s,v10.4s,v11.4s | |
1727 ror w10,w10,#20 | |
1728 add v14.4s,v14.4s,v15.4s | |
1729 ror w11,w11,#20 | |
1730 add v18.4s,v18.4s,v19.4s | |
1731 ror w12,w12,#20 | |
1732 add v22.4s,v22.4s,v23.4s | |
1733 ror w9,w9,#20 | |
1734 eor v24.16b,v1.16b,v2.16b | |
1735 add w5,w5,w10 | |
1736 eor v25.16b,v5.16b,v6.16b | |
1737 add w6,w6,w11 | |
1738 eor v26.16b,v9.16b,v10.16b | |
1739 add w7,w7,w12 | |
1740 eor v27.16b,v13.16b,v14.16b | |
1741 add w8,w8,w9 | |
1742 eor v28.16b,v17.16b,v18.16b | |
1743 eor w21,w21,w5 | |
1744 eor v29.16b,v21.16b,v22.16b | |
1745 eor w17,w17,w6 | |
1746 ushr v1.4s,v24.4s,#25 | |
1747 eor w19,w19,w7 | |
1748 ushr v5.4s,v25.4s,#25 | |
1749 eor w20,w20,w8 | |
1750 ushr v9.4s,v26.4s,#25 | |
1751 ror w21,w21,#24 | |
1752 ushr v13.4s,v27.4s,#25 | |
1753 ror w17,w17,#24 | |
1754 ushr v17.4s,v28.4s,#25 | |
1755 ror w19,w19,#24 | |
1756 ushr v21.4s,v29.4s,#25 | |
1757 ror w20,w20,#24 | |
1758 sli v1.4s,v24.4s,#7 | |
1759 add w15,w15,w21 | |
1760 sli v5.4s,v25.4s,#7 | |
1761 add w16,w16,w17 | |
1762 sli v9.4s,v26.4s,#7 | |
1763 add w13,w13,w19 | |
1764 sli v13.4s,v27.4s,#7 | |
1765 add w14,w14,w20 | |
1766 sli v17.4s,v28.4s,#7 | |
1767 eor w10,w10,w15 | |
1768 sli v21.4s,v29.4s,#7 | |
1769 eor w11,w11,w16 | |
1770 ext v2.16b,v2.16b,v2.16b,#8 | |
1771 eor w12,w12,w13 | |
1772 ext v6.16b,v6.16b,v6.16b,#8 | |
1773 eor w9,w9,w14 | |
1774 ext v10.16b,v10.16b,v10.16b,#8 | |
1775 ror w10,w10,#25 | |
1776 ext v14.16b,v14.16b,v14.16b,#8 | |
1777 ror w11,w11,#25 | |
1778 ext v18.16b,v18.16b,v18.16b,#8 | |
1779 ror w12,w12,#25 | |
1780 ext v22.16b,v22.16b,v22.16b,#8 | |
1781 ror w9,w9,#25 | |
1782 ext v3.16b,v3.16b,v3.16b,#4 | |
1783 ext v7.16b,v7.16b,v7.16b,#4 | |
1784 ext v11.16b,v11.16b,v11.16b,#4 | |
1785 ext v15.16b,v15.16b,v15.16b,#4 | |
1786 ext v19.16b,v19.16b,v19.16b,#4 | |
1787 ext v23.16b,v23.16b,v23.16b,#4 | |
1788 ext v1.16b,v1.16b,v1.16b,#12 | |
1789 ext v5.16b,v5.16b,v5.16b,#12 | |
1790 ext v9.16b,v9.16b,v9.16b,#12 | |
1791 ext v13.16b,v13.16b,v13.16b,#12 | |
1792 ext v17.16b,v17.16b,v17.16b,#12 | |
1793 ext v21.16b,v21.16b,v21.16b,#12 | |
1794 cbnz x4,.Loop_lower_neon | |
1795 | |
1796 add w5,w5,w22 // accumulate key block | |
1797 ldp q24,q25,[sp,#0] | |
1798 add x6,x6,x22,lsr#32 | |
1799 ldp q26,q27,[sp,#32] | |
1800 add w7,w7,w23 | |
1801 ldp q28,q29,[sp,#64] | |
1802 add x8,x8,x23,lsr#32 | |
1803 add v0.4s,v0.4s,v24.4s | |
1804 add w9,w9,w24 | |
1805 add v4.4s,v4.4s,v24.4s | |
1806 add x10,x10,x24,lsr#32 | |
1807 add v8.4s,v8.4s,v24.4s | |
1808 add w11,w11,w25 | |
1809 add v12.4s,v12.4s,v24.4s | |
1810 add x12,x12,x25,lsr#32 | |
1811 add v16.4s,v16.4s,v24.4s | |
1812 add w13,w13,w26 | |
1813 add v20.4s,v20.4s,v24.4s | |
1814 add x14,x14,x26,lsr#32 | |
1815 add v2.4s,v2.4s,v26.4s | |
1816 add w15,w15,w27 | |
1817 add v6.4s,v6.4s,v26.4s | |
1818 add x16,x16,x27,lsr#32 | |
1819 add v10.4s,v10.4s,v26.4s | |
1820 add w17,w17,w28 | |
1821 add v14.4s,v14.4s,v26.4s | |
1822 add x19,x19,x28,lsr#32 | |
1823 add v18.4s,v18.4s,v26.4s | |
1824 add w20,w20,w30 | |
1825 add v22.4s,v22.4s,v26.4s | |
1826 add x21,x21,x30,lsr#32 | |
1827 add v19.4s,v19.4s,v31.4s // +4 | |
1828 add x5,x5,x6,lsl#32 // pack | |
1829 add v23.4s,v23.4s,v31.4s // +4 | |
1830 add x7,x7,x8,lsl#32 | |
1831 add v3.4s,v3.4s,v27.4s | |
1832 ldp x6,x8,[x1,#0] // load input | |
1833 add v7.4s,v7.4s,v28.4s | |
1834 add x9,x9,x10,lsl#32 | |
1835 add v11.4s,v11.4s,v29.4s | |
1836 add x11,x11,x12,lsl#32 | |
1837 add v15.4s,v15.4s,v30.4s | |
1838 ldp x10,x12,[x1,#16] | |
1839 add v19.4s,v19.4s,v27.4s | |
1840 add x13,x13,x14,lsl#32 | |
1841 add v23.4s,v23.4s,v28.4s | |
1842 add x15,x15,x16,lsl#32 | |
1843 add v1.4s,v1.4s,v25.4s | |
1844 ldp x14,x16,[x1,#32] | |
1845 add v5.4s,v5.4s,v25.4s | |
1846 add x17,x17,x19,lsl#32 | |
1847 add v9.4s,v9.4s,v25.4s | |
1848 add x20,x20,x21,lsl#32 | |
1849 add v13.4s,v13.4s,v25.4s | |
1850 ldp x19,x21,[x1,#48] | |
1851 add v17.4s,v17.4s,v25.4s | |
1852 add x1,x1,#64 | |
1853 add v21.4s,v21.4s,v25.4s | |
1854 | |
1855 #ifdef __ARMEB__ | |
1856 rev x5,x5 | |
1857 rev x7,x7 | |
1858 rev x9,x9 | |
1859 rev x11,x11 | |
1860 rev x13,x13 | |
1861 rev x15,x15 | |
1862 rev x17,x17 | |
1863 rev x20,x20 | |
1864 #endif | |
1865 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 | |
1866 eor x5,x5,x6 | |
1867 eor x7,x7,x8 | |
1868 eor x9,x9,x10 | |
1869 eor x11,x11,x12 | |
1870 eor x13,x13,x14 | |
1871 eor v0.16b,v0.16b,v24.16b | |
1872 eor x15,x15,x16 | |
1873 eor v1.16b,v1.16b,v25.16b | |
1874 eor x17,x17,x19 | |
1875 eor v2.16b,v2.16b,v26.16b | |
1876 eor x20,x20,x21 | |
1877 eor v3.16b,v3.16b,v27.16b | |
1878 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 | |
1879 | |
1880 stp x5,x7,[x0,#0] // store output | |
1881 add x28,x28,#7 // increment counter | |
1882 stp x9,x11,[x0,#16] | |
1883 stp x13,x15,[x0,#32] | |
1884 stp x17,x20,[x0,#48] | |
1885 add x0,x0,#64 | |
1886 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 | |
1887 | |
1888 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 | |
1889 eor v4.16b,v4.16b,v24.16b | |
1890 eor v5.16b,v5.16b,v25.16b | |
1891 eor v6.16b,v6.16b,v26.16b | |
1892 eor v7.16b,v7.16b,v27.16b | |
1893 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 | |
1894 | |
1895 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 | |
1896 eor v8.16b,v8.16b,v0.16b | |
1897 ldp q24,q25,[sp,#0] | |
1898 eor v9.16b,v9.16b,v1.16b | |
1899 ldp q26,q27,[sp,#32] | |
1900 eor v10.16b,v10.16b,v2.16b | |
1901 eor v11.16b,v11.16b,v3.16b | |
1902 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 | |
1903 | |
1904 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 | |
1905 eor v12.16b,v12.16b,v4.16b | |
1906 eor v13.16b,v13.16b,v5.16b | |
1907 eor v14.16b,v14.16b,v6.16b | |
1908 eor v15.16b,v15.16b,v7.16b | |
1909 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 | |
1910 | |
1911 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 | |
1912 eor v16.16b,v16.16b,v8.16b | |
1913 eor v17.16b,v17.16b,v9.16b | |
1914 eor v18.16b,v18.16b,v10.16b | |
1915 eor v19.16b,v19.16b,v11.16b | |
1916 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 | |
1917 | |
1918 shl v0.4s,v31.4s,#1 // 4 -> 8 | |
1919 eor v20.16b,v20.16b,v12.16b | |
1920 eor v21.16b,v21.16b,v13.16b | |
1921 eor v22.16b,v22.16b,v14.16b | |
1922 eor v23.16b,v23.16b,v15.16b | |
1923 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 | |
1924 | |
1925 add v27.4s,v27.4s,v0.4s // += 8 | |
1926 add v28.4s,v28.4s,v0.4s | |
1927 add v29.4s,v29.4s,v0.4s | |
1928 add v30.4s,v30.4s,v0.4s | |
1929 | |
1930 b.hs .Loop_outer_512_neon | |
1931 | |
1932 adds x2,x2,#512 | |
1933 ushr v0.4s,v31.4s,#2 // 4 -> 1 | |
1934 | |
1935 ldp d8,d9,[sp,#128+0] // meet ABI requirements | |
1936 ldp d10,d11,[sp,#128+16] | |
1937 ldp d12,d13,[sp,#128+32] | |
1938 ldp d14,d15,[sp,#128+48] | |
1939 | |
1940 stp q24,q31,[sp,#0] // wipe off-load area | |
1941 stp q24,q31,[sp,#32] | |
1942 stp q24,q31,[sp,#64] | |
1943 | |
1944 b.eq .Ldone_512_neon | |
1945 | |
1946 cmp x2,#192 | |
1947 sub v27.4s,v27.4s,v0.4s // -= 1 | |
1948 sub v28.4s,v28.4s,v0.4s | |
1949 sub v29.4s,v29.4s,v0.4s | |
1950 add sp,sp,#128 | |
1951 b.hs .Loop_outer_neon | |
1952 | |
1953 eor v25.16b,v25.16b,v25.16b | |
1954 eor v26.16b,v26.16b,v26.16b | |
1955 eor v27.16b,v27.16b,v27.16b | |
1956 eor v28.16b,v28.16b,v28.16b | |
1957 eor v29.16b,v29.16b,v29.16b | |
1958 eor v30.16b,v30.16b,v30.16b | |
1959 b .Loop_outer | |
1960 | |
1961 .Ldone_512_neon: | |
1962 ldp x19,x20,[x29,#16] | |
1963 add sp,sp,#128+64 | |
1964 ldp x21,x22,[x29,#32] | |
1965 ldp x23,x24,[x29,#48] | |
1966 ldp x25,x26,[x29,#64] | |
1967 ldp x27,x28,[x29,#80] | |
1968 ldp x29,x30,[sp],#96 | |
1969 ret | |
1970 .size ChaCha20_512_neon,.-ChaCha20_512_neon | |
1971 #endif | |
OLD | NEW |