OLD | NEW |
| (Empty) |
1 #if defined(__arm__) | |
2 #include <openssl/arm_arch.h> | |
3 | |
4 .syntax unified | |
5 | |
6 .text | |
7 .code 32 | |
8 | |
9 #ifdef __clang__ | |
10 #define ldrplb ldrbpl | |
11 #define ldrneb ldrbne | |
12 #endif | |
13 | |
14 .type rem_4bit,%object | |
15 .align 5 | |
16 rem_4bit: | |
17 .short 0x0000,0x1C20,0x3840,0x2460 | |
18 .short 0x7080,0x6CA0,0x48C0,0x54E0 | |
19 .short 0xE100,0xFD20,0xD940,0xC560 | |
20 .short 0x9180,0x8DA0,0xA9C0,0xB5E0 | |
21 .size rem_4bit,.-rem_4bit | |
22 | |
23 .type rem_4bit_get,%function | |
24 rem_4bit_get: | |
25 sub r2,pc,#8 | |
26 sub r2,r2,#32 @ &rem_4bit | |
27 b .Lrem_4bit_got | |
28 nop | |
29 .size rem_4bit_get,.-rem_4bit_get | |
30 | |
31 .globl gcm_ghash_4bit | |
32 .hidden gcm_ghash_4bit | |
33 .type gcm_ghash_4bit,%function | |
34 gcm_ghash_4bit: | |
35 sub r12,pc,#8 | |
36 add r3,r2,r3 @ r3 to point at the end | |
37 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end to
o | |
38 sub r12,r12,#48 @ &rem_4bit | |
39 | |
40 ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... | |
41 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack | |
42 | |
43 ldrb r12,[r2,#15] | |
44 ldrb r14,[r0,#15] | |
45 .Louter: | |
46 eor r12,r12,r14 | |
47 and r14,r12,#0xf0 | |
48 and r12,r12,#0x0f | |
49 mov r3,#14 | |
50 | |
51 add r7,r1,r12,lsl#4 | |
52 ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] | |
53 add r11,r1,r14 | |
54 ldrb r12,[r2,#14] | |
55 | |
56 and r14,r4,#0xf @ rem | |
57 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] | |
58 add r14,r14,r14 | |
59 eor r4,r8,r4,lsr#4 | |
60 ldrh r8,[sp,r14] @ rem_4bit[rem] | |
61 eor r4,r4,r5,lsl#28 | |
62 ldrb r14,[r0,#14] | |
63 eor r5,r9,r5,lsr#4 | |
64 eor r5,r5,r6,lsl#28 | |
65 eor r6,r10,r6,lsr#4 | |
66 eor r6,r6,r7,lsl#28 | |
67 eor r7,r11,r7,lsr#4 | |
68 eor r12,r12,r14 | |
69 and r14,r12,#0xf0 | |
70 and r12,r12,#0x0f | |
71 eor r7,r7,r8,lsl#16 | |
72 | |
73 .Linner: | |
74 add r11,r1,r12,lsl#4 | |
75 and r12,r4,#0xf @ rem | |
76 subs r3,r3,#1 | |
77 add r12,r12,r12 | |
78 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] | |
79 eor r4,r8,r4,lsr#4 | |
80 eor r4,r4,r5,lsl#28 | |
81 eor r5,r9,r5,lsr#4 | |
82 eor r5,r5,r6,lsl#28 | |
83 ldrh r8,[sp,r12] @ rem_4bit[rem] | |
84 eor r6,r10,r6,lsr#4 | |
85 ldrbpl r12,[r2,r3] | |
86 eor r6,r6,r7,lsl#28 | |
87 eor r7,r11,r7,lsr#4 | |
88 | |
89 add r11,r1,r14 | |
90 and r14,r4,#0xf @ rem | |
91 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] | |
92 add r14,r14,r14 | |
93 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] | |
94 eor r4,r8,r4,lsr#4 | |
95 ldrbpl r8,[r0,r3] | |
96 eor r4,r4,r5,lsl#28 | |
97 eor r5,r9,r5,lsr#4 | |
98 ldrh r9,[sp,r14] | |
99 eor r5,r5,r6,lsl#28 | |
100 eor r6,r10,r6,lsr#4 | |
101 eor r6,r6,r7,lsl#28 | |
102 eorpl r12,r12,r8 | |
103 eor r7,r11,r7,lsr#4 | |
104 andpl r14,r12,#0xf0 | |
105 andpl r12,r12,#0x0f | |
106 eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] | |
107 bpl .Linner | |
108 | |
109 ldr r3,[sp,#32] @ re-load r3/end | |
110 add r2,r2,#16 | |
111 mov r14,r4 | |
112 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
113 rev r4,r4 | |
114 str r4,[r0,#12] | |
115 #elif defined(__ARMEB__) | |
116 str r4,[r0,#12] | |
117 #else | |
118 mov r9,r4,lsr#8 | |
119 strb r4,[r0,#12+3] | |
120 mov r10,r4,lsr#16 | |
121 strb r9,[r0,#12+2] | |
122 mov r11,r4,lsr#24 | |
123 strb r10,[r0,#12+1] | |
124 strb r11,[r0,#12] | |
125 #endif | |
126 cmp r2,r3 | |
127 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
128 rev r5,r5 | |
129 str r5,[r0,#8] | |
130 #elif defined(__ARMEB__) | |
131 str r5,[r0,#8] | |
132 #else | |
133 mov r9,r5,lsr#8 | |
134 strb r5,[r0,#8+3] | |
135 mov r10,r5,lsr#16 | |
136 strb r9,[r0,#8+2] | |
137 mov r11,r5,lsr#24 | |
138 strb r10,[r0,#8+1] | |
139 strb r11,[r0,#8] | |
140 #endif | |
141 ldrbne r12,[r2,#15] | |
142 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
143 rev r6,r6 | |
144 str r6,[r0,#4] | |
145 #elif defined(__ARMEB__) | |
146 str r6,[r0,#4] | |
147 #else | |
148 mov r9,r6,lsr#8 | |
149 strb r6,[r0,#4+3] | |
150 mov r10,r6,lsr#16 | |
151 strb r9,[r0,#4+2] | |
152 mov r11,r6,lsr#24 | |
153 strb r10,[r0,#4+1] | |
154 strb r11,[r0,#4] | |
155 #endif | |
156 | |
157 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
158 rev r7,r7 | |
159 str r7,[r0,#0] | |
160 #elif defined(__ARMEB__) | |
161 str r7,[r0,#0] | |
162 #else | |
163 mov r9,r7,lsr#8 | |
164 strb r7,[r0,#0+3] | |
165 mov r10,r7,lsr#16 | |
166 strb r9,[r0,#0+2] | |
167 mov r11,r7,lsr#24 | |
168 strb r10,[r0,#0+1] | |
169 strb r11,[r0,#0] | |
170 #endif | |
171 | |
172 bne .Louter | |
173 | |
174 add sp,sp,#36 | |
175 #if __ARM_ARCH__>=5 | |
176 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} | |
177 #else | |
178 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} | |
179 tst lr,#1 | |
180 moveq pc,lr @ be binary compatible with V4, yet | |
181 .word 0xe12fff1e @ interoperable with Thumb ISA:-) | |
182 #endif | |
183 .size gcm_ghash_4bit,.-gcm_ghash_4bit | |
184 | |
185 .globl gcm_gmult_4bit | |
186 .hidden gcm_gmult_4bit | |
187 .type gcm_gmult_4bit,%function | |
188 gcm_gmult_4bit: | |
189 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} | |
190 ldrb r12,[r0,#15] | |
191 b rem_4bit_get | |
192 .Lrem_4bit_got: | |
193 and r14,r12,#0xf0 | |
194 and r12,r12,#0x0f | |
195 mov r3,#14 | |
196 | |
197 add r7,r1,r12,lsl#4 | |
198 ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] | |
199 ldrb r12,[r0,#14] | |
200 | |
201 add r11,r1,r14 | |
202 and r14,r4,#0xf @ rem | |
203 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] | |
204 add r14,r14,r14 | |
205 eor r4,r8,r4,lsr#4 | |
206 ldrh r8,[r2,r14] @ rem_4bit[rem] | |
207 eor r4,r4,r5,lsl#28 | |
208 eor r5,r9,r5,lsr#4 | |
209 eor r5,r5,r6,lsl#28 | |
210 eor r6,r10,r6,lsr#4 | |
211 eor r6,r6,r7,lsl#28 | |
212 eor r7,r11,r7,lsr#4 | |
213 and r14,r12,#0xf0 | |
214 eor r7,r7,r8,lsl#16 | |
215 and r12,r12,#0x0f | |
216 | |
217 .Loop: | |
218 add r11,r1,r12,lsl#4 | |
219 and r12,r4,#0xf @ rem | |
220 subs r3,r3,#1 | |
221 add r12,r12,r12 | |
222 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] | |
223 eor r4,r8,r4,lsr#4 | |
224 eor r4,r4,r5,lsl#28 | |
225 eor r5,r9,r5,lsr#4 | |
226 eor r5,r5,r6,lsl#28 | |
227 ldrh r8,[r2,r12] @ rem_4bit[rem] | |
228 eor r6,r10,r6,lsr#4 | |
229 ldrbpl r12,[r0,r3] | |
230 eor r6,r6,r7,lsl#28 | |
231 eor r7,r11,r7,lsr#4 | |
232 | |
233 add r11,r1,r14 | |
234 and r14,r4,#0xf @ rem | |
235 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] | |
236 add r14,r14,r14 | |
237 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] | |
238 eor r4,r8,r4,lsr#4 | |
239 eor r4,r4,r5,lsl#28 | |
240 eor r5,r9,r5,lsr#4 | |
241 ldrh r8,[r2,r14] @ rem_4bit[rem] | |
242 eor r5,r5,r6,lsl#28 | |
243 eor r6,r10,r6,lsr#4 | |
244 eor r6,r6,r7,lsl#28 | |
245 eor r7,r11,r7,lsr#4 | |
246 andpl r14,r12,#0xf0 | |
247 andpl r12,r12,#0x0f | |
248 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] | |
249 bpl .Loop | |
250 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
251 rev r4,r4 | |
252 str r4,[r0,#12] | |
253 #elif defined(__ARMEB__) | |
254 str r4,[r0,#12] | |
255 #else | |
256 mov r9,r4,lsr#8 | |
257 strb r4,[r0,#12+3] | |
258 mov r10,r4,lsr#16 | |
259 strb r9,[r0,#12+2] | |
260 mov r11,r4,lsr#24 | |
261 strb r10,[r0,#12+1] | |
262 strb r11,[r0,#12] | |
263 #endif | |
264 | |
265 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
266 rev r5,r5 | |
267 str r5,[r0,#8] | |
268 #elif defined(__ARMEB__) | |
269 str r5,[r0,#8] | |
270 #else | |
271 mov r9,r5,lsr#8 | |
272 strb r5,[r0,#8+3] | |
273 mov r10,r5,lsr#16 | |
274 strb r9,[r0,#8+2] | |
275 mov r11,r5,lsr#24 | |
276 strb r10,[r0,#8+1] | |
277 strb r11,[r0,#8] | |
278 #endif | |
279 | |
280 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
281 rev r6,r6 | |
282 str r6,[r0,#4] | |
283 #elif defined(__ARMEB__) | |
284 str r6,[r0,#4] | |
285 #else | |
286 mov r9,r6,lsr#8 | |
287 strb r6,[r0,#4+3] | |
288 mov r10,r6,lsr#16 | |
289 strb r9,[r0,#4+2] | |
290 mov r11,r6,lsr#24 | |
291 strb r10,[r0,#4+1] | |
292 strb r11,[r0,#4] | |
293 #endif | |
294 | |
295 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
296 rev r7,r7 | |
297 str r7,[r0,#0] | |
298 #elif defined(__ARMEB__) | |
299 str r7,[r0,#0] | |
300 #else | |
301 mov r9,r7,lsr#8 | |
302 strb r7,[r0,#0+3] | |
303 mov r10,r7,lsr#16 | |
304 strb r9,[r0,#0+2] | |
305 mov r11,r7,lsr#24 | |
306 strb r10,[r0,#0+1] | |
307 strb r11,[r0,#0] | |
308 #endif | |
309 | |
310 #if __ARM_ARCH__>=5 | |
311 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} | |
312 #else | |
313 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} | |
314 tst lr,#1 | |
315 moveq pc,lr @ be binary compatible with V4, yet | |
316 .word 0xe12fff1e @ interoperable with Thumb ISA:-) | |
317 #endif | |
318 .size gcm_gmult_4bit,.-gcm_gmult_4bit | |
319 #if __ARM_MAX_ARCH__>=7 | |
320 .arch armv7-a | |
321 .fpu neon | |
322 | |
323 .globl gcm_init_neon | |
324 .hidden gcm_init_neon | |
325 .type gcm_init_neon,%function | |
326 .align 4 | |
327 gcm_init_neon: | |
328 vld1.64 d7,[r1]! @ load H | |
329 vmov.i8 q8,#0xe1 | |
330 vld1.64 d6,[r1] | |
331 vshl.i64 d17,#57 | |
332 vshr.u64 d16,#63 @ t0=0xc2....01 | |
333 vdup.8 q9,d7[7] | |
334 vshr.u64 d26,d6,#63 | |
335 vshr.s8 q9,#7 @ broadcast carry bit | |
336 vshl.i64 q3,q3,#1 | |
337 vand q8,q8,q9 | |
338 vorr d7,d26 @ H<<<=1 | |
339 veor q3,q3,q8 @ twisted H | |
340 vstmia r0,{q3} | |
341 | |
342 bx lr @ bx lr | |
343 .size gcm_init_neon,.-gcm_init_neon | |
344 | |
345 .globl gcm_gmult_neon | |
346 .hidden gcm_gmult_neon | |
347 .type gcm_gmult_neon,%function | |
348 .align 4 | |
349 gcm_gmult_neon: | |
350 vld1.64 d7,[r0]! @ load Xi | |
351 vld1.64 d6,[r0]! | |
352 vmov.i64 d29,#0x0000ffffffffffff | |
353 vldmia r1,{d26,d27} @ load twisted H | |
354 vmov.i64 d30,#0x00000000ffffffff | |
355 #ifdef __ARMEL__ | |
356 vrev64.8 q3,q3 | |
357 #endif | |
358 vmov.i64 d31,#0x000000000000ffff | |
359 veor d28,d26,d27 @ Karatsuba pre-processing | |
360 mov r3,#16 | |
361 b .Lgmult_neon | |
362 .size gcm_gmult_neon,.-gcm_gmult_neon | |
363 | |
364 .globl gcm_ghash_neon | |
365 .hidden gcm_ghash_neon | |
366 .type gcm_ghash_neon,%function | |
367 .align 4 | |
368 gcm_ghash_neon: | |
369 vld1.64 d1,[r0]! @ load Xi | |
370 vld1.64 d0,[r0]! | |
371 vmov.i64 d29,#0x0000ffffffffffff | |
372 vldmia r1,{d26,d27} @ load twisted H | |
373 vmov.i64 d30,#0x00000000ffffffff | |
374 #ifdef __ARMEL__ | |
375 vrev64.8 q0,q0 | |
376 #endif | |
377 vmov.i64 d31,#0x000000000000ffff | |
378 veor d28,d26,d27 @ Karatsuba pre-processing | |
379 | |
380 .Loop_neon: | |
381 vld1.64 d7,[r2]! @ load inp | |
382 vld1.64 d6,[r2]! | |
383 #ifdef __ARMEL__ | |
384 vrev64.8 q3,q3 | |
385 #endif | |
386 veor q3,q0 @ inp^=Xi | |
387 .Lgmult_neon: | |
388 vext.8 d16, d26, d26, #1 @ A1 | |
389 vmull.p8 q8, d16, d6 @ F = A1*B | |
390 vext.8 d0, d6, d6, #1 @ B1 | |
391 vmull.p8 q0, d26, d0 @ E = A*B1 | |
392 vext.8 d18, d26, d26, #2 @ A2 | |
393 vmull.p8 q9, d18, d6 @ H = A2*B | |
394 vext.8 d22, d6, d6, #2 @ B2 | |
395 vmull.p8 q11, d26, d22 @ G = A*B2 | |
396 vext.8 d20, d26, d26, #3 @ A3 | |
397 veor q8, q8, q0 @ L = E + F | |
398 vmull.p8 q10, d20, d6 @ J = A3*B | |
399 vext.8 d0, d6, d6, #3 @ B3 | |
400 veor q9, q9, q11 @ M = G + H | |
401 vmull.p8 q0, d26, d0 @ I = A*B3 | |
402 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 | |
403 vand d17, d17, d29 | |
404 vext.8 d22, d6, d6, #4 @ B4 | |
405 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 | |
406 vand d19, d19, d30 | |
407 vmull.p8 q11, d26, d22 @ K = A*B4 | |
408 veor q10, q10, q0 @ N = I + J | |
409 veor d16, d16, d17 | |
410 veor d18, d18, d19 | |
411 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 | |
412 vand d21, d21, d31 | |
413 vext.8 q8, q8, q8, #15 | |
414 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 | |
415 vmov.i64 d23, #0 | |
416 vext.8 q9, q9, q9, #14 | |
417 veor d20, d20, d21 | |
418 vmull.p8 q0, d26, d6 @ D = A*B | |
419 vext.8 q11, q11, q11, #12 | |
420 vext.8 q10, q10, q10, #13 | |
421 veor q8, q8, q9 | |
422 veor q10, q10, q11 | |
423 veor q0, q0, q8 | |
424 veor q0, q0, q10 | |
425 veor d6,d6,d7 @ Karatsuba pre-processing | |
426 vext.8 d16, d28, d28, #1 @ A1 | |
427 vmull.p8 q8, d16, d6 @ F = A1*B | |
428 vext.8 d2, d6, d6, #1 @ B1 | |
429 vmull.p8 q1, d28, d2 @ E = A*B1 | |
430 vext.8 d18, d28, d28, #2 @ A2 | |
431 vmull.p8 q9, d18, d6 @ H = A2*B | |
432 vext.8 d22, d6, d6, #2 @ B2 | |
433 vmull.p8 q11, d28, d22 @ G = A*B2 | |
434 vext.8 d20, d28, d28, #3 @ A3 | |
435 veor q8, q8, q1 @ L = E + F | |
436 vmull.p8 q10, d20, d6 @ J = A3*B | |
437 vext.8 d2, d6, d6, #3 @ B3 | |
438 veor q9, q9, q11 @ M = G + H | |
439 vmull.p8 q1, d28, d2 @ I = A*B3 | |
440 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 | |
441 vand d17, d17, d29 | |
442 vext.8 d22, d6, d6, #4 @ B4 | |
443 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 | |
444 vand d19, d19, d30 | |
445 vmull.p8 q11, d28, d22 @ K = A*B4 | |
446 veor q10, q10, q1 @ N = I + J | |
447 veor d16, d16, d17 | |
448 veor d18, d18, d19 | |
449 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 | |
450 vand d21, d21, d31 | |
451 vext.8 q8, q8, q8, #15 | |
452 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 | |
453 vmov.i64 d23, #0 | |
454 vext.8 q9, q9, q9, #14 | |
455 veor d20, d20, d21 | |
456 vmull.p8 q1, d28, d6 @ D = A*B | |
457 vext.8 q11, q11, q11, #12 | |
458 vext.8 q10, q10, q10, #13 | |
459 veor q8, q8, q9 | |
460 veor q10, q10, q11 | |
461 veor q1, q1, q8 | |
462 veor q1, q1, q10 | |
463 vext.8 d16, d27, d27, #1 @ A1 | |
464 vmull.p8 q8, d16, d7 @ F = A1*B | |
465 vext.8 d4, d7, d7, #1 @ B1 | |
466 vmull.p8 q2, d27, d4 @ E = A*B1 | |
467 vext.8 d18, d27, d27, #2 @ A2 | |
468 vmull.p8 q9, d18, d7 @ H = A2*B | |
469 vext.8 d22, d7, d7, #2 @ B2 | |
470 vmull.p8 q11, d27, d22 @ G = A*B2 | |
471 vext.8 d20, d27, d27, #3 @ A3 | |
472 veor q8, q8, q2 @ L = E + F | |
473 vmull.p8 q10, d20, d7 @ J = A3*B | |
474 vext.8 d4, d7, d7, #3 @ B3 | |
475 veor q9, q9, q11 @ M = G + H | |
476 vmull.p8 q2, d27, d4 @ I = A*B3 | |
477 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 | |
478 vand d17, d17, d29 | |
479 vext.8 d22, d7, d7, #4 @ B4 | |
480 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 | |
481 vand d19, d19, d30 | |
482 vmull.p8 q11, d27, d22 @ K = A*B4 | |
483 veor q10, q10, q2 @ N = I + J | |
484 veor d16, d16, d17 | |
485 veor d18, d18, d19 | |
486 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 | |
487 vand d21, d21, d31 | |
488 vext.8 q8, q8, q8, #15 | |
489 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 | |
490 vmov.i64 d23, #0 | |
491 vext.8 q9, q9, q9, #14 | |
492 veor d20, d20, d21 | |
493 vmull.p8 q2, d27, d7 @ D = A*B | |
494 vext.8 q11, q11, q11, #12 | |
495 vext.8 q10, q10, q10, #13 | |
496 veor q8, q8, q9 | |
497 veor q10, q10, q11 | |
498 veor q2, q2, q8 | |
499 veor q2, q2, q10 | |
500 veor q1,q1,q0 @ Karatsuba post-processing | |
501 veor q1,q1,q2 | |
502 veor d1,d1,d2 | |
503 veor d4,d4,d3 @ Xh|Xl - 256-bit result | |
504 | |
505 @ equivalent of reduction_avx from ghash-x86_64.pl | |
506 vshl.i64 q9,q0,#57 @ 1st phase | |
507 vshl.i64 q10,q0,#62 | |
508 veor q10,q10,q9 @ | |
509 vshl.i64 q9,q0,#63 | |
510 veor q10, q10, q9 @ | |
511 veor d1,d1,d20 @ | |
512 veor d4,d4,d21 | |
513 | |
514 vshr.u64 q10,q0,#1 @ 2nd phase | |
515 veor q2,q2,q0 | |
516 veor q0,q0,q10 @ | |
517 vshr.u64 q10,q10,#6 | |
518 vshr.u64 q0,q0,#1 @ | |
519 veor q0,q0,q2 @ | |
520 veor q0,q0,q10 @ | |
521 | |
522 subs r3,#16 | |
523 bne .Loop_neon | |
524 | |
525 #ifdef __ARMEL__ | |
526 vrev64.8 q0,q0 | |
527 #endif | |
528 sub r0,#16 | |
529 vst1.64 d1,[r0]! @ write out Xi | |
530 vst1.64 d0,[r0] | |
531 | |
532 bx lr @ bx lr | |
533 .size gcm_ghash_neon,.-gcm_ghash_neon | |
534 #endif | |
535 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67
,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,11
0,115,115,108,46,111,114,103,62,0 | |
536 .align 2 | |
537 .align 2 | |
538 #endif | |
OLD | NEW |