OLD | NEW |
| (Empty) |
1 #include "arm_arch.h" | |
2 | |
3 .text | |
4 .code 32 | |
5 | |
6 .type rem_4bit,%object | |
7 .align 5 | |
8 rem_4bit: | |
9 .short 0x0000,0x1C20,0x3840,0x2460 | |
10 .short 0x7080,0x6CA0,0x48C0,0x54E0 | |
11 .short 0xE100,0xFD20,0xD940,0xC560 | |
12 .short 0x9180,0x8DA0,0xA9C0,0xB5E0 | |
13 .size rem_4bit,.-rem_4bit | |
14 | |
15 .type rem_4bit_get,%function | |
16 rem_4bit_get: | |
17 sub r2,pc,#8 | |
18 sub r2,r2,#32 @ &rem_4bit | |
19 b .Lrem_4bit_got | |
20 nop | |
21 .size rem_4bit_get,.-rem_4bit_get | |
22 | |
23 .global gcm_ghash_4bit | |
24 .type gcm_ghash_4bit,%function | |
25 gcm_ghash_4bit: | |
26 sub r12,pc,#8 | |
27 add r3,r2,r3 @ r3 to point at the end | |
28 stmdb sp!,{r3-r11,lr} @ save r3/end too | |
29 sub r12,r12,#48 @ &rem_4bit | |
30 | |
31 ldmia r12,{r4-r11} @ copy rem_4bit ... | |
32 stmdb sp!,{r4-r11} @ ... to stack | |
33 | |
34 ldrb r12,[r2,#15] | |
35 ldrb r14,[r0,#15] | |
36 .Louter: | |
37 eor r12,r12,r14 | |
38 and r14,r12,#0xf0 | |
39 and r12,r12,#0x0f | |
40 mov r3,#14 | |
41 | |
42 add r7,r1,r12,lsl#4 | |
43 ldmia r7,{r4-r7} @ load Htbl[nlo] | |
44 add r11,r1,r14 | |
45 ldrb r12,[r2,#14] | |
46 | |
47 and r14,r4,#0xf @ rem | |
48 ldmia r11,{r8-r11} @ load Htbl[nhi] | |
49 add r14,r14,r14 | |
50 eor r4,r8,r4,lsr#4 | |
51 ldrh r8,[sp,r14] @ rem_4bit[rem] | |
52 eor r4,r4,r5,lsl#28 | |
53 ldrb r14,[r0,#14] | |
54 eor r5,r9,r5,lsr#4 | |
55 eor r5,r5,r6,lsl#28 | |
56 eor r6,r10,r6,lsr#4 | |
57 eor r6,r6,r7,lsl#28 | |
58 eor r7,r11,r7,lsr#4 | |
59 eor r12,r12,r14 | |
60 and r14,r12,#0xf0 | |
61 and r12,r12,#0x0f | |
62 eor r7,r7,r8,lsl#16 | |
63 | |
64 .Linner: | |
65 add r11,r1,r12,lsl#4 | |
66 and r12,r4,#0xf @ rem | |
67 subs r3,r3,#1 | |
68 add r12,r12,r12 | |
69 ldmia r11,{r8-r11} @ load Htbl[nlo] | |
70 eor r4,r8,r4,lsr#4 | |
71 eor r4,r4,r5,lsl#28 | |
72 eor r5,r9,r5,lsr#4 | |
73 eor r5,r5,r6,lsl#28 | |
74 ldrh r8,[sp,r12] @ rem_4bit[rem] | |
75 eor r6,r10,r6,lsr#4 | |
76 ldrplb r12,[r2,r3] | |
77 eor r6,r6,r7,lsl#28 | |
78 eor r7,r11,r7,lsr#4 | |
79 | |
80 add r11,r1,r14 | |
81 and r14,r4,#0xf @ rem | |
82 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] | |
83 add r14,r14,r14 | |
84 ldmia r11,{r8-r11} @ load Htbl[nhi] | |
85 eor r4,r8,r4,lsr#4 | |
86 ldrplb r8,[r0,r3] | |
87 eor r4,r4,r5,lsl#28 | |
88 eor r5,r9,r5,lsr#4 | |
89 ldrh r9,[sp,r14] | |
90 eor r5,r5,r6,lsl#28 | |
91 eor r6,r10,r6,lsr#4 | |
92 eor r6,r6,r7,lsl#28 | |
93 eorpl r12,r12,r8 | |
94 eor r7,r11,r7,lsr#4 | |
95 andpl r14,r12,#0xf0 | |
96 andpl r12,r12,#0x0f | |
97 eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] | |
98 bpl .Linner | |
99 | |
100 ldr r3,[sp,#32] @ re-load r3/end | |
101 add r2,r2,#16 | |
102 mov r14,r4 | |
103 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
104 rev r4,r4 | |
105 str r4,[r0,#12] | |
106 #elif defined(__ARMEB__) | |
107 str r4,[r0,#12] | |
108 #else | |
109 mov r9,r4,lsr#8 | |
110 strb r4,[r0,#12+3] | |
111 mov r10,r4,lsr#16 | |
112 strb r9,[r0,#12+2] | |
113 mov r11,r4,lsr#24 | |
114 strb r10,[r0,#12+1] | |
115 strb r11,[r0,#12] | |
116 #endif | |
117 cmp r2,r3 | |
118 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
119 rev r5,r5 | |
120 str r5,[r0,#8] | |
121 #elif defined(__ARMEB__) | |
122 str r5,[r0,#8] | |
123 #else | |
124 mov r9,r5,lsr#8 | |
125 strb r5,[r0,#8+3] | |
126 mov r10,r5,lsr#16 | |
127 strb r9,[r0,#8+2] | |
128 mov r11,r5,lsr#24 | |
129 strb r10,[r0,#8+1] | |
130 strb r11,[r0,#8] | |
131 #endif | |
132 ldrneb r12,[r2,#15] | |
133 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
134 rev r6,r6 | |
135 str r6,[r0,#4] | |
136 #elif defined(__ARMEB__) | |
137 str r6,[r0,#4] | |
138 #else | |
139 mov r9,r6,lsr#8 | |
140 strb r6,[r0,#4+3] | |
141 mov r10,r6,lsr#16 | |
142 strb r9,[r0,#4+2] | |
143 mov r11,r6,lsr#24 | |
144 strb r10,[r0,#4+1] | |
145 strb r11,[r0,#4] | |
146 #endif | |
147 | |
148 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
149 rev r7,r7 | |
150 str r7,[r0,#0] | |
151 #elif defined(__ARMEB__) | |
152 str r7,[r0,#0] | |
153 #else | |
154 mov r9,r7,lsr#8 | |
155 strb r7,[r0,#0+3] | |
156 mov r10,r7,lsr#16 | |
157 strb r9,[r0,#0+2] | |
158 mov r11,r7,lsr#24 | |
159 strb r10,[r0,#0+1] | |
160 strb r11,[r0,#0] | |
161 #endif | |
162 | |
163 bne .Louter | |
164 | |
165 add sp,sp,#36 | |
166 #if __ARM_ARCH__>=5 | |
167 ldmia sp!,{r4-r11,pc} | |
168 #else | |
169 ldmia sp!,{r4-r11,lr} | |
170 tst lr,#1 | |
171 moveq pc,lr @ be binary compatible with V4, yet | |
172 .word 0xe12fff1e @ interoperable with Thumb ISA:-
) | |
173 #endif | |
174 .size gcm_ghash_4bit,.-gcm_ghash_4bit | |
175 | |
176 .global gcm_gmult_4bit | |
177 .type gcm_gmult_4bit,%function | |
178 gcm_gmult_4bit: | |
179 stmdb sp!,{r4-r11,lr} | |
180 ldrb r12,[r0,#15] | |
181 b rem_4bit_get | |
182 .Lrem_4bit_got: | |
183 and r14,r12,#0xf0 | |
184 and r12,r12,#0x0f | |
185 mov r3,#14 | |
186 | |
187 add r7,r1,r12,lsl#4 | |
188 ldmia r7,{r4-r7} @ load Htbl[nlo] | |
189 ldrb r12,[r0,#14] | |
190 | |
191 add r11,r1,r14 | |
192 and r14,r4,#0xf @ rem | |
193 ldmia r11,{r8-r11} @ load Htbl[nhi] | |
194 add r14,r14,r14 | |
195 eor r4,r8,r4,lsr#4 | |
196 ldrh r8,[r2,r14] @ rem_4bit[rem] | |
197 eor r4,r4,r5,lsl#28 | |
198 eor r5,r9,r5,lsr#4 | |
199 eor r5,r5,r6,lsl#28 | |
200 eor r6,r10,r6,lsr#4 | |
201 eor r6,r6,r7,lsl#28 | |
202 eor r7,r11,r7,lsr#4 | |
203 and r14,r12,#0xf0 | |
204 eor r7,r7,r8,lsl#16 | |
205 and r12,r12,#0x0f | |
206 | |
207 .Loop: | |
208 add r11,r1,r12,lsl#4 | |
209 and r12,r4,#0xf @ rem | |
210 subs r3,r3,#1 | |
211 add r12,r12,r12 | |
212 ldmia r11,{r8-r11} @ load Htbl[nlo] | |
213 eor r4,r8,r4,lsr#4 | |
214 eor r4,r4,r5,lsl#28 | |
215 eor r5,r9,r5,lsr#4 | |
216 eor r5,r5,r6,lsl#28 | |
217 ldrh r8,[r2,r12] @ rem_4bit[rem] | |
218 eor r6,r10,r6,lsr#4 | |
219 ldrplb r12,[r0,r3] | |
220 eor r6,r6,r7,lsl#28 | |
221 eor r7,r11,r7,lsr#4 | |
222 | |
223 add r11,r1,r14 | |
224 and r14,r4,#0xf @ rem | |
225 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] | |
226 add r14,r14,r14 | |
227 ldmia r11,{r8-r11} @ load Htbl[nhi] | |
228 eor r4,r8,r4,lsr#4 | |
229 eor r4,r4,r5,lsl#28 | |
230 eor r5,r9,r5,lsr#4 | |
231 ldrh r8,[r2,r14] @ rem_4bit[rem] | |
232 eor r5,r5,r6,lsl#28 | |
233 eor r6,r10,r6,lsr#4 | |
234 eor r6,r6,r7,lsl#28 | |
235 eor r7,r11,r7,lsr#4 | |
236 andpl r14,r12,#0xf0 | |
237 andpl r12,r12,#0x0f | |
238 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] | |
239 bpl .Loop | |
240 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
241 rev r4,r4 | |
242 str r4,[r0,#12] | |
243 #elif defined(__ARMEB__) | |
244 str r4,[r0,#12] | |
245 #else | |
246 mov r9,r4,lsr#8 | |
247 strb r4,[r0,#12+3] | |
248 mov r10,r4,lsr#16 | |
249 strb r9,[r0,#12+2] | |
250 mov r11,r4,lsr#24 | |
251 strb r10,[r0,#12+1] | |
252 strb r11,[r0,#12] | |
253 #endif | |
254 | |
255 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
256 rev r5,r5 | |
257 str r5,[r0,#8] | |
258 #elif defined(__ARMEB__) | |
259 str r5,[r0,#8] | |
260 #else | |
261 mov r9,r5,lsr#8 | |
262 strb r5,[r0,#8+3] | |
263 mov r10,r5,lsr#16 | |
264 strb r9,[r0,#8+2] | |
265 mov r11,r5,lsr#24 | |
266 strb r10,[r0,#8+1] | |
267 strb r11,[r0,#8] | |
268 #endif | |
269 | |
270 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
271 rev r6,r6 | |
272 str r6,[r0,#4] | |
273 #elif defined(__ARMEB__) | |
274 str r6,[r0,#4] | |
275 #else | |
276 mov r9,r6,lsr#8 | |
277 strb r6,[r0,#4+3] | |
278 mov r10,r6,lsr#16 | |
279 strb r9,[r0,#4+2] | |
280 mov r11,r6,lsr#24 | |
281 strb r10,[r0,#4+1] | |
282 strb r11,[r0,#4] | |
283 #endif | |
284 | |
285 #if __ARM_ARCH__>=7 && defined(__ARMEL__) | |
286 rev r7,r7 | |
287 str r7,[r0,#0] | |
288 #elif defined(__ARMEB__) | |
289 str r7,[r0,#0] | |
290 #else | |
291 mov r9,r7,lsr#8 | |
292 strb r7,[r0,#0+3] | |
293 mov r10,r7,lsr#16 | |
294 strb r9,[r0,#0+2] | |
295 mov r11,r7,lsr#24 | |
296 strb r10,[r0,#0+1] | |
297 strb r11,[r0,#0] | |
298 #endif | |
299 | |
300 #if __ARM_ARCH__>=5 | |
301 ldmia sp!,{r4-r11,pc} | |
302 #else | |
303 ldmia sp!,{r4-r11,lr} | |
304 tst lr,#1 | |
305 moveq pc,lr @ be binary compatible with V4, yet | |
306 .word 0xe12fff1e @ interoperable with Thumb ISA:-
) | |
307 #endif | |
308 .size gcm_gmult_4bit,.-gcm_gmult_4bit | |
309 #if __ARM_ARCH__>=7 | |
310 .fpu neon | |
311 | |
312 .global gcm_gmult_neon | |
313 .type gcm_gmult_neon,%function | |
314 .align 4 | |
315 gcm_gmult_neon: | |
316 sub r1,#16 @ point at H in GCM128_CTX | |
317 vld1.64 d29,[r0,:64]!@ load Xi | |
318 vmov.i32 d5,#0xe1 @ our irreducible polynomial | |
319 vld1.64 d28,[r0,:64]! | |
320 vshr.u64 d5,#32 | |
321 vldmia r1,{d0-d1} @ load H | |
322 veor q12,q12 | |
323 #ifdef __ARMEL__ | |
324 vrev64.8 q14,q14 | |
325 #endif | |
326 veor q13,q13 | |
327 veor q11,q11 | |
328 mov r1,#16 | |
329 veor q10,q10 | |
330 mov r3,#16 | |
331 veor d2,d2 | |
332 vdup.8 d4,d28[0] @ broadcast lowest byte | |
333 b .Linner_neon | |
334 .size gcm_gmult_neon,.-gcm_gmult_neon | |
335 | |
336 .global gcm_ghash_neon | |
337 .type gcm_ghash_neon,%function | |
338 .align 4 | |
339 gcm_ghash_neon: | |
340 vld1.64 d21,[r0,:64]! @ load Xi | |
341 vmov.i32 d5,#0xe1 @ our irreducible polynomial | |
342 vld1.64 d20,[r0,:64]! | |
343 vshr.u64 d5,#32 | |
344 vldmia r0,{d0-d1} @ load H | |
345 veor q12,q12 | |
346 nop | |
347 #ifdef __ARMEL__ | |
348 vrev64.8 q10,q10 | |
349 #endif | |
350 .Louter_neon: | |
351 vld1.64 d29,[r2]! @ load inp | |
352 veor q13,q13 | |
353 vld1.64 d28,[r2]! | |
354 veor q11,q11 | |
355 mov r1,#16 | |
356 #ifdef __ARMEL__ | |
357 vrev64.8 q14,q14 | |
358 #endif | |
359 veor d2,d2 | |
360 veor q14,q10 @ inp^=Xi | |
361 veor q10,q10 | |
362 vdup.8 d4,d28[0] @ broadcast lowest byte | |
363 .Linner_neon: | |
364 subs r1,r1,#1 | |
365 vmull.p8 q9,d1,d4 @ H.lo·Xi[i] | |
366 vmull.p8 q8,d0,d4 @ H.hi·Xi[i] | |
367 vext.8 q14,q12,#1 @ IN>>=8 | |
368 | |
369 veor q10,q13 @ modulo-scheduled part | |
370 vshl.i64 d22,#48 | |
371 vdup.8 d4,d28[0] @ broadcast lowest byte | |
372 veor d3,d18,d20 | |
373 | |
374 veor d21,d22 | |
375 vuzp.8 q9,q8 | |
376 vsli.8 d2,d3,#1 @ compose the "carry" byte | |
377 vext.8 q10,q12,#1 @ Z>>=8 | |
378 | |
379 vmull.p8 q11,d2,d5 @ "carry"·0xe1 | |
380 vshr.u8 d2,d3,#7 @ save Z's bottom bit | |
381 vext.8 q13,q9,q12,#1 @ Qlo>>=8 | |
382 veor q10,q8 | |
383 bne .Linner_neon | |
384 | |
385 veor q10,q13 @ modulo-scheduled artefact | |
386 vshl.i64 d22,#48 | |
387 veor d21,d22 | |
388 | |
389 @ finalization, normalize Z:Zo | |
390 vand d2,d5 @ suffices to mask the bit | |
391 vshr.u64 d3,d20,#63 | |
392 vshl.i64 q10,#1 | |
393 subs r3,#16 | |
394 vorr q10,q1 @ Z=Z:Zo<<1 | |
395 bne .Louter_neon | |
396 | |
397 #ifdef __ARMEL__ | |
398 vrev64.8 q10,q10 | |
399 #endif | |
400 sub r0,#16 | |
401 vst1.64 d21,[r0,:64]! @ write out Xi | |
402 vst1.64 d20,[r0,:64] | |
403 | |
404 .word 0xe12fff1e | |
405 .size gcm_ghash_neon,.-gcm_ghash_neon | |
406 #endif | |
407 .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" | |
408 .align 2 | |
OLD | NEW |