OLD | NEW |
| (Empty) |
1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001 | |
2 From: Adam Langley <agl@chromium.org> | |
3 Date: Tue, 5 Nov 2013 13:10:11 -0500 | |
4 Subject: [PATCH 52/52] Optional NEON support on ARM. | |
5 | |
6 This patch causes ARM to build both the NEON and generic versions of | |
7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by | |
8 calling CRYPTO_set_NEON_capable(1). | |
9 --- | |
10 .gitignore | 1 + | |
11 Configure | 2 +- | |
12 apps/speed.c | 5 + | |
13 crypto/chacha/chacha_enc.c | 18 + | |
14 crypto/chacha/chacha_vec.c | 7 + | |
15 crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++ | |
16 crypto/cryptlib.c | 14 + | |
17 crypto/crypto.h | 8 + | |
18 crypto/poly1305/poly1305.c | 35 ++ | |
19 crypto/poly1305/poly1305_arm.c | 9 +- | |
20 10 files changed, 941 insertions(+), 4 deletions(-) | |
21 create mode 100644 crypto/chacha/chacha_vec_arm.s | |
22 | |
23 diff --git a/Configure b/Configure | |
24 index 1b95384..18b7af0 100755 | |
25 --- a/Configure | |
26 +++ b/Configure | |
27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-a
lpha.o:::::::ghash-a | |
28 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::
:::::::"; | |
29 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha2
56-mips.o sha512-mips.o::::::::::"; | |
30 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::ae
s-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-
s390x.o:::::::ghash-s390x.o:"; | |
31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb
c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a
rmv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void"; | |
32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb
c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a
rmv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm
.o:void"; | |
33 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-p
arisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-
parisc.o::::32"; | |
34 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o ae
s-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::gha
sh-parisc.o::::64"; | |
35 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o
aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::"; | |
36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c | |
37 index 54d1ca3..e4b648f 100644 | |
38 --- a/crypto/chacha/chacha_enc.c | |
39 +++ b/crypto/chacha/chacha_enc.c | |
40 @@ -61,6 +61,7 @@ | |
41 | |
42 #if !defined(OPENSSL_NO_CHACHA) | |
43 | |
44 +#include <openssl/crypto.h> | |
45 #include <openssl/chacha.h> | |
46 | |
47 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */ | |
48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k"; | |
49 | |
50 typedef unsigned int uint32_t; | |
51 | |
52 +#if __arm__ | |
53 +/* Defined in chacha_vec.c */ | |
54 +void CRYPTO_chacha_20_neon(unsigned char *out, | |
55 + const unsigned char *in, size_t in_len, | |
56 + const unsigned char key[32], | |
57 + const unsigned char nonce[8], | |
58 + size_t counter); | |
59 +#endif | |
60 + | |
61 /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in | |
62 * |input| and writes the 64 output bytes to |output|. */ | |
63 static void chacha_core(unsigned char output[64], const uint32_t input[16], | |
64 @@ -124,6 +134,16 @@ void CRYPTO_chacha_20(unsigned char *out, | |
65 unsigned char buf[64]; | |
66 size_t todo, i; | |
67 | |
68 +#if __arm__ | |
69 + if (CRYPTO_is_NEON_capable() && | |
70 + ((intptr_t)in & 15) == 0 && | |
71 + ((intptr_t)out & 15) == 0) | |
72 + { | |
73 + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter); | |
74 + return; | |
75 + } | |
76 +#endif | |
77 + | |
78 input[0] = U8TO32_LITTLE(sigma + 0); | |
79 input[1] = U8TO32_LITTLE(sigma + 4); | |
80 input[2] = U8TO32_LITTLE(sigma + 8); | |
81 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c | |
82 index 33b2238..1226c39 100644 | |
83 --- a/crypto/chacha/chacha_vec.c | |
84 +++ b/crypto/chacha/chacha_vec.c | |
85 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16))); | |
86 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ | |
87 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); | |
88 | |
89 +#if __ARM_NEON__ | |
90 +/* For ARM, we can't depend on NEON support, so this function is compiled with | |
91 + * a different name, along with the generic code, and can be enabled at | |
92 + * run-time. */ | |
93 +void CRYPTO_chacha_20_neon( | |
94 +#else | |
95 void CRYPTO_chacha_20( | |
96 +#endif | |
97 unsigned char *out, | |
98 const unsigned char *in, | |
99 size_t inlen, | |
100 diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S | |
101 new file mode 100644 | |
102 index 0000000..24a5050 | |
103 --- /dev/null | |
104 +++ b/crypto/chacha/chacha_vec_arm.S | |
105 @@ -0,0 +1,863 @@ | |
106 +# This file contains a pre-compiled version of chacha_vec.c for ARM. This is | |
107 +# needed to support switching on NEON code at runtime. If the whole of OpenSSL | |
108 +# were to be compiled with the needed flags to build chacha_vec.c, then it | |
109 +# wouldn't be possible to run on non-NEON systems. | |
110 +# | |
111 +# This file was generated by: | |
112 +# | |
113 +# /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-li
nux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../includ
e -fpic -o chacha_vec_arm.S | |
114 +# | |
115 +# And then EABI attribute 28 was set to zero to allow linking with soft-float | |
116 +# code. | |
117 + | |
118 + .syntax unified | |
119 + .cpu cortex-a8 | |
120 + .eabi_attribute 27, 3 | |
121 + .eabi_attribute 28, 0 | |
122 + .fpu neon | |
123 + .eabi_attribute 20, 1 | |
124 + .eabi_attribute 21, 1 | |
125 + .eabi_attribute 23, 3 | |
126 + .eabi_attribute 24, 1 | |
127 + .eabi_attribute 25, 1 | |
128 + .eabi_attribute 26, 2 | |
129 + .eabi_attribute 30, 2 | |
130 + .eabi_attribute 34, 1 | |
131 + .eabi_attribute 18, 4 | |
132 + .thumb | |
133 + .file "chacha_vec.c" | |
134 + .text | |
135 + .align 2 | |
136 + .global CRYPTO_chacha_20_neon | |
137 + .thumb | |
138 + .thumb_func | |
139 + .type CRYPTO_chacha_20_neon, %function | |
140 +CRYPTO_chacha_20_neon: | |
141 + @ args = 8, pretend = 0, frame = 304 | |
142 + @ frame_needed = 1, uses_anonymous_args = 0 | |
143 + @ link register save eliminated. | |
144 + push {r4, r5, r6, r7, r8, r9, sl, fp} | |
145 + fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} | |
146 + sub sp, sp, #304 | |
147 + add r7, sp, #0 | |
148 + movw ip, #43691 | |
149 + movt ip, 43690 | |
150 + str r2, [r7, #196] | |
151 + sub sp, sp, #96 | |
152 + ldr r4, [r7, #196] | |
153 + ldr r6, [r7, #400] | |
154 + ldr r2, .L38+16 | |
155 + umull r4, ip, ip, r4 | |
156 + ldr r6, [r6, #0] | |
157 + ldr r8, [r7, #400] | |
158 +.LPIC24: | |
159 + add r2, pc | |
160 + add r4, sp, #15 | |
161 + str r3, [r7, #244] | |
162 + str r6, [r7, #176] | |
163 + bic r4, r4, #15 | |
164 + str r0, [r7, #188] | |
165 + str r4, [r7, #200] | |
166 + lsrs ip, ip, #7 | |
167 + str r1, [r7, #184] | |
168 + ldmia r2, {r0, r1, r2, r3} | |
169 + ldr r4, [r8, #4] | |
170 + ldr r5, [r7, #244] | |
171 + vld1.64 {d24-d25}, [r5:64] | |
172 + vldr d26, [r5, #16] | |
173 + vldr d27, [r5, #24] | |
174 + ldr r9, [r7, #200] | |
175 + ldr r8, [r7, #404] | |
176 + ldr r5, [r7, #176] | |
177 + add r6, r9, #64 | |
178 + str r4, [r7, #300] | |
179 + mov r4, #0 | |
180 + str r8, [r7, #288] | |
181 + str r5, [r7, #296] | |
182 + str r4, [r7, #292] | |
183 + stmia r6, {r0, r1, r2, r3} | |
184 + vldr d22, [r9, #64] | |
185 + vldr d23, [r9, #72] | |
186 + vldr d20, [r7, #288] | |
187 + vldr d21, [r7, #296] | |
188 + str ip, [r7, #192] | |
189 + beq .L20 | |
190 + lsl r6, ip, #1 | |
191 + ldr r1, [r9, #68] | |
192 + add r3, r6, ip | |
193 + str r6, [r7, #180] | |
194 + ldr r2, [r9, #72] | |
195 + add r8, r8, #2 | |
196 + ldr r5, [r9, #76] | |
197 + vldr d18, .L38 | |
198 + vldr d19, .L38+8 | |
199 + str r4, [r7, #240] | |
200 + ldr r6, [r7, #184] | |
201 + ldr r4, [r7, #188] | |
202 + str r0, [r7, #224] | |
203 + str r1, [r7, #220] | |
204 + str r8, [r7, #208] | |
205 + str r2, [r7, #216] | |
206 + str r3, [r7, #204] | |
207 + str r5, [r7, #212] | |
208 + str r6, [r7, #252] | |
209 + str r4, [r7, #248] | |
210 +.L4: | |
211 + ldr r2, [r7, #244] | |
212 + add r9, r7, #216 | |
213 + ldr r3, [r7, #244] | |
214 + vadd.i32 q8, q10, q9 | |
215 + ldr r6, [r7, #208] | |
216 + vmov q15, q13 @ v4si | |
217 + ldr r5, [r7, #240] | |
218 + vmov q3, q12 @ v4si | |
219 + ldr r4, [r7, #244] | |
220 + vmov q2, q11 @ v4si | |
221 + adds r5, r5, r6 | |
222 + ldr r2, [r2, #8] | |
223 + ldr r6, [r7, #400] | |
224 + vmov q5, q10 @ v4si | |
225 + ldr r3, [r3, #12] | |
226 + vmov q1, q13 @ v4si | |
227 + ldr r0, [r7, #244] | |
228 + vmov q0, q12 @ v4si | |
229 + ldr r1, [r7, #244] | |
230 + vmov q4, q11 @ v4si | |
231 + ldmia r9, {r9, sl, fp} | |
232 + str r5, [r7, #228] | |
233 + ldr r5, [r4, #24] | |
234 + ldr r0, [r0, #0] | |
235 + ldr r1, [r1, #4] | |
236 + str r2, [r7, #264] | |
237 + str r3, [r7, #236] | |
238 + ldr r2, [r6, #4] | |
239 + ldr r3, [r4, #28] | |
240 + str r5, [r7, #280] | |
241 + ldr r5, [r6, #0] | |
242 + movs r6, #0 | |
243 + ldr ip, [r7, #228] | |
244 + ldr r8, [r7, #212] | |
245 + str r0, [r7, #232] | |
246 + str r1, [r7, #268] | |
247 + ldr r0, [r4, #16] | |
248 + ldr r1, [r4, #20] | |
249 + movs r4, #10 | |
250 + str r2, [r7, #24] | |
251 + str r3, [r7, #284] | |
252 + str r4, [r7, #256] | |
253 + ldr r2, [r7, #264] | |
254 + str r9, [r7, #276] | |
255 + mov r9, r6 | |
256 + ldr r6, [r7, #280] | |
257 + str r8, [r7, #260] | |
258 + mov r8, sl | |
259 + str r1, [r7, #272] | |
260 + mov sl, ip | |
261 + str r6, [r7, #264] | |
262 + mov r6, r5 | |
263 + ldr r3, [r7, #236] | |
264 + mov r5, r0 | |
265 + ldr ip, [r7, #24] | |
266 + ldr r1, [r7, #268] | |
267 + ldr r0, [r7, #232] | |
268 + b .L39 | |
269 +.L40: | |
270 + .align 3 | |
271 +.L38: | |
272 + .word 1 | |
273 + .word 0 | |
274 + .word 0 | |
275 + .word 0 | |
276 + .word .LANCHOR0-(.LPIC24+4) | |
277 +.L39: | |
278 +.L3: | |
279 + vadd.i32 q4, q4, q0 | |
280 + add r8, r8, r1 | |
281 + vadd.i32 q2, q2, q3 | |
282 + str r8, [r7, #268] | |
283 + veor q5, q5, q4 | |
284 + ldr r8, [r7, #276] | |
285 + veor q8, q8, q2 | |
286 + add fp, fp, r0 | |
287 + str fp, [r7, #280] | |
288 + add r8, r8, r2 | |
289 + vrev32.16 q5, q5 | |
290 + str r8, [r7, #276] | |
291 + vrev32.16 q8, q8 | |
292 + vadd.i32 q1, q1, q5 | |
293 + vadd.i32 q15, q15, q8 | |
294 + ldr r8, [r7, #280] | |
295 + veor q0, q1, q0 | |
296 + ldr r4, [r7, #260] | |
297 + veor q3, q15, q3 | |
298 + eor sl, sl, r8 | |
299 + ldr r8, [r7, #276] | |
300 + add fp, r4, r3 | |
301 + vshl.i32 q7, q0, #12 | |
302 + ldr r4, [r7, #268] | |
303 + vshl.i32 q6, q3, #12 | |
304 + eor r6, r6, r8 | |
305 + eor r9, r9, r4 | |
306 + ldr r4, [r7, #272] | |
307 + vsri.32 q7, q0, #20 | |
308 + ror r8, r6, #16 | |
309 + ldr r6, [r7, #264] | |
310 + eor ip, ip, fp | |
311 + vsri.32 q6, q3, #20 | |
312 + ror sl, sl, #16 | |
313 + ror r9, r9, #16 | |
314 + add r5, r5, sl | |
315 + vadd.i32 q4, q4, q7 | |
316 + str r5, [r7, #236] | |
317 + vadd.i32 q2, q2, q6 | |
318 + add r5, r4, r9 | |
319 + add r4, r6, r8 | |
320 + ldr r6, [r7, #284] | |
321 + ror ip, ip, #16 | |
322 + veor q5, q4, q5 | |
323 + veor q8, q2, q8 | |
324 + add r6, r6, ip | |
325 + str r6, [r7, #264] | |
326 + eors r1, r1, r5 | |
327 + ldr r6, [r7, #236] | |
328 + vshl.i32 q3, q5, #8 | |
329 + vshl.i32 q14, q8, #8 | |
330 + eors r2, r2, r4 | |
331 + eors r0, r0, r6 | |
332 + ldr r6, [r7, #264] | |
333 + vsri.32 q3, q5, #24 | |
334 + ror r1, r1, #20 | |
335 + eors r3, r3, r6 | |
336 + ldr r6, [r7, #280] | |
337 + ror r0, r0, #20 | |
338 + vsri.32 q14, q8, #24 | |
339 + adds r6, r0, r6 | |
340 + str r6, [r7, #284] | |
341 + ldr r6, [r7, #268] | |
342 + vadd.i32 q1, q1, q3 | |
343 + vadd.i32 q15, q15, q14 | |
344 + ror r2, r2, #20 | |
345 + adds r6, r1, r6 | |
346 + str r6, [r7, #260] | |
347 + ldr r6, [r7, #276] | |
348 + veor q6, q15, q6 | |
349 + veor q7, q1, q7 | |
350 + ror r3, r3, #20 | |
351 + adds r6, r2, r6 | |
352 + str r6, [r7, #280] | |
353 + ldr r6, [r7, #284] | |
354 + vshl.i32 q0, q6, #7 | |
355 + vshl.i32 q5, q7, #7 | |
356 + add fp, r3, fp | |
357 + eor sl, r6, sl | |
358 + ldr r6, [r7, #260] | |
359 + eor ip, fp, ip | |
360 + vsri.32 q0, q6, #25 | |
361 + eor r9, r6, r9 | |
362 + ldr r6, [r7, #280] | |
363 + ror sl, sl, #24 | |
364 + vsri.32 q5, q7, #25 | |
365 + eor r8, r6, r8 | |
366 + ldr r6, [r7, #236] | |
367 + ror r9, r9, #24 | |
368 + ror ip, ip, #24 | |
369 + add r6, sl, r6 | |
370 + str r6, [r7, #276] | |
371 + ldr r6, [r7, #264] | |
372 + add r5, r9, r5 | |
373 + str r5, [r7, #272] | |
374 + vext.32 q5, q5, q5, #1 | |
375 + add r5, ip, r6 | |
376 + ldr r6, [r7, #276] | |
377 + vext.32 q0, q0, q0, #1 | |
378 + vadd.i32 q4, q4, q5 | |
379 + eors r0, r0, r6 | |
380 + ldr r6, [r7, #272] | |
381 + vadd.i32 q2, q2, q0 | |
382 + vext.32 q3, q3, q3, #3 | |
383 + ror r8, r8, #24 | |
384 + eors r1, r1, r6 | |
385 + vext.32 q14, q14, q14, #3 | |
386 + add r4, r8, r4 | |
387 + ldr r6, [r7, #284] | |
388 + veor q3, q4, q3 | |
389 + veor q14, q2, q14 | |
390 + eors r2, r2, r4 | |
391 + ror r1, r1, #25 | |
392 + vext.32 q1, q1, q1, #2 | |
393 + adds r6, r1, r6 | |
394 + str r6, [r7, #284] | |
395 + vext.32 q15, q15, q15, #2 | |
396 + ldr r6, [r7, #260] | |
397 + eors r3, r3, r5 | |
398 + ror r2, r2, #25 | |
399 + vrev32.16 q8, q14 | |
400 + adds r6, r2, r6 | |
401 + vrev32.16 q3, q3 | |
402 + str r6, [r7, #268] | |
403 + vadd.i32 q1, q1, q3 | |
404 + ldr r6, [r7, #280] | |
405 + vadd.i32 q15, q15, q8 | |
406 + ror r3, r3, #25 | |
407 + veor q5, q1, q5 | |
408 + adds r6, r3, r6 | |
409 + veor q0, q15, q0 | |
410 + str r6, [r7, #264] | |
411 + ldr r6, [r7, #268] | |
412 + ror r0, r0, #25 | |
413 + add fp, r0, fp | |
414 + vshl.i32 q6, q5, #12 | |
415 + eor sl, r6, sl | |
416 + ldr r6, [r7, #284] | |
417 + vshl.i32 q14, q0, #12 | |
418 + eor r8, fp, r8 | |
419 + eor ip, r6, ip | |
420 + ldr r6, [r7, #264] | |
421 + vsri.32 q6, q5, #20 | |
422 + ror sl, sl, #16 | |
423 + eor r9, r6, r9 | |
424 + ror r6, r8, #16 | |
425 + vsri.32 q14, q0, #20 | |
426 + ldr r8, [r7, #272] | |
427 + ror ip, ip, #16 | |
428 + add r5, sl, r5 | |
429 + add r8, r6, r8 | |
430 + add r4, ip, r4 | |
431 + str r4, [r7, #236] | |
432 + eor r0, r8, r0 | |
433 + str r5, [r7, #280] | |
434 + vadd.i32 q4, q4, q6 | |
435 + ldr r5, [r7, #236] | |
436 + vadd.i32 q2, q2, q14 | |
437 + ldr r4, [r7, #276] | |
438 + ror r0, r0, #20 | |
439 + veor q3, q4, q3 | |
440 + eors r1, r1, r5 | |
441 + veor q0, q2, q8 | |
442 + str r8, [r7, #272] | |
443 + str r0, [r7, #24] | |
444 + add fp, r0, fp | |
445 + ldr r8, [r7, #280] | |
446 + ror r9, r9, #16 | |
447 + ldr r0, [r7, #284] | |
448 + add r4, r9, r4 | |
449 + str fp, [r7, #260] | |
450 + ror r1, r1, #20 | |
451 + add fp, r1, r0 | |
452 + eor r2, r8, r2 | |
453 + ldr r0, [r7, #260] | |
454 + eors r3, r3, r4 | |
455 + vshl.i32 q5, q3, #8 | |
456 + str r4, [r7, #232] | |
457 + vshl.i32 q8, q0, #8 | |
458 + ldr r4, [r7, #268] | |
459 + ldr r5, [r7, #264] | |
460 + ror r2, r2, #20 | |
461 + ror r3, r3, #20 | |
462 + eors r6, r6, r0 | |
463 + adds r5, r3, r5 | |
464 + add r8, r2, r4 | |
465 + vsri.32 q5, q3, #24 | |
466 + ldr r4, [r7, #272] | |
467 + eor r9, r5, r9 | |
468 + eor ip, fp, ip | |
469 + vsri.32 q8, q0, #24 | |
470 + eor sl, r8, sl | |
471 + ror r6, r6, #24 | |
472 + ldr r0, [r7, #280] | |
473 + str r5, [r7, #276] | |
474 + adds r4, r6, r4 | |
475 + ldr r5, [r7, #236] | |
476 + vadd.i32 q1, q1, q5 | |
477 + str r4, [r7, #272] | |
478 + vadd.i32 q15, q15, q8 | |
479 + ldr r4, [r7, #232] | |
480 + ror ip, ip, #24 | |
481 + ror sl, sl, #24 | |
482 + ror r9, r9, #24 | |
483 + add r5, ip, r5 | |
484 + add r0, sl, r0 | |
485 + str r5, [r7, #264] | |
486 + add r5, r9, r4 | |
487 + str r0, [r7, #284] | |
488 + veor q6, q1, q6 | |
489 + ldr r4, [r7, #24] | |
490 + veor q14, q15, q14 | |
491 + ldr r0, [r7, #272] | |
492 + eors r3, r3, r5 | |
493 + vshl.i32 q0, q6, #7 | |
494 + vext.32 q1, q1, q1, #2 | |
495 + eors r0, r0, r4 | |
496 + ldr r4, [r7, #284] | |
497 + str r0, [r7, #280] | |
498 + vshl.i32 q3, q14, #7 | |
499 + eors r2, r2, r4 | |
500 + ldr r4, [r7, #280] | |
501 + ldr r0, [r7, #264] | |
502 + vsri.32 q0, q6, #25 | |
503 + ror r2, r2, #25 | |
504 + ror r3, r3, #25 | |
505 + eors r1, r1, r0 | |
506 + vsri.32 q3, q14, #25 | |
507 + ror r0, r4, #25 | |
508 + ldr r4, [r7, #256] | |
509 + ror r1, r1, #25 | |
510 + vext.32 q5, q5, q5, #1 | |
511 + subs r4, r4, #1 | |
512 + str r4, [r7, #256] | |
513 + vext.32 q15, q15, q15, #2 | |
514 + vext.32 q8, q8, q8, #1 | |
515 + vext.32 q0, q0, q0, #3 | |
516 + vext.32 q3, q3, q3, #3 | |
517 + bne .L3 | |
518 + ldr r4, [r7, #264] | |
519 + vadd.i32 q14, q10, q9 | |
520 + str r2, [r7, #264] | |
521 + vadd.i32 q10, q10, q5 | |
522 + ldr r2, [r7, #252] | |
523 + vld1.64 {d12-d13}, [r2:64] | |
524 + ldr r2, [r7, #220] | |
525 + vadd.i32 q4, q11, q4 | |
526 + str ip, [r7, #24] | |
527 + mov ip, sl | |
528 + mov sl, r8 | |
529 + ldr r8, [r7, #260] | |
530 + add sl, sl, r2 | |
531 + ldr r2, [r7, #212] | |
532 + str r4, [r7, #280] | |
533 + vadd.i32 q0, q12, q0 | |
534 + ldr r4, [r7, #224] | |
535 + add r8, r8, r2 | |
536 + ldr r2, [r7, #240] | |
537 + vadd.i32 q1, q13, q1 | |
538 + str r0, [r7, #232] | |
539 + add fp, fp, r4 | |
540 + mov r0, r5 | |
541 + ldr r4, [r7, #216] | |
542 + mov r5, r6 | |
543 + mov r6, r9 | |
544 + ldr r9, [r7, #276] | |
545 + adds r2, r2, #3 | |
546 + str r2, [r7, #240] | |
547 + vadd.i32 q2, q11, q2 | |
548 + ldr r2, [r7, #252] | |
549 + add r9, r9, r4 | |
550 + vadd.i32 q3, q12, q3 | |
551 + ldr r4, [r7, #228] | |
552 + vadd.i32 q15, q13, q15 | |
553 + str r1, [r7, #268] | |
554 + vadd.i32 q8, q14, q8 | |
555 + str r3, [r7, #236] | |
556 + veor q4, q4, q6 | |
557 + ldr r3, [r7, #284] | |
558 + ldr r1, [r7, #272] | |
559 + add ip, r4, ip | |
560 + ldr r4, [r7, #248] | |
561 + vst1.64 {d8-d9}, [r4:64] | |
562 + vldr d8, [r2, #16] | |
563 + vldr d9, [r2, #24] | |
564 + veor q0, q0, q4 | |
565 + vstr d0, [r4, #16] | |
566 + vstr d1, [r4, #24] | |
567 + vldr d0, [r2, #32] | |
568 + vldr d1, [r2, #40] | |
569 + veor q1, q1, q0 | |
570 + vstr d2, [r4, #32] | |
571 + vstr d3, [r4, #40] | |
572 + vldr d2, [r2, #48] | |
573 + vldr d3, [r2, #56] | |
574 + veor q10, q10, q1 | |
575 + vstr d20, [r4, #48] | |
576 + vstr d21, [r4, #56] | |
577 + vldr d8, [r2, #64] | |
578 + vldr d9, [r2, #72] | |
579 + veor q2, q2, q4 | |
580 + vstr d4, [r4, #64] | |
581 + vstr d5, [r4, #72] | |
582 + vldr d10, [r2, #80] | |
583 + vldr d11, [r2, #88] | |
584 + veor q3, q3, q5 | |
585 + vstr d6, [r4, #80] | |
586 + vstr d7, [r4, #88] | |
587 + vldr d12, [r2, #96] | |
588 + vldr d13, [r2, #104] | |
589 + veor q15, q15, q6 | |
590 + vstr d30, [r4, #96] | |
591 + vstr d31, [r4, #104] | |
592 + vldr d20, [r2, #112] | |
593 + vldr d21, [r2, #120] | |
594 + veor q8, q8, q10 | |
595 + vstr d16, [r4, #112] | |
596 + vstr d17, [r4, #120] | |
597 + ldr r4, [r2, #128] | |
598 + ldr r2, [r7, #248] | |
599 + vadd.i32 q10, q14, q9 | |
600 + eor r4, fp, r4 | |
601 + vadd.i32 q10, q10, q9 | |
602 + str r4, [r2, #128] | |
603 + ldr r4, [r7, #252] | |
604 + ldr r2, [r4, #132] | |
605 + eor r2, sl, r2 | |
606 + ldr sl, [r7, #248] | |
607 + str r2, [sl, #132] | |
608 + ldr r2, [r4, #136] | |
609 + eor r2, r9, r2 | |
610 + str r2, [sl, #136] | |
611 + ldr r2, [r4, #140] | |
612 + eor r2, r8, r2 | |
613 + str r2, [sl, #140] | |
614 + ldr r2, [r7, #244] | |
615 + ldr r4, [r4, #144] | |
616 + ldr r2, [r2, #0] | |
617 + str r4, [r7, #44] | |
618 + ldr r4, [r7, #232] | |
619 + add r8, r4, r2 | |
620 + ldr r2, [r7, #44] | |
621 + ldr r4, [r7, #244] | |
622 + eor r8, r8, r2 | |
623 + ldr r2, [r7, #252] | |
624 + str r8, [sl, #144] | |
625 + ldr r4, [r4, #4] | |
626 + ldr r2, [r2, #148] | |
627 + str r2, [r7, #40] | |
628 + ldr r2, [r7, #268] | |
629 + add r8, r2, r4 | |
630 + ldr r4, [r7, #40] | |
631 + ldr r2, [r7, #244] | |
632 + eor r8, r8, r4 | |
633 + ldr r4, [r7, #252] | |
634 + str r8, [sl, #148] | |
635 + ldr r2, [r2, #8] | |
636 + ldr r4, [r4, #152] | |
637 + str r4, [r7, #36] | |
638 + ldr r4, [r7, #264] | |
639 + add r8, r4, r2 | |
640 + ldr r2, [r7, #36] | |
641 + eor r8, r8, r2 | |
642 + str r8, [sl, #152] | |
643 + ldr r2, [r7, #252] | |
644 + ldr r4, [r7, #244] | |
645 + ldr r2, [r2, #156] | |
646 + ldr r4, [r4, #12] | |
647 + str r2, [r7, #32] | |
648 + ldr r2, [r7, #236] | |
649 + add r8, r2, r4 | |
650 + ldr r4, [r7, #32] | |
651 + ldr r2, [r7, #252] | |
652 + eor r8, r8, r4 | |
653 + str r8, [sl, #156] | |
654 + ldr r8, [r7, #244] | |
655 + ldr r2, [r2, #160] | |
656 + ldr r4, [r8, #16] | |
657 + adds r0, r0, r4 | |
658 + ldr r4, [r7, #252] | |
659 + eors r0, r0, r2 | |
660 + str r0, [sl, #160] | |
661 + ldr r0, [r8, #20] | |
662 + ldr r2, [r4, #164] | |
663 + adds r1, r1, r0 | |
664 + ldr r0, [r7, #280] | |
665 + eors r1, r1, r2 | |
666 + str r1, [sl, #164] | |
667 + ldr r2, [r8, #24] | |
668 + ldr r1, [r4, #168] | |
669 + adds r2, r0, r2 | |
670 + eors r2, r2, r1 | |
671 + str r2, [sl, #168] | |
672 + ldr r1, [r8, #28] | |
673 + ldr r2, [r4, #172] | |
674 + adds r3, r3, r1 | |
675 + eors r3, r3, r2 | |
676 + str r3, [sl, #172] | |
677 + ldr r3, [r4, #176] | |
678 + eor r3, ip, r3 | |
679 + str r3, [sl, #176] | |
680 + ldr r3, [r4, #180] | |
681 + ldr r4, [r7, #400] | |
682 + eors r6, r6, r3 | |
683 + str r6, [sl, #180] | |
684 + ldr r6, [r7, #252] | |
685 + ldr r2, [r4, #0] | |
686 + ldr r3, [r6, #184] | |
687 + adds r5, r5, r2 | |
688 + eors r5, r5, r3 | |
689 + str r5, [sl, #184] | |
690 + ldr r2, [r6, #188] | |
691 + adds r6, r6, #192 | |
692 + ldr r3, [r4, #4] | |
693 + str r6, [r7, #252] | |
694 + ldr r0, [r7, #24] | |
695 + ldr r1, [r7, #240] | |
696 + adds r4, r0, r3 | |
697 + eors r4, r4, r2 | |
698 + ldr r2, [r7, #204] | |
699 + str r4, [sl, #188] | |
700 + add sl, sl, #192 | |
701 + cmp r1, r2 | |
702 + str sl, [r7, #248] | |
703 + bne .L4 | |
704 + ldr r4, [r7, #192] | |
705 + ldr r3, [r7, #180] | |
706 + ldr r6, [r7, #188] | |
707 + adds r5, r3, r4 | |
708 + ldr r8, [r7, #184] | |
709 + lsls r5, r5, #6 | |
710 + adds r4, r6, r5 | |
711 + add r5, r8, r5 | |
712 +.L2: | |
713 + ldr r9, [r7, #196] | |
714 + movw r3, #43691 | |
715 + movt r3, 43690 | |
716 + ldr sl, [r7, #196] | |
717 + umull r9, r3, r3, r9 | |
718 + lsrs r3, r3, #7 | |
719 + add r3, r3, r3, lsl #1 | |
720 + sub r3, sl, r3, lsl #6 | |
721 + lsrs r6, r3, #6 | |
722 + beq .L5 | |
723 + add r1, r5, #16 | |
724 + add r2, r4, #16 | |
725 + mov r0, r6 | |
726 + vldr d30, .L41 | |
727 + vldr d31, .L41+8 | |
728 +.L6: | |
729 + vmov q8, q10 @ v4si | |
730 + movs r3, #10 | |
731 + vmov q1, q13 @ v4si | |
732 + vmov q14, q12 @ v4si | |
733 + vmov q3, q11 @ v4si | |
734 +.L7: | |
735 + vadd.i32 q3, q3, q14 | |
736 + subs r3, r3, #1 | |
737 + veor q2, q8, q3 | |
738 + vrev32.16 q2, q2 | |
739 + vadd.i32 q8, q1, q2 | |
740 + veor q9, q8, q14 | |
741 + vshl.i32 q14, q9, #12 | |
742 + vsri.32 q14, q9, #20 | |
743 + vadd.i32 q3, q3, q14 | |
744 + veor q2, q3, q2 | |
745 + vshl.i32 q9, q2, #8 | |
746 + vsri.32 q9, q2, #24 | |
747 + vadd.i32 q8, q8, q9 | |
748 + vext.32 q9, q9, q9, #3 | |
749 + veor q14, q8, q14 | |
750 + vext.32 q1, q8, q8, #2 | |
751 + vshl.i32 q8, q14, #7 | |
752 + vsri.32 q8, q14, #25 | |
753 + vext.32 q8, q8, q8, #1 | |
754 + vadd.i32 q3, q3, q8 | |
755 + veor q2, q3, q9 | |
756 + vrev32.16 q2, q2 | |
757 + vadd.i32 q9, q1, q2 | |
758 + veor q8, q9, q8 | |
759 + vshl.i32 q14, q8, #12 | |
760 + vsri.32 q14, q8, #20 | |
761 + vadd.i32 q3, q3, q14 | |
762 + veor q2, q3, q2 | |
763 + vshl.i32 q8, q2, #8 | |
764 + vsri.32 q8, q2, #24 | |
765 + vadd.i32 q9, q9, q8 | |
766 + vext.32 q8, q8, q8, #1 | |
767 + veor q14, q9, q14 | |
768 + vext.32 q1, q9, q9, #2 | |
769 + vshl.i32 q9, q14, #7 | |
770 + vsri.32 q9, q14, #25 | |
771 + vext.32 q14, q9, q9, #3 | |
772 + bne .L7 | |
773 + vadd.i32 q8, q10, q8 | |
774 + subs r0, r0, #1 | |
775 + vadd.i32 q3, q11, q3 | |
776 + vldr d0, [r1, #-16] | |
777 + vldr d1, [r1, #-8] | |
778 + vadd.i32 q14, q12, q14 | |
779 + vadd.i32 q1, q13, q1 | |
780 + veor q3, q3, q0 | |
781 + vstr d6, [r2, #-16] | |
782 + vstr d7, [r2, #-8] | |
783 + vadd.i32 q10, q10, q15 | |
784 + vld1.64 {d8-d9}, [r1:64] | |
785 + veor q14, q14, q4 | |
786 + vst1.64 {d28-d29}, [r2:64] | |
787 + vldr d10, [r1, #16] | |
788 + vldr d11, [r1, #24] | |
789 + veor q1, q1, q5 | |
790 + vstr d2, [r2, #16] | |
791 + vstr d3, [r2, #24] | |
792 + vldr d18, [r1, #32] | |
793 + vldr d19, [r1, #40] | |
794 + add r1, r1, #64 | |
795 + veor q8, q8, q9 | |
796 + vstr d16, [r2, #32] | |
797 + vstr d17, [r2, #40] | |
798 + add r2, r2, #64 | |
799 + bne .L6 | |
800 + lsls r6, r6, #6 | |
801 + adds r4, r4, r6 | |
802 + adds r5, r5, r6 | |
803 +.L5: | |
804 + ldr r6, [r7, #196] | |
805 + ands ip, r6, #63 | |
806 + beq .L1 | |
807 + vmov q8, q10 @ v4si | |
808 + movs r3, #10 | |
809 + vmov q14, q13 @ v4si | |
810 + vmov q9, q12 @ v4si | |
811 + vmov q15, q11 @ v4si | |
812 +.L10: | |
813 + vadd.i32 q15, q15, q9 | |
814 + subs r3, r3, #1 | |
815 + veor q8, q8, q15 | |
816 + vrev32.16 q8, q8 | |
817 + vadd.i32 q3, q14, q8 | |
818 + veor q9, q3, q9 | |
819 + vshl.i32 q14, q9, #12 | |
820 + vsri.32 q14, q9, #20 | |
821 + vadd.i32 q15, q15, q14 | |
822 + veor q9, q15, q8 | |
823 + vshl.i32 q8, q9, #8 | |
824 + vsri.32 q8, q9, #24 | |
825 + vadd.i32 q9, q3, q8 | |
826 + vext.32 q8, q8, q8, #3 | |
827 + veor q2, q9, q14 | |
828 + vext.32 q14, q9, q9, #2 | |
829 + vshl.i32 q9, q2, #7 | |
830 + vsri.32 q9, q2, #25 | |
831 + vext.32 q9, q9, q9, #1 | |
832 + vadd.i32 q15, q15, q9 | |
833 + veor q3, q15, q8 | |
834 + vrev32.16 q3, q3 | |
835 + vadd.i32 q14, q14, q3 | |
836 + veor q8, q14, q9 | |
837 + vshl.i32 q9, q8, #12 | |
838 + vsri.32 q9, q8, #20 | |
839 + vadd.i32 q15, q15, q9 | |
840 + veor q3, q15, q3 | |
841 + vshl.i32 q8, q3, #8 | |
842 + vsri.32 q8, q3, #24 | |
843 + vadd.i32 q14, q14, q8 | |
844 + vext.32 q8, q8, q8, #1 | |
845 + veor q3, q14, q9 | |
846 + vext.32 q14, q14, q14, #2 | |
847 + vshl.i32 q9, q3, #7 | |
848 + vsri.32 q9, q3, #25 | |
849 + vext.32 q9, q9, q9, #3 | |
850 + bne .L10 | |
851 + cmp ip, #15 | |
852 + vadd.i32 q11, q11, q15 | |
853 + bhi .L37 | |
854 + ldr r9, [r7, #200] | |
855 + vst1.64 {d22-d23}, [r9:128] | |
856 +.L14: | |
857 + ldr sl, [r7, #196] | |
858 + and r3, sl, #48 | |
859 + cmp ip, r3 | |
860 + bls .L1 | |
861 + adds r0, r5, r3 | |
862 + adds r1, r4, r3 | |
863 + add r2, r0, #16 | |
864 + add r6, r1, #16 | |
865 + cmp r1, r2 | |
866 + it cc | |
867 + cmpcc r0, r6 | |
868 + rsb r9, r3, ip | |
869 + ite cc | |
870 + movcc r2, #0 | |
871 + movcs r2, #1 | |
872 + cmp r9, #15 | |
873 + ite ls | |
874 + movls r2, #0 | |
875 + andhi r2, r2, #1 | |
876 + lsr r8, r9, #4 | |
877 + eor r2, r2, #1 | |
878 + cmp r8, #0 | |
879 + it eq | |
880 + orreq r2, r2, #1 | |
881 + lsl sl, r8, #4 | |
882 + cbnz r2, .L35 | |
883 + ldr fp, [r7, #200] | |
884 + add r6, fp, r3 | |
885 +.L17: | |
886 + vld1.8 {q8}, [r0]! | |
887 + adds r2, r2, #1 | |
888 + cmp r8, r2 | |
889 + vld1.8 {q9}, [r6]! | |
890 + veor q8, q9, q8 | |
891 + vst1.8 {q8}, [r1]! | |
892 + bhi .L17 | |
893 + cmp r9, sl | |
894 + add r3, r3, sl | |
895 + beq .L1 | |
896 +.L35: | |
897 + ldr r0, [r7, #200] | |
898 +.L25: | |
899 + ldrb r2, [r5, r3] @ zero_extendqisi2 | |
900 + ldrb r1, [r3, r0] @ zero_extendqisi2 | |
901 + eors r2, r2, r1 | |
902 + strb r2, [r4, r3] | |
903 + adds r3, r3, #1 | |
904 + cmp ip, r3 | |
905 + bhi .L25 | |
906 +.L1: | |
907 + add r7, r7, #304 | |
908 + mov sp, r7 | |
909 + fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} | |
910 + pop {r4, r5, r6, r7, r8, r9, sl, fp} | |
911 + bx lr | |
912 +.L37: | |
913 + cmp ip, #31 | |
914 + vld1.64 {d0-d1}, [r5:64] | |
915 + vadd.i32 q9, q12, q9 | |
916 + veor q11, q11, q0 | |
917 + vst1.64 {d22-d23}, [r4:64] | |
918 + bls .L12 | |
919 + cmp ip, #47 | |
920 + vldr d2, [r5, #16] | |
921 + vldr d3, [r5, #24] | |
922 + vadd.i32 q13, q13, q14 | |
923 + veor q9, q9, q1 | |
924 + vstr d18, [r4, #16] | |
925 + vstr d19, [r4, #24] | |
926 + bls .L13 | |
927 + vadd.i32 q8, q8, q10 | |
928 + vldr d0, [r5, #32] | |
929 + vldr d1, [r5, #40] | |
930 + ldr r6, [r7, #200] | |
931 + vstr d16, [r6, #48] | |
932 + vstr d17, [r6, #56] | |
933 + veor q8, q13, q0 | |
934 + vstr d16, [r4, #32] | |
935 + vstr d17, [r4, #40] | |
936 + b .L14 | |
937 +.L12: | |
938 + ldr r8, [r7, #200] | |
939 + vstr d18, [r8, #16] | |
940 + vstr d19, [r8, #24] | |
941 + b .L14 | |
942 +.L20: | |
943 + ldr r5, [r7, #184] | |
944 + ldr r4, [r7, #188] | |
945 + b .L2 | |
946 +.L13: | |
947 + ldr r6, [r7, #200] | |
948 + vstr d26, [r6, #32] | |
949 + vstr d27, [r6, #40] | |
950 + b .L14 | |
951 +.L42: | |
952 + .align 3 | |
953 +.L41: | |
954 + .word 1 | |
955 + .word 0 | |
956 + .word 0 | |
957 + .word 0 | |
958 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon | |
959 + .section .rodata | |
960 + .align 3 | |
961 +.LANCHOR0 = . + 0 | |
962 +.LC0: | |
963 + .word 1634760805 | |
964 + .word 857760878 | |
965 + .word 2036477234 | |
966 + .word 1797285236 | |
967 + .ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro
GCC 2012.10) 4.7.3 20121001 (prerelease)" | |
968 + .section .note.GNU-stack,"",%progbits | |
969 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c | |
970 index 7bef015..3b6ab1d 100644 | |
971 --- a/crypto/cryptlib.c | |
972 +++ b/crypto/cryptlib.c | |
973 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type) | |
974 return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS))
; | |
975 } | |
976 | |
977 +#if __arm__ | |
978 +static int global_arm_neon_enabled = 0; | |
979 + | |
980 +void CRYPTO_set_NEON_capable(int on) | |
981 + { | |
982 + global_arm_neon_enabled = on != 0; | |
983 + } | |
984 + | |
985 +int CRYPTO_is_NEON_capable(void) | |
986 + { | |
987 + return global_arm_neon_enabled; | |
988 + } | |
989 +#endif | |
990 + | |
991 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ | |
992 defined(__INTEL__) || \ | |
993 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined
(_M_X64) | |
994 diff --git a/crypto/crypto.h b/crypto/crypto.h | |
995 index e11ac73..db339c3 100644 | |
996 --- a/crypto/crypto.h | |
997 +++ b/crypto/crypto.h | |
998 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void); | |
999 | |
1000 int CRYPTO_get_new_lockid(char *name); | |
1001 | |
1002 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This | |
1003 + * code should be called before any non-init functions. */ | |
1004 +void CRYPTO_set_NEON_capable(int on); | |
1005 + | |
1006 +/* CRYPTO_is_NEON_capable returns the last value given to | |
1007 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */ | |
1008 +int CRYPTO_is_NEON_capable(void); | |
1009 + | |
1010 int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */ | |
1011 void CRYPTO_lock(int mode, int type,const char *file,int line); | |
1012 void CRYPTO_set_locking_callback(void (*func)(int mode,int type, | |
1013 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c | |
1014 index 2e5621d..00d53bf 100644 | |
1015 --- a/crypto/poly1305/poly1305.c | |
1016 +++ b/crypto/poly1305/poly1305.c | |
1017 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v) | |
1018 } | |
1019 #endif | |
1020 | |
1021 +#if __arm__ | |
1022 +void CRYPTO_poly1305_init_neon(poly1305_state* state, | |
1023 + const unsigned char key[32]); | |
1024 + | |
1025 +void CRYPTO_poly1305_update_neon(poly1305_state* state, | |
1026 + const unsigned char *in, | |
1027 + size_t in_len); | |
1028 + | |
1029 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]); | |
1030 +#endif | |
1031 + | |
1032 static uint64_t | |
1033 mul32x32_64(uint32_t a, uint32_t b) | |
1034 { | |
1035 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uns
igned char key[32]) | |
1036 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; | |
1037 uint32_t t0,t1,t2,t3; | |
1038 | |
1039 +#if __arm__ | |
1040 + if (CRYPTO_is_NEON_capable()) | |
1041 + { | |
1042 + CRYPTO_poly1305_init_neon(statep, key); | |
1043 + return; | |
1044 + } | |
1045 +#endif | |
1046 + | |
1047 t0 = U8TO32_LE(key+0); | |
1048 t1 = U8TO32_LE(key+4); | |
1049 t2 = U8TO32_LE(key+8); | |
1050 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const u
nsigned char *in, | |
1051 unsigned int i; | |
1052 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; | |
1053 | |
1054 +#if __arm__ | |
1055 + if (CRYPTO_is_NEON_capable()) | |
1056 + { | |
1057 + CRYPTO_poly1305_update_neon(statep, in, in_len); | |
1058 + return; | |
1059 + } | |
1060 +#endif | |
1061 + | |
1062 if (state->buf_used) | |
1063 { | |
1064 unsigned int todo = 16 - state->buf_used; | |
1065 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigne
d char mac[16]) | |
1066 uint32_t g0,g1,g2,g3,g4; | |
1067 uint32_t b, nb; | |
1068 | |
1069 +#if __arm__ | |
1070 + if (CRYPTO_is_NEON_capable()) | |
1071 + { | |
1072 + CRYPTO_poly1305_finish_neon(statep, mac); | |
1073 + return; | |
1074 + } | |
1075 +#endif | |
1076 + | |
1077 if (state->buf_used) | |
1078 poly1305_update(state, state->buf, state->buf_used); | |
1079 | |
1080 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c | |
1081 index adcef35..34e339d 100644 | |
1082 --- a/crypto/poly1305/poly1305_arm.c | |
1083 +++ b/crypto/poly1305/poly1305_arm.c | |
1084 @@ -51,6 +51,7 @@ | |
1085 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ | |
1086 | |
1087 #include <stdint.h> | |
1088 +#include <string.h> | |
1089 | |
1090 #include <openssl/poly1305.h> | |
1091 | |
1092 @@ -202,7 +203,8 @@ struct poly1305_state_st { | |
1093 unsigned char key[16]; | |
1094 }; | |
1095 | |
1096 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) | |
1097 +void CRYPTO_poly1305_init_neon(poly1305_state *state, | |
1098 + const unsigned char key[32]) | |
1099 { | |
1100 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); | |
1101 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); | |
1102 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsig
ned char key[32]) | |
1103 st->buf_used = 0; | |
1104 } | |
1105 | |
1106 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, siz
e_t in_len) | |
1107 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in
, | |
1108 + size_t in_len) | |
1109 { | |
1110 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); | |
1111 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); | |
1112 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uns
igned char *in, size | |
1113 } | |
1114 } | |
1115 | |
1116 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16]) | |
1117 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]) | |
1118 { | |
1119 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); | |
1120 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); | |
1121 -- | |
1122 1.8.4.1 | |
1123 | |
OLD | NEW |