Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(177)

Side by Side Diff: openssl/patches/neon_runtime.patch

Issue 59083010: third_party/openssl: add ChaCha20+Poly1305 support. Base URL: https://chromium.googlesource.com/chromium/deps/openssl.git@master
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001
2 From: Adam Langley <agl@chromium.org>
3 Date: Tue, 5 Nov 2013 13:10:11 -0500
4 Subject: [PATCH 52/52] Optional NEON support on ARM.
5
6 This patch causes ARM to build both the NEON and generic versions of
7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by
8 calling CRYPTO_set_NEON_capable(1).
9 ---
10 .gitignore | 1 +
11 Configure | 2 +-
12 apps/speed.c | 5 +
13 crypto/chacha/chacha_enc.c | 18 +
14 crypto/chacha/chacha_vec.c | 7 +
15 crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++
16 crypto/cryptlib.c | 14 +
17 crypto/crypto.h | 8 +
18 crypto/poly1305/poly1305.c | 35 ++
19 crypto/poly1305/poly1305_arm.c | 9 +-
20 10 files changed, 941 insertions(+), 4 deletions(-)
21 create mode 100644 crypto/chacha/chacha_vec_arm.s
22
23 diff --git a/Configure b/Configure
24 index 1b95384..18b7af0 100755
25 --- a/Configure
26 +++ b/Configure
27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-a lpha.o:::::::ghash-a
28 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::: :::::::";
29 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha2 56-mips.o sha512-mips.o::::::::::";
30 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::ae s-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4- s390x.o:::::::ghash-s390x.o:";
31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a rmv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void";
32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a rmv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm .o:void";
33 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-p arisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash- parisc.o::::32";
34 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o ae s-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::gha sh-parisc.o::::64";
35 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::";
36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c
37 index 54d1ca3..e4b648f 100644
38 --- a/crypto/chacha/chacha_enc.c
39 +++ b/crypto/chacha/chacha_enc.c
40 @@ -61,6 +61,7 @@
41
42 #if !defined(OPENSSL_NO_CHACHA)
43
44 +#include <openssl/crypto.h>
45 #include <openssl/chacha.h>
46
47 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */
48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k";
49
50 typedef unsigned int uint32_t;
51
52 +#if __arm__
53 +/* Defined in chacha_vec.c */
54 +void CRYPTO_chacha_20_neon(unsigned char *out,
55 + const unsigned char *in, size_t in_len,
56 + const unsigned char key[32],
57 + const unsigned char nonce[8],
58 + size_t counter);
59 +#endif
60 +
61 /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in
62 * |input| and writes the 64 output bytes to |output|. */
63 static void chacha_core(unsigned char output[64], const uint32_t input[16],
64 @@ -124,6 +134,14 @@ void CRYPTO_chacha_20(unsigned char *out,
65 unsigned char buf[64];
66 size_t todo, i;
67
68 +#if __arm__
69 + if (CRYPTO_is_NEON_capable())
70 + {
71 + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter);
72 + return;
73 + }
74 +#endif
75 +
76 input[0] = U8TO32_LITTLE(sigma + 0);
77 input[1] = U8TO32_LITTLE(sigma + 4);
78 input[2] = U8TO32_LITTLE(sigma + 8);
79 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c
80 index 33b2238..1226c39 100644
81 --- a/crypto/chacha/chacha_vec.c
82 +++ b/crypto/chacha/chacha_vec.c
83 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16)));
84 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \
85 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
86
87 +#if __ARM_NEON__
88 +/* For ARM, we can't depend on NEON support, so this function is compiled with
89 + * a different name, along with the generic code, and can be enabled at
90 + * run-time. */
91 +void CRYPTO_chacha_20_neon(
92 +#else
93 void CRYPTO_chacha_20(
94 +#endif
95 unsigned char *out,
96 const unsigned char *in,
97 size_t inlen,
98 diff --git a/crypto/chacha/chacha_vec_arm.s b/crypto/chacha/chacha_vec_arm.s
99 new file mode 100644
100 index 0000000..24a5050
101 --- /dev/null
102 +++ b/crypto/chacha/chacha_vec_arm.s
103 @@ -0,0 +1,846 @@
104 + .syntax unified
105 + .cpu cortex-a8
106 + .eabi_attribute 27, 3
107 + .eabi_attribute 28, 1
108 + .fpu neon
109 + .eabi_attribute 20, 1
110 + .eabi_attribute 21, 1
111 + .eabi_attribute 23, 3
112 + .eabi_attribute 24, 1
113 + .eabi_attribute 25, 1
114 + .eabi_attribute 26, 2
115 + .eabi_attribute 30, 2
116 + .eabi_attribute 34, 1
117 + .eabi_attribute 18, 4
118 + .thumb
119 + .file "chacha_vec.c"
120 + .text
121 + .align 2
122 + .global CRYPTO_chacha_20_neon
123 + .thumb
124 + .thumb_func
125 + .type CRYPTO_chacha_20_neon, %function
126 +CRYPTO_chacha_20_neon:
127 + @ args = 8, pretend = 0, frame = 296
128 + @ frame_needed = 1, uses_anonymous_args = 0
129 + @ link register save eliminated.
130 + push {r4, r5, r6, r7, r8, r9, sl, fp}
131 + fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
132 + sub sp, sp, #296
133 + add r7, sp, #0
134 + movw ip, #43691
135 + movt ip, 43690
136 + str r2, [r7, #192]
137 + sub sp, sp, #96
138 + ldr r4, [r7, #192]
139 + ldr r6, [r7, #392]
140 + ldr r2, .L38+16
141 + umull r4, ip, ip, r4
142 + ldr r6, [r6, #0]
143 + ldr r8, [r7, #392]
144 + add r4, sp, #15
145 + str r3, [r7, #236]
146 + bic r4, r4, #15
147 + str r6, [r7, #172]
148 + str r4, [r7, #196]
149 + str r0, [r7, #184]
150 + lsrs ip, ip, #7
151 + str r1, [r7, #180]
152 + ldmia r2, {r0, r1, r2, r3}
153 + ldr r4, [r8, #4]
154 + ldr r5, [r7, #236]
155 + vld1.64 {d24-d25}, [r5:64]
156 + vldr d26, [r5, #16]
157 + vldr d27, [r5, #24]
158 + ldr fp, [r7, #196]
159 + ldr r8, [r7, #396]
160 + ldr r5, [r7, #172]
161 + add r6, fp, #64
162 + str r4, [r7, #292]
163 + mov r4, #0
164 + str r8, [r7, #280]
165 + str r5, [r7, #288]
166 + str r4, [r7, #284]
167 + stmia r6, {r0, r1, r2, r3}
168 + vldr d22, [fp, #64]
169 + vldr d23, [fp, #72]
170 + vldr d20, [r7, #280]
171 + vldr d21, [r7, #288]
172 + str ip, [r7, #188]
173 + beq .L20
174 + lsl r6, ip, #1
175 + ldr r1, [fp, #68]
176 + add r3, r6, ip
177 + str r6, [r7, #176]
178 + ldr r2, [fp, #72]
179 + add r8, r8, #2
180 + ldr r5, [fp, #76]
181 + vldr d18, .L38
182 + vldr d19, .L38+8
183 + str r4, [r7, #232]
184 + ldr r6, [r7, #180]
185 + ldr r4, [r7, #184]
186 + str r0, [r7, #220]
187 + str r1, [r7, #216]
188 + str r8, [r7, #200]
189 + str r2, [r7, #212]
190 + str r3, [r7, #204]
191 + str r5, [r7, #208]
192 + str r6, [r7, #244]
193 + str r4, [r7, #240]
194 +.L4:
195 + ldr r6, [r7, #236]
196 + vadd.i32 q8, q10, q9
197 + ldr r5, [r7, #236]
198 + vmov q15, q13 @ v4si
199 + ldr r8, [r7, #232]
200 + vmov q3, q12 @ v4si
201 + ldr r6, [r6, #4]
202 + vmov q2, q11 @ v4si
203 + ldr fp, [r7, #200]
204 + vmov q5, q10 @ v4si
205 + ldr r4, [r7, #236]
206 + vmov q1, q13 @ v4si
207 + add ip, r8, fp
208 + ldr r5, [r5, #0]
209 + ldr r0, [r7, #236]
210 + add r8, r7, #208
211 + ldr r1, [r7, #236]
212 + vmov q0, q12 @ v4si
213 + str r6, [r7, #260]
214 + vmov q4, q11 @ v4si
215 + ldr r6, [r7, #392]
216 + ldmia r8, {r8, r9, sl, fp}
217 + ldr r0, [r0, #8]
218 + ldr r1, [r1, #12]
219 + str r5, [r7, #224]
220 + ldr r5, [r4, #24]
221 + ldr r3, [r4, #28]
222 + ldr r2, [r6, #4]
223 + str r0, [r7, #256]
224 + str r1, [r7, #228]
225 + str r5, [r7, #272]
226 + ldr r5, [r6, #0]
227 + movs r6, #0
228 + ldr r0, [r4, #16]
229 + ldr r1, [r4, #20]
230 + movs r4, #10
231 + str r2, [r7, #20]
232 + str r3, [r7, #276]
233 + str r9, [r7, #268]
234 + mov r9, r6
235 + str r4, [r7, #248]
236 + ldr r2, [r7, #256]
237 + ldr r3, [r7, #228]
238 + str r8, [r7, #252]
239 + mov r8, sl
240 + ldr r6, [r7, #272]
241 + mov sl, ip
242 + str r1, [r7, #264]
243 + ldr ip, [r7, #20]
244 + str r6, [r7, #256]
245 + mov r6, r5
246 + ldr r1, [r7, #260]
247 + mov r5, r0
248 + ldr r0, [r7, #224]
249 + b .L39
250 +.L40:
251 + .align 3
252 +.L38:
253 + .word 1
254 + .word 0
255 + .word 0
256 + .word 0
257 + .word .LANCHOR0
258 +.L39:
259 +.L3:
260 + vadd.i32 q4, q4, q0
261 + add r8, r8, r1
262 + vadd.i32 q2, q2, q3
263 + str r8, [r7, #260]
264 + veor q5, q5, q4
265 + ldr r8, [r7, #268]
266 + veor q8, q8, q2
267 + add fp, fp, r0
268 + str fp, [r7, #272]
269 + add r8, r8, r2
270 + vrev32.16 q5, q5
271 + str r8, [r7, #268]
272 + vrev32.16 q8, q8
273 + vadd.i32 q1, q1, q5
274 + vadd.i32 q15, q15, q8
275 + ldr r8, [r7, #272]
276 + veor q0, q1, q0
277 + ldr r4, [r7, #252]
278 + veor q3, q15, q3
279 + eor sl, sl, r8
280 + ldr r8, [r7, #268]
281 + add fp, r4, r3
282 + vshl.i32 q7, q0, #12
283 + ldr r4, [r7, #260]
284 + vshl.i32 q6, q3, #12
285 + eor r6, r6, r8
286 + eor r9, r9, r4
287 + ldr r4, [r7, #264]
288 + vsri.32 q7, q0, #20
289 + ror r8, r6, #16
290 + ldr r6, [r7, #256]
291 + eor ip, ip, fp
292 + vsri.32 q6, q3, #20
293 + ror sl, sl, #16
294 + ror r9, r9, #16
295 + add r5, r5, sl
296 + vadd.i32 q4, q4, q7
297 + str r5, [r7, #228]
298 + vadd.i32 q2, q2, q6
299 + add r5, r4, r9
300 + add r4, r6, r8
301 + ldr r6, [r7, #276]
302 + ror ip, ip, #16
303 + veor q5, q4, q5
304 + veor q8, q2, q8
305 + add r6, r6, ip
306 + str r6, [r7, #256]
307 + eors r1, r1, r5
308 + ldr r6, [r7, #228]
309 + vshl.i32 q3, q5, #8
310 + vshl.i32 q14, q8, #8
311 + eors r2, r2, r4
312 + eors r0, r0, r6
313 + ldr r6, [r7, #256]
314 + vsri.32 q3, q5, #24
315 + ror r1, r1, #20
316 + eors r3, r3, r6
317 + ldr r6, [r7, #272]
318 + ror r0, r0, #20
319 + vsri.32 q14, q8, #24
320 + adds r6, r0, r6
321 + str r6, [r7, #276]
322 + ldr r6, [r7, #260]
323 + vadd.i32 q1, q1, q3
324 + vadd.i32 q15, q15, q14
325 + ror r2, r2, #20
326 + adds r6, r1, r6
327 + str r6, [r7, #252]
328 + ldr r6, [r7, #268]
329 + veor q6, q15, q6
330 + veor q7, q1, q7
331 + ror r3, r3, #20
332 + adds r6, r2, r6
333 + str r6, [r7, #272]
334 + ldr r6, [r7, #276]
335 + vshl.i32 q0, q6, #7
336 + vshl.i32 q5, q7, #7
337 + add fp, r3, fp
338 + eor sl, r6, sl
339 + ldr r6, [r7, #252]
340 + eor ip, fp, ip
341 + vsri.32 q0, q6, #25
342 + eor r9, r6, r9
343 + ldr r6, [r7, #272]
344 + ror sl, sl, #24
345 + vsri.32 q5, q7, #25
346 + eor r8, r6, r8
347 + ldr r6, [r7, #228]
348 + ror r9, r9, #24
349 + ror ip, ip, #24
350 + add r6, sl, r6
351 + str r6, [r7, #268]
352 + ldr r6, [r7, #256]
353 + add r5, r9, r5
354 + str r5, [r7, #264]
355 + vext.32 q5, q5, q5, #1
356 + add r5, ip, r6
357 + ldr r6, [r7, #268]
358 + vext.32 q0, q0, q0, #1
359 + vadd.i32 q4, q4, q5
360 + eors r0, r0, r6
361 + ldr r6, [r7, #264]
362 + vadd.i32 q2, q2, q0
363 + vext.32 q3, q3, q3, #3
364 + ror r8, r8, #24
365 + eors r1, r1, r6
366 + vext.32 q14, q14, q14, #3
367 + add r4, r8, r4
368 + ldr r6, [r7, #276]
369 + veor q3, q4, q3
370 + veor q14, q2, q14
371 + eors r2, r2, r4
372 + ror r1, r1, #25
373 + vext.32 q1, q1, q1, #2
374 + adds r6, r1, r6
375 + str r6, [r7, #276]
376 + vext.32 q15, q15, q15, #2
377 + ldr r6, [r7, #252]
378 + eors r3, r3, r5
379 + ror r2, r2, #25
380 + vrev32.16 q8, q14
381 + adds r6, r2, r6
382 + vrev32.16 q3, q3
383 + str r6, [r7, #260]
384 + vadd.i32 q1, q1, q3
385 + ldr r6, [r7, #272]
386 + vadd.i32 q15, q15, q8
387 + ror r3, r3, #25
388 + veor q5, q1, q5
389 + adds r6, r3, r6
390 + veor q0, q15, q0
391 + str r6, [r7, #256]
392 + ldr r6, [r7, #260]
393 + ror r0, r0, #25
394 + add fp, r0, fp
395 + vshl.i32 q6, q5, #12
396 + eor sl, r6, sl
397 + ldr r6, [r7, #276]
398 + vshl.i32 q14, q0, #12
399 + eor r8, fp, r8
400 + eor ip, r6, ip
401 + ldr r6, [r7, #256]
402 + vsri.32 q6, q5, #20
403 + ror sl, sl, #16
404 + eor r9, r6, r9
405 + ror r6, r8, #16
406 + vsri.32 q14, q0, #20
407 + ldr r8, [r7, #264]
408 + ror ip, ip, #16
409 + add r5, sl, r5
410 + add r8, r6, r8
411 + add r4, ip, r4
412 + str r4, [r7, #228]
413 + eor r0, r8, r0
414 + str r5, [r7, #272]
415 + vadd.i32 q4, q4, q6
416 + ldr r5, [r7, #228]
417 + vadd.i32 q2, q2, q14
418 + ldr r4, [r7, #268]
419 + ror r0, r0, #20
420 + veor q3, q4, q3
421 + eors r1, r1, r5
422 + veor q0, q2, q8
423 + str r8, [r7, #264]
424 + str r0, [r7, #20]
425 + add fp, r0, fp
426 + ldr r8, [r7, #272]
427 + ror r9, r9, #16
428 + ldr r0, [r7, #276]
429 + add r4, r9, r4
430 + str fp, [r7, #252]
431 + ror r1, r1, #20
432 + add fp, r1, r0
433 + eor r2, r8, r2
434 + ldr r0, [r7, #252]
435 + eors r3, r3, r4
436 + vshl.i32 q5, q3, #8
437 + str r4, [r7, #224]
438 + vshl.i32 q8, q0, #8
439 + ldr r4, [r7, #260]
440 + ldr r5, [r7, #256]
441 + ror r2, r2, #20
442 + ror r3, r3, #20
443 + eors r6, r6, r0
444 + adds r5, r3, r5
445 + add r8, r2, r4
446 + vsri.32 q5, q3, #24
447 + ldr r4, [r7, #264]
448 + eor r9, r5, r9
449 + eor ip, fp, ip
450 + vsri.32 q8, q0, #24
451 + eor sl, r8, sl
452 + ror r6, r6, #24
453 + ldr r0, [r7, #272]
454 + str r5, [r7, #268]
455 + adds r4, r6, r4
456 + ldr r5, [r7, #228]
457 + vadd.i32 q1, q1, q5
458 + str r4, [r7, #264]
459 + vadd.i32 q15, q15, q8
460 + ldr r4, [r7, #224]
461 + ror ip, ip, #24
462 + ror sl, sl, #24
463 + ror r9, r9, #24
464 + add r5, ip, r5
465 + add r0, sl, r0
466 + str r5, [r7, #256]
467 + add r5, r9, r4
468 + str r0, [r7, #276]
469 + veor q6, q1, q6
470 + ldr r4, [r7, #20]
471 + veor q14, q15, q14
472 + ldr r0, [r7, #264]
473 + eors r3, r3, r5
474 + vshl.i32 q0, q6, #7
475 + vext.32 q1, q1, q1, #2
476 + eors r0, r0, r4
477 + ldr r4, [r7, #276]
478 + str r0, [r7, #272]
479 + vshl.i32 q3, q14, #7
480 + eors r2, r2, r4
481 + ldr r4, [r7, #272]
482 + ldr r0, [r7, #256]
483 + vsri.32 q0, q6, #25
484 + ror r2, r2, #25
485 + ror r3, r3, #25
486 + eors r1, r1, r0
487 + vsri.32 q3, q14, #25
488 + ror r0, r4, #25
489 + ldr r4, [r7, #248]
490 + ror r1, r1, #25
491 + vext.32 q5, q5, q5, #1
492 + subs r4, r4, #1
493 + str r4, [r7, #248]
494 + vext.32 q15, q15, q15, #2
495 + vext.32 q8, q8, q8, #1
496 + vext.32 q0, q0, q0, #3
497 + vext.32 q3, q3, q3, #3
498 + bne .L3
499 + ldr r4, [r7, #256]
500 + vadd.i32 q4, q11, q4
501 + str r2, [r7, #256]
502 + vadd.i32 q14, q10, q9
503 + ldr r2, [r7, #244]
504 + vld1.64 {d12-d13}, [r2:64]
505 + str r4, [r7, #272]
506 + veor q4, q4, q6
507 + ldr r4, [r7, #220]
508 + vadd.i32 q10, q10, q5
509 + ldr r2, [r7, #216]
510 + vadd.i32 q0, q12, q0
511 + add fp, fp, r4
512 + str ip, [r7, #20]
513 + ldr r4, [r7, #212]
514 + mov ip, sl
515 + str r0, [r7, #224]
516 + mov sl, r8
517 + mov r0, r5
518 + ldr r8, [r7, #252]
519 + mov r5, r6
520 + add sl, sl, r2
521 + mov r6, r9
522 + ldr r2, [r7, #208]
523 + ldr r9, [r7, #268]
524 + vadd.i32 q1, q13, q1
525 + vadd.i32 q2, q11, q2
526 + str r1, [r7, #260]
527 + add r9, r9, r4
528 + add r4, r8, r2
529 + ldr r8, [r7, #232]
530 + vadd.i32 q3, q12, q3
531 + vadd.i32 q15, q13, q15
532 + str r3, [r7, #228]
533 + add r2, r8, #2
534 + vadd.i32 q8, q14, q8
535 + add ip, r2, ip
536 + ldr r2, [r7, #240]
537 + vst1.64 {d8-d9}, [r2:64]
538 + ldr r2, [r7, #244]
539 + ldr r3, [r7, #276]
540 + vldr d8, [r2, #16]
541 + vldr d9, [r2, #24]
542 + ldr r1, [r7, #264]
543 + veor q0, q0, q4
544 + add r8, r8, #3
545 + str r8, [r7, #232]
546 + ldr r8, [r7, #240]
547 + vstr d0, [r8, #16]
548 + vstr d1, [r8, #24]
549 + vldr d0, [r2, #32]
550 + vldr d1, [r2, #40]
551 + veor q1, q1, q0
552 + vstr d2, [r8, #32]
553 + vstr d3, [r8, #40]
554 + vldr d2, [r2, #48]
555 + vldr d3, [r2, #56]
556 + veor q10, q10, q1
557 + vstr d20, [r8, #48]
558 + vstr d21, [r8, #56]
559 + vldr d8, [r2, #64]
560 + vldr d9, [r2, #72]
561 + veor q2, q2, q4
562 + vstr d4, [r8, #64]
563 + vstr d5, [r8, #72]
564 + vldr d10, [r2, #80]
565 + vldr d11, [r2, #88]
566 + veor q3, q3, q5
567 + vstr d6, [r8, #80]
568 + vstr d7, [r8, #88]
569 + vldr d12, [r2, #96]
570 + vldr d13, [r2, #104]
571 + veor q15, q15, q6
572 + vstr d30, [r8, #96]
573 + vstr d31, [r8, #104]
574 + vldr d20, [r2, #112]
575 + vldr d21, [r2, #120]
576 + veor q8, q8, q10
577 + vstr d16, [r8, #112]
578 + vstr d17, [r8, #120]
579 + mov r8, r2
580 + ldr r2, [r2, #128]
581 + vadd.i32 q10, q14, q9
582 + eor r2, fp, r2
583 + ldr fp, [r7, #240]
584 + vadd.i32 q10, q10, q9
585 + str r2, [fp, #128]
586 + ldr r2, [r8, #132]
587 + eor r2, sl, r2
588 + str r2, [fp, #132]
589 + ldr r2, [r8, #136]
590 + eor r2, r9, r2
591 + str r2, [fp, #136]
592 + ldr r2, [r8, #140]
593 + eors r2, r2, r4
594 + str r2, [fp, #140]
595 + ldr r2, [r7, #236]
596 + ldr r4, [r8, #144]
597 + ldr r2, [r2, #0]
598 + str r4, [r7, #168]
599 + ldr r4, [r7, #224]
600 + add r8, r4, r2
601 + ldr r2, [r7, #168]
602 + ldr r4, [r7, #236]
603 + eor r8, r8, r2
604 + ldr r2, [r7, #244]
605 + str r8, [fp, #144]
606 + ldr r4, [r4, #4]
607 + ldr r2, [r2, #148]
608 + str r2, [r7, #36]
609 + ldr r2, [r7, #260]
610 + add r8, r2, r4
611 + ldr r4, [r7, #36]
612 + ldr r2, [r7, #236]
613 + eor r8, r8, r4
614 + ldr r4, [r7, #244]
615 + str r8, [fp, #148]
616 + ldr r2, [r2, #8]
617 + ldr r4, [r4, #152]
618 + str r4, [r7, #32]
619 + ldr r4, [r7, #256]
620 + add r8, r4, r2
621 + ldr r2, [r7, #32]
622 + eor r8, r8, r2
623 + str r8, [fp, #152]
624 + ldr r2, [r7, #244]
625 + ldr r4, [r7, #236]
626 + ldr r2, [r2, #156]
627 + ldr r4, [r4, #12]
628 + str r2, [r7, #28]
629 + ldr r2, [r7, #228]
630 + add r8, r2, r4
631 + ldr r4, [r7, #28]
632 + ldr r2, [r7, #244]
633 + eor r8, r8, r4
634 + str r8, [fp, #156]
635 + ldr r8, [r7, #236]
636 + ldr r2, [r2, #160]
637 + ldr r4, [r8, #16]
638 + adds r0, r0, r4
639 + ldr r4, [r7, #244]
640 + eors r0, r0, r2
641 + str r0, [fp, #160]
642 + ldr r0, [r8, #20]
643 + ldr r2, [r4, #164]
644 + adds r1, r1, r0
645 + ldr r0, [r7, #272]
646 + eors r1, r1, r2
647 + str r1, [fp, #164]
648 + ldr r2, [r8, #24]
649 + ldr r1, [r4, #168]
650 + adds r2, r0, r2
651 + eors r2, r2, r1
652 + str r2, [fp, #168]
653 + ldr r1, [r8, #28]
654 + ldr r2, [r4, #172]
655 + adds r3, r3, r1
656 + eors r3, r3, r2
657 + str r3, [fp, #172]
658 + ldr r3, [r4, #176]
659 + eor r3, ip, r3
660 + str r3, [fp, #176]
661 + ldr r3, [r4, #180]
662 + ldr r4, [r7, #392]
663 + eors r6, r6, r3
664 + str r6, [fp, #180]
665 + ldr r6, [r7, #244]
666 + ldr r2, [r4, #0]
667 + ldr r3, [r6, #184]
668 + adds r5, r5, r2
669 + eors r5, r5, r3
670 + str r5, [fp, #184]
671 + ldr r2, [r6, #188]
672 + adds r6, r6, #192
673 + ldr r3, [r4, #4]
674 + str r6, [r7, #244]
675 + ldr r0, [r7, #20]
676 + ldr r1, [r7, #232]
677 + adds r4, r0, r3
678 + eors r4, r4, r2
679 + ldr r2, [r7, #204]
680 + str r4, [fp, #188]
681 + add fp, fp, #192
682 + cmp r1, r2
683 + str fp, [r7, #240]
684 + bne .L4
685 + ldr r4, [r7, #188]
686 + ldr r3, [r7, #176]
687 + ldr r6, [r7, #184]
688 + adds r5, r3, r4
689 + ldr r8, [r7, #180]
690 + lsls r5, r5, #6
691 + adds r4, r6, r5
692 + add r5, r8, r5
693 +.L2:
694 + ldr fp, [r7, #192]
695 + movw r3, #43691
696 + movt r3, 43690
697 + ldr r6, [r7, #192]
698 + umull fp, r3, r3, fp
699 + lsrs r3, r3, #7
700 + add r3, r3, r3, lsl #1
701 + sub r3, r6, r3, lsl #6
702 + lsrs r6, r3, #6
703 + beq .L5
704 + add r1, r5, #16
705 + add r2, r4, #16
706 + mov r0, r6
707 + vldr d30, .L41
708 + vldr d31, .L41+8
709 +.L6:
710 + vmov q8, q10 @ v4si
711 + movs r3, #10
712 + vmov q1, q13 @ v4si
713 + vmov q14, q12 @ v4si
714 + vmov q3, q11 @ v4si
715 +.L7:
716 + vadd.i32 q3, q3, q14
717 + subs r3, r3, #1
718 + veor q2, q8, q3
719 + vrev32.16 q2, q2
720 + vadd.i32 q8, q1, q2
721 + veor q9, q8, q14
722 + vshl.i32 q14, q9, #12
723 + vsri.32 q14, q9, #20
724 + vadd.i32 q3, q3, q14
725 + veor q2, q3, q2
726 + vshl.i32 q9, q2, #8
727 + vsri.32 q9, q2, #24
728 + vadd.i32 q8, q8, q9
729 + vext.32 q9, q9, q9, #3
730 + veor q14, q8, q14
731 + vext.32 q1, q8, q8, #2
732 + vshl.i32 q8, q14, #7
733 + vsri.32 q8, q14, #25
734 + vext.32 q8, q8, q8, #1
735 + vadd.i32 q3, q3, q8
736 + veor q2, q3, q9
737 + vrev32.16 q2, q2
738 + vadd.i32 q9, q1, q2
739 + veor q8, q9, q8
740 + vshl.i32 q14, q8, #12
741 + vsri.32 q14, q8, #20
742 + vadd.i32 q3, q3, q14
743 + veor q2, q3, q2
744 + vshl.i32 q8, q2, #8
745 + vsri.32 q8, q2, #24
746 + vadd.i32 q9, q9, q8
747 + vext.32 q8, q8, q8, #1
748 + veor q14, q9, q14
749 + vext.32 q1, q9, q9, #2
750 + vshl.i32 q9, q14, #7
751 + vsri.32 q9, q14, #25
752 + vext.32 q14, q9, q9, #3
753 + bne .L7
754 + vadd.i32 q8, q10, q8
755 + subs r0, r0, #1
756 + vadd.i32 q3, q11, q3
757 + vldr d0, [r1, #-16]
758 + vldr d1, [r1, #-8]
759 + vadd.i32 q14, q12, q14
760 + vadd.i32 q1, q13, q1
761 + veor q3, q3, q0
762 + vstr d6, [r2, #-16]
763 + vstr d7, [r2, #-8]
764 + vadd.i32 q10, q10, q15
765 + vld1.64 {d8-d9}, [r1:64]
766 + veor q14, q14, q4
767 + vst1.64 {d28-d29}, [r2:64]
768 + vldr d10, [r1, #16]
769 + vldr d11, [r1, #24]
770 + veor q1, q1, q5
771 + vstr d2, [r2, #16]
772 + vstr d3, [r2, #24]
773 + vldr d18, [r1, #32]
774 + vldr d19, [r1, #40]
775 + add r1, r1, #64
776 + veor q8, q8, q9
777 + vstr d16, [r2, #32]
778 + vstr d17, [r2, #40]
779 + add r2, r2, #64
780 + bne .L6
781 + lsls r6, r6, #6
782 + adds r4, r4, r6
783 + adds r5, r5, r6
784 +.L5:
785 + ldr r6, [r7, #192]
786 + ands ip, r6, #63
787 + beq .L1
788 + vmov q8, q10 @ v4si
789 + movs r3, #10
790 + vmov q14, q13 @ v4si
791 + vmov q9, q12 @ v4si
792 + vmov q15, q11 @ v4si
793 +.L10:
794 + vadd.i32 q15, q15, q9
795 + subs r3, r3, #1
796 + veor q8, q8, q15
797 + vrev32.16 q8, q8
798 + vadd.i32 q3, q14, q8
799 + veor q9, q3, q9
800 + vshl.i32 q14, q9, #12
801 + vsri.32 q14, q9, #20
802 + vadd.i32 q15, q15, q14
803 + veor q9, q15, q8
804 + vshl.i32 q8, q9, #8
805 + vsri.32 q8, q9, #24
806 + vadd.i32 q9, q3, q8
807 + vext.32 q8, q8, q8, #3
808 + veor q2, q9, q14
809 + vext.32 q14, q9, q9, #2
810 + vshl.i32 q9, q2, #7
811 + vsri.32 q9, q2, #25
812 + vext.32 q9, q9, q9, #1
813 + vadd.i32 q15, q15, q9
814 + veor q3, q15, q8
815 + vrev32.16 q3, q3
816 + vadd.i32 q14, q14, q3
817 + veor q8, q14, q9
818 + vshl.i32 q9, q8, #12
819 + vsri.32 q9, q8, #20
820 + vadd.i32 q15, q15, q9
821 + veor q3, q15, q3
822 + vshl.i32 q8, q3, #8
823 + vsri.32 q8, q3, #24
824 + vadd.i32 q14, q14, q8
825 + vext.32 q8, q8, q8, #1
826 + veor q3, q14, q9
827 + vext.32 q14, q14, q14, #2
828 + vshl.i32 q9, q3, #7
829 + vsri.32 q9, q3, #25
830 + vext.32 q9, q9, q9, #3
831 + bne .L10
832 + cmp ip, #15
833 + vadd.i32 q11, q11, q15
834 + bhi .L37
835 + ldr fp, [r7, #196]
836 + vst1.64 {d22-d23}, [fp:128]
837 +.L14:
838 + ldr r6, [r7, #192]
839 + and r3, r6, #48
840 + cmp ip, r3
841 + bls .L1
842 + adds r0, r5, r3
843 + adds r1, r4, r3
844 + add r2, r0, #16
845 + add r6, r1, #16
846 + cmp r1, r2
847 + it cc
848 + cmpcc r0, r6
849 + rsb r9, r3, ip
850 + ite cc
851 + movcc r2, #0
852 + movcs r2, #1
853 + cmp r9, #15
854 + ite ls
855 + movls r2, #0
856 + andhi r2, r2, #1
857 + lsr r8, r9, #4
858 + eor r2, r2, #1
859 + cmp r8, #0
860 + it eq
861 + orreq r2, r2, #1
862 + lsl sl, r8, #4
863 + cbnz r2, .L35
864 + ldr fp, [r7, #196]
865 + add r6, fp, r3
866 +.L17:
867 + vld1.8 {q8}, [r0]!
868 + adds r2, r2, #1
869 + cmp r8, r2
870 + vld1.8 {q9}, [r6]!
871 + veor q8, q9, q8
872 + vst1.8 {q8}, [r1]!
873 + bhi .L17
874 + cmp r9, sl
875 + add r3, r3, sl
876 + beq .L1
877 +.L35:
878 + ldr r0, [r7, #196]
879 +.L25:
880 + ldrb r2, [r5, r3] @ zero_extendqisi2
881 + ldrb r1, [r3, r0] @ zero_extendqisi2
882 + eors r2, r2, r1
883 + strb r2, [r4, r3]
884 + adds r3, r3, #1
885 + cmp ip, r3
886 + bhi .L25
887 +.L1:
888 + add r7, r7, #296
889 + mov sp, r7
890 + fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
891 + pop {r4, r5, r6, r7, r8, r9, sl, fp}
892 + bx lr
893 +.L37:
894 + cmp ip, #31
895 + vld1.64 {d0-d1}, [r5:64]
896 + vadd.i32 q9, q12, q9
897 + veor q11, q11, q0
898 + vst1.64 {d22-d23}, [r4:64]
899 + bls .L12
900 + cmp ip, #47
901 + vldr d2, [r5, #16]
902 + vldr d3, [r5, #24]
903 + vadd.i32 q13, q13, q14
904 + veor q9, q9, q1
905 + vstr d18, [r4, #16]
906 + vstr d19, [r4, #24]
907 + bls .L13
908 + vadd.i32 q8, q8, q10
909 + vldr d0, [r5, #32]
910 + vldr d1, [r5, #40]
911 + ldr r6, [r7, #196]
912 + vstr d16, [r6, #48]
913 + vstr d17, [r6, #56]
914 + veor q8, q13, q0
915 + vstr d16, [r4, #32]
916 + vstr d17, [r4, #40]
917 + b .L14
918 +.L12:
919 + ldr r8, [r7, #196]
920 + vstr d18, [r8, #16]
921 + vstr d19, [r8, #24]
922 + b .L14
923 +.L20:
924 + ldr r5, [r7, #180]
925 + ldr r4, [r7, #184]
926 + b .L2
927 +.L13:
928 + ldr r6, [r7, #196]
929 + vstr d26, [r6, #32]
930 + vstr d27, [r6, #40]
931 + b .L14
932 +.L42:
933 + .align 3
934 +.L41:
935 + .word 1
936 + .word 0
937 + .word 0
938 + .word 0
939 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon
940 + .section .rodata
941 + .align 3
942 +.LANCHOR0 = . + 0
943 +.LC0:
944 + .word 1634760805
945 + .word 857760878
946 + .word 2036477234
947 + .word 1797285236
948 + .ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro GCC 2012.10) 4.7.3 20121001 (prerelease)"
949 + .section .note.GNU-stack,"",%progbits
950 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
951 index 7bef015..3b6ab1d 100644
952 --- a/crypto/cryptlib.c
953 +++ b/crypto/cryptlib.c
954 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type)
955 return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS)) ;
956 }
957
958 +#if __arm__
959 +static int global_arm_neon_enabled = 0;
960 +
961 +void CRYPTO_set_NEON_capable(int on)
962 + {
963 + global_arm_neon_enabled = on != 0;
964 + }
965 +
966 +int CRYPTO_is_NEON_capable()
967 + {
968 + return global_arm_neon_enabled;
969 + }
970 +#endif
971 +
972 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
973 defined(__INTEL__) || \
974 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
975 diff --git a/crypto/crypto.h b/crypto/crypto.h
976 index e11ac73..db339c3 100644
977 --- a/crypto/crypto.h
978 +++ b/crypto/crypto.h
979 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void);
980
981 int CRYPTO_get_new_lockid(char *name);
982
983 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This
984 + * code should be called before any non-init functions. */
985 +void CRYPTO_set_NEON_capable(int on);
986 +
987 +/* CRYPTO_is_NEON_capable returns the last value given to
988 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */
989 +int CRYPTO_is_NEON_capable();
990 +
991 int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */
992 void CRYPTO_lock(int mode, int type,const char *file,int line);
993 void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
994 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
995 index 2e5621d..00d53bf 100644
996 --- a/crypto/poly1305/poly1305.c
997 +++ b/crypto/poly1305/poly1305.c
998 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v)
999 }
1000 #endif
1001
1002 +#if __arm__
1003 +void CRYPTO_poly1305_init_neon(poly1305_state* state,
1004 + const unsigned char key[32]);
1005 +
1006 +void CRYPTO_poly1305_update_neon(poly1305_state* state,
1007 + const unsigned char *in,
1008 + size_t in_len);
1009 +
1010 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]);
1011 +#endif
1012 +
1013 static uint64_t
1014 mul32x32_64(uint32_t a, uint32_t b)
1015 {
1016 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uns igned char key[32])
1017 struct poly1305_state_st *state = (struct poly1305_state_st*) statep;
1018 uint32_t t0,t1,t2,t3;
1019
1020 +#if __arm__
1021 + if (CRYPTO_is_NEON_capable())
1022 + {
1023 + CRYPTO_poly1305_init_neon(statep, key);
1024 + return;
1025 + }
1026 +#endif
1027 +
1028 t0 = U8TO32_LE(key+0);
1029 t1 = U8TO32_LE(key+4);
1030 t2 = U8TO32_LE(key+8);
1031 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const u nsigned char *in,
1032 unsigned int i;
1033 struct poly1305_state_st *state = (struct poly1305_state_st*) statep;
1034
1035 +#if __arm__
1036 + if (CRYPTO_is_NEON_capable())
1037 + {
1038 + CRYPTO_poly1305_update_neon(statep, in, in_len);
1039 + return;
1040 + }
1041 +#endif
1042 +
1043 if (state->buf_used)
1044 {
1045 unsigned int todo = 16 - state->buf_used;
1046 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigne d char mac[16])
1047 uint32_t g0,g1,g2,g3,g4;
1048 uint32_t b, nb;
1049
1050 +#if __arm__
1051 + if (CRYPTO_is_NEON_capable())
1052 + {
1053 + CRYPTO_poly1305_finish_neon(statep, mac);
1054 + return;
1055 + }
1056 +#endif
1057 +
1058 if (state->buf_used)
1059 poly1305_update(state, state->buf, state->buf_used);
1060
1061 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c
1062 index adcef35..34e339d 100644
1063 --- a/crypto/poly1305/poly1305_arm.c
1064 +++ b/crypto/poly1305/poly1305_arm.c
1065 @@ -51,6 +51,7 @@
1066 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */
1067
1068 #include <stdint.h>
1069 +#include <string.h>
1070
1071 #include <openssl/poly1305.h>
1072
1073 @@ -202,7 +203,8 @@ struct poly1305_state_st {
1074 unsigned char key[16];
1075 };
1076
1077 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32])
1078 +void CRYPTO_poly1305_init_neon(poly1305_state *state,
1079 + const unsigned char key[32])
1080 {
1081 struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
1082 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
1083 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsig ned char key[32])
1084 st->buf_used = 0;
1085 }
1086
1087 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, siz e_t in_len)
1088 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in ,
1089 + size_t in_len)
1090 {
1091 struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
1092 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
1093 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uns igned char *in, size
1094 }
1095 }
1096
1097 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16])
1098 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16])
1099 {
1100 struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
1101 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
1102 --
1103 1.8.4.1
1104
OLDNEW
« no previous file with comments | « openssl/patches/fix_lhash_iteration.patch ('k') | openssl/patches/tls1_change_cipher_state_rewrite.patch » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698