Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(73)

Side by Side Diff: openssl/patches/neon_runtime.patch

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master
Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « openssl/patches/mac_ia32_assembly.patch ('k') | openssl/patches/new_channelid.patch » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001
2 From: Adam Langley <agl@chromium.org>
3 Date: Tue, 5 Nov 2013 13:10:11 -0500
4 Subject: [PATCH 52/52] Optional NEON support on ARM.
5
6 This patch causes ARM to build both the NEON and generic versions of
7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by
8 calling CRYPTO_set_NEON_capable(1).
9 ---
10 .gitignore | 1 +
11 Configure | 2 +-
12 apps/speed.c | 5 +
13 crypto/chacha/chacha_enc.c | 18 +
14 crypto/chacha/chacha_vec.c | 7 +
15 crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++
16 crypto/cryptlib.c | 14 +
17 crypto/crypto.h | 8 +
18 crypto/poly1305/poly1305.c | 35 ++
19 crypto/poly1305/poly1305_arm.c | 9 +-
20 10 files changed, 941 insertions(+), 4 deletions(-)
21 create mode 100644 crypto/chacha/chacha_vec_arm.s
22
23 diff --git a/Configure b/Configure
24 index 1b95384..18b7af0 100755
25 --- a/Configure
26 +++ b/Configure
27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-a lpha.o:::::::ghash-a
28 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::: :::::::";
29 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha2 56-mips.o sha512-mips.o::::::::::";
30 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::ae s-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4- s390x.o:::::::ghash-s390x.o:";
31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a rmv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void";
32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a rmv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm .o:void";
33 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-p arisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash- parisc.o::::32";
34 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o ae s-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::gha sh-parisc.o::::64";
35 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::";
36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c
37 index 54d1ca3..e4b648f 100644
38 --- a/crypto/chacha/chacha_enc.c
39 +++ b/crypto/chacha/chacha_enc.c
40 @@ -61,6 +61,7 @@
41
42 #if !defined(OPENSSL_NO_CHACHA)
43
44 +#include <openssl/crypto.h>
45 #include <openssl/chacha.h>
46
47 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */
48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k";
49
50 typedef unsigned int uint32_t;
51
52 +#if __arm__
53 +/* Defined in chacha_vec.c */
54 +void CRYPTO_chacha_20_neon(unsigned char *out,
55 + const unsigned char *in, size_t in_len,
56 + const unsigned char key[32],
57 + const unsigned char nonce[8],
58 + size_t counter);
59 +#endif
60 +
61 /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in
62 * |input| and writes the 64 output bytes to |output|. */
63 static void chacha_core(unsigned char output[64], const uint32_t input[16],
64 @@ -124,6 +134,16 @@ void CRYPTO_chacha_20(unsigned char *out,
65 unsigned char buf[64];
66 size_t todo, i;
67
68 +#if __arm__
69 + if (CRYPTO_is_NEON_capable() &&
70 + ((intptr_t)in & 15) == 0 &&
71 + ((intptr_t)out & 15) == 0)
72 + {
73 + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter);
74 + return;
75 + }
76 +#endif
77 +
78 input[0] = U8TO32_LITTLE(sigma + 0);
79 input[1] = U8TO32_LITTLE(sigma + 4);
80 input[2] = U8TO32_LITTLE(sigma + 8);
81 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c
82 index 33b2238..1226c39 100644
83 --- a/crypto/chacha/chacha_vec.c
84 +++ b/crypto/chacha/chacha_vec.c
85 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16)));
86 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \
87 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
88
89 +#if __ARM_NEON__
90 +/* For ARM, we can't depend on NEON support, so this function is compiled with
91 + * a different name, along with the generic code, and can be enabled at
92 + * run-time. */
93 +void CRYPTO_chacha_20_neon(
94 +#else
95 void CRYPTO_chacha_20(
96 +#endif
97 unsigned char *out,
98 const unsigned char *in,
99 size_t inlen,
100 diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S
101 new file mode 100644
102 index 0000000..24a5050
103 --- /dev/null
104 +++ b/crypto/chacha/chacha_vec_arm.S
105 @@ -0,0 +1,863 @@
106 +# This file contains a pre-compiled version of chacha_vec.c for ARM. This is
107 +# needed to support switching on NEON code at runtime. If the whole of OpenSSL
108 +# were to be compiled with the needed flags to build chacha_vec.c, then it
109 +# wouldn't be possible to run on non-NEON systems.
110 +#
111 +# This file was generated by:
112 +#
113 +# /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-li nux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../includ e -fpic -o chacha_vec_arm.S
114 +#
115 +# And then EABI attribute 28 was set to zero to allow linking with soft-float
116 +# code.
117 +
118 + .syntax unified
119 + .cpu cortex-a8
120 + .eabi_attribute 27, 3
121 + .eabi_attribute 28, 0
122 + .fpu neon
123 + .eabi_attribute 20, 1
124 + .eabi_attribute 21, 1
125 + .eabi_attribute 23, 3
126 + .eabi_attribute 24, 1
127 + .eabi_attribute 25, 1
128 + .eabi_attribute 26, 2
129 + .eabi_attribute 30, 2
130 + .eabi_attribute 34, 1
131 + .eabi_attribute 18, 4
132 + .thumb
133 + .file "chacha_vec.c"
134 + .text
135 + .align 2
136 + .global CRYPTO_chacha_20_neon
137 + .thumb
138 + .thumb_func
139 + .type CRYPTO_chacha_20_neon, %function
140 +CRYPTO_chacha_20_neon:
141 + @ args = 8, pretend = 0, frame = 304
142 + @ frame_needed = 1, uses_anonymous_args = 0
143 + @ link register save eliminated.
144 + push {r4, r5, r6, r7, r8, r9, sl, fp}
145 + fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
146 + sub sp, sp, #304
147 + add r7, sp, #0
148 + movw ip, #43691
149 + movt ip, 43690
150 + str r2, [r7, #196]
151 + sub sp, sp, #96
152 + ldr r4, [r7, #196]
153 + ldr r6, [r7, #400]
154 + ldr r2, .L38+16
155 + umull r4, ip, ip, r4
156 + ldr r6, [r6, #0]
157 + ldr r8, [r7, #400]
158 +.LPIC24:
159 + add r2, pc
160 + add r4, sp, #15
161 + str r3, [r7, #244]
162 + str r6, [r7, #176]
163 + bic r4, r4, #15
164 + str r0, [r7, #188]
165 + str r4, [r7, #200]
166 + lsrs ip, ip, #7
167 + str r1, [r7, #184]
168 + ldmia r2, {r0, r1, r2, r3}
169 + ldr r4, [r8, #4]
170 + ldr r5, [r7, #244]
171 + vld1.64 {d24-d25}, [r5:64]
172 + vldr d26, [r5, #16]
173 + vldr d27, [r5, #24]
174 + ldr r9, [r7, #200]
175 + ldr r8, [r7, #404]
176 + ldr r5, [r7, #176]
177 + add r6, r9, #64
178 + str r4, [r7, #300]
179 + mov r4, #0
180 + str r8, [r7, #288]
181 + str r5, [r7, #296]
182 + str r4, [r7, #292]
183 + stmia r6, {r0, r1, r2, r3}
184 + vldr d22, [r9, #64]
185 + vldr d23, [r9, #72]
186 + vldr d20, [r7, #288]
187 + vldr d21, [r7, #296]
188 + str ip, [r7, #192]
189 + beq .L20
190 + lsl r6, ip, #1
191 + ldr r1, [r9, #68]
192 + add r3, r6, ip
193 + str r6, [r7, #180]
194 + ldr r2, [r9, #72]
195 + add r8, r8, #2
196 + ldr r5, [r9, #76]
197 + vldr d18, .L38
198 + vldr d19, .L38+8
199 + str r4, [r7, #240]
200 + ldr r6, [r7, #184]
201 + ldr r4, [r7, #188]
202 + str r0, [r7, #224]
203 + str r1, [r7, #220]
204 + str r8, [r7, #208]
205 + str r2, [r7, #216]
206 + str r3, [r7, #204]
207 + str r5, [r7, #212]
208 + str r6, [r7, #252]
209 + str r4, [r7, #248]
210 +.L4:
211 + ldr r2, [r7, #244]
212 + add r9, r7, #216
213 + ldr r3, [r7, #244]
214 + vadd.i32 q8, q10, q9
215 + ldr r6, [r7, #208]
216 + vmov q15, q13 @ v4si
217 + ldr r5, [r7, #240]
218 + vmov q3, q12 @ v4si
219 + ldr r4, [r7, #244]
220 + vmov q2, q11 @ v4si
221 + adds r5, r5, r6
222 + ldr r2, [r2, #8]
223 + ldr r6, [r7, #400]
224 + vmov q5, q10 @ v4si
225 + ldr r3, [r3, #12]
226 + vmov q1, q13 @ v4si
227 + ldr r0, [r7, #244]
228 + vmov q0, q12 @ v4si
229 + ldr r1, [r7, #244]
230 + vmov q4, q11 @ v4si
231 + ldmia r9, {r9, sl, fp}
232 + str r5, [r7, #228]
233 + ldr r5, [r4, #24]
234 + ldr r0, [r0, #0]
235 + ldr r1, [r1, #4]
236 + str r2, [r7, #264]
237 + str r3, [r7, #236]
238 + ldr r2, [r6, #4]
239 + ldr r3, [r4, #28]
240 + str r5, [r7, #280]
241 + ldr r5, [r6, #0]
242 + movs r6, #0
243 + ldr ip, [r7, #228]
244 + ldr r8, [r7, #212]
245 + str r0, [r7, #232]
246 + str r1, [r7, #268]
247 + ldr r0, [r4, #16]
248 + ldr r1, [r4, #20]
249 + movs r4, #10
250 + str r2, [r7, #24]
251 + str r3, [r7, #284]
252 + str r4, [r7, #256]
253 + ldr r2, [r7, #264]
254 + str r9, [r7, #276]
255 + mov r9, r6
256 + ldr r6, [r7, #280]
257 + str r8, [r7, #260]
258 + mov r8, sl
259 + str r1, [r7, #272]
260 + mov sl, ip
261 + str r6, [r7, #264]
262 + mov r6, r5
263 + ldr r3, [r7, #236]
264 + mov r5, r0
265 + ldr ip, [r7, #24]
266 + ldr r1, [r7, #268]
267 + ldr r0, [r7, #232]
268 + b .L39
269 +.L40:
270 + .align 3
271 +.L38:
272 + .word 1
273 + .word 0
274 + .word 0
275 + .word 0
276 + .word .LANCHOR0-(.LPIC24+4)
277 +.L39:
278 +.L3:
279 + vadd.i32 q4, q4, q0
280 + add r8, r8, r1
281 + vadd.i32 q2, q2, q3
282 + str r8, [r7, #268]
283 + veor q5, q5, q4
284 + ldr r8, [r7, #276]
285 + veor q8, q8, q2
286 + add fp, fp, r0
287 + str fp, [r7, #280]
288 + add r8, r8, r2
289 + vrev32.16 q5, q5
290 + str r8, [r7, #276]
291 + vrev32.16 q8, q8
292 + vadd.i32 q1, q1, q5
293 + vadd.i32 q15, q15, q8
294 + ldr r8, [r7, #280]
295 + veor q0, q1, q0
296 + ldr r4, [r7, #260]
297 + veor q3, q15, q3
298 + eor sl, sl, r8
299 + ldr r8, [r7, #276]
300 + add fp, r4, r3
301 + vshl.i32 q7, q0, #12
302 + ldr r4, [r7, #268]
303 + vshl.i32 q6, q3, #12
304 + eor r6, r6, r8
305 + eor r9, r9, r4
306 + ldr r4, [r7, #272]
307 + vsri.32 q7, q0, #20
308 + ror r8, r6, #16
309 + ldr r6, [r7, #264]
310 + eor ip, ip, fp
311 + vsri.32 q6, q3, #20
312 + ror sl, sl, #16
313 + ror r9, r9, #16
314 + add r5, r5, sl
315 + vadd.i32 q4, q4, q7
316 + str r5, [r7, #236]
317 + vadd.i32 q2, q2, q6
318 + add r5, r4, r9
319 + add r4, r6, r8
320 + ldr r6, [r7, #284]
321 + ror ip, ip, #16
322 + veor q5, q4, q5
323 + veor q8, q2, q8
324 + add r6, r6, ip
325 + str r6, [r7, #264]
326 + eors r1, r1, r5
327 + ldr r6, [r7, #236]
328 + vshl.i32 q3, q5, #8
329 + vshl.i32 q14, q8, #8
330 + eors r2, r2, r4
331 + eors r0, r0, r6
332 + ldr r6, [r7, #264]
333 + vsri.32 q3, q5, #24
334 + ror r1, r1, #20
335 + eors r3, r3, r6
336 + ldr r6, [r7, #280]
337 + ror r0, r0, #20
338 + vsri.32 q14, q8, #24
339 + adds r6, r0, r6
340 + str r6, [r7, #284]
341 + ldr r6, [r7, #268]
342 + vadd.i32 q1, q1, q3
343 + vadd.i32 q15, q15, q14
344 + ror r2, r2, #20
345 + adds r6, r1, r6
346 + str r6, [r7, #260]
347 + ldr r6, [r7, #276]
348 + veor q6, q15, q6
349 + veor q7, q1, q7
350 + ror r3, r3, #20
351 + adds r6, r2, r6
352 + str r6, [r7, #280]
353 + ldr r6, [r7, #284]
354 + vshl.i32 q0, q6, #7
355 + vshl.i32 q5, q7, #7
356 + add fp, r3, fp
357 + eor sl, r6, sl
358 + ldr r6, [r7, #260]
359 + eor ip, fp, ip
360 + vsri.32 q0, q6, #25
361 + eor r9, r6, r9
362 + ldr r6, [r7, #280]
363 + ror sl, sl, #24
364 + vsri.32 q5, q7, #25
365 + eor r8, r6, r8
366 + ldr r6, [r7, #236]
367 + ror r9, r9, #24
368 + ror ip, ip, #24
369 + add r6, sl, r6
370 + str r6, [r7, #276]
371 + ldr r6, [r7, #264]
372 + add r5, r9, r5
373 + str r5, [r7, #272]
374 + vext.32 q5, q5, q5, #1
375 + add r5, ip, r6
376 + ldr r6, [r7, #276]
377 + vext.32 q0, q0, q0, #1
378 + vadd.i32 q4, q4, q5
379 + eors r0, r0, r6
380 + ldr r6, [r7, #272]
381 + vadd.i32 q2, q2, q0
382 + vext.32 q3, q3, q3, #3
383 + ror r8, r8, #24
384 + eors r1, r1, r6
385 + vext.32 q14, q14, q14, #3
386 + add r4, r8, r4
387 + ldr r6, [r7, #284]
388 + veor q3, q4, q3
389 + veor q14, q2, q14
390 + eors r2, r2, r4
391 + ror r1, r1, #25
392 + vext.32 q1, q1, q1, #2
393 + adds r6, r1, r6
394 + str r6, [r7, #284]
395 + vext.32 q15, q15, q15, #2
396 + ldr r6, [r7, #260]
397 + eors r3, r3, r5
398 + ror r2, r2, #25
399 + vrev32.16 q8, q14
400 + adds r6, r2, r6
401 + vrev32.16 q3, q3
402 + str r6, [r7, #268]
403 + vadd.i32 q1, q1, q3
404 + ldr r6, [r7, #280]
405 + vadd.i32 q15, q15, q8
406 + ror r3, r3, #25
407 + veor q5, q1, q5
408 + adds r6, r3, r6
409 + veor q0, q15, q0
410 + str r6, [r7, #264]
411 + ldr r6, [r7, #268]
412 + ror r0, r0, #25
413 + add fp, r0, fp
414 + vshl.i32 q6, q5, #12
415 + eor sl, r6, sl
416 + ldr r6, [r7, #284]
417 + vshl.i32 q14, q0, #12
418 + eor r8, fp, r8
419 + eor ip, r6, ip
420 + ldr r6, [r7, #264]
421 + vsri.32 q6, q5, #20
422 + ror sl, sl, #16
423 + eor r9, r6, r9
424 + ror r6, r8, #16
425 + vsri.32 q14, q0, #20
426 + ldr r8, [r7, #272]
427 + ror ip, ip, #16
428 + add r5, sl, r5
429 + add r8, r6, r8
430 + add r4, ip, r4
431 + str r4, [r7, #236]
432 + eor r0, r8, r0
433 + str r5, [r7, #280]
434 + vadd.i32 q4, q4, q6
435 + ldr r5, [r7, #236]
436 + vadd.i32 q2, q2, q14
437 + ldr r4, [r7, #276]
438 + ror r0, r0, #20
439 + veor q3, q4, q3
440 + eors r1, r1, r5
441 + veor q0, q2, q8
442 + str r8, [r7, #272]
443 + str r0, [r7, #24]
444 + add fp, r0, fp
445 + ldr r8, [r7, #280]
446 + ror r9, r9, #16
447 + ldr r0, [r7, #284]
448 + add r4, r9, r4
449 + str fp, [r7, #260]
450 + ror r1, r1, #20
451 + add fp, r1, r0
452 + eor r2, r8, r2
453 + ldr r0, [r7, #260]
454 + eors r3, r3, r4
455 + vshl.i32 q5, q3, #8
456 + str r4, [r7, #232]
457 + vshl.i32 q8, q0, #8
458 + ldr r4, [r7, #268]
459 + ldr r5, [r7, #264]
460 + ror r2, r2, #20
461 + ror r3, r3, #20
462 + eors r6, r6, r0
463 + adds r5, r3, r5
464 + add r8, r2, r4
465 + vsri.32 q5, q3, #24
466 + ldr r4, [r7, #272]
467 + eor r9, r5, r9
468 + eor ip, fp, ip
469 + vsri.32 q8, q0, #24
470 + eor sl, r8, sl
471 + ror r6, r6, #24
472 + ldr r0, [r7, #280]
473 + str r5, [r7, #276]
474 + adds r4, r6, r4
475 + ldr r5, [r7, #236]
476 + vadd.i32 q1, q1, q5
477 + str r4, [r7, #272]
478 + vadd.i32 q15, q15, q8
479 + ldr r4, [r7, #232]
480 + ror ip, ip, #24
481 + ror sl, sl, #24
482 + ror r9, r9, #24
483 + add r5, ip, r5
484 + add r0, sl, r0
485 + str r5, [r7, #264]
486 + add r5, r9, r4
487 + str r0, [r7, #284]
488 + veor q6, q1, q6
489 + ldr r4, [r7, #24]
490 + veor q14, q15, q14
491 + ldr r0, [r7, #272]
492 + eors r3, r3, r5
493 + vshl.i32 q0, q6, #7
494 + vext.32 q1, q1, q1, #2
495 + eors r0, r0, r4
496 + ldr r4, [r7, #284]
497 + str r0, [r7, #280]
498 + vshl.i32 q3, q14, #7
499 + eors r2, r2, r4
500 + ldr r4, [r7, #280]
501 + ldr r0, [r7, #264]
502 + vsri.32 q0, q6, #25
503 + ror r2, r2, #25
504 + ror r3, r3, #25
505 + eors r1, r1, r0
506 + vsri.32 q3, q14, #25
507 + ror r0, r4, #25
508 + ldr r4, [r7, #256]
509 + ror r1, r1, #25
510 + vext.32 q5, q5, q5, #1
511 + subs r4, r4, #1
512 + str r4, [r7, #256]
513 + vext.32 q15, q15, q15, #2
514 + vext.32 q8, q8, q8, #1
515 + vext.32 q0, q0, q0, #3
516 + vext.32 q3, q3, q3, #3
517 + bne .L3
518 + ldr r4, [r7, #264]
519 + vadd.i32 q14, q10, q9
520 + str r2, [r7, #264]
521 + vadd.i32 q10, q10, q5
522 + ldr r2, [r7, #252]
523 + vld1.64 {d12-d13}, [r2:64]
524 + ldr r2, [r7, #220]
525 + vadd.i32 q4, q11, q4
526 + str ip, [r7, #24]
527 + mov ip, sl
528 + mov sl, r8
529 + ldr r8, [r7, #260]
530 + add sl, sl, r2
531 + ldr r2, [r7, #212]
532 + str r4, [r7, #280]
533 + vadd.i32 q0, q12, q0
534 + ldr r4, [r7, #224]
535 + add r8, r8, r2
536 + ldr r2, [r7, #240]
537 + vadd.i32 q1, q13, q1
538 + str r0, [r7, #232]
539 + add fp, fp, r4
540 + mov r0, r5
541 + ldr r4, [r7, #216]
542 + mov r5, r6
543 + mov r6, r9
544 + ldr r9, [r7, #276]
545 + adds r2, r2, #3
546 + str r2, [r7, #240]
547 + vadd.i32 q2, q11, q2
548 + ldr r2, [r7, #252]
549 + add r9, r9, r4
550 + vadd.i32 q3, q12, q3
551 + ldr r4, [r7, #228]
552 + vadd.i32 q15, q13, q15
553 + str r1, [r7, #268]
554 + vadd.i32 q8, q14, q8
555 + str r3, [r7, #236]
556 + veor q4, q4, q6
557 + ldr r3, [r7, #284]
558 + ldr r1, [r7, #272]
559 + add ip, r4, ip
560 + ldr r4, [r7, #248]
561 + vst1.64 {d8-d9}, [r4:64]
562 + vldr d8, [r2, #16]
563 + vldr d9, [r2, #24]
564 + veor q0, q0, q4
565 + vstr d0, [r4, #16]
566 + vstr d1, [r4, #24]
567 + vldr d0, [r2, #32]
568 + vldr d1, [r2, #40]
569 + veor q1, q1, q0
570 + vstr d2, [r4, #32]
571 + vstr d3, [r4, #40]
572 + vldr d2, [r2, #48]
573 + vldr d3, [r2, #56]
574 + veor q10, q10, q1
575 + vstr d20, [r4, #48]
576 + vstr d21, [r4, #56]
577 + vldr d8, [r2, #64]
578 + vldr d9, [r2, #72]
579 + veor q2, q2, q4
580 + vstr d4, [r4, #64]
581 + vstr d5, [r4, #72]
582 + vldr d10, [r2, #80]
583 + vldr d11, [r2, #88]
584 + veor q3, q3, q5
585 + vstr d6, [r4, #80]
586 + vstr d7, [r4, #88]
587 + vldr d12, [r2, #96]
588 + vldr d13, [r2, #104]
589 + veor q15, q15, q6
590 + vstr d30, [r4, #96]
591 + vstr d31, [r4, #104]
592 + vldr d20, [r2, #112]
593 + vldr d21, [r2, #120]
594 + veor q8, q8, q10
595 + vstr d16, [r4, #112]
596 + vstr d17, [r4, #120]
597 + ldr r4, [r2, #128]
598 + ldr r2, [r7, #248]
599 + vadd.i32 q10, q14, q9
600 + eor r4, fp, r4
601 + vadd.i32 q10, q10, q9
602 + str r4, [r2, #128]
603 + ldr r4, [r7, #252]
604 + ldr r2, [r4, #132]
605 + eor r2, sl, r2
606 + ldr sl, [r7, #248]
607 + str r2, [sl, #132]
608 + ldr r2, [r4, #136]
609 + eor r2, r9, r2
610 + str r2, [sl, #136]
611 + ldr r2, [r4, #140]
612 + eor r2, r8, r2
613 + str r2, [sl, #140]
614 + ldr r2, [r7, #244]
615 + ldr r4, [r4, #144]
616 + ldr r2, [r2, #0]
617 + str r4, [r7, #44]
618 + ldr r4, [r7, #232]
619 + add r8, r4, r2
620 + ldr r2, [r7, #44]
621 + ldr r4, [r7, #244]
622 + eor r8, r8, r2
623 + ldr r2, [r7, #252]
624 + str r8, [sl, #144]
625 + ldr r4, [r4, #4]
626 + ldr r2, [r2, #148]
627 + str r2, [r7, #40]
628 + ldr r2, [r7, #268]
629 + add r8, r2, r4
630 + ldr r4, [r7, #40]
631 + ldr r2, [r7, #244]
632 + eor r8, r8, r4
633 + ldr r4, [r7, #252]
634 + str r8, [sl, #148]
635 + ldr r2, [r2, #8]
636 + ldr r4, [r4, #152]
637 + str r4, [r7, #36]
638 + ldr r4, [r7, #264]
639 + add r8, r4, r2
640 + ldr r2, [r7, #36]
641 + eor r8, r8, r2
642 + str r8, [sl, #152]
643 + ldr r2, [r7, #252]
644 + ldr r4, [r7, #244]
645 + ldr r2, [r2, #156]
646 + ldr r4, [r4, #12]
647 + str r2, [r7, #32]
648 + ldr r2, [r7, #236]
649 + add r8, r2, r4
650 + ldr r4, [r7, #32]
651 + ldr r2, [r7, #252]
652 + eor r8, r8, r4
653 + str r8, [sl, #156]
654 + ldr r8, [r7, #244]
655 + ldr r2, [r2, #160]
656 + ldr r4, [r8, #16]
657 + adds r0, r0, r4
658 + ldr r4, [r7, #252]
659 + eors r0, r0, r2
660 + str r0, [sl, #160]
661 + ldr r0, [r8, #20]
662 + ldr r2, [r4, #164]
663 + adds r1, r1, r0
664 + ldr r0, [r7, #280]
665 + eors r1, r1, r2
666 + str r1, [sl, #164]
667 + ldr r2, [r8, #24]
668 + ldr r1, [r4, #168]
669 + adds r2, r0, r2
670 + eors r2, r2, r1
671 + str r2, [sl, #168]
672 + ldr r1, [r8, #28]
673 + ldr r2, [r4, #172]
674 + adds r3, r3, r1
675 + eors r3, r3, r2
676 + str r3, [sl, #172]
677 + ldr r3, [r4, #176]
678 + eor r3, ip, r3
679 + str r3, [sl, #176]
680 + ldr r3, [r4, #180]
681 + ldr r4, [r7, #400]
682 + eors r6, r6, r3
683 + str r6, [sl, #180]
684 + ldr r6, [r7, #252]
685 + ldr r2, [r4, #0]
686 + ldr r3, [r6, #184]
687 + adds r5, r5, r2
688 + eors r5, r5, r3
689 + str r5, [sl, #184]
690 + ldr r2, [r6, #188]
691 + adds r6, r6, #192
692 + ldr r3, [r4, #4]
693 + str r6, [r7, #252]
694 + ldr r0, [r7, #24]
695 + ldr r1, [r7, #240]
696 + adds r4, r0, r3
697 + eors r4, r4, r2
698 + ldr r2, [r7, #204]
699 + str r4, [sl, #188]
700 + add sl, sl, #192
701 + cmp r1, r2
702 + str sl, [r7, #248]
703 + bne .L4
704 + ldr r4, [r7, #192]
705 + ldr r3, [r7, #180]
706 + ldr r6, [r7, #188]
707 + adds r5, r3, r4
708 + ldr r8, [r7, #184]
709 + lsls r5, r5, #6
710 + adds r4, r6, r5
711 + add r5, r8, r5
712 +.L2:
713 + ldr r9, [r7, #196]
714 + movw r3, #43691
715 + movt r3, 43690
716 + ldr sl, [r7, #196]
717 + umull r9, r3, r3, r9
718 + lsrs r3, r3, #7
719 + add r3, r3, r3, lsl #1
720 + sub r3, sl, r3, lsl #6
721 + lsrs r6, r3, #6
722 + beq .L5
723 + add r1, r5, #16
724 + add r2, r4, #16
725 + mov r0, r6
726 + vldr d30, .L41
727 + vldr d31, .L41+8
728 +.L6:
729 + vmov q8, q10 @ v4si
730 + movs r3, #10
731 + vmov q1, q13 @ v4si
732 + vmov q14, q12 @ v4si
733 + vmov q3, q11 @ v4si
734 +.L7:
735 + vadd.i32 q3, q3, q14
736 + subs r3, r3, #1
737 + veor q2, q8, q3
738 + vrev32.16 q2, q2
739 + vadd.i32 q8, q1, q2
740 + veor q9, q8, q14
741 + vshl.i32 q14, q9, #12
742 + vsri.32 q14, q9, #20
743 + vadd.i32 q3, q3, q14
744 + veor q2, q3, q2
745 + vshl.i32 q9, q2, #8
746 + vsri.32 q9, q2, #24
747 + vadd.i32 q8, q8, q9
748 + vext.32 q9, q9, q9, #3
749 + veor q14, q8, q14
750 + vext.32 q1, q8, q8, #2
751 + vshl.i32 q8, q14, #7
752 + vsri.32 q8, q14, #25
753 + vext.32 q8, q8, q8, #1
754 + vadd.i32 q3, q3, q8
755 + veor q2, q3, q9
756 + vrev32.16 q2, q2
757 + vadd.i32 q9, q1, q2
758 + veor q8, q9, q8
759 + vshl.i32 q14, q8, #12
760 + vsri.32 q14, q8, #20
761 + vadd.i32 q3, q3, q14
762 + veor q2, q3, q2
763 + vshl.i32 q8, q2, #8
764 + vsri.32 q8, q2, #24
765 + vadd.i32 q9, q9, q8
766 + vext.32 q8, q8, q8, #1
767 + veor q14, q9, q14
768 + vext.32 q1, q9, q9, #2
769 + vshl.i32 q9, q14, #7
770 + vsri.32 q9, q14, #25
771 + vext.32 q14, q9, q9, #3
772 + bne .L7
773 + vadd.i32 q8, q10, q8
774 + subs r0, r0, #1
775 + vadd.i32 q3, q11, q3
776 + vldr d0, [r1, #-16]
777 + vldr d1, [r1, #-8]
778 + vadd.i32 q14, q12, q14
779 + vadd.i32 q1, q13, q1
780 + veor q3, q3, q0
781 + vstr d6, [r2, #-16]
782 + vstr d7, [r2, #-8]
783 + vadd.i32 q10, q10, q15
784 + vld1.64 {d8-d9}, [r1:64]
785 + veor q14, q14, q4
786 + vst1.64 {d28-d29}, [r2:64]
787 + vldr d10, [r1, #16]
788 + vldr d11, [r1, #24]
789 + veor q1, q1, q5
790 + vstr d2, [r2, #16]
791 + vstr d3, [r2, #24]
792 + vldr d18, [r1, #32]
793 + vldr d19, [r1, #40]
794 + add r1, r1, #64
795 + veor q8, q8, q9
796 + vstr d16, [r2, #32]
797 + vstr d17, [r2, #40]
798 + add r2, r2, #64
799 + bne .L6
800 + lsls r6, r6, #6
801 + adds r4, r4, r6
802 + adds r5, r5, r6
803 +.L5:
804 + ldr r6, [r7, #196]
805 + ands ip, r6, #63
806 + beq .L1
807 + vmov q8, q10 @ v4si
808 + movs r3, #10
809 + vmov q14, q13 @ v4si
810 + vmov q9, q12 @ v4si
811 + vmov q15, q11 @ v4si
812 +.L10:
813 + vadd.i32 q15, q15, q9
814 + subs r3, r3, #1
815 + veor q8, q8, q15
816 + vrev32.16 q8, q8
817 + vadd.i32 q3, q14, q8
818 + veor q9, q3, q9
819 + vshl.i32 q14, q9, #12
820 + vsri.32 q14, q9, #20
821 + vadd.i32 q15, q15, q14
822 + veor q9, q15, q8
823 + vshl.i32 q8, q9, #8
824 + vsri.32 q8, q9, #24
825 + vadd.i32 q9, q3, q8
826 + vext.32 q8, q8, q8, #3
827 + veor q2, q9, q14
828 + vext.32 q14, q9, q9, #2
829 + vshl.i32 q9, q2, #7
830 + vsri.32 q9, q2, #25
831 + vext.32 q9, q9, q9, #1
832 + vadd.i32 q15, q15, q9
833 + veor q3, q15, q8
834 + vrev32.16 q3, q3
835 + vadd.i32 q14, q14, q3
836 + veor q8, q14, q9
837 + vshl.i32 q9, q8, #12
838 + vsri.32 q9, q8, #20
839 + vadd.i32 q15, q15, q9
840 + veor q3, q15, q3
841 + vshl.i32 q8, q3, #8
842 + vsri.32 q8, q3, #24
843 + vadd.i32 q14, q14, q8
844 + vext.32 q8, q8, q8, #1
845 + veor q3, q14, q9
846 + vext.32 q14, q14, q14, #2
847 + vshl.i32 q9, q3, #7
848 + vsri.32 q9, q3, #25
849 + vext.32 q9, q9, q9, #3
850 + bne .L10
851 + cmp ip, #15
852 + vadd.i32 q11, q11, q15
853 + bhi .L37
854 + ldr r9, [r7, #200]
855 + vst1.64 {d22-d23}, [r9:128]
856 +.L14:
857 + ldr sl, [r7, #196]
858 + and r3, sl, #48
859 + cmp ip, r3
860 + bls .L1
861 + adds r0, r5, r3
862 + adds r1, r4, r3
863 + add r2, r0, #16
864 + add r6, r1, #16
865 + cmp r1, r2
866 + it cc
867 + cmpcc r0, r6
868 + rsb r9, r3, ip
869 + ite cc
870 + movcc r2, #0
871 + movcs r2, #1
872 + cmp r9, #15
873 + ite ls
874 + movls r2, #0
875 + andhi r2, r2, #1
876 + lsr r8, r9, #4
877 + eor r2, r2, #1
878 + cmp r8, #0
879 + it eq
880 + orreq r2, r2, #1
881 + lsl sl, r8, #4
882 + cbnz r2, .L35
883 + ldr fp, [r7, #200]
884 + add r6, fp, r3
885 +.L17:
886 + vld1.8 {q8}, [r0]!
887 + adds r2, r2, #1
888 + cmp r8, r2
889 + vld1.8 {q9}, [r6]!
890 + veor q8, q9, q8
891 + vst1.8 {q8}, [r1]!
892 + bhi .L17
893 + cmp r9, sl
894 + add r3, r3, sl
895 + beq .L1
896 +.L35:
897 + ldr r0, [r7, #200]
898 +.L25:
899 + ldrb r2, [r5, r3] @ zero_extendqisi2
900 + ldrb r1, [r3, r0] @ zero_extendqisi2
901 + eors r2, r2, r1
902 + strb r2, [r4, r3]
903 + adds r3, r3, #1
904 + cmp ip, r3
905 + bhi .L25
906 +.L1:
907 + add r7, r7, #304
908 + mov sp, r7
909 + fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
910 + pop {r4, r5, r6, r7, r8, r9, sl, fp}
911 + bx lr
912 +.L37:
913 + cmp ip, #31
914 + vld1.64 {d0-d1}, [r5:64]
915 + vadd.i32 q9, q12, q9
916 + veor q11, q11, q0
917 + vst1.64 {d22-d23}, [r4:64]
918 + bls .L12
919 + cmp ip, #47
920 + vldr d2, [r5, #16]
921 + vldr d3, [r5, #24]
922 + vadd.i32 q13, q13, q14
923 + veor q9, q9, q1
924 + vstr d18, [r4, #16]
925 + vstr d19, [r4, #24]
926 + bls .L13
927 + vadd.i32 q8, q8, q10
928 + vldr d0, [r5, #32]
929 + vldr d1, [r5, #40]
930 + ldr r6, [r7, #200]
931 + vstr d16, [r6, #48]
932 + vstr d17, [r6, #56]
933 + veor q8, q13, q0
934 + vstr d16, [r4, #32]
935 + vstr d17, [r4, #40]
936 + b .L14
937 +.L12:
938 + ldr r8, [r7, #200]
939 + vstr d18, [r8, #16]
940 + vstr d19, [r8, #24]
941 + b .L14
942 +.L20:
943 + ldr r5, [r7, #184]
944 + ldr r4, [r7, #188]
945 + b .L2
946 +.L13:
947 + ldr r6, [r7, #200]
948 + vstr d26, [r6, #32]
949 + vstr d27, [r6, #40]
950 + b .L14
951 +.L42:
952 + .align 3
953 +.L41:
954 + .word 1
955 + .word 0
956 + .word 0
957 + .word 0
958 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon
959 + .section .rodata
960 + .align 3
961 +.LANCHOR0 = . + 0
962 +.LC0:
963 + .word 1634760805
964 + .word 857760878
965 + .word 2036477234
966 + .word 1797285236
967 + .ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro GCC 2012.10) 4.7.3 20121001 (prerelease)"
968 + .section .note.GNU-stack,"",%progbits
969 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
970 index 7bef015..3b6ab1d 100644
971 --- a/crypto/cryptlib.c
972 +++ b/crypto/cryptlib.c
973 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type)
974 return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS)) ;
975 }
976
977 +#if __arm__
978 +static int global_arm_neon_enabled = 0;
979 +
980 +void CRYPTO_set_NEON_capable(int on)
981 + {
982 + global_arm_neon_enabled = on != 0;
983 + }
984 +
985 +int CRYPTO_is_NEON_capable(void)
986 + {
987 + return global_arm_neon_enabled;
988 + }
989 +#endif
990 +
991 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
992 defined(__INTEL__) || \
993 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
994 diff --git a/crypto/crypto.h b/crypto/crypto.h
995 index e11ac73..db339c3 100644
996 --- a/crypto/crypto.h
997 +++ b/crypto/crypto.h
998 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void);
999
1000 int CRYPTO_get_new_lockid(char *name);
1001
1002 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This
1003 + * code should be called before any non-init functions. */
1004 +void CRYPTO_set_NEON_capable(int on);
1005 +
1006 +/* CRYPTO_is_NEON_capable returns the last value given to
1007 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */
1008 +int CRYPTO_is_NEON_capable(void);
1009 +
1010 int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */
1011 void CRYPTO_lock(int mode, int type,const char *file,int line);
1012 void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
1013 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
1014 index 2e5621d..00d53bf 100644
1015 --- a/crypto/poly1305/poly1305.c
1016 +++ b/crypto/poly1305/poly1305.c
1017 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v)
1018 }
1019 #endif
1020
1021 +#if __arm__
1022 +void CRYPTO_poly1305_init_neon(poly1305_state* state,
1023 + const unsigned char key[32]);
1024 +
1025 +void CRYPTO_poly1305_update_neon(poly1305_state* state,
1026 + const unsigned char *in,
1027 + size_t in_len);
1028 +
1029 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]);
1030 +#endif
1031 +
1032 static uint64_t
1033 mul32x32_64(uint32_t a, uint32_t b)
1034 {
1035 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uns igned char key[32])
1036 struct poly1305_state_st *state = (struct poly1305_state_st*) statep;
1037 uint32_t t0,t1,t2,t3;
1038
1039 +#if __arm__
1040 + if (CRYPTO_is_NEON_capable())
1041 + {
1042 + CRYPTO_poly1305_init_neon(statep, key);
1043 + return;
1044 + }
1045 +#endif
1046 +
1047 t0 = U8TO32_LE(key+0);
1048 t1 = U8TO32_LE(key+4);
1049 t2 = U8TO32_LE(key+8);
1050 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const u nsigned char *in,
1051 unsigned int i;
1052 struct poly1305_state_st *state = (struct poly1305_state_st*) statep;
1053
1054 +#if __arm__
1055 + if (CRYPTO_is_NEON_capable())
1056 + {
1057 + CRYPTO_poly1305_update_neon(statep, in, in_len);
1058 + return;
1059 + }
1060 +#endif
1061 +
1062 if (state->buf_used)
1063 {
1064 unsigned int todo = 16 - state->buf_used;
1065 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigne d char mac[16])
1066 uint32_t g0,g1,g2,g3,g4;
1067 uint32_t b, nb;
1068
1069 +#if __arm__
1070 + if (CRYPTO_is_NEON_capable())
1071 + {
1072 + CRYPTO_poly1305_finish_neon(statep, mac);
1073 + return;
1074 + }
1075 +#endif
1076 +
1077 if (state->buf_used)
1078 poly1305_update(state, state->buf, state->buf_used);
1079
1080 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c
1081 index adcef35..34e339d 100644
1082 --- a/crypto/poly1305/poly1305_arm.c
1083 +++ b/crypto/poly1305/poly1305_arm.c
1084 @@ -51,6 +51,7 @@
1085 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */
1086
1087 #include <stdint.h>
1088 +#include <string.h>
1089
1090 #include <openssl/poly1305.h>
1091
1092 @@ -202,7 +203,8 @@ struct poly1305_state_st {
1093 unsigned char key[16];
1094 };
1095
1096 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32])
1097 +void CRYPTO_poly1305_init_neon(poly1305_state *state,
1098 + const unsigned char key[32])
1099 {
1100 struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
1101 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
1102 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsig ned char key[32])
1103 st->buf_used = 0;
1104 }
1105
1106 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, siz e_t in_len)
1107 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in ,
1108 + size_t in_len)
1109 {
1110 struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
1111 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
1112 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uns igned char *in, size
1113 }
1114 }
1115
1116 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16])
1117 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16])
1118 {
1119 struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
1120 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
1121 --
1122 1.8.4.1
1123
OLDNEW
« no previous file with comments | « openssl/patches/mac_ia32_assembly.patch ('k') | openssl/patches/new_channelid.patch » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698