OLD | NEW |
(Empty) | |
| 1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001 |
| 2 From: Adam Langley <agl@chromium.org> |
| 3 Date: Tue, 5 Nov 2013 13:10:11 -0500 |
| 4 Subject: [PATCH 52/52] Optional NEON support on ARM. |
| 5 |
| 6 This patch causes ARM to build both the NEON and generic versions of |
| 7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by |
| 8 calling CRYPTO_set_NEON_capable(1). |
| 9 --- |
| 10 .gitignore | 1 + |
| 11 Configure | 2 +- |
| 12 apps/speed.c | 5 + |
| 13 crypto/chacha/chacha_enc.c | 18 + |
| 14 crypto/chacha/chacha_vec.c | 7 + |
| 15 crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++ |
| 16 crypto/cryptlib.c | 14 + |
| 17 crypto/crypto.h | 8 + |
| 18 crypto/poly1305/poly1305.c | 35 ++ |
| 19 crypto/poly1305/poly1305_arm.c | 9 +- |
| 20 10 files changed, 941 insertions(+), 4 deletions(-) |
| 21 create mode 100644 crypto/chacha/chacha_vec_arm.s |
| 22 |
| 23 diff --git a/Configure b/Configure |
| 24 index 1b95384..18b7af0 100755 |
| 25 --- a/Configure |
| 26 +++ b/Configure |
| 27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-a
lpha.o:::::::ghash-a |
| 28 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::
:::::::"; |
| 29 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha2
56-mips.o sha512-mips.o::::::::::"; |
| 30 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::ae
s-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-
s390x.o:::::::ghash-s390x.o:"; |
| 31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb
c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a
rmv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void"; |
| 32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb
c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a
rmv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm
.o:void"; |
| 33 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-p
arisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-
parisc.o::::32"; |
| 34 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o ae
s-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::gha
sh-parisc.o::::64"; |
| 35 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o
aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::"; |
| 36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c |
| 37 index 54d1ca3..e4b648f 100644 |
| 38 --- a/crypto/chacha/chacha_enc.c |
| 39 +++ b/crypto/chacha/chacha_enc.c |
| 40 @@ -61,6 +61,7 @@ |
| 41 |
| 42 #if !defined(OPENSSL_NO_CHACHA) |
| 43 |
| 44 +#include <openssl/crypto.h> |
| 45 #include <openssl/chacha.h> |
| 46 |
| 47 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */ |
| 48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k"; |
| 49 |
| 50 typedef unsigned int uint32_t; |
| 51 |
| 52 +#if __arm__ |
| 53 +/* Defined in chacha_vec.c */ |
| 54 +void CRYPTO_chacha_20_neon(unsigned char *out, |
| 55 + const unsigned char *in, size_t in_len, |
| 56 + const unsigned char key[32], |
| 57 + const unsigned char nonce[8], |
| 58 + size_t counter); |
| 59 +#endif |
| 60 + |
| 61 /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in |
| 62 * |input| and writes the 64 output bytes to |output|. */ |
| 63 static void chacha_core(unsigned char output[64], const uint32_t input[16], |
| 64 @@ -124,6 +134,14 @@ void CRYPTO_chacha_20(unsigned char *out, |
| 65 unsigned char buf[64]; |
| 66 size_t todo, i; |
| 67 |
| 68 +#if __arm__ |
| 69 + if (CRYPTO_is_NEON_capable()) |
| 70 + { |
| 71 + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter); |
| 72 + return; |
| 73 + } |
| 74 +#endif |
| 75 + |
| 76 input[0] = U8TO32_LITTLE(sigma + 0); |
| 77 input[1] = U8TO32_LITTLE(sigma + 4); |
| 78 input[2] = U8TO32_LITTLE(sigma + 8); |
| 79 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c |
| 80 index 33b2238..1226c39 100644 |
| 81 --- a/crypto/chacha/chacha_vec.c |
| 82 +++ b/crypto/chacha/chacha_vec.c |
| 83 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16))); |
| 84 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ |
| 85 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); |
| 86 |
| 87 +#if __ARM_NEON__ |
| 88 +/* For ARM, we can't depend on NEON support, so this function is compiled with |
| 89 + * a different name, along with the generic code, and can be enabled at |
| 90 + * run-time. */ |
| 91 +void CRYPTO_chacha_20_neon( |
| 92 +#else |
| 93 void CRYPTO_chacha_20( |
| 94 +#endif |
| 95 unsigned char *out, |
| 96 const unsigned char *in, |
| 97 size_t inlen, |
| 98 diff --git a/crypto/chacha/chacha_vec_arm.s b/crypto/chacha/chacha_vec_arm.s |
| 99 new file mode 100644 |
| 100 index 0000000..24a5050 |
| 101 --- /dev/null |
| 102 +++ b/crypto/chacha/chacha_vec_arm.s |
| 103 @@ -0,0 +1,846 @@ |
| 104 + .syntax unified |
| 105 + .cpu cortex-a8 |
| 106 + .eabi_attribute 27, 3 |
| 107 + .eabi_attribute 28, 1 |
| 108 + .fpu neon |
| 109 + .eabi_attribute 20, 1 |
| 110 + .eabi_attribute 21, 1 |
| 111 + .eabi_attribute 23, 3 |
| 112 + .eabi_attribute 24, 1 |
| 113 + .eabi_attribute 25, 1 |
| 114 + .eabi_attribute 26, 2 |
| 115 + .eabi_attribute 30, 2 |
| 116 + .eabi_attribute 34, 1 |
| 117 + .eabi_attribute 18, 4 |
| 118 + .thumb |
| 119 + .file "chacha_vec.c" |
| 120 + .text |
| 121 + .align 2 |
| 122 + .global CRYPTO_chacha_20_neon |
| 123 + .thumb |
| 124 + .thumb_func |
| 125 + .type CRYPTO_chacha_20_neon, %function |
| 126 +CRYPTO_chacha_20_neon: |
| 127 + @ args = 8, pretend = 0, frame = 296 |
| 128 + @ frame_needed = 1, uses_anonymous_args = 0 |
| 129 + @ link register save eliminated. |
| 130 + push {r4, r5, r6, r7, r8, r9, sl, fp} |
| 131 + fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} |
| 132 + sub sp, sp, #296 |
| 133 + add r7, sp, #0 |
| 134 + movw ip, #43691 |
| 135 + movt ip, 43690 |
| 136 + str r2, [r7, #192] |
| 137 + sub sp, sp, #96 |
| 138 + ldr r4, [r7, #192] |
| 139 + ldr r6, [r7, #392] |
| 140 + ldr r2, .L38+16 |
| 141 + umull r4, ip, ip, r4 |
| 142 + ldr r6, [r6, #0] |
| 143 + ldr r8, [r7, #392] |
| 144 + add r4, sp, #15 |
| 145 + str r3, [r7, #236] |
| 146 + bic r4, r4, #15 |
| 147 + str r6, [r7, #172] |
| 148 + str r4, [r7, #196] |
| 149 + str r0, [r7, #184] |
| 150 + lsrs ip, ip, #7 |
| 151 + str r1, [r7, #180] |
| 152 + ldmia r2, {r0, r1, r2, r3} |
| 153 + ldr r4, [r8, #4] |
| 154 + ldr r5, [r7, #236] |
| 155 + vld1.64 {d24-d25}, [r5:64] |
| 156 + vldr d26, [r5, #16] |
| 157 + vldr d27, [r5, #24] |
| 158 + ldr fp, [r7, #196] |
| 159 + ldr r8, [r7, #396] |
| 160 + ldr r5, [r7, #172] |
| 161 + add r6, fp, #64 |
| 162 + str r4, [r7, #292] |
| 163 + mov r4, #0 |
| 164 + str r8, [r7, #280] |
| 165 + str r5, [r7, #288] |
| 166 + str r4, [r7, #284] |
| 167 + stmia r6, {r0, r1, r2, r3} |
| 168 + vldr d22, [fp, #64] |
| 169 + vldr d23, [fp, #72] |
| 170 + vldr d20, [r7, #280] |
| 171 + vldr d21, [r7, #288] |
| 172 + str ip, [r7, #188] |
| 173 + beq .L20 |
| 174 + lsl r6, ip, #1 |
| 175 + ldr r1, [fp, #68] |
| 176 + add r3, r6, ip |
| 177 + str r6, [r7, #176] |
| 178 + ldr r2, [fp, #72] |
| 179 + add r8, r8, #2 |
| 180 + ldr r5, [fp, #76] |
| 181 + vldr d18, .L38 |
| 182 + vldr d19, .L38+8 |
| 183 + str r4, [r7, #232] |
| 184 + ldr r6, [r7, #180] |
| 185 + ldr r4, [r7, #184] |
| 186 + str r0, [r7, #220] |
| 187 + str r1, [r7, #216] |
| 188 + str r8, [r7, #200] |
| 189 + str r2, [r7, #212] |
| 190 + str r3, [r7, #204] |
| 191 + str r5, [r7, #208] |
| 192 + str r6, [r7, #244] |
| 193 + str r4, [r7, #240] |
| 194 +.L4: |
| 195 + ldr r6, [r7, #236] |
| 196 + vadd.i32 q8, q10, q9 |
| 197 + ldr r5, [r7, #236] |
| 198 + vmov q15, q13 @ v4si |
| 199 + ldr r8, [r7, #232] |
| 200 + vmov q3, q12 @ v4si |
| 201 + ldr r6, [r6, #4] |
| 202 + vmov q2, q11 @ v4si |
| 203 + ldr fp, [r7, #200] |
| 204 + vmov q5, q10 @ v4si |
| 205 + ldr r4, [r7, #236] |
| 206 + vmov q1, q13 @ v4si |
| 207 + add ip, r8, fp |
| 208 + ldr r5, [r5, #0] |
| 209 + ldr r0, [r7, #236] |
| 210 + add r8, r7, #208 |
| 211 + ldr r1, [r7, #236] |
| 212 + vmov q0, q12 @ v4si |
| 213 + str r6, [r7, #260] |
| 214 + vmov q4, q11 @ v4si |
| 215 + ldr r6, [r7, #392] |
| 216 + ldmia r8, {r8, r9, sl, fp} |
| 217 + ldr r0, [r0, #8] |
| 218 + ldr r1, [r1, #12] |
| 219 + str r5, [r7, #224] |
| 220 + ldr r5, [r4, #24] |
| 221 + ldr r3, [r4, #28] |
| 222 + ldr r2, [r6, #4] |
| 223 + str r0, [r7, #256] |
| 224 + str r1, [r7, #228] |
| 225 + str r5, [r7, #272] |
| 226 + ldr r5, [r6, #0] |
| 227 + movs r6, #0 |
| 228 + ldr r0, [r4, #16] |
| 229 + ldr r1, [r4, #20] |
| 230 + movs r4, #10 |
| 231 + str r2, [r7, #20] |
| 232 + str r3, [r7, #276] |
| 233 + str r9, [r7, #268] |
| 234 + mov r9, r6 |
| 235 + str r4, [r7, #248] |
| 236 + ldr r2, [r7, #256] |
| 237 + ldr r3, [r7, #228] |
| 238 + str r8, [r7, #252] |
| 239 + mov r8, sl |
| 240 + ldr r6, [r7, #272] |
| 241 + mov sl, ip |
| 242 + str r1, [r7, #264] |
| 243 + ldr ip, [r7, #20] |
| 244 + str r6, [r7, #256] |
| 245 + mov r6, r5 |
| 246 + ldr r1, [r7, #260] |
| 247 + mov r5, r0 |
| 248 + ldr r0, [r7, #224] |
| 249 + b .L39 |
| 250 +.L40: |
| 251 + .align 3 |
| 252 +.L38: |
| 253 + .word 1 |
| 254 + .word 0 |
| 255 + .word 0 |
| 256 + .word 0 |
| 257 + .word .LANCHOR0 |
| 258 +.L39: |
| 259 +.L3: |
| 260 + vadd.i32 q4, q4, q0 |
| 261 + add r8, r8, r1 |
| 262 + vadd.i32 q2, q2, q3 |
| 263 + str r8, [r7, #260] |
| 264 + veor q5, q5, q4 |
| 265 + ldr r8, [r7, #268] |
| 266 + veor q8, q8, q2 |
| 267 + add fp, fp, r0 |
| 268 + str fp, [r7, #272] |
| 269 + add r8, r8, r2 |
| 270 + vrev32.16 q5, q5 |
| 271 + str r8, [r7, #268] |
| 272 + vrev32.16 q8, q8 |
| 273 + vadd.i32 q1, q1, q5 |
| 274 + vadd.i32 q15, q15, q8 |
| 275 + ldr r8, [r7, #272] |
| 276 + veor q0, q1, q0 |
| 277 + ldr r4, [r7, #252] |
| 278 + veor q3, q15, q3 |
| 279 + eor sl, sl, r8 |
| 280 + ldr r8, [r7, #268] |
| 281 + add fp, r4, r3 |
| 282 + vshl.i32 q7, q0, #12 |
| 283 + ldr r4, [r7, #260] |
| 284 + vshl.i32 q6, q3, #12 |
| 285 + eor r6, r6, r8 |
| 286 + eor r9, r9, r4 |
| 287 + ldr r4, [r7, #264] |
| 288 + vsri.32 q7, q0, #20 |
| 289 + ror r8, r6, #16 |
| 290 + ldr r6, [r7, #256] |
| 291 + eor ip, ip, fp |
| 292 + vsri.32 q6, q3, #20 |
| 293 + ror sl, sl, #16 |
| 294 + ror r9, r9, #16 |
| 295 + add r5, r5, sl |
| 296 + vadd.i32 q4, q4, q7 |
| 297 + str r5, [r7, #228] |
| 298 + vadd.i32 q2, q2, q6 |
| 299 + add r5, r4, r9 |
| 300 + add r4, r6, r8 |
| 301 + ldr r6, [r7, #276] |
| 302 + ror ip, ip, #16 |
| 303 + veor q5, q4, q5 |
| 304 + veor q8, q2, q8 |
| 305 + add r6, r6, ip |
| 306 + str r6, [r7, #256] |
| 307 + eors r1, r1, r5 |
| 308 + ldr r6, [r7, #228] |
| 309 + vshl.i32 q3, q5, #8 |
| 310 + vshl.i32 q14, q8, #8 |
| 311 + eors r2, r2, r4 |
| 312 + eors r0, r0, r6 |
| 313 + ldr r6, [r7, #256] |
| 314 + vsri.32 q3, q5, #24 |
| 315 + ror r1, r1, #20 |
| 316 + eors r3, r3, r6 |
| 317 + ldr r6, [r7, #272] |
| 318 + ror r0, r0, #20 |
| 319 + vsri.32 q14, q8, #24 |
| 320 + adds r6, r0, r6 |
| 321 + str r6, [r7, #276] |
| 322 + ldr r6, [r7, #260] |
| 323 + vadd.i32 q1, q1, q3 |
| 324 + vadd.i32 q15, q15, q14 |
| 325 + ror r2, r2, #20 |
| 326 + adds r6, r1, r6 |
| 327 + str r6, [r7, #252] |
| 328 + ldr r6, [r7, #268] |
| 329 + veor q6, q15, q6 |
| 330 + veor q7, q1, q7 |
| 331 + ror r3, r3, #20 |
| 332 + adds r6, r2, r6 |
| 333 + str r6, [r7, #272] |
| 334 + ldr r6, [r7, #276] |
| 335 + vshl.i32 q0, q6, #7 |
| 336 + vshl.i32 q5, q7, #7 |
| 337 + add fp, r3, fp |
| 338 + eor sl, r6, sl |
| 339 + ldr r6, [r7, #252] |
| 340 + eor ip, fp, ip |
| 341 + vsri.32 q0, q6, #25 |
| 342 + eor r9, r6, r9 |
| 343 + ldr r6, [r7, #272] |
| 344 + ror sl, sl, #24 |
| 345 + vsri.32 q5, q7, #25 |
| 346 + eor r8, r6, r8 |
| 347 + ldr r6, [r7, #228] |
| 348 + ror r9, r9, #24 |
| 349 + ror ip, ip, #24 |
| 350 + add r6, sl, r6 |
| 351 + str r6, [r7, #268] |
| 352 + ldr r6, [r7, #256] |
| 353 + add r5, r9, r5 |
| 354 + str r5, [r7, #264] |
| 355 + vext.32 q5, q5, q5, #1 |
| 356 + add r5, ip, r6 |
| 357 + ldr r6, [r7, #268] |
| 358 + vext.32 q0, q0, q0, #1 |
| 359 + vadd.i32 q4, q4, q5 |
| 360 + eors r0, r0, r6 |
| 361 + ldr r6, [r7, #264] |
| 362 + vadd.i32 q2, q2, q0 |
| 363 + vext.32 q3, q3, q3, #3 |
| 364 + ror r8, r8, #24 |
| 365 + eors r1, r1, r6 |
| 366 + vext.32 q14, q14, q14, #3 |
| 367 + add r4, r8, r4 |
| 368 + ldr r6, [r7, #276] |
| 369 + veor q3, q4, q3 |
| 370 + veor q14, q2, q14 |
| 371 + eors r2, r2, r4 |
| 372 + ror r1, r1, #25 |
| 373 + vext.32 q1, q1, q1, #2 |
| 374 + adds r6, r1, r6 |
| 375 + str r6, [r7, #276] |
| 376 + vext.32 q15, q15, q15, #2 |
| 377 + ldr r6, [r7, #252] |
| 378 + eors r3, r3, r5 |
| 379 + ror r2, r2, #25 |
| 380 + vrev32.16 q8, q14 |
| 381 + adds r6, r2, r6 |
| 382 + vrev32.16 q3, q3 |
| 383 + str r6, [r7, #260] |
| 384 + vadd.i32 q1, q1, q3 |
| 385 + ldr r6, [r7, #272] |
| 386 + vadd.i32 q15, q15, q8 |
| 387 + ror r3, r3, #25 |
| 388 + veor q5, q1, q5 |
| 389 + adds r6, r3, r6 |
| 390 + veor q0, q15, q0 |
| 391 + str r6, [r7, #256] |
| 392 + ldr r6, [r7, #260] |
| 393 + ror r0, r0, #25 |
| 394 + add fp, r0, fp |
| 395 + vshl.i32 q6, q5, #12 |
| 396 + eor sl, r6, sl |
| 397 + ldr r6, [r7, #276] |
| 398 + vshl.i32 q14, q0, #12 |
| 399 + eor r8, fp, r8 |
| 400 + eor ip, r6, ip |
| 401 + ldr r6, [r7, #256] |
| 402 + vsri.32 q6, q5, #20 |
| 403 + ror sl, sl, #16 |
| 404 + eor r9, r6, r9 |
| 405 + ror r6, r8, #16 |
| 406 + vsri.32 q14, q0, #20 |
| 407 + ldr r8, [r7, #264] |
| 408 + ror ip, ip, #16 |
| 409 + add r5, sl, r5 |
| 410 + add r8, r6, r8 |
| 411 + add r4, ip, r4 |
| 412 + str r4, [r7, #228] |
| 413 + eor r0, r8, r0 |
| 414 + str r5, [r7, #272] |
| 415 + vadd.i32 q4, q4, q6 |
| 416 + ldr r5, [r7, #228] |
| 417 + vadd.i32 q2, q2, q14 |
| 418 + ldr r4, [r7, #268] |
| 419 + ror r0, r0, #20 |
| 420 + veor q3, q4, q3 |
| 421 + eors r1, r1, r5 |
| 422 + veor q0, q2, q8 |
| 423 + str r8, [r7, #264] |
| 424 + str r0, [r7, #20] |
| 425 + add fp, r0, fp |
| 426 + ldr r8, [r7, #272] |
| 427 + ror r9, r9, #16 |
| 428 + ldr r0, [r7, #276] |
| 429 + add r4, r9, r4 |
| 430 + str fp, [r7, #252] |
| 431 + ror r1, r1, #20 |
| 432 + add fp, r1, r0 |
| 433 + eor r2, r8, r2 |
| 434 + ldr r0, [r7, #252] |
| 435 + eors r3, r3, r4 |
| 436 + vshl.i32 q5, q3, #8 |
| 437 + str r4, [r7, #224] |
| 438 + vshl.i32 q8, q0, #8 |
| 439 + ldr r4, [r7, #260] |
| 440 + ldr r5, [r7, #256] |
| 441 + ror r2, r2, #20 |
| 442 + ror r3, r3, #20 |
| 443 + eors r6, r6, r0 |
| 444 + adds r5, r3, r5 |
| 445 + add r8, r2, r4 |
| 446 + vsri.32 q5, q3, #24 |
| 447 + ldr r4, [r7, #264] |
| 448 + eor r9, r5, r9 |
| 449 + eor ip, fp, ip |
| 450 + vsri.32 q8, q0, #24 |
| 451 + eor sl, r8, sl |
| 452 + ror r6, r6, #24 |
| 453 + ldr r0, [r7, #272] |
| 454 + str r5, [r7, #268] |
| 455 + adds r4, r6, r4 |
| 456 + ldr r5, [r7, #228] |
| 457 + vadd.i32 q1, q1, q5 |
| 458 + str r4, [r7, #264] |
| 459 + vadd.i32 q15, q15, q8 |
| 460 + ldr r4, [r7, #224] |
| 461 + ror ip, ip, #24 |
| 462 + ror sl, sl, #24 |
| 463 + ror r9, r9, #24 |
| 464 + add r5, ip, r5 |
| 465 + add r0, sl, r0 |
| 466 + str r5, [r7, #256] |
| 467 + add r5, r9, r4 |
| 468 + str r0, [r7, #276] |
| 469 + veor q6, q1, q6 |
| 470 + ldr r4, [r7, #20] |
| 471 + veor q14, q15, q14 |
| 472 + ldr r0, [r7, #264] |
| 473 + eors r3, r3, r5 |
| 474 + vshl.i32 q0, q6, #7 |
| 475 + vext.32 q1, q1, q1, #2 |
| 476 + eors r0, r0, r4 |
| 477 + ldr r4, [r7, #276] |
| 478 + str r0, [r7, #272] |
| 479 + vshl.i32 q3, q14, #7 |
| 480 + eors r2, r2, r4 |
| 481 + ldr r4, [r7, #272] |
| 482 + ldr r0, [r7, #256] |
| 483 + vsri.32 q0, q6, #25 |
| 484 + ror r2, r2, #25 |
| 485 + ror r3, r3, #25 |
| 486 + eors r1, r1, r0 |
| 487 + vsri.32 q3, q14, #25 |
| 488 + ror r0, r4, #25 |
| 489 + ldr r4, [r7, #248] |
| 490 + ror r1, r1, #25 |
| 491 + vext.32 q5, q5, q5, #1 |
| 492 + subs r4, r4, #1 |
| 493 + str r4, [r7, #248] |
| 494 + vext.32 q15, q15, q15, #2 |
| 495 + vext.32 q8, q8, q8, #1 |
| 496 + vext.32 q0, q0, q0, #3 |
| 497 + vext.32 q3, q3, q3, #3 |
| 498 + bne .L3 |
| 499 + ldr r4, [r7, #256] |
| 500 + vadd.i32 q4, q11, q4 |
| 501 + str r2, [r7, #256] |
| 502 + vadd.i32 q14, q10, q9 |
| 503 + ldr r2, [r7, #244] |
| 504 + vld1.64 {d12-d13}, [r2:64] |
| 505 + str r4, [r7, #272] |
| 506 + veor q4, q4, q6 |
| 507 + ldr r4, [r7, #220] |
| 508 + vadd.i32 q10, q10, q5 |
| 509 + ldr r2, [r7, #216] |
| 510 + vadd.i32 q0, q12, q0 |
| 511 + add fp, fp, r4 |
| 512 + str ip, [r7, #20] |
| 513 + ldr r4, [r7, #212] |
| 514 + mov ip, sl |
| 515 + str r0, [r7, #224] |
| 516 + mov sl, r8 |
| 517 + mov r0, r5 |
| 518 + ldr r8, [r7, #252] |
| 519 + mov r5, r6 |
| 520 + add sl, sl, r2 |
| 521 + mov r6, r9 |
| 522 + ldr r2, [r7, #208] |
| 523 + ldr r9, [r7, #268] |
| 524 + vadd.i32 q1, q13, q1 |
| 525 + vadd.i32 q2, q11, q2 |
| 526 + str r1, [r7, #260] |
| 527 + add r9, r9, r4 |
| 528 + add r4, r8, r2 |
| 529 + ldr r8, [r7, #232] |
| 530 + vadd.i32 q3, q12, q3 |
| 531 + vadd.i32 q15, q13, q15 |
| 532 + str r3, [r7, #228] |
| 533 + add r2, r8, #2 |
| 534 + vadd.i32 q8, q14, q8 |
| 535 + add ip, r2, ip |
| 536 + ldr r2, [r7, #240] |
| 537 + vst1.64 {d8-d9}, [r2:64] |
| 538 + ldr r2, [r7, #244] |
| 539 + ldr r3, [r7, #276] |
| 540 + vldr d8, [r2, #16] |
| 541 + vldr d9, [r2, #24] |
| 542 + ldr r1, [r7, #264] |
| 543 + veor q0, q0, q4 |
| 544 + add r8, r8, #3 |
| 545 + str r8, [r7, #232] |
| 546 + ldr r8, [r7, #240] |
| 547 + vstr d0, [r8, #16] |
| 548 + vstr d1, [r8, #24] |
| 549 + vldr d0, [r2, #32] |
| 550 + vldr d1, [r2, #40] |
| 551 + veor q1, q1, q0 |
| 552 + vstr d2, [r8, #32] |
| 553 + vstr d3, [r8, #40] |
| 554 + vldr d2, [r2, #48] |
| 555 + vldr d3, [r2, #56] |
| 556 + veor q10, q10, q1 |
| 557 + vstr d20, [r8, #48] |
| 558 + vstr d21, [r8, #56] |
| 559 + vldr d8, [r2, #64] |
| 560 + vldr d9, [r2, #72] |
| 561 + veor q2, q2, q4 |
| 562 + vstr d4, [r8, #64] |
| 563 + vstr d5, [r8, #72] |
| 564 + vldr d10, [r2, #80] |
| 565 + vldr d11, [r2, #88] |
| 566 + veor q3, q3, q5 |
| 567 + vstr d6, [r8, #80] |
| 568 + vstr d7, [r8, #88] |
| 569 + vldr d12, [r2, #96] |
| 570 + vldr d13, [r2, #104] |
| 571 + veor q15, q15, q6 |
| 572 + vstr d30, [r8, #96] |
| 573 + vstr d31, [r8, #104] |
| 574 + vldr d20, [r2, #112] |
| 575 + vldr d21, [r2, #120] |
| 576 + veor q8, q8, q10 |
| 577 + vstr d16, [r8, #112] |
| 578 + vstr d17, [r8, #120] |
| 579 + mov r8, r2 |
| 580 + ldr r2, [r2, #128] |
| 581 + vadd.i32 q10, q14, q9 |
| 582 + eor r2, fp, r2 |
| 583 + ldr fp, [r7, #240] |
| 584 + vadd.i32 q10, q10, q9 |
| 585 + str r2, [fp, #128] |
| 586 + ldr r2, [r8, #132] |
| 587 + eor r2, sl, r2 |
| 588 + str r2, [fp, #132] |
| 589 + ldr r2, [r8, #136] |
| 590 + eor r2, r9, r2 |
| 591 + str r2, [fp, #136] |
| 592 + ldr r2, [r8, #140] |
| 593 + eors r2, r2, r4 |
| 594 + str r2, [fp, #140] |
| 595 + ldr r2, [r7, #236] |
| 596 + ldr r4, [r8, #144] |
| 597 + ldr r2, [r2, #0] |
| 598 + str r4, [r7, #168] |
| 599 + ldr r4, [r7, #224] |
| 600 + add r8, r4, r2 |
| 601 + ldr r2, [r7, #168] |
| 602 + ldr r4, [r7, #236] |
| 603 + eor r8, r8, r2 |
| 604 + ldr r2, [r7, #244] |
| 605 + str r8, [fp, #144] |
| 606 + ldr r4, [r4, #4] |
| 607 + ldr r2, [r2, #148] |
| 608 + str r2, [r7, #36] |
| 609 + ldr r2, [r7, #260] |
| 610 + add r8, r2, r4 |
| 611 + ldr r4, [r7, #36] |
| 612 + ldr r2, [r7, #236] |
| 613 + eor r8, r8, r4 |
| 614 + ldr r4, [r7, #244] |
| 615 + str r8, [fp, #148] |
| 616 + ldr r2, [r2, #8] |
| 617 + ldr r4, [r4, #152] |
| 618 + str r4, [r7, #32] |
| 619 + ldr r4, [r7, #256] |
| 620 + add r8, r4, r2 |
| 621 + ldr r2, [r7, #32] |
| 622 + eor r8, r8, r2 |
| 623 + str r8, [fp, #152] |
| 624 + ldr r2, [r7, #244] |
| 625 + ldr r4, [r7, #236] |
| 626 + ldr r2, [r2, #156] |
| 627 + ldr r4, [r4, #12] |
| 628 + str r2, [r7, #28] |
| 629 + ldr r2, [r7, #228] |
| 630 + add r8, r2, r4 |
| 631 + ldr r4, [r7, #28] |
| 632 + ldr r2, [r7, #244] |
| 633 + eor r8, r8, r4 |
| 634 + str r8, [fp, #156] |
| 635 + ldr r8, [r7, #236] |
| 636 + ldr r2, [r2, #160] |
| 637 + ldr r4, [r8, #16] |
| 638 + adds r0, r0, r4 |
| 639 + ldr r4, [r7, #244] |
| 640 + eors r0, r0, r2 |
| 641 + str r0, [fp, #160] |
| 642 + ldr r0, [r8, #20] |
| 643 + ldr r2, [r4, #164] |
| 644 + adds r1, r1, r0 |
| 645 + ldr r0, [r7, #272] |
| 646 + eors r1, r1, r2 |
| 647 + str r1, [fp, #164] |
| 648 + ldr r2, [r8, #24] |
| 649 + ldr r1, [r4, #168] |
| 650 + adds r2, r0, r2 |
| 651 + eors r2, r2, r1 |
| 652 + str r2, [fp, #168] |
| 653 + ldr r1, [r8, #28] |
| 654 + ldr r2, [r4, #172] |
| 655 + adds r3, r3, r1 |
| 656 + eors r3, r3, r2 |
| 657 + str r3, [fp, #172] |
| 658 + ldr r3, [r4, #176] |
| 659 + eor r3, ip, r3 |
| 660 + str r3, [fp, #176] |
| 661 + ldr r3, [r4, #180] |
| 662 + ldr r4, [r7, #392] |
| 663 + eors r6, r6, r3 |
| 664 + str r6, [fp, #180] |
| 665 + ldr r6, [r7, #244] |
| 666 + ldr r2, [r4, #0] |
| 667 + ldr r3, [r6, #184] |
| 668 + adds r5, r5, r2 |
| 669 + eors r5, r5, r3 |
| 670 + str r5, [fp, #184] |
| 671 + ldr r2, [r6, #188] |
| 672 + adds r6, r6, #192 |
| 673 + ldr r3, [r4, #4] |
| 674 + str r6, [r7, #244] |
| 675 + ldr r0, [r7, #20] |
| 676 + ldr r1, [r7, #232] |
| 677 + adds r4, r0, r3 |
| 678 + eors r4, r4, r2 |
| 679 + ldr r2, [r7, #204] |
| 680 + str r4, [fp, #188] |
| 681 + add fp, fp, #192 |
| 682 + cmp r1, r2 |
| 683 + str fp, [r7, #240] |
| 684 + bne .L4 |
| 685 + ldr r4, [r7, #188] |
| 686 + ldr r3, [r7, #176] |
| 687 + ldr r6, [r7, #184] |
| 688 + adds r5, r3, r4 |
| 689 + ldr r8, [r7, #180] |
| 690 + lsls r5, r5, #6 |
| 691 + adds r4, r6, r5 |
| 692 + add r5, r8, r5 |
| 693 +.L2: |
| 694 + ldr fp, [r7, #192] |
| 695 + movw r3, #43691 |
| 696 + movt r3, 43690 |
| 697 + ldr r6, [r7, #192] |
| 698 + umull fp, r3, r3, fp |
| 699 + lsrs r3, r3, #7 |
| 700 + add r3, r3, r3, lsl #1 |
| 701 + sub r3, r6, r3, lsl #6 |
| 702 + lsrs r6, r3, #6 |
| 703 + beq .L5 |
| 704 + add r1, r5, #16 |
| 705 + add r2, r4, #16 |
| 706 + mov r0, r6 |
| 707 + vldr d30, .L41 |
| 708 + vldr d31, .L41+8 |
| 709 +.L6: |
| 710 + vmov q8, q10 @ v4si |
| 711 + movs r3, #10 |
| 712 + vmov q1, q13 @ v4si |
| 713 + vmov q14, q12 @ v4si |
| 714 + vmov q3, q11 @ v4si |
| 715 +.L7: |
| 716 + vadd.i32 q3, q3, q14 |
| 717 + subs r3, r3, #1 |
| 718 + veor q2, q8, q3 |
| 719 + vrev32.16 q2, q2 |
| 720 + vadd.i32 q8, q1, q2 |
| 721 + veor q9, q8, q14 |
| 722 + vshl.i32 q14, q9, #12 |
| 723 + vsri.32 q14, q9, #20 |
| 724 + vadd.i32 q3, q3, q14 |
| 725 + veor q2, q3, q2 |
| 726 + vshl.i32 q9, q2, #8 |
| 727 + vsri.32 q9, q2, #24 |
| 728 + vadd.i32 q8, q8, q9 |
| 729 + vext.32 q9, q9, q9, #3 |
| 730 + veor q14, q8, q14 |
| 731 + vext.32 q1, q8, q8, #2 |
| 732 + vshl.i32 q8, q14, #7 |
| 733 + vsri.32 q8, q14, #25 |
| 734 + vext.32 q8, q8, q8, #1 |
| 735 + vadd.i32 q3, q3, q8 |
| 736 + veor q2, q3, q9 |
| 737 + vrev32.16 q2, q2 |
| 738 + vadd.i32 q9, q1, q2 |
| 739 + veor q8, q9, q8 |
| 740 + vshl.i32 q14, q8, #12 |
| 741 + vsri.32 q14, q8, #20 |
| 742 + vadd.i32 q3, q3, q14 |
| 743 + veor q2, q3, q2 |
| 744 + vshl.i32 q8, q2, #8 |
| 745 + vsri.32 q8, q2, #24 |
| 746 + vadd.i32 q9, q9, q8 |
| 747 + vext.32 q8, q8, q8, #1 |
| 748 + veor q14, q9, q14 |
| 749 + vext.32 q1, q9, q9, #2 |
| 750 + vshl.i32 q9, q14, #7 |
| 751 + vsri.32 q9, q14, #25 |
| 752 + vext.32 q14, q9, q9, #3 |
| 753 + bne .L7 |
| 754 + vadd.i32 q8, q10, q8 |
| 755 + subs r0, r0, #1 |
| 756 + vadd.i32 q3, q11, q3 |
| 757 + vldr d0, [r1, #-16] |
| 758 + vldr d1, [r1, #-8] |
| 759 + vadd.i32 q14, q12, q14 |
| 760 + vadd.i32 q1, q13, q1 |
| 761 + veor q3, q3, q0 |
| 762 + vstr d6, [r2, #-16] |
| 763 + vstr d7, [r2, #-8] |
| 764 + vadd.i32 q10, q10, q15 |
| 765 + vld1.64 {d8-d9}, [r1:64] |
| 766 + veor q14, q14, q4 |
| 767 + vst1.64 {d28-d29}, [r2:64] |
| 768 + vldr d10, [r1, #16] |
| 769 + vldr d11, [r1, #24] |
| 770 + veor q1, q1, q5 |
| 771 + vstr d2, [r2, #16] |
| 772 + vstr d3, [r2, #24] |
| 773 + vldr d18, [r1, #32] |
| 774 + vldr d19, [r1, #40] |
| 775 + add r1, r1, #64 |
| 776 + veor q8, q8, q9 |
| 777 + vstr d16, [r2, #32] |
| 778 + vstr d17, [r2, #40] |
| 779 + add r2, r2, #64 |
| 780 + bne .L6 |
| 781 + lsls r6, r6, #6 |
| 782 + adds r4, r4, r6 |
| 783 + adds r5, r5, r6 |
| 784 +.L5: |
| 785 + ldr r6, [r7, #192] |
| 786 + ands ip, r6, #63 |
| 787 + beq .L1 |
| 788 + vmov q8, q10 @ v4si |
| 789 + movs r3, #10 |
| 790 + vmov q14, q13 @ v4si |
| 791 + vmov q9, q12 @ v4si |
| 792 + vmov q15, q11 @ v4si |
| 793 +.L10: |
| 794 + vadd.i32 q15, q15, q9 |
| 795 + subs r3, r3, #1 |
| 796 + veor q8, q8, q15 |
| 797 + vrev32.16 q8, q8 |
| 798 + vadd.i32 q3, q14, q8 |
| 799 + veor q9, q3, q9 |
| 800 + vshl.i32 q14, q9, #12 |
| 801 + vsri.32 q14, q9, #20 |
| 802 + vadd.i32 q15, q15, q14 |
| 803 + veor q9, q15, q8 |
| 804 + vshl.i32 q8, q9, #8 |
| 805 + vsri.32 q8, q9, #24 |
| 806 + vadd.i32 q9, q3, q8 |
| 807 + vext.32 q8, q8, q8, #3 |
| 808 + veor q2, q9, q14 |
| 809 + vext.32 q14, q9, q9, #2 |
| 810 + vshl.i32 q9, q2, #7 |
| 811 + vsri.32 q9, q2, #25 |
| 812 + vext.32 q9, q9, q9, #1 |
| 813 + vadd.i32 q15, q15, q9 |
| 814 + veor q3, q15, q8 |
| 815 + vrev32.16 q3, q3 |
| 816 + vadd.i32 q14, q14, q3 |
| 817 + veor q8, q14, q9 |
| 818 + vshl.i32 q9, q8, #12 |
| 819 + vsri.32 q9, q8, #20 |
| 820 + vadd.i32 q15, q15, q9 |
| 821 + veor q3, q15, q3 |
| 822 + vshl.i32 q8, q3, #8 |
| 823 + vsri.32 q8, q3, #24 |
| 824 + vadd.i32 q14, q14, q8 |
| 825 + vext.32 q8, q8, q8, #1 |
| 826 + veor q3, q14, q9 |
| 827 + vext.32 q14, q14, q14, #2 |
| 828 + vshl.i32 q9, q3, #7 |
| 829 + vsri.32 q9, q3, #25 |
| 830 + vext.32 q9, q9, q9, #3 |
| 831 + bne .L10 |
| 832 + cmp ip, #15 |
| 833 + vadd.i32 q11, q11, q15 |
| 834 + bhi .L37 |
| 835 + ldr fp, [r7, #196] |
| 836 + vst1.64 {d22-d23}, [fp:128] |
| 837 +.L14: |
| 838 + ldr r6, [r7, #192] |
| 839 + and r3, r6, #48 |
| 840 + cmp ip, r3 |
| 841 + bls .L1 |
| 842 + adds r0, r5, r3 |
| 843 + adds r1, r4, r3 |
| 844 + add r2, r0, #16 |
| 845 + add r6, r1, #16 |
| 846 + cmp r1, r2 |
| 847 + it cc |
| 848 + cmpcc r0, r6 |
| 849 + rsb r9, r3, ip |
| 850 + ite cc |
| 851 + movcc r2, #0 |
| 852 + movcs r2, #1 |
| 853 + cmp r9, #15 |
| 854 + ite ls |
| 855 + movls r2, #0 |
| 856 + andhi r2, r2, #1 |
| 857 + lsr r8, r9, #4 |
| 858 + eor r2, r2, #1 |
| 859 + cmp r8, #0 |
| 860 + it eq |
| 861 + orreq r2, r2, #1 |
| 862 + lsl sl, r8, #4 |
| 863 + cbnz r2, .L35 |
| 864 + ldr fp, [r7, #196] |
| 865 + add r6, fp, r3 |
| 866 +.L17: |
| 867 + vld1.8 {q8}, [r0]! |
| 868 + adds r2, r2, #1 |
| 869 + cmp r8, r2 |
| 870 + vld1.8 {q9}, [r6]! |
| 871 + veor q8, q9, q8 |
| 872 + vst1.8 {q8}, [r1]! |
| 873 + bhi .L17 |
| 874 + cmp r9, sl |
| 875 + add r3, r3, sl |
| 876 + beq .L1 |
| 877 +.L35: |
| 878 + ldr r0, [r7, #196] |
| 879 +.L25: |
| 880 + ldrb r2, [r5, r3] @ zero_extendqisi2 |
| 881 + ldrb r1, [r3, r0] @ zero_extendqisi2 |
| 882 + eors r2, r2, r1 |
| 883 + strb r2, [r4, r3] |
| 884 + adds r3, r3, #1 |
| 885 + cmp ip, r3 |
| 886 + bhi .L25 |
| 887 +.L1: |
| 888 + add r7, r7, #296 |
| 889 + mov sp, r7 |
| 890 + fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} |
| 891 + pop {r4, r5, r6, r7, r8, r9, sl, fp} |
| 892 + bx lr |
| 893 +.L37: |
| 894 + cmp ip, #31 |
| 895 + vld1.64 {d0-d1}, [r5:64] |
| 896 + vadd.i32 q9, q12, q9 |
| 897 + veor q11, q11, q0 |
| 898 + vst1.64 {d22-d23}, [r4:64] |
| 899 + bls .L12 |
| 900 + cmp ip, #47 |
| 901 + vldr d2, [r5, #16] |
| 902 + vldr d3, [r5, #24] |
| 903 + vadd.i32 q13, q13, q14 |
| 904 + veor q9, q9, q1 |
| 905 + vstr d18, [r4, #16] |
| 906 + vstr d19, [r4, #24] |
| 907 + bls .L13 |
| 908 + vadd.i32 q8, q8, q10 |
| 909 + vldr d0, [r5, #32] |
| 910 + vldr d1, [r5, #40] |
| 911 + ldr r6, [r7, #196] |
| 912 + vstr d16, [r6, #48] |
| 913 + vstr d17, [r6, #56] |
| 914 + veor q8, q13, q0 |
| 915 + vstr d16, [r4, #32] |
| 916 + vstr d17, [r4, #40] |
| 917 + b .L14 |
| 918 +.L12: |
| 919 + ldr r8, [r7, #196] |
| 920 + vstr d18, [r8, #16] |
| 921 + vstr d19, [r8, #24] |
| 922 + b .L14 |
| 923 +.L20: |
| 924 + ldr r5, [r7, #180] |
| 925 + ldr r4, [r7, #184] |
| 926 + b .L2 |
| 927 +.L13: |
| 928 + ldr r6, [r7, #196] |
| 929 + vstr d26, [r6, #32] |
| 930 + vstr d27, [r6, #40] |
| 931 + b .L14 |
| 932 +.L42: |
| 933 + .align 3 |
| 934 +.L41: |
| 935 + .word 1 |
| 936 + .word 0 |
| 937 + .word 0 |
| 938 + .word 0 |
| 939 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon |
| 940 + .section .rodata |
| 941 + .align 3 |
| 942 +.LANCHOR0 = . + 0 |
| 943 +.LC0: |
| 944 + .word 1634760805 |
| 945 + .word 857760878 |
| 946 + .word 2036477234 |
| 947 + .word 1797285236 |
| 948 + .ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro
GCC 2012.10) 4.7.3 20121001 (prerelease)" |
| 949 + .section .note.GNU-stack,"",%progbits |
| 950 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c |
| 951 index 7bef015..3b6ab1d 100644 |
| 952 --- a/crypto/cryptlib.c |
| 953 +++ b/crypto/cryptlib.c |
| 954 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type) |
| 955 return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS))
; |
| 956 } |
| 957 |
| 958 +#if __arm__ |
| 959 +static int global_arm_neon_enabled = 0; |
| 960 + |
| 961 +void CRYPTO_set_NEON_capable(int on) |
| 962 + { |
| 963 + global_arm_neon_enabled = on != 0; |
| 964 + } |
| 965 + |
| 966 +int CRYPTO_is_NEON_capable() |
| 967 + { |
| 968 + return global_arm_neon_enabled; |
| 969 + } |
| 970 +#endif |
| 971 + |
| 972 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ |
| 973 defined(__INTEL__) || \ |
| 974 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined
(_M_X64) |
| 975 diff --git a/crypto/crypto.h b/crypto/crypto.h |
| 976 index e11ac73..db339c3 100644 |
| 977 --- a/crypto/crypto.h |
| 978 +++ b/crypto/crypto.h |
| 979 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void); |
| 980 |
| 981 int CRYPTO_get_new_lockid(char *name); |
| 982 |
| 983 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This |
| 984 + * code should be called before any non-init functions. */ |
| 985 +void CRYPTO_set_NEON_capable(int on); |
| 986 + |
| 987 +/* CRYPTO_is_NEON_capable returns the last value given to |
| 988 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */ |
| 989 +int CRYPTO_is_NEON_capable(); |
| 990 + |
| 991 int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */ |
| 992 void CRYPTO_lock(int mode, int type,const char *file,int line); |
| 993 void CRYPTO_set_locking_callback(void (*func)(int mode,int type, |
| 994 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c |
| 995 index 2e5621d..00d53bf 100644 |
| 996 --- a/crypto/poly1305/poly1305.c |
| 997 +++ b/crypto/poly1305/poly1305.c |
| 998 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v) |
| 999 } |
| 1000 #endif |
| 1001 |
| 1002 +#if __arm__ |
| 1003 +void CRYPTO_poly1305_init_neon(poly1305_state* state, |
| 1004 + const unsigned char key[32]); |
| 1005 + |
| 1006 +void CRYPTO_poly1305_update_neon(poly1305_state* state, |
| 1007 + const unsigned char *in, |
| 1008 + size_t in_len); |
| 1009 + |
| 1010 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]); |
| 1011 +#endif |
| 1012 + |
| 1013 static uint64_t |
| 1014 mul32x32_64(uint32_t a, uint32_t b) |
| 1015 { |
| 1016 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uns
igned char key[32]) |
| 1017 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; |
| 1018 uint32_t t0,t1,t2,t3; |
| 1019 |
| 1020 +#if __arm__ |
| 1021 + if (CRYPTO_is_NEON_capable()) |
| 1022 + { |
| 1023 + CRYPTO_poly1305_init_neon(statep, key); |
| 1024 + return; |
| 1025 + } |
| 1026 +#endif |
| 1027 + |
| 1028 t0 = U8TO32_LE(key+0); |
| 1029 t1 = U8TO32_LE(key+4); |
| 1030 t2 = U8TO32_LE(key+8); |
| 1031 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const u
nsigned char *in, |
| 1032 unsigned int i; |
| 1033 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; |
| 1034 |
| 1035 +#if __arm__ |
| 1036 + if (CRYPTO_is_NEON_capable()) |
| 1037 + { |
| 1038 + CRYPTO_poly1305_update_neon(statep, in, in_len); |
| 1039 + return; |
| 1040 + } |
| 1041 +#endif |
| 1042 + |
| 1043 if (state->buf_used) |
| 1044 { |
| 1045 unsigned int todo = 16 - state->buf_used; |
| 1046 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigne
d char mac[16]) |
| 1047 uint32_t g0,g1,g2,g3,g4; |
| 1048 uint32_t b, nb; |
| 1049 |
| 1050 +#if __arm__ |
| 1051 + if (CRYPTO_is_NEON_capable()) |
| 1052 + { |
| 1053 + CRYPTO_poly1305_finish_neon(statep, mac); |
| 1054 + return; |
| 1055 + } |
| 1056 +#endif |
| 1057 + |
| 1058 if (state->buf_used) |
| 1059 poly1305_update(state, state->buf, state->buf_used); |
| 1060 |
| 1061 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c |
| 1062 index adcef35..34e339d 100644 |
| 1063 --- a/crypto/poly1305/poly1305_arm.c |
| 1064 +++ b/crypto/poly1305/poly1305_arm.c |
| 1065 @@ -51,6 +51,7 @@ |
| 1066 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ |
| 1067 |
| 1068 #include <stdint.h> |
| 1069 +#include <string.h> |
| 1070 |
| 1071 #include <openssl/poly1305.h> |
| 1072 |
| 1073 @@ -202,7 +203,8 @@ struct poly1305_state_st { |
| 1074 unsigned char key[16]; |
| 1075 }; |
| 1076 |
| 1077 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) |
| 1078 +void CRYPTO_poly1305_init_neon(poly1305_state *state, |
| 1079 + const unsigned char key[32]) |
| 1080 { |
| 1081 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); |
| 1082 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); |
| 1083 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsig
ned char key[32]) |
| 1084 st->buf_used = 0; |
| 1085 } |
| 1086 |
| 1087 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, siz
e_t in_len) |
| 1088 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in
, |
| 1089 + size_t in_len) |
| 1090 { |
| 1091 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); |
| 1092 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); |
| 1093 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uns
igned char *in, size |
| 1094 } |
| 1095 } |
| 1096 |
| 1097 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16]) |
| 1098 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]) |
| 1099 { |
| 1100 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); |
| 1101 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); |
| 1102 -- |
| 1103 1.8.4.1 |
| 1104 |
OLD | NEW |