| OLD | NEW |
| (Empty) |
| 1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001 | |
| 2 From: Adam Langley <agl@chromium.org> | |
| 3 Date: Tue, 5 Nov 2013 13:10:11 -0500 | |
| 4 Subject: [PATCH 52/52] Optional NEON support on ARM. | |
| 5 | |
| 6 This patch causes ARM to build both the NEON and generic versions of | |
| 7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by | |
| 8 calling CRYPTO_set_NEON_capable(1). | |
| 9 --- | |
| 10 .gitignore | 1 + | |
| 11 Configure | 2 +- | |
| 12 apps/speed.c | 5 + | |
| 13 crypto/chacha/chacha_enc.c | 18 + | |
| 14 crypto/chacha/chacha_vec.c | 7 + | |
| 15 crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++ | |
| 16 crypto/cryptlib.c | 14 + | |
| 17 crypto/crypto.h | 8 + | |
| 18 crypto/poly1305/poly1305.c | 35 ++ | |
| 19 crypto/poly1305/poly1305_arm.c | 9 +- | |
| 20 10 files changed, 941 insertions(+), 4 deletions(-) | |
| 21 create mode 100644 crypto/chacha/chacha_vec_arm.s | |
| 22 | |
| 23 diff --git a/Configure b/Configure | |
| 24 index 1b95384..18b7af0 100755 | |
| 25 --- a/Configure | |
| 26 +++ b/Configure | |
| 27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-a
lpha.o:::::::ghash-a | |
| 28 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::
:::::::"; | |
| 29 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha2
56-mips.o sha512-mips.o::::::::::"; | |
| 30 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::ae
s-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-
s390x.o:::::::ghash-s390x.o:"; | |
| 31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb
c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a
rmv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void"; | |
| 32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cb
c.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-a
rmv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm
.o:void"; | |
| 33 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-p
arisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-
parisc.o::::32"; | |
| 34 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o ae
s-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::gha
sh-parisc.o::::64"; | |
| 35 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o
aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::"; | |
| 36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c | |
| 37 index 54d1ca3..e4b648f 100644 | |
| 38 --- a/crypto/chacha/chacha_enc.c | |
| 39 +++ b/crypto/chacha/chacha_enc.c | |
| 40 @@ -61,6 +61,7 @@ | |
| 41 | |
| 42 #if !defined(OPENSSL_NO_CHACHA) | |
| 43 | |
| 44 +#include <openssl/crypto.h> | |
| 45 #include <openssl/chacha.h> | |
| 46 | |
| 47 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */ | |
| 48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k"; | |
| 49 | |
| 50 typedef unsigned int uint32_t; | |
| 51 | |
| 52 +#if __arm__ | |
| 53 +/* Defined in chacha_vec.c */ | |
| 54 +void CRYPTO_chacha_20_neon(unsigned char *out, | |
| 55 + const unsigned char *in, size_t in_len, | |
| 56 + const unsigned char key[32], | |
| 57 + const unsigned char nonce[8], | |
| 58 + size_t counter); | |
| 59 +#endif | |
| 60 + | |
| 61 /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in | |
| 62 * |input| and writes the 64 output bytes to |output|. */ | |
| 63 static void chacha_core(unsigned char output[64], const uint32_t input[16], | |
| 64 @@ -124,6 +134,16 @@ void CRYPTO_chacha_20(unsigned char *out, | |
| 65 unsigned char buf[64]; | |
| 66 size_t todo, i; | |
| 67 | |
| 68 +#if __arm__ | |
| 69 + if (CRYPTO_is_NEON_capable() && | |
| 70 + ((intptr_t)in & 15) == 0 && | |
| 71 + ((intptr_t)out & 15) == 0) | |
| 72 + { | |
| 73 + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter); | |
| 74 + return; | |
| 75 + } | |
| 76 +#endif | |
| 77 + | |
| 78 input[0] = U8TO32_LITTLE(sigma + 0); | |
| 79 input[1] = U8TO32_LITTLE(sigma + 4); | |
| 80 input[2] = U8TO32_LITTLE(sigma + 8); | |
| 81 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c | |
| 82 index 33b2238..1226c39 100644 | |
| 83 --- a/crypto/chacha/chacha_vec.c | |
| 84 +++ b/crypto/chacha/chacha_vec.c | |
| 85 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16))); | |
| 86 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ | |
| 87 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); | |
| 88 | |
| 89 +#if __ARM_NEON__ | |
| 90 +/* For ARM, we can't depend on NEON support, so this function is compiled with | |
| 91 + * a different name, along with the generic code, and can be enabled at | |
| 92 + * run-time. */ | |
| 93 +void CRYPTO_chacha_20_neon( | |
| 94 +#else | |
| 95 void CRYPTO_chacha_20( | |
| 96 +#endif | |
| 97 unsigned char *out, | |
| 98 const unsigned char *in, | |
| 99 size_t inlen, | |
| 100 diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S | |
| 101 new file mode 100644 | |
| 102 index 0000000..24a5050 | |
| 103 --- /dev/null | |
| 104 +++ b/crypto/chacha/chacha_vec_arm.S | |
| 105 @@ -0,0 +1,863 @@ | |
| 106 +# This file contains a pre-compiled version of chacha_vec.c for ARM. This is | |
| 107 +# needed to support switching on NEON code at runtime. If the whole of OpenSSL | |
| 108 +# were to be compiled with the needed flags to build chacha_vec.c, then it | |
| 109 +# wouldn't be possible to run on non-NEON systems. | |
| 110 +# | |
| 111 +# This file was generated by: | |
| 112 +# | |
| 113 +# /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-li
nux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../includ
e -fpic -o chacha_vec_arm.S | |
| 114 +# | |
| 115 +# And then EABI attribute 28 was set to zero to allow linking with soft-float | |
| 116 +# code. | |
| 117 + | |
| 118 + .syntax unified | |
| 119 + .cpu cortex-a8 | |
| 120 + .eabi_attribute 27, 3 | |
| 121 + .eabi_attribute 28, 0 | |
| 122 + .fpu neon | |
| 123 + .eabi_attribute 20, 1 | |
| 124 + .eabi_attribute 21, 1 | |
| 125 + .eabi_attribute 23, 3 | |
| 126 + .eabi_attribute 24, 1 | |
| 127 + .eabi_attribute 25, 1 | |
| 128 + .eabi_attribute 26, 2 | |
| 129 + .eabi_attribute 30, 2 | |
| 130 + .eabi_attribute 34, 1 | |
| 131 + .eabi_attribute 18, 4 | |
| 132 + .thumb | |
| 133 + .file "chacha_vec.c" | |
| 134 + .text | |
| 135 + .align 2 | |
| 136 + .global CRYPTO_chacha_20_neon | |
| 137 + .thumb | |
| 138 + .thumb_func | |
| 139 + .type CRYPTO_chacha_20_neon, %function | |
| 140 +CRYPTO_chacha_20_neon: | |
| 141 + @ args = 8, pretend = 0, frame = 304 | |
| 142 + @ frame_needed = 1, uses_anonymous_args = 0 | |
| 143 + @ link register save eliminated. | |
| 144 + push {r4, r5, r6, r7, r8, r9, sl, fp} | |
| 145 + fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} | |
| 146 + sub sp, sp, #304 | |
| 147 + add r7, sp, #0 | |
| 148 + movw ip, #43691 | |
| 149 + movt ip, 43690 | |
| 150 + str r2, [r7, #196] | |
| 151 + sub sp, sp, #96 | |
| 152 + ldr r4, [r7, #196] | |
| 153 + ldr r6, [r7, #400] | |
| 154 + ldr r2, .L38+16 | |
| 155 + umull r4, ip, ip, r4 | |
| 156 + ldr r6, [r6, #0] | |
| 157 + ldr r8, [r7, #400] | |
| 158 +.LPIC24: | |
| 159 + add r2, pc | |
| 160 + add r4, sp, #15 | |
| 161 + str r3, [r7, #244] | |
| 162 + str r6, [r7, #176] | |
| 163 + bic r4, r4, #15 | |
| 164 + str r0, [r7, #188] | |
| 165 + str r4, [r7, #200] | |
| 166 + lsrs ip, ip, #7 | |
| 167 + str r1, [r7, #184] | |
| 168 + ldmia r2, {r0, r1, r2, r3} | |
| 169 + ldr r4, [r8, #4] | |
| 170 + ldr r5, [r7, #244] | |
| 171 + vld1.64 {d24-d25}, [r5:64] | |
| 172 + vldr d26, [r5, #16] | |
| 173 + vldr d27, [r5, #24] | |
| 174 + ldr r9, [r7, #200] | |
| 175 + ldr r8, [r7, #404] | |
| 176 + ldr r5, [r7, #176] | |
| 177 + add r6, r9, #64 | |
| 178 + str r4, [r7, #300] | |
| 179 + mov r4, #0 | |
| 180 + str r8, [r7, #288] | |
| 181 + str r5, [r7, #296] | |
| 182 + str r4, [r7, #292] | |
| 183 + stmia r6, {r0, r1, r2, r3} | |
| 184 + vldr d22, [r9, #64] | |
| 185 + vldr d23, [r9, #72] | |
| 186 + vldr d20, [r7, #288] | |
| 187 + vldr d21, [r7, #296] | |
| 188 + str ip, [r7, #192] | |
| 189 + beq .L20 | |
| 190 + lsl r6, ip, #1 | |
| 191 + ldr r1, [r9, #68] | |
| 192 + add r3, r6, ip | |
| 193 + str r6, [r7, #180] | |
| 194 + ldr r2, [r9, #72] | |
| 195 + add r8, r8, #2 | |
| 196 + ldr r5, [r9, #76] | |
| 197 + vldr d18, .L38 | |
| 198 + vldr d19, .L38+8 | |
| 199 + str r4, [r7, #240] | |
| 200 + ldr r6, [r7, #184] | |
| 201 + ldr r4, [r7, #188] | |
| 202 + str r0, [r7, #224] | |
| 203 + str r1, [r7, #220] | |
| 204 + str r8, [r7, #208] | |
| 205 + str r2, [r7, #216] | |
| 206 + str r3, [r7, #204] | |
| 207 + str r5, [r7, #212] | |
| 208 + str r6, [r7, #252] | |
| 209 + str r4, [r7, #248] | |
| 210 +.L4: | |
| 211 + ldr r2, [r7, #244] | |
| 212 + add r9, r7, #216 | |
| 213 + ldr r3, [r7, #244] | |
| 214 + vadd.i32 q8, q10, q9 | |
| 215 + ldr r6, [r7, #208] | |
| 216 + vmov q15, q13 @ v4si | |
| 217 + ldr r5, [r7, #240] | |
| 218 + vmov q3, q12 @ v4si | |
| 219 + ldr r4, [r7, #244] | |
| 220 + vmov q2, q11 @ v4si | |
| 221 + adds r5, r5, r6 | |
| 222 + ldr r2, [r2, #8] | |
| 223 + ldr r6, [r7, #400] | |
| 224 + vmov q5, q10 @ v4si | |
| 225 + ldr r3, [r3, #12] | |
| 226 + vmov q1, q13 @ v4si | |
| 227 + ldr r0, [r7, #244] | |
| 228 + vmov q0, q12 @ v4si | |
| 229 + ldr r1, [r7, #244] | |
| 230 + vmov q4, q11 @ v4si | |
| 231 + ldmia r9, {r9, sl, fp} | |
| 232 + str r5, [r7, #228] | |
| 233 + ldr r5, [r4, #24] | |
| 234 + ldr r0, [r0, #0] | |
| 235 + ldr r1, [r1, #4] | |
| 236 + str r2, [r7, #264] | |
| 237 + str r3, [r7, #236] | |
| 238 + ldr r2, [r6, #4] | |
| 239 + ldr r3, [r4, #28] | |
| 240 + str r5, [r7, #280] | |
| 241 + ldr r5, [r6, #0] | |
| 242 + movs r6, #0 | |
| 243 + ldr ip, [r7, #228] | |
| 244 + ldr r8, [r7, #212] | |
| 245 + str r0, [r7, #232] | |
| 246 + str r1, [r7, #268] | |
| 247 + ldr r0, [r4, #16] | |
| 248 + ldr r1, [r4, #20] | |
| 249 + movs r4, #10 | |
| 250 + str r2, [r7, #24] | |
| 251 + str r3, [r7, #284] | |
| 252 + str r4, [r7, #256] | |
| 253 + ldr r2, [r7, #264] | |
| 254 + str r9, [r7, #276] | |
| 255 + mov r9, r6 | |
| 256 + ldr r6, [r7, #280] | |
| 257 + str r8, [r7, #260] | |
| 258 + mov r8, sl | |
| 259 + str r1, [r7, #272] | |
| 260 + mov sl, ip | |
| 261 + str r6, [r7, #264] | |
| 262 + mov r6, r5 | |
| 263 + ldr r3, [r7, #236] | |
| 264 + mov r5, r0 | |
| 265 + ldr ip, [r7, #24] | |
| 266 + ldr r1, [r7, #268] | |
| 267 + ldr r0, [r7, #232] | |
| 268 + b .L39 | |
| 269 +.L40: | |
| 270 + .align 3 | |
| 271 +.L38: | |
| 272 + .word 1 | |
| 273 + .word 0 | |
| 274 + .word 0 | |
| 275 + .word 0 | |
| 276 + .word .LANCHOR0-(.LPIC24+4) | |
| 277 +.L39: | |
| 278 +.L3: | |
| 279 + vadd.i32 q4, q4, q0 | |
| 280 + add r8, r8, r1 | |
| 281 + vadd.i32 q2, q2, q3 | |
| 282 + str r8, [r7, #268] | |
| 283 + veor q5, q5, q4 | |
| 284 + ldr r8, [r7, #276] | |
| 285 + veor q8, q8, q2 | |
| 286 + add fp, fp, r0 | |
| 287 + str fp, [r7, #280] | |
| 288 + add r8, r8, r2 | |
| 289 + vrev32.16 q5, q5 | |
| 290 + str r8, [r7, #276] | |
| 291 + vrev32.16 q8, q8 | |
| 292 + vadd.i32 q1, q1, q5 | |
| 293 + vadd.i32 q15, q15, q8 | |
| 294 + ldr r8, [r7, #280] | |
| 295 + veor q0, q1, q0 | |
| 296 + ldr r4, [r7, #260] | |
| 297 + veor q3, q15, q3 | |
| 298 + eor sl, sl, r8 | |
| 299 + ldr r8, [r7, #276] | |
| 300 + add fp, r4, r3 | |
| 301 + vshl.i32 q7, q0, #12 | |
| 302 + ldr r4, [r7, #268] | |
| 303 + vshl.i32 q6, q3, #12 | |
| 304 + eor r6, r6, r8 | |
| 305 + eor r9, r9, r4 | |
| 306 + ldr r4, [r7, #272] | |
| 307 + vsri.32 q7, q0, #20 | |
| 308 + ror r8, r6, #16 | |
| 309 + ldr r6, [r7, #264] | |
| 310 + eor ip, ip, fp | |
| 311 + vsri.32 q6, q3, #20 | |
| 312 + ror sl, sl, #16 | |
| 313 + ror r9, r9, #16 | |
| 314 + add r5, r5, sl | |
| 315 + vadd.i32 q4, q4, q7 | |
| 316 + str r5, [r7, #236] | |
| 317 + vadd.i32 q2, q2, q6 | |
| 318 + add r5, r4, r9 | |
| 319 + add r4, r6, r8 | |
| 320 + ldr r6, [r7, #284] | |
| 321 + ror ip, ip, #16 | |
| 322 + veor q5, q4, q5 | |
| 323 + veor q8, q2, q8 | |
| 324 + add r6, r6, ip | |
| 325 + str r6, [r7, #264] | |
| 326 + eors r1, r1, r5 | |
| 327 + ldr r6, [r7, #236] | |
| 328 + vshl.i32 q3, q5, #8 | |
| 329 + vshl.i32 q14, q8, #8 | |
| 330 + eors r2, r2, r4 | |
| 331 + eors r0, r0, r6 | |
| 332 + ldr r6, [r7, #264] | |
| 333 + vsri.32 q3, q5, #24 | |
| 334 + ror r1, r1, #20 | |
| 335 + eors r3, r3, r6 | |
| 336 + ldr r6, [r7, #280] | |
| 337 + ror r0, r0, #20 | |
| 338 + vsri.32 q14, q8, #24 | |
| 339 + adds r6, r0, r6 | |
| 340 + str r6, [r7, #284] | |
| 341 + ldr r6, [r7, #268] | |
| 342 + vadd.i32 q1, q1, q3 | |
| 343 + vadd.i32 q15, q15, q14 | |
| 344 + ror r2, r2, #20 | |
| 345 + adds r6, r1, r6 | |
| 346 + str r6, [r7, #260] | |
| 347 + ldr r6, [r7, #276] | |
| 348 + veor q6, q15, q6 | |
| 349 + veor q7, q1, q7 | |
| 350 + ror r3, r3, #20 | |
| 351 + adds r6, r2, r6 | |
| 352 + str r6, [r7, #280] | |
| 353 + ldr r6, [r7, #284] | |
| 354 + vshl.i32 q0, q6, #7 | |
| 355 + vshl.i32 q5, q7, #7 | |
| 356 + add fp, r3, fp | |
| 357 + eor sl, r6, sl | |
| 358 + ldr r6, [r7, #260] | |
| 359 + eor ip, fp, ip | |
| 360 + vsri.32 q0, q6, #25 | |
| 361 + eor r9, r6, r9 | |
| 362 + ldr r6, [r7, #280] | |
| 363 + ror sl, sl, #24 | |
| 364 + vsri.32 q5, q7, #25 | |
| 365 + eor r8, r6, r8 | |
| 366 + ldr r6, [r7, #236] | |
| 367 + ror r9, r9, #24 | |
| 368 + ror ip, ip, #24 | |
| 369 + add r6, sl, r6 | |
| 370 + str r6, [r7, #276] | |
| 371 + ldr r6, [r7, #264] | |
| 372 + add r5, r9, r5 | |
| 373 + str r5, [r7, #272] | |
| 374 + vext.32 q5, q5, q5, #1 | |
| 375 + add r5, ip, r6 | |
| 376 + ldr r6, [r7, #276] | |
| 377 + vext.32 q0, q0, q0, #1 | |
| 378 + vadd.i32 q4, q4, q5 | |
| 379 + eors r0, r0, r6 | |
| 380 + ldr r6, [r7, #272] | |
| 381 + vadd.i32 q2, q2, q0 | |
| 382 + vext.32 q3, q3, q3, #3 | |
| 383 + ror r8, r8, #24 | |
| 384 + eors r1, r1, r6 | |
| 385 + vext.32 q14, q14, q14, #3 | |
| 386 + add r4, r8, r4 | |
| 387 + ldr r6, [r7, #284] | |
| 388 + veor q3, q4, q3 | |
| 389 + veor q14, q2, q14 | |
| 390 + eors r2, r2, r4 | |
| 391 + ror r1, r1, #25 | |
| 392 + vext.32 q1, q1, q1, #2 | |
| 393 + adds r6, r1, r6 | |
| 394 + str r6, [r7, #284] | |
| 395 + vext.32 q15, q15, q15, #2 | |
| 396 + ldr r6, [r7, #260] | |
| 397 + eors r3, r3, r5 | |
| 398 + ror r2, r2, #25 | |
| 399 + vrev32.16 q8, q14 | |
| 400 + adds r6, r2, r6 | |
| 401 + vrev32.16 q3, q3 | |
| 402 + str r6, [r7, #268] | |
| 403 + vadd.i32 q1, q1, q3 | |
| 404 + ldr r6, [r7, #280] | |
| 405 + vadd.i32 q15, q15, q8 | |
| 406 + ror r3, r3, #25 | |
| 407 + veor q5, q1, q5 | |
| 408 + adds r6, r3, r6 | |
| 409 + veor q0, q15, q0 | |
| 410 + str r6, [r7, #264] | |
| 411 + ldr r6, [r7, #268] | |
| 412 + ror r0, r0, #25 | |
| 413 + add fp, r0, fp | |
| 414 + vshl.i32 q6, q5, #12 | |
| 415 + eor sl, r6, sl | |
| 416 + ldr r6, [r7, #284] | |
| 417 + vshl.i32 q14, q0, #12 | |
| 418 + eor r8, fp, r8 | |
| 419 + eor ip, r6, ip | |
| 420 + ldr r6, [r7, #264] | |
| 421 + vsri.32 q6, q5, #20 | |
| 422 + ror sl, sl, #16 | |
| 423 + eor r9, r6, r9 | |
| 424 + ror r6, r8, #16 | |
| 425 + vsri.32 q14, q0, #20 | |
| 426 + ldr r8, [r7, #272] | |
| 427 + ror ip, ip, #16 | |
| 428 + add r5, sl, r5 | |
| 429 + add r8, r6, r8 | |
| 430 + add r4, ip, r4 | |
| 431 + str r4, [r7, #236] | |
| 432 + eor r0, r8, r0 | |
| 433 + str r5, [r7, #280] | |
| 434 + vadd.i32 q4, q4, q6 | |
| 435 + ldr r5, [r7, #236] | |
| 436 + vadd.i32 q2, q2, q14 | |
| 437 + ldr r4, [r7, #276] | |
| 438 + ror r0, r0, #20 | |
| 439 + veor q3, q4, q3 | |
| 440 + eors r1, r1, r5 | |
| 441 + veor q0, q2, q8 | |
| 442 + str r8, [r7, #272] | |
| 443 + str r0, [r7, #24] | |
| 444 + add fp, r0, fp | |
| 445 + ldr r8, [r7, #280] | |
| 446 + ror r9, r9, #16 | |
| 447 + ldr r0, [r7, #284] | |
| 448 + add r4, r9, r4 | |
| 449 + str fp, [r7, #260] | |
| 450 + ror r1, r1, #20 | |
| 451 + add fp, r1, r0 | |
| 452 + eor r2, r8, r2 | |
| 453 + ldr r0, [r7, #260] | |
| 454 + eors r3, r3, r4 | |
| 455 + vshl.i32 q5, q3, #8 | |
| 456 + str r4, [r7, #232] | |
| 457 + vshl.i32 q8, q0, #8 | |
| 458 + ldr r4, [r7, #268] | |
| 459 + ldr r5, [r7, #264] | |
| 460 + ror r2, r2, #20 | |
| 461 + ror r3, r3, #20 | |
| 462 + eors r6, r6, r0 | |
| 463 + adds r5, r3, r5 | |
| 464 + add r8, r2, r4 | |
| 465 + vsri.32 q5, q3, #24 | |
| 466 + ldr r4, [r7, #272] | |
| 467 + eor r9, r5, r9 | |
| 468 + eor ip, fp, ip | |
| 469 + vsri.32 q8, q0, #24 | |
| 470 + eor sl, r8, sl | |
| 471 + ror r6, r6, #24 | |
| 472 + ldr r0, [r7, #280] | |
| 473 + str r5, [r7, #276] | |
| 474 + adds r4, r6, r4 | |
| 475 + ldr r5, [r7, #236] | |
| 476 + vadd.i32 q1, q1, q5 | |
| 477 + str r4, [r7, #272] | |
| 478 + vadd.i32 q15, q15, q8 | |
| 479 + ldr r4, [r7, #232] | |
| 480 + ror ip, ip, #24 | |
| 481 + ror sl, sl, #24 | |
| 482 + ror r9, r9, #24 | |
| 483 + add r5, ip, r5 | |
| 484 + add r0, sl, r0 | |
| 485 + str r5, [r7, #264] | |
| 486 + add r5, r9, r4 | |
| 487 + str r0, [r7, #284] | |
| 488 + veor q6, q1, q6 | |
| 489 + ldr r4, [r7, #24] | |
| 490 + veor q14, q15, q14 | |
| 491 + ldr r0, [r7, #272] | |
| 492 + eors r3, r3, r5 | |
| 493 + vshl.i32 q0, q6, #7 | |
| 494 + vext.32 q1, q1, q1, #2 | |
| 495 + eors r0, r0, r4 | |
| 496 + ldr r4, [r7, #284] | |
| 497 + str r0, [r7, #280] | |
| 498 + vshl.i32 q3, q14, #7 | |
| 499 + eors r2, r2, r4 | |
| 500 + ldr r4, [r7, #280] | |
| 501 + ldr r0, [r7, #264] | |
| 502 + vsri.32 q0, q6, #25 | |
| 503 + ror r2, r2, #25 | |
| 504 + ror r3, r3, #25 | |
| 505 + eors r1, r1, r0 | |
| 506 + vsri.32 q3, q14, #25 | |
| 507 + ror r0, r4, #25 | |
| 508 + ldr r4, [r7, #256] | |
| 509 + ror r1, r1, #25 | |
| 510 + vext.32 q5, q5, q5, #1 | |
| 511 + subs r4, r4, #1 | |
| 512 + str r4, [r7, #256] | |
| 513 + vext.32 q15, q15, q15, #2 | |
| 514 + vext.32 q8, q8, q8, #1 | |
| 515 + vext.32 q0, q0, q0, #3 | |
| 516 + vext.32 q3, q3, q3, #3 | |
| 517 + bne .L3 | |
| 518 + ldr r4, [r7, #264] | |
| 519 + vadd.i32 q14, q10, q9 | |
| 520 + str r2, [r7, #264] | |
| 521 + vadd.i32 q10, q10, q5 | |
| 522 + ldr r2, [r7, #252] | |
| 523 + vld1.64 {d12-d13}, [r2:64] | |
| 524 + ldr r2, [r7, #220] | |
| 525 + vadd.i32 q4, q11, q4 | |
| 526 + str ip, [r7, #24] | |
| 527 + mov ip, sl | |
| 528 + mov sl, r8 | |
| 529 + ldr r8, [r7, #260] | |
| 530 + add sl, sl, r2 | |
| 531 + ldr r2, [r7, #212] | |
| 532 + str r4, [r7, #280] | |
| 533 + vadd.i32 q0, q12, q0 | |
| 534 + ldr r4, [r7, #224] | |
| 535 + add r8, r8, r2 | |
| 536 + ldr r2, [r7, #240] | |
| 537 + vadd.i32 q1, q13, q1 | |
| 538 + str r0, [r7, #232] | |
| 539 + add fp, fp, r4 | |
| 540 + mov r0, r5 | |
| 541 + ldr r4, [r7, #216] | |
| 542 + mov r5, r6 | |
| 543 + mov r6, r9 | |
| 544 + ldr r9, [r7, #276] | |
| 545 + adds r2, r2, #3 | |
| 546 + str r2, [r7, #240] | |
| 547 + vadd.i32 q2, q11, q2 | |
| 548 + ldr r2, [r7, #252] | |
| 549 + add r9, r9, r4 | |
| 550 + vadd.i32 q3, q12, q3 | |
| 551 + ldr r4, [r7, #228] | |
| 552 + vadd.i32 q15, q13, q15 | |
| 553 + str r1, [r7, #268] | |
| 554 + vadd.i32 q8, q14, q8 | |
| 555 + str r3, [r7, #236] | |
| 556 + veor q4, q4, q6 | |
| 557 + ldr r3, [r7, #284] | |
| 558 + ldr r1, [r7, #272] | |
| 559 + add ip, r4, ip | |
| 560 + ldr r4, [r7, #248] | |
| 561 + vst1.64 {d8-d9}, [r4:64] | |
| 562 + vldr d8, [r2, #16] | |
| 563 + vldr d9, [r2, #24] | |
| 564 + veor q0, q0, q4 | |
| 565 + vstr d0, [r4, #16] | |
| 566 + vstr d1, [r4, #24] | |
| 567 + vldr d0, [r2, #32] | |
| 568 + vldr d1, [r2, #40] | |
| 569 + veor q1, q1, q0 | |
| 570 + vstr d2, [r4, #32] | |
| 571 + vstr d3, [r4, #40] | |
| 572 + vldr d2, [r2, #48] | |
| 573 + vldr d3, [r2, #56] | |
| 574 + veor q10, q10, q1 | |
| 575 + vstr d20, [r4, #48] | |
| 576 + vstr d21, [r4, #56] | |
| 577 + vldr d8, [r2, #64] | |
| 578 + vldr d9, [r2, #72] | |
| 579 + veor q2, q2, q4 | |
| 580 + vstr d4, [r4, #64] | |
| 581 + vstr d5, [r4, #72] | |
| 582 + vldr d10, [r2, #80] | |
| 583 + vldr d11, [r2, #88] | |
| 584 + veor q3, q3, q5 | |
| 585 + vstr d6, [r4, #80] | |
| 586 + vstr d7, [r4, #88] | |
| 587 + vldr d12, [r2, #96] | |
| 588 + vldr d13, [r2, #104] | |
| 589 + veor q15, q15, q6 | |
| 590 + vstr d30, [r4, #96] | |
| 591 + vstr d31, [r4, #104] | |
| 592 + vldr d20, [r2, #112] | |
| 593 + vldr d21, [r2, #120] | |
| 594 + veor q8, q8, q10 | |
| 595 + vstr d16, [r4, #112] | |
| 596 + vstr d17, [r4, #120] | |
| 597 + ldr r4, [r2, #128] | |
| 598 + ldr r2, [r7, #248] | |
| 599 + vadd.i32 q10, q14, q9 | |
| 600 + eor r4, fp, r4 | |
| 601 + vadd.i32 q10, q10, q9 | |
| 602 + str r4, [r2, #128] | |
| 603 + ldr r4, [r7, #252] | |
| 604 + ldr r2, [r4, #132] | |
| 605 + eor r2, sl, r2 | |
| 606 + ldr sl, [r7, #248] | |
| 607 + str r2, [sl, #132] | |
| 608 + ldr r2, [r4, #136] | |
| 609 + eor r2, r9, r2 | |
| 610 + str r2, [sl, #136] | |
| 611 + ldr r2, [r4, #140] | |
| 612 + eor r2, r8, r2 | |
| 613 + str r2, [sl, #140] | |
| 614 + ldr r2, [r7, #244] | |
| 615 + ldr r4, [r4, #144] | |
| 616 + ldr r2, [r2, #0] | |
| 617 + str r4, [r7, #44] | |
| 618 + ldr r4, [r7, #232] | |
| 619 + add r8, r4, r2 | |
| 620 + ldr r2, [r7, #44] | |
| 621 + ldr r4, [r7, #244] | |
| 622 + eor r8, r8, r2 | |
| 623 + ldr r2, [r7, #252] | |
| 624 + str r8, [sl, #144] | |
| 625 + ldr r4, [r4, #4] | |
| 626 + ldr r2, [r2, #148] | |
| 627 + str r2, [r7, #40] | |
| 628 + ldr r2, [r7, #268] | |
| 629 + add r8, r2, r4 | |
| 630 + ldr r4, [r7, #40] | |
| 631 + ldr r2, [r7, #244] | |
| 632 + eor r8, r8, r4 | |
| 633 + ldr r4, [r7, #252] | |
| 634 + str r8, [sl, #148] | |
| 635 + ldr r2, [r2, #8] | |
| 636 + ldr r4, [r4, #152] | |
| 637 + str r4, [r7, #36] | |
| 638 + ldr r4, [r7, #264] | |
| 639 + add r8, r4, r2 | |
| 640 + ldr r2, [r7, #36] | |
| 641 + eor r8, r8, r2 | |
| 642 + str r8, [sl, #152] | |
| 643 + ldr r2, [r7, #252] | |
| 644 + ldr r4, [r7, #244] | |
| 645 + ldr r2, [r2, #156] | |
| 646 + ldr r4, [r4, #12] | |
| 647 + str r2, [r7, #32] | |
| 648 + ldr r2, [r7, #236] | |
| 649 + add r8, r2, r4 | |
| 650 + ldr r4, [r7, #32] | |
| 651 + ldr r2, [r7, #252] | |
| 652 + eor r8, r8, r4 | |
| 653 + str r8, [sl, #156] | |
| 654 + ldr r8, [r7, #244] | |
| 655 + ldr r2, [r2, #160] | |
| 656 + ldr r4, [r8, #16] | |
| 657 + adds r0, r0, r4 | |
| 658 + ldr r4, [r7, #252] | |
| 659 + eors r0, r0, r2 | |
| 660 + str r0, [sl, #160] | |
| 661 + ldr r0, [r8, #20] | |
| 662 + ldr r2, [r4, #164] | |
| 663 + adds r1, r1, r0 | |
| 664 + ldr r0, [r7, #280] | |
| 665 + eors r1, r1, r2 | |
| 666 + str r1, [sl, #164] | |
| 667 + ldr r2, [r8, #24] | |
| 668 + ldr r1, [r4, #168] | |
| 669 + adds r2, r0, r2 | |
| 670 + eors r2, r2, r1 | |
| 671 + str r2, [sl, #168] | |
| 672 + ldr r1, [r8, #28] | |
| 673 + ldr r2, [r4, #172] | |
| 674 + adds r3, r3, r1 | |
| 675 + eors r3, r3, r2 | |
| 676 + str r3, [sl, #172] | |
| 677 + ldr r3, [r4, #176] | |
| 678 + eor r3, ip, r3 | |
| 679 + str r3, [sl, #176] | |
| 680 + ldr r3, [r4, #180] | |
| 681 + ldr r4, [r7, #400] | |
| 682 + eors r6, r6, r3 | |
| 683 + str r6, [sl, #180] | |
| 684 + ldr r6, [r7, #252] | |
| 685 + ldr r2, [r4, #0] | |
| 686 + ldr r3, [r6, #184] | |
| 687 + adds r5, r5, r2 | |
| 688 + eors r5, r5, r3 | |
| 689 + str r5, [sl, #184] | |
| 690 + ldr r2, [r6, #188] | |
| 691 + adds r6, r6, #192 | |
| 692 + ldr r3, [r4, #4] | |
| 693 + str r6, [r7, #252] | |
| 694 + ldr r0, [r7, #24] | |
| 695 + ldr r1, [r7, #240] | |
| 696 + adds r4, r0, r3 | |
| 697 + eors r4, r4, r2 | |
| 698 + ldr r2, [r7, #204] | |
| 699 + str r4, [sl, #188] | |
| 700 + add sl, sl, #192 | |
| 701 + cmp r1, r2 | |
| 702 + str sl, [r7, #248] | |
| 703 + bne .L4 | |
| 704 + ldr r4, [r7, #192] | |
| 705 + ldr r3, [r7, #180] | |
| 706 + ldr r6, [r7, #188] | |
| 707 + adds r5, r3, r4 | |
| 708 + ldr r8, [r7, #184] | |
| 709 + lsls r5, r5, #6 | |
| 710 + adds r4, r6, r5 | |
| 711 + add r5, r8, r5 | |
| 712 +.L2: | |
| 713 + ldr r9, [r7, #196] | |
| 714 + movw r3, #43691 | |
| 715 + movt r3, 43690 | |
| 716 + ldr sl, [r7, #196] | |
| 717 + umull r9, r3, r3, r9 | |
| 718 + lsrs r3, r3, #7 | |
| 719 + add r3, r3, r3, lsl #1 | |
| 720 + sub r3, sl, r3, lsl #6 | |
| 721 + lsrs r6, r3, #6 | |
| 722 + beq .L5 | |
| 723 + add r1, r5, #16 | |
| 724 + add r2, r4, #16 | |
| 725 + mov r0, r6 | |
| 726 + vldr d30, .L41 | |
| 727 + vldr d31, .L41+8 | |
| 728 +.L6: | |
| 729 + vmov q8, q10 @ v4si | |
| 730 + movs r3, #10 | |
| 731 + vmov q1, q13 @ v4si | |
| 732 + vmov q14, q12 @ v4si | |
| 733 + vmov q3, q11 @ v4si | |
| 734 +.L7: | |
| 735 + vadd.i32 q3, q3, q14 | |
| 736 + subs r3, r3, #1 | |
| 737 + veor q2, q8, q3 | |
| 738 + vrev32.16 q2, q2 | |
| 739 + vadd.i32 q8, q1, q2 | |
| 740 + veor q9, q8, q14 | |
| 741 + vshl.i32 q14, q9, #12 | |
| 742 + vsri.32 q14, q9, #20 | |
| 743 + vadd.i32 q3, q3, q14 | |
| 744 + veor q2, q3, q2 | |
| 745 + vshl.i32 q9, q2, #8 | |
| 746 + vsri.32 q9, q2, #24 | |
| 747 + vadd.i32 q8, q8, q9 | |
| 748 + vext.32 q9, q9, q9, #3 | |
| 749 + veor q14, q8, q14 | |
| 750 + vext.32 q1, q8, q8, #2 | |
| 751 + vshl.i32 q8, q14, #7 | |
| 752 + vsri.32 q8, q14, #25 | |
| 753 + vext.32 q8, q8, q8, #1 | |
| 754 + vadd.i32 q3, q3, q8 | |
| 755 + veor q2, q3, q9 | |
| 756 + vrev32.16 q2, q2 | |
| 757 + vadd.i32 q9, q1, q2 | |
| 758 + veor q8, q9, q8 | |
| 759 + vshl.i32 q14, q8, #12 | |
| 760 + vsri.32 q14, q8, #20 | |
| 761 + vadd.i32 q3, q3, q14 | |
| 762 + veor q2, q3, q2 | |
| 763 + vshl.i32 q8, q2, #8 | |
| 764 + vsri.32 q8, q2, #24 | |
| 765 + vadd.i32 q9, q9, q8 | |
| 766 + vext.32 q8, q8, q8, #1 | |
| 767 + veor q14, q9, q14 | |
| 768 + vext.32 q1, q9, q9, #2 | |
| 769 + vshl.i32 q9, q14, #7 | |
| 770 + vsri.32 q9, q14, #25 | |
| 771 + vext.32 q14, q9, q9, #3 | |
| 772 + bne .L7 | |
| 773 + vadd.i32 q8, q10, q8 | |
| 774 + subs r0, r0, #1 | |
| 775 + vadd.i32 q3, q11, q3 | |
| 776 + vldr d0, [r1, #-16] | |
| 777 + vldr d1, [r1, #-8] | |
| 778 + vadd.i32 q14, q12, q14 | |
| 779 + vadd.i32 q1, q13, q1 | |
| 780 + veor q3, q3, q0 | |
| 781 + vstr d6, [r2, #-16] | |
| 782 + vstr d7, [r2, #-8] | |
| 783 + vadd.i32 q10, q10, q15 | |
| 784 + vld1.64 {d8-d9}, [r1:64] | |
| 785 + veor q14, q14, q4 | |
| 786 + vst1.64 {d28-d29}, [r2:64] | |
| 787 + vldr d10, [r1, #16] | |
| 788 + vldr d11, [r1, #24] | |
| 789 + veor q1, q1, q5 | |
| 790 + vstr d2, [r2, #16] | |
| 791 + vstr d3, [r2, #24] | |
| 792 + vldr d18, [r1, #32] | |
| 793 + vldr d19, [r1, #40] | |
| 794 + add r1, r1, #64 | |
| 795 + veor q8, q8, q9 | |
| 796 + vstr d16, [r2, #32] | |
| 797 + vstr d17, [r2, #40] | |
| 798 + add r2, r2, #64 | |
| 799 + bne .L6 | |
| 800 + lsls r6, r6, #6 | |
| 801 + adds r4, r4, r6 | |
| 802 + adds r5, r5, r6 | |
| 803 +.L5: | |
| 804 + ldr r6, [r7, #196] | |
| 805 + ands ip, r6, #63 | |
| 806 + beq .L1 | |
| 807 + vmov q8, q10 @ v4si | |
| 808 + movs r3, #10 | |
| 809 + vmov q14, q13 @ v4si | |
| 810 + vmov q9, q12 @ v4si | |
| 811 + vmov q15, q11 @ v4si | |
| 812 +.L10: | |
| 813 + vadd.i32 q15, q15, q9 | |
| 814 + subs r3, r3, #1 | |
| 815 + veor q8, q8, q15 | |
| 816 + vrev32.16 q8, q8 | |
| 817 + vadd.i32 q3, q14, q8 | |
| 818 + veor q9, q3, q9 | |
| 819 + vshl.i32 q14, q9, #12 | |
| 820 + vsri.32 q14, q9, #20 | |
| 821 + vadd.i32 q15, q15, q14 | |
| 822 + veor q9, q15, q8 | |
| 823 + vshl.i32 q8, q9, #8 | |
| 824 + vsri.32 q8, q9, #24 | |
| 825 + vadd.i32 q9, q3, q8 | |
| 826 + vext.32 q8, q8, q8, #3 | |
| 827 + veor q2, q9, q14 | |
| 828 + vext.32 q14, q9, q9, #2 | |
| 829 + vshl.i32 q9, q2, #7 | |
| 830 + vsri.32 q9, q2, #25 | |
| 831 + vext.32 q9, q9, q9, #1 | |
| 832 + vadd.i32 q15, q15, q9 | |
| 833 + veor q3, q15, q8 | |
| 834 + vrev32.16 q3, q3 | |
| 835 + vadd.i32 q14, q14, q3 | |
| 836 + veor q8, q14, q9 | |
| 837 + vshl.i32 q9, q8, #12 | |
| 838 + vsri.32 q9, q8, #20 | |
| 839 + vadd.i32 q15, q15, q9 | |
| 840 + veor q3, q15, q3 | |
| 841 + vshl.i32 q8, q3, #8 | |
| 842 + vsri.32 q8, q3, #24 | |
| 843 + vadd.i32 q14, q14, q8 | |
| 844 + vext.32 q8, q8, q8, #1 | |
| 845 + veor q3, q14, q9 | |
| 846 + vext.32 q14, q14, q14, #2 | |
| 847 + vshl.i32 q9, q3, #7 | |
| 848 + vsri.32 q9, q3, #25 | |
| 849 + vext.32 q9, q9, q9, #3 | |
| 850 + bne .L10 | |
| 851 + cmp ip, #15 | |
| 852 + vadd.i32 q11, q11, q15 | |
| 853 + bhi .L37 | |
| 854 + ldr r9, [r7, #200] | |
| 855 + vst1.64 {d22-d23}, [r9:128] | |
| 856 +.L14: | |
| 857 + ldr sl, [r7, #196] | |
| 858 + and r3, sl, #48 | |
| 859 + cmp ip, r3 | |
| 860 + bls .L1 | |
| 861 + adds r0, r5, r3 | |
| 862 + adds r1, r4, r3 | |
| 863 + add r2, r0, #16 | |
| 864 + add r6, r1, #16 | |
| 865 + cmp r1, r2 | |
| 866 + it cc | |
| 867 + cmpcc r0, r6 | |
| 868 + rsb r9, r3, ip | |
| 869 + ite cc | |
| 870 + movcc r2, #0 | |
| 871 + movcs r2, #1 | |
| 872 + cmp r9, #15 | |
| 873 + ite ls | |
| 874 + movls r2, #0 | |
| 875 + andhi r2, r2, #1 | |
| 876 + lsr r8, r9, #4 | |
| 877 + eor r2, r2, #1 | |
| 878 + cmp r8, #0 | |
| 879 + it eq | |
| 880 + orreq r2, r2, #1 | |
| 881 + lsl sl, r8, #4 | |
| 882 + cbnz r2, .L35 | |
| 883 + ldr fp, [r7, #200] | |
| 884 + add r6, fp, r3 | |
| 885 +.L17: | |
| 886 + vld1.8 {q8}, [r0]! | |
| 887 + adds r2, r2, #1 | |
| 888 + cmp r8, r2 | |
| 889 + vld1.8 {q9}, [r6]! | |
| 890 + veor q8, q9, q8 | |
| 891 + vst1.8 {q8}, [r1]! | |
| 892 + bhi .L17 | |
| 893 + cmp r9, sl | |
| 894 + add r3, r3, sl | |
| 895 + beq .L1 | |
| 896 +.L35: | |
| 897 + ldr r0, [r7, #200] | |
| 898 +.L25: | |
| 899 + ldrb r2, [r5, r3] @ zero_extendqisi2 | |
| 900 + ldrb r1, [r3, r0] @ zero_extendqisi2 | |
| 901 + eors r2, r2, r1 | |
| 902 + strb r2, [r4, r3] | |
| 903 + adds r3, r3, #1 | |
| 904 + cmp ip, r3 | |
| 905 + bhi .L25 | |
| 906 +.L1: | |
| 907 + add r7, r7, #304 | |
| 908 + mov sp, r7 | |
| 909 + fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} | |
| 910 + pop {r4, r5, r6, r7, r8, r9, sl, fp} | |
| 911 + bx lr | |
| 912 +.L37: | |
| 913 + cmp ip, #31 | |
| 914 + vld1.64 {d0-d1}, [r5:64] | |
| 915 + vadd.i32 q9, q12, q9 | |
| 916 + veor q11, q11, q0 | |
| 917 + vst1.64 {d22-d23}, [r4:64] | |
| 918 + bls .L12 | |
| 919 + cmp ip, #47 | |
| 920 + vldr d2, [r5, #16] | |
| 921 + vldr d3, [r5, #24] | |
| 922 + vadd.i32 q13, q13, q14 | |
| 923 + veor q9, q9, q1 | |
| 924 + vstr d18, [r4, #16] | |
| 925 + vstr d19, [r4, #24] | |
| 926 + bls .L13 | |
| 927 + vadd.i32 q8, q8, q10 | |
| 928 + vldr d0, [r5, #32] | |
| 929 + vldr d1, [r5, #40] | |
| 930 + ldr r6, [r7, #200] | |
| 931 + vstr d16, [r6, #48] | |
| 932 + vstr d17, [r6, #56] | |
| 933 + veor q8, q13, q0 | |
| 934 + vstr d16, [r4, #32] | |
| 935 + vstr d17, [r4, #40] | |
| 936 + b .L14 | |
| 937 +.L12: | |
| 938 + ldr r8, [r7, #200] | |
| 939 + vstr d18, [r8, #16] | |
| 940 + vstr d19, [r8, #24] | |
| 941 + b .L14 | |
| 942 +.L20: | |
| 943 + ldr r5, [r7, #184] | |
| 944 + ldr r4, [r7, #188] | |
| 945 + b .L2 | |
| 946 +.L13: | |
| 947 + ldr r6, [r7, #200] | |
| 948 + vstr d26, [r6, #32] | |
| 949 + vstr d27, [r6, #40] | |
| 950 + b .L14 | |
| 951 +.L42: | |
| 952 + .align 3 | |
| 953 +.L41: | |
| 954 + .word 1 | |
| 955 + .word 0 | |
| 956 + .word 0 | |
| 957 + .word 0 | |
| 958 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon | |
| 959 + .section .rodata | |
| 960 + .align 3 | |
| 961 +.LANCHOR0 = . + 0 | |
| 962 +.LC0: | |
| 963 + .word 1634760805 | |
| 964 + .word 857760878 | |
| 965 + .word 2036477234 | |
| 966 + .word 1797285236 | |
| 967 + .ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro
GCC 2012.10) 4.7.3 20121001 (prerelease)" | |
| 968 + .section .note.GNU-stack,"",%progbits | |
| 969 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c | |
| 970 index 7bef015..3b6ab1d 100644 | |
| 971 --- a/crypto/cryptlib.c | |
| 972 +++ b/crypto/cryptlib.c | |
| 973 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type) | |
| 974 return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS))
; | |
| 975 } | |
| 976 | |
| 977 +#if __arm__ | |
| 978 +static int global_arm_neon_enabled = 0; | |
| 979 + | |
| 980 +void CRYPTO_set_NEON_capable(int on) | |
| 981 + { | |
| 982 + global_arm_neon_enabled = on != 0; | |
| 983 + } | |
| 984 + | |
| 985 +int CRYPTO_is_NEON_capable(void) | |
| 986 + { | |
| 987 + return global_arm_neon_enabled; | |
| 988 + } | |
| 989 +#endif | |
| 990 + | |
| 991 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ | |
| 992 defined(__INTEL__) || \ | |
| 993 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined
(_M_X64) | |
| 994 diff --git a/crypto/crypto.h b/crypto/crypto.h | |
| 995 index e11ac73..db339c3 100644 | |
| 996 --- a/crypto/crypto.h | |
| 997 +++ b/crypto/crypto.h | |
| 998 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void); | |
| 999 | |
| 1000 int CRYPTO_get_new_lockid(char *name); | |
| 1001 | |
| 1002 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This | |
| 1003 + * code should be called before any non-init functions. */ | |
| 1004 +void CRYPTO_set_NEON_capable(int on); | |
| 1005 + | |
| 1006 +/* CRYPTO_is_NEON_capable returns the last value given to | |
| 1007 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */ | |
| 1008 +int CRYPTO_is_NEON_capable(void); | |
| 1009 + | |
| 1010 int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */ | |
| 1011 void CRYPTO_lock(int mode, int type,const char *file,int line); | |
| 1012 void CRYPTO_set_locking_callback(void (*func)(int mode,int type, | |
| 1013 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c | |
| 1014 index 2e5621d..00d53bf 100644 | |
| 1015 --- a/crypto/poly1305/poly1305.c | |
| 1016 +++ b/crypto/poly1305/poly1305.c | |
| 1017 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v) | |
| 1018 } | |
| 1019 #endif | |
| 1020 | |
| 1021 +#if __arm__ | |
| 1022 +void CRYPTO_poly1305_init_neon(poly1305_state* state, | |
| 1023 + const unsigned char key[32]); | |
| 1024 + | |
| 1025 +void CRYPTO_poly1305_update_neon(poly1305_state* state, | |
| 1026 + const unsigned char *in, | |
| 1027 + size_t in_len); | |
| 1028 + | |
| 1029 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]); | |
| 1030 +#endif | |
| 1031 + | |
| 1032 static uint64_t | |
| 1033 mul32x32_64(uint32_t a, uint32_t b) | |
| 1034 { | |
| 1035 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uns
igned char key[32]) | |
| 1036 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; | |
| 1037 uint32_t t0,t1,t2,t3; | |
| 1038 | |
| 1039 +#if __arm__ | |
| 1040 + if (CRYPTO_is_NEON_capable()) | |
| 1041 + { | |
| 1042 + CRYPTO_poly1305_init_neon(statep, key); | |
| 1043 + return; | |
| 1044 + } | |
| 1045 +#endif | |
| 1046 + | |
| 1047 t0 = U8TO32_LE(key+0); | |
| 1048 t1 = U8TO32_LE(key+4); | |
| 1049 t2 = U8TO32_LE(key+8); | |
| 1050 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const u
nsigned char *in, | |
| 1051 unsigned int i; | |
| 1052 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; | |
| 1053 | |
| 1054 +#if __arm__ | |
| 1055 + if (CRYPTO_is_NEON_capable()) | |
| 1056 + { | |
| 1057 + CRYPTO_poly1305_update_neon(statep, in, in_len); | |
| 1058 + return; | |
| 1059 + } | |
| 1060 +#endif | |
| 1061 + | |
| 1062 if (state->buf_used) | |
| 1063 { | |
| 1064 unsigned int todo = 16 - state->buf_used; | |
| 1065 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigne
d char mac[16]) | |
| 1066 uint32_t g0,g1,g2,g3,g4; | |
| 1067 uint32_t b, nb; | |
| 1068 | |
| 1069 +#if __arm__ | |
| 1070 + if (CRYPTO_is_NEON_capable()) | |
| 1071 + { | |
| 1072 + CRYPTO_poly1305_finish_neon(statep, mac); | |
| 1073 + return; | |
| 1074 + } | |
| 1075 +#endif | |
| 1076 + | |
| 1077 if (state->buf_used) | |
| 1078 poly1305_update(state, state->buf, state->buf_used); | |
| 1079 | |
| 1080 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c | |
| 1081 index adcef35..34e339d 100644 | |
| 1082 --- a/crypto/poly1305/poly1305_arm.c | |
| 1083 +++ b/crypto/poly1305/poly1305_arm.c | |
| 1084 @@ -51,6 +51,7 @@ | |
| 1085 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ | |
| 1086 | |
| 1087 #include <stdint.h> | |
| 1088 +#include <string.h> | |
| 1089 | |
| 1090 #include <openssl/poly1305.h> | |
| 1091 | |
| 1092 @@ -202,7 +203,8 @@ struct poly1305_state_st { | |
| 1093 unsigned char key[16]; | |
| 1094 }; | |
| 1095 | |
| 1096 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) | |
| 1097 +void CRYPTO_poly1305_init_neon(poly1305_state *state, | |
| 1098 + const unsigned char key[32]) | |
| 1099 { | |
| 1100 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); | |
| 1101 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); | |
| 1102 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsig
ned char key[32]) | |
| 1103 st->buf_used = 0; | |
| 1104 } | |
| 1105 | |
| 1106 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, siz
e_t in_len) | |
| 1107 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in
, | |
| 1108 + size_t in_len) | |
| 1109 { | |
| 1110 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); | |
| 1111 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); | |
| 1112 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uns
igned char *in, size | |
| 1113 } | |
| 1114 } | |
| 1115 | |
| 1116 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16]) | |
| 1117 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]) | |
| 1118 { | |
| 1119 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); | |
| 1120 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); | |
| 1121 -- | |
| 1122 1.8.4.1 | |
| 1123 | |
| OLD | NEW |