Index: openssl/patches/chacha20poly1305.patch |
diff --git a/openssl/patches/chacha20poly1305.patch b/openssl/patches/chacha20poly1305.patch |
new file mode 100644 |
index 0000000000000000000000000000000000000000..ab0c56cb18691f5ee774580552028502ee9115ff |
--- /dev/null |
+++ b/openssl/patches/chacha20poly1305.patch |
@@ -0,0 +1,5726 @@ |
+From 2688f00904e4ffd647afcff69bb8fe6df8c5902b Mon Sep 17 00:00:00 2001 |
+From: Adam Langley <agl@chromium.org> |
+Date: Mon, 9 Sep 2013 12:13:24 -0400 |
+Subject: [PATCH 43/52] chacha20poly1305 |
+ |
+Add support for Chacha20 + Poly1305. |
+--- |
+ .gitignore | 1 + |
+ Configure | 56 +- |
+ Makefile.org | 6 +- |
+ apps/speed.c | 64 +- |
+ crypto/chacha/Makefile | 80 ++ |
+ crypto/chacha/chacha.h | 85 ++ |
+ crypto/chacha/chacha_enc.c | 167 +++ |
+ crypto/chacha/chacha_vec.c | 345 +++++++ |
+ crypto/chacha/chachatest.c | 211 ++++ |
+ crypto/evp/Makefile | 35 +- |
+ crypto/evp/e_chacha20poly1305.c | 261 +++++ |
+ crypto/evp/evp.h | 8 + |
+ crypto/evp/evp_err.c | 3 + |
+ crypto/poly1305/Makefile | 81 ++ |
+ crypto/poly1305/poly1305.c | 320 ++++++ |
+ crypto/poly1305/poly1305.h | 88 ++ |
+ crypto/poly1305/poly1305_arm.c | 335 ++++++ |
+ crypto/poly1305/poly1305_arm_asm.s | 2009 ++++++++++++++++++++++++++++++++++++ |
+ crypto/poly1305/poly1305_vec.c | 733 +++++++++++++ |
+ crypto/poly1305/poly1305test.c | 166 +++ |
+ ssl/s3_lib.c | 75 +- |
+ ssl/s3_pkt.c | 5 +- |
+ ssl/ssl.h | 1 + |
+ ssl/ssl_ciph.c | 16 +- |
+ ssl/ssl_locl.h | 10 + |
+ ssl/t1_enc.c | 30 +- |
+ ssl/tls1.h | 8 + |
+ test/Makefile | 23 +- |
+ 28 files changed, 5166 insertions(+), 56 deletions(-) |
+ create mode 100644 crypto/chacha/Makefile |
+ create mode 100644 crypto/chacha/chacha.h |
+ create mode 100644 crypto/chacha/chacha_enc.c |
+ create mode 100644 crypto/chacha/chacha_vec.c |
+ create mode 100644 crypto/chacha/chachatest.c |
+ create mode 100644 crypto/evp/e_chacha20poly1305.c |
+ create mode 100644 crypto/poly1305/Makefile |
+ create mode 100644 crypto/poly1305/poly1305.c |
+ create mode 100644 crypto/poly1305/poly1305.h |
+ create mode 100644 crypto/poly1305/poly1305_arm.c |
+ create mode 100644 crypto/poly1305/poly1305_arm_asm.s |
+ create mode 100644 crypto/poly1305/poly1305_vec.c |
+ create mode 100644 crypto/poly1305/poly1305test.c |
+ |
+diff --git a/Configure b/Configure |
+index 9c803dc..1b95384 100755 |
+--- a/Configure |
++++ b/Configure |
+@@ -124,24 +124,24 @@ my $tlib="-lnsl -lsocket"; |
+ my $bits1="THIRTY_TWO_BIT "; |
+ my $bits2="SIXTY_FOUR_BIT "; |
+ |
+-my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:"; |
++my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:::"; |
+ |
+ my $x86_elf_asm="$x86_asm:elf"; |
+ |
+-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:"; |
+-my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; |
+-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; |
+-my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; |
+-my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; |
+-my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; |
+-my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; |
+-my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; |
+-my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void"; |
+-my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; |
+-my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; |
+-my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; |
+-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::"; |
+-my $no_asm=":::::::::::::::void"; |
++my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o::chacha_vec.o:poly1305_vec.o"; |
++my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::::void"; |
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::::void"; |
++my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::::void"; |
++my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::::void"; |
++my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::::"; |
++my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::::"; |
++my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::::ghash-s390x.o:"; |
++my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void"; |
++my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::::32"; |
++my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::::64"; |
++my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::"; |
++my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::::"; |
++my $no_asm=":::::::::::::::::void"; |
+ |
+ # As for $BSDthreads. Idea is to maintain "collective" set of flags, |
+ # which would cover all BSD flavors. -pthread applies to them all, |
+@@ -152,7 +152,7 @@ my $no_asm=":::::::::::::::void"; |
+ # seems to be sufficient? |
+ my $BSDthreads="-pthread -D_THREAD_SAFE -D_REENTRANT"; |
+ |
+-#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $engines_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib |
++#config-string $cc : $cflags : $unistd : $thread_cflag : $sys_id : $lflags : $bn_ops : $cpuid_obj : $bn_obj : $des_obj : $aes_obj : $bf_obj : $md5_obj : $sha1_obj : $cast_obj : $rc4_obj : $rmd160_obj : $rc5_obj : $wp_obj : $cmll_obj : $modes_obj : $engines_obj : $chacha_obj : $poly1305_obj : $dso_scheme : $shared_target : $shared_cflag : $shared_ldflag : $shared_extension : $ranlib : $arflags : $multilib : |
+ |
+ my %table=( |
+ # File 'TABLE' (created by 'make TABLE') contains the data from this list, |
+@@ -647,6 +647,8 @@ my $idx_wp_obj = $idx++; |
+ my $idx_cmll_obj = $idx++; |
+ my $idx_modes_obj = $idx++; |
+ my $idx_engines_obj = $idx++; |
++my $idx_chacha_obj = $idx++; |
++my $idx_poly1305_obj = $idx++; |
+ my $idx_perlasm_scheme = $idx++; |
+ my $idx_dso_scheme = $idx++; |
+ my $idx_shared_target = $idx++; |
+@@ -692,6 +694,8 @@ my $aes_enc="aes_core.o aes_cbc.o"; |
+ my $bf_enc ="bf_enc.o"; |
+ my $cast_enc="c_enc.o"; |
+ my $rc4_enc="rc4_enc.o rc4_skey.o"; |
++my $chacha_enc="chacha_enc.o"; |
++my $poly1305 ="poly1305.o"; |
+ my $rc5_enc="rc5_enc.o"; |
+ my $md5_obj=""; |
+ my $sha1_obj=""; |
+@@ -1144,7 +1148,7 @@ $openssldir=$prefix . "/" . $openssldir if $openssldir !~ /(^\/|^[a-zA-Z]:[\\\/] |
+ |
+ print "IsMK1MF=$IsMK1MF\n"; |
+ |
+-my @fields = split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); |
++my @fields = split(/\s*:\s*/,$table{$target} . ":" x 31 , -1); |
+ my $cc = $fields[$idx_cc]; |
+ # Allow environment CC to override compiler... |
+ if($ENV{CC}) { |
+@@ -1181,6 +1185,8 @@ my $ranlib = $ENV{'RANLIB'} || $fields[$idx_ranlib]; |
+ my $ar = $ENV{'AR'} || "ar"; |
+ my $arflags = $fields[$idx_arflags]; |
+ my $multilib = $fields[$idx_multilib]; |
++my $chacha_obj = $fields[$idx_chacha_obj]; |
++my $poly1305_obj = $fields[$idx_poly1305_obj]; |
+ |
+ # if $prefix/lib$multilib is not an existing directory, then |
+ # assume that it's not searched by linker automatically, in |
+@@ -1477,6 +1483,8 @@ $des_obj=$des_enc unless ($des_obj =~ /\.o$/); |
+ $bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/); |
+ $cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/); |
+ $rc4_obj=$rc4_enc unless ($rc4_obj =~ /\.o$/); |
++$chacha_obj=$chacha_enc unless ($chacha_obj =~ /\.o$/); |
++$poly1305_obj=$poly1305 unless ($poly1305_obj =~ /\.o$/); |
+ $rc5_obj=$rc5_enc unless ($rc5_obj =~ /\.o$/); |
+ if ($sha1_obj =~ /\.o$/) |
+ { |
+@@ -1637,6 +1645,8 @@ while (<IN>) |
+ s/^BF_ENC=.*$/BF_ENC= $bf_obj/; |
+ s/^CAST_ENC=.*$/CAST_ENC= $cast_obj/; |
+ s/^RC4_ENC=.*$/RC4_ENC= $rc4_obj/; |
++ s/^CHACHA_ENC=.*$/CHACHA_ENC= $chacha_obj/; |
++ s/^POLY1305=.*$/POLY1305= $poly1305_obj/; |
+ s/^RC5_ENC=.*$/RC5_ENC= $rc5_obj/; |
+ s/^MD5_ASM_OBJ=.*$/MD5_ASM_OBJ= $md5_obj/; |
+ s/^SHA1_ASM_OBJ=.*$/SHA1_ASM_OBJ= $sha1_obj/; |
+@@ -1698,6 +1708,8 @@ print "AES_ENC =$aes_obj\n"; |
+ print "BF_ENC =$bf_obj\n"; |
+ print "CAST_ENC =$cast_obj\n"; |
+ print "RC4_ENC =$rc4_obj\n"; |
++print "CHACHA_ENC =$chacha_obj\n"; |
++print "POLY1305 =$poly1305_obj\n"; |
+ print "RC5_ENC =$rc5_obj\n"; |
+ print "MD5_OBJ_ASM =$md5_obj\n"; |
+ print "SHA1_OBJ_ASM =$sha1_obj\n"; |
+@@ -2096,11 +2108,11 @@ sub print_table_entry |
+ |
+ (my $cc,my $cflags,my $unistd,my $thread_cflag,my $sys_id,my $lflags, |
+ my $bn_ops,my $cpuid_obj,my $bn_obj,my $des_obj,my $aes_obj, my $bf_obj, |
+- my $md5_obj,my $sha1_obj,my $cast_obj,my $rc4_obj,my $rmd160_obj, |
+- my $rc5_obj,my $wp_obj,my $cmll_obj,my $modes_obj, my $engines_obj, |
++ my $md5_obj,my $sha1_obj,my $cast_obj,my $rc4_obj,my $chacha_obj,my $poly1305_obj, |
++ my $rmd160_obj, my $rc5_obj,my $wp_obj,my $cmll_obj,my $modes_obj, my $engines_obj, |
+ my $perlasm_scheme,my $dso_scheme,my $shared_target,my $shared_cflag, |
+ my $shared_ldflag,my $shared_extension,my $ranlib,my $arflags,my $multilib)= |
+- split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); |
++ split(/\s*:\s*/,$table{$target} . ":" x 31 , -1); |
+ |
+ print <<EOF |
+ |
+@@ -2121,6 +2133,8 @@ sub print_table_entry |
+ \$sha1_obj = $sha1_obj |
+ \$cast_obj = $cast_obj |
+ \$rc4_obj = $rc4_obj |
++\$chacha_obj = $chacha_obj |
++\$poly1305_obj = $poly1305_obj |
+ \$rmd160_obj = $rmd160_obj |
+ \$rc5_obj = $rc5_obj |
+ \$wp_obj = $wp_obj |
+@@ -2150,7 +2164,7 @@ sub test_sanity |
+ |
+ foreach $target (sort keys %table) |
+ { |
+- @fields = split(/\s*:\s*/,$table{$target} . ":" x 30 , -1); |
++ @fields = split(/\s*:\s*/,$table{$target} . ":" x 31 , -1); |
+ |
+ if ($fields[$idx_dso_scheme-1] =~ /^(beos|dl|dlfcn|win32|vms)$/) |
+ { |
+diff --git a/Makefile.org b/Makefile.org |
+index 2db31ea..919466d 100644 |
+--- a/Makefile.org |
++++ b/Makefile.org |
+@@ -94,6 +94,8 @@ BF_ENC= bf_enc.o |
+ CAST_ENC= c_enc.o |
+ RC4_ENC= rc4_enc.o |
+ RC5_ENC= rc5_enc.o |
++CHACHA_ENC= chacha_enc.o |
++POLY1305= poly1305.o |
+ MD5_ASM_OBJ= |
+ SHA1_ASM_OBJ= |
+ RMD160_ASM_OBJ= |
+@@ -147,7 +149,7 @@ SDIRS= \ |
+ bn ec rsa dsa ecdsa dh ecdh dso engine \ |
+ buffer bio stack lhash rand err \ |
+ evp asn1 pem x509 x509v3 conf txt_db pkcs7 pkcs12 comp ocsp ui krb5 \ |
+- cms pqueue ts jpake srp store cmac |
++ cms pqueue ts jpake srp store cmac poly1305 chacha |
+ # keep in mind that the above list is adjusted by ./Configure |
+ # according to no-xxx arguments... |
+ |
+@@ -232,6 +234,8 @@ BUILDENV= PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)' \ |
+ WP_ASM_OBJ='$(WP_ASM_OBJ)' \ |
+ MODES_ASM_OBJ='$(MODES_ASM_OBJ)' \ |
+ ENGINES_ASM_OBJ='$(ENGINES_ASM_OBJ)' \ |
++ CHACHA_ENC='$(CHACHA_ENC)' \ |
++ POLY1305='$(POLY1305)' \ |
+ PERLASM_SCHEME='$(PERLASM_SCHEME)' \ |
+ FIPSLIBDIR='${FIPSLIBDIR}' \ |
+ FIPSDIR='${FIPSDIR}' \ |
+diff --git a/crypto/chacha/Makefile b/crypto/chacha/Makefile |
+new file mode 100644 |
+index 0000000..289933b |
+--- /dev/null |
++++ b/crypto/chacha/Makefile |
+@@ -0,0 +1,80 @@ |
++# |
++# OpenSSL/crypto/chacha/Makefile |
++# |
++ |
++DIR= chacha |
++TOP= ../.. |
++CC= cc |
++CPP= $(CC) -E |
++INCLUDES= |
++CFLAG=-g |
++AR= ar r |
++ |
++CFLAGS= $(INCLUDES) $(CFLAG) |
++ASFLAGS= $(INCLUDES) $(ASFLAG) |
++AFLAGS= $(ASFLAGS) |
++ |
++CHACHA_ENC=chacha_enc.o |
++ |
++GENERAL=Makefile |
++TEST=chachatest.o |
++APPS= |
++ |
++LIB=$(TOP)/libcrypto.a |
++LIBSRC= |
++LIBOBJ=$(CHACHA_ENC) |
++ |
++SRC= $(LIBSRC) |
++ |
++EXHEADER=chacha.h |
++HEADER= $(EXHEADER) |
++ |
++ALL= $(GENERAL) $(SRC) $(HEADER) |
++ |
++top: |
++ (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) |
++ |
++all: lib |
++ |
++lib: $(LIBOBJ) |
++ $(AR) $(LIB) $(LIBOBJ) |
++ $(RANLIB) $(LIB) || echo Never mind. |
++ @touch lib |
++ |
++files: |
++ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO |
++ |
++links: |
++ @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) |
++ @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) |
++ @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) |
++ |
++install: |
++ @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... |
++ @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ |
++ do \ |
++ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ |
++ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ |
++ done; |
++ |
++tags: |
++ ctags $(SRC) |
++ |
++tests: |
++ |
++lint: |
++ lint -DLINT $(INCLUDES) $(SRC)>fluff |
++ |
++depend: |
++ @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... |
++ $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) |
++ |
++dclean: |
++ $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new |
++ mv -f Makefile.new $(MAKEFILE) |
++ |
++clean: |
++ rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff |
++ |
++# DO NOT DELETE THIS LINE -- make depend depends on it. |
++ |
+diff --git a/crypto/chacha/chacha.h b/crypto/chacha/chacha.h |
+new file mode 100644 |
+index 0000000..d56519d |
+--- /dev/null |
++++ b/crypto/chacha/chacha.h |
+@@ -0,0 +1,85 @@ |
++/* |
++ * Chacha stream algorithm. |
++ * |
++ * Created on: Jun, 2013 |
++ * Author: Elie Bursztein (elieb@google.com) |
++ * |
++ * Adapted from the estream code by D. Bernstein. |
++ */ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++#ifndef HEADER_CHACHA_H |
++#define HEADER_CHACHA_H |
++ |
++#include <openssl/opensslconf.h> |
++ |
++#if defined(OPENSSL_NO_CHACHA) |
++#error ChaCha support is disabled. |
++#endif |
++ |
++#include <stddef.h> |
++ |
++#ifdef __cplusplus |
++extern "C" { |
++#endif |
++ |
++/* CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and |
++ * nonce and writes the result to |out|, which may be equal to |in|. The |
++ * initial block counter is specified by |counter|. */ |
++void CRYPTO_chacha_20(unsigned char *out, |
++ const unsigned char *in, size_t in_len, |
++ const unsigned char key[32], |
++ const unsigned char nonce[8], |
++ size_t counter); |
++ |
++#ifdef __cplusplus |
++} |
++#endif |
++ |
++#endif |
+diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c |
+new file mode 100644 |
+index 0000000..54d1ca3 |
+--- /dev/null |
++++ b/crypto/chacha/chacha_enc.c |
+@@ -0,0 +1,167 @@ |
++/* |
++ * Chacha stream algorithm. |
++ * |
++ * Created on: Jun, 2013 |
++ * Author: Elie Bursztein (elieb@google.com) |
++ * |
++ * Adapted from the estream code by D. Bernstein. |
++ */ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++#include <stdint.h> |
++#include <string.h> |
++#include <openssl/opensslconf.h> |
++ |
++#if !defined(OPENSSL_NO_CHACHA) |
++ |
++#include <openssl/chacha.h> |
++ |
++/* sigma contains the ChaCha constants, which happen to be an ASCII string. */ |
++static const char sigma[16] = "expand 32-byte k"; |
++ |
++#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) |
++#define XOR(v, w) ((v) ^ (w)) |
++#define PLUS(x, y) ((x) + (y)) |
++#define PLUSONE(v) (PLUS((v), 1)) |
++ |
++#define U32TO8_LITTLE(p, v) \ |
++ { (p)[0] = (v >> 0) & 0xff; (p)[1] = (v >> 8) & 0xff; \ |
++ (p)[2] = (v >> 16) & 0xff; (p)[3] = (v >> 24) & 0xff; } |
++#define U8TO32_LITTLE(p) \ |
++ (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ |
++ ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24) ) |
++ |
++/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ |
++#define QUARTERROUND(a,b,c,d) \ |
++ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ |
++ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ |
++ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ |
++ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); |
++ |
++typedef unsigned int uint32_t; |
++ |
++/* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in |
++ * |input| and writes the 64 output bytes to |output|. */ |
++static void chacha_core(unsigned char output[64], const uint32_t input[16], |
++ int num_rounds) |
++ { |
++ uint32_t x[16]; |
++ int i; |
++ |
++ memcpy(x, input, sizeof(uint32_t) * 16); |
++ for (i = 20; i > 0; i -= 2) |
++ { |
++ QUARTERROUND( 0, 4, 8,12) |
++ QUARTERROUND( 1, 5, 9,13) |
++ QUARTERROUND( 2, 6,10,14) |
++ QUARTERROUND( 3, 7,11,15) |
++ QUARTERROUND( 0, 5,10,15) |
++ QUARTERROUND( 1, 6,11,12) |
++ QUARTERROUND( 2, 7, 8,13) |
++ QUARTERROUND( 3, 4, 9,14) |
++ } |
++ |
++ for (i = 0; i < 16; ++i) |
++ x[i] = PLUS(x[i], input[i]); |
++ for (i = 0; i < 16; ++i) |
++ U32TO8_LITTLE(output + 4 * i, x[i]); |
++ } |
++ |
++void CRYPTO_chacha_20(unsigned char *out, |
++ const unsigned char *in, size_t in_len, |
++ const unsigned char key[32], |
++ const unsigned char nonce[8], |
++ size_t counter) |
++ { |
++ uint32_t input[16]; |
++ unsigned char buf[64]; |
++ size_t todo, i; |
++ |
++ input[0] = U8TO32_LITTLE(sigma + 0); |
++ input[1] = U8TO32_LITTLE(sigma + 4); |
++ input[2] = U8TO32_LITTLE(sigma + 8); |
++ input[3] = U8TO32_LITTLE(sigma + 12); |
++ |
++ input[4] = U8TO32_LITTLE(key + 0); |
++ input[5] = U8TO32_LITTLE(key + 4); |
++ input[6] = U8TO32_LITTLE(key + 8); |
++ input[7] = U8TO32_LITTLE(key + 12); |
++ |
++ input[8] = U8TO32_LITTLE(key + 16); |
++ input[9] = U8TO32_LITTLE(key + 20); |
++ input[10] = U8TO32_LITTLE(key + 24); |
++ input[11] = U8TO32_LITTLE(key + 28); |
++ |
++ input[12] = counter; |
++ input[13] = ((uint64_t) counter) >> 32; |
++ input[14] = U8TO32_LITTLE(nonce + 0); |
++ input[15] = U8TO32_LITTLE(nonce + 4); |
++ |
++ while (in_len > 0) |
++ { |
++ todo = sizeof(buf); |
++ if (in_len < todo) |
++ todo = in_len; |
++ |
++ chacha_core(buf, input, 20); |
++ for (i = 0; i < todo; i++) |
++ out[i] = in[i] ^ buf[i]; |
++ |
++ out += todo; |
++ in += todo; |
++ in_len -= todo; |
++ |
++ input[12]++; |
++ if (input[12] == 0) |
++ input[13]++; |
++ } |
++ } |
++ |
++#endif /* !OPENSSL_NO_CHACHA */ |
+diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c |
+new file mode 100644 |
+index 0000000..33b2238 |
+--- /dev/null |
++++ b/crypto/chacha/chacha_vec.c |
+@@ -0,0 +1,345 @@ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++/* This implementation is by Ted Krovetz and was submitted to SUPERCOP and |
++ * marked as public domain. It was been altered to allow for non-aligned inputs |
++ * and to allow the block counter to be passed in specifically. */ |
++ |
++#include <string.h> |
++#include <stdint.h> |
++#include <openssl/opensslconf.h> |
++ |
++#if !defined(OPENSSL_NO_CHACHA) |
++ |
++#include <openssl/chacha.h> |
++ |
++#ifndef CHACHA_RNDS |
++#define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */ |
++#endif |
++ |
++/* Architecture-neutral way to specify 16-byte vector of ints */ |
++typedef unsigned vec __attribute__ ((vector_size (16))); |
++ |
++/* This implementation is designed for Neon, SSE and AltiVec machines. The |
++ * following specify how to do certain vector operations efficiently on |
++ * each architecture, using intrinsics. |
++ * This implementation supports parallel processing of multiple blocks, |
++ * including potentially using general-purpose registers. |
++ */ |
++#if __ARM_NEON__ |
++#include <arm_neon.h> |
++#define GPR_TOO 1 |
++#define VBPI 2 |
++#define ONE (vec)vsetq_lane_u32(1,vdupq_n_u32(0),0) |
++#define LOAD(m) (vec)(*((vec*)(m))) |
++#define STORE(m,r) (*((vec*)(m))) = (r) |
++#define ROTV1(x) (vec)vextq_u32((uint32x4_t)x,(uint32x4_t)x,1) |
++#define ROTV2(x) (vec)vextq_u32((uint32x4_t)x,(uint32x4_t)x,2) |
++#define ROTV3(x) (vec)vextq_u32((uint32x4_t)x,(uint32x4_t)x,3) |
++#define ROTW16(x) (vec)vrev32q_u16((uint16x8_t)x) |
++#if __clang__ |
++#define ROTW7(x) (x << ((vec){ 7, 7, 7, 7})) ^ (x >> ((vec){25,25,25,25})) |
++#define ROTW8(x) (x << ((vec){ 8, 8, 8, 8})) ^ (x >> ((vec){24,24,24,24})) |
++#define ROTW12(x) (x << ((vec){12,12,12,12})) ^ (x >> ((vec){20,20,20,20})) |
++#else |
++#define ROTW7(x) (vec)vsriq_n_u32(vshlq_n_u32((uint32x4_t)x,7),(uint32x4_t)x,25) |
++#define ROTW8(x) (vec)vsriq_n_u32(vshlq_n_u32((uint32x4_t)x,8),(uint32x4_t)x,24) |
++#define ROTW12(x) (vec)vsriq_n_u32(vshlq_n_u32((uint32x4_t)x,12),(uint32x4_t)x,20) |
++#endif |
++#elif __SSE2__ |
++#include <emmintrin.h> |
++#define GPR_TOO 0 |
++#if __clang__ |
++#define VBPI 4 |
++#else |
++#define VBPI 3 |
++#endif |
++#define ONE (vec)_mm_set_epi32(0,0,0,1) |
++#define LOAD(m) (vec)_mm_loadu_si128((__m128i*)(m)) |
++#define STORE(m,r) _mm_storeu_si128((__m128i*)(m), (__m128i) (r)) |
++#define ROTV1(x) (vec)_mm_shuffle_epi32((__m128i)x,_MM_SHUFFLE(0,3,2,1)) |
++#define ROTV2(x) (vec)_mm_shuffle_epi32((__m128i)x,_MM_SHUFFLE(1,0,3,2)) |
++#define ROTV3(x) (vec)_mm_shuffle_epi32((__m128i)x,_MM_SHUFFLE(2,1,0,3)) |
++#define ROTW7(x) (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x,25)) |
++#define ROTW12(x) (vec)(_mm_slli_epi32((__m128i)x,12) ^ _mm_srli_epi32((__m128i)x,20)) |
++#if __SSSE3__ |
++#include <tmmintrin.h> |
++#define ROTW8(x) (vec)_mm_shuffle_epi8((__m128i)x,_mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3)) |
++#define ROTW16(x) (vec)_mm_shuffle_epi8((__m128i)x,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)) |
++#else |
++#define ROTW8(x) (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x,24)) |
++#define ROTW16(x) (vec)(_mm_slli_epi32((__m128i)x,16) ^ _mm_srli_epi32((__m128i)x,16)) |
++#endif |
++#else |
++#error -- Implementation supports only machines with neon or SSE2 |
++#endif |
++ |
++#ifndef REVV_BE |
++#define REVV_BE(x) (x) |
++#endif |
++ |
++#ifndef REVW_BE |
++#define REVW_BE(x) (x) |
++#endif |
++ |
++#define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration */ |
++ |
++#define DQROUND_VECTORS(a,b,c,d) \ |
++ a += b; d ^= a; d = ROTW16(d); \ |
++ c += d; b ^= c; b = ROTW12(b); \ |
++ a += b; d ^= a; d = ROTW8(d); \ |
++ c += d; b ^= c; b = ROTW7(b); \ |
++ b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); \ |
++ a += b; d ^= a; d = ROTW16(d); \ |
++ c += d; b ^= c; b = ROTW12(b); \ |
++ a += b; d ^= a; d = ROTW8(d); \ |
++ c += d; b ^= c; b = ROTW7(b); \ |
++ b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); |
++ |
++#define QROUND_WORDS(a,b,c,d) \ |
++ a = a+b; d ^= a; d = d<<16 | d>>16; \ |
++ c = c+d; b ^= c; b = b<<12 | b>>20; \ |
++ a = a+b; d ^= a; d = d<< 8 | d>>24; \ |
++ c = c+d; b ^= c; b = b<< 7 | b>>25; |
++ |
++#define WRITE_XOR(in, op, d, v0, v1, v2, v3) \ |
++ STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \ |
++ STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \ |
++ STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ |
++ STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); |
++ |
++void CRYPTO_chacha_20( |
++ unsigned char *out, |
++ const unsigned char *in, |
++ size_t inlen, |
++ const unsigned char key[32], |
++ const unsigned char nonce[8], |
++ size_t counter) |
++ { |
++ unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp; |
++#if defined(__ARM_NEON__) |
++ unsigned *np; |
++#endif |
++ vec s0, s1, s2, s3; |
++#if !defined(__ARM_NEON__) && !defined(__SSE2__) |
++ __attribute__ ((aligned (16))) unsigned key[8], nonce[4]; |
++#endif |
++ __attribute__ ((aligned (16))) unsigned chacha_const[] = |
++ {0x61707865,0x3320646E,0x79622D32,0x6B206574}; |
++#if defined(__ARM_NEON__) || defined(__SSE2__) |
++ kp = (unsigned *)key; |
++#else |
++ ((vec *)key)[0] = REVV_BE(((vec *)key)[0]); |
++ ((vec *)key)[1] = REVV_BE(((vec *)key)[1]); |
++ nonce[0] = REVW_BE(((unsigned *)nonce)[0]); |
++ nonce[1] = REVW_BE(((unsigned *)nonce)[1]); |
++ nonce[2] = REVW_BE(((unsigned *)nonce)[2]); |
++ nonce[3] = REVW_BE(((unsigned *)nonce)[3]); |
++ kp = (unsigned *)key; |
++ np = (unsigned *)nonce; |
++#endif |
++#if defined(__ARM_NEON__) |
++ np = (unsigned*) nonce; |
++#endif |
++ s0 = LOAD(chacha_const); |
++ s1 = LOAD(&((vec*)kp)[0]); |
++ s2 = LOAD(&((vec*)kp)[1]); |
++ s3 = (vec){ |
++ counter & 0xffffffff, |
++#if __ARM_NEON__ |
++ 0, /* can't right-shift 32 bits on a 32-bit system. */ |
++#else |
++ counter >> 32, |
++#endif |
++ ((uint32_t*)nonce)[0], |
++ ((uint32_t*)nonce)[1] |
++ }; |
++ |
++ for (iters = 0; iters < inlen/(BPI*64); iters++) |
++ { |
++#if GPR_TOO |
++ register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8, |
++ x9, x10, x11, x12, x13, x14, x15; |
++#endif |
++#if VBPI > 2 |
++ vec v8,v9,v10,v11; |
++#endif |
++#if VBPI > 3 |
++ vec v12,v13,v14,v15; |
++#endif |
++ |
++ vec v0,v1,v2,v3,v4,v5,v6,v7; |
++ v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3; |
++ v7 = v3 + ONE; |
++#if VBPI > 2 |
++ v8 = v4; v9 = v5; v10 = v6; |
++ v11 = v7 + ONE; |
++#endif |
++#if VBPI > 3 |
++ v12 = v8; v13 = v9; v14 = v10; |
++ v15 = v11 + ONE; |
++#endif |
++#if GPR_TOO |
++ x0 = chacha_const[0]; x1 = chacha_const[1]; |
++ x2 = chacha_const[2]; x3 = chacha_const[3]; |
++ x4 = kp[0]; x5 = kp[1]; x6 = kp[2]; x7 = kp[3]; |
++ x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7]; |
++ x12 = counter+BPI*iters+(BPI-1); x13 = 0; |
++ x14 = np[0]; x15 = np[1]; |
++#endif |
++ for (i = CHACHA_RNDS/2; i; i--) |
++ { |
++ DQROUND_VECTORS(v0,v1,v2,v3) |
++ DQROUND_VECTORS(v4,v5,v6,v7) |
++#if VBPI > 2 |
++ DQROUND_VECTORS(v8,v9,v10,v11) |
++#endif |
++#if VBPI > 3 |
++ DQROUND_VECTORS(v12,v13,v14,v15) |
++#endif |
++#if GPR_TOO |
++ QROUND_WORDS( x0, x4, x8,x12) |
++ QROUND_WORDS( x1, x5, x9,x13) |
++ QROUND_WORDS( x2, x6,x10,x14) |
++ QROUND_WORDS( x3, x7,x11,x15) |
++ QROUND_WORDS( x0, x5,x10,x15) |
++ QROUND_WORDS( x1, x6,x11,x12) |
++ QROUND_WORDS( x2, x7, x8,x13) |
++ QROUND_WORDS( x3, x4, x9,x14) |
++#endif |
++ } |
++ |
++ WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3) |
++ s3 += ONE; |
++ WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3) |
++ s3 += ONE; |
++#if VBPI > 2 |
++ WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3) |
++ s3 += ONE; |
++#endif |
++#if VBPI > 3 |
++ WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3) |
++ s3 += ONE; |
++#endif |
++ ip += VBPI*16; |
++ op += VBPI*16; |
++#if GPR_TOO |
++ op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0])); |
++ op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1])); |
++ op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2])); |
++ op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3])); |
++ op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0])); |
++ op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1])); |
++ op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2])); |
++ op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3])); |
++ op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4])); |
++ op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5])); |
++ op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6])); |
++ op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7])); |
++ op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + BPI*iters+(BPI-1))); |
++ op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13)); |
++ op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0])); |
++ op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1])); |
++ s3 += ONE; |
++ ip += 16; |
++ op += 16; |
++#endif |
++ } |
++ |
++ for (iters = inlen%(BPI*64)/64; iters != 0; iters--) |
++ { |
++ vec v0 = s0, v1 = s1, v2 = s2, v3 = s3; |
++ for (i = CHACHA_RNDS/2; i; i--) |
++ { |
++ DQROUND_VECTORS(v0,v1,v2,v3); |
++ } |
++ WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3) |
++ s3 += ONE; |
++ ip += 16; |
++ op += 16; |
++ } |
++ |
++ inlen = inlen % 64; |
++ if (inlen) |
++ { |
++ __attribute__ ((aligned (16))) vec buf[4]; |
++ vec v0,v1,v2,v3; |
++ v0 = s0; v1 = s1; v2 = s2; v3 = s3; |
++ for (i = CHACHA_RNDS/2; i; i--) |
++ { |
++ DQROUND_VECTORS(v0,v1,v2,v3); |
++ } |
++ |
++ if (inlen >= 16) |
++ { |
++ STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0)); |
++ if (inlen >= 32) |
++ { |
++ STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1)); |
++ if (inlen >= 48) |
++ { |
++ STORE(op + 8, LOAD(ip + 8) ^ |
++ REVV_BE(v2 + s2)); |
++ buf[3] = REVV_BE(v3 + s3); |
++ } |
++ else |
++ buf[2] = REVV_BE(v2 + s2); |
++ } |
++ else |
++ buf[1] = REVV_BE(v1 + s1); |
++ } |
++ else |
++ buf[0] = REVV_BE(v0 + s0); |
++ |
++ for (i=inlen & ~15; i<inlen; i++) |
++ ((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i]; |
++ } |
++ } |
++ |
++#endif /* !OPENSSL_NO_CHACHA */ |
+diff --git a/crypto/chacha/chachatest.c b/crypto/chacha/chachatest.c |
+new file mode 100644 |
+index 0000000..b2a9389 |
+--- /dev/null |
++++ b/crypto/chacha/chachatest.c |
+@@ -0,0 +1,211 @@ |
++/* |
++ * Chacha stream algorithm. |
++ * |
++ * Created on: Jun, 2013 |
++ * Author: Elie Bursztein (elieb@google.com) |
++ * |
++ * Adapted from the estream code by D. Bernstein. |
++ */ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++#include <stdio.h> |
++#include <stdlib.h> |
++#include <string.h> |
++#include <stdint.h> |
++ |
++#include <openssl/chacha.h> |
++ |
++struct chacha_test { |
++ const char *keyhex; |
++ const char *noncehex; |
++ const char *outhex; |
++}; |
++ |
++static const struct chacha_test chacha_tests[] = { |
++ { |
++ "0000000000000000000000000000000000000000000000000000000000000000", |
++ "0000000000000000", |
++ "76b8e0ada0f13d90405d6ae55386bd28bdd219b8a08ded1aa836efcc8b770dc7da41597c5157488d7724e03fb8d84a376a43b8f41518a11cc387b669b2ee6586", |
++ }, |
++ { |
++ "0000000000000000000000000000000000000000000000000000000000000001", |
++ "0000000000000000", |
++ "4540f05a9f1fb296d7736e7b208e3c96eb4fe1834688d2604f450952ed432d41bbe2a0b6ea7566d2a5d1e7e20d42af2c53d792b1c43fea817e9ad275ae546963", |
++ }, |
++ { |
++ "0000000000000000000000000000000000000000000000000000000000000000", |
++ "0000000000000001", |
++ "de9cba7bf3d69ef5e786dc63973f653a0b49e015adbff7134fcb7df137821031e85a050278a7084527214f73efc7fa5b5277062eb7a0433e445f41e31afab757", |
++ }, |
++ { |
++ "0000000000000000000000000000000000000000000000000000000000000000", |
++ "0100000000000000", |
++ "ef3fdfd6c61578fbf5cf35bd3dd33b8009631634d21e42ac33960bd138e50d32111e4caf237ee53ca8ad6426194a88545ddc497a0b466e7d6bbdb0041b2f586b", |
++ }, |
++ { |
++ "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", |
++ "0001020304050607", |
++ "f798a189f195e66982105ffb640bb7757f579da31602fc93ec01ac56f85ac3c134a4547b733b46413042c9440049176905d3be59ea1c53f15916155c2be8241a38008b9a26bc35941e2444177c8ade6689de95264986d95889fb60e84629c9bd9a5acb1cc118be563eb9b3a4a472f82e09a7e778492b562ef7130e88dfe031c79db9d4f7c7a899151b9a475032b63fc385245fe054e3dd5a97a5f576fe064025d3ce042c566ab2c507b138db853e3d6959660996546cc9c4a6eafdc777c040d70eaf46f76dad3979e5c5360c3317166a1c894c94a371876a94df7628fe4eaaf2ccb27d5aaae0ad7ad0f9d4b6ad3b54098746d4524d38407a6deb", |
++ }, |
++}; |
++ |
++static unsigned char hex_digit(char h) |
++ { |
++ if (h >= '0' && h <= '9') |
++ return h - '0'; |
++ else if (h >= 'a' && h <= 'f') |
++ return h - 'a' + 10; |
++ else if (h >= 'A' && h <= 'F') |
++ return h - 'A' + 10; |
++ else |
++ abort(); |
++ } |
++ |
++static void hex_decode(unsigned char *out, const char* hex) |
++ { |
++ size_t j = 0; |
++ |
++ while (*hex != 0) |
++ { |
++ unsigned char v = hex_digit(*hex++); |
++ v <<= 4; |
++ v |= hex_digit(*hex++); |
++ out[j++] = v; |
++ } |
++ } |
++ |
++static void hexdump(unsigned char *a, size_t len) |
++ { |
++ size_t i; |
++ |
++ for (i = 0; i < len; i++) |
++ printf("%02x", a[i]); |
++ } |
++ |
++/* misalign returns a pointer that points 0 to 15 bytes into |in| such that the |
++ * returned pointer has alignment 1 mod 16. */ |
++static void* misalign(void* in) |
++ { |
++ intptr_t x = (intptr_t) in; |
++ x += (17 - (x % 16)) % 16; |
++ return (void*) x; |
++ } |
++ |
++int main() |
++ { |
++ static const unsigned num_tests = |
++ sizeof(chacha_tests) / sizeof(struct chacha_test); |
++ unsigned i; |
++ unsigned char key_bytes[32 + 16]; |
++ unsigned char nonce_bytes[8 + 16] = {0}; |
++ |
++ unsigned char *key = misalign(key_bytes); |
++ unsigned char *nonce = misalign(nonce_bytes); |
++ |
++ for (i = 0; i < num_tests; i++) |
++ { |
++ const struct chacha_test *test = &chacha_tests[i]; |
++ unsigned char *expected, *out_bytes, *zero_bytes, *out, *zeros; |
++ size_t len = strlen(test->outhex); |
++ |
++ if (strlen(test->keyhex) != 32*2 || |
++ strlen(test->noncehex) != 8*2 || |
++ (len & 1) == 1) |
++ return 1; |
++ |
++ len /= 2; |
++ |
++ hex_decode(key, test->keyhex); |
++ hex_decode(nonce, test->noncehex); |
++ |
++ expected = malloc(len); |
++ out_bytes = malloc(len+16); |
++ zero_bytes = malloc(len+16); |
++ /* Attempt to test unaligned inputs. */ |
++ out = misalign(out_bytes); |
++ zeros = misalign(zero_bytes); |
++ memset(zeros, 0, len); |
++ |
++ hex_decode(expected, test->outhex); |
++ CRYPTO_chacha_20(out, zeros, len, key, nonce, 0); |
++ |
++ if (memcmp(out, expected, len) != 0) |
++ { |
++ printf("ChaCha20 test #%d failed.\n", i); |
++ printf("got: "); |
++ hexdump(out, len); |
++ printf("\nexpected: "); |
++ hexdump(expected, len); |
++ printf("\n"); |
++ return 1; |
++ } |
++ |
++ /* The last test has a large output. We test whether the |
++ * counter works as expected by skipping the first 64 bytes of |
++ * it. */ |
++ if (i == num_tests - 1) |
++ { |
++ CRYPTO_chacha_20(out, zeros, len - 64, key, nonce, 1); |
++ if (memcmp(out, expected + 64, len - 64) != 0) |
++ { |
++ printf("ChaCha20 skip test failed.\n"); |
++ return 1; |
++ } |
++ } |
++ |
++ free(expected); |
++ free(zero_bytes); |
++ free(out_bytes); |
++ } |
++ |
++ |
++ printf("PASS\n"); |
++ return 0; |
++ } |
+diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile |
+index b73038d..86b0504 100644 |
+--- a/crypto/evp/Makefile |
++++ b/crypto/evp/Makefile |
+@@ -29,7 +29,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \ |
+ c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \ |
+ evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \ |
+ e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c evp_fips.c \ |
+- e_aes_cbc_hmac_sha1.c e_rc4_hmac_md5.c evp_aead.c |
++ e_aes_cbc_hmac_sha1.c e_rc4_hmac_md5.c evp_aead.c \ |
++ e_chacha20poly1305.c |
+ |
+ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ |
+ e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\ |
+@@ -42,7 +43,7 @@ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \ |
+ c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \ |
+ evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \ |
+ e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o evp_fips.o \ |
+- e_aes_cbc_hmac_sha1.o e_rc4_hmac_md5.o evp_aead.o |
++ e_aes_cbc_hmac_sha1.o e_rc4_hmac_md5.o evp_aead.o e_chacha20poly1305.o |
+ |
+ SRC= $(LIBSRC) |
+ |
+@@ -239,6 +240,21 @@ e_cast.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h |
+ e_cast.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h |
+ e_cast.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h |
+ e_cast.o: ../../include/openssl/symhacks.h ../cryptlib.h e_cast.c evp_locl.h |
++e_chacha20poly1305.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h |
++e_chacha20poly1305.o: ../../include/openssl/chacha.h |
++e_chacha20poly1305.o: ../../include/openssl/crypto.h |
++e_chacha20poly1305.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h |
++e_chacha20poly1305.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h |
++e_chacha20poly1305.o: ../../include/openssl/obj_mac.h |
++e_chacha20poly1305.o: ../../include/openssl/objects.h |
++e_chacha20poly1305.o: ../../include/openssl/opensslconf.h |
++e_chacha20poly1305.o: ../../include/openssl/opensslv.h |
++e_chacha20poly1305.o: ../../include/openssl/ossl_typ.h |
++e_chacha20poly1305.o: ../../include/openssl/poly1305.h |
++e_chacha20poly1305.o: ../../include/openssl/safestack.h |
++e_chacha20poly1305.o: ../../include/openssl/stack.h |
++e_chacha20poly1305.o: ../../include/openssl/symhacks.h e_chacha20poly1305.c |
++e_chacha20poly1305.o: evp_locl.h |
+ e_des.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h |
+ e_des.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h |
+ e_des.o: ../../include/openssl/des.h ../../include/openssl/des_old.h |
+@@ -258,9 +274,10 @@ e_des3.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h |
+ e_des3.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h |
+ e_des3.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h |
+ e_des3.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rand.h |
+-e_des3.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h |
+-e_des3.o: ../../include/openssl/symhacks.h ../../include/openssl/ui.h |
+-e_des3.o: ../../include/openssl/ui_compat.h ../cryptlib.h e_des3.c evp_locl.h |
++e_des3.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h |
++e_des3.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h |
++e_des3.o: ../../include/openssl/ui.h ../../include/openssl/ui_compat.h |
++e_des3.o: ../cryptlib.h e_des3.c evp_locl.h |
+ e_idea.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h |
+ e_idea.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h |
+ e_idea.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h |
+@@ -356,6 +373,14 @@ evp_acnf.o: ../../include/openssl/opensslconf.h |
+ evp_acnf.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h |
+ evp_acnf.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h |
+ evp_acnf.o: ../../include/openssl/symhacks.h ../cryptlib.h evp_acnf.c |
++evp_aead.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h |
++evp_aead.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h |
++evp_aead.o: ../../include/openssl/err.h ../../include/openssl/evp.h |
++evp_aead.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h |
++evp_aead.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h |
++evp_aead.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h |
++evp_aead.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h |
++evp_aead.o: ../../include/openssl/symhacks.h evp_aead.c |
+ evp_cnf.o: ../../e_os.h ../../include/openssl/asn1.h |
+ evp_cnf.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h |
+ evp_cnf.o: ../../include/openssl/conf.h ../../include/openssl/crypto.h |
+diff --git a/crypto/evp/e_chacha20poly1305.c b/crypto/evp/e_chacha20poly1305.c |
+new file mode 100644 |
+index 0000000..1c0c0fb |
+--- /dev/null |
++++ b/crypto/evp/e_chacha20poly1305.c |
+@@ -0,0 +1,261 @@ |
++/* ==================================================================== |
++ * Copyright (c) 2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * openssl-core@openssl.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.openssl.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ * |
++ */ |
++ |
++#include <stdint.h> |
++#include <string.h> |
++#include <openssl/opensslconf.h> |
++ |
++#if !defined(OPENSSL_NO_CHACHA) && !defined(OPENSSL_NO_POLY1305) |
++ |
++#include <openssl/chacha.h> |
++#include <openssl/poly1305.h> |
++#include <openssl/evp.h> |
++#include <openssl/err.h> |
++#include "evp_locl.h" |
++ |
++#define POLY1305_TAG_LEN 16 |
++#define CHACHA20_NONCE_LEN 8 |
++ |
++struct aead_chacha20_poly1305_ctx |
++ { |
++ unsigned char key[32]; |
++ unsigned char tag_len; |
++ }; |
++ |
++static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const unsigned char *key, size_t key_len, size_t tag_len) |
++ { |
++ struct aead_chacha20_poly1305_ctx *c20_ctx; |
++ |
++ if (tag_len == 0) |
++ tag_len = POLY1305_TAG_LEN; |
++ |
++ if (tag_len > POLY1305_TAG_LEN) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_INIT, EVP_R_TOO_LARGE); |
++ return 0; |
++ } |
++ |
++ if (key_len != sizeof(c20_ctx->key)) |
++ return 0; /* internal error - EVP_AEAD_CTX_init should catch this. */ |
++ |
++ c20_ctx = OPENSSL_malloc(sizeof(struct aead_chacha20_poly1305_ctx)); |
++ if (c20_ctx == NULL) |
++ return 0; |
++ |
++ memcpy(&c20_ctx->key[0], key, key_len); |
++ c20_ctx->tag_len = tag_len; |
++ ctx->aead_state = c20_ctx; |
++ |
++ return 1; |
++ } |
++ |
++static void aead_chacha20_poly1305_cleanup(EVP_AEAD_CTX *ctx) |
++ { |
++ struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state; |
++ OPENSSL_cleanse(c20_ctx->key, sizeof(c20_ctx->key)); |
++ OPENSSL_free(c20_ctx); |
++ } |
++ |
++static void poly1305_update_with_length(poly1305_state *poly1305, |
++ const unsigned char *data, size_t data_len) |
++ { |
++ size_t j = data_len; |
++ unsigned char length_bytes[8]; |
++ unsigned i; |
++ |
++ for (i = 0; i < sizeof(length_bytes); i++) |
++ { |
++ length_bytes[i] = j; |
++ j >>= 8; |
++ } |
++ |
++ CRYPTO_poly1305_update(poly1305, data, data_len); |
++ CRYPTO_poly1305_update(poly1305, length_bytes, sizeof(length_bytes)); |
++} |
++ |
++static ssize_t aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, |
++ unsigned char *out, size_t max_out_len, |
++ const unsigned char *nonce, size_t nonce_len, |
++ const unsigned char *in, size_t in_len, |
++ const unsigned char *ad, size_t ad_len) |
++ { |
++ const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state; |
++ unsigned char poly1305_key[32]; |
++ poly1305_state poly1305; |
++ const uint64_t in_len_64 = in_len; |
++ |
++ /* The underlying ChaCha implementation may not overflow the block |
++ * counter into the second counter word. Therefore we disallow |
++ * individual operations that work on more than 2TB at a time. |
++ * |in_len_64| is needed because, on 32-bit platforms, size_t is only |
++ * 32-bits and this produces a warning because it's always false. |
++ * Casting to uint64_t inside the conditional is not sufficient to stop |
++ * the warning. */ |
++ if (in_len_64 >= (1ull << 32)*64-64) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_SEAL, EVP_R_TOO_LARGE); |
++ return -1; |
++ } |
++ |
++ if (max_out_len < in_len + c20_ctx->tag_len) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_SEAL, EVP_R_BUFFER_TOO_SMALL); |
++ return -1; |
++ } |
++ |
++ if (nonce_len != CHACHA20_NONCE_LEN) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_SEAL, EVP_R_IV_TOO_LARGE); |
++ return -1; |
++ } |
++ |
++ memset(poly1305_key, 0, sizeof(poly1305_key)); |
++ CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key), c20_ctx->key, nonce, 0); |
++ |
++ CRYPTO_poly1305_init(&poly1305, poly1305_key); |
++ poly1305_update_with_length(&poly1305, ad, ad_len); |
++ CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1); |
++ poly1305_update_with_length(&poly1305, out, in_len); |
++ |
++ if (c20_ctx->tag_len != POLY1305_TAG_LEN) |
++ { |
++ unsigned char tag[POLY1305_TAG_LEN]; |
++ CRYPTO_poly1305_finish(&poly1305, tag); |
++ memcpy(out + in_len, tag, c20_ctx->tag_len); |
++ return in_len + c20_ctx->tag_len; |
++ } |
++ |
++ CRYPTO_poly1305_finish(&poly1305, out + in_len); |
++ return in_len + POLY1305_TAG_LEN; |
++ } |
++ |
++static ssize_t aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, |
++ unsigned char *out, size_t max_out_len, |
++ const unsigned char *nonce, size_t nonce_len, |
++ const unsigned char *in, size_t in_len, |
++ const unsigned char *ad, size_t ad_len) |
++ { |
++ const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state; |
++ unsigned char mac[POLY1305_TAG_LEN]; |
++ unsigned char poly1305_key[32]; |
++ size_t out_len; |
++ poly1305_state poly1305; |
++ const uint64_t in_len_64 = in_len; |
++ |
++ if (in_len < c20_ctx->tag_len) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_OPEN, EVP_R_BAD_DECRYPT); |
++ return -1; |
++ } |
++ |
++ /* The underlying ChaCha implementation may not overflow the block |
++ * counter into the second counter word. Therefore we disallow |
++ * individual operations that work on more than 2TB at a time. |
++ * |in_len_64| is needed because, on 32-bit platforms, size_t is only |
++ * 32-bits and this produces a warning because it's always false. |
++ * Casting to uint64_t inside the conditional is not sufficient to stop |
++ * the warning. */ |
++ if (in_len_64 >= (1ull << 32)*64-64) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_SEAL, EVP_R_TOO_LARGE); |
++ return -1; |
++ } |
++ |
++ if (nonce_len != CHACHA20_NONCE_LEN) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_OPEN, EVP_R_IV_TOO_LARGE); |
++ return -1; |
++ } |
++ |
++ out_len = in_len - c20_ctx->tag_len; |
++ |
++ if (max_out_len < out_len) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_OPEN, EVP_R_BUFFER_TOO_SMALL); |
++ return -1; |
++ } |
++ |
++ memset(poly1305_key, 0, sizeof(poly1305_key)); |
++ CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key), c20_ctx->key, nonce, 0); |
++ |
++ CRYPTO_poly1305_init(&poly1305, poly1305_key); |
++ poly1305_update_with_length(&poly1305, ad, ad_len); |
++ poly1305_update_with_length(&poly1305, in, out_len); |
++ CRYPTO_poly1305_finish(&poly1305, mac); |
++ |
++ if (CRYPTO_memcmp(mac, in + out_len, c20_ctx->tag_len) != 0) |
++ { |
++ EVPerr(EVP_F_AEAD_CHACHA20_POLY1305_OPEN, EVP_R_BAD_DECRYPT); |
++ return -1; |
++ } |
++ |
++ CRYPTO_chacha_20(out, in, out_len, c20_ctx->key, nonce, 1); |
++ return out_len; |
++ } |
++ |
++static const EVP_AEAD aead_chacha20_poly1305 = |
++ { |
++ 32, /* key len */ |
++ CHACHA20_NONCE_LEN, /* nonce len */ |
++ POLY1305_TAG_LEN, /* overhead */ |
++ POLY1305_TAG_LEN, /* max tag length */ |
++ |
++ aead_chacha20_poly1305_init, |
++ aead_chacha20_poly1305_cleanup, |
++ aead_chacha20_poly1305_seal, |
++ aead_chacha20_poly1305_open, |
++ }; |
++ |
++const EVP_AEAD *EVP_aead_chacha20_poly1305() |
++ { |
++ return &aead_chacha20_poly1305; |
++ } |
++ |
++#endif /* !OPENSSL_NO_CHACHA && !OPENSSL_NO_POLY1305 */ |
+diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h |
+index bd10642..7dc1656 100644 |
+--- a/crypto/evp/evp.h |
++++ b/crypto/evp/evp.h |
+@@ -1258,6 +1258,11 @@ typedef struct evp_aead_st EVP_AEAD; |
+ const EVP_AEAD *EVP_aead_aes_128_gcm(void); |
+ #endif |
+ |
++#if !defined(OPENSSL_NO_CHACHA) && !defined(OPENSSL_NO_POLY1305) |
++/* EVP_aead_chacha20_poly1305 is ChaCha20 with a Poly1305 authenticator. */ |
++const EVP_AEAD *EVP_aead_chacha20_poly1305(void); |
++#endif |
++ |
+ /* EVP_AEAD_key_length returns the length, in bytes, of the keys used by |
+ * |aead|. */ |
+ size_t EVP_AEAD_key_length(const EVP_AEAD *aead); |
+@@ -1360,6 +1365,9 @@ void ERR_load_EVP_strings(void); |
+ #define EVP_F_AEAD_AES_128_GCM_INIT 183 |
+ #define EVP_F_AEAD_AES_128_GCM_OPEN 181 |
+ #define EVP_F_AEAD_AES_128_GCM_SEAL 182 |
++#define EVP_F_AEAD_CHACHA20_POLY1305_INIT 187 |
++#define EVP_F_AEAD_CHACHA20_POLY1305_OPEN 184 |
++#define EVP_F_AEAD_CHACHA20_POLY1305_SEAL 183 |
+ #define EVP_F_AEAD_CTX_OPEN 185 |
+ #define EVP_F_AEAD_CTX_SEAL 186 |
+ #define EVP_F_AESNI_INIT_KEY 165 |
+diff --git a/crypto/evp/evp_err.c b/crypto/evp/evp_err.c |
+index c47969c..fb747e5 100644 |
+--- a/crypto/evp/evp_err.c |
++++ b/crypto/evp/evp_err.c |
+@@ -73,6 +73,9 @@ static ERR_STRING_DATA EVP_str_functs[]= |
+ {ERR_FUNC(EVP_F_AEAD_AES_128_GCM_INIT), "AEAD_AES_128_GCM_INIT"}, |
+ {ERR_FUNC(EVP_F_AEAD_AES_128_GCM_OPEN), "AEAD_AES_128_GCM_OPEN"}, |
+ {ERR_FUNC(EVP_F_AEAD_AES_128_GCM_SEAL), "AEAD_AES_128_GCM_SEAL"}, |
++{ERR_FUNC(EVP_F_AEAD_CHACHA20_POLY1305_INIT), "AEAD_CHACHA20_POLY1305_INIT"}, |
++{ERR_FUNC(EVP_F_AEAD_CHACHA20_POLY1305_OPEN), "AEAD_CHACHA20_POLY1305_OPEN"}, |
++{ERR_FUNC(EVP_F_AEAD_CHACHA20_POLY1305_SEAL), "AEAD_CHACHA20_POLY1305_SEAL"}, |
+ {ERR_FUNC(EVP_F_AEAD_CTX_OPEN), "AEAD_CTX_OPEN"}, |
+ {ERR_FUNC(EVP_F_AEAD_CTX_SEAL), "AEAD_CTX_SEAL"}, |
+ {ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"}, |
+diff --git a/crypto/poly1305/Makefile b/crypto/poly1305/Makefile |
+new file mode 100644 |
+index 0000000..397d7cd |
+--- /dev/null |
++++ b/crypto/poly1305/Makefile |
+@@ -0,0 +1,81 @@ |
++# |
++# OpenSSL/crypto/poly1305/Makefile |
++# |
++ |
++DIR= poly1305 |
++TOP= ../.. |
++CC= cc |
++CPP= $(CC) -E |
++INCLUDES= |
++CFLAG=-g |
++AR= ar r |
++ |
++POLY1305=poly1305_vec.o |
++ |
++CFLAGS= $(INCLUDES) $(CFLAG) |
++ASFLAGS= $(INCLUDES) $(ASFLAG) |
++AFLAGS= $(ASFLAGS) |
++ |
++GENERAL=Makefile |
++TEST= |
++APPS= |
++ |
++LIB=$(TOP)/libcrypto.a |
++LIBSRC=poly1305_vec.c |
++LIBOBJ=$(POLY1305) |
++ |
++SRC= $(LIBSRC) |
++ |
++EXHEADER=poly1305.h |
++HEADER= $(EXHEADER) |
++ |
++ALL= $(GENERAL) $(SRC) $(HEADER) |
++ |
++top: |
++ (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) |
++ |
++all: lib |
++ |
++lib: $(LIBOBJ) |
++ $(AR) $(LIB) $(LIBOBJ) |
++ $(RANLIB) $(LIB) || echo Never mind. |
++ @touch lib |
++ |
++files: |
++ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO |
++ |
++links: |
++ @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) |
++ @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) |
++ @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) |
++ |
++install: |
++ @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... |
++ @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ |
++ do \ |
++ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ |
++ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ |
++ done; |
++ |
++tags: |
++ ctags $(SRC) |
++ |
++tests: |
++ |
++lint: |
++ lint -DLINT $(INCLUDES) $(SRC)>fluff |
++ |
++depend: |
++ @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... |
++ $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) |
++ |
++dclean: |
++ $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new |
++ mv -f Makefile.new $(MAKEFILE) |
++ |
++clean: |
++ rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff |
++ |
++# DO NOT DELETE THIS LINE -- make depend depends on it. |
++ |
++poly1305_vec.o: ../../include/openssl/poly1305.h poly1305_vec.c |
+diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c |
+new file mode 100644 |
+index 0000000..2e5621d |
+--- /dev/null |
++++ b/crypto/poly1305/poly1305.c |
+@@ -0,0 +1,320 @@ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++/* This implementation of poly1305 is by Andrew Moon |
++ * (https://github.com/floodyberry/poly1305-donna) and released as public |
++ * domain. */ |
++ |
++#include <string.h> |
++#include <stdint.h> |
++#include <openssl/opensslconf.h> |
++ |
++#if !defined(OPENSSL_NO_POLY1305) |
++ |
++#include <openssl/poly1305.h> |
++ |
++#if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) |
++/* We can assume little-endian. */ |
++static uint32_t U8TO32_LE(const unsigned char *m) |
++ { |
++ uint32_t r; |
++ memcpy(&r, m, sizeof(r)); |
++ return r; |
++ } |
++ |
++static void U32TO8_LE(unsigned char *m, uint32_t v) |
++ { |
++ memcpy(m, &v, sizeof(v)); |
++ } |
++#else |
++static uint32_t U8TO32_LE(const unsigned char *m) |
++ { |
++ return (uint32_t)m[0] | |
++ (uint32_t)m[1] << 8 | |
++ (uint32_t)m[2] << 16 | |
++ (uint32_t)m[3] << 24; |
++ } |
++ |
++static void U32TO8_LE(unsigned char *m, uint32_t v) |
++ { |
++ m[0] = v; |
++ m[1] = v >> 8; |
++ m[2] = v >> 16; |
++ m[3] = v >> 24; |
++ } |
++#endif |
++ |
++static uint64_t |
++mul32x32_64(uint32_t a, uint32_t b) |
++ { |
++ return (uint64_t)a * b; |
++ } |
++ |
++ |
++struct poly1305_state_st |
++ { |
++ uint32_t r0,r1,r2,r3,r4; |
++ uint32_t s1,s2,s3,s4; |
++ uint32_t h0,h1,h2,h3,h4; |
++ unsigned char buf[16]; |
++ unsigned int buf_used; |
++ unsigned char key[16]; |
++ }; |
++ |
++/* poly1305_blocks updates |state| given some amount of input data. This |
++ * function may only be called with a |len| that is not a multiple of 16 at the |
++ * end of the data. Otherwise the input must be buffered into 16 byte blocks. |
++ * */ |
++static void poly1305_update(struct poly1305_state_st *state, |
++ const unsigned char *in, size_t len) |
++ { |
++ uint32_t t0,t1,t2,t3; |
++ uint64_t t[5]; |
++ uint32_t b; |
++ uint64_t c; |
++ size_t j; |
++ unsigned char mp[16]; |
++ |
++ if (len < 16) |
++ goto poly1305_donna_atmost15bytes; |
++ |
++poly1305_donna_16bytes: |
++ t0 = U8TO32_LE(in); |
++ t1 = U8TO32_LE(in+4); |
++ t2 = U8TO32_LE(in+8); |
++ t3 = U8TO32_LE(in+12); |
++ |
++ in += 16; |
++ len -= 16; |
++ |
++ state->h0 += t0 & 0x3ffffff; |
++ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; |
++ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; |
++ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; |
++ state->h4 += (t3 >> 8) | (1 << 24); |
++ |
++poly1305_donna_mul: |
++ t[0] = mul32x32_64(state->h0,state->r0) + |
++ mul32x32_64(state->h1,state->s4) + |
++ mul32x32_64(state->h2,state->s3) + |
++ mul32x32_64(state->h3,state->s2) + |
++ mul32x32_64(state->h4,state->s1); |
++ t[1] = mul32x32_64(state->h0,state->r1) + |
++ mul32x32_64(state->h1,state->r0) + |
++ mul32x32_64(state->h2,state->s4) + |
++ mul32x32_64(state->h3,state->s3) + |
++ mul32x32_64(state->h4,state->s2); |
++ t[2] = mul32x32_64(state->h0,state->r2) + |
++ mul32x32_64(state->h1,state->r1) + |
++ mul32x32_64(state->h2,state->r0) + |
++ mul32x32_64(state->h3,state->s4) + |
++ mul32x32_64(state->h4,state->s3); |
++ t[3] = mul32x32_64(state->h0,state->r3) + |
++ mul32x32_64(state->h1,state->r2) + |
++ mul32x32_64(state->h2,state->r1) + |
++ mul32x32_64(state->h3,state->r0) + |
++ mul32x32_64(state->h4,state->s4); |
++ t[4] = mul32x32_64(state->h0,state->r4) + |
++ mul32x32_64(state->h1,state->r3) + |
++ mul32x32_64(state->h2,state->r2) + |
++ mul32x32_64(state->h3,state->r1) + |
++ mul32x32_64(state->h4,state->r0); |
++ |
++ state->h0 = (uint32_t)t[0] & 0x3ffffff; c = (t[0] >> 26); |
++ t[1] += c; state->h1 = (uint32_t)t[1] & 0x3ffffff; b = (uint32_t)(t[1] >> 26); |
++ t[2] += b; state->h2 = (uint32_t)t[2] & 0x3ffffff; b = (uint32_t)(t[2] >> 26); |
++ t[3] += b; state->h3 = (uint32_t)t[3] & 0x3ffffff; b = (uint32_t)(t[3] >> 26); |
++ t[4] += b; state->h4 = (uint32_t)t[4] & 0x3ffffff; b = (uint32_t)(t[4] >> 26); |
++ state->h0 += b * 5; |
++ |
++ if (len >= 16) |
++ goto poly1305_donna_16bytes; |
++ |
++ /* final bytes */ |
++poly1305_donna_atmost15bytes: |
++ if (!len) |
++ return; |
++ |
++ for (j = 0; j < len; j++) |
++ mp[j] = in[j]; |
++ mp[j++] = 1; |
++ for (; j < 16; j++) |
++ mp[j] = 0; |
++ len = 0; |
++ |
++ t0 = U8TO32_LE(mp+0); |
++ t1 = U8TO32_LE(mp+4); |
++ t2 = U8TO32_LE(mp+8); |
++ t3 = U8TO32_LE(mp+12); |
++ |
++ state->h0 += t0 & 0x3ffffff; |
++ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; |
++ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; |
++ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; |
++ state->h4 += (t3 >> 8); |
++ |
++ goto poly1305_donna_mul; |
++ } |
++ |
++void CRYPTO_poly1305_init(poly1305_state *statep, const unsigned char key[32]) |
++ { |
++ struct poly1305_state_st *state = (struct poly1305_state_st*) statep; |
++ uint32_t t0,t1,t2,t3; |
++ |
++ t0 = U8TO32_LE(key+0); |
++ t1 = U8TO32_LE(key+4); |
++ t2 = U8TO32_LE(key+8); |
++ t3 = U8TO32_LE(key+12); |
++ |
++ /* precompute multipliers */ |
++ state->r0 = t0 & 0x3ffffff; t0 >>= 26; t0 |= t1 << 6; |
++ state->r1 = t0 & 0x3ffff03; t1 >>= 20; t1 |= t2 << 12; |
++ state->r2 = t1 & 0x3ffc0ff; t2 >>= 14; t2 |= t3 << 18; |
++ state->r3 = t2 & 0x3f03fff; t3 >>= 8; |
++ state->r4 = t3 & 0x00fffff; |
++ |
++ state->s1 = state->r1 * 5; |
++ state->s2 = state->r2 * 5; |
++ state->s3 = state->r3 * 5; |
++ state->s4 = state->r4 * 5; |
++ |
++ /* init state */ |
++ state->h0 = 0; |
++ state->h1 = 0; |
++ state->h2 = 0; |
++ state->h3 = 0; |
++ state->h4 = 0; |
++ |
++ state->buf_used = 0; |
++ memcpy(state->key, key + 16, sizeof(state->key)); |
++ } |
++ |
++void CRYPTO_poly1305_update(poly1305_state *statep, const unsigned char *in, |
++ size_t in_len) |
++ { |
++ unsigned int i; |
++ struct poly1305_state_st *state = (struct poly1305_state_st*) statep; |
++ |
++ if (state->buf_used) |
++ { |
++ unsigned int todo = 16 - state->buf_used; |
++ if (todo > in_len) |
++ todo = in_len; |
++ for (i = 0; i < todo; i++) |
++ state->buf[state->buf_used + i] = in[i]; |
++ state->buf_used += todo; |
++ in_len -= todo; |
++ in += todo; |
++ |
++ if (state->buf_used == 16) |
++ { |
++ poly1305_update(state, state->buf, 16); |
++ state->buf_used = 0; |
++ } |
++ } |
++ |
++ if (in_len >= 16) |
++ { |
++ size_t todo = in_len & ~0xf; |
++ poly1305_update(state, in, todo); |
++ in += todo; |
++ in_len &= 0xf; |
++ } |
++ |
++ if (in_len) |
++ { |
++ for (i = 0; i < in_len; i++) |
++ state->buf[i] = in[i]; |
++ state->buf_used = in_len; |
++ } |
++ } |
++ |
++void CRYPTO_poly1305_finish(poly1305_state *statep, unsigned char mac[16]) |
++ { |
++ struct poly1305_state_st *state = (struct poly1305_state_st*) statep; |
++ uint64_t f0,f1,f2,f3; |
++ uint32_t g0,g1,g2,g3,g4; |
++ uint32_t b, nb; |
++ |
++ if (state->buf_used) |
++ poly1305_update(state, state->buf, state->buf_used); |
++ |
++ b = state->h0 >> 26; state->h0 = state->h0 & 0x3ffffff; |
++ state->h1 += b; b = state->h1 >> 26; state->h1 = state->h1 & 0x3ffffff; |
++ state->h2 += b; b = state->h2 >> 26; state->h2 = state->h2 & 0x3ffffff; |
++ state->h3 += b; b = state->h3 >> 26; state->h3 = state->h3 & 0x3ffffff; |
++ state->h4 += b; b = state->h4 >> 26; state->h4 = state->h4 & 0x3ffffff; |
++ state->h0 += b * 5; |
++ |
++ g0 = state->h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff; |
++ g1 = state->h1 + b; b = g1 >> 26; g1 &= 0x3ffffff; |
++ g2 = state->h2 + b; b = g2 >> 26; g2 &= 0x3ffffff; |
++ g3 = state->h3 + b; b = g3 >> 26; g3 &= 0x3ffffff; |
++ g4 = state->h4 + b - (1 << 26); |
++ |
++ b = (g4 >> 31) - 1; |
++ nb = ~b; |
++ state->h0 = (state->h0 & nb) | (g0 & b); |
++ state->h1 = (state->h1 & nb) | (g1 & b); |
++ state->h2 = (state->h2 & nb) | (g2 & b); |
++ state->h3 = (state->h3 & nb) | (g3 & b); |
++ state->h4 = (state->h4 & nb) | (g4 & b); |
++ |
++ f0 = ((state->h0 ) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); |
++ f1 = ((state->h1 >> 6) | (state->h2 << 20)) + (uint64_t)U8TO32_LE(&state->key[4]); |
++ f2 = ((state->h2 >> 12) | (state->h3 << 14)) + (uint64_t)U8TO32_LE(&state->key[8]); |
++ f3 = ((state->h3 >> 18) | (state->h4 << 8)) + (uint64_t)U8TO32_LE(&state->key[12]); |
++ |
++ U32TO8_LE(&mac[ 0], f0); f1 += (f0 >> 32); |
++ U32TO8_LE(&mac[ 4], f1); f2 += (f1 >> 32); |
++ U32TO8_LE(&mac[ 8], f2); f3 += (f2 >> 32); |
++ U32TO8_LE(&mac[12], f3); |
++ } |
++ |
++#endif /* !OPENSSL_NO_POLY1305 */ |
+diff --git a/crypto/poly1305/poly1305.h b/crypto/poly1305/poly1305.h |
+new file mode 100644 |
+index 0000000..28f85ed |
+--- /dev/null |
++++ b/crypto/poly1305/poly1305.h |
+@@ -0,0 +1,88 @@ |
++/* |
++ * Poly1305 |
++ * |
++ * Created on: Jun, 2013 |
++ * Author: Elie Bursztein (elieb@google.com) |
++ * |
++ * Adapted from the estream code by D. Bernstein. |
++ */ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++#ifndef HEADER_POLY1305_H_ |
++#define HEADER_POLY1305_H_ |
++ |
++#include <stdint.h> |
++#include <openssl/opensslconf.h> |
++ |
++#if defined(OPENSSL_NO_POLY1305) |
++#error Poly1305 support is disabled. |
++#endif |
++ |
++typedef unsigned char poly1305_state[512]; |
++ |
++/* poly1305_init sets up |state| so that it can be used to calculate an |
++ * authentication tag with the one-time key |key|. Note that |key| is a |
++ * one-time key and therefore there is no `reset' method because that would |
++ * enable several messages to be authenticated with the same key. */ |
++extern void CRYPTO_poly1305_init(poly1305_state* state, |
++ const unsigned char key[32]); |
++ |
++/* poly1305_update processes |in_len| bytes from |in|. It can be called zero or |
++ * more times after poly1305_init. */ |
++extern void CRYPTO_poly1305_update(poly1305_state* state, |
++ const unsigned char *in, |
++ size_t in_len); |
++ |
++/* poly1305_finish completes the poly1305 calculation and writes a 16 byte |
++ * authentication tag to |mac|. */ |
++extern void CRYPTO_poly1305_finish(poly1305_state* state, |
++ unsigned char mac[16]); |
++ |
++#endif /* HEADER_POLY1305_H_ */ |
+diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c |
+new file mode 100644 |
+index 0000000..adcef35 |
+--- /dev/null |
++++ b/crypto/poly1305/poly1305_arm.c |
+@@ -0,0 +1,335 @@ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++/* This implementation was taken from the public domain, neon2 version in |
++ * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ |
++ |
++#include <stdint.h> |
++ |
++#include <openssl/poly1305.h> |
++ |
++#if !defined(OPENSSL_NO_POLY1305) |
++ |
++typedef struct { |
++ uint32_t v[12]; /* for alignment; only using 10 */ |
++} fe1305x2; |
++ |
++#define addmulmod openssl_poly1305_neon2_addmulmod |
++#define blocks openssl_poly1305_neon2_blocks |
++ |
++extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, const fe1305x2 *c); |
++ |
++extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const unsigned char *in, unsigned int inlen); |
++ |
++static void freeze(fe1305x2 *r) |
++ { |
++ int i; |
++ |
++ uint32_t x0 = r->v[0]; |
++ uint32_t x1 = r->v[2]; |
++ uint32_t x2 = r->v[4]; |
++ uint32_t x3 = r->v[6]; |
++ uint32_t x4 = r->v[8]; |
++ uint32_t y0; |
++ uint32_t y1; |
++ uint32_t y2; |
++ uint32_t y3; |
++ uint32_t y4; |
++ uint32_t swap; |
++ |
++ for (i = 0;i < 3;++i) |
++ { |
++ x1 += x0 >> 26; x0 &= 0x3ffffff; |
++ x2 += x1 >> 26; x1 &= 0x3ffffff; |
++ x3 += x2 >> 26; x2 &= 0x3ffffff; |
++ x4 += x3 >> 26; x3 &= 0x3ffffff; |
++ x0 += 5*(x4 >> 26); x4 &= 0x3ffffff; |
++ } |
++ |
++ y0 = x0 + 5; |
++ y1 = x1 + (y0 >> 26); y0 &= 0x3ffffff; |
++ y2 = x2 + (y1 >> 26); y1 &= 0x3ffffff; |
++ y3 = x3 + (y2 >> 26); y2 &= 0x3ffffff; |
++ y4 = x4 + (y3 >> 26); y3 &= 0x3ffffff; |
++ swap = -(y4 >> 26); y4 &= 0x3ffffff; |
++ |
++ y0 ^= x0; |
++ y1 ^= x1; |
++ y2 ^= x2; |
++ y3 ^= x3; |
++ y4 ^= x4; |
++ |
++ y0 &= swap; |
++ y1 &= swap; |
++ y2 &= swap; |
++ y3 &= swap; |
++ y4 &= swap; |
++ |
++ y0 ^= x0; |
++ y1 ^= x1; |
++ y2 ^= x2; |
++ y3 ^= x3; |
++ y4 ^= x4; |
++ |
++ r->v[0] = y0; |
++ r->v[2] = y1; |
++ r->v[4] = y2; |
++ r->v[6] = y3; |
++ r->v[8] = y4; |
++ } |
++ |
++static void fe1305x2_tobytearray(unsigned char *r, fe1305x2 *x) |
++ { |
++ uint32_t x0 = x->v[0]; |
++ uint32_t x1 = x->v[2]; |
++ uint32_t x2 = x->v[4]; |
++ uint32_t x3 = x->v[6]; |
++ uint32_t x4 = x->v[8]; |
++ |
++ x1 += x0 >> 26; |
++ x0 &= 0x3ffffff; |
++ x2 += x1 >> 26; |
++ x1 &= 0x3ffffff; |
++ x3 += x2 >> 26; |
++ x2 &= 0x3ffffff; |
++ x4 += x3 >> 26; |
++ x3 &= 0x3ffffff; |
++ |
++ *(uint32_t *) r = x0 + (x1 << 26); |
++ *(uint32_t *) (r + 4) = (x1 >> 6) + (x2 << 20); |
++ *(uint32_t *) (r + 8) = (x2 >> 12) + (x3 << 14); |
++ *(uint32_t *) (r + 12) = (x3 >> 18) + (x4 << 8); |
++ } |
++ |
++/* load32 exists to avoid breaking strict aliasing rules in |
++ * fe1305x2_frombytearray. */ |
++static uint32_t load32(unsigned char *t) |
++ { |
++ uint32_t tmp; |
++ memcpy(&tmp, t, sizeof(tmp)); |
++ return tmp; |
++ } |
++ |
++static void fe1305x2_frombytearray(fe1305x2 *r, const unsigned char *x, unsigned long long xlen) |
++ { |
++ int i; |
++ unsigned char t[17]; |
++ |
++ for (i = 0; (i < 16) && (i < xlen); i++) |
++ t[i] = x[i]; |
++ xlen -= i; |
++ x += i; |
++ t[i++] = 1; |
++ for (; i<17; i++) |
++ t[i] = 0; |
++ |
++ r->v[0] = 0x3ffffff & load32(t); |
++ r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); |
++ r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); |
++ r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); |
++ r->v[8] = load32(t + 13); |
++ |
++ if (xlen) |
++ { |
++ for (i = 0; (i < 16) && (i < xlen); i++) |
++ t[i] = x[i]; |
++ t[i++] = 1; |
++ for (; i<17; i++) |
++ t[i] = 0; |
++ |
++ r->v[1] = 0x3ffffff & load32(t); |
++ r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); |
++ r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); |
++ r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); |
++ r->v[9] = load32(t + 13); |
++ } |
++ else |
++ r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; |
++ } |
++ |
++static const fe1305x2 zero __attribute__ ((aligned (16))); |
++ |
++struct poly1305_state_st { |
++ unsigned char data[sizeof(fe1305x2[5]) + 128]; |
++ unsigned char buf[32]; |
++ unsigned int buf_used; |
++ unsigned char key[16]; |
++}; |
++ |
++void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) |
++ { |
++ struct poly1305_state_st *st = (struct poly1305_state_st*) (state); |
++ fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); |
++ fe1305x2 *const h = r + 1; |
++ fe1305x2 *const c = h + 1; |
++ fe1305x2 *const precomp = c + 1; |
++ unsigned int j; |
++ |
++ r->v[1] = r->v[0] = 0x3ffffff & *(uint32_t *) key; |
++ r->v[3] = r->v[2] = 0x3ffff03 & ((*(uint32_t *) (key + 3)) >> 2); |
++ r->v[5] = r->v[4] = 0x3ffc0ff & ((*(uint32_t *) (key + 6)) >> 4); |
++ r->v[7] = r->v[6] = 0x3f03fff & ((*(uint32_t *) (key + 9)) >> 6); |
++ r->v[9] = r->v[8] = 0x00fffff & ((*(uint32_t *) (key + 12)) >> 8); |
++ |
++ for (j = 0; j < 10; j++) |
++ h->v[j] = 0; /* XXX: should fast-forward a bit */ |
++ |
++ addmulmod(precomp,r,r,&zero); /* precompute r^2 */ |
++ addmulmod(precomp + 1,precomp,precomp,&zero); /* precompute r^4 */ |
++ |
++ memcpy(st->key, key + 16, 16); |
++ st->buf_used = 0; |
++ } |
++ |
++void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, size_t in_len) |
++ { |
++ struct poly1305_state_st *st = (struct poly1305_state_st*) (state); |
++ fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); |
++ fe1305x2 *const h = r + 1; |
++ fe1305x2 *const c = h + 1; |
++ fe1305x2 *const precomp = c + 1; |
++ unsigned int i; |
++ unsigned char data[sizeof(fe1305x2) + 16]; |
++ fe1305x2 *const r2r = (fe1305x2 *) (data + (15 & (-(int) data))); |
++ |
++ if (st->buf_used) |
++ { |
++ unsigned int todo = 32 - st->buf_used; |
++ if (todo > in_len) |
++ todo = in_len; |
++ for (i = 0; i < todo; i++) |
++ st->buf[st->buf_used + i] = in[i]; |
++ st->buf_used += todo; |
++ in_len -= todo; |
++ in += todo; |
++ |
++ if (st->buf_used == sizeof(st->buf)) |
++ { |
++ fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); |
++ r2r->v[0] = precomp->v[0]; |
++ r2r->v[2] = precomp->v[2]; |
++ r2r->v[4] = precomp->v[4]; |
++ r2r->v[6] = precomp->v[6]; |
++ r2r->v[8] = precomp->v[8]; |
++ r2r->v[1] = r->v[1]; |
++ r2r->v[3] = r->v[3]; |
++ r2r->v[5] = r->v[5]; |
++ r2r->v[7] = r->v[7]; |
++ r2r->v[9] = r->v[9]; |
++ addmulmod(h,h,r2r,c); |
++ st->buf_used = 0; |
++ } |
++ } |
++ |
++ while (in_len > 32) |
++ { |
++ unsigned int tlen = 1048576; |
++ if (in_len < 1048576) |
++ tlen = in_len; |
++ tlen -= blocks(h, precomp, in, tlen); |
++ in_len -= tlen; |
++ in += tlen; |
++ } |
++ |
++ if (in_len) |
++ { |
++ for (i = 0; i < in_len; i++) |
++ st->buf[i] = in[i]; |
++ st->buf_used = in_len; |
++ } |
++ } |
++ |
++void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16]) |
++ { |
++ struct poly1305_state_st *st = (struct poly1305_state_st*) (state); |
++ fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); |
++ fe1305x2 *const h = r + 1; |
++ fe1305x2 *const c = h + 1; |
++ fe1305x2 *const precomp = c + 1; |
++ |
++ if (st->buf_used > 16) |
++ { |
++ fe1305x2_frombytearray(c, st->buf, st->buf_used); |
++ precomp->v[1] = r->v[1]; |
++ precomp->v[3] = r->v[3]; |
++ precomp->v[5] = r->v[5]; |
++ precomp->v[7] = r->v[7]; |
++ precomp->v[9] = r->v[9]; |
++ addmulmod(h,h,precomp,c); |
++ } |
++ else if (st->buf_used > 0) |
++ { |
++ fe1305x2_frombytearray(c, st->buf, st->buf_used); |
++ r->v[1] = 1; |
++ r->v[3] = 0; |
++ r->v[5] = 0; |
++ r->v[7] = 0; |
++ r->v[9] = 0; |
++ addmulmod(h,h,r,c); |
++ } |
++ |
++ h->v[0] += h->v[1]; |
++ h->v[2] += h->v[3]; |
++ h->v[4] += h->v[5]; |
++ h->v[6] += h->v[7]; |
++ h->v[8] += h->v[9]; |
++ freeze(h); |
++ |
++ fe1305x2_frombytearray(c, st->key, 16); |
++ c->v[8] ^= (1 << 24); |
++ |
++ h->v[0] += c->v[0]; |
++ h->v[2] += c->v[2]; |
++ h->v[4] += c->v[4]; |
++ h->v[6] += c->v[6]; |
++ h->v[8] += c->v[8]; |
++ fe1305x2_tobytearray(mac, h); |
++ } |
++ |
++#endif /* !OPENSSL_NO_POLY1305 */ |
+diff --git a/crypto/poly1305/poly1305_arm_asm.s b/crypto/poly1305/poly1305_arm_asm.s |
+new file mode 100644 |
+index 0000000..449d16f |
+--- /dev/null |
++++ b/crypto/poly1305/poly1305_arm_asm.s |
+@@ -0,0 +1,2009 @@ |
++# This implementation was taken from the public domain, neon2 version in |
++# SUPERCOP by D. J. Bernstein and Peter Schwabe. |
++ |
++# qhasm: int32 input_0 |
++ |
++# qhasm: int32 input_1 |
++ |
++# qhasm: int32 input_2 |
++ |
++# qhasm: int32 input_3 |
++ |
++# qhasm: stack32 input_4 |
++ |
++# qhasm: stack32 input_5 |
++ |
++# qhasm: stack32 input_6 |
++ |
++# qhasm: stack32 input_7 |
++ |
++# qhasm: int32 caller_r4 |
++ |
++# qhasm: int32 caller_r5 |
++ |
++# qhasm: int32 caller_r6 |
++ |
++# qhasm: int32 caller_r7 |
++ |
++# qhasm: int32 caller_r8 |
++ |
++# qhasm: int32 caller_r9 |
++ |
++# qhasm: int32 caller_r10 |
++ |
++# qhasm: int32 caller_r11 |
++ |
++# qhasm: int32 caller_r12 |
++ |
++# qhasm: int32 caller_r14 |
++ |
++# qhasm: reg128 caller_q4 |
++ |
++# qhasm: reg128 caller_q5 |
++ |
++# qhasm: reg128 caller_q6 |
++ |
++# qhasm: reg128 caller_q7 |
++ |
++# qhasm: startcode |
++.fpu neon |
++.text |
++ |
++# qhasm: reg128 r0 |
++ |
++# qhasm: reg128 r1 |
++ |
++# qhasm: reg128 r2 |
++ |
++# qhasm: reg128 r3 |
++ |
++# qhasm: reg128 r4 |
++ |
++# qhasm: reg128 x01 |
++ |
++# qhasm: reg128 x23 |
++ |
++# qhasm: reg128 x4 |
++ |
++# qhasm: reg128 y0 |
++ |
++# qhasm: reg128 y12 |
++ |
++# qhasm: reg128 y34 |
++ |
++# qhasm: reg128 5y12 |
++ |
++# qhasm: reg128 5y34 |
++ |
++# qhasm: stack128 y0_stack |
++ |
++# qhasm: stack128 y12_stack |
++ |
++# qhasm: stack128 y34_stack |
++ |
++# qhasm: stack128 5y12_stack |
++ |
++# qhasm: stack128 5y34_stack |
++ |
++# qhasm: reg128 z0 |
++ |
++# qhasm: reg128 z12 |
++ |
++# qhasm: reg128 z34 |
++ |
++# qhasm: reg128 5z12 |
++ |
++# qhasm: reg128 5z34 |
++ |
++# qhasm: stack128 z0_stack |
++ |
++# qhasm: stack128 z12_stack |
++ |
++# qhasm: stack128 z34_stack |
++ |
++# qhasm: stack128 5z12_stack |
++ |
++# qhasm: stack128 5z34_stack |
++ |
++# qhasm: stack128 two24 |
++ |
++# qhasm: int32 ptr |
++ |
++# qhasm: reg128 c01 |
++ |
++# qhasm: reg128 c23 |
++ |
++# qhasm: reg128 d01 |
++ |
++# qhasm: reg128 d23 |
++ |
++# qhasm: reg128 t0 |
++ |
++# qhasm: reg128 t1 |
++ |
++# qhasm: reg128 t2 |
++ |
++# qhasm: reg128 t3 |
++ |
++# qhasm: reg128 t4 |
++ |
++# qhasm: reg128 mask |
++ |
++# qhasm: reg128 u0 |
++ |
++# qhasm: reg128 u1 |
++ |
++# qhasm: reg128 u2 |
++ |
++# qhasm: reg128 u3 |
++ |
++# qhasm: reg128 u4 |
++ |
++# qhasm: reg128 v01 |
++ |
++# qhasm: reg128 mid |
++ |
++# qhasm: reg128 v23 |
++ |
++# qhasm: reg128 v4 |
++ |
++# qhasm: int32 len |
++ |
++# qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks |
++.align 4 |
++.global openssl_poly1305_neon2_blocks |
++.type openssl_poly1305_neon2_blocks STT_FUNC |
++openssl_poly1305_neon2_blocks: |
++vpush {q4,q5,q6,q7} |
++mov r12,sp |
++sub sp,sp,#192 |
++and sp,sp,#0xffffffe0 |
++ |
++# qhasm: len = input_3 |
++# asm 1: mov >len=int32#4,<input_3=int32#4 |
++# asm 2: mov >len=r3,<input_3=r3 |
++mov r3,r3 |
++ |
++# qhasm: new y0 |
++ |
++# qhasm: y0 = mem64[input_1]y0[1]; input_1 += 8 |
++# asm 1: vld1.8 {<y0=reg128#1%bot},[<input_1=int32#2]! |
++# asm 2: vld1.8 {<y0=d0},[<input_1=r1]! |
++vld1.8 {d0},[r1]! |
++ |
++# qhasm: y12 = mem128[input_1]; input_1 += 16 |
++# asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<input_1=int32#2]! |
++# asm 2: vld1.8 {>y12=d2->y12=d3},[<input_1=r1]! |
++vld1.8 {d2-d3},[r1]! |
++ |
++# qhasm: y34 = mem128[input_1]; input_1 += 16 |
++# asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<input_1=int32#2]! |
++# asm 2: vld1.8 {>y34=d4->y34=d5},[<input_1=r1]! |
++vld1.8 {d4-d5},[r1]! |
++ |
++# qhasm: input_1 += 8 |
++# asm 1: add >input_1=int32#2,<input_1=int32#2,#8 |
++# asm 2: add >input_1=r1,<input_1=r1,#8 |
++add r1,r1,#8 |
++ |
++# qhasm: new z0 |
++ |
++# qhasm: z0 = mem64[input_1]z0[1]; input_1 += 8 |
++# asm 1: vld1.8 {<z0=reg128#4%bot},[<input_1=int32#2]! |
++# asm 2: vld1.8 {<z0=d6},[<input_1=r1]! |
++vld1.8 {d6},[r1]! |
++ |
++# qhasm: z12 = mem128[input_1]; input_1 += 16 |
++# asm 1: vld1.8 {>z12=reg128#5%bot->z12=reg128#5%top},[<input_1=int32#2]! |
++# asm 2: vld1.8 {>z12=d8->z12=d9},[<input_1=r1]! |
++vld1.8 {d8-d9},[r1]! |
++ |
++# qhasm: z34 = mem128[input_1]; input_1 += 16 |
++# asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<input_1=int32#2]! |
++# asm 2: vld1.8 {>z34=d10->z34=d11},[<input_1=r1]! |
++vld1.8 {d10-d11},[r1]! |
++ |
++# qhasm: 2x mask = 0xffffffff |
++# asm 1: vmov.i64 >mask=reg128#7,#0xffffffff |
++# asm 2: vmov.i64 >mask=q6,#0xffffffff |
++vmov.i64 q6,#0xffffffff |
++ |
++# qhasm: 2x u4 = 0xff |
++# asm 1: vmov.i64 >u4=reg128#8,#0xff |
++# asm 2: vmov.i64 >u4=q7,#0xff |
++vmov.i64 q7,#0xff |
++ |
++# qhasm: x01 aligned= mem128[input_0];input_0+=16 |
++# asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[<input_0=int32#1,: 128]! |
++# asm 2: vld1.8 {>x01=d16->x01=d17},[<input_0=r0,: 128]! |
++vld1.8 {d16-d17},[r0,: 128]! |
++ |
++# qhasm: x23 aligned= mem128[input_0];input_0+=16 |
++# asm 1: vld1.8 {>x23=reg128#10%bot->x23=reg128#10%top},[<input_0=int32#1,: 128]! |
++# asm 2: vld1.8 {>x23=d18->x23=d19},[<input_0=r0,: 128]! |
++vld1.8 {d18-d19},[r0,: 128]! |
++ |
++# qhasm: x4 aligned= mem64[input_0]x4[1] |
++# asm 1: vld1.8 {<x4=reg128#11%bot},[<input_0=int32#1,: 64] |
++# asm 2: vld1.8 {<x4=d20},[<input_0=r0,: 64] |
++vld1.8 {d20},[r0,: 64] |
++ |
++# qhasm: input_0 -= 32 |
++# asm 1: sub >input_0=int32#1,<input_0=int32#1,#32 |
++# asm 2: sub >input_0=r0,<input_0=r0,#32 |
++sub r0,r0,#32 |
++ |
++# qhasm: 2x mask unsigned>>=6 |
++# asm 1: vshr.u64 >mask=reg128#7,<mask=reg128#7,#6 |
++# asm 2: vshr.u64 >mask=q6,<mask=q6,#6 |
++vshr.u64 q6,q6,#6 |
++ |
++# qhasm: 2x u4 unsigned>>= 7 |
++# asm 1: vshr.u64 >u4=reg128#8,<u4=reg128#8,#7 |
++# asm 2: vshr.u64 >u4=q7,<u4=q7,#7 |
++vshr.u64 q7,q7,#7 |
++ |
++# qhasm: 4x 5y12 = y12 << 2 |
++# asm 1: vshl.i32 >5y12=reg128#12,<y12=reg128#2,#2 |
++# asm 2: vshl.i32 >5y12=q11,<y12=q1,#2 |
++vshl.i32 q11,q1,#2 |
++ |
++# qhasm: 4x 5y34 = y34 << 2 |
++# asm 1: vshl.i32 >5y34=reg128#13,<y34=reg128#3,#2 |
++# asm 2: vshl.i32 >5y34=q12,<y34=q2,#2 |
++vshl.i32 q12,q2,#2 |
++ |
++# qhasm: 4x 5y12 += y12 |
++# asm 1: vadd.i32 >5y12=reg128#12,<5y12=reg128#12,<y12=reg128#2 |
++# asm 2: vadd.i32 >5y12=q11,<5y12=q11,<y12=q1 |
++vadd.i32 q11,q11,q1 |
++ |
++# qhasm: 4x 5y34 += y34 |
++# asm 1: vadd.i32 >5y34=reg128#13,<5y34=reg128#13,<y34=reg128#3 |
++# asm 2: vadd.i32 >5y34=q12,<5y34=q12,<y34=q2 |
++vadd.i32 q12,q12,q2 |
++ |
++# qhasm: 2x u4 <<= 24 |
++# asm 1: vshl.i64 >u4=reg128#8,<u4=reg128#8,#24 |
++# asm 2: vshl.i64 >u4=q7,<u4=q7,#24 |
++vshl.i64 q7,q7,#24 |
++ |
++# qhasm: 4x 5z12 = z12 << 2 |
++# asm 1: vshl.i32 >5z12=reg128#14,<z12=reg128#5,#2 |
++# asm 2: vshl.i32 >5z12=q13,<z12=q4,#2 |
++vshl.i32 q13,q4,#2 |
++ |
++# qhasm: 4x 5z34 = z34 << 2 |
++# asm 1: vshl.i32 >5z34=reg128#15,<z34=reg128#6,#2 |
++# asm 2: vshl.i32 >5z34=q14,<z34=q5,#2 |
++vshl.i32 q14,q5,#2 |
++ |
++# qhasm: 4x 5z12 += z12 |
++# asm 1: vadd.i32 >5z12=reg128#14,<5z12=reg128#14,<z12=reg128#5 |
++# asm 2: vadd.i32 >5z12=q13,<5z12=q13,<z12=q4 |
++vadd.i32 q13,q13,q4 |
++ |
++# qhasm: 4x 5z34 += z34 |
++# asm 1: vadd.i32 >5z34=reg128#15,<5z34=reg128#15,<z34=reg128#6 |
++# asm 2: vadd.i32 >5z34=q14,<5z34=q14,<z34=q5 |
++vadd.i32 q14,q14,q5 |
++ |
++# qhasm: new two24 |
++ |
++# qhasm: new y0_stack |
++ |
++# qhasm: new y12_stack |
++ |
++# qhasm: new y34_stack |
++ |
++# qhasm: new 5y12_stack |
++ |
++# qhasm: new 5y34_stack |
++ |
++# qhasm: new z0_stack |
++ |
++# qhasm: new z12_stack |
++ |
++# qhasm: new z34_stack |
++ |
++# qhasm: new 5z12_stack |
++ |
++# qhasm: new 5z34_stack |
++ |
++# qhasm: ptr = &two24 |
++# asm 1: lea >ptr=int32#2,<two24=stack128#1 |
++# asm 2: lea >ptr=r1,<two24=[sp,#0] |
++add r1,sp,#0 |
++ |
++# qhasm: mem128[ptr] aligned= u4 |
++# asm 1: vst1.8 {<u4=reg128#8%bot-<u4=reg128#8%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<u4=d14-<u4=d15},[<ptr=r1,: 128] |
++vst1.8 {d14-d15},[r1,: 128] |
++ |
++# qhasm: r4 = u4 |
++# asm 1: vmov >r4=reg128#16,<u4=reg128#8 |
++# asm 2: vmov >r4=q15,<u4=q7 |
++vmov q15,q7 |
++ |
++# qhasm: r0 = u4 |
++# asm 1: vmov >r0=reg128#8,<u4=reg128#8 |
++# asm 2: vmov >r0=q7,<u4=q7 |
++vmov q7,q7 |
++ |
++# qhasm: ptr = &y0_stack |
++# asm 1: lea >ptr=int32#2,<y0_stack=stack128#2 |
++# asm 2: lea >ptr=r1,<y0_stack=[sp,#16] |
++add r1,sp,#16 |
++ |
++# qhasm: mem128[ptr] aligned= y0 |
++# asm 1: vst1.8 {<y0=reg128#1%bot-<y0=reg128#1%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<y0=d0-<y0=d1},[<ptr=r1,: 128] |
++vst1.8 {d0-d1},[r1,: 128] |
++ |
++# qhasm: ptr = &y12_stack |
++# asm 1: lea >ptr=int32#2,<y12_stack=stack128#3 |
++# asm 2: lea >ptr=r1,<y12_stack=[sp,#32] |
++add r1,sp,#32 |
++ |
++# qhasm: mem128[ptr] aligned= y12 |
++# asm 1: vst1.8 {<y12=reg128#2%bot-<y12=reg128#2%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<y12=d2-<y12=d3},[<ptr=r1,: 128] |
++vst1.8 {d2-d3},[r1,: 128] |
++ |
++# qhasm: ptr = &y34_stack |
++# asm 1: lea >ptr=int32#2,<y34_stack=stack128#4 |
++# asm 2: lea >ptr=r1,<y34_stack=[sp,#48] |
++add r1,sp,#48 |
++ |
++# qhasm: mem128[ptr] aligned= y34 |
++# asm 1: vst1.8 {<y34=reg128#3%bot-<y34=reg128#3%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<y34=d4-<y34=d5},[<ptr=r1,: 128] |
++vst1.8 {d4-d5},[r1,: 128] |
++ |
++# qhasm: ptr = &z0_stack |
++# asm 1: lea >ptr=int32#2,<z0_stack=stack128#7 |
++# asm 2: lea >ptr=r1,<z0_stack=[sp,#96] |
++add r1,sp,#96 |
++ |
++# qhasm: mem128[ptr] aligned= z0 |
++# asm 1: vst1.8 {<z0=reg128#4%bot-<z0=reg128#4%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<z0=d6-<z0=d7},[<ptr=r1,: 128] |
++vst1.8 {d6-d7},[r1,: 128] |
++ |
++# qhasm: ptr = &z12_stack |
++# asm 1: lea >ptr=int32#2,<z12_stack=stack128#8 |
++# asm 2: lea >ptr=r1,<z12_stack=[sp,#112] |
++add r1,sp,#112 |
++ |
++# qhasm: mem128[ptr] aligned= z12 |
++# asm 1: vst1.8 {<z12=reg128#5%bot-<z12=reg128#5%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<z12=d8-<z12=d9},[<ptr=r1,: 128] |
++vst1.8 {d8-d9},[r1,: 128] |
++ |
++# qhasm: ptr = &z34_stack |
++# asm 1: lea >ptr=int32#2,<z34_stack=stack128#9 |
++# asm 2: lea >ptr=r1,<z34_stack=[sp,#128] |
++add r1,sp,#128 |
++ |
++# qhasm: mem128[ptr] aligned= z34 |
++# asm 1: vst1.8 {<z34=reg128#6%bot-<z34=reg128#6%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<z34=d10-<z34=d11},[<ptr=r1,: 128] |
++vst1.8 {d10-d11},[r1,: 128] |
++ |
++# qhasm: ptr = &5y12_stack |
++# asm 1: lea >ptr=int32#2,<5y12_stack=stack128#5 |
++# asm 2: lea >ptr=r1,<5y12_stack=[sp,#64] |
++add r1,sp,#64 |
++ |
++# qhasm: mem128[ptr] aligned= 5y12 |
++# asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<5y12=d22-<5y12=d23},[<ptr=r1,: 128] |
++vst1.8 {d22-d23},[r1,: 128] |
++ |
++# qhasm: ptr = &5y34_stack |
++# asm 1: lea >ptr=int32#2,<5y34_stack=stack128#6 |
++# asm 2: lea >ptr=r1,<5y34_stack=[sp,#80] |
++add r1,sp,#80 |
++ |
++# qhasm: mem128[ptr] aligned= 5y34 |
++# asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<5y34=d24-<5y34=d25},[<ptr=r1,: 128] |
++vst1.8 {d24-d25},[r1,: 128] |
++ |
++# qhasm: ptr = &5z12_stack |
++# asm 1: lea >ptr=int32#2,<5z12_stack=stack128#10 |
++# asm 2: lea >ptr=r1,<5z12_stack=[sp,#144] |
++add r1,sp,#144 |
++ |
++# qhasm: mem128[ptr] aligned= 5z12 |
++# asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<5z12=d26-<5z12=d27},[<ptr=r1,: 128] |
++vst1.8 {d26-d27},[r1,: 128] |
++ |
++# qhasm: ptr = &5z34_stack |
++# asm 1: lea >ptr=int32#2,<5z34_stack=stack128#11 |
++# asm 2: lea >ptr=r1,<5z34_stack=[sp,#160] |
++add r1,sp,#160 |
++ |
++# qhasm: mem128[ptr] aligned= 5z34 |
++# asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[<ptr=int32#2,: 128] |
++# asm 2: vst1.8 {<5z34=d28-<5z34=d29},[<ptr=r1,: 128] |
++vst1.8 {d28-d29},[r1,: 128] |
++ |
++# qhasm: unsigned>? len - 64 |
++# asm 1: cmp <len=int32#4,#64 |
++# asm 2: cmp <len=r3,#64 |
++cmp r3,#64 |
++ |
++# qhasm: goto below64bytes if !unsigned> |
++bls ._below64bytes |
++ |
++# qhasm: input_2 += 32 |
++# asm 1: add >input_2=int32#2,<input_2=int32#3,#32 |
++# asm 2: add >input_2=r1,<input_2=r2,#32 |
++add r1,r2,#32 |
++ |
++# qhasm: mainloop2: |
++._mainloop2: |
++ |
++# qhasm: c01 = mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>c01=reg128#1%bot->c01=reg128#1%top},[<input_2=int32#2]! |
++# asm 2: vld1.8 {>c01=d0->c01=d1},[<input_2=r1]! |
++vld1.8 {d0-d1},[r1]! |
++ |
++# qhasm: c23 = mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>c23=reg128#2%bot->c23=reg128#2%top},[<input_2=int32#2]! |
++# asm 2: vld1.8 {>c23=d2->c23=d3},[<input_2=r1]! |
++vld1.8 {d2-d3},[r1]! |
++ |
++# qhasm: r4[0,1] += x01[0] unsigned* z34[2]; r4[2,3] += x01[1] unsigned* z34[3] |
++# asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%bot,<z34=reg128#6%top |
++# asm 2: vmlal.u32 <r4=q15,<x01=d16,<z34=d11 |
++vmlal.u32 q15,d16,d11 |
++ |
++# qhasm: ptr = &z12_stack |
++# asm 1: lea >ptr=int32#3,<z12_stack=stack128#8 |
++# asm 2: lea >ptr=r2,<z12_stack=[sp,#112] |
++add r2,sp,#112 |
++ |
++# qhasm: z12 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>z12=reg128#3%bot->z12=reg128#3%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>z12=d4->z12=d5},[<ptr=r2,: 128] |
++vld1.8 {d4-d5},[r2,: 128] |
++ |
++# qhasm: r4[0,1] += x01[2] unsigned* z34[0]; r4[2,3] += x01[3] unsigned* z34[1] |
++# asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%top,<z34=reg128#6%bot |
++# asm 2: vmlal.u32 <r4=q15,<x01=d17,<z34=d10 |
++vmlal.u32 q15,d17,d10 |
++ |
++# qhasm: ptr = &z0_stack |
++# asm 1: lea >ptr=int32#3,<z0_stack=stack128#7 |
++# asm 2: lea >ptr=r2,<z0_stack=[sp,#96] |
++add r2,sp,#96 |
++ |
++# qhasm: z0 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>z0=reg128#4%bot->z0=reg128#4%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>z0=d6->z0=d7},[<ptr=r2,: 128] |
++vld1.8 {d6-d7},[r2,: 128] |
++ |
++# qhasm: r4[0,1] += x23[0] unsigned* z12[2]; r4[2,3] += x23[1] unsigned* z12[3] |
++# asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%bot,<z12=reg128#3%top |
++# asm 2: vmlal.u32 <r4=q15,<x23=d18,<z12=d5 |
++vmlal.u32 q15,d18,d5 |
++ |
++# qhasm: c01 c23 = c01[0]c01[1]c01[2]c23[2]c23[0]c23[1]c01[3]c23[3] |
++# asm 1: vtrn.32 <c01=reg128#1%top,<c23=reg128#2%top |
++# asm 2: vtrn.32 <c01=d1,<c23=d3 |
++vtrn.32 d1,d3 |
++ |
++# qhasm: r4[0,1] += x23[2] unsigned* z12[0]; r4[2,3] += x23[3] unsigned* z12[1] |
++# asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%top,<z12=reg128#3%bot |
++# asm 2: vmlal.u32 <r4=q15,<x23=d19,<z12=d4 |
++vmlal.u32 q15,d19,d4 |
++ |
++# qhasm: r4[0,1] += x4[0] unsigned* z0[0]; r4[2,3] += x4[1] unsigned* z0[1] |
++# asm 1: vmlal.u32 <r4=reg128#16,<x4=reg128#11%bot,<z0=reg128#4%bot |
++# asm 2: vmlal.u32 <r4=q15,<x4=d20,<z0=d6 |
++vmlal.u32 q15,d20,d6 |
++ |
++# qhasm: r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 |
++# asm 1: vshll.u32 >r3=reg128#5,<c23=reg128#2%top,#18 |
++# asm 2: vshll.u32 >r3=q4,<c23=d3,#18 |
++vshll.u32 q4,d3,#18 |
++ |
++# qhasm: c01 c23 = c01[0]c23[0]c01[2]c01[3]c01[1]c23[1]c23[2]c23[3] |
++# asm 1: vtrn.32 <c01=reg128#1%bot,<c23=reg128#2%bot |
++# asm 2: vtrn.32 <c01=d0,<c23=d2 |
++vtrn.32 d0,d2 |
++ |
++# qhasm: r3[0,1] += x01[0] unsigned* z34[0]; r3[2,3] += x01[1] unsigned* z34[1] |
++# asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%bot,<z34=reg128#6%bot |
++# asm 2: vmlal.u32 <r3=q4,<x01=d16,<z34=d10 |
++vmlal.u32 q4,d16,d10 |
++ |
++# qhasm: r3[0,1] += x01[2] unsigned* z12[2]; r3[2,3] += x01[3] unsigned* z12[3] |
++# asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%top,<z12=reg128#3%top |
++# asm 2: vmlal.u32 <r3=q4,<x01=d17,<z12=d5 |
++vmlal.u32 q4,d17,d5 |
++ |
++# qhasm: r0 = r0[1]c01[0]r0[2,3] |
++# asm 1: vext.32 <r0=reg128#8%bot,<r0=reg128#8%bot,<c01=reg128#1%bot,#1 |
++# asm 2: vext.32 <r0=d14,<r0=d14,<c01=d0,#1 |
++vext.32 d14,d14,d0,#1 |
++ |
++# qhasm: r3[0,1] += x23[0] unsigned* z12[0]; r3[2,3] += x23[1] unsigned* z12[1] |
++# asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%bot,<z12=reg128#3%bot |
++# asm 2: vmlal.u32 <r3=q4,<x23=d18,<z12=d4 |
++vmlal.u32 q4,d18,d4 |
++ |
++# qhasm: input_2 -= 64 |
++# asm 1: sub >input_2=int32#2,<input_2=int32#2,#64 |
++# asm 2: sub >input_2=r1,<input_2=r1,#64 |
++sub r1,r1,#64 |
++ |
++# qhasm: r3[0,1] += x23[2] unsigned* z0[0]; r3[2,3] += x23[3] unsigned* z0[1] |
++# asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%top,<z0=reg128#4%bot |
++# asm 2: vmlal.u32 <r3=q4,<x23=d19,<z0=d6 |
++vmlal.u32 q4,d19,d6 |
++ |
++# qhasm: ptr = &5z34_stack |
++# asm 1: lea >ptr=int32#3,<5z34_stack=stack128#11 |
++# asm 2: lea >ptr=r2,<5z34_stack=[sp,#160] |
++add r2,sp,#160 |
++ |
++# qhasm: 5z34 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>5z34=d10->5z34=d11},[<ptr=r2,: 128] |
++vld1.8 {d10-d11},[r2,: 128] |
++ |
++# qhasm: r3[0,1] += x4[0] unsigned* 5z34[2]; r3[2,3] += x4[1] unsigned* 5z34[3] |
++# asm 1: vmlal.u32 <r3=reg128#5,<x4=reg128#11%bot,<5z34=reg128#6%top |
++# asm 2: vmlal.u32 <r3=q4,<x4=d20,<5z34=d11 |
++vmlal.u32 q4,d20,d11 |
++ |
++# qhasm: r0 = r0[1]r0[0]r0[3]r0[2] |
++# asm 1: vrev64.i32 >r0=reg128#8,<r0=reg128#8 |
++# asm 2: vrev64.i32 >r0=q7,<r0=q7 |
++vrev64.i32 q7,q7 |
++ |
++# qhasm: r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 |
++# asm 1: vshll.u32 >r2=reg128#14,<c01=reg128#1%top,#12 |
++# asm 2: vshll.u32 >r2=q13,<c01=d1,#12 |
++vshll.u32 q13,d1,#12 |
++ |
++# qhasm: d01 = mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>d01=reg128#12%bot->d01=reg128#12%top},[<input_2=int32#2]! |
++# asm 2: vld1.8 {>d01=d22->d01=d23},[<input_2=r1]! |
++vld1.8 {d22-d23},[r1]! |
++ |
++# qhasm: r2[0,1] += x01[0] unsigned* z12[2]; r2[2,3] += x01[1] unsigned* z12[3] |
++# asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%bot,<z12=reg128#3%top |
++# asm 2: vmlal.u32 <r2=q13,<x01=d16,<z12=d5 |
++vmlal.u32 q13,d16,d5 |
++ |
++# qhasm: r2[0,1] += x01[2] unsigned* z12[0]; r2[2,3] += x01[3] unsigned* z12[1] |
++# asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%top,<z12=reg128#3%bot |
++# asm 2: vmlal.u32 <r2=q13,<x01=d17,<z12=d4 |
++vmlal.u32 q13,d17,d4 |
++ |
++# qhasm: r2[0,1] += x23[0] unsigned* z0[0]; r2[2,3] += x23[1] unsigned* z0[1] |
++# asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%bot,<z0=reg128#4%bot |
++# asm 2: vmlal.u32 <r2=q13,<x23=d18,<z0=d6 |
++vmlal.u32 q13,d18,d6 |
++ |
++# qhasm: r2[0,1] += x23[2] unsigned* 5z34[2]; r2[2,3] += x23[3] unsigned* 5z34[3] |
++# asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%top,<5z34=reg128#6%top |
++# asm 2: vmlal.u32 <r2=q13,<x23=d19,<5z34=d11 |
++vmlal.u32 q13,d19,d11 |
++ |
++# qhasm: r2[0,1] += x4[0] unsigned* 5z34[0]; r2[2,3] += x4[1] unsigned* 5z34[1] |
++# asm 1: vmlal.u32 <r2=reg128#14,<x4=reg128#11%bot,<5z34=reg128#6%bot |
++# asm 2: vmlal.u32 <r2=q13,<x4=d20,<5z34=d10 |
++vmlal.u32 q13,d20,d10 |
++ |
++# qhasm: r0 = r0[0,1]c01[1]r0[2] |
++# asm 1: vext.32 <r0=reg128#8%top,<c01=reg128#1%bot,<r0=reg128#8%top,#1 |
++# asm 2: vext.32 <r0=d15,<c01=d0,<r0=d15,#1 |
++vext.32 d15,d0,d15,#1 |
++ |
++# qhasm: r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 |
++# asm 1: vshll.u32 >r1=reg128#15,<c23=reg128#2%bot,#6 |
++# asm 2: vshll.u32 >r1=q14,<c23=d2,#6 |
++vshll.u32 q14,d2,#6 |
++ |
++# qhasm: r1[0,1] += x01[0] unsigned* z12[0]; r1[2,3] += x01[1] unsigned* z12[1] |
++# asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%bot,<z12=reg128#3%bot |
++# asm 2: vmlal.u32 <r1=q14,<x01=d16,<z12=d4 |
++vmlal.u32 q14,d16,d4 |
++ |
++# qhasm: r1[0,1] += x01[2] unsigned* z0[0]; r1[2,3] += x01[3] unsigned* z0[1] |
++# asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%top,<z0=reg128#4%bot |
++# asm 2: vmlal.u32 <r1=q14,<x01=d17,<z0=d6 |
++vmlal.u32 q14,d17,d6 |
++ |
++# qhasm: r1[0,1] += x23[0] unsigned* 5z34[2]; r1[2,3] += x23[1] unsigned* 5z34[3] |
++# asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%bot,<5z34=reg128#6%top |
++# asm 2: vmlal.u32 <r1=q14,<x23=d18,<5z34=d11 |
++vmlal.u32 q14,d18,d11 |
++ |
++# qhasm: r1[0,1] += x23[2] unsigned* 5z34[0]; r1[2,3] += x23[3] unsigned* 5z34[1] |
++# asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%top,<5z34=reg128#6%bot |
++# asm 2: vmlal.u32 <r1=q14,<x23=d19,<5z34=d10 |
++vmlal.u32 q14,d19,d10 |
++ |
++# qhasm: ptr = &5z12_stack |
++# asm 1: lea >ptr=int32#3,<5z12_stack=stack128#10 |
++# asm 2: lea >ptr=r2,<5z12_stack=[sp,#144] |
++add r2,sp,#144 |
++ |
++# qhasm: 5z12 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>5z12=d0->5z12=d1},[<ptr=r2,: 128] |
++vld1.8 {d0-d1},[r2,: 128] |
++ |
++# qhasm: r1[0,1] += x4[0] unsigned* 5z12[2]; r1[2,3] += x4[1] unsigned* 5z12[3] |
++# asm 1: vmlal.u32 <r1=reg128#15,<x4=reg128#11%bot,<5z12=reg128#1%top |
++# asm 2: vmlal.u32 <r1=q14,<x4=d20,<5z12=d1 |
++vmlal.u32 q14,d20,d1 |
++ |
++# qhasm: d23 = mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>d23=reg128#2%bot->d23=reg128#2%top},[<input_2=int32#2]! |
++# asm 2: vld1.8 {>d23=d2->d23=d3},[<input_2=r1]! |
++vld1.8 {d2-d3},[r1]! |
++ |
++# qhasm: input_2 += 32 |
++# asm 1: add >input_2=int32#2,<input_2=int32#2,#32 |
++# asm 2: add >input_2=r1,<input_2=r1,#32 |
++add r1,r1,#32 |
++ |
++# qhasm: r0[0,1] += x4[0] unsigned* 5z12[0]; r0[2,3] += x4[1] unsigned* 5z12[1] |
++# asm 1: vmlal.u32 <r0=reg128#8,<x4=reg128#11%bot,<5z12=reg128#1%bot |
++# asm 2: vmlal.u32 <r0=q7,<x4=d20,<5z12=d0 |
++vmlal.u32 q7,d20,d0 |
++ |
++# qhasm: r0[0,1] += x23[0] unsigned* 5z34[0]; r0[2,3] += x23[1] unsigned* 5z34[1] |
++# asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%bot,<5z34=reg128#6%bot |
++# asm 2: vmlal.u32 <r0=q7,<x23=d18,<5z34=d10 |
++vmlal.u32 q7,d18,d10 |
++ |
++# qhasm: d01 d23 = d01[0] d23[0] d01[1] d23[1] |
++# asm 1: vswp <d23=reg128#2%bot,<d01=reg128#12%top |
++# asm 2: vswp <d23=d2,<d01=d23 |
++vswp d2,d23 |
++ |
++# qhasm: r0[0,1] += x23[2] unsigned* 5z12[2]; r0[2,3] += x23[3] unsigned* 5z12[3] |
++# asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%top,<5z12=reg128#1%top |
++# asm 2: vmlal.u32 <r0=q7,<x23=d19,<5z12=d1 |
++vmlal.u32 q7,d19,d1 |
++ |
++# qhasm: r0[0,1] += x01[0] unsigned* z0[0]; r0[2,3] += x01[1] unsigned* z0[1] |
++# asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%bot,<z0=reg128#4%bot |
++# asm 2: vmlal.u32 <r0=q7,<x01=d16,<z0=d6 |
++vmlal.u32 q7,d16,d6 |
++ |
++# qhasm: new mid |
++ |
++# qhasm: 2x v4 = d23 unsigned>> 40 |
++# asm 1: vshr.u64 >v4=reg128#4,<d23=reg128#2,#40 |
++# asm 2: vshr.u64 >v4=q3,<d23=q1,#40 |
++vshr.u64 q3,q1,#40 |
++ |
++# qhasm: mid = d01[1]d23[0] mid[2,3] |
++# asm 1: vext.32 <mid=reg128#1%bot,<d01=reg128#12%bot,<d23=reg128#2%bot,#1 |
++# asm 2: vext.32 <mid=d0,<d01=d22,<d23=d2,#1 |
++vext.32 d0,d22,d2,#1 |
++ |
++# qhasm: new v23 |
++ |
++# qhasm: v23[2] = d23[0,1] unsigned>> 14; v23[3] = d23[2,3] unsigned>> 14 |
++# asm 1: vshrn.u64 <v23=reg128#10%top,<d23=reg128#2,#14 |
++# asm 2: vshrn.u64 <v23=d19,<d23=q1,#14 |
++vshrn.u64 d19,q1,#14 |
++ |
++# qhasm: mid = mid[0,1] d01[3]d23[2] |
++# asm 1: vext.32 <mid=reg128#1%top,<d01=reg128#12%top,<d23=reg128#2%top,#1 |
++# asm 2: vext.32 <mid=d1,<d01=d23,<d23=d3,#1 |
++vext.32 d1,d23,d3,#1 |
++ |
++# qhasm: new v01 |
++ |
++# qhasm: v01[2] = d01[0,1] unsigned>> 26; v01[3] = d01[2,3] unsigned>> 26 |
++# asm 1: vshrn.u64 <v01=reg128#11%top,<d01=reg128#12,#26 |
++# asm 2: vshrn.u64 <v01=d21,<d01=q11,#26 |
++vshrn.u64 d21,q11,#26 |
++ |
++# qhasm: v01 = d01[1]d01[0] v01[2,3] |
++# asm 1: vext.32 <v01=reg128#11%bot,<d01=reg128#12%bot,<d01=reg128#12%bot,#1 |
++# asm 2: vext.32 <v01=d20,<d01=d22,<d01=d22,#1 |
++vext.32 d20,d22,d22,#1 |
++ |
++# qhasm: r0[0,1] += x01[2] unsigned* 5z34[2]; r0[2,3] += x01[3] unsigned* 5z34[3] |
++# asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%top,<5z34=reg128#6%top |
++# asm 2: vmlal.u32 <r0=q7,<x01=d17,<5z34=d11 |
++vmlal.u32 q7,d17,d11 |
++ |
++# qhasm: v01 = v01[1]d01[2] v01[2,3] |
++# asm 1: vext.32 <v01=reg128#11%bot,<v01=reg128#11%bot,<d01=reg128#12%top,#1 |
++# asm 2: vext.32 <v01=d20,<v01=d20,<d01=d23,#1 |
++vext.32 d20,d20,d23,#1 |
++ |
++# qhasm: v23[0] = mid[0,1] unsigned>> 20; v23[1] = mid[2,3] unsigned>> 20 |
++# asm 1: vshrn.u64 <v23=reg128#10%bot,<mid=reg128#1,#20 |
++# asm 2: vshrn.u64 <v23=d18,<mid=q0,#20 |
++vshrn.u64 d18,q0,#20 |
++ |
++# qhasm: v4 = v4[0]v4[2]v4[1]v4[3] |
++# asm 1: vtrn.32 <v4=reg128#4%bot,<v4=reg128#4%top |
++# asm 2: vtrn.32 <v4=d6,<v4=d7 |
++vtrn.32 d6,d7 |
++ |
++# qhasm: 4x v01 &= 0x03ffffff |
++# asm 1: vand.i32 <v01=reg128#11,#0x03ffffff |
++# asm 2: vand.i32 <v01=q10,#0x03ffffff |
++vand.i32 q10,#0x03ffffff |
++ |
++# qhasm: ptr = &y34_stack |
++# asm 1: lea >ptr=int32#3,<y34_stack=stack128#4 |
++# asm 2: lea >ptr=r2,<y34_stack=[sp,#48] |
++add r2,sp,#48 |
++ |
++# qhasm: y34 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>y34=d4->y34=d5},[<ptr=r2,: 128] |
++vld1.8 {d4-d5},[r2,: 128] |
++ |
++# qhasm: 4x v23 &= 0x03ffffff |
++# asm 1: vand.i32 <v23=reg128#10,#0x03ffffff |
++# asm 2: vand.i32 <v23=q9,#0x03ffffff |
++vand.i32 q9,#0x03ffffff |
++ |
++# qhasm: ptr = &y12_stack |
++# asm 1: lea >ptr=int32#3,<y12_stack=stack128#3 |
++# asm 2: lea >ptr=r2,<y12_stack=[sp,#32] |
++add r2,sp,#32 |
++ |
++# qhasm: y12 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>y12=d2->y12=d3},[<ptr=r2,: 128] |
++vld1.8 {d2-d3},[r2,: 128] |
++ |
++# qhasm: 4x v4 |= 0x01000000 |
++# asm 1: vorr.i32 <v4=reg128#4,#0x01000000 |
++# asm 2: vorr.i32 <v4=q3,#0x01000000 |
++vorr.i32 q3,#0x01000000 |
++ |
++# qhasm: ptr = &y0_stack |
++# asm 1: lea >ptr=int32#3,<y0_stack=stack128#2 |
++# asm 2: lea >ptr=r2,<y0_stack=[sp,#16] |
++add r2,sp,#16 |
++ |
++# qhasm: y0 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>y0=reg128#1%bot->y0=reg128#1%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>y0=d0->y0=d1},[<ptr=r2,: 128] |
++vld1.8 {d0-d1},[r2,: 128] |
++ |
++# qhasm: r4[0,1] += v01[0] unsigned* y34[2]; r4[2,3] += v01[1] unsigned* y34[3] |
++# asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%bot,<y34=reg128#3%top |
++# asm 2: vmlal.u32 <r4=q15,<v01=d20,<y34=d5 |
++vmlal.u32 q15,d20,d5 |
++ |
++# qhasm: r4[0,1] += v01[2] unsigned* y34[0]; r4[2,3] += v01[3] unsigned* y34[1] |
++# asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%top,<y34=reg128#3%bot |
++# asm 2: vmlal.u32 <r4=q15,<v01=d21,<y34=d4 |
++vmlal.u32 q15,d21,d4 |
++ |
++# qhasm: r4[0,1] += v23[0] unsigned* y12[2]; r4[2,3] += v23[1] unsigned* y12[3] |
++# asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%bot,<y12=reg128#2%top |
++# asm 2: vmlal.u32 <r4=q15,<v23=d18,<y12=d3 |
++vmlal.u32 q15,d18,d3 |
++ |
++# qhasm: r4[0,1] += v23[2] unsigned* y12[0]; r4[2,3] += v23[3] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%top,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r4=q15,<v23=d19,<y12=d2 |
++vmlal.u32 q15,d19,d2 |
++ |
++# qhasm: r4[0,1] += v4[0] unsigned* y0[0]; r4[2,3] += v4[1] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r4=reg128#16,<v4=reg128#4%bot,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r4=q15,<v4=d6,<y0=d0 |
++vmlal.u32 q15,d6,d0 |
++ |
++# qhasm: ptr = &5y34_stack |
++# asm 1: lea >ptr=int32#3,<5y34_stack=stack128#6 |
++# asm 2: lea >ptr=r2,<5y34_stack=[sp,#80] |
++add r2,sp,#80 |
++ |
++# qhasm: 5y34 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>5y34=d24->5y34=d25},[<ptr=r2,: 128] |
++vld1.8 {d24-d25},[r2,: 128] |
++ |
++# qhasm: r3[0,1] += v01[0] unsigned* y34[0]; r3[2,3] += v01[1] unsigned* y34[1] |
++# asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%bot,<y34=reg128#3%bot |
++# asm 2: vmlal.u32 <r3=q4,<v01=d20,<y34=d4 |
++vmlal.u32 q4,d20,d4 |
++ |
++# qhasm: r3[0,1] += v01[2] unsigned* y12[2]; r3[2,3] += v01[3] unsigned* y12[3] |
++# asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%top,<y12=reg128#2%top |
++# asm 2: vmlal.u32 <r3=q4,<v01=d21,<y12=d3 |
++vmlal.u32 q4,d21,d3 |
++ |
++# qhasm: r3[0,1] += v23[0] unsigned* y12[0]; r3[2,3] += v23[1] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%bot,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r3=q4,<v23=d18,<y12=d2 |
++vmlal.u32 q4,d18,d2 |
++ |
++# qhasm: r3[0,1] += v23[2] unsigned* y0[0]; r3[2,3] += v23[3] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%top,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r3=q4,<v23=d19,<y0=d0 |
++vmlal.u32 q4,d19,d0 |
++ |
++# qhasm: r3[0,1] += v4[0] unsigned* 5y34[2]; r3[2,3] += v4[1] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r3=reg128#5,<v4=reg128#4%bot,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r3=q4,<v4=d6,<5y34=d25 |
++vmlal.u32 q4,d6,d25 |
++ |
++# qhasm: ptr = &5y12_stack |
++# asm 1: lea >ptr=int32#3,<5y12_stack=stack128#5 |
++# asm 2: lea >ptr=r2,<5y12_stack=[sp,#64] |
++add r2,sp,#64 |
++ |
++# qhasm: 5y12 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>5y12=d22->5y12=d23},[<ptr=r2,: 128] |
++vld1.8 {d22-d23},[r2,: 128] |
++ |
++# qhasm: r0[0,1] += v4[0] unsigned* 5y12[0]; r0[2,3] += v4[1] unsigned* 5y12[1] |
++# asm 1: vmlal.u32 <r0=reg128#8,<v4=reg128#4%bot,<5y12=reg128#12%bot |
++# asm 2: vmlal.u32 <r0=q7,<v4=d6,<5y12=d22 |
++vmlal.u32 q7,d6,d22 |
++ |
++# qhasm: r0[0,1] += v23[0] unsigned* 5y34[0]; r0[2,3] += v23[1] unsigned* 5y34[1] |
++# asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%bot,<5y34=reg128#13%bot |
++# asm 2: vmlal.u32 <r0=q7,<v23=d18,<5y34=d24 |
++vmlal.u32 q7,d18,d24 |
++ |
++# qhasm: r0[0,1] += v23[2] unsigned* 5y12[2]; r0[2,3] += v23[3] unsigned* 5y12[3] |
++# asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%top,<5y12=reg128#12%top |
++# asm 2: vmlal.u32 <r0=q7,<v23=d19,<5y12=d23 |
++vmlal.u32 q7,d19,d23 |
++ |
++# qhasm: r0[0,1] += v01[0] unsigned* y0[0]; r0[2,3] += v01[1] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%bot,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r0=q7,<v01=d20,<y0=d0 |
++vmlal.u32 q7,d20,d0 |
++ |
++# qhasm: r0[0,1] += v01[2] unsigned* 5y34[2]; r0[2,3] += v01[3] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%top,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r0=q7,<v01=d21,<5y34=d25 |
++vmlal.u32 q7,d21,d25 |
++ |
++# qhasm: r1[0,1] += v01[0] unsigned* y12[0]; r1[2,3] += v01[1] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%bot,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r1=q14,<v01=d20,<y12=d2 |
++vmlal.u32 q14,d20,d2 |
++ |
++# qhasm: r1[0,1] += v01[2] unsigned* y0[0]; r1[2,3] += v01[3] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%top,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r1=q14,<v01=d21,<y0=d0 |
++vmlal.u32 q14,d21,d0 |
++ |
++# qhasm: r1[0,1] += v23[0] unsigned* 5y34[2]; r1[2,3] += v23[1] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%bot,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r1=q14,<v23=d18,<5y34=d25 |
++vmlal.u32 q14,d18,d25 |
++ |
++# qhasm: r1[0,1] += v23[2] unsigned* 5y34[0]; r1[2,3] += v23[3] unsigned* 5y34[1] |
++# asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%top,<5y34=reg128#13%bot |
++# asm 2: vmlal.u32 <r1=q14,<v23=d19,<5y34=d24 |
++vmlal.u32 q14,d19,d24 |
++ |
++# qhasm: r1[0,1] += v4[0] unsigned* 5y12[2]; r1[2,3] += v4[1] unsigned* 5y12[3] |
++# asm 1: vmlal.u32 <r1=reg128#15,<v4=reg128#4%bot,<5y12=reg128#12%top |
++# asm 2: vmlal.u32 <r1=q14,<v4=d6,<5y12=d23 |
++vmlal.u32 q14,d6,d23 |
++ |
++# qhasm: r2[0,1] += v01[0] unsigned* y12[2]; r2[2,3] += v01[1] unsigned* y12[3] |
++# asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%bot,<y12=reg128#2%top |
++# asm 2: vmlal.u32 <r2=q13,<v01=d20,<y12=d3 |
++vmlal.u32 q13,d20,d3 |
++ |
++# qhasm: r2[0,1] += v01[2] unsigned* y12[0]; r2[2,3] += v01[3] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%top,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r2=q13,<v01=d21,<y12=d2 |
++vmlal.u32 q13,d21,d2 |
++ |
++# qhasm: r2[0,1] += v23[0] unsigned* y0[0]; r2[2,3] += v23[1] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%bot,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r2=q13,<v23=d18,<y0=d0 |
++vmlal.u32 q13,d18,d0 |
++ |
++# qhasm: r2[0,1] += v23[2] unsigned* 5y34[2]; r2[2,3] += v23[3] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%top,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r2=q13,<v23=d19,<5y34=d25 |
++vmlal.u32 q13,d19,d25 |
++ |
++# qhasm: r2[0,1] += v4[0] unsigned* 5y34[0]; r2[2,3] += v4[1] unsigned* 5y34[1] |
++# asm 1: vmlal.u32 <r2=reg128#14,<v4=reg128#4%bot,<5y34=reg128#13%bot |
++# asm 2: vmlal.u32 <r2=q13,<v4=d6,<5y34=d24 |
++vmlal.u32 q13,d6,d24 |
++ |
++# qhasm: ptr = &two24 |
++# asm 1: lea >ptr=int32#3,<two24=stack128#1 |
++# asm 2: lea >ptr=r2,<two24=[sp,#0] |
++add r2,sp,#0 |
++ |
++# qhasm: 2x t1 = r0 unsigned>> 26 |
++# asm 1: vshr.u64 >t1=reg128#4,<r0=reg128#8,#26 |
++# asm 2: vshr.u64 >t1=q3,<r0=q7,#26 |
++vshr.u64 q3,q7,#26 |
++ |
++# qhasm: len -= 64 |
++# asm 1: sub >len=int32#4,<len=int32#4,#64 |
++# asm 2: sub >len=r3,<len=r3,#64 |
++sub r3,r3,#64 |
++ |
++# qhasm: r0 &= mask |
++# asm 1: vand >r0=reg128#6,<r0=reg128#8,<mask=reg128#7 |
++# asm 2: vand >r0=q5,<r0=q7,<mask=q6 |
++vand q5,q7,q6 |
++ |
++# qhasm: 2x r1 += t1 |
++# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#15,<t1=reg128#4 |
++# asm 2: vadd.i64 >r1=q3,<r1=q14,<t1=q3 |
++vadd.i64 q3,q14,q3 |
++ |
++# qhasm: 2x t4 = r3 unsigned>> 26 |
++# asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#5,#26 |
++# asm 2: vshr.u64 >t4=q7,<r3=q4,#26 |
++vshr.u64 q7,q4,#26 |
++ |
++# qhasm: r3 &= mask |
++# asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7 |
++# asm 2: vand >r3=q4,<r3=q4,<mask=q6 |
++vand q4,q4,q6 |
++ |
++# qhasm: 2x x4 = r4 + t4 |
++# asm 1: vadd.i64 >x4=reg128#8,<r4=reg128#16,<t4=reg128#8 |
++# asm 2: vadd.i64 >x4=q7,<r4=q15,<t4=q7 |
++vadd.i64 q7,q15,q7 |
++ |
++# qhasm: r4 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>r4=reg128#16%bot->r4=reg128#16%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>r4=d30->r4=d31},[<ptr=r2,: 128] |
++vld1.8 {d30-d31},[r2,: 128] |
++ |
++# qhasm: 2x t2 = r1 unsigned>> 26 |
++# asm 1: vshr.u64 >t2=reg128#9,<r1=reg128#4,#26 |
++# asm 2: vshr.u64 >t2=q8,<r1=q3,#26 |
++vshr.u64 q8,q3,#26 |
++ |
++# qhasm: r1 &= mask |
++# asm 1: vand >r1=reg128#4,<r1=reg128#4,<mask=reg128#7 |
++# asm 2: vand >r1=q3,<r1=q3,<mask=q6 |
++vand q3,q3,q6 |
++ |
++# qhasm: 2x t0 = x4 unsigned>> 26 |
++# asm 1: vshr.u64 >t0=reg128#10,<x4=reg128#8,#26 |
++# asm 2: vshr.u64 >t0=q9,<x4=q7,#26 |
++vshr.u64 q9,q7,#26 |
++ |
++# qhasm: 2x r2 += t2 |
++# asm 1: vadd.i64 >r2=reg128#9,<r2=reg128#14,<t2=reg128#9 |
++# asm 2: vadd.i64 >r2=q8,<r2=q13,<t2=q8 |
++vadd.i64 q8,q13,q8 |
++ |
++# qhasm: x4 &= mask |
++# asm 1: vand >x4=reg128#11,<x4=reg128#8,<mask=reg128#7 |
++# asm 2: vand >x4=q10,<x4=q7,<mask=q6 |
++vand q10,q7,q6 |
++ |
++# qhasm: 2x x01 = r0 + t0 |
++# asm 1: vadd.i64 >x01=reg128#6,<r0=reg128#6,<t0=reg128#10 |
++# asm 2: vadd.i64 >x01=q5,<r0=q5,<t0=q9 |
++vadd.i64 q5,q5,q9 |
++ |
++# qhasm: r0 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>r0=reg128#8%bot->r0=reg128#8%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>r0=d14->r0=d15},[<ptr=r2,: 128] |
++vld1.8 {d14-d15},[r2,: 128] |
++ |
++# qhasm: ptr = &z34_stack |
++# asm 1: lea >ptr=int32#3,<z34_stack=stack128#9 |
++# asm 2: lea >ptr=r2,<z34_stack=[sp,#128] |
++add r2,sp,#128 |
++ |
++# qhasm: 2x t0 <<= 2 |
++# asm 1: vshl.i64 >t0=reg128#10,<t0=reg128#10,#2 |
++# asm 2: vshl.i64 >t0=q9,<t0=q9,#2 |
++vshl.i64 q9,q9,#2 |
++ |
++# qhasm: 2x t3 = r2 unsigned>> 26 |
++# asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#9,#26 |
++# asm 2: vshr.u64 >t3=q13,<r2=q8,#26 |
++vshr.u64 q13,q8,#26 |
++ |
++# qhasm: 2x x01 += t0 |
++# asm 1: vadd.i64 >x01=reg128#15,<x01=reg128#6,<t0=reg128#10 |
++# asm 2: vadd.i64 >x01=q14,<x01=q5,<t0=q9 |
++vadd.i64 q14,q5,q9 |
++ |
++# qhasm: z34 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<ptr=int32#3,: 128] |
++# asm 2: vld1.8 {>z34=d10->z34=d11},[<ptr=r2,: 128] |
++vld1.8 {d10-d11},[r2,: 128] |
++ |
++# qhasm: x23 = r2 & mask |
++# asm 1: vand >x23=reg128#10,<r2=reg128#9,<mask=reg128#7 |
++# asm 2: vand >x23=q9,<r2=q8,<mask=q6 |
++vand q9,q8,q6 |
++ |
++# qhasm: 2x r3 += t3 |
++# asm 1: vadd.i64 >r3=reg128#5,<r3=reg128#5,<t3=reg128#14 |
++# asm 2: vadd.i64 >r3=q4,<r3=q4,<t3=q13 |
++vadd.i64 q4,q4,q13 |
++ |
++# qhasm: input_2 += 32 |
++# asm 1: add >input_2=int32#2,<input_2=int32#2,#32 |
++# asm 2: add >input_2=r1,<input_2=r1,#32 |
++add r1,r1,#32 |
++ |
++# qhasm: 2x t1 = x01 unsigned>> 26 |
++# asm 1: vshr.u64 >t1=reg128#14,<x01=reg128#15,#26 |
++# asm 2: vshr.u64 >t1=q13,<x01=q14,#26 |
++vshr.u64 q13,q14,#26 |
++ |
++# qhasm: x23 = x23[0,2,1,3] |
++# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top |
++# asm 2: vtrn.32 <x23=d18,<x23=d19 |
++vtrn.32 d18,d19 |
++ |
++# qhasm: x01 = x01 & mask |
++# asm 1: vand >x01=reg128#9,<x01=reg128#15,<mask=reg128#7 |
++# asm 2: vand >x01=q8,<x01=q14,<mask=q6 |
++vand q8,q14,q6 |
++ |
++# qhasm: 2x r1 += t1 |
++# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#4,<t1=reg128#14 |
++# asm 2: vadd.i64 >r1=q3,<r1=q3,<t1=q13 |
++vadd.i64 q3,q3,q13 |
++ |
++# qhasm: 2x t4 = r3 unsigned>> 26 |
++# asm 1: vshr.u64 >t4=reg128#14,<r3=reg128#5,#26 |
++# asm 2: vshr.u64 >t4=q13,<r3=q4,#26 |
++vshr.u64 q13,q4,#26 |
++ |
++# qhasm: x01 = x01[0,2,1,3] |
++# asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top |
++# asm 2: vtrn.32 <x01=d16,<x01=d17 |
++vtrn.32 d16,d17 |
++ |
++# qhasm: r3 &= mask |
++# asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7 |
++# asm 2: vand >r3=q4,<r3=q4,<mask=q6 |
++vand q4,q4,q6 |
++ |
++# qhasm: r1 = r1[0,2,1,3] |
++# asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top |
++# asm 2: vtrn.32 <r1=d6,<r1=d7 |
++vtrn.32 d6,d7 |
++ |
++# qhasm: 2x x4 += t4 |
++# asm 1: vadd.i64 >x4=reg128#11,<x4=reg128#11,<t4=reg128#14 |
++# asm 2: vadd.i64 >x4=q10,<x4=q10,<t4=q13 |
++vadd.i64 q10,q10,q13 |
++ |
++# qhasm: r3 = r3[0,2,1,3] |
++# asm 1: vtrn.32 <r3=reg128#5%bot,<r3=reg128#5%top |
++# asm 2: vtrn.32 <r3=d8,<r3=d9 |
++vtrn.32 d8,d9 |
++ |
++# qhasm: x01 = x01[0,1] r1[0,1] |
++# asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0 |
++# asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0 |
++vext.32 d17,d6,d6,#0 |
++ |
++# qhasm: x23 = x23[0,1] r3[0,1] |
++# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#5%bot,<r3=reg128#5%bot,#0 |
++# asm 2: vext.32 <x23=d19,<r3=d8,<r3=d8,#0 |
++vext.32 d19,d8,d8,#0 |
++ |
++# qhasm: x4 = x4[0,2,1,3] |
++# asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top |
++# asm 2: vtrn.32 <x4=d20,<x4=d21 |
++vtrn.32 d20,d21 |
++ |
++# qhasm: unsigned>? len - 64 |
++# asm 1: cmp <len=int32#4,#64 |
++# asm 2: cmp <len=r3,#64 |
++cmp r3,#64 |
++ |
++# qhasm: goto mainloop2 if unsigned> |
++bhi ._mainloop2 |
++ |
++# qhasm: input_2 -= 32 |
++# asm 1: sub >input_2=int32#3,<input_2=int32#2,#32 |
++# asm 2: sub >input_2=r2,<input_2=r1,#32 |
++sub r2,r1,#32 |
++ |
++# qhasm: below64bytes: |
++._below64bytes: |
++ |
++# qhasm: unsigned>? len - 32 |
++# asm 1: cmp <len=int32#4,#32 |
++# asm 2: cmp <len=r3,#32 |
++cmp r3,#32 |
++ |
++# qhasm: goto end if !unsigned> |
++bls ._end |
++ |
++# qhasm: mainloop: |
++._mainloop: |
++ |
++# qhasm: new r0 |
++ |
++# qhasm: ptr = &two24 |
++# asm 1: lea >ptr=int32#2,<two24=stack128#1 |
++# asm 2: lea >ptr=r1,<two24=[sp,#0] |
++add r1,sp,#0 |
++ |
++# qhasm: r4 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>r4=reg128#5%bot->r4=reg128#5%top},[<ptr=int32#2,: 128] |
++# asm 2: vld1.8 {>r4=d8->r4=d9},[<ptr=r1,: 128] |
++vld1.8 {d8-d9},[r1,: 128] |
++ |
++# qhasm: u4 aligned= mem128[ptr] |
++# asm 1: vld1.8 {>u4=reg128#6%bot->u4=reg128#6%top},[<ptr=int32#2,: 128] |
++# asm 2: vld1.8 {>u4=d10->u4=d11},[<ptr=r1,: 128] |
++vld1.8 {d10-d11},[r1,: 128] |
++ |
++# qhasm: c01 = mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>c01=reg128#8%bot->c01=reg128#8%top},[<input_2=int32#3]! |
++# asm 2: vld1.8 {>c01=d14->c01=d15},[<input_2=r2]! |
++vld1.8 {d14-d15},[r2]! |
++ |
++# qhasm: r4[0,1] += x01[0] unsigned* y34[2]; r4[2,3] += x01[1] unsigned* y34[3] |
++# asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%bot,<y34=reg128#3%top |
++# asm 2: vmlal.u32 <r4=q4,<x01=d16,<y34=d5 |
++vmlal.u32 q4,d16,d5 |
++ |
++# qhasm: c23 = mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_2=int32#3]! |
++# asm 2: vld1.8 {>c23=d26->c23=d27},[<input_2=r2]! |
++vld1.8 {d26-d27},[r2]! |
++ |
++# qhasm: r4[0,1] += x01[2] unsigned* y34[0]; r4[2,3] += x01[3] unsigned* y34[1] |
++# asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%top,<y34=reg128#3%bot |
++# asm 2: vmlal.u32 <r4=q4,<x01=d17,<y34=d4 |
++vmlal.u32 q4,d17,d4 |
++ |
++# qhasm: r0 = u4[1]c01[0]r0[2,3] |
++# asm 1: vext.32 <r0=reg128#4%bot,<u4=reg128#6%bot,<c01=reg128#8%bot,#1 |
++# asm 2: vext.32 <r0=d6,<u4=d10,<c01=d14,#1 |
++vext.32 d6,d10,d14,#1 |
++ |
++# qhasm: r4[0,1] += x23[0] unsigned* y12[2]; r4[2,3] += x23[1] unsigned* y12[3] |
++# asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%bot,<y12=reg128#2%top |
++# asm 2: vmlal.u32 <r4=q4,<x23=d18,<y12=d3 |
++vmlal.u32 q4,d18,d3 |
++ |
++# qhasm: r0 = r0[0,1]u4[1]c23[0] |
++# asm 1: vext.32 <r0=reg128#4%top,<u4=reg128#6%bot,<c23=reg128#14%bot,#1 |
++# asm 2: vext.32 <r0=d7,<u4=d10,<c23=d26,#1 |
++vext.32 d7,d10,d26,#1 |
++ |
++# qhasm: r4[0,1] += x23[2] unsigned* y12[0]; r4[2,3] += x23[3] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%top,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r4=q4,<x23=d19,<y12=d2 |
++vmlal.u32 q4,d19,d2 |
++ |
++# qhasm: r0 = r0[1]r0[0]r0[3]r0[2] |
++# asm 1: vrev64.i32 >r0=reg128#4,<r0=reg128#4 |
++# asm 2: vrev64.i32 >r0=q3,<r0=q3 |
++vrev64.i32 q3,q3 |
++ |
++# qhasm: r4[0,1] += x4[0] unsigned* y0[0]; r4[2,3] += x4[1] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r4=reg128#5,<x4=reg128#11%bot,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r4=q4,<x4=d20,<y0=d0 |
++vmlal.u32 q4,d20,d0 |
++ |
++# qhasm: r0[0,1] += x4[0] unsigned* 5y12[0]; r0[2,3] += x4[1] unsigned* 5y12[1] |
++# asm 1: vmlal.u32 <r0=reg128#4,<x4=reg128#11%bot,<5y12=reg128#12%bot |
++# asm 2: vmlal.u32 <r0=q3,<x4=d20,<5y12=d22 |
++vmlal.u32 q3,d20,d22 |
++ |
++# qhasm: r0[0,1] += x23[0] unsigned* 5y34[0]; r0[2,3] += x23[1] unsigned* 5y34[1] |
++# asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%bot,<5y34=reg128#13%bot |
++# asm 2: vmlal.u32 <r0=q3,<x23=d18,<5y34=d24 |
++vmlal.u32 q3,d18,d24 |
++ |
++# qhasm: r0[0,1] += x23[2] unsigned* 5y12[2]; r0[2,3] += x23[3] unsigned* 5y12[3] |
++# asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%top,<5y12=reg128#12%top |
++# asm 2: vmlal.u32 <r0=q3,<x23=d19,<5y12=d23 |
++vmlal.u32 q3,d19,d23 |
++ |
++# qhasm: c01 c23 = c01[0]c23[0]c01[2]c23[2]c01[1]c23[1]c01[3]c23[3] |
++# asm 1: vtrn.32 <c01=reg128#8,<c23=reg128#14 |
++# asm 2: vtrn.32 <c01=q7,<c23=q13 |
++vtrn.32 q7,q13 |
++ |
++# qhasm: r0[0,1] += x01[0] unsigned* y0[0]; r0[2,3] += x01[1] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%bot,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r0=q3,<x01=d16,<y0=d0 |
++vmlal.u32 q3,d16,d0 |
++ |
++# qhasm: r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 |
++# asm 1: vshll.u32 >r3=reg128#6,<c23=reg128#14%top,#18 |
++# asm 2: vshll.u32 >r3=q5,<c23=d27,#18 |
++vshll.u32 q5,d27,#18 |
++ |
++# qhasm: r0[0,1] += x01[2] unsigned* 5y34[2]; r0[2,3] += x01[3] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%top,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r0=q3,<x01=d17,<5y34=d25 |
++vmlal.u32 q3,d17,d25 |
++ |
++# qhasm: r3[0,1] += x01[0] unsigned* y34[0]; r3[2,3] += x01[1] unsigned* y34[1] |
++# asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%bot,<y34=reg128#3%bot |
++# asm 2: vmlal.u32 <r3=q5,<x01=d16,<y34=d4 |
++vmlal.u32 q5,d16,d4 |
++ |
++# qhasm: r3[0,1] += x01[2] unsigned* y12[2]; r3[2,3] += x01[3] unsigned* y12[3] |
++# asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%top,<y12=reg128#2%top |
++# asm 2: vmlal.u32 <r3=q5,<x01=d17,<y12=d3 |
++vmlal.u32 q5,d17,d3 |
++ |
++# qhasm: r3[0,1] += x23[0] unsigned* y12[0]; r3[2,3] += x23[1] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%bot,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r3=q5,<x23=d18,<y12=d2 |
++vmlal.u32 q5,d18,d2 |
++ |
++# qhasm: r3[0,1] += x23[2] unsigned* y0[0]; r3[2,3] += x23[3] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%top,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r3=q5,<x23=d19,<y0=d0 |
++vmlal.u32 q5,d19,d0 |
++ |
++# qhasm: r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 |
++# asm 1: vshll.u32 >r1=reg128#14,<c23=reg128#14%bot,#6 |
++# asm 2: vshll.u32 >r1=q13,<c23=d26,#6 |
++vshll.u32 q13,d26,#6 |
++ |
++# qhasm: r3[0,1] += x4[0] unsigned* 5y34[2]; r3[2,3] += x4[1] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r3=reg128#6,<x4=reg128#11%bot,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r3=q5,<x4=d20,<5y34=d25 |
++vmlal.u32 q5,d20,d25 |
++ |
++# qhasm: r1[0,1] += x01[0] unsigned* y12[0]; r1[2,3] += x01[1] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%bot,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r1=q13,<x01=d16,<y12=d2 |
++vmlal.u32 q13,d16,d2 |
++ |
++# qhasm: r1[0,1] += x01[2] unsigned* y0[0]; r1[2,3] += x01[3] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%top,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r1=q13,<x01=d17,<y0=d0 |
++vmlal.u32 q13,d17,d0 |
++ |
++# qhasm: r1[0,1] += x23[0] unsigned* 5y34[2]; r1[2,3] += x23[1] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%bot,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r1=q13,<x23=d18,<5y34=d25 |
++vmlal.u32 q13,d18,d25 |
++ |
++# qhasm: r1[0,1] += x23[2] unsigned* 5y34[0]; r1[2,3] += x23[3] unsigned* 5y34[1] |
++# asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%top,<5y34=reg128#13%bot |
++# asm 2: vmlal.u32 <r1=q13,<x23=d19,<5y34=d24 |
++vmlal.u32 q13,d19,d24 |
++ |
++# qhasm: r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 |
++# asm 1: vshll.u32 >r2=reg128#8,<c01=reg128#8%top,#12 |
++# asm 2: vshll.u32 >r2=q7,<c01=d15,#12 |
++vshll.u32 q7,d15,#12 |
++ |
++# qhasm: r1[0,1] += x4[0] unsigned* 5y12[2]; r1[2,3] += x4[1] unsigned* 5y12[3] |
++# asm 1: vmlal.u32 <r1=reg128#14,<x4=reg128#11%bot,<5y12=reg128#12%top |
++# asm 2: vmlal.u32 <r1=q13,<x4=d20,<5y12=d23 |
++vmlal.u32 q13,d20,d23 |
++ |
++# qhasm: r2[0,1] += x01[0] unsigned* y12[2]; r2[2,3] += x01[1] unsigned* y12[3] |
++# asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%bot,<y12=reg128#2%top |
++# asm 2: vmlal.u32 <r2=q7,<x01=d16,<y12=d3 |
++vmlal.u32 q7,d16,d3 |
++ |
++# qhasm: r2[0,1] += x01[2] unsigned* y12[0]; r2[2,3] += x01[3] unsigned* y12[1] |
++# asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%top,<y12=reg128#2%bot |
++# asm 2: vmlal.u32 <r2=q7,<x01=d17,<y12=d2 |
++vmlal.u32 q7,d17,d2 |
++ |
++# qhasm: r2[0,1] += x23[0] unsigned* y0[0]; r2[2,3] += x23[1] unsigned* y0[1] |
++# asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%bot,<y0=reg128#1%bot |
++# asm 2: vmlal.u32 <r2=q7,<x23=d18,<y0=d0 |
++vmlal.u32 q7,d18,d0 |
++ |
++# qhasm: r2[0,1] += x23[2] unsigned* 5y34[2]; r2[2,3] += x23[3] unsigned* 5y34[3] |
++# asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%top,<5y34=reg128#13%top |
++# asm 2: vmlal.u32 <r2=q7,<x23=d19,<5y34=d25 |
++vmlal.u32 q7,d19,d25 |
++ |
++# qhasm: r2[0,1] += x4[0] unsigned* 5y34[0]; r2[2,3] += x4[1] unsigned* 5y34[1] |
++# asm 1: vmlal.u32 <r2=reg128#8,<x4=reg128#11%bot,<5y34=reg128#13%bot |
++# asm 2: vmlal.u32 <r2=q7,<x4=d20,<5y34=d24 |
++vmlal.u32 q7,d20,d24 |
++ |
++# qhasm: 2x t1 = r0 unsigned>> 26 |
++# asm 1: vshr.u64 >t1=reg128#9,<r0=reg128#4,#26 |
++# asm 2: vshr.u64 >t1=q8,<r0=q3,#26 |
++vshr.u64 q8,q3,#26 |
++ |
++# qhasm: r0 &= mask |
++# asm 1: vand >r0=reg128#4,<r0=reg128#4,<mask=reg128#7 |
++# asm 2: vand >r0=q3,<r0=q3,<mask=q6 |
++vand q3,q3,q6 |
++ |
++# qhasm: 2x r1 += t1 |
++# asm 1: vadd.i64 >r1=reg128#9,<r1=reg128#14,<t1=reg128#9 |
++# asm 2: vadd.i64 >r1=q8,<r1=q13,<t1=q8 |
++vadd.i64 q8,q13,q8 |
++ |
++# qhasm: 2x t4 = r3 unsigned>> 26 |
++# asm 1: vshr.u64 >t4=reg128#10,<r3=reg128#6,#26 |
++# asm 2: vshr.u64 >t4=q9,<r3=q5,#26 |
++vshr.u64 q9,q5,#26 |
++ |
++# qhasm: r3 &= mask |
++# asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7 |
++# asm 2: vand >r3=q5,<r3=q5,<mask=q6 |
++vand q5,q5,q6 |
++ |
++# qhasm: 2x r4 += t4 |
++# asm 1: vadd.i64 >r4=reg128#5,<r4=reg128#5,<t4=reg128#10 |
++# asm 2: vadd.i64 >r4=q4,<r4=q4,<t4=q9 |
++vadd.i64 q4,q4,q9 |
++ |
++# qhasm: 2x t2 = r1 unsigned>> 26 |
++# asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#9,#26 |
++# asm 2: vshr.u64 >t2=q9,<r1=q8,#26 |
++vshr.u64 q9,q8,#26 |
++ |
++# qhasm: r1 &= mask |
++# asm 1: vand >r1=reg128#11,<r1=reg128#9,<mask=reg128#7 |
++# asm 2: vand >r1=q10,<r1=q8,<mask=q6 |
++vand q10,q8,q6 |
++ |
++# qhasm: 2x t0 = r4 unsigned>> 26 |
++# asm 1: vshr.u64 >t0=reg128#9,<r4=reg128#5,#26 |
++# asm 2: vshr.u64 >t0=q8,<r4=q4,#26 |
++vshr.u64 q8,q4,#26 |
++ |
++# qhasm: 2x r2 += t2 |
++# asm 1: vadd.i64 >r2=reg128#8,<r2=reg128#8,<t2=reg128#10 |
++# asm 2: vadd.i64 >r2=q7,<r2=q7,<t2=q9 |
++vadd.i64 q7,q7,q9 |
++ |
++# qhasm: r4 &= mask |
++# asm 1: vand >r4=reg128#5,<r4=reg128#5,<mask=reg128#7 |
++# asm 2: vand >r4=q4,<r4=q4,<mask=q6 |
++vand q4,q4,q6 |
++ |
++# qhasm: 2x r0 += t0 |
++# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9 |
++# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8 |
++vadd.i64 q3,q3,q8 |
++ |
++# qhasm: 2x t0 <<= 2 |
++# asm 1: vshl.i64 >t0=reg128#9,<t0=reg128#9,#2 |
++# asm 2: vshl.i64 >t0=q8,<t0=q8,#2 |
++vshl.i64 q8,q8,#2 |
++ |
++# qhasm: 2x t3 = r2 unsigned>> 26 |
++# asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#8,#26 |
++# asm 2: vshr.u64 >t3=q13,<r2=q7,#26 |
++vshr.u64 q13,q7,#26 |
++ |
++# qhasm: 2x r0 += t0 |
++# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9 |
++# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8 |
++vadd.i64 q3,q3,q8 |
++ |
++# qhasm: x23 = r2 & mask |
++# asm 1: vand >x23=reg128#10,<r2=reg128#8,<mask=reg128#7 |
++# asm 2: vand >x23=q9,<r2=q7,<mask=q6 |
++vand q9,q7,q6 |
++ |
++# qhasm: 2x r3 += t3 |
++# asm 1: vadd.i64 >r3=reg128#6,<r3=reg128#6,<t3=reg128#14 |
++# asm 2: vadd.i64 >r3=q5,<r3=q5,<t3=q13 |
++vadd.i64 q5,q5,q13 |
++ |
++# qhasm: 2x t1 = r0 unsigned>> 26 |
++# asm 1: vshr.u64 >t1=reg128#8,<r0=reg128#4,#26 |
++# asm 2: vshr.u64 >t1=q7,<r0=q3,#26 |
++vshr.u64 q7,q3,#26 |
++ |
++# qhasm: x01 = r0 & mask |
++# asm 1: vand >x01=reg128#9,<r0=reg128#4,<mask=reg128#7 |
++# asm 2: vand >x01=q8,<r0=q3,<mask=q6 |
++vand q8,q3,q6 |
++ |
++# qhasm: 2x r1 += t1 |
++# asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#11,<t1=reg128#8 |
++# asm 2: vadd.i64 >r1=q3,<r1=q10,<t1=q7 |
++vadd.i64 q3,q10,q7 |
++ |
++# qhasm: 2x t4 = r3 unsigned>> 26 |
++# asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#6,#26 |
++# asm 2: vshr.u64 >t4=q7,<r3=q5,#26 |
++vshr.u64 q7,q5,#26 |
++ |
++# qhasm: r3 &= mask |
++# asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7 |
++# asm 2: vand >r3=q5,<r3=q5,<mask=q6 |
++vand q5,q5,q6 |
++ |
++# qhasm: 2x x4 = r4 + t4 |
++# asm 1: vadd.i64 >x4=reg128#11,<r4=reg128#5,<t4=reg128#8 |
++# asm 2: vadd.i64 >x4=q10,<r4=q4,<t4=q7 |
++vadd.i64 q10,q4,q7 |
++ |
++# qhasm: len -= 32 |
++# asm 1: sub >len=int32#4,<len=int32#4,#32 |
++# asm 2: sub >len=r3,<len=r3,#32 |
++sub r3,r3,#32 |
++ |
++# qhasm: x01 = x01[0,2,1,3] |
++# asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top |
++# asm 2: vtrn.32 <x01=d16,<x01=d17 |
++vtrn.32 d16,d17 |
++ |
++# qhasm: x23 = x23[0,2,1,3] |
++# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top |
++# asm 2: vtrn.32 <x23=d18,<x23=d19 |
++vtrn.32 d18,d19 |
++ |
++# qhasm: r1 = r1[0,2,1,3] |
++# asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top |
++# asm 2: vtrn.32 <r1=d6,<r1=d7 |
++vtrn.32 d6,d7 |
++ |
++# qhasm: r3 = r3[0,2,1,3] |
++# asm 1: vtrn.32 <r3=reg128#6%bot,<r3=reg128#6%top |
++# asm 2: vtrn.32 <r3=d10,<r3=d11 |
++vtrn.32 d10,d11 |
++ |
++# qhasm: x4 = x4[0,2,1,3] |
++# asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top |
++# asm 2: vtrn.32 <x4=d20,<x4=d21 |
++vtrn.32 d20,d21 |
++ |
++# qhasm: x01 = x01[0,1] r1[0,1] |
++# asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0 |
++# asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0 |
++vext.32 d17,d6,d6,#0 |
++ |
++# qhasm: x23 = x23[0,1] r3[0,1] |
++# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#6%bot,<r3=reg128#6%bot,#0 |
++# asm 2: vext.32 <x23=d19,<r3=d10,<r3=d10,#0 |
++vext.32 d19,d10,d10,#0 |
++ |
++# qhasm: unsigned>? len - 32 |
++# asm 1: cmp <len=int32#4,#32 |
++# asm 2: cmp <len=r3,#32 |
++cmp r3,#32 |
++ |
++# qhasm: goto mainloop if unsigned> |
++bhi ._mainloop |
++ |
++# qhasm: end: |
++._end: |
++ |
++# qhasm: mem128[input_0] = x01;input_0+=16 |
++# asm 1: vst1.8 {<x01=reg128#9%bot-<x01=reg128#9%top},[<input_0=int32#1]! |
++# asm 2: vst1.8 {<x01=d16-<x01=d17},[<input_0=r0]! |
++vst1.8 {d16-d17},[r0]! |
++ |
++# qhasm: mem128[input_0] = x23;input_0+=16 |
++# asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1]! |
++# asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0]! |
++vst1.8 {d18-d19},[r0]! |
++ |
++# qhasm: mem64[input_0] = x4[0] |
++# asm 1: vst1.8 <x4=reg128#11%bot,[<input_0=int32#1] |
++# asm 2: vst1.8 <x4=d20,[<input_0=r0] |
++vst1.8 d20,[r0] |
++ |
++# qhasm: len = len |
++# asm 1: mov >len=int32#1,<len=int32#4 |
++# asm 2: mov >len=r0,<len=r3 |
++mov r0,r3 |
++ |
++# qhasm: qpopreturn len |
++mov sp,r12 |
++vpop {q4,q5,q6,q7} |
++bx lr |
++ |
++# qhasm: int32 input_0 |
++ |
++# qhasm: int32 input_1 |
++ |
++# qhasm: int32 input_2 |
++ |
++# qhasm: int32 input_3 |
++ |
++# qhasm: stack32 input_4 |
++ |
++# qhasm: stack32 input_5 |
++ |
++# qhasm: stack32 input_6 |
++ |
++# qhasm: stack32 input_7 |
++ |
++# qhasm: int32 caller_r4 |
++ |
++# qhasm: int32 caller_r5 |
++ |
++# qhasm: int32 caller_r6 |
++ |
++# qhasm: int32 caller_r7 |
++ |
++# qhasm: int32 caller_r8 |
++ |
++# qhasm: int32 caller_r9 |
++ |
++# qhasm: int32 caller_r10 |
++ |
++# qhasm: int32 caller_r11 |
++ |
++# qhasm: int32 caller_r12 |
++ |
++# qhasm: int32 caller_r14 |
++ |
++# qhasm: reg128 caller_q4 |
++ |
++# qhasm: reg128 caller_q5 |
++ |
++# qhasm: reg128 caller_q6 |
++ |
++# qhasm: reg128 caller_q7 |
++ |
++# qhasm: reg128 r0 |
++ |
++# qhasm: reg128 r1 |
++ |
++# qhasm: reg128 r2 |
++ |
++# qhasm: reg128 r3 |
++ |
++# qhasm: reg128 r4 |
++ |
++# qhasm: reg128 x01 |
++ |
++# qhasm: reg128 x23 |
++ |
++# qhasm: reg128 x4 |
++ |
++# qhasm: reg128 y01 |
++ |
++# qhasm: reg128 y23 |
++ |
++# qhasm: reg128 y4 |
++ |
++# qhasm: reg128 _5y01 |
++ |
++# qhasm: reg128 _5y23 |
++ |
++# qhasm: reg128 _5y4 |
++ |
++# qhasm: reg128 c01 |
++ |
++# qhasm: reg128 c23 |
++ |
++# qhasm: reg128 c4 |
++ |
++# qhasm: reg128 t0 |
++ |
++# qhasm: reg128 t1 |
++ |
++# qhasm: reg128 t2 |
++ |
++# qhasm: reg128 t3 |
++ |
++# qhasm: reg128 t4 |
++ |
++# qhasm: reg128 mask |
++ |
++# qhasm: enter crypto_onetimeauth_poly1305_neon2_addmulmod |
++.align 2 |
++.global openssl_poly1305_neon2_addmulmod |
++.type openssl_poly1305_neon2_addmulmod STT_FUNC |
++openssl_poly1305_neon2_addmulmod: |
++sub sp,sp,#0 |
++ |
++# qhasm: 2x mask = 0xffffffff |
++# asm 1: vmov.i64 >mask=reg128#1,#0xffffffff |
++# asm 2: vmov.i64 >mask=q0,#0xffffffff |
++vmov.i64 q0,#0xffffffff |
++ |
++# qhasm: y01 aligned= mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[<input_2=int32#3,: 128]! |
++# asm 2: vld1.8 {>y01=d2->y01=d3},[<input_2=r2,: 128]! |
++vld1.8 {d2-d3},[r2,: 128]! |
++ |
++# qhasm: 4x _5y01 = y01 << 2 |
++# asm 1: vshl.i32 >_5y01=reg128#3,<y01=reg128#2,#2 |
++# asm 2: vshl.i32 >_5y01=q2,<y01=q1,#2 |
++vshl.i32 q2,q1,#2 |
++ |
++# qhasm: y23 aligned= mem128[input_2];input_2+=16 |
++# asm 1: vld1.8 {>y23=reg128#4%bot->y23=reg128#4%top},[<input_2=int32#3,: 128]! |
++# asm 2: vld1.8 {>y23=d6->y23=d7},[<input_2=r2,: 128]! |
++vld1.8 {d6-d7},[r2,: 128]! |
++ |
++# qhasm: 4x _5y23 = y23 << 2 |
++# asm 1: vshl.i32 >_5y23=reg128#9,<y23=reg128#4,#2 |
++# asm 2: vshl.i32 >_5y23=q8,<y23=q3,#2 |
++vshl.i32 q8,q3,#2 |
++ |
++# qhasm: y4 aligned= mem64[input_2]y4[1] |
++# asm 1: vld1.8 {<y4=reg128#10%bot},[<input_2=int32#3,: 64] |
++# asm 2: vld1.8 {<y4=d18},[<input_2=r2,: 64] |
++vld1.8 {d18},[r2,: 64] |
++ |
++# qhasm: 4x _5y4 = y4 << 2 |
++# asm 1: vshl.i32 >_5y4=reg128#11,<y4=reg128#10,#2 |
++# asm 2: vshl.i32 >_5y4=q10,<y4=q9,#2 |
++vshl.i32 q10,q9,#2 |
++ |
++# qhasm: x01 aligned= mem128[input_1];input_1+=16 |
++# asm 1: vld1.8 {>x01=reg128#12%bot->x01=reg128#12%top},[<input_1=int32#2,: 128]! |
++# asm 2: vld1.8 {>x01=d22->x01=d23},[<input_1=r1,: 128]! |
++vld1.8 {d22-d23},[r1,: 128]! |
++ |
++# qhasm: 4x _5y01 += y01 |
++# asm 1: vadd.i32 >_5y01=reg128#3,<_5y01=reg128#3,<y01=reg128#2 |
++# asm 2: vadd.i32 >_5y01=q2,<_5y01=q2,<y01=q1 |
++vadd.i32 q2,q2,q1 |
++ |
++# qhasm: x23 aligned= mem128[input_1];input_1+=16 |
++# asm 1: vld1.8 {>x23=reg128#13%bot->x23=reg128#13%top},[<input_1=int32#2,: 128]! |
++# asm 2: vld1.8 {>x23=d24->x23=d25},[<input_1=r1,: 128]! |
++vld1.8 {d24-d25},[r1,: 128]! |
++ |
++# qhasm: 4x _5y23 += y23 |
++# asm 1: vadd.i32 >_5y23=reg128#9,<_5y23=reg128#9,<y23=reg128#4 |
++# asm 2: vadd.i32 >_5y23=q8,<_5y23=q8,<y23=q3 |
++vadd.i32 q8,q8,q3 |
++ |
++# qhasm: 4x _5y4 += y4 |
++# asm 1: vadd.i32 >_5y4=reg128#11,<_5y4=reg128#11,<y4=reg128#10 |
++# asm 2: vadd.i32 >_5y4=q10,<_5y4=q10,<y4=q9 |
++vadd.i32 q10,q10,q9 |
++ |
++# qhasm: c01 aligned= mem128[input_3];input_3+=16 |
++# asm 1: vld1.8 {>c01=reg128#14%bot->c01=reg128#14%top},[<input_3=int32#4,: 128]! |
++# asm 2: vld1.8 {>c01=d26->c01=d27},[<input_3=r3,: 128]! |
++vld1.8 {d26-d27},[r3,: 128]! |
++ |
++# qhasm: 4x x01 += c01 |
++# asm 1: vadd.i32 >x01=reg128#12,<x01=reg128#12,<c01=reg128#14 |
++# asm 2: vadd.i32 >x01=q11,<x01=q11,<c01=q13 |
++vadd.i32 q11,q11,q13 |
++ |
++# qhasm: c23 aligned= mem128[input_3];input_3+=16 |
++# asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_3=int32#4,: 128]! |
++# asm 2: vld1.8 {>c23=d26->c23=d27},[<input_3=r3,: 128]! |
++vld1.8 {d26-d27},[r3,: 128]! |
++ |
++# qhasm: 4x x23 += c23 |
++# asm 1: vadd.i32 >x23=reg128#13,<x23=reg128#13,<c23=reg128#14 |
++# asm 2: vadd.i32 >x23=q12,<x23=q12,<c23=q13 |
++vadd.i32 q12,q12,q13 |
++ |
++# qhasm: x4 aligned= mem64[input_1]x4[1] |
++# asm 1: vld1.8 {<x4=reg128#14%bot},[<input_1=int32#2,: 64] |
++# asm 2: vld1.8 {<x4=d26},[<input_1=r1,: 64] |
++vld1.8 {d26},[r1,: 64] |
++ |
++# qhasm: 2x mask unsigned>>=6 |
++# asm 1: vshr.u64 >mask=reg128#1,<mask=reg128#1,#6 |
++# asm 2: vshr.u64 >mask=q0,<mask=q0,#6 |
++vshr.u64 q0,q0,#6 |
++ |
++# qhasm: c4 aligned= mem64[input_3]c4[1] |
++# asm 1: vld1.8 {<c4=reg128#15%bot},[<input_3=int32#4,: 64] |
++# asm 2: vld1.8 {<c4=d28},[<input_3=r3,: 64] |
++vld1.8 {d28},[r3,: 64] |
++ |
++# qhasm: 4x x4 += c4 |
++# asm 1: vadd.i32 >x4=reg128#14,<x4=reg128#14,<c4=reg128#15 |
++# asm 2: vadd.i32 >x4=q13,<x4=q13,<c4=q14 |
++vadd.i32 q13,q13,q14 |
++ |
++# qhasm: r0[0,1] = x01[0] unsigned* y01[0]; r0[2,3] = x01[1] unsigned* y01[1] |
++# asm 1: vmull.u32 >r0=reg128#15,<x01=reg128#12%bot,<y01=reg128#2%bot |
++# asm 2: vmull.u32 >r0=q14,<x01=d22,<y01=d2 |
++vmull.u32 q14,d22,d2 |
++ |
++# qhasm: r0[0,1] += x01[2] unsigned* _5y4[0]; r0[2,3] += x01[3] unsigned* _5y4[1] |
++# asm 1: vmlal.u32 <r0=reg128#15,<x01=reg128#12%top,<_5y4=reg128#11%bot |
++# asm 2: vmlal.u32 <r0=q14,<x01=d23,<_5y4=d20 |
++vmlal.u32 q14,d23,d20 |
++ |
++# qhasm: r0[0,1] += x23[0] unsigned* _5y23[2]; r0[2,3] += x23[1] unsigned* _5y23[3] |
++# asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%bot,<_5y23=reg128#9%top |
++# asm 2: vmlal.u32 <r0=q14,<x23=d24,<_5y23=d17 |
++vmlal.u32 q14,d24,d17 |
++ |
++# qhasm: r0[0,1] += x23[2] unsigned* _5y23[0]; r0[2,3] += x23[3] unsigned* _5y23[1] |
++# asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%top,<_5y23=reg128#9%bot |
++# asm 2: vmlal.u32 <r0=q14,<x23=d25,<_5y23=d16 |
++vmlal.u32 q14,d25,d16 |
++ |
++# qhasm: r0[0,1] += x4[0] unsigned* _5y01[2]; r0[2,3] += x4[1] unsigned* _5y01[3] |
++# asm 1: vmlal.u32 <r0=reg128#15,<x4=reg128#14%bot,<_5y01=reg128#3%top |
++# asm 2: vmlal.u32 <r0=q14,<x4=d26,<_5y01=d5 |
++vmlal.u32 q14,d26,d5 |
++ |
++# qhasm: r1[0,1] = x01[0] unsigned* y01[2]; r1[2,3] = x01[1] unsigned* y01[3] |
++# asm 1: vmull.u32 >r1=reg128#3,<x01=reg128#12%bot,<y01=reg128#2%top |
++# asm 2: vmull.u32 >r1=q2,<x01=d22,<y01=d3 |
++vmull.u32 q2,d22,d3 |
++ |
++# qhasm: r1[0,1] += x01[2] unsigned* y01[0]; r1[2,3] += x01[3] unsigned* y01[1] |
++# asm 1: vmlal.u32 <r1=reg128#3,<x01=reg128#12%top,<y01=reg128#2%bot |
++# asm 2: vmlal.u32 <r1=q2,<x01=d23,<y01=d2 |
++vmlal.u32 q2,d23,d2 |
++ |
++# qhasm: r1[0,1] += x23[0] unsigned* _5y4[0]; r1[2,3] += x23[1] unsigned* _5y4[1] |
++# asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%bot,<_5y4=reg128#11%bot |
++# asm 2: vmlal.u32 <r1=q2,<x23=d24,<_5y4=d20 |
++vmlal.u32 q2,d24,d20 |
++ |
++# qhasm: r1[0,1] += x23[2] unsigned* _5y23[2]; r1[2,3] += x23[3] unsigned* _5y23[3] |
++# asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%top,<_5y23=reg128#9%top |
++# asm 2: vmlal.u32 <r1=q2,<x23=d25,<_5y23=d17 |
++vmlal.u32 q2,d25,d17 |
++ |
++# qhasm: r1[0,1] += x4[0] unsigned* _5y23[0]; r1[2,3] += x4[1] unsigned* _5y23[1] |
++# asm 1: vmlal.u32 <r1=reg128#3,<x4=reg128#14%bot,<_5y23=reg128#9%bot |
++# asm 2: vmlal.u32 <r1=q2,<x4=d26,<_5y23=d16 |
++vmlal.u32 q2,d26,d16 |
++ |
++# qhasm: r2[0,1] = x01[0] unsigned* y23[0]; r2[2,3] = x01[1] unsigned* y23[1] |
++# asm 1: vmull.u32 >r2=reg128#16,<x01=reg128#12%bot,<y23=reg128#4%bot |
++# asm 2: vmull.u32 >r2=q15,<x01=d22,<y23=d6 |
++vmull.u32 q15,d22,d6 |
++ |
++# qhasm: r2[0,1] += x01[2] unsigned* y01[2]; r2[2,3] += x01[3] unsigned* y01[3] |
++# asm 1: vmlal.u32 <r2=reg128#16,<x01=reg128#12%top,<y01=reg128#2%top |
++# asm 2: vmlal.u32 <r2=q15,<x01=d23,<y01=d3 |
++vmlal.u32 q15,d23,d3 |
++ |
++# qhasm: r2[0,1] += x23[0] unsigned* y01[0]; r2[2,3] += x23[1] unsigned* y01[1] |
++# asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%bot,<y01=reg128#2%bot |
++# asm 2: vmlal.u32 <r2=q15,<x23=d24,<y01=d2 |
++vmlal.u32 q15,d24,d2 |
++ |
++# qhasm: r2[0,1] += x23[2] unsigned* _5y4[0]; r2[2,3] += x23[3] unsigned* _5y4[1] |
++# asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%top,<_5y4=reg128#11%bot |
++# asm 2: vmlal.u32 <r2=q15,<x23=d25,<_5y4=d20 |
++vmlal.u32 q15,d25,d20 |
++ |
++# qhasm: r2[0,1] += x4[0] unsigned* _5y23[2]; r2[2,3] += x4[1] unsigned* _5y23[3] |
++# asm 1: vmlal.u32 <r2=reg128#16,<x4=reg128#14%bot,<_5y23=reg128#9%top |
++# asm 2: vmlal.u32 <r2=q15,<x4=d26,<_5y23=d17 |
++vmlal.u32 q15,d26,d17 |
++ |
++# qhasm: r3[0,1] = x01[0] unsigned* y23[2]; r3[2,3] = x01[1] unsigned* y23[3] |
++# asm 1: vmull.u32 >r3=reg128#9,<x01=reg128#12%bot,<y23=reg128#4%top |
++# asm 2: vmull.u32 >r3=q8,<x01=d22,<y23=d7 |
++vmull.u32 q8,d22,d7 |
++ |
++# qhasm: r3[0,1] += x01[2] unsigned* y23[0]; r3[2,3] += x01[3] unsigned* y23[1] |
++# asm 1: vmlal.u32 <r3=reg128#9,<x01=reg128#12%top,<y23=reg128#4%bot |
++# asm 2: vmlal.u32 <r3=q8,<x01=d23,<y23=d6 |
++vmlal.u32 q8,d23,d6 |
++ |
++# qhasm: r3[0,1] += x23[0] unsigned* y01[2]; r3[2,3] += x23[1] unsigned* y01[3] |
++# asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%bot,<y01=reg128#2%top |
++# asm 2: vmlal.u32 <r3=q8,<x23=d24,<y01=d3 |
++vmlal.u32 q8,d24,d3 |
++ |
++# qhasm: r3[0,1] += x23[2] unsigned* y01[0]; r3[2,3] += x23[3] unsigned* y01[1] |
++# asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%top,<y01=reg128#2%bot |
++# asm 2: vmlal.u32 <r3=q8,<x23=d25,<y01=d2 |
++vmlal.u32 q8,d25,d2 |
++ |
++# qhasm: r3[0,1] += x4[0] unsigned* _5y4[0]; r3[2,3] += x4[1] unsigned* _5y4[1] |
++# asm 1: vmlal.u32 <r3=reg128#9,<x4=reg128#14%bot,<_5y4=reg128#11%bot |
++# asm 2: vmlal.u32 <r3=q8,<x4=d26,<_5y4=d20 |
++vmlal.u32 q8,d26,d20 |
++ |
++# qhasm: r4[0,1] = x01[0] unsigned* y4[0]; r4[2,3] = x01[1] unsigned* y4[1] |
++# asm 1: vmull.u32 >r4=reg128#10,<x01=reg128#12%bot,<y4=reg128#10%bot |
++# asm 2: vmull.u32 >r4=q9,<x01=d22,<y4=d18 |
++vmull.u32 q9,d22,d18 |
++ |
++# qhasm: r4[0,1] += x01[2] unsigned* y23[2]; r4[2,3] += x01[3] unsigned* y23[3] |
++# asm 1: vmlal.u32 <r4=reg128#10,<x01=reg128#12%top,<y23=reg128#4%top |
++# asm 2: vmlal.u32 <r4=q9,<x01=d23,<y23=d7 |
++vmlal.u32 q9,d23,d7 |
++ |
++# qhasm: r4[0,1] += x23[0] unsigned* y23[0]; r4[2,3] += x23[1] unsigned* y23[1] |
++# asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%bot,<y23=reg128#4%bot |
++# asm 2: vmlal.u32 <r4=q9,<x23=d24,<y23=d6 |
++vmlal.u32 q9,d24,d6 |
++ |
++# qhasm: r4[0,1] += x23[2] unsigned* y01[2]; r4[2,3] += x23[3] unsigned* y01[3] |
++# asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%top,<y01=reg128#2%top |
++# asm 2: vmlal.u32 <r4=q9,<x23=d25,<y01=d3 |
++vmlal.u32 q9,d25,d3 |
++ |
++# qhasm: r4[0,1] += x4[0] unsigned* y01[0]; r4[2,3] += x4[1] unsigned* y01[1] |
++# asm 1: vmlal.u32 <r4=reg128#10,<x4=reg128#14%bot,<y01=reg128#2%bot |
++# asm 2: vmlal.u32 <r4=q9,<x4=d26,<y01=d2 |
++vmlal.u32 q9,d26,d2 |
++ |
++# qhasm: 2x t1 = r0 unsigned>> 26 |
++# asm 1: vshr.u64 >t1=reg128#2,<r0=reg128#15,#26 |
++# asm 2: vshr.u64 >t1=q1,<r0=q14,#26 |
++vshr.u64 q1,q14,#26 |
++ |
++# qhasm: r0 &= mask |
++# asm 1: vand >r0=reg128#4,<r0=reg128#15,<mask=reg128#1 |
++# asm 2: vand >r0=q3,<r0=q14,<mask=q0 |
++vand q3,q14,q0 |
++ |
++# qhasm: 2x r1 += t1 |
++# asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#3,<t1=reg128#2 |
++# asm 2: vadd.i64 >r1=q1,<r1=q2,<t1=q1 |
++vadd.i64 q1,q2,q1 |
++ |
++# qhasm: 2x t4 = r3 unsigned>> 26 |
++# asm 1: vshr.u64 >t4=reg128#3,<r3=reg128#9,#26 |
++# asm 2: vshr.u64 >t4=q2,<r3=q8,#26 |
++vshr.u64 q2,q8,#26 |
++ |
++# qhasm: r3 &= mask |
++# asm 1: vand >r3=reg128#9,<r3=reg128#9,<mask=reg128#1 |
++# asm 2: vand >r3=q8,<r3=q8,<mask=q0 |
++vand q8,q8,q0 |
++ |
++# qhasm: 2x r4 += t4 |
++# asm 1: vadd.i64 >r4=reg128#3,<r4=reg128#10,<t4=reg128#3 |
++# asm 2: vadd.i64 >r4=q2,<r4=q9,<t4=q2 |
++vadd.i64 q2,q9,q2 |
++ |
++# qhasm: 2x t2 = r1 unsigned>> 26 |
++# asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#2,#26 |
++# asm 2: vshr.u64 >t2=q9,<r1=q1,#26 |
++vshr.u64 q9,q1,#26 |
++ |
++# qhasm: r1 &= mask |
++# asm 1: vand >r1=reg128#2,<r1=reg128#2,<mask=reg128#1 |
++# asm 2: vand >r1=q1,<r1=q1,<mask=q0 |
++vand q1,q1,q0 |
++ |
++# qhasm: 2x t0 = r4 unsigned>> 26 |
++# asm 1: vshr.u64 >t0=reg128#11,<r4=reg128#3,#26 |
++# asm 2: vshr.u64 >t0=q10,<r4=q2,#26 |
++vshr.u64 q10,q2,#26 |
++ |
++# qhasm: 2x r2 += t2 |
++# asm 1: vadd.i64 >r2=reg128#10,<r2=reg128#16,<t2=reg128#10 |
++# asm 2: vadd.i64 >r2=q9,<r2=q15,<t2=q9 |
++vadd.i64 q9,q15,q9 |
++ |
++# qhasm: r4 &= mask |
++# asm 1: vand >r4=reg128#3,<r4=reg128#3,<mask=reg128#1 |
++# asm 2: vand >r4=q2,<r4=q2,<mask=q0 |
++vand q2,q2,q0 |
++ |
++# qhasm: 2x r0 += t0 |
++# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11 |
++# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10 |
++vadd.i64 q3,q3,q10 |
++ |
++# qhasm: 2x t0 <<= 2 |
++# asm 1: vshl.i64 >t0=reg128#11,<t0=reg128#11,#2 |
++# asm 2: vshl.i64 >t0=q10,<t0=q10,#2 |
++vshl.i64 q10,q10,#2 |
++ |
++# qhasm: 2x t3 = r2 unsigned>> 26 |
++# asm 1: vshr.u64 >t3=reg128#12,<r2=reg128#10,#26 |
++# asm 2: vshr.u64 >t3=q11,<r2=q9,#26 |
++vshr.u64 q11,q9,#26 |
++ |
++# qhasm: 2x r0 += t0 |
++# asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11 |
++# asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10 |
++vadd.i64 q3,q3,q10 |
++ |
++# qhasm: x23 = r2 & mask |
++# asm 1: vand >x23=reg128#10,<r2=reg128#10,<mask=reg128#1 |
++# asm 2: vand >x23=q9,<r2=q9,<mask=q0 |
++vand q9,q9,q0 |
++ |
++# qhasm: 2x r3 += t3 |
++# asm 1: vadd.i64 >r3=reg128#9,<r3=reg128#9,<t3=reg128#12 |
++# asm 2: vadd.i64 >r3=q8,<r3=q8,<t3=q11 |
++vadd.i64 q8,q8,q11 |
++ |
++# qhasm: 2x t1 = r0 unsigned>> 26 |
++# asm 1: vshr.u64 >t1=reg128#11,<r0=reg128#4,#26 |
++# asm 2: vshr.u64 >t1=q10,<r0=q3,#26 |
++vshr.u64 q10,q3,#26 |
++ |
++# qhasm: x23 = x23[0,2,1,3] |
++# asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top |
++# asm 2: vtrn.32 <x23=d18,<x23=d19 |
++vtrn.32 d18,d19 |
++ |
++# qhasm: x01 = r0 & mask |
++# asm 1: vand >x01=reg128#4,<r0=reg128#4,<mask=reg128#1 |
++# asm 2: vand >x01=q3,<r0=q3,<mask=q0 |
++vand q3,q3,q0 |
++ |
++# qhasm: 2x r1 += t1 |
++# asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#2,<t1=reg128#11 |
++# asm 2: vadd.i64 >r1=q1,<r1=q1,<t1=q10 |
++vadd.i64 q1,q1,q10 |
++ |
++# qhasm: 2x t4 = r3 unsigned>> 26 |
++# asm 1: vshr.u64 >t4=reg128#11,<r3=reg128#9,#26 |
++# asm 2: vshr.u64 >t4=q10,<r3=q8,#26 |
++vshr.u64 q10,q8,#26 |
++ |
++# qhasm: x01 = x01[0,2,1,3] |
++# asm 1: vtrn.32 <x01=reg128#4%bot,<x01=reg128#4%top |
++# asm 2: vtrn.32 <x01=d6,<x01=d7 |
++vtrn.32 d6,d7 |
++ |
++# qhasm: r3 &= mask |
++# asm 1: vand >r3=reg128#1,<r3=reg128#9,<mask=reg128#1 |
++# asm 2: vand >r3=q0,<r3=q8,<mask=q0 |
++vand q0,q8,q0 |
++ |
++# qhasm: r1 = r1[0,2,1,3] |
++# asm 1: vtrn.32 <r1=reg128#2%bot,<r1=reg128#2%top |
++# asm 2: vtrn.32 <r1=d2,<r1=d3 |
++vtrn.32 d2,d3 |
++ |
++# qhasm: 2x x4 = r4 + t4 |
++# asm 1: vadd.i64 >x4=reg128#3,<r4=reg128#3,<t4=reg128#11 |
++# asm 2: vadd.i64 >x4=q2,<r4=q2,<t4=q10 |
++vadd.i64 q2,q2,q10 |
++ |
++# qhasm: r3 = r3[0,2,1,3] |
++# asm 1: vtrn.32 <r3=reg128#1%bot,<r3=reg128#1%top |
++# asm 2: vtrn.32 <r3=d0,<r3=d1 |
++vtrn.32 d0,d1 |
++ |
++# qhasm: x01 = x01[0,1] r1[0,1] |
++# asm 1: vext.32 <x01=reg128#4%top,<r1=reg128#2%bot,<r1=reg128#2%bot,#0 |
++# asm 2: vext.32 <x01=d7,<r1=d2,<r1=d2,#0 |
++vext.32 d7,d2,d2,#0 |
++ |
++# qhasm: x23 = x23[0,1] r3[0,1] |
++# asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#1%bot,<r3=reg128#1%bot,#0 |
++# asm 2: vext.32 <x23=d19,<r3=d0,<r3=d0,#0 |
++vext.32 d19,d0,d0,#0 |
++ |
++# qhasm: x4 = x4[0,2,1,3] |
++# asm 1: vtrn.32 <x4=reg128#3%bot,<x4=reg128#3%top |
++# asm 2: vtrn.32 <x4=d4,<x4=d5 |
++vtrn.32 d4,d5 |
++ |
++# qhasm: mem128[input_0] aligned= x01;input_0+=16 |
++# asm 1: vst1.8 {<x01=reg128#4%bot-<x01=reg128#4%top},[<input_0=int32#1,: 128]! |
++# asm 2: vst1.8 {<x01=d6-<x01=d7},[<input_0=r0,: 128]! |
++vst1.8 {d6-d7},[r0,: 128]! |
++ |
++# qhasm: mem128[input_0] aligned= x23;input_0+=16 |
++# asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1,: 128]! |
++# asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0,: 128]! |
++vst1.8 {d18-d19},[r0,: 128]! |
++ |
++# qhasm: mem64[input_0] aligned= x4[0] |
++# asm 1: vst1.8 <x4=reg128#3%bot,[<input_0=int32#1,: 64] |
++# asm 2: vst1.8 <x4=d4,[<input_0=r0,: 64] |
++vst1.8 d4,[r0,: 64] |
++ |
++# qhasm: return |
++add sp,sp,#0 |
++bx lr |
+diff --git a/crypto/poly1305/poly1305_vec.c b/crypto/poly1305/poly1305_vec.c |
+new file mode 100644 |
+index 0000000..c546200 |
+--- /dev/null |
++++ b/crypto/poly1305/poly1305_vec.c |
+@@ -0,0 +1,733 @@ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++/* This implementation of poly1305 is by Andrew Moon |
++ * (https://github.com/floodyberry/poly1305-donna) and released as public |
++ * domain. It implements SIMD vectorization based on the algorithm described in |
++ * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte |
++ * block size |
++*/ |
++ |
++#include <emmintrin.h> |
++#include <stdint.h> |
++#include <openssl/opensslconf.h> |
++ |
++#if !defined(OPENSSL_NO_POLY1305) |
++ |
++#include <openssl/poly1305.h> |
++ |
++#define ALIGN(x) __attribute__((aligned(x))) |
++#define INLINE inline |
++#define U8TO64_LE(m) (*(uint64_t*)(m)) |
++#define U8TO32_LE(m) (*(uint32_t*)(m)) |
++#define U64TO8_LE(m,v) (*(uint64_t*)(m)) = v |
++ |
++typedef __m128i xmmi; |
++typedef unsigned __int128 uint128_t; |
++ |
++static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = |
++ {(1 << 26) - 1, 0, (1 << 26) - 1, 0}; |
++static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; |
++static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = |
++ {(1 << 24), 0, (1 << 24), 0}; |
++ |
++static uint128_t INLINE |
++add128(uint128_t a, uint128_t b) |
++ { |
++ return a + b; |
++ } |
++ |
++static uint128_t INLINE |
++add128_64(uint128_t a, uint64_t b) |
++ { |
++ return a + b; |
++ } |
++ |
++static uint128_t INLINE |
++mul64x64_128(uint64_t a, uint64_t b) |
++ { |
++ return (uint128_t)a * b; |
++ } |
++ |
++static uint64_t INLINE |
++lo128(uint128_t a) |
++ { |
++ return (uint64_t)a; |
++ } |
++ |
++static uint64_t INLINE |
++shr128(uint128_t v, const int shift) |
++ { |
++ return (uint64_t)(v >> shift); |
++ } |
++ |
++static uint64_t INLINE |
++shr128_pair(uint64_t hi, uint64_t lo, const int shift) |
++ { |
++ return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); |
++ } |
++ |
++typedef struct poly1305_power_t |
++ { |
++ union |
++ { |
++ xmmi v; |
++ uint64_t u[2]; |
++ uint32_t d[4]; |
++ } R20,R21,R22,R23,R24,S21,S22,S23,S24; |
++ } poly1305_power; |
++ |
++typedef struct poly1305_state_internal_t |
++ { |
++ poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 |
++ bytes of free storage */ |
++ union |
++ { |
++ xmmi H[5]; /* 80 bytes */ |
++ uint64_t HH[10]; |
++ }; |
++ /* uint64_t r0,r1,r2; [24 bytes] */ |
++ /* uint64_t pad0,pad1; [16 bytes] */ |
++ uint64_t started; /* 8 bytes */ |
++ uint64_t leftover; /* 8 bytes */ |
++ uint8_t buffer[64]; /* 64 bytes */ |
++ } poly1305_state_internal; /* 448 bytes total + 63 bytes for |
++ alignment = 511 bytes raw */ |
++ |
++static poly1305_state_internal INLINE |
++*poly1305_aligned_state(poly1305_state *state) |
++ { |
++ return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); |
++ } |
++ |
++/* copy 0-63 bytes */ |
++static void INLINE |
++poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) |
++ { |
++ size_t offset = src - dst; |
++ if (bytes & 32) |
++ { |
++ _mm_storeu_si128((xmmi *)(dst + 0), _mm_loadu_si128((xmmi *)(dst + offset + 0))); |
++ _mm_storeu_si128((xmmi *)(dst + 16), _mm_loadu_si128((xmmi *)(dst + offset + 16))); |
++ dst += 32; |
++ } |
++ if (bytes & 16) |
++ { |
++ _mm_storeu_si128((xmmi *)dst, |
++ _mm_loadu_si128((xmmi *)(dst + offset))); |
++ dst += 16; |
++ } |
++ if (bytes & 8) |
++ { |
++ *(uint64_t *)dst = *(uint64_t *)(dst + offset); |
++ dst += 8; |
++ } |
++ if (bytes & 4) |
++ { |
++ *(uint32_t *)dst = *(uint32_t *)(dst + offset); |
++ dst += 4; |
++ } |
++ if (bytes & 2) |
++ { |
++ *(uint16_t *)dst = *(uint16_t *)(dst + offset); |
++ dst += 2; |
++ } |
++ if (bytes & 1) |
++ { |
++ *( uint8_t *)dst = *( uint8_t *)(dst + offset); |
++ } |
++ } |
++ |
++/* zero 0-15 bytes */ |
++static void INLINE |
++poly1305_block_zero(uint8_t *dst, size_t bytes) |
++ { |
++ if (bytes & 8) { *(uint64_t *)dst = 0; dst += 8; } |
++ if (bytes & 4) { *(uint32_t *)dst = 0; dst += 4; } |
++ if (bytes & 2) { *(uint16_t *)dst = 0; dst += 2; } |
++ if (bytes & 1) { *( uint8_t *)dst = 0; } |
++ } |
++ |
++static size_t INLINE |
++poly1305_min(size_t a, size_t b) |
++ { |
++ return (a < b) ? a : b; |
++ } |
++ |
++void |
++CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) |
++ { |
++ poly1305_state_internal *st = poly1305_aligned_state(state); |
++ poly1305_power *p; |
++ uint64_t r0,r1,r2; |
++ uint64_t t0,t1; |
++ |
++ /* clamp key */ |
++ t0 = U8TO64_LE(key + 0); |
++ t1 = U8TO64_LE(key + 8); |
++ r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20; |
++ r1 = t0 & 0xfffffc0ffff; t1 >>= 24; |
++ r2 = t1 & 0x00ffffffc0f; |
++ |
++ /* store r in un-used space of st->P[1] */ |
++ p = &st->P[1]; |
++ p->R20.d[1] = (uint32_t)(r0 ); |
++ p->R20.d[3] = (uint32_t)(r0 >> 32); |
++ p->R21.d[1] = (uint32_t)(r1 ); |
++ p->R21.d[3] = (uint32_t)(r1 >> 32); |
++ p->R22.d[1] = (uint32_t)(r2 ); |
++ p->R22.d[3] = (uint32_t)(r2 >> 32); |
++ |
++ /* store pad */ |
++ p->R23.d[1] = U8TO32_LE(key + 16); |
++ p->R23.d[3] = U8TO32_LE(key + 20); |
++ p->R24.d[1] = U8TO32_LE(key + 24); |
++ p->R24.d[3] = U8TO32_LE(key + 28); |
++ |
++ /* H = 0 */ |
++ st->H[0] = _mm_setzero_si128(); |
++ st->H[1] = _mm_setzero_si128(); |
++ st->H[2] = _mm_setzero_si128(); |
++ st->H[3] = _mm_setzero_si128(); |
++ st->H[4] = _mm_setzero_si128(); |
++ |
++ st->started = 0; |
++ st->leftover = 0; |
++ } |
++ |
++static void |
++poly1305_first_block(poly1305_state_internal *st, const uint8_t *m) |
++ { |
++ const xmmi MMASK = |
++ _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask); |
++ const xmmi FIVE = _mm_load_si128((xmmi*)poly1305_x64_sse2_5); |
++ const xmmi HIBIT = _mm_load_si128((xmmi*)poly1305_x64_sse2_1shl128); |
++ xmmi T5,T6; |
++ poly1305_power *p; |
++ uint128_t d[3]; |
++ uint64_t r0,r1,r2; |
++ uint64_t r20,r21,r22,s22; |
++ uint64_t pad0,pad1; |
++ uint64_t c; |
++ uint64_t i; |
++ |
++ /* pull out stored info */ |
++ p = &st->P[1]; |
++ |
++ r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; |
++ r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; |
++ r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; |
++ pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; |
++ pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; |
++ |
++ /* compute powers r^2,r^4 */ |
++ r20 = r0; |
++ r21 = r1; |
++ r22 = r2; |
++ for (i = 0; i < 2; i++) |
++ { |
++ s22 = r22 * (5 << 2); |
++ |
++ d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); |
++ d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); |
++ d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); |
++ |
++ r20 = lo128(d[0]) & 0xfffffffffff; c = shr128(d[0], 44); |
++ d[1] = add128_64(d[1], c); r21 = lo128(d[1]) & 0xfffffffffff; c = shr128(d[1], 44); |
++ d[2] = add128_64(d[2], c); r22 = lo128(d[2]) & 0x3ffffffffff; c = shr128(d[2], 42); |
++ r20 += c * 5; c = (r20 >> 44); r20 = r20 & 0xfffffffffff; |
++ r21 += c; |
++ |
++ p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)( r20 ) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0)); |
++ p->R21.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0)); |
++ p->R22.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8) ) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0)); |
++ p->R23.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0)); |
++ p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16) ) ), _MM_SHUFFLE(1,0,1,0)); |
++ p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); |
++ p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); |
++ p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); |
++ p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); |
++ p--; |
++ } |
++ |
++ /* put saved info back */ |
++ p = &st->P[1]; |
++ p->R20.d[1] = (uint32_t)(r0 ); |
++ p->R20.d[3] = (uint32_t)(r0 >> 32); |
++ p->R21.d[1] = (uint32_t)(r1 ); |
++ p->R21.d[3] = (uint32_t)(r1 >> 32); |
++ p->R22.d[1] = (uint32_t)(r2 ); |
++ p->R22.d[3] = (uint32_t)(r2 >> 32); |
++ p->R23.d[1] = (uint32_t)(pad0 ); |
++ p->R23.d[3] = (uint32_t)(pad0 >> 32); |
++ p->R24.d[1] = (uint32_t)(pad1 ); |
++ p->R24.d[3] = (uint32_t)(pad1 >> 32); |
++ |
++ /* H = [Mx,My] */ |
++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16))); |
++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24))); |
++ st->H[0] = _mm_and_si128(MMASK, T5); |
++ st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); |
++ st->H[2] = _mm_and_si128(MMASK, T5); |
++ st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); |
++ } |
++ |
++static void |
++poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, size_t bytes) |
++ { |
++ const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask); |
++ const xmmi FIVE = _mm_load_si128((xmmi*)poly1305_x64_sse2_5); |
++ const xmmi HIBIT = _mm_load_si128((xmmi*)poly1305_x64_sse2_1shl128); |
++ |
++ poly1305_power *p; |
++ xmmi H0,H1,H2,H3,H4; |
++ xmmi T0,T1,T2,T3,T4,T5,T6; |
++ xmmi M0,M1,M2,M3,M4; |
++ xmmi C1,C2; |
++ |
++ H0 = st->H[0]; |
++ H1 = st->H[1]; |
++ H2 = st->H[2]; |
++ H3 = st->H[3]; |
++ H4 = st->H[4]; |
++ |
++ while (bytes >= 64) |
++ { |
++ /* H *= [r^4,r^4] */ |
++ p = &st->P[0]; |
++ T0 = _mm_mul_epu32(H0, p->R20.v); |
++ T1 = _mm_mul_epu32(H0, p->R21.v); |
++ T2 = _mm_mul_epu32(H0, p->R22.v); |
++ T3 = _mm_mul_epu32(H0, p->R23.v); |
++ T4 = _mm_mul_epu32(H0, p->R24.v); |
++ T5 = _mm_mul_epu32(H1, p->S24.v); T6 = _mm_mul_epu32(H1, p->R20.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H2, p->S23.v); T6 = _mm_mul_epu32(H2, p->S24.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H3, p->S22.v); T6 = _mm_mul_epu32(H3, p->S23.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H4, p->S21.v); T6 = _mm_mul_epu32(H4, p->S22.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H1, p->R21.v); T6 = _mm_mul_epu32(H1, p->R22.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H2, p->R20.v); T6 = _mm_mul_epu32(H2, p->R21.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H3, p->S24.v); T6 = _mm_mul_epu32(H3, p->R20.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H4, p->S23.v); T6 = _mm_mul_epu32(H4, p->S24.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H1, p->R23.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H2, p->R22.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H3, p->R21.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H4, p->R20.v); T4 = _mm_add_epi64(T4, T5); |
++ |
++ /* H += [Mx,My]*[r^2,r^2] */ |
++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16))); |
++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24))); |
++ M0 = _mm_and_si128(MMASK, T5); |
++ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); |
++ M2 = _mm_and_si128(MMASK, T5); |
++ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); |
++ |
++ p = &st->P[1]; |
++ T5 = _mm_mul_epu32(M0, p->R20.v); T6 = _mm_mul_epu32(M0, p->R21.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(M1, p->S24.v); T6 = _mm_mul_epu32(M1, p->R20.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(M2, p->S23.v); T6 = _mm_mul_epu32(M2, p->S24.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(M3, p->S22.v); T6 = _mm_mul_epu32(M3, p->S23.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(M4, p->S21.v); T6 = _mm_mul_epu32(M4, p->S22.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(M0, p->R22.v); T6 = _mm_mul_epu32(M0, p->R23.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(M1, p->R21.v); T6 = _mm_mul_epu32(M1, p->R22.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(M2, p->R20.v); T6 = _mm_mul_epu32(M2, p->R21.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(M3, p->S24.v); T6 = _mm_mul_epu32(M3, p->R20.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(M4, p->S23.v); T6 = _mm_mul_epu32(M4, p->S24.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(M0, p->R24.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(M1, p->R23.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(M2, p->R22.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(M3, p->R21.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(M4, p->R20.v); T4 = _mm_add_epi64(T4, T5); |
++ |
++ /* H += [Mx,My] */ |
++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)), _mm_loadl_epi64((xmmi *)(m + 48))); |
++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)), _mm_loadl_epi64((xmmi *)(m + 56))); |
++ M0 = _mm_and_si128(MMASK, T5); |
++ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); |
++ M2 = _mm_and_si128(MMASK, T5); |
++ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); |
++ |
++ T0 = _mm_add_epi64(T0, M0); |
++ T1 = _mm_add_epi64(T1, M1); |
++ T2 = _mm_add_epi64(T2, M2); |
++ T3 = _mm_add_epi64(T3, M3); |
++ T4 = _mm_add_epi64(T4, M4); |
++ |
++ /* reduce */ |
++ C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2); |
++ C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); |
++ C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2); |
++ C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); |
++ |
++ /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */ |
++ H0 = T0; |
++ H1 = T1; |
++ H2 = T2; |
++ H3 = T3; |
++ H4 = T4; |
++ |
++ m += 64; |
++ bytes -= 64; |
++ } |
++ |
++ st->H[0] = H0; |
++ st->H[1] = H1; |
++ st->H[2] = H2; |
++ st->H[3] = H3; |
++ st->H[4] = H4; |
++ } |
++ |
++static size_t |
++poly1305_combine(poly1305_state_internal *st, const uint8_t *m, size_t bytes) |
++ { |
++ const xmmi MMASK = |
++ _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask); |
++ const xmmi HIBIT = _mm_load_si128((xmmi*)poly1305_x64_sse2_1shl128); |
++ const xmmi FIVE = _mm_load_si128((xmmi*)poly1305_x64_sse2_5); |
++ |
++ poly1305_power *p; |
++ xmmi H0,H1,H2,H3,H4; |
++ xmmi M0,M1,M2,M3,M4; |
++ xmmi T0,T1,T2,T3,T4,T5,T6; |
++ xmmi C1,C2; |
++ |
++ uint64_t r0,r1,r2; |
++ uint64_t t0,t1,t2,t3,t4; |
++ uint64_t c; |
++ size_t consumed = 0; |
++ |
++ H0 = st->H[0]; |
++ H1 = st->H[1]; |
++ H2 = st->H[2]; |
++ H3 = st->H[3]; |
++ H4 = st->H[4]; |
++ |
++ /* p = [r^2,r^2] */ |
++ p = &st->P[1]; |
++ |
++ if (bytes >= 32) |
++ { |
++ /* H *= [r^2,r^2] */ |
++ T0 = _mm_mul_epu32(H0, p->R20.v); |
++ T1 = _mm_mul_epu32(H0, p->R21.v); |
++ T2 = _mm_mul_epu32(H0, p->R22.v); |
++ T3 = _mm_mul_epu32(H0, p->R23.v); |
++ T4 = _mm_mul_epu32(H0, p->R24.v); |
++ T5 = _mm_mul_epu32(H1, p->S24.v); T6 = _mm_mul_epu32(H1, p->R20.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H2, p->S23.v); T6 = _mm_mul_epu32(H2, p->S24.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H3, p->S22.v); T6 = _mm_mul_epu32(H3, p->S23.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H4, p->S21.v); T6 = _mm_mul_epu32(H4, p->S22.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H1, p->R21.v); T6 = _mm_mul_epu32(H1, p->R22.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H2, p->R20.v); T6 = _mm_mul_epu32(H2, p->R21.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H3, p->S24.v); T6 = _mm_mul_epu32(H3, p->R20.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H4, p->S23.v); T6 = _mm_mul_epu32(H4, p->S24.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H1, p->R23.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H2, p->R22.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H3, p->R21.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H4, p->R20.v); T4 = _mm_add_epi64(T4, T5); |
++ |
++ /* H += [Mx,My] */ |
++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16))); |
++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24))); |
++ M0 = _mm_and_si128(MMASK, T5); |
++ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); |
++ M2 = _mm_and_si128(MMASK, T5); |
++ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); |
++ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); |
++ |
++ T0 = _mm_add_epi64(T0, M0); |
++ T1 = _mm_add_epi64(T1, M1); |
++ T2 = _mm_add_epi64(T2, M2); |
++ T3 = _mm_add_epi64(T3, M3); |
++ T4 = _mm_add_epi64(T4, M4); |
++ |
++ /* reduce */ |
++ C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2); |
++ C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); |
++ C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2); |
++ C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); |
++ |
++ /* H = (H*[r^2,r^2] + [Mx,My]) */ |
++ H0 = T0; |
++ H1 = T1; |
++ H2 = T2; |
++ H3 = T3; |
++ H4 = T4; |
++ |
++ consumed = 32; |
++ } |
++ |
++ /* finalize, H *= [r^2,r] */ |
++ r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; |
++ r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; |
++ r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; |
++ |
++ p->R20.d[2] = (uint32_t)( r0 ) & 0x3ffffff; |
++ p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; |
++ p->R22.d[2] = (uint32_t)((r1 >> 8) ) & 0x3ffffff; |
++ p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; |
++ p->R24.d[2] = (uint32_t)((r2 >> 16) ) ; |
++ p->S21.d[2] = p->R21.d[2] * 5; |
++ p->S22.d[2] = p->R22.d[2] * 5; |
++ p->S23.d[2] = p->R23.d[2] * 5; |
++ p->S24.d[2] = p->R24.d[2] * 5; |
++ |
++ /* H *= [r^2,r] */ |
++ T0 = _mm_mul_epu32(H0, p->R20.v); |
++ T1 = _mm_mul_epu32(H0, p->R21.v); |
++ T2 = _mm_mul_epu32(H0, p->R22.v); |
++ T3 = _mm_mul_epu32(H0, p->R23.v); |
++ T4 = _mm_mul_epu32(H0, p->R24.v); |
++ T5 = _mm_mul_epu32(H1, p->S24.v); T6 = _mm_mul_epu32(H1, p->R20.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H2, p->S23.v); T6 = _mm_mul_epu32(H2, p->S24.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H3, p->S22.v); T6 = _mm_mul_epu32(H3, p->S23.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H4, p->S21.v); T6 = _mm_mul_epu32(H4, p->S22.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6); |
++ T5 = _mm_mul_epu32(H1, p->R21.v); T6 = _mm_mul_epu32(H1, p->R22.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H2, p->R20.v); T6 = _mm_mul_epu32(H2, p->R21.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H3, p->S24.v); T6 = _mm_mul_epu32(H3, p->R20.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H4, p->S23.v); T6 = _mm_mul_epu32(H4, p->S24.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6); |
++ T5 = _mm_mul_epu32(H1, p->R23.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H2, p->R22.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H3, p->R21.v); T4 = _mm_add_epi64(T4, T5); |
++ T5 = _mm_mul_epu32(H4, p->R20.v); T4 = _mm_add_epi64(T4, T5); |
++ |
++ C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2); |
++ C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); |
++ C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2); |
++ C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); |
++ |
++ /* H = H[0]+H[1] */ |
++ H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); |
++ H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); |
++ H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); |
++ H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); |
++ H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); |
++ |
++ t0 = _mm_cvtsi128_si32(H0) ; c = (t0 >> 26); t0 &= 0x3ffffff; |
++ t1 = _mm_cvtsi128_si32(H1) + c; c = (t1 >> 26); t1 &= 0x3ffffff; |
++ t2 = _mm_cvtsi128_si32(H2) + c; c = (t2 >> 26); t2 &= 0x3ffffff; |
++ t3 = _mm_cvtsi128_si32(H3) + c; c = (t3 >> 26); t3 &= 0x3ffffff; |
++ t4 = _mm_cvtsi128_si32(H4) + c; c = (t4 >> 26); t4 &= 0x3ffffff; |
++ t0 = t0 + (c * 5); c = (t0 >> 26); t0 &= 0x3ffffff; |
++ t1 = t1 + c; |
++ |
++ st->HH[0] = ((t0 ) | (t1 << 26) ) & 0xfffffffffffull; |
++ st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull; |
++ st->HH[2] = ((t3 >> 10) | (t4 << 16) ) & 0x3ffffffffffull; |
++ |
++ return consumed; |
++ } |
++ |
++void |
++CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *m, |
++ size_t bytes) |
++ { |
++ poly1305_state_internal *st = poly1305_aligned_state(state); |
++ size_t want; |
++ |
++ /* need at least 32 initial bytes to start the accelerated branch */ |
++ if (!st->started) |
++ { |
++ if ((st->leftover == 0) && (bytes > 32)) |
++ { |
++ poly1305_first_block(st, m); |
++ m += 32; |
++ bytes -= 32; |
++ } |
++ else |
++ { |
++ want = poly1305_min(32 - st->leftover, bytes); |
++ poly1305_block_copy(st->buffer + st->leftover, m, want); |
++ bytes -= want; |
++ m += want; |
++ st->leftover += want; |
++ if ((st->leftover < 32) || (bytes == 0)) |
++ return; |
++ poly1305_first_block(st, st->buffer); |
++ st->leftover = 0; |
++ } |
++ st->started = 1; |
++ } |
++ |
++ /* handle leftover */ |
++ if (st->leftover) |
++ { |
++ want = poly1305_min(64 - st->leftover, bytes); |
++ poly1305_block_copy(st->buffer + st->leftover, m, want); |
++ bytes -= want; |
++ m += want; |
++ st->leftover += want; |
++ if (st->leftover < 64) |
++ return; |
++ poly1305_blocks(st, st->buffer, 64); |
++ st->leftover = 0; |
++ } |
++ |
++ /* process 64 byte blocks */ |
++ if (bytes >= 64) |
++ { |
++ want = (bytes & ~63); |
++ poly1305_blocks(st, m, want); |
++ m += want; |
++ bytes -= want; |
++ } |
++ |
++ if (bytes) |
++ { |
++ poly1305_block_copy(st->buffer + st->leftover, m, bytes); |
++ st->leftover += bytes; |
++ } |
++ } |
++ |
++void |
++CRYPTO_poly1305_finish(poly1305_state *state, unsigned char mac[16]) |
++ { |
++ poly1305_state_internal *st = poly1305_aligned_state(state); |
++ size_t leftover = st->leftover; |
++ uint8_t *m = st->buffer; |
++ uint128_t d[3]; |
++ uint64_t h0,h1,h2; |
++ uint64_t t0,t1; |
++ uint64_t g0,g1,g2,c,nc; |
++ uint64_t r0,r1,r2,s1,s2; |
++ poly1305_power *p; |
++ |
++ if (st->started) |
++ { |
++ size_t consumed = poly1305_combine(st, m, leftover); |
++ leftover -= consumed; |
++ m += consumed; |
++ } |
++ |
++ /* st->HH will either be 0 or have the combined result */ |
++ h0 = st->HH[0]; |
++ h1 = st->HH[1]; |
++ h2 = st->HH[2]; |
++ |
++ p = &st->P[1]; |
++ r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; |
++ r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; |
++ r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; |
++ s1 = r1 * (5 << 2); |
++ s2 = r2 * (5 << 2); |
++ |
++ if (leftover < 16) |
++ goto poly1305_donna_atmost15bytes; |
++ |
++poly1305_donna_atleast16bytes: |
++ t0 = U8TO64_LE(m + 0); |
++ t1 = U8TO64_LE(m + 8); |
++ h0 += t0 & 0xfffffffffff; |
++ t0 = shr128_pair(t1, t0, 44); |
++ h1 += t0 & 0xfffffffffff; |
++ h2 += (t1 >> 24) | ((uint64_t)1 << 40); |
++ |
++poly1305_donna_mul: |
++ d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), mul64x64_128(h2, s1)); |
++ d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), mul64x64_128(h2, s2)); |
++ d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), mul64x64_128(h2, r0)); |
++ h0 = lo128(d[0]) & 0xfffffffffff; c = shr128(d[0], 44); |
++ d[1] = add128_64(d[1], c); h1 = lo128(d[1]) & 0xfffffffffff; c = shr128(d[1], 44); |
++ d[2] = add128_64(d[2], c); h2 = lo128(d[2]) & 0x3ffffffffff; c = shr128(d[2], 42); |
++ h0 += c * 5; |
++ |
++ m += 16; |
++ leftover -= 16; |
++ if (leftover >= 16) goto poly1305_donna_atleast16bytes; |
++ |
++ /* final bytes */ |
++poly1305_donna_atmost15bytes: |
++ if (!leftover) goto poly1305_donna_finish; |
++ |
++ m[leftover++] = 1; |
++ poly1305_block_zero(m + leftover, 16 - leftover); |
++ leftover = 16; |
++ |
++ t0 = U8TO64_LE(m+0); |
++ t1 = U8TO64_LE(m+8); |
++ h0 += t0 & 0xfffffffffff; t0 = shr128_pair(t1, t0, 44); |
++ h1 += t0 & 0xfffffffffff; |
++ h2 += (t1 >> 24); |
++ |
++ goto poly1305_donna_mul; |
++ |
++poly1305_donna_finish: |
++ c = (h0 >> 44); h0 &= 0xfffffffffff; |
++ h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; |
++ h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; |
++ h0 += c * 5; |
++ |
++ g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; |
++ g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; |
++ g2 = h2 + c - ((uint64_t)1 << 42); |
++ |
++ c = (g2 >> 63) - 1; |
++ nc = ~c; |
++ h0 = (h0 & nc) | (g0 & c); |
++ h1 = (h1 & nc) | (g1 & c); |
++ h2 = (h2 & nc) | (g2 & c); |
++ |
++ /* pad */ |
++ t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; |
++ t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; |
++ h0 += (t0 & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; t0 = shr128_pair(t1, t0, 44); |
++ h1 += (t0 & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; t1 = (t1 >> 24); |
++ h2 += (t1 ) + c; |
++ |
++ U64TO8_LE(mac + 0, ((h0 ) | (h1 << 44))); |
++ U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24))); |
++ } |
++ |
++#endif /* !OPENSSL_NO_POLY1305 */ |
+diff --git a/crypto/poly1305/poly1305test.c b/crypto/poly1305/poly1305test.c |
+new file mode 100644 |
+index 0000000..8dd26af |
+--- /dev/null |
++++ b/crypto/poly1305/poly1305test.c |
+@@ -0,0 +1,166 @@ |
++/* ==================================================================== |
++ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved. |
++ * |
++ * Redistribution and use in source and binary forms, with or without |
++ * modification, are permitted provided that the following conditions |
++ * are met: |
++ * |
++ * 1. Redistributions of source code must retain the above copyright |
++ * notice, this list of conditions and the following disclaimer. |
++ * |
++ * 2. Redistributions in binary form must reproduce the above copyright |
++ * notice, this list of conditions and the following disclaimer in |
++ * the documentation and/or other materials provided with the |
++ * distribution. |
++ * |
++ * 3. All advertising materials mentioning features or use of this |
++ * software must display the following acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" |
++ * |
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
++ * endorse or promote products derived from this software without |
++ * prior written permission. For written permission, please contact |
++ * licensing@OpenSSL.org. |
++ * |
++ * 5. Products derived from this software may not be called "OpenSSL" |
++ * nor may "OpenSSL" appear in their names without prior written |
++ * permission of the OpenSSL Project. |
++ * |
++ * 6. Redistributions of any form whatsoever must retain the following |
++ * acknowledgment: |
++ * "This product includes software developed by the OpenSSL Project |
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" |
++ * |
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
++ * OF THE POSSIBILITY OF SUCH DAMAGE. |
++ * ==================================================================== |
++ */ |
++ |
++#include <stdio.h> |
++#include <stdlib.h> |
++#include <string.h> |
++ |
++#include <openssl/poly1305.h> |
++ |
++struct poly1305_test |
++ { |
++ const char *inputhex; |
++ const char *keyhex; |
++ const char *outhex; |
++ }; |
++ |
++static const struct poly1305_test poly1305_tests[] = { |
++ { |
++ "", |
++ "c8afaac331ee372cd6082de134943b174710130e9f6fea8d72293850a667d86c", |
++ "4710130e9f6fea8d72293850a667d86c", |
++ }, |
++ { |
++ "48656c6c6f20776f726c6421", |
++ "746869732069732033322d62797465206b657920666f7220506f6c7931333035", |
++ "a6f745008f81c916a20dcc74eef2b2f0", |
++ }, |
++ { |
++ "0000000000000000000000000000000000000000000000000000000000000000", |
++ "746869732069732033322d62797465206b657920666f7220506f6c7931333035", |
++ "49ec78090e481ec6c26b33b91ccc0307", |
++ }, |
++ { |
++ "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", |
++ "746869732069732033322d62797465206b657920666f7220506f6c7931333035", |
++ "da84bcab02676c38cdb015604274c2aa", |
++ }, |
++}; |
++ |
++static unsigned char hex_digit(char h) |
++ { |
++ if (h >= '0' && h <= '9') |
++ return h - '0'; |
++ else if (h >= 'a' && h <= 'f') |
++ return h - 'a' + 10; |
++ else if (h >= 'A' && h <= 'F') |
++ return h - 'A' + 10; |
++ else |
++ abort(); |
++ } |
++ |
++static void hex_decode(unsigned char *out, const char* hex) |
++ { |
++ size_t j = 0; |
++ |
++ while (*hex != 0) |
++ { |
++ unsigned char v = hex_digit(*hex++); |
++ v <<= 4; |
++ v |= hex_digit(*hex++); |
++ out[j++] = v; |
++ } |
++ } |
++ |
++static void hexdump(unsigned char *a, size_t len) |
++ { |
++ size_t i; |
++ |
++ for (i = 0; i < len; i++) |
++ printf("%02x", a[i]); |
++ } |
++ |
++int main() |
++ { |
++ static const unsigned num_tests = |
++ sizeof(poly1305_tests) / sizeof(struct poly1305_test); |
++ unsigned i; |
++ unsigned char key[32], out[16], expected[16]; |
++ poly1305_state poly1305; |
++ |
++ for (i = 0; i < num_tests; i++) |
++ { |
++ const struct poly1305_test *test = &poly1305_tests[i]; |
++ unsigned char *in; |
++ size_t inlen = strlen(test->inputhex); |
++ |
++ if (strlen(test->keyhex) != sizeof(key)*2 || |
++ strlen(test->outhex) != sizeof(out)*2 || |
++ (inlen & 1) == 1) |
++ return 1; |
++ |
++ inlen /= 2; |
++ |
++ hex_decode(key, test->keyhex); |
++ hex_decode(expected, test->outhex); |
++ |
++ in = malloc(inlen); |
++ |
++ hex_decode(in, test->inputhex); |
++ CRYPTO_poly1305_init(&poly1305, key); |
++ CRYPTO_poly1305_update(&poly1305, in, inlen); |
++ CRYPTO_poly1305_finish(&poly1305, out); |
++ |
++ if (memcmp(out, expected, sizeof(expected)) != 0) |
++ { |
++ printf("Poly1305 test #%d failed.\n", i); |
++ printf("got: "); |
++ hexdump(out, sizeof(out)); |
++ printf("\nexpected: "); |
++ hexdump(expected, sizeof(expected)); |
++ printf("\n"); |
++ return 1; |
++ } |
++ |
++ free(in); |
++ } |
++ |
++ printf("PASS\n"); |
++ return 0; |
++ } |
+diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c |
+index 75b6560..a042b8d 100644 |
+--- a/ssl/s3_lib.c |
++++ b/ssl/s3_lib.c |
+@@ -1841,7 +1841,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -1873,7 +1874,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -1905,7 +1907,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -1937,7 +1940,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -1969,7 +1973,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -2001,7 +2006,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -2714,7 +2720,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -2746,7 +2753,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -2778,7 +2786,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -2810,7 +2819,8 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ SSL_AEAD, |
+ SSL_TLSV1_2, |
+ SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, |
+- SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4), |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(4)| |
++ SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD, |
+ 128, |
+ 128, |
+ }, |
+@@ -2894,6 +2904,51 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ |
+ }, |
+ #endif |
+ |
++ { |
++ 1, |
++ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305, |
++ TLS1_CK_ECDHE_RSA_CHACHA20_POLY1305, |
++ SSL_kEECDH, |
++ SSL_aRSA, |
++ SSL_CHACHA20POLY1305, |
++ SSL_AEAD, |
++ SSL_TLSV1_2, |
++ SSL_NOT_EXP|SSL_HIGH, |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(0), |
++ 256, |
++ 0, |
++ }, |
++ |
++ { |
++ 1, |
++ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, |
++ TLS1_CK_ECDHE_ECDSA_CHACHA20_POLY1305, |
++ SSL_kEECDH, |
++ SSL_aECDSA, |
++ SSL_CHACHA20POLY1305, |
++ SSL_AEAD, |
++ SSL_TLSV1_2, |
++ SSL_NOT_EXP|SSL_HIGH, |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(0), |
++ 256, |
++ 0, |
++ }, |
++ |
++ { |
++ 1, |
++ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305, |
++ TLS1_CK_DHE_RSA_CHACHA20_POLY1305, |
++ SSL_kEDH, |
++ SSL_aRSA, |
++ SSL_CHACHA20POLY1305, |
++ SSL_AEAD, |
++ SSL_TLSV1_2, |
++ SSL_NOT_EXP|SSL_HIGH, |
++ SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256|SSL_CIPHER_ALGORITHM2_AEAD|FIXED_NONCE_LEN(0), |
++ 256, |
++ 0, |
++ }, |
++ |
+ /* end of list */ |
+ }; |
+ |
+diff --git a/ssl/s3_pkt.c b/ssl/s3_pkt.c |
+index 5038f6c..04b474d 100644 |
+--- a/ssl/s3_pkt.c |
++++ b/ssl/s3_pkt.c |
+@@ -790,8 +790,11 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf, |
+ else |
+ eivlen = 0; |
+ } |
+- else if (s->aead_write_ctx != NULL) |
++ else if (s->aead_write_ctx != NULL && |
++ s->aead_write_ctx->variable_nonce_included_in_record) |
++ { |
+ eivlen = s->aead_write_ctx->variable_nonce_len; |
++ } |
+ else |
+ eivlen = 0; |
+ |
+diff --git a/ssl/ssl.h b/ssl/ssl.h |
+index 0644cbf..d782a98 100644 |
+--- a/ssl/ssl.h |
++++ b/ssl/ssl.h |
+@@ -291,6 +291,7 @@ extern "C" { |
+ #define SSL_TXT_CAMELLIA128 "CAMELLIA128" |
+ #define SSL_TXT_CAMELLIA256 "CAMELLIA256" |
+ #define SSL_TXT_CAMELLIA "CAMELLIA" |
++#define SSL_TXT_CHACHA20 "CHACHA20" |
+ |
+ #define SSL_TXT_MD5 "MD5" |
+ #define SSL_TXT_SHA1 "SHA1" |
+diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c |
+index 7e780cd..b6370bd 100644 |
+--- a/ssl/ssl_ciph.c |
++++ b/ssl/ssl_ciph.c |
+@@ -298,6 +298,7 @@ static const SSL_CIPHER cipher_aliases[]={ |
+ {0,SSL_TXT_CAMELLIA128,0,0,0,SSL_CAMELLIA128,0,0,0,0,0,0}, |
+ {0,SSL_TXT_CAMELLIA256,0,0,0,SSL_CAMELLIA256,0,0,0,0,0,0}, |
+ {0,SSL_TXT_CAMELLIA ,0,0,0,SSL_CAMELLIA128|SSL_CAMELLIA256,0,0,0,0,0,0}, |
++ {0,SSL_TXT_CHACHA20 ,0,0,0,SSL_CHACHA20POLY1305,0,0,0,0,0,0}, |
+ |
+ /* MAC aliases */ |
+ {0,SSL_TXT_MD5,0, 0,0,0,SSL_MD5, 0,0,0,0,0}, |
+@@ -523,9 +524,15 @@ int ssl_cipher_get_evp_aead(const SSL_SESSION *s, const EVP_AEAD **aead) |
+ return 0; |
+ |
+ #ifndef OPENSSL_NO_AES |
+- /* There is only one AEAD for now. */ |
+- *aead = EVP_aead_aes_128_gcm(); |
+- return 1; |
++ switch (c->algorithm_enc) |
++ { |
++ case SSL_AES128GCM: |
++ *aead = EVP_aead_aes_128_gcm(); |
++ return 1; |
++ case SSL_CHACHA20POLY1305: |
++ *aead = EVP_aead_chacha20_poly1305(); |
++ return 1; |
++ } |
+ #endif |
+ |
+ return 0; |
+@@ -1715,6 +1722,9 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) |
+ case SSL_SEED: |
+ enc="SEED(128)"; |
+ break; |
++ case SSL_CHACHA20POLY1305: |
++ enc="ChaCha20-Poly1305"; |
++ break; |
+ default: |
+ enc="unknown"; |
+ break; |
+diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h |
+index 63bc28b..b83d8cd 100644 |
+--- a/ssl/ssl_locl.h |
++++ b/ssl/ssl_locl.h |
+@@ -328,6 +328,7 @@ |
+ #define SSL_SEED 0x00000800L |
+ #define SSL_AES128GCM 0x00001000L |
+ #define SSL_AES256GCM 0x00002000L |
++#define SSL_CHACHA20POLY1305 0x00004000L |
+ |
+ #define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM) |
+ #define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256) |
+@@ -389,6 +390,12 @@ |
+ #define SSL_CIPHER_AEAD_FIXED_NONCE_LEN(ssl_cipher) \ |
+ (((ssl_cipher->algorithm2 >> 24) & 0xf)*2) |
+ |
++/* SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD is a flag in |
++ * SSL_CIPHER.algorithm2 which indicates that the variable part of the nonce is |
++ * included as a prefix of the record. (AES-GCM, for example, does with with an |
++ * 8-byte variable nonce.) */ |
++#define SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD (1<<22) |
++ |
+ /* |
+ * Export and cipher strength information. For each cipher we have to decide |
+ * whether it is exportable or not. This information is likely to change |
+@@ -605,6 +612,9 @@ struct ssl_aead_ctx_st |
+ * records. */ |
+ unsigned char fixed_nonce[8]; |
+ unsigned char fixed_nonce_len, variable_nonce_len, tag_len; |
++ /* variable_nonce_included_in_record is non-zero if the variable nonce |
++ * for a record is included as a prefix before the ciphertext. */ |
++ char variable_nonce_included_in_record; |
+ }; |
+ |
+ #ifndef OPENSSL_NO_COMP |
+diff --git a/ssl/t1_enc.c b/ssl/t1_enc.c |
+index 7af1a32..15800af 100644 |
+--- a/ssl/t1_enc.c |
++++ b/ssl/t1_enc.c |
+@@ -366,6 +366,8 @@ static int tls1_change_cipher_state_aead(SSL *s, char is_read, |
+ memcpy(aead_ctx->fixed_nonce, iv, iv_len); |
+ aead_ctx->fixed_nonce_len = iv_len; |
+ aead_ctx->variable_nonce_len = 8; /* always the case, currently. */ |
++ aead_ctx->variable_nonce_included_in_record = |
++ (s->s3->tmp.new_cipher->algorithm2 & SSL_CIPHER_ALGORITHM2_VARIABLE_NONCE_INCLUDED_IN_RECORD) != 0; |
+ if (aead_ctx->variable_nonce_len + aead_ctx->fixed_nonce_len != EVP_AEAD_nonce_length(aead)) |
+ { |
+ SSLerr(SSL_F_TLS1_CHANGE_CIPHER_STATE_AEAD, ERR_R_INTERNAL_ERROR); |
+@@ -863,6 +865,7 @@ int tls1_enc(SSL *s, int send) |
+ if (send) |
+ { |
+ size_t len = rec->length; |
++ size_t eivlen = 0; |
+ in = rec->input; |
+ out = rec->data; |
+ |
+@@ -878,18 +881,22 @@ int tls1_enc(SSL *s, int send) |
+ * variable nonce. Thus we can copy the sequence number |
+ * bytes into place without overwriting any of the |
+ * plaintext. */ |
+- memcpy(out, ad, aead->variable_nonce_len); |
+- len -= aead->variable_nonce_len; |
++ if (aead->variable_nonce_included_in_record) |
++ { |
++ memcpy(out, ad, aead->variable_nonce_len); |
++ len -= aead->variable_nonce_len; |
++ eivlen = aead->variable_nonce_len; |
++ } |
+ |
+ ad[11] = len >> 8; |
+ ad[12] = len & 0xff; |
+ |
+ n = EVP_AEAD_CTX_seal(&aead->ctx, |
+- out + aead->variable_nonce_len, len + aead->tag_len, |
++ out + eivlen, len + aead->tag_len, |
+ nonce, nonce_used, |
+- in + aead->variable_nonce_len, len, |
++ in + eivlen, len, |
+ ad, sizeof(ad)); |
+- if (n >= 0) |
++ if (n >= 0 && aead->variable_nonce_included_in_record) |
+ n += aead->variable_nonce_len; |
+ } |
+ else |
+@@ -903,12 +910,17 @@ int tls1_enc(SSL *s, int send) |
+ |
+ if (len < aead->variable_nonce_len) |
+ return 0; |
+- memcpy(nonce + nonce_used, in, aead->variable_nonce_len); |
++ memcpy(nonce + nonce_used, |
++ aead->variable_nonce_included_in_record ? in : ad, |
++ aead->variable_nonce_len); |
+ nonce_used += aead->variable_nonce_len; |
+ |
+- in += aead->variable_nonce_len; |
+- len -= aead->variable_nonce_len; |
+- out += aead->variable_nonce_len; |
++ if (aead->variable_nonce_included_in_record) |
++ { |
++ in += aead->variable_nonce_len; |
++ len -= aead->variable_nonce_len; |
++ out += aead->variable_nonce_len; |
++ } |
+ |
+ if (len < aead->tag_len) |
+ return 0; |
+diff --git a/ssl/tls1.h b/ssl/tls1.h |
+index 8cac7df..3cbcb83 100644 |
+--- a/ssl/tls1.h |
++++ b/ssl/tls1.h |
+@@ -526,6 +526,10 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) |
+ #define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031 |
+ #define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032 |
+ |
++#define TLS1_CK_ECDHE_RSA_CHACHA20_POLY1305 0x0300CC13 |
++#define TLS1_CK_ECDHE_ECDSA_CHACHA20_POLY1305 0x0300CC14 |
++#define TLS1_CK_DHE_RSA_CHACHA20_POLY1305 0x0300CC15 |
++ |
+ /* XXX |
+ * Inconsistency alert: |
+ * The OpenSSL names of ciphers with ephemeral DH here include the string |
+@@ -677,6 +681,10 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) |
+ #define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256" |
+ #define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384" |
+ |
++#define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 "ECDHE-RSA-CHACHA20-POLY1305" |
++#define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 "ECDHE-ECDSA-CHACHA20-POLY1305" |
++#define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305 "DHE-RSA-CHACHA20-POLY1305" |
++ |
+ #define TLS_CT_RSA_SIGN 1 |
+ #define TLS_CT_DSS_SIGN 2 |
+ #define TLS_CT_RSA_FIXED_DH 3 |
+diff --git a/test/Makefile b/test/Makefile |
+index 4c9eabc..4790aa8 100644 |
+--- a/test/Makefile |
++++ b/test/Makefile |
+@@ -86,7 +86,9 @@ OBJ= $(BNTEST).o $(ECTEST).o $(ECDSATEST).o $(ECDHTEST).o $(IDEATEST).o \ |
+ $(MDC2TEST).o $(RMDTEST).o \ |
+ $(RANDTEST).o $(DHTEST).o $(ENGINETEST).o $(CASTTEST).o \ |
+ $(BFTEST).o $(SSLTEST).o $(DSATEST).o $(EXPTEST).o $(RSATEST).o \ |
+- $(EVPTEST).o $(IGETEST).o $(JPAKETEST).o $(ASN1TEST).o |
++ $(EVPTEST).o $(IGETEST).o $(JPAKETEST).o $(ASN1TEST).o $(CHACHATEST).o \ |
++ $(POLY1305TEST).o |
++ |
+ SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ |
+ $(MD2TEST).c $(MD4TEST).c $(MD5TEST).c \ |
+ $(HMACTEST).c $(WPTEST).c \ |
+@@ -94,7 +96,8 @@ SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ |
+ $(DESTEST).c $(SHATEST).c $(SHA1TEST).c $(MDC2TEST).c $(RMDTEST).c \ |
+ $(RANDTEST).c $(DHTEST).c $(ENGINETEST).c $(CASTTEST).c \ |
+ $(BFTEST).c $(SSLTEST).c $(DSATEST).c $(EXPTEST).c $(RSATEST).c \ |
+- $(EVPTEST).c $(IGETEST).c $(JPAKETEST).c $(SRPTEST).c $(ASN1TEST).c |
++ $(EVPTEST).c $(IGETEST).c $(JPAKETEST).c $(SRPTEST).c $(ASN1TEST).c \ |
++ $(CHACHATEST).c $(POLY1305TEST).c |
+ |
+ EXHEADER= |
+ HEADER= $(EXHEADER) |
+@@ -137,7 +140,7 @@ alltests: \ |
+ test_enc test_x509 test_rsa test_crl test_sid \ |
+ test_gen test_req test_pkcs7 test_verify test_dh test_dsa \ |
+ test_ss test_ca test_engine test_evp test_ssl test_tsa test_ige \ |
+- test_jpake test_srp test_cms |
++ test_jpake test_srp test_cms test_chacha test_poly1305 |
+ |
+ test_evp: |
+ ../util/shlib_wrap.sh ./$(EVPTEST) evptests.txt |
+@@ -318,6 +321,14 @@ test_srp: $(SRPTEST)$(EXE_EXT) |
+ @echo "Test SRP" |
+ ../util/shlib_wrap.sh ./srptest |
+ |
++test_chacha: $(CHACHATEST)$(EXE_EXT) |
++ @echo "Test ChaCha" |
++ ../util/shlib_wrap.sh ./$(CHACHATEST) |
++ |
++test_poly1305: $(POLY1305TEST)$(EXE_EXT) |
++ @echo "Test Poly1305" |
++ ../util/shlib_wrap.sh ./$(POLY1305TEST) |
++ |
+ lint: |
+ lint -DLINT $(INCLUDES) $(SRC)>fluff |
+ |
+@@ -394,6 +405,12 @@ $(SHA256TEST)$(EXE_EXT): $(SHA256TEST).o $(DLIBCRYPTO) |
+ $(SHA512TEST)$(EXE_EXT): $(SHA512TEST).o $(DLIBCRYPTO) |
+ @target=$(SHA512TEST); $(BUILD_CMD) |
+ |
++$(CHACHATEST)$(EXE_EXT): $(CHACHATEST).o $(DLIBCRYPTO) |
++ @target=$(CHACHATEST); $(BUILD_CMD) |
++ |
++$(POLY1305TEST)$(EXE_EXT): $(POLY1305TEST).o $(DLIBCRYPTO) |
++ @target=$(CHACHATEST); $(BUILD_CMD) |
++ |
+ $(RMDTEST)$(EXE_EXT): $(RMDTEST).o $(DLIBCRYPTO) |
+ @target=$(RMDTEST); $(BUILD_CMD) |
+ |
+-- |
+1.8.4.1 |
+ |