nss/lib/freebl/chacha20/chacha20_vec.c - Issue 27510015: Support ChaCha20+Poly1305 cipher suites.

Side by Side Diff: nss/lib/freebl/chacha20/chacha20_vec.c

Issue 27510015: Support ChaCha20+Poly1305 cipher suites. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/nss/

Patch Set: Make ChaCha20Poly1305 function conform to freebl API, undo changes to chacha20.c, use generic code … Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /* This Source Code Form is subject to the terms of the Mozilla Public

	2 * License, v. 2.0. If a copy of the MPL was not distributed with this

	3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	4

	5 /* This implementation is by Ted Krovetz and was submitted to SUPERCOP and

	6 * marked as public domain. It was been altered to allow for non-aligned inputs

	7 * and to allow the block counter to be passed in specifically. */

	8

	9 #include <string.h>

	10 #include <stdint.h>

	11

	12 #include "chacha20.h"

	13

	14 #ifndef CHACHA_RNDS

	15 #define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */

	16 #endif

	17

	18 /* Architecture-neutral way to specify 16-byte vector of ints */

	19 typedef unsigned vec __attribute__ ((vector_size (16)));
	wtc 2013/10/19 01:15:39 This file and poly1305-donna-x64-sse2-incremental- This file and poly1305-donna-x64-sse2-incremental-source.c don't compile on Windows when targeting x64. This file seems within reach. I suspect we just need to find the Visual C++ equivalent of "__attribute__ ((vector_size (16)))" and the __SSE2__ and __SSSE3__ macros. poly1305-donna-x64-sse2-incremental-source.c uses the __int128 type. I don't know if Visual C++ has an equivalent. agl 2013/10/21 21:53:39 Thanks. Let's land without building on Windows and Show quoted text On 2013/10/19 01:15:39, wtc wrote: > > This file and poly1305-donna-x64-sse2-incremental-source.c don't compile on > Windows when targeting x64. > > This file seems within reach. I suspect we just need to find the Visual C++ > equivalent of "__attribute__ ((vector_size (16)))" and the __SSE2__ and > __SSSE3__ macros. Thanks. Let's land without building on Windows and I'll see what I can do. Show quoted text > poly1305-donna-x64-sse2-incremental-source.c uses the __int128 type. I don't > know if Visual C++ has an equivalent. It doesn't. That's a shame. However, the plan is to bump up AES-GCM on platforms where we have AES-NI and hopefully that includes most Windows machines. (And on machines without AES-NI, I bet the fallback Poly1305 implementation is still faster than AES-GCM.) wtc 2013/10/22 22:36:42 I looked into the "__attribute__ ((vector_size (16 Show quoted text On 2013/10/21 21:53:39, agl wrote: > > Thanks. Let's land without building on Windows and I'll see what I can do. I looked into the "__attribute__ ((vector_size (16)))" problem. It is a GCC extension that allows us to use normal arithmetic and logical operators such as + and ^ on integer variables declared with this attribute, and the compiler will generate vector instructions that operate on the four integers in one shot. It is syntactic sugar, because there are equivalent compiler intrinsics. Visual C++ doesn't support this syntactic sugar, so to port this file to Visual C++ we will need to convert code such as: a += b; d ^= a; to compiler intrinsics that operate on variables of the type __m128i.
	20

	21 /* This implementation is designed for Neon, SSE and AltiVec machines. The

	22 * following specify how to do certain vector operations efficiently on

	23 * each architecture, using intrinsics.

	24 * This implementation supports parallel processing of multiple blocks,

	25 * including potentially using general-purpose registers.

	26 */

	27 #if __ARM_NEON__

	28 #include <arm_neon.h>

	29 #define GPR_TOO 1

	30 #define VBPI 2

	31 #define ONE (vec)vsetq_lane_u32(1,vdupq_n_u32(0),0)

	32 #define LOAD(m) (vec)(((vec)(m)))

	33 #define STORE(m,r) (((vec)(m))) = (r)

	34 #define ROTV1(x) (vec)vextq_u32((uint32x4_t)x,(uint32x4_t)x,1)

	35 #define ROTV2(x) (vec)vextq_u32((uint32x4_t)x,(uint32x4_t)x,2)

	36 #define ROTV3(x) (vec)vextq_u32((uint32x4_t)x,(uint32x4_t)x,3)

	37 #define ROTW16(x) (vec)vrev32q_u16((uint16x8_t)x)

	38 #if __clang__

	39 #define ROTW7(x) (x << ((vec){ 7, 7, 7, 7})) ^ (x >> ((vec){25,25,25,25}))

	40 #define ROTW8(x) (x << ((vec){ 8, 8, 8, 8})) ^ (x >> ((vec){24,24,24,24}))

	41 #define ROTW12(x) (x << ((vec){12,12,12,12})) ^ (x >> ((vec){20,20,20,20}))

	42 #else

	43 #define ROTW7(x) (vec)vsriq_n_u32(vshlq_n_u32((uint32x4_t)x,7),(uint32x4_t)x,25 )

	44 #define ROTW8(x) (vec)vsriq_n_u32(vshlq_n_u32((uint32x4_t)x,8),(uint32x4_t)x,24 )

	45 #define ROTW12(x) (vec)vsriq_n_u32(vshlq_n_u32((uint32x4_t)x,12),(uint32x4_t)x,2 0)

	46 #endif

	47 #elif __SSE2__

	48 #include <emmintrin.h>

	49 #define GPR_TOO 0

	50 #if __clang__

	51 #define VBPI 4

	52 #else

	53 #define VBPI 3

	54 #endif

	55 #define ONE (vec)_mm_set_epi32(0,0,0,1)

	56 #define LOAD(m) (vec)_mm_loadu_si128((__m128i*)(m))

	57 #define STORE(m,r) _mm_storeu_si128((__m128i*)(m), (__m128i) (r))

	58 #define ROTV1(x) (vec)_mm_shuffle_epi32((__m128i)x,_MM_SHUFFLE(0,3,2,1))

	59 #define ROTV2(x) (vec)_mm_shuffle_epi32((__m128i)x,_MM_SHUFFLE(1,0,3,2))

	60 #define ROTV3(x) (vec)_mm_shuffle_epi32((__m128i)x,_MM_SHUFFLE(2,1,0,3))

	61 #define ROTW7(x) (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i) x,25))

	62 #define ROTW12(x) (vec)(_mm_slli_epi32((__m128i)x,12) ^ _mm_srli_epi32((__m128i) x,20))

	63 #if __SSSE3__

	64 #include <tmmintrin.h>

	65 #define ROTW8(x) (vec)_mm_shuffle_epi8((__m128i)x,_mm_set_epi8(14,13,12,15,10,9 ,8,11,6,5,4,7,2,1,0,3))

	66 #define ROTW16(x) (vec)_mm_shuffle_epi8((__m128i)x,_mm_set_epi8(13,12,15,14,9,8, 11,10,5,4,7,6,1,0,3,2))

	67 #else

	68 #define ROTW8(x) (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i) x,24))

	69 #define ROTW16(x) (vec)(_mm_slli_epi32((__m128i)x,16) ^ _mm_srli_epi32((__m128i) x,16))

	70 #endif

	71 #else

	72 #error -- Implementation supports only machines with neon or SSE2

	73 #endif

	74

	75 #ifndef REVV_BE

	76 #define REVV_BE(x) (x)

	77 #endif

	78

	79 #ifndef REVW_BE

	80 #define REVW_BE(x) (x)

	81 #endif

	82

	83 #define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration */

	84

	85 #define DQROUND_VECTORS(a,b,c,d) \

	86 a += b; d ^= a; d = ROTW16(d); \

	87 c += d; b ^= c; b = ROTW12(b); \

	88 a += b; d ^= a; d = ROTW8(d); \

	89 c += d; b ^= c; b = ROTW7(b); \

	90 b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); \

	91 a += b; d ^= a; d = ROTW16(d); \

	92 c += d; b ^= c; b = ROTW12(b); \

	93 a += b; d ^= a; d = ROTW8(d); \

	94 c += d; b ^= c; b = ROTW7(b); \

	95 b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);

	96

	97 #define QROUND_WORDS(a,b,c,d) \

	98 a = a+b; d ^= a; d = d<<16 \| d>>16; \

	99 c = c+d; b ^= c; b = b<<12 \| b>>20; \

	100 a = a+b; d ^= a; d = d<< 8 \| d>>24; \

	101 c = c+d; b ^= c; b = b<< 7 \| b>>25;

	102

	103 #define WRITE_XOR(in, op, d, v0, v1, v2, v3) \

	104 STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \

	105 STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \

	106 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \

	107 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));

	108

	109 void ChaCha20XOR(

	110 unsigned char *out,

	111 const unsigned char *in,

	112 unsigned int inlen,

	113 const unsigned char key[32],

	114 const unsigned char nonce[8],

	115 unsigned int counter)

	116 {

	117 unsigned iters, i, op=(unsigned )out, ip=(unsigned )in, *kp;

	118 #if defined(__ARM_NEON__)

	119 unsigned *np;

	120 #endif

	121 vec s0, s1, s2, s3;

	122 #if !defined(__ARM_NEON__) && !defined(__SSE2__)

	123 __attribute__ ((aligned (16))) unsigned key[8], nonce[4];

	124 #endif

	125 __attribute__ ((aligned (16))) unsigned chacha_const[] =

	126 {0x61707865,0x3320646E,0x79622D32,0x6B206574};

	127 #if defined(__ARM_NEON__) \|\| defined(__SSE2__)

	128 kp = (unsigned *)key;

	129 #else

	130 ((vec )key)[0] = REVV_BE(((vec )key)[0]);

	131 ((vec )key)[1] = REVV_BE(((vec )key)[1]);

	132 nonce[0] = REVW_BE(((unsigned *)nonce)[0]);

	133 nonce[1] = REVW_BE(((unsigned *)nonce)[1]);

	134 nonce[2] = REVW_BE(((unsigned *)nonce)[2]);

	135 nonce[3] = REVW_BE(((unsigned *)nonce)[3]);

	136 kp = (unsigned *)key;

	137 np = (unsigned *)nonce;

	138 #endif

	139 #if defined(__ARM_NEON__)

	140 np = (unsigned*) nonce;

	141 #endif

	142 s0 = LOAD(chacha_const);

	143 s1 = LOAD(&((vec*)kp)[0]);

	144 s2 = LOAD(&((vec*)kp)[1]);

	145 s3 = (vec) {

	146 counter,

	147 0,

	148 ((uint32_t*)nonce)[0],

	149 ((uint32_t*)nonce)[1]

	150 };

	151

	152 for (iters = 0; iters < inlen/(BPI*64); iters++) {

	153 #if GPR_TOO

	154 register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8,

	155 x9, x10, x11, x12, x13, x14, x15;

	156 #endif

	157 #if VBPI > 2

	158 vec v8,v9,v10,v11;

	159 #endif

	160 #if VBPI > 3

	161 vec v12,v13,v14,v15;

	162 #endif

	163

	164 vec v0,v1,v2,v3,v4,v5,v6,v7;

	165 v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3;

	166 v7 = v3 + ONE;

	167 #if VBPI > 2

	168 v8 = v4; v9 = v5; v10 = v6;

	169 v11 = v7 + ONE;

	170 #endif

	171 #if VBPI > 3

	172 v12 = v8; v13 = v9; v14 = v10;

	173 v15 = v11 + ONE;

	174 #endif

	175 #if GPR_TOO

	176 x0 = chacha_const[0]; x1 = chacha_const[1];

	177 x2 = chacha_const[2]; x3 = chacha_const[3];

	178 x4 = kp[0]; x5 = kp[1]; x6 = kp[2]; x7 = kp[3];

	179 x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7];

	180 x12 = counter+BPI*iters+(BPI-1); x13 = 0; x14 = np[0]; x15 = np[1];

	181 #endif

	182 for (i = CHACHA_RNDS/2; i; i--) {

	183 DQROUND_VECTORS(v0,v1,v2,v3)

	184 DQROUND_VECTORS(v4,v5,v6,v7)

	185 #if VBPI > 2

	186 DQROUND_VECTORS(v8,v9,v10,v11)

	187 #endif

	188 #if VBPI > 3

	189 DQROUND_VECTORS(v12,v13,v14,v15)

	190 #endif

	191 #if GPR_TOO

	192 QROUND_WORDS( x0, x4, x8,x12)

	193 QROUND_WORDS( x1, x5, x9,x13)

	194 QROUND_WORDS( x2, x6,x10,x14)

	195 QROUND_WORDS( x3, x7,x11,x15)

	196 QROUND_WORDS( x0, x5,x10,x15)

	197 QROUND_WORDS( x1, x6,x11,x12)

	198 QROUND_WORDS( x2, x7, x8,x13)

	199 QROUND_WORDS( x3, x4, x9,x14)

	200 #endif

	201 }

	202

	203 WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)

	204 s3 += ONE;

	205 WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3)

	206 s3 += ONE;

	207 #if VBPI > 2

	208 WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3)

	209 s3 += ONE;

	210 #endif

	211 #if VBPI > 3

	212 WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3)

	213 s3 += ONE;

	214 #endif

	215 ip += VBPI*16;

	216 op += VBPI*16;

	217 #if GPR_TOO

	218 op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0]));

	219 op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1]));

	220 op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2]));

	221 op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3]));

	222 op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0]));

	223 op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1]));

	224 op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2]));

	225 op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3]));

	226 op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4]));

	227 op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5]));

	228 op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));

	229 op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));

	230 op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + BPI*iters+(BPI-1)));

	231 op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13));

	232 op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0]));

	233 op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1]));

	234 s3 += ONE;

	235 ip += 16;

	236 op += 16;

	237 #endif

	238 }

	239

	240 for (iters = inlen%(BPI*64)/64; iters != 0; iters--) {

	241 vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;

	242 for (i = CHACHA_RNDS/2; i; i--) {

	243 DQROUND_VECTORS(v0,v1,v2,v3);

	244 }

	245 WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)

	246 s3 += ONE;

	247 ip += 16;

	248 op += 16;

	249 }

	250

	251 inlen = inlen % 64;

	252 if (inlen) {

	253 __attribute__ ((aligned (16))) vec buf[4];

	254 vec v0,v1,v2,v3;

	255 v0 = s0; v1 = s1; v2 = s2; v3 = s3;

	256 for (i = CHACHA_RNDS/2; i; i--) {

	257 DQROUND_VECTORS(v0,v1,v2,v3);

	258 }

	259

	260 if (inlen >= 16) {

	261 STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));

	262 if (inlen >= 32) {

	263 STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));

	264 if (inlen >= 48) {

	265 STORE(op + 8, LOAD(ip + 8) ^ REVV_BE(v2 + s2));

	266 buf[3] = REVV_BE(v3 + s3);

	267 } else {

	268 buf[2] = REVV_BE(v2 + s2);

	269 }

	270 } else {

	271 buf[1] = REVV_BE(v1 + s1);

	272 }

	273 } else {

	274 buf[0] = REVV_BE(v0 + s0);

	275 }

	276

	277 for (i=inlen & ~15; i<inlen; i++) {

	278 ((char )op)[i] = ((char )ip)[i] ^ ((char *)buf)[i];

	279 }

	280 }

	281 }

OLD	NEW

« nss/lib/freebl/chacha20/chacha20.c ('K') | « nss/lib/freebl/chacha20/chacha20.c ('k') | nss/lib/freebl/chacha20poly1305.c » ('j') | nss/lib/freebl/chacha20poly1305.c » ('J')