patches/nss-chacha20-poly1305.patch - Issue 27510015: Support ChaCha20+Poly1305 cipher suites.

Unified Diff: patches/nss-chacha20-poly1305.patch

Issue 27510015: Support ChaCha20+Poly1305 cipher suites. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/nss/

Patch Set: Fold long lines Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: patches/nss-chacha20-poly1305.patch

===================================================================

--- patches/nss-chacha20-poly1305.patch (revision 0)

+++ patches/nss-chacha20-poly1305.patch (revision 0)

@@ -0,0 +1,1932 @@

+Index: nss/lib/softoken/pkcs11.c

+===================================================================

+--- nss/lib/softoken/pkcs11.c (revision 228205)

++++ nss/lib/softoken/pkcs11.c (working copy)

+@@ -368,6 +368,9 @@

+ {CKM_SEED_MAC, {16, 16, CKF_SN_VR}, PR_TRUE},

+ {CKM_SEED_MAC_GENERAL, {16, 16, CKF_SN_VR}, PR_TRUE},

+ {CKM_SEED_CBC_PAD, {16, 16, CKF_EN_DE_WR_UN}, PR_TRUE},

++ /* ------------------------- ChaCha20 Operations ---------------------- */

++ {CKM_NSS_CHACHA20_KEY_GEN, {32, 32, CKF_GENERATE}, PR_TRUE},

++ {CKM_NSS_CHACHA20_POLY1305,{32, 32, CKF_EN_DE}, PR_TRUE},

+ /* ------------------------- Hashing Operations ----------------------- */

+ {CKM_MD2, {0, 0, CKF_DIGEST}, PR_FALSE},

+ {CKM_MD2_HMAC, {1, 128, CKF_SN_VR}, PR_TRUE},

+Index: nss/lib/softoken/pkcs11c.c

+===================================================================

+--- nss/lib/softoken/pkcs11c.c (revision 228205)

++++ nss/lib/softoken/pkcs11c.c (working copy)

+@@ -475,6 +475,97 @@

+ maxLen, input, inputLen);

+ }

++static SFTKChaCha20Poly1305Info *

++sftk_ChaCha20Poly1305_CreateContext(const unsigned char *key,

++ unsigned int keyLen,

++ const CK_NSS_AEAD_PARAMS* params)

++{

++ SFTKChaCha20Poly1305Info *ctx;

++ if (params->ulIvLen != sizeof(ctx->nonce)) {

++ PORT_SetError(SEC_ERROR_INPUT_LEN);

++ return NULL;

++ }

++ ctx = PORT_New(SFTKChaCha20Poly1305Info);

++ if (ctx == NULL) {

++ return NULL;

++ }

++ if (ChaCha20Poly1305_InitContext(&ctx->freeblCtx, key, keyLen,

++ params->ulTagLen) != SECSuccess) {

++ PORT_Free(ctx);

++ return NULL;

++ }

++ memcpy(ctx->nonce, params->pIv, sizeof(ctx->nonce));

++ if (params->ulAADLen > sizeof(ctx->ad)) {

++ /* Need to allocate an overflow buffer for the additional data. */

++ ctx->adOverflow = (unsigned char *)PORT_Alloc(params->ulAADLen);

++ if (!ctx->adOverflow) {

++ PORT_Free(ctx);

++ return NULL;

++ }

++ memcpy(ctx->adOverflow, params->pAAD, params->ulAADLen);

++ } else {

++ ctx->adOverflow = NULL;

++ memcpy(ctx->ad, params->pAAD, params->ulAADLen);

++ }

++ ctx->adLen = params->ulAADLen;

++ return ctx;

++}

++static void

++sftk_ChaCha20Poly1305_DestroyContext(SFTKChaCha20Poly1305Info *ctx,

++ PRBool freeit)

++{

++ ChaCha20Poly1305_DestroyContext(&ctx->freeblCtx, PR_FALSE);

++ if (ctx->adOverflow != NULL) {

++ PORT_Free(ctx->adOverflow);

++ ctx->adOverflow = NULL;

++ }

++ ctx->adLen = 0;

++ if (freeit) {

++ PORT_Free(ctx);

++ }

++}

++static SECStatus

++sftk_ChaCha20Poly1305_Encrypt(const SFTKChaCha20Poly1305Info *ctx,

++ unsigned char *output, unsigned int *outputLen,

++ unsigned int maxOutputLen,

++ const unsigned char *input, unsigned int inputLen)

++{

++ const unsigned char *ad = ctx->adOverflow;

++ if (ad == NULL) {

++ ad = ctx->ad;

++ }

++ return ChaCha20Poly1305_Seal(&ctx->freeblCtx, output, outputLen,

++ maxOutputLen, input, inputLen, ctx->nonce,

++ sizeof(ctx->nonce), ad, ctx->adLen);

++}

++static SECStatus

++sftk_ChaCha20Poly1305_Decrypt(const SFTKChaCha20Poly1305Info *ctx,

++ unsigned char *output, unsigned int *outputLen,

++ unsigned int maxOutputLen,

++ const unsigned char *input, unsigned int inputLen)

++{

++ const unsigned char *ad = ctx->adOverflow;

++ if (ad == NULL) {

++ ad = ctx->ad;

++ }

++ return ChaCha20Poly1305_Open(&ctx->freeblCtx, output, outputLen,

++ maxOutputLen, input, inputLen, ctx->nonce,

++ sizeof(ctx->nonce), ad, ctx->adLen);

++}

+ /** NSC_CryptInit initializes an encryption/Decryption operation.

+ *

+ * Always called by NSC_EncryptInit, NSC_DecryptInit, NSC_WrapKey,NSC_UnwrapKey.

+@@ -870,6 +961,35 @@

+ context->destroy = (SFTKDestroy) AES_DestroyContext;

+ break;

++ case CKM_NSS_CHACHA20_POLY1305:

++ if (pMechanism->ulParameterLen != sizeof(CK_NSS_AEAD_PARAMS)) {

++ crv = CKR_MECHANISM_PARAM_INVALID;

++ break;

++ }

++ context->multi = PR_FALSE;

++ if (key_type != CKK_NSS_CHACHA20) {

++ crv = CKR_KEY_TYPE_INCONSISTENT;

++ break;

++ }

++ att = sftk_FindAttribute(key,CKA_VALUE);

++ if (att == NULL) {

++ crv = CKR_KEY_HANDLE_INVALID;

++ break;

++ }

++ context->cipherInfo = sftk_ChaCha20Poly1305_CreateContext(

++ (unsigned char*) att->attrib.pValue, att->attrib.ulValueLen,

++ (CK_NSS_AEAD_PARAMS*) pMechanism->pParameter);

++ sftk_FreeAttribute(att);

++ if (context->cipherInfo == NULL) {

++ crv = sftk_MapCryptError(PORT_GetError());

++ break;

++ }

++ context->update = (SFTKCipher) (isEncrypt ?

++ sftk_ChaCha20Poly1305_Encrypt :

++ sftk_ChaCha20Poly1305_Decrypt);

++ context->destroy = (SFTKDestroy) sftk_ChaCha20Poly1305_DestroyContext;

++ break;

+ case CKM_NETSCAPE_AES_KEY_WRAP_PAD:

+ context->doPad = PR_TRUE;

+ /* fall thru */

+@@ -3272,6 +3392,10 @@

+ *key_type = CKK_AES;

+ if (*key_length == 0) crv = CKR_TEMPLATE_INCOMPLETE;

+ break;

++ case CKM_NSS_CHACHA20_KEY_GEN:

++ *key_type = CKK_NSS_CHACHA20;

++ if (*key_length == 0) crv = CKR_TEMPLATE_INCOMPLETE;

++ break;

+ default:

+ PORT_Assert(0);

+ crv = CKR_MECHANISM_INVALID;

+@@ -3516,6 +3640,7 @@

+ case CKM_SEED_KEY_GEN:

+ case CKM_CAMELLIA_KEY_GEN:

+ case CKM_AES_KEY_GEN:

++ case CKM_NSS_CHACHA20_KEY_GEN:

+ #if NSS_SOFTOKEN_DOES_RC5

+ case CKM_RC5_KEY_GEN:

+ #endif

+Index: nss/lib/softoken/pkcs11i.h

+===================================================================

+--- nss/lib/softoken/pkcs11i.h (revision 228205)

++++ nss/lib/softoken/pkcs11i.h (working copy)

+@@ -14,6 +14,7 @@

+ #include "pkcs11t.h"

+ #include "sftkdbt.h"

++#include "chacha20poly1305.h"

+ #include "hasht.h"

+ /*

+@@ -104,6 +105,7 @@

+ typedef struct SFTKOAEPEncryptInfoStr SFTKOAEPEncryptInfo;

+ typedef struct SFTKOAEPDecryptInfoStr SFTKOAEPDecryptInfo;

+ typedef struct SFTKSSLMACInfoStr SFTKSSLMACInfo;

++typedef struct SFTKChaCha20Poly1305InfoStr SFTKChaCha20Poly1305Info;

+ typedef struct SFTKItemTemplateStr SFTKItemTemplate;

+ /* define function pointer typdefs for pointer tables */

+@@ -399,6 +401,16 @@

+ unsigned int keySize;

+ };

++/* SFTKChaCha20Poly1305Info saves the key, tag length, nonce, and additional

++ * data for a ChaCha20+Poly1305 AEAD operation. */

++struct SFTKChaCha20Poly1305InfoStr {

++ ChaCha20Poly1305Context freeblCtx;

++ unsigned char nonce[8];

++ unsigned char ad[16];

++ unsigned char *adOverflow;

++ unsigned int adLen;

++};

+ /*

+ * Template based on SECItems, suitable for passing as arrays

+ */

+Index: nss/lib/freebl/blapit.h

+===================================================================

+--- nss/lib/freebl/blapit.h (revision 228205)

++++ nss/lib/freebl/blapit.h (working copy)

+@@ -222,6 +222,7 @@

+ struct SHA512ContextStr ;

+ struct AESKeyWrapContextStr ;

+ struct SEEDContextStr ;

++struct ChaCha20Poly1305ContextStr;

+ typedef struct DESContextStr DESContext;

+ typedef struct RC2ContextStr RC2Context;

+@@ -240,6 +241,7 @@

+ typedef struct SHA512ContextStr SHA384Context;

+ typedef struct AESKeyWrapContextStr AESKeyWrapContext;

+ typedef struct SEEDContextStr SEEDContext;

++typedef struct ChaCha20Poly1305ContextStr ChaCha20Poly1305Context;

+ /***************************************************************************

+ ** RSA Public and Private Key structures

+Index: nss/lib/freebl/blapi.h

+===================================================================

+--- nss/lib/freebl/blapi.h (revision 228205)

++++ nss/lib/freebl/blapi.h (working copy)

+@@ -818,7 +818,39 @@

+ unsigned int *outputLen, unsigned int maxOutputLen,

+ const unsigned char *input, unsigned int inputLen);

++/******************************************/

++/*

++** ChaCha20+Poly1305 AEAD

++*/

++extern SECStatus

++ChaCha20Poly1305_InitContext(ChaCha20Poly1305Context *ctx,

++ const unsigned char *key, unsigned int keyLen,

++ unsigned int tagLen);

++extern ChaCha20Poly1305Context *

++ChaCha20Poly1305_CreateContext(const unsigned char *key, unsigned int keyLen,

++ unsigned int tagLen);

++extern void

++ChaCha20Poly1305_DestroyContext(ChaCha20Poly1305Context *ctx, PRBool freeit);

++extern SECStatus

++ChaCha20Poly1305_Seal(const ChaCha20Poly1305Context *ctx,

++ unsigned char *output, unsigned int *outputLen,

++ unsigned int maxOutputLen,

++ const unsigned char *input, unsigned int inputLen,

++ const unsigned char *nonce, unsigned int nonceLen,

++ const unsigned char *ad, unsigned int adLen);

++extern SECStatus

++ChaCha20Poly1305_Open(const ChaCha20Poly1305Context *ctx,

++ unsigned char *output, unsigned int *outputLen,

++ unsigned int maxOutputLen,

++ const unsigned char *input, unsigned int inputLen,

++ const unsigned char *nonce, unsigned int nonceLen,

++ const unsigned char *ad, unsigned int adLen);

+ /******************************************/

+ /*

+ ** MD5 secure hash function

+Index: nss/lib/freebl/poly1305/poly1305-donna-x64-sse2-incremental-source.c

+===================================================================

+--- nss/lib/freebl/poly1305/poly1305-donna-x64-sse2-incremental-source.c (revision 0)

++++ nss/lib/freebl/poly1305/poly1305-donna-x64-sse2-incremental-source.c (revision 0)

+@@ -0,0 +1,623 @@

++/* This Source Code Form is subject to the terms of the Mozilla Public

++ * License, v. 2.0. If a copy of the MPL was not distributed with this

++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

++/* This implementation of poly1305 is by Andrew Moon

++ * (https://github.com/floodyberry/poly1305-donna) and released as public

++ * domain. It implements SIMD vectorization based on the algorithm described in

++ * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte

++ * block size. */

++#include <emmintrin.h>

++#include <stdint.h>

++#include "poly1305.h"

++#define ALIGN(x) __attribute__((aligned(x)))

++#define INLINE inline

++#define U8TO64_LE(m) (*(uint64_t*)(m))

++#define U8TO32_LE(m) (*(uint32_t*)(m))

++#define U64TO8_LE(m,v) (*(uint64_t*)(m)) = v

++typedef __m128i xmmi;

++typedef unsigned __int128 uint128_t;

++static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = {(1 << 26) - 1, 0, (1 << 26) - 1, 0};

++static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = {5, 0, 5, 0};

++static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0, (1 << 24), 0};

++static uint128_t INLINE

++add128(uint128_t a, uint128_t b) {

++ return a + b;

++}

++static uint128_t INLINE

++add128_64(uint128_t a, uint64_t b) {

++ return a + b;

++}

++static uint128_t INLINE

++mul64x64_128(uint64_t a, uint64_t b) {

++ return (uint128_t)a * b;

++}

++static uint64_t INLINE

++lo128(uint128_t a) {

++ return (uint64_t)a;

++}

++static uint64_t INLINE

++shr128(uint128_t v, const int shift) {

++ return (uint64_t)(v >> shift);

++}

++static uint64_t INLINE

++shr128_pair(uint64_t hi, uint64_t lo, const int shift) {

++ return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);

++}

++typedef struct poly1305_power_t {

++ union {

++ xmmi v;

++ uint64_t u[2];

++ uint32_t d[4];

++ } R20,R21,R22,R23,R24,S21,S22,S23,S24;

++} poly1305_power;

++typedef struct poly1305_state_internal_t {

++ poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 bytes of free storage */

++ union {

++ xmmi H[5]; /* 80 bytes */

++ uint64_t HH[10];

++ };

++ /* uint64_t r0,r1,r2; [24 bytes] */

++ /* uint64_t pad0,pad1; [16 bytes] */

++ uint64_t started; /* 8 bytes */

++ uint64_t leftover; /* 8 bytes */

++ uint8_t buffer[64]; /* 64 bytes */

++} poly1305_state_internal; /* 448 bytes total + 63 bytes for alignment = 511 bytes raw */

++static poly1305_state_internal INLINE

++*poly1305_aligned_state(poly1305_state *state) {

++ return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);

++}

++/* copy 0-63 bytes */

++static void INLINE

++poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {

++ size_t offset = src - dst;

++ if (bytes & 32) {

++ _mm_storeu_si128((xmmi *)(dst + 0), _mm_loadu_si128((xmmi *)(dst + offset + 0)));

++ _mm_storeu_si128((xmmi *)(dst + 16), _mm_loadu_si128((xmmi *)(dst + offset + 16)));

++ dst += 32;

++ }

++ if (bytes & 16) { _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); dst += 16; }

++ if (bytes & 8) { *(uint64_t *)dst = *(uint64_t *)(dst + offset); dst += 8; }

++ if (bytes & 4) { *(uint32_t *)dst = *(uint32_t *)(dst + offset); dst += 4; }

++ if (bytes & 2) { *(uint16_t *)dst = *(uint16_t *)(dst + offset); dst += 2; }

++ if (bytes & 1) { *( uint8_t *)dst = *( uint8_t *)(dst + offset); }

++}

++/* zero 0-15 bytes */

++static void INLINE

++poly1305_block_zero(uint8_t *dst, size_t bytes) {

++ if (bytes & 8) { *(uint64_t *)dst = 0; dst += 8; }

++ if (bytes & 4) { *(uint32_t *)dst = 0; dst += 4; }

++ if (bytes & 2) { *(uint16_t *)dst = 0; dst += 2; }

++ if (bytes & 1) { *( uint8_t *)dst = 0; }

++}

++static size_t INLINE

++poly1305_min(size_t a, size_t b) {

++ return (a < b) ? a : b;

++}

++void

++Poly1305Init(poly1305_state *state, const unsigned char key[32]) {

++ poly1305_state_internal *st = poly1305_aligned_state(state);

++ poly1305_power *p;

++ uint64_t r0,r1,r2;

++ uint64_t t0,t1;

++ /* clamp key */

++ t0 = U8TO64_LE(key + 0);

++ t1 = U8TO64_LE(key + 8);

++ r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20;

++ r1 = t0 & 0xfffffc0ffff; t1 >>= 24;

++ r2 = t1 & 0x00ffffffc0f;

++ /* store r in un-used space of st->P[1] */

++ p = &st->P[1];

++ p->R20.d[1] = (uint32_t)(r0 );

++ p->R20.d[3] = (uint32_t)(r0 >> 32);

++ p->R21.d[1] = (uint32_t)(r1 );

++ p->R21.d[3] = (uint32_t)(r1 >> 32);

++ p->R22.d[1] = (uint32_t)(r2 );

++ p->R22.d[3] = (uint32_t)(r2 >> 32);

++ /* store pad */

++ p->R23.d[1] = U8TO32_LE(key + 16);

++ p->R23.d[3] = U8TO32_LE(key + 20);

++ p->R24.d[1] = U8TO32_LE(key + 24);

++ p->R24.d[3] = U8TO32_LE(key + 28);

++ /* H = 0 */

++ st->H[0] = _mm_setzero_si128();

++ st->H[1] = _mm_setzero_si128();

++ st->H[2] = _mm_setzero_si128();

++ st->H[3] = _mm_setzero_si128();

++ st->H[4] = _mm_setzero_si128();

++ st->started = 0;

++ st->leftover = 0;

++}

++static void

++poly1305_first_block(poly1305_state_internal *st, const uint8_t *m) {

++ const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);

++ const xmmi FIVE = _mm_load_si128((xmmi*)poly1305_x64_sse2_5);

++ const xmmi HIBIT = _mm_load_si128((xmmi*)poly1305_x64_sse2_1shl128);

++ xmmi T5,T6;

++ poly1305_power *p;

++ uint128_t d[3];

++ uint64_t r0,r1,r2;

++ uint64_t r20,r21,r22,s22;

++ uint64_t pad0,pad1;

++ uint64_t c;

++ uint64_t i;

++ /* pull out stored info */

++ p = &st->P[1];

++ r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];

++ r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];

++ r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];

++ pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];

++ pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];

++ /* compute powers r^2,r^4 */

++ r20 = r0;

++ r21 = r1;

++ r22 = r2;

++ for (i = 0; i < 2; i++) {

++ s22 = r22 * (5 << 2);

++ d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));

++ d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));

++ d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));

++ r20 = lo128(d[0]) & 0xfffffffffff; c = shr128(d[0], 44);

++ d[1] = add128_64(d[1], c); r21 = lo128(d[1]) & 0xfffffffffff; c = shr128(d[1], 44);

++ d[2] = add128_64(d[2], c); r22 = lo128(d[2]) & 0x3ffffffffff; c = shr128(d[2], 42);

++ r20 += c * 5; c = (r20 >> 44); r20 = r20 & 0xfffffffffff;

++ r21 += c;

++ p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)( r20 ) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0));

++ p->R21.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0));

++ p->R22.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8) ) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0));

++ p->R23.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), _MM_SHUFFLE(1,0,1,0));

++ p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16) ) ), _MM_SHUFFLE(1,0,1,0));

++ p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);

++ p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);

++ p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);

++ p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);

++ p--;

++ }

++ /* put saved info back */

++ p = &st->P[1];

++ p->R20.d[1] = (uint32_t)(r0 );

++ p->R20.d[3] = (uint32_t)(r0 >> 32);

++ p->R21.d[1] = (uint32_t)(r1 );

++ p->R21.d[3] = (uint32_t)(r1 >> 32);

++ p->R22.d[1] = (uint32_t)(r2 );

++ p->R22.d[3] = (uint32_t)(r2 >> 32);

++ p->R23.d[1] = (uint32_t)(pad0 );

++ p->R23.d[3] = (uint32_t)(pad0 >> 32);

++ p->R24.d[1] = (uint32_t)(pad1 );

++ p->R24.d[3] = (uint32_t)(pad1 >> 32);

++ /* H = [Mx,My] */

++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));

++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));

++ st->H[0] = _mm_and_si128(MMASK, T5);

++ st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));

++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));

++ st->H[2] = _mm_and_si128(MMASK, T5);

++ st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));

++ st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);

++}

++static void

++poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, size_t bytes) {

++ const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);

++ const xmmi FIVE = _mm_load_si128((xmmi*)poly1305_x64_sse2_5);

++ const xmmi HIBIT = _mm_load_si128((xmmi*)poly1305_x64_sse2_1shl128);

++ poly1305_power *p;

++ xmmi H0,H1,H2,H3,H4;

++ xmmi T0,T1,T2,T3,T4,T5,T6;

++ xmmi M0,M1,M2,M3,M4;

++ xmmi C1,C2;

++ H0 = st->H[0];

++ H1 = st->H[1];

++ H2 = st->H[2];

++ H3 = st->H[3];

++ H4 = st->H[4];

++ while (bytes >= 64) {

++ /* H *= [r^4,r^4] */

++ p = &st->P[0];

++ T0 = _mm_mul_epu32(H0, p->R20.v);

++ T1 = _mm_mul_epu32(H0, p->R21.v);

++ T2 = _mm_mul_epu32(H0, p->R22.v);

++ T3 = _mm_mul_epu32(H0, p->R23.v);

++ T4 = _mm_mul_epu32(H0, p->R24.v);

++ T5 = _mm_mul_epu32(H1, p->S24.v); T6 = _mm_mul_epu32(H1, p->R20.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(H2, p->S23.v); T6 = _mm_mul_epu32(H2, p->S24.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(H3, p->S22.v); T6 = _mm_mul_epu32(H3, p->S23.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(H4, p->S21.v); T6 = _mm_mul_epu32(H4, p->S22.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(H1, p->R21.v); T6 = _mm_mul_epu32(H1, p->R22.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(H2, p->R20.v); T6 = _mm_mul_epu32(H2, p->R21.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(H3, p->S24.v); T6 = _mm_mul_epu32(H3, p->R20.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(H4, p->S23.v); T6 = _mm_mul_epu32(H4, p->S24.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(H1, p->R23.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(H2, p->R22.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(H3, p->R21.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(H4, p->R20.v); T4 = _mm_add_epi64(T4, T5);

++ /* H += [Mx,My]*[r^2,r^2] */

++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));

++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));

++ M0 = _mm_and_si128(MMASK, T5);

++ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));

++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));

++ M2 = _mm_and_si128(MMASK, T5);

++ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));

++ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);

++ p = &st->P[1];

++ T5 = _mm_mul_epu32(M0, p->R20.v); T6 = _mm_mul_epu32(M0, p->R21.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(M1, p->S24.v); T6 = _mm_mul_epu32(M1, p->R20.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(M2, p->S23.v); T6 = _mm_mul_epu32(M2, p->S24.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(M3, p->S22.v); T6 = _mm_mul_epu32(M3, p->S23.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(M4, p->S21.v); T6 = _mm_mul_epu32(M4, p->S22.v); T0 = _mm_add_epi64(T0, T5); T1 = _mm_add_epi64(T1, T6);

++ T5 = _mm_mul_epu32(M0, p->R22.v); T6 = _mm_mul_epu32(M0, p->R23.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(M1, p->R21.v); T6 = _mm_mul_epu32(M1, p->R22.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(M2, p->R20.v); T6 = _mm_mul_epu32(M2, p->R21.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(M3, p->S24.v); T6 = _mm_mul_epu32(M3, p->R20.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(M4, p->S23.v); T6 = _mm_mul_epu32(M4, p->S24.v); T2 = _mm_add_epi64(T2, T5); T3 = _mm_add_epi64(T3, T6);

++ T5 = _mm_mul_epu32(M0, p->R24.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(M1, p->R23.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(M2, p->R22.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(M3, p->R21.v); T4 = _mm_add_epi64(T4, T5);

++ T5 = _mm_mul_epu32(M4, p->R20.v); T4 = _mm_add_epi64(T4, T5);

++ /* H += [Mx,My] */

++ T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)), _mm_loadl_epi64((xmmi *)(m + 48)));

++ T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)), _mm_loadl_epi64((xmmi *)(m + 56)));

++ M0 = _mm_and_si128(MMASK, T5);

++ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));

++ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));

++ M2 = _mm_and_si128(MMASK, T5);

++ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));

++ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);

++ T0 = _mm_add_epi64(T0, M0);

++ T1 = _mm_add_epi64(T1, M1);

++ T2 = _mm_add_epi64(T2, M2);

++ T3 = _mm_add_epi64(T3, M3);

++ T4 = _mm_add_epi64(T4, M4);

++ /* reduce */

++ C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2);

++ C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));

++ C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2);

++ C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1);

++ /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */

++ H0 = T0;

++ H1 = T1;

++ H2 = T2;

++ H3 = T3;

++ H4 = T4;

++ m += 64;

++ bytes -= 64;

++ }

++ st->H[0] = H0;

++ st->H[1] = H1;

++ st->H[2] = H2;

++ st->H[3] = H3;

++ st->H[4] = H4;

++}

++static size_t

++poly1305_combine(poly1305_state_internal *st, const uint8_t *m, size_t bytes) {

++ const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);

++ const xmmi HIBIT = _mm_load_si128((xmmi*)poly1305_x64_sse2_1shl128);

++ const xmmi FIVE = _mm_load_si128((xmmi*)poly1305_x64_sse2_5);

++ poly1305_power *p;

++ xmmi H0,H1,H2,H3,H4;

++ xmmi M0,M1,M2,M3,M4;

++ xmmi T0,T1,T2,T3,T4,T5,T6;

++ xmmi C1,C2;

++ uint64_t r0,r1,r2;

++ uint64_t t0,t1,t2,t3,t4;

++ uint64_t c;

++ size_t consumed = 0;

++ H0 = st->H[0];

++ H1 = st->H[1];

++ H2 = st->H[2];

++ H3 = st->H[3];

++ H4 = st->H[4];

++ /* p = [r^2,r^2] */

++ p = &st->P[1];

++ if (bytes >= 32) {

++ /* H *= [r^2,r^2] */