mozilla/security/nss/lib/freebl/ecl/ecp_256_32.c - Issue 12042100: Add an optimized 32-bit implementation of the NIST P-256 elliptic curve.

Side by Side Diff: mozilla/security/nss/lib/freebl/ecl/ecp_256_32.c

Issue 12042100: Add an optimized 32-bit implementation of the NIST P-256 elliptic curve. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/nss/

Patch Set: Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /* This Source Code Form is subject to the terms of the Mozilla Public

	2 * License, v. 2.0. If a copy of the MPL was not distributed with this

	3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	4

	5 /* A 32-bit implementation of the NIST P-256 elliptic curve. */
	wtc 2013/01/25 02:32:49 Lines 1-5 used to be one comment block. I separate Lines 1-5 used to be one comment block. I separated it into a license header and a comment. I also removed <stdint.h>, <stdio.h>, and <stdlib.h> and kept only <string.h> because the only standard C library functions I saw in this file are memcpy, memcmp, memset.
	6

	7 #include <string.h>

	8

	9 #include "prtypes.h"

	10 #include "mpi.h"

	11 #include "mpi-priv.h"

	12 #include "ecp.h"

	13

	14 typedef PRUint8 u8;

	15 typedef PRUint32 u32;

	16 typedef PRInt32 s32;

	17 typedef PRUint64 u64;

	18

	19 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's

	20 * MPI library calls them digits, but here they are called limbs, which is

	21 * GMP's terminology.

	22 *

	23 * The value of an felem (field element) is:

	24 * x[0] + (x[1] * 2*29) + (x[2] 2*57) + ... + (x[8] 2**228)

	25 *

	26 * That is, each limb is alternately 29 or 28-bits wide in little-endian

	27 * order.

	28 *

	29 * This means that an felem hits 2257, rather than 2256 as we would like. A

	30 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems

	31 * when multiplying as terms end up one bit short of a limb which would require

	32 * much bit-shifting to correct.

	33 *

	34 * Finally, the values stored in an felem are in Montgomery form. So the value

	35 * \|y\| is stored as (yR) mod p, where p is the P-256 prime and R is 2*257.

	36 */

	37 typedef u32 limb;

	38 #define NLIMBS 9

	39 typedef limb felem[NLIMBS];

	40

	41 static const limb kBottom28Bits = 0xfffffff;

	42 static const limb kBottom29Bits = 0x1fffffff;

	43

	44 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and

	45 * 28-bit words.

	46 */

	47 static const felem kOne = {

	48 2, 0, 0, 0xffff800,

	49 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,

	50 0

	51 };

	52 static const felem kZero = {0};

	53 static const felem kP = {

	54 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,

	55 0, 0, 0x200000, 0xf000000,

	56 0xfffffff

	57 };

	58 static const felem k2P = {

	59 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,

	60 0, 0, 0x400000, 0xe000000,

	61 0x1fffffff

	62 };

	63

	64 /* kPrecomputed contains precomputed values to aid the calculation of scalar

	65 * multiples of the base point, G. It's actually two, equal length, tables

	66 * concatenated.

	67 *

	68 * The first table contains (x,y) felem pairs for 16 multiples of the base

	69 * point, G.

	70 *

	71 * Index \| Index (binary) \| Value

	72 * 0 \| 0000 \| 0G (all zeros, omitted)

	73 * 1 \| 0001 \| G

	74 * 2 \| 0010 \| 2**64G

	75 * 3 \| 0011 \| 2**64G + G

	76 * 4 \| 0100 \| 2**128G

	77 * 5 \| 0101 \| 2**128G + G

	78 * 6 \| 0110 \| 2128G + 264G

	79 * 7 \| 0111 \| 2128G + 264G + G

	80 * 8 \| 1000 \| 2**192G

	81 * 9 \| 1001 \| 2**192G + G

	82 * 10 \| 1010 \| 2192G + 264G

	83 * 11 \| 1011 \| 2192G + 264G + G

	84 * 12 \| 1100 \| 2192G + 2128G

	85 * 13 \| 1101 \| 2192G + 2128G + G

	86 * 14 \| 1110 \| 2192G + 2128G + 2**64G

	87 * 15 \| 1111 \| 2192G + 2128G + 2**64G + G

	88 *

	89 * The second table follows the same style, but the terms are 2**32G,

	90 * 296G, 2160G, 2**224G.

	91 *

	92 * This is ~2KB of data.

	93 */

	94 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {

	95 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,

	96 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,

	97 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,

	98 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,

	99 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,

	100 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,

	101 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,

	102 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,

	103 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,

	104 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,

	105 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,

	106 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,

	107 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,

	108 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,

	109 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,

	110 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,

	111 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,

	112 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,

	113 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,

	114 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,

	115 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,

	116 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,

	117 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,

	118 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,

	119 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,

	120 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,

	121 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,

	122 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,

	123 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,

	124 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,

	125 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,

	126 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,

	127 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,

	128 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,

	129 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,

	130 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,

	131 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,

	132 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,

	133 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,

	134 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,

	135 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,

	136 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,

	137 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,

	138 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,

	139 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,

	140 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,

	141 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,

	142 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,

	143 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,

	144 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,

	145 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,

	146 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,

	147 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,

	148 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,

	149 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,

	150 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,

	151 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,

	152 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,

	153 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,

	154 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,

	155 };

	156

	157 /* Field element operations:

	158 */

	159

	160 /* NON_ZERO_TO_ALL_ONES returns:

	161 * 0xffffffff for 0 < x <= 2**31

	162 * 0 for x == 0 or x > 2**31.

	163 *

	164 * This macro assumes that right-shifting a signed number shifts in the MSB on

	165 * the left. This is not ensured by the C standard, but is true on the CPUs

	166 * that we're targetting with this code (x86 and ARM).

	167 */

	168 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) ((x)-1)) >> 31)))
	Ryan Sleevi 2013/01/25 03:50:36 Just because this is slipping into more and more c Just because this is slipping into more and more code, does it make since to place this standard-but-undefined behaviour somewhere just as a gatekeeper for 'things might go wrong here'? Are we relying on the fact that a unittest should fail, were this to not be true? Should their be a bitsmear alternative 'just in case'? eg: x &= 0x80000000; x \|= x >> 16; x \|= x >> 8; x \|= x >> 4; x \|= x >> 2; x \|= x >> 1; agl 2013/01/25 16:04:03 I'd love to put it in nspr or somewhere similar. P Show quoted text On 2013/01/25 03:50:36, Ryan Sleevi wrote: > Just because this is slipping into more and more code, does it make since to > place this standard-but-undefined behaviour somewhere just as a gatekeeper for > 'things might go wrong here'? Are we relying on the fact that a unittest should > fail, were this to not be true? I'd love to put it in nspr or somewhere similar. Preferably with an #if defined(__x86_64__) \|\| defined(__whatever_arm_uses__) with the bitsmeared version as an alternative. However, in the interests of keeping the patches limited so far I've been copying it as it's only a one-liner and somewhat obscure. wtc 2013/01/31 16:59:02 After I worked out the detail (including plugging Show quoted text On 2013/01/25 03:50:36, Ryan Sleevi wrote: > > Should their be a bitsmear alternative 'just in case'? > > eg: > x &= 0x80000000; x \|= x >> 16; > x \|= x >> 8; x \|= x >> 4; > x \|= x >> 2; x \|= x >> 1; After I worked out the detail (including plugging in Ryan's constantTimeCondition code from the NSS RSA-OAEP patch), I found that the result is simply u32 NonZeroToAllOnes(u32 x) { x = (x - 1) >> 31; return x - 1; } or in macro form: #define NON_ZERO_TO_ALL_ONES(x) ((((x)-1) >> 31) - 1) This should be equally fast. I hope I didn't make a mistake.
	169

	170 /* felem_reduce_carry adds a multiple of p in order to cancel \|carry\|,

	171 * which is a term at 2**257.

	172 *

	173 * On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.

	174 * On exit: inout[0,2,..] < 230, inout[1,3,...] < 229.

	175 */

	176 static void felem_reduce_carry(felem inout, limb carry)

	177 {

	178 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);

	179

	180 inout[0] += carry << 1;

	181 inout[3] += 0x10000000 & carry_mask;

	182 /* carry < 23 thus (carry << 11) < 214 and we added 2**28 in the

	183 * previous line therefore this doesn't underflow.

	184 */

	185 inout[3] -= carry << 11;

	186 inout[4] += (0x20000000 - 1) & carry_mask;

	187 inout[5] += (0x10000000 - 1) & carry_mask;

	188 inout[6] += (0x20000000 - 1) & carry_mask;

	189 inout[6] -= carry << 22;

	190 /* This may underflow if carry is non-zero but, if so, we'll fix it in the

	191 * next line.

	192 */

	193 inout[7] -= 1 & carry_mask;

	194 inout[7] += carry << 25;

	195 }

	196

	197 /* felem_sum sets out = in+in2.

	198 *

	199 * On entry, in[i]+in2[i] must not overflow a 32-bit word.

	200 * On exit: out[0,2,...] < 230, out[1,3,...] < 229

	201 */

	202 static void felem_sum(felem out, const felem in, const felem in2)

	203 {

	204 limb carry = 0;

	205 unsigned int i;

	206 for (i = 0;; i++) {

	207 out[i] = in[i] + in2[i];

	208 out[i] += carry;

	209 carry = out[i] >> 29;

	210 out[i] &= kBottom29Bits;

	211

	212 i++;

	213 if (i == NLIMBS)

	214 break;

	215

	216 out[i] = in[i] + in2[i];

	217 out[i] += carry;

	218 carry = out[i] >> 28;

	219 out[i] &= kBottom28Bits;

	220 }

	221

	222 felem_reduce_carry(out, carry);

	223 }

	224

	225 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)

	226 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)

	227 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)

	228 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)

	229 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)

	230 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)

	231

	232 /* zero31 is 0 mod p.

	233 */

	234 static const felem zero31 = {

	235 two31m3, two30m2, two31m2, two30p13m2,

	236 two31m2, two30m2, two31p24m2, two30m27m2,

	237 two31m2

	238 };

	239

	240 /* felem_diff sets out = in-in2.

	241 *

	242 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	243 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	244 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	245 */

	246 static void felem_diff(felem out, const felem in, const felem in2)

	247 {

	248 limb carry = 0;

	249 unsigned int i;

	250

	251 for (i = 0;; i++) {

	252 out[i] = in[i] - in2[i];

	253 out[i] += zero31[i];

	254 out[i] += carry;

	255 carry = out[i] >> 29;

	256 out[i] &= kBottom29Bits;

	257

	258 i++;

	259 if (i == NLIMBS)

	260 break;

	261

	262 out[i] = in[i] - in2[i];

	263 out[i] += zero31[i];

	264 out[i] += carry;

	265 carry = out[i] >> 28;

	266 out[i] &= kBottom28Bits;

	267 }

	268

	269 felem_reduce_carry(out, carry);

	270 }

	271

	272 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words

	273 * with the same 29,28,... bit positions as an felem.

	274 *

	275 * The values in felems are in Montgomery form: xR mod p where R = 2*257.

	276 * Since we just multiplied two Montgomery values together, the result is

	277 * xyR*R mod p. We wish to divide by R in order for the result also to be

	278 * in Montgomery form.

	279 *

	280 * On entry: tmp[i] < 2**64

	281 * On exit: out[0,2,...] < 230, out[1,3,...] < 229

	282 */

	283 static void felem_reduce_degree(felem out, u64 tmp[17])

	284 {

	285 /* The following table may be helpful when reading this code:

	286 *

	287 * Limb number: 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \| 10...

	288 * Width (bits): 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29

	289 * Start bit: 0 \| 29\| 57\| 86\|114\|143\|171\|200\|228\|257\|285

	290 * (odd phase): 0 \| 28\| 57\| 85\|114\|142\|171\|199\|228\|256\|285

	291 */

	292 limb tmp2[18], carry, x, xMask;

	293 unsigned int i;

	294

	295 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an

	296 * felem. So the top of an element of tmp might overlap with another

	297 * element two positions down. The following loop eliminates this

	298 * overlap.

	299 */

	300 tmp2[0] = tmp[0] & kBottom29Bits;

	301

	302 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try

	303 * and hint to the compiler that it can do a single-word shift by selecting

	304 * the right register rather than doing a double-word shift and truncating

	305 * afterwards.

	306 */

	307 tmp2[1] = ((limb) tmp[0]) >> 29;

	308 tmp2[1] \|= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;

	309 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;

	310 carry = tmp2[1] >> 28;

	311 tmp2[1] &= kBottom28Bits;

	312

	313 for (i = 2; i < 17; i++) {

	314 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	315 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;

	316 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;

	317 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;

	318 tmp2[i] += carry;

	319 carry = tmp2[i] >> 29;

	320 tmp2[i] &= kBottom29Bits;

	321

	322 i++;

	323 if (i == 17)

	324 break;

	325 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	326 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;

	327 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;

	328 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;

	329 tmp2[i] += carry;

	330 carry = tmp2[i] >> 28;

	331 tmp2[i] &= kBottom28Bits;

	332 }

	333

	334 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;

	335 tmp2[17] += ((limb) (tmp[16])) >> 29;

	336 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);

	337 tmp2[17] += carry;

	338

	339 /* Montgomery elimination of terms:

	340 *

	341 * Since R is 2**257, we can divide by R with a bitwise shift if we can

	342 * ensure that the right-most 257 bits are all zero. We can make that true

	343 * by adding multiplies of p without affecting the value.

	344 *

	345 * So we eliminate limbs from right to left. Since the bottom 29 bits of p

	346 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.

	347 * We can do that for 8 further limbs and then right shift to eliminate the

	348 * extra factor of R.

	349 */

	350 for (i = 0;; i += 2) {

	351 tmp2[i + 1] += tmp2[i] >> 29;

	352 x = tmp2[i] & kBottom29Bits;

	353 xMask = NON_ZERO_TO_ALL_ONES(x);

	354 tmp2[i] = 0;

	355

	356 /* The bounds calculations for this loop are tricky. Each iteration of

	357 * the loop eliminates two words by adding values to words to their

	358 * right.

	359 *

	360 * The following table contains the amounts added to each word (as an

	361 * offset from the value of i at the top of the loop). The amounts are

	362 * accounted for from the first and second half of the loop separately

	363 * and are written as, for example, 28 to mean a value <2**28.

	364 *

	365 * Word: 3 4 5 6 7 8 9 10

	366 * Added in top half: 28 11 29 21 29 28

	367 * 28 29

	368 * 29

	369 * Added in bottom half: 29 10 28 21 28 28

	370 * 29

	371 *

	372 * The value that is currently offset 7 will be offset 5 for the next

	373 * iteration and then offset 3 for the iteration after that. Therefore

	374 * the total value added will be the values added at 7, 5 and 3.

	375 *

	376 * The following table accumulates these values. The sums at the bottom

	377 * are written as, for example, 29+28, to mean a value < 229+228.

	378 *

	379 * Word: 3 4 5 6 7 8 9 10 11 12 13

	380 * 28 11 10 29 21 29 28 28 28 28 28

	381 * 29 28 11 28 29 28 29 28 29 28

	382 * 29 28 21 21 29 21 29 21

	383 * 10 29 28 21 28 21 28

	384 * 28 29 28 29 28 29 28

	385 * 11 10 29 10 29 10

	386 * 29 28 11 28 11

	387 * 29 29

	388 * --------------------------------------------

	389 * 30+ 31+ 30+ 31+ 30+

	390 * 28+ 29+ 28+ 29+ 21+

	391 * 21+ 28+ 21+ 28+ 10

	392 * 10 21+ 10 21+

	393 * 11 11

	394 *

	395 * So the greatest amount is added to tmp2[10] and tmp2[12]. If

	396 * tmp2[10/12] has an initial value of <2**29, then the maximum value

	397 * will be < 231 + 230 + 228 + 221 + 211, which is < 232,

	398 * as required.

	399 */

	400 tmp2[i + 3] += (x << 10) & kBottom28Bits;

	401 tmp2[i + 4] += (x >> 18);

	402

	403 tmp2[i + 6] += (x << 21) & kBottom29Bits;

	404 tmp2[i + 7] += x >> 8;

	405

	406 /* At position 200, which is the starting bit position for word 7, we

	407 * have a factor of 0xf000000 = 228 - 224.

	408 */

	409 tmp2[i + 7] += 0x10000000 & xMask;

	410 /* Word 7 is 28 bits wide, so the 2*28 term exactly hits word 8. /

	411 tmp2[i + 8] += (x - 1) & xMask;

	412 tmp2[i + 7] -= (x << 24) & kBottom28Bits;

	413 tmp2[i + 8] -= x >> 4;

	414

	415 tmp2[i + 8] += 0x20000000 & xMask;

	416 tmp2[i + 8] -= x;

	417 tmp2[i + 8] += (x << 28) & kBottom29Bits;

	418 tmp2[i + 9] += ((x >> 1) - 1) & xMask;

	419

	420 if (i+1 == NLIMBS)

	421 break;

	422 tmp2[i + 2] += tmp2[i + 1] >> 28;

	423 x = tmp2[i + 1] & kBottom28Bits;

	424 xMask = NON_ZERO_TO_ALL_ONES(x);

	425 tmp2[i + 1] = 0;

	426

	427 tmp2[i + 4] += (x << 11) & kBottom29Bits;

	428 tmp2[i + 5] += (x >> 18);

	429

	430 tmp2[i + 7] += (x << 21) & kBottom28Bits;

	431 tmp2[i + 8] += x >> 7;

	432

	433 /* At position 199, which is the starting bit of the 8th word when

	434 * dealing with a context starting on an odd word, we have a factor of

	435 * 0x1e000000 = 229 - 225. Since we have not updated i, the 8th

	436 * word from i+1 is i+8.

	437 */

	438 tmp2[i + 8] += 0x20000000 & xMask;

	439 tmp2[i + 9] += (x - 1) & xMask;

	440 tmp2[i + 8] -= (x << 25) & kBottom29Bits;

	441 tmp2[i + 9] -= x >> 4;

	442

	443 tmp2[i + 9] += 0x10000000 & xMask;

	444 tmp2[i + 9] -= x;

	445 tmp2[i + 10] += (x - 1) & xMask;

	446 }

	447

	448 /* We merge the right shift with a carry chain. The words above 2**257 have

	449 * widths of 28,29,... which we need to correct when copying them down.

	450 */

	451 carry = 0;

	452 for (i = 0; i < 8; i++) {

	453 /* The maximum value of tmp2[i + 9] occurs on the first iteration and

	454 * is < 230+229+228. Adding 229 (from tmp2[i + 10]) is

	455 * therefore safe.

	456 */

	457 out[i] = tmp2[i + 9];

	458 out[i] += carry;

	459 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;

	460 carry = out[i] >> 29;

	461 out[i] &= kBottom29Bits;

	462

	463 i++;

	464 out[i] = tmp2[i + 9] >> 1;

	465 out[i] += carry;

	466 carry = out[i] >> 28;

	467 out[i] &= kBottom28Bits;

	468 }

	469

	470 out[8] = tmp2[17];

	471 out[8] += carry;

	472 carry = out[8] >> 29;

	473 out[8] &= kBottom29Bits;

	474

	475 felem_reduce_carry(out, carry);

	476 }

	477

	478 /* felem_square sets out=in*in.

	479 *

	480 * On entry: in[0,2,...] < 230, in[1,3,...] < 229.

	481 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	482 */

	483 static void felem_square(felem out, const felem in)

	484 {

	485 u64 tmp[17];

	486

	487 tmp[0] = ((u64) in[0]) * in[0];

	488 tmp[1] = ((u64) in[0]) * (in[1] << 1);

	489 tmp[2] = ((u64) in[0]) * (in[2] << 1) +

	490 ((u64) in[1]) * (in[1] << 1);

	491 tmp[3] = ((u64) in[0]) * (in[3] << 1) +

	492 ((u64) in[1]) * (in[2] << 1);

	493 tmp[4] = ((u64) in[0]) * (in[4] << 1) +

	494 ((u64) in[1]) * (in[3] << 2) +

	495 ((u64) in[2]) * in[2];

	496 tmp[5] = ((u64) in[0]) * (in[5] << 1) +

	497 ((u64) in[1]) * (in[4] << 1) +

	498 ((u64) in[2]) * (in[3] << 1);

	499 tmp[6] = ((u64) in[0]) * (in[6] << 1) +

	500 ((u64) in[1]) * (in[5] << 2) +

	501 ((u64) in[2]) * (in[4] << 1) +

	502 ((u64) in[3]) * (in[3] << 1);

	503 tmp[7] = ((u64) in[0]) * (in[7] << 1) +

	504 ((u64) in[1]) * (in[6] << 1) +

	505 ((u64) in[2]) * (in[5] << 1) +

	506 ((u64) in[3]) * (in[4] << 1);

	507 /* tmp[8] has the greatest value of 261 + 260 + 261 + 260 + 2**60,

	508 * which is < 2**64 as required.

	509 */

	510 tmp[8] = ((u64) in[0]) * (in[8] << 1) +

	511 ((u64) in[1]) * (in[7] << 2) +

	512 ((u64) in[2]) * (in[6] << 1) +

	513 ((u64) in[3]) * (in[5] << 2) +

	514 ((u64) in[4]) * in[4];

	515 tmp[9] = ((u64) in[1]) * (in[8] << 1) +

	516 ((u64) in[2]) * (in[7] << 1) +

	517 ((u64) in[3]) * (in[6] << 1) +

	518 ((u64) in[4]) * (in[5] << 1);

	519 tmp[10] = ((u64) in[2]) * (in[8] << 1) +

	520 ((u64) in[3]) * (in[7] << 2) +

	521 ((u64) in[4]) * (in[6] << 1) +

	522 ((u64) in[5]) * (in[5] << 1);

	523 tmp[11] = ((u64) in[3]) * (in[8] << 1) +

	524 ((u64) in[4]) * (in[7] << 1) +

	525 ((u64) in[5]) * (in[6] << 1);

	526 tmp[12] = ((u64) in[4]) * (in[8] << 1) +

	527 ((u64) in[5]) * (in[7] << 2) +

	528 ((u64) in[6]) * in[6];

	529 tmp[13] = ((u64) in[5]) * (in[8] << 1) +

	530 ((u64) in[6]) * (in[7] << 1);

	531 tmp[14] = ((u64) in[6]) * (in[8] << 1) +

	532 ((u64) in[7]) * (in[7] << 1);

	533 tmp[15] = ((u64) in[7]) * (in[8] << 1);

	534 tmp[16] = ((u64) in[8]) * in[8];

	535

	536 felem_reduce_degree(out, tmp);

	537 }

	538

	539 /* felem_mul sets out=in*in2.

	540 *

	541 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	542 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	543 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	544 */

	545 static void felem_mul(felem out, const felem in, const felem in2)

	546 {

	547 u64 tmp[17];

	548

	549 tmp[0] = ((u64) in[0]) * in2[0];

	550 tmp[1] = ((u64) in[0]) * (in2[1] << 0) +

	551 ((u64) in[1]) * (in2[0] << 0);

	552 tmp[2] = ((u64) in[0]) * (in2[2] << 0) +

	553 ((u64) in[1]) * (in2[1] << 1) +

	554 ((u64) in[2]) * (in2[0] << 0);

	555 tmp[3] = ((u64) in[0]) * (in2[3] << 0) +

	556 ((u64) in[1]) * (in2[2] << 0) +

	557 ((u64) in[2]) * (in2[1] << 0) +

	558 ((u64) in[3]) * (in2[0] << 0);

	559 tmp[4] = ((u64) in[0]) * (in2[4] << 0) +

	560 ((u64) in[1]) * (in2[3] << 1) +

	561 ((u64) in[2]) * (in2[2] << 0) +

	562 ((u64) in[3]) * (in2[1] << 1) +

	563 ((u64) in[4]) * (in2[0] << 0);

	564 tmp[5] = ((u64) in[0]) * (in2[5] << 0) +

	565 ((u64) in[1]) * (in2[4] << 0) +

	566 ((u64) in[2]) * (in2[3] << 0) +

	567 ((u64) in[3]) * (in2[2] << 0) +

	568 ((u64) in[4]) * (in2[1] << 0) +

	569 ((u64) in[5]) * (in2[0] << 0);

	570 tmp[6] = ((u64) in[0]) * (in2[6] << 0) +

	571 ((u64) in[1]) * (in2[5] << 1) +

	572 ((u64) in[2]) * (in2[4] << 0) +

	573 ((u64) in[3]) * (in2[3] << 1) +

	574 ((u64) in[4]) * (in2[2] << 0) +

	575 ((u64) in[5]) * (in2[1] << 1) +

	576 ((u64) in[6]) * (in2[0] << 0);

	577 tmp[7] = ((u64) in[0]) * (in2[7] << 0) +

	578 ((u64) in[1]) * (in2[6] << 0) +

	579 ((u64) in[2]) * (in2[5] << 0) +

	580 ((u64) in[3]) * (in2[4] << 0) +

	581 ((u64) in[4]) * (in2[3] << 0) +

	582 ((u64) in[5]) * (in2[2] << 0) +

	583 ((u64) in[6]) * (in2[1] << 0) +

	584 ((u64) in[7]) * (in2[0] << 0);

	585 /* tmp[8] has the greatest value but doesn't overflow. See logic in

	586 * felem_square.

	587 */

	588 tmp[8] = ((u64) in[0]) * (in2[8] << 0) +

	589 ((u64) in[1]) * (in2[7] << 1) +

	590 ((u64) in[2]) * (in2[6] << 0) +

	591 ((u64) in[3]) * (in2[5] << 1) +

	592 ((u64) in[4]) * (in2[4] << 0) +

	593 ((u64) in[5]) * (in2[3] << 1) +

	594 ((u64) in[6]) * (in2[2] << 0) +

	595 ((u64) in[7]) * (in2[1] << 1) +

	596 ((u64) in[8]) * (in2[0] << 0);

	597 tmp[9] = ((u64) in[1]) * (in2[8] << 0) +

	598 ((u64) in[2]) * (in2[7] << 0) +

	599 ((u64) in[3]) * (in2[6] << 0) +

	600 ((u64) in[4]) * (in2[5] << 0) +

	601 ((u64) in[5]) * (in2[4] << 0) +

	602 ((u64) in[6]) * (in2[3] << 0) +

	603 ((u64) in[7]) * (in2[2] << 0) +

	604 ((u64) in[8]) * (in2[1] << 0);

	605 tmp[10] = ((u64) in[2]) * (in2[8] << 0) +

	606 ((u64) in[3]) * (in2[7] << 1) +

	607 ((u64) in[4]) * (in2[6] << 0) +

	608 ((u64) in[5]) * (in2[5] << 1) +

	609 ((u64) in[6]) * (in2[4] << 0) +

	610 ((u64) in[7]) * (in2[3] << 1) +

	611 ((u64) in[8]) * (in2[2] << 0);

	612 tmp[11] = ((u64) in[3]) * (in2[8] << 0) +

	613 ((u64) in[4]) * (in2[7] << 0) +

	614 ((u64) in[5]) * (in2[6] << 0) +

	615 ((u64) in[6]) * (in2[5] << 0) +

	616 ((u64) in[7]) * (in2[4] << 0) +

	617 ((u64) in[8]) * (in2[3] << 0);

	618 tmp[12] = ((u64) in[4]) * (in2[8] << 0) +

	619 ((u64) in[5]) * (in2[7] << 1) +

	620 ((u64) in[6]) * (in2[6] << 0) +

	621 ((u64) in[7]) * (in2[5] << 1) +

	622 ((u64) in[8]) * (in2[4] << 0);

	623 tmp[13] = ((u64) in[5]) * (in2[8] << 0) +

	624 ((u64) in[6]) * (in2[7] << 0) +

	625 ((u64) in[7]) * (in2[6] << 0) +

	626 ((u64) in[8]) * (in2[5] << 0);

	627 tmp[14] = ((u64) in[6]) * (in2[8] << 0) +

	628 ((u64) in[7]) * (in2[7] << 1) +

	629 ((u64) in[8]) * (in2[6] << 0);

	630 tmp[15] = ((u64) in[7]) * (in2[8] << 0) +

	631 ((u64) in[8]) * (in2[7] << 0);

	632 tmp[16] = ((u64) in[8]) * (in2[8] << 0);

	633

	634 felem_reduce_degree(out, tmp);

	635 }

	636

	637 static void felem_assign(felem out, const felem in)

	638 {

	639 memcpy(out, in, sizeof(felem));

	640 }

	641

	642 /* felem_inv calculates \|out\| = \|in\|^{-1}

	643 *

	644 * Based on Fermat's Little Theorem:

	645 * a^p = a (mod p)

	646 * a^{p-1} = 1 (mod p)

	647 * a^{p-2} = a^{-1} (mod p)

	648 */

	649 static void felem_inv(felem out, const felem in)

	650 {

	651 felem ftmp, ftmp2;

	652 /* each e_I will hold \|in\|^{2^I - 1} */

	653 felem e2, e4, e8, e16, e32, e64;

	654 unsigned int i;

	655

	656 felem_square(ftmp, in); /* 2^1 */

	657 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */

	658 felem_assign(e2, ftmp);

	659 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */

	660 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */

	661 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */

	662 felem_assign(e4, ftmp);

	663 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */

	664 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */

	665 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */

	666 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */

	667 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */

	668 felem_assign(e8, ftmp);

	669 for (i = 0; i < 8; i++) {

	670 felem_square(ftmp, ftmp);

	671 } /* 2^16 - 2^8 */

	672 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */

	673 felem_assign(e16, ftmp);

	674 for (i = 0; i < 16; i++) {

	675 felem_square(ftmp, ftmp);

	676 } /* 2^32 - 2^16 */

	677 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */

	678 felem_assign(e32, ftmp);

	679 for (i = 0; i < 32; i++) {

	680 felem_square(ftmp, ftmp);

	681 } /* 2^64 - 2^32 */

	682 felem_assign(e64, ftmp);

	683 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */

	684 for (i = 0; i < 192; i++) {

	685 felem_square(ftmp, ftmp);

	686 } /* 2^256 - 2^224 + 2^192 */

	687

	688 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */

	689 for (i = 0; i < 16; i++) {

	690 felem_square(ftmp2, ftmp2);

	691 } /* 2^80 - 2^16 */

	692 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */

	693 for (i = 0; i < 8; i++) {

	694 felem_square(ftmp2, ftmp2);

	695 } /* 2^88 - 2^8 */

	696 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */

	697 for (i = 0; i < 4; i++) {

	698 felem_square(ftmp2, ftmp2);

	699 } /* 2^92 - 2^4 */

	700 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */

	701 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */

	702 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */

	703 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */

	704 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */

	705 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */

	706 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */

	707

	708 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */

	709 }

	710

	711 /* felem_scalar_3 sets out=3*out.

	712 *

	713 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	714 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	715 */

	716 static void felem_scalar_3(felem out)

	717 {

	718 limb carry = 0;

	719 unsigned int i;

	720

	721 for (i = 0;; i++) {

	722 out[i] *= 3;

	723 out[i] += carry;

	724 carry = out[i] >> 29;

	725 out[i] &= kBottom29Bits;

	726

	727 i++;

	728 if (i == NLIMBS)

	729 break;

	730

	731 out[i] *= 3;

	732 out[i] += carry;

	733 carry = out[i] >> 28;

	734 out[i] &= kBottom28Bits;

	735 }

	736

	737 felem_reduce_carry(out, carry);

	738 }

	739

	740 /* felem_scalar_4 sets out=4*out.

	741 *

	742 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	743 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	744 */

	745 static void felem_scalar_4(felem out)

	746 {

	747 limb carry = 0, next_carry;

	748 unsigned int i;

	749

	750 for (i = 0;; i++) {

	751 next_carry = out[i] >> 27;

	752 out[i] <<= 2;

	753 out[i] &= kBottom29Bits;

	754 out[i] += carry;

	755 carry = next_carry + (out[i] >> 29);

	756 out[i] &= kBottom29Bits;

	757

	758 i++;

	759 if (i == NLIMBS)

	760 break;

	761 next_carry = out[i] >> 26;

	762 out[i] <<= 2;

	763 out[i] &= kBottom28Bits;

	764 out[i] += carry;

	765 carry = next_carry + (out[i] >> 28);

	766 out[i] &= kBottom28Bits;

	767 }

	768

	769 felem_reduce_carry(out, carry);

	770 }

	771

	772 /* felem_scalar_8 sets out=8*out.

	773 *

	774 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	775 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	776 */

	777 static void felem_scalar_8(felem out)

	778 {

	779 limb carry = 0, next_carry;

	780 unsigned int i;

	781

	782 for (i = 0;; i++) {

	783 next_carry = out[i] >> 26;

	784 out[i] <<= 3;

	785 out[i] &= kBottom29Bits;

	786 out[i] += carry;

	787 carry = next_carry + (out[i] >> 29);

	788 out[i] &= kBottom29Bits;

	789

	790 i++;

	791 if (i == NLIMBS)

	792 break;

	793 next_carry = out[i] >> 25;

	794 out[i] <<= 3;

	795 out[i] &= kBottom28Bits;

	796 out[i] += carry;

	797 carry = next_carry + (out[i] >> 28);

	798 out[i] &= kBottom28Bits;

	799 }

	800

	801 felem_reduce_carry(out, carry);

	802 }

	803

	804 /* felem_is_zero_vartime returns 1 iff \|in\| == 0. It takes a variable amount of

	805 * time depending on the value of \|in\|.

	806 */

	807 static char felem_is_zero_vartime(const felem in)

	808 {

	809 limb carry;

	810 int i;

	811 limb tmp[NLIMBS];

	812 felem_assign(tmp, in);

	813

	814 /* First, reduce tmp to a minimal form.

	815 */

	816 do {

	817 carry = 0;

	818 for (i = 0;; i++) {

	819 tmp[i] += carry;

	820 carry = tmp[i] >> 29;

	821 tmp[i] &= kBottom29Bits;

	822

	823 i++;

	824 if (i == NLIMBS)

	825 break;

	826

	827 tmp[i] += carry;

	828 carry = tmp[i] >> 28;

	829 tmp[i] &= kBottom28Bits;

	830 }

	831

	832 felem_reduce_carry(tmp, carry);

	833 } while (carry);

	834

	835 /* tmp < 2**257, so the only possible zero values are 0, p and 2p.

	836 */

	837 return memcmp(tmp, kZero, sizeof(tmp)) == 0 \|\|

	838 memcmp(tmp, kP, sizeof(tmp)) == 0 \|\|

	839 memcmp(tmp, k2P, sizeof(tmp)) == 0;

	840 }

	841

	842 /* Group operations:

	843 *

	844 * Elements of the elliptic curve group are represented in Jacobian

	845 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z2, y'=y/z3 in

	846 * Jacobian form.

	847 */

	848

	849 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.

	850 *

	851 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l

	852 */

	853 static void point_double(felem x_out, felem y_out, felem z_out,

	854 const felem x, const felem y, const felem z)

	855 {

	856 felem delta, gamma, alpha, beta, tmp, tmp2;

	857

	858 felem_square(delta, z);

	859 felem_square(gamma, y);

	860 felem_mul(beta, x, gamma);

	861

	862 felem_sum(tmp, x, delta);

	863 felem_diff(tmp2, x, delta);

	864 felem_mul(alpha, tmp, tmp2);

	865 felem_scalar_3(alpha);

	866

	867 felem_sum(tmp, y, z);

	868 felem_square(tmp, tmp);

	869 felem_diff(tmp, tmp, gamma);

	870 felem_diff(z_out, tmp, delta);

	871

	872 felem_scalar_4(beta);

	873 felem_square(x_out, alpha);

	874 felem_diff(x_out, x_out, beta);

	875 felem_diff(x_out, x_out, beta);

	876

	877 felem_diff(tmp, beta, x_out);

	878 felem_mul(tmp, alpha, tmp);

	879 felem_square(tmp2, gamma);

	880 felem_scalar_8(tmp2);

	881 felem_diff(y_out, tmp, tmp2);

	882 }

	883

	884 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.

	885 * (i.e. the second point is affine.)

	886 *

	887 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	888 *

	889 * Note that this function does not handle P+P, infinity+P nor P+infinity

	890 * correctly.

	891 */

	892 static void point_add_mixed(felem x_out, felem y_out, felem z_out,

	893 const felem x1, const felem y1, const felem z1,

	894 const felem x2, const felem y2)

	895 {

	896 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;

	897

	898 felem_square(z1z1, z1);

	899 felem_sum(tmp, z1, z1);

	900

	901 felem_mul(u2, x2, z1z1);

	902 felem_mul(z1z1z1, z1, z1z1);

	903 felem_mul(s2, y2, z1z1z1);

	904 felem_diff(h, u2, x1);

	905 felem_sum(i, h, h);

	906 felem_square(i, i);

	907 felem_mul(j, h, i);

	908 felem_diff(r, s2, y1);

	909 felem_sum(r, r, r);

	910 felem_mul(v, x1, i);

	911

	912 felem_mul(z_out, tmp, h);

	913 felem_square(rr, r);

	914 felem_diff(x_out, rr, j);

	915 felem_diff(x_out, x_out, v);

	916 felem_diff(x_out, x_out, v);

	917

	918 felem_diff(tmp, v, x_out);

	919 felem_mul(y_out, tmp, r);

	920 felem_mul(tmp, y1, j);

	921 felem_diff(y_out, y_out, tmp);

	922 felem_diff(y_out, y_out, tmp);

	923 }

	924

	925 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.

	926 *

	927 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	928 *

	929 * Note that this function does not handle P+P, infinity+P nor P+infinity

	930 * correctly.

	931 */

	932 static void point_add(felem x_out, felem y_out, felem z_out,

	933 const felem x1, const felem y1, const felem z1,

	934 const felem x2, const felem y2, const felem z2)

	935 {

	936 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	937

	938 felem_square(z1z1, z1);

	939 felem_square(z2z2, z2);

	940 felem_mul(u1, x1, z2z2);

	941

	942 felem_sum(tmp, z1, z2);

	943 felem_square(tmp, tmp);

	944 felem_diff(tmp, tmp, z1z1);

	945 felem_diff(tmp, tmp, z2z2);

	946

	947 felem_mul(z2z2z2, z2, z2z2);

	948 felem_mul(s1, y1, z2z2z2);

	949

	950 felem_mul(u2, x2, z1z1);

	951 felem_mul(z1z1z1, z1, z1z1);

	952 felem_mul(s2, y2, z1z1z1);

	953 felem_diff(h, u2, u1);

	954 felem_sum(i, h, h);

	955 felem_square(i, i);

	956 felem_mul(j, h, i);

	957 felem_diff(r, s2, s1);

	958 felem_sum(r, r, r);

	959 felem_mul(v, u1, i);

	960

	961 felem_mul(z_out, tmp, h);

	962 felem_square(rr, r);

	963 felem_diff(x_out, rr, j);

	964 felem_diff(x_out, x_out, v);

	965 felem_diff(x_out, x_out, v);

	966

	967 felem_diff(tmp, v, x_out);

	968 felem_mul(y_out, tmp, r);

	969 felem_mul(tmp, s1, j);

	970 felem_diff(y_out, y_out, tmp);

	971 felem_diff(y_out, y_out, tmp);

	972 }

	973

	974 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +

	975 * {x2,y2,z2}.

	976 *

	977 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	978 *

	979 * This function handles the case where {x1,y1,z1}={x2,y2,z2}.

	980 */

	981 static void point_add_or_double_vartime(

	982 felem x_out, felem y_out, felem z_out,

	983 const felem x1, const felem y1, const felem z1,

	984 const felem x2, const felem y2, const felem z2)

	985 {

	986 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	987 char x_equal, y_equal;

	988

	989 felem_square(z1z1, z1);

	990 felem_square(z2z2, z2);

	991 felem_mul(u1, x1, z2z2);

	992

	993 felem_sum(tmp, z1, z2);

	994 felem_square(tmp, tmp);

	995 felem_diff(tmp, tmp, z1z1);

	996 felem_diff(tmp, tmp, z2z2);

	997

	998 felem_mul(z2z2z2, z2, z2z2);

	999 felem_mul(s1, y1, z2z2z2);

	1000

	1001 felem_mul(u2, x2, z1z1);

	1002 felem_mul(z1z1z1, z1, z1z1);

	1003 felem_mul(s2, y2, z1z1z1);

	1004 felem_diff(h, u2, u1);

	1005 x_equal = felem_is_zero_vartime(h);

	1006 felem_sum(i, h, h);

	1007 felem_square(i, i);

	1008 felem_mul(j, h, i);

	1009 felem_diff(r, s2, s1);

	1010 y_equal = felem_is_zero_vartime(r);

	1011 if (x_equal && y_equal) {

	1012 point_double(x_out, y_out, z_out, x1, y1, z1);

	1013 return;

	1014 }

	1015 felem_sum(r, r, r);

	1016 felem_mul(v, u1, i);

	1017

	1018 felem_mul(z_out, tmp, h);

	1019 felem_square(rr, r);

	1020 felem_diff(x_out, rr, j);

	1021 felem_diff(x_out, x_out, v);

	1022 felem_diff(x_out, x_out, v);

	1023

	1024 felem_diff(tmp, v, x_out);

	1025 felem_mul(y_out, tmp, r);

	1026 felem_mul(tmp, s1, j);

	1027 felem_diff(y_out, y_out, tmp);

	1028 felem_diff(y_out, y_out, tmp);

	1029 }

	1030

	1031 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.

	1032 *

	1033 * On entry: mask is either 0 or 0xffffffff.

	1034 */

	1035 static void copy_conditional(felem out, const felem in, limb mask)

	1036 {

	1037 int i;

	1038

	1039 for (i = 0; i < NLIMBS; i++) {

	1040 const limb tmp = mask & (in[i] ^ out[i]);

	1041 out[i] ^= tmp;

	1042 }

	1043 }

	1044

	1045 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.

	1046 * On entry: index < 16, table[0] must be zero.

	1047 */

	1048 static void select_affine_point(felem out_x, felem out_y,

	1049 const limb *table, limb index)

	1050 {

	1051 limb i, j;

	1052

	1053 memset(out_x, 0, sizeof(felem));

	1054 memset(out_y, 0, sizeof(felem));

	1055

	1056 for (i = 1; i < 16; i++) {

	1057 limb mask = i ^ index;

	1058 mask \|= mask >> 2;

	1059 mask \|= mask >> 1;

	1060 mask &= 1;

	1061 mask--;

	1062 for (j = 0; j < NLIMBS; j++, table++) {

	1063 out_x[j] \|= *table & mask;

	1064 }

	1065 for (j = 0; j < NLIMBS; j++, table++) {

	1066 out_y[j] \|= *table & mask;

	1067 }

	1068 }

	1069 }

	1070

	1071 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of

	1072 * table. On entry: index < 16, table[0] must be zero.

	1073 */

	1074 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,

	1075 const limb *table, limb index)

	1076 {

	1077 limb i, j;

	1078

	1079 memset(out_x, 0, sizeof(felem));

	1080 memset(out_y, 0, sizeof(felem));

	1081 memset(out_z, 0, sizeof(felem));

	1082

	1083 /* The implicit value at index 0 is all zero. We don't need to perform that

	1084 * iteration of the loop because we already set out_* to zero.

	1085 */

	1086 table += 3*NLIMBS;

	1087

	1088 for (i = 1; i < 16; i++) {

	1089 limb mask = i ^ index;

	1090 mask \|= mask >> 2;

	1091 mask \|= mask >> 1;

	1092 mask &= 1;

	1093 mask--;

	1094 for (j = 0; j < NLIMBS; j++, table++) {

	1095 out_x[j] \|= *table & mask;

	1096 }

	1097 for (j = 0; j < NLIMBS; j++, table++) {

	1098 out_y[j] \|= *table & mask;

	1099 }

	1100 for (j = 0; j < NLIMBS; j++, table++) {

	1101 out_z[j] \|= *table & mask;

	1102 }

	1103 }

	1104 }

	1105

	1106 /* get_bit returns the bit'th bit of scalar. */

	1107 static char get_bit(const u8 scalar[32], int bit)

	1108 {

	1109 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;

	1110 }

	1111

	1112 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian

	1113 * number. Note that the value of scalar must be less than the order of the

	1114 * group.

	1115 */

	1116 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])

	1117 {

	1118 int i, j;

	1119 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;

	1120 u32 table_offset;

	1121

	1122 felem px, py;

	1123 felem tx, ty, tz;

	1124

	1125 memset(nx, 0, sizeof(felem));

	1126 memset(ny, 0, sizeof(felem));

	1127 memset(nz, 0, sizeof(felem));

	1128

	1129 /* The loop adds bits at positions 0, 64, 128 and 192, followed by

	1130 * positions 32,96,160 and 224 and does this 32 times.

	1131 */

	1132 for (i = 0; i < 32; i++) {

	1133 if (i) {

	1134 point_double(nx, ny, nz, nx, ny, nz);

	1135 }

	1136 for (j = 0; j <= 32; j += 32) {

	1137 char bit0 = get_bit(scalar, 31 - i + j);

	1138 char bit1 = get_bit(scalar, 95 - i + j);

	1139 char bit2 = get_bit(scalar, 159 - i + j);

	1140 char bit3 = get_bit(scalar, 223 - i + j);

	1141 limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);

	1142

	1143 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);

	1144 select_affine_point(px, py, kPrecomputed + table_offset, index);

	1145

	1146 /* Since scalar is less than the order of the group, we know that

	1147 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle

	1148 * below.

	1149 */

	1150 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);

	1151 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero

	1152 * (a.k.a. the point at infinity). We handle that situation by

	1153 * copying the point from the table.

	1154 */

	1155 copy_conditional(nx, px, n_is_infinity_mask);

	1156 copy_conditional(ny, py, n_is_infinity_mask);

	1157 copy_conditional(nz, kOne, n_is_infinity_mask);

	1158

	1159 /* Equally, the result is also wrong if the point from the table is

	1160 * zero, which happens when the index is zero. We handle that by

	1161 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0.

	1162 */

	1163 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);

	1164 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1165 copy_conditional(nx, tx, mask);

	1166 copy_conditional(ny, ty, mask);

	1167 copy_conditional(nz, tz, mask);

	1168 /* If p was not zero, then n is now non-zero. */

	1169 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1170 }

	1171 }

	1172 }

	1173

	1174 /* point_to_affine converts a Jacobian point to an affine point. If the input

	1175 * is the point at infinity then it returns (0, 0) in constant time.

	1176 */

	1177 static void point_to_affine(felem x_out, felem y_out,

	1178 const felem nx, const felem ny, const felem nz) {

	1179 felem z_inv, z_inv_sq;

	1180 felem_inv(z_inv, nz);

	1181 felem_square(z_inv_sq, z_inv);

	1182 felem_mul(x_out, nx, z_inv_sq);

	1183 felem_mul(z_inv, z_inv, z_inv_sq);

	1184 felem_mul(y_out, ny, z_inv);

	1185 }

	1186

	1187 /* scalar_mult sets {nx,ny,nz} = scalar{x,y}. /

	1188 static void scalar_mult(felem nx, felem ny, felem nz,

	1189 const felem x, const felem y, const u8 scalar[32])

	1190 {

	1191 int i;

	1192 felem px, py, pz, tx, ty, tz;

	1193 felem precomp[16][3];

	1194 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;

	1195

	1196 /* We precompute 0,1,2,... times {x,y}. */

	1197 memset(precomp, 0, sizeof(felem) * 3);

	1198 memcpy(&precomp[1][0], x, sizeof(felem));

	1199 memcpy(&precomp[1][1], y, sizeof(felem));

	1200 memcpy(&precomp[1][2], kOne, sizeof(felem));

	1201

	1202 for (i = 2; i < 16; i += 2) {

	1203 point_double(precomp[i][0], precomp[i][1], precomp[i][2],

	1204 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);

	1205

	1206 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],

	1207 precomp[i][0], precomp[i][1], precomp[i][2], x, y);

	1208 }

	1209

	1210 memset(nx, 0, sizeof(felem));

	1211 memset(ny, 0, sizeof(felem));

	1212 memset(nz, 0, sizeof(felem));

	1213 n_is_infinity_mask = -1;

	1214

	1215 /* We add in a window of four bits each iteration and do this 64 times. */

	1216 for (i = 0; i < 64; i++) {

	1217 if (i) {

	1218 point_double(nx, ny, nz, nx, ny, nz);

	1219 point_double(nx, ny, nz, nx, ny, nz);

	1220 point_double(nx, ny, nz, nx, ny, nz);

	1221 point_double(nx, ny, nz, nx, ny, nz);

	1222 }

	1223

	1224 index = scalar[31 - i / 2];

	1225 if ((i & 1) == 1) {

	1226 index &= 15;

	1227 } else {

	1228 index >>= 4;

	1229 }

	1230

	1231 /* See the comments in scalar_base_mult about handling infinities. */

	1232 select_jacobian_point(px, py, pz, (limb *) precomp, index);

	1233 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);

	1234 copy_conditional(nx, px, n_is_infinity_mask);

	1235 copy_conditional(ny, py, n_is_infinity_mask);

	1236 copy_conditional(nz, pz, n_is_infinity_mask);

	1237

	1238 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;

	1239 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1240 copy_conditional(nx, tx, mask);

	1241 copy_conditional(ny, ty, mask);

	1242 copy_conditional(nz, tz, mask);

	1243 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1244 }

	1245 }

	1246

	1247 /* Interface with Freebl: */

	1248

	1249 #ifdef IS_BIG_ENDIAN
	wtc 2013/01/25 02:32:49 I changed #if to #ifdef here. I changed #if to #ifdef here.
	1250 #error "This code needs a little-endian processor"

	1251 #endif

	1252

	1253 static const u32 kRInvDigits[8] = {

	1254 0x80000000, 1, 0xffffffff, 0,

	1255 0x80000001, 0xfffffffe, 1, 0x7fffffff

	1256 };

	1257 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))

	1258 static const mp_int kRInv = {

	1259 MP_ZPOS,

	1260 MP_DIGITS_IN_256_BITS,

	1261 MP_DIGITS_IN_256_BITS,

	1262 /* Because we are running on a little-endian processor, this cast works for

	1263 * both 32 and 64-bit processors.

	1264 */

	1265 (mp_digit*) kRInvDigits

	1266 };

	1267

	1268 static const limb kTwo28 = 0x10000000;

	1269 static const limb kTwo29 = 0x20000000;

	1270

	1271 /* to_montgomery sets out = Rin. /

	1272 static mp_err to_montgomery(felem out, const mp_int in, const ECGroup group)

	1273 {

	1274 /* There are no MPI functions for bitshift operations and we wish to shift

	1275 * in 257 bits left so we move the digits 256-bits left and then multiply

	1276 * by two.

	1277 */

	1278 mp_int in_shifted;

	1279 int i;

	1280 mp_err res;

	1281

	1282 mp_init(&in_shifted);

	1283 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);

	1284 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),

	1285 MP_DIGITS(in),

	1286 MP_USED(in)*sizeof(mp_digit));

	1287 mp_mul_2(&in_shifted, &in_shifted);

	1288 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));

	1289

	1290 for (i = 0;; i++) {

	1291 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;

	1292 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);

	1293

	1294 i++;

	1295 if (i == NLIMBS)

	1296 break;

	1297 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;

	1298 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);

	1299 }

	1300

	1301 CLEANUP:

	1302 mp_clear(&in_shifted);

	1303 return res;

	1304 }

	1305

	1306 /* from_montgomery sets out=in/R. */

	1307 static mp_err from_montgomery(mp_int *out, const felem in,

	1308 const ECGroup *group)

	1309 {

	1310 mp_int result, tmp;

	1311 mp_err res;

	1312 int i;

	1313

	1314 mp_init(&result);

	1315 mp_init(&tmp);

	1316

	1317 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));

	1318 for (i = NLIMBS-2; i >= 0; i--) {

	1319 if ((i & 1) == 0) {

	1320 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));

	1321 } else {

	1322 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));

	1323 }

	1324 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));

	1325 }

	1326

	1327 MP_CHECKOK(mp_mul(&result, &kRInv, out));

	1328 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));

	1329

	1330 CLEANUP:

	1331 mp_clear(&result);

	1332 mp_clear(&tmp);

	1333 return res;

	1334 }

	1335

	1336 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */

	1337 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)

	1338 {

	1339 /* We require that \|n\| is less than the order of the group and therefore it

	1340 * will fit into \|scalar\|. However, these is a timing side-channel here that

	1341 * we cannot avoid: if \|n\| is sufficiently small it may be one or more words

	1342 * too short and we'll copy less data.

	1343 */

	1344 memset(out_scalar, 0, 32);

	1345 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));

	1346 }

	1347

	1348 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the

	1349 * order of the group.

	1350 */

	1351 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,

	1352 mp_int out_x, mp_int out_y,

	1353 const ECGroup *group)

	1354 {

	1355 u8 scalar[32];

	1356 felem x, y, z, x_affine, y_affine;

	1357 mp_err res;

	1358

	1359 /* FIXME(agl): test that n < order. */

	1360

	1361 scalar_from_mp_int(scalar, n);

	1362 scalar_base_mult(x, y, z, scalar);

	1363 point_to_affine(x_affine, y_affine, x, y, z);

	1364 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1365 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1366

	1367 CLEANUP:

	1368 return res;

	1369 }

	1370

	1371 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <

	1372 * the order of the group.

	1373 */

	1374 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,

	1375 const mp_int in_x, const mp_int in_y,

	1376 mp_int out_x, mp_int out_y,

	1377 const ECGroup *group)

	1378 {

	1379 u8 scalar[32];

	1380 felem x, y, z, x_affine, y_affine, px, py;

	1381 mp_err res;

	1382

	1383 scalar_from_mp_int(scalar, n);

	1384

	1385 MP_CHECKOK(to_montgomery(px, in_x, group));

	1386 MP_CHECKOK(to_montgomery(py, in_y, group));

	1387

	1388 scalar_mult(x, y, z, px, py, scalar);

	1389 point_to_affine(x_affine, y_affine, x, y, z);

	1390 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1391 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1392

	1393 CLEANUP:

	1394 return res;

	1395 }

	1396

	1397 /* ec_GFp_nistp256_point_mul_vartime sets {out_x,out_y} = n1*G +

	1398 * n2*{in_x,in_y}, where n1 and n2 are < the order of the group.

	1399 *

	1400 * As indicated by the name, this function operates in variable time. This

	1401 * is safe because it's used for signature validation which doesn't deal

	1402 * with secrets.

	1403 */

	1404 mp_err ec_GFp_nistp256_points_mul_vartime(

	1405 const mp_int n1, const mp_int n2,

	1406 const mp_int in_x, const mp_int in_y,

	1407 mp_int out_x, mp_int out_y,

	1408 const ECGroup *group)

	1409 {

	1410 u8 scalar1[32], scalar2[32];

	1411 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;

	1412 mp_err res = MP_OKAY;

	1413

	1414 /* If n2 == NULL, this is just a base-point multiplication. */

	1415 if (n2 == NULL) {

	1416 return ec_GFp_nistp256_base_point_mul(n1, out_x, out_y, group);

	1417 }

	1418

	1419 /* If n1 == nULL, this is just an arbitary-point multiplication. */

	1420 if (n1 == NULL) {

	1421 return ec_GFp_nistp256_point_mul(n2, in_x, in_y, out_x, out_y, group);

	1422 }

	1423

	1424 /* If both scalars are zero, then the result is the point at infinity. */

	1425 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {

	1426 mp_zero(out_x);

	1427 mp_zero(out_y);

	1428 return res;

	1429 }

	1430

	1431 scalar_from_mp_int(scalar1, n1);

	1432 scalar_from_mp_int(scalar2, n2);

	1433

	1434 MP_CHECKOK(to_montgomery(px, in_x, group));

	1435 MP_CHECKOK(to_montgomery(py, in_y, group));

	1436 scalar_base_mult(x1, y1, z1, scalar1);

	1437 scalar_mult(x2, y2, z2, px, py, scalar2);

	1438

	1439 if (mp_cmp_z(n2) == 0) {

	1440 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just

	1441 * {x1,y1,z1}. */

	1442 } else if (mp_cmp_z(n1) == 0) {

	1443 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just

	1444 * {x2,y2,z2}. */

	1445 memcpy(x1, x2, sizeof(x2));

	1446 memcpy(y1, y2, sizeof(y2));

	1447 memcpy(z1, z2, sizeof(z2));

	1448 } else {

	1449 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */

	1450 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);

	1451 }

	1452

	1453 point_to_affine(x_affine, y_affine, x1, y1, z1);

	1454 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1455 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1456

	1457 CLEANUP:

	1458 return res;

	1459 }

OLD	NEW

« mozilla/security/nss/lib/freebl/ecl/ecp_256.c ('K') | « mozilla/security/nss/lib/freebl/ecl/ecp_256.c ('k') | nss.gyp » ('j') | no next file with comments »