ecp_256_32.c - Issue 10944017: 32-bit, P-256 implementation.

Side by Side Diff: ecp_256_32.c

Issue 10944017: 32-bit, P-256 implementation. Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Addressing wtc'c comments. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * A 32-bit implementation of the NIST P-256 elliptic curve.

	3 *

	4 * This Source Code Form is subject to the terms of the Mozilla Public

	5 * License, v. 2.0. If a copy of the MPL was not distributed with this

	6 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

	7 */

	8

	9 #include <stdint.h>

	10 #include <stdio.h>

	11 #include <stdlib.h>

	12 #include <string.h>

	13

	14 #include "prtypes.h"

	15 #include "mpi.h"

	16 #include "mpi-priv.h"

	17 #include "ecp.h"

	18

	19 typedef PRUint8 u8;

	20 typedef PRUint32 u32;

	21 typedef PRInt32 s32;

	22 typedef PRUint64 u64;

	23

	24 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's

	25 * MPI library calls them digits, but here they are called limbs, which is

	26 * GMP's terminology.

	27 *

	28 * The value of an felem (field element) is:

	29 * x[0] + (x[1] * 2*29) + (x[2] 2*57) + ... + (x[8] 2**228)

	30 *

	31 * That is, each limb is alternately 29 or 28-bits wide in little-endian

	32 * order.

	33 *

	34 * This means that an felem hits 2257, rather than 2256 as we would like. A

	35 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems

	36 * when multiplying as terms end up one bit short of a limb which would require

	37 * much bit-shifting to correct.

	38 *

	39 * Finally, the values stored in an felem are in Montgomery form. So the value

	40 * \|y\| is stored as (yR) mod p, where p is the P-256 prime and R is 2*257.

	41 */

	42 typedef u32 limb;

	43 #define NLIMBS 9

	44 typedef limb felem[NLIMBS];

	45

	46 static const limb kBottom28Bits = 0xfffffff;

	47 static const limb kBottom29Bits = 0x1fffffff;

	48

	49 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and

	50 * 28-bit words.

	51 */

	52 static const felem kOne = {

	53 2, 0, 0, 0xffff800,

	54 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,

	55 0

	56 };

	57 static const felem kZero = {0};

	58 static const felem kP = {

	59 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,

	60 0, 0, 0x200000, 0xf000000,

	61 0xfffffff

	62 };

	63 static const felem k2P = {

	64 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,

	65 0, 0, 0x400000, 0xe000000,

	66 0x1fffffff

	67 };

	68

	69 /* kPrecomputed contains precomputed values to aid the calculation of scalar

	70 * multiples of the base point, G. It's actually two, equal length, tables

	71 * concatenated.

	72 *

	73 * The first table contains (x,y) felem pairs for 16 multiples of the base

	74 * point, G.

	75 *

	76 * Index \| Index (binary) \| Value

	77 * 0 \| 0000 \| 0G (all zeros, omitted)

	78 * 1 \| 0001 \| G

	79 * 2 \| 0010 \| 2**64G

	80 * 3 \| 0011 \| 2**64G + G

	81 * 4 \| 0100 \| 2**128G

	82 * 5 \| 0101 \| 2**128G + G

	83 * 6 \| 0110 \| 2128G + 264G

	84 * 7 \| 0111 \| 2128G + 264G + G

	85 * 8 \| 1000 \| 2**192G

	86 * 9 \| 1001 \| 2**192G + G

	87 * 10 \| 1010 \| 2192G + 264G

	88 * 11 \| 1011 \| 2192G + 264G + G

	89 * 12 \| 1100 \| 2192G + 2128G

	90 * 13 \| 1101 \| 2192G + 2128G + G

	91 * 14 \| 1110 \| 2192G + 2128G + 2**64G

	92 * 15 \| 1111 \| 2192G + 2128G + 2**64G + G

	93 *

	94 * The second table follows the same style, but the terms are 2**32G,

	95 * 296G, 2160G, 2**224G.

	96 *

	97 * This is ~2KB of data.

	98 */

	99 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {

	100 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,

	101 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,

	102 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,

	103 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,

	104 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,

	105 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,

	106 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,

	107 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,

	108 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,

	109 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,

	110 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,

	111 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,

	112 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,

	113 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,

	114 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,

	115 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,

	116 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,

	117 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,

	118 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,

	119 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,

	120 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,

	121 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,

	122 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,

	123 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,

	124 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,

	125 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,

	126 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,

	127 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,

	128 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,

	129 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,

	130 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,

	131 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,

	132 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,

	133 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,

	134 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,

	135 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,

	136 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,

	137 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,

	138 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,

	139 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,

	140 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,

	141 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,

	142 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,

	143 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,

	144 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,

	145 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,

	146 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,

	147 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,

	148 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,

	149 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,

	150 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,

	151 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,

	152 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,

	153 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,

	154 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,

	155 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,

	156 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,

	157 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,

	158 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,

	159 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,

	160 };

	161

	162 /* Field element operations:

	163 */

	164

	165 /* NON_ZERO_TO_ALL_ONES returns:

	166 * 0xffffffff for 0 < x <= 2**31

	167 * 0 for x == 0 or x > 2**31.

	168 *

	169 * This macro assumes that right-shifting a signed number shifts in the MSB on

	170 * the left. This is not ensured by the C standard, but is true on the CPUs

	171 * that we're targetting with this code (x86 and ARM).

	172 */

	173 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) ((x)-1)) >> 31)))

	174

	175 /* felem_reduce_carry adds a multiple of p in order to cancel \|carry\|,

	176 * which is a term at 2**257.

	177 *

	178 * On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.

	179 * On exit: inout[0,2,..] < 230, inout[1,3,...] < 229.

	180 */

	181 static void felem_reduce_carry(felem inout, limb carry)

	182 {

	183 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);

	184

	185 inout[0] += carry << 1;

	186 inout[3] += 0x10000000 & carry_mask;

	187 /* carry < 23 thus (carry << 11) < 214 and we added 2**28 in the

	188 * previous line therefore this doesn't underflow.

	189 */

	190 inout[3] -= carry << 11;

	191 inout[4] += (0x20000000 - 1) & carry_mask;

	192 inout[5] += (0x10000000 - 1) & carry_mask;

	193 inout[6] += (0x20000000 - 1) & carry_mask;

	194 inout[6] -= carry << 22;

	195 /* This may underflow if carry is non-zero but, if so, we'll fix it in the

	196 * next line.

	197 */

	198 inout[7] -= 1 & carry_mask;

	199 inout[7] += carry << 25;

	200 }

	201

	202 /* felem_sum sets out = in+in2.

	203 *

	204 * On entry, in[i]+in2[i] must not overflow a 32-bit word.

	205 * On exit: out[0,2,...] < 230, out[1,3,...] < 229

	206 */

	207 static void felem_sum(felem out, const felem in, const felem in2)

	208 {

	209 limb carry = 0;

	210 unsigned int i;

	211 for (i = 0;; i++) {

	212 out[i] = in[i] + in2[i];

	213 out[i] += carry;

	214 carry = out[i] >> 29;

	215 out[i] &= kBottom29Bits;

	216

	217 i++;

	218 if (i == NLIMBS)

	219 break;

	220

	221 out[i] = in[i] + in2[i];

	222 out[i] += carry;

	223 carry = out[i] >> 28;

	224 out[i] &= kBottom28Bits;

	225 }

	226

	227 felem_reduce_carry(out, carry);

	228 }

	229

	230 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)

	231 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)

	232 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)

	233 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)

	234 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)

	235 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)

	236

	237 /* zero31 is 0 mod p.

	238 */

	239 static const felem zero31 = {

	240 two31m3, two30m2, two31m2, two30p13m2,

	241 two31m2, two30m2, two31p24m2, two30m27m2,

	242 two31m2

	243 };

	244

	245 /* felem_diff sets out = in-in2.

	246 *

	247 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	248 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	249 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	250 */

	251 static void felem_diff(felem out, const felem in, const felem in2)

	252 {

	253 limb carry = 0;

	254 unsigned int i;

	255

	256 for (i = 0;; i++) {

	257 out[i] = in[i] - in2[i];

	258 out[i] += zero31[i];

	259 out[i] += carry;

	260 carry = out[i] >> 29;

	261 out[i] &= kBottom29Bits;

	262

	263 i++;

	264 if (i == NLIMBS)

	265 break;

	266

	267 out[i] = in[i] - in2[i];

	268 out[i] += zero31[i];

	269 out[i] += carry;

	270 carry = out[i] >> 28;

	271 out[i] &= kBottom28Bits;

	272 }

	273

	274 felem_reduce_carry(out, carry);

	275 }

	276

	277 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words

	278 * with the same 29,28,... bit positions as an felem.

	279 *

	280 * The values in felems are in Montgomery form: xR mod p where R = 2*257.

	281 * Since we just multiplied two Montgomery values together, the result is

	282 * xyR*R mod p. We wish to divide by R in order for the result also to be

	283 * in Montgomery form.

	284 *

	285 * On entry: tmp[i] < 2**64

	286 * On exit: out[0,2,...] < 230, out[1,3,...] < 229

	287 */

	288 static void felem_reduce_degree(felem out, u64 tmp[17])

	289 {

	290 /* The following table may be helpful when reading this code:

	291 *

	292 * Limb number: 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \| 10...

	293 * Width (bits): 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29

	294 * Start bit: 0 \| 29\| 57\| 86\|114\|143\|171\|200\|228\|257\|285

	295 * (odd phase): 0 \| 28\| 57\| 85\|114\|142\|171\|199\|228\|256\|285

	296 */

	297 limb tmp2[18], carry, x, xMask;

	298 unsigned int i;

	299

	300 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an

	301 * felem. So the top of an element of tmp might overlap with another

	302 * element two positions down. The following loop eliminates this

	303 * overlap.

	304 */

	305 tmp2[0] = tmp[0] & kBottom29Bits;

	306

	307 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try

	308 * and hint to the compiler that it can do a single-word shift by selecting

	309 * the right register rather than doing a double-word shift and truncating

	310 * afterwards.

	311 */

	312 tmp2[1] = ((limb) tmp[0]) >> 29;

	313 tmp2[1] \|= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;

	314 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;

	315 carry = tmp2[1] >> 28;

	316 tmp2[1] &= kBottom28Bits;

	317

	318 for (i = 2; i < 17; i++) {

	319 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	320 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;

	321 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;

	322 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;

	323 tmp2[i] += carry;

	324 carry = tmp2[i] >> 29;

	325 tmp2[i] &= kBottom29Bits;

	326

	327 i++;

	328 if (i == 17)

	329 break;

	330 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	331 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;

	332 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;

	333 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;

	334 tmp2[i] += carry;

	335 carry = tmp2[i] >> 28;

	336 tmp2[i] &= kBottom28Bits;

	337 }

	338

	339 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;

	340 tmp2[17] += ((limb) (tmp[16])) >> 29;

	341 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);

	342 tmp2[17] += carry;

	343

	344 /* Montgomery elimination of terms:

	345 *

	346 * Since R is 2**257, we can divide by R with a bitwise shift if we can

	347 * ensure that the right-most 257 bits are all zero. We can make that true

	348 * by adding multiplies of p without affecting the value.

	349 *

	350 * So we eliminate limbs from right to left. Since the bottom 29 bits of p

	351 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.

	352 * We can do that for 8 further limbs and then right shift to eliminate the

	353 * extra factor of R.

	354 */

	355 for (i = 0;; i += 2) {

	356 tmp2[i + 1] += tmp2[i] >> 29;

	357 x = tmp2[i] & kBottom29Bits;

	358 xMask = NON_ZERO_TO_ALL_ONES(x);

	359 tmp2[i] = 0;

	360

	361 /* The bounds calculations for this loop are tricky. Each iteration of

	362 * the loop eliminates two words by adding values to words to their

	363 * right.

	364 *

	365 * The following table contains the amounts added to each word (as an

	366 * offset from the value of i at the top of the loop). The amounts are

	367 * accounted for from the first and second half of the loop separately

	368 * and are written as, for example, 28 to mean a value <2**28.

	369 *

	370 * Word: 3 4 5 6 7 8 9 10

	371 * Added in top half: 28 11 29 21 29 28

	372 * 28 29

	373 * 29

	374 * Added in bottom half: 29 10 28 21 28 28

	375 * 29

	376 *

	377 * The value that is currently offset 7 will be offset 5 for the next

	378 * iteration and then offset 3 for the iteration after that. Therefore

	379 * the total value added will be the values added at 7, 5 and 3.

	380 *

	381 * The following table accumulates these values. The sums at the bottom

	382 * are written as, for example, 29+28, to mean a value < 229+228.

	383 *

	384 * Word: 3 4 5 6 7 8 9 10 11 12 13

	385 * 28 11 10 29 21 29 28 28 28 28 28

	386 * 29 28 11 28 29 28 29 28 29 28

	387 * 29 28 21 21 29 21 29 21

	388 * 10 29 28 21 28 21 28

	389 * 28 29 28 29 28 29 28

	390 * 11 10 29 10 29 10

	391 * 29 28 11 28 11

	392 * 29 29

	393 * --------------------------------------------

	394 * 30+ 31+ 30+ 31+ 30+

	395 * 28+ 29+ 28+ 29+ 21+

	396 * 21+ 28+ 21+ 28+ 10

	397 * 10 21+ 10 21+

	398 * 11 11

	399 *

	400 * So the greatest amount is added to tmp2[10] and tmp2[12]. If

	401 * tmp2[10/12] has an initial value of <2**29, then the maximum value

	402 * will be < 231 + 230 + 228 + 221 + 211, which is < 232,

	403 * as required.

	404 */

	405 tmp2[i + 3] += (x << 10) & kBottom28Bits;

	406 tmp2[i + 4] += (x >> 18);

	407

	408 tmp2[i + 6] += (x << 21) & kBottom29Bits;

	409 tmp2[i + 7] += x >> 8;

	410

	411 /* At position 200, which is the starting bit position for word 7, we

	412 * have a factor of 0xf000000 = 228 - 224.

	413 */

	414 tmp2[i + 7] += 0x10000000 & xMask;

	415 /* Word 7 is 28 bits wide, so the 2*28 term exactly hits word 8. /

	416 tmp2[i + 8] += (x - 1) & xMask;

	417 tmp2[i + 7] -= (x << 24) & kBottom28Bits;

	418 tmp2[i + 8] -= x >> 4;

	419

	420 tmp2[i + 8] += 0x20000000 & xMask;

	421 tmp2[i + 8] -= x;

	422 tmp2[i + 8] += (x << 28) & kBottom29Bits;

	423 tmp2[i + 9] += ((x >> 1) - 1) & xMask;

	424

	425 if (i+1 == NLIMBS)

	426 break;

	427 tmp2[i + 2] += tmp2[i + 1] >> 28;

	428 x = tmp2[i + 1] & kBottom28Bits;

	429 xMask = NON_ZERO_TO_ALL_ONES(x);

	430 tmp2[i + 1] = 0;

	431

	432 tmp2[i + 4] += (x << 11) & kBottom29Bits;

	433 tmp2[i + 5] += (x >> 18);

	434

	435 tmp2[i + 7] += (x << 21) & kBottom28Bits;

	436 tmp2[i + 8] += x >> 7;

	437

	438 /* At position 199, which is the starting bit of the 8th word when

	439 * dealing with a context starting on an odd word, we have a factor of

	440 * 0x1e000000 = 229 - 225. Since we have not updated i, the 8th

	441 * word from i+1 is i+8.

	442 */

	443 tmp2[i + 8] += 0x20000000 & xMask;

	444 tmp2[i + 9] += (x - 1) & xMask;

	445 tmp2[i + 8] -= (x << 25) & kBottom29Bits;

	446 tmp2[i + 9] -= x >> 4;

	447

	448 tmp2[i + 9] += 0x10000000 & xMask;

	449 tmp2[i + 9] -= x;

	450 tmp2[i + 10] += (x - 1) & xMask;

	451 }

	452

	453 /* We merge the right shift with a carry chain. The words above 2**257 have

	454 * widths of 28,29,... which we need to correct when copying them down.

	455 */

	456 carry = 0;

	457 for (i = 0; i < 8; i++) {

	458 /* The maximum value of tmp2[i + 9] occurs on the first iteration and

	459 * is < 230+229+228. Adding 229 (from tmp2[i + 10]) is

	460 * therefore safe.

	461 */

	462 out[i] = tmp2[i + 9];

	463 out[i] += carry;

	464 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;

	465 carry = out[i] >> 29;

	466 out[i] &= kBottom29Bits;

	467

	468 i++;

	469 out[i] = tmp2[i + 9] >> 1;

	470 out[i] += carry;

	471 carry = out[i] >> 28;

	472 out[i] &= kBottom28Bits;

	473 }

	474

	475 out[8] = tmp2[17];

	476 out[8] += carry;

	477 carry = out[8] >> 29;

	478 out[8] &= kBottom29Bits;

	479

	480 felem_reduce_carry(out, carry);

	481 }

	482

	483 /* felem_square sets out=in*in.

	484 *

	485 * On entry: in[0,2,...] < 230, in[1,3,...] < 229.

	486 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	487 */

	488 static void felem_square(felem out, const felem in)

	489 {

	490 u64 tmp[17];

	491

	492 tmp[0] = ((u64) in[0]) * in[0];

	493 tmp[1] = ((u64) in[0]) * (in[1] << 1);

	494 tmp[2] = ((u64) in[0]) * (in[2] << 1) +

	495 ((u64) in[1]) * (in[1] << 1);

	496 tmp[3] = ((u64) in[0]) * (in[3] << 1) +

	497 ((u64) in[1]) * (in[2] << 1);

	498 tmp[4] = ((u64) in[0]) * (in[4] << 1) +

	499 ((u64) in[1]) * (in[3] << 2) +

	500 ((u64) in[2]) * in[2];

	501 tmp[5] = ((u64) in[0]) * (in[5] << 1) +

	502 ((u64) in[1]) * (in[4] << 1) +

	503 ((u64) in[2]) * (in[3] << 1);

	504 tmp[6] = ((u64) in[0]) * (in[6] << 1) +

	505 ((u64) in[1]) * (in[5] << 2) +

	506 ((u64) in[2]) * (in[4] << 1) +

	507 ((u64) in[3]) * (in[3] << 1);

	508 tmp[7] = ((u64) in[0]) * (in[7] << 1) +

	509 ((u64) in[1]) * (in[6] << 1) +

	510 ((u64) in[2]) * (in[5] << 1) +

	511 ((u64) in[3]) * (in[4] << 1);

	512 /* tmp[8] has the greatest value of 261 + 260 + 261 + 260 + 2**60,

	513 * which is < 2**64 as required.

	514 */

	515 tmp[8] = ((u64) in[0]) * (in[8] << 1) +

	516 ((u64) in[1]) * (in[7] << 2) +

	517 ((u64) in[2]) * (in[6] << 1) +

	518 ((u64) in[3]) * (in[5] << 2) +

	519 ((u64) in[4]) * in[4];

	520 tmp[9] = ((u64) in[1]) * (in[8] << 1) +

	521 ((u64) in[2]) * (in[7] << 1) +

	522 ((u64) in[3]) * (in[6] << 1) +

	523 ((u64) in[4]) * (in[5] << 1);

	524 tmp[10] = ((u64) in[2]) * (in[8] << 1) +

	525 ((u64) in[3]) * (in[7] << 2) +

	526 ((u64) in[4]) * (in[6] << 1) +

	527 ((u64) in[5]) * (in[5] << 1);

	528 tmp[11] = ((u64) in[3]) * (in[8] << 1) +

	529 ((u64) in[4]) * (in[7] << 1) +

	530 ((u64) in[5]) * (in[6] << 1);

	531 tmp[12] = ((u64) in[4]) * (in[8] << 1) +

	532 ((u64) in[5]) * (in[7] << 2) +

	533 ((u64) in[6]) * in[6];

	534 tmp[13] = ((u64) in[5]) * (in[8] << 1) +

	535 ((u64) in[6]) * (in[7] << 1);

	536 tmp[14] = ((u64) in[6]) * (in[8] << 1) +

	537 ((u64) in[7]) * (in[7] << 1);

	538 tmp[15] = ((u64) in[7]) * (in[8] << 1);

	539 tmp[16] = ((u64) in[8]) * in[8];

	540

	541 felem_reduce_degree(out, tmp);

	542 }

	543

	544 /* felem_mul sets out=in*in2.

	545 *

	546 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	547 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	548 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	549 */

	550 static void felem_mul(felem out, const felem in, const felem in2)

	551 {

	552 u64 tmp[17];

	553

	554 tmp[0] = ((u64) in[0]) * in2[0];

	555 tmp[1] = ((u64) in[0]) * (in2[1] << 0) +

	556 ((u64) in[1]) * (in2[0] << 0);

	557 tmp[2] = ((u64) in[0]) * (in2[2] << 0) +

	558 ((u64) in[1]) * (in2[1] << 1) +

	559 ((u64) in[2]) * (in2[0] << 0);

	560 tmp[3] = ((u64) in[0]) * (in2[3] << 0) +

	561 ((u64) in[1]) * (in2[2] << 0) +

	562 ((u64) in[2]) * (in2[1] << 0) +

	563 ((u64) in[3]) * (in2[0] << 0);

	564 tmp[4] = ((u64) in[0]) * (in2[4] << 0) +

	565 ((u64) in[1]) * (in2[3] << 1) +

	566 ((u64) in[2]) * (in2[2] << 0) +

	567 ((u64) in[3]) * (in2[1] << 1) +

	568 ((u64) in[4]) * (in2[0] << 0);

	569 tmp[5] = ((u64) in[0]) * (in2[5] << 0) +

	570 ((u64) in[1]) * (in2[4] << 0) +

	571 ((u64) in[2]) * (in2[3] << 0) +

	572 ((u64) in[3]) * (in2[2] << 0) +

	573 ((u64) in[4]) * (in2[1] << 0) +

	574 ((u64) in[5]) * (in2[0] << 0);

	575 tmp[6] = ((u64) in[0]) * (in2[6] << 0) +

	576 ((u64) in[1]) * (in2[5] << 1) +

	577 ((u64) in[2]) * (in2[4] << 0) +

	578 ((u64) in[3]) * (in2[3] << 1) +

	579 ((u64) in[4]) * (in2[2] << 0) +

	580 ((u64) in[5]) * (in2[1] << 1) +

	581 ((u64) in[6]) * (in2[0] << 0);

	582 tmp[7] = ((u64) in[0]) * (in2[7] << 0) +

	583 ((u64) in[1]) * (in2[6] << 0) +

	584 ((u64) in[2]) * (in2[5] << 0) +

	585 ((u64) in[3]) * (in2[4] << 0) +

	586 ((u64) in[4]) * (in2[3] << 0) +

	587 ((u64) in[5]) * (in2[2] << 0) +

	588 ((u64) in[6]) * (in2[1] << 0) +

	589 ((u64) in[7]) * (in2[0] << 0);

	590 /* tmp[8] has the greatest value but doesn't overflow. See logic in

	591 * felem_square.

	592 */

	593 tmp[8] = ((u64) in[0]) * (in2[8] << 0) +

	594 ((u64) in[1]) * (in2[7] << 1) +

	595 ((u64) in[2]) * (in2[6] << 0) +

	596 ((u64) in[3]) * (in2[5] << 1) +

	597 ((u64) in[4]) * (in2[4] << 0) +

	598 ((u64) in[5]) * (in2[3] << 1) +

	599 ((u64) in[6]) * (in2[2] << 0) +

	600 ((u64) in[7]) * (in2[1] << 1) +

	601 ((u64) in[8]) * (in2[0] << 0);

	602 tmp[9] = ((u64) in[1]) * (in2[8] << 0) +

	603 ((u64) in[2]) * (in2[7] << 0) +

	604 ((u64) in[3]) * (in2[6] << 0) +

	605 ((u64) in[4]) * (in2[5] << 0) +

	606 ((u64) in[5]) * (in2[4] << 0) +

	607 ((u64) in[6]) * (in2[3] << 0) +

	608 ((u64) in[7]) * (in2[2] << 0) +

	609 ((u64) in[8]) * (in2[1] << 0);

	610 tmp[10] = ((u64) in[2]) * (in2[8] << 0) +

	611 ((u64) in[3]) * (in2[7] << 1) +

	612 ((u64) in[4]) * (in2[6] << 0) +

	613 ((u64) in[5]) * (in2[5] << 1) +

	614 ((u64) in[6]) * (in2[4] << 0) +

	615 ((u64) in[7]) * (in2[3] << 1) +

	616 ((u64) in[8]) * (in2[2] << 0);

	617 tmp[11] = ((u64) in[3]) * (in2[8] << 0) +

	618 ((u64) in[4]) * (in2[7] << 0) +

	619 ((u64) in[5]) * (in2[6] << 0) +

	620 ((u64) in[6]) * (in2[5] << 0) +

	621 ((u64) in[7]) * (in2[4] << 0) +

	622 ((u64) in[8]) * (in2[3] << 0);

	623 tmp[12] = ((u64) in[4]) * (in2[8] << 0) +

	624 ((u64) in[5]) * (in2[7] << 1) +

	625 ((u64) in[6]) * (in2[6] << 0) +

	626 ((u64) in[7]) * (in2[5] << 1) +

	627 ((u64) in[8]) * (in2[4] << 0);

	628 tmp[13] = ((u64) in[5]) * (in2[8] << 0) +

	629 ((u64) in[6]) * (in2[7] << 0) +

	630 ((u64) in[7]) * (in2[6] << 0) +

	631 ((u64) in[8]) * (in2[5] << 0);

	632 tmp[14] = ((u64) in[6]) * (in2[8] << 0) +

	633 ((u64) in[7]) * (in2[7] << 1) +

	634 ((u64) in[8]) * (in2[6] << 0);

	635 tmp[15] = ((u64) in[7]) * (in2[8] << 0) +

	636 ((u64) in[8]) * (in2[7] << 0);

	637 tmp[16] = ((u64) in[8]) * (in2[8] << 0);

	638

	639 felem_reduce_degree(out, tmp);

	640 }

	641

	642 static void felem_assign(felem out, const felem in)

	643 {

	644 memcpy(out, in, sizeof(felem));

	645 }

	646

	647 /* felem_inv calculates \|out\| = \|in\|^{-1}

	648 *

	649 * Based on Fermat's Little Theorem:

	650 * a^p = a (mod p)

	651 * a^{p-1} = 1 (mod p)

	652 * a^{p-2} = a^{-1} (mod p)

	653 */

	654 static void felem_inv(felem out, const felem in)

	655 {

	656 felem ftmp, ftmp2;

	657 /* each e_I will hold \|in\|^{2^I - 1} */

	658 felem e2, e4, e8, e16, e32, e64;

	659 unsigned int i;

	660

	661 felem_square(ftmp, in); /* 2^1 */

	662 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */

	663 felem_assign(e2, ftmp);

	664 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */

	665 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */

	666 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */

	667 felem_assign(e4, ftmp);

	668 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */

	669 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */

	670 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */

	671 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */

	672 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */

	673 felem_assign(e8, ftmp);

	674 for (i = 0; i < 8; i++) {

	675 felem_square(ftmp, ftmp);

	676 } /* 2^16 - 2^8 */

	677 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */

	678 felem_assign(e16, ftmp);

	679 for (i = 0; i < 16; i++) {

	680 felem_square(ftmp, ftmp);

	681 } /* 2^32 - 2^16 */

	682 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */

	683 felem_assign(e32, ftmp);

	684 for (i = 0; i < 32; i++) {

	685 felem_square(ftmp, ftmp);

	686 } /* 2^64 - 2^32 */

	687 felem_assign(e64, ftmp);

	688 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */

	689 for (i = 0; i < 192; i++) {

	690 felem_square(ftmp, ftmp);

	691 } /* 2^256 - 2^224 + 2^192 */

	692

	693 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */

	694 for (i = 0; i < 16; i++) {

	695 felem_square(ftmp2, ftmp2);

	696 } /* 2^80 - 2^16 */

	697 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */

	698 for (i = 0; i < 8; i++) {

	699 felem_square(ftmp2, ftmp2);

	700 } /* 2^88 - 2^8 */

	701 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */

	702 for (i = 0; i < 4; i++) {

	703 felem_square(ftmp2, ftmp2);

	704 } /* 2^92 - 2^4 */

	705 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */

	706 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */

	707 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */

	708 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */

	709 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */

	710 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */

	711 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */

	712

	713 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */

	714 }

	715

	716 /* felem_scalar_3 sets out=3*out.

	717 *

	718 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	719 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	720 */

	721 static void felem_scalar_3(felem out)

	722 {

	723 limb carry = 0;

	724 unsigned int i;

	725

	726 for (i = 0;; i++) {

	727 out[i] *= 3;

	728 out[i] += carry;

	729 carry = out[i] >> 29;

	730 out[i] &= kBottom29Bits;

	731

	732 i++;

	733 if (i == NLIMBS)

	734 break;

	735

	736 out[i] *= 3;

	737 out[i] += carry;

	738 carry = out[i] >> 28;

	739 out[i] &= kBottom28Bits;

	740 }

	741

	742 felem_reduce_carry(out, carry);

	743 }

	744

	745 /* felem_scalar_4 sets out=4*out.

	746 *

	747 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	748 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	749 */

	750 static void felem_scalar_4(felem out)

	751 {

	752 limb carry = 0, next_carry;

	753 unsigned int i;

	754

	755 for (i = 0;; i++) {

	756 next_carry = out[i] >> 27;

	757 out[i] <<= 2;

	758 out[i] &= kBottom29Bits;

	759 out[i] += carry;

	760 carry = next_carry + (out[i] >> 29);

	761 out[i] &= kBottom29Bits;

	762

	763 i++;

	764 if (i == NLIMBS)

	765 break;

	766 next_carry = out[i] >> 26;

	767 out[i] <<= 2;

	768 out[i] &= kBottom28Bits;

	769 out[i] += carry;

	770 carry = next_carry + (out[i] >> 28);

	771 out[i] &= kBottom28Bits;

	772 }

	773

	774 felem_reduce_carry(out, carry);

	775 }

	776

	777 /* felem_scalar_8 sets out=8*out.

	778 *

	779 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	780 * On exit: out[0,2,...] < 230, out[1,3,...] < 229.

	781 */

	782 static void felem_scalar_8(felem out)

	783 {

	784 limb carry = 0, next_carry;

	785 unsigned int i;

	786

	787 for (i = 0;; i++) {

	788 next_carry = out[i] >> 26;

	789 out[i] <<= 3;

	790 out[i] &= kBottom29Bits;

	791 out[i] += carry;

	792 carry = next_carry + (out[i] >> 29);

	793 out[i] &= kBottom29Bits;

	794

	795 i++;

	796 if (i == NLIMBS)

	797 break;

	798 next_carry = out[i] >> 25;

	799 out[i] <<= 3;

	800 out[i] &= kBottom28Bits;

	801 out[i] += carry;

	802 carry = next_carry + (out[i] >> 28);

	803 out[i] &= kBottom28Bits;

	804 }

	805

	806 felem_reduce_carry(out, carry);

	807 }

	808

	809 /* felem_is_zero_vartime returns 1 iff \|in\| == 0. It takes a variable amount of

	810 * time depending on the value of \|in\|.

	811 */

	812 static char felem_is_zero_vartime(const felem in)

	813 {

	814 limb carry;

	815 int i;

	816 limb tmp[NLIMBS];

	817 felem_assign(tmp, in);

	818

	819 /* First, reduce tmp to a minimal form.

	820 */

	821 do {

	822 carry = 0;

	823 for (i = 0;; i++) {

	824 tmp[i] += carry;

	825 carry = tmp[i] >> 29;

	826 tmp[i] &= kBottom29Bits;

	827

	828 i++;

	829 if (i == NLIMBS)

	830 break;

	831

	832 tmp[i] += carry;

	833 carry = tmp[i] >> 28;

	834 tmp[i] &= kBottom28Bits;

	835 }

	836

	837 felem_reduce_carry(tmp, carry);

	838 } while (carry);

	839

	840 /* tmp < 2**257, so the only possible zero values are 0, p and 2p.

	841 */

	842 return memcmp(tmp, kZero, sizeof(tmp)) == 0 \|\|

	843 memcmp(tmp, kP, sizeof(tmp)) == 0 \|\|

	844 memcmp(tmp, k2P, sizeof(tmp)) == 0;

	845 }

	846

	847 /* Group operations:

	848 *

	849 * Elements of the elliptic curve group are represented in Jacobian

	850 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z2, y'=y/z3 in

	851 * Jacobian form.

	852 */

	853

	854 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.

	855 *

	856 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l

	857 */

	858 static void point_double(felem x_out, felem y_out, felem z_out,

	859 const felem x, const felem y, const felem z)

	860 {

	861 felem delta, gamma, alpha, beta, tmp, tmp2;

	862

	863 felem_square(delta, z);

	864 felem_square(gamma, y);

	865 felem_mul(beta, x, gamma);

	866

	867 felem_sum(tmp, x, delta);

	868 felem_diff(tmp2, x, delta);

	869 felem_mul(alpha, tmp, tmp2);

	870 felem_scalar_3(alpha);

	871

	872 felem_sum(tmp, y, z);

	873 felem_square(tmp, tmp);

	874 felem_diff(tmp, tmp, gamma);

	875 felem_diff(z_out, tmp, delta);

	876

	877 felem_scalar_4(beta);

	878 felem_square(x_out, alpha);

	879 felem_diff(x_out, x_out, beta);

	880 felem_diff(x_out, x_out, beta);

	881

	882 felem_diff(tmp, beta, x_out);

	883 felem_mul(tmp, alpha, tmp);

	884 felem_square(tmp2, gamma);

	885 felem_scalar_8(tmp2);

	886 felem_diff(y_out, tmp, tmp2);

	887 }

	888

	889 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.

	890 * (i.e. the second point is affine.)

	891 *

	892 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	893 *

	894 * Note that this function does not handle P+P, infinity+P nor P+infinity

	895 * correctly.

	896 */

	897 static void point_add_mixed(felem x_out, felem y_out, felem z_out,

	898 const felem x1, const felem y1, const felem z1,

	899 const felem x2, const felem y2)

	900 {

	901 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;

	902

	903 felem_square(z1z1, z1);

	904 felem_sum(tmp, z1, z1);

	905

	906 felem_mul(u2, x2, z1z1);

	907 felem_mul(z1z1z1, z1, z1z1);

	908 felem_mul(s2, y2, z1z1z1);

	909 felem_diff(h, u2, x1);

	910 felem_sum(i, h, h);

	911 felem_square(i, i);

	912 felem_mul(j, h, i);

	913 felem_diff(r, s2, y1);

	914 felem_sum(r, r, r);

	915 felem_mul(v, x1, i);

	916

	917 felem_mul(z_out, tmp, h);

	918 felem_square(rr, r);

	919 felem_diff(x_out, rr, j);

	920 felem_diff(x_out, x_out, v);

	921 felem_diff(x_out, x_out, v);

	922

	923 felem_diff(tmp, v, x_out);

	924 felem_mul(y_out, tmp, r);

	925 felem_mul(tmp, y1, j);

	926 felem_diff(y_out, y_out, tmp);

	927 felem_diff(y_out, y_out, tmp);

	928 }

	929

	930 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.

	931 *

	932 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	933 *

	934 * Note that this function does not handle P+P, infinity+P nor P+infinity

	935 * correctly.

	936 */

	937 static void point_add(felem x_out, felem y_out, felem z_out,

	938 const felem x1, const felem y1, const felem z1,

	939 const felem x2, const felem y2, const felem z2)

	940 {

	941 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	942

	943 felem_square(z1z1, z1);

	944 felem_square(z2z2, z2);

	945 felem_mul(u1, x1, z2z2);

	946

	947 felem_sum(tmp, z1, z2);

	948 felem_square(tmp, tmp);

	949 felem_diff(tmp, tmp, z1z1);

	950 felem_diff(tmp, tmp, z2z2);

	951

	952 felem_mul(z2z2z2, z2, z2z2);

	953 felem_mul(s1, y1, z2z2z2);

	954

	955 felem_mul(u2, x2, z1z1);

	956 felem_mul(z1z1z1, z1, z1z1);

	957 felem_mul(s2, y2, z1z1z1);

	958 felem_diff(h, u2, u1);

	959 felem_sum(i, h, h);

	960 felem_square(i, i);

	961 felem_mul(j, h, i);

	962 felem_diff(r, s2, s1);

	963 felem_sum(r, r, r);

	964 felem_mul(v, u1, i);

	965

	966 felem_mul(z_out, tmp, h);

	967 felem_square(rr, r);

	968 felem_diff(x_out, rr, j);

	969 felem_diff(x_out, x_out, v);

	970 felem_diff(x_out, x_out, v);

	971

	972 felem_diff(tmp, v, x_out);

	973 felem_mul(y_out, tmp, r);

	974 felem_mul(tmp, s1, j);

	975 felem_diff(y_out, y_out, tmp);

	976 felem_diff(y_out, y_out, tmp);

	977 }

	978

	979 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +

	980 * {x2,y2,z2}.

	981 *

	982 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	983 *

	984 * This function handles the case where {x1,y1,z1}={x2,y2,z2}.

	985 */

	986 static void point_add_or_double_vartime(

	987 felem x_out, felem y_out, felem z_out,

	988 const felem x1, const felem y1, const felem z1,

	989 const felem x2, const felem y2, const felem z2)

	990 {

	991 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	992 char x_equal, y_equal;

	993

	994 felem_square(z1z1, z1);

	995 felem_square(z2z2, z2);

	996 felem_mul(u1, x1, z2z2);

	997

	998 felem_sum(tmp, z1, z2);

	999 felem_square(tmp, tmp);

	1000 felem_diff(tmp, tmp, z1z1);

	1001 felem_diff(tmp, tmp, z2z2);

	1002

	1003 felem_mul(z2z2z2, z2, z2z2);

	1004 felem_mul(s1, y1, z2z2z2);

	1005

	1006 felem_mul(u2, x2, z1z1);

	1007 felem_mul(z1z1z1, z1, z1z1);

	1008 felem_mul(s2, y2, z1z1z1);

	1009 felem_diff(h, u2, u1);

	1010 x_equal = felem_is_zero_vartime(h);

	1011 felem_sum(i, h, h);

	1012 felem_square(i, i);

	1013 felem_mul(j, h, i);

	1014 felem_diff(r, s2, s1);

	1015 y_equal = felem_is_zero_vartime(r);

	1016 if (x_equal && y_equal) {

	1017 point_double(x_out, y_out, z_out, x1, y1, z1);

	1018 return;

	1019 }

	1020 felem_sum(r, r, r);

	1021 felem_mul(v, u1, i);

	1022

	1023 felem_mul(z_out, tmp, h);

	1024 felem_square(rr, r);

	1025 felem_diff(x_out, rr, j);

	1026 felem_diff(x_out, x_out, v);

	1027 felem_diff(x_out, x_out, v);

	1028

	1029 felem_diff(tmp, v, x_out);

	1030 felem_mul(y_out, tmp, r);

	1031 felem_mul(tmp, s1, j);

	1032 felem_diff(y_out, y_out, tmp);

	1033 felem_diff(y_out, y_out, tmp);

	1034 }

	1035

	1036 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.

	1037 *

	1038 * On entry: mask is either 0 or 0xffffffff.

	1039 */

	1040 static void copy_conditional(felem out, const felem in, limb mask)

	1041 {

	1042 int i;

	1043

	1044 for (i = 0; i < NLIMBS; i++) {

	1045 const limb tmp = mask & (in[i] ^ out[i]);

	1046 out[i] ^= tmp;

	1047 }

	1048 }

	1049

	1050 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.

	1051 * On entry: index < 16, table[0] must be zero.

	1052 */

	1053 static void select_affine_point(felem out_x, felem out_y,

	1054 const limb *table, limb index)

	1055 {

	1056 limb i, j;

	1057

	1058 memset(out_x, 0, sizeof(felem));

	1059 memset(out_y, 0, sizeof(felem));

	1060

	1061 for (i = 1; i < 16; i++) {

	1062 limb mask = i ^ index;

	1063 mask \|= mask >> 2;

	1064 mask \|= mask >> 1;

	1065 mask &= 1;

	1066 mask--;

	1067 for (j = 0; j < NLIMBS; j++, table++) {

	1068 out_x[j] \|= *table & mask;

	1069 }

	1070 for (j = 0; j < NLIMBS; j++, table++) {

	1071 out_y[j] \|= *table & mask;

	1072 }

	1073 }

	1074 }

	1075

	1076 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of

	1077 * table. On entry: index < 16, table[0] must be zero.

	1078 */

	1079 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,

	1080 const limb *table, limb index)

	1081 {

	1082 limb i, j;

	1083

	1084 memset(out_x, 0, sizeof(felem));

	1085 memset(out_y, 0, sizeof(felem));

	1086 memset(out_z, 0, sizeof(felem));

	1087

	1088 /* The implicit value at index 0 is all zero. We don't need to perform that

	1089 * iteration of the loop because we already set out_* to zero.

	1090 */

	1091 table += 3*NLIMBS;

	1092

	1093 for (i = 1; i < 16; i++) {

	1094 limb mask = i ^ index;

	1095 mask \|= mask >> 2;

	1096 mask \|= mask >> 1;

	1097 mask &= 1;

	1098 mask--;

	1099 for (j = 0; j < NLIMBS; j++, table++) {

	1100 out_x[j] \|= *table & mask;

	1101 }

	1102 for (j = 0; j < NLIMBS; j++, table++) {

	1103 out_y[j] \|= *table & mask;

	1104 }

	1105 for (j = 0; j < NLIMBS; j++, table++) {

	1106 out_z[j] \|= *table & mask;

	1107 }

	1108 }

	1109 }

	1110

	1111 /* get_bit returns the bit'th bit of scalar. */

	1112 static char get_bit(const u8 scalar[32], int bit)

	1113 {

	1114 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;

	1115 }

	1116

	1117 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian

	1118 * number. Note that the value of scalar must be less than the order of the

	1119 * group.

	1120 */

	1121 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])

	1122 {

	1123 int i, j;

	1124 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;

	1125 u32 table_offset;

	1126

	1127 felem px, py;

	1128 felem tx, ty, tz;

	1129

	1130 memset(nx, 0, sizeof(felem));

	1131 memset(ny, 0, sizeof(felem));

	1132 memset(nz, 0, sizeof(felem));

	1133

	1134 /* The loop adds bits at positions 0, 64, 128 and 192, followed by

	1135 * positions 32,96,160 and 224 and does this 32 times.

	1136 */

	1137 for (i = 0; i < 32; i++) {

	1138 if (i) {

	1139 point_double(nx, ny, nz, nx, ny, nz);

	1140 }

	1141 for (j = 0; j <= 32; j += 32) {

	1142 char bit0 = get_bit(scalar, 31 - i + j);

	1143 char bit1 = get_bit(scalar, 95 - i + j);

	1144 char bit2 = get_bit(scalar, 159 - i + j);

	1145 char bit3 = get_bit(scalar, 223 - i + j);

	1146 limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);

	1147

	1148 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);

	1149 select_affine_point(px, py, kPrecomputed + table_offset, index);

	1150

	1151 /* Since scalar is less than the order of the group, we know that

	1152 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle

	1153 * below.

	1154 */

	1155 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);

	1156 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero

	1157 * (a.k.a. the point at infinity). We handle that situation by

	1158 * copying the point from the table.

	1159 */

	1160 copy_conditional(nx, px, n_is_infinity_mask);

	1161 copy_conditional(ny, py, n_is_infinity_mask);

	1162 copy_conditional(nz, kOne, n_is_infinity_mask);

	1163

	1164 /* Equally, the result is also wrong if the point from the table is

	1165 * zero, which happens when the index is zero. We handle that by

	1166 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0.

	1167 */

	1168 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);

	1169 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1170 copy_conditional(nx, tx, mask);

	1171 copy_conditional(ny, ty, mask);

	1172 copy_conditional(nz, tz, mask);

	1173 /* If p was not zero, then n is now non-zero. */

	1174 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1175 }

	1176 }

	1177 }

	1178

	1179 /* point_to_affine converts a Jacobian point to an affine point. If the input

	1180 * is the point at infinity then it returns (0, 0) in constant time.

	1181 */

	1182 static void point_to_affine(felem x_out, felem y_out,

	1183 const felem nx, const felem ny, const felem nz) {

	1184 felem z_inv, z_inv_sq;

	1185 felem_inv(z_inv, nz);

	1186 felem_square(z_inv_sq, z_inv);

	1187 felem_mul(x_out, nx, z_inv_sq);

	1188 felem_mul(z_inv, z_inv, z_inv_sq);

	1189 felem_mul(y_out, ny, z_inv);

	1190 }

	1191

	1192 /* scalar_mult sets {nx,ny,nz} = scalar{x,y}. /

	1193 static void scalar_mult(felem nx, felem ny, felem nz,

	1194 const felem x, const felem y, const u8 scalar[32])

	1195 {

	1196 int i;

	1197 felem px, py, pz, tx, ty, tz;

	1198 felem precomp[16][3];

	1199 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;

	1200

	1201 /* We precompute 0,1,2,... times {x,y}. */

	1202 memset(precomp, 0, sizeof(felem) * 3);

	1203 memcpy(&precomp[1][0], x, sizeof(felem));

	1204 memcpy(&precomp[1][1], y, sizeof(felem));

	1205 memcpy(&precomp[1][2], kOne, sizeof(felem));

	1206

	1207 for (i = 2; i < 16; i += 2) {

	1208 point_double(precomp[i][0], precomp[i][1], precomp[i][2],

	1209 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);

	1210

	1211 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],

	1212 precomp[i][0], precomp[i][1], precomp[i][2], x, y);

	1213 }

	1214

	1215 memset(nx, 0, sizeof(felem));

	1216 memset(ny, 0, sizeof(felem));

	1217 memset(nz, 0, sizeof(felem));

	1218 n_is_infinity_mask = -1;

	1219

	1220 /* We add in a window of four bits each iteration and do this 64 times. */

	1221 for (i = 0; i < 64; i++) {

	1222 if (i) {

	1223 point_double(nx, ny, nz, nx, ny, nz);

	1224 point_double(nx, ny, nz, nx, ny, nz);

	1225 point_double(nx, ny, nz, nx, ny, nz);

	1226 point_double(nx, ny, nz, nx, ny, nz);

	1227 }

	1228

	1229 index = scalar[31 - i / 2];

	1230 if ((i & 1) == 1) {

	1231 index &= 15;

	1232 } else {

	1233 index >>= 4;

	1234 }

	1235

	1236 /* See the comments in scalar_base_mult about handling infinities. */

	1237 select_jacobian_point(px, py, pz, (limb *) precomp, index);

	1238 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);

	1239 copy_conditional(nx, px, n_is_infinity_mask);

	1240 copy_conditional(ny, py, n_is_infinity_mask);

	1241 copy_conditional(nz, pz, n_is_infinity_mask);

	1242

	1243 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;

	1244 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1245 copy_conditional(nx, tx, mask);

	1246 copy_conditional(ny, ty, mask);

	1247 copy_conditional(nz, tz, mask);

	1248 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1249 }

	1250 }

	1251

	1252 /* Interface with Freebl: */

	1253

	1254 #if IS_BIG_ENDIAN

	1255 #error "This code needs a little-endian processor"

	1256 #endif

	1257

	1258 static const u32 kRInvDigits[8] = {

	1259 0x80000000, 1, 0xffffffff, 0,

	1260 0x80000001, 0xfffffffe, 1, 0x7fffffff

	1261 };

	1262 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))

	1263 static const mp_int kRInv = {

	1264 MP_ZPOS,

	1265 MP_DIGITS_IN_256_BITS,

	1266 MP_DIGITS_IN_256_BITS,

	1267 /* Because we are running on a little-endian processor, this cast works for

	1268 * both 32 and 64-bit processors.

	1269 */

	1270 (mp_digit*) kRInvDigits

	1271 };

	1272

	1273 static const limb kTwo28 = 0x10000000;

	1274 static const limb kTwo29 = 0x20000000;

	1275

	1276 /* to_montgomery sets out = Rin. /

	1277 static mp_err to_montgomery(felem out, const mp_int in, const ECGroup group)

	1278 {

	1279 /* There are no MPI functions for bitshift operations and we wish to shift

	1280 * in 257 bits left so we move the digits 256-bits left and then multiply

	1281 * by two.

	1282 */

	1283 mp_int in_shifted;

	1284 int i;

	1285 mp_err res;

	1286

	1287 mp_init(&in_shifted);

	1288 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);

	1289 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),

	1290 MP_DIGITS(in),

	1291 MP_USED(in)*sizeof(mp_digit));

	1292 mp_mul_2(&in_shifted, &in_shifted);

	1293 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));

	1294

	1295 for (i = 0;; i++) {

	1296 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;

	1297 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);

	1298

	1299 i++;

	1300 if (i == NLIMBS)

	1301 break;

	1302 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;

	1303 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);

	1304 }

	1305

	1306 CLEANUP:

	1307 mp_clear(&in_shifted);

	1308 return res;

	1309 }

	1310

	1311 /* from_montgomery sets out=in/R. */

	1312 static mp_err from_montgomery(mp_int *out, const felem in,

	1313 const ECGroup *group)

	1314 {

	1315 mp_int result, tmp;

	1316 mp_err res;

	1317 int i;

	1318

	1319 mp_init(&result);

	1320 mp_init(&tmp);

	1321

	1322 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));

	1323 for (i = NLIMBS-2; i >= 0; i--) {

	1324 if ((i & 1) == 0) {

	1325 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));

	1326 } else {

	1327 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));

	1328 }

	1329 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));

	1330 }

	1331

	1332 MP_CHECKOK(mp_mul(&result, &kRInv, out));

	1333 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));

	1334

	1335 CLEANUP:

	1336 mp_clear(&result);

	1337 mp_clear(&tmp);

	1338 return res;

	1339 }

	1340

	1341 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */

	1342 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)

	1343 {

	1344 /* We require that \|n\| is less than the order of the group and therefore it

	1345 * will fit into \|scalar\|. However, these is a timing side-channel here that

	1346 * we cannot avoid: if \|n\| is sufficiently small it may be one or more words

	1347 * too short and we'll copy less data.

	1348 */

	1349 memset(out_scalar, 0, 32);

	1350 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));

	1351 }

	1352

	1353 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the

	1354 * order of the group.

	1355 */

	1356 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,

	1357 mp_int out_x, mp_int out_y,

	1358 const ECGroup *group)

	1359 {

	1360 u8 scalar[32];

	1361 felem x, y, z, x_affine, y_affine;

	1362 mp_err res;

	1363

	1364 /* FIXME(agl): test that n < order. */

	1365

	1366 scalar_from_mp_int(scalar, n);

	1367 scalar_base_mult(x, y, z, scalar);

	1368 point_to_affine(x_affine, y_affine, x, y, z);

	1369 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1370 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1371

	1372 CLEANUP:

	1373 return res;

	1374 }

	1375

	1376 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <

	1377 * the order of the group.

	1378 */

	1379 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,

	1380 const mp_int in_x, const mp_int in_y,

	1381 mp_int out_x, mp_int out_y,

	1382 const ECGroup *group)

	1383 {

	1384 u8 scalar[32];

	1385 felem x, y, z, x_affine, y_affine, px, py;

	1386 mp_err res;

	1387

	1388 scalar_from_mp_int(scalar, n);

	1389

	1390 MP_CHECKOK(to_montgomery(px, in_x, group));

	1391 MP_CHECKOK(to_montgomery(py, in_y, group));

	1392

	1393 scalar_mult(x, y, z, px, py, scalar);

	1394 point_to_affine(x_affine, y_affine, x, y, z);

	1395 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1396 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1397

	1398 CLEANUP:

	1399 return res;

	1400 }

	1401

	1402 /* ec_GFp_nistp256_point_mul_vartime sets {out_x,out_y} = n1*G +

	1403 * n2*{in_x,in_y}, where n1 and n2 are < the order of the group.

	1404 *

	1405 * As indicated by the name, this function operates in variable time. This

	1406 * is safe because it's used for signature validation which doesn't deal

	1407 * with secrets.

	1408 */

	1409 mp_err ec_GFp_nistp256_points_mul_vartime(

	1410 const mp_int n1, const mp_int n2,

	1411 const mp_int in_x, const mp_int in_y,

	1412 mp_int out_x, mp_int out_y,

	1413 const ECGroup *group)

	1414 {

	1415 u8 scalar1[32], scalar2[32];

	1416 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;

	1417 mp_err res = MP_OKAY;

	1418

	1419 /* If n2 == NULL, this is just a base-point multiplication. */

	1420 if (n2 == NULL) {

	1421 return ec_GFp_nistp256_base_point_mul(n1, out_x, out_y, group);

	1422 }

	1423

	1424 /* If n1 == nULL, this is just an arbitary-point multiplication. */

	1425 if (n1 == NULL) {

	1426 return ec_GFp_nistp256_point_mul(n2, in_x, in_y, out_x, out_y, group);

	1427 }

	1428

	1429 /* If both scalars are zero, then the result is the point at infinity. */

	1430 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {

	1431 mp_zero(out_x);

	1432 mp_zero(out_y);

	1433 return res;

	1434 }

	1435

	1436 scalar_from_mp_int(scalar1, n1);

	1437 scalar_from_mp_int(scalar2, n2);

	1438

	1439 MP_CHECKOK(to_montgomery(px, in_x, group));

	1440 MP_CHECKOK(to_montgomery(py, in_y, group));

	1441 scalar_base_mult(x1, y1, z1, scalar1);

	1442 scalar_mult(x2, y2, z2, px, py, scalar2);

	1443

	1444 if (mp_cmp_z(n2) == 0) {

	1445 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just

	1446 * {x1,y1,z1}. */

	1447 } else if (mp_cmp_z(n1) == 0) {

	1448 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just

	1449 * {x2,y2,z2}. */

	1450 memcpy(x1, x2, sizeof(x2));

	1451 memcpy(y1, y2, sizeof(y2));

	1452 memcpy(z1, z2, sizeof(z2));

	1453 } else {

	1454 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */

	1455 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);

	1456 }

	1457

	1458 point_to_affine(x_affine, y_affine, x1, y1, z1);

	1459 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1460 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1461

	1462 CLEANUP:

	1463 return res;

	1464 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »