ecp_256_32.c - Issue 10944017: 32-bit, P-256 implementation.

Side by Side Diff: ecp_256_32.c

Issue 10944017: 32-bit, P-256 implementation. Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #include <endian.h>
	wtc 2012/10/09 21:04:47 <endian.h> is not portable. For example, it doesn' <endian.h> is not portable. For example, it doesn't exist on Mac OS X. so probably doesn't exist for iOS either. agl 2012/10/11 14:55:04 In which case there doesn't appear to be any porta Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > <endian.h> is not portable. For example, it doesn't exist on > Mac OS X. so probably doesn't exist for iOS either. In which case there doesn't appear to be any portable method for checking the endianness of the system at compile time. Is this something that the NSS build system knows about? wtc 2012/10/11 16:29:00 Yes, NSPR provides the endianness of the system us Show quoted text On 2012/10/11 14:55:04, agl wrote: > > In which case there doesn't appear to be any portable method for checking the > endianness of the system at compile time. Is this something that the NSS build > system knows about? Yes, NSPR provides the endianness of the system using two macros: IS_BIG_ENDIAN and IS_LITTLE_ENDIAN. Only one of the macro is defined. Include the NSPR header "prcpucfg.h" (which is included by any NSPR header) and test either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN. For example, the "prcpucfg.h" file for Linux defines the two macros for ARM as follows: http://bonsai.mozilla.org/cvsblame.cgi?file=mozilla/nsprpub/pr/include/md/_li... agl 2012/10/11 18:51:01 Thanks, done. Show quoted text On 2012/10/11 16:29:00, wtc wrote: > Include the NSPR header "prcpucfg.h" (which is included by > any NSPR header) and test either IS_BIG_ENDIAN or > IS_LITTLE_ENDIAN. Thanks, done.
	2

	3 #include <stdint.h>

	4 #include <stdio.h>

	5

	6 #include <string.h>

	7 #include <stdlib.h>

	8 #include <unistd.h>
	wtc 2012/10/09 21:04:47 Do you remember why you need <unistd.h>? This head Do you remember why you need <unistd.h>? This header seems to declare system calls. It's not clear to me why this file (which is mostly math) needs to make system calls. agl 2012/10/11 14:55:04 It's probably leftover from debugging calls. Will Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > Do you remember why you need <unistd.h>? This header seems to > declare system calls. It's not clear to me why this file (which > is mostly math) needs to make system calls. It's probably leftover from debugging calls. Will remove.
	9

	10 #include "mpi.h"

	11 #include "mpi-priv.h"

	12 #include "ecp.h"

	13

	14 typedef uint8_t u8;

	15 typedef uint32_t u32;

	16 typedef int32_t s32;

	17 typedef uint64_t u64;

	18

	19 /* Our field elements are represented as nine, 32-bit words. Freebl's MPI
	wtc 2012/10/09 21:04:47 Nit: I think the comma after "nine" is not necessa Nit: I think the comma after "nine" is not necessary. Similarly for line 63. agl 2012/10/11 14:55:04 Done. Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > Nit: I think the comma after "nine" is not necessary. Similarly > for line 63. Done.
	20 * library calls them digits, but here they are called limbs, which is GMP's

	21 * terminology.

	22 *

	23 * The value of an felem (field element) is:

	24 * x[0] + (x[1] * 2*29) + (x[2] 2*57) + ... + (x[8] 2**228)

	25 *

	26 * That is, each limb is alternatively 29 or 28-bits wide in little-endian

	27 * order.

	28 *

	29 * This means that an felem hits 2257, rather than 2256 as we would like. A

	30 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems

	31 * when multipling as terms end up one bit short of a limb which would require
	wtc 2012/10/09 21:04:47 Typo: multipling => multiplying Typo: multipling => multiplying agl 2012/10/11 14:55:04 Done. Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > Typo: multipling => multiplying Done.
	32 * much bit-shifting to correct.

	33 *

	34 * Finally, the values stored in an felem are in Montgomery form. So the value

	35 * \|y\| is stored as (y * 2**257) mod p, where p is the P-256 prime.

	36 */

	37 typedef uint32_t limb;

	38 #define NLIMBS 9

	39 typedef limb felem[NLIMBS];

	40

	41 static const limb kBottom28Bits = 0xfffffff;

	42 static const limb kBottom29Bits = 0x1fffffff;

	43

	44 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and

	45 * 28-bit words. */

	46 static const felem kOne = {

	47 2, 0, 0, 0xffff800,

	48 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,

	49 0

	50 };

	51 static const limb kZero[NLIMBS] = {0};

	52 static const limb kP[NLIMBS] = {

	53 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,

	54 0, 0, 0x200000, 0xf000000,

	55 0xfffffff

	56 };

	57 static const limb k2P[NLIMBS] = {

	58 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,

	59 0, 0, 0x400000, 0xe000000,

	60 0x1fffffff

	61 };

	62 /* kPrecomputed contains precomputed values to aid the calculation of scalar

	63 * multiples of the base point, G. It's actually two, equal length, tables

	64 * concatenated.

	65 *

	66 * The first table contains (x,y) felem pairs for 16 multiples of the base

	67 * point, G.

	68 *

	69 * Index \| Index (binary) \| Value

	70 * 0 \| 0000 \| 0G (all zeros, omitted)

	71 * 1 \| 0001 \| G

	72 * 2 \| 0010 \| 2**64G

	73 * 3 \| 0011 \| 2**64G + G

	74 * 4 \| 0100 \| 2**128G

	75 * 5 \| 0101 \| 2**128G + G

	76 * 6 \| 0110 \| 2128G + 264G

	77 * 7 \| 0111 \| 2128G + 264G + G

	78 * 8 \| 1000 \| 2**192G

	79 * 9 \| 1001 \| 2**192G + G

	80 * 10 \| 1010 \| 2192G + 264G

	81 * 11 \| 1011 \| 2192G + 264G + G

	82 * 12 \| 1100 \| 2192G + 2128G

	83 * 13 \| 1101 \| 2192G + 2128G + G

	84 * 14 \| 1110 \| 2192G + 2128G + 2**64G

	85 * 15 \| 1111 \| 2192G + 2128G + 2**64G + G

	86 *

	87 * The second table follows the same style, but the multiples are 2**32G,

	88 * 296G, 2160G, 2**224G.

	89 *

	90 * This is ~2KB of data. */

	91 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {

	92 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,

	93 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,

	94 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,

	95 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,

	96 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,

	97 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,

	98 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,

	99 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,

	100 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,

	101 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,

	102 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,

	103 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,

	104 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,

	105 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,

	106 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,

	107 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,

	108 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,

	109 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,

	110 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,

	111 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,

	112 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,

	113 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,

	114 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,

	115 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,

	116 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,

	117 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,

	118 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,

	119 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,

	120 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,

	121 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,

	122 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,

	123 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,

	124 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,

	125 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,

	126 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,

	127 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,

	128 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,

	129 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,

	130 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,

	131 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,

	132 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,

	133 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,

	134 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,

	135 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,

	136 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,

	137 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,

	138 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,

	139 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,

	140 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,

	141 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,

	142 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,

	143 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,

	144 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,

	145 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,

	146 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,

	147 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,

	148 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,

	149 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,

	150 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,

	151 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,

	152 };

	153

	154

	155 /* Field element operations: */

	156

	157 /* NON_ZERO_TO_ALL_ONES returns:

	158 * 0xffffffff for 0 < x <= 2**31

	159 * 0 for x == 0 or x > 2*31 /

	160 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) (x-1)) >> 31)))
	wtc 2012/10/09 21:04:47 I believe you are depending on the right-shift ope I believe you are depending on the right-shift operator extending the sign bit when the left operand is negative. This is implementation-defined behavior. The C language standard says (paraphrasing): If the left operand has a signed type and negative value, the result is implementation defined. agl 2012/10/11 14:55:04 I wasn't aware that the result was undefined by C. Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > I believe you are depending on the right-shift operator > extending the sign bit when the left operand is negative. > This is implementation-defined behavior. The C language > standard says (paraphrasing): > If the left operand has a signed type and negative value, > the result is implementation defined. I wasn't aware that the result was undefined by C. That's unfortunate because the alternative is quite a lot slower. Can we compile this only on ARM (and maybe Intel) and thus restrict it to chips that implement an arithmetic-shift-right? Tests will certainly uncover if a platform is doing anything else. wtc 2012/10/11 16:29:00 If that property depends on the processor rather t Show quoted text On 2012/10/11 14:55:04, agl wrote: > Can we compile this only on ARM (and > maybe Intel) and thus restrict it to chips that implement an > arithmetic-shift-right? Tests will certainly uncover if a platform is doing > anything else. If that property depends on the processor rather than the compiler, yes, we can do that. Please also add a comment to state your assumption of arithmetic right-shift. agl 2012/10/11 18:51:01 Done. Show quoted text On 2012/10/11 16:29:00, wtc wrote: > > On 2012/10/11 14:55:04, agl wrote: > > Can we compile this only on ARM (and > > maybe Intel) and thus restrict it to chips that implement an > > arithmetic-shift-right? Tests will certainly uncover if a platform is doing > > anything else. > > If that property depends on the processor rather than the > compiler, yes, we can do that. Please also add a comment to > state your assumption of arithmetic right-shift. Done.
	161

	162 /* felem_reduce_degree adds a multiple of p in order to cancel \|carry\|,

	163 * which is a term at 2**257.

	164 *

	165 * On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.

	166 * On exit: inout[0,2,..] < 230, inout[1,3,...] < 229. */

	167 static void felem_reduce_carry(felem inout, limb carry)

	168 {

	169 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);

	170

	171 inout[0] += carry << 1;

	172 inout[3] += 0x10000000 & carry_mask;

	173 inout[3] -= carry << 11;

	174 inout[4] += (0x20000000 - 1) & carry_mask;

	175 inout[5] += (0x10000000 - 1) & carry_mask;

	176 inout[6] += (0x20000000 - 1) & carry_mask;

	177 inout[6] -= carry << 22;

	178 inout[7] -= 1 & carry_mask;

	179 inout[7] += carry << 25;

	180 }

	181

	182 /* felem_sum sets out = in+in2.

	183 *

	184 * On entry, in[i]+in2[i] must not overflow a 32-bit word.

	185 * On exit: in[0,2,...] < 230, in2[1,3,...] < 229 */

	186 static void felem_sum(felem out, const felem in, const felem in2)

	187 {

	188 limb carry = 0;

	189 unsigned i;

	190 for (i = 0; i < 9; i++) {
	wtc 2012/10/09 21:04:47 Should "9" be changed to NLIMBS here and on line 1 Should "9" be changed to NLIMBS here and on line 197? Not sure which is more readable. agl 2012/10/11 14:55:04 I've changed it to NLIMBS here and in other places Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > Should "9" be changed to NLIMBS here and on line 197? Not sure > which is more readable. I've changed it to NLIMBS here and in other places where I still had an explicit '9'.
	191 out[i] = in[i] + in2[i];

	192 out[i] += carry;

	193 carry = out[i] >> 29;

	194 out[i] &= kBottom29Bits;

	195

	196 i++;

	197 if (i == 9)

	198 break;

	199

	200 out[i] = in[i] + in2[i];

	201 out[i] += carry;

	202 carry = out[i] >> 28;

	203 out[i] &= kBottom28Bits;

	204 }

	205

	206 felem_reduce_carry(out, carry);

	207 }

	208

	209 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)

	210 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)

	211 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)

	212 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)

	213 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)

	214 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)

	215

	216 /* zero31 is 0 mod p. */
	wtc 2012/10/09 21:04:47 zero31 is 0 mod p. Just curious: is it p, 2p, 3p, zero31 is 0 mod p. Just curious: is it p, 2p, 3p, 4p, ...? agl 2012/10/11 14:55:04 It ends up as 8p. Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > zero31 is 0 mod p. Just curious: is it p, 2p, 3p, 4p, ...? It ends up as 8p. wtc 2012/10/11 16:29:00 Just curious: why is 8p chosen? Is it more conveni Show quoted text On 2012/10/11 14:55:04, agl wrote: > > It ends up as 8p. Just curious: why is 8p chosen? Is it more convenient than the other 0 mod p values? agl 2012/10/11 18:51:01 The important feature of the value is that the hig Show quoted text On 2012/10/11 16:29:00, wtc wrote: > Just curious: why is 8p chosen? Is it more convenient than > the other 0 mod p values? The important feature of the value is that the high bits of each word are set (i.e. either bit 30 or 31). This allows us to add this value to another felem without affecting it's value mod p, but now each word can be subtracted from without underflow. So it happens to be 8p because we want to set bit 31 in the last word and that happens to hit 2**259. Therefore the closest value that is 0 mod p is 8p.
	217 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, tw o30m2, two31p24m2, two30m27m2, two31m2 };

	218

	219 /* felem_diff sets out = in-in2.

	220 *

	221 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	222 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	223 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	224 static void felem_diff(felem out, const felem in, const felem in2)

	225 {

	226 limb carry = 0;

	227 unsigned i;

	228

	229 for (i = 0; i < 9; i++) {

	230 out[i] = in[i] - in2[i];

	231 out[i] += zero31[i];

	232 out[i] += carry;

	233 carry = out[i] >> 29;

	234 out[i] &= kBottom29Bits;

	235

	236 i++;

	237 if (i == 9)

	238 break;

	239

	240 out[i] = in[i] - in2[i];

	241 out[i] += zero31[i];

	242 out[i] += carry;

	243 carry = out[i] >> 28;

	244 out[i] &= kBottom28Bits;

	245 }

	246

	247 felem_reduce_carry(out, carry);

	248 }

	249

	250 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words

	251 * with the same 29,28,... bit pattern as an felem.

	252 *

	253 * On entry: tmp[i] < 2**64

	254 * On exit: out[0,2,...] < 230, out[1,3,...] < 229 */

	255 static void felem_reduce_degree(felem out, u64 tmp[17])
	wtc 2012/10/09 21:04:47 Why does the 'tmp' array have a size of 17? Why does the 'tmp' array have a size of 17? agl 2012/10/11 14:55:04 The result of multiplying two arrays with indexes Show quoted text On 2012/10/09 21:04:47, wtc wrote: > > Why does the 'tmp' array have a size of 17? The result of multiplying two arrays with indexes [0...8] is an array with indexes [0...16], so there's 17 elements in the product.
	256 {

	257 limb tmp2[18], carry, x, xMask;

	258 unsigned i;

	259

	260 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an

	261 * felem. So the top of an element of tmp might overlap with another

	262 * element two positions down. The following loop eliminates this

	263 * overlap. */

	264 tmp2[0] = tmp[0] & kBottom29Bits;

	265

	266 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try

	267 * and hint to the compiler that it can do a single-word shift by selecting

	268 * the right register rather than doing a double-word shift and truncating

	269 * afterwards. */

	270 tmp2[1] = ((limb) tmp[0]) >> 29;

	271 tmp2[1] \|= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;

	272 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;

	273 carry = tmp2[1] >> 28;

	274 tmp2[1] &= kBottom28Bits;

	275

	276 for (i = 2; i < 17; i++) {

	277 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	278 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;

	279 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;

	280 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;

	281 tmp2[i] += carry;

	282 carry = tmp2[i] >> 29;

	283 tmp2[i] &= kBottom29Bits;

	284

	285 i++;

	286 if (i == 17)

	287 break;

	288 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	289 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;

	290 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;

	291 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;

	292 tmp2[i] += carry;

	293 carry = tmp2[i] >> 28;

	294 tmp2[i] &= kBottom28Bits;

	295 }

	296

	297 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;

	298 tmp2[17] += ((limb) (tmp[16])) >> 29;

	299 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);

	300 tmp2[17] += carry;

	301

	302 /* Montgomery elimination of terms.

	303 *

	304 * The values in felems are in Montgomery form: xR mod p where R = 2*257.

	305 * Since we just multiplied two Montgomery values together, the result is

	306 * xyR*R mod p. We wish to divide by R in order for the result also to be

	307 * in Montgomery form.

	308 *

	309 * Since R is 2**257, we can do that with a bitwise shift if we can ensure

	310 * that the right-most 257 bits are all zero. We can make that true by

	311 * adding multiplies of p without affecting the value.

	312 *

	313 * So we eliminate limbs from right to left. Since the bottom 29 bits of p

	314 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.

	315 * We can do that for 8 further limbs and then right shift to eliminate the

	316 * extra factor of R. */

	317 for (i = 0; i < 9; i++) {

	318 tmp2[i + 1] += tmp2[i] >> 29;

	319 x = tmp2[i] & kBottom29Bits;

	320 tmp2[i] = 0;

	321

	322 /* The bounds calculations for this loop are tricky. Each iteration of

	323 * the loop eliminates two words by adding values to words to their

	324 * right.

	325 *

	326 * The following table contains the amounts added to each word (as an

	327 * offset from the value of i at the top of the loop). The amounts are

	328 * accounted for from the first and second half of the loop separately

	329 * and are written as, for example, 28 to mean a value <2**28.

	330 *

	331 * Word: 3 4 5 6 7 8 9 10

	332 * Added in top half: 28 14 29 26 29 28

	333 * 28 29

	334 * Added in bottom half: 29 14 28 25 28 28

	335 * 29

	336 *

	337 * The value that is current offset 7 will be offset 5 for the next

	338 * iteration and then offset 3 for the iteration after that. Therefore

	339 * the total value added will be the values added at 7, 5 and 3.

	340 *

	341 * The following table accumulates these values. The sums at the bottom

	342 * are written as, for example, 29+28, to mean a value < 229+228.

	343 *

	344 * Word: 3 4 5 6 7 8 9 10 11

	345 * 28 14 14 29 26 29 28 28 28

	346 * 29 28 29 28 29 28 29 28

	347 * 14 28 25 26 29 26

	348 * 28 29 28 28 28

	349 * 29 28 29 28

	350 * 29 14 29 14

	351 * 14 28 14 28

	352 * 29

	353 * ----------------------------------

	354 * 30+ 31+ 30+

	355 * 28 30+ 28

	356 * 14

	357 *

	358 * So the greatest amount is added to tmp2[10]. If tmp2[10] has an

	359 * initial value of <229, then the maximum value will be < 231 +

	360 * 230 + 229 + 214, which is < 232, as required. */

	361 tmp2[i + 3] += (x << 10) & kBottom28Bits;

	362 tmp2[i + 4] += (x >> 18);

	363

	364 tmp2[i + 6] += (x << 21) & kBottom29Bits;

	365 tmp2[i + 7] += x >> 8;

	366

	367 /* At position 200 we have a factor of 0xf000000 = 228 - 224. */

	368 tmp2[i + 7] += 0x10000000;

	369 tmp2[i + 8] += x - 1;

	370 tmp2[i + 7] -= (x << 24) & kBottom28Bits;

	371 tmp2[i + 8] -= x >> 4;

	372

	373 tmp2[i + 8] += 0x20000000;

	374 tmp2[i + 8] -= x;

	375 tmp2[i + 8] += (x << 28) & kBottom29Bits;

	376 tmp2[i + 9] += (x >> 1) - 1;

	377

	378 i++;

	379 if (i == 9)

	380 break;

	381 tmp2[i + 1] += tmp2[i] >> 28;

	382 x = tmp2[i] & kBottom28Bits;

	383 tmp2[i] = 0;

	384

	385 tmp2[i + 3] += (x << 11) & kBottom29Bits;

	386 tmp2[i + 4] += (x >> 18);

	387

	388 tmp2[i + 6] += (x << 21) & kBottom28Bits;

	389 tmp2[i + 7] += x >> 7;

	390

	391 /* At position 199 we have a factor of 0x1e000000 = 229 - 225. */

	392 tmp2[i + 7] += 0x20000000;

	393 tmp2[i + 8] += x - 1;

	394 tmp2[i + 7] -= (x << 25) & kBottom29Bits;

	395 tmp2[i + 8] -= x >> 4;

	396

	397 /* If x is < 2**28, therefore is x > 0 iff x-1 has a zero MSB. Thus xMas k

	398 is all ones iff x != 0. */

	399 xMask = ((s32) ~ (x - 1)) >> 31;

	400 tmp2[i + 8] += 0x10000000 & xMask;

	401 tmp2[i + 8] -= x & xMask;

	402 tmp2[i + 9] += (x - 1) & xMask;

	403 }

	404

	405 /* We merge the right shift with a carry chain. The words above 2**257 have

	406 * widths of 28,29,... which we need to correct when copying them down. */

	407 carry = 0;

	408 for (i = 0; i < 8; i++) {

	409 /* The maximum value of tmp2[i + 9] occurs on the first iteration and

	410 * is < 230+229+228. Adding 229 (from tmp2[i + 10]) is

	411 * therefore safe. */

	412 out[i] = tmp2[i + 9];

	413 out[i] += carry;

	414 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;

	415 carry = out[i] >> 29;

	416 out[i] &= kBottom29Bits;

	417

	418 i++;

	419 out[i] = tmp2[i + 9] >> 1;

	420 out[i] += carry;

	421 carry = out[i] >> 28;

	422 out[i] &= kBottom28Bits;

	423 }

	424

	425 out[8] = tmp2[17];

	426 out[8] += carry;

	427 carry = out[8] >> 29;

	428 out[8] &= kBottom29Bits;

	429

	430 felem_reduce_carry(out, carry);

	431 }

	432

	433 /* felem_square sets out=in*in.

	434 *

	435 * On entry: in[0,2,...] < 230, in[1,3,...] < 229.

	436 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	437 static void felem_square(felem out, const felem in)

	438 {

	439 uint64_t tmp[17];

	440

	441 tmp[0] = ((uint64_t) in[0]) * in[0];

	442 tmp[1] = ((uint64_t) in[0]) * (in[1] << 1);

	443 tmp[2] = ((uint64_t) in[0]) * (in[2] << 1) +

	444 ((uint64_t) in[1]) * (in[1] << 1);

	445 tmp[3] = ((uint64_t) in[0]) * (in[3] << 1) +

	446 ((uint64_t) in[1]) * (in[2] << 1);

	447 tmp[4] = ((uint64_t) in[0]) * (in[4] << 1) +

	448 ((uint64_t) in[1]) * (in[3] << 2) +

	449 ((uint64_t) in[2]) * in[2];

	450 tmp[5] = ((uint64_t) in[0]) * (in[5] << 1) +

	451 ((uint64_t) in[1]) * (in[4] << 1) +

	452 ((uint64_t) in[2]) * (in[3] << 1);

	453 tmp[6] = ((uint64_t) in[0]) * (in[6] << 1) +

	454 ((uint64_t) in[1]) * (in[5] << 2) +

	455 ((uint64_t) in[2]) * (in[4] << 1) +

	456 ((uint64_t) in[3]) * (in[3] << 1);

	457 tmp[7] = ((uint64_t) in[0]) * (in[7] << 1) +

	458 ((uint64_t) in[1]) * (in[6] << 1) +

	459 ((uint64_t) in[2]) * (in[5] << 1) +

	460 ((uint64_t) in[3]) * (in[4] << 1);

	461 /* tmp[8] has the greatest value of 261 + 260 + 261 + 260 + 2**60,

	462 * which is < 2*64 as required. /

	463 tmp[8] = ((uint64_t) in[0]) * (in[8] << 1) +

	464 ((uint64_t) in[1]) * (in[7] << 2) +

	465 ((uint64_t) in[2]) * (in[6] << 1) +

	466 ((uint64_t) in[3]) * (in[5] << 2) +

	467 ((uint64_t) in[4]) * in[4];

	468 tmp[9] = ((uint64_t) in[1]) * (in[8] << 1) +

	469 ((uint64_t) in[2]) * (in[7] << 1) +

	470 ((uint64_t) in[3]) * (in[6] << 1) +

	471 ((uint64_t) in[4]) * (in[5] << 1);

	472 tmp[10] = ((uint64_t) in[2]) * (in[8] << 1) +

	473 ((uint64_t) in[3]) * (in[7] << 2) +

	474 ((uint64_t) in[4]) * (in[6] << 1) +

	475 ((uint64_t) in[5]) * (in[5] << 1);

	476 tmp[11] = ((uint64_t) in[3]) * (in[8] << 1) +

	477 ((uint64_t) in[4]) * (in[7] << 1) +

	478 ((uint64_t) in[5]) * (in[6] << 1);

	479 tmp[12] = ((uint64_t) in[4]) * (in[8] << 1) +

	480 ((uint64_t) in[5]) * (in[7] << 2) +

	481 ((uint64_t) in[6]) * in[6];

	482 tmp[13] = ((uint64_t) in[5]) * (in[8] << 1) +

	483 ((uint64_t) in[6]) * (in[7] << 1);

	484 tmp[14] = ((uint64_t) in[6]) * (in[8] << 1) +

	485 ((uint64_t) in[7]) * (in[7] << 1);

	486 tmp[15] = ((uint64_t) in[7]) * (in[8] << 1);

	487 tmp[16] = ((uint64_t) in[8]) * in[8];

	488

	489 felem_reduce_degree(out, tmp);

	490 }

	491

	492 /* felem_mul sets out=in*in2.

	493 *

	494 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	495 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	496 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	497 static void felem_mul(felem out, const felem in, const felem in2)

	498 {

	499 u64 tmp[17];

	500

	501 tmp[0] = ((uint64_t) in[0]) * in2[0];

	502 tmp[1] = ((uint64_t) in[0]) * (in2[1] << 0) +

	503 ((uint64_t) in[1]) * (in2[0] << 0);

	504 tmp[2] = ((uint64_t) in[0]) * (in2[2] << 0) +

	505 ((uint64_t) in[1]) * (in2[1] << 1) +

	506 ((uint64_t) in[2]) * (in2[0] << 0);

	507 tmp[3] = ((uint64_t) in[0]) * (in2[3] << 0) +

	508 ((uint64_t) in[1]) * (in2[2] << 0) +

	509 ((uint64_t) in[2]) * (in2[1] << 0) +

	510 ((uint64_t) in[3]) * (in2[0] << 0);

	511 tmp[4] = ((uint64_t) in[0]) * (in2[4] << 0) +

	512 ((uint64_t) in[1]) * (in2[3] << 1) +

	513 ((uint64_t) in[2]) * (in2[2] << 0) +

	514 ((uint64_t) in[3]) * (in2[1] << 1) +

	515 ((uint64_t) in[4]) * (in2[0] << 0);

	516 tmp[5] = ((uint64_t) in[0]) * (in2[5] << 0) +

	517 ((uint64_t) in[1]) * (in2[4] << 0) +

	518 ((uint64_t) in[2]) * (in2[3] << 0) +

	519 ((uint64_t) in[3]) * (in2[2] << 0) +

	520 ((uint64_t) in[4]) * (in2[1] << 0) +

	521 ((uint64_t) in[5]) * (in2[0] << 0);

	522 tmp[6] = ((uint64_t) in[0]) * (in2[6] << 0) +

	523 ((uint64_t) in[1]) * (in2[5] << 1) +

	524 ((uint64_t) in[2]) * (in2[4] << 0) +

	525 ((uint64_t) in[3]) * (in2[3] << 1) +

	526 ((uint64_t) in[4]) * (in2[2] << 0) +

	527 ((uint64_t) in[5]) * (in2[1] << 1) +

	528 ((uint64_t) in[6]) * (in2[0] << 0);

	529 tmp[7] = ((uint64_t) in[0]) * (in2[7] << 0) +

	530 ((uint64_t) in[1]) * (in2[6] << 0) +

	531 ((uint64_t) in[2]) * (in2[5] << 0) +

	532 ((uint64_t) in[3]) * (in2[4] << 0) +

	533 ((uint64_t) in[4]) * (in2[3] << 0) +

	534 ((uint64_t) in[5]) * (in2[2] << 0) +

	535 ((uint64_t) in[6]) * (in2[1] << 0) +

	536 ((uint64_t) in[7]) * (in2[0] << 0);

	537 /* tmp[8] has the greatest value but doesn't overflow. See logic in

	538 * felem_square. */

	539 tmp[8] = ((uint64_t) in[0]) * (in2[8] << 0) +

	540 ((uint64_t) in[1]) * (in2[7] << 1) +

	541 ((uint64_t) in[2]) * (in2[6] << 0) +

	542 ((uint64_t) in[3]) * (in2[5] << 1) +

	543 ((uint64_t) in[4]) * (in2[4] << 0) +

	544 ((uint64_t) in[5]) * (in2[3] << 1) +

	545 ((uint64_t) in[6]) * (in2[2] << 0) +

	546 ((uint64_t) in[7]) * (in2[1] << 1) +

	547 ((uint64_t) in[8]) * (in2[0] << 0);

	548 tmp[9] = ((uint64_t) in[1]) * (in2[8] << 0) +

	549 ((uint64_t) in[2]) * (in2[7] << 0) +

	550 ((uint64_t) in[3]) * (in2[6] << 0) +

	551 ((uint64_t) in[4]) * (in2[5] << 0) +

	552 ((uint64_t) in[5]) * (in2[4] << 0) +

	553 ((uint64_t) in[6]) * (in2[3] << 0) +

	554 ((uint64_t) in[7]) * (in2[2] << 0) +

	555 ((uint64_t) in[8]) * (in2[1] << 0);

	556 tmp[10] = ((uint64_t) in[2]) * (in2[8] << 0) +

	557 ((uint64_t) in[3]) * (in2[7] << 1) +

	558 ((uint64_t) in[4]) * (in2[6] << 0) +

	559 ((uint64_t) in[5]) * (in2[5] << 1) +

	560 ((uint64_t) in[6]) * (in2[4] << 0) +

	561 ((uint64_t) in[7]) * (in2[3] << 1) +

	562 ((uint64_t) in[8]) * (in2[2] << 0);

	563 tmp[11] = ((uint64_t) in[3]) * (in2[8] << 0) +

	564 ((uint64_t) in[4]) * (in2[7] << 0) +

	565 ((uint64_t) in[5]) * (in2[6] << 0) +

	566 ((uint64_t) in[6]) * (in2[5] << 0) +

	567 ((uint64_t) in[7]) * (in2[4] << 0) +

	568 ((uint64_t) in[8]) * (in2[3] << 0);

	569 tmp[12] = ((uint64_t) in[4]) * (in2[8] << 0) +

	570 ((uint64_t) in[5]) * (in2[7] << 1) +

	571 ((uint64_t) in[6]) * (in2[6] << 0) +

	572 ((uint64_t) in[7]) * (in2[5] << 1) +

	573 ((uint64_t) in[8]) * (in2[4] << 0);

	574 tmp[13] = ((uint64_t) in[5]) * (in2[8] << 0) +

	575 ((uint64_t) in[6]) * (in2[7] << 0) +

	576 ((uint64_t) in[7]) * (in2[6] << 0) +

	577 ((uint64_t) in[8]) * (in2[5] << 0);

	578 tmp[14] = ((uint64_t) in[6]) * (in2[8] << 0) +

	579 ((uint64_t) in[7]) * (in2[7] << 1) +

	580 ((uint64_t) in[8]) * (in2[6] << 0);

	581 tmp[15] = ((uint64_t) in[7]) * (in2[8] << 0) +

	582 ((uint64_t) in[8]) * (in2[7] << 0);

	583 tmp[16] = ((uint64_t) in[8]) * (in2[8] << 0);

	584

	585 felem_reduce_degree(out, tmp);

	586 }

	587

	588 static void felem_assign(felem out, const felem in)

	589 {

	590 memcpy(out, in, sizeof(felem));

	591 }

	592

	593 /* felem_inv calculates \|out\| = \|in\|^{-1}

	594 *

	595 * Based on Fermat's Little Theorem:

	596 * a^p = a (mod p)

	597 * a^{p-1} = 1 (mod p)

	598 * a^{p-2} = a^{-1} (mod p)

	599 */

	600 static void felem_inv(felem out, const felem in)

	601 {

	602 felem ftmp, ftmp2;

	603 /* each e_I will hold \|in\|^{2^I - 1} */

	604 felem e2, e4, e8, e16, e32, e64;

	605 unsigned i;

	606

	607 felem_square(ftmp, in); /* 2^1 */

	608 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */

	609 felem_assign(e2, ftmp);

	610 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */

	611 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */

	612 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */

	613 felem_assign(e4, ftmp);

	614 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */

	615 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */

	616 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */

	617 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */

	618 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */

	619 felem_assign(e8, ftmp);

	620 for (i = 0; i < 8; i++) {

	621 felem_square(ftmp, ftmp);

	622 } /* 2^16 - 2^8 */

	623 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */

	624 felem_assign(e16, ftmp);

	625 for (i = 0; i < 16; i++) {

	626 felem_square(ftmp, ftmp);

	627 } /* 2^32 - 2^16 */

	628 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */

	629 felem_assign(e32, ftmp);

	630 for (i = 0; i < 32; i++) {

	631 felem_square(ftmp, ftmp);

	632 } /* 2^64 - 2^32 */

	633 felem_assign(e64, ftmp);

	634 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */

	635 for (i = 0; i < 192; i++) {

	636 felem_square(ftmp, ftmp);

	637 } /* 2^256 - 2^224 + 2^192 */

	638

	639 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */

	640 for (i = 0; i < 16; i++) {

	641 felem_square(ftmp2, ftmp2);

	642 } /* 2^80 - 2^16 */

	643 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */

	644 for (i = 0; i < 8; i++) {

	645 felem_square(ftmp2, ftmp2);

	646 } /* 2^88 - 2^8 */

	647 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */

	648 for (i = 0; i < 4; i++) {

	649 felem_square(ftmp2, ftmp2);

	650 } /* 2^92 - 2^4 */

	651 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */

	652 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */

	653 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */

	654 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */

	655 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */

	656 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */

	657 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */

	658

	659 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */

	660 }

	661

	662 /* felem_scalar_3 sets out=3*out.

	663 *

	664 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	665 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	666 static void felem_scalar_3(felem out)

	667 {

	668 limb carry = 0;

	669 unsigned i;

	670

	671 for (i = 0;; i++) {

	672 out[i] *= 3;

	673 out[i] += carry;

	674 carry = out[i] >> 29;

	675 out[i] &= kBottom29Bits;

	676

	677 i++;

	678 if (i == 9)

	679 break;

	680

	681 out[i] *= 3;

	682 out[i] += carry;

	683 carry = out[i] >> 28;

	684 out[i] &= kBottom28Bits;

	685 }

	686

	687 felem_reduce_carry(out, carry);

	688 }

	689

	690 /* felem_scalar_4 sets out=4*out.

	691 *

	692 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	693 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	694 static void felem_scalar_4(felem out)

	695 {

	696 limb carry = 0, next_carry;

	697 unsigned i;

	698

	699 for (i = 0;; i++) {

	700 next_carry = out[i] >> 27;

	701 out[i] <<= 2;

	702 out[i] &= kBottom29Bits;

	703 out[i] += carry;

	704 carry = next_carry + (out[i] >> 29);

	705 out[i] &= kBottom29Bits;

	706

	707 i++;

	708 if (i == 9)

	709 break;

	710 next_carry = out[i] >> 26;

	711 out[i] <<= 2;

	712 out[i] &= kBottom28Bits;

	713 out[i] += carry;

	714 carry = next_carry + (out[i] >> 28);

	715 out[i] &= kBottom28Bits;

	716 }

	717

	718 felem_reduce_carry(out, carry);

	719 }

	720

	721 /* felem_scalar_8 sets out=8*out.

	722 *

	723 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	724 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	725 static void felem_scalar_8(felem out)

	726 {

	727 limb carry = 0, next_carry;

	728 unsigned i;

	729

	730 for (i = 0;; i++) {

	731 next_carry = out[i] >> 26;

	732 out[i] <<= 3;

	733 out[i] &= kBottom29Bits;

	734 out[i] += carry;

	735 carry = next_carry + (out[i] >> 29);

	736 out[i] &= kBottom29Bits;

	737

	738 i++;

	739 if (i == 9)

	740 break;

	741 next_carry = out[i] >> 25;

	742 out[i] <<= 3;

	743 out[i] &= kBottom28Bits;

	744 out[i] += carry;

	745 carry = next_carry + (out[i] >> 28);

	746 out[i] &= kBottom28Bits;

	747 }

	748

	749 felem_reduce_carry(out, carry);

	750 }

	751

	752 static char felem_is_zero_vartime(const felem in)

	753 {

	754 limb carry;

	755 int i;

	756 limb tmp[NLIMBS];

	757 felem_assign(tmp, in);

	758

	759 /* First, reduce tmp to a minimal form. */

	760 do {

	761 carry = 0;

	762 for (i = 0;; i++) {

	763 tmp[i] += carry;

	764 carry = tmp[i] >> 29;

	765 tmp[i] &= kBottom29Bits;

	766

	767 i++;

	768 if (i == 9)

	769 break;

	770

	771 tmp[i] += carry;

	772 carry = tmp[i] >> 28;

	773 tmp[i] &= kBottom28Bits;

	774 }

	775

	776 felem_reduce_carry(tmp, carry);

	777 } while (carry);

	778

	779 /* tmp < 2*257, so the only possible zero values are 0, p and 2p. /

	780 return memcmp(tmp, kZero, sizeof(tmp)) == 0 \|\|

	781 memcmp(tmp, kP, sizeof(tmp)) == 0 \|\|

	782 memcmp(tmp, k2P, sizeof(tmp)) == 0;

	783 }

	784

	785 /* Group operations:

	786 *

	787 * Elements of the elliptic curve group are represented in Jacobian

	788 * coordinates. An affine point (x', y') is x=x'/z, y=y'/z in Jacobian form. */

	789

	790 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.

	791 *

	792 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l */

	793 static void point_double(felem x_out, felem y_out, felem z_out,

	794 const felem x, const felem y, const felem z)

	795 {

	796 felem delta, gamma, alpha, beta, tmp, tmp2;

	797

	798 felem_square(delta, z);

	799 felem_square(gamma, y);

	800 felem_mul(beta, x, gamma);

	801

	802 felem_sum(tmp, x, delta);

	803 felem_diff(tmp2, x, delta);

	804 felem_mul(alpha, tmp, tmp2);

	805 felem_scalar_3(alpha);

	806

	807 felem_sum(tmp, y, z);

	808 felem_square(tmp, tmp);

	809 felem_diff(tmp, tmp, gamma);

	810 felem_diff(z_out, tmp, delta);

	811

	812 felem_scalar_4(beta);

	813 felem_square(x_out, alpha);

	814 felem_diff(x_out, x_out, beta);

	815 felem_diff(x_out, x_out, beta);

	816

	817 felem_diff(tmp, beta, x_out);

	818 felem_mul(tmp, alpha, tmp);

	819 felem_square(tmp2, gamma);

	820 felem_scalar_8(tmp2);

	821 felem_diff(y_out, tmp, tmp2);

	822 }

	823

	824 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.

	825 * (i.e. the second point is affine.)

	826 *

	827 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	828 *

	829 * Note that this function does not handle P+P, infinity+P nor P+infinity

	830 * correctly. */

	831 static void point_add_mixed(felem x_out, felem y_out, felem z_out,

	832 const felem x1, const felem y1, const felem z1,

	833 const felem x2, const felem y2)

	834 {

	835 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;

	836

	837 felem_square(z1z1, z1);

	838 felem_sum(tmp, z1, z1);

	839

	840 felem_mul(u2, x2, z1z1);

	841 felem_mul(z1z1z1, z1, z1z1);

	842 felem_mul(s2, y2, z1z1z1);

	843 felem_diff(h, u2, x1);

	844 felem_sum(i, h, h);

	845 felem_square(i, i);

	846 felem_mul(j, h, i);

	847 felem_diff(r, s2, y1);

	848 felem_sum(r, r, r);

	849 felem_mul(v, x1, i);

	850

	851 felem_mul(z_out, tmp, h);

	852 felem_square(rr, r);

	853 felem_diff(x_out, rr, j);

	854 felem_diff(x_out, x_out, v);

	855 felem_diff(x_out, x_out, v);

	856

	857 felem_diff(tmp, v, x_out);

	858 felem_mul(y_out, tmp, r);

	859 felem_mul(tmp, y1, j);

	860 felem_diff(y_out, y_out, tmp);

	861 felem_diff(y_out, y_out, tmp);

	862 }

	863

	864 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.

	865 *

	866 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	867 *

	868 * Note that this function does not handle P+P, infinity+P nor P+infinity

	869 * correctly. */

	870 static void point_add(felem x_out, felem y_out, felem z_out,

	871 const felem x1, const felem y1, const felem z1,

	872 const felem x2, const felem y2, const felem z2)

	873 {

	874 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	875

	876 felem_square(z1z1, z1);

	877 felem_square(z2z2, z2);

	878 felem_mul(u1, x1, z2z2);

	879

	880 felem_sum(tmp, z1, z2);

	881 felem_square(tmp, tmp);

	882 felem_diff(tmp, tmp, z1z1);

	883 felem_diff(tmp, tmp, z2z2);

	884

	885 felem_mul(z2z2z2, z2, z2z2);

	886 felem_mul(s1, y1, z2z2z2);

	887

	888 felem_mul(u2, x2, z1z1);

	889 felem_mul(z1z1z1, z1, z1z1);

	890 felem_mul(s2, y2, z1z1z1);

	891 felem_diff(h, u2, u1);

	892 felem_sum(i, h, h);

	893 felem_square(i, i);

	894 felem_mul(j, h, i);

	895 felem_diff(r, s2, s1);

	896 felem_sum(r, r, r);

	897 felem_mul(v, u1, i);

	898

	899 felem_mul(z_out, tmp, h);

	900 felem_square(rr, r);

	901 felem_diff(x_out, rr, j);

	902 felem_diff(x_out, x_out, v);

	903 felem_diff(x_out, x_out, v);

	904

	905 felem_diff(tmp, v, x_out);

	906 felem_mul(y_out, tmp, r);

	907 felem_mul(tmp, s1, j);

	908 felem_diff(y_out, y_out, tmp);

	909 felem_diff(y_out, y_out, tmp);

	910 }

	911

	912 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2 }.

	913 *

	914 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	915 *

	916 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */

	917 static void point_add_or_double_vartime(

	918 felem x_out, felem y_out, felem z_out,

	919 const felem x1, const felem y1, const felem z1,

	920 const felem x2, const felem y2, const felem z2)

	921 {

	922 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	923 char x_equal, y_equal;

	924

	925 felem_square(z1z1, z1);

	926 felem_square(z2z2, z2);

	927 felem_mul(u1, x1, z2z2);

	928

	929 felem_sum(tmp, z1, z2);

	930 felem_square(tmp, tmp);

	931 felem_diff(tmp, tmp, z1z1);

	932 felem_diff(tmp, tmp, z2z2);

	933

	934 felem_mul(z2z2z2, z2, z2z2);

	935 felem_mul(s1, y1, z2z2z2);

	936

	937 felem_mul(u2, x2, z1z1);

	938 felem_mul(z1z1z1, z1, z1z1);

	939 felem_mul(s2, y2, z1z1z1);

	940 felem_diff(h, u2, u1);

	941 x_equal = felem_is_zero_vartime(h);

	942 felem_sum(i, h, h);

	943 felem_square(i, i);

	944 felem_mul(j, h, i);

	945 felem_diff(r, s2, s1);

	946 y_equal = felem_is_zero_vartime(r);

	947 if (x_equal && y_equal) {

	948 point_double(x_out, y_out, z_out, x1, y1, z1);

	949 return;

	950 }

	951 felem_sum(r, r, r);

	952 felem_mul(v, u1, i);

	953

	954 felem_mul(z_out, tmp, h);

	955 felem_square(rr, r);

	956 felem_diff(x_out, rr, j);

	957 felem_diff(x_out, x_out, v);

	958 felem_diff(x_out, x_out, v);

	959

	960 felem_diff(tmp, v, x_out);

	961 felem_mul(y_out, tmp, r);

	962 felem_mul(tmp, s1, j);

	963 felem_diff(y_out, y_out, tmp);

	964 felem_diff(y_out, y_out, tmp);

	965 }

	966

	967 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.

	968 *

	969 * On entry: mask is either 0 or 0xffffffff. */

	970 static void copy_conditional(felem out, const felem in, limb mask)

	971 {

	972 int i;

	973

	974 for (i = 0; i < 9; i++) {

	975 const limb tmp = mask & (in[i] ^ out[i]);

	976 out[i] ^= tmp;

	977 }

	978 }

	979

	980 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.

	981 * On entry: index < 16, table[0] must be zero. */

	982 static void select_affine_point(felem out_x, felem out_y,

	983 const limb *table, limb index)

	984 {

	985 limb i, j;

	986

	987 memset(out_x, 0, sizeof(felem));

	988 memset(out_y, 0, sizeof(felem));

	989

	990 for (i = 1; i < 16; i++) {

	991 limb mask = i ^ index;

	992 mask \|= mask >> 2;

	993 mask \|= mask >> 1;

	994 mask &= 1;

	995 mask--;

	996 for (j = 0; j < NLIMBS; j++) {

	997 out_x[j] \|= (*table++) & mask;

	998 }

	999 for (j = 0; j < NLIMBS; j++) {

	1000 out_y[j] \|= (*table++) & mask;

	1001 }

	1002 }

	1003 }

	1004

	1005 /* select_affine_point sets {out_x,out_y,out_z} to the index'th entry of table.

	1006 * On entry: index < 16, table[0] must be zero. */

	1007 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,

	1008 const limb *table, limb index)

	1009 {

	1010 limb i, j;

	1011

	1012 memset(out_x, 0, sizeof(felem));

	1013 memset(out_y, 0, sizeof(felem));

	1014 memset(out_z, 0, sizeof(felem));

	1015

	1016 table += 3*NLIMBS;

	1017

	1018 for (i = 1; i < 16; i++) {

	1019 limb mask = i ^ index;

	1020 mask \|= mask >> 2;

	1021 mask \|= mask >> 1;

	1022 mask &= 1;

	1023 mask--;

	1024 for (j = 0; j < NLIMBS; j++) {

	1025 out_x[j] \|= (*table++) & mask;

	1026 }

	1027 for (j = 0; j < NLIMBS; j++) {

	1028 out_y[j] \|= (*table++) & mask;

	1029 }

	1030 for (j = 0; j < NLIMBS; j++) {

	1031 out_z[j] \|= (*table++) & mask;

	1032 }

	1033 }

	1034 }

	1035

	1036 /* get_bit returns the bit'th bit of scalar. */

	1037 static char get_bit(const u8 scalar[32], int bit)

	1038 {

	1039 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;

	1040 }

	1041

	1042 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian

	1043 * number. Note that the value of scalar must be less than the order of the

	1044 * group. */

	1045 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])

	1046 {

	1047 int i, j;

	1048 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;

	1049 u32 table_offset;

	1050

	1051 felem px, py;

	1052 felem tx, ty, tz;

	1053

	1054 memset(nx, 0, sizeof(felem));

	1055 memset(ny, 0, sizeof(felem));

	1056 memset(nz, 0, sizeof(felem));

	1057

	1058 /* The loop adds bits at positions 0, 64, 128 and 192, followed by

	1059 * positions 32,96,160 and 224 and does this 32 times. */

	1060 for (i = 0; i < 32; i++) {

	1061 if (i) {

	1062 point_double(nx, ny, nz, nx, ny, nz);

	1063 }

	1064 for (j = 0; j <= 32; j += 32) {

	1065 char bit0 = get_bit(scalar, 31 - i + j);

	1066 char bit1 = get_bit(scalar, 95 - i + j);

	1067 char bit2 = get_bit(scalar, 159 - i + j);

	1068 char bit3 = get_bit(scalar, 223 - i + j);

	1069 limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);

	1070

	1071 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);

	1072 select_affine_point(px, py, kPrecomputed + table_offset, index);

	1073

	1074 /* Since scalar is less than the order of the group, we know that

	1075 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle

	1076 * below. */

	1077 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);

	1078 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero

	1079 * (a.k.a. the point at infinity). We handle that situation by

	1080 * copying the point from the table. */

	1081 copy_conditional(nx, px, n_is_infinity_mask);

	1082 copy_conditional(ny, py, n_is_infinity_mask);

	1083 copy_conditional(nz, kOne, n_is_infinity_mask);

	1084

	1085 /* Equally, the result is also wrong if the point from the table is

	1086 * zero, which happens when the index is zero. We handle that by

	1087 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */

	1088 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);

	1089 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1090 copy_conditional(nx, tx, mask);

	1091 copy_conditional(ny, ty, mask);

	1092 copy_conditional(nz, tz, mask);

	1093 /* If p was not zero, then n is now non-zero. */

	1094 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1095 }

	1096 }

	1097 }

	1098

	1099 /* point_to_affine converts a Jacobian point to an affine point. If the input

	1100 * is the point at infinity then it returns (0, 0) in constant time. */

	1101 static void point_to_affine(felem x_out, felem y_out,

	1102 const felem nx, const felem ny, const felem nz) {

	1103 felem z_inv, z_inv_sq;

	1104 felem_inv(z_inv, nz);

	1105 felem_square(z_inv_sq, z_inv);

	1106 felem_mul(x_out, nx, z_inv_sq);

	1107 felem_mul(z_inv, z_inv, z_inv_sq);

	1108 felem_mul(y_out, ny, z_inv);

	1109 }

	1110

	1111 /* scalar_base_mult sets {nx,ny,nz} = scalar{x,y}. /

	1112 static void scalar_mult(felem nx, felem ny, felem nz,

	1113 const felem x, const felem y, const u8 scalar[32])

	1114 {

	1115 int i;

	1116 felem px, py, pz, tx, ty, tz;

	1117 felem precomp[16][3];

	1118 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;

	1119

	1120 /* We precompute 0,1,2,... times {x,y}. */

	1121 memset(precomp, 0, sizeof(felem) * 3);

	1122 memcpy(&precomp[1][0], x, sizeof(felem));

	1123 memcpy(&precomp[1][1], y, sizeof(felem));

	1124 memcpy(&precomp[1][2], kOne, sizeof(felem));

	1125

	1126 for (i = 2; i < 16; i += 2) {

	1127 point_double(precomp[i][0], precomp[i][1], precomp[i][2],

	1128 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);

	1129

	1130 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],

	1131 precomp[i][0], precomp[i][1], precomp[i][2], x, y);

	1132 }

	1133

	1134 memset(nx, 0, sizeof(felem));

	1135 memset(ny, 0, sizeof(felem));

	1136 memset(nz, 0, sizeof(felem));

	1137 n_is_infinity_mask = -1;

	1138

	1139 /* We add in a window of four bits each iteration and do this 64 times. */

	1140 for (i = 0; i < 64; i++) {

	1141 if (i) {

	1142 point_double(nx, ny, nz, nx, ny, nz);

	1143 point_double(nx, ny, nz, nx, ny, nz);

	1144 point_double(nx, ny, nz, nx, ny, nz);

	1145 point_double(nx, ny, nz, nx, ny, nz);

	1146 }

	1147

	1148 index = scalar[31 - i / 2];

	1149 if ((i & 1) == 1) {

	1150 index &= 15;

	1151 } else {

	1152 index >>= 4;

	1153 }

	1154

	1155 /* See the comments in scalar_base_mult about handling infinities. */

	1156 select_jacobian_point(px, py, pz, (limb *) precomp, index);

	1157 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);

	1158 copy_conditional(nx, px, n_is_infinity_mask);

	1159 copy_conditional(ny, py, n_is_infinity_mask);

	1160 copy_conditional(nz, pz, n_is_infinity_mask);

	1161

	1162 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;

	1163 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1164 copy_conditional(nx, tx, mask);

	1165 copy_conditional(ny, ty, mask);

	1166 copy_conditional(nz, tz, mask);

	1167 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1168 }

	1169 }

	1170

	1171

	1172 /* Interface with Freebl: */

	1173

	1174 #if __BYTE_ORDER != __LITTLE_ENDIAN

	1175 #error "This code needs a little-endian processor"

	1176 #endif

	1177

	1178 static const u32 kRInvDigits[8] = {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xf ffffffe, 1, 0x7fffffff};

	1179 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))

	1180 static const mp_int kRInv = {

	1181 MP_ZPOS,

	1182 MP_DIGITS_IN_256_BITS,

	1183 MP_DIGITS_IN_256_BITS,

	1184 /* Because we are running on a little-endian processor, this cast works for

	1185 * both 32 and 64-bit processors. */

	1186 (mp_digit*) kRInvDigits

	1187 };

	1188

	1189 static const limb kTwo28 = 0x10000000;

	1190 static const limb kTwo29 = 0x20000000;

	1191

	1192 /* montgomery sets out = Rin. /

	1193 static mp_err montgomery(felem out, const mp_int in, const ECGroup group) {

	1194 /* There are no MPI functions for bitshift operations and we wish to shift i n

	1195 * 257 bits left so we multiply by two and then move the digits 256-bits

	1196 * left. */

	1197 mp_int in_shifted;

	1198 int i;

	1199 mp_err res;

	1200

	1201 mp_init(&in_shifted);

	1202 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);

	1203 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),

	1204 MP_DIGITS(in),

	1205 MP_USED(in)*sizeof(mp_digit));

	1206 mp_mul_2(&in_shifted, &in_shifted);

	1207 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));

	1208

	1209 for (i = 0; i < NLIMBS; i++) {

	1210 if ((i & 1) == 0) {

	1211 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;

	1212 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);

	1213 } else {

	1214 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;

	1215 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);

	1216 }

	1217 }

	1218

	1219 mp_clear(&in_shifted);

	1220

	1221 CLEANUP:

	1222 return res;

	1223 }

	1224

	1225 /* inverse_montgomery sets out=in/R. */

	1226 static mp_err inverse_montgomery(mp_int *out, const felem in,

	1227 const ECGroup *group) {

	1228 mp_int result, tmp;

	1229 mp_err res;

	1230 int i;

	1231

	1232 mp_init(&result);

	1233 mp_init(&tmp);

	1234

	1235 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));

	1236 for (i = NLIMBS-2; i >= 0; i--) {

	1237 if ((i & 1) == 0) {

	1238 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));

	1239 } else {

	1240 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));

	1241 }

	1242 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));

	1243 }

	1244

	1245 MP_CHECKOK(mp_mul(&result, &kRInv, out));

	1246 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));

	1247

	1248 mp_clear(&result);

	1249 mp_clear(&tmp);

	1250

	1251 CLEANUP:

	1252 return res;

	1253 }

	1254

	1255 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */

	1256 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n) {

	1257 /* We require that \|n\| is less than the order of the group and therefore it

	1258 * will fit into \|scalar\|. However, these is a timing side-channel here that

	1259 * we cannot avoid: if \|n\| is sufficiently small it may be one or more words

	1260 * too short and we'll copy less data. */

	1261 memset(out_scalar, 0, 32);

	1262 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));

	1263 }

	1264

	1265 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the

	1266 * order of the group. */

	1267 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,

	1268 mp_int out_x, mp_int out_y,

	1269 const ECGroup *group) {

	1270 u8 scalar[32];

	1271 felem x, y, z, x_affine, y_affine;

	1272 mp_err res;

	1273

	1274 scalar_from_mp_int(scalar, n);

	1275 scalar_base_mult(x, y, z, scalar);

	1276 point_to_affine(x_affine, y_affine, x, y, z);

	1277 MP_CHECKOK(inverse_montgomery(out_x, x_affine, group));

	1278 MP_CHECKOK(inverse_montgomery(out_y, y_affine, group));

	1279

	1280 CLEANUP:

	1281 return res;

	1282 }

	1283

	1284 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <

	1285 * the order of the group. */

	1286 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,

	1287 const mp_int in_x, const mp_int in_y,

	1288 mp_int out_x, mp_int out_y,

	1289 const ECGroup *group) {

	1290 u8 scalar[32];

	1291 felem x, y, z, x_affine, y_affine, px, py;

	1292 mp_err res;

	1293

	1294 scalar_from_mp_int(scalar, n);

	1295

	1296 MP_CHECKOK(montgomery(px, in_x, group));

	1297 MP_CHECKOK(montgomery(py, in_y, group));

	1298

	1299 scalar_mult(x, y, z, px, py, scalar);

	1300 point_to_affine(x_affine, y_affine, x, y, z);

	1301 MP_CHECKOK(inverse_montgomery(out_x, x_affine, group));

	1302 MP_CHECKOK(inverse_montgomery(out_y, y_affine, group));

	1303

	1304 CLEANUP:

	1305 return res;

	1306 }

	1307

	1308 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n1G + n2{in_x,in_y}, where

	1309 * n1 and n2 are < the order of the group.

	1310 *

	1311 * As indicated by the name, this function operates in variable time. This

	1312 * is safe because it's used for signature validation which doesn't deal

	1313 * with secrets. */

	1314 mp_err ec_GFp_nistp256_points_mul_vartime

	1315 (const mp_int n1, const mp_int n2,

	1316 const mp_int in_x, const mp_int in_y,

	1317 mp_int out_x, mp_int out_y,

	1318 const ECGroup *group) {

	1319 u8 scalar1[32], scalar2[32];

	1320 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;

	1321 mp_err res = MP_OKAY;

	1322

	1323 /* If both scalars are zero, then the result is the point at infinity. */

	1324 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {

	1325 mp_zero(out_x);

	1326 mp_zero(out_y);

	1327 return res;

	1328 }

	1329

	1330 scalar_from_mp_int(scalar1, n1);

	1331 scalar_from_mp_int(scalar2, n2);

	1332

	1333 MP_CHECKOK(montgomery(px, in_x, group));

	1334 MP_CHECKOK(montgomery(py, in_y, group));

	1335 scalar_base_mult(x1, y1, z1, scalar1);

	1336 scalar_mult(x2, y2, z2, px, py, scalar2);

	1337

	1338 if (mp_cmp_z(n2) == 0) {

	1339 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just

	1340 * {x1,y1,z1}. */

	1341 } else if (mp_cmp_z(n1) == 0) {

	1342 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just

	1343 * {x2,y2,z2}. */

	1344 memcpy(x1, x2, sizeof(x2));

	1345 memcpy(y1, y2, sizeof(y2));

	1346 memcpy(z1, z2, sizeof(z2));

	1347 } else {

	1348 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */

	1349 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);

	1350 }

	1351

	1352 point_to_affine(x_affine, y_affine, x1, y1, z1);

	1353 MP_CHECKOK(inverse_montgomery(out_x, x_affine, group));

	1354 MP_CHECKOK(inverse_montgomery(out_y, y_affine, group));

	1355

	1356 CLEANUP:

	1357 return res;

	1358 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »