ecp_256_32.c - Issue 10944017: 32-bit, P-256 implementation.

Side by Side Diff: ecp_256_32.c

Issue 10944017: 32-bit, P-256 implementation. Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Including results of ekasper's review. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #include "prcpucfg.h"
	wtc 2012/11/15 22:58:41 1. Please copy the MPL 2 header from another file. 1. Please copy the MPL 2 header from another file. 2. NSS doesn't have style recommendations for headers. I suggest we list the headers in two groups: #include <stdint.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include "prcpucfg.h" #include "mpi.h" #include "mpi-priv.h" #include "ecp.h" 3. The <stdint.h> header could be problematic when we integrate this file with upstream NSS. A solution is to replace <stdint.h> with "prtypes.h" and replace uint8_t, uint32_t, etc. with PRUint8, PRUint32, etc. If "prtypes.h" is included, "prcpucfg.h" can be omitted. (Every NSPR header includes "prcpucfg.h".) agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > 1. Please copy the MPL 2 header from another file. Done. Show quoted text > 2. NSS doesn't have style recommendations for headers. I > suggest we list the headers in two groups: Done. Show quoted text > 3. The <stdint.h> header could be problematic when we integrate > this file with upstream NSS. A solution is to replace > <stdint.h> with "prtypes.h" and replace uint8_t, uint32_t, > etc. with PRUint8, PRUint32, etc. Done.
	2

	3 #include <stdint.h>

	4 #include <stdio.h>

	5

	6 #include <string.h>

	7 #include <stdlib.h>

	8

	9 #include "mpi.h"

	10 #include "mpi-priv.h"

	11 #include "ecp.h"

	12

	13 typedef uint8_t u8;

	14 typedef uint32_t u32;

	15 typedef int32_t s32;

	16 typedef uint64_t u64;

	17

	18 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's

	19 * MPI library calls them digits, but here they are called limbs, which is

	20 * GMP's terminology.

	21 *

	22 * The value of an felem (field element) is:

	23 * x[0] + (x[1] * 2*29) + (x[2] 2*57) + ... + (x[8] 2**228)

	24 *

	25 * That is, each limb is alternately 29 or 28-bits wide in little-endian

	26 * order.

	27 *

	28 * This means that an felem hits 2257, rather than 2256 as we would like. A

	29 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems

	30 * when multiplying as terms end up one bit short of a limb which would require

	31 * much bit-shifting to correct.

	32 *

	33 * Finally, the values stored in an felem are in Montgomery form. So the value

	34 * \|y\| is stored as (y * 2**257) mod p, where p is the P-256 prime.

	35 */

	36 typedef uint32_t limb;

	37 #define NLIMBS 9

	38 typedef limb felem[NLIMBS];

	39

	40 static const limb kBottom28Bits = 0xfffffff;

	41 static const limb kBottom29Bits = 0x1fffffff;

	42

	43 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and

	44 * 28-bit words. */

	45 static const felem kOne = {

	46 2, 0, 0, 0xffff800,

	47 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,

	48 0

	49 };

	50 static const limb kZero[NLIMBS] = {0};

	51 static const limb kP[NLIMBS] = {

	52 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,

	53 0, 0, 0x200000, 0xf000000,

	54 0xfffffff

	55 };

	56 static const limb k2P[NLIMBS] = {
	wtc 2012/11/15 22:58:41 Use the felem typedef when you define kZero, kP, a Use the felem typedef when you define kZero, kP, and k2P, as in the definition of kOne. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Use the felem typedef when you define kZero, kP, and k2P, > as in the definition of kOne. Done.
	57 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,

	58 0, 0, 0x400000, 0xe000000,

	59 0x1fffffff

	60 };
	wtc 2012/11/15 22:58:41 Add a blank line here? Add a blank line here? agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Add a blank line here? Done.
	61 /* kPrecomputed contains precomputed values to aid the calculation of scalar

	62 * multiples of the base point, G. It's actually two, equal length, tables

	63 * concatenated.

	64 *

	65 * The first table contains (x,y) felem pairs for 16 multiples of the base

	66 * point, G.

	67 *

	68 * Index \| Index (binary) \| Value

	69 * 0 \| 0000 \| 0G (all zeros, omitted)

	70 * 1 \| 0001 \| G

	71 * 2 \| 0010 \| 2**64G

	72 * 3 \| 0011 \| 2**64G + G

	73 * 4 \| 0100 \| 2**128G

	74 * 5 \| 0101 \| 2**128G + G

	75 * 6 \| 0110 \| 2128G + 264G

	76 * 7 \| 0111 \| 2128G + 264G + G

	77 * 8 \| 1000 \| 2**192G

	78 * 9 \| 1001 \| 2**192G + G

	79 * 10 \| 1010 \| 2192G + 264G

	80 * 11 \| 1011 \| 2192G + 264G + G

	81 * 12 \| 1100 \| 2192G + 2128G

	82 * 13 \| 1101 \| 2192G + 2128G + G

	83 * 14 \| 1110 \| 2192G + 2128G + 2**64G

	84 * 15 \| 1111 \| 2192G + 2128G + 2**64G + G

	85 *

	86 * The second table follows the same style, but the multiples are 2**32G,
	wtc 2012/11/15 22:58:41 Nit: 232G, 296G, etc. probably should be calle Nit: 232G, 296G, etc. probably should be called "terms" rather than "multiples". agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Nit: 232G, 296G, etc. probably should be called "terms" > rather than "multiples". Done.
	87 * 296G, 2160G, 2**224G.

	88 *

	89 * This is ~2KB of data. */
	wtc 2012/11/15 22:58:41 Nit: put the closing / on its own line. If you m Nit: put the closing / on its own line. If you make this change, please make the same change to all comment blocks in this file for consistency. (I am also fine with your current style.) agl 2012/11/16 19:17:46 Done for all multi-line comments in the file. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > If you make this change, please make the same change to > all comment blocks in this file for consistency. (I am > also fine with your current style.) Done for all multi-line comments in the file.
	90 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {

	91 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,

	92 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,

	93 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,

	94 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,

	95 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,

	96 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,

	97 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,

	98 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,

	99 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,

	100 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,

	101 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,

	102 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,

	103 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,

	104 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,

	105 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,

	106 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,

	107 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,

	108 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,

	109 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,

	110 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,

	111 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,

	112 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,

	113 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,

	114 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,

	115 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,

	116 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,

	117 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,

	118 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,

	119 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,

	120 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,

	121 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,

	122 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,

	123 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,

	124 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,

	125 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,

	126 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,

	127 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,

	128 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,

	129 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,

	130 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,

	131 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,

	132 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,

	133 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,

	134 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,

	135 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,

	136 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,

	137 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,

	138 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,

	139 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,

	140 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,

	141 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,

	142 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,

	143 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,

	144 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,

	145 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,

	146 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,

	147 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,

	148 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,

	149 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,

	150 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,

	151 };

	152
	wtc 2012/11/15 22:58:41 Nit: delete one blank line. Nit: delete one blank line. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Nit: delete one blank line. Done.
	153

	154 /* Field element operations: */

	155

	156 /* NON_ZERO_TO_ALL_ONES returns:

	157 * 0xffffffff for 0 < x <= 2**31

	158 * 0 for x == 0 or x > 2**31.

	159 *

	160 * This macro assumes that right-shifting a signed number shifts in the MSB on

	161 * the left. This is not ensured by the C standard, but is true on the CPUs

	162 * that we're targetting with this code (x86 and ARM). */

	163 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) (x-1)) >> 31)))
	wtc 2012/11/15 22:58:41 Should we add parentheses around x in the macro's Should we add parentheses around x in the macro's expansion? It is not necessary in the current code, and this macro already has a lot of parentheses. So I am fine to go without for clarity. agl 2012/11/16 19:17:46 Despite the noted abundance of them already, I thi Show quoted text > Should we add parentheses around x in the macro's expansion? Despite the noted abundance of them already, I think so. Done.
	164

	165 /* felem_reduce_degree adds a multiple of p in order to cancel \|carry\|,
	wtc 2012/11/15 22:58:41 Typo: felem_reduce_degree => felem_reduce_carry Typo: felem_reduce_degree => felem_reduce_carry agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Typo: felem_reduce_degree => felem_reduce_carry Done.
	166 * which is a term at 2**257.

	167 *

	168 * On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.
	wtc 2012/11/15 22:58:41 Question: The above says \|carry\| is a term at 22 Question: The above says \|carry\| is a term at 2257. But here we have carry < 23. These seem to contradict each other. agl 2012/11/16 19:17:46 \|carry\| is the coefficient of the term at 2257, Show quoted text > Question: The above says \|carry\| is a term at 2257. But > here we have carry < 23. These seem to contradict each > other. \|carry\| is the coefficient of the term at 2257, but it is less than 8. In the same way that, in a big-endian, 32-bit word, the first byte is a term at 224, and is less than 256.
	169 * On exit: inout[0,2,..] < 230, inout[1,3,...] < 229. */

	170 static void felem_reduce_carry(felem inout, limb carry)

	171 {

	172 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);

	173

	174 inout[0] += carry << 1;

	175 inout[3] += 0x10000000 & carry_mask;

	176 // carry < 23 thus (carry << 11) < 214 and we added 2**28 in the

	177 // previous line therefore this doesn't underflow.

	178 inout[3] -= carry << 11;

	179 inout[4] += (0x20000000 - 1) & carry_mask;

	180 inout[5] += (0x10000000 - 1) & carry_mask;

	181 inout[6] += (0x20000000 - 1) & carry_mask;

	182 inout[6] -= carry << 22;

	183 // This may underflow if carry is non-zero but, if so, we'll fix it in the

	184 // next line.
	wtc 2012/11/15 22:58:41 Why don't we add and then subtract? Will that avoi Why don't we add and then subtract? Will that avoid the potential underflow altogether? Just curious. agl 2012/11/16 19:17:46 Since we're using two's complement, adding then su Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Why don't we add and then subtract? Will that avoid the > potential underflow altogether? Just curious. Since we're using two's complement, adding then subtracting is exactly the same as subtracting and then adding. I added the comment because ekasper asked for it when she missed the " & carry_mask". I think I prefer to keep the subtraction nearer its cause (the addition to [6]).
	185 inout[7] -= 1 & carry_mask;

	186 inout[7] += carry << 25;

	187 }

	188

	189 /* felem_sum sets out = in+in2.

	190 *

	191 * On entry, in[i]+in2[i] must not overflow a 32-bit word.

	192 * On exit: in[0,2,...] < 230, in2[1,3,...] < 229 */
	wtc 2012/11/15 22:58:41 Are "in" and "in2" typos? I think you meant "out". Are "in" and "in2" typos? I think you meant "out". agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Are "in" and "in2" typos? I think you meant "out". Done.
	193 static void felem_sum(felem out, const felem in, const felem in2)

	194 {

	195 limb carry = 0;

	196 unsigned i;
	wtc 2012/11/15 22:58:41 Nit: change "unsigned" to "unsigned int" throughou Nit: change "unsigned" to "unsigned int" throughout the file. The latter is more common in NSS. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Nit: change "unsigned" to "unsigned int" throughout the file. > The latter is more common in NSS. Done.
	197 for (i = 0; i < NLIMBS; i++) {

	198 out[i] = in[i] + in2[i];

	199 out[i] += carry;

	200 carry = out[i] >> 29;

	201 out[i] &= kBottom29Bits;

	202

	203 i++;

	204 if (i == NLIMBS)

	205 break;

	206

	207 out[i] = in[i] + in2[i];

	208 out[i] += carry;

	209 carry = out[i] >> 28;

	210 out[i] &= kBottom28Bits;

	211 }

	212

	213 felem_reduce_carry(out, carry);

	214 }

	215

	216 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)

	217 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)

	218 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)

	219 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)

	220 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)

	221 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)

	222

	223 /* zero31 is 0 mod p. */

	224 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, tw o30m2, two31p24m2, two30m27m2, two31m2 };
	wtc 2012/11/15 22:58:41 Please fold this long line. Please fold this long line. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Please fold this long line. Done.
	225

	226 /* felem_diff sets out = in-in2.

	227 *

	228 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	229 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	230 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	231 static void felem_diff(felem out, const felem in, const felem in2)

	232 {

	233 limb carry = 0;

	234 unsigned i;

	235

	236 for (i = 0; i < NLIMBS; i++) {

	237 out[i] = in[i] - in2[i];

	238 out[i] += zero31[i];

	239 out[i] += carry;

	240 carry = out[i] >> 29;

	241 out[i] &= kBottom29Bits;

	242

	243 i++;

	244 if (i == NLIMBS)

	245 break;

	246

	247 out[i] = in[i] - in2[i];

	248 out[i] += zero31[i];

	249 out[i] += carry;

	250 carry = out[i] >> 28;

	251 out[i] &= kBottom28Bits;

	252 }

	253

	254 felem_reduce_carry(out, carry);

	255 }

	256

	257 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
	wtc 2012/11/15 22:58:41 Please add "where R = 2257", or define R earlier Please add "where R = 2257", or define R earlier in the file, such as on line 34. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Please add "where R = 2**257", or define R earlier in the file, > such as on line 34. Done.
	258 * with the same 29,28,... bit pattern as an felem.

	259 *

	260 * On entry: tmp[i] < 2**64
	wtc 2012/11/15 22:58:41 You said tmp contains the 29, 28 bit pattern. Did You said tmp contains the 29, 28 bit pattern. Did you mean tmp[0,2,...] < 230, tmp[1,3,...] < 229? The on-entry condition you stated here implies that's not the case. So I guess the 29, 28 bits are packed into 64-bit words? agl 2012/11/16 19:17:46 I've changed it to say "bit positions". The values Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > You said tmp contains the 29, 28 bit pattern. I've changed it to say "bit positions". The values themselves can take the whole 64-bit range, but they are coefficients of a polynomial-like equation with terms at 20, 229, 2**57 etc. Hopefully the table makes things a little clearer.
	261 * On exit: out[0,2,...] < 230, out[1,3,...] < 229 */

	262 static void felem_reduce_degree(felem out, u64 tmp[17])

	263 {

	264 /* The following table may be helpful when reading this code:

	265 *

	266 * Limb number: 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \| 10...

	267 * Width (bits): 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29\| 28\| 29

	268 * Start bit: 0 \| 29\| 57\| 86\|114\|143\|171\|200\|228\|257\|285

	269 * (odd word): 0 \| 28\| 57\| 85\|114\|142\|142\|171\|199\|228\|256
	wtc 2012/11/15 22:58:41 Nit: should "(odd word):" be left aligned? Nit: should "(odd word):" be left aligned? agl 2012/11/16 19:17:46 It's right aligned because the row contains the st Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Nit: should "(odd word):" be left aligned? It's right aligned because the row contains the starting bits when starting on an odd-numbered word. I've changed it to read "odd phase". Maybe that's clearer?
	270 */

	271 limb tmp2[18], carry, x, xMask;

	272 unsigned i;

	273

	274 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an

	275 * felem. So the top of an element of tmp might overlap with another

	276 * element two positions down. The following loop eliminates this

	277 * overlap. */

	278 tmp2[0] = tmp[0] & kBottom29Bits;

	279

	280 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try

	281 * and hint to the compiler that it can do a single-word shift by selecting

	282 * the right register rather than doing a double-word shift and truncating

	283 * afterwards. */

	284 tmp2[1] = ((limb) tmp[0]) >> 29;

	285 tmp2[1] \|= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;

	286 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;

	287 carry = tmp2[1] >> 28;

	288 tmp2[1] &= kBottom28Bits;

	289

	290 for (i = 2; i < 17; i++) {

	291 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	292 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;

	293 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;

	294 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;

	295 tmp2[i] += carry;

	296 carry = tmp2[i] >> 29;

	297 tmp2[i] &= kBottom29Bits;

	298

	299 i++;

	300 if (i == 17)

	301 break;

	302 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;

	303 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;

	304 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;

	305 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;

	306 tmp2[i] += carry;

	307 carry = tmp2[i] >> 28;

	308 tmp2[i] &= kBottom28Bits;

	309 }

	310

	311 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;

	312 tmp2[17] += ((limb) (tmp[16])) >> 29;

	313 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);

	314 tmp2[17] += carry;

	315

	316 /* Montgomery elimination of terms.

	317 *

	318 * The values in felems are in Montgomery form: xR mod p where R = 2*257.

	319 * Since we just multiplied two Montgomery values together, the result is

	320 * xyR*R mod p. We wish to divide by R in order for the result also to be

	321 * in Montgomery form.
	wtc 2012/11/15 22:58:41 This paragraph should be moved to the comment bloc This paragraph should be moved to the comment block in front of this function, because it explains why this function computes out = tmp/R mod p. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > This paragraph should be moved to the comment block in front > of this function, because it explains why this function computes out = tmp/R mod > p. Done.
	322 *

	323 * Since R is 2**257, we can do that with a bitwise shift if we can ensure

	324 * that the right-most 257 bits are all zero. We can make that true by

	325 * adding multiplies of p without affecting the value.

	326 *

	327 * So we eliminate limbs from right to left. Since the bottom 29 bits of p

	328 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.

	329 * We can do that for 8 further limbs and then right shift to eliminate the

	330 * extra factor of R. */

	331 for (i = 0; i < 9; i += 2) {
	wtc 2012/11/16 05:28:53 9 => NLIMBS ? 9 => NLIMBS ? agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/16 05:28:53, wtc wrote: > > 9 => NLIMBS ? Done.
	332 tmp2[i + 1] += tmp2[i] >> 29;

	333 x = tmp2[i] & kBottom29Bits;

	334 tmp2[i] = 0;

	335

	336 /* The bounds calculations for this loop are tricky. Each iteration of

	337 * the loop eliminates two words by adding values to words to their

	338 * right.

	339 *

	340 * The following table contains the amounts added to each word (as an

	341 * offset from the value of i at the top of the loop). The amounts are

	342 * accounted for from the first and second half of the loop separately

	343 * and are written as, for example, 28 to mean a value <2**28.

	344 *

	345 * Word: 3 4 5 6 7 8 9 10

	346 * Added in top half: 28 11 29 21 29 28

	347 * 28 29

	348 * 29

	349 * Added in bottom half: 29 10 28 21 28 28

	350 * 29

	351 *

	352 * The value that is current offset 7 will be offset 5 for the next

	353 * iteration and then offset 3 for the iteration after that. Therefore

	354 * the total value added will be the values added at 7, 5 and 3.

	355 *

	356 * The following table accumulates these values. The sums at the bottom

	357 * are written as, for example, 29+28, to mean a value < 229+228.

	358 *

	359 * Word: 3 4 5 6 7 8 9 10 11 12 13

	360 * 28 11 10 29 21 29 28 28 28 28 28

	361 * 29 28 11 28 29 28 29 28 29 28

	362 * 29 28 21 21 29 21 29 21

	363 * 10 29 28 21 28 21 28

	364 * 28 29 28 29 28 29 28

	365 * 11 10 29 10 29 10

	366 * 29 28 11 28 11

	367 * 29 29

	368 * --------------------------------------------

	369 * 30+ 31+ 30+ 31+ 30+

	370 * 28+ 29+ 28+ 29+ 21+

	371 * 21+ 28+ 21+ 28+ 10

	372 * 10 21+ 10 21+

	373 * 11 11

	374 *

	375 * So the greatest amount is added to tmp2[10/12]. If tmp2[10/12] has an
	wtc 2012/11/15 22:58:41 What does tmp2[10/12] mean? tmp2[10] and tmp2[12]? What does tmp2[10/12] mean? tmp2[10] and tmp2[12]? agl 2012/11/16 19:17:46 Yes. Have expanded that in the comment. Show quoted text > What does tmp2[10/12] mean? tmp2[10] and tmp2[12]? Yes. Have expanded that in the comment.
	376 * initial value of <229, then the maximum value will be < 231 +

	377 * 230 + 228 + 221 + 211, which is < 2*32, as required. /

	378 tmp2[i + 3] += (x << 10) & kBottom28Bits;

	379 tmp2[i + 4] += (x >> 18);

	380

	381 tmp2[i + 6] += (x << 21) & kBottom29Bits;

	382 tmp2[i + 7] += x >> 8;

	383

	384 /* At position 200, which is the starting bit position for word 7, we

	385 * have a factor of 0xf000000 = 228 - 224. */

	386 tmp2[i + 7] += 0x10000000;

	387 tmp2[i + 8] += x - 1; /* Word 7 is 28 bits wide, so the 2**28 term

	388 exactly hits word 8. */
	wtc 2012/11/15 22:58:41 Nit: add '' to the left of this comment line. Nit: add '' to the left of this comment line. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Nit: add '*' to the left of this comment line. Done.
	389 tmp2[i + 7] -= (x << 24) & kBottom28Bits;

	390 tmp2[i + 8] -= x >> 4;

	391

	392 tmp2[i + 8] += 0x20000000;

	393 tmp2[i + 8] -= x;

	394 tmp2[i + 8] += (x << 28) & kBottom29Bits;

	395 tmp2[i + 9] += (x >> 1) - 1;

	396

	397 if (i+1 == NLIMBS)

	398 break;
	wtc 2012/11/16 05:28:53 I discovered that this used to be i++ if ( I discovered that this used to be i++ if (i == NLIMBS) break; I guess you or ekasper thinks the new form is clearer? agl 2012/11/16 19:17:46 ekasper asked for it. Show quoted text > I guess you or ekasper thinks the new form is clearer? ekasper asked for it.
	399 tmp2[i + 2] += tmp2[i + 1] >> 28;

	400 x = tmp2[i + 1] & kBottom28Bits;

	401 tmp2[i + 1] = 0;

	402

	403 tmp2[i + 4] += (x << 11) & kBottom29Bits;

	404 tmp2[i + 5] += (x >> 18);

	405

	406 tmp2[i + 7] += (x << 21) & kBottom28Bits;

	407 tmp2[i + 8] += x >> 7;

	408

	409 /* At position 199, which is the starting bit of the 9th word when

	410 * dealing with a context starting on an odd word, we have a factor of

	411 * 0x1e000000 = 229 - 225. Since we haved updated i, the 9th word
	wtc 2012/11/15 22:58:41 Is "we have updated i" correct? This loop does not Is "we have updated i" correct? This loop does not do i++ in the middle of the loop. wtc 2012/11/16 05:28:53 Should this comment say: Since we haved not upd Should this comment say: Since we haved not updated i, the 9th word from i+1 is i+10. because tmp2[i + 10] is the last element updated by this loop? agl 2012/11/16 19:17:46 Yes, thanks, there was a missing "not" in there. Show quoted text On 2012/11/16 05:28:53, wtc wrote: > Since we haved not updated i, the 9th word > from i+1 is i+10. Yes, thanks, there was a missing "not" in there. I also duplicated a column in the table at the top of the function. After fixing that, the comment reads: /* At position 199, which is the starting bit of the 8th word when * dealing with a context starting on an odd word, we have a factor of * 0x1e000000 = 229 - 225. Since we have not updated i, the 8th * word from i+1 is i+8.
	412 * from i+1 is i+9. */

	413 tmp2[i + 8] += 0x20000000;

	414 tmp2[i + 9] += x - 1;

	415 tmp2[i + 8] -= (x << 25) & kBottom29Bits;

	416 tmp2[i + 9] -= x >> 4;

	417

	418 xMask = NON_ZERO_TO_ALL_ONES(x);

	419 tmp2[i + 9] += 0x10000000 & xMask;

	420 tmp2[i + 9] -= x & xMask;

	421 tmp2[i + 10] += (x - 1) & xMask;

	422 }

	423

	424 /* We merge the right shift with a carry chain. The words above 2**257 have

	425 * widths of 28,29,... which we need to correct when copying them down. */

	426 carry = 0;

	427 for (i = 0; i < 8; i++) {

	428 /* The maximum value of tmp2[i + 9] occurs on the first iteration and

	429 * is < 230+229+228. Adding 229 (from tmp2[i + 10]) is

	430 * therefore safe. */

	431 out[i] = tmp2[i + 9];

	432 out[i] += carry;

	433 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;

	434 carry = out[i] >> 29;

	435 out[i] &= kBottom29Bits;

	436

	437 i++;

	438 out[i] = tmp2[i + 9] >> 1;

	439 out[i] += carry;

	440 carry = out[i] >> 28;

	441 out[i] &= kBottom28Bits;

	442 }

	443

	444 out[8] = tmp2[17];

	445 out[8] += carry;

	446 carry = out[8] >> 29;

	447 out[8] &= kBottom29Bits;

	448

	449 felem_reduce_carry(out, carry);

	450 }

	451

	452 /* felem_square sets out=in*in.

	453 *

	454 * On entry: in[0,2,...] < 230, in[1,3,...] < 229.

	455 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	456 static void felem_square(felem out, const felem in)

	457 {

	458 uint64_t tmp[17];
	wtc 2012/11/15 22:58:41 You can use u64 instead of uint64_t here and in th You can use u64 instead of uint64_t here and in the typecasts. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > You can use u64 instead of uint64_t here and in the typecasts. Done.
	459

	460 tmp[0] = ((uint64_t) in[0]) * in[0];

	461 tmp[1] = ((uint64_t) in[0]) * (in[1] << 1);

	462 tmp[2] = ((uint64_t) in[0]) * (in[2] << 1) +

	463 ((uint64_t) in[1]) * (in[1] << 1);
	wtc 2012/11/15 22:58:41 Just curious: where do these << 1 and << 2 come fr Just curious: where do these << 1 and << 2 come from? agl 2012/11/16 19:17:46 Two sources: When the indexes unequal, a factor o Show quoted text > Just curious: where do these << 1 and << 2 come from? Two sources: When the indexes unequal, a factor of two comes from the fact that we're doing a squaring. So, in a multiplication of two different values we would have in1[0] * in2[1] + in1[1] + in2[0]. But here, as in1 == in2, thus just 2(in[0] in[1]). There are also factors of two that arise from the limb pattern. When multiplying two terms at 229, the result is a term at 257, rather than 2**58 which would be natural. Therefore the result has to be shifted up a bit.
	464 tmp[3] = ((uint64_t) in[0]) * (in[3] << 1) +

	465 ((uint64_t) in[1]) * (in[2] << 1);

	466 tmp[4] = ((uint64_t) in[0]) * (in[4] << 1) +

	467 ((uint64_t) in[1]) * (in[3] << 2) +

	468 ((uint64_t) in[2]) * in[2];

	469 tmp[5] = ((uint64_t) in[0]) * (in[5] << 1) +

	470 ((uint64_t) in[1]) * (in[4] << 1) +

	471 ((uint64_t) in[2]) * (in[3] << 1);

	472 tmp[6] = ((uint64_t) in[0]) * (in[6] << 1) +

	473 ((uint64_t) in[1]) * (in[5] << 2) +

	474 ((uint64_t) in[2]) * (in[4] << 1) +

	475 ((uint64_t) in[3]) * (in[3] << 1);

	476 tmp[7] = ((uint64_t) in[0]) * (in[7] << 1) +

	477 ((uint64_t) in[1]) * (in[6] << 1) +

	478 ((uint64_t) in[2]) * (in[5] << 1) +

	479 ((uint64_t) in[3]) * (in[4] << 1);

	480 /* tmp[8] has the greatest value of 261 + 260 + 261 + 260 + 2**60,

	481 * which is < 2*64 as required. /

	482 tmp[8] = ((uint64_t) in[0]) * (in[8] << 1) +

	483 ((uint64_t) in[1]) * (in[7] << 2) +

	484 ((uint64_t) in[2]) * (in[6] << 1) +

	485 ((uint64_t) in[3]) * (in[5] << 2) +

	486 ((uint64_t) in[4]) * in[4];

	487 tmp[9] = ((uint64_t) in[1]) * (in[8] << 1) +

	488 ((uint64_t) in[2]) * (in[7] << 1) +

	489 ((uint64_t) in[3]) * (in[6] << 1) +

	490 ((uint64_t) in[4]) * (in[5] << 1);

	491 tmp[10] = ((uint64_t) in[2]) * (in[8] << 1) +

	492 ((uint64_t) in[3]) * (in[7] << 2) +

	493 ((uint64_t) in[4]) * (in[6] << 1) +

	494 ((uint64_t) in[5]) * (in[5] << 1);

	495 tmp[11] = ((uint64_t) in[3]) * (in[8] << 1) +

	496 ((uint64_t) in[4]) * (in[7] << 1) +

	497 ((uint64_t) in[5]) * (in[6] << 1);

	498 tmp[12] = ((uint64_t) in[4]) * (in[8] << 1) +

	499 ((uint64_t) in[5]) * (in[7] << 2) +

	500 ((uint64_t) in[6]) * in[6];

	501 tmp[13] = ((uint64_t) in[5]) * (in[8] << 1) +

	502 ((uint64_t) in[6]) * (in[7] << 1);

	503 tmp[14] = ((uint64_t) in[6]) * (in[8] << 1) +

	504 ((uint64_t) in[7]) * (in[7] << 1);

	505 tmp[15] = ((uint64_t) in[7]) * (in[8] << 1);

	506 tmp[16] = ((uint64_t) in[8]) * in[8];

	507

	508 felem_reduce_degree(out, tmp);

	509 }

	510

	511 /* felem_mul sets out=in*in2.

	512 *

	513 * On entry: in[0,2,...] < 230, in[1,3,...] < 229 and

	514 * in2[0,2,...] < 230, in2[1,3,...] < 229.

	515 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	516 static void felem_mul(felem out, const felem in, const felem in2)

	517 {

	518 u64 tmp[17];

	519

	520 tmp[0] = ((uint64_t) in[0]) * in2[0];

	521 tmp[1] = ((uint64_t) in[0]) * (in2[1] << 0) +

	522 ((uint64_t) in[1]) * (in2[0] << 0);

	523 tmp[2] = ((uint64_t) in[0]) * (in2[2] << 0) +

	524 ((uint64_t) in[1]) * (in2[1] << 1) +

	525 ((uint64_t) in[2]) * (in2[0] << 0);

	526 tmp[3] = ((uint64_t) in[0]) * (in2[3] << 0) +

	527 ((uint64_t) in[1]) * (in2[2] << 0) +

	528 ((uint64_t) in[2]) * (in2[1] << 0) +

	529 ((uint64_t) in[3]) * (in2[0] << 0);

	530 tmp[4] = ((uint64_t) in[0]) * (in2[4] << 0) +

	531 ((uint64_t) in[1]) * (in2[3] << 1) +

	532 ((uint64_t) in[2]) * (in2[2] << 0) +

	533 ((uint64_t) in[3]) * (in2[1] << 1) +

	534 ((uint64_t) in[4]) * (in2[0] << 0);

	535 tmp[5] = ((uint64_t) in[0]) * (in2[5] << 0) +

	536 ((uint64_t) in[1]) * (in2[4] << 0) +

	537 ((uint64_t) in[2]) * (in2[3] << 0) +

	538 ((uint64_t) in[3]) * (in2[2] << 0) +

	539 ((uint64_t) in[4]) * (in2[1] << 0) +

	540 ((uint64_t) in[5]) * (in2[0] << 0);

	541 tmp[6] = ((uint64_t) in[0]) * (in2[6] << 0) +

	542 ((uint64_t) in[1]) * (in2[5] << 1) +

	543 ((uint64_t) in[2]) * (in2[4] << 0) +

	544 ((uint64_t) in[3]) * (in2[3] << 1) +

	545 ((uint64_t) in[4]) * (in2[2] << 0) +

	546 ((uint64_t) in[5]) * (in2[1] << 1) +

	547 ((uint64_t) in[6]) * (in2[0] << 0);

	548 tmp[7] = ((uint64_t) in[0]) * (in2[7] << 0) +

	549 ((uint64_t) in[1]) * (in2[6] << 0) +

	550 ((uint64_t) in[2]) * (in2[5] << 0) +

	551 ((uint64_t) in[3]) * (in2[4] << 0) +

	552 ((uint64_t) in[4]) * (in2[3] << 0) +

	553 ((uint64_t) in[5]) * (in2[2] << 0) +

	554 ((uint64_t) in[6]) * (in2[1] << 0) +

	555 ((uint64_t) in[7]) * (in2[0] << 0);

	556 /* tmp[8] has the greatest value but doesn't overflow. See logic in

	557 * felem_square. */

	558 tmp[8] = ((uint64_t) in[0]) * (in2[8] << 0) +

	559 ((uint64_t) in[1]) * (in2[7] << 1) +

	560 ((uint64_t) in[2]) * (in2[6] << 0) +

	561 ((uint64_t) in[3]) * (in2[5] << 1) +

	562 ((uint64_t) in[4]) * (in2[4] << 0) +

	563 ((uint64_t) in[5]) * (in2[3] << 1) +

	564 ((uint64_t) in[6]) * (in2[2] << 0) +

	565 ((uint64_t) in[7]) * (in2[1] << 1) +

	566 ((uint64_t) in[8]) * (in2[0] << 0);

	567 tmp[9] = ((uint64_t) in[1]) * (in2[8] << 0) +

	568 ((uint64_t) in[2]) * (in2[7] << 0) +

	569 ((uint64_t) in[3]) * (in2[6] << 0) +

	570 ((uint64_t) in[4]) * (in2[5] << 0) +

	571 ((uint64_t) in[5]) * (in2[4] << 0) +

	572 ((uint64_t) in[6]) * (in2[3] << 0) +

	573 ((uint64_t) in[7]) * (in2[2] << 0) +

	574 ((uint64_t) in[8]) * (in2[1] << 0);

	575 tmp[10] = ((uint64_t) in[2]) * (in2[8] << 0) +

	576 ((uint64_t) in[3]) * (in2[7] << 1) +

	577 ((uint64_t) in[4]) * (in2[6] << 0) +

	578 ((uint64_t) in[5]) * (in2[5] << 1) +

	579 ((uint64_t) in[6]) * (in2[4] << 0) +

	580 ((uint64_t) in[7]) * (in2[3] << 1) +

	581 ((uint64_t) in[8]) * (in2[2] << 0);

	582 tmp[11] = ((uint64_t) in[3]) * (in2[8] << 0) +

	583 ((uint64_t) in[4]) * (in2[7] << 0) +

	584 ((uint64_t) in[5]) * (in2[6] << 0) +

	585 ((uint64_t) in[6]) * (in2[5] << 0) +

	586 ((uint64_t) in[7]) * (in2[4] << 0) +

	587 ((uint64_t) in[8]) * (in2[3] << 0);

	588 tmp[12] = ((uint64_t) in[4]) * (in2[8] << 0) +

	589 ((uint64_t) in[5]) * (in2[7] << 1) +

	590 ((uint64_t) in[6]) * (in2[6] << 0) +

	591 ((uint64_t) in[7]) * (in2[5] << 1) +

	592 ((uint64_t) in[8]) * (in2[4] << 0);

	593 tmp[13] = ((uint64_t) in[5]) * (in2[8] << 0) +

	594 ((uint64_t) in[6]) * (in2[7] << 0) +

	595 ((uint64_t) in[7]) * (in2[6] << 0) +

	596 ((uint64_t) in[8]) * (in2[5] << 0);

	597 tmp[14] = ((uint64_t) in[6]) * (in2[8] << 0) +

	598 ((uint64_t) in[7]) * (in2[7] << 1) +

	599 ((uint64_t) in[8]) * (in2[6] << 0);

	600 tmp[15] = ((uint64_t) in[7]) * (in2[8] << 0) +

	601 ((uint64_t) in[8]) * (in2[7] << 0);

	602 tmp[16] = ((uint64_t) in[8]) * (in2[8] << 0);

	603

	604 felem_reduce_degree(out, tmp);

	605 }

	606

	607 static void felem_assign(felem out, const felem in)

	608 {

	609 memcpy(out, in, sizeof(felem));
	wtc 2012/11/15 22:58:41 Does C allow us to assign an array directly? I kno Does C allow us to assign an array directly? I know we can do that with structs. agl 2012/11/16 19:17:46 I'm not sure. Since neither of us are sure, I'd st Show quoted text > Does C allow us to assign an array directly? I know we can > do that with structs. I'm not sure. Since neither of us are sure, I'd still want to use the obvious code even if it worked :)
	610 }

	611

	612 /* felem_inv calculates \|out\| = \|in\|^{-1}

	613 *

	614 * Based on Fermat's Little Theorem:

	615 * a^p = a (mod p)

	616 * a^{p-1} = 1 (mod p)

	617 * a^{p-2} = a^{-1} (mod p)

	618 */

	619 static void felem_inv(felem out, const felem in)

	620 {

	621 felem ftmp, ftmp2;

	622 /* each e_I will hold \|in\|^{2^I - 1} */

	623 felem e2, e4, e8, e16, e32, e64;

	624 unsigned i;

	625

	626 felem_square(ftmp, in); /* 2^1 */

	627 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */

	628 felem_assign(e2, ftmp);

	629 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */

	630 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */

	631 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */

	632 felem_assign(e4, ftmp);

	633 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */

	634 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */

	635 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */

	636 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */

	637 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */

	638 felem_assign(e8, ftmp);

	639 for (i = 0; i < 8; i++) {

	640 felem_square(ftmp, ftmp);

	641 } /* 2^16 - 2^8 */

	642 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */

	643 felem_assign(e16, ftmp);

	644 for (i = 0; i < 16; i++) {

	645 felem_square(ftmp, ftmp);

	646 } /* 2^32 - 2^16 */

	647 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */

	648 felem_assign(e32, ftmp);

	649 for (i = 0; i < 32; i++) {

	650 felem_square(ftmp, ftmp);

	651 } /* 2^64 - 2^32 */

	652 felem_assign(e64, ftmp);

	653 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */

	654 for (i = 0; i < 192; i++) {

	655 felem_square(ftmp, ftmp);

	656 } /* 2^256 - 2^224 + 2^192 */

	657

	658 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */

	659 for (i = 0; i < 16; i++) {

	660 felem_square(ftmp2, ftmp2);

	661 } /* 2^80 - 2^16 */

	662 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */

	663 for (i = 0; i < 8; i++) {

	664 felem_square(ftmp2, ftmp2);

	665 } /* 2^88 - 2^8 */

	666 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */

	667 for (i = 0; i < 4; i++) {

	668 felem_square(ftmp2, ftmp2);

	669 } /* 2^92 - 2^4 */

	670 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */

	671 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */

	672 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */

	673 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */

	674 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */

	675 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */

	676 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */

	677

	678 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
	wtc 2012/11/15 22:58:41 How did you figure out this sequence?! :-) How did you figure out this sequence?! :-) agl 2012/11/16 19:17:46 Pen and paper, if I recall. In general, the probl Show quoted text > How did you figure out this sequence?! :-) Pen and paper, if I recall. In general, the problem of calculating these sequences is NP-complete and is known as the `addition chain' problem. I am not sure that this chain is optimal, but it works.
	679 }

	680

	681 /* felem_scalar_3 sets out=3*out.

	682 *

	683 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	684 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	685 static void felem_scalar_3(felem out)

	686 {

	687 limb carry = 0;

	688 unsigned i;

	689

	690 for (i = 0;; i++) {
	wtc 2012/11/15 22:58:41 In some other for loops you still put the i < NLIM In some other for loops you still put the i < NLIMBS condition here. It would be good to be consistent. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > In some other for loops you still put the i < NLIMBS condition > here. It would be good to be consistent. Done.
	691 out[i] *= 3;

	692 out[i] += carry;

	693 carry = out[i] >> 29;

	694 out[i] &= kBottom29Bits;

	695

	696 i++;

	697 if (i == NLIMBS)

	698 break;

	699

	700 out[i] *= 3;

	701 out[i] += carry;

	702 carry = out[i] >> 28;

	703 out[i] &= kBottom28Bits;

	704 }

	705

	706 felem_reduce_carry(out, carry);

	707 }

	708

	709 /* felem_scalar_4 sets out=4*out.

	710 *

	711 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	712 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	713 static void felem_scalar_4(felem out)

	714 {

	715 limb carry = 0, next_carry;

	716 unsigned i;

	717

	718 for (i = 0;; i++) {

	719 next_carry = out[i] >> 27;

	720 out[i] <<= 2;

	721 out[i] &= kBottom29Bits;

	722 out[i] += carry;

	723 carry = next_carry + (out[i] >> 29);

	724 out[i] &= kBottom29Bits;

	725

	726 i++;

	727 if (i == NLIMBS)

	728 break;

	729 next_carry = out[i] >> 26;

	730 out[i] <<= 2;

	731 out[i] &= kBottom28Bits;

	732 out[i] += carry;

	733 carry = next_carry + (out[i] >> 28);

	734 out[i] &= kBottom28Bits;

	735 }

	736

	737 felem_reduce_carry(out, carry);

	738 }

	739

	740 /* felem_scalar_8 sets out=8*out.

	741 *

	742 * On entry: out[0,2,...] < 230, out[1,3,...] < 229.

	743 * On exit: out[0,2,...] < 230, out[1,3,...] < 229. */

	744 static void felem_scalar_8(felem out)

	745 {

	746 limb carry = 0, next_carry;

	747 unsigned i;

	748

	749 for (i = 0;; i++) {

	750 next_carry = out[i] >> 26;

	751 out[i] <<= 3;

	752 out[i] &= kBottom29Bits;

	753 out[i] += carry;

	754 carry = next_carry + (out[i] >> 29);

	755 out[i] &= kBottom29Bits;

	756

	757 i++;

	758 if (i == NLIMBS)

	759 break;

	760 next_carry = out[i] >> 25;

	761 out[i] <<= 3;

	762 out[i] &= kBottom28Bits;

	763 out[i] += carry;

	764 carry = next_carry + (out[i] >> 28);

	765 out[i] &= kBottom28Bits;

	766 }

	767

	768 felem_reduce_carry(out, carry);

	769 }

	770

	771 /* felem_is_zero_vartime returns 1 iff \|in\| == 0. It takes a variable amount of

	772 * time depending on the value of \|in\|. */

	773 static char felem_is_zero_vartime(const felem in)

	774 {

	775 limb carry;

	776 int i;

	777 limb tmp[NLIMBS];

	778 felem_assign(tmp, in);

	779

	780 /* First, reduce tmp to a minimal form. */

	781 do {

	782 carry = 0;

	783 for (i = 0;; i++) {

	784 tmp[i] += carry;

	785 carry = tmp[i] >> 29;

	786 tmp[i] &= kBottom29Bits;

	787

	788 i++;

	789 if (i == NLIMBS)

	790 break;

	791

	792 tmp[i] += carry;

	793 carry = tmp[i] >> 28;

	794 tmp[i] &= kBottom28Bits;

	795 }

	796

	797 felem_reduce_carry(tmp, carry);

	798 } while (carry);

	799

	800 /* tmp < 2*257, so the only possible zero values are 0, p and 2p. /

	801 return memcmp(tmp, kZero, sizeof(tmp)) == 0 \|\|

	802 memcmp(tmp, kP, sizeof(tmp)) == 0 \|\|

	803 memcmp(tmp, k2P, sizeof(tmp)) == 0;

	804 }

	805

	806 /* Group operations:

	807 *

	808 * Elements of the elliptic curve group are represented in Jacobian

	809 * coordinates. An affine point (x', y') is x=x'/z, y=y'/z in Jacobian form. */
	wtc 2012/11/15 22:58:41 What is z? Should the Jacobian form include the z What is z? Should the Jacobian form include the z coordinate? agl 2012/11/16 19:17:46 Yes, thanks. I also messed up the definition of Ja Show quoted text > Should the Jacobian form include the z coordinate? Yes, thanks. I also messed up the definition of Jacobian here and confused it with projective coordinates.
	810

	811 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.

	812 *

	813 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l */

	814 static void point_double(felem x_out, felem y_out, felem z_out,

	815 const felem x, const felem y, const felem z)

	816 {

	817 felem delta, gamma, alpha, beta, tmp, tmp2;

	818

	819 felem_square(delta, z);

	820 felem_square(gamma, y);

	821 felem_mul(beta, x, gamma);

	822

	823 felem_sum(tmp, x, delta);

	824 felem_diff(tmp2, x, delta);

	825 felem_mul(alpha, tmp, tmp2);

	826 felem_scalar_3(alpha);

	827

	828 felem_sum(tmp, y, z);

	829 felem_square(tmp, tmp);

	830 felem_diff(tmp, tmp, gamma);

	831 felem_diff(z_out, tmp, delta);

	832

	833 felem_scalar_4(beta);

	834 felem_square(x_out, alpha);

	835 felem_diff(x_out, x_out, beta);

	836 felem_diff(x_out, x_out, beta);

	837

	838 felem_diff(tmp, beta, x_out);

	839 felem_mul(tmp, alpha, tmp);

	840 felem_square(tmp2, gamma);

	841 felem_scalar_8(tmp2);

	842 felem_diff(y_out, tmp, tmp2);

	843 }

	844

	845 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.

	846 * (i.e. the second point is affine.)

	847 *

	848 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	849 *

	850 * Note that this function does not handle P+P, infinity+P nor P+infinity

	851 * correctly. */

	852 static void point_add_mixed(felem x_out, felem y_out, felem z_out,

	853 const felem x1, const felem y1, const felem z1,

	854 const felem x2, const felem y2)

	855 {

	856 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;

	857

	858 felem_square(z1z1, z1);

	859 felem_sum(tmp, z1, z1);

	860

	861 felem_mul(u2, x2, z1z1);

	862 felem_mul(z1z1z1, z1, z1z1);

	863 felem_mul(s2, y2, z1z1z1);

	864 felem_diff(h, u2, x1);

	865 felem_sum(i, h, h);

	866 felem_square(i, i);

	867 felem_mul(j, h, i);

	868 felem_diff(r, s2, y1);

	869 felem_sum(r, r, r);

	870 felem_mul(v, x1, i);

	871

	872 felem_mul(z_out, tmp, h);

	873 felem_square(rr, r);

	874 felem_diff(x_out, rr, j);

	875 felem_diff(x_out, x_out, v);

	876 felem_diff(x_out, x_out, v);

	877

	878 felem_diff(tmp, v, x_out);

	879 felem_mul(y_out, tmp, r);

	880 felem_mul(tmp, y1, j);

	881 felem_diff(y_out, y_out, tmp);

	882 felem_diff(y_out, y_out, tmp);

	883 }

	884

	885 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.

	886 *

	887 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	888 *

	889 * Note that this function does not handle P+P, infinity+P nor P+infinity

	890 * correctly. */

	891 static void point_add(felem x_out, felem y_out, felem z_out,

	892 const felem x1, const felem y1, const felem z1,

	893 const felem x2, const felem y2, const felem z2)

	894 {

	895 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	896

	897 felem_square(z1z1, z1);

	898 felem_square(z2z2, z2);

	899 felem_mul(u1, x1, z2z2);

	900

	901 felem_sum(tmp, z1, z2);

	902 felem_square(tmp, tmp);

	903 felem_diff(tmp, tmp, z1z1);

	904 felem_diff(tmp, tmp, z2z2);

	905

	906 felem_mul(z2z2z2, z2, z2z2);

	907 felem_mul(s1, y1, z2z2z2);

	908

	909 felem_mul(u2, x2, z1z1);

	910 felem_mul(z1z1z1, z1, z1z1);

	911 felem_mul(s2, y2, z1z1z1);

	912 felem_diff(h, u2, u1);

	913 felem_sum(i, h, h);

	914 felem_square(i, i);

	915 felem_mul(j, h, i);

	916 felem_diff(r, s2, s1);

	917 felem_sum(r, r, r);

	918 felem_mul(v, u1, i);

	919

	920 felem_mul(z_out, tmp, h);

	921 felem_square(rr, r);

	922 felem_diff(x_out, rr, j);

	923 felem_diff(x_out, x_out, v);

	924 felem_diff(x_out, x_out, v);

	925

	926 felem_diff(tmp, v, x_out);

	927 felem_mul(y_out, tmp, r);

	928 felem_mul(tmp, s1, j);

	929 felem_diff(y_out, y_out, tmp);

	930 felem_diff(y_out, y_out, tmp);

	931 }

	932

	933 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +

	934 * {x2,y2,z2}.

	935 *

	936 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl

	937 *

	938 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */

	939 static void point_add_or_double_vartime(

	940 felem x_out, felem y_out, felem z_out,

	941 const felem x1, const felem y1, const felem z1,

	942 const felem x2, const felem y2, const felem z2)

	943 {

	944 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;

	945 char x_equal, y_equal;

	946

	947 felem_square(z1z1, z1);

	948 felem_square(z2z2, z2);

	949 felem_mul(u1, x1, z2z2);

	950

	951 felem_sum(tmp, z1, z2);

	952 felem_square(tmp, tmp);

	953 felem_diff(tmp, tmp, z1z1);

	954 felem_diff(tmp, tmp, z2z2);

	955

	956 felem_mul(z2z2z2, z2, z2z2);

	957 felem_mul(s1, y1, z2z2z2);

	958

	959 felem_mul(u2, x2, z1z1);

	960 felem_mul(z1z1z1, z1, z1z1);

	961 felem_mul(s2, y2, z1z1z1);

	962 felem_diff(h, u2, u1);

	963 x_equal = felem_is_zero_vartime(h);

	964 felem_sum(i, h, h);

	965 felem_square(i, i);

	966 felem_mul(j, h, i);

	967 felem_diff(r, s2, s1);

	968 y_equal = felem_is_zero_vartime(r);

	969 if (x_equal && y_equal) {

	970 point_double(x_out, y_out, z_out, x1, y1, z1);

	971 return;

	972 }

	973 felem_sum(r, r, r);

	974 felem_mul(v, u1, i);

	975

	976 felem_mul(z_out, tmp, h);

	977 felem_square(rr, r);

	978 felem_diff(x_out, rr, j);

	979 felem_diff(x_out, x_out, v);

	980 felem_diff(x_out, x_out, v);

	981

	982 felem_diff(tmp, v, x_out);

	983 felem_mul(y_out, tmp, r);

	984 felem_mul(tmp, s1, j);

	985 felem_diff(y_out, y_out, tmp);

	986 felem_diff(y_out, y_out, tmp);

	987 }

	988

	989 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.

	990 *

	991 * On entry: mask is either 0 or 0xffffffff. */

	992 static void copy_conditional(felem out, const felem in, limb mask)

	993 {

	994 int i;

	995

	996 for (i = 0; i < NLIMBS; i++) {

	997 const limb tmp = mask & (in[i] ^ out[i]);

	998 out[i] ^= tmp;

	999 }

	1000 }

	1001

	1002 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.

	1003 * On entry: index < 16, table[0] must be zero. */

	1004 static void select_affine_point(felem out_x, felem out_y,

	1005 const limb *table, limb index)

	1006 {

	1007 limb i, j;

	1008

	1009 memset(out_x, 0, sizeof(felem));

	1010 memset(out_y, 0, sizeof(felem));

	1011

	1012 for (i = 1; i < 16; i++) {

	1013 limb mask = i ^ index;

	1014 mask \|= mask >> 2;

	1015 mask \|= mask >> 1;

	1016 mask &= 1;

	1017 mask--;

	1018 for (j = 0; j < NLIMBS; j++) {

	1019 out_x[j] \|= (*table++) & mask;

	1020 }
	wtc 2012/11/15 22:58:41 Do you think this is a little clearer? for (j Do you think this is a little clearer? for (j = 0; j < NLIMBS; j++, table++) { out_x[j] \|= table & mask; } agl* 2012/11/16 19:17:46 Done. (And 4x below.) Show quoted text > Do you think this is a little clearer? > > for (j = 0; j < NLIMBS; j++, table++) { > out_x[j] \|= *table & mask; > } Done. (And 4x below.)
	1021 for (j = 0; j < NLIMBS; j++) {

	1022 out_y[j] \|= (*table++) & mask;

	1023 }

	1024 }

	1025 }

	1026

	1027 /* select_affine_point sets {out_x,out_y,out_z} to the index'th entry of table.

	1028 * On entry: index < 16, table[0] must be zero. */

	1029 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
	wtc 2012/11/15 22:58:41 Typo: select_affine_point => select_jacobian_point Typo: select_affine_point => select_jacobian_point agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Typo: select_affine_point => select_jacobian_point Done.
	1030 const limb *table, limb index)

	1031 {

	1032 limb i, j;

	1033

	1034 memset(out_x, 0, sizeof(felem));

	1035 memset(out_y, 0, sizeof(felem));

	1036 memset(out_z, 0, sizeof(felem));

	1037

	1038 table += 3*NLIMBS;
	wtc 2012/11/15 22:58:41 Why does this function add an offset to table, but Why does this function add an offset to table, but the previous function (select_affine_point) doesn't? Should this fact be documented? Or should we have the caller add this offset before calling this function? agl 2012/11/16 19:17:46 Have added comment. /* The implicit value at inde Show quoted text On 2012/11/15 22:58:41, wtc wrote: > Why does this function add an offset to table, but > the previous function (select_affine_point) doesn't? Have added comment. /* The implicit value at index 0 is all zero. We don't need to perform that * iteration of the loop because we already set out_* to zero.
	1039

	1040 for (i = 1; i < 16; i++) {

	1041 limb mask = i ^ index;

	1042 mask \|= mask >> 2;

	1043 mask \|= mask >> 1;

	1044 mask &= 1;

	1045 mask--;

	1046 for (j = 0; j < NLIMBS; j++) {

	1047 out_x[j] \|= (*table++) & mask;

	1048 }

	1049 for (j = 0; j < NLIMBS; j++) {

	1050 out_y[j] \|= (*table++) & mask;

	1051 }

	1052 for (j = 0; j < NLIMBS; j++) {

	1053 out_z[j] \|= (*table++) & mask;

	1054 }

	1055 }

	1056 }

	1057

	1058 /* get_bit returns the bit'th bit of scalar. */

	1059 static char get_bit(const u8 scalar[32], int bit)

	1060 {

	1061 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;

	1062 }

	1063

	1064 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian

	1065 * number. Note that the value of scalar must be less than the order of the

	1066 * group. */

	1067 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])

	1068 {

	1069 int i, j;

	1070 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;

	1071 u32 table_offset;

	1072

	1073 felem px, py;

	1074 felem tx, ty, tz;

	1075

	1076 memset(nx, 0, sizeof(felem));

	1077 memset(ny, 0, sizeof(felem));

	1078 memset(nz, 0, sizeof(felem));

	1079

	1080 /* The loop adds bits at positions 0, 64, 128 and 192, followed by

	1081 * positions 32,96,160 and 224 and does this 32 times. */

	1082 for (i = 0; i < 32; i++) {

	1083 if (i) {

	1084 point_double(nx, ny, nz, nx, ny, nz);

	1085 }

	1086 for (j = 0; j <= 32; j += 32) {

	1087 char bit0 = get_bit(scalar, 31 - i + j);

	1088 char bit1 = get_bit(scalar, 95 - i + j);

	1089 char bit2 = get_bit(scalar, 159 - i + j);

	1090 char bit3 = get_bit(scalar, 223 - i + j);

	1091 limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);

	1092

	1093 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);

	1094 select_affine_point(px, py, kPrecomputed + table_offset, index);

	1095

	1096 /* Since scalar is less than the order of the group, we know that

	1097 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle

	1098 * below. */

	1099 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);

	1100 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero

	1101 * (a.k.a. the point at infinity). We handle that situation by

	1102 * copying the point from the table. */

	1103 copy_conditional(nx, px, n_is_infinity_mask);

	1104 copy_conditional(ny, py, n_is_infinity_mask);

	1105 copy_conditional(nz, kOne, n_is_infinity_mask);

	1106

	1107 /* Equally, the result is also wrong if the point from the table is

	1108 * zero, which happens when the index is zero. We handle that by

	1109 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */

	1110 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);

	1111 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1112 copy_conditional(nx, tx, mask);

	1113 copy_conditional(ny, ty, mask);

	1114 copy_conditional(nz, tz, mask);

	1115 /* If p was not zero, then n is now non-zero. */

	1116 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1117 }

	1118 }

	1119 }

	1120

	1121 /* point_to_affine converts a Jacobian point to an affine point. If the input

	1122 * is the point at infinity then it returns (0, 0) in constant time. */

	1123 static void point_to_affine(felem x_out, felem y_out,

	1124 const felem nx, const felem ny, const felem nz) {

	1125 felem z_inv, z_inv_sq;

	1126 felem_inv(z_inv, nz);

	1127 felem_square(z_inv_sq, z_inv);
	wtc 2012/11/15 22:58:41 In your formula on line 809, I see x=x'/z, y=y'/z. In your formula on line 809, I see x=x'/z, y=y'/z. I only see z inverse. So I don't understand why you're using z_inv_sq here. agl 2012/11/16 19:17:46 I had written down the formula for projective coor Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > In your formula on line 809, I see x=x'/z, y=y'/z. I only > see z inverse. So I don't understand why you're using > z_inv_sq here. I had written down the formula for projective coordinates, but I'm using Jacobian here!
	1128 felem_mul(x_out, nx, z_inv_sq);

	1129 felem_mul(z_inv, z_inv, z_inv_sq);
	wtc 2012/11/15 22:58:41 Isn't this z inverse cubed? Isn't this z inverse cubed?
	1130 felem_mul(y_out, ny, z_inv);

	1131 }

	1132

	1133 /* scalar_mult sets {nx,ny,nz} = scalar{x,y}. /

	1134 static void scalar_mult(felem nx, felem ny, felem nz,

	1135 const felem x, const felem y, const u8 scalar[32])

	1136 {

	1137 int i;

	1138 felem px, py, pz, tx, ty, tz;

	1139 felem precomp[16][3];

	1140 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;

	1141

	1142 /* We precompute 0,1,2,... times {x,y}. */

	1143 memset(precomp, 0, sizeof(felem) * 3);

	1144 memcpy(&precomp[1][0], x, sizeof(felem));

	1145 memcpy(&precomp[1][1], y, sizeof(felem));

	1146 memcpy(&precomp[1][2], kOne, sizeof(felem));

	1147

	1148 for (i = 2; i < 16; i += 2) {

	1149 point_double(precomp[i][0], precomp[i][1], precomp[i][2],

	1150 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);

	1151

	1152 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],

	1153 precomp[i][0], precomp[i][1], precomp[i][2], x, y);

	1154 }

	1155

	1156 memset(nx, 0, sizeof(felem));

	1157 memset(ny, 0, sizeof(felem));

	1158 memset(nz, 0, sizeof(felem));

	1159 n_is_infinity_mask = -1;

	1160

	1161 /* We add in a window of four bits each iteration and do this 64 times. */

	1162 for (i = 0; i < 64; i++) {

	1163 if (i) {

	1164 point_double(nx, ny, nz, nx, ny, nz);

	1165 point_double(nx, ny, nz, nx, ny, nz);

	1166 point_double(nx, ny, nz, nx, ny, nz);

	1167 point_double(nx, ny, nz, nx, ny, nz);

	1168 }

	1169

	1170 index = scalar[31 - i / 2];

	1171 if ((i & 1) == 1) {

	1172 index &= 15;

	1173 } else {

	1174 index >>= 4;

	1175 }

	1176

	1177 /* See the comments in scalar_base_mult about handling infinities. */

	1178 select_jacobian_point(px, py, pz, (limb *) precomp, index);

	1179 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);

	1180 copy_conditional(nx, px, n_is_infinity_mask);

	1181 copy_conditional(ny, py, n_is_infinity_mask);

	1182 copy_conditional(nz, pz, n_is_infinity_mask);

	1183

	1184 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;

	1185 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

	1186 copy_conditional(nx, tx, mask);

	1187 copy_conditional(ny, ty, mask);

	1188 copy_conditional(nz, tz, mask);

	1189 n_is_infinity_mask &= ~p_is_noninfinite_mask;

	1190 }

	1191 }

	1192

	1193
	wtc 2012/11/15 22:58:41 Delete a blank line? Delete a blank line? agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Delete a blank line? Done.
	1194 /* Interface with Freebl: */

	1195

	1196 #if IS_BIG_ENDIAN

	1197 #error "This code needs a little-endian processor"

	1198 #endif

	1199

	1200 static const u32 kRInvDigits[8] = {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xf ffffffe, 1, 0x7fffffff};
	wtc 2012/11/15 22:58:41 Fold this long line. Fold this long line. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Fold this long line. Done.
	1201 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))

	1202 static const mp_int kRInv = {

	1203 MP_ZPOS,

	1204 MP_DIGITS_IN_256_BITS,

	1205 MP_DIGITS_IN_256_BITS,

	1206 /* Because we are running on a little-endian processor, this cast works for

	1207 * both 32 and 64-bit processors. */

	1208 (mp_digit*) kRInvDigits

	1209 };

	1210

	1211 static const limb kTwo28 = 0x10000000;

	1212 static const limb kTwo29 = 0x20000000;

	1213

	1214 /* to_montgomery sets out = Rin. /

	1215 static mp_err to_montgomery(felem out, const mp_int in, const ECGroup group) {
	wtc 2012/11/15 22:58:41 Put the opening '{' for this function on its own l Put the opening '{' for this function on its own line. Same for all subsequent functions. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Put the opening '{' for this function on its own line. > Same for all subsequent functions. Done.
	1216 /* There are no MPI functions for bitshift operations and we wish to shift

	1217 * in 257 bits left so we move the digits 256-bits left and then multiply

	1218 * by two. */

	1219 mp_int in_shifted;

	1220 int i;

	1221 mp_err res;

	1222

	1223 mp_init(&in_shifted);

	1224 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);

	1225 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),

	1226 MP_DIGITS(in),

	1227 MP_USED(in)*sizeof(mp_digit));

	1228 mp_mul_2(&in_shifted, &in_shifted);

	1229 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));

	1230

	1231 for (i = 0; i < NLIMBS; i++) {

	1232 if ((i & 1) == 0) {

	1233 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;

	1234 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);

	1235 } else {

	1236 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;

	1237 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);

	1238 }

	1239 }
	wtc 2012/11/16 05:28:53 Should we write this for loop using the prevalent Should we write this for loop using the prevalent form in this file? for (i = 0; i < NLIMBS; i++) { out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits; mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL); i++; if (i == NLIMBS) break; out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits; mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL); } agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/16 05:28:53, wtc wrote: > > Should we write this for loop using the prevalent form in > this file? > > for (i = 0; i < NLIMBS; i++) { > out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits; > mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL); > > i++; > if (i == NLIMBS) > break; > > out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits; > mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL); > } Done.
	1240

	1241 mp_clear(&in_shifted);
	wtc 2012/11/15 22:58:41 I believe we need to do this mp_clear call under t I believe we need to do this mp_clear call under the CLEANUP: label so that in_shifted is also freed on error. Make the same change to the next function. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > I believe we need to do this mp_clear call under the > CLEANUP: label so that in_shifted is also freed on error. > > Make the same change to the next function. Done.
	1242

	1243 CLEANUP:

	1244 return res;

	1245 }

	1246

	1247 /* from_montgomery sets out=in/R. */

	1248 static mp_err from_montgomery(mp_int *out, const felem in,

	1249 const ECGroup *group) {
	wtc 2012/11/15 22:58:41 Adjust the indentation. Adjust the indentation. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Adjust the indentation. Done.
	1250 mp_int result, tmp;

	1251 mp_err res;

	1252 int i;

	1253

	1254 mp_init(&result);

	1255 mp_init(&tmp);

	1256

	1257 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));

	1258 for (i = NLIMBS-2; i >= 0; i--) {

	1259 if ((i & 1) == 0) {

	1260 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));

	1261 } else {

	1262 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));

	1263 }

	1264 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));

	1265 }

	1266

	1267 MP_CHECKOK(mp_mul(&result, &kRInv, out));

	1268 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));

	1269

	1270 mp_clear(&result);

	1271 mp_clear(&tmp);

	1272

	1273 CLEANUP:

	1274 return res;

	1275 }

	1276

	1277 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */

	1278 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n) {

	1279 /* We require that \|n\| is less than the order of the group and therefore it

	1280 * will fit into \|scalar\|. However, these is a timing side-channel here that

	1281 * we cannot avoid: if \|n\| is sufficiently small it may be one or more words

	1282 * too short and we'll copy less data. */

	1283 memset(out_scalar, 0, 32);

	1284 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));

	1285 }

	1286

	1287 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the

	1288 * order of the group. */

	1289 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,

	1290 mp_int out_x, mp_int out_y,
	wtc 2012/11/15 22:58:41 Does freebl follow the convention that output argu Does freebl follow the convention that output arguments (out_x, out_y) should be listed before the input argument (n)? I guess this function's prototype is prescribed? agl 2012/11/16 19:17:46 Yes, this function's prototype matches base_point_ Show quoted text On 2012/11/15 22:58:41, wtc wrote: > I guess this function's prototype is prescribed? Yes, this function's prototype matches base_point_mul elsewhere in freebl.
	1291 const ECGroup *group) {

	1292 u8 scalar[32];

	1293 felem x, y, z, x_affine, y_affine;

	1294 mp_err res;

	1295

	1296 // FIXME(agl): test that n < order.
	wtc 2012/11/15 22:58:41 Use C-style /* / comment delimiters. Use C-style / / comment delimiters. agl* 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Use C-style /* */ comment delimiters. Done.
	1297

	1298 scalar_from_mp_int(scalar, n);

	1299 scalar_base_mult(x, y, z, scalar);

	1300 point_to_affine(x_affine, y_affine, x, y, z);

	1301 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1302 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1303

	1304 CLEANUP:

	1305 return res;

	1306 }

	1307

	1308 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <

	1309 * the order of the group. */

	1310 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,

	1311 const mp_int in_x, const mp_int in_y,

	1312 mp_int out_x, mp_int out_y,

	1313 const ECGroup *group) {

	1314 u8 scalar[32];

	1315 felem x, y, z, x_affine, y_affine, px, py;

	1316 mp_err res;

	1317

	1318 scalar_from_mp_int(scalar, n);

	1319

	1320 MP_CHECKOK(to_montgomery(px, in_x, group));

	1321 MP_CHECKOK(to_montgomery(py, in_y, group));

	1322

	1323 scalar_mult(x, y, z, px, py, scalar);

	1324 point_to_affine(x_affine, y_affine, x, y, z);

	1325 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1326 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1327

	1328 CLEANUP:

	1329 return res;

	1330 }

	1331

	1332 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n1G + n2{in_x,in_y}, where
	wtc 2012/11/15 22:58:41 Typo: ec_GFp_nistp256_point_mul => ec_GFp_nistp256 Typo: ec_GFp_nistp256_point_mul => ec_GFp_nistp256_points_mul_vartime agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Typo: ec_GFp_nistp256_point_mul => ec_GFp_nistp256_points_mul_vartime Done.
	1333 * n1 and n2 are < the order of the group.

	1334 *

	1335 * As indicated by the name, this function operates in variable time. This

	1336 * is safe because it's used for signature validation which doesn't deal

	1337 * with secrets. */

	1338 mp_err ec_GFp_nistp256_points_mul_vartime

	1339 (const mp_int n1, const mp_int n2,
	wtc 2012/11/15 22:58:41 Put the opening '(' on the previous line. Put the opening '(' on the previous line. agl 2012/11/16 19:17:46 Done. Show quoted text On 2012/11/15 22:58:41, wtc wrote: > > Put the opening '(' on the previous line. Done.
	1340 const mp_int in_x, const mp_int in_y,

	1341 mp_int out_x, mp_int out_y,

	1342 const ECGroup *group) {

	1343 u8 scalar1[32], scalar2[32];

	1344 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;

	1345 mp_err res = MP_OKAY;

	1346

	1347 /* If both scalars are zero, then the result is the point at infinity. */

	1348 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {

	1349 mp_zero(out_x);

	1350 mp_zero(out_y);

	1351 return res;

	1352 }

	1353

	1354 scalar_from_mp_int(scalar1, n1);

	1355 scalar_from_mp_int(scalar2, n2);

	1356

	1357 MP_CHECKOK(to_montgomery(px, in_x, group));

	1358 MP_CHECKOK(to_montgomery(py, in_y, group));

	1359 scalar_base_mult(x1, y1, z1, scalar1);

	1360 scalar_mult(x2, y2, z2, px, py, scalar2);

	1361

	1362 if (mp_cmp_z(n2) == 0) {

	1363 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just

	1364 * {x1,y1,z1}. */

	1365 } else if (mp_cmp_z(n1) == 0) {

	1366 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just

	1367 * {x2,y2,z2}. */

	1368 memcpy(x1, x2, sizeof(x2));

	1369 memcpy(y1, y2, sizeof(y2));

	1370 memcpy(z1, z2, sizeof(z2));

	1371 } else {

	1372 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */

	1373 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);

	1374 }

	1375

	1376 point_to_affine(x_affine, y_affine, x1, y1, z1);

	1377 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

	1378 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

	1379

	1380 CLEANUP:

	1381 return res;

	1382 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »