Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(145)

Side by Side Diff: ecp_256_32.c

Issue 10944017: 32-bit, P-256 implementation. Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Including results of ekasper's review. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #include "prcpucfg.h"
wtc 2012/11/15 22:58:41 1. Please copy the MPL 2 header from another file.
agl 2012/11/16 19:17:46 Done.
2
3 #include <stdint.h>
4 #include <stdio.h>
5
6 #include <string.h>
7 #include <stdlib.h>
8
9 #include "mpi.h"
10 #include "mpi-priv.h"
11 #include "ecp.h"
12
13 typedef uint8_t u8;
14 typedef uint32_t u32;
15 typedef int32_t s32;
16 typedef uint64_t u64;
17
18 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's
19 * MPI library calls them digits, but here they are called limbs, which is
20 * GMP's terminology.
21 *
22 * The value of an felem (field element) is:
23 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
24 *
25 * That is, each limb is alternately 29 or 28-bits wide in little-endian
26 * order.
27 *
28 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
29 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
30 * when multiplying as terms end up one bit short of a limb which would require
31 * much bit-shifting to correct.
32 *
33 * Finally, the values stored in an felem are in Montgomery form. So the value
34 * |y| is stored as (y * 2**257) mod p, where p is the P-256 prime.
35 */
36 typedef uint32_t limb;
37 #define NLIMBS 9
38 typedef limb felem[NLIMBS];
39
40 static const limb kBottom28Bits = 0xfffffff;
41 static const limb kBottom29Bits = 0x1fffffff;
42
43 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
44 * 28-bit words. */
45 static const felem kOne = {
46 2, 0, 0, 0xffff800,
47 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
48 0
49 };
50 static const limb kZero[NLIMBS] = {0};
51 static const limb kP[NLIMBS] = {
52 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
53 0, 0, 0x200000, 0xf000000,
54 0xfffffff
55 };
56 static const limb k2P[NLIMBS] = {
wtc 2012/11/15 22:58:41 Use the felem typedef when you define kZero, kP, a
agl 2012/11/16 19:17:46 Done.
57 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
58 0, 0, 0x400000, 0xe000000,
59 0x1fffffff
60 };
wtc 2012/11/15 22:58:41 Add a blank line here?
agl 2012/11/16 19:17:46 Done.
61 /* kPrecomputed contains precomputed values to aid the calculation of scalar
62 * multiples of the base point, G. It's actually two, equal length, tables
63 * concatenated.
64 *
65 * The first table contains (x,y) felem pairs for 16 multiples of the base
66 * point, G.
67 *
68 * Index | Index (binary) | Value
69 * 0 | 0000 | 0G (all zeros, omitted)
70 * 1 | 0001 | G
71 * 2 | 0010 | 2**64G
72 * 3 | 0011 | 2**64G + G
73 * 4 | 0100 | 2**128G
74 * 5 | 0101 | 2**128G + G
75 * 6 | 0110 | 2**128G + 2**64G
76 * 7 | 0111 | 2**128G + 2**64G + G
77 * 8 | 1000 | 2**192G
78 * 9 | 1001 | 2**192G + G
79 * 10 | 1010 | 2**192G + 2**64G
80 * 11 | 1011 | 2**192G + 2**64G + G
81 * 12 | 1100 | 2**192G + 2**128G
82 * 13 | 1101 | 2**192G + 2**128G + G
83 * 14 | 1110 | 2**192G + 2**128G + 2**64G
84 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
85 *
86 * The second table follows the same style, but the multiples are 2**32G,
wtc 2012/11/15 22:58:41 Nit: 2**32G, 2**96G, etc. probably should be calle
agl 2012/11/16 19:17:46 Done.
87 * 2**96G, 2**160G, 2**224G.
88 *
89 * This is ~2KB of data. */
wtc 2012/11/15 22:58:41 Nit: put the closing */ on its own line. If you m
agl 2012/11/16 19:17:46 Done for all multi-line comments in the file.
90 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
91 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,
92 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,
93 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,
94 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,
95 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,
96 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,
97 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,
98 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,
99 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,
100 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,
101 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,
102 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,
103 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,
104 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,
105 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
106 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,
107 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,
108 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,
109 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,
110 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,
111 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,
112 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,
113 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
114 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,
115 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,
116 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,
117 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,
118 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,
119 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,
120 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,
121 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,
122 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,
123 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
124 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,
125 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,
126 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
127 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,
128 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,
129 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,
130 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,
131 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,
132 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,
133 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,
134 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,
135 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,
136 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,
137 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,
138 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,
139 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,
140 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
141 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,
142 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,
143 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,
144 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,
145 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,
146 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,
147 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,
148 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,
149 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,
150 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,
151 };
152
wtc 2012/11/15 22:58:41 Nit: delete one blank line.
agl 2012/11/16 19:17:46 Done.
153
154 /* Field element operations: */
155
156 /* NON_ZERO_TO_ALL_ONES returns:
157 * 0xffffffff for 0 < x <= 2**31
158 * 0 for x == 0 or x > 2**31.
159 *
160 * This macro assumes that right-shifting a signed number shifts in the MSB on
161 * the left. This is not ensured by the C standard, but is true on the CPUs
162 * that we're targetting with this code (x86 and ARM). */
163 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) (x-1)) >> 31)))
wtc 2012/11/15 22:58:41 Should we add parentheses around x in the macro's
agl 2012/11/16 19:17:46 Despite the noted abundance of them already, I thi
164
165 /* felem_reduce_degree adds a multiple of p in order to cancel |carry|,
wtc 2012/11/15 22:58:41 Typo: felem_reduce_degree => felem_reduce_carry
agl 2012/11/16 19:17:46 Done.
166 * which is a term at 2**257.
167 *
168 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
wtc 2012/11/15 22:58:41 Question: The above says |carry| is a term at 2**2
agl 2012/11/16 19:17:46 |carry| is the coefficient of the term at 2**257,
169 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
170 static void felem_reduce_carry(felem inout, limb carry)
171 {
172 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
173
174 inout[0] += carry << 1;
175 inout[3] += 0x10000000 & carry_mask;
176 // carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
177 // previous line therefore this doesn't underflow.
178 inout[3] -= carry << 11;
179 inout[4] += (0x20000000 - 1) & carry_mask;
180 inout[5] += (0x10000000 - 1) & carry_mask;
181 inout[6] += (0x20000000 - 1) & carry_mask;
182 inout[6] -= carry << 22;
183 // This may underflow if carry is non-zero but, if so, we'll fix it in the
184 // next line.
wtc 2012/11/15 22:58:41 Why don't we add and then subtract? Will that avoi
agl 2012/11/16 19:17:46 Since we're using two's complement, adding then su
185 inout[7] -= 1 & carry_mask;
186 inout[7] += carry << 25;
187 }
188
189 /* felem_sum sets out = in+in2.
190 *
191 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
192 * On exit: in[0,2,...] < 2**30, in2[1,3,...] < 2**29 */
wtc 2012/11/15 22:58:41 Are "in" and "in2" typos? I think you meant "out".
agl 2012/11/16 19:17:46 Done.
193 static void felem_sum(felem out, const felem in, const felem in2)
194 {
195 limb carry = 0;
196 unsigned i;
wtc 2012/11/15 22:58:41 Nit: change "unsigned" to "unsigned int" throughou
agl 2012/11/16 19:17:46 Done.
197 for (i = 0; i < NLIMBS; i++) {
198 out[i] = in[i] + in2[i];
199 out[i] += carry;
200 carry = out[i] >> 29;
201 out[i] &= kBottom29Bits;
202
203 i++;
204 if (i == NLIMBS)
205 break;
206
207 out[i] = in[i] + in2[i];
208 out[i] += carry;
209 carry = out[i] >> 28;
210 out[i] &= kBottom28Bits;
211 }
212
213 felem_reduce_carry(out, carry);
214 }
215
216 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
217 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
218 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
219 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
220 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
221 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
222
223 /* zero31 is 0 mod p. */
224 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, tw o30m2, two31p24m2, two30m27m2, two31m2 };
wtc 2012/11/15 22:58:41 Please fold this long line.
agl 2012/11/16 19:17:46 Done.
225
226 /* felem_diff sets out = in-in2.
227 *
228 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
229 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
230 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
231 static void felem_diff(felem out, const felem in, const felem in2)
232 {
233 limb carry = 0;
234 unsigned i;
235
236 for (i = 0; i < NLIMBS; i++) {
237 out[i] = in[i] - in2[i];
238 out[i] += zero31[i];
239 out[i] += carry;
240 carry = out[i] >> 29;
241 out[i] &= kBottom29Bits;
242
243 i++;
244 if (i == NLIMBS)
245 break;
246
247 out[i] = in[i] - in2[i];
248 out[i] += zero31[i];
249 out[i] += carry;
250 carry = out[i] >> 28;
251 out[i] &= kBottom28Bits;
252 }
253
254 felem_reduce_carry(out, carry);
255 }
256
257 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
wtc 2012/11/15 22:58:41 Please add "where R = 2**257", or define R earlier
agl 2012/11/16 19:17:46 Done.
258 * with the same 29,28,... bit pattern as an felem.
259 *
260 * On entry: tmp[i] < 2**64
wtc 2012/11/15 22:58:41 You said tmp contains the 29, 28 bit pattern. Did
agl 2012/11/16 19:17:46 I've changed it to say "bit positions". The values
261 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
262 static void felem_reduce_degree(felem out, u64 tmp[17])
263 {
264 /* The following table may be helpful when reading this code:
265 *
266 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
267 * Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
268 * Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285
269 * (odd word): 0 | 28| 57| 85|114|142|142|171|199|228|256
wtc 2012/11/15 22:58:41 Nit: should "(odd word):" be left aligned?
agl 2012/11/16 19:17:46 It's right aligned because the row contains the st
270 */
271 limb tmp2[18], carry, x, xMask;
272 unsigned i;
273
274 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
275 * felem. So the top of an element of tmp might overlap with another
276 * element two positions down. The following loop eliminates this
277 * overlap. */
278 tmp2[0] = tmp[0] & kBottom29Bits;
279
280 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
281 * and hint to the compiler that it can do a single-word shift by selecting
282 * the right register rather than doing a double-word shift and truncating
283 * afterwards. */
284 tmp2[1] = ((limb) tmp[0]) >> 29;
285 tmp2[1] |= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;
286 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
287 carry = tmp2[1] >> 28;
288 tmp2[1] &= kBottom28Bits;
289
290 for (i = 2; i < 17; i++) {
291 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
292 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;
293 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
294 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
295 tmp2[i] += carry;
296 carry = tmp2[i] >> 29;
297 tmp2[i] &= kBottom29Bits;
298
299 i++;
300 if (i == 17)
301 break;
302 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
303 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;
304 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
305 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
306 tmp2[i] += carry;
307 carry = tmp2[i] >> 28;
308 tmp2[i] &= kBottom28Bits;
309 }
310
311 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;
312 tmp2[17] += ((limb) (tmp[16])) >> 29;
313 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);
314 tmp2[17] += carry;
315
316 /* Montgomery elimination of terms.
317 *
318 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
319 * Since we just multiplied two Montgomery values together, the result is
320 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
321 * in Montgomery form.
wtc 2012/11/15 22:58:41 This paragraph should be moved to the comment bloc
agl 2012/11/16 19:17:46 Done.
322 *
323 * Since R is 2**257, we can do that with a bitwise shift if we can ensure
324 * that the right-most 257 bits are all zero. We can make that true by
325 * adding multiplies of p without affecting the value.
326 *
327 * So we eliminate limbs from right to left. Since the bottom 29 bits of p
328 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
329 * We can do that for 8 further limbs and then right shift to eliminate the
330 * extra factor of R. */
331 for (i = 0; i < 9; i += 2) {
wtc 2012/11/16 05:28:53 9 => NLIMBS ?
agl 2012/11/16 19:17:46 Done.
332 tmp2[i + 1] += tmp2[i] >> 29;
333 x = tmp2[i] & kBottom29Bits;
334 tmp2[i] = 0;
335
336 /* The bounds calculations for this loop are tricky. Each iteration of
337 * the loop eliminates two words by adding values to words to their
338 * right.
339 *
340 * The following table contains the amounts added to each word (as an
341 * offset from the value of i at the top of the loop). The amounts are
342 * accounted for from the first and second half of the loop separately
343 * and are written as, for example, 28 to mean a value <2**28.
344 *
345 * Word: 3 4 5 6 7 8 9 10
346 * Added in top half: 28 11 29 21 29 28
347 * 28 29
348 * 29
349 * Added in bottom half: 29 10 28 21 28 28
350 * 29
351 *
352 * The value that is current offset 7 will be offset 5 for the next
353 * iteration and then offset 3 for the iteration after that. Therefore
354 * the total value added will be the values added at 7, 5 and 3.
355 *
356 * The following table accumulates these values. The sums at the bottom
357 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
358 *
359 * Word: 3 4 5 6 7 8 9 10 11 12 13
360 * 28 11 10 29 21 29 28 28 28 28 28
361 * 29 28 11 28 29 28 29 28 29 28
362 * 29 28 21 21 29 21 29 21
363 * 10 29 28 21 28 21 28
364 * 28 29 28 29 28 29 28
365 * 11 10 29 10 29 10
366 * 29 28 11 28 11
367 * 29 29
368 * --------------------------------------------
369 * 30+ 31+ 30+ 31+ 30+
370 * 28+ 29+ 28+ 29+ 21+
371 * 21+ 28+ 21+ 28+ 10
372 * 10 21+ 10 21+
373 * 11 11
374 *
375 * So the greatest amount is added to tmp2[10/12]. If tmp2[10/12] has an
wtc 2012/11/15 22:58:41 What does tmp2[10/12] mean? tmp2[10] and tmp2[12]?
agl 2012/11/16 19:17:46 Yes. Have expanded that in the comment.
376 * initial value of <2**29, then the maximum value will be < 2**31 +
377 * 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32, as required. */
378 tmp2[i + 3] += (x << 10) & kBottom28Bits;
379 tmp2[i + 4] += (x >> 18);
380
381 tmp2[i + 6] += (x << 21) & kBottom29Bits;
382 tmp2[i + 7] += x >> 8;
383
384 /* At position 200, which is the starting bit position for word 7, we
385 * have a factor of 0xf000000 = 2**28 - 2**24. */
386 tmp2[i + 7] += 0x10000000;
387 tmp2[i + 8] += x - 1; /* Word 7 is 28 bits wide, so the 2**28 term
388 exactly hits word 8. */
wtc 2012/11/15 22:58:41 Nit: add '*' to the left of this comment line.
agl 2012/11/16 19:17:46 Done.
389 tmp2[i + 7] -= (x << 24) & kBottom28Bits;
390 tmp2[i + 8] -= x >> 4;
391
392 tmp2[i + 8] += 0x20000000;
393 tmp2[i + 8] -= x;
394 tmp2[i + 8] += (x << 28) & kBottom29Bits;
395 tmp2[i + 9] += (x >> 1) - 1;
396
397 if (i+1 == NLIMBS)
398 break;
wtc 2012/11/16 05:28:53 I discovered that this used to be i++ if (
agl 2012/11/16 19:17:46 ekasper asked for it.
399 tmp2[i + 2] += tmp2[i + 1] >> 28;
400 x = tmp2[i + 1] & kBottom28Bits;
401 tmp2[i + 1] = 0;
402
403 tmp2[i + 4] += (x << 11) & kBottom29Bits;
404 tmp2[i + 5] += (x >> 18);
405
406 tmp2[i + 7] += (x << 21) & kBottom28Bits;
407 tmp2[i + 8] += x >> 7;
408
409 /* At position 199, which is the starting bit of the 9th word when
410 * dealing with a context starting on an odd word, we have a factor of
411 * 0x1e000000 = 2**29 - 2**25. Since we haved updated i, the 9th word
wtc 2012/11/15 22:58:41 Is "we have updated i" correct? This loop does not
wtc 2012/11/16 05:28:53 Should this comment say: Since we haved not upd
agl 2012/11/16 19:17:46 Yes, thanks, there was a missing "not" in there.
412 * from i+1 is i+9. */
413 tmp2[i + 8] += 0x20000000;
414 tmp2[i + 9] += x - 1;
415 tmp2[i + 8] -= (x << 25) & kBottom29Bits;
416 tmp2[i + 9] -= x >> 4;
417
418 xMask = NON_ZERO_TO_ALL_ONES(x);
419 tmp2[i + 9] += 0x10000000 & xMask;
420 tmp2[i + 9] -= x & xMask;
421 tmp2[i + 10] += (x - 1) & xMask;
422 }
423
424 /* We merge the right shift with a carry chain. The words above 2**257 have
425 * widths of 28,29,... which we need to correct when copying them down. */
426 carry = 0;
427 for (i = 0; i < 8; i++) {
428 /* The maximum value of tmp2[i + 9] occurs on the first iteration and
429 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
430 * therefore safe. */
431 out[i] = tmp2[i + 9];
432 out[i] += carry;
433 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
434 carry = out[i] >> 29;
435 out[i] &= kBottom29Bits;
436
437 i++;
438 out[i] = tmp2[i + 9] >> 1;
439 out[i] += carry;
440 carry = out[i] >> 28;
441 out[i] &= kBottom28Bits;
442 }
443
444 out[8] = tmp2[17];
445 out[8] += carry;
446 carry = out[8] >> 29;
447 out[8] &= kBottom29Bits;
448
449 felem_reduce_carry(out, carry);
450 }
451
452 /* felem_square sets out=in*in.
453 *
454 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
455 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
456 static void felem_square(felem out, const felem in)
457 {
458 uint64_t tmp[17];
wtc 2012/11/15 22:58:41 You can use u64 instead of uint64_t here and in th
agl 2012/11/16 19:17:46 Done.
459
460 tmp[0] = ((uint64_t) in[0]) * in[0];
461 tmp[1] = ((uint64_t) in[0]) * (in[1] << 1);
462 tmp[2] = ((uint64_t) in[0]) * (in[2] << 1) +
463 ((uint64_t) in[1]) * (in[1] << 1);
wtc 2012/11/15 22:58:41 Just curious: where do these << 1 and << 2 come fr
agl 2012/11/16 19:17:46 Two sources: When the indexes unequal, a factor o
464 tmp[3] = ((uint64_t) in[0]) * (in[3] << 1) +
465 ((uint64_t) in[1]) * (in[2] << 1);
466 tmp[4] = ((uint64_t) in[0]) * (in[4] << 1) +
467 ((uint64_t) in[1]) * (in[3] << 2) +
468 ((uint64_t) in[2]) * in[2];
469 tmp[5] = ((uint64_t) in[0]) * (in[5] << 1) +
470 ((uint64_t) in[1]) * (in[4] << 1) +
471 ((uint64_t) in[2]) * (in[3] << 1);
472 tmp[6] = ((uint64_t) in[0]) * (in[6] << 1) +
473 ((uint64_t) in[1]) * (in[5] << 2) +
474 ((uint64_t) in[2]) * (in[4] << 1) +
475 ((uint64_t) in[3]) * (in[3] << 1);
476 tmp[7] = ((uint64_t) in[0]) * (in[7] << 1) +
477 ((uint64_t) in[1]) * (in[6] << 1) +
478 ((uint64_t) in[2]) * (in[5] << 1) +
479 ((uint64_t) in[3]) * (in[4] << 1);
480 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
481 * which is < 2**64 as required. */
482 tmp[8] = ((uint64_t) in[0]) * (in[8] << 1) +
483 ((uint64_t) in[1]) * (in[7] << 2) +
484 ((uint64_t) in[2]) * (in[6] << 1) +
485 ((uint64_t) in[3]) * (in[5] << 2) +
486 ((uint64_t) in[4]) * in[4];
487 tmp[9] = ((uint64_t) in[1]) * (in[8] << 1) +
488 ((uint64_t) in[2]) * (in[7] << 1) +
489 ((uint64_t) in[3]) * (in[6] << 1) +
490 ((uint64_t) in[4]) * (in[5] << 1);
491 tmp[10] = ((uint64_t) in[2]) * (in[8] << 1) +
492 ((uint64_t) in[3]) * (in[7] << 2) +
493 ((uint64_t) in[4]) * (in[6] << 1) +
494 ((uint64_t) in[5]) * (in[5] << 1);
495 tmp[11] = ((uint64_t) in[3]) * (in[8] << 1) +
496 ((uint64_t) in[4]) * (in[7] << 1) +
497 ((uint64_t) in[5]) * (in[6] << 1);
498 tmp[12] = ((uint64_t) in[4]) * (in[8] << 1) +
499 ((uint64_t) in[5]) * (in[7] << 2) +
500 ((uint64_t) in[6]) * in[6];
501 tmp[13] = ((uint64_t) in[5]) * (in[8] << 1) +
502 ((uint64_t) in[6]) * (in[7] << 1);
503 tmp[14] = ((uint64_t) in[6]) * (in[8] << 1) +
504 ((uint64_t) in[7]) * (in[7] << 1);
505 tmp[15] = ((uint64_t) in[7]) * (in[8] << 1);
506 tmp[16] = ((uint64_t) in[8]) * in[8];
507
508 felem_reduce_degree(out, tmp);
509 }
510
511 /* felem_mul sets out=in*in2.
512 *
513 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
514 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
515 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
516 static void felem_mul(felem out, const felem in, const felem in2)
517 {
518 u64 tmp[17];
519
520 tmp[0] = ((uint64_t) in[0]) * in2[0];
521 tmp[1] = ((uint64_t) in[0]) * (in2[1] << 0) +
522 ((uint64_t) in[1]) * (in2[0] << 0);
523 tmp[2] = ((uint64_t) in[0]) * (in2[2] << 0) +
524 ((uint64_t) in[1]) * (in2[1] << 1) +
525 ((uint64_t) in[2]) * (in2[0] << 0);
526 tmp[3] = ((uint64_t) in[0]) * (in2[3] << 0) +
527 ((uint64_t) in[1]) * (in2[2] << 0) +
528 ((uint64_t) in[2]) * (in2[1] << 0) +
529 ((uint64_t) in[3]) * (in2[0] << 0);
530 tmp[4] = ((uint64_t) in[0]) * (in2[4] << 0) +
531 ((uint64_t) in[1]) * (in2[3] << 1) +
532 ((uint64_t) in[2]) * (in2[2] << 0) +
533 ((uint64_t) in[3]) * (in2[1] << 1) +
534 ((uint64_t) in[4]) * (in2[0] << 0);
535 tmp[5] = ((uint64_t) in[0]) * (in2[5] << 0) +
536 ((uint64_t) in[1]) * (in2[4] << 0) +
537 ((uint64_t) in[2]) * (in2[3] << 0) +
538 ((uint64_t) in[3]) * (in2[2] << 0) +
539 ((uint64_t) in[4]) * (in2[1] << 0) +
540 ((uint64_t) in[5]) * (in2[0] << 0);
541 tmp[6] = ((uint64_t) in[0]) * (in2[6] << 0) +
542 ((uint64_t) in[1]) * (in2[5] << 1) +
543 ((uint64_t) in[2]) * (in2[4] << 0) +
544 ((uint64_t) in[3]) * (in2[3] << 1) +
545 ((uint64_t) in[4]) * (in2[2] << 0) +
546 ((uint64_t) in[5]) * (in2[1] << 1) +
547 ((uint64_t) in[6]) * (in2[0] << 0);
548 tmp[7] = ((uint64_t) in[0]) * (in2[7] << 0) +
549 ((uint64_t) in[1]) * (in2[6] << 0) +
550 ((uint64_t) in[2]) * (in2[5] << 0) +
551 ((uint64_t) in[3]) * (in2[4] << 0) +
552 ((uint64_t) in[4]) * (in2[3] << 0) +
553 ((uint64_t) in[5]) * (in2[2] << 0) +
554 ((uint64_t) in[6]) * (in2[1] << 0) +
555 ((uint64_t) in[7]) * (in2[0] << 0);
556 /* tmp[8] has the greatest value but doesn't overflow. See logic in
557 * felem_square. */
558 tmp[8] = ((uint64_t) in[0]) * (in2[8] << 0) +
559 ((uint64_t) in[1]) * (in2[7] << 1) +
560 ((uint64_t) in[2]) * (in2[6] << 0) +
561 ((uint64_t) in[3]) * (in2[5] << 1) +
562 ((uint64_t) in[4]) * (in2[4] << 0) +
563 ((uint64_t) in[5]) * (in2[3] << 1) +
564 ((uint64_t) in[6]) * (in2[2] << 0) +
565 ((uint64_t) in[7]) * (in2[1] << 1) +
566 ((uint64_t) in[8]) * (in2[0] << 0);
567 tmp[9] = ((uint64_t) in[1]) * (in2[8] << 0) +
568 ((uint64_t) in[2]) * (in2[7] << 0) +
569 ((uint64_t) in[3]) * (in2[6] << 0) +
570 ((uint64_t) in[4]) * (in2[5] << 0) +
571 ((uint64_t) in[5]) * (in2[4] << 0) +
572 ((uint64_t) in[6]) * (in2[3] << 0) +
573 ((uint64_t) in[7]) * (in2[2] << 0) +
574 ((uint64_t) in[8]) * (in2[1] << 0);
575 tmp[10] = ((uint64_t) in[2]) * (in2[8] << 0) +
576 ((uint64_t) in[3]) * (in2[7] << 1) +
577 ((uint64_t) in[4]) * (in2[6] << 0) +
578 ((uint64_t) in[5]) * (in2[5] << 1) +
579 ((uint64_t) in[6]) * (in2[4] << 0) +
580 ((uint64_t) in[7]) * (in2[3] << 1) +
581 ((uint64_t) in[8]) * (in2[2] << 0);
582 tmp[11] = ((uint64_t) in[3]) * (in2[8] << 0) +
583 ((uint64_t) in[4]) * (in2[7] << 0) +
584 ((uint64_t) in[5]) * (in2[6] << 0) +
585 ((uint64_t) in[6]) * (in2[5] << 0) +
586 ((uint64_t) in[7]) * (in2[4] << 0) +
587 ((uint64_t) in[8]) * (in2[3] << 0);
588 tmp[12] = ((uint64_t) in[4]) * (in2[8] << 0) +
589 ((uint64_t) in[5]) * (in2[7] << 1) +
590 ((uint64_t) in[6]) * (in2[6] << 0) +
591 ((uint64_t) in[7]) * (in2[5] << 1) +
592 ((uint64_t) in[8]) * (in2[4] << 0);
593 tmp[13] = ((uint64_t) in[5]) * (in2[8] << 0) +
594 ((uint64_t) in[6]) * (in2[7] << 0) +
595 ((uint64_t) in[7]) * (in2[6] << 0) +
596 ((uint64_t) in[8]) * (in2[5] << 0);
597 tmp[14] = ((uint64_t) in[6]) * (in2[8] << 0) +
598 ((uint64_t) in[7]) * (in2[7] << 1) +
599 ((uint64_t) in[8]) * (in2[6] << 0);
600 tmp[15] = ((uint64_t) in[7]) * (in2[8] << 0) +
601 ((uint64_t) in[8]) * (in2[7] << 0);
602 tmp[16] = ((uint64_t) in[8]) * (in2[8] << 0);
603
604 felem_reduce_degree(out, tmp);
605 }
606
607 static void felem_assign(felem out, const felem in)
608 {
609 memcpy(out, in, sizeof(felem));
wtc 2012/11/15 22:58:41 Does C allow us to assign an array directly? I kno
agl 2012/11/16 19:17:46 I'm not sure. Since neither of us are sure, I'd st
610 }
611
612 /* felem_inv calculates |out| = |in|^{-1}
613 *
614 * Based on Fermat's Little Theorem:
615 * a^p = a (mod p)
616 * a^{p-1} = 1 (mod p)
617 * a^{p-2} = a^{-1} (mod p)
618 */
619 static void felem_inv(felem out, const felem in)
620 {
621 felem ftmp, ftmp2;
622 /* each e_I will hold |in|^{2^I - 1} */
623 felem e2, e4, e8, e16, e32, e64;
624 unsigned i;
625
626 felem_square(ftmp, in); /* 2^1 */
627 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
628 felem_assign(e2, ftmp);
629 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
630 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
631 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
632 felem_assign(e4, ftmp);
633 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
634 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
635 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
636 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
637 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
638 felem_assign(e8, ftmp);
639 for (i = 0; i < 8; i++) {
640 felem_square(ftmp, ftmp);
641 } /* 2^16 - 2^8 */
642 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
643 felem_assign(e16, ftmp);
644 for (i = 0; i < 16; i++) {
645 felem_square(ftmp, ftmp);
646 } /* 2^32 - 2^16 */
647 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
648 felem_assign(e32, ftmp);
649 for (i = 0; i < 32; i++) {
650 felem_square(ftmp, ftmp);
651 } /* 2^64 - 2^32 */
652 felem_assign(e64, ftmp);
653 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
654 for (i = 0; i < 192; i++) {
655 felem_square(ftmp, ftmp);
656 } /* 2^256 - 2^224 + 2^192 */
657
658 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
659 for (i = 0; i < 16; i++) {
660 felem_square(ftmp2, ftmp2);
661 } /* 2^80 - 2^16 */
662 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
663 for (i = 0; i < 8; i++) {
664 felem_square(ftmp2, ftmp2);
665 } /* 2^88 - 2^8 */
666 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
667 for (i = 0; i < 4; i++) {
668 felem_square(ftmp2, ftmp2);
669 } /* 2^92 - 2^4 */
670 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
671 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
672 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
673 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
674 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
675 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
676 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
677
678 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
wtc 2012/11/15 22:58:41 How did you figure out this sequence?! :-)
agl 2012/11/16 19:17:46 Pen and paper, if I recall. In general, the probl
679 }
680
681 /* felem_scalar_3 sets out=3*out.
682 *
683 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
684 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
685 static void felem_scalar_3(felem out)
686 {
687 limb carry = 0;
688 unsigned i;
689
690 for (i = 0;; i++) {
wtc 2012/11/15 22:58:41 In some other for loops you still put the i < NLIM
agl 2012/11/16 19:17:46 Done.
691 out[i] *= 3;
692 out[i] += carry;
693 carry = out[i] >> 29;
694 out[i] &= kBottom29Bits;
695
696 i++;
697 if (i == NLIMBS)
698 break;
699
700 out[i] *= 3;
701 out[i] += carry;
702 carry = out[i] >> 28;
703 out[i] &= kBottom28Bits;
704 }
705
706 felem_reduce_carry(out, carry);
707 }
708
709 /* felem_scalar_4 sets out=4*out.
710 *
711 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
712 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
713 static void felem_scalar_4(felem out)
714 {
715 limb carry = 0, next_carry;
716 unsigned i;
717
718 for (i = 0;; i++) {
719 next_carry = out[i] >> 27;
720 out[i] <<= 2;
721 out[i] &= kBottom29Bits;
722 out[i] += carry;
723 carry = next_carry + (out[i] >> 29);
724 out[i] &= kBottom29Bits;
725
726 i++;
727 if (i == NLIMBS)
728 break;
729 next_carry = out[i] >> 26;
730 out[i] <<= 2;
731 out[i] &= kBottom28Bits;
732 out[i] += carry;
733 carry = next_carry + (out[i] >> 28);
734 out[i] &= kBottom28Bits;
735 }
736
737 felem_reduce_carry(out, carry);
738 }
739
740 /* felem_scalar_8 sets out=8*out.
741 *
742 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
743 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
744 static void felem_scalar_8(felem out)
745 {
746 limb carry = 0, next_carry;
747 unsigned i;
748
749 for (i = 0;; i++) {
750 next_carry = out[i] >> 26;
751 out[i] <<= 3;
752 out[i] &= kBottom29Bits;
753 out[i] += carry;
754 carry = next_carry + (out[i] >> 29);
755 out[i] &= kBottom29Bits;
756
757 i++;
758 if (i == NLIMBS)
759 break;
760 next_carry = out[i] >> 25;
761 out[i] <<= 3;
762 out[i] &= kBottom28Bits;
763 out[i] += carry;
764 carry = next_carry + (out[i] >> 28);
765 out[i] &= kBottom28Bits;
766 }
767
768 felem_reduce_carry(out, carry);
769 }
770
771 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
772 * time depending on the value of |in|. */
773 static char felem_is_zero_vartime(const felem in)
774 {
775 limb carry;
776 int i;
777 limb tmp[NLIMBS];
778 felem_assign(tmp, in);
779
780 /* First, reduce tmp to a minimal form. */
781 do {
782 carry = 0;
783 for (i = 0;; i++) {
784 tmp[i] += carry;
785 carry = tmp[i] >> 29;
786 tmp[i] &= kBottom29Bits;
787
788 i++;
789 if (i == NLIMBS)
790 break;
791
792 tmp[i] += carry;
793 carry = tmp[i] >> 28;
794 tmp[i] &= kBottom28Bits;
795 }
796
797 felem_reduce_carry(tmp, carry);
798 } while (carry);
799
800 /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
801 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
802 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
803 memcmp(tmp, k2P, sizeof(tmp)) == 0;
804 }
805
806 /* Group operations:
807 *
808 * Elements of the elliptic curve group are represented in Jacobian
809 * coordinates. An affine point (x', y') is x=x'/z, y=y'/z in Jacobian form. */
wtc 2012/11/15 22:58:41 What is z? Should the Jacobian form include the z
agl 2012/11/16 19:17:46 Yes, thanks. I also messed up the definition of Ja
810
811 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
812 *
813 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l */
814 static void point_double(felem x_out, felem y_out, felem z_out,
815 const felem x, const felem y, const felem z)
816 {
817 felem delta, gamma, alpha, beta, tmp, tmp2;
818
819 felem_square(delta, z);
820 felem_square(gamma, y);
821 felem_mul(beta, x, gamma);
822
823 felem_sum(tmp, x, delta);
824 felem_diff(tmp2, x, delta);
825 felem_mul(alpha, tmp, tmp2);
826 felem_scalar_3(alpha);
827
828 felem_sum(tmp, y, z);
829 felem_square(tmp, tmp);
830 felem_diff(tmp, tmp, gamma);
831 felem_diff(z_out, tmp, delta);
832
833 felem_scalar_4(beta);
834 felem_square(x_out, alpha);
835 felem_diff(x_out, x_out, beta);
836 felem_diff(x_out, x_out, beta);
837
838 felem_diff(tmp, beta, x_out);
839 felem_mul(tmp, alpha, tmp);
840 felem_square(tmp2, gamma);
841 felem_scalar_8(tmp2);
842 felem_diff(y_out, tmp, tmp2);
843 }
844
845 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
846 * (i.e. the second point is affine.)
847 *
848 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
849 *
850 * Note that this function does not handle P+P, infinity+P nor P+infinity
851 * correctly. */
852 static void point_add_mixed(felem x_out, felem y_out, felem z_out,
853 const felem x1, const felem y1, const felem z1,
854 const felem x2, const felem y2)
855 {
856 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
857
858 felem_square(z1z1, z1);
859 felem_sum(tmp, z1, z1);
860
861 felem_mul(u2, x2, z1z1);
862 felem_mul(z1z1z1, z1, z1z1);
863 felem_mul(s2, y2, z1z1z1);
864 felem_diff(h, u2, x1);
865 felem_sum(i, h, h);
866 felem_square(i, i);
867 felem_mul(j, h, i);
868 felem_diff(r, s2, y1);
869 felem_sum(r, r, r);
870 felem_mul(v, x1, i);
871
872 felem_mul(z_out, tmp, h);
873 felem_square(rr, r);
874 felem_diff(x_out, rr, j);
875 felem_diff(x_out, x_out, v);
876 felem_diff(x_out, x_out, v);
877
878 felem_diff(tmp, v, x_out);
879 felem_mul(y_out, tmp, r);
880 felem_mul(tmp, y1, j);
881 felem_diff(y_out, y_out, tmp);
882 felem_diff(y_out, y_out, tmp);
883 }
884
885 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
886 *
887 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
888 *
889 * Note that this function does not handle P+P, infinity+P nor P+infinity
890 * correctly. */
891 static void point_add(felem x_out, felem y_out, felem z_out,
892 const felem x1, const felem y1, const felem z1,
893 const felem x2, const felem y2, const felem z2)
894 {
895 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
896
897 felem_square(z1z1, z1);
898 felem_square(z2z2, z2);
899 felem_mul(u1, x1, z2z2);
900
901 felem_sum(tmp, z1, z2);
902 felem_square(tmp, tmp);
903 felem_diff(tmp, tmp, z1z1);
904 felem_diff(tmp, tmp, z2z2);
905
906 felem_mul(z2z2z2, z2, z2z2);
907 felem_mul(s1, y1, z2z2z2);
908
909 felem_mul(u2, x2, z1z1);
910 felem_mul(z1z1z1, z1, z1z1);
911 felem_mul(s2, y2, z1z1z1);
912 felem_diff(h, u2, u1);
913 felem_sum(i, h, h);
914 felem_square(i, i);
915 felem_mul(j, h, i);
916 felem_diff(r, s2, s1);
917 felem_sum(r, r, r);
918 felem_mul(v, u1, i);
919
920 felem_mul(z_out, tmp, h);
921 felem_square(rr, r);
922 felem_diff(x_out, rr, j);
923 felem_diff(x_out, x_out, v);
924 felem_diff(x_out, x_out, v);
925
926 felem_diff(tmp, v, x_out);
927 felem_mul(y_out, tmp, r);
928 felem_mul(tmp, s1, j);
929 felem_diff(y_out, y_out, tmp);
930 felem_diff(y_out, y_out, tmp);
931 }
932
933 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
934 * {x2,y2,z2}.
935 *
936 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
937 *
938 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
939 static void point_add_or_double_vartime(
940 felem x_out, felem y_out, felem z_out,
941 const felem x1, const felem y1, const felem z1,
942 const felem x2, const felem y2, const felem z2)
943 {
944 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
945 char x_equal, y_equal;
946
947 felem_square(z1z1, z1);
948 felem_square(z2z2, z2);
949 felem_mul(u1, x1, z2z2);
950
951 felem_sum(tmp, z1, z2);
952 felem_square(tmp, tmp);
953 felem_diff(tmp, tmp, z1z1);
954 felem_diff(tmp, tmp, z2z2);
955
956 felem_mul(z2z2z2, z2, z2z2);
957 felem_mul(s1, y1, z2z2z2);
958
959 felem_mul(u2, x2, z1z1);
960 felem_mul(z1z1z1, z1, z1z1);
961 felem_mul(s2, y2, z1z1z1);
962 felem_diff(h, u2, u1);
963 x_equal = felem_is_zero_vartime(h);
964 felem_sum(i, h, h);
965 felem_square(i, i);
966 felem_mul(j, h, i);
967 felem_diff(r, s2, s1);
968 y_equal = felem_is_zero_vartime(r);
969 if (x_equal && y_equal) {
970 point_double(x_out, y_out, z_out, x1, y1, z1);
971 return;
972 }
973 felem_sum(r, r, r);
974 felem_mul(v, u1, i);
975
976 felem_mul(z_out, tmp, h);
977 felem_square(rr, r);
978 felem_diff(x_out, rr, j);
979 felem_diff(x_out, x_out, v);
980 felem_diff(x_out, x_out, v);
981
982 felem_diff(tmp, v, x_out);
983 felem_mul(y_out, tmp, r);
984 felem_mul(tmp, s1, j);
985 felem_diff(y_out, y_out, tmp);
986 felem_diff(y_out, y_out, tmp);
987 }
988
989 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.
990 *
991 * On entry: mask is either 0 or 0xffffffff. */
992 static void copy_conditional(felem out, const felem in, limb mask)
993 {
994 int i;
995
996 for (i = 0; i < NLIMBS; i++) {
997 const limb tmp = mask & (in[i] ^ out[i]);
998 out[i] ^= tmp;
999 }
1000 }
1001
1002 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.
1003 * On entry: index < 16, table[0] must be zero. */
1004 static void select_affine_point(felem out_x, felem out_y,
1005 const limb *table, limb index)
1006 {
1007 limb i, j;
1008
1009 memset(out_x, 0, sizeof(felem));
1010 memset(out_y, 0, sizeof(felem));
1011
1012 for (i = 1; i < 16; i++) {
1013 limb mask = i ^ index;
1014 mask |= mask >> 2;
1015 mask |= mask >> 1;
1016 mask &= 1;
1017 mask--;
1018 for (j = 0; j < NLIMBS; j++) {
1019 out_x[j] |= (*table++) & mask;
1020 }
wtc 2012/11/15 22:58:41 Do you think this is a little clearer? for (j
agl 2012/11/16 19:17:46 Done. (And 4x below.)
1021 for (j = 0; j < NLIMBS; j++) {
1022 out_y[j] |= (*table++) & mask;
1023 }
1024 }
1025 }
1026
1027 /* select_affine_point sets {out_x,out_y,out_z} to the index'th entry of table.
1028 * On entry: index < 16, table[0] must be zero. */
1029 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
wtc 2012/11/15 22:58:41 Typo: select_affine_point => select_jacobian_point
agl 2012/11/16 19:17:46 Done.
1030 const limb *table, limb index)
1031 {
1032 limb i, j;
1033
1034 memset(out_x, 0, sizeof(felem));
1035 memset(out_y, 0, sizeof(felem));
1036 memset(out_z, 0, sizeof(felem));
1037
1038 table += 3*NLIMBS;
wtc 2012/11/15 22:58:41 Why does this function add an offset to table, but
agl 2012/11/16 19:17:46 Have added comment. /* The implicit value at inde
1039
1040 for (i = 1; i < 16; i++) {
1041 limb mask = i ^ index;
1042 mask |= mask >> 2;
1043 mask |= mask >> 1;
1044 mask &= 1;
1045 mask--;
1046 for (j = 0; j < NLIMBS; j++) {
1047 out_x[j] |= (*table++) & mask;
1048 }
1049 for (j = 0; j < NLIMBS; j++) {
1050 out_y[j] |= (*table++) & mask;
1051 }
1052 for (j = 0; j < NLIMBS; j++) {
1053 out_z[j] |= (*table++) & mask;
1054 }
1055 }
1056 }
1057
1058 /* get_bit returns the bit'th bit of scalar. */
1059 static char get_bit(const u8 scalar[32], int bit)
1060 {
1061 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;
1062 }
1063
1064 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
1065 * number. Note that the value of scalar must be less than the order of the
1066 * group. */
1067 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])
1068 {
1069 int i, j;
1070 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
1071 u32 table_offset;
1072
1073 felem px, py;
1074 felem tx, ty, tz;
1075
1076 memset(nx, 0, sizeof(felem));
1077 memset(ny, 0, sizeof(felem));
1078 memset(nz, 0, sizeof(felem));
1079
1080 /* The loop adds bits at positions 0, 64, 128 and 192, followed by
1081 * positions 32,96,160 and 224 and does this 32 times. */
1082 for (i = 0; i < 32; i++) {
1083 if (i) {
1084 point_double(nx, ny, nz, nx, ny, nz);
1085 }
1086 for (j = 0; j <= 32; j += 32) {
1087 char bit0 = get_bit(scalar, 31 - i + j);
1088 char bit1 = get_bit(scalar, 95 - i + j);
1089 char bit2 = get_bit(scalar, 159 - i + j);
1090 char bit3 = get_bit(scalar, 223 - i + j);
1091 limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
1092
1093 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);
1094 select_affine_point(px, py, kPrecomputed + table_offset, index);
1095
1096 /* Since scalar is less than the order of the group, we know that
1097 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
1098 * below. */
1099 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
1100 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
1101 * (a.k.a. the point at infinity). We handle that situation by
1102 * copying the point from the table. */
1103 copy_conditional(nx, px, n_is_infinity_mask);
1104 copy_conditional(ny, py, n_is_infinity_mask);
1105 copy_conditional(nz, kOne, n_is_infinity_mask);
1106
1107 /* Equally, the result is also wrong if the point from the table is
1108 * zero, which happens when the index is zero. We handle that by
1109 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
1110 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1111 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1112 copy_conditional(nx, tx, mask);
1113 copy_conditional(ny, ty, mask);
1114 copy_conditional(nz, tz, mask);
1115 /* If p was not zero, then n is now non-zero. */
1116 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1117 }
1118 }
1119 }
1120
1121 /* point_to_affine converts a Jacobian point to an affine point. If the input
1122 * is the point at infinity then it returns (0, 0) in constant time. */
1123 static void point_to_affine(felem x_out, felem y_out,
1124 const felem nx, const felem ny, const felem nz) {
1125 felem z_inv, z_inv_sq;
1126 felem_inv(z_inv, nz);
1127 felem_square(z_inv_sq, z_inv);
wtc 2012/11/15 22:58:41 In your formula on line 809, I see x=x'/z, y=y'/z.
agl 2012/11/16 19:17:46 I had written down the formula for projective coor
1128 felem_mul(x_out, nx, z_inv_sq);
1129 felem_mul(z_inv, z_inv, z_inv_sq);
wtc 2012/11/15 22:58:41 Isn't this z inverse cubed?
1130 felem_mul(y_out, ny, z_inv);
1131 }
1132
1133 /* scalar_mult sets {nx,ny,nz} = scalar*{x,y}. */
1134 static void scalar_mult(felem nx, felem ny, felem nz,
1135 const felem x, const felem y, const u8 scalar[32])
1136 {
1137 int i;
1138 felem px, py, pz, tx, ty, tz;
1139 felem precomp[16][3];
1140 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
1141
1142 /* We precompute 0,1,2,... times {x,y}. */
1143 memset(precomp, 0, sizeof(felem) * 3);
1144 memcpy(&precomp[1][0], x, sizeof(felem));
1145 memcpy(&precomp[1][1], y, sizeof(felem));
1146 memcpy(&precomp[1][2], kOne, sizeof(felem));
1147
1148 for (i = 2; i < 16; i += 2) {
1149 point_double(precomp[i][0], precomp[i][1], precomp[i][2],
1150 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
1151
1152 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
1153 precomp[i][0], precomp[i][1], precomp[i][2], x, y);
1154 }
1155
1156 memset(nx, 0, sizeof(felem));
1157 memset(ny, 0, sizeof(felem));
1158 memset(nz, 0, sizeof(felem));
1159 n_is_infinity_mask = -1;
1160
1161 /* We add in a window of four bits each iteration and do this 64 times. */
1162 for (i = 0; i < 64; i++) {
1163 if (i) {
1164 point_double(nx, ny, nz, nx, ny, nz);
1165 point_double(nx, ny, nz, nx, ny, nz);
1166 point_double(nx, ny, nz, nx, ny, nz);
1167 point_double(nx, ny, nz, nx, ny, nz);
1168 }
1169
1170 index = scalar[31 - i / 2];
1171 if ((i & 1) == 1) {
1172 index &= 15;
1173 } else {
1174 index >>= 4;
1175 }
1176
1177 /* See the comments in scalar_base_mult about handling infinities. */
1178 select_jacobian_point(px, py, pz, (limb *) precomp, index);
1179 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
1180 copy_conditional(nx, px, n_is_infinity_mask);
1181 copy_conditional(ny, py, n_is_infinity_mask);
1182 copy_conditional(nz, pz, n_is_infinity_mask);
1183
1184 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;
1185 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1186 copy_conditional(nx, tx, mask);
1187 copy_conditional(ny, ty, mask);
1188 copy_conditional(nz, tz, mask);
1189 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1190 }
1191 }
1192
1193
wtc 2012/11/15 22:58:41 Delete a blank line?
agl 2012/11/16 19:17:46 Done.
1194 /* Interface with Freebl: */
1195
1196 #if IS_BIG_ENDIAN
1197 #error "This code needs a little-endian processor"
1198 #endif
1199
1200 static const u32 kRInvDigits[8] = {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xf ffffffe, 1, 0x7fffffff};
wtc 2012/11/15 22:58:41 Fold this long line.
agl 2012/11/16 19:17:46 Done.
1201 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))
1202 static const mp_int kRInv = {
1203 MP_ZPOS,
1204 MP_DIGITS_IN_256_BITS,
1205 MP_DIGITS_IN_256_BITS,
1206 /* Because we are running on a little-endian processor, this cast works for
1207 * both 32 and 64-bit processors. */
1208 (mp_digit*) kRInvDigits
1209 };
1210
1211 static const limb kTwo28 = 0x10000000;
1212 static const limb kTwo29 = 0x20000000;
1213
1214 /* to_montgomery sets out = R*in. */
1215 static mp_err to_montgomery(felem out, const mp_int *in, const ECGroup *group) {
wtc 2012/11/15 22:58:41 Put the opening '{' for this function on its own l
agl 2012/11/16 19:17:46 Done.
1216 /* There are no MPI functions for bitshift operations and we wish to shift
1217 * in 257 bits left so we move the digits 256-bits left and then multiply
1218 * by two. */
1219 mp_int in_shifted;
1220 int i;
1221 mp_err res;
1222
1223 mp_init(&in_shifted);
1224 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);
1225 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),
1226 MP_DIGITS(in),
1227 MP_USED(in)*sizeof(mp_digit));
1228 mp_mul_2(&in_shifted, &in_shifted);
1229 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));
1230
1231 for (i = 0; i < NLIMBS; i++) {
1232 if ((i & 1) == 0) {
1233 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;
1234 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);
1235 } else {
1236 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;
1237 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);
1238 }
1239 }
wtc 2012/11/16 05:28:53 Should we write this for loop using the prevalent
agl 2012/11/16 19:17:46 Done.
1240
1241 mp_clear(&in_shifted);
wtc 2012/11/15 22:58:41 I believe we need to do this mp_clear call under t
agl 2012/11/16 19:17:46 Done.
1242
1243 CLEANUP:
1244 return res;
1245 }
1246
1247 /* from_montgomery sets out=in/R. */
1248 static mp_err from_montgomery(mp_int *out, const felem in,
1249 const ECGroup *group) {
wtc 2012/11/15 22:58:41 Adjust the indentation.
agl 2012/11/16 19:17:46 Done.
1250 mp_int result, tmp;
1251 mp_err res;
1252 int i;
1253
1254 mp_init(&result);
1255 mp_init(&tmp);
1256
1257 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));
1258 for (i = NLIMBS-2; i >= 0; i--) {
1259 if ((i & 1) == 0) {
1260 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));
1261 } else {
1262 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));
1263 }
1264 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));
1265 }
1266
1267 MP_CHECKOK(mp_mul(&result, &kRInv, out));
1268 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));
1269
1270 mp_clear(&result);
1271 mp_clear(&tmp);
1272
1273 CLEANUP:
1274 return res;
1275 }
1276
1277 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */
1278 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n) {
1279 /* We require that |n| is less than the order of the group and therefore it
1280 * will fit into |scalar|. However, these is a timing side-channel here that
1281 * we cannot avoid: if |n| is sufficiently small it may be one or more words
1282 * too short and we'll copy less data. */
1283 memset(out_scalar, 0, 32);
1284 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));
1285 }
1286
1287 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the
1288 * order of the group. */
1289 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,
1290 mp_int *out_x, mp_int *out_y,
wtc 2012/11/15 22:58:41 Does freebl follow the convention that output argu
agl 2012/11/16 19:17:46 Yes, this function's prototype matches base_point_
1291 const ECGroup *group) {
1292 u8 scalar[32];
1293 felem x, y, z, x_affine, y_affine;
1294 mp_err res;
1295
1296 // FIXME(agl): test that n < order.
wtc 2012/11/15 22:58:41 Use C-style /* */ comment delimiters.
agl 2012/11/16 19:17:46 Done.
1297
1298 scalar_from_mp_int(scalar, n);
1299 scalar_base_mult(x, y, z, scalar);
1300 point_to_affine(x_affine, y_affine, x, y, z);
1301 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1302 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1303
1304 CLEANUP:
1305 return res;
1306 }
1307
1308 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
1309 * the order of the group. */
1310 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,
1311 const mp_int *in_x, const mp_int *in_y,
1312 mp_int *out_x, mp_int *out_y,
1313 const ECGroup *group) {
1314 u8 scalar[32];
1315 felem x, y, z, x_affine, y_affine, px, py;
1316 mp_err res;
1317
1318 scalar_from_mp_int(scalar, n);
1319
1320 MP_CHECKOK(to_montgomery(px, in_x, group));
1321 MP_CHECKOK(to_montgomery(py, in_y, group));
1322
1323 scalar_mult(x, y, z, px, py, scalar);
1324 point_to_affine(x_affine, y_affine, x, y, z);
1325 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1326 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1327
1328 CLEANUP:
1329 return res;
1330 }
1331
1332 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n1*G + n2*{in_x,in_y}, where
wtc 2012/11/15 22:58:41 Typo: ec_GFp_nistp256_point_mul => ec_GFp_nistp256
agl 2012/11/16 19:17:46 Done.
1333 * n1 and n2 are < the order of the group.
1334 *
1335 * As indicated by the name, this function operates in variable time. This
1336 * is safe because it's used for signature validation which doesn't deal
1337 * with secrets. */
1338 mp_err ec_GFp_nistp256_points_mul_vartime
1339 (const mp_int *n1, const mp_int *n2,
wtc 2012/11/15 22:58:41 Put the opening '(' on the previous line.
agl 2012/11/16 19:17:46 Done.
1340 const mp_int *in_x, const mp_int *in_y,
1341 mp_int *out_x, mp_int *out_y,
1342 const ECGroup *group) {
1343 u8 scalar1[32], scalar2[32];
1344 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;
1345 mp_err res = MP_OKAY;
1346
1347 /* If both scalars are zero, then the result is the point at infinity. */
1348 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {
1349 mp_zero(out_x);
1350 mp_zero(out_y);
1351 return res;
1352 }
1353
1354 scalar_from_mp_int(scalar1, n1);
1355 scalar_from_mp_int(scalar2, n2);
1356
1357 MP_CHECKOK(to_montgomery(px, in_x, group));
1358 MP_CHECKOK(to_montgomery(py, in_y, group));
1359 scalar_base_mult(x1, y1, z1, scalar1);
1360 scalar_mult(x2, y2, z2, px, py, scalar2);
1361
1362 if (mp_cmp_z(n2) == 0) {
1363 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
1364 * {x1,y1,z1}. */
1365 } else if (mp_cmp_z(n1) == 0) {
1366 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
1367 * {x2,y2,z2}. */
1368 memcpy(x1, x2, sizeof(x2));
1369 memcpy(y1, y2, sizeof(y2));
1370 memcpy(z1, z2, sizeof(z2));
1371 } else {
1372 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
1373 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
1374 }
1375
1376 point_to_affine(x_affine, y_affine, x1, y1, z1);
1377 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1378 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1379
1380 CLEANUP:
1381 return res;
1382 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698