Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(219)

Side by Side Diff: ecp_256_32.c

Issue 10944017: 32-bit, P-256 implementation. Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #include <endian.h>
wtc 2012/10/09 21:04:47 <endian.h> is not portable. For example, it doesn'
agl 2012/10/11 14:55:04 In which case there doesn't appear to be any porta
wtc 2012/10/11 16:29:00 Yes, NSPR provides the endianness of the system us
agl 2012/10/11 18:51:01 Thanks, done.
2
3 #include <stdint.h>
4 #include <stdio.h>
5
6 #include <string.h>
7 #include <stdlib.h>
8 #include <unistd.h>
wtc 2012/10/09 21:04:47 Do you remember why you need <unistd.h>? This head
agl 2012/10/11 14:55:04 It's probably leftover from debugging calls. Will
9
10 #include "mpi.h"
11 #include "mpi-priv.h"
12 #include "ecp.h"
13
14 typedef uint8_t u8;
15 typedef uint32_t u32;
16 typedef int32_t s32;
17 typedef uint64_t u64;
18
19 /* Our field elements are represented as nine, 32-bit words. Freebl's MPI
wtc 2012/10/09 21:04:47 Nit: I think the comma after "nine" is not necessa
agl 2012/10/11 14:55:04 Done.
20 * library calls them digits, but here they are called limbs, which is GMP's
21 * terminology.
22 *
23 * The value of an felem (field element) is:
24 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
25 *
26 * That is, each limb is alternatively 29 or 28-bits wide in little-endian
27 * order.
28 *
29 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
30 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
31 * when multipling as terms end up one bit short of a limb which would require
wtc 2012/10/09 21:04:47 Typo: multipling => multiplying
agl 2012/10/11 14:55:04 Done.
32 * much bit-shifting to correct.
33 *
34 * Finally, the values stored in an felem are in Montgomery form. So the value
35 * |y| is stored as (y * 2**257) mod p, where p is the P-256 prime.
36 */
37 typedef uint32_t limb;
38 #define NLIMBS 9
39 typedef limb felem[NLIMBS];
40
41 static const limb kBottom28Bits = 0xfffffff;
42 static const limb kBottom29Bits = 0x1fffffff;
43
44 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
45 * 28-bit words. */
46 static const felem kOne = {
47 2, 0, 0, 0xffff800,
48 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
49 0
50 };
51 static const limb kZero[NLIMBS] = {0};
52 static const limb kP[NLIMBS] = {
53 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
54 0, 0, 0x200000, 0xf000000,
55 0xfffffff
56 };
57 static const limb k2P[NLIMBS] = {
58 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
59 0, 0, 0x400000, 0xe000000,
60 0x1fffffff
61 };
62 /* kPrecomputed contains precomputed values to aid the calculation of scalar
63 * multiples of the base point, G. It's actually two, equal length, tables
64 * concatenated.
65 *
66 * The first table contains (x,y) felem pairs for 16 multiples of the base
67 * point, G.
68 *
69 * Index | Index (binary) | Value
70 * 0 | 0000 | 0G (all zeros, omitted)
71 * 1 | 0001 | G
72 * 2 | 0010 | 2**64G
73 * 3 | 0011 | 2**64G + G
74 * 4 | 0100 | 2**128G
75 * 5 | 0101 | 2**128G + G
76 * 6 | 0110 | 2**128G + 2**64G
77 * 7 | 0111 | 2**128G + 2**64G + G
78 * 8 | 1000 | 2**192G
79 * 9 | 1001 | 2**192G + G
80 * 10 | 1010 | 2**192G + 2**64G
81 * 11 | 1011 | 2**192G + 2**64G + G
82 * 12 | 1100 | 2**192G + 2**128G
83 * 13 | 1101 | 2**192G + 2**128G + G
84 * 14 | 1110 | 2**192G + 2**128G + 2**64G
85 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
86 *
87 * The second table follows the same style, but the multiples are 2**32G,
88 * 2**96G, 2**160G, 2**224G.
89 *
90 * This is ~2KB of data. */
91 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
92 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,
93 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,
94 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,
95 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,
96 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,
97 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,
98 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,
99 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,
100 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,
101 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,
102 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,
103 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,
104 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,
105 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,
106 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
107 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,
108 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,
109 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,
110 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,
111 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,
112 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,
113 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,
114 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
115 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,
116 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,
117 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,
118 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,
119 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,
120 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,
121 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,
122 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,
123 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,
124 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
125 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,
126 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,
127 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
128 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,
129 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,
130 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,
131 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,
132 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,
133 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,
134 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,
135 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,
136 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,
137 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,
138 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,
139 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,
140 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,
141 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
142 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,
143 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,
144 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,
145 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,
146 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,
147 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,
148 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,
149 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,
150 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,
151 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,
152 };
153
154
155 /* Field element operations: */
156
157 /* NON_ZERO_TO_ALL_ONES returns:
158 * 0xffffffff for 0 < x <= 2**31
159 * 0 for x == 0 or x > 2**31 */
160 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) (x-1)) >> 31)))
wtc 2012/10/09 21:04:47 I believe you are depending on the right-shift ope
agl 2012/10/11 14:55:04 I wasn't aware that the result was undefined by C.
wtc 2012/10/11 16:29:00 If that property depends on the processor rather t
agl 2012/10/11 18:51:01 Done.
161
162 /* felem_reduce_degree adds a multiple of p in order to cancel |carry|,
163 * which is a term at 2**257.
164 *
165 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
166 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
167 static void felem_reduce_carry(felem inout, limb carry)
168 {
169 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
170
171 inout[0] += carry << 1;
172 inout[3] += 0x10000000 & carry_mask;
173 inout[3] -= carry << 11;
174 inout[4] += (0x20000000 - 1) & carry_mask;
175 inout[5] += (0x10000000 - 1) & carry_mask;
176 inout[6] += (0x20000000 - 1) & carry_mask;
177 inout[6] -= carry << 22;
178 inout[7] -= 1 & carry_mask;
179 inout[7] += carry << 25;
180 }
181
182 /* felem_sum sets out = in+in2.
183 *
184 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
185 * On exit: in[0,2,...] < 2**30, in2[1,3,...] < 2**29 */
186 static void felem_sum(felem out, const felem in, const felem in2)
187 {
188 limb carry = 0;
189 unsigned i;
190 for (i = 0; i < 9; i++) {
wtc 2012/10/09 21:04:47 Should "9" be changed to NLIMBS here and on line 1
agl 2012/10/11 14:55:04 I've changed it to NLIMBS here and in other places
191 out[i] = in[i] + in2[i];
192 out[i] += carry;
193 carry = out[i] >> 29;
194 out[i] &= kBottom29Bits;
195
196 i++;
197 if (i == 9)
198 break;
199
200 out[i] = in[i] + in2[i];
201 out[i] += carry;
202 carry = out[i] >> 28;
203 out[i] &= kBottom28Bits;
204 }
205
206 felem_reduce_carry(out, carry);
207 }
208
209 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
210 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
211 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
212 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
213 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
214 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
215
216 /* zero31 is 0 mod p. */
wtc 2012/10/09 21:04:47 zero31 is 0 mod p. Just curious: is it p, 2p, 3p,
agl 2012/10/11 14:55:04 It ends up as 8p.
wtc 2012/10/11 16:29:00 Just curious: why is 8p chosen? Is it more conveni
agl 2012/10/11 18:51:01 The important feature of the value is that the hig
217 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, tw o30m2, two31p24m2, two30m27m2, two31m2 };
218
219 /* felem_diff sets out = in-in2.
220 *
221 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
222 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
223 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
224 static void felem_diff(felem out, const felem in, const felem in2)
225 {
226 limb carry = 0;
227 unsigned i;
228
229 for (i = 0; i < 9; i++) {
230 out[i] = in[i] - in2[i];
231 out[i] += zero31[i];
232 out[i] += carry;
233 carry = out[i] >> 29;
234 out[i] &= kBottom29Bits;
235
236 i++;
237 if (i == 9)
238 break;
239
240 out[i] = in[i] - in2[i];
241 out[i] += zero31[i];
242 out[i] += carry;
243 carry = out[i] >> 28;
244 out[i] &= kBottom28Bits;
245 }
246
247 felem_reduce_carry(out, carry);
248 }
249
250 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
251 * with the same 29,28,... bit pattern as an felem.
252 *
253 * On entry: tmp[i] < 2**64
254 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
255 static void felem_reduce_degree(felem out, u64 tmp[17])
wtc 2012/10/09 21:04:47 Why does the 'tmp' array have a size of 17?
agl 2012/10/11 14:55:04 The result of multiplying two arrays with indexes
256 {
257 limb tmp2[18], carry, x, xMask;
258 unsigned i;
259
260 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
261 * felem. So the top of an element of tmp might overlap with another
262 * element two positions down. The following loop eliminates this
263 * overlap. */
264 tmp2[0] = tmp[0] & kBottom29Bits;
265
266 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
267 * and hint to the compiler that it can do a single-word shift by selecting
268 * the right register rather than doing a double-word shift and truncating
269 * afterwards. */
270 tmp2[1] = ((limb) tmp[0]) >> 29;
271 tmp2[1] |= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;
272 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
273 carry = tmp2[1] >> 28;
274 tmp2[1] &= kBottom28Bits;
275
276 for (i = 2; i < 17; i++) {
277 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
278 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;
279 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
280 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
281 tmp2[i] += carry;
282 carry = tmp2[i] >> 29;
283 tmp2[i] &= kBottom29Bits;
284
285 i++;
286 if (i == 17)
287 break;
288 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
289 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;
290 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
291 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
292 tmp2[i] += carry;
293 carry = tmp2[i] >> 28;
294 tmp2[i] &= kBottom28Bits;
295 }
296
297 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;
298 tmp2[17] += ((limb) (tmp[16])) >> 29;
299 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);
300 tmp2[17] += carry;
301
302 /* Montgomery elimination of terms.
303 *
304 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
305 * Since we just multiplied two Montgomery values together, the result is
306 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
307 * in Montgomery form.
308 *
309 * Since R is 2**257, we can do that with a bitwise shift if we can ensure
310 * that the right-most 257 bits are all zero. We can make that true by
311 * adding multiplies of p without affecting the value.
312 *
313 * So we eliminate limbs from right to left. Since the bottom 29 bits of p
314 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
315 * We can do that for 8 further limbs and then right shift to eliminate the
316 * extra factor of R. */
317 for (i = 0; i < 9; i++) {
318 tmp2[i + 1] += tmp2[i] >> 29;
319 x = tmp2[i] & kBottom29Bits;
320 tmp2[i] = 0;
321
322 /* The bounds calculations for this loop are tricky. Each iteration of
323 * the loop eliminates two words by adding values to words to their
324 * right.
325 *
326 * The following table contains the amounts added to each word (as an
327 * offset from the value of i at the top of the loop). The amounts are
328 * accounted for from the first and second half of the loop separately
329 * and are written as, for example, 28 to mean a value <2**28.
330 *
331 * Word: 3 4 5 6 7 8 9 10
332 * Added in top half: 28 14 29 26 29 28
333 * 28 29
334 * Added in bottom half: 29 14 28 25 28 28
335 * 29
336 *
337 * The value that is current offset 7 will be offset 5 for the next
338 * iteration and then offset 3 for the iteration after that. Therefore
339 * the total value added will be the values added at 7, 5 and 3.
340 *
341 * The following table accumulates these values. The sums at the bottom
342 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
343 *
344 * Word: 3 4 5 6 7 8 9 10 11
345 * 28 14 14 29 26 29 28 28 28
346 * 29 28 29 28 29 28 29 28
347 * 14 28 25 26 29 26
348 * 28 29 28 28 28
349 * 29 28 29 28
350 * 29 14 29 14
351 * 14 28 14 28
352 * 29
353 * ----------------------------------
354 * 30+ 31+ 30+
355 * 28 30+ 28
356 * 14
357 *
358 * So the greatest amount is added to tmp2[10]. If tmp2[10] has an
359 * initial value of <2**29, then the maximum value will be < 2**31 +
360 * 2**30 + 2**29 + 2**14, which is < 2**32, as required. */
361 tmp2[i + 3] += (x << 10) & kBottom28Bits;
362 tmp2[i + 4] += (x >> 18);
363
364 tmp2[i + 6] += (x << 21) & kBottom29Bits;
365 tmp2[i + 7] += x >> 8;
366
367 /* At position 200 we have a factor of 0xf000000 = 2**28 - 2**24. */
368 tmp2[i + 7] += 0x10000000;
369 tmp2[i + 8] += x - 1;
370 tmp2[i + 7] -= (x << 24) & kBottom28Bits;
371 tmp2[i + 8] -= x >> 4;
372
373 tmp2[i + 8] += 0x20000000;
374 tmp2[i + 8] -= x;
375 tmp2[i + 8] += (x << 28) & kBottom29Bits;
376 tmp2[i + 9] += (x >> 1) - 1;
377
378 i++;
379 if (i == 9)
380 break;
381 tmp2[i + 1] += tmp2[i] >> 28;
382 x = tmp2[i] & kBottom28Bits;
383 tmp2[i] = 0;
384
385 tmp2[i + 3] += (x << 11) & kBottom29Bits;
386 tmp2[i + 4] += (x >> 18);
387
388 tmp2[i + 6] += (x << 21) & kBottom28Bits;
389 tmp2[i + 7] += x >> 7;
390
391 /* At position 199 we have a factor of 0x1e000000 = 2**29 - 2**25. */
392 tmp2[i + 7] += 0x20000000;
393 tmp2[i + 8] += x - 1;
394 tmp2[i + 7] -= (x << 25) & kBottom29Bits;
395 tmp2[i + 8] -= x >> 4;
396
397 /* If x is < 2**28, therefore is x > 0 iff x-1 has a zero MSB. Thus xMas k
398 is all ones iff x != 0. */
399 xMask = ((s32) ~ (x - 1)) >> 31;
400 tmp2[i + 8] += 0x10000000 & xMask;
401 tmp2[i + 8] -= x & xMask;
402 tmp2[i + 9] += (x - 1) & xMask;
403 }
404
405 /* We merge the right shift with a carry chain. The words above 2**257 have
406 * widths of 28,29,... which we need to correct when copying them down. */
407 carry = 0;
408 for (i = 0; i < 8; i++) {
409 /* The maximum value of tmp2[i + 9] occurs on the first iteration and
410 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
411 * therefore safe. */
412 out[i] = tmp2[i + 9];
413 out[i] += carry;
414 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
415 carry = out[i] >> 29;
416 out[i] &= kBottom29Bits;
417
418 i++;
419 out[i] = tmp2[i + 9] >> 1;
420 out[i] += carry;
421 carry = out[i] >> 28;
422 out[i] &= kBottom28Bits;
423 }
424
425 out[8] = tmp2[17];
426 out[8] += carry;
427 carry = out[8] >> 29;
428 out[8] &= kBottom29Bits;
429
430 felem_reduce_carry(out, carry);
431 }
432
433 /* felem_square sets out=in*in.
434 *
435 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
436 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
437 static void felem_square(felem out, const felem in)
438 {
439 uint64_t tmp[17];
440
441 tmp[0] = ((uint64_t) in[0]) * in[0];
442 tmp[1] = ((uint64_t) in[0]) * (in[1] << 1);
443 tmp[2] = ((uint64_t) in[0]) * (in[2] << 1) +
444 ((uint64_t) in[1]) * (in[1] << 1);
445 tmp[3] = ((uint64_t) in[0]) * (in[3] << 1) +
446 ((uint64_t) in[1]) * (in[2] << 1);
447 tmp[4] = ((uint64_t) in[0]) * (in[4] << 1) +
448 ((uint64_t) in[1]) * (in[3] << 2) +
449 ((uint64_t) in[2]) * in[2];
450 tmp[5] = ((uint64_t) in[0]) * (in[5] << 1) +
451 ((uint64_t) in[1]) * (in[4] << 1) +
452 ((uint64_t) in[2]) * (in[3] << 1);
453 tmp[6] = ((uint64_t) in[0]) * (in[6] << 1) +
454 ((uint64_t) in[1]) * (in[5] << 2) +
455 ((uint64_t) in[2]) * (in[4] << 1) +
456 ((uint64_t) in[3]) * (in[3] << 1);
457 tmp[7] = ((uint64_t) in[0]) * (in[7] << 1) +
458 ((uint64_t) in[1]) * (in[6] << 1) +
459 ((uint64_t) in[2]) * (in[5] << 1) +
460 ((uint64_t) in[3]) * (in[4] << 1);
461 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
462 * which is < 2**64 as required. */
463 tmp[8] = ((uint64_t) in[0]) * (in[8] << 1) +
464 ((uint64_t) in[1]) * (in[7] << 2) +
465 ((uint64_t) in[2]) * (in[6] << 1) +
466 ((uint64_t) in[3]) * (in[5] << 2) +
467 ((uint64_t) in[4]) * in[4];
468 tmp[9] = ((uint64_t) in[1]) * (in[8] << 1) +
469 ((uint64_t) in[2]) * (in[7] << 1) +
470 ((uint64_t) in[3]) * (in[6] << 1) +
471 ((uint64_t) in[4]) * (in[5] << 1);
472 tmp[10] = ((uint64_t) in[2]) * (in[8] << 1) +
473 ((uint64_t) in[3]) * (in[7] << 2) +
474 ((uint64_t) in[4]) * (in[6] << 1) +
475 ((uint64_t) in[5]) * (in[5] << 1);
476 tmp[11] = ((uint64_t) in[3]) * (in[8] << 1) +
477 ((uint64_t) in[4]) * (in[7] << 1) +
478 ((uint64_t) in[5]) * (in[6] << 1);
479 tmp[12] = ((uint64_t) in[4]) * (in[8] << 1) +
480 ((uint64_t) in[5]) * (in[7] << 2) +
481 ((uint64_t) in[6]) * in[6];
482 tmp[13] = ((uint64_t) in[5]) * (in[8] << 1) +
483 ((uint64_t) in[6]) * (in[7] << 1);
484 tmp[14] = ((uint64_t) in[6]) * (in[8] << 1) +
485 ((uint64_t) in[7]) * (in[7] << 1);
486 tmp[15] = ((uint64_t) in[7]) * (in[8] << 1);
487 tmp[16] = ((uint64_t) in[8]) * in[8];
488
489 felem_reduce_degree(out, tmp);
490 }
491
492 /* felem_mul sets out=in*in2.
493 *
494 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
495 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
496 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
497 static void felem_mul(felem out, const felem in, const felem in2)
498 {
499 u64 tmp[17];
500
501 tmp[0] = ((uint64_t) in[0]) * in2[0];
502 tmp[1] = ((uint64_t) in[0]) * (in2[1] << 0) +
503 ((uint64_t) in[1]) * (in2[0] << 0);
504 tmp[2] = ((uint64_t) in[0]) * (in2[2] << 0) +
505 ((uint64_t) in[1]) * (in2[1] << 1) +
506 ((uint64_t) in[2]) * (in2[0] << 0);
507 tmp[3] = ((uint64_t) in[0]) * (in2[3] << 0) +
508 ((uint64_t) in[1]) * (in2[2] << 0) +
509 ((uint64_t) in[2]) * (in2[1] << 0) +
510 ((uint64_t) in[3]) * (in2[0] << 0);
511 tmp[4] = ((uint64_t) in[0]) * (in2[4] << 0) +
512 ((uint64_t) in[1]) * (in2[3] << 1) +
513 ((uint64_t) in[2]) * (in2[2] << 0) +
514 ((uint64_t) in[3]) * (in2[1] << 1) +
515 ((uint64_t) in[4]) * (in2[0] << 0);
516 tmp[5] = ((uint64_t) in[0]) * (in2[5] << 0) +
517 ((uint64_t) in[1]) * (in2[4] << 0) +
518 ((uint64_t) in[2]) * (in2[3] << 0) +
519 ((uint64_t) in[3]) * (in2[2] << 0) +
520 ((uint64_t) in[4]) * (in2[1] << 0) +
521 ((uint64_t) in[5]) * (in2[0] << 0);
522 tmp[6] = ((uint64_t) in[0]) * (in2[6] << 0) +
523 ((uint64_t) in[1]) * (in2[5] << 1) +
524 ((uint64_t) in[2]) * (in2[4] << 0) +
525 ((uint64_t) in[3]) * (in2[3] << 1) +
526 ((uint64_t) in[4]) * (in2[2] << 0) +
527 ((uint64_t) in[5]) * (in2[1] << 1) +
528 ((uint64_t) in[6]) * (in2[0] << 0);
529 tmp[7] = ((uint64_t) in[0]) * (in2[7] << 0) +
530 ((uint64_t) in[1]) * (in2[6] << 0) +
531 ((uint64_t) in[2]) * (in2[5] << 0) +
532 ((uint64_t) in[3]) * (in2[4] << 0) +
533 ((uint64_t) in[4]) * (in2[3] << 0) +
534 ((uint64_t) in[5]) * (in2[2] << 0) +
535 ((uint64_t) in[6]) * (in2[1] << 0) +
536 ((uint64_t) in[7]) * (in2[0] << 0);
537 /* tmp[8] has the greatest value but doesn't overflow. See logic in
538 * felem_square. */
539 tmp[8] = ((uint64_t) in[0]) * (in2[8] << 0) +
540 ((uint64_t) in[1]) * (in2[7] << 1) +
541 ((uint64_t) in[2]) * (in2[6] << 0) +
542 ((uint64_t) in[3]) * (in2[5] << 1) +
543 ((uint64_t) in[4]) * (in2[4] << 0) +
544 ((uint64_t) in[5]) * (in2[3] << 1) +
545 ((uint64_t) in[6]) * (in2[2] << 0) +
546 ((uint64_t) in[7]) * (in2[1] << 1) +
547 ((uint64_t) in[8]) * (in2[0] << 0);
548 tmp[9] = ((uint64_t) in[1]) * (in2[8] << 0) +
549 ((uint64_t) in[2]) * (in2[7] << 0) +
550 ((uint64_t) in[3]) * (in2[6] << 0) +
551 ((uint64_t) in[4]) * (in2[5] << 0) +
552 ((uint64_t) in[5]) * (in2[4] << 0) +
553 ((uint64_t) in[6]) * (in2[3] << 0) +
554 ((uint64_t) in[7]) * (in2[2] << 0) +
555 ((uint64_t) in[8]) * (in2[1] << 0);
556 tmp[10] = ((uint64_t) in[2]) * (in2[8] << 0) +
557 ((uint64_t) in[3]) * (in2[7] << 1) +
558 ((uint64_t) in[4]) * (in2[6] << 0) +
559 ((uint64_t) in[5]) * (in2[5] << 1) +
560 ((uint64_t) in[6]) * (in2[4] << 0) +
561 ((uint64_t) in[7]) * (in2[3] << 1) +
562 ((uint64_t) in[8]) * (in2[2] << 0);
563 tmp[11] = ((uint64_t) in[3]) * (in2[8] << 0) +
564 ((uint64_t) in[4]) * (in2[7] << 0) +
565 ((uint64_t) in[5]) * (in2[6] << 0) +
566 ((uint64_t) in[6]) * (in2[5] << 0) +
567 ((uint64_t) in[7]) * (in2[4] << 0) +
568 ((uint64_t) in[8]) * (in2[3] << 0);
569 tmp[12] = ((uint64_t) in[4]) * (in2[8] << 0) +
570 ((uint64_t) in[5]) * (in2[7] << 1) +
571 ((uint64_t) in[6]) * (in2[6] << 0) +
572 ((uint64_t) in[7]) * (in2[5] << 1) +
573 ((uint64_t) in[8]) * (in2[4] << 0);
574 tmp[13] = ((uint64_t) in[5]) * (in2[8] << 0) +
575 ((uint64_t) in[6]) * (in2[7] << 0) +
576 ((uint64_t) in[7]) * (in2[6] << 0) +
577 ((uint64_t) in[8]) * (in2[5] << 0);
578 tmp[14] = ((uint64_t) in[6]) * (in2[8] << 0) +
579 ((uint64_t) in[7]) * (in2[7] << 1) +
580 ((uint64_t) in[8]) * (in2[6] << 0);
581 tmp[15] = ((uint64_t) in[7]) * (in2[8] << 0) +
582 ((uint64_t) in[8]) * (in2[7] << 0);
583 tmp[16] = ((uint64_t) in[8]) * (in2[8] << 0);
584
585 felem_reduce_degree(out, tmp);
586 }
587
588 static void felem_assign(felem out, const felem in)
589 {
590 memcpy(out, in, sizeof(felem));
591 }
592
593 /* felem_inv calculates |out| = |in|^{-1}
594 *
595 * Based on Fermat's Little Theorem:
596 * a^p = a (mod p)
597 * a^{p-1} = 1 (mod p)
598 * a^{p-2} = a^{-1} (mod p)
599 */
600 static void felem_inv(felem out, const felem in)
601 {
602 felem ftmp, ftmp2;
603 /* each e_I will hold |in|^{2^I - 1} */
604 felem e2, e4, e8, e16, e32, e64;
605 unsigned i;
606
607 felem_square(ftmp, in); /* 2^1 */
608 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
609 felem_assign(e2, ftmp);
610 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
611 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
612 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
613 felem_assign(e4, ftmp);
614 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
615 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
616 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
617 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
618 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
619 felem_assign(e8, ftmp);
620 for (i = 0; i < 8; i++) {
621 felem_square(ftmp, ftmp);
622 } /* 2^16 - 2^8 */
623 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
624 felem_assign(e16, ftmp);
625 for (i = 0; i < 16; i++) {
626 felem_square(ftmp, ftmp);
627 } /* 2^32 - 2^16 */
628 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
629 felem_assign(e32, ftmp);
630 for (i = 0; i < 32; i++) {
631 felem_square(ftmp, ftmp);
632 } /* 2^64 - 2^32 */
633 felem_assign(e64, ftmp);
634 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
635 for (i = 0; i < 192; i++) {
636 felem_square(ftmp, ftmp);
637 } /* 2^256 - 2^224 + 2^192 */
638
639 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
640 for (i = 0; i < 16; i++) {
641 felem_square(ftmp2, ftmp2);
642 } /* 2^80 - 2^16 */
643 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
644 for (i = 0; i < 8; i++) {
645 felem_square(ftmp2, ftmp2);
646 } /* 2^88 - 2^8 */
647 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
648 for (i = 0; i < 4; i++) {
649 felem_square(ftmp2, ftmp2);
650 } /* 2^92 - 2^4 */
651 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
652 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
653 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
654 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
655 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
656 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
657 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
658
659 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
660 }
661
662 /* felem_scalar_3 sets out=3*out.
663 *
664 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
665 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
666 static void felem_scalar_3(felem out)
667 {
668 limb carry = 0;
669 unsigned i;
670
671 for (i = 0;; i++) {
672 out[i] *= 3;
673 out[i] += carry;
674 carry = out[i] >> 29;
675 out[i] &= kBottom29Bits;
676
677 i++;
678 if (i == 9)
679 break;
680
681 out[i] *= 3;
682 out[i] += carry;
683 carry = out[i] >> 28;
684 out[i] &= kBottom28Bits;
685 }
686
687 felem_reduce_carry(out, carry);
688 }
689
690 /* felem_scalar_4 sets out=4*out.
691 *
692 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
693 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
694 static void felem_scalar_4(felem out)
695 {
696 limb carry = 0, next_carry;
697 unsigned i;
698
699 for (i = 0;; i++) {
700 next_carry = out[i] >> 27;
701 out[i] <<= 2;
702 out[i] &= kBottom29Bits;
703 out[i] += carry;
704 carry = next_carry + (out[i] >> 29);
705 out[i] &= kBottom29Bits;
706
707 i++;
708 if (i == 9)
709 break;
710 next_carry = out[i] >> 26;
711 out[i] <<= 2;
712 out[i] &= kBottom28Bits;
713 out[i] += carry;
714 carry = next_carry + (out[i] >> 28);
715 out[i] &= kBottom28Bits;
716 }
717
718 felem_reduce_carry(out, carry);
719 }
720
721 /* felem_scalar_8 sets out=8*out.
722 *
723 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
724 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
725 static void felem_scalar_8(felem out)
726 {
727 limb carry = 0, next_carry;
728 unsigned i;
729
730 for (i = 0;; i++) {
731 next_carry = out[i] >> 26;
732 out[i] <<= 3;
733 out[i] &= kBottom29Bits;
734 out[i] += carry;
735 carry = next_carry + (out[i] >> 29);
736 out[i] &= kBottom29Bits;
737
738 i++;
739 if (i == 9)
740 break;
741 next_carry = out[i] >> 25;
742 out[i] <<= 3;
743 out[i] &= kBottom28Bits;
744 out[i] += carry;
745 carry = next_carry + (out[i] >> 28);
746 out[i] &= kBottom28Bits;
747 }
748
749 felem_reduce_carry(out, carry);
750 }
751
752 static char felem_is_zero_vartime(const felem in)
753 {
754 limb carry;
755 int i;
756 limb tmp[NLIMBS];
757 felem_assign(tmp, in);
758
759 /* First, reduce tmp to a minimal form. */
760 do {
761 carry = 0;
762 for (i = 0;; i++) {
763 tmp[i] += carry;
764 carry = tmp[i] >> 29;
765 tmp[i] &= kBottom29Bits;
766
767 i++;
768 if (i == 9)
769 break;
770
771 tmp[i] += carry;
772 carry = tmp[i] >> 28;
773 tmp[i] &= kBottom28Bits;
774 }
775
776 felem_reduce_carry(tmp, carry);
777 } while (carry);
778
779 /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
780 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
781 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
782 memcmp(tmp, k2P, sizeof(tmp)) == 0;
783 }
784
785 /* Group operations:
786 *
787 * Elements of the elliptic curve group are represented in Jacobian
788 * coordinates. An affine point (x', y') is x=x'/z, y=y'/z in Jacobian form. */
789
790 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
791 *
792 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l */
793 static void point_double(felem x_out, felem y_out, felem z_out,
794 const felem x, const felem y, const felem z)
795 {
796 felem delta, gamma, alpha, beta, tmp, tmp2;
797
798 felem_square(delta, z);
799 felem_square(gamma, y);
800 felem_mul(beta, x, gamma);
801
802 felem_sum(tmp, x, delta);
803 felem_diff(tmp2, x, delta);
804 felem_mul(alpha, tmp, tmp2);
805 felem_scalar_3(alpha);
806
807 felem_sum(tmp, y, z);
808 felem_square(tmp, tmp);
809 felem_diff(tmp, tmp, gamma);
810 felem_diff(z_out, tmp, delta);
811
812 felem_scalar_4(beta);
813 felem_square(x_out, alpha);
814 felem_diff(x_out, x_out, beta);
815 felem_diff(x_out, x_out, beta);
816
817 felem_diff(tmp, beta, x_out);
818 felem_mul(tmp, alpha, tmp);
819 felem_square(tmp2, gamma);
820 felem_scalar_8(tmp2);
821 felem_diff(y_out, tmp, tmp2);
822 }
823
824 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
825 * (i.e. the second point is affine.)
826 *
827 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
828 *
829 * Note that this function does not handle P+P, infinity+P nor P+infinity
830 * correctly. */
831 static void point_add_mixed(felem x_out, felem y_out, felem z_out,
832 const felem x1, const felem y1, const felem z1,
833 const felem x2, const felem y2)
834 {
835 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
836
837 felem_square(z1z1, z1);
838 felem_sum(tmp, z1, z1);
839
840 felem_mul(u2, x2, z1z1);
841 felem_mul(z1z1z1, z1, z1z1);
842 felem_mul(s2, y2, z1z1z1);
843 felem_diff(h, u2, x1);
844 felem_sum(i, h, h);
845 felem_square(i, i);
846 felem_mul(j, h, i);
847 felem_diff(r, s2, y1);
848 felem_sum(r, r, r);
849 felem_mul(v, x1, i);
850
851 felem_mul(z_out, tmp, h);
852 felem_square(rr, r);
853 felem_diff(x_out, rr, j);
854 felem_diff(x_out, x_out, v);
855 felem_diff(x_out, x_out, v);
856
857 felem_diff(tmp, v, x_out);
858 felem_mul(y_out, tmp, r);
859 felem_mul(tmp, y1, j);
860 felem_diff(y_out, y_out, tmp);
861 felem_diff(y_out, y_out, tmp);
862 }
863
864 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
865 *
866 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
867 *
868 * Note that this function does not handle P+P, infinity+P nor P+infinity
869 * correctly. */
870 static void point_add(felem x_out, felem y_out, felem z_out,
871 const felem x1, const felem y1, const felem z1,
872 const felem x2, const felem y2, const felem z2)
873 {
874 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
875
876 felem_square(z1z1, z1);
877 felem_square(z2z2, z2);
878 felem_mul(u1, x1, z2z2);
879
880 felem_sum(tmp, z1, z2);
881 felem_square(tmp, tmp);
882 felem_diff(tmp, tmp, z1z1);
883 felem_diff(tmp, tmp, z2z2);
884
885 felem_mul(z2z2z2, z2, z2z2);
886 felem_mul(s1, y1, z2z2z2);
887
888 felem_mul(u2, x2, z1z1);
889 felem_mul(z1z1z1, z1, z1z1);
890 felem_mul(s2, y2, z1z1z1);
891 felem_diff(h, u2, u1);
892 felem_sum(i, h, h);
893 felem_square(i, i);
894 felem_mul(j, h, i);
895 felem_diff(r, s2, s1);
896 felem_sum(r, r, r);
897 felem_mul(v, u1, i);
898
899 felem_mul(z_out, tmp, h);
900 felem_square(rr, r);
901 felem_diff(x_out, rr, j);
902 felem_diff(x_out, x_out, v);
903 felem_diff(x_out, x_out, v);
904
905 felem_diff(tmp, v, x_out);
906 felem_mul(y_out, tmp, r);
907 felem_mul(tmp, s1, j);
908 felem_diff(y_out, y_out, tmp);
909 felem_diff(y_out, y_out, tmp);
910 }
911
912 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2 }.
913 *
914 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
915 *
916 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
917 static void point_add_or_double_vartime(
918 felem x_out, felem y_out, felem z_out,
919 const felem x1, const felem y1, const felem z1,
920 const felem x2, const felem y2, const felem z2)
921 {
922 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
923 char x_equal, y_equal;
924
925 felem_square(z1z1, z1);
926 felem_square(z2z2, z2);
927 felem_mul(u1, x1, z2z2);
928
929 felem_sum(tmp, z1, z2);
930 felem_square(tmp, tmp);
931 felem_diff(tmp, tmp, z1z1);
932 felem_diff(tmp, tmp, z2z2);
933
934 felem_mul(z2z2z2, z2, z2z2);
935 felem_mul(s1, y1, z2z2z2);
936
937 felem_mul(u2, x2, z1z1);
938 felem_mul(z1z1z1, z1, z1z1);
939 felem_mul(s2, y2, z1z1z1);
940 felem_diff(h, u2, u1);
941 x_equal = felem_is_zero_vartime(h);
942 felem_sum(i, h, h);
943 felem_square(i, i);
944 felem_mul(j, h, i);
945 felem_diff(r, s2, s1);
946 y_equal = felem_is_zero_vartime(r);
947 if (x_equal && y_equal) {
948 point_double(x_out, y_out, z_out, x1, y1, z1);
949 return;
950 }
951 felem_sum(r, r, r);
952 felem_mul(v, u1, i);
953
954 felem_mul(z_out, tmp, h);
955 felem_square(rr, r);
956 felem_diff(x_out, rr, j);
957 felem_diff(x_out, x_out, v);
958 felem_diff(x_out, x_out, v);
959
960 felem_diff(tmp, v, x_out);
961 felem_mul(y_out, tmp, r);
962 felem_mul(tmp, s1, j);
963 felem_diff(y_out, y_out, tmp);
964 felem_diff(y_out, y_out, tmp);
965 }
966
967 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.
968 *
969 * On entry: mask is either 0 or 0xffffffff. */
970 static void copy_conditional(felem out, const felem in, limb mask)
971 {
972 int i;
973
974 for (i = 0; i < 9; i++) {
975 const limb tmp = mask & (in[i] ^ out[i]);
976 out[i] ^= tmp;
977 }
978 }
979
980 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.
981 * On entry: index < 16, table[0] must be zero. */
982 static void select_affine_point(felem out_x, felem out_y,
983 const limb *table, limb index)
984 {
985 limb i, j;
986
987 memset(out_x, 0, sizeof(felem));
988 memset(out_y, 0, sizeof(felem));
989
990 for (i = 1; i < 16; i++) {
991 limb mask = i ^ index;
992 mask |= mask >> 2;
993 mask |= mask >> 1;
994 mask &= 1;
995 mask--;
996 for (j = 0; j < NLIMBS; j++) {
997 out_x[j] |= (*table++) & mask;
998 }
999 for (j = 0; j < NLIMBS; j++) {
1000 out_y[j] |= (*table++) & mask;
1001 }
1002 }
1003 }
1004
1005 /* select_affine_point sets {out_x,out_y,out_z} to the index'th entry of table.
1006 * On entry: index < 16, table[0] must be zero. */
1007 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
1008 const limb *table, limb index)
1009 {
1010 limb i, j;
1011
1012 memset(out_x, 0, sizeof(felem));
1013 memset(out_y, 0, sizeof(felem));
1014 memset(out_z, 0, sizeof(felem));
1015
1016 table += 3*NLIMBS;
1017
1018 for (i = 1; i < 16; i++) {
1019 limb mask = i ^ index;
1020 mask |= mask >> 2;
1021 mask |= mask >> 1;
1022 mask &= 1;
1023 mask--;
1024 for (j = 0; j < NLIMBS; j++) {
1025 out_x[j] |= (*table++) & mask;
1026 }
1027 for (j = 0; j < NLIMBS; j++) {
1028 out_y[j] |= (*table++) & mask;
1029 }
1030 for (j = 0; j < NLIMBS; j++) {
1031 out_z[j] |= (*table++) & mask;
1032 }
1033 }
1034 }
1035
1036 /* get_bit returns the bit'th bit of scalar. */
1037 static char get_bit(const u8 scalar[32], int bit)
1038 {
1039 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;
1040 }
1041
1042 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
1043 * number. Note that the value of scalar must be less than the order of the
1044 * group. */
1045 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])
1046 {
1047 int i, j;
1048 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
1049 u32 table_offset;
1050
1051 felem px, py;
1052 felem tx, ty, tz;
1053
1054 memset(nx, 0, sizeof(felem));
1055 memset(ny, 0, sizeof(felem));
1056 memset(nz, 0, sizeof(felem));
1057
1058 /* The loop adds bits at positions 0, 64, 128 and 192, followed by
1059 * positions 32,96,160 and 224 and does this 32 times. */
1060 for (i = 0; i < 32; i++) {
1061 if (i) {
1062 point_double(nx, ny, nz, nx, ny, nz);
1063 }
1064 for (j = 0; j <= 32; j += 32) {
1065 char bit0 = get_bit(scalar, 31 - i + j);
1066 char bit1 = get_bit(scalar, 95 - i + j);
1067 char bit2 = get_bit(scalar, 159 - i + j);
1068 char bit3 = get_bit(scalar, 223 - i + j);
1069 limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
1070
1071 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);
1072 select_affine_point(px, py, kPrecomputed + table_offset, index);
1073
1074 /* Since scalar is less than the order of the group, we know that
1075 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
1076 * below. */
1077 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
1078 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
1079 * (a.k.a. the point at infinity). We handle that situation by
1080 * copying the point from the table. */
1081 copy_conditional(nx, px, n_is_infinity_mask);
1082 copy_conditional(ny, py, n_is_infinity_mask);
1083 copy_conditional(nz, kOne, n_is_infinity_mask);
1084
1085 /* Equally, the result is also wrong if the point from the table is
1086 * zero, which happens when the index is zero. We handle that by
1087 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
1088 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1089 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1090 copy_conditional(nx, tx, mask);
1091 copy_conditional(ny, ty, mask);
1092 copy_conditional(nz, tz, mask);
1093 /* If p was not zero, then n is now non-zero. */
1094 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1095 }
1096 }
1097 }
1098
1099 /* point_to_affine converts a Jacobian point to an affine point. If the input
1100 * is the point at infinity then it returns (0, 0) in constant time. */
1101 static void point_to_affine(felem x_out, felem y_out,
1102 const felem nx, const felem ny, const felem nz) {
1103 felem z_inv, z_inv_sq;
1104 felem_inv(z_inv, nz);
1105 felem_square(z_inv_sq, z_inv);
1106 felem_mul(x_out, nx, z_inv_sq);
1107 felem_mul(z_inv, z_inv, z_inv_sq);
1108 felem_mul(y_out, ny, z_inv);
1109 }
1110
1111 /* scalar_base_mult sets {nx,ny,nz} = scalar*{x,y}. */
1112 static void scalar_mult(felem nx, felem ny, felem nz,
1113 const felem x, const felem y, const u8 scalar[32])
1114 {
1115 int i;
1116 felem px, py, pz, tx, ty, tz;
1117 felem precomp[16][3];
1118 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
1119
1120 /* We precompute 0,1,2,... times {x,y}. */
1121 memset(precomp, 0, sizeof(felem) * 3);
1122 memcpy(&precomp[1][0], x, sizeof(felem));
1123 memcpy(&precomp[1][1], y, sizeof(felem));
1124 memcpy(&precomp[1][2], kOne, sizeof(felem));
1125
1126 for (i = 2; i < 16; i += 2) {
1127 point_double(precomp[i][0], precomp[i][1], precomp[i][2],
1128 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
1129
1130 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
1131 precomp[i][0], precomp[i][1], precomp[i][2], x, y);
1132 }
1133
1134 memset(nx, 0, sizeof(felem));
1135 memset(ny, 0, sizeof(felem));
1136 memset(nz, 0, sizeof(felem));
1137 n_is_infinity_mask = -1;
1138
1139 /* We add in a window of four bits each iteration and do this 64 times. */
1140 for (i = 0; i < 64; i++) {
1141 if (i) {
1142 point_double(nx, ny, nz, nx, ny, nz);
1143 point_double(nx, ny, nz, nx, ny, nz);
1144 point_double(nx, ny, nz, nx, ny, nz);
1145 point_double(nx, ny, nz, nx, ny, nz);
1146 }
1147
1148 index = scalar[31 - i / 2];
1149 if ((i & 1) == 1) {
1150 index &= 15;
1151 } else {
1152 index >>= 4;
1153 }
1154
1155 /* See the comments in scalar_base_mult about handling infinities. */
1156 select_jacobian_point(px, py, pz, (limb *) precomp, index);
1157 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
1158 copy_conditional(nx, px, n_is_infinity_mask);
1159 copy_conditional(ny, py, n_is_infinity_mask);
1160 copy_conditional(nz, pz, n_is_infinity_mask);
1161
1162 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;
1163 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1164 copy_conditional(nx, tx, mask);
1165 copy_conditional(ny, ty, mask);
1166 copy_conditional(nz, tz, mask);
1167 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1168 }
1169 }
1170
1171
1172 /* Interface with Freebl: */
1173
1174 #if __BYTE_ORDER != __LITTLE_ENDIAN
1175 #error "This code needs a little-endian processor"
1176 #endif
1177
1178 static const u32 kRInvDigits[8] = {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xf ffffffe, 1, 0x7fffffff};
1179 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))
1180 static const mp_int kRInv = {
1181 MP_ZPOS,
1182 MP_DIGITS_IN_256_BITS,
1183 MP_DIGITS_IN_256_BITS,
1184 /* Because we are running on a little-endian processor, this cast works for
1185 * both 32 and 64-bit processors. */
1186 (mp_digit*) kRInvDigits
1187 };
1188
1189 static const limb kTwo28 = 0x10000000;
1190 static const limb kTwo29 = 0x20000000;
1191
1192 /* montgomery sets out = R*in. */
1193 static mp_err montgomery(felem out, const mp_int *in, const ECGroup *group) {
1194 /* There are no MPI functions for bitshift operations and we wish to shift i n
1195 * 257 bits left so we multiply by two and then move the digits 256-bits
1196 * left. */
1197 mp_int in_shifted;
1198 int i;
1199 mp_err res;
1200
1201 mp_init(&in_shifted);
1202 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);
1203 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),
1204 MP_DIGITS(in),
1205 MP_USED(in)*sizeof(mp_digit));
1206 mp_mul_2(&in_shifted, &in_shifted);
1207 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));
1208
1209 for (i = 0; i < NLIMBS; i++) {
1210 if ((i & 1) == 0) {
1211 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;
1212 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);
1213 } else {
1214 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;
1215 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);
1216 }
1217 }
1218
1219 mp_clear(&in_shifted);
1220
1221 CLEANUP:
1222 return res;
1223 }
1224
1225 /* inverse_montgomery sets out=in/R. */
1226 static mp_err inverse_montgomery(mp_int *out, const felem in,
1227 const ECGroup *group) {
1228 mp_int result, tmp;
1229 mp_err res;
1230 int i;
1231
1232 mp_init(&result);
1233 mp_init(&tmp);
1234
1235 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));
1236 for (i = NLIMBS-2; i >= 0; i--) {
1237 if ((i & 1) == 0) {
1238 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));
1239 } else {
1240 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));
1241 }
1242 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));
1243 }
1244
1245 MP_CHECKOK(mp_mul(&result, &kRInv, out));
1246 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));
1247
1248 mp_clear(&result);
1249 mp_clear(&tmp);
1250
1251 CLEANUP:
1252 return res;
1253 }
1254
1255 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */
1256 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n) {
1257 /* We require that |n| is less than the order of the group and therefore it
1258 * will fit into |scalar|. However, these is a timing side-channel here that
1259 * we cannot avoid: if |n| is sufficiently small it may be one or more words
1260 * too short and we'll copy less data. */
1261 memset(out_scalar, 0, 32);
1262 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));
1263 }
1264
1265 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the
1266 * order of the group. */
1267 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,
1268 mp_int *out_x, mp_int *out_y,
1269 const ECGroup *group) {
1270 u8 scalar[32];
1271 felem x, y, z, x_affine, y_affine;
1272 mp_err res;
1273
1274 scalar_from_mp_int(scalar, n);
1275 scalar_base_mult(x, y, z, scalar);
1276 point_to_affine(x_affine, y_affine, x, y, z);
1277 MP_CHECKOK(inverse_montgomery(out_x, x_affine, group));
1278 MP_CHECKOK(inverse_montgomery(out_y, y_affine, group));
1279
1280 CLEANUP:
1281 return res;
1282 }
1283
1284 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
1285 * the order of the group. */
1286 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,
1287 const mp_int *in_x, const mp_int *in_y,
1288 mp_int *out_x, mp_int *out_y,
1289 const ECGroup *group) {
1290 u8 scalar[32];
1291 felem x, y, z, x_affine, y_affine, px, py;
1292 mp_err res;
1293
1294 scalar_from_mp_int(scalar, n);
1295
1296 MP_CHECKOK(montgomery(px, in_x, group));
1297 MP_CHECKOK(montgomery(py, in_y, group));
1298
1299 scalar_mult(x, y, z, px, py, scalar);
1300 point_to_affine(x_affine, y_affine, x, y, z);
1301 MP_CHECKOK(inverse_montgomery(out_x, x_affine, group));
1302 MP_CHECKOK(inverse_montgomery(out_y, y_affine, group));
1303
1304 CLEANUP:
1305 return res;
1306 }
1307
1308 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n1*G + n2*{in_x,in_y}, where
1309 * n1 and n2 are < the order of the group.
1310 *
1311 * As indicated by the name, this function operates in variable time. This
1312 * is safe because it's used for signature validation which doesn't deal
1313 * with secrets. */
1314 mp_err ec_GFp_nistp256_points_mul_vartime
1315 (const mp_int *n1, const mp_int *n2,
1316 const mp_int *in_x, const mp_int *in_y,
1317 mp_int *out_x, mp_int *out_y,
1318 const ECGroup *group) {
1319 u8 scalar1[32], scalar2[32];
1320 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;
1321 mp_err res = MP_OKAY;
1322
1323 /* If both scalars are zero, then the result is the point at infinity. */
1324 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {
1325 mp_zero(out_x);
1326 mp_zero(out_y);
1327 return res;
1328 }
1329
1330 scalar_from_mp_int(scalar1, n1);
1331 scalar_from_mp_int(scalar2, n2);
1332
1333 MP_CHECKOK(montgomery(px, in_x, group));
1334 MP_CHECKOK(montgomery(py, in_y, group));
1335 scalar_base_mult(x1, y1, z1, scalar1);
1336 scalar_mult(x2, y2, z2, px, py, scalar2);
1337
1338 if (mp_cmp_z(n2) == 0) {
1339 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
1340 * {x1,y1,z1}. */
1341 } else if (mp_cmp_z(n1) == 0) {
1342 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
1343 * {x2,y2,z2}. */
1344 memcpy(x1, x2, sizeof(x2));
1345 memcpy(y1, y2, sizeof(y2));
1346 memcpy(z1, z2, sizeof(z2));
1347 } else {
1348 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
1349 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
1350 }
1351
1352 point_to_affine(x_affine, y_affine, x1, y1, z1);
1353 MP_CHECKOK(inverse_montgomery(out_x, x_affine, group));
1354 MP_CHECKOK(inverse_montgomery(out_y, y_affine, group));
1355
1356 CLEANUP:
1357 return res;
1358 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698