Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1091)

Side by Side Diff: ecp_256_32.c

Issue 10944017: 32-bit, P-256 implementation. Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Addressing wtc'c comments. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * A 32-bit implementation of the NIST P-256 elliptic curve.
3 *
4 * This Source Code Form is subject to the terms of the Mozilla Public
5 * License, v. 2.0. If a copy of the MPL was not distributed with this
6 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 */
8
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13
14 #include "prtypes.h"
15 #include "mpi.h"
16 #include "mpi-priv.h"
17 #include "ecp.h"
18
19 typedef PRUint8 u8;
20 typedef PRUint32 u32;
21 typedef PRInt32 s32;
22 typedef PRUint64 u64;
23
24 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's
25 * MPI library calls them digits, but here they are called limbs, which is
26 * GMP's terminology.
27 *
28 * The value of an felem (field element) is:
29 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
30 *
31 * That is, each limb is alternately 29 or 28-bits wide in little-endian
32 * order.
33 *
34 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
35 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
36 * when multiplying as terms end up one bit short of a limb which would require
37 * much bit-shifting to correct.
38 *
39 * Finally, the values stored in an felem are in Montgomery form. So the value
40 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
41 */
42 typedef u32 limb;
43 #define NLIMBS 9
44 typedef limb felem[NLIMBS];
45
46 static const limb kBottom28Bits = 0xfffffff;
47 static const limb kBottom29Bits = 0x1fffffff;
48
49 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
50 * 28-bit words.
51 */
52 static const felem kOne = {
53 2, 0, 0, 0xffff800,
54 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
55 0
56 };
57 static const felem kZero = {0};
58 static const felem kP = {
59 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
60 0, 0, 0x200000, 0xf000000,
61 0xfffffff
62 };
63 static const felem k2P = {
64 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
65 0, 0, 0x400000, 0xe000000,
66 0x1fffffff
67 };
68
69 /* kPrecomputed contains precomputed values to aid the calculation of scalar
70 * multiples of the base point, G. It's actually two, equal length, tables
71 * concatenated.
72 *
73 * The first table contains (x,y) felem pairs for 16 multiples of the base
74 * point, G.
75 *
76 * Index | Index (binary) | Value
77 * 0 | 0000 | 0G (all zeros, omitted)
78 * 1 | 0001 | G
79 * 2 | 0010 | 2**64G
80 * 3 | 0011 | 2**64G + G
81 * 4 | 0100 | 2**128G
82 * 5 | 0101 | 2**128G + G
83 * 6 | 0110 | 2**128G + 2**64G
84 * 7 | 0111 | 2**128G + 2**64G + G
85 * 8 | 1000 | 2**192G
86 * 9 | 1001 | 2**192G + G
87 * 10 | 1010 | 2**192G + 2**64G
88 * 11 | 1011 | 2**192G + 2**64G + G
89 * 12 | 1100 | 2**192G + 2**128G
90 * 13 | 1101 | 2**192G + 2**128G + G
91 * 14 | 1110 | 2**192G + 2**128G + 2**64G
92 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
93 *
94 * The second table follows the same style, but the terms are 2**32G,
95 * 2**96G, 2**160G, 2**224G.
96 *
97 * This is ~2KB of data.
98 */
99 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
100 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,
101 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,
102 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,
103 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,
104 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,
105 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,
106 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,
107 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,
108 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,
109 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,
110 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,
111 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,
112 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,
113 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,
114 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
115 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,
116 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,
117 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,
118 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,
119 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,
120 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,
121 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,
122 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
123 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,
124 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,
125 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,
126 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,
127 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,
128 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,
129 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,
130 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,
131 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,
132 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
133 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,
134 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,
135 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
136 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,
137 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,
138 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,
139 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,
140 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,
141 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,
142 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,
143 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,
144 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,
145 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,
146 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,
147 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,
148 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,
149 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
150 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,
151 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,
152 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,
153 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,
154 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,
155 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,
156 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,
157 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,
158 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,
159 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,
160 };
161
162 /* Field element operations:
163 */
164
165 /* NON_ZERO_TO_ALL_ONES returns:
166 * 0xffffffff for 0 < x <= 2**31
167 * 0 for x == 0 or x > 2**31.
168 *
169 * This macro assumes that right-shifting a signed number shifts in the MSB on
170 * the left. This is not ensured by the C standard, but is true on the CPUs
171 * that we're targetting with this code (x86 and ARM).
172 */
173 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) ((x)-1)) >> 31)))
174
175 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
176 * which is a term at 2**257.
177 *
178 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
179 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
180 */
181 static void felem_reduce_carry(felem inout, limb carry)
182 {
183 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
184
185 inout[0] += carry << 1;
186 inout[3] += 0x10000000 & carry_mask;
187 /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
188 * previous line therefore this doesn't underflow.
189 */
190 inout[3] -= carry << 11;
191 inout[4] += (0x20000000 - 1) & carry_mask;
192 inout[5] += (0x10000000 - 1) & carry_mask;
193 inout[6] += (0x20000000 - 1) & carry_mask;
194 inout[6] -= carry << 22;
195 /* This may underflow if carry is non-zero but, if so, we'll fix it in the
196 * next line.
197 */
198 inout[7] -= 1 & carry_mask;
199 inout[7] += carry << 25;
200 }
201
202 /* felem_sum sets out = in+in2.
203 *
204 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
205 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
206 */
207 static void felem_sum(felem out, const felem in, const felem in2)
208 {
209 limb carry = 0;
210 unsigned int i;
211 for (i = 0;; i++) {
212 out[i] = in[i] + in2[i];
213 out[i] += carry;
214 carry = out[i] >> 29;
215 out[i] &= kBottom29Bits;
216
217 i++;
218 if (i == NLIMBS)
219 break;
220
221 out[i] = in[i] + in2[i];
222 out[i] += carry;
223 carry = out[i] >> 28;
224 out[i] &= kBottom28Bits;
225 }
226
227 felem_reduce_carry(out, carry);
228 }
229
230 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
231 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
232 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
233 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
234 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
235 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
236
237 /* zero31 is 0 mod p.
238 */
239 static const felem zero31 = {
240 two31m3, two30m2, two31m2, two30p13m2,
241 two31m2, two30m2, two31p24m2, two30m27m2,
242 two31m2
243 };
244
245 /* felem_diff sets out = in-in2.
246 *
247 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
248 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
249 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
250 */
251 static void felem_diff(felem out, const felem in, const felem in2)
252 {
253 limb carry = 0;
254 unsigned int i;
255
256 for (i = 0;; i++) {
257 out[i] = in[i] - in2[i];
258 out[i] += zero31[i];
259 out[i] += carry;
260 carry = out[i] >> 29;
261 out[i] &= kBottom29Bits;
262
263 i++;
264 if (i == NLIMBS)
265 break;
266
267 out[i] = in[i] - in2[i];
268 out[i] += zero31[i];
269 out[i] += carry;
270 carry = out[i] >> 28;
271 out[i] &= kBottom28Bits;
272 }
273
274 felem_reduce_carry(out, carry);
275 }
276
277 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
278 * with the same 29,28,... bit positions as an felem.
279 *
280 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
281 * Since we just multiplied two Montgomery values together, the result is
282 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
283 * in Montgomery form.
284 *
285 * On entry: tmp[i] < 2**64
286 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
287 */
288 static void felem_reduce_degree(felem out, u64 tmp[17])
289 {
290 /* The following table may be helpful when reading this code:
291 *
292 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
293 * Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
294 * Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285
295 * (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
296 */
297 limb tmp2[18], carry, x, xMask;
298 unsigned int i;
299
300 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
301 * felem. So the top of an element of tmp might overlap with another
302 * element two positions down. The following loop eliminates this
303 * overlap.
304 */
305 tmp2[0] = tmp[0] & kBottom29Bits;
306
307 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
308 * and hint to the compiler that it can do a single-word shift by selecting
309 * the right register rather than doing a double-word shift and truncating
310 * afterwards.
311 */
312 tmp2[1] = ((limb) tmp[0]) >> 29;
313 tmp2[1] |= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;
314 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
315 carry = tmp2[1] >> 28;
316 tmp2[1] &= kBottom28Bits;
317
318 for (i = 2; i < 17; i++) {
319 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
320 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;
321 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
322 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
323 tmp2[i] += carry;
324 carry = tmp2[i] >> 29;
325 tmp2[i] &= kBottom29Bits;
326
327 i++;
328 if (i == 17)
329 break;
330 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
331 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;
332 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
333 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
334 tmp2[i] += carry;
335 carry = tmp2[i] >> 28;
336 tmp2[i] &= kBottom28Bits;
337 }
338
339 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;
340 tmp2[17] += ((limb) (tmp[16])) >> 29;
341 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);
342 tmp2[17] += carry;
343
344 /* Montgomery elimination of terms:
345 *
346 * Since R is 2**257, we can divide by R with a bitwise shift if we can
347 * ensure that the right-most 257 bits are all zero. We can make that true
348 * by adding multiplies of p without affecting the value.
349 *
350 * So we eliminate limbs from right to left. Since the bottom 29 bits of p
351 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
352 * We can do that for 8 further limbs and then right shift to eliminate the
353 * extra factor of R.
354 */
355 for (i = 0;; i += 2) {
356 tmp2[i + 1] += tmp2[i] >> 29;
357 x = tmp2[i] & kBottom29Bits;
358 xMask = NON_ZERO_TO_ALL_ONES(x);
359 tmp2[i] = 0;
360
361 /* The bounds calculations for this loop are tricky. Each iteration of
362 * the loop eliminates two words by adding values to words to their
363 * right.
364 *
365 * The following table contains the amounts added to each word (as an
366 * offset from the value of i at the top of the loop). The amounts are
367 * accounted for from the first and second half of the loop separately
368 * and are written as, for example, 28 to mean a value <2**28.
369 *
370 * Word: 3 4 5 6 7 8 9 10
371 * Added in top half: 28 11 29 21 29 28
372 * 28 29
373 * 29
374 * Added in bottom half: 29 10 28 21 28 28
375 * 29
376 *
377 * The value that is currently offset 7 will be offset 5 for the next
378 * iteration and then offset 3 for the iteration after that. Therefore
379 * the total value added will be the values added at 7, 5 and 3.
380 *
381 * The following table accumulates these values. The sums at the bottom
382 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
383 *
384 * Word: 3 4 5 6 7 8 9 10 11 12 13
385 * 28 11 10 29 21 29 28 28 28 28 28
386 * 29 28 11 28 29 28 29 28 29 28
387 * 29 28 21 21 29 21 29 21
388 * 10 29 28 21 28 21 28
389 * 28 29 28 29 28 29 28
390 * 11 10 29 10 29 10
391 * 29 28 11 28 11
392 * 29 29
393 * --------------------------------------------
394 * 30+ 31+ 30+ 31+ 30+
395 * 28+ 29+ 28+ 29+ 21+
396 * 21+ 28+ 21+ 28+ 10
397 * 10 21+ 10 21+
398 * 11 11
399 *
400 * So the greatest amount is added to tmp2[10] and tmp2[12]. If
401 * tmp2[10/12] has an initial value of <2**29, then the maximum value
402 * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
403 * as required.
404 */
405 tmp2[i + 3] += (x << 10) & kBottom28Bits;
406 tmp2[i + 4] += (x >> 18);
407
408 tmp2[i + 6] += (x << 21) & kBottom29Bits;
409 tmp2[i + 7] += x >> 8;
410
411 /* At position 200, which is the starting bit position for word 7, we
412 * have a factor of 0xf000000 = 2**28 - 2**24.
413 */
414 tmp2[i + 7] += 0x10000000 & xMask;
415 /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
416 tmp2[i + 8] += (x - 1) & xMask;
417 tmp2[i + 7] -= (x << 24) & kBottom28Bits;
418 tmp2[i + 8] -= x >> 4;
419
420 tmp2[i + 8] += 0x20000000 & xMask;
421 tmp2[i + 8] -= x;
422 tmp2[i + 8] += (x << 28) & kBottom29Bits;
423 tmp2[i + 9] += ((x >> 1) - 1) & xMask;
424
425 if (i+1 == NLIMBS)
426 break;
427 tmp2[i + 2] += tmp2[i + 1] >> 28;
428 x = tmp2[i + 1] & kBottom28Bits;
429 xMask = NON_ZERO_TO_ALL_ONES(x);
430 tmp2[i + 1] = 0;
431
432 tmp2[i + 4] += (x << 11) & kBottom29Bits;
433 tmp2[i + 5] += (x >> 18);
434
435 tmp2[i + 7] += (x << 21) & kBottom28Bits;
436 tmp2[i + 8] += x >> 7;
437
438 /* At position 199, which is the starting bit of the 8th word when
439 * dealing with a context starting on an odd word, we have a factor of
440 * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
441 * word from i+1 is i+8.
442 */
443 tmp2[i + 8] += 0x20000000 & xMask;
444 tmp2[i + 9] += (x - 1) & xMask;
445 tmp2[i + 8] -= (x << 25) & kBottom29Bits;
446 tmp2[i + 9] -= x >> 4;
447
448 tmp2[i + 9] += 0x10000000 & xMask;
449 tmp2[i + 9] -= x;
450 tmp2[i + 10] += (x - 1) & xMask;
451 }
452
453 /* We merge the right shift with a carry chain. The words above 2**257 have
454 * widths of 28,29,... which we need to correct when copying them down.
455 */
456 carry = 0;
457 for (i = 0; i < 8; i++) {
458 /* The maximum value of tmp2[i + 9] occurs on the first iteration and
459 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
460 * therefore safe.
461 */
462 out[i] = tmp2[i + 9];
463 out[i] += carry;
464 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
465 carry = out[i] >> 29;
466 out[i] &= kBottom29Bits;
467
468 i++;
469 out[i] = tmp2[i + 9] >> 1;
470 out[i] += carry;
471 carry = out[i] >> 28;
472 out[i] &= kBottom28Bits;
473 }
474
475 out[8] = tmp2[17];
476 out[8] += carry;
477 carry = out[8] >> 29;
478 out[8] &= kBottom29Bits;
479
480 felem_reduce_carry(out, carry);
481 }
482
483 /* felem_square sets out=in*in.
484 *
485 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
486 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
487 */
488 static void felem_square(felem out, const felem in)
489 {
490 u64 tmp[17];
491
492 tmp[0] = ((u64) in[0]) * in[0];
493 tmp[1] = ((u64) in[0]) * (in[1] << 1);
494 tmp[2] = ((u64) in[0]) * (in[2] << 1) +
495 ((u64) in[1]) * (in[1] << 1);
496 tmp[3] = ((u64) in[0]) * (in[3] << 1) +
497 ((u64) in[1]) * (in[2] << 1);
498 tmp[4] = ((u64) in[0]) * (in[4] << 1) +
499 ((u64) in[1]) * (in[3] << 2) +
500 ((u64) in[2]) * in[2];
501 tmp[5] = ((u64) in[0]) * (in[5] << 1) +
502 ((u64) in[1]) * (in[4] << 1) +
503 ((u64) in[2]) * (in[3] << 1);
504 tmp[6] = ((u64) in[0]) * (in[6] << 1) +
505 ((u64) in[1]) * (in[5] << 2) +
506 ((u64) in[2]) * (in[4] << 1) +
507 ((u64) in[3]) * (in[3] << 1);
508 tmp[7] = ((u64) in[0]) * (in[7] << 1) +
509 ((u64) in[1]) * (in[6] << 1) +
510 ((u64) in[2]) * (in[5] << 1) +
511 ((u64) in[3]) * (in[4] << 1);
512 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
513 * which is < 2**64 as required.
514 */
515 tmp[8] = ((u64) in[0]) * (in[8] << 1) +
516 ((u64) in[1]) * (in[7] << 2) +
517 ((u64) in[2]) * (in[6] << 1) +
518 ((u64) in[3]) * (in[5] << 2) +
519 ((u64) in[4]) * in[4];
520 tmp[9] = ((u64) in[1]) * (in[8] << 1) +
521 ((u64) in[2]) * (in[7] << 1) +
522 ((u64) in[3]) * (in[6] << 1) +
523 ((u64) in[4]) * (in[5] << 1);
524 tmp[10] = ((u64) in[2]) * (in[8] << 1) +
525 ((u64) in[3]) * (in[7] << 2) +
526 ((u64) in[4]) * (in[6] << 1) +
527 ((u64) in[5]) * (in[5] << 1);
528 tmp[11] = ((u64) in[3]) * (in[8] << 1) +
529 ((u64) in[4]) * (in[7] << 1) +
530 ((u64) in[5]) * (in[6] << 1);
531 tmp[12] = ((u64) in[4]) * (in[8] << 1) +
532 ((u64) in[5]) * (in[7] << 2) +
533 ((u64) in[6]) * in[6];
534 tmp[13] = ((u64) in[5]) * (in[8] << 1) +
535 ((u64) in[6]) * (in[7] << 1);
536 tmp[14] = ((u64) in[6]) * (in[8] << 1) +
537 ((u64) in[7]) * (in[7] << 1);
538 tmp[15] = ((u64) in[7]) * (in[8] << 1);
539 tmp[16] = ((u64) in[8]) * in[8];
540
541 felem_reduce_degree(out, tmp);
542 }
543
544 /* felem_mul sets out=in*in2.
545 *
546 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
547 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
548 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
549 */
550 static void felem_mul(felem out, const felem in, const felem in2)
551 {
552 u64 tmp[17];
553
554 tmp[0] = ((u64) in[0]) * in2[0];
555 tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
556 ((u64) in[1]) * (in2[0] << 0);
557 tmp[2] = ((u64) in[0]) * (in2[2] << 0) +
558 ((u64) in[1]) * (in2[1] << 1) +
559 ((u64) in[2]) * (in2[0] << 0);
560 tmp[3] = ((u64) in[0]) * (in2[3] << 0) +
561 ((u64) in[1]) * (in2[2] << 0) +
562 ((u64) in[2]) * (in2[1] << 0) +
563 ((u64) in[3]) * (in2[0] << 0);
564 tmp[4] = ((u64) in[0]) * (in2[4] << 0) +
565 ((u64) in[1]) * (in2[3] << 1) +
566 ((u64) in[2]) * (in2[2] << 0) +
567 ((u64) in[3]) * (in2[1] << 1) +
568 ((u64) in[4]) * (in2[0] << 0);
569 tmp[5] = ((u64) in[0]) * (in2[5] << 0) +
570 ((u64) in[1]) * (in2[4] << 0) +
571 ((u64) in[2]) * (in2[3] << 0) +
572 ((u64) in[3]) * (in2[2] << 0) +
573 ((u64) in[4]) * (in2[1] << 0) +
574 ((u64) in[5]) * (in2[0] << 0);
575 tmp[6] = ((u64) in[0]) * (in2[6] << 0) +
576 ((u64) in[1]) * (in2[5] << 1) +
577 ((u64) in[2]) * (in2[4] << 0) +
578 ((u64) in[3]) * (in2[3] << 1) +
579 ((u64) in[4]) * (in2[2] << 0) +
580 ((u64) in[5]) * (in2[1] << 1) +
581 ((u64) in[6]) * (in2[0] << 0);
582 tmp[7] = ((u64) in[0]) * (in2[7] << 0) +
583 ((u64) in[1]) * (in2[6] << 0) +
584 ((u64) in[2]) * (in2[5] << 0) +
585 ((u64) in[3]) * (in2[4] << 0) +
586 ((u64) in[4]) * (in2[3] << 0) +
587 ((u64) in[5]) * (in2[2] << 0) +
588 ((u64) in[6]) * (in2[1] << 0) +
589 ((u64) in[7]) * (in2[0] << 0);
590 /* tmp[8] has the greatest value but doesn't overflow. See logic in
591 * felem_square.
592 */
593 tmp[8] = ((u64) in[0]) * (in2[8] << 0) +
594 ((u64) in[1]) * (in2[7] << 1) +
595 ((u64) in[2]) * (in2[6] << 0) +
596 ((u64) in[3]) * (in2[5] << 1) +
597 ((u64) in[4]) * (in2[4] << 0) +
598 ((u64) in[5]) * (in2[3] << 1) +
599 ((u64) in[6]) * (in2[2] << 0) +
600 ((u64) in[7]) * (in2[1] << 1) +
601 ((u64) in[8]) * (in2[0] << 0);
602 tmp[9] = ((u64) in[1]) * (in2[8] << 0) +
603 ((u64) in[2]) * (in2[7] << 0) +
604 ((u64) in[3]) * (in2[6] << 0) +
605 ((u64) in[4]) * (in2[5] << 0) +
606 ((u64) in[5]) * (in2[4] << 0) +
607 ((u64) in[6]) * (in2[3] << 0) +
608 ((u64) in[7]) * (in2[2] << 0) +
609 ((u64) in[8]) * (in2[1] << 0);
610 tmp[10] = ((u64) in[2]) * (in2[8] << 0) +
611 ((u64) in[3]) * (in2[7] << 1) +
612 ((u64) in[4]) * (in2[6] << 0) +
613 ((u64) in[5]) * (in2[5] << 1) +
614 ((u64) in[6]) * (in2[4] << 0) +
615 ((u64) in[7]) * (in2[3] << 1) +
616 ((u64) in[8]) * (in2[2] << 0);
617 tmp[11] = ((u64) in[3]) * (in2[8] << 0) +
618 ((u64) in[4]) * (in2[7] << 0) +
619 ((u64) in[5]) * (in2[6] << 0) +
620 ((u64) in[6]) * (in2[5] << 0) +
621 ((u64) in[7]) * (in2[4] << 0) +
622 ((u64) in[8]) * (in2[3] << 0);
623 tmp[12] = ((u64) in[4]) * (in2[8] << 0) +
624 ((u64) in[5]) * (in2[7] << 1) +
625 ((u64) in[6]) * (in2[6] << 0) +
626 ((u64) in[7]) * (in2[5] << 1) +
627 ((u64) in[8]) * (in2[4] << 0);
628 tmp[13] = ((u64) in[5]) * (in2[8] << 0) +
629 ((u64) in[6]) * (in2[7] << 0) +
630 ((u64) in[7]) * (in2[6] << 0) +
631 ((u64) in[8]) * (in2[5] << 0);
632 tmp[14] = ((u64) in[6]) * (in2[8] << 0) +
633 ((u64) in[7]) * (in2[7] << 1) +
634 ((u64) in[8]) * (in2[6] << 0);
635 tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
636 ((u64) in[8]) * (in2[7] << 0);
637 tmp[16] = ((u64) in[8]) * (in2[8] << 0);
638
639 felem_reduce_degree(out, tmp);
640 }
641
642 static void felem_assign(felem out, const felem in)
643 {
644 memcpy(out, in, sizeof(felem));
645 }
646
647 /* felem_inv calculates |out| = |in|^{-1}
648 *
649 * Based on Fermat's Little Theorem:
650 * a^p = a (mod p)
651 * a^{p-1} = 1 (mod p)
652 * a^{p-2} = a^{-1} (mod p)
653 */
654 static void felem_inv(felem out, const felem in)
655 {
656 felem ftmp, ftmp2;
657 /* each e_I will hold |in|^{2^I - 1} */
658 felem e2, e4, e8, e16, e32, e64;
659 unsigned int i;
660
661 felem_square(ftmp, in); /* 2^1 */
662 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
663 felem_assign(e2, ftmp);
664 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
665 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
666 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
667 felem_assign(e4, ftmp);
668 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
669 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
670 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
671 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
672 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
673 felem_assign(e8, ftmp);
674 for (i = 0; i < 8; i++) {
675 felem_square(ftmp, ftmp);
676 } /* 2^16 - 2^8 */
677 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
678 felem_assign(e16, ftmp);
679 for (i = 0; i < 16; i++) {
680 felem_square(ftmp, ftmp);
681 } /* 2^32 - 2^16 */
682 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
683 felem_assign(e32, ftmp);
684 for (i = 0; i < 32; i++) {
685 felem_square(ftmp, ftmp);
686 } /* 2^64 - 2^32 */
687 felem_assign(e64, ftmp);
688 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
689 for (i = 0; i < 192; i++) {
690 felem_square(ftmp, ftmp);
691 } /* 2^256 - 2^224 + 2^192 */
692
693 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
694 for (i = 0; i < 16; i++) {
695 felem_square(ftmp2, ftmp2);
696 } /* 2^80 - 2^16 */
697 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
698 for (i = 0; i < 8; i++) {
699 felem_square(ftmp2, ftmp2);
700 } /* 2^88 - 2^8 */
701 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
702 for (i = 0; i < 4; i++) {
703 felem_square(ftmp2, ftmp2);
704 } /* 2^92 - 2^4 */
705 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
706 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
707 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
708 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
709 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
710 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
711 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
712
713 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
714 }
715
716 /* felem_scalar_3 sets out=3*out.
717 *
718 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
719 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
720 */
721 static void felem_scalar_3(felem out)
722 {
723 limb carry = 0;
724 unsigned int i;
725
726 for (i = 0;; i++) {
727 out[i] *= 3;
728 out[i] += carry;
729 carry = out[i] >> 29;
730 out[i] &= kBottom29Bits;
731
732 i++;
733 if (i == NLIMBS)
734 break;
735
736 out[i] *= 3;
737 out[i] += carry;
738 carry = out[i] >> 28;
739 out[i] &= kBottom28Bits;
740 }
741
742 felem_reduce_carry(out, carry);
743 }
744
745 /* felem_scalar_4 sets out=4*out.
746 *
747 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
748 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
749 */
750 static void felem_scalar_4(felem out)
751 {
752 limb carry = 0, next_carry;
753 unsigned int i;
754
755 for (i = 0;; i++) {
756 next_carry = out[i] >> 27;
757 out[i] <<= 2;
758 out[i] &= kBottom29Bits;
759 out[i] += carry;
760 carry = next_carry + (out[i] >> 29);
761 out[i] &= kBottom29Bits;
762
763 i++;
764 if (i == NLIMBS)
765 break;
766 next_carry = out[i] >> 26;
767 out[i] <<= 2;
768 out[i] &= kBottom28Bits;
769 out[i] += carry;
770 carry = next_carry + (out[i] >> 28);
771 out[i] &= kBottom28Bits;
772 }
773
774 felem_reduce_carry(out, carry);
775 }
776
777 /* felem_scalar_8 sets out=8*out.
778 *
779 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
780 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
781 */
782 static void felem_scalar_8(felem out)
783 {
784 limb carry = 0, next_carry;
785 unsigned int i;
786
787 for (i = 0;; i++) {
788 next_carry = out[i] >> 26;
789 out[i] <<= 3;
790 out[i] &= kBottom29Bits;
791 out[i] += carry;
792 carry = next_carry + (out[i] >> 29);
793 out[i] &= kBottom29Bits;
794
795 i++;
796 if (i == NLIMBS)
797 break;
798 next_carry = out[i] >> 25;
799 out[i] <<= 3;
800 out[i] &= kBottom28Bits;
801 out[i] += carry;
802 carry = next_carry + (out[i] >> 28);
803 out[i] &= kBottom28Bits;
804 }
805
806 felem_reduce_carry(out, carry);
807 }
808
809 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
810 * time depending on the value of |in|.
811 */
812 static char felem_is_zero_vartime(const felem in)
813 {
814 limb carry;
815 int i;
816 limb tmp[NLIMBS];
817 felem_assign(tmp, in);
818
819 /* First, reduce tmp to a minimal form.
820 */
821 do {
822 carry = 0;
823 for (i = 0;; i++) {
824 tmp[i] += carry;
825 carry = tmp[i] >> 29;
826 tmp[i] &= kBottom29Bits;
827
828 i++;
829 if (i == NLIMBS)
830 break;
831
832 tmp[i] += carry;
833 carry = tmp[i] >> 28;
834 tmp[i] &= kBottom28Bits;
835 }
836
837 felem_reduce_carry(tmp, carry);
838 } while (carry);
839
840 /* tmp < 2**257, so the only possible zero values are 0, p and 2p.
841 */
842 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
843 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
844 memcmp(tmp, k2P, sizeof(tmp)) == 0;
845 }
846
847 /* Group operations:
848 *
849 * Elements of the elliptic curve group are represented in Jacobian
850 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
851 * Jacobian form.
852 */
853
854 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
855 *
856 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l
857 */
858 static void point_double(felem x_out, felem y_out, felem z_out,
859 const felem x, const felem y, const felem z)
860 {
861 felem delta, gamma, alpha, beta, tmp, tmp2;
862
863 felem_square(delta, z);
864 felem_square(gamma, y);
865 felem_mul(beta, x, gamma);
866
867 felem_sum(tmp, x, delta);
868 felem_diff(tmp2, x, delta);
869 felem_mul(alpha, tmp, tmp2);
870 felem_scalar_3(alpha);
871
872 felem_sum(tmp, y, z);
873 felem_square(tmp, tmp);
874 felem_diff(tmp, tmp, gamma);
875 felem_diff(z_out, tmp, delta);
876
877 felem_scalar_4(beta);
878 felem_square(x_out, alpha);
879 felem_diff(x_out, x_out, beta);
880 felem_diff(x_out, x_out, beta);
881
882 felem_diff(tmp, beta, x_out);
883 felem_mul(tmp, alpha, tmp);
884 felem_square(tmp2, gamma);
885 felem_scalar_8(tmp2);
886 felem_diff(y_out, tmp, tmp2);
887 }
888
889 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
890 * (i.e. the second point is affine.)
891 *
892 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
893 *
894 * Note that this function does not handle P+P, infinity+P nor P+infinity
895 * correctly.
896 */
897 static void point_add_mixed(felem x_out, felem y_out, felem z_out,
898 const felem x1, const felem y1, const felem z1,
899 const felem x2, const felem y2)
900 {
901 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
902
903 felem_square(z1z1, z1);
904 felem_sum(tmp, z1, z1);
905
906 felem_mul(u2, x2, z1z1);
907 felem_mul(z1z1z1, z1, z1z1);
908 felem_mul(s2, y2, z1z1z1);
909 felem_diff(h, u2, x1);
910 felem_sum(i, h, h);
911 felem_square(i, i);
912 felem_mul(j, h, i);
913 felem_diff(r, s2, y1);
914 felem_sum(r, r, r);
915 felem_mul(v, x1, i);
916
917 felem_mul(z_out, tmp, h);
918 felem_square(rr, r);
919 felem_diff(x_out, rr, j);
920 felem_diff(x_out, x_out, v);
921 felem_diff(x_out, x_out, v);
922
923 felem_diff(tmp, v, x_out);
924 felem_mul(y_out, tmp, r);
925 felem_mul(tmp, y1, j);
926 felem_diff(y_out, y_out, tmp);
927 felem_diff(y_out, y_out, tmp);
928 }
929
930 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
931 *
932 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
933 *
934 * Note that this function does not handle P+P, infinity+P nor P+infinity
935 * correctly.
936 */
937 static void point_add(felem x_out, felem y_out, felem z_out,
938 const felem x1, const felem y1, const felem z1,
939 const felem x2, const felem y2, const felem z2)
940 {
941 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
942
943 felem_square(z1z1, z1);
944 felem_square(z2z2, z2);
945 felem_mul(u1, x1, z2z2);
946
947 felem_sum(tmp, z1, z2);
948 felem_square(tmp, tmp);
949 felem_diff(tmp, tmp, z1z1);
950 felem_diff(tmp, tmp, z2z2);
951
952 felem_mul(z2z2z2, z2, z2z2);
953 felem_mul(s1, y1, z2z2z2);
954
955 felem_mul(u2, x2, z1z1);
956 felem_mul(z1z1z1, z1, z1z1);
957 felem_mul(s2, y2, z1z1z1);
958 felem_diff(h, u2, u1);
959 felem_sum(i, h, h);
960 felem_square(i, i);
961 felem_mul(j, h, i);
962 felem_diff(r, s2, s1);
963 felem_sum(r, r, r);
964 felem_mul(v, u1, i);
965
966 felem_mul(z_out, tmp, h);
967 felem_square(rr, r);
968 felem_diff(x_out, rr, j);
969 felem_diff(x_out, x_out, v);
970 felem_diff(x_out, x_out, v);
971
972 felem_diff(tmp, v, x_out);
973 felem_mul(y_out, tmp, r);
974 felem_mul(tmp, s1, j);
975 felem_diff(y_out, y_out, tmp);
976 felem_diff(y_out, y_out, tmp);
977 }
978
979 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
980 * {x2,y2,z2}.
981 *
982 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
983 *
984 * This function handles the case where {x1,y1,z1}={x2,y2,z2}.
985 */
986 static void point_add_or_double_vartime(
987 felem x_out, felem y_out, felem z_out,
988 const felem x1, const felem y1, const felem z1,
989 const felem x2, const felem y2, const felem z2)
990 {
991 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
992 char x_equal, y_equal;
993
994 felem_square(z1z1, z1);
995 felem_square(z2z2, z2);
996 felem_mul(u1, x1, z2z2);
997
998 felem_sum(tmp, z1, z2);
999 felem_square(tmp, tmp);
1000 felem_diff(tmp, tmp, z1z1);
1001 felem_diff(tmp, tmp, z2z2);
1002
1003 felem_mul(z2z2z2, z2, z2z2);
1004 felem_mul(s1, y1, z2z2z2);
1005
1006 felem_mul(u2, x2, z1z1);
1007 felem_mul(z1z1z1, z1, z1z1);
1008 felem_mul(s2, y2, z1z1z1);
1009 felem_diff(h, u2, u1);
1010 x_equal = felem_is_zero_vartime(h);
1011 felem_sum(i, h, h);
1012 felem_square(i, i);
1013 felem_mul(j, h, i);
1014 felem_diff(r, s2, s1);
1015 y_equal = felem_is_zero_vartime(r);
1016 if (x_equal && y_equal) {
1017 point_double(x_out, y_out, z_out, x1, y1, z1);
1018 return;
1019 }
1020 felem_sum(r, r, r);
1021 felem_mul(v, u1, i);
1022
1023 felem_mul(z_out, tmp, h);
1024 felem_square(rr, r);
1025 felem_diff(x_out, rr, j);
1026 felem_diff(x_out, x_out, v);
1027 felem_diff(x_out, x_out, v);
1028
1029 felem_diff(tmp, v, x_out);
1030 felem_mul(y_out, tmp, r);
1031 felem_mul(tmp, s1, j);
1032 felem_diff(y_out, y_out, tmp);
1033 felem_diff(y_out, y_out, tmp);
1034 }
1035
1036 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.
1037 *
1038 * On entry: mask is either 0 or 0xffffffff.
1039 */
1040 static void copy_conditional(felem out, const felem in, limb mask)
1041 {
1042 int i;
1043
1044 for (i = 0; i < NLIMBS; i++) {
1045 const limb tmp = mask & (in[i] ^ out[i]);
1046 out[i] ^= tmp;
1047 }
1048 }
1049
1050 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.
1051 * On entry: index < 16, table[0] must be zero.
1052 */
1053 static void select_affine_point(felem out_x, felem out_y,
1054 const limb *table, limb index)
1055 {
1056 limb i, j;
1057
1058 memset(out_x, 0, sizeof(felem));
1059 memset(out_y, 0, sizeof(felem));
1060
1061 for (i = 1; i < 16; i++) {
1062 limb mask = i ^ index;
1063 mask |= mask >> 2;
1064 mask |= mask >> 1;
1065 mask &= 1;
1066 mask--;
1067 for (j = 0; j < NLIMBS; j++, table++) {
1068 out_x[j] |= *table & mask;
1069 }
1070 for (j = 0; j < NLIMBS; j++, table++) {
1071 out_y[j] |= *table & mask;
1072 }
1073 }
1074 }
1075
1076 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
1077 * table. On entry: index < 16, table[0] must be zero.
1078 */
1079 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
1080 const limb *table, limb index)
1081 {
1082 limb i, j;
1083
1084 memset(out_x, 0, sizeof(felem));
1085 memset(out_y, 0, sizeof(felem));
1086 memset(out_z, 0, sizeof(felem));
1087
1088 /* The implicit value at index 0 is all zero. We don't need to perform that
1089 * iteration of the loop because we already set out_* to zero.
1090 */
1091 table += 3*NLIMBS;
1092
1093 for (i = 1; i < 16; i++) {
1094 limb mask = i ^ index;
1095 mask |= mask >> 2;
1096 mask |= mask >> 1;
1097 mask &= 1;
1098 mask--;
1099 for (j = 0; j < NLIMBS; j++, table++) {
1100 out_x[j] |= *table & mask;
1101 }
1102 for (j = 0; j < NLIMBS; j++, table++) {
1103 out_y[j] |= *table & mask;
1104 }
1105 for (j = 0; j < NLIMBS; j++, table++) {
1106 out_z[j] |= *table & mask;
1107 }
1108 }
1109 }
1110
1111 /* get_bit returns the bit'th bit of scalar. */
1112 static char get_bit(const u8 scalar[32], int bit)
1113 {
1114 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;
1115 }
1116
1117 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
1118 * number. Note that the value of scalar must be less than the order of the
1119 * group.
1120 */
1121 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])
1122 {
1123 int i, j;
1124 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
1125 u32 table_offset;
1126
1127 felem px, py;
1128 felem tx, ty, tz;
1129
1130 memset(nx, 0, sizeof(felem));
1131 memset(ny, 0, sizeof(felem));
1132 memset(nz, 0, sizeof(felem));
1133
1134 /* The loop adds bits at positions 0, 64, 128 and 192, followed by
1135 * positions 32,96,160 and 224 and does this 32 times.
1136 */
1137 for (i = 0; i < 32; i++) {
1138 if (i) {
1139 point_double(nx, ny, nz, nx, ny, nz);
1140 }
1141 for (j = 0; j <= 32; j += 32) {
1142 char bit0 = get_bit(scalar, 31 - i + j);
1143 char bit1 = get_bit(scalar, 95 - i + j);
1144 char bit2 = get_bit(scalar, 159 - i + j);
1145 char bit3 = get_bit(scalar, 223 - i + j);
1146 limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
1147
1148 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);
1149 select_affine_point(px, py, kPrecomputed + table_offset, index);
1150
1151 /* Since scalar is less than the order of the group, we know that
1152 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
1153 * below.
1154 */
1155 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
1156 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
1157 * (a.k.a. the point at infinity). We handle that situation by
1158 * copying the point from the table.
1159 */
1160 copy_conditional(nx, px, n_is_infinity_mask);
1161 copy_conditional(ny, py, n_is_infinity_mask);
1162 copy_conditional(nz, kOne, n_is_infinity_mask);
1163
1164 /* Equally, the result is also wrong if the point from the table is
1165 * zero, which happens when the index is zero. We handle that by
1166 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0.
1167 */
1168 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1169 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1170 copy_conditional(nx, tx, mask);
1171 copy_conditional(ny, ty, mask);
1172 copy_conditional(nz, tz, mask);
1173 /* If p was not zero, then n is now non-zero. */
1174 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1175 }
1176 }
1177 }
1178
1179 /* point_to_affine converts a Jacobian point to an affine point. If the input
1180 * is the point at infinity then it returns (0, 0) in constant time.
1181 */
1182 static void point_to_affine(felem x_out, felem y_out,
1183 const felem nx, const felem ny, const felem nz) {
1184 felem z_inv, z_inv_sq;
1185 felem_inv(z_inv, nz);
1186 felem_square(z_inv_sq, z_inv);
1187 felem_mul(x_out, nx, z_inv_sq);
1188 felem_mul(z_inv, z_inv, z_inv_sq);
1189 felem_mul(y_out, ny, z_inv);
1190 }
1191
1192 /* scalar_mult sets {nx,ny,nz} = scalar*{x,y}. */
1193 static void scalar_mult(felem nx, felem ny, felem nz,
1194 const felem x, const felem y, const u8 scalar[32])
1195 {
1196 int i;
1197 felem px, py, pz, tx, ty, tz;
1198 felem precomp[16][3];
1199 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
1200
1201 /* We precompute 0,1,2,... times {x,y}. */
1202 memset(precomp, 0, sizeof(felem) * 3);
1203 memcpy(&precomp[1][0], x, sizeof(felem));
1204 memcpy(&precomp[1][1], y, sizeof(felem));
1205 memcpy(&precomp[1][2], kOne, sizeof(felem));
1206
1207 for (i = 2; i < 16; i += 2) {
1208 point_double(precomp[i][0], precomp[i][1], precomp[i][2],
1209 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
1210
1211 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
1212 precomp[i][0], precomp[i][1], precomp[i][2], x, y);
1213 }
1214
1215 memset(nx, 0, sizeof(felem));
1216 memset(ny, 0, sizeof(felem));
1217 memset(nz, 0, sizeof(felem));
1218 n_is_infinity_mask = -1;
1219
1220 /* We add in a window of four bits each iteration and do this 64 times. */
1221 for (i = 0; i < 64; i++) {
1222 if (i) {
1223 point_double(nx, ny, nz, nx, ny, nz);
1224 point_double(nx, ny, nz, nx, ny, nz);
1225 point_double(nx, ny, nz, nx, ny, nz);
1226 point_double(nx, ny, nz, nx, ny, nz);
1227 }
1228
1229 index = scalar[31 - i / 2];
1230 if ((i & 1) == 1) {
1231 index &= 15;
1232 } else {
1233 index >>= 4;
1234 }
1235
1236 /* See the comments in scalar_base_mult about handling infinities. */
1237 select_jacobian_point(px, py, pz, (limb *) precomp, index);
1238 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
1239 copy_conditional(nx, px, n_is_infinity_mask);
1240 copy_conditional(ny, py, n_is_infinity_mask);
1241 copy_conditional(nz, pz, n_is_infinity_mask);
1242
1243 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;
1244 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1245 copy_conditional(nx, tx, mask);
1246 copy_conditional(ny, ty, mask);
1247 copy_conditional(nz, tz, mask);
1248 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1249 }
1250 }
1251
1252 /* Interface with Freebl: */
1253
1254 #if IS_BIG_ENDIAN
1255 #error "This code needs a little-endian processor"
1256 #endif
1257
1258 static const u32 kRInvDigits[8] = {
1259 0x80000000, 1, 0xffffffff, 0,
1260 0x80000001, 0xfffffffe, 1, 0x7fffffff
1261 };
1262 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))
1263 static const mp_int kRInv = {
1264 MP_ZPOS,
1265 MP_DIGITS_IN_256_BITS,
1266 MP_DIGITS_IN_256_BITS,
1267 /* Because we are running on a little-endian processor, this cast works for
1268 * both 32 and 64-bit processors.
1269 */
1270 (mp_digit*) kRInvDigits
1271 };
1272
1273 static const limb kTwo28 = 0x10000000;
1274 static const limb kTwo29 = 0x20000000;
1275
1276 /* to_montgomery sets out = R*in. */
1277 static mp_err to_montgomery(felem out, const mp_int *in, const ECGroup *group)
1278 {
1279 /* There are no MPI functions for bitshift operations and we wish to shift
1280 * in 257 bits left so we move the digits 256-bits left and then multiply
1281 * by two.
1282 */
1283 mp_int in_shifted;
1284 int i;
1285 mp_err res;
1286
1287 mp_init(&in_shifted);
1288 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);
1289 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),
1290 MP_DIGITS(in),
1291 MP_USED(in)*sizeof(mp_digit));
1292 mp_mul_2(&in_shifted, &in_shifted);
1293 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));
1294
1295 for (i = 0;; i++) {
1296 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;
1297 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);
1298
1299 i++;
1300 if (i == NLIMBS)
1301 break;
1302 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;
1303 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);
1304 }
1305
1306 CLEANUP:
1307 mp_clear(&in_shifted);
1308 return res;
1309 }
1310
1311 /* from_montgomery sets out=in/R. */
1312 static mp_err from_montgomery(mp_int *out, const felem in,
1313 const ECGroup *group)
1314 {
1315 mp_int result, tmp;
1316 mp_err res;
1317 int i;
1318
1319 mp_init(&result);
1320 mp_init(&tmp);
1321
1322 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));
1323 for (i = NLIMBS-2; i >= 0; i--) {
1324 if ((i & 1) == 0) {
1325 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));
1326 } else {
1327 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));
1328 }
1329 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));
1330 }
1331
1332 MP_CHECKOK(mp_mul(&result, &kRInv, out));
1333 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));
1334
1335 CLEANUP:
1336 mp_clear(&result);
1337 mp_clear(&tmp);
1338 return res;
1339 }
1340
1341 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */
1342 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)
1343 {
1344 /* We require that |n| is less than the order of the group and therefore it
1345 * will fit into |scalar|. However, these is a timing side-channel here that
1346 * we cannot avoid: if |n| is sufficiently small it may be one or more words
1347 * too short and we'll copy less data.
1348 */
1349 memset(out_scalar, 0, 32);
1350 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));
1351 }
1352
1353 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the
1354 * order of the group.
1355 */
1356 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,
1357 mp_int *out_x, mp_int *out_y,
1358 const ECGroup *group)
1359 {
1360 u8 scalar[32];
1361 felem x, y, z, x_affine, y_affine;
1362 mp_err res;
1363
1364 /* FIXME(agl): test that n < order. */
1365
1366 scalar_from_mp_int(scalar, n);
1367 scalar_base_mult(x, y, z, scalar);
1368 point_to_affine(x_affine, y_affine, x, y, z);
1369 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1370 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1371
1372 CLEANUP:
1373 return res;
1374 }
1375
1376 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
1377 * the order of the group.
1378 */
1379 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,
1380 const mp_int *in_x, const mp_int *in_y,
1381 mp_int *out_x, mp_int *out_y,
1382 const ECGroup *group)
1383 {
1384 u8 scalar[32];
1385 felem x, y, z, x_affine, y_affine, px, py;
1386 mp_err res;
1387
1388 scalar_from_mp_int(scalar, n);
1389
1390 MP_CHECKOK(to_montgomery(px, in_x, group));
1391 MP_CHECKOK(to_montgomery(py, in_y, group));
1392
1393 scalar_mult(x, y, z, px, py, scalar);
1394 point_to_affine(x_affine, y_affine, x, y, z);
1395 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1396 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1397
1398 CLEANUP:
1399 return res;
1400 }
1401
1402 /* ec_GFp_nistp256_point_mul_vartime sets {out_x,out_y} = n1*G +
1403 * n2*{in_x,in_y}, where n1 and n2 are < the order of the group.
1404 *
1405 * As indicated by the name, this function operates in variable time. This
1406 * is safe because it's used for signature validation which doesn't deal
1407 * with secrets.
1408 */
1409 mp_err ec_GFp_nistp256_points_mul_vartime(
1410 const mp_int *n1, const mp_int *n2,
1411 const mp_int *in_x, const mp_int *in_y,
1412 mp_int *out_x, mp_int *out_y,
1413 const ECGroup *group)
1414 {
1415 u8 scalar1[32], scalar2[32];
1416 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;
1417 mp_err res = MP_OKAY;
1418
1419 /* If n2 == NULL, this is just a base-point multiplication. */
1420 if (n2 == NULL) {
1421 return ec_GFp_nistp256_base_point_mul(n1, out_x, out_y, group);
1422 }
1423
1424 /* If n1 == nULL, this is just an arbitary-point multiplication. */
1425 if (n1 == NULL) {
1426 return ec_GFp_nistp256_point_mul(n2, in_x, in_y, out_x, out_y, group);
1427 }
1428
1429 /* If both scalars are zero, then the result is the point at infinity. */
1430 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {
1431 mp_zero(out_x);
1432 mp_zero(out_y);
1433 return res;
1434 }
1435
1436 scalar_from_mp_int(scalar1, n1);
1437 scalar_from_mp_int(scalar2, n2);
1438
1439 MP_CHECKOK(to_montgomery(px, in_x, group));
1440 MP_CHECKOK(to_montgomery(py, in_y, group));
1441 scalar_base_mult(x1, y1, z1, scalar1);
1442 scalar_mult(x2, y2, z2, px, py, scalar2);
1443
1444 if (mp_cmp_z(n2) == 0) {
1445 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
1446 * {x1,y1,z1}. */
1447 } else if (mp_cmp_z(n1) == 0) {
1448 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
1449 * {x2,y2,z2}. */
1450 memcpy(x1, x2, sizeof(x2));
1451 memcpy(y1, y2, sizeof(y2));
1452 memcpy(z1, z2, sizeof(z2));
1453 } else {
1454 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
1455 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
1456 }
1457
1458 point_to_affine(x_affine, y_affine, x1, y1, z1);
1459 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1460 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1461
1462 CLEANUP:
1463 return res;
1464 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698