Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(482)

Side by Side Diff: mozilla/security/nss/lib/freebl/ecl/ecp_256_32.c

Issue 12042100: Add an optimized 32-bit implementation of the NIST P-256 elliptic curve. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/nss/
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5 /* A 32-bit implementation of the NIST P-256 elliptic curve. */
wtc 2013/01/25 02:32:49 Lines 1-5 used to be one comment block. I separate
6
7 #include <string.h>
8
9 #include "prtypes.h"
10 #include "mpi.h"
11 #include "mpi-priv.h"
12 #include "ecp.h"
13
14 typedef PRUint8 u8;
15 typedef PRUint32 u32;
16 typedef PRInt32 s32;
17 typedef PRUint64 u64;
18
19 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's
20 * MPI library calls them digits, but here they are called limbs, which is
21 * GMP's terminology.
22 *
23 * The value of an felem (field element) is:
24 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
25 *
26 * That is, each limb is alternately 29 or 28-bits wide in little-endian
27 * order.
28 *
29 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
30 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
31 * when multiplying as terms end up one bit short of a limb which would require
32 * much bit-shifting to correct.
33 *
34 * Finally, the values stored in an felem are in Montgomery form. So the value
35 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
36 */
37 typedef u32 limb;
38 #define NLIMBS 9
39 typedef limb felem[NLIMBS];
40
41 static const limb kBottom28Bits = 0xfffffff;
42 static const limb kBottom29Bits = 0x1fffffff;
43
44 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
45 * 28-bit words.
46 */
47 static const felem kOne = {
48 2, 0, 0, 0xffff800,
49 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
50 0
51 };
52 static const felem kZero = {0};
53 static const felem kP = {
54 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
55 0, 0, 0x200000, 0xf000000,
56 0xfffffff
57 };
58 static const felem k2P = {
59 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
60 0, 0, 0x400000, 0xe000000,
61 0x1fffffff
62 };
63
64 /* kPrecomputed contains precomputed values to aid the calculation of scalar
65 * multiples of the base point, G. It's actually two, equal length, tables
66 * concatenated.
67 *
68 * The first table contains (x,y) felem pairs for 16 multiples of the base
69 * point, G.
70 *
71 * Index | Index (binary) | Value
72 * 0 | 0000 | 0G (all zeros, omitted)
73 * 1 | 0001 | G
74 * 2 | 0010 | 2**64G
75 * 3 | 0011 | 2**64G + G
76 * 4 | 0100 | 2**128G
77 * 5 | 0101 | 2**128G + G
78 * 6 | 0110 | 2**128G + 2**64G
79 * 7 | 0111 | 2**128G + 2**64G + G
80 * 8 | 1000 | 2**192G
81 * 9 | 1001 | 2**192G + G
82 * 10 | 1010 | 2**192G + 2**64G
83 * 11 | 1011 | 2**192G + 2**64G + G
84 * 12 | 1100 | 2**192G + 2**128G
85 * 13 | 1101 | 2**192G + 2**128G + G
86 * 14 | 1110 | 2**192G + 2**128G + 2**64G
87 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
88 *
89 * The second table follows the same style, but the terms are 2**32G,
90 * 2**96G, 2**160G, 2**224G.
91 *
92 * This is ~2KB of data.
93 */
94 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
95 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7e dc, 0xd4a6eab, 0x3120bee,
96 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba 21, 0x14b10bb, 0xae3fe3,
97 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe4907 3, 0x3fa36cc, 0x5ebcd2c,
98 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea1244 6, 0xe1ade1e, 0xec91f22,
99 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109 , 0xa267a00, 0xb57c050,
100 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0 x7d6dee7, 0x2976e4b,
101 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a 5a9, 0x843a649, 0xc3ab0fa,
102 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11 , 0x58c43df, 0xf423fc2,
103 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db4 0f, 0x83e277d, 0xb0dd609,
104 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5 , 0xe10c9e, 0x33ab581,
105 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f , 0x48764cd, 0x76dbcca,
106 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b2 0, 0x4ba3173, 0xc168c33,
107 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0 , 0x65dd7ff, 0x3a1e4f6,
108 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f07 7, 0xa6add89, 0x4894acd,
109 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
110 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c , 0xda0cf5b, 0x812e881,
111 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51 , 0xc22be3e, 0xe35e65a,
112 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9 , 0x1c5a839, 0x47a1e26,
113 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c5 02, 0x2f32042, 0xa17769b,
114 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a 02, 0x3fc93, 0x5620023,
115 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c , 0x407f75c, 0xbaab133,
116 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea 7, 0x3293ac0, 0xcdc98aa,
117 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
118 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72 , 0x73e1c35, 0xee70fbc,
119 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85 , 0x27de188, 0x66f70b8,
120 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae 914, 0x2f3ec51, 0x3826b59,
121 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x 823d9d2, 0x8213f39,
122 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4 a, 0xf5ddc3d, 0x3786689,
123 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a72 9, 0x4be3499, 0x52b23aa,
124 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb0480 35, 0xe31de66, 0xc6ecaa3,
125 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a752 9, 0xcb7beb1, 0xb2a78a1,
126 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff6 58, 0xe3d6511, 0xc7d76f,
127 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
128 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d3241 1, 0xb04a838, 0xd760d2d,
129 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11 e, 0x20bca9a, 0x66f496b,
130 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
131 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56 ff, 0x65ef930, 0x21dc4a,
132 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15 f, 0x624e62e, 0xa90ae2f,
133 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522 b, 0xdc78583, 0x40eeabb,
134 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef3 4, 0xae2a960, 0x91b8bdc,
135 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0 x2413c8e, 0x5425bf9,
136 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633 , 0x7c91952, 0xd806dce,
137 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef7 3, 0x8956f34, 0xe4b5cf2,
138 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7 , 0x627b614, 0x7371cca,
139 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc 9, 0x9c19bf2, 0x5882229,
140 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b 3, 0xe85ff25, 0x408ef57,
141 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113 , 0xa4a1769, 0x11fbc6c,
142 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b 7, 0x4acbad9, 0x5efc5fa,
143 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc , 0x7bf0fa9, 0x957651,
144 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
145 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c1 2d, 0xf20bd46, 0x1951fa7,
146 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74 , 0x99bb618, 0x2db944c,
147 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e7477 9, 0x576138, 0x9587927,
148 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782 d, 0xfc72e0b, 0x701b298,
149 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5 d8, 0xf858d3a, 0x942eea8,
150 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a 1, 0x8395659, 0x52ed4e2,
151 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c 0, 0x6bdf55a, 0x4e4457d,
152 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747 b, 0x878558d, 0x7d29aa4,
153 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,
154 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,
155 };
156
157 /* Field element operations:
158 */
159
160 /* NON_ZERO_TO_ALL_ONES returns:
161 * 0xffffffff for 0 < x <= 2**31
162 * 0 for x == 0 or x > 2**31.
163 *
164 * This macro assumes that right-shifting a signed number shifts in the MSB on
165 * the left. This is not ensured by the C standard, but is true on the CPUs
166 * that we're targetting with this code (x86 and ARM).
167 */
168 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) ((x)-1)) >> 31)))
Ryan Sleevi 2013/01/25 03:50:36 Just because this is slipping into more and more c
agl 2013/01/25 16:04:03 I'd love to put it in nspr or somewhere similar. P
wtc 2013/01/31 16:59:02 After I worked out the detail (including plugging
169
170 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
171 * which is a term at 2**257.
172 *
173 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
174 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
175 */
176 static void felem_reduce_carry(felem inout, limb carry)
177 {
178 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
179
180 inout[0] += carry << 1;
181 inout[3] += 0x10000000 & carry_mask;
182 /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
183 * previous line therefore this doesn't underflow.
184 */
185 inout[3] -= carry << 11;
186 inout[4] += (0x20000000 - 1) & carry_mask;
187 inout[5] += (0x10000000 - 1) & carry_mask;
188 inout[6] += (0x20000000 - 1) & carry_mask;
189 inout[6] -= carry << 22;
190 /* This may underflow if carry is non-zero but, if so, we'll fix it in the
191 * next line.
192 */
193 inout[7] -= 1 & carry_mask;
194 inout[7] += carry << 25;
195 }
196
197 /* felem_sum sets out = in+in2.
198 *
199 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
200 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
201 */
202 static void felem_sum(felem out, const felem in, const felem in2)
203 {
204 limb carry = 0;
205 unsigned int i;
206 for (i = 0;; i++) {
207 out[i] = in[i] + in2[i];
208 out[i] += carry;
209 carry = out[i] >> 29;
210 out[i] &= kBottom29Bits;
211
212 i++;
213 if (i == NLIMBS)
214 break;
215
216 out[i] = in[i] + in2[i];
217 out[i] += carry;
218 carry = out[i] >> 28;
219 out[i] &= kBottom28Bits;
220 }
221
222 felem_reduce_carry(out, carry);
223 }
224
225 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
226 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
227 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
228 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
229 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
230 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
231
232 /* zero31 is 0 mod p.
233 */
234 static const felem zero31 = {
235 two31m3, two30m2, two31m2, two30p13m2,
236 two31m2, two30m2, two31p24m2, two30m27m2,
237 two31m2
238 };
239
240 /* felem_diff sets out = in-in2.
241 *
242 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
243 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
244 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
245 */
246 static void felem_diff(felem out, const felem in, const felem in2)
247 {
248 limb carry = 0;
249 unsigned int i;
250
251 for (i = 0;; i++) {
252 out[i] = in[i] - in2[i];
253 out[i] += zero31[i];
254 out[i] += carry;
255 carry = out[i] >> 29;
256 out[i] &= kBottom29Bits;
257
258 i++;
259 if (i == NLIMBS)
260 break;
261
262 out[i] = in[i] - in2[i];
263 out[i] += zero31[i];
264 out[i] += carry;
265 carry = out[i] >> 28;
266 out[i] &= kBottom28Bits;
267 }
268
269 felem_reduce_carry(out, carry);
270 }
271
272 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
273 * with the same 29,28,... bit positions as an felem.
274 *
275 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
276 * Since we just multiplied two Montgomery values together, the result is
277 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
278 * in Montgomery form.
279 *
280 * On entry: tmp[i] < 2**64
281 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
282 */
283 static void felem_reduce_degree(felem out, u64 tmp[17])
284 {
285 /* The following table may be helpful when reading this code:
286 *
287 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
288 * Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
289 * Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285
290 * (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
291 */
292 limb tmp2[18], carry, x, xMask;
293 unsigned int i;
294
295 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
296 * felem. So the top of an element of tmp might overlap with another
297 * element two positions down. The following loop eliminates this
298 * overlap.
299 */
300 tmp2[0] = tmp[0] & kBottom29Bits;
301
302 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
303 * and hint to the compiler that it can do a single-word shift by selecting
304 * the right register rather than doing a double-word shift and truncating
305 * afterwards.
306 */
307 tmp2[1] = ((limb) tmp[0]) >> 29;
308 tmp2[1] |= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;
309 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
310 carry = tmp2[1] >> 28;
311 tmp2[1] &= kBottom28Bits;
312
313 for (i = 2; i < 17; i++) {
314 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
315 tmp2[i] += ((limb) (tmp[i - 1])) >> 28;
316 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
317 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
318 tmp2[i] += carry;
319 carry = tmp2[i] >> 29;
320 tmp2[i] &= kBottom29Bits;
321
322 i++;
323 if (i == 17)
324 break;
325 tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
326 tmp2[i] += ((limb) (tmp[i - 1])) >> 29;
327 tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
328 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
329 tmp2[i] += carry;
330 carry = tmp2[i] >> 28;
331 tmp2[i] &= kBottom28Bits;
332 }
333
334 tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;
335 tmp2[17] += ((limb) (tmp[16])) >> 29;
336 tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);
337 tmp2[17] += carry;
338
339 /* Montgomery elimination of terms:
340 *
341 * Since R is 2**257, we can divide by R with a bitwise shift if we can
342 * ensure that the right-most 257 bits are all zero. We can make that true
343 * by adding multiplies of p without affecting the value.
344 *
345 * So we eliminate limbs from right to left. Since the bottom 29 bits of p
346 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
347 * We can do that for 8 further limbs and then right shift to eliminate the
348 * extra factor of R.
349 */
350 for (i = 0;; i += 2) {
351 tmp2[i + 1] += tmp2[i] >> 29;
352 x = tmp2[i] & kBottom29Bits;
353 xMask = NON_ZERO_TO_ALL_ONES(x);
354 tmp2[i] = 0;
355
356 /* The bounds calculations for this loop are tricky. Each iteration of
357 * the loop eliminates two words by adding values to words to their
358 * right.
359 *
360 * The following table contains the amounts added to each word (as an
361 * offset from the value of i at the top of the loop). The amounts are
362 * accounted for from the first and second half of the loop separately
363 * and are written as, for example, 28 to mean a value <2**28.
364 *
365 * Word: 3 4 5 6 7 8 9 10
366 * Added in top half: 28 11 29 21 29 28
367 * 28 29
368 * 29
369 * Added in bottom half: 29 10 28 21 28 28
370 * 29
371 *
372 * The value that is currently offset 7 will be offset 5 for the next
373 * iteration and then offset 3 for the iteration after that. Therefore
374 * the total value added will be the values added at 7, 5 and 3.
375 *
376 * The following table accumulates these values. The sums at the bottom
377 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
378 *
379 * Word: 3 4 5 6 7 8 9 10 11 12 13
380 * 28 11 10 29 21 29 28 28 28 28 28
381 * 29 28 11 28 29 28 29 28 29 28
382 * 29 28 21 21 29 21 29 21
383 * 10 29 28 21 28 21 28
384 * 28 29 28 29 28 29 28
385 * 11 10 29 10 29 10
386 * 29 28 11 28 11
387 * 29 29
388 * --------------------------------------------
389 * 30+ 31+ 30+ 31+ 30+
390 * 28+ 29+ 28+ 29+ 21+
391 * 21+ 28+ 21+ 28+ 10
392 * 10 21+ 10 21+
393 * 11 11
394 *
395 * So the greatest amount is added to tmp2[10] and tmp2[12]. If
396 * tmp2[10/12] has an initial value of <2**29, then the maximum value
397 * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
398 * as required.
399 */
400 tmp2[i + 3] += (x << 10) & kBottom28Bits;
401 tmp2[i + 4] += (x >> 18);
402
403 tmp2[i + 6] += (x << 21) & kBottom29Bits;
404 tmp2[i + 7] += x >> 8;
405
406 /* At position 200, which is the starting bit position for word 7, we
407 * have a factor of 0xf000000 = 2**28 - 2**24.
408 */
409 tmp2[i + 7] += 0x10000000 & xMask;
410 /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
411 tmp2[i + 8] += (x - 1) & xMask;
412 tmp2[i + 7] -= (x << 24) & kBottom28Bits;
413 tmp2[i + 8] -= x >> 4;
414
415 tmp2[i + 8] += 0x20000000 & xMask;
416 tmp2[i + 8] -= x;
417 tmp2[i + 8] += (x << 28) & kBottom29Bits;
418 tmp2[i + 9] += ((x >> 1) - 1) & xMask;
419
420 if (i+1 == NLIMBS)
421 break;
422 tmp2[i + 2] += tmp2[i + 1] >> 28;
423 x = tmp2[i + 1] & kBottom28Bits;
424 xMask = NON_ZERO_TO_ALL_ONES(x);
425 tmp2[i + 1] = 0;
426
427 tmp2[i + 4] += (x << 11) & kBottom29Bits;
428 tmp2[i + 5] += (x >> 18);
429
430 tmp2[i + 7] += (x << 21) & kBottom28Bits;
431 tmp2[i + 8] += x >> 7;
432
433 /* At position 199, which is the starting bit of the 8th word when
434 * dealing with a context starting on an odd word, we have a factor of
435 * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
436 * word from i+1 is i+8.
437 */
438 tmp2[i + 8] += 0x20000000 & xMask;
439 tmp2[i + 9] += (x - 1) & xMask;
440 tmp2[i + 8] -= (x << 25) & kBottom29Bits;
441 tmp2[i + 9] -= x >> 4;
442
443 tmp2[i + 9] += 0x10000000 & xMask;
444 tmp2[i + 9] -= x;
445 tmp2[i + 10] += (x - 1) & xMask;
446 }
447
448 /* We merge the right shift with a carry chain. The words above 2**257 have
449 * widths of 28,29,... which we need to correct when copying them down.
450 */
451 carry = 0;
452 for (i = 0; i < 8; i++) {
453 /* The maximum value of tmp2[i + 9] occurs on the first iteration and
454 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
455 * therefore safe.
456 */
457 out[i] = tmp2[i + 9];
458 out[i] += carry;
459 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
460 carry = out[i] >> 29;
461 out[i] &= kBottom29Bits;
462
463 i++;
464 out[i] = tmp2[i + 9] >> 1;
465 out[i] += carry;
466 carry = out[i] >> 28;
467 out[i] &= kBottom28Bits;
468 }
469
470 out[8] = tmp2[17];
471 out[8] += carry;
472 carry = out[8] >> 29;
473 out[8] &= kBottom29Bits;
474
475 felem_reduce_carry(out, carry);
476 }
477
478 /* felem_square sets out=in*in.
479 *
480 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
481 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
482 */
483 static void felem_square(felem out, const felem in)
484 {
485 u64 tmp[17];
486
487 tmp[0] = ((u64) in[0]) * in[0];
488 tmp[1] = ((u64) in[0]) * (in[1] << 1);
489 tmp[2] = ((u64) in[0]) * (in[2] << 1) +
490 ((u64) in[1]) * (in[1] << 1);
491 tmp[3] = ((u64) in[0]) * (in[3] << 1) +
492 ((u64) in[1]) * (in[2] << 1);
493 tmp[4] = ((u64) in[0]) * (in[4] << 1) +
494 ((u64) in[1]) * (in[3] << 2) +
495 ((u64) in[2]) * in[2];
496 tmp[5] = ((u64) in[0]) * (in[5] << 1) +
497 ((u64) in[1]) * (in[4] << 1) +
498 ((u64) in[2]) * (in[3] << 1);
499 tmp[6] = ((u64) in[0]) * (in[6] << 1) +
500 ((u64) in[1]) * (in[5] << 2) +
501 ((u64) in[2]) * (in[4] << 1) +
502 ((u64) in[3]) * (in[3] << 1);
503 tmp[7] = ((u64) in[0]) * (in[7] << 1) +
504 ((u64) in[1]) * (in[6] << 1) +
505 ((u64) in[2]) * (in[5] << 1) +
506 ((u64) in[3]) * (in[4] << 1);
507 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
508 * which is < 2**64 as required.
509 */
510 tmp[8] = ((u64) in[0]) * (in[8] << 1) +
511 ((u64) in[1]) * (in[7] << 2) +
512 ((u64) in[2]) * (in[6] << 1) +
513 ((u64) in[3]) * (in[5] << 2) +
514 ((u64) in[4]) * in[4];
515 tmp[9] = ((u64) in[1]) * (in[8] << 1) +
516 ((u64) in[2]) * (in[7] << 1) +
517 ((u64) in[3]) * (in[6] << 1) +
518 ((u64) in[4]) * (in[5] << 1);
519 tmp[10] = ((u64) in[2]) * (in[8] << 1) +
520 ((u64) in[3]) * (in[7] << 2) +
521 ((u64) in[4]) * (in[6] << 1) +
522 ((u64) in[5]) * (in[5] << 1);
523 tmp[11] = ((u64) in[3]) * (in[8] << 1) +
524 ((u64) in[4]) * (in[7] << 1) +
525 ((u64) in[5]) * (in[6] << 1);
526 tmp[12] = ((u64) in[4]) * (in[8] << 1) +
527 ((u64) in[5]) * (in[7] << 2) +
528 ((u64) in[6]) * in[6];
529 tmp[13] = ((u64) in[5]) * (in[8] << 1) +
530 ((u64) in[6]) * (in[7] << 1);
531 tmp[14] = ((u64) in[6]) * (in[8] << 1) +
532 ((u64) in[7]) * (in[7] << 1);
533 tmp[15] = ((u64) in[7]) * (in[8] << 1);
534 tmp[16] = ((u64) in[8]) * in[8];
535
536 felem_reduce_degree(out, tmp);
537 }
538
539 /* felem_mul sets out=in*in2.
540 *
541 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
542 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
543 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
544 */
545 static void felem_mul(felem out, const felem in, const felem in2)
546 {
547 u64 tmp[17];
548
549 tmp[0] = ((u64) in[0]) * in2[0];
550 tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
551 ((u64) in[1]) * (in2[0] << 0);
552 tmp[2] = ((u64) in[0]) * (in2[2] << 0) +
553 ((u64) in[1]) * (in2[1] << 1) +
554 ((u64) in[2]) * (in2[0] << 0);
555 tmp[3] = ((u64) in[0]) * (in2[3] << 0) +
556 ((u64) in[1]) * (in2[2] << 0) +
557 ((u64) in[2]) * (in2[1] << 0) +
558 ((u64) in[3]) * (in2[0] << 0);
559 tmp[4] = ((u64) in[0]) * (in2[4] << 0) +
560 ((u64) in[1]) * (in2[3] << 1) +
561 ((u64) in[2]) * (in2[2] << 0) +
562 ((u64) in[3]) * (in2[1] << 1) +
563 ((u64) in[4]) * (in2[0] << 0);
564 tmp[5] = ((u64) in[0]) * (in2[5] << 0) +
565 ((u64) in[1]) * (in2[4] << 0) +
566 ((u64) in[2]) * (in2[3] << 0) +
567 ((u64) in[3]) * (in2[2] << 0) +
568 ((u64) in[4]) * (in2[1] << 0) +
569 ((u64) in[5]) * (in2[0] << 0);
570 tmp[6] = ((u64) in[0]) * (in2[6] << 0) +
571 ((u64) in[1]) * (in2[5] << 1) +
572 ((u64) in[2]) * (in2[4] << 0) +
573 ((u64) in[3]) * (in2[3] << 1) +
574 ((u64) in[4]) * (in2[2] << 0) +
575 ((u64) in[5]) * (in2[1] << 1) +
576 ((u64) in[6]) * (in2[0] << 0);
577 tmp[7] = ((u64) in[0]) * (in2[7] << 0) +
578 ((u64) in[1]) * (in2[6] << 0) +
579 ((u64) in[2]) * (in2[5] << 0) +
580 ((u64) in[3]) * (in2[4] << 0) +
581 ((u64) in[4]) * (in2[3] << 0) +
582 ((u64) in[5]) * (in2[2] << 0) +
583 ((u64) in[6]) * (in2[1] << 0) +
584 ((u64) in[7]) * (in2[0] << 0);
585 /* tmp[8] has the greatest value but doesn't overflow. See logic in
586 * felem_square.
587 */
588 tmp[8] = ((u64) in[0]) * (in2[8] << 0) +
589 ((u64) in[1]) * (in2[7] << 1) +
590 ((u64) in[2]) * (in2[6] << 0) +
591 ((u64) in[3]) * (in2[5] << 1) +
592 ((u64) in[4]) * (in2[4] << 0) +
593 ((u64) in[5]) * (in2[3] << 1) +
594 ((u64) in[6]) * (in2[2] << 0) +
595 ((u64) in[7]) * (in2[1] << 1) +
596 ((u64) in[8]) * (in2[0] << 0);
597 tmp[9] = ((u64) in[1]) * (in2[8] << 0) +
598 ((u64) in[2]) * (in2[7] << 0) +
599 ((u64) in[3]) * (in2[6] << 0) +
600 ((u64) in[4]) * (in2[5] << 0) +
601 ((u64) in[5]) * (in2[4] << 0) +
602 ((u64) in[6]) * (in2[3] << 0) +
603 ((u64) in[7]) * (in2[2] << 0) +
604 ((u64) in[8]) * (in2[1] << 0);
605 tmp[10] = ((u64) in[2]) * (in2[8] << 0) +
606 ((u64) in[3]) * (in2[7] << 1) +
607 ((u64) in[4]) * (in2[6] << 0) +
608 ((u64) in[5]) * (in2[5] << 1) +
609 ((u64) in[6]) * (in2[4] << 0) +
610 ((u64) in[7]) * (in2[3] << 1) +
611 ((u64) in[8]) * (in2[2] << 0);
612 tmp[11] = ((u64) in[3]) * (in2[8] << 0) +
613 ((u64) in[4]) * (in2[7] << 0) +
614 ((u64) in[5]) * (in2[6] << 0) +
615 ((u64) in[6]) * (in2[5] << 0) +
616 ((u64) in[7]) * (in2[4] << 0) +
617 ((u64) in[8]) * (in2[3] << 0);
618 tmp[12] = ((u64) in[4]) * (in2[8] << 0) +
619 ((u64) in[5]) * (in2[7] << 1) +
620 ((u64) in[6]) * (in2[6] << 0) +
621 ((u64) in[7]) * (in2[5] << 1) +
622 ((u64) in[8]) * (in2[4] << 0);
623 tmp[13] = ((u64) in[5]) * (in2[8] << 0) +
624 ((u64) in[6]) * (in2[7] << 0) +
625 ((u64) in[7]) * (in2[6] << 0) +
626 ((u64) in[8]) * (in2[5] << 0);
627 tmp[14] = ((u64) in[6]) * (in2[8] << 0) +
628 ((u64) in[7]) * (in2[7] << 1) +
629 ((u64) in[8]) * (in2[6] << 0);
630 tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
631 ((u64) in[8]) * (in2[7] << 0);
632 tmp[16] = ((u64) in[8]) * (in2[8] << 0);
633
634 felem_reduce_degree(out, tmp);
635 }
636
637 static void felem_assign(felem out, const felem in)
638 {
639 memcpy(out, in, sizeof(felem));
640 }
641
642 /* felem_inv calculates |out| = |in|^{-1}
643 *
644 * Based on Fermat's Little Theorem:
645 * a^p = a (mod p)
646 * a^{p-1} = 1 (mod p)
647 * a^{p-2} = a^{-1} (mod p)
648 */
649 static void felem_inv(felem out, const felem in)
650 {
651 felem ftmp, ftmp2;
652 /* each e_I will hold |in|^{2^I - 1} */
653 felem e2, e4, e8, e16, e32, e64;
654 unsigned int i;
655
656 felem_square(ftmp, in); /* 2^1 */
657 felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
658 felem_assign(e2, ftmp);
659 felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
660 felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
661 felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
662 felem_assign(e4, ftmp);
663 felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
664 felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
665 felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
666 felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
667 felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
668 felem_assign(e8, ftmp);
669 for (i = 0; i < 8; i++) {
670 felem_square(ftmp, ftmp);
671 } /* 2^16 - 2^8 */
672 felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
673 felem_assign(e16, ftmp);
674 for (i = 0; i < 16; i++) {
675 felem_square(ftmp, ftmp);
676 } /* 2^32 - 2^16 */
677 felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
678 felem_assign(e32, ftmp);
679 for (i = 0; i < 32; i++) {
680 felem_square(ftmp, ftmp);
681 } /* 2^64 - 2^32 */
682 felem_assign(e64, ftmp);
683 felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
684 for (i = 0; i < 192; i++) {
685 felem_square(ftmp, ftmp);
686 } /* 2^256 - 2^224 + 2^192 */
687
688 felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
689 for (i = 0; i < 16; i++) {
690 felem_square(ftmp2, ftmp2);
691 } /* 2^80 - 2^16 */
692 felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
693 for (i = 0; i < 8; i++) {
694 felem_square(ftmp2, ftmp2);
695 } /* 2^88 - 2^8 */
696 felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
697 for (i = 0; i < 4; i++) {
698 felem_square(ftmp2, ftmp2);
699 } /* 2^92 - 2^4 */
700 felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
701 felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
702 felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
703 felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
704 felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
705 felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
706 felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
707
708 felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
709 }
710
711 /* felem_scalar_3 sets out=3*out.
712 *
713 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
714 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
715 */
716 static void felem_scalar_3(felem out)
717 {
718 limb carry = 0;
719 unsigned int i;
720
721 for (i = 0;; i++) {
722 out[i] *= 3;
723 out[i] += carry;
724 carry = out[i] >> 29;
725 out[i] &= kBottom29Bits;
726
727 i++;
728 if (i == NLIMBS)
729 break;
730
731 out[i] *= 3;
732 out[i] += carry;
733 carry = out[i] >> 28;
734 out[i] &= kBottom28Bits;
735 }
736
737 felem_reduce_carry(out, carry);
738 }
739
740 /* felem_scalar_4 sets out=4*out.
741 *
742 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
743 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
744 */
745 static void felem_scalar_4(felem out)
746 {
747 limb carry = 0, next_carry;
748 unsigned int i;
749
750 for (i = 0;; i++) {
751 next_carry = out[i] >> 27;
752 out[i] <<= 2;
753 out[i] &= kBottom29Bits;
754 out[i] += carry;
755 carry = next_carry + (out[i] >> 29);
756 out[i] &= kBottom29Bits;
757
758 i++;
759 if (i == NLIMBS)
760 break;
761 next_carry = out[i] >> 26;
762 out[i] <<= 2;
763 out[i] &= kBottom28Bits;
764 out[i] += carry;
765 carry = next_carry + (out[i] >> 28);
766 out[i] &= kBottom28Bits;
767 }
768
769 felem_reduce_carry(out, carry);
770 }
771
772 /* felem_scalar_8 sets out=8*out.
773 *
774 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
775 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
776 */
777 static void felem_scalar_8(felem out)
778 {
779 limb carry = 0, next_carry;
780 unsigned int i;
781
782 for (i = 0;; i++) {
783 next_carry = out[i] >> 26;
784 out[i] <<= 3;
785 out[i] &= kBottom29Bits;
786 out[i] += carry;
787 carry = next_carry + (out[i] >> 29);
788 out[i] &= kBottom29Bits;
789
790 i++;
791 if (i == NLIMBS)
792 break;
793 next_carry = out[i] >> 25;
794 out[i] <<= 3;
795 out[i] &= kBottom28Bits;
796 out[i] += carry;
797 carry = next_carry + (out[i] >> 28);
798 out[i] &= kBottom28Bits;
799 }
800
801 felem_reduce_carry(out, carry);
802 }
803
804 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
805 * time depending on the value of |in|.
806 */
807 static char felem_is_zero_vartime(const felem in)
808 {
809 limb carry;
810 int i;
811 limb tmp[NLIMBS];
812 felem_assign(tmp, in);
813
814 /* First, reduce tmp to a minimal form.
815 */
816 do {
817 carry = 0;
818 for (i = 0;; i++) {
819 tmp[i] += carry;
820 carry = tmp[i] >> 29;
821 tmp[i] &= kBottom29Bits;
822
823 i++;
824 if (i == NLIMBS)
825 break;
826
827 tmp[i] += carry;
828 carry = tmp[i] >> 28;
829 tmp[i] &= kBottom28Bits;
830 }
831
832 felem_reduce_carry(tmp, carry);
833 } while (carry);
834
835 /* tmp < 2**257, so the only possible zero values are 0, p and 2p.
836 */
837 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
838 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
839 memcmp(tmp, k2P, sizeof(tmp)) == 0;
840 }
841
842 /* Group operations:
843 *
844 * Elements of the elliptic curve group are represented in Jacobian
845 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
846 * Jacobian form.
847 */
848
849 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
850 *
851 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling -dbl-2009-l
852 */
853 static void point_double(felem x_out, felem y_out, felem z_out,
854 const felem x, const felem y, const felem z)
855 {
856 felem delta, gamma, alpha, beta, tmp, tmp2;
857
858 felem_square(delta, z);
859 felem_square(gamma, y);
860 felem_mul(beta, x, gamma);
861
862 felem_sum(tmp, x, delta);
863 felem_diff(tmp2, x, delta);
864 felem_mul(alpha, tmp, tmp2);
865 felem_scalar_3(alpha);
866
867 felem_sum(tmp, y, z);
868 felem_square(tmp, tmp);
869 felem_diff(tmp, tmp, gamma);
870 felem_diff(z_out, tmp, delta);
871
872 felem_scalar_4(beta);
873 felem_square(x_out, alpha);
874 felem_diff(x_out, x_out, beta);
875 felem_diff(x_out, x_out, beta);
876
877 felem_diff(tmp, beta, x_out);
878 felem_mul(tmp, alpha, tmp);
879 felem_square(tmp2, gamma);
880 felem_scalar_8(tmp2);
881 felem_diff(y_out, tmp, tmp2);
882 }
883
884 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
885 * (i.e. the second point is affine.)
886 *
887 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
888 *
889 * Note that this function does not handle P+P, infinity+P nor P+infinity
890 * correctly.
891 */
892 static void point_add_mixed(felem x_out, felem y_out, felem z_out,
893 const felem x1, const felem y1, const felem z1,
894 const felem x2, const felem y2)
895 {
896 felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
897
898 felem_square(z1z1, z1);
899 felem_sum(tmp, z1, z1);
900
901 felem_mul(u2, x2, z1z1);
902 felem_mul(z1z1z1, z1, z1z1);
903 felem_mul(s2, y2, z1z1z1);
904 felem_diff(h, u2, x1);
905 felem_sum(i, h, h);
906 felem_square(i, i);
907 felem_mul(j, h, i);
908 felem_diff(r, s2, y1);
909 felem_sum(r, r, r);
910 felem_mul(v, x1, i);
911
912 felem_mul(z_out, tmp, h);
913 felem_square(rr, r);
914 felem_diff(x_out, rr, j);
915 felem_diff(x_out, x_out, v);
916 felem_diff(x_out, x_out, v);
917
918 felem_diff(tmp, v, x_out);
919 felem_mul(y_out, tmp, r);
920 felem_mul(tmp, y1, j);
921 felem_diff(y_out, y_out, tmp);
922 felem_diff(y_out, y_out, tmp);
923 }
924
925 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
926 *
927 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
928 *
929 * Note that this function does not handle P+P, infinity+P nor P+infinity
930 * correctly.
931 */
932 static void point_add(felem x_out, felem y_out, felem z_out,
933 const felem x1, const felem y1, const felem z1,
934 const felem x2, const felem y2, const felem z2)
935 {
936 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
937
938 felem_square(z1z1, z1);
939 felem_square(z2z2, z2);
940 felem_mul(u1, x1, z2z2);
941
942 felem_sum(tmp, z1, z2);
943 felem_square(tmp, tmp);
944 felem_diff(tmp, tmp, z1z1);
945 felem_diff(tmp, tmp, z2z2);
946
947 felem_mul(z2z2z2, z2, z2z2);
948 felem_mul(s1, y1, z2z2z2);
949
950 felem_mul(u2, x2, z1z1);
951 felem_mul(z1z1z1, z1, z1z1);
952 felem_mul(s2, y2, z1z1z1);
953 felem_diff(h, u2, u1);
954 felem_sum(i, h, h);
955 felem_square(i, i);
956 felem_mul(j, h, i);
957 felem_diff(r, s2, s1);
958 felem_sum(r, r, r);
959 felem_mul(v, u1, i);
960
961 felem_mul(z_out, tmp, h);
962 felem_square(rr, r);
963 felem_diff(x_out, rr, j);
964 felem_diff(x_out, x_out, v);
965 felem_diff(x_out, x_out, v);
966
967 felem_diff(tmp, v, x_out);
968 felem_mul(y_out, tmp, r);
969 felem_mul(tmp, s1, j);
970 felem_diff(y_out, y_out, tmp);
971 felem_diff(y_out, y_out, tmp);
972 }
973
974 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
975 * {x2,y2,z2}.
976 *
977 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition -add-2007-bl
978 *
979 * This function handles the case where {x1,y1,z1}={x2,y2,z2}.
980 */
981 static void point_add_or_double_vartime(
982 felem x_out, felem y_out, felem z_out,
983 const felem x1, const felem y1, const felem z1,
984 const felem x2, const felem y2, const felem z2)
985 {
986 felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
987 char x_equal, y_equal;
988
989 felem_square(z1z1, z1);
990 felem_square(z2z2, z2);
991 felem_mul(u1, x1, z2z2);
992
993 felem_sum(tmp, z1, z2);
994 felem_square(tmp, tmp);
995 felem_diff(tmp, tmp, z1z1);
996 felem_diff(tmp, tmp, z2z2);
997
998 felem_mul(z2z2z2, z2, z2z2);
999 felem_mul(s1, y1, z2z2z2);
1000
1001 felem_mul(u2, x2, z1z1);
1002 felem_mul(z1z1z1, z1, z1z1);
1003 felem_mul(s2, y2, z1z1z1);
1004 felem_diff(h, u2, u1);
1005 x_equal = felem_is_zero_vartime(h);
1006 felem_sum(i, h, h);
1007 felem_square(i, i);
1008 felem_mul(j, h, i);
1009 felem_diff(r, s2, s1);
1010 y_equal = felem_is_zero_vartime(r);
1011 if (x_equal && y_equal) {
1012 point_double(x_out, y_out, z_out, x1, y1, z1);
1013 return;
1014 }
1015 felem_sum(r, r, r);
1016 felem_mul(v, u1, i);
1017
1018 felem_mul(z_out, tmp, h);
1019 felem_square(rr, r);
1020 felem_diff(x_out, rr, j);
1021 felem_diff(x_out, x_out, v);
1022 felem_diff(x_out, x_out, v);
1023
1024 felem_diff(tmp, v, x_out);
1025 felem_mul(y_out, tmp, r);
1026 felem_mul(tmp, s1, j);
1027 felem_diff(y_out, y_out, tmp);
1028 felem_diff(y_out, y_out, tmp);
1029 }
1030
1031 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.
1032 *
1033 * On entry: mask is either 0 or 0xffffffff.
1034 */
1035 static void copy_conditional(felem out, const felem in, limb mask)
1036 {
1037 int i;
1038
1039 for (i = 0; i < NLIMBS; i++) {
1040 const limb tmp = mask & (in[i] ^ out[i]);
1041 out[i] ^= tmp;
1042 }
1043 }
1044
1045 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.
1046 * On entry: index < 16, table[0] must be zero.
1047 */
1048 static void select_affine_point(felem out_x, felem out_y,
1049 const limb *table, limb index)
1050 {
1051 limb i, j;
1052
1053 memset(out_x, 0, sizeof(felem));
1054 memset(out_y, 0, sizeof(felem));
1055
1056 for (i = 1; i < 16; i++) {
1057 limb mask = i ^ index;
1058 mask |= mask >> 2;
1059 mask |= mask >> 1;
1060 mask &= 1;
1061 mask--;
1062 for (j = 0; j < NLIMBS; j++, table++) {
1063 out_x[j] |= *table & mask;
1064 }
1065 for (j = 0; j < NLIMBS; j++, table++) {
1066 out_y[j] |= *table & mask;
1067 }
1068 }
1069 }
1070
1071 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
1072 * table. On entry: index < 16, table[0] must be zero.
1073 */
1074 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
1075 const limb *table, limb index)
1076 {
1077 limb i, j;
1078
1079 memset(out_x, 0, sizeof(felem));
1080 memset(out_y, 0, sizeof(felem));
1081 memset(out_z, 0, sizeof(felem));
1082
1083 /* The implicit value at index 0 is all zero. We don't need to perform that
1084 * iteration of the loop because we already set out_* to zero.
1085 */
1086 table += 3*NLIMBS;
1087
1088 for (i = 1; i < 16; i++) {
1089 limb mask = i ^ index;
1090 mask |= mask >> 2;
1091 mask |= mask >> 1;
1092 mask &= 1;
1093 mask--;
1094 for (j = 0; j < NLIMBS; j++, table++) {
1095 out_x[j] |= *table & mask;
1096 }
1097 for (j = 0; j < NLIMBS; j++, table++) {
1098 out_y[j] |= *table & mask;
1099 }
1100 for (j = 0; j < NLIMBS; j++, table++) {
1101 out_z[j] |= *table & mask;
1102 }
1103 }
1104 }
1105
1106 /* get_bit returns the bit'th bit of scalar. */
1107 static char get_bit(const u8 scalar[32], int bit)
1108 {
1109 return ((scalar[bit >> 3]) >> (bit & 7)) & 1;
1110 }
1111
1112 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
1113 * number. Note that the value of scalar must be less than the order of the
1114 * group.
1115 */
1116 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])
1117 {
1118 int i, j;
1119 limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
1120 u32 table_offset;
1121
1122 felem px, py;
1123 felem tx, ty, tz;
1124
1125 memset(nx, 0, sizeof(felem));
1126 memset(ny, 0, sizeof(felem));
1127 memset(nz, 0, sizeof(felem));
1128
1129 /* The loop adds bits at positions 0, 64, 128 and 192, followed by
1130 * positions 32,96,160 and 224 and does this 32 times.
1131 */
1132 for (i = 0; i < 32; i++) {
1133 if (i) {
1134 point_double(nx, ny, nz, nx, ny, nz);
1135 }
1136 for (j = 0; j <= 32; j += 32) {
1137 char bit0 = get_bit(scalar, 31 - i + j);
1138 char bit1 = get_bit(scalar, 95 - i + j);
1139 char bit2 = get_bit(scalar, 159 - i + j);
1140 char bit3 = get_bit(scalar, 223 - i + j);
1141 limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
1142
1143 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);
1144 select_affine_point(px, py, kPrecomputed + table_offset, index);
1145
1146 /* Since scalar is less than the order of the group, we know that
1147 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
1148 * below.
1149 */
1150 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
1151 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
1152 * (a.k.a. the point at infinity). We handle that situation by
1153 * copying the point from the table.
1154 */
1155 copy_conditional(nx, px, n_is_infinity_mask);
1156 copy_conditional(ny, py, n_is_infinity_mask);
1157 copy_conditional(nz, kOne, n_is_infinity_mask);
1158
1159 /* Equally, the result is also wrong if the point from the table is
1160 * zero, which happens when the index is zero. We handle that by
1161 * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0.
1162 */
1163 p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1164 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1165 copy_conditional(nx, tx, mask);
1166 copy_conditional(ny, ty, mask);
1167 copy_conditional(nz, tz, mask);
1168 /* If p was not zero, then n is now non-zero. */
1169 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1170 }
1171 }
1172 }
1173
1174 /* point_to_affine converts a Jacobian point to an affine point. If the input
1175 * is the point at infinity then it returns (0, 0) in constant time.
1176 */
1177 static void point_to_affine(felem x_out, felem y_out,
1178 const felem nx, const felem ny, const felem nz) {
1179 felem z_inv, z_inv_sq;
1180 felem_inv(z_inv, nz);
1181 felem_square(z_inv_sq, z_inv);
1182 felem_mul(x_out, nx, z_inv_sq);
1183 felem_mul(z_inv, z_inv, z_inv_sq);
1184 felem_mul(y_out, ny, z_inv);
1185 }
1186
1187 /* scalar_mult sets {nx,ny,nz} = scalar*{x,y}. */
1188 static void scalar_mult(felem nx, felem ny, felem nz,
1189 const felem x, const felem y, const u8 scalar[32])
1190 {
1191 int i;
1192 felem px, py, pz, tx, ty, tz;
1193 felem precomp[16][3];
1194 limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
1195
1196 /* We precompute 0,1,2,... times {x,y}. */
1197 memset(precomp, 0, sizeof(felem) * 3);
1198 memcpy(&precomp[1][0], x, sizeof(felem));
1199 memcpy(&precomp[1][1], y, sizeof(felem));
1200 memcpy(&precomp[1][2], kOne, sizeof(felem));
1201
1202 for (i = 2; i < 16; i += 2) {
1203 point_double(precomp[i][0], precomp[i][1], precomp[i][2],
1204 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
1205
1206 point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
1207 precomp[i][0], precomp[i][1], precomp[i][2], x, y);
1208 }
1209
1210 memset(nx, 0, sizeof(felem));
1211 memset(ny, 0, sizeof(felem));
1212 memset(nz, 0, sizeof(felem));
1213 n_is_infinity_mask = -1;
1214
1215 /* We add in a window of four bits each iteration and do this 64 times. */
1216 for (i = 0; i < 64; i++) {
1217 if (i) {
1218 point_double(nx, ny, nz, nx, ny, nz);
1219 point_double(nx, ny, nz, nx, ny, nz);
1220 point_double(nx, ny, nz, nx, ny, nz);
1221 point_double(nx, ny, nz, nx, ny, nz);
1222 }
1223
1224 index = scalar[31 - i / 2];
1225 if ((i & 1) == 1) {
1226 index &= 15;
1227 } else {
1228 index >>= 4;
1229 }
1230
1231 /* See the comments in scalar_base_mult about handling infinities. */
1232 select_jacobian_point(px, py, pz, (limb *) precomp, index);
1233 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
1234 copy_conditional(nx, px, n_is_infinity_mask);
1235 copy_conditional(ny, py, n_is_infinity_mask);
1236 copy_conditional(nz, pz, n_is_infinity_mask);
1237
1238 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;
1239 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1240 copy_conditional(nx, tx, mask);
1241 copy_conditional(ny, ty, mask);
1242 copy_conditional(nz, tz, mask);
1243 n_is_infinity_mask &= ~p_is_noninfinite_mask;
1244 }
1245 }
1246
1247 /* Interface with Freebl: */
1248
1249 #ifdef IS_BIG_ENDIAN
wtc 2013/01/25 02:32:49 I changed #if to #ifdef here.
1250 #error "This code needs a little-endian processor"
1251 #endif
1252
1253 static const u32 kRInvDigits[8] = {
1254 0x80000000, 1, 0xffffffff, 0,
1255 0x80000001, 0xfffffffe, 1, 0x7fffffff
1256 };
1257 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))
1258 static const mp_int kRInv = {
1259 MP_ZPOS,
1260 MP_DIGITS_IN_256_BITS,
1261 MP_DIGITS_IN_256_BITS,
1262 /* Because we are running on a little-endian processor, this cast works for
1263 * both 32 and 64-bit processors.
1264 */
1265 (mp_digit*) kRInvDigits
1266 };
1267
1268 static const limb kTwo28 = 0x10000000;
1269 static const limb kTwo29 = 0x20000000;
1270
1271 /* to_montgomery sets out = R*in. */
1272 static mp_err to_montgomery(felem out, const mp_int *in, const ECGroup *group)
1273 {
1274 /* There are no MPI functions for bitshift operations and we wish to shift
1275 * in 257 bits left so we move the digits 256-bits left and then multiply
1276 * by two.
1277 */
1278 mp_int in_shifted;
1279 int i;
1280 mp_err res;
1281
1282 mp_init(&in_shifted);
1283 s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);
1284 memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),
1285 MP_DIGITS(in),
1286 MP_USED(in)*sizeof(mp_digit));
1287 mp_mul_2(&in_shifted, &in_shifted);
1288 MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));
1289
1290 for (i = 0;; i++) {
1291 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;
1292 mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);
1293
1294 i++;
1295 if (i == NLIMBS)
1296 break;
1297 out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;
1298 mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);
1299 }
1300
1301 CLEANUP:
1302 mp_clear(&in_shifted);
1303 return res;
1304 }
1305
1306 /* from_montgomery sets out=in/R. */
1307 static mp_err from_montgomery(mp_int *out, const felem in,
1308 const ECGroup *group)
1309 {
1310 mp_int result, tmp;
1311 mp_err res;
1312 int i;
1313
1314 mp_init(&result);
1315 mp_init(&tmp);
1316
1317 MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));
1318 for (i = NLIMBS-2; i >= 0; i--) {
1319 if ((i & 1) == 0) {
1320 MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));
1321 } else {
1322 MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));
1323 }
1324 MP_CHECKOK(mp_add_d(&tmp, in[i], &result));
1325 }
1326
1327 MP_CHECKOK(mp_mul(&result, &kRInv, out));
1328 MP_CHECKOK(group->meth->field_mod(out, out, group->meth));
1329
1330 CLEANUP:
1331 mp_clear(&result);
1332 mp_clear(&tmp);
1333 return res;
1334 }
1335
1336 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */
1337 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)
1338 {
1339 /* We require that |n| is less than the order of the group and therefore it
1340 * will fit into |scalar|. However, these is a timing side-channel here that
1341 * we cannot avoid: if |n| is sufficiently small it may be one or more words
1342 * too short and we'll copy less data.
1343 */
1344 memset(out_scalar, 0, 32);
1345 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));
1346 }
1347
1348 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the
1349 * order of the group.
1350 */
1351 mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,
1352 mp_int *out_x, mp_int *out_y,
1353 const ECGroup *group)
1354 {
1355 u8 scalar[32];
1356 felem x, y, z, x_affine, y_affine;
1357 mp_err res;
1358
1359 /* FIXME(agl): test that n < order. */
1360
1361 scalar_from_mp_int(scalar, n);
1362 scalar_base_mult(x, y, z, scalar);
1363 point_to_affine(x_affine, y_affine, x, y, z);
1364 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1365 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1366
1367 CLEANUP:
1368 return res;
1369 }
1370
1371 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
1372 * the order of the group.
1373 */
1374 mp_err ec_GFp_nistp256_point_mul(const mp_int *n,
1375 const mp_int *in_x, const mp_int *in_y,
1376 mp_int *out_x, mp_int *out_y,
1377 const ECGroup *group)
1378 {
1379 u8 scalar[32];
1380 felem x, y, z, x_affine, y_affine, px, py;
1381 mp_err res;
1382
1383 scalar_from_mp_int(scalar, n);
1384
1385 MP_CHECKOK(to_montgomery(px, in_x, group));
1386 MP_CHECKOK(to_montgomery(py, in_y, group));
1387
1388 scalar_mult(x, y, z, px, py, scalar);
1389 point_to_affine(x_affine, y_affine, x, y, z);
1390 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1391 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1392
1393 CLEANUP:
1394 return res;
1395 }
1396
1397 /* ec_GFp_nistp256_point_mul_vartime sets {out_x,out_y} = n1*G +
1398 * n2*{in_x,in_y}, where n1 and n2 are < the order of the group.
1399 *
1400 * As indicated by the name, this function operates in variable time. This
1401 * is safe because it's used for signature validation which doesn't deal
1402 * with secrets.
1403 */
1404 mp_err ec_GFp_nistp256_points_mul_vartime(
1405 const mp_int *n1, const mp_int *n2,
1406 const mp_int *in_x, const mp_int *in_y,
1407 mp_int *out_x, mp_int *out_y,
1408 const ECGroup *group)
1409 {
1410 u8 scalar1[32], scalar2[32];
1411 felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;
1412 mp_err res = MP_OKAY;
1413
1414 /* If n2 == NULL, this is just a base-point multiplication. */
1415 if (n2 == NULL) {
1416 return ec_GFp_nistp256_base_point_mul(n1, out_x, out_y, group);
1417 }
1418
1419 /* If n1 == nULL, this is just an arbitary-point multiplication. */
1420 if (n1 == NULL) {
1421 return ec_GFp_nistp256_point_mul(n2, in_x, in_y, out_x, out_y, group);
1422 }
1423
1424 /* If both scalars are zero, then the result is the point at infinity. */
1425 if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {
1426 mp_zero(out_x);
1427 mp_zero(out_y);
1428 return res;
1429 }
1430
1431 scalar_from_mp_int(scalar1, n1);
1432 scalar_from_mp_int(scalar2, n2);
1433
1434 MP_CHECKOK(to_montgomery(px, in_x, group));
1435 MP_CHECKOK(to_montgomery(py, in_y, group));
1436 scalar_base_mult(x1, y1, z1, scalar1);
1437 scalar_mult(x2, y2, z2, px, py, scalar2);
1438
1439 if (mp_cmp_z(n2) == 0) {
1440 /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
1441 * {x1,y1,z1}. */
1442 } else if (mp_cmp_z(n1) == 0) {
1443 /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
1444 * {x2,y2,z2}. */
1445 memcpy(x1, x2, sizeof(x2));
1446 memcpy(y1, y2, sizeof(y2));
1447 memcpy(z1, z2, sizeof(z2));
1448 } else {
1449 /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
1450 point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
1451 }
1452
1453 point_to_affine(x_affine, y_affine, x1, y1, z1);
1454 MP_CHECKOK(from_montgomery(out_x, x_affine, group));
1455 MP_CHECKOK(from_montgomery(out_y, y_affine, group));
1456
1457 CLEANUP:
1458 return res;
1459 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698