mozilla/security/nss/lib/freebl/ecl/ecp_256_32.c - Issue 12137003: Support big-endian processors. Eliminate the dependency on the

Side by Side Diff: mozilla/security/nss/lib/freebl/ecl/ecp_256_32.c

Issue 12137003: Support big-endian processors. Eliminate the dependency on the (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/nss/

Patch Set: Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /* This Source Code Form is subject to the terms of the Mozilla Public	1 /* This Source Code Form is subject to the terms of the Mozilla Public

2 * License, v. 2.0. If a copy of the MPL was not distributed with this	2 * License, v. 2.0. If a copy of the MPL was not distributed with this

3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */	3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

4	4

5 /* A 32-bit implementation of the NIST P-256 elliptic curve. */	5 /* A 32-bit implementation of the NIST P-256 elliptic curve. */

6	6

7 #include <string.h>	7 #include <string.h>

8	8

9 #include "prtypes.h"	9 #include "prtypes.h"

10 #include "mpi.h"	10 #include "mpi.h"

(...skipping 142 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
153 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,	153 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d 7, 0xa5bef68, 0xb7b30d8,

154 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,	154 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f5195 1, 0x9d0c177, 0x1c49a78,

155 };	155 };

156	156

157 /* Field element operations:	157 /* Field element operations:

158 */	158 */

159	159

160 /* NON_ZERO_TO_ALL_ONES returns:	160 /* NON_ZERO_TO_ALL_ONES returns:

161 * 0xffffffff for 0 < x <= 2**31	161 * 0xffffffff for 0 < x <= 2**31

162 * 0 for x == 0 or x > 2**31.	162 * 0 for x == 0 or x > 2**31.

163 *

164 * This macro assumes that right-shifting a signed number shifts in the MSB on

165 * the left. This is not ensured by the C standard, but is true on the CPUs

166 * that we're targetting with this code (x86 and ARM).

167 */	163 */

168 #define NON_ZERO_TO_ALL_ONES(x) (~((u32) (((s32) ((x)-1)) >> 31)))	164 #define NON_ZERO_TO_ALL_ONES(x) ((((x) - 1) >> 31) - 1)
	Ryan Sleevi 2013/01/31 22:53:38 Suggestion: Add u32 casts to make sure that the un Suggestion: Add u32 casts to make sure that the unsigned underflow and shifts are well-defined. I haven't checked all the calls, but I note Adam's original behaviour made sure to make this obvious, especially with the right-shift. wtc 2013/02/01 05:28:04 Thank you for the suggestion. The argument \|x\| is Show quoted text On 2013/01/31 22:53:38, Ryan Sleevi wrote: > Suggestion: Add u32 casts to make sure that the unsigned underflow and shifts > are well-defined. Thank you for the suggestion. The argument \|x\| is always a \|limb\|, which is a u32. I will document this. Since \|x\| is a u32, the original macro must use the s32 cast before the right shift.
169	165

170 /* felem_reduce_carry adds a multiple of p in order to cancel \|carry\|,	166 /* felem_reduce_carry adds a multiple of p in order to cancel \|carry\|,

171 * which is a term at 2**257.	167 * which is a term at 2**257.

172 *	168 *

173 * On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.	169 * On entry: carry < 23, inout[0,2,...] < 229, inout[1,3,...] < 2**28.

174 * On exit: inout[0,2,..] < 230, inout[1,3,...] < 229.	170 * On exit: inout[0,2,..] < 230, inout[1,3,...] < 229.

175 */	171 */

176 static void felem_reduce_carry(felem inout, limb carry)	172 static void felem_reduce_carry(felem inout, limb carry)

177 {	173 {

178 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);	174 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);

(...skipping 954 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1133 if (i) {	1129 if (i) {

1134 point_double(nx, ny, nz, nx, ny, nz);	1130 point_double(nx, ny, nz, nx, ny, nz);

1135 }	1131 }

1136 for (j = 0; j <= 32; j += 32) {	1132 for (j = 0; j <= 32; j += 32) {

1137 char bit0 = get_bit(scalar, 31 - i + j);	1133 char bit0 = get_bit(scalar, 31 - i + j);

1138 char bit1 = get_bit(scalar, 95 - i + j);	1134 char bit1 = get_bit(scalar, 95 - i + j);

1139 char bit2 = get_bit(scalar, 159 - i + j);	1135 char bit2 = get_bit(scalar, 159 - i + j);

1140 char bit3 = get_bit(scalar, 223 - i + j);	1136 char bit3 = get_bit(scalar, 223 - i + j);

1141 limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);	1137 limb index = bit0 \| (bit1 << 1) \| (bit2 << 2) \| (bit3 << 3);

1142	1138

1143 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);	1139 table_offset = ((((s32)j) << (32-6)) >> 31) & (30*NLIMBS);
	wtc 2013/02/01 05:28:04 Here j is 0 or 32 (1 << 5). So this can be simplif Here j is 0 or 32 (1 << 5). So this can be simplified to table_offset = ~(((u32)j >> 5) - 1) & (30NLIMBS); or table_offset = (j >> 5) (30NLIMBS); If multiplication is as fast as the other instructions, the latter seems preferrable. wtc* 2013/02/01 05:35:40 A better solution: table_offset = 0; for A better solution: table_offset = 0; for (j = 0; j <= 32; j += 32) { ... select_affine_point(px, py, kPrecomputed + table_offset, index); table_offset += 30NLIMBS; agl* 2013/02/01 18:45:17 I like this, thanks. Show quoted text On 2013/02/01 05:35:40, wtc wrote: > A better solution: > > table_offset = 0; > for (j = 0; j <= 32; j += 32) { I like this, thanks.
1144 select_affine_point(px, py, kPrecomputed + table_offset, index);	1140 select_affine_point(px, py, kPrecomputed + table_offset, index);

1145	1141

1146 /* Since scalar is less than the order of the group, we know that	1142 /* Since scalar is less than the order of the group, we know that

1147 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle	1143 * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle

1148 * below.	1144 * below.

1149 */	1145 */

1150 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);	1146 point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);

1151 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero	1147 /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero

1152 * (a.k.a. the point at infinity). We handle that situation by	1148 * (a.k.a. the point at infinity). We handle that situation by

1153 * copying the point from the table.	1149 * copying the point from the table.

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1222 }	1218 }

1223	1219

1224 index = scalar[31 - i / 2];	1220 index = scalar[31 - i / 2];

1225 if ((i & 1) == 1) {	1221 if ((i & 1) == 1) {

1226 index &= 15;	1222 index &= 15;

1227 } else {	1223 } else {

1228 index >>= 4;	1224 index >>= 4;

1229 }	1225 }

1230	1226

1231 /* See the comments in scalar_base_mult about handling infinities. */	1227 /* See the comments in scalar_base_mult about handling infinities. */

1232 select_jacobian_point(px, py, pz, (limb *) precomp, index);	1228 select_jacobian_point(px, py, pz, (limb *) precomp, index);
	wtc 2013/01/31 22:46:05 Why is this (limb ) cast necessary? Is there som Why is this (limb ) cast necessary? Is there some way to avoid it? I found that precomp precomp[0] precomp[0][0] precomp[0][0][0] all compile with no warning on Mac OS X. I have no idea which form is preferrable :-) agl 2013/02/01 18:45:17 ecp_256_32.c:1233:2: warning: passing argument 4 o Show quoted text On 2013/01/31 22:46:05, wtc wrote: > Why is this (limb ) cast necessary? ecp_256_32.c:1233:2: warning: passing argument 4 of ‘select_jacobian_point’ from incompatible pointer type [enabled by default] ecp_256_32.c:1075:13: note: expected ‘const limb ’ but argument is of type ‘limb ()[3][9]’ $ gcc --version gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3 wtc* 2013/02/01 19:28:35 Thanks for the compiler warning message. I changed Thanks for the compiler warning message. I changed this to precomp[0][0].
1233 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);	1229 point_add(tx, ty, tz, nx, ny, nz, px, py, pz);

1234 copy_conditional(nx, px, n_is_infinity_mask);	1230 copy_conditional(nx, px, n_is_infinity_mask);

1235 copy_conditional(ny, py, n_is_infinity_mask);	1231 copy_conditional(ny, py, n_is_infinity_mask);

1236 copy_conditional(nz, pz, n_is_infinity_mask);	1232 copy_conditional(nz, pz, n_is_infinity_mask);

1237	1233

1238 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;	1234 p_is_noninfinite_mask = ((s32) ~ (index - 1)) >> 31;
	wtc 2013/02/01 05:28:04 Here \|index\| is 0, ..., 15. So this can be rewritt Here \|index\| is 0, ..., 15. So this can be rewritten as p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index); agl 2013/02/01 18:45:17 Agreed. Show quoted text On 2013/02/01 05:28:04, wtc wrote: > Here \|index\| is 0, ..., 15. So this can be rewritten as > p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index); Agreed.
1239 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;	1235 mask = p_is_noninfinite_mask & ~n_is_infinity_mask;

1240 copy_conditional(nx, tx, mask);	1236 copy_conditional(nx, tx, mask);

1241 copy_conditional(ny, ty, mask);	1237 copy_conditional(ny, ty, mask);

1242 copy_conditional(nz, tz, mask);	1238 copy_conditional(nz, tz, mask);

1243 n_is_infinity_mask &= ~p_is_noninfinite_mask;	1239 n_is_infinity_mask &= ~p_is_noninfinite_mask;

1244 }	1240 }

1245 }	1241 }

1246	1242

1247 /* Interface with Freebl: */	1243 /* Interface with Freebl: */

1248	1244

	1245 /* BYTESWAP_MP_DIGIT_TO_LE swaps the bytes of a mp_digit to

	1246 * little-endian order.

	1247 */

1249 #ifdef IS_BIG_ENDIAN	1248 #ifdef IS_BIG_ENDIAN

1250 #error "This code needs a little-endian processor"	1249 #ifdef __APPLE__

	1250 #include <libkern/OSByteOrder.h>

	1251 #define BYTESWAP32(x) OSSwapInt32(x)

	1252 #define BYTESWAP64(x) OSSwapInt64(x)

	1253 #else

	1254 #define BYTESWAP32(x) \

	1255 ((x) >> 24 \| (x) >> 8 & 0xff00 \| ((x) & 0xff00) << 8 \| (x) << 24)

	1256 #define BYTESWAP64(x) \

	1257 ((x) >> 56 \| (x) >> 40 & 0xff00 \| \

	1258 (x) >> 24 & 0xff0000 \| (x) >> 8 & 0xff000000 \| \

	1259 ((x) & 0xff000000) << 8 \| ((x) & 0xff0000) << 24 \| \

	1260 ((x) & 0xff00) << 40 \| (x) << 56)

1251 #endif	1261 #endif

1252	1262

1253 static const u32 kRInvDigits[8] = {	1263 #ifdef MP_USE_UINT_DIGIT
	wtc 2013/01/31 22:46:05 If this macro is defined, mp_digit is 32-bit. Othe If this macro is defined, mp_digit is 32-bit. Otherwise, mp_digit is 64-bit (either unsigned long long or 64-bit unsigned long). See http://bonsai.mozilla.org/cvsblame.cgi?file=mozilla/security/nss/lib/freebl/m...
	1264 #define BYTESWAP_MP_DIGIT_TO_LE(x) BYTESWAP32(BYTESWAP32(x))

	1265 #else

	1266 #define BYTESWAP_MP_DIGIT_TO_LE(x) BYTESWAP64(BYTESWAP64(x))

	1267 #endif

	1268 #endif /* IS_BIG_ENDIAN */

	1269

	1270 #ifdef MP_USE_UINT_DIGIT

	1271 static const mp_digit kRInvDigits[8] = {

1254 0x80000000, 1, 0xffffffff, 0,	1272 0x80000000, 1, 0xffffffff, 0,

1255 0x80000001, 0xfffffffe, 1, 0x7fffffff	1273 0x80000001, 0xfffffffe, 1, 0x7fffffff

1256 };	1274 };

	1275 #else

	1276 static const mp_digit kRInvDigits[4] = {

	1277 PR_UINT64(0x180000000), 0xffffffff,

	1278 PR_UINT64(0xfffffffe80000001), PR_UINT64(0x7fffffff00000001)

	1279 };

	1280 #endif

1257 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))	1281 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))

1258 static const mp_int kRInv = {	1282 static const mp_int kRInv = {

1259 MP_ZPOS,	1283 MP_ZPOS,

1260 MP_DIGITS_IN_256_BITS,	1284 MP_DIGITS_IN_256_BITS,

1261 MP_DIGITS_IN_256_BITS,	1285 MP_DIGITS_IN_256_BITS,

1262 /* Because we are running on a little-endian processor, this cast works for

1263 * both 32 and 64-bit processors.

1264 */

1265 (mp_digit*) kRInvDigits	1286 (mp_digit*) kRInvDigits

1266 };	1287 };

1267	1288

1268 static const limb kTwo28 = 0x10000000;	1289 static const limb kTwo28 = 0x10000000;

1269 static const limb kTwo29 = 0x20000000;	1290 static const limb kTwo29 = 0x20000000;

1270	1291

1271 /* to_montgomery sets out = Rin. /	1292 /* to_montgomery sets out = Rin. /

1272 static mp_err to_montgomery(felem out, const mp_int in, const ECGroup group)	1293 static mp_err to_montgomery(felem out, const mp_int in, const ECGroup group)

1273 {	1294 {

1274 /* There are no MPI functions for bitshift operations and we wish to shift	1295 /* There are no MPI functions for bitshift operations and we wish to shift

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1335	1356

1336 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */	1357 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */

1337 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)	1358 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)

1338 {	1359 {

1339 /* We require that \|n\| is less than the order of the group and therefore it	1360 /* We require that \|n\| is less than the order of the group and therefore it

1340 * will fit into \|scalar\|. However, these is a timing side-channel here that	1361 * will fit into \|scalar\|. However, these is a timing side-channel here that

1341 * we cannot avoid: if \|n\| is sufficiently small it may be one or more words	1362 * we cannot avoid: if \|n\| is sufficiently small it may be one or more words

1342 * too short and we'll copy less data.	1363 * too short and we'll copy less data.

1343 */	1364 */

1344 memset(out_scalar, 0, 32);	1365 memset(out_scalar, 0, 32);

	1366 #ifdef IS_LITTLE_ENDIAN

1345 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));	1367 memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));
	wtc 2013/01/31 22:46:05 I found that this memcpy is done without checking I found that this memcpy is done without checking that MP_USED(n) * sizeof(mp_digit) <= 32. Is this what your FIXME comment on line 1391 refers to? agl 2013/02/01 18:45:17 Yes. I had forgotten about that. Is there a #defi Show quoted text On 2013/01/31 22:46:05, wtc wrote: > > I found that this memcpy is done without checking that > MP_USED(n) * sizeof(mp_digit) <= 32. > > Is this what your FIXME comment on line 1391 refers to? Yes. I had forgotten about that. Is there a #define that can be used so that the check is only enabled in debug mode? NDEBUG is the typical one but I don't know if NSS uses it. (It doesn't appear to be common at least.) There's ECL_DEBUG, but that's no obviously enabled in any build. wtc 2013/02/01 19:28:35 NSS uses both DEBUG and NDEBUG. DEBUG is more comm Show quoted text On 2013/02/01 18:45:17, agl wrote: > > Is there a #define that can be used so that the check is only enabled in debug > mode? NDEBUG is the typical one but I don't know if NSS uses it. (It doesn't > appear to be common at least.) NSS uses both DEBUG and NDEBUG. DEBUG is more commonly used.
	1368 #else

	1369 {

	1370 mp_size i;

	1371 mp_digit swapped[MP_DIGITS_IN_256_BITS];

	1372 for (i = 0; i < MP_USED(n); i++) {

	1373 swapped[i] = BYTESWAP_MP_DIGIT_TO_LE(MP_DIGIT(n, i));

	1374 }

	1375 memcpy(out_scalar, swapped, MP_USED(n) * sizeof(mp_digit));

	1376 }

	1377 #endif

1346 }	1378 }

1347	1379

1348 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the	1380 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the

1349 * order of the group.	1381 * order of the group.

1350 */	1382 */

1351 static mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,	1383 static mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,

1352 mp_int out_x, mp_int out_y,	1384 mp_int out_x, mp_int out_y,

1353 const ECGroup *group)	1385 const ECGroup *group)

1354 {	1386 {

1355 u8 scalar[32];	1387 u8 scalar[32];

1356 felem x, y, z, x_affine, y_affine;	1388 felem x, y, z, x_affine, y_affine;

1357 mp_err res;	1389 mp_err res;

1358	1390

1359 /* FIXME(agl): test that n < order. */	1391 /* FIXME(agl): test that n < order. */
	wtc 2013/02/01 19:28:35 I added an assertion to scalar_from_mp_int to ensu I added an assertion to scalar_from_mp_int to ensure that memcpy won't overflow the out_scalar buffer. It is weaker than testing that n < order. To test that n < order, I'd need to repeat the test in all three point multiplication methods. Do you think it is enough to just prevent buffer overflow in scalar_from_mp_int? Do you think an assertion (compiled away in release builds) is enough?
1360	1392

1361 scalar_from_mp_int(scalar, n);	1393 scalar_from_mp_int(scalar, n);

1362 scalar_base_mult(x, y, z, scalar);	1394 scalar_base_mult(x, y, z, scalar);

1363 point_to_affine(x_affine, y_affine, x, y, z);	1395 point_to_affine(x_affine, y_affine, x, y, z);

1364 MP_CHECKOK(from_montgomery(out_x, x_affine, group));	1396 MP_CHECKOK(from_montgomery(out_x, x_affine, group));

1365 MP_CHECKOK(from_montgomery(out_y, y_affine, group));	1397 MP_CHECKOK(from_montgomery(out_y, y_affine, group));

1366	1398

1367 CLEANUP:	1399 CLEANUP:

1368 return res;	1400 return res;

1369 }	1401 }

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1461 /* Wire in fast point multiplication for named curves. */	1493 /* Wire in fast point multiplication for named curves. */

1462 mp_err ec_group_set_gfp256_32(ECGroup *group, ECCurveName name)	1494 mp_err ec_group_set_gfp256_32(ECGroup *group, ECCurveName name)

1463 {	1495 {

1464 if (name == ECCurve_NIST_P256) {	1496 if (name == ECCurve_NIST_P256) {

1465 group->base_point_mul = &ec_GFp_nistp256_base_point_mul;	1497 group->base_point_mul = &ec_GFp_nistp256_base_point_mul;

1466 group->point_mul = &ec_GFp_nistp256_point_mul;	1498 group->point_mul = &ec_GFp_nistp256_point_mul;

1467 group->points_mul = &ec_GFp_nistp256_points_mul_vartime;	1499 group->points_mul = &ec_GFp_nistp256_points_mul_vartime;

1468 }	1500 }

1469 return MP_OKAY;	1501 return MP_OKAY;

1470 }	1502 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »