cgpt/cgpt_common.c - Issue 5025003: The right implementation of CGPT label conversion between UTF8 and UTF16.

Side by Side Diff: cgpt/cgpt_common.c

Issue 5025003: The right implementation of CGPT label conversion between UTF8 and UTF16. (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/vboot_reference.git

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /* Copyright (c) 2010 The Chromium OS Authors. All rights reserved.	1 /* Copyright (c) 2010 The Chromium OS Authors. All rights reserved.

2 * Use of this source code is governed by a BSD-style license that can be	2 * Use of this source code is governed by a BSD-style license that can be

3 * found in the LICENSE file.	3 * found in the LICENSE file.

4 *	4 *

5 * Utility for ChromeOS-specific GPT partitions, Please see corresponding .c	5 * Utility for ChromeOS-specific GPT partitions, Please see corresponding .c

6 * files for more details.	6 * files for more details.

7 */	7 */

8	8

9 #include "cgpt.h"	9 #include "cgpt.h"

10	10

(...skipping 332 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
343 le16toh(guid->u.Uuid.time_high_and_version),	343 le16toh(guid->u.Uuid.time_high_and_version),

344 guid->u.Uuid.clock_seq_high_and_reserved,	344 guid->u.Uuid.clock_seq_high_and_reserved,

345 guid->u.Uuid.clock_seq_low,	345 guid->u.Uuid.clock_seq_low,

346 guid->u.Uuid.node[0], guid->u.Uuid.node[1],	346 guid->u.Uuid.node[0], guid->u.Uuid.node[1],

347 guid->u.Uuid.node[2], guid->u.Uuid.node[3],	347 guid->u.Uuid.node[2], guid->u.Uuid.node[3],

348 guid->u.Uuid.node[4], guid->u.Uuid.node[5]) == GUID_STRLEN-1);	348 guid->u.Uuid.node[4], guid->u.Uuid.node[5]) == GUID_STRLEN-1);

349 }	349 }

350	350

351 /* Convert possibly unterminated UTF16 string to UTF8.	351 /* Convert possibly unterminated UTF16 string to UTF8.

352 * Caller must prepare enough space for UTF8, which could be up to	352 * Caller must prepare enough space for UTF8, which could be up to

353 * twice the number of UTF16 chars plus the terminating '\0'.	353 * twice the number of UTF16 chars plus the terminating '\0'.
	Bill Richardson 2010/11/17 17:26:39 I think this size bound is wrong. It should be "th I think this size bound is wrong. It should be "three times", not "twice". For example, if the input is a single 0xee00, then the output will be 0xee, 0x80, 0x80. The same comments appears in the header file too. Louis 2010/11/18 05:35:21 Hm... my initial idea should be "UTF16 bytes", in Hm... my initial idea should be "UTF16 bytes", instead of "UTF16 chars". My bad caused the misleading. I have fixed the comment. In worst case (code point 0x0800~0xFFFF), a 2-byte UTF16 char unit could be converted to 4 UTF8 bytes. For code point 0x10000~0x10FFFF, the UTF16 encoding is 2 code units, which is 4 bytes, and its corresponding UTF8 encoding is also 4 bytes. See the following table for encoding lengths. Code point UTF16 UTF8 0x0000-0x007F 2 bytes 1 byte 0x0080-0x07FF 2 bytes 2 bytes 0x0800-0xFFFF 2 bytes 3 bytes 0x10000-0x10FFFF 4 bytes 4 bytes On 2010/11/17 17:26:39, Bill Richardson wrote: Show quoted text > I think this size bound is wrong. It should be "three times", not "twice". For > example, if the input is a single 0xee00, then the output will be 0xee, 0x80, > 0x80. The same comments appears in the header file too.
354 * FIXME(wfrichar): The original implementation had security issues. As a	354 *

355 * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542	355 * This function uses a simple state meachine to convert UTF-16 char(s) to

356 * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix	356 * a code point. Once a code point is parsed out, the state machine throws

357 * this.	357 * out sequencial UTF-8 chars in one time.

	358 *

	359 * Return: CGPT_OK --- all character are converted successfully.

	360 * CGPT_FAILED --- convert error, i.e. output buffer is too short.

358 */	361 */

359 void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,	362 int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,

360 uint8_t *utf8, unsigned int maxoutput)	363 uint8_t *utf8, unsigned int maxoutput)

361 {	364 {

362 size_t s16idx, s8idx;	365 size_t s16idx, s8idx;

363 uint32_t utfchar;	366 uint32_t code_point;

	367 int code_point_ready = 1; // code point is ready to output.

	368 int retval = CGPT_OK;

364	369

365 if (!utf16 \|\| !maxinput \|\| !utf8 \|\| !maxoutput)	370 if (!utf16 \|\| !maxinput \|\| !utf8 \|\| !maxoutput)

366 return;	371 return CGPT_FAILED;

367	372

368 maxoutput--; /* plan for termination now */	373 maxoutput--; /* plan for termination now */

369	374

370 for (s16idx = s8idx = 0;	375 for (s16idx = s8idx = 0;

371 s16idx < maxinput && utf16[s16idx] && maxoutput;	376 s16idx < maxinput && utf16[s16idx] && maxoutput;

372 s16idx++, maxoutput--) {	377 s16idx++) {

373 utfchar = le16toh(utf16[s16idx]);	378 unsigned short codeunit = le16toh(utf16[s16idx]);
	Bill Richardson 2010/11/17 17:26:39 Shouldn't codeunit be uint16_t instead of unsigned Shouldn't codeunit be uint16_t instead of unsigned short, just for consistency? Louis 2010/11/18 05:35:21 Done. Good catch! On 2010/11/17 17:26:39, Bill Ric Done. Good catch! On 2010/11/17 17:26:39, Bill Richardson wrote: Show quoted text > Shouldn't codeunit be uint16_t instead of unsigned short, just for consistency?
374 utf8[s8idx++] = utfchar & 0x7F;	379

	380 if (code_point_ready) {

	381 if (codeunit >= 0xD800 && codeunit <= 0xDBFF) {

	382 /* high surrogate, need the low surrogate. */

	383 code_point_ready = 0;

	384 code_point = (codeunit & 0x03FF) + 0x0040;

	385 } else {

	386 /* BMP char, output it. */

	387 code_point = codeunit;

	388 }

	389 } else {

	390 /* expect the low surrogate */

	391 if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) {

	392 code_point = (code_point << 10) \| (codeunit & 0x03FF);

	393 code_point_ready = 1;

	394 } else {

	395 /* the second code unit is NOT the low surrogate. Unexpected. */

	396 retval = CGPT_FAILED;

	397 break;

	398 }

	399 }

	400

	401 /* If UTF code point is ready, output it. */

	402 if (code_point_ready) {

	403 require(code_point <= 0x10FFFF);

	404 if (code_point <= 0x7F && maxoutput >= 1) {
	Bill Richardson 2010/11/17 17:26:39 All these "maxoutput >=" tests should be "maxoutpu All these "maxoutput >=" tests should be "maxoutput >" instead, to leave room for the terminating '\0'. Louis 2010/11/18 05:35:21 They don't because in line 373, the space was rese They don't because in line 373, the space was reserved already. maxoutput--; /* plan for termination now */ On 2010/11/17 17:26:39, Bill Richardson wrote: Show quoted text > All these "maxoutput >=" tests should be "maxoutput >" instead, to leave room > for the terminating '\0'. >
	405 maxoutput -= 1;

	406 utf8[s8idx++] = code_point & 0x7F;

	407 } else if (code_point <= 0x7FF && maxoutput >= 2) {

	408 maxoutput -= 2;

	409 utf8[s8idx++] = 0xC0 \| (code_point >> 6);

	410 utf8[s8idx++] = 0x80 \| (code_point & 0x3F);

	411 } else if (code_point <= 0xFFFF && maxoutput >= 3) {

	412 maxoutput -= 3;

	413 utf8[s8idx++] = 0xE0 \| (code_point >> 12);

	414 utf8[s8idx++] = 0x80 \| ((code_point >> 6) & 0x3F);

	415 utf8[s8idx++] = 0x80 \| (code_point & 0x3F);

	416 } else if (code_point <= 0x10FFFF && maxoutput >= 4) {

	417 maxoutput -= 4;

	418 utf8[s8idx++] = 0xF0 \| (code_point >> 18);

	419 utf8[s8idx++] = 0x80 \| ((code_point >> 12) & 0x3F);

	420 utf8[s8idx++] = 0x80 \| ((code_point >> 6) & 0x3F);

	421 utf8[s8idx++] = 0x80 \| (code_point & 0x3F);

	422 } else {

	423 /* buffer underrun */

	424 retval = CGPT_FAILED;

	425 break;

	426 }

	427 }

375 }	428 }

376 utf8[s8idx++] = 0;	429 utf8[s8idx++] = 0;

	430 return retval;

377 }	431 }

378	432

379 /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.	433 /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.

380 * Caller must prepare enough space for UTF16, including a terminating 0x0000.	434 * Caller must prepare enough space for UTF16, including a terminating 0x0000.

381 * FIXME(wfrichar): The original implementation had security issues. As a	435 *

382 * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542	436 * This function converts UTF8 chars to a code point first. Then, convrts it

383 * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix	437 * to UTF16 code unit(s).

384 * this.	438 *

	439 * Return: CGPT_OK --- all character are converted successfully.

	440 * CGPT_FAILED --- convert error, i.e. output buffer is too short.

385 */	441 */

386 void UTF8ToUTF16(const uint8_t utf8, uint16_t utf16, unsigned int maxoutput)	442 int UTF8ToUTF16(const uint8_t utf8, uint16_t utf16, unsigned int maxoutput)

387 {	443 {

388 size_t s16idx, s8idx;	444 size_t s16idx, s8idx;

389 uint32_t utfchar;	445 uint32_t code_point = 0;

	446 unsigned int need_more_code_unit = 0;

	447 int retval = CGPT_OK;

390	448

391 if (!utf8 \|\| !utf16 \|\| !maxoutput)	449 if (!utf8 \|\| !utf16 \|\| !maxoutput)

392 return;	450 return CGPT_FAILED;

393	451

394 maxoutput--; /* plan for termination */	452 maxoutput--; /* plan for termination */

395	453

396 for (s8idx = s16idx = 0;	454 for (s8idx = s16idx = 0;

397 utf8[s8idx] && maxoutput;	455 utf8[s8idx] && maxoutput;

398 s8idx++, maxoutput--) {	456 s8idx++) {

399 utfchar = utf8[s8idx];	457 unsigned char code_unit;
	Bill Richardson 2010/11/17 17:26:39 uint8_t instead of unsigned char ? uint8_t instead of unsigned char ? Louis 2010/11/18 05:35:21 Done. Thanks again. My stupidness. On 2010/11/17 Done. Thanks again. My stupidness. On 2010/11/17 17:26:39, Bill Richardson wrote: Show quoted text > uint8_t instead of unsigned char ?
400 utf16[s16idx++] = utfchar & 0x7F;	458 code_unit = utf8[s8idx];

	459

	460 if (need_more_code_unit) {

	461 /* Trailing bytes of multi-byte character */

	462 if ((code_unit & 0xC0) == 0x80) {

	463 code_point = (code_point << 6) \| (code_unit & 0x3F);

	464 need_more_code_unit--;

	465 } else {

	466 /* Unexpected code unit. */

	467 retval = CGPT_FAILED;

	468 break;

	469 }

	470 } else {

	471 /* parsing a new code point. */

	472 if (code_unit <= 0x7F) {

	473 code_point = code_unit;

	474 } else if (code_unit <= 0xBF) {

	475 /* 0x80-0xBF must NOT be the heading byte unit of a new code point. */

	476 retval = CGPT_FAILED;

	477 break;
	Bill Richardson 2010/11/17 17:26:39 I don't think this handles all the valid input. Fo I don't think this handles all the valid input. For example, 0xF4 0x91 0x81 0x81 is invalid but I think it will pass here. You'll probably need to implement a case statement based on the value of need_more_code_unit in order to handle it correcly. Look carefully at the Second Byte column of Table 3-7 on page 93 of chapter 3 of the Unicode Standard Version 5.2 (http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf). It's not as regular as you might expect. Louis 2010/11/18 05:35:21 Done. You are right. I changed the need_more_code_ Done. You are right. I changed the need_more_code_unit into expected_units and decoded_units. Also checked: 1. shortest possible UTF8 sequence 2. illegal UTF8 sequence 3. surrogate 4. invalid code point On 2010/11/17 17:26:39, Bill Richardson wrote: Show quoted text > I don't think this handles all the valid input. For example, > 0xF4 0x91 0x81 0x81 is invalid but I think it will pass here. You'll probably > need to implement a case statement based on the value of need_more_code_unit in > order to handle it correcly. Look carefully at the Second Byte column of Table > 3-7 on page 93 of chapter 3 of the Unicode Standard Version 5.2 > (http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf). It's not as regular as > you might expect.
	478 } else if (code_unit >= 0xC2 && code_unit <= 0xDF) {

	479 code_point = code_unit & 0x1F;

	480 need_more_code_unit = 1;

	481 } else if (code_unit >= 0xE0 && code_unit <= 0xEF) {

	482 code_point = code_unit & 0x0F;

	483 need_more_code_unit = 2;

	484 } else if (code_unit >= 0xF0 && code_unit <= 0xF4) {

	485 code_point = code_unit & 0x07;

	486 need_more_code_unit = 3;

	487 } else {

	488 /* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */

	489 retval = CGPT_FAILED;

	490 break;

	491 }

	492 }

	493

	494 /* If no more unit is needed, output the UTF16 unit(s). */

	495 if (!need_more_code_unit) {

	496 require(code_point <= 0x10FFFF);

	497 if (code_point <= 0xFFFF) {

	498 utf16[s16idx++] = code_point;

	499 maxoutput -= 1;

	500 } else if (code_point <= 0x10FFFF && maxoutput >= 2) {
	Bill Richardson 2010/11/17 17:26:39 maxoutput > 2, to account for the trailing \0000. maxoutput > 2, to account for the trailing \0000. Louis 2010/11/18 05:35:21 In line 452, the space has been reserved. On 2010 In line 452, the space has been reserved. On 2010/11/17 17:26:39, Bill Richardson wrote: Show quoted text > maxoutput > 2, to account for the trailing \0000.
	501 utf16[s16idx++] = 0xD800 \| ((code_point >> 10) - 0x0040);

	502 utf16[s16idx++] = 0xDC00 \| (code_point & 0x03FF);

	503 maxoutput -= 2;

	504 } else {

	505 /* buffer underrun */

	506 retval = CGPT_FAILED;

	507 break;

	508 }

	509 }

401 }	510 }

402 utf16[s16idx++] = 0;	511 utf16[s16idx++] = 0;

	512 return retval;

403 }	513 }

404	514

405 struct {	515 struct {

406 Guid type;	516 Guid type;

407 char *name;	517 char *name;

408 char *description;	518 char *description;

409 } supported_types[] = {	519 } supported_types[] = {

410 {GPT_ENT_TYPE_CHROMEOS_KERNEL, "kernel", "ChromeOS kernel"},	520 {GPT_ENT_TYPE_CHROMEOS_KERNEL, "kernel", "ChromeOS kernel"},

411 {GPT_ENT_TYPE_CHROMEOS_ROOTFS, "rootfs", "ChromeOS rootfs"},	521 {GPT_ENT_TYPE_CHROMEOS_ROOTFS, "rootfs", "ChromeOS rootfs"},

412 {GPT_ENT_TYPE_LINUX_DATA, "data", "Linux data"},	522 {GPT_ENT_TYPE_LINUX_DATA, "data", "Linux data"},

(...skipping 273 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
686	796

687 void PMBRToStr(struct pmbr pmbr, char str, unsigned int buflen) {	797 void PMBRToStr(struct pmbr pmbr, char str, unsigned int buflen) {

688 char buf[GUID_STRLEN];	798 char buf[GUID_STRLEN];

689 if (IsZero(&pmbr->boot_guid)) {	799 if (IsZero(&pmbr->boot_guid)) {

690 require(snprintf(str, buflen, "PMBR") < buflen);	800 require(snprintf(str, buflen, "PMBR") < buflen);

691 } else {	801 } else {

692 GuidToStr(&pmbr->boot_guid, buf, sizeof(buf));	802 GuidToStr(&pmbr->boot_guid, buf, sizeof(buf));

693 require(snprintf(str, buflen, "PMBR (Boot GUID: %s)", buf) < buflen);	803 require(snprintf(str, buflen, "PMBR (Boot GUID: %s)", buf) < buflen);

694 }	804 }

695 }	805 }

696

OLD	NEW

« no previous file with comments | « cgpt/cgpt.h ('k') | cgpt/cmd_add.c » ('j') | cgpt/cmd_add.c » ('J')