cgpt/cgpt_common.c - Issue 5025003: The right implementation of CGPT label conversion between UTF8 and UTF16.

Unified Diff: cgpt/cgpt_common.c

Issue 5025003: The right implementation of CGPT label conversion between UTF8 and UTF16. (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/vboot_reference.git

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: cgpt/cgpt_common.c

diff --git a/cgpt/cgpt_common.c b/cgpt/cgpt_common.c

index 4b9a5a141dce0b847ab7a4b58549a241c26f9fed..f5babe605b274d4a9e47ea5baa419181c65e064e 100644

--- a/cgpt/cgpt_common.c

+++ b/cgpt/cgpt_common.c

@@ -351,55 +351,165 @@ void GuidToStr(const Guid *guid, char *str, unsigned int buflen) {

/* Convert possibly unterminated UTF16 string to UTF8.

* Caller must prepare enough space for UTF8, which could be up to

* twice the number of UTF16 chars plus the terminating '\0'.

Bill Richardson 2010/11/17 17:26:39 I think this size bound is wrong. It should be "th

Louis 2010/11/18 05:35:21 Hm... my initial idea should be "UTF16 bytes", in

- * FIXME(wfrichar): The original implementation had security issues. As a

- * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542

- * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix

- * this.

+ *

+ * This function uses a simple state meachine to convert UTF-16 char(s) to

+ * a code point. Once a code point is parsed out, the state machine throws

+ * out sequencial UTF-8 chars in one time.

+ *

+ * Return: CGPT_OK --- all character are converted successfully.

+ * CGPT_FAILED --- convert error, i.e. output buffer is too short.

-void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,

- uint8_t *utf8, unsigned int maxoutput)

+int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,

+ uint8_t *utf8, unsigned int maxoutput)

{

size_t s16idx, s8idx;

- uint32_t utfchar;

+ uint32_t code_point;

+ int code_point_ready = 1; // code point is ready to output.

+ int retval = CGPT_OK;

if (!utf16 || !maxinput || !utf8 || !maxoutput)

- return;

+ return CGPT_FAILED;

maxoutput--; /* plan for termination now */

for (s16idx = s8idx = 0;

s16idx < maxinput && utf16[s16idx] && maxoutput;

- s16idx++, maxoutput--) {

- utfchar = le16toh(utf16[s16idx]);

- utf8[s8idx++] = utfchar & 0x7F;

+ s16idx++) {

+ unsigned short codeunit = le16toh(utf16[s16idx]);

Bill Richardson 2010/11/17 17:26:39 Shouldn't codeunit be uint16_t instead of unsigned

Louis 2010/11/18 05:35:21 Done. Good catch! On 2010/11/17 17:26:39, Bill Ric

+ if (code_point_ready) {

+ if (codeunit >= 0xD800 && codeunit <= 0xDBFF) {

+ /* high surrogate, need the low surrogate. */

+ code_point_ready = 0;

+ code_point = (codeunit & 0x03FF) + 0x0040;

+ } else {

+ /* BMP char, output it. */

+ code_point = codeunit;

+ }

+ } else {

+ /* expect the low surrogate */

+ if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) {

+ code_point = (code_point << 10) | (codeunit & 0x03FF);

+ code_point_ready = 1;

+ } else {

+ /* the second code unit is NOT the low surrogate. Unexpected. */

+ retval = CGPT_FAILED;

+ break;

+ }

+ /* If UTF code point is ready, output it. */

+ if (code_point_ready) {

+ require(code_point <= 0x10FFFF);

+ if (code_point <= 0x7F && maxoutput >= 1) {

Bill Richardson 2010/11/17 17:26:39 All these "maxoutput >=" tests should be "maxoutpu

Louis 2010/11/18 05:35:21 They don't because in line 373, the space was rese

+ maxoutput -= 1;

+ utf8[s8idx++] = code_point & 0x7F;

+ } else if (code_point <= 0x7FF && maxoutput >= 2) {

+ maxoutput -= 2;

+ utf8[s8idx++] = 0xC0 | (code_point >> 6);

+ utf8[s8idx++] = 0x80 | (code_point & 0x3F);

+ } else if (code_point <= 0xFFFF && maxoutput >= 3) {

+ maxoutput -= 3;

+ utf8[s8idx++] = 0xE0 | (code_point >> 12);

+ utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);

+ utf8[s8idx++] = 0x80 | (code_point & 0x3F);

+ } else if (code_point <= 0x10FFFF && maxoutput >= 4) {

+ maxoutput -= 4;

+ utf8[s8idx++] = 0xF0 | (code_point >> 18);

+ utf8[s8idx++] = 0x80 | ((code_point >> 12) & 0x3F);

+ utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);

+ utf8[s8idx++] = 0x80 | (code_point & 0x3F);

+ } else {

+ /* buffer underrun */

+ retval = CGPT_FAILED;

+ break;

+ }

}

utf8[s8idx++] = 0;

+ return retval;

}

/* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.

* Caller must prepare enough space for UTF16, including a terminating 0x0000.

- * FIXME(wfrichar): The original implementation had security issues. As a

- * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542

- * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix

- * this.

+ *

+ * This function converts UTF8 chars to a code point first. Then, convrts it

+ * to UTF16 code unit(s).

+ *

+ * Return: CGPT_OK --- all character are converted successfully.

+ * CGPT_FAILED --- convert error, i.e. output buffer is too short.

-void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)

+int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)

{

size_t s16idx, s8idx;

- uint32_t utfchar;

+ uint32_t code_point = 0;

+ unsigned int need_more_code_unit = 0;

+ int retval = CGPT_OK;

if (!utf8 || !utf16 || !maxoutput)

- return;

+ return CGPT_FAILED;

maxoutput--; /* plan for termination */

for (s8idx = s16idx = 0;

utf8[s8idx] && maxoutput;

- s8idx++, maxoutput--) {

- utfchar = utf8[s8idx];

- utf16[s16idx++] = utfchar & 0x7F;

+ s8idx++) {

+ unsigned char code_unit;

Bill Richardson 2010/11/17 17:26:39 uint8_t instead of unsigned char ?

Louis 2010/11/18 05:35:21 Done. Thanks again. My stupidness. On 2010/11/17

+ code_unit = utf8[s8idx];

+ if (need_more_code_unit) {

+ /* Trailing bytes of multi-byte character */

+ if ((code_unit & 0xC0) == 0x80) {

+ code_point = (code_point << 6) | (code_unit & 0x3F);

+ need_more_code_unit--;

+ } else {

+ /* Unexpected code unit. */

+ retval = CGPT_FAILED;

+ break;

+ }

+ } else {

+ /* parsing a new code point. */

+ if (code_unit <= 0x7F) {

+ code_point = code_unit;

+ } else if (code_unit <= 0xBF) {

+ /* 0x80-0xBF must NOT be the heading byte unit of a new code point. */

+ retval = CGPT_FAILED;

+ break;

Bill Richardson 2010/11/17 17:26:39 I don't think this handles all the valid input. Fo

Louis 2010/11/18 05:35:21 Done. You are right. I changed the need_more_code_

+ } else if (code_unit >= 0xC2 && code_unit <= 0xDF) {

+ code_point = code_unit & 0x1F;

+ need_more_code_unit = 1;

+ } else if (code_unit >= 0xE0 && code_unit <= 0xEF) {

+ code_point = code_unit & 0x0F;

+ need_more_code_unit = 2;

+ } else if (code_unit >= 0xF0 && code_unit <= 0xF4) {

+ code_point = code_unit & 0x07;

+ need_more_code_unit = 3;

+ } else {

+ /* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */

+ retval = CGPT_FAILED;

+ break;

+ }

+ /* If no more unit is needed, output the UTF16 unit(s). */

+ if (!need_more_code_unit) {

+ require(code_point <= 0x10FFFF);

+ if (code_point <= 0xFFFF) {

+ utf16[s16idx++] = code_point;

+ maxoutput -= 1;

+ } else if (code_point <= 0x10FFFF && maxoutput >= 2) {

Bill Richardson 2010/11/17 17:26:39 maxoutput > 2, to account for the trailing \0000.

Louis 2010/11/18 05:35:21 In line 452, the space has been reserved. On 2010

+ utf16[s16idx++] = 0xD800 | ((code_point >> 10) - 0x0040);

+ utf16[s16idx++] = 0xDC00 | (code_point & 0x03FF);

+ maxoutput -= 2;

+ } else {

+ /* buffer underrun */

+ retval = CGPT_FAILED;

+ break;

+ }

}

utf16[s16idx++] = 0;

+ return retval;

}

struct {

@@ -693,4 +803,3 @@ void PMBRToStr(struct pmbr *pmbr, char *str, unsigned int buflen) {

require(snprintf(str, buflen, "PMBR (Boot GUID: %s)", buf) < buflen);

}

« no previous file with comments | « cgpt/cgpt.h ('k') | cgpt/cmd_add.c » ('j') | cgpt/cmd_add.c » ('J')