Chromium Code Reviews| Index: cgpt/cgpt_common.c |
| diff --git a/cgpt/cgpt_common.c b/cgpt/cgpt_common.c |
| index 4b9a5a141dce0b847ab7a4b58549a241c26f9fed..f5babe605b274d4a9e47ea5baa419181c65e064e 100644 |
| --- a/cgpt/cgpt_common.c |
| +++ b/cgpt/cgpt_common.c |
| @@ -351,55 +351,165 @@ void GuidToStr(const Guid *guid, char *str, unsigned int buflen) { |
| /* Convert possibly unterminated UTF16 string to UTF8. |
| * Caller must prepare enough space for UTF8, which could be up to |
| * twice the number of UTF16 chars plus the terminating '\0'. |
|
Bill Richardson
2010/11/17 17:26:39
I think this size bound is wrong. It should be "th
Louis
2010/11/18 05:35:21
Hm... my initial idea should be "UTF16 bytes", in
|
| - * FIXME(wfrichar): The original implementation had security issues. As a |
| - * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542 |
| - * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix |
| - * this. |
| + * |
| + * This function uses a simple state meachine to convert UTF-16 char(s) to |
| + * a code point. Once a code point is parsed out, the state machine throws |
| + * out sequencial UTF-8 chars in one time. |
| + * |
| + * Return: CGPT_OK --- all character are converted successfully. |
| + * CGPT_FAILED --- convert error, i.e. output buffer is too short. |
| */ |
| -void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, |
| - uint8_t *utf8, unsigned int maxoutput) |
| +int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, |
| + uint8_t *utf8, unsigned int maxoutput) |
| { |
| size_t s16idx, s8idx; |
| - uint32_t utfchar; |
| + uint32_t code_point; |
| + int code_point_ready = 1; // code point is ready to output. |
| + int retval = CGPT_OK; |
| if (!utf16 || !maxinput || !utf8 || !maxoutput) |
| - return; |
| + return CGPT_FAILED; |
| maxoutput--; /* plan for termination now */ |
| for (s16idx = s8idx = 0; |
| s16idx < maxinput && utf16[s16idx] && maxoutput; |
| - s16idx++, maxoutput--) { |
| - utfchar = le16toh(utf16[s16idx]); |
| - utf8[s8idx++] = utfchar & 0x7F; |
| + s16idx++) { |
| + unsigned short codeunit = le16toh(utf16[s16idx]); |
|
Bill Richardson
2010/11/17 17:26:39
Shouldn't codeunit be uint16_t instead of unsigned
Louis
2010/11/18 05:35:21
Done. Good catch!
On 2010/11/17 17:26:39, Bill Ric
|
| + |
| + if (code_point_ready) { |
| + if (codeunit >= 0xD800 && codeunit <= 0xDBFF) { |
| + /* high surrogate, need the low surrogate. */ |
| + code_point_ready = 0; |
| + code_point = (codeunit & 0x03FF) + 0x0040; |
| + } else { |
| + /* BMP char, output it. */ |
| + code_point = codeunit; |
| + } |
| + } else { |
| + /* expect the low surrogate */ |
| + if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) { |
| + code_point = (code_point << 10) | (codeunit & 0x03FF); |
| + code_point_ready = 1; |
| + } else { |
| + /* the second code unit is NOT the low surrogate. Unexpected. */ |
| + retval = CGPT_FAILED; |
| + break; |
| + } |
| + } |
| + |
| + /* If UTF code point is ready, output it. */ |
| + if (code_point_ready) { |
| + require(code_point <= 0x10FFFF); |
| + if (code_point <= 0x7F && maxoutput >= 1) { |
|
Bill Richardson
2010/11/17 17:26:39
All these "maxoutput >=" tests should be "maxoutpu
Louis
2010/11/18 05:35:21
They don't because in line 373, the space was rese
|
| + maxoutput -= 1; |
| + utf8[s8idx++] = code_point & 0x7F; |
| + } else if (code_point <= 0x7FF && maxoutput >= 2) { |
| + maxoutput -= 2; |
| + utf8[s8idx++] = 0xC0 | (code_point >> 6); |
| + utf8[s8idx++] = 0x80 | (code_point & 0x3F); |
| + } else if (code_point <= 0xFFFF && maxoutput >= 3) { |
| + maxoutput -= 3; |
| + utf8[s8idx++] = 0xE0 | (code_point >> 12); |
| + utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F); |
| + utf8[s8idx++] = 0x80 | (code_point & 0x3F); |
| + } else if (code_point <= 0x10FFFF && maxoutput >= 4) { |
| + maxoutput -= 4; |
| + utf8[s8idx++] = 0xF0 | (code_point >> 18); |
| + utf8[s8idx++] = 0x80 | ((code_point >> 12) & 0x3F); |
| + utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F); |
| + utf8[s8idx++] = 0x80 | (code_point & 0x3F); |
| + } else { |
| + /* buffer underrun */ |
| + retval = CGPT_FAILED; |
| + break; |
| + } |
| + } |
| } |
| utf8[s8idx++] = 0; |
| + return retval; |
| } |
| /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated. |
| * Caller must prepare enough space for UTF16, including a terminating 0x0000. |
| - * FIXME(wfrichar): The original implementation had security issues. As a |
| - * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542 |
| - * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix |
| - * this. |
| + * |
| + * This function converts UTF8 chars to a code point first. Then, convrts it |
| + * to UTF16 code unit(s). |
| + * |
| + * Return: CGPT_OK --- all character are converted successfully. |
| + * CGPT_FAILED --- convert error, i.e. output buffer is too short. |
| */ |
| -void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput) |
| +int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput) |
| { |
| size_t s16idx, s8idx; |
| - uint32_t utfchar; |
| + uint32_t code_point = 0; |
| + unsigned int need_more_code_unit = 0; |
| + int retval = CGPT_OK; |
| if (!utf8 || !utf16 || !maxoutput) |
| - return; |
| + return CGPT_FAILED; |
| maxoutput--; /* plan for termination */ |
| for (s8idx = s16idx = 0; |
| utf8[s8idx] && maxoutput; |
| - s8idx++, maxoutput--) { |
| - utfchar = utf8[s8idx]; |
| - utf16[s16idx++] = utfchar & 0x7F; |
| + s8idx++) { |
| + unsigned char code_unit; |
|
Bill Richardson
2010/11/17 17:26:39
uint8_t instead of unsigned char ?
Louis
2010/11/18 05:35:21
Done. Thanks again. My stupidness.
On 2010/11/17
|
| + code_unit = utf8[s8idx]; |
| + |
| + if (need_more_code_unit) { |
| + /* Trailing bytes of multi-byte character */ |
| + if ((code_unit & 0xC0) == 0x80) { |
| + code_point = (code_point << 6) | (code_unit & 0x3F); |
| + need_more_code_unit--; |
| + } else { |
| + /* Unexpected code unit. */ |
| + retval = CGPT_FAILED; |
| + break; |
| + } |
| + } else { |
| + /* parsing a new code point. */ |
| + if (code_unit <= 0x7F) { |
| + code_point = code_unit; |
| + } else if (code_unit <= 0xBF) { |
| + /* 0x80-0xBF must NOT be the heading byte unit of a new code point. */ |
| + retval = CGPT_FAILED; |
| + break; |
|
Bill Richardson
2010/11/17 17:26:39
I don't think this handles all the valid input. Fo
Louis
2010/11/18 05:35:21
Done. You are right. I changed the need_more_code_
|
| + } else if (code_unit >= 0xC2 && code_unit <= 0xDF) { |
| + code_point = code_unit & 0x1F; |
| + need_more_code_unit = 1; |
| + } else if (code_unit >= 0xE0 && code_unit <= 0xEF) { |
| + code_point = code_unit & 0x0F; |
| + need_more_code_unit = 2; |
| + } else if (code_unit >= 0xF0 && code_unit <= 0xF4) { |
| + code_point = code_unit & 0x07; |
| + need_more_code_unit = 3; |
| + } else { |
| + /* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */ |
| + retval = CGPT_FAILED; |
| + break; |
| + } |
| + } |
| + |
| + /* If no more unit is needed, output the UTF16 unit(s). */ |
| + if (!need_more_code_unit) { |
| + require(code_point <= 0x10FFFF); |
| + if (code_point <= 0xFFFF) { |
| + utf16[s16idx++] = code_point; |
| + maxoutput -= 1; |
| + } else if (code_point <= 0x10FFFF && maxoutput >= 2) { |
|
Bill Richardson
2010/11/17 17:26:39
maxoutput > 2, to account for the trailing \0000.
Louis
2010/11/18 05:35:21
In line 452, the space has been reserved.
On 2010
|
| + utf16[s16idx++] = 0xD800 | ((code_point >> 10) - 0x0040); |
| + utf16[s16idx++] = 0xDC00 | (code_point & 0x03FF); |
| + maxoutput -= 2; |
| + } else { |
| + /* buffer underrun */ |
| + retval = CGPT_FAILED; |
| + break; |
| + } |
| + } |
| } |
| utf16[s16idx++] = 0; |
| + return retval; |
| } |
| struct { |
| @@ -693,4 +803,3 @@ void PMBRToStr(struct pmbr *pmbr, char *str, unsigned int buflen) { |
| require(snprintf(str, buflen, "PMBR (Boot GUID: %s)", buf) < buflen); |
| } |
| } |
| - |