| Index: cgpt/cgpt_common.c
|
| diff --git a/cgpt/cgpt_common.c b/cgpt/cgpt_common.c
|
| index 0e466fdc55085a113c81bf261d6447167595e734..52cbe70c9b445f52fca09405f084b10d6bca3d90 100644
|
| --- a/cgpt/cgpt_common.c
|
| +++ b/cgpt/cgpt_common.c
|
| @@ -350,56 +350,209 @@ void GuidToStr(const Guid *guid, char *str, unsigned int buflen) {
|
|
|
| /* Convert possibly unterminated UTF16 string to UTF8.
|
| * Caller must prepare enough space for UTF8, which could be up to
|
| - * twice the number of UTF16 chars plus the terminating '\0'.
|
| - * FIXME(wfrichar): The original implementation had security issues. As a
|
| - * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542
|
| - * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix
|
| - * this.
|
| + * twice the byte length of UTF16 string plus the terminating '\0'.
|
| + * See the following table for encoding lengths.
|
| + *
|
| + * Code point UTF16 UTF8
|
| + * 0x0000-0x007F 2 bytes 1 byte
|
| + * 0x0080-0x07FF 2 bytes 2 bytes
|
| + * 0x0800-0xFFFF 2 bytes 3 bytes
|
| + * 0x10000-0x10FFFF 4 bytes 4 bytes
|
| + *
|
| + * This function uses a simple state meachine to convert UTF-16 char(s) to
|
| + * a code point. Once a code point is parsed out, the state machine throws
|
| + * out sequencial UTF-8 chars in one time.
|
| + *
|
| + * Return: CGPT_OK --- all character are converted successfully.
|
| + * CGPT_FAILED --- convert error, i.e. output buffer is too short.
|
| */
|
| -void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
|
| - uint8_t *utf8, unsigned int maxoutput)
|
| +int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
|
| + uint8_t *utf8, unsigned int maxoutput)
|
| {
|
| size_t s16idx, s8idx;
|
| - uint32_t utfchar;
|
| + uint32_t code_point;
|
| + int code_point_ready = 1; // code point is ready to output.
|
| + int retval = CGPT_OK;
|
|
|
| if (!utf16 || !maxinput || !utf8 || !maxoutput)
|
| - return;
|
| + return CGPT_FAILED;
|
|
|
| maxoutput--; /* plan for termination now */
|
|
|
| for (s16idx = s8idx = 0;
|
| s16idx < maxinput && utf16[s16idx] && maxoutput;
|
| - s16idx++, maxoutput--) {
|
| - utfchar = le16toh(utf16[s16idx]);
|
| - utf8[s8idx++] = utfchar & 0x7F;
|
| + s16idx++) {
|
| + uint16_t codeunit = le16toh(utf16[s16idx]);
|
| +
|
| + if (code_point_ready) {
|
| + if (codeunit >= 0xD800 && codeunit <= 0xDBFF) {
|
| + /* high surrogate, need the low surrogate. */
|
| + code_point_ready = 0;
|
| + code_point = (codeunit & 0x03FF) + 0x0040;
|
| + } else {
|
| + /* BMP char, output it. */
|
| + code_point = codeunit;
|
| + }
|
| + } else {
|
| + /* expect the low surrogate */
|
| + if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) {
|
| + code_point = (code_point << 10) | (codeunit & 0x03FF);
|
| + code_point_ready = 1;
|
| + } else {
|
| + /* the second code unit is NOT the low surrogate. Unexpected. */
|
| + retval = CGPT_FAILED;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* If UTF code point is ready, output it. */
|
| + if (code_point_ready) {
|
| + require(code_point <= 0x10FFFF);
|
| + if (code_point <= 0x7F && maxoutput >= 1) {
|
| + maxoutput -= 1;
|
| + utf8[s8idx++] = code_point & 0x7F;
|
| + } else if (code_point <= 0x7FF && maxoutput >= 2) {
|
| + maxoutput -= 2;
|
| + utf8[s8idx++] = 0xC0 | (code_point >> 6);
|
| + utf8[s8idx++] = 0x80 | (code_point & 0x3F);
|
| + } else if (code_point <= 0xFFFF && maxoutput >= 3) {
|
| + maxoutput -= 3;
|
| + utf8[s8idx++] = 0xE0 | (code_point >> 12);
|
| + utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);
|
| + utf8[s8idx++] = 0x80 | (code_point & 0x3F);
|
| + } else if (code_point <= 0x10FFFF && maxoutput >= 4) {
|
| + maxoutput -= 4;
|
| + utf8[s8idx++] = 0xF0 | (code_point >> 18);
|
| + utf8[s8idx++] = 0x80 | ((code_point >> 12) & 0x3F);
|
| + utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);
|
| + utf8[s8idx++] = 0x80 | (code_point & 0x3F);
|
| + } else {
|
| + /* buffer underrun */
|
| + retval = CGPT_FAILED;
|
| + break;
|
| + }
|
| + }
|
| }
|
| utf8[s8idx++] = 0;
|
| + return retval;
|
| }
|
|
|
| /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.
|
| * Caller must prepare enough space for UTF16, including a terminating 0x0000.
|
| - * FIXME(wfrichar): The original implementation had security issues. As a
|
| - * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542
|
| - * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix
|
| - * this.
|
| + * See the following table for encoding lengths. In any case, the caller
|
| + * just needs to prepare the byte length of UTF8 plus the terminating 0x0000.
|
| + *
|
| + * Code point UTF16 UTF8
|
| + * 0x0000-0x007F 2 bytes 1 byte
|
| + * 0x0080-0x07FF 2 bytes 2 bytes
|
| + * 0x0800-0xFFFF 2 bytes 3 bytes
|
| + * 0x10000-0x10FFFF 4 bytes 4 bytes
|
| + *
|
| + * This function converts UTF8 chars to a code point first. Then, convrts it
|
| + * to UTF16 code unit(s).
|
| + *
|
| + * Return: CGPT_OK --- all character are converted successfully.
|
| + * CGPT_FAILED --- convert error, i.e. output buffer is too short.
|
| */
|
| -void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)
|
| +int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)
|
| {
|
| size_t s16idx, s8idx;
|
| - uint32_t utfchar;
|
| + uint32_t code_point = 0;
|
| + unsigned int expected_units = 1;
|
| + unsigned int decoded_units = 1;
|
| + int retval = CGPT_OK;
|
|
|
| if (!utf8 || !utf16 || !maxoutput)
|
| - return;
|
| + return CGPT_FAILED;
|
|
|
| maxoutput--; /* plan for termination */
|
|
|
| for (s8idx = s16idx = 0;
|
| utf8[s8idx] && maxoutput;
|
| - s8idx++, maxoutput--) {
|
| - utfchar = utf8[s8idx];
|
| - utf16[s16idx++] = utfchar & 0x7F;
|
| + s8idx++) {
|
| + uint8_t code_unit;
|
| + code_unit = utf8[s8idx];
|
| +
|
| + if (expected_units != decoded_units) {
|
| + /* Trailing bytes of multi-byte character */
|
| + if ((code_unit & 0xC0) == 0x80) {
|
| + code_point = (code_point << 6) | (code_unit & 0x3F);
|
| + ++decoded_units;
|
| + } else {
|
| + /* Unexpected code unit. */
|
| + retval = CGPT_FAILED;
|
| + break;
|
| + }
|
| + } else {
|
| + /* parsing a new code point. */
|
| + decoded_units = 1;
|
| + if (code_unit <= 0x7F) {
|
| + code_point = code_unit;
|
| + expected_units = 1;
|
| + } else if (code_unit <= 0xBF) {
|
| + /* 0x80-0xBF must NOT be the heading byte unit of a new code point. */
|
| + retval = CGPT_FAILED;
|
| + break;
|
| + } else if (code_unit >= 0xC2 && code_unit <= 0xDF) {
|
| + code_point = code_unit & 0x1F;
|
| + expected_units = 2;
|
| + } else if (code_unit >= 0xE0 && code_unit <= 0xEF) {
|
| + code_point = code_unit & 0x0F;
|
| + expected_units = 3;
|
| + } else if (code_unit >= 0xF0 && code_unit <= 0xF4) {
|
| + code_point = code_unit & 0x07;
|
| + expected_units = 4;
|
| + } else {
|
| + /* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */
|
| + retval = CGPT_FAILED;
|
| + break;
|
| + }
|
| + }
|
| +
|
| + /* If no more unit is needed, output the UTF16 unit(s). */
|
| + if (expected_units == decoded_units) {
|
| + /* Check if the encoding is the shortest possible UTF-8 sequence. */
|
| + switch (expected_units) {
|
| + case 2:
|
| + if (code_point <= 0x7F) retval = CGPT_FAILED;
|
| + break;
|
| + case 3:
|
| + if (code_point <= 0x7FF) retval = CGPT_FAILED;
|
| + break;
|
| + case 4:
|
| + if (code_point <= 0xFFFF) retval = CGPT_FAILED;
|
| + break;
|
| + }
|
| + if (retval == CGPT_FAILED) break; /* leave immediately */
|
| +
|
| + if ((code_point <= 0xD7FF) ||
|
| + (code_point >= 0xE000 && code_point <= 0xFFFF)) {
|
| + utf16[s16idx++] = code_point;
|
| + maxoutput -= 1;
|
| + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF &&
|
| + maxoutput >= 2) {
|
| + utf16[s16idx++] = 0xD800 | ((code_point >> 10) - 0x0040);
|
| + utf16[s16idx++] = 0xDC00 | (code_point & 0x03FF);
|
| + maxoutput -= 2;
|
| + } else {
|
| + /* Three possibilities fall into here. Both are failure cases.
|
| + * a. surrogate pair (non-BMP characters; 0xD800~0xDFFF)
|
| + * b. invalid code point > 0x10FFFF
|
| + * c. buffer underrun
|
| + */
|
| + retval = CGPT_FAILED;
|
| + break;
|
| + }
|
| + }
|
| }
|
| +
|
| + /* A null-terminator shows up before the UTF8 sequence ends. */
|
| + if (expected_units != decoded_units) {
|
| + retval = CGPT_FAILED;
|
| + }
|
| +
|
| utf16[s16idx++] = 0;
|
| + return retval;
|
| }
|
|
|
| struct {
|
|
|