Chromium Code Reviews| Index: src/unicode.cc |
| diff --git a/src/unicode.cc b/src/unicode.cc |
| index 0d0d63d1775660018b57e4e3e107110371a55661..519750505df8563d8599f8e2ef3a9d3face36c3a 100644 |
| --- a/src/unicode.cc |
| +++ b/src/unicode.cc |
| @@ -190,71 +190,103 @@ static int LookupMapping(const int32_t* table, |
| } |
| -uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { |
| - // We only get called for non-ASCII characters. |
| - if (length == 1) { |
| - *cursor += 1; |
| - return kBadChar; |
| - } |
| - byte first = str[0]; |
| - byte second = str[1] ^ 0x80; |
| - if (second & 0xC0) { |
| +static inline size_t NonASCIISequenceLength(byte first) { |
| + static const uint8_t lengths[256] = { |
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
|
vogelheim
2015/05/21 16:58:38
The table is difficult to read. It also leaves me
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
I updated the matrix to be 16 x 16, and added comm
|
| + return lengths[first]; |
| +} |
| + |
| + |
| +uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { |
|
vogelheim
2015/05/21 16:58:38
This might also benefit from a unit test that will
vogelheim
2015/05/21 16:58:38
I believe this deserves some commentary, and if on
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
yeah, actually, it's supposed to be consistent wit
|
| + DCHECK((str[0] & 0x80) == 0x80); |
| + size_t length = NonASCIISequenceLength(str[0]); |
| + if (length == 0 || max_length < length) { |
| *cursor += 1; |
| return kBadChar; |
| } |
| - if (first < 0xE0) { |
| - if (first < 0xC0) { |
| + if (length == 2) { |
|
vogelheim
2015/05/21 16:58:38
I was trying to figure out *why* these characters
vogelheim
2015/05/21 16:58:38
I find the code below to be somewhat confusing. If
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
right. It's just that UTF-8 cannot encode all of u
|
| + DCHECK(str[0] <= 0xDF); |
| + if (str[0] < 0xC2) { |
| *cursor += 1; |
| return kBadChar; |
| } |
| - uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; |
| - if (code_point <= kMaxOneByteChar) { |
| + if (str[1] < 0x80 || str[1] > 0xBF) { |
| *cursor += 1; |
| return kBadChar; |
| } |
| *cursor += 2; |
| - return code_point; |
| - } |
| - if (length == 2) { |
| - *cursor += 1; |
| - return kBadChar; |
| - } |
| - byte third = str[2] ^ 0x80; |
| - if (third & 0xC0) { |
| - *cursor += 1; |
| - return kBadChar; |
| + return ((str[0] << 6) + str[1]) - 0x00003080; |
| } |
| - if (first < 0xF0) { |
| - uchar code_point = ((((first << 6) | second) << 6) | third) |
| - & kMaxThreeByteChar; |
| - if (code_point <= kMaxTwoByteChar) { |
| + if (length == 3) { |
| + DCHECK(str[0] >= 0xE0 && str[0] <= 0xEF); |
| + switch (str[0]) { |
| + case 0xE0: |
| + if (str[1] < 0xA0 || str[1] > 0xBF) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + break; |
| + case 0xED: |
| + if (str[1] < 0x80 || str[1] > 0x9F) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + break; |
| + default: |
| + if (str[1] < 0x80 || str[1] > 0xBF) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + } |
| + if (str[2] < 0x80 || str[2] > 0xBF) { |
| *cursor += 1; |
| return kBadChar; |
| } |
| *cursor += 3; |
| - return code_point; |
| + return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
| } |
| - if (length == 3) { |
| + DCHECK(length == 4); |
| + DCHECK(str[0] >= 0xF0 && str[0] <= 0xF4); |
| + switch (str[0]) { |
| + case 0xF0: |
| + if (str[1] < 0x90 || str[1] > 0xBF) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + break; |
| + case 0xF4: |
| + if (str[1] < 0x80 || str[1] > 0x8F) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + break; |
| + default: |
| + if (str[1] < 0x80 || str[1] > 0xBF) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + } |
| + if (str[2] < 0x80 || str[2] > 0xBF) { |
| *cursor += 1; |
| return kBadChar; |
| } |
| - byte fourth = str[3] ^ 0x80; |
| - if (fourth & 0xC0) { |
| + if (str[3] < 0x80 || str[3] > 0xBF) { |
| *cursor += 1; |
| return kBadChar; |
| } |
| - if (first < 0xF8) { |
| - uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) |
| - & kMaxFourByteChar; |
| - if (code_point <= kMaxThreeByteChar) { |
| - *cursor += 1; |
| - return kBadChar; |
| - } |
| - *cursor += 4; |
| - return code_point; |
| - } |
| - *cursor += 1; |
| - return kBadChar; |
| + *cursor += 4; |
| + return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
| + 0x03C82080; |
| } |