Index: src/unicode.cc |
diff --git a/src/unicode.cc b/src/unicode.cc |
index 0d0d63d1775660018b57e4e3e107110371a55661..df45697bde2a32f2550ef5a99757dd11620fd56f 100644 |
--- a/src/unicode.cc |
+++ b/src/unicode.cc |
@@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table, |
} |
-uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { |
- // We only get called for non-ASCII characters. |
- if (length == 1) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- byte first = str[0]; |
- byte second = str[1] ^ 0x80; |
- if (second & 0xC0) { |
+static inline size_t NonASCIISequenceLength(byte first) { |
+ // clang-format off |
+ static const uint8_t lengths[256] = { |
+ // The first 128 entries correspond to ASCII characters. |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ // The following 64 entries correspond to continuation bytes. |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
+ // The next are two invalid overlong encodings and 30 two-byte sequences. |
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
+ // 16 three-byte sequences. |
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
+ // 5 four-byte sequences, followed by sequences that could only encode |
+ // code points outside of the unicode range. |
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
+ // clang-format on |
+ return lengths[first]; |
+} |
+ |
+ |
+static inline bool IsContinuationCharacter(byte chr) { |
+ return chr >= 0x80 && chr <= 0xBF; |
+} |
+ |
+ |
+// This method decodes an UTF-8 value according to RFC 3629. |
+uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { |
+ size_t length = NonASCIISequenceLength(str[0]); |
+ if (length == 0 || max_length < length) { |
*cursor += 1; |
return kBadChar; |
} |
- if (first < 0xE0) { |
- if (first < 0xC0) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; |
- if (code_point <= kMaxOneByteChar) { |
+ if (length == 2) { |
+ if (!IsContinuationCharacter(str[1])) { |
*cursor += 1; |
return kBadChar; |
} |
*cursor += 2; |
- return code_point; |
+ return ((str[0] << 6) + str[1]) - 0x00003080; |
} |
- if (length == 2) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- byte third = str[2] ^ 0x80; |
- if (third & 0xC0) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- if (first < 0xF0) { |
- uchar code_point = ((((first << 6) | second) << 6) | third) |
- & kMaxThreeByteChar; |
- if (code_point <= kMaxTwoByteChar) { |
+ if (length == 3) { |
+ switch (str[0]) { |
+ case 0xE0: |
+ // Overlong three-byte sequence. |
+ if (str[1] < 0xA0 || str[1] > 0xBF) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ break; |
+ case 0xED: |
+ // High and low surrogate halves. |
+ if (str[1] < 0x80 || str[1] > 0x9F) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ break; |
+ default: |
+ if (!IsContinuationCharacter(str[1])) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ } |
+ if (!IsContinuationCharacter(str[2])) { |
*cursor += 1; |
return kBadChar; |
} |
*cursor += 3; |
- return code_point; |
+ return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
} |
- if (length == 3) { |
+ DCHECK(length == 4); |
+ switch (str[0]) { |
+ case 0xF0: |
+ // Overlong four-byte sequence. |
+ if (str[1] < 0x90 || str[1] > 0xBF) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ break; |
+ case 0xF4: |
+ // Code points outside of the unicode range. |
+ if (str[1] < 0x80 || str[1] > 0x8F) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ break; |
+ default: |
+ if (!IsContinuationCharacter(str[1])) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ } |
+ if (!IsContinuationCharacter(str[2])) { |
*cursor += 1; |
return kBadChar; |
} |
- byte fourth = str[3] ^ 0x80; |
- if (fourth & 0xC0) { |
+ if (!IsContinuationCharacter(str[3])) { |
*cursor += 1; |
return kBadChar; |
} |
- if (first < 0xF8) { |
- uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) |
- & kMaxFourByteChar; |
- if (code_point <= kMaxThreeByteChar) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- *cursor += 4; |
- return code_point; |
- } |
- *cursor += 1; |
- return kBadChar; |
+ *cursor += 4; |
+ return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
+ 0x03C82080; |
} |