Index: src/unicode.cc |
diff --git a/src/unicode.cc b/src/unicode.cc |
index fa4afc59965d68d3fc5cac56655ce0051e0aa0c3..73ad3e423b0450e1cd44872700b8bbf6f354e219 100644 |
--- a/src/unicode.cc |
+++ b/src/unicode.cc |
@@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) { |
// This method decodes an UTF-8 value according to RFC 3629. |
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { |
size_t length = NonASCIISequenceLength(str[0]); |
- if (length == 0 || max_length < length) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- if (length == 2) { |
- if (!IsContinuationCharacter(str[1])) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- *cursor += 2; |
- return ((str[0] << 6) + str[1]) - 0x00003080; |
+ |
+ // Check continuation characters. |
+ size_t max_count = std::min(length, max_length); |
+ size_t count = 1; |
+ while (count < max_count && IsContinuationCharacter(str[count])) { |
+ count++; |
} |
+ |
+ // Check overly long sequences & other conditions. Use length as error |
+ // indicator. |
if (length == 3) { |
- switch (str[0]) { |
- case 0xE0: |
- // Overlong three-byte sequence. |
- if (str[1] < 0xA0 || str[1] > 0xBF) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- break; |
- case 0xED: |
- // High and low surrogate halves. |
- if (str[1] < 0x80 || str[1] > 0x9F) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- break; |
- default: |
- if (!IsContinuationCharacter(str[1])) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- } |
- if (!IsContinuationCharacter(str[2])) { |
- *cursor += 1; |
- return kBadChar; |
+ if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) { |
+ // Overlong three-byte sequence? |
+ length = 0; |
+ } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) { |
+ // High and low surrogate halves? |
+ length = 0; |
} |
- *cursor += 3; |
- return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
- } |
- DCHECK(length == 4); |
- switch (str[0]) { |
- case 0xF0: |
+ } else if (length == 4) { |
+ if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) { |
// Overlong four-byte sequence. |
- if (str[1] < 0x90 || str[1] > 0xBF) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- break; |
- case 0xF4: |
+ length = 0; |
+ } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) { |
// Code points outside of the unicode range. |
- if (str[1] < 0x80 || str[1] > 0x8F) { |
- *cursor += 1; |
- return kBadChar; |
- } |
- break; |
- default: |
- if (!IsContinuationCharacter(str[1])) { |
- *cursor += 1; |
- return kBadChar; |
- } |
+ length = 0; |
+ } |
} |
- if (!IsContinuationCharacter(str[2])) { |
- *cursor += 1; |
+ |
+ if (count != length) { |
+ // All invalid encodings should land here. |
+ *cursor += count; |
return kBadChar; |
} |
- if (!IsContinuationCharacter(str[3])) { |
- *cursor += 1; |
- return kBadChar; |
+ |
+ // All errors have been handled, so we only have to assemble the result. |
+ *cursor += length; |
+ switch (length) { |
+ case 1: |
+ return str[0]; |
+ case 2: |
+ return ((str[0] << 6) + str[1]) - 0x00003080; |
+ case 3: |
+ return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
+ case 4: |
+ return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
+ 0x03C82080; |
} |
- *cursor += 4; |
- return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
- 0x03C82080; |
+ |
+ UNREACHABLE(); |
+ return kBadChar; |
} |
uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { |
@@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { |
// with one shift. |
uint8_t mask = 0x7f >> kind; |
- // Store the kind - 1 (i.e., remaining bytes) in the top byte, value |
- // in the bottom three. |
- *buffer = (kind - 1) << 24 | (next & mask); |
+ // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes) |
+ // in 2nd nibble, and the value in the bottom three. The 2nd nibble is |
+ // intended as a counter about how many bytes are still needed. |
+ *buffer = kind << 28 | (kind - 1) << 24 | (next & mask); |
return kIncomplete; |
} else { |
// No buffer, and not the start of a 1-byte char (handled at the |
@@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { |
// We're inside of a character, as described by buffer. |
// How many bytes (excluding this one) do we still expect? |
- uint8_t count = (*buffer >> 24) - 1; |
+ uint8_t bytes_expected = *buffer >> 28; |
+ uint8_t bytes_left = (*buffer >> 24) & 0x0f; |
+ bytes_left--; |
// Update the value. |
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); |
- if (count) { |
- *buffer = count << 24 | value; |
+ if (bytes_left) { |
+ *buffer = (bytes_expected << 28 | bytes_left << 24 | value); |
return kIncomplete; |
} else { |
*buffer = 0; |
- return value; |
+ bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) || |
+ (bytes_expected == 3 && value < 0x800); |
+ return sequence_was_too_long ? kBadChar : value; |
} |
} else { |
// Within a character, but not a continuation character? Then the |