| Index: src/unicode.cc
|
| diff --git a/src/unicode.cc b/src/unicode.cc
|
| index fa4afc59965d68d3fc5cac56655ce0051e0aa0c3..73ad3e423b0450e1cd44872700b8bbf6f354e219 100644
|
| --- a/src/unicode.cc
|
| +++ b/src/unicode.cc
|
| @@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) {
|
| // This method decodes an UTF-8 value according to RFC 3629.
|
| uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
|
| size_t length = NonASCIISequenceLength(str[0]);
|
| - if (length == 0 || max_length < length) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - if (length == 2) {
|
| - if (!IsContinuationCharacter(str[1])) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - *cursor += 2;
|
| - return ((str[0] << 6) + str[1]) - 0x00003080;
|
| +
|
| + // Check continuation characters.
|
| + size_t max_count = std::min(length, max_length);
|
| + size_t count = 1;
|
| + while (count < max_count && IsContinuationCharacter(str[count])) {
|
| + count++;
|
| }
|
| +
|
| + // Check overly long sequences & other conditions. Use length as error
|
| + // indicator.
|
| if (length == 3) {
|
| - switch (str[0]) {
|
| - case 0xE0:
|
| - // Overlong three-byte sequence.
|
| - if (str[1] < 0xA0 || str[1] > 0xBF) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - break;
|
| - case 0xED:
|
| - // High and low surrogate halves.
|
| - if (str[1] < 0x80 || str[1] > 0x9F) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - break;
|
| - default:
|
| - if (!IsContinuationCharacter(str[1])) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - }
|
| - if (!IsContinuationCharacter(str[2])) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| + if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
|
| + // Overlong three-byte sequence?
|
| + length = 0;
|
| + } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
|
| + // High and low surrogate halves?
|
| + length = 0;
|
| }
|
| - *cursor += 3;
|
| - return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
|
| - }
|
| - DCHECK(length == 4);
|
| - switch (str[0]) {
|
| - case 0xF0:
|
| + } else if (length == 4) {
|
| + if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
|
| // Overlong four-byte sequence.
|
| - if (str[1] < 0x90 || str[1] > 0xBF) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - break;
|
| - case 0xF4:
|
| + length = 0;
|
| + } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
|
| // Code points outside of the unicode range.
|
| - if (str[1] < 0x80 || str[1] > 0x8F) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| - break;
|
| - default:
|
| - if (!IsContinuationCharacter(str[1])) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| - }
|
| + length = 0;
|
| + }
|
| }
|
| - if (!IsContinuationCharacter(str[2])) {
|
| - *cursor += 1;
|
| +
|
| + if (count != length) {
|
| + // All invalid encodings should land here.
|
| + *cursor += count;
|
| return kBadChar;
|
| }
|
| - if (!IsContinuationCharacter(str[3])) {
|
| - *cursor += 1;
|
| - return kBadChar;
|
| +
|
| + // All errors have been handled, so we only have to assemble the result.
|
| + *cursor += length;
|
| + switch (length) {
|
| + case 1:
|
| + return str[0];
|
| + case 2:
|
| + return ((str[0] << 6) + str[1]) - 0x00003080;
|
| + case 3:
|
| + return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
|
| + case 4:
|
| + return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
|
| + 0x03C82080;
|
| }
|
| - *cursor += 4;
|
| - return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
|
| - 0x03C82080;
|
| +
|
| + UNREACHABLE();
|
| + return kBadChar;
|
| }
|
|
|
| uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
|
| @@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
|
| // with one shift.
|
| uint8_t mask = 0x7f >> kind;
|
|
|
| - // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
|
| - // in the bottom three.
|
| - *buffer = (kind - 1) << 24 | (next & mask);
|
| + // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
|
| + // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
|
| + // intended as a counter about how many bytes are still needed.
|
| + *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
|
| return kIncomplete;
|
| } else {
|
| // No buffer, and not the start of a 1-byte char (handled at the
|
| @@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
|
| // We're inside of a character, as described by buffer.
|
|
|
| // How many bytes (excluding this one) do we still expect?
|
| - uint8_t count = (*buffer >> 24) - 1;
|
| + uint8_t bytes_expected = *buffer >> 28;
|
| + uint8_t bytes_left = (*buffer >> 24) & 0x0f;
|
| + bytes_left--;
|
| // Update the value.
|
| uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
|
| - if (count) {
|
| - *buffer = count << 24 | value;
|
| + if (bytes_left) {
|
| + *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
|
| return kIncomplete;
|
| } else {
|
| *buffer = 0;
|
| - return value;
|
| + bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
|
| + (bytes_expected == 3 && value < 0x800);
|
| + return sequence_was_too_long ? kBadChar : value;
|
| }
|
| } else {
|
| // Within a character, but not a continuation character? Then the
|
|
|