src/unicode.cc - Issue 1148653007: Update UTF-8 decoder to detect more special cases.

Unified Diff: src/unicode.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: updates Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/unicode.cc

diff --git a/src/unicode.cc b/src/unicode.cc

index 0d0d63d1775660018b57e4e3e107110371a55661..67829cb71dc13ebc4865641dd04347fee0846ff9 100644

--- a/src/unicode.cc

+++ b/src/unicode.cc

@@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table,

}

-uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {

- // We only get called for non-ASCII characters.

- if (length == 1) {

- *cursor += 1;

- return kBadChar;

- }

- byte first = str[0];

- byte second = str[1] ^ 0x80;

- if (second & 0xC0) {

+static inline size_t NonASCIISequenceLength(byte first) {

+ // clang-format off

+ static const uint8_t lengths[256] = {

+ // The first 128 entries correspond to ASCII characters.

+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+ // The following 64 entries correspond to continuation bytes.

+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+ // The next are two invalid overlong encodings and 30 two-byte sequences.

+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+ // 16 three-byte sequences.

+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

+ // 5 four-byte sequences, followed by sequences that could only encode

+ // code points outside of the unicode range.

+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

+ // clang-format on

+ return lengths[first];

+static inline bool IsContinuationCharacter(byte chr) {

+ return chr >= 0x80 && chr <= 0xBF;

+// This method decodes an UTF-8 value according to RFC 3629.

+uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {

vogelheim 2015/05/22 15:53:38 Not sure if worth the effort, but this might be mo

jochen (gone - plz use gerrit) 2015/05/22 18:12:39 then I pick this version :)

+ size_t length = NonASCIISequenceLength(str[0]);

+ if (length == 0 || max_length < length) {

*cursor += 1;

return kBadChar;

}

- if (first < 0xE0) {

- if (first < 0xC0) {

- *cursor += 1;

- return kBadChar;

- }

- uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;

- if (code_point <= kMaxOneByteChar) {

+ if (length == 2) {

+ if (!IsContinuationCharacter(str[1])) {

*cursor += 1;

return kBadChar;

}

*cursor += 2;

- return code_point;

+ return ((str[0] << 6) + str[1]) - 0x00003080;

}

- if (length == 2) {

- *cursor += 1;

- return kBadChar;

- }

- byte third = str[2] ^ 0x80;

- if (third & 0xC0) {

- *cursor += 1;

- return kBadChar;

- }

- if (first < 0xF0) {

- uchar code_point = ((((first << 6) | second) << 6) | third)

- & kMaxThreeByteChar;

- if (code_point <= kMaxTwoByteChar) {

+ if (length == 3) {

+ switch (str[0]) {

+ case 0xE0:

+ // Overlong three-byte sequence.

+ if (str[1] < 0xA0 || str[1] > 0xBF) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ break;

+ case 0xED:

+ // High and low surrogate halves.

+ if (str[1] < 0x80 || str[1] > 0x9F) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ break;

+ default:

+ if (!IsContinuationCharacter(str[1])) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ if (!IsContinuationCharacter(str[2])) {

*cursor += 1;

return kBadChar;

}

*cursor += 3;

- return code_point;

+ return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

}

- if (length == 3) {

+ DCHECK(length == 4);

+ switch (str[0]) {

+ case 0xF0:

+ // Overlong four-byte sequence.

+ if (str[1] < 0x90 || str[1] > 0xBF) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ break;

+ case 0xF4:

+ // Code poits outside of the unicode range.

vogelheim 2015/05/22 15:53:38 poits -> points

+ if (str[1] < 0x80 || str[1] > 0x8F) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ break;

+ default:

+ if (!IsContinuationCharacter(str[1])) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ if (!IsContinuationCharacter(str[2])) {

*cursor += 1;

return kBadChar;

}

- byte fourth = str[3] ^ 0x80;

- if (fourth & 0xC0) {

+ if (!IsContinuationCharacter(str[3])) {

*cursor += 1;

return kBadChar;

}

- if (first < 0xF8) {

- uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth)

- & kMaxFourByteChar;

- if (code_point <= kMaxThreeByteChar) {

- *cursor += 1;

- return kBadChar;

- }

- *cursor += 4;

- return code_point;

- }

- *cursor += 1;

- return kBadChar;

+ *cursor += 4;

+ return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

+ 0x03C82080;

}

« no previous file with comments | « no previous file | test/cctest/test-api.cc » ('j') | test/cctest/test-api.cc » ('J')