Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(248)

Unified Diff: src/unicode.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/unicode.cc
diff --git a/src/unicode.cc b/src/unicode.cc
index 0d0d63d1775660018b57e4e3e107110371a55661..519750505df8563d8599f8e2ef3a9d3face36c3a 100644
--- a/src/unicode.cc
+++ b/src/unicode.cc
@@ -190,71 +190,103 @@ static int LookupMapping(const int32_t* table,
}
-uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {
- // We only get called for non-ASCII characters.
- if (length == 1) {
- *cursor += 1;
- return kBadChar;
- }
- byte first = str[0];
- byte second = str[1] ^ 0x80;
- if (second & 0xC0) {
+static inline size_t NonASCIISequenceLength(byte first) {
+ static const uint8_t lengths[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vogelheim 2015/05/21 16:58:38 The table is difficult to read. It also leaves me
jochen (gone - plz use gerrit) 2015/05/22 12:38:12 I updated the matrix to be 16 x 16, and added comm
+ return lengths[first];
+}
+
+
+uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
vogelheim 2015/05/21 16:58:38 This might also benefit from a unit test that will
vogelheim 2015/05/21 16:58:38 I believe this deserves some commentary, and if on
jochen (gone - plz use gerrit) 2015/05/22 12:38:12 yeah, actually, it's supposed to be consistent wit
+ DCHECK((str[0] & 0x80) == 0x80);
+ size_t length = NonASCIISequenceLength(str[0]);
+ if (length == 0 || max_length < length) {
*cursor += 1;
return kBadChar;
}
- if (first < 0xE0) {
- if (first < 0xC0) {
+ if (length == 2) {
vogelheim 2015/05/21 16:58:38 I was trying to figure out *why* these characters
vogelheim 2015/05/21 16:58:38 I find the code below to be somewhat confusing. If
jochen (gone - plz use gerrit) 2015/05/22 12:38:12 right. It's just that UTF-8 cannot encode all of u
+ DCHECK(str[0] <= 0xDF);
+ if (str[0] < 0xC2) {
*cursor += 1;
return kBadChar;
}
- uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
- if (code_point <= kMaxOneByteChar) {
+ if (str[1] < 0x80 || str[1] > 0xBF) {
*cursor += 1;
return kBadChar;
}
*cursor += 2;
- return code_point;
- }
- if (length == 2) {
- *cursor += 1;
- return kBadChar;
- }
- byte third = str[2] ^ 0x80;
- if (third & 0xC0) {
- *cursor += 1;
- return kBadChar;
+ return ((str[0] << 6) + str[1]) - 0x00003080;
}
- if (first < 0xF0) {
- uchar code_point = ((((first << 6) | second) << 6) | third)
- & kMaxThreeByteChar;
- if (code_point <= kMaxTwoByteChar) {
+ if (length == 3) {
+ DCHECK(str[0] >= 0xE0 && str[0] <= 0xEF);
+ switch (str[0]) {
+ case 0xE0:
+ if (str[1] < 0xA0 || str[1] > 0xBF) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ break;
+ case 0xED:
+ if (str[1] < 0x80 || str[1] > 0x9F) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ break;
+ default:
+ if (str[1] < 0x80 || str[1] > 0xBF) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ }
+ if (str[2] < 0x80 || str[2] > 0xBF) {
*cursor += 1;
return kBadChar;
}
*cursor += 3;
- return code_point;
+ return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
}
- if (length == 3) {
+ DCHECK(length == 4);
+ DCHECK(str[0] >= 0xF0 && str[0] <= 0xF4);
+ switch (str[0]) {
+ case 0xF0:
+ if (str[1] < 0x90 || str[1] > 0xBF) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ break;
+ case 0xF4:
+ if (str[1] < 0x80 || str[1] > 0x8F) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ break;
+ default:
+ if (str[1] < 0x80 || str[1] > 0xBF) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ }
+ if (str[2] < 0x80 || str[2] > 0xBF) {
*cursor += 1;
return kBadChar;
}
- byte fourth = str[3] ^ 0x80;
- if (fourth & 0xC0) {
+ if (str[3] < 0x80 || str[3] > 0xBF) {
*cursor += 1;
return kBadChar;
}
- if (first < 0xF8) {
- uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth)
- & kMaxFourByteChar;
- if (code_point <= kMaxThreeByteChar) {
- *cursor += 1;
- return kBadChar;
- }
- *cursor += 4;
- return code_point;
- }
- *cursor += 1;
- return kBadChar;
+ *cursor += 4;
+ return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
+ 0x03C82080;
}
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698