Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(309)

Side by Side Diff: src/unicode.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // This file was generated at 2014-10-08 15:25:47.940335 5 // This file was generated at 2014-10-08 15:25:47.940335
6 6
7 #include "src/unicode-inl.h" 7 #include "src/unicode-inl.h"
8 #include <stdio.h> 8 #include <stdio.h>
9 #include <stdlib.h> 9 #include <stdlib.h>
10 10
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
183 return 0; 183 return 0;
184 } 184 }
185 return -1; 185 return -1;
186 } 186 }
187 } else { 187 } else {
188 return 0; 188 return 0;
189 } 189 }
190 } 190 }
191 191
192 192
193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { 193 static inline size_t NonASCIISequenceLength(byte first) {
194 // We only get called for non-ASCII characters. 194 static const uint8_t lengths[256] = {
195 if (length == 1) { 195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
203 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
204 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
205 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vogelheim 2015/05/21 16:58:38 The table is difficult to read. It also leaves me
jochen (gone - plz use gerrit) 2015/05/22 12:38:12 I updated the matrix to be 16 x 16, and added comm
206 return lengths[first];
207 }
208
209
210 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
vogelheim 2015/05/21 16:58:38 This might also benefit from a unit test that will
vogelheim 2015/05/21 16:58:38 I believe this deserves some commentary, and if on
jochen (gone - plz use gerrit) 2015/05/22 12:38:12 yeah, actually, it's supposed to be consistent wit
211 DCHECK((str[0] & 0x80) == 0x80);
212 size_t length = NonASCIISequenceLength(str[0]);
213 if (length == 0 || max_length < length) {
196 *cursor += 1; 214 *cursor += 1;
197 return kBadChar; 215 return kBadChar;
198 } 216 }
199 byte first = str[0]; 217 if (length == 2) {
vogelheim 2015/05/21 16:58:38 I was trying to figure out *why* these characters
vogelheim 2015/05/21 16:58:38 I find the code below to be somewhat confusing. If
jochen (gone - plz use gerrit) 2015/05/22 12:38:12 right. It's just that UTF-8 cannot encode all of u
200 byte second = str[1] ^ 0x80; 218 DCHECK(str[0] <= 0xDF);
201 if (second & 0xC0) { 219 if (str[0] < 0xC2) {
202 *cursor += 1;
203 return kBadChar;
204 }
205 if (first < 0xE0) {
206 if (first < 0xC0) {
207 *cursor += 1; 220 *cursor += 1;
208 return kBadChar; 221 return kBadChar;
209 } 222 }
210 uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; 223 if (str[1] < 0x80 || str[1] > 0xBF) {
211 if (code_point <= kMaxOneByteChar) {
212 *cursor += 1; 224 *cursor += 1;
213 return kBadChar; 225 return kBadChar;
214 } 226 }
215 *cursor += 2; 227 *cursor += 2;
216 return code_point; 228 return ((str[0] << 6) + str[1]) - 0x00003080;
217 } 229 }
218 if (length == 2) { 230 if (length == 3) {
219 *cursor += 1; 231 DCHECK(str[0] >= 0xE0 && str[0] <= 0xEF);
220 return kBadChar; 232 switch (str[0]) {
221 } 233 case 0xE0:
222 byte third = str[2] ^ 0x80; 234 if (str[1] < 0xA0 || str[1] > 0xBF) {
223 if (third & 0xC0) { 235 *cursor += 1;
224 *cursor += 1; 236 return kBadChar;
225 return kBadChar; 237 }
226 } 238 break;
227 if (first < 0xF0) { 239 case 0xED:
228 uchar code_point = ((((first << 6) | second) << 6) | third) 240 if (str[1] < 0x80 || str[1] > 0x9F) {
229 & kMaxThreeByteChar; 241 *cursor += 1;
230 if (code_point <= kMaxTwoByteChar) { 242 return kBadChar;
243 }
244 break;
245 default:
246 if (str[1] < 0x80 || str[1] > 0xBF) {
247 *cursor += 1;
248 return kBadChar;
249 }
250 }
251 if (str[2] < 0x80 || str[2] > 0xBF) {
231 *cursor += 1; 252 *cursor += 1;
232 return kBadChar; 253 return kBadChar;
233 } 254 }
234 *cursor += 3; 255 *cursor += 3;
235 return code_point; 256 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
236 } 257 }
237 if (length == 3) { 258 DCHECK(length == 4);
259 DCHECK(str[0] >= 0xF0 && str[0] <= 0xF4);
260 switch (str[0]) {
261 case 0xF0:
262 if (str[1] < 0x90 || str[1] > 0xBF) {
263 *cursor += 1;
264 return kBadChar;
265 }
266 break;
267 case 0xF4:
268 if (str[1] < 0x80 || str[1] > 0x8F) {
269 *cursor += 1;
270 return kBadChar;
271 }
272 break;
273 default:
274 if (str[1] < 0x80 || str[1] > 0xBF) {
275 *cursor += 1;
276 return kBadChar;
277 }
278 }
279 if (str[2] < 0x80 || str[2] > 0xBF) {
238 *cursor += 1; 280 *cursor += 1;
239 return kBadChar; 281 return kBadChar;
240 } 282 }
241 byte fourth = str[3] ^ 0x80; 283 if (str[3] < 0x80 || str[3] > 0xBF) {
242 if (fourth & 0xC0) {
243 *cursor += 1; 284 *cursor += 1;
244 return kBadChar; 285 return kBadChar;
245 } 286 }
246 if (first < 0xF8) { 287 *cursor += 4;
247 uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) 288 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
248 & kMaxFourByteChar; 289 0x03C82080;
249 if (code_point <= kMaxThreeByteChar) {
250 *cursor += 1;
251 return kBadChar;
252 }
253 *cursor += 4;
254 return code_point;
255 }
256 *cursor += 1;
257 return kBadChar;
258 } 290 }
259 291
260 292
261 // Uppercase: point.category == 'Lu' 293 // Uppercase: point.category == 'Lu'
262 294
263 static const uint16_t kUppercaseTable0Size = 455; 295 static const uint16_t kUppercaseTable0Size = 455;
264 static const int32_t kUppercaseTable0[455] = { 296 static const int32_t kUppercaseTable0[455] = {
265 1073741889, 90, 1073742016, 214, 297 1073741889, 90, 1073742016, 214,
266 1073742040, 222, 256, 258, // NOLINT 298 1073742040, 222, 256, 258, // NOLINT
267 260, 262, 264, 266, 299 260, 262, 264, 266,
(...skipping 3117 matching lines...) Expand 10 before | Expand all | Expand 10 after
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3417 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3386 + 3418 +
3387 kCanonicalizationRangeMultiStrings1Size * 3419 kCanonicalizationRangeMultiStrings1Size *
3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3420 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3389 + 3421 +
3390 kCanonicalizationRangeMultiStrings7Size * 3422 kCanonicalizationRangeMultiStrings7Size *
3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT 3423 sizeof(MultiCharacterSpecialCase<1>); // NOLINT
3392 } 3424 }
3393 3425
3394 } // namespace unibrow 3426 } // namespace unibrow
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698