Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(561)

Side by Side Diff: src/unicode.cc

Issue 2493143003: Return kBadChar for longest subpart of incomplete utf-8 character. (Closed)
Patch Set: Fix end of buffer handling. Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | test/cctest/test-parsing.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // This file was generated at 2014-10-08 15:25:47.940335 5 // This file was generated at 2014-10-08 15:25:47.940335
6 6
7 #include "src/unicode.h" 7 #include "src/unicode.h"
8 #include "src/unicode-inl.h" 8 #include "src/unicode-inl.h"
9 #include <stdio.h> 9 #include <stdio.h>
10 #include <stdlib.h> 10 #include <stdlib.h>
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after
221 221
222 222
223 static inline bool IsContinuationCharacter(byte chr) { 223 static inline bool IsContinuationCharacter(byte chr) {
224 return chr >= 0x80 && chr <= 0xBF; 224 return chr >= 0x80 && chr <= 0xBF;
225 } 225 }
226 226
227 227
228 // This method decodes an UTF-8 value according to RFC 3629. 228 // This method decodes an UTF-8 value according to RFC 3629.
229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { 229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
230 size_t length = NonASCIISequenceLength(str[0]); 230 size_t length = NonASCIISequenceLength(str[0]);
231 if (length == 0 || max_length < length) { 231
232 *cursor += 1; 232 // Check continuation characters.
233 size_t max_count = std::min(length, max_length);
234 size_t count = 1;
235 while (count < max_count && IsContinuationCharacter(str[count])) {
236 count++;
237 }
238
239 // Check overly long sequences & other conditions. Use length as error
240 // indicator.
241 if (length == 3) {
242 if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
243 // Overlong three-byte sequence?
244 length = 0;
245 } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
246 // High and low surrogate halves?
247 length = 0;
248 }
249 } else if (length == 4) {
250 if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
251 // Overlong four-byte sequence.
252 length = 0;
253 } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
254 // Code points outside of the unicode range.
255 length = 0;
256 }
257 }
258
259 if (count != length) {
260 // All invalid encodings should land here.
261 *cursor += count;
233 return kBadChar; 262 return kBadChar;
234 } 263 }
235 if (length == 2) { 264
236 if (!IsContinuationCharacter(str[1])) { 265 // All errors have been handled, so we only have to assemble the result.
237 *cursor += 1; 266 *cursor += length;
238 return kBadChar; 267 switch (length) {
239 } 268 case 1:
240 *cursor += 2; 269 return str[0];
241 return ((str[0] << 6) + str[1]) - 0x00003080; 270 case 2:
271 return ((str[0] << 6) + str[1]) - 0x00003080;
272 case 3:
273 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
274 case 4:
275 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
276 0x03C82080;
242 } 277 }
243 if (length == 3) { 278
244 switch (str[0]) { 279 UNREACHABLE();
245 case 0xE0: 280 return kBadChar;
246 // Overlong three-byte sequence.
247 if (str[1] < 0xA0 || str[1] > 0xBF) {
248 *cursor += 1;
249 return kBadChar;
250 }
251 break;
252 case 0xED:
253 // High and low surrogate halves.
254 if (str[1] < 0x80 || str[1] > 0x9F) {
255 *cursor += 1;
256 return kBadChar;
257 }
258 break;
259 default:
260 if (!IsContinuationCharacter(str[1])) {
261 *cursor += 1;
262 return kBadChar;
263 }
264 }
265 if (!IsContinuationCharacter(str[2])) {
266 *cursor += 1;
267 return kBadChar;
268 }
269 *cursor += 3;
270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
271 }
272 DCHECK(length == 4);
273 switch (str[0]) {
274 case 0xF0:
275 // Overlong four-byte sequence.
276 if (str[1] < 0x90 || str[1] > 0xBF) {
277 *cursor += 1;
278 return kBadChar;
279 }
280 break;
281 case 0xF4:
282 // Code points outside of the unicode range.
283 if (str[1] < 0x80 || str[1] > 0x8F) {
284 *cursor += 1;
285 return kBadChar;
286 }
287 break;
288 default:
289 if (!IsContinuationCharacter(str[1])) {
290 *cursor += 1;
291 return kBadChar;
292 }
293 }
294 if (!IsContinuationCharacter(str[2])) {
295 *cursor += 1;
296 return kBadChar;
297 }
298 if (!IsContinuationCharacter(str[3])) {
299 *cursor += 1;
300 return kBadChar;
301 }
302 *cursor += 4;
303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
304 0x03C82080;
305 } 281 }
306 282
307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { 283 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
308 DCHECK_NOT_NULL(buffer); 284 DCHECK_NOT_NULL(buffer);
309 285
310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) 286 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)
311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) { 287 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {
312 return static_cast<uchar>(next); 288 return static_cast<uchar>(next);
313 } 289 }
314 290
315 if (*buffer == 0) { 291 if (*buffer == 0) {
316 // We're at the start of a new character. 292 // We're at the start of a new character.
317 uint32_t kind = NonASCIISequenceLength(next); 293 uint32_t kind = NonASCIISequenceLength(next);
318 if (kind >= 2 && kind <= 4) { 294 if (kind >= 2 && kind <= 4) {
319 // Start of 2..4 byte character, and no buffer. 295 // Start of 2..4 byte character, and no buffer.
320 296
321 // The mask for the lower bits depends on the kind, and is 297 // The mask for the lower bits depends on the kind, and is
322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that 298 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that
323 // with one shift. 299 // with one shift.
324 uint8_t mask = 0x7f >> kind; 300 uint8_t mask = 0x7f >> kind;
325 301
326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value 302 // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
327 // in the bottom three. 303 // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
328 *buffer = (kind - 1) << 24 | (next & mask); 304 // intended as a counter about how many bytes are still needed.
305 *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
329 return kIncomplete; 306 return kIncomplete;
330 } else { 307 } else {
331 // No buffer, and not the start of a 1-byte char (handled at the 308 // No buffer, and not the start of a 1-byte char (handled at the
332 // beginning), and not the start of a 2..4 byte char? Bad char. 309 // beginning), and not the start of a 2..4 byte char? Bad char.
333 *buffer = 0; 310 *buffer = 0;
334 return kBadChar; 311 return kBadChar;
335 } 312 }
336 } else if (*buffer <= 0xff) { 313 } else if (*buffer <= 0xff) {
337 // We have one unprocessed byte left (from the last else case in this if 314 // We have one unprocessed byte left (from the last else case in this if
338 // statement). 315 // statement).
339 uchar previous = *buffer; 316 uchar previous = *buffer;
340 *buffer = 0; 317 *buffer = 0;
341 uchar t = ValueOfIncremental(previous, buffer); 318 uchar t = ValueOfIncremental(previous, buffer);
342 if (t == kIncomplete) { 319 if (t == kIncomplete) {
343 // If we have an incomplete character, process both the previous and the 320 // If we have an incomplete character, process both the previous and the
344 // next byte at once. 321 // next byte at once.
345 return ValueOfIncremental(next, buffer); 322 return ValueOfIncremental(next, buffer);
346 } else { 323 } else {
347 // Otherwise, process the previous byte and save the next byte for next 324 // Otherwise, process the previous byte and save the next byte for next
348 // time. 325 // time.
349 DCHECK_EQ(0, *buffer); 326 DCHECK_EQ(0, *buffer);
350 *buffer = next; 327 *buffer = next;
351 return t; 328 return t;
352 } 329 }
353 } else if (IsContinuationCharacter(next)) { 330 } else if (IsContinuationCharacter(next)) {
354 // We're inside of a character, as described by buffer. 331 // We're inside of a character, as described by buffer.
355 332
356 // How many bytes (excluding this one) do we still expect? 333 // How many bytes (excluding this one) do we still expect?
357 uint8_t count = (*buffer >> 24) - 1; 334 uint8_t bytes_expected = *buffer >> 28;
335 uint8_t bytes_left = (*buffer >> 24) & 0x0f;
336 bytes_left--;
358 // Update the value. 337 // Update the value.
359 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); 338 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
360 if (count) { 339 if (bytes_left) {
361 *buffer = count << 24 | value; 340 *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
362 return kIncomplete; 341 return kIncomplete;
363 } else { 342 } else {
364 *buffer = 0; 343 *buffer = 0;
365 return value; 344 bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
345 (bytes_expected == 3 && value < 0x800);
346 return sequence_was_too_long ? kBadChar : value;
366 } 347 }
367 } else { 348 } else {
368 // Within a character, but not a continuation character? Then the 349 // Within a character, but not a continuation character? Then the
369 // previous char was a bad char. But we need to save the current 350 // previous char was a bad char. But we need to save the current
370 // one. 351 // one.
371 *buffer = next; 352 *buffer = next;
372 return kBadChar; 353 return kBadChar;
373 } 354 }
374 } 355 }
375 356
(...skipping 3150 matching lines...) Expand 10 before | Expand all | Expand 10 after
3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3507 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3527 + 3508 +
3528 kCanonicalizationRangeMultiStrings1Size * 3509 kCanonicalizationRangeMultiStrings1Size *
3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3510 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3530 + 3511 +
3531 kCanonicalizationRangeMultiStrings7Size * 3512 kCanonicalizationRangeMultiStrings7Size *
3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT 3513 sizeof(MultiCharacterSpecialCase<1>); // NOLINT
3533 } 3514 }
3534 3515
3535 } // namespace unibrow 3516 } // namespace unibrow
OLDNEW
« no previous file with comments | « no previous file | test/cctest/test-parsing.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698