Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(91)

Side by Side Diff: src/scanner-character-streams.cc

Issue 708823002: Streaming API: detect UTF-8 BOM. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: . Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/scanner-character-streams.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/scanner-character-streams.h" 7 #include "src/scanner-character-streams.h"
8 8
9 #include "include/v8.h" 9 #include "include/v8.h"
10 #include "src/handles.h" 10 #include "src/handles.h"
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after
225 } 225 }
226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, 226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
227 raw_data_length_); 227 raw_data_length_);
228 raw_character_position_ = char_position + i; 228 raw_character_position_ = char_position + i;
229 return i; 229 return i;
230 } 230 }
231 231
232 232
233 static const byte kUtf8MultiByteMask = 0xC0; 233 static const byte kUtf8MultiByteMask = 0xC0;
234 static const byte kUtf8MultiByteCharFollower = 0x80; 234 static const byte kUtf8MultiByteCharFollower = 0x80;
235 static const byte kUtf8MultiByteCharStart = 0xC0;
235 236
236 237
237 #ifdef DEBUG
238 static const byte kUtf8MultiByteCharStart = 0xC0;
239 static bool IsUtf8MultiCharacterStart(byte first_byte) { 238 static bool IsUtf8MultiCharacterStart(byte first_byte) {
240 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; 239 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
241 } 240 }
242 #endif
243 241
244 242
245 static bool IsUtf8MultiCharacterFollower(byte later_byte) { 243 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
246 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; 244 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
247 } 245 }
248 246
249 247
250 // Move the cursor back to point at the preceding UTF-8 character start 248 // Move the cursor back to point at the preceding UTF-8 character start
251 // in the buffer. 249 // in the buffer.
252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { 250 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
334 // here) and the internal parts which use unsigned. TODO(marja): make the 332 // here) and the internal parts which use unsigned. TODO(marja): make the
335 // internal parts use size_t too. 333 // internal parts use size_t too.
336 current_data_length_ = 334 current_data_length_ =
337 static_cast<unsigned>(source_stream_->GetMoreData(&current_data_)); 335 static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
338 current_data_offset_ = 0; 336 current_data_offset_ = 0;
339 bool data_ends = current_data_length_ == 0; 337 bool data_ends = current_data_length_ == 0;
340 338
341 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 339 // A caveat: a data chunk might end with bytes from an incomplete UTF-8
342 // character (the rest of the bytes will be in the next chunk). 340 // character (the rest of the bytes will be in the next chunk).
343 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { 341 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
342 if (first_chunk_) {
343 // Get rid of the byte order mark (if any).
344 if (current_data_length_ >= 3 && current_data_[0] == 0xef &&
345 current_data_[1] == 0xbb && current_data_[2] == 0xbf) {
346 current_data_offset_ = 3;
347 }
348 }
349
344 HandleUtf8SplitCharacters(&data_in_buffer); 350 HandleUtf8SplitCharacters(&data_in_buffer);
345 if (!data_ends && current_data_offset_ == current_data_length_) { 351 if (!data_ends && current_data_offset_ == current_data_length_) {
346 // The data stream didn't end, but we used all the data in the 352 // The data stream didn't end, but we used all the data in the
347 // chunk. This will only happen when the chunk was really small. We 353 // chunk. This will only happen when the chunk was really small. We
348 // don't handle the case where a UTF-8 character is split over several 354 // don't handle the case where a UTF-8 character is split over several
349 // chunks; in that case V8 won't crash, but it will be a parse error. 355 // chunks; in that case V8 won't crash, but it will be a parse error.
350 delete[] current_data_; 356 delete[] current_data_;
351 current_data_ = NULL; 357 current_data_ = NULL;
352 current_data_length_ = 0; 358 current_data_length_ = 0;
353 current_data_offset_ = 0; 359 current_data_offset_ = 0;
354 continue; // Request a new chunk. 360 continue; // Request a new chunk.
355 } 361 }
356 } 362 }
357 363
358 // Did the data stream end? 364 // Did the data stream end?
359 if (data_ends) { 365 if (data_ends) {
360 DCHECK(utf8_split_char_buffer_length_ == 0); 366 DCHECK(utf8_split_char_buffer_length_ == 0);
361 return data_in_buffer; 367 return data_in_buffer;
362 } 368 }
369
370 first_chunk_ = false;
363 } 371 }
364 372
365 // Fill the buffer from current_data_. 373 // Fill the buffer from current_data_.
366 unsigned new_offset = 0; 374 unsigned new_offset = 0;
367 unsigned new_chars_in_buffer = 375 unsigned new_chars_in_buffer =
368 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, 376 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
369 current_data_ + current_data_offset_, &new_offset, 377 current_data_ + current_data_offset_, &new_offset,
370 current_data_length_ - current_data_offset_, encoding_); 378 current_data_length_ - current_data_offset_, encoding_);
371 data_in_buffer += new_chars_in_buffer; 379 data_in_buffer += new_chars_in_buffer;
372 current_data_offset_ += new_offset; 380 current_data_offset_ += new_offset;
(...skipping 16 matching lines...) Expand all
389 // Given any byte, we can always read its local environment (in both 397 // Given any byte, we can always read its local environment (in both
390 // directions) to find out the (possibly multi-byte) character it belongs 398 // directions) to find out the (possibly multi-byte) character it belongs
391 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a 399 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a
392 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or 400 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or
393 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. 401 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.
394 402
395 // First check if we have leftover data from the last chunk. 403 // First check if we have leftover data from the last chunk.
396 unibrow::uchar c; 404 unibrow::uchar c;
397 if (utf8_split_char_buffer_length_ > 0) { 405 if (utf8_split_char_buffer_length_ > 0) {
398 // Move the bytes which are part of the split character (which started in 406 // Move the bytes which are part of the split character (which started in
399 // the previous chunk) into utf8_split_char_buffer_. Note that the 407 // the previous chunk) into utf8_split_char_buffer_.
400 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. 408 while (
401 while (current_data_offset_ < current_data_length_ && 409 current_data_offset_ < current_data_length_ &&
402 utf8_split_char_buffer_length_ < 4 && 410 utf8_split_char_buffer_length_ < 4 &&
403 (c = current_data_[current_data_offset_]) >> 6 == 2) { 411 IsUtf8MultiCharacterFollower(c = current_data_[current_data_offset_])) {
404 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; 412 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
405 ++utf8_split_char_buffer_length_; 413 ++utf8_split_char_buffer_length_;
406 ++current_data_offset_; 414 ++current_data_offset_;
407 } 415 }
408 416
409 // Convert the data in utf8_split_char_buffer_. 417 // Convert the data in utf8_split_char_buffer_.
410 unsigned new_offset = 0; 418 unsigned new_offset = 0;
411 unsigned new_chars_in_buffer = 419 unsigned new_chars_in_buffer =
412 CopyCharsHelper(buffer_ + *data_in_buffer, 420 CopyCharsHelper(buffer_ + *data_in_buffer,
413 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, 421 kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
414 &new_offset, utf8_split_char_buffer_length_, encoding_); 422 &new_offset, utf8_split_char_buffer_length_, encoding_);
415 *data_in_buffer += new_chars_in_buffer; 423 *data_in_buffer += new_chars_in_buffer;
416 // Make sure we used all the data. 424 // Make sure we used all the data.
417 DCHECK(new_offset == utf8_split_char_buffer_length_); 425 DCHECK(new_offset == utf8_split_char_buffer_length_);
418 DCHECK(*data_in_buffer <= kBufferSize); 426 DCHECK(*data_in_buffer <= kBufferSize);
419 427
420 utf8_split_char_buffer_length_ = 0; 428 utf8_split_char_buffer_length_ = 0;
421 } 429 }
422 430
423 // Move bytes which are part of an incomplete character from the end of the 431 // Move bytes which are part of an incomplete character from the end of the
424 // current chunk to utf8_split_char_buffer_. They will be converted when the 432 // current chunk to utf8_split_char_buffer_. They will be converted when the
425 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 433 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
426 // bytes long, but if the data is invalid, we can have character values bigger 434 // bytes long, but if the data is invalid, we can have character values bigger
427 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. 435 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
428 while (current_data_length_ > current_data_offset_ && 436 while (current_data_length_ > current_data_offset_ &&
429 (c = current_data_[current_data_length_ - 1]) > 437 utf8_split_char_buffer_length_ < 4 &&
430 unibrow::Utf8::kMaxOneByteChar && 438 (IsUtf8MultiCharacterFollower(
431 utf8_split_char_buffer_length_ < 4) { 439 c = current_data_[current_data_length_ - 1]) ||
440 IsUtf8MultiCharacterStart(c))) {
432 --current_data_length_; 441 --current_data_length_;
433 ++utf8_split_char_buffer_length_; 442 ++utf8_split_char_buffer_length_;
434 if (c >= (3 << 6)) { 443 if (IsUtf8MultiCharacterStart(c)) {
435 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte 444 // This is the first byte of the multi-byte character. No need to copy the
436 // character. No need to copy the previous characters into the conversion 445 // previous characters into the conversion buffer (even if they're
437 // buffer (even if they're multi-byte). 446 // multi-byte).
438 break; 447 break;
439 } 448 }
440 } 449 }
441 CHECK(utf8_split_char_buffer_length_ <= 4); 450 CHECK(utf8_split_char_buffer_length_ <= 4);
442 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { 451 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
443 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; 452 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
444 } 453 }
445 } 454 }
446 455
447 456
(...skipping 11 matching lines...) Expand all
459 int end_position) 468 int end_position)
460 : Utf16CharacterStream(), 469 : Utf16CharacterStream(),
461 source_(data), 470 source_(data),
462 raw_data_(data->GetTwoByteData(start_position)) { 471 raw_data_(data->GetTwoByteData(start_position)) {
463 buffer_cursor_ = raw_data_, 472 buffer_cursor_ = raw_data_,
464 buffer_end_ = raw_data_ + (end_position - start_position); 473 buffer_end_ = raw_data_ + (end_position - start_position);
465 pos_ = start_position; 474 pos_ = start_position;
466 } 475 }
467 476
468 } } // namespace v8::internal 477 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « src/scanner-character-streams.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698