Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1146)

Unified Diff: src/scanner-character-streams.cc

Issue 708823002: Streaming API: detect UTF-8 BOM. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: . Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/scanner-character-streams.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/scanner-character-streams.cc
diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc
index 50c3955c1bdc888200f593cad0f45ec69900ae18..0dec5daed4aa321a458c69041fe82def9fa941f1 100644
--- a/src/scanner-character-streams.cc
+++ b/src/scanner-character-streams.cc
@@ -232,14 +232,12 @@ unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
static const byte kUtf8MultiByteMask = 0xC0;
static const byte kUtf8MultiByteCharFollower = 0x80;
+static const byte kUtf8MultiByteCharStart = 0xC0;
-#ifdef DEBUG
-static const byte kUtf8MultiByteCharStart = 0xC0;
static bool IsUtf8MultiCharacterStart(byte first_byte) {
return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
}
-#endif
static bool IsUtf8MultiCharacterFollower(byte later_byte) {
@@ -341,6 +339,14 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
// A caveat: a data chunk might end with bytes from an incomplete UTF-8
// character (the rest of the bytes will be in the next chunk).
if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
+ if (first_chunk_) {
+ // Get rid of the byte order mark (if any).
+ if (current_data_length_ >= 3 && current_data_[0] == 0xef &&
+ current_data_[1] == 0xbb && current_data_[2] == 0xbf) {
+ current_data_offset_ = 3;
+ }
+ }
+
HandleUtf8SplitCharacters(&data_in_buffer);
if (!data_ends && current_data_offset_ == current_data_length_) {
// The data stream didn't end, but we used all the data in the
@@ -360,6 +366,8 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
DCHECK(utf8_split_char_buffer_length_ == 0);
return data_in_buffer;
}
+
+ first_chunk_ = false;
}
// Fill the buffer from current_data_.
@@ -396,11 +404,11 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(
unibrow::uchar c;
if (utf8_split_char_buffer_length_ > 0) {
// Move the bytes which are part of the split character (which started in
- // the previous chunk) into utf8_split_char_buffer_. Note that the
- // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
- while (current_data_offset_ < current_data_length_ &&
- utf8_split_char_buffer_length_ < 4 &&
- (c = current_data_[current_data_offset_]) >> 6 == 2) {
+ // the previous chunk) into utf8_split_char_buffer_.
+ while (
+ current_data_offset_ < current_data_length_ &&
+ utf8_split_char_buffer_length_ < 4 &&
+ IsUtf8MultiCharacterFollower(c = current_data_[current_data_offset_])) {
utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
++utf8_split_char_buffer_length_;
++current_data_offset_;
@@ -426,15 +434,16 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(
// bytes long, but if the data is invalid, we can have character values bigger
// than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
while (current_data_length_ > current_data_offset_ &&
- (c = current_data_[current_data_length_ - 1]) >
- unibrow::Utf8::kMaxOneByteChar &&
- utf8_split_char_buffer_length_ < 4) {
+ utf8_split_char_buffer_length_ < 4 &&
+ (IsUtf8MultiCharacterFollower(
+ c = current_data_[current_data_length_ - 1]) ||
+ IsUtf8MultiCharacterStart(c))) {
--current_data_length_;
++utf8_split_char_buffer_length_;
- if (c >= (3 << 6)) {
- // 3 << 6 = 0b11000000; this is the first byte of the multi-byte
- // character. No need to copy the previous characters into the conversion
- // buffer (even if they're multi-byte).
+ if (IsUtf8MultiCharacterStart(c)) {
+ // This is the first byte of the multi-byte character. No need to copy the
+ // previous characters into the conversion buffer (even if they're
+ // multi-byte).
break;
}
}
« no previous file with comments | « src/scanner-character-streams.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698