Chromium Code Reviews| Index: src/scanner-character-streams.cc |
| =================================================================== |
| --- src/scanner-character-streams.cc (revision 10944) |
| +++ src/scanner-character-streams.cc (working copy) |
| @@ -36,19 +36,19 @@ |
| namespace internal { |
| // ---------------------------------------------------------------------------- |
| -// BufferedUC16CharacterStreams |
| +// BufferedUtf16CharacterStreams |
| -BufferedUC16CharacterStream::BufferedUC16CharacterStream() |
| - : UC16CharacterStream(), |
| +BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() |
| + : Utf16CharacterStream(), |
| pushback_limit_(NULL) { |
| // Initialize buffer as being empty. First read will fill the buffer. |
| buffer_cursor_ = buffer_; |
| buffer_end_ = buffer_; |
| } |
| -BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { } |
| +BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } |
| -void BufferedUC16CharacterStream::PushBack(uc32 character) { |
| +void BufferedUtf16CharacterStream::PushBack(uc32 character) { |
| if (character == kEndOfInput) { |
| pos_--; |
| return; |
| @@ -63,7 +63,7 @@ |
| } |
| -void BufferedUC16CharacterStream::SlowPushBack(uc16 character) { |
| +void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { |
| // In pushback mode, the end of the buffer contains pushback, |
| // and the start of the buffer (from buffer start to pushback_limit_) |
| // contains valid data that comes just after the pushback. |
| @@ -89,7 +89,7 @@ |
| } |
| -bool BufferedUC16CharacterStream::ReadBlock() { |
| +bool BufferedUtf16CharacterStream::ReadBlock() { |
| buffer_cursor_ = buffer_; |
| if (pushback_limit_ != NULL) { |
| // Leave pushback mode. |
| @@ -106,7 +106,7 @@ |
| } |
| -unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) { |
| +unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) { |
| // Leave pushback mode (i.e., ignore that there might be valid data |
| // in the buffer before the pushback_limit_ point). |
| pushback_limit_ = NULL; |
| @@ -114,10 +114,10 @@ |
| } |
| // ---------------------------------------------------------------------------- |
| -// GenericStringUC16CharacterStream |
| +// GenericStringUtf16CharacterStream |
| -GenericStringUC16CharacterStream::GenericStringUC16CharacterStream( |
| +GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( |
| Handle<String> data, |
| unsigned start_position, |
| unsigned end_position) |
| @@ -130,10 +130,10 @@ |
| } |
| -GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { } |
| +GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } |
| -unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) { |
| +unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) { |
| unsigned old_pos = pos_; |
| pos_ = Min(pos_ + delta, length_); |
| ReadBlock(); |
| @@ -141,7 +141,7 @@ |
| } |
| -unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos, |
| +unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos, |
| unsigned length) { |
| if (from_pos >= length_) return 0; |
| if (from_pos + length > length_) { |
| @@ -153,10 +153,10 @@ |
| // ---------------------------------------------------------------------------- |
| -// Utf8ToUC16CharacterStream |
| -Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data, |
| - unsigned length) |
| - : BufferedUC16CharacterStream(), |
| +// Utf8ToUtf16CharacterStream |
| +Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, |
| + unsigned length) |
| + : BufferedUtf16CharacterStream(), |
| raw_data_(data), |
| raw_data_length_(length), |
| raw_data_pos_(0), |
| @@ -165,10 +165,10 @@ |
| } |
| -Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { } |
| +Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
| -unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) { |
| +unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { |
| unsigned old_pos = pos_; |
| unsigned target_pos = pos_ + delta; |
| SetRawPosition(target_pos); |
| @@ -178,9 +178,9 @@ |
| } |
| -unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position, |
| - unsigned length) { |
| - static const unibrow::uchar kMaxUC16Character = 0xffff; |
| +unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position, |
| + unsigned length) { |
| + static const unibrow::uchar kMaxUtf16Character = 0xffff; |
| SetRawPosition(char_position); |
| if (raw_character_position_ != char_position) { |
| // char_position was not a valid position in the stream (hit the end |
| @@ -188,7 +188,7 @@ |
| return 0u; |
| } |
| unsigned i = 0; |
| - while (i < length) { |
| + while (i < length - 1) { |
|
rossberg
2012/03/07 13:32:47
When does this ever consume the last character in
Erik Corry
2012/03/11 19:29:22
This loop condition does not prevent it from consu
|
| if (raw_data_pos_ == raw_data_length_) break; |
| unibrow::uchar c = raw_data_[raw_data_pos_]; |
| if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| @@ -197,12 +197,13 @@ |
| c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, |
| raw_data_length_ - raw_data_pos_, |
| &raw_data_pos_); |
| - // Don't allow characters outside of the BMP. |
| - if (c > kMaxUC16Character) { |
| - c = unibrow::Utf8::kBadChar; |
| - } |
| } |
| - buffer_[i++] = static_cast<uc16>(c); |
| + if (c > kMaxUtf16Character) { |
| + buffer_[i++] = unibrow::Utf16::LeadSurrogate(c); |
| + buffer_[i++] = unibrow::Utf16::TrailSurrogate(c); |
| + } else { |
| + buffer_[i++] = static_cast<uc16>(c); |
| + } |
| } |
| raw_character_position_ = char_position + i; |
| return i; |
| @@ -266,37 +267,52 @@ |
| } |
| -void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) { |
| +// This can't set a raw position between two surrogate pairs, since there |
| +// is no position in the UTF8 stream that corresponds to that. This assumes |
| +// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If |
| +// it is illegally coded as two 3 byte sequences then there is no problem here. |
| +void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) { |
| if (raw_character_position_ > target_position) { |
| // Spool backwards in utf8 buffer. |
| do { |
| + int old_pos = raw_data_pos_; |
| Utf8CharacterBack(raw_data_, &raw_data_pos_); |
| raw_character_position_--; |
| + ASSERT(old_pos - raw_data_pos_ <= 4); |
| + // Step back over both code units for surrogate pairs. |
| + if (old_pos - raw_data_pos_ == 4) raw_character_position_--; |
| } while (raw_character_position_ > target_position); |
| + // No surrogate pair splitting. |
| + ASSERT(raw_character_position_ == target_position); |
| return; |
| } |
| // Spool forwards in the utf8 buffer. |
| while (raw_character_position_ < target_position) { |
| if (raw_data_pos_ == raw_data_length_) return; |
| + int old_pos = raw_data_pos_; |
| Utf8CharacterForward(raw_data_, &raw_data_pos_); |
| raw_character_position_++; |
| + ASSERT(raw_data_pos_ - old_pos <= 4); |
| + if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
| } |
| + // No surrogate pair splitting. |
| + ASSERT(raw_character_position_ == target_position); |
| } |
| // ---------------------------------------------------------------------------- |
| -// ExternalTwoByteStringUC16CharacterStream |
| +// ExternalTwoByteStringUtf16CharacterStream |
| -ExternalTwoByteStringUC16CharacterStream:: |
| - ~ExternalTwoByteStringUC16CharacterStream() { } |
| +ExternalTwoByteStringUtf16CharacterStream:: |
| + ~ExternalTwoByteStringUtf16CharacterStream() { } |
| -ExternalTwoByteStringUC16CharacterStream |
| - ::ExternalTwoByteStringUC16CharacterStream( |
| +ExternalTwoByteStringUtf16CharacterStream |
| + ::ExternalTwoByteStringUtf16CharacterStream( |
| Handle<ExternalTwoByteString> data, |
| int start_position, |
| int end_position) |
| - : UC16CharacterStream(), |
| + : Utf16CharacterStream(), |
| source_(data), |
| raw_data_(data->GetTwoByteData(start_position)) { |
| buffer_cursor_ = raw_data_, |