| Index: src/scanner-character-streams.cc
|
| ===================================================================
|
| --- src/scanner-character-streams.cc (revision 10944)
|
| +++ src/scanner-character-streams.cc (working copy)
|
| @@ -36,19 +36,19 @@
|
| namespace internal {
|
|
|
| // ----------------------------------------------------------------------------
|
| -// BufferedUC16CharacterStreams
|
| +// BufferedUtf16CharacterStreams
|
|
|
| -BufferedUC16CharacterStream::BufferedUC16CharacterStream()
|
| - : UC16CharacterStream(),
|
| +BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
|
| + : Utf16CharacterStream(),
|
| pushback_limit_(NULL) {
|
| // Initialize buffer as being empty. First read will fill the buffer.
|
| buffer_cursor_ = buffer_;
|
| buffer_end_ = buffer_;
|
| }
|
|
|
| -BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
|
| +BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
|
|
|
| -void BufferedUC16CharacterStream::PushBack(uc32 character) {
|
| +void BufferedUtf16CharacterStream::PushBack(uc32 character) {
|
| if (character == kEndOfInput) {
|
| pos_--;
|
| return;
|
| @@ -63,7 +63,7 @@
|
| }
|
|
|
|
|
| -void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
|
| +void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
|
| // In pushback mode, the end of the buffer contains pushback,
|
| // and the start of the buffer (from buffer start to pushback_limit_)
|
| // contains valid data that comes just after the pushback.
|
| @@ -89,7 +89,7 @@
|
| }
|
|
|
|
|
| -bool BufferedUC16CharacterStream::ReadBlock() {
|
| +bool BufferedUtf16CharacterStream::ReadBlock() {
|
| buffer_cursor_ = buffer_;
|
| if (pushback_limit_ != NULL) {
|
| // Leave pushback mode.
|
| @@ -106,7 +106,7 @@
|
| }
|
|
|
|
|
| -unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
|
| +unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
|
| // Leave pushback mode (i.e., ignore that there might be valid data
|
| // in the buffer before the pushback_limit_ point).
|
| pushback_limit_ = NULL;
|
| @@ -114,10 +114,10 @@
|
| }
|
|
|
| // ----------------------------------------------------------------------------
|
| -// GenericStringUC16CharacterStream
|
| +// GenericStringUtf16CharacterStream
|
|
|
|
|
| -GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
|
| +GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
|
| Handle<String> data,
|
| unsigned start_position,
|
| unsigned end_position)
|
| @@ -130,10 +130,10 @@
|
| }
|
|
|
|
|
| -GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
|
| +GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
|
|
|
|
|
| -unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
|
| +unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
|
| unsigned old_pos = pos_;
|
| pos_ = Min(pos_ + delta, length_);
|
| ReadBlock();
|
| @@ -141,7 +141,7 @@
|
| }
|
|
|
|
|
| -unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
|
| +unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
|
| unsigned length) {
|
| if (from_pos >= length_) return 0;
|
| if (from_pos + length > length_) {
|
| @@ -153,10 +153,10 @@
|
|
|
|
|
| // ----------------------------------------------------------------------------
|
| -// Utf8ToUC16CharacterStream
|
| -Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
|
| - unsigned length)
|
| - : BufferedUC16CharacterStream(),
|
| +// Utf8ToUtf16CharacterStream
|
| +Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
|
| + unsigned length)
|
| + : BufferedUtf16CharacterStream(),
|
| raw_data_(data),
|
| raw_data_length_(length),
|
| raw_data_pos_(0),
|
| @@ -165,10 +165,10 @@
|
| }
|
|
|
|
|
| -Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
|
| +Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
|
|
|
|
|
| -unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
|
| +unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
|
| unsigned old_pos = pos_;
|
| unsigned target_pos = pos_ + delta;
|
| SetRawPosition(target_pos);
|
| @@ -178,9 +178,9 @@
|
| }
|
|
|
|
|
| -unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
|
| - unsigned length) {
|
| - static const unibrow::uchar kMaxUC16Character = 0xffff;
|
| +unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
|
| + unsigned length) {
|
| + static const unibrow::uchar kMaxUtf16Character = 0xffff;
|
| SetRawPosition(char_position);
|
| if (raw_character_position_ != char_position) {
|
| // char_position was not a valid position in the stream (hit the end
|
| @@ -188,7 +188,7 @@
|
| return 0u;
|
| }
|
| unsigned i = 0;
|
| - while (i < length) {
|
| + while (i < length - 1) {
|
| if (raw_data_pos_ == raw_data_length_) break;
|
| unibrow::uchar c = raw_data_[raw_data_pos_];
|
| if (c <= unibrow::Utf8::kMaxOneByteChar) {
|
| @@ -197,12 +197,13 @@
|
| c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
|
| raw_data_length_ - raw_data_pos_,
|
| &raw_data_pos_);
|
| - // Don't allow characters outside of the BMP.
|
| - if (c > kMaxUC16Character) {
|
| - c = unibrow::Utf8::kBadChar;
|
| - }
|
| }
|
| - buffer_[i++] = static_cast<uc16>(c);
|
| + if (c > kMaxUtf16Character) {
|
| + buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
|
| + buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
|
| + } else {
|
| + buffer_[i++] = static_cast<uc16>(c);
|
| + }
|
| }
|
| raw_character_position_ = char_position + i;
|
| return i;
|
| @@ -266,37 +267,52 @@
|
| }
|
|
|
|
|
| -void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
|
| +// This can't set a raw position between two surrogate pairs, since there
|
| +// is no position in the UTF8 stream that corresponds to that. This assumes
|
| +// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
|
| +// it is illegally coded as two 3 byte sequences then there is no problem here.
|
| +void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
|
| if (raw_character_position_ > target_position) {
|
| // Spool backwards in utf8 buffer.
|
| do {
|
| + int old_pos = raw_data_pos_;
|
| Utf8CharacterBack(raw_data_, &raw_data_pos_);
|
| raw_character_position_--;
|
| + ASSERT(old_pos - raw_data_pos_ <= 4);
|
| + // Step back over both code units for surrogate pairs.
|
| + if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
|
| } while (raw_character_position_ > target_position);
|
| + // No surrogate pair splitting.
|
| + ASSERT(raw_character_position_ == target_position);
|
| return;
|
| }
|
| // Spool forwards in the utf8 buffer.
|
| while (raw_character_position_ < target_position) {
|
| if (raw_data_pos_ == raw_data_length_) return;
|
| + int old_pos = raw_data_pos_;
|
| Utf8CharacterForward(raw_data_, &raw_data_pos_);
|
| raw_character_position_++;
|
| + ASSERT(raw_data_pos_ - old_pos <= 4);
|
| + if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
|
| }
|
| + // No surrogate pair splitting.
|
| + ASSERT(raw_character_position_ == target_position);
|
| }
|
|
|
|
|
| // ----------------------------------------------------------------------------
|
| -// ExternalTwoByteStringUC16CharacterStream
|
| +// ExternalTwoByteStringUtf16CharacterStream
|
|
|
| -ExternalTwoByteStringUC16CharacterStream::
|
| - ~ExternalTwoByteStringUC16CharacterStream() { }
|
| +ExternalTwoByteStringUtf16CharacterStream::
|
| + ~ExternalTwoByteStringUtf16CharacterStream() { }
|
|
|
|
|
| -ExternalTwoByteStringUC16CharacterStream
|
| - ::ExternalTwoByteStringUC16CharacterStream(
|
| +ExternalTwoByteStringUtf16CharacterStream
|
| + ::ExternalTwoByteStringUtf16CharacterStream(
|
| Handle<ExternalTwoByteString> data,
|
| int start_position,
|
| int end_position)
|
| - : UC16CharacterStream(),
|
| + : Utf16CharacterStream(),
|
| source_(data),
|
| raw_data_(data->GetTwoByteData(start_position)) {
|
| buffer_cursor_ = raw_data_,
|
|
|