| OLD | NEW |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/parsing/scanner-character-streams.h" | 5 #include "src/parsing/scanner-character-streams.h" |
| 6 | 6 |
| 7 #include "include/v8.h" | 7 #include "include/v8.h" |
| 8 #include "src/globals.h" | 8 #include "src/globals.h" |
| 9 #include "src/handles.h" | 9 #include "src/handles.h" |
| 10 #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! | 10 #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! |
| 11 #include "src/objects-inl.h" | 11 #include "src/objects-inl.h" |
| 12 #include "src/unicode-inl.h" | 12 #include "src/unicode-inl.h" |
| 13 | 13 |
| 14 namespace v8 { | 14 namespace v8 { |
| 15 namespace internal { | 15 namespace internal { |
| 16 | 16 |
| 17 namespace { | 17 namespace { |
| 18 | 18 |
| 19 size_t CopyUtf8CharsToUtf16Chars(uint16_t* dest, size_t length, const byte* src, |
| 20 size_t* src_pos, size_t src_length) { |
| 21 static const unibrow::uchar kMaxUtf16Character = |
| 22 unibrow::Utf16::kMaxNonSurrogateCharCode; |
| 23 size_t i = 0; |
| 24 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
| 25 // one character early (in the normal case), because we need to have at least |
| 26 // two free spaces in the buffer to be sure that the next character will fit. |
| 27 while (i < length - 1) { |
| 28 if (*src_pos == src_length) break; |
| 29 unibrow::uchar c = src[*src_pos]; |
| 30 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 31 *src_pos = *src_pos + 1; |
| 32 } else { |
| 33 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
| 34 src_pos); |
| 35 } |
| 36 if (c > kMaxUtf16Character) { |
| 37 dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 38 dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 39 } else { |
| 40 dest[i++] = static_cast<uc16>(c); |
| 41 } |
| 42 } |
| 43 return i; |
| 44 } |
| 45 |
| 19 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, | 46 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, |
| 20 size_t* src_pos, size_t src_length, | 47 size_t* src_pos, size_t src_length, |
| 21 ScriptCompiler::StreamedSource::Encoding encoding) { | 48 ScriptCompiler::StreamedSource::Encoding encoding) { |
| 22 // It's possible that this will be called with length 0, but don't assume that | 49 // It's possible that this will be called with length 0, but don't assume that |
| 23 // the functions this calls handle it gracefully. | 50 // the functions this calls handle it gracefully. |
| 24 if (length == 0) return 0; | 51 if (length == 0) return 0; |
| 25 | 52 |
| 26 if (encoding == ScriptCompiler::StreamedSource::UTF8) { | 53 if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
| 27 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | 54 return CopyUtf8CharsToUtf16Chars(dest, length, src, src_pos, src_length); |
| 28 dest, length, src, src_pos, src_length); | |
| 29 } | 55 } |
| 30 | 56 |
| 31 size_t to_fill = length; | 57 size_t to_fill = length; |
| 32 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | 58 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; |
| 33 | 59 |
| 34 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | 60 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { |
| 35 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | 61 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); |
| 36 } else { | 62 } else { |
| 37 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | 63 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); |
| 38 v8::internal::CopyChars<uint16_t, uint16_t>( | 64 v8::internal::CopyChars<uint16_t, uint16_t>( |
| (...skipping 129 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 168 if (from_pos + length > length_) { | 194 if (from_pos + length > length_) { |
| 169 length = length_ - from_pos; | 195 length = length_ - from_pos; |
| 170 } | 196 } |
| 171 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), | 197 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), |
| 172 static_cast<int>(from_pos + length)); | 198 static_cast<int>(from_pos + length)); |
| 173 return length; | 199 return length; |
| 174 } | 200 } |
| 175 | 201 |
| 176 | 202 |
| 177 // ---------------------------------------------------------------------------- | 203 // ---------------------------------------------------------------------------- |
| 178 // Utf8ToUtf16CharacterStream | 204 // ExternalStreamingStream |
| 179 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | |
| 180 size_t length) | |
| 181 : BufferedUtf16CharacterStream(), | |
| 182 raw_data_(data), | |
| 183 raw_data_length_(length), | |
| 184 raw_data_pos_(0), | |
| 185 raw_character_position_(0) { | |
| 186 ReadBlock(); | |
| 187 } | |
| 188 | |
| 189 | |
| 190 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | |
| 191 | |
| 192 | |
| 193 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, | |
| 194 const byte* src, size_t* src_pos, | |
| 195 size_t src_length) { | |
| 196 static const unibrow::uchar kMaxUtf16Character = | |
| 197 unibrow::Utf16::kMaxNonSurrogateCharCode; | |
| 198 size_t i = 0; | |
| 199 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | |
| 200 // one character early (in the normal case), because we need to have at least | |
| 201 // two free spaces in the buffer to be sure that the next character will fit. | |
| 202 while (i < length - 1) { | |
| 203 if (*src_pos == src_length) break; | |
| 204 unibrow::uchar c = src[*src_pos]; | |
| 205 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
| 206 *src_pos = *src_pos + 1; | |
| 207 } else { | |
| 208 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | |
| 209 src_pos); | |
| 210 } | |
| 211 if (c > kMaxUtf16Character) { | |
| 212 dest[i++] = unibrow::Utf16::LeadSurrogate(c); | |
| 213 dest[i++] = unibrow::Utf16::TrailSurrogate(c); | |
| 214 } else { | |
| 215 dest[i++] = static_cast<uc16>(c); | |
| 216 } | |
| 217 } | |
| 218 return i; | |
| 219 } | |
| 220 | |
| 221 | |
| 222 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { | |
| 223 size_t old_pos = pos_; | |
| 224 size_t target_pos = pos_ + delta; | |
| 225 SetRawPosition(target_pos); | |
| 226 pos_ = raw_character_position_; | |
| 227 ReadBlock(); | |
| 228 return pos_ - old_pos; | |
| 229 } | |
| 230 | |
| 231 | |
| 232 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { | |
| 233 SetRawPosition(char_position); | |
| 234 if (raw_character_position_ != char_position) { | |
| 235 // char_position was not a valid position in the stream (hit the end | |
| 236 // while spooling to it). | |
| 237 return 0u; | |
| 238 } | |
| 239 size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | |
| 240 raw_data_length_); | |
| 241 raw_character_position_ = char_position + i; | |
| 242 return i; | |
| 243 } | |
| 244 | |
| 245 | |
| 246 static const byte kUtf8MultiByteMask = 0xC0; | |
| 247 static const byte kUtf8MultiByteCharFollower = 0x80; | |
| 248 | |
| 249 | |
| 250 #ifdef DEBUG | |
| 251 static const byte kUtf8MultiByteCharStart = 0xC0; | |
| 252 static bool IsUtf8MultiCharacterStart(byte first_byte) { | |
| 253 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | |
| 254 } | |
| 255 #endif | |
| 256 | |
| 257 | |
| 258 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | |
| 259 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | |
| 260 } | |
| 261 | |
| 262 | |
| 263 // Move the cursor back to point at the preceding UTF-8 character start | |
| 264 // in the buffer. | |
| 265 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { | |
| 266 byte character = buffer[--*cursor]; | |
| 267 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
| 268 DCHECK(IsUtf8MultiCharacterFollower(character)); | |
| 269 // Last byte of a multi-byte character encoding. Step backwards until | |
| 270 // pointing to the first byte of the encoding, recognized by having the | |
| 271 // top two bits set. | |
| 272 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | |
| 273 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | |
| 274 } | |
| 275 } | |
| 276 | |
| 277 | |
| 278 // Move the cursor forward to point at the next following UTF-8 character start | |
| 279 // in the buffer. | |
| 280 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { | |
| 281 byte character = buffer[(*cursor)++]; | |
| 282 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
| 283 // First character of a multi-byte character encoding. | |
| 284 // The number of most-significant one-bits determines the length of the | |
| 285 // encoding: | |
| 286 // 110..... - (0xCx, 0xDx) one additional byte (minimum). | |
| 287 // 1110.... - (0xEx) two additional bytes. | |
| 288 // 11110... - (0xFx) three additional bytes (maximum). | |
| 289 DCHECK(IsUtf8MultiCharacterStart(character)); | |
| 290 // Additional bytes is: | |
| 291 // 1 if value in range 0xC0 .. 0xDF. | |
| 292 // 2 if value in range 0xE0 .. 0xEF. | |
| 293 // 3 if value in range 0xF0 .. 0xF7. | |
| 294 // Encode that in a single value. | |
| 295 size_t additional_bytes = | |
| 296 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | |
| 297 *cursor += additional_bytes; | |
| 298 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | |
| 299 } | |
| 300 } | |
| 301 | |
| 302 | |
| 303 // This can't set a raw position between two surrogate pairs, since there | |
| 304 // is no position in the UTF8 stream that corresponds to that. This assumes | |
| 305 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If | |
| 306 // it is illegally coded as two 3 byte sequences then there is no problem here. | |
| 307 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { | |
| 308 if (raw_character_position_ > target_position) { | |
| 309 // Spool backwards in utf8 buffer. | |
| 310 do { | |
| 311 size_t old_pos = raw_data_pos_; | |
| 312 Utf8CharacterBack(raw_data_, &raw_data_pos_); | |
| 313 raw_character_position_--; | |
| 314 DCHECK(old_pos - raw_data_pos_ <= 4); | |
| 315 // Step back over both code units for surrogate pairs. | |
| 316 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | |
| 317 } while (raw_character_position_ > target_position); | |
| 318 // No surrogate pair splitting. | |
| 319 DCHECK(raw_character_position_ == target_position); | |
| 320 return; | |
| 321 } | |
| 322 // Spool forwards in the utf8 buffer. | |
| 323 while (raw_character_position_ < target_position) { | |
| 324 if (raw_data_pos_ == raw_data_length_) return; | |
| 325 size_t old_pos = raw_data_pos_; | |
| 326 Utf8CharacterForward(raw_data_, &raw_data_pos_); | |
| 327 raw_character_position_++; | |
| 328 DCHECK(raw_data_pos_ - old_pos <= 4); | |
| 329 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | |
| 330 } | |
| 331 // No surrogate pair splitting. | |
| 332 DCHECK(raw_character_position_ == target_position); | |
| 333 } | |
| 334 | |
| 335 | 205 |
| 336 size_t ExternalStreamingStream::FillBuffer(size_t position) { | 206 size_t ExternalStreamingStream::FillBuffer(size_t position) { |
| 337 // Ignore "position" which is the position in the decoded data. Instead, | 207 // Ignore "position" which is the position in the decoded data. Instead, |
| 338 // ExternalStreamingStream keeps track of the position in the raw data. | 208 // ExternalStreamingStream keeps track of the position in the raw data. |
| 339 size_t data_in_buffer = 0; | 209 size_t data_in_buffer = 0; |
| 340 // Note that the UTF-8 decoder might not be able to fill the buffer | 210 // Note that the UTF-8 decoder might not be able to fill the buffer |
| 341 // completely; it will typically leave the last character empty (see | 211 // completely; it will typically leave the last character empty (see |
| 342 // Utf8ToUtf16CharacterStream::CopyChars). | 212 // Utf8ToUtf16CharacterStream::CopyChars). |
| 343 while (data_in_buffer < kBufferSize - 1) { | 213 while (data_in_buffer < kBufferSize - 1) { |
| 344 if (current_data_ == NULL) { | 214 if (current_data_ == NULL) { |
| (...skipping 247 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 592 ExternalOneByteStringUtf16CharacterStream( | 462 ExternalOneByteStringUtf16CharacterStream( |
| 593 Handle<ExternalOneByteString> data, int start_position, | 463 Handle<ExternalOneByteString> data, int start_position, |
| 594 int end_position) | 464 int end_position) |
| 595 : raw_data_(data->GetChars()), | 465 : raw_data_(data->GetChars()), |
| 596 length_(end_position), | 466 length_(end_position), |
| 597 bookmark_(kNoBookmark) { | 467 bookmark_(kNoBookmark) { |
| 598 DCHECK(end_position >= start_position); | 468 DCHECK(end_position >= start_position); |
| 599 pos_ = start_position; | 469 pos_ = start_position; |
| 600 } | 470 } |
| 601 | 471 |
| 472 ExternalOneByteStringUtf16CharacterStream:: |
| 473 ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length) |
| 474 : raw_data_(reinterpret_cast<const uint8_t*>(data)), |
| 475 length_(length), |
| 476 bookmark_(kNoBookmark) {} |
| 477 |
| 478 ExternalOneByteStringUtf16CharacterStream:: |
| 479 ExternalOneByteStringUtf16CharacterStream(const char* data) |
| 480 : ExternalOneByteStringUtf16CharacterStream(data, strlen(data)) {} |
| 481 |
| 602 bool ExternalOneByteStringUtf16CharacterStream::SetBookmark() { | 482 bool ExternalOneByteStringUtf16CharacterStream::SetBookmark() { |
| 603 bookmark_ = pos_; | 483 bookmark_ = pos_; |
| 604 return true; | 484 return true; |
| 605 } | 485 } |
| 606 | 486 |
| 607 void ExternalOneByteStringUtf16CharacterStream::ResetToBookmark() { | 487 void ExternalOneByteStringUtf16CharacterStream::ResetToBookmark() { |
| 608 DCHECK(bookmark_ != kNoBookmark); | 488 DCHECK(bookmark_ != kNoBookmark); |
| 609 pos_ = bookmark_; | 489 pos_ = bookmark_; |
| 610 buffer_cursor_ = buffer_; | 490 buffer_cursor_ = buffer_; |
| 611 buffer_end_ = buffer_ + FillBuffer(pos_); | 491 buffer_end_ = buffer_ + FillBuffer(pos_); |
| (...skipping 11 matching lines...) Expand all Loading... |
| 623 if (from_pos >= length_) return 0; | 503 if (from_pos >= length_) return 0; |
| 624 size_t length = Min(kBufferSize, length_ - from_pos); | 504 size_t length = Min(kBufferSize, length_ - from_pos); |
| 625 for (size_t i = 0; i < length; ++i) { | 505 for (size_t i = 0; i < length; ++i) { |
| 626 buffer_[i] = static_cast<uc16>(raw_data_[from_pos + i]); | 506 buffer_[i] = static_cast<uc16>(raw_data_[from_pos + i]); |
| 627 } | 507 } |
| 628 return length; | 508 return length; |
| 629 } | 509 } |
| 630 | 510 |
| 631 } // namespace internal | 511 } // namespace internal |
| 632 } // namespace v8 | 512 } // namespace v8 |
| OLD | NEW |