OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/scanner-character-streams.h" | 7 #include "src/scanner-character-streams.h" |
8 | 8 |
| 9 #include "include/v8.h" |
9 #include "src/handles.h" | 10 #include "src/handles.h" |
10 #include "src/unicode-inl.h" | 11 #include "src/unicode-inl.h" |
11 | 12 |
12 namespace v8 { | 13 namespace v8 { |
13 namespace internal { | 14 namespace internal { |
14 | 15 |
| 16 namespace { |
| 17 |
| 18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src, |
| 19 unsigned* src_pos, unsigned src_length, |
| 20 ScriptCompiler::StreamedSource::Encoding encoding) { |
| 21 if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
| 22 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( |
| 23 dest, length, src, src_pos, src_length); |
| 24 } |
| 25 |
| 26 unsigned to_fill = length; |
| 27 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; |
| 28 |
| 29 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { |
| 30 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); |
| 31 } else { |
| 32 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); |
| 33 v8::internal::CopyChars<uint16_t, uint16_t>( |
| 34 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); |
| 35 } |
| 36 *src_pos += to_fill; |
| 37 return to_fill; |
| 38 } |
| 39 |
| 40 } // namespace |
| 41 |
| 42 |
15 // ---------------------------------------------------------------------------- | 43 // ---------------------------------------------------------------------------- |
16 // BufferedUtf16CharacterStreams | 44 // BufferedUtf16CharacterStreams |
17 | 45 |
18 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() | 46 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() |
19 : Utf16CharacterStream(), | 47 : Utf16CharacterStream(), |
20 pushback_limit_(NULL) { | 48 pushback_limit_(NULL) { |
21 // Initialize buffer as being empty. First read will fill the buffer. | 49 // Initialize buffer as being empty. First read will fill the buffer. |
22 buffer_cursor_ = buffer_; | 50 buffer_cursor_ = buffer_; |
23 buffer_end_ = buffer_; | 51 buffer_end_ = buffer_; |
24 } | 52 } |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
138 raw_data_length_(length), | 166 raw_data_length_(length), |
139 raw_data_pos_(0), | 167 raw_data_pos_(0), |
140 raw_character_position_(0) { | 168 raw_character_position_(0) { |
141 ReadBlock(); | 169 ReadBlock(); |
142 } | 170 } |
143 | 171 |
144 | 172 |
145 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | 173 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
146 | 174 |
147 | 175 |
| 176 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length, |
| 177 const byte* src, |
| 178 unsigned* src_pos, |
| 179 unsigned src_length) { |
| 180 static const unibrow::uchar kMaxUtf16Character = 0xffff; |
| 181 unsigned i = 0; |
| 182 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
| 183 // one character early (in the normal case), because we need to have at least |
| 184 // two free spaces in the buffer to be sure that the next character will fit. |
| 185 while (i < length - 1) { |
| 186 if (*src_pos == src_length) break; |
| 187 unibrow::uchar c = src[*src_pos]; |
| 188 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 189 *src_pos = *src_pos + 1; |
| 190 } else { |
| 191 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
| 192 src_pos); |
| 193 } |
| 194 if (c > kMaxUtf16Character) { |
| 195 dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 196 dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 197 } else { |
| 198 dest[i++] = static_cast<uc16>(c); |
| 199 } |
| 200 } |
| 201 return i; |
| 202 } |
| 203 |
| 204 |
148 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { | 205 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { |
149 unsigned old_pos = pos_; | 206 unsigned old_pos = pos_; |
150 unsigned target_pos = pos_ + delta; | 207 unsigned target_pos = pos_ + delta; |
151 SetRawPosition(target_pos); | 208 SetRawPosition(target_pos); |
152 pos_ = raw_character_position_; | 209 pos_ = raw_character_position_; |
153 ReadBlock(); | 210 ReadBlock(); |
154 return pos_ - old_pos; | 211 return pos_ - old_pos; |
155 } | 212 } |
156 | 213 |
157 | 214 |
158 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { | 215 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { |
159 static const unibrow::uchar kMaxUtf16Character = 0xffff; | |
160 SetRawPosition(char_position); | 216 SetRawPosition(char_position); |
161 if (raw_character_position_ != char_position) { | 217 if (raw_character_position_ != char_position) { |
162 // char_position was not a valid position in the stream (hit the end | 218 // char_position was not a valid position in the stream (hit the end |
163 // while spooling to it). | 219 // while spooling to it). |
164 return 0u; | 220 return 0u; |
165 } | 221 } |
166 unsigned i = 0; | 222 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, |
167 while (i < kBufferSize - 1) { | 223 raw_data_length_); |
168 if (raw_data_pos_ == raw_data_length_) break; | |
169 unibrow::uchar c = raw_data_[raw_data_pos_]; | |
170 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
171 raw_data_pos_++; | |
172 } else { | |
173 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, | |
174 raw_data_length_ - raw_data_pos_, | |
175 &raw_data_pos_); | |
176 } | |
177 if (c > kMaxUtf16Character) { | |
178 buffer_[i++] = unibrow::Utf16::LeadSurrogate(c); | |
179 buffer_[i++] = unibrow::Utf16::TrailSurrogate(c); | |
180 } else { | |
181 buffer_[i++] = static_cast<uc16>(c); | |
182 } | |
183 } | |
184 raw_character_position_ = char_position + i; | 224 raw_character_position_ = char_position + i; |
185 return i; | 225 return i; |
186 } | 226 } |
187 | 227 |
188 | 228 |
189 static const byte kUtf8MultiByteMask = 0xC0; | 229 static const byte kUtf8MultiByteMask = 0xC0; |
190 static const byte kUtf8MultiByteCharFollower = 0x80; | 230 static const byte kUtf8MultiByteCharFollower = 0x80; |
191 | 231 |
192 | 232 |
193 #ifdef DEBUG | 233 #ifdef DEBUG |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
269 Utf8CharacterForward(raw_data_, &raw_data_pos_); | 309 Utf8CharacterForward(raw_data_, &raw_data_pos_); |
270 raw_character_position_++; | 310 raw_character_position_++; |
271 DCHECK(raw_data_pos_ - old_pos <= 4); | 311 DCHECK(raw_data_pos_ - old_pos <= 4); |
272 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | 312 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
273 } | 313 } |
274 // No surrogate pair splitting. | 314 // No surrogate pair splitting. |
275 DCHECK(raw_character_position_ == target_position); | 315 DCHECK(raw_character_position_ == target_position); |
276 } | 316 } |
277 | 317 |
278 | 318 |
| 319 unsigned ExternalStreamingStream::FillBuffer(unsigned position) { |
| 320 // Ignore "position" which is the position in the decoded data. Instead, |
| 321 // ExternalStreamingStream keeps track of the position in the raw data. |
| 322 unsigned data_in_buffer = 0; |
| 323 // Note that the UTF-8 decoder might not be able to fill the buffer |
| 324 // completely; it will typically leave the last character empty (see |
| 325 // Utf8ToUtf16CharacterStream::CopyChars). |
| 326 while (data_in_buffer < kBufferSize - 1) { |
| 327 if (current_data_ == NULL) { |
| 328 // GetSomeData will wait until the embedder has enough data. |
| 329 current_data_length_ = source_stream_->GetMoreData(¤t_data_); |
| 330 current_data_offset_ = 0; |
| 331 bool data_ends = current_data_length_ == 0; |
| 332 |
| 333 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 |
| 334 // character (the rest of the bytes will be in the next chunk). |
| 335 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { |
| 336 HandleUtf8SplitCharacters(&data_in_buffer); |
| 337 if (!data_ends && current_data_offset_ == current_data_length_) { |
| 338 // The data stream didn't end, but we used all the data in the |
| 339 // chunk. This will only happen when the chunk was really small. We |
| 340 // don't handle the case where a UTF-8 character is split over several |
| 341 // chunks; in that case V8 won't crash, but it will be a parse error. |
| 342 delete[] current_data_; |
| 343 current_data_ = NULL; |
| 344 current_data_length_ = 0; |
| 345 current_data_offset_ = 0; |
| 346 continue; // Request a new chunk. |
| 347 } |
| 348 } |
| 349 |
| 350 // Did the data stream end? |
| 351 if (data_ends) { |
| 352 DCHECK(utf8_split_char_buffer_length_ == 0); |
| 353 return data_in_buffer; |
| 354 } |
| 355 } |
| 356 |
| 357 // Fill the buffer from current_data_. |
| 358 unsigned new_offset = 0; |
| 359 unsigned new_chars_in_buffer = |
| 360 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, |
| 361 current_data_ + current_data_offset_, &new_offset, |
| 362 current_data_length_ - current_data_offset_, encoding_); |
| 363 data_in_buffer += new_chars_in_buffer; |
| 364 current_data_offset_ += new_offset; |
| 365 DCHECK(data_in_buffer <= kBufferSize); |
| 366 |
| 367 // Did we use all the data in the data chunk? |
| 368 if (current_data_offset_ == current_data_length_) { |
| 369 delete[] current_data_; |
| 370 current_data_ = NULL; |
| 371 current_data_length_ = 0; |
| 372 current_data_offset_ = 0; |
| 373 } |
| 374 } |
| 375 return data_in_buffer; |
| 376 } |
| 377 |
| 378 void ExternalStreamingStream::HandleUtf8SplitCharacters( |
| 379 unsigned* data_in_buffer) { |
| 380 // First check if we have leftover data from the last chunk. |
| 381 unibrow::uchar c; |
| 382 if (utf8_split_char_buffer_length_ > 0) { |
| 383 // Move the bytes which are part of the split character (which started in |
| 384 // the previous chunk) into utf8_split_char_buffer_. |
| 385 while (current_data_offset_ < current_data_length_ && |
| 386 utf8_split_char_buffer_length_ < 4 && |
| 387 (c = current_data_[current_data_offset_]) > |
| 388 unibrow::Utf8::kMaxOneByteChar) { |
| 389 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; |
| 390 ++utf8_split_char_buffer_length_; |
| 391 ++current_data_offset_; |
| 392 } |
| 393 |
| 394 // Convert the data in utf8_split_char_buffer_. |
| 395 unsigned new_offset = 0; |
| 396 unsigned new_chars_in_buffer = |
| 397 CopyCharsHelper(buffer_ + *data_in_buffer, |
| 398 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, |
| 399 &new_offset, utf8_split_char_buffer_length_, encoding_); |
| 400 *data_in_buffer += new_chars_in_buffer; |
| 401 // Make sure we used all the data. |
| 402 DCHECK(new_offset == utf8_split_char_buffer_length_); |
| 403 DCHECK(*data_in_buffer <= kBufferSize); |
| 404 |
| 405 utf8_split_char_buffer_length_ = 0; |
| 406 } |
| 407 |
| 408 // Move bytes which are part of an incomplete character from the end of the |
| 409 // current chunk to utf8_split_char_buffer_. They will be converted when the |
| 410 // next data chunk arrives. |
| 411 while (current_data_length_ > current_data_offset_ && |
| 412 (c = current_data_[current_data_length_ - 1]) > |
| 413 unibrow::Utf8::kMaxOneByteChar) { |
| 414 --current_data_length_; |
| 415 ++utf8_split_char_buffer_length_; |
| 416 } |
| 417 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { |
| 418 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; |
| 419 } |
| 420 } |
| 421 |
| 422 |
279 // ---------------------------------------------------------------------------- | 423 // ---------------------------------------------------------------------------- |
280 // ExternalTwoByteStringUtf16CharacterStream | 424 // ExternalTwoByteStringUtf16CharacterStream |
281 | 425 |
282 ExternalTwoByteStringUtf16CharacterStream:: | 426 ExternalTwoByteStringUtf16CharacterStream:: |
283 ~ExternalTwoByteStringUtf16CharacterStream() { } | 427 ~ExternalTwoByteStringUtf16CharacterStream() { } |
284 | 428 |
285 | 429 |
286 ExternalTwoByteStringUtf16CharacterStream | 430 ExternalTwoByteStringUtf16CharacterStream |
287 ::ExternalTwoByteStringUtf16CharacterStream( | 431 ::ExternalTwoByteStringUtf16CharacterStream( |
288 Handle<ExternalTwoByteString> data, | 432 Handle<ExternalTwoByteString> data, |
289 int start_position, | 433 int start_position, |
290 int end_position) | 434 int end_position) |
291 : Utf16CharacterStream(), | 435 : Utf16CharacterStream(), |
292 source_(data), | 436 source_(data), |
293 raw_data_(data->GetTwoByteData(start_position)) { | 437 raw_data_(data->GetTwoByteData(start_position)) { |
294 buffer_cursor_ = raw_data_, | 438 buffer_cursor_ = raw_data_, |
295 buffer_end_ = raw_data_ + (end_position - start_position); | 439 buffer_end_ = raw_data_ + (end_position - start_position); |
296 pos_ = start_position; | 440 pos_ = start_position; |
297 } | 441 } |
298 | 442 |
299 } } // namespace v8::internal | 443 } } // namespace v8::internal |
OLD | NEW |