OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/scanner-character-streams.h" | 7 #include "src/scanner-character-streams.h" |
8 | 8 |
| 9 #include "include/v8.h" |
9 #include "src/handles.h" | 10 #include "src/handles.h" |
10 #include "src/unicode-inl.h" | 11 #include "src/unicode-inl.h" |
11 | 12 |
12 namespace v8 { | 13 namespace v8 { |
13 namespace internal { | 14 namespace internal { |
14 | 15 |
| 16 namespace { |
| 17 |
| 18 unsigned CopyCharsHelper( |
| 19 uint16_t* dest, unsigned length, const uint8_t* src, unsigned* src_pos, |
| 20 unsigned src_length, |
| 21 ScriptCompiler::ExternalSourceStream::Encoding encoding) { |
| 22 if (encoding == ScriptCompiler::ExternalSourceStream::UTF8) { |
| 23 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( |
| 24 dest, length, src, src_pos, src_length); |
| 25 } |
| 26 |
| 27 unsigned to_fill = length; |
| 28 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; |
| 29 |
| 30 if (encoding == ScriptCompiler::ExternalSourceStream::ONE_BYTE) { |
| 31 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); |
| 32 } else { |
| 33 DCHECK(encoding == ScriptCompiler::ExternalSourceStream::TWO_BYTE); |
| 34 v8::internal::CopyChars<uint16_t, uint16_t>( |
| 35 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); |
| 36 } |
| 37 *src_pos += to_fill; |
| 38 return to_fill; |
| 39 } |
| 40 |
| 41 } // namespace |
| 42 |
| 43 |
15 // ---------------------------------------------------------------------------- | 44 // ---------------------------------------------------------------------------- |
16 // BufferedUtf16CharacterStreams | 45 // BufferedUtf16CharacterStreams |
17 | 46 |
18 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() | 47 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() |
19 : Utf16CharacterStream(), | 48 : Utf16CharacterStream(), |
20 pushback_limit_(NULL) { | 49 pushback_limit_(NULL) { |
21 // Initialize buffer as being empty. First read will fill the buffer. | 50 // Initialize buffer as being empty. First read will fill the buffer. |
22 buffer_cursor_ = buffer_; | 51 buffer_cursor_ = buffer_; |
23 buffer_end_ = buffer_; | 52 buffer_end_ = buffer_; |
24 } | 53 } |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
138 raw_data_length_(length), | 167 raw_data_length_(length), |
139 raw_data_pos_(0), | 168 raw_data_pos_(0), |
140 raw_character_position_(0) { | 169 raw_character_position_(0) { |
141 ReadBlock(); | 170 ReadBlock(); |
142 } | 171 } |
143 | 172 |
144 | 173 |
145 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | 174 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
146 | 175 |
147 | 176 |
| 177 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length, |
| 178 const byte* src, |
| 179 unsigned* src_pos, |
| 180 unsigned src_length) { |
| 181 static const unibrow::uchar kMaxUtf16Character = 0xffff; |
| 182 unsigned i = 0; |
| 183 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
| 184 // one character early (in the normal case), because we need to have at least |
| 185 // two free spaces in the buffer to be sure that the next character will fit. |
| 186 while (i < length - 1) { |
| 187 if (*src_pos == src_length) break; |
| 188 unibrow::uchar c = src[*src_pos]; |
| 189 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 190 *src_pos = *src_pos + 1; |
| 191 } else { |
| 192 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
| 193 src_pos); |
| 194 } |
| 195 if (c > kMaxUtf16Character) { |
| 196 dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 197 dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 198 } else { |
| 199 dest[i++] = static_cast<uc16>(c); |
| 200 } |
| 201 } |
| 202 return i; |
| 203 } |
| 204 |
| 205 |
148 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { | 206 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { |
149 unsigned old_pos = pos_; | 207 unsigned old_pos = pos_; |
150 unsigned target_pos = pos_ + delta; | 208 unsigned target_pos = pos_ + delta; |
151 SetRawPosition(target_pos); | 209 SetRawPosition(target_pos); |
152 pos_ = raw_character_position_; | 210 pos_ = raw_character_position_; |
153 ReadBlock(); | 211 ReadBlock(); |
154 return pos_ - old_pos; | 212 return pos_ - old_pos; |
155 } | 213 } |
156 | 214 |
157 | 215 |
158 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { | 216 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { |
159 static const unibrow::uchar kMaxUtf16Character = 0xffff; | |
160 SetRawPosition(char_position); | 217 SetRawPosition(char_position); |
161 if (raw_character_position_ != char_position) { | 218 if (raw_character_position_ != char_position) { |
162 // char_position was not a valid position in the stream (hit the end | 219 // char_position was not a valid position in the stream (hit the end |
163 // while spooling to it). | 220 // while spooling to it). |
164 return 0u; | 221 return 0u; |
165 } | 222 } |
166 unsigned i = 0; | 223 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, |
167 while (i < kBufferSize - 1) { | 224 raw_data_length_); |
168 if (raw_data_pos_ == raw_data_length_) break; | |
169 unibrow::uchar c = raw_data_[raw_data_pos_]; | |
170 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
171 raw_data_pos_++; | |
172 } else { | |
173 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, | |
174 raw_data_length_ - raw_data_pos_, | |
175 &raw_data_pos_); | |
176 } | |
177 if (c > kMaxUtf16Character) { | |
178 buffer_[i++] = unibrow::Utf16::LeadSurrogate(c); | |
179 buffer_[i++] = unibrow::Utf16::TrailSurrogate(c); | |
180 } else { | |
181 buffer_[i++] = static_cast<uc16>(c); | |
182 } | |
183 } | |
184 raw_character_position_ = char_position + i; | 225 raw_character_position_ = char_position + i; |
185 return i; | 226 return i; |
186 } | 227 } |
187 | 228 |
188 | 229 |
189 static const byte kUtf8MultiByteMask = 0xC0; | 230 static const byte kUtf8MultiByteMask = 0xC0; |
190 static const byte kUtf8MultiByteCharFollower = 0x80; | 231 static const byte kUtf8MultiByteCharFollower = 0x80; |
191 | 232 |
192 | 233 |
193 #ifdef DEBUG | 234 #ifdef DEBUG |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
269 Utf8CharacterForward(raw_data_, &raw_data_pos_); | 310 Utf8CharacterForward(raw_data_, &raw_data_pos_); |
270 raw_character_position_++; | 311 raw_character_position_++; |
271 DCHECK(raw_data_pos_ - old_pos <= 4); | 312 DCHECK(raw_data_pos_ - old_pos <= 4); |
272 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | 313 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
273 } | 314 } |
274 // No surrogate pair splitting. | 315 // No surrogate pair splitting. |
275 DCHECK(raw_character_position_ == target_position); | 316 DCHECK(raw_character_position_ == target_position); |
276 } | 317 } |
277 | 318 |
278 | 319 |
| 320 unsigned ExternalStreamingStream::FillBuffer(unsigned position) { |
| 321 // Ignore "position" which is the position in the decoded data. Instead, |
| 322 // ExternalStreamingStream keeps track of the position in the raw data. |
| 323 unsigned data_in_buffer = 0; |
| 324 // Note that the UTF-8 decoder might not be able to fill the buffer |
| 325 // completely; it will typically leave the last character empty (see |
| 326 // Utf8ToUtf16CharacterStream::CopyChars). |
| 327 while (data_in_buffer < kBufferSize - 1) { |
| 328 if (current_data_ == NULL) { |
| 329 // GetSomeData will wait until the embedder has enough data. |
| 330 current_data_length_ = source_stream_->GetMoreData(¤t_data_); |
| 331 current_data_offset_ = 0; |
| 332 |
| 333 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 |
| 334 // character (the rest of the bytes will be in the next chunk). |
| 335 if (source_stream_->encoding == |
| 336 ScriptCompiler::ExternalSourceStream::UTF8) { |
| 337 bool data_ends = current_data_length_ == 0; |
| 338 HandleUtf8SplitCharacters(&data_in_buffer); |
| 339 // Did we use all the data in the data chunk? Note that this would mean |
| 340 // the chunk was really small. We don't handle the case where a UTF-8 |
| 341 // character is split over several chunks; in that case V8 won't crash, |
| 342 // but it will be a parse error. |
| 343 if (!data_ends && current_data_offset_ == current_data_length_) { |
| 344 delete[] current_data_; |
| 345 current_data_ = NULL; |
| 346 current_data_length_ = 0; |
| 347 current_data_offset_ = 0; |
| 348 continue; |
| 349 } |
| 350 } |
| 351 |
| 352 // Did the data stream end? |
| 353 if (current_data_length_ == 0 && utf8_split_char_buffer_length_ == 0) { |
| 354 return data_in_buffer; |
| 355 } |
| 356 } |
| 357 |
| 358 // Fill the buffer from current_data_. |
| 359 unsigned new_offset = 0; |
| 360 unsigned new_chars_in_buffer = CopyCharsHelper( |
| 361 buffer_ + data_in_buffer, kBufferSize - data_in_buffer, |
| 362 current_data_ + current_data_offset_, &new_offset, |
| 363 current_data_length_ - current_data_offset_, source_stream_->encoding); |
| 364 data_in_buffer += new_chars_in_buffer; |
| 365 current_data_offset_ += new_offset; |
| 366 DCHECK(data_in_buffer <= kBufferSize); |
| 367 |
| 368 // Did we use all the data in the data chunk? |
| 369 if (current_data_offset_ == current_data_length_) { |
| 370 delete[] current_data_; |
| 371 current_data_ = NULL; |
| 372 current_data_length_ = 0; |
| 373 current_data_offset_ = 0; |
| 374 } |
| 375 } |
| 376 return data_in_buffer; |
| 377 } |
| 378 |
| 379 void ExternalStreamingStream::HandleUtf8SplitCharacters( |
| 380 unsigned* data_in_buffer) { |
| 381 // First check if we have leftover data from the last chunk. |
| 382 unibrow::uchar c; |
| 383 if (utf8_split_char_buffer_length_ > 0) { |
| 384 // Move the bytes which are part of the split character (which started in |
| 385 // the previous chunk) into utf8_split_char_buffer_. |
| 386 while (current_data_offset_ < current_data_length_ && |
| 387 utf8_split_char_buffer_length_ < 4 && |
| 388 (c = current_data_[current_data_offset_]) > |
| 389 unibrow::Utf8::kMaxOneByteChar) { |
| 390 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; |
| 391 ++utf8_split_char_buffer_length_; |
| 392 ++current_data_offset_; |
| 393 } |
| 394 |
| 395 // Convert the data in utf8_split_char_buffer_. |
| 396 unsigned new_offset = 0; |
| 397 unsigned new_chars_in_buffer = CopyCharsHelper( |
| 398 buffer_ + *data_in_buffer, kBufferSize - *data_in_buffer, |
| 399 utf8_split_char_buffer_, &new_offset, utf8_split_char_buffer_length_, |
| 400 source_stream_->encoding); |
| 401 *data_in_buffer += new_chars_in_buffer; |
| 402 // Make sure we used all the data. |
| 403 DCHECK(new_offset == utf8_split_char_buffer_length_); |
| 404 DCHECK(*data_in_buffer <= kBufferSize); |
| 405 |
| 406 utf8_split_char_buffer_length_ = 0; |
| 407 } |
| 408 |
| 409 // Move bytes which are part of an incomplete character from the end of the |
| 410 // current chunk to utf8_split_char_buffer_. They will be converted when the |
| 411 // next data chunk arrives. |
| 412 while (current_data_length_ > current_data_offset_ && |
| 413 (c = current_data_[current_data_length_ - 1]) > |
| 414 unibrow::Utf8::kMaxOneByteChar) { |
| 415 --current_data_length_; |
| 416 ++utf8_split_char_buffer_length_; |
| 417 } |
| 418 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { |
| 419 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; |
| 420 } |
| 421 } |
| 422 |
| 423 |
279 // ---------------------------------------------------------------------------- | 424 // ---------------------------------------------------------------------------- |
280 // ExternalTwoByteStringUtf16CharacterStream | 425 // ExternalTwoByteStringUtf16CharacterStream |
281 | 426 |
282 ExternalTwoByteStringUtf16CharacterStream:: | 427 ExternalTwoByteStringUtf16CharacterStream:: |
283 ~ExternalTwoByteStringUtf16CharacterStream() { } | 428 ~ExternalTwoByteStringUtf16CharacterStream() { } |
284 | 429 |
285 | 430 |
286 ExternalTwoByteStringUtf16CharacterStream | 431 ExternalTwoByteStringUtf16CharacterStream |
287 ::ExternalTwoByteStringUtf16CharacterStream( | 432 ::ExternalTwoByteStringUtf16CharacterStream( |
288 Handle<ExternalTwoByteString> data, | 433 Handle<ExternalTwoByteString> data, |
289 int start_position, | 434 int start_position, |
290 int end_position) | 435 int end_position) |
291 : Utf16CharacterStream(), | 436 : Utf16CharacterStream(), |
292 source_(data), | 437 source_(data), |
293 raw_data_(data->GetTwoByteData(start_position)) { | 438 raw_data_(data->GetTwoByteData(start_position)) { |
294 buffer_cursor_ = raw_data_, | 439 buffer_cursor_ = raw_data_, |
295 buffer_end_ = raw_data_ + (end_position - start_position); | 440 buffer_end_ = raw_data_ + (end_position - start_position); |
296 pos_ = start_position; | 441 pos_ = start_position; |
297 } | 442 } |
298 | 443 |
299 } } // namespace v8::internal | 444 } } // namespace v8::internal |
OLD | NEW |