OLD | NEW |
| (Empty) |
1 // Copyright 2011 the V8 project authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "src/scanner-character-streams.h" | |
6 | |
7 #include "include/v8.h" | |
8 #include "src/globals.h" | |
9 #include "src/handles.h" | |
10 #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! | |
11 #include "src/objects.h" | |
12 #include "src/unicode-inl.h" | |
13 | |
14 namespace v8 { | |
15 namespace internal { | |
16 | |
17 namespace { | |
18 | |
19 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, | |
20 size_t* src_pos, size_t src_length, | |
21 ScriptCompiler::StreamedSource::Encoding encoding) { | |
22 // It's possible that this will be called with length 0, but don't assume that | |
23 // the functions this calls handle it gracefully. | |
24 if (length == 0) return 0; | |
25 | |
26 if (encoding == ScriptCompiler::StreamedSource::UTF8) { | |
27 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | |
28 dest, length, src, src_pos, src_length); | |
29 } | |
30 | |
31 size_t to_fill = length; | |
32 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | |
33 | |
34 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | |
35 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | |
36 } else { | |
37 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | |
38 v8::internal::CopyChars<uint16_t, uint16_t>( | |
39 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); | |
40 } | |
41 *src_pos += to_fill; | |
42 return to_fill; | |
43 } | |
44 | |
45 } // namespace | |
46 | |
47 | |
48 // ---------------------------------------------------------------------------- | |
49 // BufferedUtf16CharacterStreams | |
50 | |
51 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() | |
52 : Utf16CharacterStream(), | |
53 pushback_limit_(NULL) { | |
54 // Initialize buffer as being empty. First read will fill the buffer. | |
55 buffer_cursor_ = buffer_; | |
56 buffer_end_ = buffer_; | |
57 } | |
58 | |
59 | |
60 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } | |
61 | |
62 void BufferedUtf16CharacterStream::PushBack(uc32 character) { | |
63 if (character == kEndOfInput) { | |
64 pos_--; | |
65 return; | |
66 } | |
67 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { | |
68 // buffer_ is writable, buffer_cursor_ is const pointer. | |
69 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); | |
70 pos_--; | |
71 return; | |
72 } | |
73 SlowPushBack(static_cast<uc16>(character)); | |
74 } | |
75 | |
76 | |
77 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { | |
78 // In pushback mode, the end of the buffer contains pushback, | |
79 // and the start of the buffer (from buffer start to pushback_limit_) | |
80 // contains valid data that comes just after the pushback. | |
81 // We NULL the pushback_limit_ if pushing all the way back to the | |
82 // start of the buffer. | |
83 | |
84 if (pushback_limit_ == NULL) { | |
85 // Enter pushback mode. | |
86 pushback_limit_ = buffer_end_; | |
87 buffer_end_ = buffer_ + kBufferSize; | |
88 buffer_cursor_ = buffer_end_; | |
89 } | |
90 // Ensure that there is room for at least one pushback. | |
91 DCHECK(buffer_cursor_ > buffer_); | |
92 DCHECK(pos_ > 0); | |
93 buffer_[--buffer_cursor_ - buffer_] = character; | |
94 if (buffer_cursor_ == buffer_) { | |
95 pushback_limit_ = NULL; | |
96 } else if (buffer_cursor_ < pushback_limit_) { | |
97 pushback_limit_ = buffer_cursor_; | |
98 } | |
99 pos_--; | |
100 } | |
101 | |
102 | |
103 bool BufferedUtf16CharacterStream::ReadBlock() { | |
104 buffer_cursor_ = buffer_; | |
105 if (pushback_limit_ != NULL) { | |
106 // Leave pushback mode. | |
107 buffer_end_ = pushback_limit_; | |
108 pushback_limit_ = NULL; | |
109 // If there were any valid characters left at the | |
110 // start of the buffer, use those. | |
111 if (buffer_cursor_ < buffer_end_) return true; | |
112 // Otherwise read a new block. | |
113 } | |
114 size_t length = FillBuffer(pos_); | |
115 buffer_end_ = buffer_ + length; | |
116 return length > 0; | |
117 } | |
118 | |
119 | |
120 size_t BufferedUtf16CharacterStream::SlowSeekForward(size_t delta) { | |
121 // Leave pushback mode (i.e., ignore that there might be valid data | |
122 // in the buffer before the pushback_limit_ point). | |
123 pushback_limit_ = NULL; | |
124 return BufferSeekForward(delta); | |
125 } | |
126 | |
127 | |
128 // ---------------------------------------------------------------------------- | |
129 // GenericStringUtf16CharacterStream | |
130 | |
131 | |
132 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( | |
133 Handle<String> data, size_t start_position, size_t end_position) | |
134 : string_(data), length_(end_position), bookmark_(kNoBookmark) { | |
135 DCHECK(end_position >= start_position); | |
136 pos_ = start_position; | |
137 } | |
138 | |
139 | |
140 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } | |
141 | |
142 | |
143 bool GenericStringUtf16CharacterStream::SetBookmark() { | |
144 bookmark_ = pos_; | |
145 return true; | |
146 } | |
147 | |
148 | |
149 void GenericStringUtf16CharacterStream::ResetToBookmark() { | |
150 DCHECK(bookmark_ != kNoBookmark); | |
151 pos_ = bookmark_; | |
152 buffer_cursor_ = buffer_; | |
153 buffer_end_ = buffer_ + FillBuffer(pos_); | |
154 } | |
155 | |
156 | |
157 size_t GenericStringUtf16CharacterStream::BufferSeekForward(size_t delta) { | |
158 size_t old_pos = pos_; | |
159 pos_ = Min(pos_ + delta, length_); | |
160 ReadBlock(); | |
161 return pos_ - old_pos; | |
162 } | |
163 | |
164 | |
165 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { | |
166 if (from_pos >= length_) return 0; | |
167 size_t length = kBufferSize; | |
168 if (from_pos + length > length_) { | |
169 length = length_ - from_pos; | |
170 } | |
171 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), | |
172 static_cast<int>(from_pos + length)); | |
173 return length; | |
174 } | |
175 | |
176 | |
177 // ---------------------------------------------------------------------------- | |
178 // Utf8ToUtf16CharacterStream | |
179 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | |
180 size_t length) | |
181 : BufferedUtf16CharacterStream(), | |
182 raw_data_(data), | |
183 raw_data_length_(length), | |
184 raw_data_pos_(0), | |
185 raw_character_position_(0) { | |
186 ReadBlock(); | |
187 } | |
188 | |
189 | |
190 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | |
191 | |
192 | |
193 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, | |
194 const byte* src, size_t* src_pos, | |
195 size_t src_length) { | |
196 static const unibrow::uchar kMaxUtf16Character = 0xffff; | |
197 size_t i = 0; | |
198 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | |
199 // one character early (in the normal case), because we need to have at least | |
200 // two free spaces in the buffer to be sure that the next character will fit. | |
201 while (i < length - 1) { | |
202 if (*src_pos == src_length) break; | |
203 unibrow::uchar c = src[*src_pos]; | |
204 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
205 *src_pos = *src_pos + 1; | |
206 } else { | |
207 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | |
208 src_pos); | |
209 } | |
210 if (c > kMaxUtf16Character) { | |
211 dest[i++] = unibrow::Utf16::LeadSurrogate(c); | |
212 dest[i++] = unibrow::Utf16::TrailSurrogate(c); | |
213 } else { | |
214 dest[i++] = static_cast<uc16>(c); | |
215 } | |
216 } | |
217 return i; | |
218 } | |
219 | |
220 | |
221 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { | |
222 size_t old_pos = pos_; | |
223 size_t target_pos = pos_ + delta; | |
224 SetRawPosition(target_pos); | |
225 pos_ = raw_character_position_; | |
226 ReadBlock(); | |
227 return pos_ - old_pos; | |
228 } | |
229 | |
230 | |
231 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { | |
232 SetRawPosition(char_position); | |
233 if (raw_character_position_ != char_position) { | |
234 // char_position was not a valid position in the stream (hit the end | |
235 // while spooling to it). | |
236 return 0u; | |
237 } | |
238 size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | |
239 raw_data_length_); | |
240 raw_character_position_ = char_position + i; | |
241 return i; | |
242 } | |
243 | |
244 | |
245 static const byte kUtf8MultiByteMask = 0xC0; | |
246 static const byte kUtf8MultiByteCharFollower = 0x80; | |
247 | |
248 | |
249 #ifdef DEBUG | |
250 static const byte kUtf8MultiByteCharStart = 0xC0; | |
251 static bool IsUtf8MultiCharacterStart(byte first_byte) { | |
252 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | |
253 } | |
254 #endif | |
255 | |
256 | |
257 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | |
258 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | |
259 } | |
260 | |
261 | |
262 // Move the cursor back to point at the preceding UTF-8 character start | |
263 // in the buffer. | |
264 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { | |
265 byte character = buffer[--*cursor]; | |
266 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
267 DCHECK(IsUtf8MultiCharacterFollower(character)); | |
268 // Last byte of a multi-byte character encoding. Step backwards until | |
269 // pointing to the first byte of the encoding, recognized by having the | |
270 // top two bits set. | |
271 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | |
272 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | |
273 } | |
274 } | |
275 | |
276 | |
277 // Move the cursor forward to point at the next following UTF-8 character start | |
278 // in the buffer. | |
279 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { | |
280 byte character = buffer[(*cursor)++]; | |
281 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
282 // First character of a multi-byte character encoding. | |
283 // The number of most-significant one-bits determines the length of the | |
284 // encoding: | |
285 // 110..... - (0xCx, 0xDx) one additional byte (minimum). | |
286 // 1110.... - (0xEx) two additional bytes. | |
287 // 11110... - (0xFx) three additional bytes (maximum). | |
288 DCHECK(IsUtf8MultiCharacterStart(character)); | |
289 // Additional bytes is: | |
290 // 1 if value in range 0xC0 .. 0xDF. | |
291 // 2 if value in range 0xE0 .. 0xEF. | |
292 // 3 if value in range 0xF0 .. 0xF7. | |
293 // Encode that in a single value. | |
294 size_t additional_bytes = | |
295 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | |
296 *cursor += additional_bytes; | |
297 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | |
298 } | |
299 } | |
300 | |
301 | |
302 // This can't set a raw position between two surrogate pairs, since there | |
303 // is no position in the UTF8 stream that corresponds to that. This assumes | |
304 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If | |
305 // it is illegally coded as two 3 byte sequences then there is no problem here. | |
306 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { | |
307 if (raw_character_position_ > target_position) { | |
308 // Spool backwards in utf8 buffer. | |
309 do { | |
310 size_t old_pos = raw_data_pos_; | |
311 Utf8CharacterBack(raw_data_, &raw_data_pos_); | |
312 raw_character_position_--; | |
313 DCHECK(old_pos - raw_data_pos_ <= 4); | |
314 // Step back over both code units for surrogate pairs. | |
315 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | |
316 } while (raw_character_position_ > target_position); | |
317 // No surrogate pair splitting. | |
318 DCHECK(raw_character_position_ == target_position); | |
319 return; | |
320 } | |
321 // Spool forwards in the utf8 buffer. | |
322 while (raw_character_position_ < target_position) { | |
323 if (raw_data_pos_ == raw_data_length_) return; | |
324 size_t old_pos = raw_data_pos_; | |
325 Utf8CharacterForward(raw_data_, &raw_data_pos_); | |
326 raw_character_position_++; | |
327 DCHECK(raw_data_pos_ - old_pos <= 4); | |
328 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | |
329 } | |
330 // No surrogate pair splitting. | |
331 DCHECK(raw_character_position_ == target_position); | |
332 } | |
333 | |
334 | |
335 size_t ExternalStreamingStream::FillBuffer(size_t position) { | |
336 // Ignore "position" which is the position in the decoded data. Instead, | |
337 // ExternalStreamingStream keeps track of the position in the raw data. | |
338 size_t data_in_buffer = 0; | |
339 // Note that the UTF-8 decoder might not be able to fill the buffer | |
340 // completely; it will typically leave the last character empty (see | |
341 // Utf8ToUtf16CharacterStream::CopyChars). | |
342 while (data_in_buffer < kBufferSize - 1) { | |
343 if (current_data_ == NULL) { | |
344 // GetSomeData will wait until the embedder has enough data. Here's an | |
345 // interface between the API which uses size_t (which is the correct type | |
346 // here) and the internal parts which use size_t. | |
347 current_data_length_ = source_stream_->GetMoreData(¤t_data_); | |
348 current_data_offset_ = 0; | |
349 bool data_ends = current_data_length_ == 0; | |
350 bookmark_data_is_from_current_data_ = false; | |
351 | |
352 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 | |
353 // character (the rest of the bytes will be in the next chunk). | |
354 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { | |
355 HandleUtf8SplitCharacters(&data_in_buffer); | |
356 if (!data_ends && current_data_offset_ == current_data_length_) { | |
357 // The data stream didn't end, but we used all the data in the | |
358 // chunk. This will only happen when the chunk was really small. We | |
359 // don't handle the case where a UTF-8 character is split over several | |
360 // chunks; in that case V8 won't crash, but it will be a parse error. | |
361 FlushCurrent(); | |
362 continue; // Request a new chunk. | |
363 } | |
364 } | |
365 | |
366 // Did the data stream end? | |
367 if (data_ends) { | |
368 DCHECK(utf8_split_char_buffer_length_ == 0); | |
369 return data_in_buffer; | |
370 } | |
371 } | |
372 | |
373 // Fill the buffer from current_data_. | |
374 size_t new_offset = 0; | |
375 size_t new_chars_in_buffer = | |
376 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, | |
377 current_data_ + current_data_offset_, &new_offset, | |
378 current_data_length_ - current_data_offset_, encoding_); | |
379 data_in_buffer += new_chars_in_buffer; | |
380 current_data_offset_ += new_offset; | |
381 DCHECK(data_in_buffer <= kBufferSize); | |
382 | |
383 // Did we use all the data in the data chunk? | |
384 if (current_data_offset_ == current_data_length_) { | |
385 FlushCurrent(); | |
386 } | |
387 } | |
388 return data_in_buffer; | |
389 } | |
390 | |
391 | |
392 bool ExternalStreamingStream::SetBookmark() { | |
393 // Bookmarking for this stream is a bit more complex than expected, since | |
394 // the stream state is distributed over several places: | |
395 // - pos_ (inherited from Utf16CharacterStream) | |
396 // - buffer_cursor_ and buffer_end_ (also from Utf16CharacterStream) | |
397 // - buffer_ (from BufferedUtf16CharacterStream) | |
398 // - current_data_ (+ .._offset_ and .._length) (this class) | |
399 // - utf8_split_char_buffer_* (a partial utf8 symbol at the block boundary) | |
400 // | |
401 // The underlying source_stream_ instance likely could re-construct this | |
402 // local data for us, but with the given interfaces we have no way of | |
403 // accomplishing this. Thus, we'll have to save all data locally. | |
404 // | |
405 // What gets saved where: | |
406 // - pos_ => bookmark_ | |
407 // - buffer_[buffer_cursor_ .. buffer_end_] => bookmark_buffer_ | |
408 // - current_data_[.._offset_ .. .._length_] => bookmark_data_ | |
409 // - utf8_split_char_buffer_* => bookmark_utf8_split... | |
410 // | |
411 // To make sure we don't unnecessarily copy data, we also maintain | |
412 // whether bookmark_data_ contains a copy of the current current_data_ | |
413 // block. This is done with: | |
414 // - bookmark_data_is_from_current_data_ | |
415 // - bookmark_data_offset_: offset into bookmark_data_ | |
416 // | |
417 // Note that bookmark_data_is_from_current_data_ must be maintained | |
418 // whenever current_data_ is updated. | |
419 | |
420 bookmark_ = pos_; | |
421 | |
422 size_t buffer_length = buffer_end_ - buffer_cursor_; | |
423 bookmark_buffer_.Dispose(); | |
424 bookmark_buffer_ = Vector<uint16_t>::New(static_cast<int>(buffer_length)); | |
425 CopyCharsUnsigned(bookmark_buffer_.start(), buffer_cursor_, buffer_length); | |
426 | |
427 size_t data_length = current_data_length_ - current_data_offset_; | |
428 size_t bookmark_data_length = static_cast<size_t>(bookmark_data_.length()); | |
429 if (bookmark_data_is_from_current_data_ && | |
430 data_length < bookmark_data_length) { | |
431 // Fast case: bookmark_data_ was previously copied from the current | |
432 // data block, and we have enough data for this bookmark. | |
433 bookmark_data_offset_ = bookmark_data_length - data_length; | |
434 } else { | |
435 // Slow case: We need to copy current_data_. | |
436 bookmark_data_.Dispose(); | |
437 bookmark_data_ = Vector<uint8_t>::New(static_cast<int>(data_length)); | |
438 CopyBytes(bookmark_data_.start(), current_data_ + current_data_offset_, | |
439 data_length); | |
440 bookmark_data_is_from_current_data_ = true; | |
441 bookmark_data_offset_ = 0; | |
442 } | |
443 | |
444 bookmark_utf8_split_char_buffer_length_ = utf8_split_char_buffer_length_; | |
445 for (size_t i = 0; i < utf8_split_char_buffer_length_; i++) { | |
446 bookmark_utf8_split_char_buffer_[i] = utf8_split_char_buffer_[i]; | |
447 } | |
448 | |
449 return source_stream_->SetBookmark(); | |
450 } | |
451 | |
452 | |
453 void ExternalStreamingStream::ResetToBookmark() { | |
454 source_stream_->ResetToBookmark(); | |
455 FlushCurrent(); | |
456 | |
457 pos_ = bookmark_; | |
458 | |
459 // bookmark_data_* => current_data_* | |
460 // (current_data_ assumes ownership of its memory.) | |
461 current_data_offset_ = 0; | |
462 current_data_length_ = bookmark_data_.length() - bookmark_data_offset_; | |
463 uint8_t* data = new uint8_t[current_data_length_]; | |
464 CopyCharsUnsigned(data, bookmark_data_.begin() + bookmark_data_offset_, | |
465 current_data_length_); | |
466 delete[] current_data_; | |
467 current_data_ = data; | |
468 bookmark_data_is_from_current_data_ = true; | |
469 | |
470 // bookmark_buffer_ needs to be copied to buffer_. | |
471 CopyCharsUnsigned(buffer_, bookmark_buffer_.begin(), | |
472 bookmark_buffer_.length()); | |
473 buffer_cursor_ = buffer_; | |
474 buffer_end_ = buffer_ + bookmark_buffer_.length(); | |
475 | |
476 // utf8 split char buffer | |
477 utf8_split_char_buffer_length_ = bookmark_utf8_split_char_buffer_length_; | |
478 for (size_t i = 0; i < bookmark_utf8_split_char_buffer_length_; i++) { | |
479 utf8_split_char_buffer_[i] = bookmark_utf8_split_char_buffer_[i]; | |
480 } | |
481 } | |
482 | |
483 | |
484 void ExternalStreamingStream::FlushCurrent() { | |
485 delete[] current_data_; | |
486 current_data_ = NULL; | |
487 current_data_length_ = 0; | |
488 current_data_offset_ = 0; | |
489 bookmark_data_is_from_current_data_ = false; | |
490 } | |
491 | |
492 | |
493 void ExternalStreamingStream::HandleUtf8SplitCharacters( | |
494 size_t* data_in_buffer) { | |
495 // Note the following property of UTF-8 which makes this function possible: | |
496 // Given any byte, we can always read its local environment (in both | |
497 // directions) to find out the (possibly multi-byte) character it belongs | |
498 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a | |
499 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or | |
500 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. | |
501 | |
502 // First check if we have leftover data from the last chunk. | |
503 unibrow::uchar c; | |
504 if (utf8_split_char_buffer_length_ > 0) { | |
505 // Move the bytes which are part of the split character (which started in | |
506 // the previous chunk) into utf8_split_char_buffer_. Note that the | |
507 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. | |
508 while (current_data_offset_ < current_data_length_ && | |
509 utf8_split_char_buffer_length_ < 4 && | |
510 (c = current_data_[current_data_offset_]) >> 6 == 2) { | |
511 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; | |
512 ++utf8_split_char_buffer_length_; | |
513 ++current_data_offset_; | |
514 } | |
515 | |
516 // Convert the data in utf8_split_char_buffer_. | |
517 size_t new_offset = 0; | |
518 size_t new_chars_in_buffer = | |
519 CopyCharsHelper(buffer_ + *data_in_buffer, | |
520 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, | |
521 &new_offset, utf8_split_char_buffer_length_, encoding_); | |
522 *data_in_buffer += new_chars_in_buffer; | |
523 // Make sure we used all the data. | |
524 DCHECK(new_offset == utf8_split_char_buffer_length_); | |
525 DCHECK(*data_in_buffer <= kBufferSize); | |
526 | |
527 utf8_split_char_buffer_length_ = 0; | |
528 } | |
529 | |
530 // Move bytes which are part of an incomplete character from the end of the | |
531 // current chunk to utf8_split_char_buffer_. They will be converted when the | |
532 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 | |
533 // bytes long, but if the data is invalid, we can have character values bigger | |
534 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. | |
535 while (current_data_length_ > current_data_offset_ && | |
536 (c = current_data_[current_data_length_ - 1]) > | |
537 unibrow::Utf8::kMaxOneByteChar && | |
538 utf8_split_char_buffer_length_ < 4) { | |
539 --current_data_length_; | |
540 ++utf8_split_char_buffer_length_; | |
541 if (c >= (3 << 6)) { | |
542 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte | |
543 // character. No need to copy the previous characters into the conversion | |
544 // buffer (even if they're multi-byte). | |
545 break; | |
546 } | |
547 } | |
548 CHECK(utf8_split_char_buffer_length_ <= 4); | |
549 for (size_t i = 0; i < utf8_split_char_buffer_length_; ++i) { | |
550 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; | |
551 } | |
552 } | |
553 | |
554 | |
555 // ---------------------------------------------------------------------------- | |
556 // ExternalTwoByteStringUtf16CharacterStream | |
557 | |
558 ExternalTwoByteStringUtf16CharacterStream:: | |
559 ~ExternalTwoByteStringUtf16CharacterStream() { } | |
560 | |
561 | |
562 ExternalTwoByteStringUtf16CharacterStream:: | |
563 ExternalTwoByteStringUtf16CharacterStream( | |
564 Handle<ExternalTwoByteString> data, int start_position, | |
565 int end_position) | |
566 : Utf16CharacterStream(), | |
567 source_(data), | |
568 raw_data_(data->GetTwoByteData(start_position)), | |
569 bookmark_(kNoBookmark) { | |
570 buffer_cursor_ = raw_data_, | |
571 buffer_end_ = raw_data_ + (end_position - start_position); | |
572 pos_ = start_position; | |
573 } | |
574 | |
575 | |
576 bool ExternalTwoByteStringUtf16CharacterStream::SetBookmark() { | |
577 bookmark_ = pos_; | |
578 return true; | |
579 } | |
580 | |
581 | |
582 void ExternalTwoByteStringUtf16CharacterStream::ResetToBookmark() { | |
583 DCHECK(bookmark_ != kNoBookmark); | |
584 pos_ = bookmark_; | |
585 buffer_cursor_ = raw_data_ + bookmark_; | |
586 } | |
587 } // namespace internal | |
588 } // namespace v8 | |
OLD | NEW |