OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/parsing/scanner-character-streams.h" | 5 #include "src/parsing/scanner-character-streams.h" |
6 | 6 |
7 #include "include/v8.h" | 7 #include "include/v8.h" |
8 #include "src/globals.h" | 8 #include "src/globals.h" |
9 #include "src/handles.h" | 9 #include "src/handles.h" |
10 #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! | 10 #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! |
11 #include "src/objects-inl.h" | 11 #include "src/objects-inl.h" |
12 #include "src/unicode-inl.h" | 12 #include "src/unicode-inl.h" |
13 | 13 |
14 namespace v8 { | 14 namespace v8 { |
15 namespace internal { | 15 namespace internal { |
16 | 16 |
17 namespace { | 17 namespace { |
18 | 18 |
| 19 size_t CopyUtf8CharsToUtf16Chars(uint16_t* dest, size_t length, const byte* src, |
| 20 size_t* src_pos, size_t src_length) { |
| 21 static const unibrow::uchar kMaxUtf16Character = |
| 22 unibrow::Utf16::kMaxNonSurrogateCharCode; |
| 23 size_t i = 0; |
| 24 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
| 25 // one character early (in the normal case), because we need to have at least |
| 26 // two free spaces in the buffer to be sure that the next character will fit. |
| 27 while (i < length - 1) { |
| 28 if (*src_pos == src_length) break; |
| 29 unibrow::uchar c = src[*src_pos]; |
| 30 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 31 *src_pos = *src_pos + 1; |
| 32 } else { |
| 33 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
| 34 src_pos); |
| 35 } |
| 36 if (c > kMaxUtf16Character) { |
| 37 dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 38 dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 39 } else { |
| 40 dest[i++] = static_cast<uc16>(c); |
| 41 } |
| 42 } |
| 43 return i; |
| 44 } |
| 45 |
19 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, | 46 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, |
20 size_t* src_pos, size_t src_length, | 47 size_t* src_pos, size_t src_length, |
21 ScriptCompiler::StreamedSource::Encoding encoding) { | 48 ScriptCompiler::StreamedSource::Encoding encoding) { |
22 // It's possible that this will be called with length 0, but don't assume that | 49 // It's possible that this will be called with length 0, but don't assume that |
23 // the functions this calls handle it gracefully. | 50 // the functions this calls handle it gracefully. |
24 if (length == 0) return 0; | 51 if (length == 0) return 0; |
25 | 52 |
26 if (encoding == ScriptCompiler::StreamedSource::UTF8) { | 53 if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
27 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | 54 return CopyUtf8CharsToUtf16Chars(dest, length, src, src_pos, src_length); |
28 dest, length, src, src_pos, src_length); | |
29 } | 55 } |
30 | 56 |
31 size_t to_fill = length; | 57 size_t to_fill = length; |
32 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | 58 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; |
33 | 59 |
34 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | 60 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { |
35 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | 61 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); |
36 } else { | 62 } else { |
37 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | 63 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); |
38 v8::internal::CopyChars<uint16_t, uint16_t>( | 64 v8::internal::CopyChars<uint16_t, uint16_t>( |
(...skipping 129 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
168 if (from_pos + length > length_) { | 194 if (from_pos + length > length_) { |
169 length = length_ - from_pos; | 195 length = length_ - from_pos; |
170 } | 196 } |
171 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), | 197 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), |
172 static_cast<int>(from_pos + length)); | 198 static_cast<int>(from_pos + length)); |
173 return length; | 199 return length; |
174 } | 200 } |
175 | 201 |
176 | 202 |
177 // ---------------------------------------------------------------------------- | 203 // ---------------------------------------------------------------------------- |
178 // Utf8ToUtf16CharacterStream | 204 // ExternalStreamingStream |
179 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | |
180 size_t length) | |
181 : BufferedUtf16CharacterStream(), | |
182 raw_data_(data), | |
183 raw_data_length_(length), | |
184 raw_data_pos_(0), | |
185 raw_character_position_(0) { | |
186 ReadBlock(); | |
187 } | |
188 | |
189 | |
190 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | |
191 | |
192 | |
193 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, | |
194 const byte* src, size_t* src_pos, | |
195 size_t src_length) { | |
196 static const unibrow::uchar kMaxUtf16Character = | |
197 unibrow::Utf16::kMaxNonSurrogateCharCode; | |
198 size_t i = 0; | |
199 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | |
200 // one character early (in the normal case), because we need to have at least | |
201 // two free spaces in the buffer to be sure that the next character will fit. | |
202 while (i < length - 1) { | |
203 if (*src_pos == src_length) break; | |
204 unibrow::uchar c = src[*src_pos]; | |
205 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
206 *src_pos = *src_pos + 1; | |
207 } else { | |
208 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | |
209 src_pos); | |
210 } | |
211 if (c > kMaxUtf16Character) { | |
212 dest[i++] = unibrow::Utf16::LeadSurrogate(c); | |
213 dest[i++] = unibrow::Utf16::TrailSurrogate(c); | |
214 } else { | |
215 dest[i++] = static_cast<uc16>(c); | |
216 } | |
217 } | |
218 return i; | |
219 } | |
220 | |
221 | |
222 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { | |
223 size_t old_pos = pos_; | |
224 size_t target_pos = pos_ + delta; | |
225 SetRawPosition(target_pos); | |
226 pos_ = raw_character_position_; | |
227 ReadBlock(); | |
228 return pos_ - old_pos; | |
229 } | |
230 | |
231 | |
232 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { | |
233 SetRawPosition(char_position); | |
234 if (raw_character_position_ != char_position) { | |
235 // char_position was not a valid position in the stream (hit the end | |
236 // while spooling to it). | |
237 return 0u; | |
238 } | |
239 size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | |
240 raw_data_length_); | |
241 raw_character_position_ = char_position + i; | |
242 return i; | |
243 } | |
244 | |
245 | |
246 static const byte kUtf8MultiByteMask = 0xC0; | |
247 static const byte kUtf8MultiByteCharFollower = 0x80; | |
248 | |
249 | |
250 #ifdef DEBUG | |
251 static const byte kUtf8MultiByteCharStart = 0xC0; | |
252 static bool IsUtf8MultiCharacterStart(byte first_byte) { | |
253 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | |
254 } | |
255 #endif | |
256 | |
257 | |
258 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | |
259 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | |
260 } | |
261 | |
262 | |
263 // Move the cursor back to point at the preceding UTF-8 character start | |
264 // in the buffer. | |
265 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { | |
266 byte character = buffer[--*cursor]; | |
267 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
268 DCHECK(IsUtf8MultiCharacterFollower(character)); | |
269 // Last byte of a multi-byte character encoding. Step backwards until | |
270 // pointing to the first byte of the encoding, recognized by having the | |
271 // top two bits set. | |
272 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | |
273 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | |
274 } | |
275 } | |
276 | |
277 | |
278 // Move the cursor forward to point at the next following UTF-8 character start | |
279 // in the buffer. | |
280 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { | |
281 byte character = buffer[(*cursor)++]; | |
282 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
283 // First character of a multi-byte character encoding. | |
284 // The number of most-significant one-bits determines the length of the | |
285 // encoding: | |
286 // 110..... - (0xCx, 0xDx) one additional byte (minimum). | |
287 // 1110.... - (0xEx) two additional bytes. | |
288 // 11110... - (0xFx) three additional bytes (maximum). | |
289 DCHECK(IsUtf8MultiCharacterStart(character)); | |
290 // Additional bytes is: | |
291 // 1 if value in range 0xC0 .. 0xDF. | |
292 // 2 if value in range 0xE0 .. 0xEF. | |
293 // 3 if value in range 0xF0 .. 0xF7. | |
294 // Encode that in a single value. | |
295 size_t additional_bytes = | |
296 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | |
297 *cursor += additional_bytes; | |
298 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | |
299 } | |
300 } | |
301 | |
302 | |
303 // This can't set a raw position between two surrogate pairs, since there | |
304 // is no position in the UTF8 stream that corresponds to that. This assumes | |
305 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If | |
306 // it is illegally coded as two 3 byte sequences then there is no problem here. | |
307 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { | |
308 if (raw_character_position_ > target_position) { | |
309 // Spool backwards in utf8 buffer. | |
310 do { | |
311 size_t old_pos = raw_data_pos_; | |
312 Utf8CharacterBack(raw_data_, &raw_data_pos_); | |
313 raw_character_position_--; | |
314 DCHECK(old_pos - raw_data_pos_ <= 4); | |
315 // Step back over both code units for surrogate pairs. | |
316 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | |
317 } while (raw_character_position_ > target_position); | |
318 // No surrogate pair splitting. | |
319 DCHECK(raw_character_position_ == target_position); | |
320 return; | |
321 } | |
322 // Spool forwards in the utf8 buffer. | |
323 while (raw_character_position_ < target_position) { | |
324 if (raw_data_pos_ == raw_data_length_) return; | |
325 size_t old_pos = raw_data_pos_; | |
326 Utf8CharacterForward(raw_data_, &raw_data_pos_); | |
327 raw_character_position_++; | |
328 DCHECK(raw_data_pos_ - old_pos <= 4); | |
329 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | |
330 } | |
331 // No surrogate pair splitting. | |
332 DCHECK(raw_character_position_ == target_position); | |
333 } | |
334 | |
335 | 205 |
336 size_t ExternalStreamingStream::FillBuffer(size_t position) { | 206 size_t ExternalStreamingStream::FillBuffer(size_t position) { |
337 // Ignore "position" which is the position in the decoded data. Instead, | 207 // Ignore "position" which is the position in the decoded data. Instead, |
338 // ExternalStreamingStream keeps track of the position in the raw data. | 208 // ExternalStreamingStream keeps track of the position in the raw data. |
339 size_t data_in_buffer = 0; | 209 size_t data_in_buffer = 0; |
340 // Note that the UTF-8 decoder might not be able to fill the buffer | 210 // Note that the UTF-8 decoder might not be able to fill the buffer |
341 // completely; it will typically leave the last character empty (see | 211 // completely; it will typically leave the last character empty (see |
342 // Utf8ToUtf16CharacterStream::CopyChars). | 212 // Utf8ToUtf16CharacterStream::CopyChars). |
343 while (data_in_buffer < kBufferSize - 1) { | 213 while (data_in_buffer < kBufferSize - 1) { |
344 if (current_data_ == NULL) { | 214 if (current_data_ == NULL) { |
(...skipping 247 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
592 ExternalOneByteStringUtf16CharacterStream( | 462 ExternalOneByteStringUtf16CharacterStream( |
593 Handle<ExternalOneByteString> data, int start_position, | 463 Handle<ExternalOneByteString> data, int start_position, |
594 int end_position) | 464 int end_position) |
595 : raw_data_(data->GetChars()), | 465 : raw_data_(data->GetChars()), |
596 length_(end_position), | 466 length_(end_position), |
597 bookmark_(kNoBookmark) { | 467 bookmark_(kNoBookmark) { |
598 DCHECK(end_position >= start_position); | 468 DCHECK(end_position >= start_position); |
599 pos_ = start_position; | 469 pos_ = start_position; |
600 } | 470 } |
601 | 471 |
| 472 ExternalOneByteStringUtf16CharacterStream:: |
| 473 ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length) |
| 474 : raw_data_(reinterpret_cast<const uint8_t*>(data)), |
| 475 length_(length), |
| 476 bookmark_(kNoBookmark) {} |
| 477 |
| 478 ExternalOneByteStringUtf16CharacterStream:: |
| 479 ExternalOneByteStringUtf16CharacterStream(const char* data) |
| 480 : ExternalOneByteStringUtf16CharacterStream(data, strlen(data)) {} |
| 481 |
602 bool ExternalOneByteStringUtf16CharacterStream::SetBookmark() { | 482 bool ExternalOneByteStringUtf16CharacterStream::SetBookmark() { |
603 bookmark_ = pos_; | 483 bookmark_ = pos_; |
604 return true; | 484 return true; |
605 } | 485 } |
606 | 486 |
607 void ExternalOneByteStringUtf16CharacterStream::ResetToBookmark() { | 487 void ExternalOneByteStringUtf16CharacterStream::ResetToBookmark() { |
608 DCHECK(bookmark_ != kNoBookmark); | 488 DCHECK(bookmark_ != kNoBookmark); |
609 pos_ = bookmark_; | 489 pos_ = bookmark_; |
610 buffer_cursor_ = buffer_; | 490 buffer_cursor_ = buffer_; |
611 buffer_end_ = buffer_ + FillBuffer(pos_); | 491 buffer_end_ = buffer_ + FillBuffer(pos_); |
(...skipping 11 matching lines...) Expand all Loading... |
623 if (from_pos >= length_) return 0; | 503 if (from_pos >= length_) return 0; |
624 size_t length = Min(kBufferSize, length_ - from_pos); | 504 size_t length = Min(kBufferSize, length_ - from_pos); |
625 for (size_t i = 0; i < length; ++i) { | 505 for (size_t i = 0; i < length; ++i) { |
626 buffer_[i] = static_cast<uc16>(raw_data_[from_pos + i]); | 506 buffer_[i] = static_cast<uc16>(raw_data_[from_pos + i]); |
627 } | 507 } |
628 return length; | 508 return length; |
629 } | 509 } |
630 | 510 |
631 } // namespace internal | 511 } // namespace internal |
632 } // namespace v8 | 512 } // namespace v8 |
OLD | NEW |