OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/scanner-character-streams.h" | 7 #include "src/scanner-character-streams.h" |
8 | 8 |
9 #include "include/v8.h" | 9 #include "include/v8.h" |
10 #include "src/handles.h" | 10 #include "src/handles.h" |
11 #include "src/unicode-inl.h" | 11 #include "src/unicode-inl.h" |
12 | 12 |
13 namespace v8 { | 13 namespace v8 { |
14 namespace internal { | 14 namespace internal { |
15 | 15 |
16 namespace { | 16 namespace { |
17 | 17 |
18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src, | 18 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, |
19 unsigned* src_pos, unsigned src_length, | 19 size_t* src_pos, size_t src_length, |
20 ScriptCompiler::StreamedSource::Encoding encoding) { | 20 ScriptCompiler::StreamedSource::Encoding encoding) { |
21 // It's possible that this will be called with length 0, but don't assume that | 21 // It's possible that this will be called with length 0, but don't assume that |
22 // the functions this calls handle it gracefully. | 22 // the functions this calls handle it gracefully. |
23 if (length == 0) return 0; | 23 if (length == 0) return 0; |
24 | 24 |
25 if (encoding == ScriptCompiler::StreamedSource::UTF8) { | 25 if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
26 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | 26 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( |
27 dest, length, src, src_pos, src_length); | 27 dest, length, src, src_pos, src_length); |
28 } | 28 } |
29 | 29 |
30 unsigned to_fill = length; | 30 size_t to_fill = length; |
31 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | 31 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; |
32 | 32 |
33 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | 33 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { |
34 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | 34 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); |
35 } else { | 35 } else { |
36 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | 36 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); |
37 v8::internal::CopyChars<uint16_t, uint16_t>( | 37 v8::internal::CopyChars<uint16_t, uint16_t>( |
38 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); | 38 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); |
39 } | 39 } |
40 *src_pos += to_fill; | 40 *src_pos += to_fill; |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
103 buffer_cursor_ = buffer_; | 103 buffer_cursor_ = buffer_; |
104 if (pushback_limit_ != NULL) { | 104 if (pushback_limit_ != NULL) { |
105 // Leave pushback mode. | 105 // Leave pushback mode. |
106 buffer_end_ = pushback_limit_; | 106 buffer_end_ = pushback_limit_; |
107 pushback_limit_ = NULL; | 107 pushback_limit_ = NULL; |
108 // If there were any valid characters left at the | 108 // If there were any valid characters left at the |
109 // start of the buffer, use those. | 109 // start of the buffer, use those. |
110 if (buffer_cursor_ < buffer_end_) return true; | 110 if (buffer_cursor_ < buffer_end_) return true; |
111 // Otherwise read a new block. | 111 // Otherwise read a new block. |
112 } | 112 } |
113 unsigned length = FillBuffer(pos_); | 113 size_t length = FillBuffer(pos_); |
114 buffer_end_ = buffer_ + length; | 114 buffer_end_ = buffer_ + length; |
115 return length > 0; | 115 return length > 0; |
116 } | 116 } |
117 | 117 |
118 | 118 |
119 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) { | 119 size_t BufferedUtf16CharacterStream::SlowSeekForward(size_t delta) { |
120 // Leave pushback mode (i.e., ignore that there might be valid data | 120 // Leave pushback mode (i.e., ignore that there might be valid data |
121 // in the buffer before the pushback_limit_ point). | 121 // in the buffer before the pushback_limit_ point). |
122 pushback_limit_ = NULL; | 122 pushback_limit_ = NULL; |
123 return BufferSeekForward(delta); | 123 return BufferSeekForward(delta); |
124 } | 124 } |
125 | 125 |
126 | 126 |
127 // ---------------------------------------------------------------------------- | 127 // ---------------------------------------------------------------------------- |
128 // GenericStringUtf16CharacterStream | 128 // GenericStringUtf16CharacterStream |
129 | 129 |
130 | 130 |
131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( | 131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( |
132 Handle<String> data, | 132 Handle<String> data, size_t start_position, size_t end_position) |
133 unsigned start_position, | 133 : string_(data), length_(end_position) { |
134 unsigned end_position) | |
135 : string_(data), | |
136 length_(end_position) { | |
137 DCHECK(end_position >= start_position); | 134 DCHECK(end_position >= start_position); |
138 pos_ = start_position; | 135 pos_ = start_position; |
139 } | 136 } |
140 | 137 |
141 | 138 |
142 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } | 139 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } |
143 | 140 |
144 | 141 |
145 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) { | 142 size_t GenericStringUtf16CharacterStream::BufferSeekForward(size_t delta) { |
146 unsigned old_pos = pos_; | 143 size_t old_pos = pos_; |
147 pos_ = Min(pos_ + delta, length_); | 144 pos_ = Min(pos_ + delta, length_); |
148 ReadBlock(); | 145 ReadBlock(); |
149 return pos_ - old_pos; | 146 return pos_ - old_pos; |
150 } | 147 } |
151 | 148 |
152 | 149 |
153 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) { | 150 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { |
154 if (from_pos >= length_) return 0; | 151 if (from_pos >= length_) return 0; |
155 unsigned length = kBufferSize; | 152 size_t length = kBufferSize; |
156 if (from_pos + length > length_) { | 153 if (from_pos + length > length_) { |
157 length = length_ - from_pos; | 154 length = length_ - from_pos; |
158 } | 155 } |
159 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); | 156 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), |
| 157 static_cast<int>(from_pos + length)); |
160 return length; | 158 return length; |
161 } | 159 } |
162 | 160 |
163 | 161 |
164 // ---------------------------------------------------------------------------- | 162 // ---------------------------------------------------------------------------- |
165 // Utf8ToUtf16CharacterStream | 163 // Utf8ToUtf16CharacterStream |
166 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | 164 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, |
167 unsigned length) | 165 size_t length) |
168 : BufferedUtf16CharacterStream(), | 166 : BufferedUtf16CharacterStream(), |
169 raw_data_(data), | 167 raw_data_(data), |
170 raw_data_length_(length), | 168 raw_data_length_(length), |
171 raw_data_pos_(0), | 169 raw_data_pos_(0), |
172 raw_character_position_(0) { | 170 raw_character_position_(0) { |
173 ReadBlock(); | 171 ReadBlock(); |
174 } | 172 } |
175 | 173 |
176 | 174 |
177 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | 175 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
178 | 176 |
179 | 177 |
180 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length, | 178 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, |
181 const byte* src, | 179 const byte* src, size_t* src_pos, |
182 unsigned* src_pos, | 180 size_t src_length) { |
183 unsigned src_length) { | |
184 static const unibrow::uchar kMaxUtf16Character = 0xffff; | 181 static const unibrow::uchar kMaxUtf16Character = 0xffff; |
185 unsigned i = 0; | 182 size_t i = 0; |
186 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | 183 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
187 // one character early (in the normal case), because we need to have at least | 184 // one character early (in the normal case), because we need to have at least |
188 // two free spaces in the buffer to be sure that the next character will fit. | 185 // two free spaces in the buffer to be sure that the next character will fit. |
189 while (i < length - 1) { | 186 while (i < length - 1) { |
190 if (*src_pos == src_length) break; | 187 if (*src_pos == src_length) break; |
191 unibrow::uchar c = src[*src_pos]; | 188 unibrow::uchar c = src[*src_pos]; |
192 if (c <= unibrow::Utf8::kMaxOneByteChar) { | 189 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
193 *src_pos = *src_pos + 1; | 190 *src_pos = *src_pos + 1; |
194 } else { | 191 } else { |
195 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | 192 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
196 src_pos); | 193 src_pos); |
197 } | 194 } |
198 if (c > kMaxUtf16Character) { | 195 if (c > kMaxUtf16Character) { |
199 dest[i++] = unibrow::Utf16::LeadSurrogate(c); | 196 dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
200 dest[i++] = unibrow::Utf16::TrailSurrogate(c); | 197 dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
201 } else { | 198 } else { |
202 dest[i++] = static_cast<uc16>(c); | 199 dest[i++] = static_cast<uc16>(c); |
203 } | 200 } |
204 } | 201 } |
205 return i; | 202 return i; |
206 } | 203 } |
207 | 204 |
208 | 205 |
209 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { | 206 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { |
210 unsigned old_pos = pos_; | 207 size_t old_pos = pos_; |
211 unsigned target_pos = pos_ + delta; | 208 size_t target_pos = pos_ + delta; |
212 SetRawPosition(target_pos); | 209 SetRawPosition(target_pos); |
213 pos_ = raw_character_position_; | 210 pos_ = raw_character_position_; |
214 ReadBlock(); | 211 ReadBlock(); |
215 return pos_ - old_pos; | 212 return pos_ - old_pos; |
216 } | 213 } |
217 | 214 |
218 | 215 |
219 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { | 216 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { |
220 SetRawPosition(char_position); | 217 SetRawPosition(char_position); |
221 if (raw_character_position_ != char_position) { | 218 if (raw_character_position_ != char_position) { |
222 // char_position was not a valid position in the stream (hit the end | 219 // char_position was not a valid position in the stream (hit the end |
223 // while spooling to it). | 220 // while spooling to it). |
224 return 0u; | 221 return 0u; |
225 } | 222 } |
226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | 223 size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, |
227 raw_data_length_); | 224 raw_data_length_); |
228 raw_character_position_ = char_position + i; | 225 raw_character_position_ = char_position + i; |
229 return i; | 226 return i; |
230 } | 227 } |
231 | 228 |
232 | 229 |
233 static const byte kUtf8MultiByteMask = 0xC0; | 230 static const byte kUtf8MultiByteMask = 0xC0; |
234 static const byte kUtf8MultiByteCharFollower = 0x80; | 231 static const byte kUtf8MultiByteCharFollower = 0x80; |
235 | 232 |
236 | 233 |
237 #ifdef DEBUG | 234 #ifdef DEBUG |
238 static const byte kUtf8MultiByteCharStart = 0xC0; | 235 static const byte kUtf8MultiByteCharStart = 0xC0; |
239 static bool IsUtf8MultiCharacterStart(byte first_byte) { | 236 static bool IsUtf8MultiCharacterStart(byte first_byte) { |
240 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | 237 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; |
241 } | 238 } |
242 #endif | 239 #endif |
243 | 240 |
244 | 241 |
245 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | 242 static bool IsUtf8MultiCharacterFollower(byte later_byte) { |
246 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | 243 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; |
247 } | 244 } |
248 | 245 |
249 | 246 |
250 // Move the cursor back to point at the preceding UTF-8 character start | 247 // Move the cursor back to point at the preceding UTF-8 character start |
251 // in the buffer. | 248 // in the buffer. |
252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { | 249 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { |
253 byte character = buffer[--*cursor]; | 250 byte character = buffer[--*cursor]; |
254 if (character > unibrow::Utf8::kMaxOneByteChar) { | 251 if (character > unibrow::Utf8::kMaxOneByteChar) { |
255 DCHECK(IsUtf8MultiCharacterFollower(character)); | 252 DCHECK(IsUtf8MultiCharacterFollower(character)); |
256 // Last byte of a multi-byte character encoding. Step backwards until | 253 // Last byte of a multi-byte character encoding. Step backwards until |
257 // pointing to the first byte of the encoding, recognized by having the | 254 // pointing to the first byte of the encoding, recognized by having the |
258 // top two bits set. | 255 // top two bits set. |
259 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | 256 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } |
260 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | 257 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); |
261 } | 258 } |
262 } | 259 } |
263 | 260 |
264 | 261 |
265 // Move the cursor forward to point at the next following UTF-8 character start | 262 // Move the cursor forward to point at the next following UTF-8 character start |
266 // in the buffer. | 263 // in the buffer. |
267 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) { | 264 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { |
268 byte character = buffer[(*cursor)++]; | 265 byte character = buffer[(*cursor)++]; |
269 if (character > unibrow::Utf8::kMaxOneByteChar) { | 266 if (character > unibrow::Utf8::kMaxOneByteChar) { |
270 // First character of a multi-byte character encoding. | 267 // First character of a multi-byte character encoding. |
271 // The number of most-significant one-bits determines the length of the | 268 // The number of most-significant one-bits determines the length of the |
272 // encoding: | 269 // encoding: |
273 // 110..... - (0xCx, 0xDx) one additional byte (minimum). | 270 // 110..... - (0xCx, 0xDx) one additional byte (minimum). |
274 // 1110.... - (0xEx) two additional bytes. | 271 // 1110.... - (0xEx) two additional bytes. |
275 // 11110... - (0xFx) three additional bytes (maximum). | 272 // 11110... - (0xFx) three additional bytes (maximum). |
276 DCHECK(IsUtf8MultiCharacterStart(character)); | 273 DCHECK(IsUtf8MultiCharacterStart(character)); |
277 // Additional bytes is: | 274 // Additional bytes is: |
278 // 1 if value in range 0xC0 .. 0xDF. | 275 // 1 if value in range 0xC0 .. 0xDF. |
279 // 2 if value in range 0xE0 .. 0xEF. | 276 // 2 if value in range 0xE0 .. 0xEF. |
280 // 3 if value in range 0xF0 .. 0xF7. | 277 // 3 if value in range 0xF0 .. 0xF7. |
281 // Encode that in a single value. | 278 // Encode that in a single value. |
282 unsigned additional_bytes = | 279 size_t additional_bytes = |
283 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | 280 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; |
284 *cursor += additional_bytes; | 281 *cursor += additional_bytes; |
285 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | 282 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); |
286 } | 283 } |
287 } | 284 } |
288 | 285 |
289 | 286 |
290 // This can't set a raw position between two surrogate pairs, since there | 287 // This can't set a raw position between two surrogate pairs, since there |
291 // is no position in the UTF8 stream that corresponds to that. This assumes | 288 // is no position in the UTF8 stream that corresponds to that. This assumes |
292 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If | 289 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If |
293 // it is illegally coded as two 3 byte sequences then there is no problem here. | 290 // it is illegally coded as two 3 byte sequences then there is no problem here. |
294 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) { | 291 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { |
295 if (raw_character_position_ > target_position) { | 292 if (raw_character_position_ > target_position) { |
296 // Spool backwards in utf8 buffer. | 293 // Spool backwards in utf8 buffer. |
297 do { | 294 do { |
298 int old_pos = raw_data_pos_; | 295 size_t old_pos = raw_data_pos_; |
299 Utf8CharacterBack(raw_data_, &raw_data_pos_); | 296 Utf8CharacterBack(raw_data_, &raw_data_pos_); |
300 raw_character_position_--; | 297 raw_character_position_--; |
301 DCHECK(old_pos - raw_data_pos_ <= 4); | 298 DCHECK(old_pos - raw_data_pos_ <= 4); |
302 // Step back over both code units for surrogate pairs. | 299 // Step back over both code units for surrogate pairs. |
303 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | 300 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; |
304 } while (raw_character_position_ > target_position); | 301 } while (raw_character_position_ > target_position); |
305 // No surrogate pair splitting. | 302 // No surrogate pair splitting. |
306 DCHECK(raw_character_position_ == target_position); | 303 DCHECK(raw_character_position_ == target_position); |
307 return; | 304 return; |
308 } | 305 } |
309 // Spool forwards in the utf8 buffer. | 306 // Spool forwards in the utf8 buffer. |
310 while (raw_character_position_ < target_position) { | 307 while (raw_character_position_ < target_position) { |
311 if (raw_data_pos_ == raw_data_length_) return; | 308 if (raw_data_pos_ == raw_data_length_) return; |
312 int old_pos = raw_data_pos_; | 309 size_t old_pos = raw_data_pos_; |
313 Utf8CharacterForward(raw_data_, &raw_data_pos_); | 310 Utf8CharacterForward(raw_data_, &raw_data_pos_); |
314 raw_character_position_++; | 311 raw_character_position_++; |
315 DCHECK(raw_data_pos_ - old_pos <= 4); | 312 DCHECK(raw_data_pos_ - old_pos <= 4); |
316 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | 313 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
317 } | 314 } |
318 // No surrogate pair splitting. | 315 // No surrogate pair splitting. |
319 DCHECK(raw_character_position_ == target_position); | 316 DCHECK(raw_character_position_ == target_position); |
320 } | 317 } |
321 | 318 |
322 | 319 |
323 unsigned ExternalStreamingStream::FillBuffer(unsigned position) { | 320 size_t ExternalStreamingStream::FillBuffer(size_t position) { |
324 // Ignore "position" which is the position in the decoded data. Instead, | 321 // Ignore "position" which is the position in the decoded data. Instead, |
325 // ExternalStreamingStream keeps track of the position in the raw data. | 322 // ExternalStreamingStream keeps track of the position in the raw data. |
326 unsigned data_in_buffer = 0; | 323 size_t data_in_buffer = 0; |
327 // Note that the UTF-8 decoder might not be able to fill the buffer | 324 // Note that the UTF-8 decoder might not be able to fill the buffer |
328 // completely; it will typically leave the last character empty (see | 325 // completely; it will typically leave the last character empty (see |
329 // Utf8ToUtf16CharacterStream::CopyChars). | 326 // Utf8ToUtf16CharacterStream::CopyChars). |
330 while (data_in_buffer < kBufferSize - 1) { | 327 while (data_in_buffer < kBufferSize - 1) { |
331 if (current_data_ == NULL) { | 328 if (current_data_ == NULL) { |
332 // GetSomeData will wait until the embedder has enough data. Here's an | 329 // GetSomeData will wait until the embedder has enough data. Here's an |
333 // interface between the API which uses size_t (which is the correct type | 330 // interface between the API which uses size_t (which is the correct type |
334 // here) and the internal parts which use unsigned. TODO(marja): make the | 331 // here) and the internal parts which use size_t. |
335 // internal parts use size_t too. | 332 current_data_length_ = source_stream_->GetMoreData(¤t_data_); |
336 current_data_length_ = | |
337 static_cast<unsigned>(source_stream_->GetMoreData(¤t_data_)); | |
338 current_data_offset_ = 0; | 333 current_data_offset_ = 0; |
339 bool data_ends = current_data_length_ == 0; | 334 bool data_ends = current_data_length_ == 0; |
340 | 335 |
341 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 | 336 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 |
342 // character (the rest of the bytes will be in the next chunk). | 337 // character (the rest of the bytes will be in the next chunk). |
343 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { | 338 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { |
344 HandleUtf8SplitCharacters(&data_in_buffer); | 339 HandleUtf8SplitCharacters(&data_in_buffer); |
345 if (!data_ends && current_data_offset_ == current_data_length_) { | 340 if (!data_ends && current_data_offset_ == current_data_length_) { |
346 // The data stream didn't end, but we used all the data in the | 341 // The data stream didn't end, but we used all the data in the |
347 // chunk. This will only happen when the chunk was really small. We | 342 // chunk. This will only happen when the chunk was really small. We |
348 // don't handle the case where a UTF-8 character is split over several | 343 // don't handle the case where a UTF-8 character is split over several |
349 // chunks; in that case V8 won't crash, but it will be a parse error. | 344 // chunks; in that case V8 won't crash, but it will be a parse error. |
350 delete[] current_data_; | 345 delete[] current_data_; |
351 current_data_ = NULL; | 346 current_data_ = NULL; |
352 current_data_length_ = 0; | 347 current_data_length_ = 0; |
353 current_data_offset_ = 0; | 348 current_data_offset_ = 0; |
354 continue; // Request a new chunk. | 349 continue; // Request a new chunk. |
355 } | 350 } |
356 } | 351 } |
357 | 352 |
358 // Did the data stream end? | 353 // Did the data stream end? |
359 if (data_ends) { | 354 if (data_ends) { |
360 DCHECK(utf8_split_char_buffer_length_ == 0); | 355 DCHECK(utf8_split_char_buffer_length_ == 0); |
361 return data_in_buffer; | 356 return data_in_buffer; |
362 } | 357 } |
363 } | 358 } |
364 | 359 |
365 // Fill the buffer from current_data_. | 360 // Fill the buffer from current_data_. |
366 unsigned new_offset = 0; | 361 size_t new_offset = 0; |
367 unsigned new_chars_in_buffer = | 362 size_t new_chars_in_buffer = |
368 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, | 363 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, |
369 current_data_ + current_data_offset_, &new_offset, | 364 current_data_ + current_data_offset_, &new_offset, |
370 current_data_length_ - current_data_offset_, encoding_); | 365 current_data_length_ - current_data_offset_, encoding_); |
371 data_in_buffer += new_chars_in_buffer; | 366 data_in_buffer += new_chars_in_buffer; |
372 current_data_offset_ += new_offset; | 367 current_data_offset_ += new_offset; |
373 DCHECK(data_in_buffer <= kBufferSize); | 368 DCHECK(data_in_buffer <= kBufferSize); |
374 | 369 |
375 // Did we use all the data in the data chunk? | 370 // Did we use all the data in the data chunk? |
376 if (current_data_offset_ == current_data_length_) { | 371 if (current_data_offset_ == current_data_length_) { |
377 delete[] current_data_; | 372 delete[] current_data_; |
378 current_data_ = NULL; | 373 current_data_ = NULL; |
379 current_data_length_ = 0; | 374 current_data_length_ = 0; |
380 current_data_offset_ = 0; | 375 current_data_offset_ = 0; |
381 } | 376 } |
382 } | 377 } |
383 return data_in_buffer; | 378 return data_in_buffer; |
384 } | 379 } |
385 | 380 |
386 void ExternalStreamingStream::HandleUtf8SplitCharacters( | 381 void ExternalStreamingStream::HandleUtf8SplitCharacters( |
387 unsigned* data_in_buffer) { | 382 size_t* data_in_buffer) { |
388 // Note the following property of UTF-8 which makes this function possible: | 383 // Note the following property of UTF-8 which makes this function possible: |
389 // Given any byte, we can always read its local environment (in both | 384 // Given any byte, we can always read its local environment (in both |
390 // directions) to find out the (possibly multi-byte) character it belongs | 385 // directions) to find out the (possibly multi-byte) character it belongs |
391 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a | 386 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a |
392 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or | 387 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or |
393 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. | 388 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. |
394 | 389 |
395 // First check if we have leftover data from the last chunk. | 390 // First check if we have leftover data from the last chunk. |
396 unibrow::uchar c; | 391 unibrow::uchar c; |
397 if (utf8_split_char_buffer_length_ > 0) { | 392 if (utf8_split_char_buffer_length_ > 0) { |
398 // Move the bytes which are part of the split character (which started in | 393 // Move the bytes which are part of the split character (which started in |
399 // the previous chunk) into utf8_split_char_buffer_. Note that the | 394 // the previous chunk) into utf8_split_char_buffer_. Note that the |
400 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. | 395 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. |
401 while (current_data_offset_ < current_data_length_ && | 396 while (current_data_offset_ < current_data_length_ && |
402 utf8_split_char_buffer_length_ < 4 && | 397 utf8_split_char_buffer_length_ < 4 && |
403 (c = current_data_[current_data_offset_]) >> 6 == 2) { | 398 (c = current_data_[current_data_offset_]) >> 6 == 2) { |
404 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; | 399 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; |
405 ++utf8_split_char_buffer_length_; | 400 ++utf8_split_char_buffer_length_; |
406 ++current_data_offset_; | 401 ++current_data_offset_; |
407 } | 402 } |
408 | 403 |
409 // Convert the data in utf8_split_char_buffer_. | 404 // Convert the data in utf8_split_char_buffer_. |
410 unsigned new_offset = 0; | 405 size_t new_offset = 0; |
411 unsigned new_chars_in_buffer = | 406 size_t new_chars_in_buffer = |
412 CopyCharsHelper(buffer_ + *data_in_buffer, | 407 CopyCharsHelper(buffer_ + *data_in_buffer, |
413 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, | 408 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, |
414 &new_offset, utf8_split_char_buffer_length_, encoding_); | 409 &new_offset, utf8_split_char_buffer_length_, encoding_); |
415 *data_in_buffer += new_chars_in_buffer; | 410 *data_in_buffer += new_chars_in_buffer; |
416 // Make sure we used all the data. | 411 // Make sure we used all the data. |
417 DCHECK(new_offset == utf8_split_char_buffer_length_); | 412 DCHECK(new_offset == utf8_split_char_buffer_length_); |
418 DCHECK(*data_in_buffer <= kBufferSize); | 413 DCHECK(*data_in_buffer <= kBufferSize); |
419 | 414 |
420 utf8_split_char_buffer_length_ = 0; | 415 utf8_split_char_buffer_length_ = 0; |
421 } | 416 } |
(...skipping 10 matching lines...) Expand all Loading... |
432 --current_data_length_; | 427 --current_data_length_; |
433 ++utf8_split_char_buffer_length_; | 428 ++utf8_split_char_buffer_length_; |
434 if (c >= (3 << 6)) { | 429 if (c >= (3 << 6)) { |
435 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte | 430 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte |
436 // character. No need to copy the previous characters into the conversion | 431 // character. No need to copy the previous characters into the conversion |
437 // buffer (even if they're multi-byte). | 432 // buffer (even if they're multi-byte). |
438 break; | 433 break; |
439 } | 434 } |
440 } | 435 } |
441 CHECK(utf8_split_char_buffer_length_ <= 4); | 436 CHECK(utf8_split_char_buffer_length_ <= 4); |
442 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { | 437 for (size_t i = 0; i < utf8_split_char_buffer_length_; ++i) { |
443 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; | 438 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; |
444 } | 439 } |
445 } | 440 } |
446 | 441 |
447 | 442 |
448 // ---------------------------------------------------------------------------- | 443 // ---------------------------------------------------------------------------- |
449 // ExternalTwoByteStringUtf16CharacterStream | 444 // ExternalTwoByteStringUtf16CharacterStream |
450 | 445 |
451 ExternalTwoByteStringUtf16CharacterStream:: | 446 ExternalTwoByteStringUtf16CharacterStream:: |
452 ~ExternalTwoByteStringUtf16CharacterStream() { } | 447 ~ExternalTwoByteStringUtf16CharacterStream() { } |
453 | 448 |
454 | 449 |
455 ExternalTwoByteStringUtf16CharacterStream | 450 ExternalTwoByteStringUtf16CharacterStream |
456 ::ExternalTwoByteStringUtf16CharacterStream( | 451 ::ExternalTwoByteStringUtf16CharacterStream( |
457 Handle<ExternalTwoByteString> data, | 452 Handle<ExternalTwoByteString> data, |
458 int start_position, | 453 int start_position, |
459 int end_position) | 454 int end_position) |
460 : Utf16CharacterStream(), | 455 : Utf16CharacterStream(), |
461 source_(data), | 456 source_(data), |
462 raw_data_(data->GetTwoByteData(start_position)) { | 457 raw_data_(data->GetTwoByteData(start_position)) { |
463 buffer_cursor_ = raw_data_, | 458 buffer_cursor_ = raw_data_, |
464 buffer_end_ = raw_data_ + (end_position - start_position); | 459 buffer_end_ = raw_data_ + (end_position - start_position); |
465 pos_ = start_position; | 460 pos_ = start_position; |
466 } | 461 } |
467 | 462 |
468 } } // namespace v8::internal | 463 } } // namespace v8::internal |
OLD | NEW |