| OLD | NEW | 
|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. | 
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be | 
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. | 
| 4 | 4 | 
| 5 #include "src/v8.h" | 5 #include "src/v8.h" | 
| 6 | 6 | 
| 7 #include "src/scanner-character-streams.h" | 7 #include "src/scanner-character-streams.h" | 
| 8 | 8 | 
| 9 #include "include/v8.h" | 9 #include "include/v8.h" | 
| 10 #include "src/handles.h" | 10 #include "src/handles.h" | 
| 11 #include "src/unicode-inl.h" | 11 #include "src/unicode-inl.h" | 
| 12 | 12 | 
| 13 namespace v8 { | 13 namespace v8 { | 
| 14 namespace internal { | 14 namespace internal { | 
| 15 | 15 | 
| 16 namespace { | 16 namespace { | 
| 17 | 17 | 
| 18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src, | 18 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, | 
| 19                          unsigned* src_pos, unsigned src_length, | 19                        size_t* src_pos, size_t src_length, | 
| 20                          ScriptCompiler::StreamedSource::Encoding encoding) { | 20                        ScriptCompiler::StreamedSource::Encoding encoding) { | 
| 21   // It's possible that this will be called with length 0, but don't assume that | 21   // It's possible that this will be called with length 0, but don't assume that | 
| 22   // the functions this calls handle it gracefully. | 22   // the functions this calls handle it gracefully. | 
| 23   if (length == 0) return 0; | 23   if (length == 0) return 0; | 
| 24 | 24 | 
| 25   if (encoding == ScriptCompiler::StreamedSource::UTF8) { | 25   if (encoding == ScriptCompiler::StreamedSource::UTF8) { | 
| 26     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | 26     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | 
| 27         dest, length, src, src_pos, src_length); | 27         dest, length, src, src_pos, src_length); | 
| 28   } | 28   } | 
| 29 | 29 | 
| 30   unsigned to_fill = length; | 30   size_t to_fill = length; | 
| 31   if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | 31   if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | 
| 32 | 32 | 
| 33   if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | 33   if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | 
| 34     v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | 34     v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | 
| 35   } else { | 35   } else { | 
| 36     DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | 36     DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | 
| 37     v8::internal::CopyChars<uint16_t, uint16_t>( | 37     v8::internal::CopyChars<uint16_t, uint16_t>( | 
| 38         dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); | 38         dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); | 
| 39   } | 39   } | 
| 40   *src_pos += to_fill; | 40   *src_pos += to_fill; | 
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 103   buffer_cursor_ = buffer_; | 103   buffer_cursor_ = buffer_; | 
| 104   if (pushback_limit_ != NULL) { | 104   if (pushback_limit_ != NULL) { | 
| 105     // Leave pushback mode. | 105     // Leave pushback mode. | 
| 106     buffer_end_ = pushback_limit_; | 106     buffer_end_ = pushback_limit_; | 
| 107     pushback_limit_ = NULL; | 107     pushback_limit_ = NULL; | 
| 108     // If there were any valid characters left at the | 108     // If there were any valid characters left at the | 
| 109     // start of the buffer, use those. | 109     // start of the buffer, use those. | 
| 110     if (buffer_cursor_ < buffer_end_) return true; | 110     if (buffer_cursor_ < buffer_end_) return true; | 
| 111     // Otherwise read a new block. | 111     // Otherwise read a new block. | 
| 112   } | 112   } | 
| 113   unsigned length = FillBuffer(pos_); | 113   size_t length = FillBuffer(pos_); | 
| 114   buffer_end_ = buffer_ + length; | 114   buffer_end_ = buffer_ + length; | 
| 115   return length > 0; | 115   return length > 0; | 
| 116 } | 116 } | 
| 117 | 117 | 
| 118 | 118 | 
| 119 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) { | 119 size_t BufferedUtf16CharacterStream::SlowSeekForward(size_t delta) { | 
| 120   // Leave pushback mode (i.e., ignore that there might be valid data | 120   // Leave pushback mode (i.e., ignore that there might be valid data | 
| 121   // in the buffer before the pushback_limit_ point). | 121   // in the buffer before the pushback_limit_ point). | 
| 122   pushback_limit_ = NULL; | 122   pushback_limit_ = NULL; | 
| 123   return BufferSeekForward(delta); | 123   return BufferSeekForward(delta); | 
| 124 } | 124 } | 
| 125 | 125 | 
| 126 | 126 | 
| 127 // ---------------------------------------------------------------------------- | 127 // ---------------------------------------------------------------------------- | 
| 128 // GenericStringUtf16CharacterStream | 128 // GenericStringUtf16CharacterStream | 
| 129 | 129 | 
| 130 | 130 | 
| 131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( | 131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( | 
| 132     Handle<String> data, | 132     Handle<String> data, size_t start_position, size_t end_position) | 
| 133     unsigned start_position, | 133     : string_(data), length_(end_position) { | 
| 134     unsigned end_position) |  | 
| 135     : string_(data), |  | 
| 136       length_(end_position) { |  | 
| 137   DCHECK(end_position >= start_position); | 134   DCHECK(end_position >= start_position); | 
| 138   pos_ = start_position; | 135   pos_ = start_position; | 
| 139 } | 136 } | 
| 140 | 137 | 
| 141 | 138 | 
| 142 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } | 139 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } | 
| 143 | 140 | 
| 144 | 141 | 
| 145 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) { | 142 size_t GenericStringUtf16CharacterStream::BufferSeekForward(size_t delta) { | 
| 146   unsigned old_pos = pos_; | 143   size_t old_pos = pos_; | 
| 147   pos_ = Min(pos_ + delta, length_); | 144   pos_ = Min(pos_ + delta, length_); | 
| 148   ReadBlock(); | 145   ReadBlock(); | 
| 149   return pos_ - old_pos; | 146   return pos_ - old_pos; | 
| 150 } | 147 } | 
| 151 | 148 | 
| 152 | 149 | 
| 153 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) { | 150 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { | 
| 154   if (from_pos >= length_) return 0; | 151   if (from_pos >= length_) return 0; | 
| 155   unsigned length = kBufferSize; | 152   size_t length = kBufferSize; | 
| 156   if (from_pos + length > length_) { | 153   if (from_pos + length > length_) { | 
| 157     length = length_ - from_pos; | 154     length = length_ - from_pos; | 
| 158   } | 155   } | 
| 159   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); | 156   String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), | 
|  | 157                             static_cast<int>(from_pos + length)); | 
| 160   return length; | 158   return length; | 
| 161 } | 159 } | 
| 162 | 160 | 
| 163 | 161 | 
| 164 // ---------------------------------------------------------------------------- | 162 // ---------------------------------------------------------------------------- | 
| 165 // Utf8ToUtf16CharacterStream | 163 // Utf8ToUtf16CharacterStream | 
| 166 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | 164 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | 
| 167                                                        unsigned length) | 165                                                        size_t length) | 
| 168     : BufferedUtf16CharacterStream(), | 166     : BufferedUtf16CharacterStream(), | 
| 169       raw_data_(data), | 167       raw_data_(data), | 
| 170       raw_data_length_(length), | 168       raw_data_length_(length), | 
| 171       raw_data_pos_(0), | 169       raw_data_pos_(0), | 
| 172       raw_character_position_(0) { | 170       raw_character_position_(0) { | 
| 173   ReadBlock(); | 171   ReadBlock(); | 
| 174 } | 172 } | 
| 175 | 173 | 
| 176 | 174 | 
| 177 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | 175 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | 
| 178 | 176 | 
| 179 | 177 | 
| 180 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length, | 178 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, | 
| 181                                                const byte* src, | 179                                              const byte* src, size_t* src_pos, | 
| 182                                                unsigned* src_pos, | 180                                              size_t src_length) { | 
| 183                                                unsigned src_length) { |  | 
| 184   static const unibrow::uchar kMaxUtf16Character = 0xffff; | 181   static const unibrow::uchar kMaxUtf16Character = 0xffff; | 
| 185   unsigned i = 0; | 182   size_t i = 0; | 
| 186   // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | 183   // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | 
| 187   // one character early (in the normal case), because we need to have at least | 184   // one character early (in the normal case), because we need to have at least | 
| 188   // two free spaces in the buffer to be sure that the next character will fit. | 185   // two free spaces in the buffer to be sure that the next character will fit. | 
| 189   while (i < length - 1) { | 186   while (i < length - 1) { | 
| 190     if (*src_pos == src_length) break; | 187     if (*src_pos == src_length) break; | 
| 191     unibrow::uchar c = src[*src_pos]; | 188     unibrow::uchar c = src[*src_pos]; | 
| 192     if (c <= unibrow::Utf8::kMaxOneByteChar) { | 189     if (c <= unibrow::Utf8::kMaxOneByteChar) { | 
| 193       *src_pos = *src_pos + 1; | 190       *src_pos = *src_pos + 1; | 
| 194     } else { | 191     } else { | 
| 195       c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | 192       c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | 
| 196                                         src_pos); | 193                                         src_pos); | 
| 197     } | 194     } | 
| 198     if (c > kMaxUtf16Character) { | 195     if (c > kMaxUtf16Character) { | 
| 199       dest[i++] = unibrow::Utf16::LeadSurrogate(c); | 196       dest[i++] = unibrow::Utf16::LeadSurrogate(c); | 
| 200       dest[i++] = unibrow::Utf16::TrailSurrogate(c); | 197       dest[i++] = unibrow::Utf16::TrailSurrogate(c); | 
| 201     } else { | 198     } else { | 
| 202       dest[i++] = static_cast<uc16>(c); | 199       dest[i++] = static_cast<uc16>(c); | 
| 203     } | 200     } | 
| 204   } | 201   } | 
| 205   return i; | 202   return i; | 
| 206 } | 203 } | 
| 207 | 204 | 
| 208 | 205 | 
| 209 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { | 206 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { | 
| 210   unsigned old_pos = pos_; | 207   size_t old_pos = pos_; | 
| 211   unsigned target_pos = pos_ + delta; | 208   size_t target_pos = pos_ + delta; | 
| 212   SetRawPosition(target_pos); | 209   SetRawPosition(target_pos); | 
| 213   pos_ = raw_character_position_; | 210   pos_ = raw_character_position_; | 
| 214   ReadBlock(); | 211   ReadBlock(); | 
| 215   return pos_ - old_pos; | 212   return pos_ - old_pos; | 
| 216 } | 213 } | 
| 217 | 214 | 
| 218 | 215 | 
| 219 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { | 216 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { | 
| 220   SetRawPosition(char_position); | 217   SetRawPosition(char_position); | 
| 221   if (raw_character_position_ != char_position) { | 218   if (raw_character_position_ != char_position) { | 
| 222     // char_position was not a valid position in the stream (hit the end | 219     // char_position was not a valid position in the stream (hit the end | 
| 223     // while spooling to it). | 220     // while spooling to it). | 
| 224     return 0u; | 221     return 0u; | 
| 225   } | 222   } | 
| 226   unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | 223   size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | 
| 227                          raw_data_length_); | 224                        raw_data_length_); | 
| 228   raw_character_position_ = char_position + i; | 225   raw_character_position_ = char_position + i; | 
| 229   return i; | 226   return i; | 
| 230 } | 227 } | 
| 231 | 228 | 
| 232 | 229 | 
| 233 static const byte kUtf8MultiByteMask = 0xC0; | 230 static const byte kUtf8MultiByteMask = 0xC0; | 
| 234 static const byte kUtf8MultiByteCharFollower = 0x80; | 231 static const byte kUtf8MultiByteCharFollower = 0x80; | 
| 235 | 232 | 
| 236 | 233 | 
| 237 #ifdef DEBUG | 234 #ifdef DEBUG | 
| 238 static const byte kUtf8MultiByteCharStart = 0xC0; | 235 static const byte kUtf8MultiByteCharStart = 0xC0; | 
| 239 static bool IsUtf8MultiCharacterStart(byte first_byte) { | 236 static bool IsUtf8MultiCharacterStart(byte first_byte) { | 
| 240   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | 237   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | 
| 241 } | 238 } | 
| 242 #endif | 239 #endif | 
| 243 | 240 | 
| 244 | 241 | 
| 245 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | 242 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | 
| 246   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | 243   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | 
| 247 } | 244 } | 
| 248 | 245 | 
| 249 | 246 | 
| 250 // Move the cursor back to point at the preceding UTF-8 character start | 247 // Move the cursor back to point at the preceding UTF-8 character start | 
| 251 // in the buffer. | 248 // in the buffer. | 
| 252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { | 249 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { | 
| 253   byte character = buffer[--*cursor]; | 250   byte character = buffer[--*cursor]; | 
| 254   if (character > unibrow::Utf8::kMaxOneByteChar) { | 251   if (character > unibrow::Utf8::kMaxOneByteChar) { | 
| 255     DCHECK(IsUtf8MultiCharacterFollower(character)); | 252     DCHECK(IsUtf8MultiCharacterFollower(character)); | 
| 256     // Last byte of a multi-byte character encoding. Step backwards until | 253     // Last byte of a multi-byte character encoding. Step backwards until | 
| 257     // pointing to the first byte of the encoding, recognized by having the | 254     // pointing to the first byte of the encoding, recognized by having the | 
| 258     // top two bits set. | 255     // top two bits set. | 
| 259     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | 256     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | 
| 260     DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | 257     DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | 
| 261   } | 258   } | 
| 262 } | 259 } | 
| 263 | 260 | 
| 264 | 261 | 
| 265 // Move the cursor forward to point at the next following UTF-8 character start | 262 // Move the cursor forward to point at the next following UTF-8 character start | 
| 266 // in the buffer. | 263 // in the buffer. | 
| 267 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) { | 264 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { | 
| 268   byte character = buffer[(*cursor)++]; | 265   byte character = buffer[(*cursor)++]; | 
| 269   if (character > unibrow::Utf8::kMaxOneByteChar) { | 266   if (character > unibrow::Utf8::kMaxOneByteChar) { | 
| 270     // First character of a multi-byte character encoding. | 267     // First character of a multi-byte character encoding. | 
| 271     // The number of most-significant one-bits determines the length of the | 268     // The number of most-significant one-bits determines the length of the | 
| 272     // encoding: | 269     // encoding: | 
| 273     //  110..... - (0xCx, 0xDx) one additional byte (minimum). | 270     //  110..... - (0xCx, 0xDx) one additional byte (minimum). | 
| 274     //  1110.... - (0xEx) two additional bytes. | 271     //  1110.... - (0xEx) two additional bytes. | 
| 275     //  11110... - (0xFx) three additional bytes (maximum). | 272     //  11110... - (0xFx) three additional bytes (maximum). | 
| 276     DCHECK(IsUtf8MultiCharacterStart(character)); | 273     DCHECK(IsUtf8MultiCharacterStart(character)); | 
| 277     // Additional bytes is: | 274     // Additional bytes is: | 
| 278     // 1 if value in range 0xC0 .. 0xDF. | 275     // 1 if value in range 0xC0 .. 0xDF. | 
| 279     // 2 if value in range 0xE0 .. 0xEF. | 276     // 2 if value in range 0xE0 .. 0xEF. | 
| 280     // 3 if value in range 0xF0 .. 0xF7. | 277     // 3 if value in range 0xF0 .. 0xF7. | 
| 281     // Encode that in a single value. | 278     // Encode that in a single value. | 
| 282     unsigned additional_bytes = | 279     size_t additional_bytes = | 
| 283         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | 280         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | 
| 284     *cursor += additional_bytes; | 281     *cursor += additional_bytes; | 
| 285     DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | 282     DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | 
| 286   } | 283   } | 
| 287 } | 284 } | 
| 288 | 285 | 
| 289 | 286 | 
| 290 // This can't set a raw position between two surrogate pairs, since there | 287 // This can't set a raw position between two surrogate pairs, since there | 
| 291 // is no position in the UTF8 stream that corresponds to that.  This assumes | 288 // is no position in the UTF8 stream that corresponds to that.  This assumes | 
| 292 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If | 289 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If | 
| 293 // it is illegally coded as two 3 byte sequences then there is no problem here. | 290 // it is illegally coded as two 3 byte sequences then there is no problem here. | 
| 294 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) { | 291 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { | 
| 295   if (raw_character_position_ > target_position) { | 292   if (raw_character_position_ > target_position) { | 
| 296     // Spool backwards in utf8 buffer. | 293     // Spool backwards in utf8 buffer. | 
| 297     do { | 294     do { | 
| 298       int old_pos = raw_data_pos_; | 295       size_t old_pos = raw_data_pos_; | 
| 299       Utf8CharacterBack(raw_data_, &raw_data_pos_); | 296       Utf8CharacterBack(raw_data_, &raw_data_pos_); | 
| 300       raw_character_position_--; | 297       raw_character_position_--; | 
| 301       DCHECK(old_pos - raw_data_pos_ <= 4); | 298       DCHECK(old_pos - raw_data_pos_ <= 4); | 
| 302       // Step back over both code units for surrogate pairs. | 299       // Step back over both code units for surrogate pairs. | 
| 303       if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | 300       if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | 
| 304     } while (raw_character_position_ > target_position); | 301     } while (raw_character_position_ > target_position); | 
| 305     // No surrogate pair splitting. | 302     // No surrogate pair splitting. | 
| 306     DCHECK(raw_character_position_ == target_position); | 303     DCHECK(raw_character_position_ == target_position); | 
| 307     return; | 304     return; | 
| 308   } | 305   } | 
| 309   // Spool forwards in the utf8 buffer. | 306   // Spool forwards in the utf8 buffer. | 
| 310   while (raw_character_position_ < target_position) { | 307   while (raw_character_position_ < target_position) { | 
| 311     if (raw_data_pos_ == raw_data_length_) return; | 308     if (raw_data_pos_ == raw_data_length_) return; | 
| 312     int old_pos = raw_data_pos_; | 309     size_t old_pos = raw_data_pos_; | 
| 313     Utf8CharacterForward(raw_data_, &raw_data_pos_); | 310     Utf8CharacterForward(raw_data_, &raw_data_pos_); | 
| 314     raw_character_position_++; | 311     raw_character_position_++; | 
| 315     DCHECK(raw_data_pos_ - old_pos <= 4); | 312     DCHECK(raw_data_pos_ - old_pos <= 4); | 
| 316     if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | 313     if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | 
| 317   } | 314   } | 
| 318   // No surrogate pair splitting. | 315   // No surrogate pair splitting. | 
| 319   DCHECK(raw_character_position_ == target_position); | 316   DCHECK(raw_character_position_ == target_position); | 
| 320 } | 317 } | 
| 321 | 318 | 
| 322 | 319 | 
| 323 unsigned ExternalStreamingStream::FillBuffer(unsigned position) { | 320 size_t ExternalStreamingStream::FillBuffer(size_t position) { | 
| 324   // Ignore "position" which is the position in the decoded data. Instead, | 321   // Ignore "position" which is the position in the decoded data. Instead, | 
| 325   // ExternalStreamingStream keeps track of the position in the raw data. | 322   // ExternalStreamingStream keeps track of the position in the raw data. | 
| 326   unsigned data_in_buffer = 0; | 323   size_t data_in_buffer = 0; | 
| 327   // Note that the UTF-8 decoder might not be able to fill the buffer | 324   // Note that the UTF-8 decoder might not be able to fill the buffer | 
| 328   // completely; it will typically leave the last character empty (see | 325   // completely; it will typically leave the last character empty (see | 
| 329   // Utf8ToUtf16CharacterStream::CopyChars). | 326   // Utf8ToUtf16CharacterStream::CopyChars). | 
| 330   while (data_in_buffer < kBufferSize - 1) { | 327   while (data_in_buffer < kBufferSize - 1) { | 
| 331     if (current_data_ == NULL) { | 328     if (current_data_ == NULL) { | 
| 332       // GetSomeData will wait until the embedder has enough data. Here's an | 329       // GetSomeData will wait until the embedder has enough data. Here's an | 
| 333       // interface between the API which uses size_t (which is the correct type | 330       // interface between the API which uses size_t (which is the correct type | 
| 334       // here) and the internal parts which use unsigned. TODO(marja): make the | 331       // here) and the internal parts which use size_t. | 
| 335       // internal parts use size_t too. | 332       current_data_length_ = source_stream_->GetMoreData(¤t_data_); | 
| 336       current_data_length_ = |  | 
| 337           static_cast<unsigned>(source_stream_->GetMoreData(¤t_data_)); |  | 
| 338       current_data_offset_ = 0; | 333       current_data_offset_ = 0; | 
| 339       bool data_ends = current_data_length_ == 0; | 334       bool data_ends = current_data_length_ == 0; | 
| 340 | 335 | 
| 341       // A caveat: a data chunk might end with bytes from an incomplete UTF-8 | 336       // A caveat: a data chunk might end with bytes from an incomplete UTF-8 | 
| 342       // character (the rest of the bytes will be in the next chunk). | 337       // character (the rest of the bytes will be in the next chunk). | 
| 343       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { | 338       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { | 
| 344         HandleUtf8SplitCharacters(&data_in_buffer); | 339         HandleUtf8SplitCharacters(&data_in_buffer); | 
| 345         if (!data_ends && current_data_offset_ == current_data_length_) { | 340         if (!data_ends && current_data_offset_ == current_data_length_) { | 
| 346           // The data stream didn't end, but we used all the data in the | 341           // The data stream didn't end, but we used all the data in the | 
| 347           // chunk. This will only happen when the chunk was really small. We | 342           // chunk. This will only happen when the chunk was really small. We | 
| 348           // don't handle the case where a UTF-8 character is split over several | 343           // don't handle the case where a UTF-8 character is split over several | 
| 349           // chunks; in that case V8 won't crash, but it will be a parse error. | 344           // chunks; in that case V8 won't crash, but it will be a parse error. | 
| 350           delete[] current_data_; | 345           delete[] current_data_; | 
| 351           current_data_ = NULL; | 346           current_data_ = NULL; | 
| 352           current_data_length_ = 0; | 347           current_data_length_ = 0; | 
| 353           current_data_offset_ = 0; | 348           current_data_offset_ = 0; | 
| 354           continue;  // Request a new chunk. | 349           continue;  // Request a new chunk. | 
| 355         } | 350         } | 
| 356       } | 351       } | 
| 357 | 352 | 
| 358       // Did the data stream end? | 353       // Did the data stream end? | 
| 359       if (data_ends) { | 354       if (data_ends) { | 
| 360         DCHECK(utf8_split_char_buffer_length_ == 0); | 355         DCHECK(utf8_split_char_buffer_length_ == 0); | 
| 361         return data_in_buffer; | 356         return data_in_buffer; | 
| 362       } | 357       } | 
| 363     } | 358     } | 
| 364 | 359 | 
| 365     // Fill the buffer from current_data_. | 360     // Fill the buffer from current_data_. | 
| 366     unsigned new_offset = 0; | 361     size_t new_offset = 0; | 
| 367     unsigned new_chars_in_buffer = | 362     size_t new_chars_in_buffer = | 
| 368         CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, | 363         CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, | 
| 369                         current_data_ + current_data_offset_, &new_offset, | 364                         current_data_ + current_data_offset_, &new_offset, | 
| 370                         current_data_length_ - current_data_offset_, encoding_); | 365                         current_data_length_ - current_data_offset_, encoding_); | 
| 371     data_in_buffer += new_chars_in_buffer; | 366     data_in_buffer += new_chars_in_buffer; | 
| 372     current_data_offset_ += new_offset; | 367     current_data_offset_ += new_offset; | 
| 373     DCHECK(data_in_buffer <= kBufferSize); | 368     DCHECK(data_in_buffer <= kBufferSize); | 
| 374 | 369 | 
| 375     // Did we use all the data in the data chunk? | 370     // Did we use all the data in the data chunk? | 
| 376     if (current_data_offset_ == current_data_length_) { | 371     if (current_data_offset_ == current_data_length_) { | 
| 377       delete[] current_data_; | 372       delete[] current_data_; | 
| 378       current_data_ = NULL; | 373       current_data_ = NULL; | 
| 379       current_data_length_ = 0; | 374       current_data_length_ = 0; | 
| 380       current_data_offset_ = 0; | 375       current_data_offset_ = 0; | 
| 381     } | 376     } | 
| 382   } | 377   } | 
| 383   return data_in_buffer; | 378   return data_in_buffer; | 
| 384 } | 379 } | 
| 385 | 380 | 
| 386 void ExternalStreamingStream::HandleUtf8SplitCharacters( | 381 void ExternalStreamingStream::HandleUtf8SplitCharacters( | 
| 387     unsigned* data_in_buffer) { | 382     size_t* data_in_buffer) { | 
| 388   // Note the following property of UTF-8 which makes this function possible: | 383   // Note the following property of UTF-8 which makes this function possible: | 
| 389   // Given any byte, we can always read its local environment (in both | 384   // Given any byte, we can always read its local environment (in both | 
| 390   // directions) to find out the (possibly multi-byte) character it belongs | 385   // directions) to find out the (possibly multi-byte) character it belongs | 
| 391   // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a | 386   // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a | 
| 392   // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or | 387   // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or | 
| 393   // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. | 388   // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. | 
| 394 | 389 | 
| 395   // First check if we have leftover data from the last chunk. | 390   // First check if we have leftover data from the last chunk. | 
| 396   unibrow::uchar c; | 391   unibrow::uchar c; | 
| 397   if (utf8_split_char_buffer_length_ > 0) { | 392   if (utf8_split_char_buffer_length_ > 0) { | 
| 398     // Move the bytes which are part of the split character (which started in | 393     // Move the bytes which are part of the split character (which started in | 
| 399     // the previous chunk) into utf8_split_char_buffer_. Note that the | 394     // the previous chunk) into utf8_split_char_buffer_. Note that the | 
| 400     // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. | 395     // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. | 
| 401     while (current_data_offset_ < current_data_length_ && | 396     while (current_data_offset_ < current_data_length_ && | 
| 402            utf8_split_char_buffer_length_ < 4 && | 397            utf8_split_char_buffer_length_ < 4 && | 
| 403            (c = current_data_[current_data_offset_]) >> 6 == 2) { | 398            (c = current_data_[current_data_offset_]) >> 6 == 2) { | 
| 404       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; | 399       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; | 
| 405       ++utf8_split_char_buffer_length_; | 400       ++utf8_split_char_buffer_length_; | 
| 406       ++current_data_offset_; | 401       ++current_data_offset_; | 
| 407     } | 402     } | 
| 408 | 403 | 
| 409     // Convert the data in utf8_split_char_buffer_. | 404     // Convert the data in utf8_split_char_buffer_. | 
| 410     unsigned new_offset = 0; | 405     size_t new_offset = 0; | 
| 411     unsigned new_chars_in_buffer = | 406     size_t new_chars_in_buffer = | 
| 412         CopyCharsHelper(buffer_ + *data_in_buffer, | 407         CopyCharsHelper(buffer_ + *data_in_buffer, | 
| 413                         kBufferSize - *data_in_buffer, utf8_split_char_buffer_, | 408                         kBufferSize - *data_in_buffer, utf8_split_char_buffer_, | 
| 414                         &new_offset, utf8_split_char_buffer_length_, encoding_); | 409                         &new_offset, utf8_split_char_buffer_length_, encoding_); | 
| 415     *data_in_buffer += new_chars_in_buffer; | 410     *data_in_buffer += new_chars_in_buffer; | 
| 416     // Make sure we used all the data. | 411     // Make sure we used all the data. | 
| 417     DCHECK(new_offset == utf8_split_char_buffer_length_); | 412     DCHECK(new_offset == utf8_split_char_buffer_length_); | 
| 418     DCHECK(*data_in_buffer <= kBufferSize); | 413     DCHECK(*data_in_buffer <= kBufferSize); | 
| 419 | 414 | 
| 420     utf8_split_char_buffer_length_ = 0; | 415     utf8_split_char_buffer_length_ = 0; | 
| 421   } | 416   } | 
| (...skipping 10 matching lines...) Expand all  Loading... | 
| 432     --current_data_length_; | 427     --current_data_length_; | 
| 433     ++utf8_split_char_buffer_length_; | 428     ++utf8_split_char_buffer_length_; | 
| 434     if (c >= (3 << 6)) { | 429     if (c >= (3 << 6)) { | 
| 435       // 3 << 6 = 0b11000000; this is the first byte of the multi-byte | 430       // 3 << 6 = 0b11000000; this is the first byte of the multi-byte | 
| 436       // character. No need to copy the previous characters into the conversion | 431       // character. No need to copy the previous characters into the conversion | 
| 437       // buffer (even if they're multi-byte). | 432       // buffer (even if they're multi-byte). | 
| 438       break; | 433       break; | 
| 439     } | 434     } | 
| 440   } | 435   } | 
| 441   CHECK(utf8_split_char_buffer_length_ <= 4); | 436   CHECK(utf8_split_char_buffer_length_ <= 4); | 
| 442   for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { | 437   for (size_t i = 0; i < utf8_split_char_buffer_length_; ++i) { | 
| 443     utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; | 438     utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; | 
| 444   } | 439   } | 
| 445 } | 440 } | 
| 446 | 441 | 
| 447 | 442 | 
| 448 // ---------------------------------------------------------------------------- | 443 // ---------------------------------------------------------------------------- | 
| 449 // ExternalTwoByteStringUtf16CharacterStream | 444 // ExternalTwoByteStringUtf16CharacterStream | 
| 450 | 445 | 
| 451 ExternalTwoByteStringUtf16CharacterStream:: | 446 ExternalTwoByteStringUtf16CharacterStream:: | 
| 452     ~ExternalTwoByteStringUtf16CharacterStream() { } | 447     ~ExternalTwoByteStringUtf16CharacterStream() { } | 
| 453 | 448 | 
| 454 | 449 | 
| 455 ExternalTwoByteStringUtf16CharacterStream | 450 ExternalTwoByteStringUtf16CharacterStream | 
| 456     ::ExternalTwoByteStringUtf16CharacterStream( | 451     ::ExternalTwoByteStringUtf16CharacterStream( | 
| 457         Handle<ExternalTwoByteString> data, | 452         Handle<ExternalTwoByteString> data, | 
| 458         int start_position, | 453         int start_position, | 
| 459         int end_position) | 454         int end_position) | 
| 460     : Utf16CharacterStream(), | 455     : Utf16CharacterStream(), | 
| 461       source_(data), | 456       source_(data), | 
| 462       raw_data_(data->GetTwoByteData(start_position)) { | 457       raw_data_(data->GetTwoByteData(start_position)) { | 
| 463   buffer_cursor_ = raw_data_, | 458   buffer_cursor_ = raw_data_, | 
| 464   buffer_end_ = raw_data_ + (end_position - start_position); | 459   buffer_end_ = raw_data_ + (end_position - start_position); | 
| 465   pos_ = start_position; | 460   pos_ = start_position; | 
| 466 } | 461 } | 
| 467 | 462 | 
| 468 } }  // namespace v8::internal | 463 } }  // namespace v8::internal | 
| OLD | NEW | 
|---|