src/scanner-character-streams.cc - Issue 864273005: Scanner / Unicode decoding: use size_t instead of unsigned.

Side by Side Diff: src/scanner-character-streams.cc

Issue 864273005: Scanner / Unicode decoding: use size_t instead of unsigned. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: tentative Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/scanner-character-streams.h"	7 #include "src/scanner-character-streams.h"

8	8

9 #include "include/v8.h"	9 #include "include/v8.h"

10 #include "src/handles.h"	10 #include "src/handles.h"

11 #include "src/unicode-inl.h"	11 #include "src/unicode-inl.h"

12	12

13 namespace v8 {	13 namespace v8 {

14 namespace internal {	14 namespace internal {

15	15

16 namespace {	16 namespace {

17	17

18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,	18 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src,

19 unsigned* src_pos, unsigned src_length,	19 size_t* src_pos, size_t src_length,

20 ScriptCompiler::StreamedSource::Encoding encoding) {	20 ScriptCompiler::StreamedSource::Encoding encoding) {

21 // It's possible that this will be called with length 0, but don't assume that	21 // It's possible that this will be called with length 0, but don't assume that

22 // the functions this calls handle it gracefully.	22 // the functions this calls handle it gracefully.

23 if (length == 0) return 0;	23 if (length == 0) return 0;

24	24

25 if (encoding == ScriptCompiler::StreamedSource::UTF8) {	25 if (encoding == ScriptCompiler::StreamedSource::UTF8) {

26 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(	26 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(

27 dest, length, src, src_pos, src_length);	27 dest, length, src, src_pos, src_length);

28 }	28 }

29	29

30 unsigned to_fill = length;	30 size_t to_fill = length;

31 if (to_fill > src_length - src_pos) to_fill = src_length - src_pos;	31 if (to_fill > src_length - src_pos) to_fill = src_length - src_pos;

32	32

33 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {	33 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {

34 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);	34 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);

35 } else {	35 } else {

36 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);	36 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);

37 v8::internal::CopyChars<uint16_t, uint16_t>(	37 v8::internal::CopyChars<uint16_t, uint16_t>(

38 dest, reinterpret_cast<const uint16_t>(src + src_pos), to_fill);	38 dest, reinterpret_cast<const uint16_t>(src + src_pos), to_fill);

39 }	39 }

40 *src_pos += to_fill;	40 *src_pos += to_fill;

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
103 buffer_cursor_ = buffer_;	103 buffer_cursor_ = buffer_;

104 if (pushback_limit_ != NULL) {	104 if (pushback_limit_ != NULL) {

105 // Leave pushback mode.	105 // Leave pushback mode.

106 buffer_end_ = pushback_limit_;	106 buffer_end_ = pushback_limit_;

107 pushback_limit_ = NULL;	107 pushback_limit_ = NULL;

108 // If there were any valid characters left at the	108 // If there were any valid characters left at the

109 // start of the buffer, use those.	109 // start of the buffer, use those.

110 if (buffer_cursor_ < buffer_end_) return true;	110 if (buffer_cursor_ < buffer_end_) return true;

111 // Otherwise read a new block.	111 // Otherwise read a new block.

112 }	112 }

113 unsigned length = FillBuffer(pos_);	113 size_t length = FillBuffer(pos_);

114 buffer_end_ = buffer_ + length;	114 buffer_end_ = buffer_ + length;

115 return length > 0;	115 return length > 0;

116 }	116 }

117	117

118	118

119 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {	119 size_t BufferedUtf16CharacterStream::SlowSeekForward(size_t delta) {

120 // Leave pushback mode (i.e., ignore that there might be valid data	120 // Leave pushback mode (i.e., ignore that there might be valid data

121 // in the buffer before the pushback_limit_ point).	121 // in the buffer before the pushback_limit_ point).

122 pushback_limit_ = NULL;	122 pushback_limit_ = NULL;

123 return BufferSeekForward(delta);	123 return BufferSeekForward(delta);

124 }	124 }

125	125

126	126

127 // ----------------------------------------------------------------------------	127 // ----------------------------------------------------------------------------

128 // GenericStringUtf16CharacterStream	128 // GenericStringUtf16CharacterStream

129	129

130	130

131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(	131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(

132 Handle<String> data,	132 Handle<String> data, size_t start_position, size_t end_position)

133 unsigned start_position,	133 : string_(data), length_(end_position) {

134 unsigned end_position)

135 : string_(data),

136 length_(end_position) {

137 DCHECK(end_position >= start_position);	134 DCHECK(end_position >= start_position);

138 pos_ = start_position;	135 pos_ = start_position;

139 }	136 }

140	137

141	138

142 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }	139 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }

143	140

144	141

145 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {	142 size_t GenericStringUtf16CharacterStream::BufferSeekForward(size_t delta) {

146 unsigned old_pos = pos_;	143 size_t old_pos = pos_;

147 pos_ = Min(pos_ + delta, length_);	144 pos_ = Min(pos_ + delta, length_);

148 ReadBlock();	145 ReadBlock();

149 return pos_ - old_pos;	146 return pos_ - old_pos;

150 }	147 }

151	148

152	149

153 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {	150 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) {

154 if (from_pos >= length_) return 0;	151 if (from_pos >= length_) return 0;

155 unsigned length = kBufferSize;	152 size_t length = kBufferSize;

156 if (from_pos + length > length_) {	153 if (from_pos + length > length_) {

157 length = length_ - from_pos;	154 length = length_ - from_pos;

158 }	155 }

159 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);	156 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos),

	157 static_cast<int>(from_pos + length));

160 return length;	158 return length;

161 }	159 }

162	160

163	161

164 // ----------------------------------------------------------------------------	162 // ----------------------------------------------------------------------------

165 // Utf8ToUtf16CharacterStream	163 // Utf8ToUtf16CharacterStream

166 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,	164 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,

167 unsigned length)	165 size_t length)

168 : BufferedUtf16CharacterStream(),	166 : BufferedUtf16CharacterStream(),

169 raw_data_(data),	167 raw_data_(data),

170 raw_data_length_(length),	168 raw_data_length_(length),

171 raw_data_pos_(0),	169 raw_data_pos_(0),

172 raw_character_position_(0) {	170 raw_character_position_(0) {

173 ReadBlock();	171 ReadBlock();

174 }	172 }

175	173

176	174

177 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }	175 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }

178	176

179	177

180 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,	178 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length,

181 const byte* src,	179 const byte* src, size_t* src_pos,

182 unsigned* src_pos,	180 size_t src_length) {

183 unsigned src_length) {

184 static const unibrow::uchar kMaxUtf16Character = 0xffff;	181 static const unibrow::uchar kMaxUtf16Character = 0xffff;

185 unsigned i = 0;	182 size_t i = 0;

186 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer	183 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer

187 // one character early (in the normal case), because we need to have at least	184 // one character early (in the normal case), because we need to have at least

188 // two free spaces in the buffer to be sure that the next character will fit.	185 // two free spaces in the buffer to be sure that the next character will fit.

189 while (i < length - 1) {	186 while (i < length - 1) {

190 if (*src_pos == src_length) break;	187 if (*src_pos == src_length) break;

191 unibrow::uchar c = src[*src_pos];	188 unibrow::uchar c = src[*src_pos];

192 if (c <= unibrow::Utf8::kMaxOneByteChar) {	189 if (c <= unibrow::Utf8::kMaxOneByteChar) {

193 src_pos = src_pos + 1;	190 src_pos = src_pos + 1;

194 } else {	191 } else {

195 c = unibrow::Utf8::CalculateValue(src + src_pos, src_length - src_pos,	192 c = unibrow::Utf8::CalculateValue(src + src_pos, src_length - src_pos,

196 src_pos);	193 src_pos);

197 }	194 }

198 if (c > kMaxUtf16Character) {	195 if (c > kMaxUtf16Character) {

199 dest[i++] = unibrow::Utf16::LeadSurrogate(c);	196 dest[i++] = unibrow::Utf16::LeadSurrogate(c);

200 dest[i++] = unibrow::Utf16::TrailSurrogate(c);	197 dest[i++] = unibrow::Utf16::TrailSurrogate(c);

201 } else {	198 } else {

202 dest[i++] = static_cast<uc16>(c);	199 dest[i++] = static_cast<uc16>(c);

203 }	200 }

204 }	201 }

205 return i;	202 return i;

206 }	203 }

207	204

208	205

209 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {	206 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) {

210 unsigned old_pos = pos_;	207 size_t old_pos = pos_;

211 unsigned target_pos = pos_ + delta;	208 size_t target_pos = pos_ + delta;

212 SetRawPosition(target_pos);	209 SetRawPosition(target_pos);

213 pos_ = raw_character_position_;	210 pos_ = raw_character_position_;

214 ReadBlock();	211 ReadBlock();

215 return pos_ - old_pos;	212 return pos_ - old_pos;

216 }	213 }

217	214

218	215

219 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {	216 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) {

220 SetRawPosition(char_position);	217 SetRawPosition(char_position);

221 if (raw_character_position_ != char_position) {	218 if (raw_character_position_ != char_position) {

222 // char_position was not a valid position in the stream (hit the end	219 // char_position was not a valid position in the stream (hit the end

223 // while spooling to it).	220 // while spooling to it).

224 return 0u;	221 return 0u;

225 }	222 }

226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,	223 size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,

227 raw_data_length_);	224 raw_data_length_);

228 raw_character_position_ = char_position + i;	225 raw_character_position_ = char_position + i;

229 return i;	226 return i;

230 }	227 }

231	228

232	229

233 static const byte kUtf8MultiByteMask = 0xC0;	230 static const byte kUtf8MultiByteMask = 0xC0;

234 static const byte kUtf8MultiByteCharFollower = 0x80;	231 static const byte kUtf8MultiByteCharFollower = 0x80;

235	232

236	233

237 #ifdef DEBUG	234 #ifdef DEBUG

238 static const byte kUtf8MultiByteCharStart = 0xC0;	235 static const byte kUtf8MultiByteCharStart = 0xC0;

239 static bool IsUtf8MultiCharacterStart(byte first_byte) {	236 static bool IsUtf8MultiCharacterStart(byte first_byte) {

240 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;	237 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;

241 }	238 }

242 #endif	239 #endif

243	240

244	241

245 static bool IsUtf8MultiCharacterFollower(byte later_byte) {	242 static bool IsUtf8MultiCharacterFollower(byte later_byte) {

246 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;	243 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;

247 }	244 }

248	245

249	246

250 // Move the cursor back to point at the preceding UTF-8 character start	247 // Move the cursor back to point at the preceding UTF-8 character start

251 // in the buffer.	248 // in the buffer.

252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {	249 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) {

253 byte character = buffer[--*cursor];	250 byte character = buffer[--*cursor];

254 if (character > unibrow::Utf8::kMaxOneByteChar) {	251 if (character > unibrow::Utf8::kMaxOneByteChar) {

255 DCHECK(IsUtf8MultiCharacterFollower(character));	252 DCHECK(IsUtf8MultiCharacterFollower(character));

256 // Last byte of a multi-byte character encoding. Step backwards until	253 // Last byte of a multi-byte character encoding. Step backwards until

257 // pointing to the first byte of the encoding, recognized by having the	254 // pointing to the first byte of the encoding, recognized by having the

258 // top two bits set.	255 // top two bits set.

259 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }	256 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }

260 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));	257 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));

261 }	258 }

262 }	259 }

263	260

264	261

265 // Move the cursor forward to point at the next following UTF-8 character start	262 // Move the cursor forward to point at the next following UTF-8 character start

266 // in the buffer.	263 // in the buffer.

267 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {	264 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) {

268 byte character = buffer[(*cursor)++];	265 byte character = buffer[(*cursor)++];

269 if (character > unibrow::Utf8::kMaxOneByteChar) {	266 if (character > unibrow::Utf8::kMaxOneByteChar) {

270 // First character of a multi-byte character encoding.	267 // First character of a multi-byte character encoding.

271 // The number of most-significant one-bits determines the length of the	268 // The number of most-significant one-bits determines the length of the

272 // encoding:	269 // encoding:

273 // 110..... - (0xCx, 0xDx) one additional byte (minimum).	270 // 110..... - (0xCx, 0xDx) one additional byte (minimum).

274 // 1110.... - (0xEx) two additional bytes.	271 // 1110.... - (0xEx) two additional bytes.

275 // 11110... - (0xFx) three additional bytes (maximum).	272 // 11110... - (0xFx) three additional bytes (maximum).

276 DCHECK(IsUtf8MultiCharacterStart(character));	273 DCHECK(IsUtf8MultiCharacterStart(character));

277 // Additional bytes is:	274 // Additional bytes is:

278 // 1 if value in range 0xC0 .. 0xDF.	275 // 1 if value in range 0xC0 .. 0xDF.

279 // 2 if value in range 0xE0 .. 0xEF.	276 // 2 if value in range 0xE0 .. 0xEF.

280 // 3 if value in range 0xF0 .. 0xF7.	277 // 3 if value in range 0xF0 .. 0xF7.

281 // Encode that in a single value.	278 // Encode that in a single value.

282 unsigned additional_bytes =	279 size_t additional_bytes =

283 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;	280 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;

284 *cursor += additional_bytes;	281 *cursor += additional_bytes;

285 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));	282 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));

286 }	283 }

287 }	284 }

288	285

289	286

290 // This can't set a raw position between two surrogate pairs, since there	287 // This can't set a raw position between two surrogate pairs, since there

291 // is no position in the UTF8 stream that corresponds to that. This assumes	288 // is no position in the UTF8 stream that corresponds to that. This assumes

292 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If	289 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If

293 // it is illegally coded as two 3 byte sequences then there is no problem here.	290 // it is illegally coded as two 3 byte sequences then there is no problem here.

294 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {	291 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) {

295 if (raw_character_position_ > target_position) {	292 if (raw_character_position_ > target_position) {

296 // Spool backwards in utf8 buffer.	293 // Spool backwards in utf8 buffer.

297 do {	294 do {

298 int old_pos = raw_data_pos_;	295 size_t old_pos = raw_data_pos_;

299 Utf8CharacterBack(raw_data_, &raw_data_pos_);	296 Utf8CharacterBack(raw_data_, &raw_data_pos_);

300 raw_character_position_--;	297 raw_character_position_--;

301 DCHECK(old_pos - raw_data_pos_ <= 4);	298 DCHECK(old_pos - raw_data_pos_ <= 4);

302 // Step back over both code units for surrogate pairs.	299 // Step back over both code units for surrogate pairs.

303 if (old_pos - raw_data_pos_ == 4) raw_character_position_--;	300 if (old_pos - raw_data_pos_ == 4) raw_character_position_--;

304 } while (raw_character_position_ > target_position);	301 } while (raw_character_position_ > target_position);

305 // No surrogate pair splitting.	302 // No surrogate pair splitting.

306 DCHECK(raw_character_position_ == target_position);	303 DCHECK(raw_character_position_ == target_position);

307 return;	304 return;

308 }	305 }

309 // Spool forwards in the utf8 buffer.	306 // Spool forwards in the utf8 buffer.

310 while (raw_character_position_ < target_position) {	307 while (raw_character_position_ < target_position) {

311 if (raw_data_pos_ == raw_data_length_) return;	308 if (raw_data_pos_ == raw_data_length_) return;

312 int old_pos = raw_data_pos_;	309 size_t old_pos = raw_data_pos_;

313 Utf8CharacterForward(raw_data_, &raw_data_pos_);	310 Utf8CharacterForward(raw_data_, &raw_data_pos_);

314 raw_character_position_++;	311 raw_character_position_++;

315 DCHECK(raw_data_pos_ - old_pos <= 4);	312 DCHECK(raw_data_pos_ - old_pos <= 4);

316 if (raw_data_pos_ - old_pos == 4) raw_character_position_++;	313 if (raw_data_pos_ - old_pos == 4) raw_character_position_++;

317 }	314 }

318 // No surrogate pair splitting.	315 // No surrogate pair splitting.

319 DCHECK(raw_character_position_ == target_position);	316 DCHECK(raw_character_position_ == target_position);

320 }	317 }

321	318

322	319

323 unsigned ExternalStreamingStream::FillBuffer(unsigned position) {	320 size_t ExternalStreamingStream::FillBuffer(size_t position) {

324 // Ignore "position" which is the position in the decoded data. Instead,	321 // Ignore "position" which is the position in the decoded data. Instead,

325 // ExternalStreamingStream keeps track of the position in the raw data.	322 // ExternalStreamingStream keeps track of the position in the raw data.

326 unsigned data_in_buffer = 0;	323 size_t data_in_buffer = 0;

327 // Note that the UTF-8 decoder might not be able to fill the buffer	324 // Note that the UTF-8 decoder might not be able to fill the buffer

328 // completely; it will typically leave the last character empty (see	325 // completely; it will typically leave the last character empty (see

329 // Utf8ToUtf16CharacterStream::CopyChars).	326 // Utf8ToUtf16CharacterStream::CopyChars).

330 while (data_in_buffer < kBufferSize - 1) {	327 while (data_in_buffer < kBufferSize - 1) {

331 if (current_data_ == NULL) {	328 if (current_data_ == NULL) {

332 // GetSomeData will wait until the embedder has enough data. Here's an	329 // GetSomeData will wait until the embedder has enough data. Here's an

333 // interface between the API which uses size_t (which is the correct type	330 // interface between the API which uses size_t (which is the correct type

334 // here) and the internal parts which use unsigned. TODO(marja): make the	331 // here) and the internal parts which use size_t.

335 // internal parts use size_t too.	332 current_data_length_ = source_stream_->GetMoreData(&current_data_);

336 current_data_length_ =

337 static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));

338 current_data_offset_ = 0;	333 current_data_offset_ = 0;

339 bool data_ends = current_data_length_ == 0;	334 bool data_ends = current_data_length_ == 0;

340	335

341 // A caveat: a data chunk might end with bytes from an incomplete UTF-8	336 // A caveat: a data chunk might end with bytes from an incomplete UTF-8

342 // character (the rest of the bytes will be in the next chunk).	337 // character (the rest of the bytes will be in the next chunk).

343 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {	338 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {

344 HandleUtf8SplitCharacters(&data_in_buffer);	339 HandleUtf8SplitCharacters(&data_in_buffer);

345 if (!data_ends && current_data_offset_ == current_data_length_) {	340 if (!data_ends && current_data_offset_ == current_data_length_) {

346 // The data stream didn't end, but we used all the data in the	341 // The data stream didn't end, but we used all the data in the

347 // chunk. This will only happen when the chunk was really small. We	342 // chunk. This will only happen when the chunk was really small. We

348 // don't handle the case where a UTF-8 character is split over several	343 // don't handle the case where a UTF-8 character is split over several

349 // chunks; in that case V8 won't crash, but it will be a parse error.	344 // chunks; in that case V8 won't crash, but it will be a parse error.

350 delete[] current_data_;	345 delete[] current_data_;

351 current_data_ = NULL;	346 current_data_ = NULL;

352 current_data_length_ = 0;	347 current_data_length_ = 0;

353 current_data_offset_ = 0;	348 current_data_offset_ = 0;

354 continue; // Request a new chunk.	349 continue; // Request a new chunk.

355 }	350 }

356 }	351 }

357	352

358 // Did the data stream end?	353 // Did the data stream end?

359 if (data_ends) {	354 if (data_ends) {

360 DCHECK(utf8_split_char_buffer_length_ == 0);	355 DCHECK(utf8_split_char_buffer_length_ == 0);

361 return data_in_buffer;	356 return data_in_buffer;

362 }	357 }

363 }	358 }

364	359

365 // Fill the buffer from current_data_.	360 // Fill the buffer from current_data_.

366 unsigned new_offset = 0;	361 size_t new_offset = 0;

367 unsigned new_chars_in_buffer =	362 size_t new_chars_in_buffer =

368 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,	363 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,

369 current_data_ + current_data_offset_, &new_offset,	364 current_data_ + current_data_offset_, &new_offset,

370 current_data_length_ - current_data_offset_, encoding_);	365 current_data_length_ - current_data_offset_, encoding_);

371 data_in_buffer += new_chars_in_buffer;	366 data_in_buffer += new_chars_in_buffer;

372 current_data_offset_ += new_offset;	367 current_data_offset_ += new_offset;

373 DCHECK(data_in_buffer <= kBufferSize);	368 DCHECK(data_in_buffer <= kBufferSize);

374	369

375 // Did we use all the data in the data chunk?	370 // Did we use all the data in the data chunk?

376 if (current_data_offset_ == current_data_length_) {	371 if (current_data_offset_ == current_data_length_) {

377 delete[] current_data_;	372 delete[] current_data_;

378 current_data_ = NULL;	373 current_data_ = NULL;

379 current_data_length_ = 0;	374 current_data_length_ = 0;

380 current_data_offset_ = 0;	375 current_data_offset_ = 0;

381 }	376 }

382 }	377 }

383 return data_in_buffer;	378 return data_in_buffer;

384 }	379 }

385	380

386 void ExternalStreamingStream::HandleUtf8SplitCharacters(	381 void ExternalStreamingStream::HandleUtf8SplitCharacters(

387 unsigned* data_in_buffer) {	382 size_t* data_in_buffer) {

388 // Note the following property of UTF-8 which makes this function possible:	383 // Note the following property of UTF-8 which makes this function possible:

389 // Given any byte, we can always read its local environment (in both	384 // Given any byte, we can always read its local environment (in both

390 // directions) to find out the (possibly multi-byte) character it belongs	385 // directions) to find out the (possibly multi-byte) character it belongs

391 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a	386 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a

392 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or	387 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or

393 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.	388 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.

394	389

395 // First check if we have leftover data from the last chunk.	390 // First check if we have leftover data from the last chunk.

396 unibrow::uchar c;	391 unibrow::uchar c;

397 if (utf8_split_char_buffer_length_ > 0) {	392 if (utf8_split_char_buffer_length_ > 0) {

398 // Move the bytes which are part of the split character (which started in	393 // Move the bytes which are part of the split character (which started in

399 // the previous chunk) into utf8_split_char_buffer_. Note that the	394 // the previous chunk) into utf8_split_char_buffer_. Note that the

400 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.	395 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.

401 while (current_data_offset_ < current_data_length_ &&	396 while (current_data_offset_ < current_data_length_ &&

402 utf8_split_char_buffer_length_ < 4 &&	397 utf8_split_char_buffer_length_ < 4 &&

403 (c = current_data_[current_data_offset_]) >> 6 == 2) {	398 (c = current_data_[current_data_offset_]) >> 6 == 2) {

404 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;	399 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;

405 ++utf8_split_char_buffer_length_;	400 ++utf8_split_char_buffer_length_;

406 ++current_data_offset_;	401 ++current_data_offset_;

407 }	402 }

408	403

409 // Convert the data in utf8_split_char_buffer_.	404 // Convert the data in utf8_split_char_buffer_.

410 unsigned new_offset = 0;	405 size_t new_offset = 0;

411 unsigned new_chars_in_buffer =	406 size_t new_chars_in_buffer =

412 CopyCharsHelper(buffer_ + *data_in_buffer,	407 CopyCharsHelper(buffer_ + *data_in_buffer,

413 kBufferSize - *data_in_buffer, utf8_split_char_buffer_,	408 kBufferSize - *data_in_buffer, utf8_split_char_buffer_,

414 &new_offset, utf8_split_char_buffer_length_, encoding_);	409 &new_offset, utf8_split_char_buffer_length_, encoding_);

415 *data_in_buffer += new_chars_in_buffer;	410 *data_in_buffer += new_chars_in_buffer;

416 // Make sure we used all the data.	411 // Make sure we used all the data.

417 DCHECK(new_offset == utf8_split_char_buffer_length_);	412 DCHECK(new_offset == utf8_split_char_buffer_length_);

418 DCHECK(*data_in_buffer <= kBufferSize);	413 DCHECK(*data_in_buffer <= kBufferSize);

419	414

420 utf8_split_char_buffer_length_ = 0;	415 utf8_split_char_buffer_length_ = 0;

421 }	416 }

(...skipping 10 matching lines...) Expand all Loading...
432 --current_data_length_;	427 --current_data_length_;

433 ++utf8_split_char_buffer_length_;	428 ++utf8_split_char_buffer_length_;

434 if (c >= (3 << 6)) {	429 if (c >= (3 << 6)) {

435 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte	430 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte

436 // character. No need to copy the previous characters into the conversion	431 // character. No need to copy the previous characters into the conversion

437 // buffer (even if they're multi-byte).	432 // buffer (even if they're multi-byte).

438 break;	433 break;

439 }	434 }

440 }	435 }

441 CHECK(utf8_split_char_buffer_length_ <= 4);	436 CHECK(utf8_split_char_buffer_length_ <= 4);

442 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {	437 for (size_t i = 0; i < utf8_split_char_buffer_length_; ++i) {

443 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];	438 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];

444 }	439 }

445 }	440 }

446	441

447	442

448 // ----------------------------------------------------------------------------	443 // ----------------------------------------------------------------------------

449 // ExternalTwoByteStringUtf16CharacterStream	444 // ExternalTwoByteStringUtf16CharacterStream

450	445

451 ExternalTwoByteStringUtf16CharacterStream::	446 ExternalTwoByteStringUtf16CharacterStream::

452 ~ExternalTwoByteStringUtf16CharacterStream() { }	447 ~ExternalTwoByteStringUtf16CharacterStream() { }

453	448

454	449

455 ExternalTwoByteStringUtf16CharacterStream	450 ExternalTwoByteStringUtf16CharacterStream

456 ::ExternalTwoByteStringUtf16CharacterStream(	451 ::ExternalTwoByteStringUtf16CharacterStream(

457 Handle<ExternalTwoByteString> data,	452 Handle<ExternalTwoByteString> data,

458 int start_position,	453 int start_position,

459 int end_position)	454 int end_position)

460 : Utf16CharacterStream(),	455 : Utf16CharacterStream(),

461 source_(data),	456 source_(data),

462 raw_data_(data->GetTwoByteData(start_position)) {	457 raw_data_(data->GetTwoByteData(start_position)) {

463 buffer_cursor_ = raw_data_,	458 buffer_cursor_ = raw_data_,

464 buffer_end_ = raw_data_ + (end_position - start_position);	459 buffer_end_ = raw_data_ + (end_position - start_position);

465 pos_ = start_position;	460 pos_ = start_position;

466 }	461 }

467	462

468 } } // namespace v8::internal	463 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner-character-streams.h ('k') | src/unicode.h » ('j') | no next file with comments »