src/scanner-character-streams.cc - Issue 708823002: Streaming API: detect UTF-8 BOM.

Side by Side Diff: src/scanner-character-streams.cc

Issue 708823002: Streaming API: detect UTF-8 BOM. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: . Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/scanner-character-streams.h"	7 #include "src/scanner-character-streams.h"

8	8

9 #include "include/v8.h"	9 #include "include/v8.h"

10 #include "src/handles.h"	10 #include "src/handles.h"

(...skipping 214 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
225 }	225 }

226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,	226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,

227 raw_data_length_);	227 raw_data_length_);

228 raw_character_position_ = char_position + i;	228 raw_character_position_ = char_position + i;

229 return i;	229 return i;

230 }	230 }

231	231

232	232

233 static const byte kUtf8MultiByteMask = 0xC0;	233 static const byte kUtf8MultiByteMask = 0xC0;

234 static const byte kUtf8MultiByteCharFollower = 0x80;	234 static const byte kUtf8MultiByteCharFollower = 0x80;

	235 static const byte kUtf8MultiByteCharStart = 0xC0;

235	236

236	237

237 #ifdef DEBUG

238 static const byte kUtf8MultiByteCharStart = 0xC0;

239 static bool IsUtf8MultiCharacterStart(byte first_byte) {	238 static bool IsUtf8MultiCharacterStart(byte first_byte) {

240 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;	239 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;

241 }	240 }

242 #endif

243	241

244	242

245 static bool IsUtf8MultiCharacterFollower(byte later_byte) {	243 static bool IsUtf8MultiCharacterFollower(byte later_byte) {

246 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;	244 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;

247 }	245 }

248	246

249	247

250 // Move the cursor back to point at the preceding UTF-8 character start	248 // Move the cursor back to point at the preceding UTF-8 character start

251 // in the buffer.	249 // in the buffer.

252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {	250 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
334 // here) and the internal parts which use unsigned. TODO(marja): make the	332 // here) and the internal parts which use unsigned. TODO(marja): make the

335 // internal parts use size_t too.	333 // internal parts use size_t too.

336 current_data_length_ =	334 current_data_length_ =

337 static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));	335 static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));

338 current_data_offset_ = 0;	336 current_data_offset_ = 0;

339 bool data_ends = current_data_length_ == 0;	337 bool data_ends = current_data_length_ == 0;

340	338

341 // A caveat: a data chunk might end with bytes from an incomplete UTF-8	339 // A caveat: a data chunk might end with bytes from an incomplete UTF-8

342 // character (the rest of the bytes will be in the next chunk).	340 // character (the rest of the bytes will be in the next chunk).

343 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {	341 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {

	342 if (first_chunk_) {

	343 // Get rid of the byte order mark (if any).

	344 if (current_data_length_ >= 3 && current_data_[0] == 0xef &&

	345 current_data_[1] == 0xbb && current_data_[2] == 0xbf) {

	346 current_data_offset_ = 3;

	347 }

	348 }

	349

344 HandleUtf8SplitCharacters(&data_in_buffer);	350 HandleUtf8SplitCharacters(&data_in_buffer);

345 if (!data_ends && current_data_offset_ == current_data_length_) {	351 if (!data_ends && current_data_offset_ == current_data_length_) {

346 // The data stream didn't end, but we used all the data in the	352 // The data stream didn't end, but we used all the data in the

347 // chunk. This will only happen when the chunk was really small. We	353 // chunk. This will only happen when the chunk was really small. We

348 // don't handle the case where a UTF-8 character is split over several	354 // don't handle the case where a UTF-8 character is split over several

349 // chunks; in that case V8 won't crash, but it will be a parse error.	355 // chunks; in that case V8 won't crash, but it will be a parse error.

350 delete[] current_data_;	356 delete[] current_data_;

351 current_data_ = NULL;	357 current_data_ = NULL;

352 current_data_length_ = 0;	358 current_data_length_ = 0;

353 current_data_offset_ = 0;	359 current_data_offset_ = 0;

354 continue; // Request a new chunk.	360 continue; // Request a new chunk.

355 }	361 }

356 }	362 }

357	363

358 // Did the data stream end?	364 // Did the data stream end?

359 if (data_ends) {	365 if (data_ends) {

360 DCHECK(utf8_split_char_buffer_length_ == 0);	366 DCHECK(utf8_split_char_buffer_length_ == 0);

361 return data_in_buffer;	367 return data_in_buffer;

362 }	368 }

	369

	370 first_chunk_ = false;

363 }	371 }

364	372

365 // Fill the buffer from current_data_.	373 // Fill the buffer from current_data_.

366 unsigned new_offset = 0;	374 unsigned new_offset = 0;

367 unsigned new_chars_in_buffer =	375 unsigned new_chars_in_buffer =

368 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,	376 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,

369 current_data_ + current_data_offset_, &new_offset,	377 current_data_ + current_data_offset_, &new_offset,

370 current_data_length_ - current_data_offset_, encoding_);	378 current_data_length_ - current_data_offset_, encoding_);

371 data_in_buffer += new_chars_in_buffer;	379 data_in_buffer += new_chars_in_buffer;

372 current_data_offset_ += new_offset;	380 current_data_offset_ += new_offset;

(...skipping 16 matching lines...) Expand all Loading...
389 // Given any byte, we can always read its local environment (in both	397 // Given any byte, we can always read its local environment (in both

390 // directions) to find out the (possibly multi-byte) character it belongs	398 // directions) to find out the (possibly multi-byte) character it belongs

391 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a	399 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a

392 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or	400 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or

393 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.	401 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.

394	402

395 // First check if we have leftover data from the last chunk.	403 // First check if we have leftover data from the last chunk.

396 unibrow::uchar c;	404 unibrow::uchar c;

397 if (utf8_split_char_buffer_length_ > 0) {	405 if (utf8_split_char_buffer_length_ > 0) {

398 // Move the bytes which are part of the split character (which started in	406 // Move the bytes which are part of the split character (which started in

399 // the previous chunk) into utf8_split_char_buffer_. Note that the	407 // the previous chunk) into utf8_split_char_buffer_.

400 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.	408 while (

401 while (current_data_offset_ < current_data_length_ &&	409 current_data_offset_ < current_data_length_ &&

402 utf8_split_char_buffer_length_ < 4 &&	410 utf8_split_char_buffer_length_ < 4 &&

403 (c = current_data_[current_data_offset_]) >> 6 == 2) {	411 IsUtf8MultiCharacterFollower(c = current_data_[current_data_offset_])) {

404 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;	412 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;

405 ++utf8_split_char_buffer_length_;	413 ++utf8_split_char_buffer_length_;

406 ++current_data_offset_;	414 ++current_data_offset_;

407 }	415 }

408	416

409 // Convert the data in utf8_split_char_buffer_.	417 // Convert the data in utf8_split_char_buffer_.

410 unsigned new_offset = 0;	418 unsigned new_offset = 0;

411 unsigned new_chars_in_buffer =	419 unsigned new_chars_in_buffer =

412 CopyCharsHelper(buffer_ + *data_in_buffer,	420 CopyCharsHelper(buffer_ + *data_in_buffer,

413 kBufferSize - *data_in_buffer, utf8_split_char_buffer_,	421 kBufferSize - *data_in_buffer, utf8_split_char_buffer_,

414 &new_offset, utf8_split_char_buffer_length_, encoding_);	422 &new_offset, utf8_split_char_buffer_length_, encoding_);

415 *data_in_buffer += new_chars_in_buffer;	423 *data_in_buffer += new_chars_in_buffer;

416 // Make sure we used all the data.	424 // Make sure we used all the data.

417 DCHECK(new_offset == utf8_split_char_buffer_length_);	425 DCHECK(new_offset == utf8_split_char_buffer_length_);

418 DCHECK(*data_in_buffer <= kBufferSize);	426 DCHECK(*data_in_buffer <= kBufferSize);

419	427

420 utf8_split_char_buffer_length_ = 0;	428 utf8_split_char_buffer_length_ = 0;

421 }	429 }

422	430

423 // Move bytes which are part of an incomplete character from the end of the	431 // Move bytes which are part of an incomplete character from the end of the

424 // current chunk to utf8_split_char_buffer_. They will be converted when the	432 // current chunk to utf8_split_char_buffer_. They will be converted when the

425 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4	433 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4

426 // bytes long, but if the data is invalid, we can have character values bigger	434 // bytes long, but if the data is invalid, we can have character values bigger

427 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.	435 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.

428 while (current_data_length_ > current_data_offset_ &&	436 while (current_data_length_ > current_data_offset_ &&

429 (c = current_data_[current_data_length_ - 1]) >	437 utf8_split_char_buffer_length_ < 4 &&

430 unibrow::Utf8::kMaxOneByteChar &&	438 (IsUtf8MultiCharacterFollower(

431 utf8_split_char_buffer_length_ < 4) {	439 c = current_data_[current_data_length_ - 1]) \|\|

	440 IsUtf8MultiCharacterStart(c))) {

432 --current_data_length_;	441 --current_data_length_;

433 ++utf8_split_char_buffer_length_;	442 ++utf8_split_char_buffer_length_;

434 if (c >= (3 << 6)) {	443 if (IsUtf8MultiCharacterStart(c)) {

435 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte	444 // This is the first byte of the multi-byte character. No need to copy the

436 // character. No need to copy the previous characters into the conversion	445 // previous characters into the conversion buffer (even if they're

437 // buffer (even if they're multi-byte).	446 // multi-byte).

438 break;	447 break;

439 }	448 }

440 }	449 }

441 CHECK(utf8_split_char_buffer_length_ <= 4);	450 CHECK(utf8_split_char_buffer_length_ <= 4);

442 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {	451 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {

443 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];	452 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];

444 }	453 }

445 }	454 }

446	455

447	456

(...skipping 11 matching lines...) Expand all Loading...
459 int end_position)	468 int end_position)

460 : Utf16CharacterStream(),	469 : Utf16CharacterStream(),

461 source_(data),	470 source_(data),

462 raw_data_(data->GetTwoByteData(start_position)) {	471 raw_data_(data->GetTwoByteData(start_position)) {

463 buffer_cursor_ = raw_data_,	472 buffer_cursor_ = raw_data_,

464 buffer_end_ = raw_data_ + (end_position - start_position);	473 buffer_end_ = raw_data_ + (end_position - start_position);

465 pos_ = start_position;	474 pos_ = start_position;

466 }	475 }

467	476

468 } } // namespace v8::internal	477 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner-character-streams.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »