Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(558)

Side by Side Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: Rebase Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « include/v8.h ('k') | src/unicode.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 4436 matching lines...) Expand 10 before | Expand all | Expand 10 after
4447 int String::Utf8Length() const { 4447 int String::Utf8Length() const {
4448 i::Handle<i::String> str = Utils::OpenHandle(this); 4448 i::Handle<i::String> str = Utils::OpenHandle(this);
4449 i::Isolate* isolate = str->GetIsolate(); 4449 i::Isolate* isolate = str->GetIsolate();
4450 return v8::Utf8Length(*str, isolate); 4450 return v8::Utf8Length(*str, isolate);
4451 } 4451 }
4452 4452
4453 4453
4454 class Utf8WriterVisitor { 4454 class Utf8WriterVisitor {
4455 public: 4455 public:
4456 Utf8WriterVisitor( 4456 Utf8WriterVisitor(
4457 char* buffer, int capacity, bool skip_capacity_check) 4457 char* buffer,
4458 : early_termination_(false), 4458 int capacity,
4459 last_character_(unibrow::Utf16::kNoPreviousCharacter), 4459 bool skip_capacity_check,
4460 buffer_(buffer), 4460 bool replace_invalid_utf8)
4461 start_(buffer), 4461 : early_termination_(false),
4462 capacity_(capacity), 4462 last_character_(unibrow::Utf16::kNoPreviousCharacter),
4463 skip_capacity_check_(capacity == -1 || skip_capacity_check), 4463 buffer_(buffer),
4464 utf16_chars_read_(0) { 4464 start_(buffer),
4465 capacity_(capacity),
4466 skip_capacity_check_(capacity == -1 || skip_capacity_check),
4467 replace_invalid_utf8_(replace_invalid_utf8),
4468 utf16_chars_read_(0) {
4465 } 4469 }
4466 4470
4467 static int WriteEndCharacter(uint16_t character, 4471 static int WriteEndCharacter(uint16_t character,
4468 int last_character, 4472 int last_character,
4469 int remaining, 4473 int remaining,
4470 char* const buffer) { 4474 char* const buffer,
4475 bool replace_invalid_utf8) {
4471 using namespace unibrow; 4476 using namespace unibrow;
4472 ASSERT(remaining > 0); 4477 ASSERT(remaining > 0);
4473 // We can't use a local buffer here because Encode needs to modify 4478 // We can't use a local buffer here because Encode needs to modify
4474 // previous characters in the stream. We know, however, that 4479 // previous characters in the stream. We know, however, that
4475 // exactly one character will be advanced. 4480 // exactly one character will be advanced.
4476 if (Utf16::IsTrailSurrogate(character) && 4481 if (Utf16::IsSurrogatePair(last_character, character)) {
4477 Utf16::IsLeadSurrogate(last_character)) { 4482 int written = Utf8::Encode(buffer,
4478 int written = Utf8::Encode(buffer, character, last_character); 4483 character,
4484 last_character,
4485 replace_invalid_utf8);
4479 ASSERT(written == 1); 4486 ASSERT(written == 1);
4480 return written; 4487 return written;
4481 } 4488 }
4482 // Use a scratch buffer to check the required characters. 4489 // Use a scratch buffer to check the required characters.
4483 char temp_buffer[Utf8::kMaxEncodedSize]; 4490 char temp_buffer[Utf8::kMaxEncodedSize];
4484 // Can't encode using last_character as gcc has array bounds issues. 4491 // Can't encode using last_character as gcc has array bounds issues.
4485 int written = Utf8::Encode(temp_buffer, 4492 int written = Utf8::Encode(temp_buffer,
4486 character, 4493 character,
4487 Utf16::kNoPreviousCharacter); 4494 Utf16::kNoPreviousCharacter,
4495 replace_invalid_utf8);
4488 // Won't fit. 4496 // Won't fit.
4489 if (written > remaining) return 0; 4497 if (written > remaining) return 0;
4490 // Copy over the character from temp_buffer. 4498 // Copy over the character from temp_buffer.
4491 for (int j = 0; j < written; j++) { 4499 for (int j = 0; j < written; j++) {
4492 buffer[j] = temp_buffer[j]; 4500 buffer[j] = temp_buffer[j];
4493 } 4501 }
4494 return written; 4502 return written;
4495 } 4503 }
4496 4504
4505 // Visit writes out a group of code units (chars) of a v8::String to the
4506 // internal buffer_. This is done in two phases. The first phase calculates a
4507 // pesimistic estimate (writable_length) on how many code units can be safely
4508 // written without exceeding the buffer capacity and without writing the last
4509 // code unit (it could be a lead surrogate). The estimated number of code
4510 // units is then written out in one go, and the reported byte usage is used
4511 // to correct the estimate. This is repeated until the estimate becomes <= 0
4512 // or all code units have been written out. The second phase writes out code
4513 // units until the buffer capacity is reached, would be exceeded by the next
4514 // unit, or all units have been written out.
4497 template<typename Char> 4515 template<typename Char>
4498 void Visit(const Char* chars, const int length) { 4516 void Visit(const Char* chars, const int length) {
4499 using namespace unibrow; 4517 using namespace unibrow;
4500 ASSERT(!early_termination_); 4518 ASSERT(!early_termination_);
4501 if (length == 0) return; 4519 if (length == 0) return;
4502 // Copy state to stack. 4520 // Copy state to stack.
4503 char* buffer = buffer_; 4521 char* buffer = buffer_;
4504 int last_character = 4522 int last_character =
4505 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_; 4523 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;
4506 int i = 0; 4524 int i = 0;
(...skipping 17 matching lines...) Expand all
4524 // Write the characters to the stream. 4542 // Write the characters to the stream.
4525 if (sizeof(Char) == 1) { 4543 if (sizeof(Char) == 1) {
4526 for (; i < fast_length; i++) { 4544 for (; i < fast_length; i++) {
4527 buffer += 4545 buffer +=
4528 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++)); 4546 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));
4529 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); 4547 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4530 } 4548 }
4531 } else { 4549 } else {
4532 for (; i < fast_length; i++) { 4550 for (; i < fast_length; i++) {
4533 uint16_t character = *chars++; 4551 uint16_t character = *chars++;
4534 buffer += Utf8::Encode(buffer, character, last_character); 4552 buffer += Utf8::Encode(buffer,
4553 character,
4554 last_character,
4555 replace_invalid_utf8_);
4535 last_character = character; 4556 last_character = character;
4536 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); 4557 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4537 } 4558 }
4538 } 4559 }
4539 // Array is fully written. Exit. 4560 // Array is fully written. Exit.
4540 if (fast_length == length) { 4561 if (fast_length == length) {
4541 // Write state back out to object. 4562 // Write state back out to object.
4542 last_character_ = last_character; 4563 last_character_ = last_character;
4543 buffer_ = buffer; 4564 buffer_ = buffer;
4544 utf16_chars_read_ += length; 4565 utf16_chars_read_ += length;
4545 return; 4566 return;
4546 } 4567 }
4547 } 4568 }
4548 ASSERT(!skip_capacity_check_); 4569 ASSERT(!skip_capacity_check_);
4549 // Slow loop. Must check capacity on each iteration. 4570 // Slow loop. Must check capacity on each iteration.
4550 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); 4571 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
4551 ASSERT(remaining_capacity >= 0); 4572 ASSERT(remaining_capacity >= 0);
4552 for (; i < length && remaining_capacity > 0; i++) { 4573 for (; i < length && remaining_capacity > 0; i++) {
4553 uint16_t character = *chars++; 4574 uint16_t character = *chars++;
4575 // remaining_capacity is <= 3 bytes at this point, so we do not write out
4576 // an umatched lead surrogate.
4577 if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
4578 early_termination_ = true;
4579 break;
4580 }
4554 int written = WriteEndCharacter(character, 4581 int written = WriteEndCharacter(character,
4555 last_character, 4582 last_character,
4556 remaining_capacity, 4583 remaining_capacity,
4557 buffer); 4584 buffer,
4585 replace_invalid_utf8_);
4558 if (written == 0) { 4586 if (written == 0) {
4559 early_termination_ = true; 4587 early_termination_ = true;
4560 break; 4588 break;
4561 } 4589 }
4562 buffer += written; 4590 buffer += written;
4563 remaining_capacity -= written; 4591 remaining_capacity -= written;
4564 last_character = character; 4592 last_character = character;
4565 } 4593 }
4566 // Write state back out to object. 4594 // Write state back out to object.
4567 last_character_ = last_character; 4595 last_character_ = last_character;
(...skipping 27 matching lines...) Expand all
4595 return static_cast<int>(buffer_ - start_); 4623 return static_cast<int>(buffer_ - start_);
4596 } 4624 }
4597 4625
4598 private: 4626 private:
4599 bool early_termination_; 4627 bool early_termination_;
4600 int last_character_; 4628 int last_character_;
4601 char* buffer_; 4629 char* buffer_;
4602 char* const start_; 4630 char* const start_;
4603 int capacity_; 4631 int capacity_;
4604 bool const skip_capacity_check_; 4632 bool const skip_capacity_check_;
4633 bool const replace_invalid_utf8_;
4605 int utf16_chars_read_; 4634 int utf16_chars_read_;
4606 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); 4635 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
4607 }; 4636 };
4608 4637
4609 4638
4610 static bool RecursivelySerializeToUtf8(i::String* current, 4639 static bool RecursivelySerializeToUtf8(i::String* current,
4611 Utf8WriterVisitor* writer, 4640 Utf8WriterVisitor* writer,
4612 int recursion_budget) { 4641 int recursion_budget) {
4613 while (!writer->IsDone()) { 4642 while (!writer->IsDone()) {
4614 i::ConsString* cons_string = i::String::VisitFlat(writer, current); 4643 i::ConsString* cons_string = i::String::VisitFlat(writer, current);
(...skipping 18 matching lines...) Expand all
4633 int options) const { 4662 int options) const {
4634 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); 4663 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();
4635 LOG_API(isolate, "String::WriteUtf8"); 4664 LOG_API(isolate, "String::WriteUtf8");
4636 ENTER_V8(isolate); 4665 ENTER_V8(isolate);
4637 i::Handle<i::String> str = Utils::OpenHandle(this); 4666 i::Handle<i::String> str = Utils::OpenHandle(this);
4638 if (options & HINT_MANY_WRITES_EXPECTED) { 4667 if (options & HINT_MANY_WRITES_EXPECTED) {
4639 FlattenString(str); // Flatten the string for efficiency. 4668 FlattenString(str); // Flatten the string for efficiency.
4640 } 4669 }
4641 const int string_length = str->length(); 4670 const int string_length = str->length();
4642 bool write_null = !(options & NO_NULL_TERMINATION); 4671 bool write_null = !(options & NO_NULL_TERMINATION);
4672 bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
4673 int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
4643 // First check if we can just write the string without checking capacity. 4674 // First check if we can just write the string without checking capacity.
4644 if (capacity == -1 || capacity / 3 >= string_length) { 4675 if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
4645 Utf8WriterVisitor writer(buffer, capacity, true); 4676 Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
4646 const int kMaxRecursion = 100; 4677 const int kMaxRecursion = 100;
4647 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); 4678 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
4648 if (success) return writer.CompleteWrite(write_null, nchars_ref); 4679 if (success) return writer.CompleteWrite(write_null, nchars_ref);
4649 } else if (capacity >= string_length) { 4680 } else if (capacity >= string_length) {
4650 // First check that the buffer is large enough. 4681 // First check that the buffer is large enough.
4651 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate()); 4682 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
4652 if (utf8_bytes <= capacity) { 4683 if (utf8_bytes <= capacity) {
4653 // ASCII fast path. 4684 // ASCII fast path.
4654 if (utf8_bytes == string_length) { 4685 if (utf8_bytes == string_length) {
4655 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options); 4686 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);
4656 if (nchars_ref != NULL) *nchars_ref = string_length; 4687 if (nchars_ref != NULL) *nchars_ref = string_length;
4657 if (write_null && (utf8_bytes+1 <= capacity)) { 4688 if (write_null && (utf8_bytes+1 <= capacity)) {
4658 return string_length + 1; 4689 return string_length + 1;
4659 } 4690 }
4660 return string_length; 4691 return string_length;
4661 } 4692 }
4662 if (write_null && (utf8_bytes+1 > capacity)) { 4693 if (write_null && (utf8_bytes+1 > capacity)) {
4663 options |= NO_NULL_TERMINATION; 4694 options |= NO_NULL_TERMINATION;
4664 } 4695 }
4665 // Recurse once without a capacity limit. 4696 // Recurse once without a capacity limit.
4666 // This will get into the first branch above. 4697 // This will get into the first branch above.
4667 // TODO(dcarney) Check max left rec. in Utf8Length and fall through. 4698 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
4668 return WriteUtf8(buffer, -1, nchars_ref, options); 4699 return WriteUtf8(buffer, -1, nchars_ref, options);
4669 } 4700 }
4670 } 4701 }
4671 // Recursive slow path can potentially be unreasonable slow. Flatten. 4702 // Recursive slow path can potentially be unreasonable slow. Flatten.
4672 str = FlattenGetString(str); 4703 str = FlattenGetString(str);
4673 Utf8WriterVisitor writer(buffer, capacity, false); 4704 Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
4674 i::String::VisitFlat(&writer, *str); 4705 i::String::VisitFlat(&writer, *str);
4675 return writer.CompleteWrite(write_null, nchars_ref); 4706 return writer.CompleteWrite(write_null, nchars_ref);
4676 } 4707 }
4677 4708
4678 4709
4679 template<typename CharType> 4710 template<typename CharType>
4680 static inline int WriteHelper(const String* string, 4711 static inline int WriteHelper(const String* string,
4681 CharType* buffer, 4712 CharType* buffer,
4682 int start, 4713 int start,
4683 int length, 4714 int length,
(...skipping 2739 matching lines...) Expand 10 before | Expand all | Expand 10 after
7423 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate()); 7454 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());
7424 Address callback_address = 7455 Address callback_address =
7425 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback)); 7456 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));
7426 VMState<EXTERNAL> state(isolate); 7457 VMState<EXTERNAL> state(isolate);
7427 ExternalCallbackScope call_scope(isolate, callback_address); 7458 ExternalCallbackScope call_scope(isolate, callback_address);
7428 callback(info); 7459 callback(info);
7429 } 7460 }
7430 7461
7431 7462
7432 } } // namespace v8::internal 7463 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « include/v8.h ('k') | src/unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698