Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(450)

Side by Side Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: Abandon refactoring, get core behavior change done Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 4486 matching lines...) Expand 10 before | Expand all | Expand 10 after
4497 int String::Utf8Length() const { 4497 int String::Utf8Length() const {
4498 i::Handle<i::String> str = Utils::OpenHandle(this); 4498 i::Handle<i::String> str = Utils::OpenHandle(this);
4499 i::Isolate* isolate = str->GetIsolate(); 4499 i::Isolate* isolate = str->GetIsolate();
4500 return v8::Utf8Length(*str, isolate); 4500 return v8::Utf8Length(*str, isolate);
4501 } 4501 }
4502 4502
4503 4503
4504 class Utf8WriterVisitor { 4504 class Utf8WriterVisitor {
4505 public: 4505 public:
4506 Utf8WriterVisitor( 4506 Utf8WriterVisitor(
4507 char* buffer, int capacity, bool skip_capacity_check) 4507 char* buffer,
4508 int capacity,
4509 bool skip_capacity_check,
4510 bool allow_invalid_utf8)
4508 : early_termination_(false), 4511 : early_termination_(false),
4509 last_character_(unibrow::Utf16::kNoPreviousCharacter), 4512 last_character_(unibrow::Utf16::kNoPreviousCharacter),
4510 buffer_(buffer), 4513 buffer_(buffer),
4511 start_(buffer), 4514 start_(buffer),
4512 capacity_(capacity), 4515 capacity_(capacity),
4513 skip_capacity_check_(capacity == -1 || skip_capacity_check), 4516 skip_capacity_check_(capacity == -1 || skip_capacity_check),
4517 allow_invalid_utf8_(allow_invalid_utf8),
4514 utf16_chars_read_(0) { 4518 utf16_chars_read_(0) {
4515 } 4519 }
4516 4520
4517 static int WriteEndCharacter(uint16_t character, 4521 static int WriteEndCharacter(uint16_t character,
4518 int last_character, 4522 int last_character,
4519 int remaining, 4523 int remaining,
4520 char* const buffer) { 4524 char* const buffer,
4525 bool allow_invalid_utf8) {
4521 using namespace unibrow; 4526 using namespace unibrow;
4522 ASSERT(remaining > 0); 4527 ASSERT(remaining > 0);
4523 // We can't use a local buffer here because Encode needs to modify 4528 // We can't use a local buffer here because Encode needs to modify
4524 // previous characters in the stream. We know, however, that 4529 // previous characters in the stream. We know, however, that
4525 // exactly one character will be advanced. 4530 // exactly one character will be advanced.
4526 if (Utf16::IsTrailSurrogate(character) && 4531 if (Utf16::IsSurrogatePair(last_character, character)) {
4527 Utf16::IsLeadSurrogate(last_character)) { 4532 int written = Utf8::Encode(buffer,
4528 int written = Utf8::Encode(buffer, character, last_character); 4533 character,
4534 last_character,
4535 allow_invalid_utf8);
4529 ASSERT(written == 1); 4536 ASSERT(written == 1);
4530 return written; 4537 return written;
4531 } 4538 }
4532 // Use a scratch buffer to check the required characters. 4539 // Use a scratch buffer to check the required characters.
4533 char temp_buffer[Utf8::kMaxEncodedSize]; 4540 char temp_buffer[Utf8::kMaxEncodedSize];
4534 // Can't encode using last_character as gcc has array bounds issues. 4541 // Can't encode using last_character as gcc has array bounds issues.
4535 int written = Utf8::Encode(temp_buffer, 4542 int written = Utf8::Encode(temp_buffer,
4536 character, 4543 character,
4537 Utf16::kNoPreviousCharacter); 4544 Utf16::kNoPreviousCharacter,
4545 allow_invalid_utf8);
4538 // Won't fit. 4546 // Won't fit.
4539 if (written > remaining) return 0; 4547 if (written > remaining) return 0;
4540 // Copy over the character from temp_buffer. 4548 // Copy over the character from temp_buffer.
4541 for (int j = 0; j < written; j++) { 4549 for (int j = 0; j < written; j++) {
4542 buffer[j] = temp_buffer[j]; 4550 buffer[j] = temp_buffer[j];
4543 } 4551 }
4544 return written; 4552 return written;
4545 } 4553 }
4546 4554
4547 template<typename Char> 4555 template<typename Char>
(...skipping 26 matching lines...) Expand all
4574 // Write the characters to the stream. 4582 // Write the characters to the stream.
4575 if (sizeof(Char) == 1) { 4583 if (sizeof(Char) == 1) {
4576 for (; i < fast_length; i++) { 4584 for (; i < fast_length; i++) {
4577 buffer += 4585 buffer +=
4578 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++)); 4586 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));
4579 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); 4587 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4580 } 4588 }
4581 } else { 4589 } else {
4582 for (; i < fast_length; i++) { 4590 for (; i < fast_length; i++) {
4583 uint16_t character = *chars++; 4591 uint16_t character = *chars++;
4584 buffer += Utf8::Encode(buffer, character, last_character); 4592 buffer += Utf8::Encode(buffer,
4593 character,
4594 last_character,
4595 allow_invalid_utf8_);
4585 last_character = character; 4596 last_character = character;
4586 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); 4597 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4587 } 4598 }
4588 } 4599 }
4589 // Array is fully written. Exit. 4600 // Array is fully written. Exit.
4590 if (fast_length == length) { 4601 if (fast_length == length) {
4591 // Write state back out to object. 4602 // Write state back out to object.
4592 last_character_ = last_character; 4603 last_character_ = last_character;
4593 buffer_ = buffer; 4604 buffer_ = buffer;
4594 utf16_chars_read_ += length; 4605 utf16_chars_read_ += length;
4595 return; 4606 return;
4596 } 4607 }
4597 } 4608 }
4598 ASSERT(!skip_capacity_check_); 4609 ASSERT(!skip_capacity_check_);
4599 // Slow loop. Must check capacity on each iteration. 4610 // Slow loop. Must check capacity on each iteration.
4600 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); 4611 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
4601 ASSERT(remaining_capacity >= 0); 4612 ASSERT(remaining_capacity >= 0);
4602 for (; i < length && remaining_capacity > 0; i++) { 4613 for (; i < length && remaining_capacity > 0; i++) {
4603 uint16_t character = *chars++; 4614 uint16_t character = *chars++;
4604 int written = WriteEndCharacter(character, 4615 int written = WriteEndCharacter(character,
4605 last_character, 4616 last_character,
4606 remaining_capacity, 4617 remaining_capacity,
4607 buffer); 4618 buffer,
4619 allow_invalid_utf8_);
4608 if (written == 0) { 4620 if (written == 0) {
4609 early_termination_ = true; 4621 early_termination_ = true;
4610 break; 4622 break;
4611 } 4623 }
4612 buffer += written; 4624 buffer += written;
4613 remaining_capacity -= written; 4625 remaining_capacity -= written;
4614 last_character = character; 4626 last_character = character;
4615 } 4627 }
4616 // Write state back out to object. 4628 // Write state back out to object.
4617 last_character_ = last_character; 4629 last_character_ = last_character;
(...skipping 27 matching lines...) Expand all
4645 return static_cast<int>(buffer_ - start_); 4657 return static_cast<int>(buffer_ - start_);
4646 } 4658 }
4647 4659
4648 private: 4660 private:
4649 bool early_termination_; 4661 bool early_termination_;
4650 int last_character_; 4662 int last_character_;
4651 char* buffer_; 4663 char* buffer_;
4652 char* const start_; 4664 char* const start_;
4653 int capacity_; 4665 int capacity_;
4654 bool const skip_capacity_check_; 4666 bool const skip_capacity_check_;
4667 bool const allow_invalid_utf8_;
4655 int utf16_chars_read_; 4668 int utf16_chars_read_;
4656 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); 4669 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
4657 }; 4670 };
4658 4671
4659 4672
4660 static bool RecursivelySerializeToUtf8(i::String* current, 4673 static bool RecursivelySerializeToUtf8(i::String* current,
4661 Utf8WriterVisitor* writer, 4674 Utf8WriterVisitor* writer,
4662 int recursion_budget) { 4675 int recursion_budget) {
4663 while (!writer->IsDone()) { 4676 while (!writer->IsDone()) {
4664 i::ConsString* cons_string = i::String::VisitFlat(writer, current); 4677 i::ConsString* cons_string = i::String::VisitFlat(writer, current);
(...skipping 18 matching lines...) Expand all
4683 int options) const { 4696 int options) const {
4684 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); 4697 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();
4685 LOG_API(isolate, "String::WriteUtf8"); 4698 LOG_API(isolate, "String::WriteUtf8");
4686 ENTER_V8(isolate); 4699 ENTER_V8(isolate);
4687 i::Handle<i::String> str = Utils::OpenHandle(this); 4700 i::Handle<i::String> str = Utils::OpenHandle(this);
4688 if (options & HINT_MANY_WRITES_EXPECTED) { 4701 if (options & HINT_MANY_WRITES_EXPECTED) {
4689 FlattenString(str); // Flatten the string for efficiency. 4702 FlattenString(str); // Flatten the string for efficiency.
4690 } 4703 }
4691 const int string_length = str->length(); 4704 const int string_length = str->length();
4692 bool write_null = !(options & NO_NULL_TERMINATION); 4705 bool write_null = !(options & NO_NULL_TERMINATION);
4706 bool allow_invalid_utf8 = !(options & DISALLOW_INVALID_UTF8);
4693 // First check if we can just write the string without checking capacity. 4707 // First check if we can just write the string without checking capacity.
4708 // @TODO Replace magic number 3 with something more descriptive. E.g.
dcarney 2014/01/10 16:54:33 the syntax we use is: // TODO(username) Comment.
haimuiba 2014/01/13 07:48:21 Done.
4709 // Utf8::kMaxTwoByteSize (as in the maximum size an unsighed 2 byte code unit
4710 // value will take up when encoded to UTF-8)? When I first read this code I
4711 // thought there might be a overflow bug here since UTF-8 may take up to 4
4712 // bytes per code unit. Then I realized that a surrogate pair has a
4713 // str.length of 2, making the code correct.
4694 if (capacity == -1 || capacity / 3 >= string_length) { 4714 if (capacity == -1 || capacity / 3 >= string_length) {
4695 Utf8WriterVisitor writer(buffer, capacity, true); 4715 Utf8WriterVisitor writer(buffer, capacity, true, allow_invalid_utf8);
4696 const int kMaxRecursion = 100; 4716 const int kMaxRecursion = 100;
4697 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); 4717 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
4698 if (success) return writer.CompleteWrite(write_null, nchars_ref); 4718 if (success) return writer.CompleteWrite(write_null, nchars_ref);
4699 } else if (capacity >= string_length) { 4719 } else if (capacity >= string_length) {
4700 // First check that the buffer is large enough. 4720 // First check that the buffer is large enough.
4701 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate()); 4721 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
4702 if (utf8_bytes <= capacity) { 4722 if (utf8_bytes <= capacity) {
4703 // ASCII fast path. 4723 // ASCII fast path.
4704 if (utf8_bytes == string_length) { 4724 if (utf8_bytes == string_length) {
4705 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options); 4725 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);
4706 if (nchars_ref != NULL) *nchars_ref = string_length; 4726 if (nchars_ref != NULL) *nchars_ref = string_length;
4707 if (write_null && (utf8_bytes+1 <= capacity)) { 4727 if (write_null && (utf8_bytes+1 <= capacity)) {
4708 return string_length + 1; 4728 return string_length + 1;
4709 } 4729 }
4710 return string_length; 4730 return string_length;
4711 } 4731 }
4712 if (write_null && (utf8_bytes+1 > capacity)) { 4732 if (write_null && (utf8_bytes+1 > capacity)) {
4713 options |= NO_NULL_TERMINATION; 4733 options |= NO_NULL_TERMINATION;
4714 } 4734 }
4715 // Recurse once without a capacity limit. 4735 // Recurse once without a capacity limit.
4716 // This will get into the first branch above. 4736 // This will get into the first branch above.
4717 // TODO(dcarney) Check max left rec. in Utf8Length and fall through. 4737 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
4718 return WriteUtf8(buffer, -1, nchars_ref, options); 4738 return WriteUtf8(buffer, -1, nchars_ref, options);
4719 } 4739 }
4720 } 4740 }
4721 // Recursive slow path can potentially be unreasonable slow. Flatten. 4741 // Recursive slow path can potentially be unreasonable slow. Flatten.
4722 str = FlattenGetString(str); 4742 str = FlattenGetString(str);
4723 Utf8WriterVisitor writer(buffer, capacity, false); 4743 Utf8WriterVisitor writer(buffer, capacity, false, allow_invalid_utf8);
4724 i::String::VisitFlat(&writer, *str); 4744 i::String::VisitFlat(&writer, *str);
4725 return writer.CompleteWrite(write_null, nchars_ref); 4745 return writer.CompleteWrite(write_null, nchars_ref);
dcarney 2014/01/10 16:49:55 think you need to ensure that all return points fr
haimuiba 2014/01/13 07:48:21 Not sure I understand. If allow_invalid_utf8=false
4726 } 4746 }
4727 4747
4728 4748
4729 template<typename CharType> 4749 template<typename CharType>
4730 static inline int WriteHelper(const String* string, 4750 static inline int WriteHelper(const String* string,
4731 CharType* buffer, 4751 CharType* buffer,
4732 int start, 4752 int start,
4733 int length, 4753 int length,
4734 int options) { 4754 int options) {
4735 i::Isolate* isolate = Utils::OpenHandle(string)->GetIsolate(); 4755 i::Isolate* isolate = Utils::OpenHandle(string)->GetIsolate();
(...skipping 2779 matching lines...) Expand 10 before | Expand all | Expand 10 after
7515 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate()); 7535 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());
7516 Address callback_address = 7536 Address callback_address =
7517 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback)); 7537 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));
7518 VMState<EXTERNAL> state(isolate); 7538 VMState<EXTERNAL> state(isolate);
7519 ExternalCallbackScope call_scope(isolate, callback_address); 7539 ExternalCallbackScope call_scope(isolate, callback_address);
7520 callback(info); 7540 callback(info);
7521 } 7541 }
7522 7542
7523 7543
7524 } } // namespace v8::internal 7544 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698