src/unicode-inl.h - Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: src/unicode-inl.h

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: DISALLOW_INVALID_UTF8 flag and fixes Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/unicode-inl.h

diff --git a/src/unicode-inl.h b/src/unicode-inl.h

index f861f9f2d47449945d62a6fbc8044abbcd0b2a2b..3d1c503cbef783d1bd5cd8ae30bb02273a55973d 100644

--- a/src/unicode-inl.h

+++ b/src/unicode-inl.h

@@ -107,8 +107,17 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {

return 2;

}

+// Encode encodes the unicode code point c into the given str buffer. Unless

+// allow_invalid is set to true, surrogate code points will be replaced with

+// kReplacementCharacter. The caller is required to combine surrogate pairs

+// into code points before calling Encode.

+unsigned Utf8::Encode(char* str, uchar c, bool allow_invalid) {

+ if (!allow_invalid &&

dcarney 2014/01/07 10:12:16 move this block down into the kMaxThreeByteChar cl

+ (Utf16::IsLeadSurrogate(c) ||

+ Utf16::IsTrailSurrogate(c))) {

+ c = kReplacementCharacter;

+ }

-unsigned Utf8::Encode(char* str, uchar c, int previous) {

static const int kMask = ~(1 << 6);

if (c <= kMaxOneByteChar) {

str[0] = c;

@@ -118,13 +127,6 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) {

str[1] = 0x80 | (c & kMask);

return 2;

} else if (c <= kMaxThreeByteChar) {

- if (Utf16::IsTrailSurrogate(c) &&

- Utf16::IsLeadSurrogate(previous)) {

- const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;

- return Encode(str - kUnmatchedSize,

- Utf16::CombineSurrogatePair(previous, c),

- Utf16::kNoPreviousCharacter) - kUnmatchedSize;

- }

str[0] = 0xE0 | (c >> 12);

str[1] = 0x80 | ((c >> 6) & kMask);

str[2] = 0x80 | (c & kMask);

@@ -150,6 +152,7 @@ uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {

return CalculateValue(bytes, length, cursor);

}

+// @TODO give this the same semantics as Encode?

dcarney 2014/01/07 11:05:50 i don't see an easy way to do this. You'd have to

unsigned Utf8::Length(uchar c, int previous) {

if (c <= kMaxOneByteChar) {

return 1;

« src/api.cc ('K') | « src/unicode.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »