Index: src/unicode-inl.h |
diff --git a/src/unicode-inl.h b/src/unicode-inl.h |
index f861f9f2d47449945d62a6fbc8044abbcd0b2a2b..99eca644b7030b9b75eaa1564718dc085c5b3adb 100644 |
--- a/src/unicode-inl.h |
+++ b/src/unicode-inl.h |
@@ -107,8 +107,14 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
return 2; |
} |
- |
-unsigned Utf8::Encode(char* str, uchar c, int previous) { |
+// Encode encodes the UTF-16 code units c and previous into the given str |
+// buffer, and combines surrogate code units into single code points. If |
+// replace_invalid is set to true, orphan surrogate code units will be replaced |
+// with kBadChar. |
+unsigned Utf8::Encode(char* str, |
+ uchar c, |
+ int previous, |
+ bool replace_invalid) { |
static const int kMask = ~(1 << 6); |
if (c <= kMaxOneByteChar) { |
str[0] = c; |
@@ -118,12 +124,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) { |
str[1] = 0x80 | (c & kMask); |
return 2; |
} else if (c <= kMaxThreeByteChar) { |
- if (Utf16::IsTrailSurrogate(c) && |
- Utf16::IsLeadSurrogate(previous)) { |
+ if (Utf16::IsSurrogatePair(previous, c)) { |
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
return Encode(str - kUnmatchedSize, |
Utf16::CombineSurrogatePair(previous, c), |
- Utf16::kNoPreviousCharacter) - kUnmatchedSize; |
+ Utf16::kNoPreviousCharacter, |
+ replace_invalid) - kUnmatchedSize; |
+ } else if (replace_invalid && |
+ (Utf16::IsLeadSurrogate(c) || |
+ Utf16::IsTrailSurrogate(c))) { |
+ c = kBadChar; |
} |
str[0] = 0xE0 | (c >> 12); |
str[1] = 0x80 | ((c >> 6) & kMask); |