Index: src/unicode-inl.h |
diff --git a/src/unicode-inl.h b/src/unicode-inl.h |
index f861f9f2d47449945d62a6fbc8044abbcd0b2a2b..3d1c503cbef783d1bd5cd8ae30bb02273a55973d 100644 |
--- a/src/unicode-inl.h |
+++ b/src/unicode-inl.h |
@@ -107,8 +107,17 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
return 2; |
} |
+// Encode encodes the unicode code point c into the given str buffer. Unless |
+// allow_invalid is set to true, surrogate code points will be replaced with |
+// kReplacementCharacter. The caller is required to combine surrogate pairs |
+// into code points before calling Encode. |
+unsigned Utf8::Encode(char* str, uchar c, bool allow_invalid) { |
+ if (!allow_invalid && |
dcarney
2014/01/07 10:12:16
move this block down into the kMaxThreeByteChar cl
|
+ (Utf16::IsLeadSurrogate(c) || |
+ Utf16::IsTrailSurrogate(c))) { |
+ c = kReplacementCharacter; |
+ } |
-unsigned Utf8::Encode(char* str, uchar c, int previous) { |
static const int kMask = ~(1 << 6); |
if (c <= kMaxOneByteChar) { |
str[0] = c; |
@@ -118,13 +127,6 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) { |
str[1] = 0x80 | (c & kMask); |
return 2; |
} else if (c <= kMaxThreeByteChar) { |
- if (Utf16::IsTrailSurrogate(c) && |
- Utf16::IsLeadSurrogate(previous)) { |
- const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
- return Encode(str - kUnmatchedSize, |
- Utf16::CombineSurrogatePair(previous, c), |
- Utf16::kNoPreviousCharacter) - kUnmatchedSize; |
- } |
str[0] = 0xE0 | (c >> 12); |
str[1] = 0x80 | ((c >> 6) & kMask); |
str[2] = 0x80 | (c & kMask); |
@@ -150,6 +152,7 @@ uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { |
return CalculateValue(bytes, length, cursor); |
} |
+// @TODO give this the same semantics as Encode? |
dcarney
2014/01/07 11:05:50
i don't see an easy way to do this. You'd have to
|
unsigned Utf8::Length(uchar c, int previous) { |
if (c <= kMaxOneByteChar) { |
return 1; |