Index: src/unicode.h |
diff --git a/src/unicode.h b/src/unicode.h |
index 6ba61d0e17b2a0b6ff7702c422c38ddf9276318a..bb5506d38e2531b0f60fd73f8676c3ad77277ab7 100644 |
--- a/src/unicode.h |
+++ b/src/unicode.h |
@@ -102,6 +102,9 @@ class UnicodeData { |
class Utf16 { |
public: |
+ static inline bool IsSurrogatePair(int lead, int trail) { |
+ return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); |
+ } |
static inline bool IsLeadSurrogate(int code) { |
if (code == kNoPreviousCharacter) return false; |
return (code & 0xfc00) == 0xd800; |
@@ -146,11 +149,16 @@ class Utf8 { |
public: |
static inline uchar Length(uchar chr, int previous); |
static inline unsigned EncodeOneByte(char* out, uint8_t c); |
- static inline unsigned Encode( |
- char* out, uchar c, int previous); |
+ static inline unsigned Encode(char* out, |
+ uchar c, |
+ int previous, |
+ bool replace_invalid = false); |
static uchar CalculateValue(const byte* str, |
unsigned length, |
unsigned* cursor); |
+ |
+ // The unicode replacement character, used to signal invalid unicode |
+ // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding. |
static const uchar kBadChar = 0xFFFD; |
static const unsigned kMaxEncodedSize = 4; |
static const unsigned kMaxOneByteChar = 0x7f; |
@@ -162,6 +170,9 @@ class Utf8 { |
// that match are coded as a 4 byte UTF-8 sequence. |
static const unsigned kBytesSavedByCombiningSurrogates = 2; |
static const unsigned kSizeOfUnmatchedSurrogate = 3; |
+ // The maximum size a single UTF-16 code unit may take up when encoded as |
+ // UTF-8. |
+ static const unsigned kMax16BitCodeUnitSize = 3; |
static inline uchar ValueOf(const byte* str, |
unsigned length, |
unsigned* cursor); |