Index: src/unicode.h |
=================================================================== |
--- src/unicode.h (revision 10944) |
+++ src/unicode.h (working copy) |
@@ -114,10 +114,33 @@ |
unsigned length_; |
}; |
+ |
+class Utf16 { |
rossberg
2012/03/07 13:32:47
Nit: this doesn't quite fit into the above Utf8 se
Erik Corry
2012/03/11 19:29:22
Done.
|
+ public: |
+ static inline bool IsLeadSurrogate(uchar code) { |
+ return (code & 0xfc00) == 0xd800; |
+ } |
+ static inline bool IsTrailSurrogate(uchar code) { |
+ return (code & 0xfc00) == 0xdc00; |
+ } |
+ static inline int CombineSurrogatePair(uchar lead, uchar trail) { |
rossberg
2012/03/07 13:32:47
Isn't int32_t more accurate as result type?
Erik Corry
2012/03/11 19:29:22
Done.
|
+ return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); |
+ } |
+ static const uchar kMaxNonSurrogateCharCode = 0xffff; |
+ static inline uchar LeadSurrogate(int char_code) { |
rossberg
2012/03/07 13:32:47
Similar here (and below), isn't char_code an int32
Erik Corry
2012/03/11 19:29:22
Done.
|
+ return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); |
+ } |
+ static inline uchar TrailSurrogate(int char_code) { |
+ return 0xdc00 + (char_code & 0x3ff); |
+ } |
+}; |
+ |
+ |
class Utf8 { |
public: |
- static inline uchar Length(uchar chr); |
- static inline unsigned Encode(char* out, uchar c); |
+ static inline uchar Length(uchar chr, int previous); |
+ static inline unsigned Encode( |
+ char* out, uchar c, int previous); |
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, |
unsigned capacity, unsigned* chars_read, unsigned* offset); |
static uchar CalculateValue(const byte* str, |
@@ -130,6 +153,13 @@ |
static const unsigned kMaxThreeByteChar = 0xffff; |
static const unsigned kMaxFourByteChar = 0x1fffff; |
+ static const int kNoPreviousCharacter = -1; |
+ |
+ // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together |
+ // that match are coded as a 4 byte UTF-8 sequence. |
+ static const unsigned kBytesSavedByCombiningSurrogates = 2; |
+ static const unsigned kSizeOfUnmatchedSurrogate = 3; |
+ |
private: |
template <unsigned s> friend class Utf8InputBuffer; |
friend class Test; |
@@ -147,6 +177,7 @@ |
// Note that default implementation is not efficient. |
virtual void Seek(unsigned); |
unsigned Length(); |
+ unsigned Utf16Length(); |
virtual ~CharacterStream() { } |
static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, |
unsigned& offset); |
@@ -156,6 +187,7 @@ |
unsigned capacity, unsigned& offset); |
static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); |
virtual void Rewind() = 0; |
+ |
protected: |
virtual void FillBuffer() = 0; |
// The number of characters left in the current buffer |