Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(113)

Unified Diff: src/unicode.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/unicode.h
===================================================================
--- src/unicode.h (revision 10944)
+++ src/unicode.h (working copy)
@@ -114,10 +114,33 @@
unsigned length_;
};
+
+class Utf16 {
rossberg 2012/03/07 13:32:47 Nit: this doesn't quite fit into the above Utf8 se
Erik Corry 2012/03/11 19:29:22 Done.
+ public:
+ static inline bool IsLeadSurrogate(uchar code) {
+ return (code & 0xfc00) == 0xd800;
+ }
+ static inline bool IsTrailSurrogate(uchar code) {
+ return (code & 0xfc00) == 0xdc00;
+ }
+ static inline int CombineSurrogatePair(uchar lead, uchar trail) {
rossberg 2012/03/07 13:32:47 Isn't int32_t more accurate as result type?
Erik Corry 2012/03/11 19:29:22 Done.
+ return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+ }
+ static const uchar kMaxNonSurrogateCharCode = 0xffff;
+ static inline uchar LeadSurrogate(int char_code) {
rossberg 2012/03/07 13:32:47 Similar here (and below), isn't char_code an int32
Erik Corry 2012/03/11 19:29:22 Done.
+ return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
+ }
+ static inline uchar TrailSurrogate(int char_code) {
+ return 0xdc00 + (char_code & 0x3ff);
+ }
+};
+
+
class Utf8 {
public:
- static inline uchar Length(uchar chr);
- static inline unsigned Encode(char* out, uchar c);
+ static inline uchar Length(uchar chr, int previous);
+ static inline unsigned Encode(
+ char* out, uchar c, int previous);
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read, unsigned* offset);
static uchar CalculateValue(const byte* str,
@@ -130,6 +153,13 @@
static const unsigned kMaxThreeByteChar = 0xffff;
static const unsigned kMaxFourByteChar = 0x1fffff;
+ static const int kNoPreviousCharacter = -1;
+
+ // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
+ // that match are coded as a 4 byte UTF-8 sequence.
+ static const unsigned kBytesSavedByCombiningSurrogates = 2;
+ static const unsigned kSizeOfUnmatchedSurrogate = 3;
+
private:
template <unsigned s> friend class Utf8InputBuffer;
friend class Test;
@@ -147,6 +177,7 @@
// Note that default implementation is not efficient.
virtual void Seek(unsigned);
unsigned Length();
+ unsigned Utf16Length();
virtual ~CharacterStream() { }
static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
unsigned& offset);
@@ -156,6 +187,7 @@
unsigned capacity, unsigned& offset);
static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
virtual void Rewind() = 0;
+
protected:
virtual void FillBuffer() = 0;
// The number of characters left in the current buffer

Powered by Google App Engine
This is Rietveld 408576698