Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Side by Side Diff: src/unicode.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 public: 107 public:
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109 inline Buffer() : data_(0), length_(0) { } 109 inline Buffer() : data_(0), length_(0) { }
110 Data data() { return data_; } 110 Data data() { return data_; }
111 unsigned length() { return length_; } 111 unsigned length() { return length_; }
112 private: 112 private:
113 Data data_; 113 Data data_;
114 unsigned length_; 114 unsigned length_;
115 }; 115 };
116 116
117
118 class Utf16 {
rossberg 2012/03/07 13:32:47 Nit: this doesn't quite fit into the above Utf8 se
Erik Corry 2012/03/11 19:29:22 Done.
119 public:
120 static inline bool IsLeadSurrogate(uchar code) {
121 return (code & 0xfc00) == 0xd800;
122 }
123 static inline bool IsTrailSurrogate(uchar code) {
124 return (code & 0xfc00) == 0xdc00;
125 }
126 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
rossberg 2012/03/07 13:32:47 Isn't int32_t more accurate as result type?
Erik Corry 2012/03/11 19:29:22 Done.
127 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
128 }
129 static const uchar kMaxNonSurrogateCharCode = 0xffff;
130 static inline uchar LeadSurrogate(int char_code) {
rossberg 2012/03/07 13:32:47 Similar here (and below), isn't char_code an int32
Erik Corry 2012/03/11 19:29:22 Done.
131 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
132 }
133 static inline uchar TrailSurrogate(int char_code) {
134 return 0xdc00 + (char_code & 0x3ff);
135 }
136 };
137
138
117 class Utf8 { 139 class Utf8 {
118 public: 140 public:
119 static inline uchar Length(uchar chr); 141 static inline uchar Length(uchar chr, int previous);
120 static inline unsigned Encode(char* out, uchar c); 142 static inline unsigned Encode(
143 char* out, uchar c, int previous);
121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, 144 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122 unsigned capacity, unsigned* chars_read, unsigned* offset); 145 unsigned capacity, unsigned* chars_read, unsigned* offset);
123 static uchar CalculateValue(const byte* str, 146 static uchar CalculateValue(const byte* str,
124 unsigned length, 147 unsigned length,
125 unsigned* cursor); 148 unsigned* cursor);
126 static const uchar kBadChar = 0xFFFD; 149 static const uchar kBadChar = 0xFFFD;
127 static const unsigned kMaxEncodedSize = 4; 150 static const unsigned kMaxEncodedSize = 4;
128 static const unsigned kMaxOneByteChar = 0x7f; 151 static const unsigned kMaxOneByteChar = 0x7f;
129 static const unsigned kMaxTwoByteChar = 0x7ff; 152 static const unsigned kMaxTwoByteChar = 0x7ff;
130 static const unsigned kMaxThreeByteChar = 0xffff; 153 static const unsigned kMaxThreeByteChar = 0xffff;
131 static const unsigned kMaxFourByteChar = 0x1fffff; 154 static const unsigned kMaxFourByteChar = 0x1fffff;
132 155
156 static const int kNoPreviousCharacter = -1;
157
158 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
159 // that match are coded as a 4 byte UTF-8 sequence.
160 static const unsigned kBytesSavedByCombiningSurrogates = 2;
161 static const unsigned kSizeOfUnmatchedSurrogate = 3;
162
133 private: 163 private:
134 template <unsigned s> friend class Utf8InputBuffer; 164 template <unsigned s> friend class Utf8InputBuffer;
135 friend class Test; 165 friend class Test;
136 static inline uchar ValueOf(const byte* str, 166 static inline uchar ValueOf(const byte* str,
137 unsigned length, 167 unsigned length,
138 unsigned* cursor); 168 unsigned* cursor);
139 }; 169 };
140 170
141 // --- C h a r a c t e r S t r e a m --- 171 // --- C h a r a c t e r S t r e a m ---
142 172
143 class CharacterStream { 173 class CharacterStream {
144 public: 174 public:
145 inline uchar GetNext(); 175 inline uchar GetNext();
146 inline bool has_more() { return remaining_ != 0; } 176 inline bool has_more() { return remaining_ != 0; }
147 // Note that default implementation is not efficient. 177 // Note that default implementation is not efficient.
148 virtual void Seek(unsigned); 178 virtual void Seek(unsigned);
149 unsigned Length(); 179 unsigned Length();
180 unsigned Utf16Length();
150 virtual ~CharacterStream() { } 181 virtual ~CharacterStream() { }
151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, 182 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152 unsigned& offset); 183 unsigned& offset);
153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, 184 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154 unsigned capacity, unsigned& offset); 185 unsigned capacity, unsigned& offset);
155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, 186 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156 unsigned capacity, unsigned& offset); 187 unsigned capacity, unsigned& offset);
157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); 188 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158 virtual void Rewind() = 0; 189 virtual void Rewind() = 0;
190
159 protected: 191 protected:
160 virtual void FillBuffer() = 0; 192 virtual void FillBuffer() = 0;
161 // The number of characters left in the current buffer 193 // The number of characters left in the current buffer
162 unsigned remaining_; 194 unsigned remaining_;
163 // The current offset within the buffer 195 // The current offset within the buffer
164 unsigned cursor_; 196 unsigned cursor_;
165 // The buffer containing the decoded characters. 197 // The buffer containing the decoded characters.
166 const byte* buffer_; 198 const byte* buffer_;
167 }; 199 };
168 200
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
271 static const int kMaxWidth = 1; 303 static const int kMaxWidth = 1;
272 static int Convert(uchar c, 304 static int Convert(uchar c,
273 uchar n, 305 uchar n,
274 uchar* result, 306 uchar* result,
275 bool* allow_caching_ptr); 307 bool* allow_caching_ptr);
276 }; 308 };
277 309
278 } // namespace unibrow 310 } // namespace unibrow
279 311
280 #endif // V8_UNICODE_H_ 312 #endif // V8_UNICODE_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698