OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef V8_UNICODE_H_ | 5 #ifndef V8_UNICODE_H_ |
6 #define V8_UNICODE_H_ | 6 #define V8_UNICODE_H_ |
7 | 7 |
8 #include <sys/types.h> | 8 #include <sys/types.h> |
9 #include "src/globals.h" | 9 #include "src/globals.h" |
10 /** | 10 /** |
(...skipping 26 matching lines...) Expand all Loading... |
37 : code_point_(code_point), | 37 : code_point_(code_point), |
38 value_(value) { } | 38 value_(value) { } |
39 uchar code_point_ : 21; | 39 uchar code_point_ : 21; |
40 bool value_ : 1; | 40 bool value_ : 1; |
41 }; | 41 }; |
42 static const int kSize = size; | 42 static const int kSize = size; |
43 static const int kMask = kSize - 1; | 43 static const int kMask = kSize - 1; |
44 CacheEntry entries_[kSize]; | 44 CacheEntry entries_[kSize]; |
45 }; | 45 }; |
46 | 46 |
| 47 |
47 // A cache used in case conversion. It caches the value for characters | 48 // A cache used in case conversion. It caches the value for characters |
48 // that either have no mapping or map to a single character independent | 49 // that either have no mapping or map to a single character independent |
49 // of context. Characters that map to more than one character or that | 50 // of context. Characters that map to more than one character or that |
50 // map differently depending on context are always looked up. | 51 // map differently depending on context are always looked up. |
51 template <class T, int size = 256> | 52 template <class T, int size = 256> |
52 class Mapping { | 53 class Mapping { |
53 public: | 54 public: |
54 inline Mapping() { } | 55 inline Mapping() { } |
55 inline int get(uchar c, uchar n, uchar* result); | 56 inline int get(uchar c, uchar n, uchar* result); |
56 private: | 57 private: |
57 friend class Test; | 58 friend class Test; |
58 int CalculateValue(uchar c, uchar n, uchar* result); | 59 int CalculateValue(uchar c, uchar n, uchar* result); |
59 struct CacheEntry { | 60 struct CacheEntry { |
60 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } | 61 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } |
61 inline CacheEntry(uchar code_point, signed offset) | 62 inline CacheEntry(uchar code_point, signed offset) |
62 : code_point_(code_point), | 63 : code_point_(code_point), |
63 offset_(offset) { } | 64 offset_(offset) { } |
64 uchar code_point_; | 65 uchar code_point_; |
65 signed offset_; | 66 signed offset_; |
66 static const int kNoChar = (1 << 21) - 1; | 67 static const int kNoChar = (1 << 21) - 1; |
67 }; | 68 }; |
68 static const int kSize = size; | 69 static const int kSize = size; |
69 static const int kMask = kSize - 1; | 70 static const int kMask = kSize - 1; |
70 CacheEntry entries_[kSize]; | 71 CacheEntry entries_[kSize]; |
71 }; | 72 }; |
72 | 73 |
| 74 |
73 class UnicodeData { | 75 class UnicodeData { |
74 private: | 76 private: |
75 friend class Test; | 77 friend class Test; |
76 static int GetByteCount(); | 78 static int GetByteCount(); |
77 static const uchar kMaxCodePoint; | 79 static const uchar kMaxCodePoint; |
78 }; | 80 }; |
79 | 81 |
| 82 |
80 class Utf16 { | 83 class Utf16 { |
81 public: | 84 public: |
82 static inline bool IsSurrogatePair(int lead, int trail) { | 85 static inline bool IsSurrogatePair(int lead, int trail) { |
83 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); | 86 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); |
84 } | 87 } |
85 static inline bool IsLeadSurrogate(int code) { | 88 static inline bool IsLeadSurrogate(int code) { |
86 if (code == kNoPreviousCharacter) return false; | 89 if (code == kNoPreviousCharacter) return false; |
87 return (code & 0xfc00) == 0xd800; | 90 return (code & 0xfc00) == 0xd800; |
88 } | 91 } |
89 static inline bool IsTrailSurrogate(int code) { | 92 static inline bool IsTrailSurrogate(int code) { |
(...skipping 16 matching lines...) Expand all Loading... |
106 // The illegality stems from the surrogate not being part of a pair. | 109 // The illegality stems from the surrogate not being part of a pair. |
107 static const int kUtf8BytesToCodeASurrogate = 3; | 110 static const int kUtf8BytesToCodeASurrogate = 3; |
108 static inline uint16_t LeadSurrogate(uint32_t char_code) { | 111 static inline uint16_t LeadSurrogate(uint32_t char_code) { |
109 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | 112 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); |
110 } | 113 } |
111 static inline uint16_t TrailSurrogate(uint32_t char_code) { | 114 static inline uint16_t TrailSurrogate(uint32_t char_code) { |
112 return 0xdc00 + (char_code & 0x3ff); | 115 return 0xdc00 + (char_code & 0x3ff); |
113 } | 116 } |
114 }; | 117 }; |
115 | 118 |
116 class Latin1 { | |
117 public: | |
118 static const unsigned kMaxChar = 0xff; | |
119 // Returns 0 if character does not convert to single latin-1 character | |
120 // or if the character doesn't not convert back to latin-1 via inverse | |
121 // operation (upper to lower, etc). | |
122 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); | |
123 }; | |
124 | 119 |
125 class Utf8 { | 120 class Utf8 { |
126 public: | 121 public: |
127 static inline uchar Length(uchar chr, int previous); | 122 static inline uchar Length(uchar chr, int previous); |
128 static inline unsigned EncodeOneByte(char* out, uint8_t c); | 123 static inline unsigned EncodeOneByte(char* out, uint8_t c); |
129 static inline unsigned Encode(char* out, | 124 static inline unsigned Encode(char* out, |
130 uchar c, | 125 uchar c, |
131 int previous, | 126 int previous, |
132 bool replace_invalid = false); | 127 bool replace_invalid = false); |
133 static uchar CalculateValue(const byte* str, | 128 static uchar CalculateValue(const byte* str, |
(...skipping 14 matching lines...) Expand all Loading... |
148 static const unsigned kBytesSavedByCombiningSurrogates = 2; | 143 static const unsigned kBytesSavedByCombiningSurrogates = 2; |
149 static const unsigned kSizeOfUnmatchedSurrogate = 3; | 144 static const unsigned kSizeOfUnmatchedSurrogate = 3; |
150 // The maximum size a single UTF-16 code unit may take up when encoded as | 145 // The maximum size a single UTF-16 code unit may take up when encoded as |
151 // UTF-8. | 146 // UTF-8. |
152 static const unsigned kMax16BitCodeUnitSize = 3; | 147 static const unsigned kMax16BitCodeUnitSize = 3; |
153 static inline uchar ValueOf(const byte* str, | 148 static inline uchar ValueOf(const byte* str, |
154 unsigned length, | 149 unsigned length, |
155 unsigned* cursor); | 150 unsigned* cursor); |
156 }; | 151 }; |
157 | 152 |
158 | |
159 class Utf8DecoderBase { | |
160 public: | |
161 // Initialization done in subclass. | |
162 inline Utf8DecoderBase(); | |
163 inline Utf8DecoderBase(uint16_t* buffer, | |
164 unsigned buffer_length, | |
165 const uint8_t* stream, | |
166 unsigned stream_length); | |
167 inline unsigned Utf16Length() const { return utf16_length_; } | |
168 protected: | |
169 // This reads all characters and sets the utf16_length_. | |
170 // The first buffer_length utf16 chars are cached in the buffer. | |
171 void Reset(uint16_t* buffer, | |
172 unsigned buffer_length, | |
173 const uint8_t* stream, | |
174 unsigned stream_length); | |
175 static void WriteUtf16Slow(const uint8_t* stream, | |
176 uint16_t* data, | |
177 unsigned length); | |
178 const uint8_t* unbuffered_start_; | |
179 unsigned utf16_length_; | |
180 bool last_byte_of_buffer_unused_; | |
181 private: | |
182 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); | |
183 }; | |
184 | |
185 template <unsigned kBufferSize> | |
186 class Utf8Decoder : public Utf8DecoderBase { | |
187 public: | |
188 inline Utf8Decoder() {} | |
189 inline Utf8Decoder(const char* stream, unsigned length); | |
190 inline void Reset(const char* stream, unsigned length); | |
191 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; | |
192 private: | |
193 uint16_t buffer_[kBufferSize]; | |
194 }; | |
195 | |
196 | |
197 struct Uppercase { | 153 struct Uppercase { |
198 static bool Is(uchar c); | 154 static bool Is(uchar c); |
199 }; | 155 }; |
200 struct Lowercase { | 156 struct Lowercase { |
201 static bool Is(uchar c); | 157 static bool Is(uchar c); |
202 }; | 158 }; |
203 struct Letter { | 159 struct Letter { |
204 static bool Is(uchar c); | 160 static bool Is(uchar c); |
205 }; | 161 }; |
206 struct Number { | 162 struct ID_Start { |
| 163 static bool Is(uchar c); |
| 164 }; |
| 165 struct ID_Continue { |
207 static bool Is(uchar c); | 166 static bool Is(uchar c); |
208 }; | 167 }; |
209 struct WhiteSpace { | 168 struct WhiteSpace { |
210 static bool Is(uchar c); | 169 static bool Is(uchar c); |
211 }; | 170 }; |
212 struct LineTerminator { | 171 struct LineTerminator { |
213 static bool Is(uchar c); | 172 static bool Is(uchar c); |
214 }; | 173 }; |
215 struct CombiningMark { | |
216 static bool Is(uchar c); | |
217 }; | |
218 struct ConnectorPunctuation { | |
219 static bool Is(uchar c); | |
220 }; | |
221 struct ToLowercase { | 174 struct ToLowercase { |
222 static const int kMaxWidth = 3; | 175 static const int kMaxWidth = 3; |
223 static const bool kIsToLower = true; | 176 static const bool kIsToLower = true; |
224 static int Convert(uchar c, | 177 static int Convert(uchar c, |
225 uchar n, | 178 uchar n, |
226 uchar* result, | 179 uchar* result, |
227 bool* allow_caching_ptr); | 180 bool* allow_caching_ptr); |
228 }; | 181 }; |
229 struct ToUppercase { | 182 struct ToUppercase { |
230 static const int kMaxWidth = 3; | 183 static const int kMaxWidth = 3; |
(...skipping 21 matching lines...) Expand all Loading... |
252 static const int kMaxWidth = 1; | 205 static const int kMaxWidth = 1; |
253 static int Convert(uchar c, | 206 static int Convert(uchar c, |
254 uchar n, | 207 uchar n, |
255 uchar* result, | 208 uchar* result, |
256 bool* allow_caching_ptr); | 209 bool* allow_caching_ptr); |
257 }; | 210 }; |
258 | 211 |
259 } // namespace unibrow | 212 } // namespace unibrow |
260 | 213 |
261 #endif // V8_UNICODE_H_ | 214 #endif // V8_UNICODE_H_ |
OLD | NEW |