| OLD | NEW |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef V8_UNICODE_H_ | 5 #ifndef V8_UNICODE_H_ |
| 6 #define V8_UNICODE_H_ | 6 #define V8_UNICODE_H_ |
| 7 | 7 |
| 8 #include <sys/types.h> | 8 #include <sys/types.h> |
| 9 #include "src/globals.h" | 9 #include "src/globals.h" |
| 10 /** | 10 /** |
| (...skipping 26 matching lines...) Expand all Loading... |
| 37 : code_point_(code_point), | 37 : code_point_(code_point), |
| 38 value_(value) { } | 38 value_(value) { } |
| 39 uchar code_point_ : 21; | 39 uchar code_point_ : 21; |
| 40 bool value_ : 1; | 40 bool value_ : 1; |
| 41 }; | 41 }; |
| 42 static const int kSize = size; | 42 static const int kSize = size; |
| 43 static const int kMask = kSize - 1; | 43 static const int kMask = kSize - 1; |
| 44 CacheEntry entries_[kSize]; | 44 CacheEntry entries_[kSize]; |
| 45 }; | 45 }; |
| 46 | 46 |
| 47 |
| 47 // A cache used in case conversion. It caches the value for characters | 48 // A cache used in case conversion. It caches the value for characters |
| 48 // that either have no mapping or map to a single character independent | 49 // that either have no mapping or map to a single character independent |
| 49 // of context. Characters that map to more than one character or that | 50 // of context. Characters that map to more than one character or that |
| 50 // map differently depending on context are always looked up. | 51 // map differently depending on context are always looked up. |
| 51 template <class T, int size = 256> | 52 template <class T, int size = 256> |
| 52 class Mapping { | 53 class Mapping { |
| 53 public: | 54 public: |
| 54 inline Mapping() { } | 55 inline Mapping() { } |
| 55 inline int get(uchar c, uchar n, uchar* result); | 56 inline int get(uchar c, uchar n, uchar* result); |
| 56 private: | 57 private: |
| 57 friend class Test; | 58 friend class Test; |
| 58 int CalculateValue(uchar c, uchar n, uchar* result); | 59 int CalculateValue(uchar c, uchar n, uchar* result); |
| 59 struct CacheEntry { | 60 struct CacheEntry { |
| 60 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } | 61 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } |
| 61 inline CacheEntry(uchar code_point, signed offset) | 62 inline CacheEntry(uchar code_point, signed offset) |
| 62 : code_point_(code_point), | 63 : code_point_(code_point), |
| 63 offset_(offset) { } | 64 offset_(offset) { } |
| 64 uchar code_point_; | 65 uchar code_point_; |
| 65 signed offset_; | 66 signed offset_; |
| 66 static const int kNoChar = (1 << 21) - 1; | 67 static const int kNoChar = (1 << 21) - 1; |
| 67 }; | 68 }; |
| 68 static const int kSize = size; | 69 static const int kSize = size; |
| 69 static const int kMask = kSize - 1; | 70 static const int kMask = kSize - 1; |
| 70 CacheEntry entries_[kSize]; | 71 CacheEntry entries_[kSize]; |
| 71 }; | 72 }; |
| 72 | 73 |
| 74 |
| 73 class UnicodeData { | 75 class UnicodeData { |
| 74 private: | 76 private: |
| 75 friend class Test; | 77 friend class Test; |
| 76 static int GetByteCount(); | 78 static int GetByteCount(); |
| 77 static const uchar kMaxCodePoint; | 79 static const uchar kMaxCodePoint; |
| 78 }; | 80 }; |
| 79 | 81 |
| 82 |
| 80 class Utf16 { | 83 class Utf16 { |
| 81 public: | 84 public: |
| 82 static inline bool IsSurrogatePair(int lead, int trail) { | 85 static inline bool IsSurrogatePair(int lead, int trail) { |
| 83 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); | 86 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); |
| 84 } | 87 } |
| 85 static inline bool IsLeadSurrogate(int code) { | 88 static inline bool IsLeadSurrogate(int code) { |
| 86 if (code == kNoPreviousCharacter) return false; | 89 if (code == kNoPreviousCharacter) return false; |
| 87 return (code & 0xfc00) == 0xd800; | 90 return (code & 0xfc00) == 0xd800; |
| 88 } | 91 } |
| 89 static inline bool IsTrailSurrogate(int code) { | 92 static inline bool IsTrailSurrogate(int code) { |
| (...skipping 16 matching lines...) Expand all Loading... |
| 106 // The illegality stems from the surrogate not being part of a pair. | 109 // The illegality stems from the surrogate not being part of a pair. |
| 107 static const int kUtf8BytesToCodeASurrogate = 3; | 110 static const int kUtf8BytesToCodeASurrogate = 3; |
| 108 static inline uint16_t LeadSurrogate(uint32_t char_code) { | 111 static inline uint16_t LeadSurrogate(uint32_t char_code) { |
| 109 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | 112 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); |
| 110 } | 113 } |
| 111 static inline uint16_t TrailSurrogate(uint32_t char_code) { | 114 static inline uint16_t TrailSurrogate(uint32_t char_code) { |
| 112 return 0xdc00 + (char_code & 0x3ff); | 115 return 0xdc00 + (char_code & 0x3ff); |
| 113 } | 116 } |
| 114 }; | 117 }; |
| 115 | 118 |
| 116 class Latin1 { | |
| 117 public: | |
| 118 static const unsigned kMaxChar = 0xff; | |
| 119 // Returns 0 if character does not convert to single latin-1 character | |
| 120 // or if the character doesn't not convert back to latin-1 via inverse | |
| 121 // operation (upper to lower, etc). | |
| 122 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); | |
| 123 }; | |
| 124 | 119 |
| 125 class Utf8 { | 120 class Utf8 { |
| 126 public: | 121 public: |
| 127 static inline uchar Length(uchar chr, int previous); | 122 static inline uchar Length(uchar chr, int previous); |
| 128 static inline unsigned EncodeOneByte(char* out, uint8_t c); | 123 static inline unsigned EncodeOneByte(char* out, uint8_t c); |
| 129 static inline unsigned Encode(char* out, | 124 static inline unsigned Encode(char* out, |
| 130 uchar c, | 125 uchar c, |
| 131 int previous, | 126 int previous, |
| 132 bool replace_invalid = false); | 127 bool replace_invalid = false); |
| 133 static uchar CalculateValue(const byte* str, | 128 static uchar CalculateValue(const byte* str, |
| (...skipping 14 matching lines...) Expand all Loading... |
| 148 static const unsigned kBytesSavedByCombiningSurrogates = 2; | 143 static const unsigned kBytesSavedByCombiningSurrogates = 2; |
| 149 static const unsigned kSizeOfUnmatchedSurrogate = 3; | 144 static const unsigned kSizeOfUnmatchedSurrogate = 3; |
| 150 // The maximum size a single UTF-16 code unit may take up when encoded as | 145 // The maximum size a single UTF-16 code unit may take up when encoded as |
| 151 // UTF-8. | 146 // UTF-8. |
| 152 static const unsigned kMax16BitCodeUnitSize = 3; | 147 static const unsigned kMax16BitCodeUnitSize = 3; |
| 153 static inline uchar ValueOf(const byte* str, | 148 static inline uchar ValueOf(const byte* str, |
| 154 unsigned length, | 149 unsigned length, |
| 155 unsigned* cursor); | 150 unsigned* cursor); |
| 156 }; | 151 }; |
| 157 | 152 |
| 158 | |
| 159 class Utf8DecoderBase { | |
| 160 public: | |
| 161 // Initialization done in subclass. | |
| 162 inline Utf8DecoderBase(); | |
| 163 inline Utf8DecoderBase(uint16_t* buffer, | |
| 164 unsigned buffer_length, | |
| 165 const uint8_t* stream, | |
| 166 unsigned stream_length); | |
| 167 inline unsigned Utf16Length() const { return utf16_length_; } | |
| 168 protected: | |
| 169 // This reads all characters and sets the utf16_length_. | |
| 170 // The first buffer_length utf16 chars are cached in the buffer. | |
| 171 void Reset(uint16_t* buffer, | |
| 172 unsigned buffer_length, | |
| 173 const uint8_t* stream, | |
| 174 unsigned stream_length); | |
| 175 static void WriteUtf16Slow(const uint8_t* stream, | |
| 176 uint16_t* data, | |
| 177 unsigned length); | |
| 178 const uint8_t* unbuffered_start_; | |
| 179 unsigned utf16_length_; | |
| 180 bool last_byte_of_buffer_unused_; | |
| 181 private: | |
| 182 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); | |
| 183 }; | |
| 184 | |
| 185 template <unsigned kBufferSize> | |
| 186 class Utf8Decoder : public Utf8DecoderBase { | |
| 187 public: | |
| 188 inline Utf8Decoder() {} | |
| 189 inline Utf8Decoder(const char* stream, unsigned length); | |
| 190 inline void Reset(const char* stream, unsigned length); | |
| 191 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; | |
| 192 private: | |
| 193 uint16_t buffer_[kBufferSize]; | |
| 194 }; | |
| 195 | |
| 196 | |
| 197 struct Uppercase { | 153 struct Uppercase { |
| 198 static bool Is(uchar c); | 154 static bool Is(uchar c); |
| 199 }; | 155 }; |
| 200 struct Lowercase { | 156 struct Lowercase { |
| 201 static bool Is(uchar c); | 157 static bool Is(uchar c); |
| 202 }; | 158 }; |
| 203 struct Letter { | 159 struct Letter { |
| 204 static bool Is(uchar c); | 160 static bool Is(uchar c); |
| 205 }; | 161 }; |
| 206 struct Number { | 162 struct ID_Start { |
| 163 static bool Is(uchar c); |
| 164 }; |
| 165 struct ID_Continue { |
| 207 static bool Is(uchar c); | 166 static bool Is(uchar c); |
| 208 }; | 167 }; |
| 209 struct WhiteSpace { | 168 struct WhiteSpace { |
| 210 static bool Is(uchar c); | 169 static bool Is(uchar c); |
| 211 }; | 170 }; |
| 212 struct LineTerminator { | 171 struct LineTerminator { |
| 213 static bool Is(uchar c); | 172 static bool Is(uchar c); |
| 214 }; | 173 }; |
| 215 struct CombiningMark { | |
| 216 static bool Is(uchar c); | |
| 217 }; | |
| 218 struct ConnectorPunctuation { | |
| 219 static bool Is(uchar c); | |
| 220 }; | |
| 221 struct ToLowercase { | 174 struct ToLowercase { |
| 222 static const int kMaxWidth = 3; | 175 static const int kMaxWidth = 3; |
| 223 static const bool kIsToLower = true; | 176 static const bool kIsToLower = true; |
| 224 static int Convert(uchar c, | 177 static int Convert(uchar c, |
| 225 uchar n, | 178 uchar n, |
| 226 uchar* result, | 179 uchar* result, |
| 227 bool* allow_caching_ptr); | 180 bool* allow_caching_ptr); |
| 228 }; | 181 }; |
| 229 struct ToUppercase { | 182 struct ToUppercase { |
| 230 static const int kMaxWidth = 3; | 183 static const int kMaxWidth = 3; |
| (...skipping 21 matching lines...) Expand all Loading... |
| 252 static const int kMaxWidth = 1; | 205 static const int kMaxWidth = 1; |
| 253 static int Convert(uchar c, | 206 static int Convert(uchar c, |
| 254 uchar n, | 207 uchar n, |
| 255 uchar* result, | 208 uchar* result, |
| 256 bool* allow_caching_ptr); | 209 bool* allow_caching_ptr); |
| 257 }; | 210 }; |
| 258 | 211 |
| 259 } // namespace unibrow | 212 } // namespace unibrow |
| 260 | 213 |
| 261 #endif // V8_UNICODE_H_ | 214 #endif // V8_UNICODE_H_ |
| OLD | NEW |