Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 28 matching lines...) Expand all Loading... | |
| 39 | 39 |
| 40 typedef unsigned int uchar; | 40 typedef unsigned int uchar; |
| 41 typedef unsigned char byte; | 41 typedef unsigned char byte; |
| 42 | 42 |
| 43 /** | 43 /** |
| 44 * The max length of the result of converting the case of a single | 44 * The max length of the result of converting the case of a single |
| 45 * character. | 45 * character. |
| 46 */ | 46 */ |
| 47 const int kMaxMappingSize = 4; | 47 const int kMaxMappingSize = 4; |
| 48 | 48 |
| 49 /** | |
| 50 * The unicode replacement character, used to signal invalid unicode sequences | |
| 51 * (e.g. an orphan surrogate) when converting to a UTF encoding. | |
|
dcarney
2014/01/10 16:49:55
typo - UTF-8
haimuiba
2014/01/13 07:48:21
Done.
| |
| 52 */ | |
| 53 const int kReplacementCharacter = 0xFFFD; | |
|
dcarney
2014/01/10 16:49:55
this should be in Utf8, but see below
haimuiba
2014/01/13 07:48:21
Done.
| |
| 54 | |
| 49 template <class T, int size = 256> | 55 template <class T, int size = 256> |
| 50 class Predicate { | 56 class Predicate { |
| 51 public: | 57 public: |
| 52 inline Predicate() { } | 58 inline Predicate() { } |
| 53 inline bool get(uchar c); | 59 inline bool get(uchar c); |
| 54 private: | 60 private: |
| 55 friend class Test; | 61 friend class Test; |
| 56 bool CalculateValue(uchar c); | 62 bool CalculateValue(uchar c); |
| 57 struct CacheEntry { | 63 struct CacheEntry { |
| 58 inline CacheEntry() : code_point_(0), value_(0) { } | 64 inline CacheEntry() : code_point_(0), value_(0) { } |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 95 | 101 |
| 96 class UnicodeData { | 102 class UnicodeData { |
| 97 private: | 103 private: |
| 98 friend class Test; | 104 friend class Test; |
| 99 static int GetByteCount(); | 105 static int GetByteCount(); |
| 100 static const uchar kMaxCodePoint; | 106 static const uchar kMaxCodePoint; |
| 101 }; | 107 }; |
| 102 | 108 |
| 103 class Utf16 { | 109 class Utf16 { |
| 104 public: | 110 public: |
| 111 static inline bool IsSurrogatePair(int lead, int trail) { | |
| 112 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); | |
| 113 } | |
| 105 static inline bool IsLeadSurrogate(int code) { | 114 static inline bool IsLeadSurrogate(int code) { |
| 106 if (code == kNoPreviousCharacter) return false; | 115 if (code == kNoPreviousCharacter) return false; |
| 107 return (code & 0xfc00) == 0xd800; | 116 return (code & 0xfc00) == 0xd800; |
| 108 } | 117 } |
| 109 static inline bool IsTrailSurrogate(int code) { | 118 static inline bool IsTrailSurrogate(int code) { |
| 110 if (code == kNoPreviousCharacter) return false; | 119 if (code == kNoPreviousCharacter) return false; |
| 111 return (code & 0xfc00) == 0xdc00; | 120 return (code & 0xfc00) == 0xdc00; |
| 112 } | 121 } |
| 113 | 122 |
| 114 static inline int CombineSurrogatePair(uchar lead, uchar trail) { | 123 static inline int CombineSurrogatePair(uchar lead, uchar trail) { |
| (...skipping 24 matching lines...) Expand all Loading... | |
| 139 // Returns 0 if character does not convert to single latin-1 character | 148 // Returns 0 if character does not convert to single latin-1 character |
| 140 // or if the character doesn't not convert back to latin-1 via inverse | 149 // or if the character doesn't not convert back to latin-1 via inverse |
| 141 // operation (upper to lower, etc). | 150 // operation (upper to lower, etc). |
| 142 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); | 151 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); |
| 143 }; | 152 }; |
| 144 | 153 |
| 145 class Utf8 { | 154 class Utf8 { |
| 146 public: | 155 public: |
| 147 static inline uchar Length(uchar chr, int previous); | 156 static inline uchar Length(uchar chr, int previous); |
| 148 static inline unsigned EncodeOneByte(char* out, uint8_t c); | 157 static inline unsigned EncodeOneByte(char* out, uint8_t c); |
| 149 static inline unsigned Encode( | 158 static inline unsigned Encode(char* out, |
| 150 char* out, uchar c, int previous); | 159 uchar c, |
| 160 int previous, | |
| 161 bool allow_invalid); | |
|
dcarney
2014/01/10 16:49:55
this either needs to be an enum to avoid passing t
haimuiba
2014/01/13 07:48:21
Done by defaulting to true. I don't understand the
| |
| 151 static uchar CalculateValue(const byte* str, | 162 static uchar CalculateValue(const byte* str, |
| 152 unsigned length, | 163 unsigned length, |
| 153 unsigned* cursor); | 164 unsigned* cursor); |
| 154 static const uchar kBadChar = 0xFFFD; | 165 static const uchar kBadChar = 0xFFFD; |
|
dcarney
2014/01/10 16:49:55
hmmm, maybe you should just rename this variable h
haimuiba
2014/01/13 07:48:21
Done. Decided to keep the kBadChar name for now as
| |
| 155 static const unsigned kMaxEncodedSize = 4; | 166 static const unsigned kMaxEncodedSize = 4; |
| 156 static const unsigned kMaxOneByteChar = 0x7f; | 167 static const unsigned kMaxOneByteChar = 0x7f; |
| 157 static const unsigned kMaxTwoByteChar = 0x7ff; | 168 static const unsigned kMaxTwoByteChar = 0x7ff; |
| 158 static const unsigned kMaxThreeByteChar = 0xffff; | 169 static const unsigned kMaxThreeByteChar = 0xffff; |
| 159 static const unsigned kMaxFourByteChar = 0x1fffff; | 170 static const unsigned kMaxFourByteChar = 0x1fffff; |
| 160 | 171 |
| 161 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together | 172 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together |
| 162 // that match are coded as a 4 byte UTF-8 sequence. | 173 // that match are coded as a 4 byte UTF-8 sequence. |
| 163 static const unsigned kBytesSavedByCombiningSurrogates = 2; | 174 static const unsigned kBytesSavedByCombiningSurrogates = 2; |
| 164 static const unsigned kSizeOfUnmatchedSurrogate = 3; | 175 static const unsigned kSizeOfUnmatchedSurrogate = 3; |
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 267 static const int kMaxWidth = 1; | 278 static const int kMaxWidth = 1; |
| 268 static int Convert(uchar c, | 279 static int Convert(uchar c, |
| 269 uchar n, | 280 uchar n, |
| 270 uchar* result, | 281 uchar* result, |
| 271 bool* allow_caching_ptr); | 282 bool* allow_caching_ptr); |
| 272 }; | 283 }; |
| 273 | 284 |
| 274 } // namespace unibrow | 285 } // namespace unibrow |
| 275 | 286 |
| 276 #endif // V8_UNICODE_H_ | 287 #endif // V8_UNICODE_H_ |
| OLD | NEW |