Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/unicode.h

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: Abandon refactoring, get core behavior change done Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 28 matching lines...) Expand all
39 39
40 typedef unsigned int uchar; 40 typedef unsigned int uchar;
41 typedef unsigned char byte; 41 typedef unsigned char byte;
42 42
43 /** 43 /**
44 * The max length of the result of converting the case of a single 44 * The max length of the result of converting the case of a single
45 * character. 45 * character.
46 */ 46 */
47 const int kMaxMappingSize = 4; 47 const int kMaxMappingSize = 4;
48 48
49 /**
50 * The unicode replacement character, used to signal invalid unicode sequences
51 * (e.g. an orphan surrogate) when converting to a UTF encoding.
dcarney 2014/01/10 16:49:55 typo - UTF-8
haimuiba 2014/01/13 07:48:21 Done.
52 */
53 const int kReplacementCharacter = 0xFFFD;
dcarney 2014/01/10 16:49:55 this should be in Utf8, but see below
haimuiba 2014/01/13 07:48:21 Done.
54
49 template <class T, int size = 256> 55 template <class T, int size = 256>
50 class Predicate { 56 class Predicate {
51 public: 57 public:
52 inline Predicate() { } 58 inline Predicate() { }
53 inline bool get(uchar c); 59 inline bool get(uchar c);
54 private: 60 private:
55 friend class Test; 61 friend class Test;
56 bool CalculateValue(uchar c); 62 bool CalculateValue(uchar c);
57 struct CacheEntry { 63 struct CacheEntry {
58 inline CacheEntry() : code_point_(0), value_(0) { } 64 inline CacheEntry() : code_point_(0), value_(0) { }
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 101
96 class UnicodeData { 102 class UnicodeData {
97 private: 103 private:
98 friend class Test; 104 friend class Test;
99 static int GetByteCount(); 105 static int GetByteCount();
100 static const uchar kMaxCodePoint; 106 static const uchar kMaxCodePoint;
101 }; 107 };
102 108
103 class Utf16 { 109 class Utf16 {
104 public: 110 public:
111 static inline bool IsSurrogatePair(int lead, int trail) {
112 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
113 }
105 static inline bool IsLeadSurrogate(int code) { 114 static inline bool IsLeadSurrogate(int code) {
106 if (code == kNoPreviousCharacter) return false; 115 if (code == kNoPreviousCharacter) return false;
107 return (code & 0xfc00) == 0xd800; 116 return (code & 0xfc00) == 0xd800;
108 } 117 }
109 static inline bool IsTrailSurrogate(int code) { 118 static inline bool IsTrailSurrogate(int code) {
110 if (code == kNoPreviousCharacter) return false; 119 if (code == kNoPreviousCharacter) return false;
111 return (code & 0xfc00) == 0xdc00; 120 return (code & 0xfc00) == 0xdc00;
112 } 121 }
113 122
114 static inline int CombineSurrogatePair(uchar lead, uchar trail) { 123 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
(...skipping 24 matching lines...) Expand all
139 // Returns 0 if character does not convert to single latin-1 character 148 // Returns 0 if character does not convert to single latin-1 character
140 // or if the character doesn't not convert back to latin-1 via inverse 149 // or if the character doesn't not convert back to latin-1 via inverse
141 // operation (upper to lower, etc). 150 // operation (upper to lower, etc).
142 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); 151 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
143 }; 152 };
144 153
145 class Utf8 { 154 class Utf8 {
146 public: 155 public:
147 static inline uchar Length(uchar chr, int previous); 156 static inline uchar Length(uchar chr, int previous);
148 static inline unsigned EncodeOneByte(char* out, uint8_t c); 157 static inline unsigned EncodeOneByte(char* out, uint8_t c);
149 static inline unsigned Encode( 158 static inline unsigned Encode(char* out,
150 char* out, uchar c, int previous); 159 uchar c,
160 int previous,
161 bool allow_invalid);
dcarney 2014/01/10 16:49:55 this either needs to be an enum to avoid passing t
haimuiba 2014/01/13 07:48:21 Done by defaulting to true. I don't understand the
151 static uchar CalculateValue(const byte* str, 162 static uchar CalculateValue(const byte* str,
152 unsigned length, 163 unsigned length,
153 unsigned* cursor); 164 unsigned* cursor);
154 static const uchar kBadChar = 0xFFFD; 165 static const uchar kBadChar = 0xFFFD;
dcarney 2014/01/10 16:49:55 hmmm, maybe you should just rename this variable h
haimuiba 2014/01/13 07:48:21 Done. Decided to keep the kBadChar name for now as
155 static const unsigned kMaxEncodedSize = 4; 166 static const unsigned kMaxEncodedSize = 4;
156 static const unsigned kMaxOneByteChar = 0x7f; 167 static const unsigned kMaxOneByteChar = 0x7f;
157 static const unsigned kMaxTwoByteChar = 0x7ff; 168 static const unsigned kMaxTwoByteChar = 0x7ff;
158 static const unsigned kMaxThreeByteChar = 0xffff; 169 static const unsigned kMaxThreeByteChar = 0xffff;
159 static const unsigned kMaxFourByteChar = 0x1fffff; 170 static const unsigned kMaxFourByteChar = 0x1fffff;
160 171
161 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together 172 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
162 // that match are coded as a 4 byte UTF-8 sequence. 173 // that match are coded as a 4 byte UTF-8 sequence.
163 static const unsigned kBytesSavedByCombiningSurrogates = 2; 174 static const unsigned kBytesSavedByCombiningSurrogates = 2;
164 static const unsigned kSizeOfUnmatchedSurrogate = 3; 175 static const unsigned kSizeOfUnmatchedSurrogate = 3;
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
267 static const int kMaxWidth = 1; 278 static const int kMaxWidth = 1;
268 static int Convert(uchar c, 279 static int Convert(uchar c,
269 uchar n, 280 uchar n,
270 uchar* result, 281 uchar* result,
271 bool* allow_caching_ptr); 282 bool* allow_caching_ptr);
272 }; 283 };
273 284
274 } // namespace unibrow 285 } // namespace unibrow
275 286
276 #endif // V8_UNICODE_H_ 287 #endif // V8_UNICODE_H_
OLDNEW
« src/api.cc ('K') | « src/objects.cc ('k') | src/unicode-inl.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698