Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(32)

Side by Side Diff: src/unicode.h

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: Rebase Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/api.cc ('k') | src/unicode-inl.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 95
96 class UnicodeData { 96 class UnicodeData {
97 private: 97 private:
98 friend class Test; 98 friend class Test;
99 static int GetByteCount(); 99 static int GetByteCount();
100 static const uchar kMaxCodePoint; 100 static const uchar kMaxCodePoint;
101 }; 101 };
102 102
103 class Utf16 { 103 class Utf16 {
104 public: 104 public:
105 static inline bool IsSurrogatePair(int lead, int trail) {
106 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
107 }
105 static inline bool IsLeadSurrogate(int code) { 108 static inline bool IsLeadSurrogate(int code) {
106 if (code == kNoPreviousCharacter) return false; 109 if (code == kNoPreviousCharacter) return false;
107 return (code & 0xfc00) == 0xd800; 110 return (code & 0xfc00) == 0xd800;
108 } 111 }
109 static inline bool IsTrailSurrogate(int code) { 112 static inline bool IsTrailSurrogate(int code) {
110 if (code == kNoPreviousCharacter) return false; 113 if (code == kNoPreviousCharacter) return false;
111 return (code & 0xfc00) == 0xdc00; 114 return (code & 0xfc00) == 0xdc00;
112 } 115 }
113 116
114 static inline int CombineSurrogatePair(uchar lead, uchar trail) { 117 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
(...skipping 24 matching lines...) Expand all
139 // Returns 0 if character does not convert to single latin-1 character 142 // Returns 0 if character does not convert to single latin-1 character
140 // or if the character doesn't not convert back to latin-1 via inverse 143 // or if the character doesn't not convert back to latin-1 via inverse
141 // operation (upper to lower, etc). 144 // operation (upper to lower, etc).
142 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); 145 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
143 }; 146 };
144 147
145 class Utf8 { 148 class Utf8 {
146 public: 149 public:
147 static inline uchar Length(uchar chr, int previous); 150 static inline uchar Length(uchar chr, int previous);
148 static inline unsigned EncodeOneByte(char* out, uint8_t c); 151 static inline unsigned EncodeOneByte(char* out, uint8_t c);
149 static inline unsigned Encode( 152 static inline unsigned Encode(char* out,
150 char* out, uchar c, int previous); 153 uchar c,
154 int previous,
155 bool replace_invalid = false);
151 static uchar CalculateValue(const byte* str, 156 static uchar CalculateValue(const byte* str,
152 unsigned length, 157 unsigned length,
153 unsigned* cursor); 158 unsigned* cursor);
159
160 // The unicode replacement character, used to signal invalid unicode
161 // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
154 static const uchar kBadChar = 0xFFFD; 162 static const uchar kBadChar = 0xFFFD;
155 static const unsigned kMaxEncodedSize = 4; 163 static const unsigned kMaxEncodedSize = 4;
156 static const unsigned kMaxOneByteChar = 0x7f; 164 static const unsigned kMaxOneByteChar = 0x7f;
157 static const unsigned kMaxTwoByteChar = 0x7ff; 165 static const unsigned kMaxTwoByteChar = 0x7ff;
158 static const unsigned kMaxThreeByteChar = 0xffff; 166 static const unsigned kMaxThreeByteChar = 0xffff;
159 static const unsigned kMaxFourByteChar = 0x1fffff; 167 static const unsigned kMaxFourByteChar = 0x1fffff;
160 168
161 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together 169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
162 // that match are coded as a 4 byte UTF-8 sequence. 170 // that match are coded as a 4 byte UTF-8 sequence.
163 static const unsigned kBytesSavedByCombiningSurrogates = 2; 171 static const unsigned kBytesSavedByCombiningSurrogates = 2;
164 static const unsigned kSizeOfUnmatchedSurrogate = 3; 172 static const unsigned kSizeOfUnmatchedSurrogate = 3;
173 // The maximum size a single UTF-16 code unit may take up when encoded as
174 // UTF-8.
175 static const unsigned kMax16BitCodeUnitSize = 3;
165 static inline uchar ValueOf(const byte* str, 176 static inline uchar ValueOf(const byte* str,
166 unsigned length, 177 unsigned length,
167 unsigned* cursor); 178 unsigned* cursor);
168 }; 179 };
169 180
170 181
171 class Utf8DecoderBase { 182 class Utf8DecoderBase {
172 public: 183 public:
173 // Initialization done in subclass. 184 // Initialization done in subclass.
174 inline Utf8DecoderBase(); 185 inline Utf8DecoderBase();
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after
267 static const int kMaxWidth = 1; 278 static const int kMaxWidth = 1;
268 static int Convert(uchar c, 279 static int Convert(uchar c,
269 uchar n, 280 uchar n,
270 uchar* result, 281 uchar* result,
271 bool* allow_caching_ptr); 282 bool* allow_caching_ptr);
272 }; 283 };
273 284
274 } // namespace unibrow 285 } // namespace unibrow
275 286
276 #endif // V8_UNICODE_H_ 287 #endif // V8_UNICODE_H_
OLDNEW
« no previous file with comments | « src/api.cc ('k') | src/unicode-inl.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698