Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(302)

Side by Side Diff: third_party/WebKit/Source/platform/inspector_protocol/String16STL.cpp

Issue 2238423002: [DevTools] Generate all files in inspector_protocol. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@2240663003
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "platform/inspector_protocol/String16.h"
6
7 #include <algorithm>
8 #include <cctype>
9 #include <cstdio>
10 #include <locale>
11
12 namespace blink {
13 namespace protocol {
14
15 const UChar replacementCharacter = 0xFFFD;
16 using UChar32 = uint32_t;
17
18 inline int inlineUTF8SequenceLengthNonASCII(char b0)
19 {
20 if ((b0 & 0xC0) != 0xC0)
21 return 0;
22 if ((b0 & 0xE0) == 0xC0)
23 return 2;
24 if ((b0 & 0xF0) == 0xE0)
25 return 3;
26 if ((b0 & 0xF8) == 0xF0)
27 return 4;
28 return 0;
29 }
30
31 inline int inlineUTF8SequenceLength(char b0)
32 {
33 return String16::isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
34 }
35
36 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
37 // into the first byte, depending on how many bytes follow. There are
38 // as many entries in this table as there are UTF-8 sequence types.
39 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
40 // for *legal* UTF-8 will be 4 or fewer bytes total.
41 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };
42
43 typedef enum {
44 conversionOK, // conversion successful
45 sourceExhausted, // partial character in source, but hit end
46 targetExhausted, // insuff. room in target for conversion
47 sourceIllegal // source sequence is illegal/malformed
48 } ConversionResult;
49
50 ConversionResult convertUTF16ToUTF8(
51 const UChar** sourceStart, const UChar* sourceEnd,
52 char** targetStart, char* targetEnd, bool strict)
53 {
54 ConversionResult result = conversionOK;
55 const UChar* source = *sourceStart;
56 char* target = *targetStart;
57 while (source < sourceEnd) {
58 UChar32 ch;
59 unsigned short bytesToWrite = 0;
60 const UChar32 byteMask = 0xBF;
61 const UChar32 byteMark = 0x80;
62 const UChar* oldSource = source; // In case we have to back up because o f target overflow.
63 ch = static_cast<unsigned short>(*source++);
64 // If we have a surrogate pair, convert to UChar32 first.
65 if (ch >= 0xD800 && ch <= 0xDBFF) {
66 // If the 16 bits following the high surrogate are in the source buf fer...
67 if (source < sourceEnd) {
68 UChar32 ch2 = static_cast<unsigned short>(*source);
69 // If it's a low surrogate, convert to UChar32.
70 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
71 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
72 ++source;
73 } else if (strict) { // it's an unpaired high surrogate
74 --source; // return to the illegal value itself
75 result = sourceIllegal;
76 break;
77 }
78 } else { // We don't have the 16 bits following the high surrogate.
79 --source; // return to the high surrogate
80 result = sourceExhausted;
81 break;
82 }
83 } else if (strict) {
84 // UTF-16 surrogate values are illegal in UTF-32
85 if (ch >= 0xDC00 && ch <= 0xDFFF) {
86 --source; // return to the illegal value itself
87 result = sourceIllegal;
88 break;
89 }
90 }
91 // Figure out how many bytes the result will require
92 if (ch < (UChar32)0x80) {
93 bytesToWrite = 1;
94 } else if (ch < (UChar32)0x800) {
95 bytesToWrite = 2;
96 } else if (ch < (UChar32)0x10000) {
97 bytesToWrite = 3;
98 } else if (ch < (UChar32)0x110000) {
99 bytesToWrite = 4;
100 } else {
101 bytesToWrite = 3;
102 ch = replacementCharacter;
103 }
104
105 target += bytesToWrite;
106 if (target > targetEnd) {
107 source = oldSource; // Back up source pointer!
108 target -= bytesToWrite;
109 result = targetExhausted;
110 break;
111 }
112 switch (bytesToWrite) { // note: everything falls through.
113 case 4:
114 *--target = (char)((ch | byteMark) & byteMask);
115 ch >>= 6;
116 case 3:
117 *--target = (char)((ch | byteMark) & byteMask);
118 ch >>= 6;
119 case 2:
120 *--target = (char)((ch | byteMark) & byteMask);
121 ch >>= 6;
122 case 1:
123 *--target = (char)(ch | firstByteMark[bytesToWrite]);
124 }
125 target += bytesToWrite;
126 }
127 *sourceStart = source;
128 *targetStart = target;
129 return result;
130 }
131
132 /**
133 * Is this code point a BMP code point (U+0000..U+ffff)?
134 * @param c 32-bit code point
135 * @return TRUE or FALSE
136 * @stable ICU 2.8
137 */
138 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)
139
140 /**
141 * Is this code point a supplementary code point (U+10000..U+10ffff)?
142 * @param c 32-bit code point
143 * @return TRUE or FALSE
144 * @stable ICU 2.8
145 */
146 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff)
147
148 /**
149 * Is this code point a surrogate (U+d800..U+dfff)?
150 * @param c 32-bit code point
151 * @return TRUE or FALSE
152 * @stable ICU 2.4
153 */
154 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800)
155
156 /**
157 * Get the lead surrogate (0xd800..0xdbff) for a
158 * supplementary code point (0x10000..0x10ffff).
159 * @param supplementary 32-bit code point (U+10000..U+10ffff)
160 * @return lead surrogate (U+d800..U+dbff) for supplementary
161 * @stable ICU 2.4
162 */
163 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)
164
165 /**
166 * Get the trail surrogate (0xdc00..0xdfff) for a
167 * supplementary code point (0x10000..0x10ffff).
168 * @param supplementary 32-bit code point (U+10000..U+10ffff)
169 * @return trail surrogate (U+dc00..U+dfff) for supplementary
170 * @stable ICU 2.4
171 */
172 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00)
173
174 // This must be called with the length pre-determined by the first byte.
175 // If presented with a length > 4, this returns false. The Unicode
176 // definition of UTF-8 goes up to 4-byte sequences.
177 static bool isLegalUTF8(const unsigned char* source, int length)
178 {
179 unsigned char a;
180 const unsigned char* srcptr = source + length;
181 switch (length) {
182 default:
183 return false;
184 // Everything else falls through when "true"...
185 case 4:
186 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
187 return false;
188 case 3:
189 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
190 return false;
191 case 2:
192 if ((a = (*--srcptr)) > 0xBF)
193 return false;
194
195 // no fall-through in this inner switch
196 switch (*source) {
197 case 0xE0:
198 if (a < 0xA0)
199 return false;
200 break;
201 case 0xED:
202 if (a > 0x9F)
203 return false;
204 break;
205 case 0xF0:
206 if (a < 0x90)
207 return false;
208 break;
209 case 0xF4:
210 if (a > 0x8F)
211 return false;
212 break;
213 default:
214 if (a < 0x80)
215 return false;
216 }
217
218 case 1:
219 if (*source >= 0x80 && *source < 0xC2)
220 return false;
221 }
222 if (*source > 0xF4)
223 return false;
224 return true;
225 }
226
227 // Magic values subtracted from a buffer value during UTF8 conversion.
228 // This table contains as many values as there might be trailing bytes
229 // in a UTF-8 sequence.
230 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };
231
232 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
233 {
234 UChar32 character = 0;
235
236 // The cases all fall through.
237 switch (length) {
238 case 6:
239 character += static_cast<unsigned char>(*sequence++);
240 character <<= 6;
241 case 5:
242 character += static_cast<unsigned char>(*sequence++);
243 character <<= 6;
244 case 4:
245 character += static_cast<unsigned char>(*sequence++);
246 character <<= 6;
247 case 3:
248 character += static_cast<unsigned char>(*sequence++);
249 character <<= 6;
250 case 2:
251 character += static_cast<unsigned char>(*sequence++);
252 character <<= 6;
253 case 1:
254 character += static_cast<unsigned char>(*sequence++);
255 }
256
257 return character - offsetsFromUTF8[length - 1];
258 }
259
260 ConversionResult convertUTF8ToUTF16(
261 const char** sourceStart, const char* sourceEnd,
262 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
263 {
264 ConversionResult result = conversionOK;
265 const char* source = *sourceStart;
266 UChar* target = *targetStart;
267 UChar orAllData = 0;
268 while (source < sourceEnd) {
269 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
270 if (sourceEnd - source < utf8SequenceLength) {
271 result = sourceExhausted;
272 break;
273 }
274 // Do this check whether lenient or strict
275 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {
276 result = sourceIllegal;
277 break;
278 }
279
280 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
281
282 if (target >= targetEnd) {
283 source -= utf8SequenceLength; // Back up source pointer!
284 result = targetExhausted;
285 break;
286 }
287
288 if (U_IS_BMP(character)) {
289 // UTF-16 surrogate values are illegal in UTF-32
290 if (U_IS_SURROGATE(character)) {
291 if (strict) {
292 source -= utf8SequenceLength; // return to the illegal value itself
293 result = sourceIllegal;
294 break;
295 }
296 *target++ = replacementCharacter;
297 orAllData |= replacementCharacter;
298 } else {
299 *target++ = static_cast<UChar>(character); // normal case
300 orAllData |= character;
301 }
302 } else if (U_IS_SUPPLEMENTARY(character)) {
303 // target is a character in range 0xFFFF - 0x10FFFF
304 if (target + 1 >= targetEnd) {
305 source -= utf8SequenceLength; // Back up source pointer!
306 result = targetExhausted;
307 break;
308 }
309 *target++ = U16_LEAD(character);
310 *target++ = U16_TRAIL(character);
311 orAllData = 0xffff;
312 } else {
313 if (strict) {
314 source -= utf8SequenceLength; // return to the start
315 result = sourceIllegal;
316 break; // Bail out; shouldn't continue
317 } else {
318 *target++ = replacementCharacter;
319 orAllData |= replacementCharacter;
320 }
321 }
322 }
323 *sourceStart = source;
324 *targetStart = target;
325
326 if (sourceAllASCII)
327 *sourceAllASCII = !(orAllData & ~0x7f);
328
329 return result;
330 }
331
332 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec k room is available.
333 static inline void putUTF8Triple(char*& buffer, UChar ch)
334 {
335 DCHECK_GE(ch, 0x0800);
336 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
337 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
338 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
339 }
340
341 String16 String16::fromUTF8(const char* stringStart, size_t length)
342 {
343 if (!stringStart || !length)
344 return String16();
345
346 std::vector<UChar> buffer(length);
347 UChar* bufferStart = buffer.data();
348
349 UChar* bufferCurrent = bufferStart;
350 const char* stringCurrent = stringStart;
351 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, bufferCurrent + buffer.size(), 0, true) != conversionOK)
352 return String16();
353
354 unsigned utf16Length = bufferCurrent - bufferStart;
355 return String16(bufferStart, utf16Length);
356 }
357
358 std::string String16::utf8() const
359 {
360 unsigned length = this->length();
361
362 if (!length)
363 return std::string("");
364
365 // Allocate a buffer big enough to hold all the characters
366 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
367 // Optimization ideas, if we find this function is hot:
368 // * We could speculatively create a CStringBuffer to contain 'length'
369 // characters, and resize if necessary (i.e. if the buffer contains
370 // non-ascii characters). (Alternatively, scan the buffer first for
371 // ascii characters, so we know this will be sufficient).
372 // * We could allocate a CStringBuffer with an appropriate size to
373 // have a good chance of being able to write the string into the
374 // buffer without reallocing (say, 1.5 x length).
375 if (length > std::numeric_limits<unsigned>::max() / 3)
376 return std::string();
377 std::vector<char> bufferVector(length * 3);
378 char* buffer = bufferVector.data();
379 const UChar* characters = m_impl.data();
380
381 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt h, &buffer, buffer + bufferVector.size(), false);
382 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for any conversion
383
384 // Only produced from strict conversion.
385 DCHECK(result != sourceIllegal);
386
387 // Check for an unconverted high surrogate.
388 if (result == sourceExhausted) {
389 // This should be one unpaired high surrogate. Treat it the same
390 // was as an unpaired high surrogate would have been handled in
391 // the middle of a string with non-strict conversion - which is
392 // to say, simply encode it to UTF-8.
393 DCHECK((characters + 1) == (m_impl.data() + length));
394 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
395 // There should be room left, since one UChar hasn't been
396 // converted.
397 DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
398 putUTF8Triple(buffer, *characters);
399 }
400
401 return std::string(bufferVector.data(), buffer - bufferVector.data());
402 }
403
404 } // namespace protocol
405 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698