Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(189)

Side by Side Diff: third_party/WebKit/Source/platform/v8_inspector/String16.cpp

Issue 2295913003: [DevTools] Switch from platform/v8_inspector to v8/v8-inspector.h. (Closed)
Patch Set: rebase Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "platform/v8_inspector/String16.h"
6
7 #include "platform/v8_inspector/ProtocolPlatform.h"
8
9 #include <algorithm>
10 #include <cctype>
11 #include <cstdio>
12 #include <cstdlib>
13 #include <cstring>
14 #include <locale>
15 #include <string>
16
17 namespace v8_inspector {
18
19 namespace {
20
21 bool isASCII(UChar c)
22 {
23 return !(c & ~0x7F);
24 }
25
26 bool isSpaceOrNewLine(UChar c)
27 {
28 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
29 }
30
31 int charactersToInteger(const UChar* characters, size_t length, bool* ok = nullp tr)
32 {
33 std::vector<char> buffer;
34 buffer.reserve(length + 1);
35 for (size_t i = 0; i < length; ++i) {
36 if (!isASCII(characters[i])) {
37 if (ok)
38 *ok = false;
39 return 0;
40 }
41 buffer.push_back(static_cast<char>(characters[i]));
42 }
43 buffer.push_back('\0');
44
45 char* endptr;
46 int result = std::strtol(buffer.data(), &endptr, 10);
47 if (ok)
48 *ok = !(*endptr);
49 return result;
50 }
51
52 const UChar replacementCharacter = 0xFFFD;
53 using UChar32 = uint32_t;
54
55 inline int inlineUTF8SequenceLengthNonASCII(char b0)
56 {
57 if ((b0 & 0xC0) != 0xC0)
58 return 0;
59 if ((b0 & 0xE0) == 0xC0)
60 return 2;
61 if ((b0 & 0xF0) == 0xE0)
62 return 3;
63 if ((b0 & 0xF8) == 0xF0)
64 return 4;
65 return 0;
66 }
67
68 inline int inlineUTF8SequenceLength(char b0)
69 {
70 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
71 }
72
73 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
74 // into the first byte, depending on how many bytes follow. There are
75 // as many entries in this table as there are UTF-8 sequence types.
76 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
77 // for *legal* UTF-8 will be 4 or fewer bytes total.
78 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };
79
80 typedef enum {
81 conversionOK, // conversion successful
82 sourceExhausted, // partial character in source, but hit end
83 targetExhausted, // insuff. room in target for conversion
84 sourceIllegal // source sequence is illegal/malformed
85 } ConversionResult;
86
87 ConversionResult convertUTF16ToUTF8(
88 const UChar** sourceStart, const UChar* sourceEnd,
89 char** targetStart, char* targetEnd, bool strict)
90 {
91 ConversionResult result = conversionOK;
92 const UChar* source = *sourceStart;
93 char* target = *targetStart;
94 while (source < sourceEnd) {
95 UChar32 ch;
96 unsigned short bytesToWrite = 0;
97 const UChar32 byteMask = 0xBF;
98 const UChar32 byteMark = 0x80;
99 const UChar* oldSource = source; // In case we have to back up because o f target overflow.
100 ch = static_cast<unsigned short>(*source++);
101 // If we have a surrogate pair, convert to UChar32 first.
102 if (ch >= 0xD800 && ch <= 0xDBFF) {
103 // If the 16 bits following the high surrogate are in the source buf fer...
104 if (source < sourceEnd) {
105 UChar32 ch2 = static_cast<unsigned short>(*source);
106 // If it's a low surrogate, convert to UChar32.
107 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
108 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
109 ++source;
110 } else if (strict) { // it's an unpaired high surrogate
111 --source; // return to the illegal value itself
112 result = sourceIllegal;
113 break;
114 }
115 } else { // We don't have the 16 bits following the high surrogate.
116 --source; // return to the high surrogate
117 result = sourceExhausted;
118 break;
119 }
120 } else if (strict) {
121 // UTF-16 surrogate values are illegal in UTF-32
122 if (ch >= 0xDC00 && ch <= 0xDFFF) {
123 --source; // return to the illegal value itself
124 result = sourceIllegal;
125 break;
126 }
127 }
128 // Figure out how many bytes the result will require
129 if (ch < (UChar32)0x80) {
130 bytesToWrite = 1;
131 } else if (ch < (UChar32)0x800) {
132 bytesToWrite = 2;
133 } else if (ch < (UChar32)0x10000) {
134 bytesToWrite = 3;
135 } else if (ch < (UChar32)0x110000) {
136 bytesToWrite = 4;
137 } else {
138 bytesToWrite = 3;
139 ch = replacementCharacter;
140 }
141
142 target += bytesToWrite;
143 if (target > targetEnd) {
144 source = oldSource; // Back up source pointer!
145 target -= bytesToWrite;
146 result = targetExhausted;
147 break;
148 }
149 switch (bytesToWrite) { // note: everything falls through.
150 case 4:
151 *--target = (char)((ch | byteMark) & byteMask);
152 ch >>= 6;
153 case 3:
154 *--target = (char)((ch | byteMark) & byteMask);
155 ch >>= 6;
156 case 2:
157 *--target = (char)((ch | byteMark) & byteMask);
158 ch >>= 6;
159 case 1:
160 *--target = (char)(ch | firstByteMark[bytesToWrite]);
161 }
162 target += bytesToWrite;
163 }
164 *sourceStart = source;
165 *targetStart = target;
166 return result;
167 }
168
169 /**
170 * Is this code point a BMP code point (U+0000..U+ffff)?
171 * @param c 32-bit code point
172 * @return TRUE or FALSE
173 * @stable ICU 2.8
174 */
175 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)
176
177 /**
178 * Is this code point a supplementary code point (U+10000..U+10ffff)?
179 * @param c 32-bit code point
180 * @return TRUE or FALSE
181 * @stable ICU 2.8
182 */
183 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff)
184
185 /**
186 * Is this code point a surrogate (U+d800..U+dfff)?
187 * @param c 32-bit code point
188 * @return TRUE or FALSE
189 * @stable ICU 2.4
190 */
191 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800)
192
193 /**
194 * Get the lead surrogate (0xd800..0xdbff) for a
195 * supplementary code point (0x10000..0x10ffff).
196 * @param supplementary 32-bit code point (U+10000..U+10ffff)
197 * @return lead surrogate (U+d800..U+dbff) for supplementary
198 * @stable ICU 2.4
199 */
200 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)
201
202 /**
203 * Get the trail surrogate (0xdc00..0xdfff) for a
204 * supplementary code point (0x10000..0x10ffff).
205 * @param supplementary 32-bit code point (U+10000..U+10ffff)
206 * @return trail surrogate (U+dc00..U+dfff) for supplementary
207 * @stable ICU 2.4
208 */
209 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00)
210
211 // This must be called with the length pre-determined by the first byte.
212 // If presented with a length > 4, this returns false. The Unicode
213 // definition of UTF-8 goes up to 4-byte sequences.
214 static bool isLegalUTF8(const unsigned char* source, int length)
215 {
216 unsigned char a;
217 const unsigned char* srcptr = source + length;
218 switch (length) {
219 default:
220 return false;
221 // Everything else falls through when "true"...
222 case 4:
223 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
224 return false;
225 case 3:
226 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
227 return false;
228 case 2:
229 if ((a = (*--srcptr)) > 0xBF)
230 return false;
231
232 // no fall-through in this inner switch
233 switch (*source) {
234 case 0xE0:
235 if (a < 0xA0)
236 return false;
237 break;
238 case 0xED:
239 if (a > 0x9F)
240 return false;
241 break;
242 case 0xF0:
243 if (a < 0x90)
244 return false;
245 break;
246 case 0xF4:
247 if (a > 0x8F)
248 return false;
249 break;
250 default:
251 if (a < 0x80)
252 return false;
253 }
254
255 case 1:
256 if (*source >= 0x80 && *source < 0xC2)
257 return false;
258 }
259 if (*source > 0xF4)
260 return false;
261 return true;
262 }
263
264 // Magic values subtracted from a buffer value during UTF8 conversion.
265 // This table contains as many values as there might be trailing bytes
266 // in a UTF-8 sequence.
267 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };
268
269 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
270 {
271 UChar32 character = 0;
272
273 // The cases all fall through.
274 switch (length) {
275 case 6:
276 character += static_cast<unsigned char>(*sequence++);
277 character <<= 6;
278 case 5:
279 character += static_cast<unsigned char>(*sequence++);
280 character <<= 6;
281 case 4:
282 character += static_cast<unsigned char>(*sequence++);
283 character <<= 6;
284 case 3:
285 character += static_cast<unsigned char>(*sequence++);
286 character <<= 6;
287 case 2:
288 character += static_cast<unsigned char>(*sequence++);
289 character <<= 6;
290 case 1:
291 character += static_cast<unsigned char>(*sequence++);
292 }
293
294 return character - offsetsFromUTF8[length - 1];
295 }
296
297 ConversionResult convertUTF8ToUTF16(
298 const char** sourceStart, const char* sourceEnd,
299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
300 {
301 ConversionResult result = conversionOK;
302 const char* source = *sourceStart;
303 UChar* target = *targetStart;
304 UChar orAllData = 0;
305 while (source < sourceEnd) {
306 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
307 if (sourceEnd - source < utf8SequenceLength) {
308 result = sourceExhausted;
309 break;
310 }
311 // Do this check whether lenient or strict
312 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {
313 result = sourceIllegal;
314 break;
315 }
316
317 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
318
319 if (target >= targetEnd) {
320 source -= utf8SequenceLength; // Back up source pointer!
321 result = targetExhausted;
322 break;
323 }
324
325 if (U_IS_BMP(character)) {
326 // UTF-16 surrogate values are illegal in UTF-32
327 if (U_IS_SURROGATE(character)) {
328 if (strict) {
329 source -= utf8SequenceLength; // return to the illegal value itself
330 result = sourceIllegal;
331 break;
332 }
333 *target++ = replacementCharacter;
334 orAllData |= replacementCharacter;
335 } else {
336 *target++ = static_cast<UChar>(character); // normal case
337 orAllData |= character;
338 }
339 } else if (U_IS_SUPPLEMENTARY(character)) {
340 // target is a character in range 0xFFFF - 0x10FFFF
341 if (target + 1 >= targetEnd) {
342 source -= utf8SequenceLength; // Back up source pointer!
343 result = targetExhausted;
344 break;
345 }
346 *target++ = U16_LEAD(character);
347 *target++ = U16_TRAIL(character);
348 orAllData = 0xffff;
349 } else {
350 if (strict) {
351 source -= utf8SequenceLength; // return to the start
352 result = sourceIllegal;
353 break; // Bail out; shouldn't continue
354 } else {
355 *target++ = replacementCharacter;
356 orAllData |= replacementCharacter;
357 }
358 }
359 }
360 *sourceStart = source;
361 *targetStart = target;
362
363 if (sourceAllASCII)
364 *sourceAllASCII = !(orAllData & ~0x7f);
365
366 return result;
367 }
368
369 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec k room is available.
370 static inline void putUTF8Triple(char*& buffer, UChar ch)
371 {
372 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
373 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
374 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
375 }
376
377 } // namespace
378
379 // static
380 String16 String16::fromInteger(int number)
381 {
382 const size_t kBufferSize = 50;
383 char buffer[kBufferSize];
384 std::snprintf(buffer, kBufferSize, "%d", number);
385 return String16(buffer);
386 }
387
388 // static
389 String16 String16::fromDouble(double number)
390 {
391 const size_t kBufferSize = 100;
392 char buffer[kBufferSize];
393 std::snprintf(buffer, kBufferSize, "%f", number);
394 return String16(buffer);
395 }
396
397 // static
398 String16 String16::fromDoublePrecision3(double number)
399 {
400 const size_t kBufferSize = 100;
401 char buffer[kBufferSize];
402 std::snprintf(buffer, kBufferSize, "%.3g", number);
403 return String16(buffer);
404 }
405
406 // static
407 String16 String16::fromDoublePrecision6(double number)
408 {
409 const size_t kBufferSize = 100;
410 char buffer[kBufferSize];
411 std::snprintf(buffer, kBufferSize, "%.6g", number);
412 return String16(buffer);
413 }
414
415 int String16::toInteger(bool* ok) const
416 {
417 return charactersToInteger(characters16(), length(), ok);
418 }
419
420 String16 String16::stripWhiteSpace() const
421 {
422 if (!length())
423 return String16();
424
425 unsigned start = 0;
426 unsigned end = length() - 1;
427
428 // skip white space from start
429 while (start <= end && isSpaceOrNewLine(characters16()[start]))
430 ++start;
431
432 // only white space
433 if (start > end)
434 return String16();
435
436 // skip white space from end
437 while (end && isSpaceOrNewLine(characters16()[end]))
438 --end;
439
440 if (!start && end == length() - 1)
441 return *this;
442 return String16(characters16() + start, end + 1 - start);
443 }
444
445 String16Builder::String16Builder()
446 {
447 }
448
449 void String16Builder::append(const String16& s)
450 {
451 m_buffer.insert(m_buffer.end(), s.characters16(), s.characters16() + s.lengt h());
452 }
453
454 void String16Builder::append(UChar c)
455 {
456 m_buffer.push_back(c);
457 }
458
459 void String16Builder::append(char c)
460 {
461 UChar u = c;
462 m_buffer.push_back(u);
463 }
464
465 void String16Builder::append(const UChar* characters, size_t length)
466 {
467 m_buffer.insert(m_buffer.end(), characters, characters + length);
468 }
469
470 void String16Builder::append(const char* characters, size_t length)
471 {
472 m_buffer.insert(m_buffer.end(), characters, characters + length);
473 }
474
475 String16 String16Builder::toString()
476 {
477 return String16(m_buffer.data(), m_buffer.size());
478 }
479
480 void String16Builder::reserveCapacity(size_t capacity)
481 {
482 m_buffer.reserve(capacity);
483 }
484
485 String16 String16::fromUTF8(const char* stringStart, size_t length)
486 {
487 if (!stringStart || !length)
488 return String16();
489
490 std::vector<UChar> buffer(length);
491 UChar* bufferStart = buffer.data();
492
493 UChar* bufferCurrent = bufferStart;
494 const char* stringCurrent = stringStart;
495 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, bufferCurrent + buffer.size(), 0, true) != conversionOK)
496 return String16();
497
498 unsigned utf16Length = bufferCurrent - bufferStart;
499 return String16(bufferStart, utf16Length);
500 }
501
502 std::string String16::utf8() const
503 {
504 unsigned length = this->length();
505
506 if (!length)
507 return std::string("");
508
509 // Allocate a buffer big enough to hold all the characters
510 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
511 // Optimization ideas, if we find this function is hot:
512 // * We could speculatively create a CStringBuffer to contain 'length'
513 // characters, and resize if necessary (i.e. if the buffer contains
514 // non-ascii characters). (Alternatively, scan the buffer first for
515 // ascii characters, so we know this will be sufficient).
516 // * We could allocate a CStringBuffer with an appropriate size to
517 // have a good chance of being able to write the string into the
518 // buffer without reallocing (say, 1.5 x length).
519 if (length > std::numeric_limits<unsigned>::max() / 3)
520 return std::string();
521 std::vector<char> bufferVector(length * 3);
522 char* buffer = bufferVector.data();
523 const UChar* characters = m_impl.data();
524
525 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt h, &buffer, buffer + bufferVector.size(), false);
526 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for any conversion
527
528 // Only produced from strict conversion.
529 DCHECK(result != sourceIllegal);
530
531 // Check for an unconverted high surrogate.
532 if (result == sourceExhausted) {
533 // This should be one unpaired high surrogate. Treat it the same
534 // was as an unpaired high surrogate would have been handled in
535 // the middle of a string with non-strict conversion - which is
536 // to say, simply encode it to UTF-8.
537 DCHECK((characters + 1) == (m_impl.data() + length));
538 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
539 // There should be room left, since one UChar hasn't been
540 // converted.
541 DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
542 putUTF8Triple(buffer, *characters);
543 }
544
545 return std::string(bufferVector.data(), buffer - bufferVector.data());
546 }
547
548 } // namespace v8_inspector
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698