OLD | NEW |
| (Empty) |
1 // Copyright 2016 the V8 project authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "src/inspector/String16.h" | |
6 | |
7 #include "src/inspector/ProtocolPlatform.h" | |
8 | |
9 #include <algorithm> | |
10 #include <cctype> | |
11 #include <cstdio> | |
12 #include <cstdlib> | |
13 #include <cstring> | |
14 #include <locale> | |
15 #include <string> | |
16 | |
17 namespace v8_inspector { | |
18 | |
19 namespace { | |
20 | |
21 bool isASCII(UChar c) | |
22 { | |
23 return !(c & ~0x7F); | |
24 } | |
25 | |
26 bool isSpaceOrNewLine(UChar c) | |
27 { | |
28 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9)); | |
29 } | |
30 | |
31 int charactersToInteger(const UChar* characters, size_t length, bool* ok = nullp
tr) | |
32 { | |
33 std::vector<char> buffer; | |
34 buffer.reserve(length + 1); | |
35 for (size_t i = 0; i < length; ++i) { | |
36 if (!isASCII(characters[i])) { | |
37 if (ok) | |
38 *ok = false; | |
39 return 0; | |
40 } | |
41 buffer.push_back(static_cast<char>(characters[i])); | |
42 } | |
43 buffer.push_back('\0'); | |
44 | |
45 char* endptr; | |
46 int result = std::strtol(buffer.data(), &endptr, 10); | |
47 if (ok) | |
48 *ok = !(*endptr); | |
49 return result; | |
50 } | |
51 | |
52 const UChar replacementCharacter = 0xFFFD; | |
53 using UChar32 = uint32_t; | |
54 | |
55 inline int inlineUTF8SequenceLengthNonASCII(char b0) | |
56 { | |
57 if ((b0 & 0xC0) != 0xC0) | |
58 return 0; | |
59 if ((b0 & 0xE0) == 0xC0) | |
60 return 2; | |
61 if ((b0 & 0xF0) == 0xE0) | |
62 return 3; | |
63 if ((b0 & 0xF8) == 0xF0) | |
64 return 4; | |
65 return 0; | |
66 } | |
67 | |
68 inline int inlineUTF8SequenceLength(char b0) | |
69 { | |
70 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | |
71 } | |
72 | |
73 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | |
74 // into the first byte, depending on how many bytes follow. There are | |
75 // as many entries in this table as there are UTF-8 sequence types. | |
76 // (I.e., one byte sequence, two byte... etc.). Remember that sequences | |
77 // for *legal* UTF-8 will be 4 or fewer bytes total. | |
78 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; | |
79 | |
80 typedef enum { | |
81 conversionOK, // conversion successful | |
82 sourceExhausted, // partial character in source, but hit end | |
83 targetExhausted, // insuff. room in target for conversion | |
84 sourceIllegal // source sequence is illegal/malformed | |
85 } ConversionResult; | |
86 | |
87 ConversionResult convertUTF16ToUTF8( | |
88 const UChar** sourceStart, const UChar* sourceEnd, | |
89 char** targetStart, char* targetEnd, bool strict) | |
90 { | |
91 ConversionResult result = conversionOK; | |
92 const UChar* source = *sourceStart; | |
93 char* target = *targetStart; | |
94 while (source < sourceEnd) { | |
95 UChar32 ch; | |
96 unsigned short bytesToWrite = 0; | |
97 const UChar32 byteMask = 0xBF; | |
98 const UChar32 byteMark = 0x80; | |
99 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. | |
100 ch = static_cast<unsigned short>(*source++); | |
101 // If we have a surrogate pair, convert to UChar32 first. | |
102 if (ch >= 0xD800 && ch <= 0xDBFF) { | |
103 // If the 16 bits following the high surrogate are in the source buf
fer... | |
104 if (source < sourceEnd) { | |
105 UChar32 ch2 = static_cast<unsigned short>(*source); | |
106 // If it's a low surrogate, convert to UChar32. | |
107 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | |
108 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | |
109 ++source; | |
110 } else if (strict) { // it's an unpaired high surrogate | |
111 --source; // return to the illegal value itself | |
112 result = sourceIllegal; | |
113 break; | |
114 } | |
115 } else { // We don't have the 16 bits following the high surrogate. | |
116 --source; // return to the high surrogate | |
117 result = sourceExhausted; | |
118 break; | |
119 } | |
120 } else if (strict) { | |
121 // UTF-16 surrogate values are illegal in UTF-32 | |
122 if (ch >= 0xDC00 && ch <= 0xDFFF) { | |
123 --source; // return to the illegal value itself | |
124 result = sourceIllegal; | |
125 break; | |
126 } | |
127 } | |
128 // Figure out how many bytes the result will require | |
129 if (ch < (UChar32)0x80) { | |
130 bytesToWrite = 1; | |
131 } else if (ch < (UChar32)0x800) { | |
132 bytesToWrite = 2; | |
133 } else if (ch < (UChar32)0x10000) { | |
134 bytesToWrite = 3; | |
135 } else if (ch < (UChar32)0x110000) { | |
136 bytesToWrite = 4; | |
137 } else { | |
138 bytesToWrite = 3; | |
139 ch = replacementCharacter; | |
140 } | |
141 | |
142 target += bytesToWrite; | |
143 if (target > targetEnd) { | |
144 source = oldSource; // Back up source pointer! | |
145 target -= bytesToWrite; | |
146 result = targetExhausted; | |
147 break; | |
148 } | |
149 switch (bytesToWrite) { // note: everything falls through. | |
150 case 4: | |
151 *--target = (char)((ch | byteMark) & byteMask); | |
152 ch >>= 6; | |
153 case 3: | |
154 *--target = (char)((ch | byteMark) & byteMask); | |
155 ch >>= 6; | |
156 case 2: | |
157 *--target = (char)((ch | byteMark) & byteMask); | |
158 ch >>= 6; | |
159 case 1: | |
160 *--target = (char)(ch | firstByteMark[bytesToWrite]); | |
161 } | |
162 target += bytesToWrite; | |
163 } | |
164 *sourceStart = source; | |
165 *targetStart = target; | |
166 return result; | |
167 } | |
168 | |
169 /** | |
170 * Is this code point a BMP code point (U+0000..U+ffff)? | |
171 * @param c 32-bit code point | |
172 * @return TRUE or FALSE | |
173 * @stable ICU 2.8 | |
174 */ | |
175 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) | |
176 | |
177 /** | |
178 * Is this code point a supplementary code point (U+10000..U+10ffff)? | |
179 * @param c 32-bit code point | |
180 * @return TRUE or FALSE | |
181 * @stable ICU 2.8 | |
182 */ | |
183 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff) | |
184 | |
185 /** | |
186 * Is this code point a surrogate (U+d800..U+dfff)? | |
187 * @param c 32-bit code point | |
188 * @return TRUE or FALSE | |
189 * @stable ICU 2.4 | |
190 */ | |
191 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) | |
192 | |
193 /** | |
194 * Get the lead surrogate (0xd800..0xdbff) for a | |
195 * supplementary code point (0x10000..0x10ffff). | |
196 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
197 * @return lead surrogate (U+d800..U+dbff) for supplementary | |
198 * @stable ICU 2.4 | |
199 */ | |
200 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) | |
201 | |
202 /** | |
203 * Get the trail surrogate (0xdc00..0xdfff) for a | |
204 * supplementary code point (0x10000..0x10ffff). | |
205 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
206 * @return trail surrogate (U+dc00..U+dfff) for supplementary | |
207 * @stable ICU 2.4 | |
208 */ | |
209 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00) | |
210 | |
211 // This must be called with the length pre-determined by the first byte. | |
212 // If presented with a length > 4, this returns false. The Unicode | |
213 // definition of UTF-8 goes up to 4-byte sequences. | |
214 static bool isLegalUTF8(const unsigned char* source, int length) | |
215 { | |
216 unsigned char a; | |
217 const unsigned char* srcptr = source + length; | |
218 switch (length) { | |
219 default: | |
220 return false; | |
221 // Everything else falls through when "true"... | |
222 case 4: | |
223 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
224 return false; | |
225 case 3: | |
226 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
227 return false; | |
228 case 2: | |
229 if ((a = (*--srcptr)) > 0xBF) | |
230 return false; | |
231 | |
232 // no fall-through in this inner switch | |
233 switch (*source) { | |
234 case 0xE0: | |
235 if (a < 0xA0) | |
236 return false; | |
237 break; | |
238 case 0xED: | |
239 if (a > 0x9F) | |
240 return false; | |
241 break; | |
242 case 0xF0: | |
243 if (a < 0x90) | |
244 return false; | |
245 break; | |
246 case 0xF4: | |
247 if (a > 0x8F) | |
248 return false; | |
249 break; | |
250 default: | |
251 if (a < 0x80) | |
252 return false; | |
253 } | |
254 | |
255 case 1: | |
256 if (*source >= 0x80 && *source < 0xC2) | |
257 return false; | |
258 } | |
259 if (*source > 0xF4) | |
260 return false; | |
261 return true; | |
262 } | |
263 | |
264 // Magic values subtracted from a buffer value during UTF8 conversion. | |
265 // This table contains as many values as there might be trailing bytes | |
266 // in a UTF-8 sequence. | |
267 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20
80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8
2082080UL) }; | |
268 | |
269 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) | |
270 { | |
271 UChar32 character = 0; | |
272 | |
273 // The cases all fall through. | |
274 switch (length) { | |
275 case 6: | |
276 character += static_cast<unsigned char>(*sequence++); | |
277 character <<= 6; | |
278 case 5: | |
279 character += static_cast<unsigned char>(*sequence++); | |
280 character <<= 6; | |
281 case 4: | |
282 character += static_cast<unsigned char>(*sequence++); | |
283 character <<= 6; | |
284 case 3: | |
285 character += static_cast<unsigned char>(*sequence++); | |
286 character <<= 6; | |
287 case 2: | |
288 character += static_cast<unsigned char>(*sequence++); | |
289 character <<= 6; | |
290 case 1: | |
291 character += static_cast<unsigned char>(*sequence++); | |
292 } | |
293 | |
294 return character - offsetsFromUTF8[length - 1]; | |
295 } | |
296 | |
297 ConversionResult convertUTF8ToUTF16( | |
298 const char** sourceStart, const char* sourceEnd, | |
299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | |
300 { | |
301 ConversionResult result = conversionOK; | |
302 const char* source = *sourceStart; | |
303 UChar* target = *targetStart; | |
304 UChar orAllData = 0; | |
305 while (source < sourceEnd) { | |
306 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | |
307 if (sourceEnd - source < utf8SequenceLength) { | |
308 result = sourceExhausted; | |
309 break; | |
310 } | |
311 // Do this check whether lenient or strict | |
312 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength)) { | |
313 result = sourceIllegal; | |
314 break; | |
315 } | |
316 | |
317 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | |
318 | |
319 if (target >= targetEnd) { | |
320 source -= utf8SequenceLength; // Back up source pointer! | |
321 result = targetExhausted; | |
322 break; | |
323 } | |
324 | |
325 if (U_IS_BMP(character)) { | |
326 // UTF-16 surrogate values are illegal in UTF-32 | |
327 if (U_IS_SURROGATE(character)) { | |
328 if (strict) { | |
329 source -= utf8SequenceLength; // return to the illegal value
itself | |
330 result = sourceIllegal; | |
331 break; | |
332 } | |
333 *target++ = replacementCharacter; | |
334 orAllData |= replacementCharacter; | |
335 } else { | |
336 *target++ = static_cast<UChar>(character); // normal case | |
337 orAllData |= character; | |
338 } | |
339 } else if (U_IS_SUPPLEMENTARY(character)) { | |
340 // target is a character in range 0xFFFF - 0x10FFFF | |
341 if (target + 1 >= targetEnd) { | |
342 source -= utf8SequenceLength; // Back up source pointer! | |
343 result = targetExhausted; | |
344 break; | |
345 } | |
346 *target++ = U16_LEAD(character); | |
347 *target++ = U16_TRAIL(character); | |
348 orAllData = 0xffff; | |
349 } else { | |
350 if (strict) { | |
351 source -= utf8SequenceLength; // return to the start | |
352 result = sourceIllegal; | |
353 break; // Bail out; shouldn't continue | |
354 } else { | |
355 *target++ = replacementCharacter; | |
356 orAllData |= replacementCharacter; | |
357 } | |
358 } | |
359 } | |
360 *sourceStart = source; | |
361 *targetStart = target; | |
362 | |
363 if (sourceAllASCII) | |
364 *sourceAllASCII = !(orAllData & ~0x7f); | |
365 | |
366 return result; | |
367 } | |
368 | |
369 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec
k room is available. | |
370 static inline void putUTF8Triple(char*& buffer, UChar ch) | |
371 { | |
372 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
373 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
374 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
375 } | |
376 | |
377 } // namespace | |
378 | |
379 // static | |
380 String16 String16::fromInteger(int number) | |
381 { | |
382 const size_t kBufferSize = 50; | |
383 char buffer[kBufferSize]; | |
384 std::snprintf(buffer, kBufferSize, "%d", number); | |
385 return String16(buffer); | |
386 } | |
387 | |
388 // static | |
389 String16 String16::fromDouble(double number) | |
390 { | |
391 const size_t kBufferSize = 100; | |
392 char buffer[kBufferSize]; | |
393 std::snprintf(buffer, kBufferSize, "%f", number); | |
394 return String16(buffer); | |
395 } | |
396 | |
397 // static | |
398 String16 String16::fromDoublePrecision3(double number) | |
399 { | |
400 const size_t kBufferSize = 100; | |
401 char buffer[kBufferSize]; | |
402 std::snprintf(buffer, kBufferSize, "%.3g", number); | |
403 return String16(buffer); | |
404 } | |
405 | |
406 // static | |
407 String16 String16::fromDoublePrecision6(double number) | |
408 { | |
409 const size_t kBufferSize = 100; | |
410 char buffer[kBufferSize]; | |
411 std::snprintf(buffer, kBufferSize, "%.6g", number); | |
412 return String16(buffer); | |
413 } | |
414 | |
415 int String16::toInteger(bool* ok) const | |
416 { | |
417 return charactersToInteger(characters16(), length(), ok); | |
418 } | |
419 | |
420 String16 String16::stripWhiteSpace() const | |
421 { | |
422 if (!length()) | |
423 return String16(); | |
424 | |
425 unsigned start = 0; | |
426 unsigned end = length() - 1; | |
427 | |
428 // skip white space from start | |
429 while (start <= end && isSpaceOrNewLine(characters16()[start])) | |
430 ++start; | |
431 | |
432 // only white space | |
433 if (start > end) | |
434 return String16(); | |
435 | |
436 // skip white space from end | |
437 while (end && isSpaceOrNewLine(characters16()[end])) | |
438 --end; | |
439 | |
440 if (!start && end == length() - 1) | |
441 return *this; | |
442 return String16(characters16() + start, end + 1 - start); | |
443 } | |
444 | |
445 String16Builder::String16Builder() | |
446 { | |
447 } | |
448 | |
449 void String16Builder::append(const String16& s) | |
450 { | |
451 m_buffer.insert(m_buffer.end(), s.characters16(), s.characters16() + s.lengt
h()); | |
452 } | |
453 | |
454 void String16Builder::append(UChar c) | |
455 { | |
456 m_buffer.push_back(c); | |
457 } | |
458 | |
459 void String16Builder::append(char c) | |
460 { | |
461 UChar u = c; | |
462 m_buffer.push_back(u); | |
463 } | |
464 | |
465 void String16Builder::append(const UChar* characters, size_t length) | |
466 { | |
467 m_buffer.insert(m_buffer.end(), characters, characters + length); | |
468 } | |
469 | |
470 void String16Builder::append(const char* characters, size_t length) | |
471 { | |
472 m_buffer.insert(m_buffer.end(), characters, characters + length); | |
473 } | |
474 | |
475 String16 String16Builder::toString() | |
476 { | |
477 return String16(m_buffer.data(), m_buffer.size()); | |
478 } | |
479 | |
480 void String16Builder::reserveCapacity(size_t capacity) | |
481 { | |
482 m_buffer.reserve(capacity); | |
483 } | |
484 | |
485 String16 String16::fromUTF8(const char* stringStart, size_t length) | |
486 { | |
487 if (!stringStart || !length) | |
488 return String16(); | |
489 | |
490 std::vector<UChar> buffer(length); | |
491 UChar* bufferStart = buffer.data(); | |
492 | |
493 UChar* bufferCurrent = bufferStart; | |
494 const char* stringCurrent = stringStart; | |
495 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
bufferCurrent + buffer.size(), 0, true) != conversionOK) | |
496 return String16(); | |
497 | |
498 unsigned utf16Length = bufferCurrent - bufferStart; | |
499 return String16(bufferStart, utf16Length); | |
500 } | |
501 | |
502 std::string String16::utf8() const | |
503 { | |
504 unsigned length = this->length(); | |
505 | |
506 if (!length) | |
507 return std::string(""); | |
508 | |
509 // Allocate a buffer big enough to hold all the characters | |
510 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
511 // Optimization ideas, if we find this function is hot: | |
512 // * We could speculatively create a CStringBuffer to contain 'length' | |
513 // characters, and resize if necessary (i.e. if the buffer contains | |
514 // non-ascii characters). (Alternatively, scan the buffer first for | |
515 // ascii characters, so we know this will be sufficient). | |
516 // * We could allocate a CStringBuffer with an appropriate size to | |
517 // have a good chance of being able to write the string into the | |
518 // buffer without reallocing (say, 1.5 x length). | |
519 if (length > std::numeric_limits<unsigned>::max() / 3) | |
520 return std::string(); | |
521 std::vector<char> bufferVector(length * 3); | |
522 char* buffer = bufferVector.data(); | |
523 const UChar* characters = m_impl.data(); | |
524 | |
525 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt
h, &buffer, buffer + bufferVector.size(), false); | |
526 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for
any conversion | |
527 | |
528 // Only produced from strict conversion. | |
529 DCHECK(result != sourceIllegal); | |
530 | |
531 // Check for an unconverted high surrogate. | |
532 if (result == sourceExhausted) { | |
533 // This should be one unpaired high surrogate. Treat it the same | |
534 // was as an unpaired high surrogate would have been handled in | |
535 // the middle of a string with non-strict conversion - which is | |
536 // to say, simply encode it to UTF-8. | |
537 DCHECK((characters + 1) == (m_impl.data() + length)); | |
538 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); | |
539 // There should be room left, since one UChar hasn't been | |
540 // converted. | |
541 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); | |
542 putUTF8Triple(buffer, *characters); | |
543 } | |
544 | |
545 return std::string(bufferVector.data(), buffer - bufferVector.data()); | |
546 } | |
547 | |
548 } // namespace v8_inspector | |
OLD | NEW |