OLD | NEW |
| (Empty) |
1 // Copyright 2016 the V8 project authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "src/inspector/String16.h" | |
6 | |
7 #include "src/base/platform/platform.h" | |
8 #include "src/inspector/ProtocolPlatform.h" | |
9 | |
10 #include <algorithm> | |
11 #include <cctype> | |
12 #include <cstdlib> | |
13 #include <cstring> | |
14 #include <locale> | |
15 #include <string> | |
16 | |
17 namespace v8_inspector { | |
18 | |
19 namespace { | |
20 | |
21 bool isASCII(UChar c) { return !(c & ~0x7F); } | |
22 | |
23 bool isSpaceOrNewLine(UChar c) { | |
24 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9)); | |
25 } | |
26 | |
27 int charactersToInteger(const UChar* characters, size_t length, | |
28 bool* ok = nullptr) { | |
29 std::vector<char> buffer; | |
30 buffer.reserve(length + 1); | |
31 for (size_t i = 0; i < length; ++i) { | |
32 if (!isASCII(characters[i])) { | |
33 if (ok) *ok = false; | |
34 return 0; | |
35 } | |
36 buffer.push_back(static_cast<char>(characters[i])); | |
37 } | |
38 buffer.push_back('\0'); | |
39 | |
40 char* endptr; | |
41 int result = std::strtol(buffer.data(), &endptr, 10); | |
42 if (ok) *ok = !(*endptr); | |
43 return result; | |
44 } | |
45 | |
46 const UChar replacementCharacter = 0xFFFD; | |
47 using UChar32 = uint32_t; | |
48 | |
49 inline int inlineUTF8SequenceLengthNonASCII(char b0) { | |
50 if ((b0 & 0xC0) != 0xC0) return 0; | |
51 if ((b0 & 0xE0) == 0xC0) return 2; | |
52 if ((b0 & 0xF0) == 0xE0) return 3; | |
53 if ((b0 & 0xF8) == 0xF0) return 4; | |
54 return 0; | |
55 } | |
56 | |
57 inline int inlineUTF8SequenceLength(char b0) { | |
58 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | |
59 } | |
60 | |
61 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | |
62 // into the first byte, depending on how many bytes follow. There are | |
63 // as many entries in this table as there are UTF-8 sequence types. | |
64 // (I.e., one byte sequence, two byte... etc.). Remember that sequences | |
65 // for *legal* UTF-8 will be 4 or fewer bytes total. | |
66 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, | |
67 0xF0, 0xF8, 0xFC}; | |
68 | |
69 typedef enum { | |
70 conversionOK, // conversion successful | |
71 sourceExhausted, // partial character in source, but hit end | |
72 targetExhausted, // insuff. room in target for conversion | |
73 sourceIllegal // source sequence is illegal/malformed | |
74 } ConversionResult; | |
75 | |
76 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, | |
77 const UChar* sourceEnd, char** targetStart, | |
78 char* targetEnd, bool strict) { | |
79 ConversionResult result = conversionOK; | |
80 const UChar* source = *sourceStart; | |
81 char* target = *targetStart; | |
82 while (source < sourceEnd) { | |
83 UChar32 ch; | |
84 unsigned short bytesToWrite = 0; | |
85 const UChar32 byteMask = 0xBF; | |
86 const UChar32 byteMark = 0x80; | |
87 const UChar* oldSource = | |
88 source; // In case we have to back up because of target overflow. | |
89 ch = static_cast<unsigned short>(*source++); | |
90 // If we have a surrogate pair, convert to UChar32 first. | |
91 if (ch >= 0xD800 && ch <= 0xDBFF) { | |
92 // If the 16 bits following the high surrogate are in the source buffer... | |
93 if (source < sourceEnd) { | |
94 UChar32 ch2 = static_cast<unsigned short>(*source); | |
95 // If it's a low surrogate, convert to UChar32. | |
96 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | |
97 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | |
98 ++source; | |
99 } else if (strict) { // it's an unpaired high surrogate | |
100 --source; // return to the illegal value itself | |
101 result = sourceIllegal; | |
102 break; | |
103 } | |
104 } else { // We don't have the 16 bits following the high surrogate. | |
105 --source; // return to the high surrogate | |
106 result = sourceExhausted; | |
107 break; | |
108 } | |
109 } else if (strict) { | |
110 // UTF-16 surrogate values are illegal in UTF-32 | |
111 if (ch >= 0xDC00 && ch <= 0xDFFF) { | |
112 --source; // return to the illegal value itself | |
113 result = sourceIllegal; | |
114 break; | |
115 } | |
116 } | |
117 // Figure out how many bytes the result will require | |
118 if (ch < (UChar32)0x80) { | |
119 bytesToWrite = 1; | |
120 } else if (ch < (UChar32)0x800) { | |
121 bytesToWrite = 2; | |
122 } else if (ch < (UChar32)0x10000) { | |
123 bytesToWrite = 3; | |
124 } else if (ch < (UChar32)0x110000) { | |
125 bytesToWrite = 4; | |
126 } else { | |
127 bytesToWrite = 3; | |
128 ch = replacementCharacter; | |
129 } | |
130 | |
131 target += bytesToWrite; | |
132 if (target > targetEnd) { | |
133 source = oldSource; // Back up source pointer! | |
134 target -= bytesToWrite; | |
135 result = targetExhausted; | |
136 break; | |
137 } | |
138 switch (bytesToWrite) { // note: everything falls through. | |
139 case 4: | |
140 *--target = (char)((ch | byteMark) & byteMask); | |
141 ch >>= 6; | |
142 case 3: | |
143 *--target = (char)((ch | byteMark) & byteMask); | |
144 ch >>= 6; | |
145 case 2: | |
146 *--target = (char)((ch | byteMark) & byteMask); | |
147 ch >>= 6; | |
148 case 1: | |
149 *--target = (char)(ch | firstByteMark[bytesToWrite]); | |
150 } | |
151 target += bytesToWrite; | |
152 } | |
153 *sourceStart = source; | |
154 *targetStart = target; | |
155 return result; | |
156 } | |
157 | |
158 /** | |
159 * Is this code point a BMP code point (U+0000..U+ffff)? | |
160 * @param c 32-bit code point | |
161 * @return TRUE or FALSE | |
162 * @stable ICU 2.8 | |
163 */ | |
164 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) | |
165 | |
166 /** | |
167 * Is this code point a supplementary code point (U+10000..U+10ffff)? | |
168 * @param c 32-bit code point | |
169 * @return TRUE or FALSE | |
170 * @stable ICU 2.8 | |
171 */ | |
172 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff) | |
173 | |
174 /** | |
175 * Is this code point a surrogate (U+d800..U+dfff)? | |
176 * @param c 32-bit code point | |
177 * @return TRUE or FALSE | |
178 * @stable ICU 2.4 | |
179 */ | |
180 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800) | |
181 | |
182 /** | |
183 * Get the lead surrogate (0xd800..0xdbff) for a | |
184 * supplementary code point (0x10000..0x10ffff). | |
185 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
186 * @return lead surrogate (U+d800..U+dbff) for supplementary | |
187 * @stable ICU 2.4 | |
188 */ | |
189 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) | |
190 | |
191 /** | |
192 * Get the trail surrogate (0xdc00..0xdfff) for a | |
193 * supplementary code point (0x10000..0x10ffff). | |
194 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
195 * @return trail surrogate (U+dc00..U+dfff) for supplementary | |
196 * @stable ICU 2.4 | |
197 */ | |
198 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00) | |
199 | |
200 // This must be called with the length pre-determined by the first byte. | |
201 // If presented with a length > 4, this returns false. The Unicode | |
202 // definition of UTF-8 goes up to 4-byte sequences. | |
203 static bool isLegalUTF8(const unsigned char* source, int length) { | |
204 unsigned char a; | |
205 const unsigned char* srcptr = source + length; | |
206 switch (length) { | |
207 default: | |
208 return false; | |
209 // Everything else falls through when "true"... | |
210 case 4: | |
211 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; | |
212 case 3: | |
213 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; | |
214 case 2: | |
215 if ((a = (*--srcptr)) > 0xBF) return false; | |
216 | |
217 // no fall-through in this inner switch | |
218 switch (*source) { | |
219 case 0xE0: | |
220 if (a < 0xA0) return false; | |
221 break; | |
222 case 0xED: | |
223 if (a > 0x9F) return false; | |
224 break; | |
225 case 0xF0: | |
226 if (a < 0x90) return false; | |
227 break; | |
228 case 0xF4: | |
229 if (a > 0x8F) return false; | |
230 break; | |
231 default: | |
232 if (a < 0x80) return false; | |
233 } | |
234 | |
235 case 1: | |
236 if (*source >= 0x80 && *source < 0xC2) return false; | |
237 } | |
238 if (*source > 0xF4) return false; | |
239 return true; | |
240 } | |
241 | |
242 // Magic values subtracted from a buffer value during UTF8 conversion. | |
243 // This table contains as many values as there might be trailing bytes | |
244 // in a UTF-8 sequence. | |
245 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, | |
246 0x00003080UL, | |
247 0x000E2080UL, | |
248 0x03C82080UL, | |
249 static_cast<UChar32>(0xFA082080UL), | |
250 static_cast<UChar32>(0x82082080UL)}; | |
251 | |
252 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) { | |
253 UChar32 character = 0; | |
254 | |
255 // The cases all fall through. | |
256 switch (length) { | |
257 case 6: | |
258 character += static_cast<unsigned char>(*sequence++); | |
259 character <<= 6; | |
260 case 5: | |
261 character += static_cast<unsigned char>(*sequence++); | |
262 character <<= 6; | |
263 case 4: | |
264 character += static_cast<unsigned char>(*sequence++); | |
265 character <<= 6; | |
266 case 3: | |
267 character += static_cast<unsigned char>(*sequence++); | |
268 character <<= 6; | |
269 case 2: | |
270 character += static_cast<unsigned char>(*sequence++); | |
271 character <<= 6; | |
272 case 1: | |
273 character += static_cast<unsigned char>(*sequence++); | |
274 } | |
275 | |
276 return character - offsetsFromUTF8[length - 1]; | |
277 } | |
278 | |
279 ConversionResult convertUTF8ToUTF16(const char** sourceStart, | |
280 const char* sourceEnd, UChar** targetStart, | |
281 UChar* targetEnd, bool* sourceAllASCII, | |
282 bool strict) { | |
283 ConversionResult result = conversionOK; | |
284 const char* source = *sourceStart; | |
285 UChar* target = *targetStart; | |
286 UChar orAllData = 0; | |
287 while (source < sourceEnd) { | |
288 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | |
289 if (sourceEnd - source < utf8SequenceLength) { | |
290 result = sourceExhausted; | |
291 break; | |
292 } | |
293 // Do this check whether lenient or strict | |
294 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), | |
295 utf8SequenceLength)) { | |
296 result = sourceIllegal; | |
297 break; | |
298 } | |
299 | |
300 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | |
301 | |
302 if (target >= targetEnd) { | |
303 source -= utf8SequenceLength; // Back up source pointer! | |
304 result = targetExhausted; | |
305 break; | |
306 } | |
307 | |
308 if (U_IS_BMP(character)) { | |
309 // UTF-16 surrogate values are illegal in UTF-32 | |
310 if (U_IS_SURROGATE(character)) { | |
311 if (strict) { | |
312 source -= utf8SequenceLength; // return to the illegal value itself | |
313 result = sourceIllegal; | |
314 break; | |
315 } | |
316 *target++ = replacementCharacter; | |
317 orAllData |= replacementCharacter; | |
318 } else { | |
319 *target++ = static_cast<UChar>(character); // normal case | |
320 orAllData |= character; | |
321 } | |
322 } else if (U_IS_SUPPLEMENTARY(character)) { | |
323 // target is a character in range 0xFFFF - 0x10FFFF | |
324 if (target + 1 >= targetEnd) { | |
325 source -= utf8SequenceLength; // Back up source pointer! | |
326 result = targetExhausted; | |
327 break; | |
328 } | |
329 *target++ = U16_LEAD(character); | |
330 *target++ = U16_TRAIL(character); | |
331 orAllData = 0xffff; | |
332 } else { | |
333 if (strict) { | |
334 source -= utf8SequenceLength; // return to the start | |
335 result = sourceIllegal; | |
336 break; // Bail out; shouldn't continue | |
337 } else { | |
338 *target++ = replacementCharacter; | |
339 orAllData |= replacementCharacter; | |
340 } | |
341 } | |
342 } | |
343 *sourceStart = source; | |
344 *targetStart = target; | |
345 | |
346 if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f); | |
347 | |
348 return result; | |
349 } | |
350 | |
351 // Helper to write a three-byte UTF-8 code point to the buffer, caller must | |
352 // check room is available. | |
353 static inline void putUTF8Triple(char*& buffer, UChar ch) { | |
354 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
355 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
356 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
357 } | |
358 | |
359 } // namespace | |
360 | |
361 // static | |
362 String16 String16::fromInteger(int number) { | |
363 const size_t kBufferSize = 50; | |
364 char buffer[kBufferSize]; | |
365 v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number); | |
366 return String16(buffer); | |
367 } | |
368 | |
369 // static | |
370 String16 String16::fromDouble(double number) { | |
371 const size_t kBufferSize = 100; | |
372 char buffer[kBufferSize]; | |
373 v8::base::OS::SNPrintF(buffer, kBufferSize, "%f", number); | |
374 return String16(buffer); | |
375 } | |
376 | |
377 // static | |
378 String16 String16::fromDoublePrecision3(double number) { | |
379 const size_t kBufferSize = 100; | |
380 char buffer[kBufferSize]; | |
381 v8::base::OS::SNPrintF(buffer, kBufferSize, "%.3g", number); | |
382 return String16(buffer); | |
383 } | |
384 | |
385 // static | |
386 String16 String16::fromDoublePrecision6(double number) { | |
387 const size_t kBufferSize = 100; | |
388 char buffer[kBufferSize]; | |
389 v8::base::OS::SNPrintF(buffer, kBufferSize, "%.6g", number); | |
390 return String16(buffer); | |
391 } | |
392 | |
393 int String16::toInteger(bool* ok) const { | |
394 return charactersToInteger(characters16(), length(), ok); | |
395 } | |
396 | |
397 String16 String16::stripWhiteSpace() const { | |
398 if (!length()) return String16(); | |
399 | |
400 unsigned start = 0; | |
401 unsigned end = length() - 1; | |
402 | |
403 // skip white space from start | |
404 while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start; | |
405 | |
406 // only white space | |
407 if (start > end) return String16(); | |
408 | |
409 // skip white space from end | |
410 while (end && isSpaceOrNewLine(characters16()[end])) --end; | |
411 | |
412 if (!start && end == length() - 1) return *this; | |
413 return String16(characters16() + start, end + 1 - start); | |
414 } | |
415 | |
416 String16Builder::String16Builder() {} | |
417 | |
418 void String16Builder::append(const String16& s) { | |
419 m_buffer.insert(m_buffer.end(), s.characters16(), | |
420 s.characters16() + s.length()); | |
421 } | |
422 | |
423 void String16Builder::append(UChar c) { m_buffer.push_back(c); } | |
424 | |
425 void String16Builder::append(char c) { | |
426 UChar u = c; | |
427 m_buffer.push_back(u); | |
428 } | |
429 | |
430 void String16Builder::append(const UChar* characters, size_t length) { | |
431 m_buffer.insert(m_buffer.end(), characters, characters + length); | |
432 } | |
433 | |
434 void String16Builder::append(const char* characters, size_t length) { | |
435 m_buffer.insert(m_buffer.end(), characters, characters + length); | |
436 } | |
437 | |
438 String16 String16Builder::toString() { | |
439 return String16(m_buffer.data(), m_buffer.size()); | |
440 } | |
441 | |
442 void String16Builder::reserveCapacity(size_t capacity) { | |
443 m_buffer.reserve(capacity); | |
444 } | |
445 | |
446 String16 String16::fromUTF8(const char* stringStart, size_t length) { | |
447 if (!stringStart || !length) return String16(); | |
448 | |
449 std::vector<UChar> buffer(length); | |
450 UChar* bufferStart = buffer.data(); | |
451 | |
452 UChar* bufferCurrent = bufferStart; | |
453 const char* stringCurrent = stringStart; | |
454 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, | |
455 bufferCurrent + buffer.size(), 0, | |
456 true) != conversionOK) | |
457 return String16(); | |
458 | |
459 unsigned utf16Length = bufferCurrent - bufferStart; | |
460 return String16(bufferStart, utf16Length); | |
461 } | |
462 | |
463 std::string String16::utf8() const { | |
464 unsigned length = this->length(); | |
465 | |
466 if (!length) return std::string(""); | |
467 | |
468 // Allocate a buffer big enough to hold all the characters | |
469 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
470 // Optimization ideas, if we find this function is hot: | |
471 // * We could speculatively create a CStringBuffer to contain 'length' | |
472 // characters, and resize if necessary (i.e. if the buffer contains | |
473 // non-ascii characters). (Alternatively, scan the buffer first for | |
474 // ascii characters, so we know this will be sufficient). | |
475 // * We could allocate a CStringBuffer with an appropriate size to | |
476 // have a good chance of being able to write the string into the | |
477 // buffer without reallocing (say, 1.5 x length). | |
478 if (length > std::numeric_limits<unsigned>::max() / 3) return std::string(); | |
479 std::vector<char> bufferVector(length * 3); | |
480 char* buffer = bufferVector.data(); | |
481 const UChar* characters = m_impl.data(); | |
482 | |
483 ConversionResult result = | |
484 convertUTF16ToUTF8(&characters, characters + length, &buffer, | |
485 buffer + bufferVector.size(), false); | |
486 DCHECK( | |
487 result != | |
488 targetExhausted); // (length * 3) should be sufficient for any conversion | |
489 | |
490 // Only produced from strict conversion. | |
491 DCHECK(result != sourceIllegal); | |
492 | |
493 // Check for an unconverted high surrogate. | |
494 if (result == sourceExhausted) { | |
495 // This should be one unpaired high surrogate. Treat it the same | |
496 // was as an unpaired high surrogate would have been handled in | |
497 // the middle of a string with non-strict conversion - which is | |
498 // to say, simply encode it to UTF-8. | |
499 DCHECK((characters + 1) == (m_impl.data() + length)); | |
500 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); | |
501 // There should be room left, since one UChar hasn't been | |
502 // converted. | |
503 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); | |
504 putUTF8Triple(buffer, *characters); | |
505 } | |
506 | |
507 return std::string(bufferVector.data(), buffer - bufferVector.data()); | |
508 } | |
509 | |
510 } // namespace v8_inspector | |
OLD | NEW |