OLD | NEW |
(Empty) | |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "src/inspector/String16.h" |
| 6 |
| 7 #include "src/inspector/ProtocolPlatform.h" |
| 8 |
| 9 #include <algorithm> |
| 10 #include <cctype> |
| 11 #include <cstdio> |
| 12 #include <cstdlib> |
| 13 #include <cstring> |
| 14 #include <locale> |
| 15 #include <string> |
| 16 |
| 17 namespace v8_inspector { |
| 18 |
| 19 namespace { |
| 20 |
| 21 bool isASCII(UChar c) { return !(c & ~0x7F); } |
| 22 |
| 23 bool isSpaceOrNewLine(UChar c) { |
| 24 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9)); |
| 25 } |
| 26 |
| 27 int charactersToInteger(const UChar* characters, size_t length, |
| 28 bool* ok = nullptr) { |
| 29 std::vector<char> buffer; |
| 30 buffer.reserve(length + 1); |
| 31 for (size_t i = 0; i < length; ++i) { |
| 32 if (!isASCII(characters[i])) { |
| 33 if (ok) *ok = false; |
| 34 return 0; |
| 35 } |
| 36 buffer.push_back(static_cast<char>(characters[i])); |
| 37 } |
| 38 buffer.push_back('\0'); |
| 39 |
| 40 char* endptr; |
| 41 int result = std::strtol(buffer.data(), &endptr, 10); |
| 42 if (ok) *ok = !(*endptr); |
| 43 return result; |
| 44 } |
| 45 |
| 46 const UChar replacementCharacter = 0xFFFD; |
| 47 using UChar32 = uint32_t; |
| 48 |
| 49 inline int inlineUTF8SequenceLengthNonASCII(char b0) { |
| 50 if ((b0 & 0xC0) != 0xC0) return 0; |
| 51 if ((b0 & 0xE0) == 0xC0) return 2; |
| 52 if ((b0 & 0xF0) == 0xE0) return 3; |
| 53 if ((b0 & 0xF8) == 0xF0) return 4; |
| 54 return 0; |
| 55 } |
| 56 |
| 57 inline int inlineUTF8SequenceLength(char b0) { |
| 58 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
| 59 } |
| 60 |
| 61 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
| 62 // into the first byte, depending on how many bytes follow. There are |
| 63 // as many entries in this table as there are UTF-8 sequence types. |
| 64 // (I.e., one byte sequence, two byte... etc.). Remember that sequences |
| 65 // for *legal* UTF-8 will be 4 or fewer bytes total. |
| 66 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, |
| 67 0xF0, 0xF8, 0xFC}; |
| 68 |
| 69 typedef enum { |
| 70 conversionOK, // conversion successful |
| 71 sourceExhausted, // partial character in source, but hit end |
| 72 targetExhausted, // insuff. room in target for conversion |
| 73 sourceIllegal // source sequence is illegal/malformed |
| 74 } ConversionResult; |
| 75 |
| 76 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, |
| 77 const UChar* sourceEnd, char** targetStart, |
| 78 char* targetEnd, bool strict) { |
| 79 ConversionResult result = conversionOK; |
| 80 const UChar* source = *sourceStart; |
| 81 char* target = *targetStart; |
| 82 while (source < sourceEnd) { |
| 83 UChar32 ch; |
| 84 unsigned short bytesToWrite = 0; |
| 85 const UChar32 byteMask = 0xBF; |
| 86 const UChar32 byteMark = 0x80; |
| 87 const UChar* oldSource = |
| 88 source; // In case we have to back up because of target overflow. |
| 89 ch = static_cast<unsigned short>(*source++); |
| 90 // If we have a surrogate pair, convert to UChar32 first. |
| 91 if (ch >= 0xD800 && ch <= 0xDBFF) { |
| 92 // If the 16 bits following the high surrogate are in the source buffer... |
| 93 if (source < sourceEnd) { |
| 94 UChar32 ch2 = static_cast<unsigned short>(*source); |
| 95 // If it's a low surrogate, convert to UChar32. |
| 96 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
| 97 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; |
| 98 ++source; |
| 99 } else if (strict) { // it's an unpaired high surrogate |
| 100 --source; // return to the illegal value itself |
| 101 result = sourceIllegal; |
| 102 break; |
| 103 } |
| 104 } else { // We don't have the 16 bits following the high surrogate. |
| 105 --source; // return to the high surrogate |
| 106 result = sourceExhausted; |
| 107 break; |
| 108 } |
| 109 } else if (strict) { |
| 110 // UTF-16 surrogate values are illegal in UTF-32 |
| 111 if (ch >= 0xDC00 && ch <= 0xDFFF) { |
| 112 --source; // return to the illegal value itself |
| 113 result = sourceIllegal; |
| 114 break; |
| 115 } |
| 116 } |
| 117 // Figure out how many bytes the result will require |
| 118 if (ch < (UChar32)0x80) { |
| 119 bytesToWrite = 1; |
| 120 } else if (ch < (UChar32)0x800) { |
| 121 bytesToWrite = 2; |
| 122 } else if (ch < (UChar32)0x10000) { |
| 123 bytesToWrite = 3; |
| 124 } else if (ch < (UChar32)0x110000) { |
| 125 bytesToWrite = 4; |
| 126 } else { |
| 127 bytesToWrite = 3; |
| 128 ch = replacementCharacter; |
| 129 } |
| 130 |
| 131 target += bytesToWrite; |
| 132 if (target > targetEnd) { |
| 133 source = oldSource; // Back up source pointer! |
| 134 target -= bytesToWrite; |
| 135 result = targetExhausted; |
| 136 break; |
| 137 } |
| 138 switch (bytesToWrite) { // note: everything falls through. |
| 139 case 4: |
| 140 *--target = (char)((ch | byteMark) & byteMask); |
| 141 ch >>= 6; |
| 142 case 3: |
| 143 *--target = (char)((ch | byteMark) & byteMask); |
| 144 ch >>= 6; |
| 145 case 2: |
| 146 *--target = (char)((ch | byteMark) & byteMask); |
| 147 ch >>= 6; |
| 148 case 1: |
| 149 *--target = (char)(ch | firstByteMark[bytesToWrite]); |
| 150 } |
| 151 target += bytesToWrite; |
| 152 } |
| 153 *sourceStart = source; |
| 154 *targetStart = target; |
| 155 return result; |
| 156 } |
| 157 |
| 158 /** |
| 159 * Is this code point a BMP code point (U+0000..U+ffff)? |
| 160 * @param c 32-bit code point |
| 161 * @return TRUE or FALSE |
| 162 * @stable ICU 2.8 |
| 163 */ |
| 164 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) |
| 165 |
| 166 /** |
| 167 * Is this code point a supplementary code point (U+10000..U+10ffff)? |
| 168 * @param c 32-bit code point |
| 169 * @return TRUE or FALSE |
| 170 * @stable ICU 2.8 |
| 171 */ |
| 172 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff) |
| 173 |
| 174 /** |
| 175 * Is this code point a surrogate (U+d800..U+dfff)? |
| 176 * @param c 32-bit code point |
| 177 * @return TRUE or FALSE |
| 178 * @stable ICU 2.4 |
| 179 */ |
| 180 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800) |
| 181 |
| 182 /** |
| 183 * Get the lead surrogate (0xd800..0xdbff) for a |
| 184 * supplementary code point (0x10000..0x10ffff). |
| 185 * @param supplementary 32-bit code point (U+10000..U+10ffff) |
| 186 * @return lead surrogate (U+d800..U+dbff) for supplementary |
| 187 * @stable ICU 2.4 |
| 188 */ |
| 189 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) |
| 190 |
| 191 /** |
| 192 * Get the trail surrogate (0xdc00..0xdfff) for a |
| 193 * supplementary code point (0x10000..0x10ffff). |
| 194 * @param supplementary 32-bit code point (U+10000..U+10ffff) |
| 195 * @return trail surrogate (U+dc00..U+dfff) for supplementary |
| 196 * @stable ICU 2.4 |
| 197 */ |
| 198 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00) |
| 199 |
| 200 // This must be called with the length pre-determined by the first byte. |
| 201 // If presented with a length > 4, this returns false. The Unicode |
| 202 // definition of UTF-8 goes up to 4-byte sequences. |
| 203 static bool isLegalUTF8(const unsigned char* source, int length) { |
| 204 unsigned char a; |
| 205 const unsigned char* srcptr = source + length; |
| 206 switch (length) { |
| 207 default: |
| 208 return false; |
| 209 // Everything else falls through when "true"... |
| 210 case 4: |
| 211 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
| 212 case 3: |
| 213 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
| 214 case 2: |
| 215 if ((a = (*--srcptr)) > 0xBF) return false; |
| 216 |
| 217 // no fall-through in this inner switch |
| 218 switch (*source) { |
| 219 case 0xE0: |
| 220 if (a < 0xA0) return false; |
| 221 break; |
| 222 case 0xED: |
| 223 if (a > 0x9F) return false; |
| 224 break; |
| 225 case 0xF0: |
| 226 if (a < 0x90) return false; |
| 227 break; |
| 228 case 0xF4: |
| 229 if (a > 0x8F) return false; |
| 230 break; |
| 231 default: |
| 232 if (a < 0x80) return false; |
| 233 } |
| 234 |
| 235 case 1: |
| 236 if (*source >= 0x80 && *source < 0xC2) return false; |
| 237 } |
| 238 if (*source > 0xF4) return false; |
| 239 return true; |
| 240 } |
| 241 |
| 242 // Magic values subtracted from a buffer value during UTF8 conversion. |
| 243 // This table contains as many values as there might be trailing bytes |
| 244 // in a UTF-8 sequence. |
| 245 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, |
| 246 0x00003080UL, |
| 247 0x000E2080UL, |
| 248 0x03C82080UL, |
| 249 static_cast<UChar32>(0xFA082080UL), |
| 250 static_cast<UChar32>(0x82082080UL)}; |
| 251 |
| 252 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) { |
| 253 UChar32 character = 0; |
| 254 |
| 255 // The cases all fall through. |
| 256 switch (length) { |
| 257 case 6: |
| 258 character += static_cast<unsigned char>(*sequence++); |
| 259 character <<= 6; |
| 260 case 5: |
| 261 character += static_cast<unsigned char>(*sequence++); |
| 262 character <<= 6; |
| 263 case 4: |
| 264 character += static_cast<unsigned char>(*sequence++); |
| 265 character <<= 6; |
| 266 case 3: |
| 267 character += static_cast<unsigned char>(*sequence++); |
| 268 character <<= 6; |
| 269 case 2: |
| 270 character += static_cast<unsigned char>(*sequence++); |
| 271 character <<= 6; |
| 272 case 1: |
| 273 character += static_cast<unsigned char>(*sequence++); |
| 274 } |
| 275 |
| 276 return character - offsetsFromUTF8[length - 1]; |
| 277 } |
| 278 |
| 279 ConversionResult convertUTF8ToUTF16(const char** sourceStart, |
| 280 const char* sourceEnd, UChar** targetStart, |
| 281 UChar* targetEnd, bool* sourceAllASCII, |
| 282 bool strict) { |
| 283 ConversionResult result = conversionOK; |
| 284 const char* source = *sourceStart; |
| 285 UChar* target = *targetStart; |
| 286 UChar orAllData = 0; |
| 287 while (source < sourceEnd) { |
| 288 int utf8SequenceLength = inlineUTF8SequenceLength(*source); |
| 289 if (sourceEnd - source < utf8SequenceLength) { |
| 290 result = sourceExhausted; |
| 291 break; |
| 292 } |
| 293 // Do this check whether lenient or strict |
| 294 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), |
| 295 utf8SequenceLength)) { |
| 296 result = sourceIllegal; |
| 297 break; |
| 298 } |
| 299 |
| 300 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); |
| 301 |
| 302 if (target >= targetEnd) { |
| 303 source -= utf8SequenceLength; // Back up source pointer! |
| 304 result = targetExhausted; |
| 305 break; |
| 306 } |
| 307 |
| 308 if (U_IS_BMP(character)) { |
| 309 // UTF-16 surrogate values are illegal in UTF-32 |
| 310 if (U_IS_SURROGATE(character)) { |
| 311 if (strict) { |
| 312 source -= utf8SequenceLength; // return to the illegal value itself |
| 313 result = sourceIllegal; |
| 314 break; |
| 315 } |
| 316 *target++ = replacementCharacter; |
| 317 orAllData |= replacementCharacter; |
| 318 } else { |
| 319 *target++ = static_cast<UChar>(character); // normal case |
| 320 orAllData |= character; |
| 321 } |
| 322 } else if (U_IS_SUPPLEMENTARY(character)) { |
| 323 // target is a character in range 0xFFFF - 0x10FFFF |
| 324 if (target + 1 >= targetEnd) { |
| 325 source -= utf8SequenceLength; // Back up source pointer! |
| 326 result = targetExhausted; |
| 327 break; |
| 328 } |
| 329 *target++ = U16_LEAD(character); |
| 330 *target++ = U16_TRAIL(character); |
| 331 orAllData = 0xffff; |
| 332 } else { |
| 333 if (strict) { |
| 334 source -= utf8SequenceLength; // return to the start |
| 335 result = sourceIllegal; |
| 336 break; // Bail out; shouldn't continue |
| 337 } else { |
| 338 *target++ = replacementCharacter; |
| 339 orAllData |= replacementCharacter; |
| 340 } |
| 341 } |
| 342 } |
| 343 *sourceStart = source; |
| 344 *targetStart = target; |
| 345 |
| 346 if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f); |
| 347 |
| 348 return result; |
| 349 } |
| 350 |
| 351 // Helper to write a three-byte UTF-8 code point to the buffer, caller must |
| 352 // check room is available. |
| 353 static inline void putUTF8Triple(char*& buffer, UChar ch) { |
| 354 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); |
| 355 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); |
| 356 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); |
| 357 } |
| 358 |
| 359 } // namespace |
| 360 |
| 361 // static |
| 362 String16 String16::fromInteger(int number) { |
| 363 const size_t kBufferSize = 50; |
| 364 char buffer[kBufferSize]; |
| 365 std::snprintf(buffer, kBufferSize, "%d", number); |
| 366 return String16(buffer); |
| 367 } |
| 368 |
| 369 // static |
| 370 String16 String16::fromDouble(double number) { |
| 371 const size_t kBufferSize = 100; |
| 372 char buffer[kBufferSize]; |
| 373 std::snprintf(buffer, kBufferSize, "%f", number); |
| 374 return String16(buffer); |
| 375 } |
| 376 |
| 377 // static |
| 378 String16 String16::fromDoublePrecision3(double number) { |
| 379 const size_t kBufferSize = 100; |
| 380 char buffer[kBufferSize]; |
| 381 std::snprintf(buffer, kBufferSize, "%.3g", number); |
| 382 return String16(buffer); |
| 383 } |
| 384 |
| 385 // static |
| 386 String16 String16::fromDoublePrecision6(double number) { |
| 387 const size_t kBufferSize = 100; |
| 388 char buffer[kBufferSize]; |
| 389 std::snprintf(buffer, kBufferSize, "%.6g", number); |
| 390 return String16(buffer); |
| 391 } |
| 392 |
| 393 int String16::toInteger(bool* ok) const { |
| 394 return charactersToInteger(characters16(), length(), ok); |
| 395 } |
| 396 |
| 397 String16 String16::stripWhiteSpace() const { |
| 398 if (!length()) return String16(); |
| 399 |
| 400 unsigned start = 0; |
| 401 unsigned end = length() - 1; |
| 402 |
| 403 // skip white space from start |
| 404 while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start; |
| 405 |
| 406 // only white space |
| 407 if (start > end) return String16(); |
| 408 |
| 409 // skip white space from end |
| 410 while (end && isSpaceOrNewLine(characters16()[end])) --end; |
| 411 |
| 412 if (!start && end == length() - 1) return *this; |
| 413 return String16(characters16() + start, end + 1 - start); |
| 414 } |
| 415 |
| 416 String16Builder::String16Builder() {} |
| 417 |
| 418 void String16Builder::append(const String16& s) { |
| 419 m_buffer.insert(m_buffer.end(), s.characters16(), |
| 420 s.characters16() + s.length()); |
| 421 } |
| 422 |
| 423 void String16Builder::append(UChar c) { m_buffer.push_back(c); } |
| 424 |
| 425 void String16Builder::append(char c) { |
| 426 UChar u = c; |
| 427 m_buffer.push_back(u); |
| 428 } |
| 429 |
| 430 void String16Builder::append(const UChar* characters, size_t length) { |
| 431 m_buffer.insert(m_buffer.end(), characters, characters + length); |
| 432 } |
| 433 |
| 434 void String16Builder::append(const char* characters, size_t length) { |
| 435 m_buffer.insert(m_buffer.end(), characters, characters + length); |
| 436 } |
| 437 |
| 438 String16 String16Builder::toString() { |
| 439 return String16(m_buffer.data(), m_buffer.size()); |
| 440 } |
| 441 |
| 442 void String16Builder::reserveCapacity(size_t capacity) { |
| 443 m_buffer.reserve(capacity); |
| 444 } |
| 445 |
| 446 String16 String16::fromUTF8(const char* stringStart, size_t length) { |
| 447 if (!stringStart || !length) return String16(); |
| 448 |
| 449 std::vector<UChar> buffer(length); |
| 450 UChar* bufferStart = buffer.data(); |
| 451 |
| 452 UChar* bufferCurrent = bufferStart; |
| 453 const char* stringCurrent = stringStart; |
| 454 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, |
| 455 bufferCurrent + buffer.size(), 0, |
| 456 true) != conversionOK) |
| 457 return String16(); |
| 458 |
| 459 unsigned utf16Length = bufferCurrent - bufferStart; |
| 460 return String16(bufferStart, utf16Length); |
| 461 } |
| 462 |
| 463 std::string String16::utf8() const { |
| 464 unsigned length = this->length(); |
| 465 |
| 466 if (!length) return std::string(""); |
| 467 |
| 468 // Allocate a buffer big enough to hold all the characters |
| 469 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). |
| 470 // Optimization ideas, if we find this function is hot: |
| 471 // * We could speculatively create a CStringBuffer to contain 'length' |
| 472 // characters, and resize if necessary (i.e. if the buffer contains |
| 473 // non-ascii characters). (Alternatively, scan the buffer first for |
| 474 // ascii characters, so we know this will be sufficient). |
| 475 // * We could allocate a CStringBuffer with an appropriate size to |
| 476 // have a good chance of being able to write the string into the |
| 477 // buffer without reallocing (say, 1.5 x length). |
| 478 if (length > std::numeric_limits<unsigned>::max() / 3) return std::string(); |
| 479 std::vector<char> bufferVector(length * 3); |
| 480 char* buffer = bufferVector.data(); |
| 481 const UChar* characters = m_impl.data(); |
| 482 |
| 483 ConversionResult result = |
| 484 convertUTF16ToUTF8(&characters, characters + length, &buffer, |
| 485 buffer + bufferVector.size(), false); |
| 486 DCHECK( |
| 487 result != |
| 488 targetExhausted); // (length * 3) should be sufficient for any conversion |
| 489 |
| 490 // Only produced from strict conversion. |
| 491 DCHECK(result != sourceIllegal); |
| 492 |
| 493 // Check for an unconverted high surrogate. |
| 494 if (result == sourceExhausted) { |
| 495 // This should be one unpaired high surrogate. Treat it the same |
| 496 // was as an unpaired high surrogate would have been handled in |
| 497 // the middle of a string with non-strict conversion - which is |
| 498 // to say, simply encode it to UTF-8. |
| 499 DCHECK((characters + 1) == (m_impl.data() + length)); |
| 500 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); |
| 501 // There should be room left, since one UChar hasn't been |
| 502 // converted. |
| 503 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); |
| 504 putUTF8Triple(buffer, *characters); |
| 505 } |
| 506 |
| 507 return std::string(bufferVector.data(), buffer - bufferVector.data()); |
| 508 } |
| 509 |
| 510 } // namespace v8_inspector |
OLD | NEW |