| OLD | NEW |
| (Empty) |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "src/inspector/String16.h" | |
| 6 | |
| 7 #include "src/inspector/ProtocolPlatform.h" | |
| 8 | |
| 9 #include <algorithm> | |
| 10 #include <cctype> | |
| 11 #include <cstdio> | |
| 12 #include <cstdlib> | |
| 13 #include <cstring> | |
| 14 #include <locale> | |
| 15 #include <string> | |
| 16 | |
| 17 namespace v8_inspector { | |
| 18 | |
| 19 namespace { | |
| 20 | |
| 21 bool isASCII(UChar c) | |
| 22 { | |
| 23 return !(c & ~0x7F); | |
| 24 } | |
| 25 | |
| 26 bool isSpaceOrNewLine(UChar c) | |
| 27 { | |
| 28 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9)); | |
| 29 } | |
| 30 | |
| 31 int charactersToInteger(const UChar* characters, size_t length, bool* ok = nullp
tr) | |
| 32 { | |
| 33 std::vector<char> buffer; | |
| 34 buffer.reserve(length + 1); | |
| 35 for (size_t i = 0; i < length; ++i) { | |
| 36 if (!isASCII(characters[i])) { | |
| 37 if (ok) | |
| 38 *ok = false; | |
| 39 return 0; | |
| 40 } | |
| 41 buffer.push_back(static_cast<char>(characters[i])); | |
| 42 } | |
| 43 buffer.push_back('\0'); | |
| 44 | |
| 45 char* endptr; | |
| 46 int result = std::strtol(buffer.data(), &endptr, 10); | |
| 47 if (ok) | |
| 48 *ok = !(*endptr); | |
| 49 return result; | |
| 50 } | |
| 51 | |
| 52 const UChar replacementCharacter = 0xFFFD; | |
| 53 using UChar32 = uint32_t; | |
| 54 | |
| 55 inline int inlineUTF8SequenceLengthNonASCII(char b0) | |
| 56 { | |
| 57 if ((b0 & 0xC0) != 0xC0) | |
| 58 return 0; | |
| 59 if ((b0 & 0xE0) == 0xC0) | |
| 60 return 2; | |
| 61 if ((b0 & 0xF0) == 0xE0) | |
| 62 return 3; | |
| 63 if ((b0 & 0xF8) == 0xF0) | |
| 64 return 4; | |
| 65 return 0; | |
| 66 } | |
| 67 | |
| 68 inline int inlineUTF8SequenceLength(char b0) | |
| 69 { | |
| 70 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | |
| 71 } | |
| 72 | |
| 73 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | |
| 74 // into the first byte, depending on how many bytes follow. There are | |
| 75 // as many entries in this table as there are UTF-8 sequence types. | |
| 76 // (I.e., one byte sequence, two byte... etc.). Remember that sequences | |
| 77 // for *legal* UTF-8 will be 4 or fewer bytes total. | |
| 78 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; | |
| 79 | |
| 80 typedef enum { | |
| 81 conversionOK, // conversion successful | |
| 82 sourceExhausted, // partial character in source, but hit end | |
| 83 targetExhausted, // insuff. room in target for conversion | |
| 84 sourceIllegal // source sequence is illegal/malformed | |
| 85 } ConversionResult; | |
| 86 | |
| 87 ConversionResult convertUTF16ToUTF8( | |
| 88 const UChar** sourceStart, const UChar* sourceEnd, | |
| 89 char** targetStart, char* targetEnd, bool strict) | |
| 90 { | |
| 91 ConversionResult result = conversionOK; | |
| 92 const UChar* source = *sourceStart; | |
| 93 char* target = *targetStart; | |
| 94 while (source < sourceEnd) { | |
| 95 UChar32 ch; | |
| 96 unsigned short bytesToWrite = 0; | |
| 97 const UChar32 byteMask = 0xBF; | |
| 98 const UChar32 byteMark = 0x80; | |
| 99 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. | |
| 100 ch = static_cast<unsigned short>(*source++); | |
| 101 // If we have a surrogate pair, convert to UChar32 first. | |
| 102 if (ch >= 0xD800 && ch <= 0xDBFF) { | |
| 103 // If the 16 bits following the high surrogate are in the source buf
fer... | |
| 104 if (source < sourceEnd) { | |
| 105 UChar32 ch2 = static_cast<unsigned short>(*source); | |
| 106 // If it's a low surrogate, convert to UChar32. | |
| 107 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | |
| 108 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | |
| 109 ++source; | |
| 110 } else if (strict) { // it's an unpaired high surrogate | |
| 111 --source; // return to the illegal value itself | |
| 112 result = sourceIllegal; | |
| 113 break; | |
| 114 } | |
| 115 } else { // We don't have the 16 bits following the high surrogate. | |
| 116 --source; // return to the high surrogate | |
| 117 result = sourceExhausted; | |
| 118 break; | |
| 119 } | |
| 120 } else if (strict) { | |
| 121 // UTF-16 surrogate values are illegal in UTF-32 | |
| 122 if (ch >= 0xDC00 && ch <= 0xDFFF) { | |
| 123 --source; // return to the illegal value itself | |
| 124 result = sourceIllegal; | |
| 125 break; | |
| 126 } | |
| 127 } | |
| 128 // Figure out how many bytes the result will require | |
| 129 if (ch < (UChar32)0x80) { | |
| 130 bytesToWrite = 1; | |
| 131 } else if (ch < (UChar32)0x800) { | |
| 132 bytesToWrite = 2; | |
| 133 } else if (ch < (UChar32)0x10000) { | |
| 134 bytesToWrite = 3; | |
| 135 } else if (ch < (UChar32)0x110000) { | |
| 136 bytesToWrite = 4; | |
| 137 } else { | |
| 138 bytesToWrite = 3; | |
| 139 ch = replacementCharacter; | |
| 140 } | |
| 141 | |
| 142 target += bytesToWrite; | |
| 143 if (target > targetEnd) { | |
| 144 source = oldSource; // Back up source pointer! | |
| 145 target -= bytesToWrite; | |
| 146 result = targetExhausted; | |
| 147 break; | |
| 148 } | |
| 149 switch (bytesToWrite) { // note: everything falls through. | |
| 150 case 4: | |
| 151 *--target = (char)((ch | byteMark) & byteMask); | |
| 152 ch >>= 6; | |
| 153 case 3: | |
| 154 *--target = (char)((ch | byteMark) & byteMask); | |
| 155 ch >>= 6; | |
| 156 case 2: | |
| 157 *--target = (char)((ch | byteMark) & byteMask); | |
| 158 ch >>= 6; | |
| 159 case 1: | |
| 160 *--target = (char)(ch | firstByteMark[bytesToWrite]); | |
| 161 } | |
| 162 target += bytesToWrite; | |
| 163 } | |
| 164 *sourceStart = source; | |
| 165 *targetStart = target; | |
| 166 return result; | |
| 167 } | |
| 168 | |
| 169 /** | |
| 170 * Is this code point a BMP code point (U+0000..U+ffff)? | |
| 171 * @param c 32-bit code point | |
| 172 * @return TRUE or FALSE | |
| 173 * @stable ICU 2.8 | |
| 174 */ | |
| 175 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) | |
| 176 | |
| 177 /** | |
| 178 * Is this code point a supplementary code point (U+10000..U+10ffff)? | |
| 179 * @param c 32-bit code point | |
| 180 * @return TRUE or FALSE | |
| 181 * @stable ICU 2.8 | |
| 182 */ | |
| 183 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff) | |
| 184 | |
| 185 /** | |
| 186 * Is this code point a surrogate (U+d800..U+dfff)? | |
| 187 * @param c 32-bit code point | |
| 188 * @return TRUE or FALSE | |
| 189 * @stable ICU 2.4 | |
| 190 */ | |
| 191 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) | |
| 192 | |
| 193 /** | |
| 194 * Get the lead surrogate (0xd800..0xdbff) for a | |
| 195 * supplementary code point (0x10000..0x10ffff). | |
| 196 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
| 197 * @return lead surrogate (U+d800..U+dbff) for supplementary | |
| 198 * @stable ICU 2.4 | |
| 199 */ | |
| 200 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) | |
| 201 | |
| 202 /** | |
| 203 * Get the trail surrogate (0xdc00..0xdfff) for a | |
| 204 * supplementary code point (0x10000..0x10ffff). | |
| 205 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
| 206 * @return trail surrogate (U+dc00..U+dfff) for supplementary | |
| 207 * @stable ICU 2.4 | |
| 208 */ | |
| 209 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00) | |
| 210 | |
| 211 // This must be called with the length pre-determined by the first byte. | |
| 212 // If presented with a length > 4, this returns false. The Unicode | |
| 213 // definition of UTF-8 goes up to 4-byte sequences. | |
| 214 static bool isLegalUTF8(const unsigned char* source, int length) | |
| 215 { | |
| 216 unsigned char a; | |
| 217 const unsigned char* srcptr = source + length; | |
| 218 switch (length) { | |
| 219 default: | |
| 220 return false; | |
| 221 // Everything else falls through when "true"... | |
| 222 case 4: | |
| 223 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
| 224 return false; | |
| 225 case 3: | |
| 226 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
| 227 return false; | |
| 228 case 2: | |
| 229 if ((a = (*--srcptr)) > 0xBF) | |
| 230 return false; | |
| 231 | |
| 232 // no fall-through in this inner switch | |
| 233 switch (*source) { | |
| 234 case 0xE0: | |
| 235 if (a < 0xA0) | |
| 236 return false; | |
| 237 break; | |
| 238 case 0xED: | |
| 239 if (a > 0x9F) | |
| 240 return false; | |
| 241 break; | |
| 242 case 0xF0: | |
| 243 if (a < 0x90) | |
| 244 return false; | |
| 245 break; | |
| 246 case 0xF4: | |
| 247 if (a > 0x8F) | |
| 248 return false; | |
| 249 break; | |
| 250 default: | |
| 251 if (a < 0x80) | |
| 252 return false; | |
| 253 } | |
| 254 | |
| 255 case 1: | |
| 256 if (*source >= 0x80 && *source < 0xC2) | |
| 257 return false; | |
| 258 } | |
| 259 if (*source > 0xF4) | |
| 260 return false; | |
| 261 return true; | |
| 262 } | |
| 263 | |
| 264 // Magic values subtracted from a buffer value during UTF8 conversion. | |
| 265 // This table contains as many values as there might be trailing bytes | |
| 266 // in a UTF-8 sequence. | |
| 267 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20
80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8
2082080UL) }; | |
| 268 | |
| 269 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) | |
| 270 { | |
| 271 UChar32 character = 0; | |
| 272 | |
| 273 // The cases all fall through. | |
| 274 switch (length) { | |
| 275 case 6: | |
| 276 character += static_cast<unsigned char>(*sequence++); | |
| 277 character <<= 6; | |
| 278 case 5: | |
| 279 character += static_cast<unsigned char>(*sequence++); | |
| 280 character <<= 6; | |
| 281 case 4: | |
| 282 character += static_cast<unsigned char>(*sequence++); | |
| 283 character <<= 6; | |
| 284 case 3: | |
| 285 character += static_cast<unsigned char>(*sequence++); | |
| 286 character <<= 6; | |
| 287 case 2: | |
| 288 character += static_cast<unsigned char>(*sequence++); | |
| 289 character <<= 6; | |
| 290 case 1: | |
| 291 character += static_cast<unsigned char>(*sequence++); | |
| 292 } | |
| 293 | |
| 294 return character - offsetsFromUTF8[length - 1]; | |
| 295 } | |
| 296 | |
| 297 ConversionResult convertUTF8ToUTF16( | |
| 298 const char** sourceStart, const char* sourceEnd, | |
| 299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | |
| 300 { | |
| 301 ConversionResult result = conversionOK; | |
| 302 const char* source = *sourceStart; | |
| 303 UChar* target = *targetStart; | |
| 304 UChar orAllData = 0; | |
| 305 while (source < sourceEnd) { | |
| 306 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | |
| 307 if (sourceEnd - source < utf8SequenceLength) { | |
| 308 result = sourceExhausted; | |
| 309 break; | |
| 310 } | |
| 311 // Do this check whether lenient or strict | |
| 312 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength)) { | |
| 313 result = sourceIllegal; | |
| 314 break; | |
| 315 } | |
| 316 | |
| 317 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | |
| 318 | |
| 319 if (target >= targetEnd) { | |
| 320 source -= utf8SequenceLength; // Back up source pointer! | |
| 321 result = targetExhausted; | |
| 322 break; | |
| 323 } | |
| 324 | |
| 325 if (U_IS_BMP(character)) { | |
| 326 // UTF-16 surrogate values are illegal in UTF-32 | |
| 327 if (U_IS_SURROGATE(character)) { | |
| 328 if (strict) { | |
| 329 source -= utf8SequenceLength; // return to the illegal value
itself | |
| 330 result = sourceIllegal; | |
| 331 break; | |
| 332 } | |
| 333 *target++ = replacementCharacter; | |
| 334 orAllData |= replacementCharacter; | |
| 335 } else { | |
| 336 *target++ = static_cast<UChar>(character); // normal case | |
| 337 orAllData |= character; | |
| 338 } | |
| 339 } else if (U_IS_SUPPLEMENTARY(character)) { | |
| 340 // target is a character in range 0xFFFF - 0x10FFFF | |
| 341 if (target + 1 >= targetEnd) { | |
| 342 source -= utf8SequenceLength; // Back up source pointer! | |
| 343 result = targetExhausted; | |
| 344 break; | |
| 345 } | |
| 346 *target++ = U16_LEAD(character); | |
| 347 *target++ = U16_TRAIL(character); | |
| 348 orAllData = 0xffff; | |
| 349 } else { | |
| 350 if (strict) { | |
| 351 source -= utf8SequenceLength; // return to the start | |
| 352 result = sourceIllegal; | |
| 353 break; // Bail out; shouldn't continue | |
| 354 } else { | |
| 355 *target++ = replacementCharacter; | |
| 356 orAllData |= replacementCharacter; | |
| 357 } | |
| 358 } | |
| 359 } | |
| 360 *sourceStart = source; | |
| 361 *targetStart = target; | |
| 362 | |
| 363 if (sourceAllASCII) | |
| 364 *sourceAllASCII = !(orAllData & ~0x7f); | |
| 365 | |
| 366 return result; | |
| 367 } | |
| 368 | |
| 369 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec
k room is available. | |
| 370 static inline void putUTF8Triple(char*& buffer, UChar ch) | |
| 371 { | |
| 372 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
| 373 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
| 374 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
| 375 } | |
| 376 | |
| 377 } // namespace | |
| 378 | |
| 379 // static | |
| 380 String16 String16::fromInteger(int number) | |
| 381 { | |
| 382 const size_t kBufferSize = 50; | |
| 383 char buffer[kBufferSize]; | |
| 384 std::snprintf(buffer, kBufferSize, "%d", number); | |
| 385 return String16(buffer); | |
| 386 } | |
| 387 | |
| 388 // static | |
| 389 String16 String16::fromDouble(double number) | |
| 390 { | |
| 391 const size_t kBufferSize = 100; | |
| 392 char buffer[kBufferSize]; | |
| 393 std::snprintf(buffer, kBufferSize, "%f", number); | |
| 394 return String16(buffer); | |
| 395 } | |
| 396 | |
| 397 // static | |
| 398 String16 String16::fromDoublePrecision3(double number) | |
| 399 { | |
| 400 const size_t kBufferSize = 100; | |
| 401 char buffer[kBufferSize]; | |
| 402 std::snprintf(buffer, kBufferSize, "%.3g", number); | |
| 403 return String16(buffer); | |
| 404 } | |
| 405 | |
| 406 // static | |
| 407 String16 String16::fromDoublePrecision6(double number) | |
| 408 { | |
| 409 const size_t kBufferSize = 100; | |
| 410 char buffer[kBufferSize]; | |
| 411 std::snprintf(buffer, kBufferSize, "%.6g", number); | |
| 412 return String16(buffer); | |
| 413 } | |
| 414 | |
| 415 int String16::toInteger(bool* ok) const | |
| 416 { | |
| 417 return charactersToInteger(characters16(), length(), ok); | |
| 418 } | |
| 419 | |
| 420 String16 String16::stripWhiteSpace() const | |
| 421 { | |
| 422 if (!length()) | |
| 423 return String16(); | |
| 424 | |
| 425 unsigned start = 0; | |
| 426 unsigned end = length() - 1; | |
| 427 | |
| 428 // skip white space from start | |
| 429 while (start <= end && isSpaceOrNewLine(characters16()[start])) | |
| 430 ++start; | |
| 431 | |
| 432 // only white space | |
| 433 if (start > end) | |
| 434 return String16(); | |
| 435 | |
| 436 // skip white space from end | |
| 437 while (end && isSpaceOrNewLine(characters16()[end])) | |
| 438 --end; | |
| 439 | |
| 440 if (!start && end == length() - 1) | |
| 441 return *this; | |
| 442 return String16(characters16() + start, end + 1 - start); | |
| 443 } | |
| 444 | |
| 445 String16Builder::String16Builder() | |
| 446 { | |
| 447 } | |
| 448 | |
| 449 void String16Builder::append(const String16& s) | |
| 450 { | |
| 451 m_buffer.insert(m_buffer.end(), s.characters16(), s.characters16() + s.lengt
h()); | |
| 452 } | |
| 453 | |
| 454 void String16Builder::append(UChar c) | |
| 455 { | |
| 456 m_buffer.push_back(c); | |
| 457 } | |
| 458 | |
| 459 void String16Builder::append(char c) | |
| 460 { | |
| 461 UChar u = c; | |
| 462 m_buffer.push_back(u); | |
| 463 } | |
| 464 | |
| 465 void String16Builder::append(const UChar* characters, size_t length) | |
| 466 { | |
| 467 m_buffer.insert(m_buffer.end(), characters, characters + length); | |
| 468 } | |
| 469 | |
| 470 void String16Builder::append(const char* characters, size_t length) | |
| 471 { | |
| 472 m_buffer.insert(m_buffer.end(), characters, characters + length); | |
| 473 } | |
| 474 | |
| 475 String16 String16Builder::toString() | |
| 476 { | |
| 477 return String16(m_buffer.data(), m_buffer.size()); | |
| 478 } | |
| 479 | |
| 480 void String16Builder::reserveCapacity(size_t capacity) | |
| 481 { | |
| 482 m_buffer.reserve(capacity); | |
| 483 } | |
| 484 | |
| 485 String16 String16::fromUTF8(const char* stringStart, size_t length) | |
| 486 { | |
| 487 if (!stringStart || !length) | |
| 488 return String16(); | |
| 489 | |
| 490 std::vector<UChar> buffer(length); | |
| 491 UChar* bufferStart = buffer.data(); | |
| 492 | |
| 493 UChar* bufferCurrent = bufferStart; | |
| 494 const char* stringCurrent = stringStart; | |
| 495 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
bufferCurrent + buffer.size(), 0, true) != conversionOK) | |
| 496 return String16(); | |
| 497 | |
| 498 unsigned utf16Length = bufferCurrent - bufferStart; | |
| 499 return String16(bufferStart, utf16Length); | |
| 500 } | |
| 501 | |
| 502 std::string String16::utf8() const | |
| 503 { | |
| 504 unsigned length = this->length(); | |
| 505 | |
| 506 if (!length) | |
| 507 return std::string(""); | |
| 508 | |
| 509 // Allocate a buffer big enough to hold all the characters | |
| 510 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
| 511 // Optimization ideas, if we find this function is hot: | |
| 512 // * We could speculatively create a CStringBuffer to contain 'length' | |
| 513 // characters, and resize if necessary (i.e. if the buffer contains | |
| 514 // non-ascii characters). (Alternatively, scan the buffer first for | |
| 515 // ascii characters, so we know this will be sufficient). | |
| 516 // * We could allocate a CStringBuffer with an appropriate size to | |
| 517 // have a good chance of being able to write the string into the | |
| 518 // buffer without reallocing (say, 1.5 x length). | |
| 519 if (length > std::numeric_limits<unsigned>::max() / 3) | |
| 520 return std::string(); | |
| 521 std::vector<char> bufferVector(length * 3); | |
| 522 char* buffer = bufferVector.data(); | |
| 523 const UChar* characters = m_impl.data(); | |
| 524 | |
| 525 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt
h, &buffer, buffer + bufferVector.size(), false); | |
| 526 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for
any conversion | |
| 527 | |
| 528 // Only produced from strict conversion. | |
| 529 DCHECK(result != sourceIllegal); | |
| 530 | |
| 531 // Check for an unconverted high surrogate. | |
| 532 if (result == sourceExhausted) { | |
| 533 // This should be one unpaired high surrogate. Treat it the same | |
| 534 // was as an unpaired high surrogate would have been handled in | |
| 535 // the middle of a string with non-strict conversion - which is | |
| 536 // to say, simply encode it to UTF-8. | |
| 537 DCHECK((characters + 1) == (m_impl.data() + length)); | |
| 538 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); | |
| 539 // There should be room left, since one UChar hasn't been | |
| 540 // converted. | |
| 541 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); | |
| 542 putUTF8Triple(buffer, *characters); | |
| 543 } | |
| 544 | |
| 545 return std::string(bufferVector.data(), buffer - bufferVector.data()); | |
| 546 } | |
| 547 | |
| 548 } // namespace v8_inspector | |
| OLD | NEW |