| OLD | NEW |
| (Empty) |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "platform/inspector_protocol/InspectorProtocol.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 #include <cctype> | |
| 9 #include <cstdio> | |
| 10 #include <locale> | |
| 11 | |
| 12 namespace blink { | |
| 13 namespace protocol { | |
| 14 | |
| 15 const UChar replacementCharacter = 0xFFFD; | |
| 16 using UChar32 = uint32_t; | |
| 17 | |
| 18 inline int inlineUTF8SequenceLengthNonASCII(char b0) | |
| 19 { | |
| 20 if ((b0 & 0xC0) != 0xC0) | |
| 21 return 0; | |
| 22 if ((b0 & 0xE0) == 0xC0) | |
| 23 return 2; | |
| 24 if ((b0 & 0xF0) == 0xE0) | |
| 25 return 3; | |
| 26 if ((b0 & 0xF8) == 0xF0) | |
| 27 return 4; | |
| 28 return 0; | |
| 29 } | |
| 30 | |
| 31 inline int inlineUTF8SequenceLength(char b0) | |
| 32 { | |
| 33 return String16::isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | |
| 34 } | |
| 35 | |
| 36 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | |
| 37 // into the first byte, depending on how many bytes follow. There are | |
| 38 // as many entries in this table as there are UTF-8 sequence types. | |
| 39 // (I.e., one byte sequence, two byte... etc.). Remember that sequences | |
| 40 // for *legal* UTF-8 will be 4 or fewer bytes total. | |
| 41 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; | |
| 42 | |
| 43 typedef enum { | |
| 44 conversionOK, // conversion successful | |
| 45 sourceExhausted, // partial character in source, but hit end | |
| 46 targetExhausted, // insuff. room in target for conversion | |
| 47 sourceIllegal // source sequence is illegal/malformed | |
| 48 } ConversionResult; | |
| 49 | |
| 50 ConversionResult convertUTF16ToUTF8( | |
| 51 const UChar** sourceStart, const UChar* sourceEnd, | |
| 52 char** targetStart, char* targetEnd, bool strict) | |
| 53 { | |
| 54 ConversionResult result = conversionOK; | |
| 55 const UChar* source = *sourceStart; | |
| 56 char* target = *targetStart; | |
| 57 while (source < sourceEnd) { | |
| 58 UChar32 ch; | |
| 59 unsigned short bytesToWrite = 0; | |
| 60 const UChar32 byteMask = 0xBF; | |
| 61 const UChar32 byteMark = 0x80; | |
| 62 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. | |
| 63 ch = static_cast<unsigned short>(*source++); | |
| 64 // If we have a surrogate pair, convert to UChar32 first. | |
| 65 if (ch >= 0xD800 && ch <= 0xDBFF) { | |
| 66 // If the 16 bits following the high surrogate are in the source buf
fer... | |
| 67 if (source < sourceEnd) { | |
| 68 UChar32 ch2 = static_cast<unsigned short>(*source); | |
| 69 // If it's a low surrogate, convert to UChar32. | |
| 70 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | |
| 71 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | |
| 72 ++source; | |
| 73 } else if (strict) { // it's an unpaired high surrogate | |
| 74 --source; // return to the illegal value itself | |
| 75 result = sourceIllegal; | |
| 76 break; | |
| 77 } | |
| 78 } else { // We don't have the 16 bits following the high surrogate. | |
| 79 --source; // return to the high surrogate | |
| 80 result = sourceExhausted; | |
| 81 break; | |
| 82 } | |
| 83 } else if (strict) { | |
| 84 // UTF-16 surrogate values are illegal in UTF-32 | |
| 85 if (ch >= 0xDC00 && ch <= 0xDFFF) { | |
| 86 --source; // return to the illegal value itself | |
| 87 result = sourceIllegal; | |
| 88 break; | |
| 89 } | |
| 90 } | |
| 91 // Figure out how many bytes the result will require | |
| 92 if (ch < (UChar32)0x80) { | |
| 93 bytesToWrite = 1; | |
| 94 } else if (ch < (UChar32)0x800) { | |
| 95 bytesToWrite = 2; | |
| 96 } else if (ch < (UChar32)0x10000) { | |
| 97 bytesToWrite = 3; | |
| 98 } else if (ch < (UChar32)0x110000) { | |
| 99 bytesToWrite = 4; | |
| 100 } else { | |
| 101 bytesToWrite = 3; | |
| 102 ch = replacementCharacter; | |
| 103 } | |
| 104 | |
| 105 target += bytesToWrite; | |
| 106 if (target > targetEnd) { | |
| 107 source = oldSource; // Back up source pointer! | |
| 108 target -= bytesToWrite; | |
| 109 result = targetExhausted; | |
| 110 break; | |
| 111 } | |
| 112 switch (bytesToWrite) { // note: everything falls through. | |
| 113 case 4: | |
| 114 *--target = (char)((ch | byteMark) & byteMask); | |
| 115 ch >>= 6; | |
| 116 case 3: | |
| 117 *--target = (char)((ch | byteMark) & byteMask); | |
| 118 ch >>= 6; | |
| 119 case 2: | |
| 120 *--target = (char)((ch | byteMark) & byteMask); | |
| 121 ch >>= 6; | |
| 122 case 1: | |
| 123 *--target = (char)(ch | firstByteMark[bytesToWrite]); | |
| 124 } | |
| 125 target += bytesToWrite; | |
| 126 } | |
| 127 *sourceStart = source; | |
| 128 *targetStart = target; | |
| 129 return result; | |
| 130 } | |
| 131 | |
| 132 /** | |
| 133 * Is this code point a BMP code point (U+0000..U+ffff)? | |
| 134 * @param c 32-bit code point | |
| 135 * @return TRUE or FALSE | |
| 136 * @stable ICU 2.8 | |
| 137 */ | |
| 138 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) | |
| 139 | |
| 140 /** | |
| 141 * Is this code point a supplementary code point (U+10000..U+10ffff)? | |
| 142 * @param c 32-bit code point | |
| 143 * @return TRUE or FALSE | |
| 144 * @stable ICU 2.8 | |
| 145 */ | |
| 146 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff) | |
| 147 | |
| 148 /** | |
| 149 * Is this code point a surrogate (U+d800..U+dfff)? | |
| 150 * @param c 32-bit code point | |
| 151 * @return TRUE or FALSE | |
| 152 * @stable ICU 2.4 | |
| 153 */ | |
| 154 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) | |
| 155 | |
| 156 /** | |
| 157 * Get the lead surrogate (0xd800..0xdbff) for a | |
| 158 * supplementary code point (0x10000..0x10ffff). | |
| 159 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
| 160 * @return lead surrogate (U+d800..U+dbff) for supplementary | |
| 161 * @stable ICU 2.4 | |
| 162 */ | |
| 163 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) | |
| 164 | |
| 165 /** | |
| 166 * Get the trail surrogate (0xdc00..0xdfff) for a | |
| 167 * supplementary code point (0x10000..0x10ffff). | |
| 168 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
| 169 * @return trail surrogate (U+dc00..U+dfff) for supplementary | |
| 170 * @stable ICU 2.4 | |
| 171 */ | |
| 172 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00) | |
| 173 | |
| 174 // This must be called with the length pre-determined by the first byte. | |
| 175 // If presented with a length > 4, this returns false. The Unicode | |
| 176 // definition of UTF-8 goes up to 4-byte sequences. | |
| 177 static bool isLegalUTF8(const unsigned char* source, int length) | |
| 178 { | |
| 179 unsigned char a; | |
| 180 const unsigned char* srcptr = source + length; | |
| 181 switch (length) { | |
| 182 default: | |
| 183 return false; | |
| 184 // Everything else falls through when "true"... | |
| 185 case 4: | |
| 186 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
| 187 return false; | |
| 188 case 3: | |
| 189 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
| 190 return false; | |
| 191 case 2: | |
| 192 if ((a = (*--srcptr)) > 0xBF) | |
| 193 return false; | |
| 194 | |
| 195 // no fall-through in this inner switch | |
| 196 switch (*source) { | |
| 197 case 0xE0: | |
| 198 if (a < 0xA0) | |
| 199 return false; | |
| 200 break; | |
| 201 case 0xED: | |
| 202 if (a > 0x9F) | |
| 203 return false; | |
| 204 break; | |
| 205 case 0xF0: | |
| 206 if (a < 0x90) | |
| 207 return false; | |
| 208 break; | |
| 209 case 0xF4: | |
| 210 if (a > 0x8F) | |
| 211 return false; | |
| 212 break; | |
| 213 default: | |
| 214 if (a < 0x80) | |
| 215 return false; | |
| 216 } | |
| 217 | |
| 218 case 1: | |
| 219 if (*source >= 0x80 && *source < 0xC2) | |
| 220 return false; | |
| 221 } | |
| 222 if (*source > 0xF4) | |
| 223 return false; | |
| 224 return true; | |
| 225 } | |
| 226 | |
| 227 // Magic values subtracted from a buffer value during UTF8 conversion. | |
| 228 // This table contains as many values as there might be trailing bytes | |
| 229 // in a UTF-8 sequence. | |
| 230 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20
80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8
2082080UL) }; | |
| 231 | |
| 232 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) | |
| 233 { | |
| 234 UChar32 character = 0; | |
| 235 | |
| 236 // The cases all fall through. | |
| 237 switch (length) { | |
| 238 case 6: | |
| 239 character += static_cast<unsigned char>(*sequence++); | |
| 240 character <<= 6; | |
| 241 case 5: | |
| 242 character += static_cast<unsigned char>(*sequence++); | |
| 243 character <<= 6; | |
| 244 case 4: | |
| 245 character += static_cast<unsigned char>(*sequence++); | |
| 246 character <<= 6; | |
| 247 case 3: | |
| 248 character += static_cast<unsigned char>(*sequence++); | |
| 249 character <<= 6; | |
| 250 case 2: | |
| 251 character += static_cast<unsigned char>(*sequence++); | |
| 252 character <<= 6; | |
| 253 case 1: | |
| 254 character += static_cast<unsigned char>(*sequence++); | |
| 255 } | |
| 256 | |
| 257 return character - offsetsFromUTF8[length - 1]; | |
| 258 } | |
| 259 | |
| 260 ConversionResult convertUTF8ToUTF16( | |
| 261 const char** sourceStart, const char* sourceEnd, | |
| 262 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | |
| 263 { | |
| 264 ConversionResult result = conversionOK; | |
| 265 const char* source = *sourceStart; | |
| 266 UChar* target = *targetStart; | |
| 267 UChar orAllData = 0; | |
| 268 while (source < sourceEnd) { | |
| 269 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | |
| 270 if (sourceEnd - source < utf8SequenceLength) { | |
| 271 result = sourceExhausted; | |
| 272 break; | |
| 273 } | |
| 274 // Do this check whether lenient or strict | |
| 275 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength)) { | |
| 276 result = sourceIllegal; | |
| 277 break; | |
| 278 } | |
| 279 | |
| 280 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | |
| 281 | |
| 282 if (target >= targetEnd) { | |
| 283 source -= utf8SequenceLength; // Back up source pointer! | |
| 284 result = targetExhausted; | |
| 285 break; | |
| 286 } | |
| 287 | |
| 288 if (U_IS_BMP(character)) { | |
| 289 // UTF-16 surrogate values are illegal in UTF-32 | |
| 290 if (U_IS_SURROGATE(character)) { | |
| 291 if (strict) { | |
| 292 source -= utf8SequenceLength; // return to the illegal value
itself | |
| 293 result = sourceIllegal; | |
| 294 break; | |
| 295 } | |
| 296 *target++ = replacementCharacter; | |
| 297 orAllData |= replacementCharacter; | |
| 298 } else { | |
| 299 *target++ = static_cast<UChar>(character); // normal case | |
| 300 orAllData |= character; | |
| 301 } | |
| 302 } else if (U_IS_SUPPLEMENTARY(character)) { | |
| 303 // target is a character in range 0xFFFF - 0x10FFFF | |
| 304 if (target + 1 >= targetEnd) { | |
| 305 source -= utf8SequenceLength; // Back up source pointer! | |
| 306 result = targetExhausted; | |
| 307 break; | |
| 308 } | |
| 309 *target++ = U16_LEAD(character); | |
| 310 *target++ = U16_TRAIL(character); | |
| 311 orAllData = 0xffff; | |
| 312 } else { | |
| 313 if (strict) { | |
| 314 source -= utf8SequenceLength; // return to the start | |
| 315 result = sourceIllegal; | |
| 316 break; // Bail out; shouldn't continue | |
| 317 } else { | |
| 318 *target++ = replacementCharacter; | |
| 319 orAllData |= replacementCharacter; | |
| 320 } | |
| 321 } | |
| 322 } | |
| 323 *sourceStart = source; | |
| 324 *targetStart = target; | |
| 325 | |
| 326 if (sourceAllASCII) | |
| 327 *sourceAllASCII = !(orAllData & ~0x7f); | |
| 328 | |
| 329 return result; | |
| 330 } | |
| 331 | |
| 332 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec
k room is available. | |
| 333 static inline void putUTF8Triple(char*& buffer, UChar ch) | |
| 334 { | |
| 335 DCHECK_GE(ch, 0x0800); | |
| 336 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
| 337 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
| 338 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
| 339 } | |
| 340 | |
| 341 String16 String16::fromUTF8(const char* stringStart, size_t length) | |
| 342 { | |
| 343 if (!stringStart || !length) | |
| 344 return String16(); | |
| 345 | |
| 346 std::vector<UChar> buffer(length); | |
| 347 UChar* bufferStart = buffer.data(); | |
| 348 | |
| 349 UChar* bufferCurrent = bufferStart; | |
| 350 const char* stringCurrent = stringStart; | |
| 351 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
bufferCurrent + buffer.size(), 0, true) != conversionOK) | |
| 352 return String16(); | |
| 353 | |
| 354 unsigned utf16Length = bufferCurrent - bufferStart; | |
| 355 return String16(bufferStart, utf16Length); | |
| 356 } | |
| 357 | |
| 358 std::string String16::utf8() const | |
| 359 { | |
| 360 unsigned length = this->length(); | |
| 361 | |
| 362 if (!length) | |
| 363 return std::string(""); | |
| 364 | |
| 365 // Allocate a buffer big enough to hold all the characters | |
| 366 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
| 367 // Optimization ideas, if we find this function is hot: | |
| 368 // * We could speculatively create a CStringBuffer to contain 'length' | |
| 369 // characters, and resize if necessary (i.e. if the buffer contains | |
| 370 // non-ascii characters). (Alternatively, scan the buffer first for | |
| 371 // ascii characters, so we know this will be sufficient). | |
| 372 // * We could allocate a CStringBuffer with an appropriate size to | |
| 373 // have a good chance of being able to write the string into the | |
| 374 // buffer without reallocing (say, 1.5 x length). | |
| 375 if (length > std::numeric_limits<unsigned>::max() / 3) | |
| 376 return std::string(); | |
| 377 std::vector<char> bufferVector(length * 3); | |
| 378 char* buffer = bufferVector.data(); | |
| 379 const UChar* characters = m_impl.data(); | |
| 380 | |
| 381 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt
h, &buffer, buffer + bufferVector.size(), false); | |
| 382 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for
any conversion | |
| 383 | |
| 384 // Only produced from strict conversion. | |
| 385 DCHECK(result != sourceIllegal); | |
| 386 | |
| 387 // Check for an unconverted high surrogate. | |
| 388 if (result == sourceExhausted) { | |
| 389 // This should be one unpaired high surrogate. Treat it the same | |
| 390 // was as an unpaired high surrogate would have been handled in | |
| 391 // the middle of a string with non-strict conversion - which is | |
| 392 // to say, simply encode it to UTF-8. | |
| 393 DCHECK((characters + 1) == (m_impl.data() + length)); | |
| 394 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); | |
| 395 // There should be room left, since one UChar hasn't been | |
| 396 // converted. | |
| 397 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); | |
| 398 putUTF8Triple(buffer, *characters); | |
| 399 } | |
| 400 | |
| 401 return std::string(bufferVector.data(), buffer - bufferVector.data()); | |
| 402 } | |
| 403 | |
| 404 } // namespace protocol | |
| 405 } // namespace blink | |
| OLD | NEW |