Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "platform/inspector_protocol/String16STL.h" | 5 #include "platform/inspector_protocol/String16STL.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <cctype> | 8 #include <cctype> |
| 9 #include <cstdio> | |
| 9 #include <functional> | 10 #include <functional> |
| 10 #include <locale> | 11 #include <locale> |
| 11 | 12 |
| 13 #define DCHECK(k) | |
|
dgozman
2016/06/08 15:56:56
Not needed, as you have it in Platform.h
| |
| 14 | |
| 12 namespace blink { | 15 namespace blink { |
| 13 namespace protocol { | 16 namespace protocol { |
| 14 | 17 |
| 15 const UChar replacementCharacter = 0xFFFD; | 18 const UChar replacementCharacter = 0xFFFD; |
| 16 | 19 |
| 17 template<typename CharType> inline bool isASCII(CharType c) | 20 template<typename CharType> inline bool isASCII(CharType c) |
| 18 { | 21 { |
| 19 return !(c & ~0x7F); | 22 return !(c & ~0x7F); |
| 20 } | 23 } |
| 21 | 24 |
| (...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 267 case 1: | 270 case 1: |
| 268 *--target = (char)(ch | firstByteMark[bytesToWrite]); | 271 *--target = (char)(ch | firstByteMark[bytesToWrite]); |
| 269 } | 272 } |
| 270 target += bytesToWrite; | 273 target += bytesToWrite; |
| 271 } | 274 } |
| 272 *sourceStart = source; | 275 *sourceStart = source; |
| 273 *targetStart = target; | 276 *targetStart = target; |
| 274 return result; | 277 return result; |
| 275 } | 278 } |
| 276 | 279 |
| 280 /** | |
| 281 * Is this code point a BMP code point (U+0000..U+ffff)? | |
| 282 * @param c 32-bit code point | |
| 283 * @return TRUE or FALSE | |
| 284 * @stable ICU 2.8 | |
| 285 */ | |
| 286 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) | |
| 287 | |
| 288 /** | |
| 289 * Is this code point a supplementary code point (U+10000..U+10ffff)? | |
| 290 * @param c 32-bit code point | |
| 291 * @return TRUE or FALSE | |
| 292 * @stable ICU 2.8 | |
| 293 */ | |
| 294 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff) | |
| 295 | |
| 296 /** | |
| 297 * Is this code point a surrogate (U+d800..U+dfff)? | |
| 298 * @param c 32-bit code point | |
| 299 * @return TRUE or FALSE | |
| 300 * @stable ICU 2.4 | |
| 301 */ | |
| 302 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) | |
| 303 | |
| 304 /** | |
| 305 * Get the lead surrogate (0xd800..0xdbff) for a | |
| 306 * supplementary code point (0x10000..0x10ffff). | |
| 307 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
| 308 * @return lead surrogate (U+d800..U+dbff) for supplementary | |
| 309 * @stable ICU 2.4 | |
| 310 */ | |
| 311 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) | |
| 312 | |
| 313 /** | |
| 314 * Get the trail surrogate (0xdc00..0xdfff) for a | |
| 315 * supplementary code point (0x10000..0x10ffff). | |
| 316 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
| 317 * @return trail surrogate (U+dc00..U+dfff) for supplementary | |
| 318 * @stable ICU 2.4 | |
| 319 */ | |
| 320 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00) | |
| 321 | |
| 322 // This must be called with the length pre-determined by the first byte. | |
| 323 // If presented with a length > 4, this returns false. The Unicode | |
| 324 // definition of UTF-8 goes up to 4-byte sequences. | |
| 325 static bool isLegalUTF8(const unsigned char* source, int length) | |
| 326 { | |
| 327 unsigned char a; | |
| 328 const unsigned char* srcptr = source + length; | |
| 329 switch (length) { | |
| 330 default: | |
| 331 return false; | |
| 332 // Everything else falls through when "true"... | |
| 333 case 4: | |
| 334 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
| 335 return false; | |
| 336 case 3: | |
| 337 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
| 338 return false; | |
| 339 case 2: | |
| 340 if ((a = (*--srcptr)) > 0xBF) | |
| 341 return false; | |
| 342 | |
| 343 // no fall-through in this inner switch | |
| 344 switch (*source) { | |
| 345 case 0xE0: | |
| 346 if (a < 0xA0) | |
| 347 return false; | |
| 348 break; | |
| 349 case 0xED: | |
| 350 if (a > 0x9F) | |
| 351 return false; | |
| 352 break; | |
| 353 case 0xF0: | |
| 354 if (a < 0x90) | |
| 355 return false; | |
| 356 break; | |
| 357 case 0xF4: | |
| 358 if (a > 0x8F) | |
| 359 return false; | |
| 360 break; | |
| 361 default: | |
| 362 if (a < 0x80) | |
| 363 return false; | |
| 364 } | |
| 365 | |
| 366 case 1: | |
| 367 if (*source >= 0x80 && *source < 0xC2) | |
| 368 return false; | |
| 369 } | |
| 370 if (*source > 0xF4) | |
| 371 return false; | |
| 372 return true; | |
| 373 } | |
| 374 | |
| 375 // Magic values subtracted from a buffer value during UTF8 conversion. | |
| 376 // This table contains as many values as there might be trailing bytes | |
| 377 // in a UTF-8 sequence. | |
| 378 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) }; | |
| 379 | |
| 380 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) | |
| 381 { | |
| 382 UChar32 character = 0; | |
| 383 | |
| 384 // The cases all fall through. | |
| 385 switch (length) { | |
| 386 case 6: | |
| 387 character += static_cast<unsigned char>(*sequence++); | |
| 388 character <<= 6; | |
| 389 case 5: | |
| 390 character += static_cast<unsigned char>(*sequence++); | |
| 391 character <<= 6; | |
| 392 case 4: | |
| 393 character += static_cast<unsigned char>(*sequence++); | |
| 394 character <<= 6; | |
| 395 case 3: | |
| 396 character += static_cast<unsigned char>(*sequence++); | |
| 397 character <<= 6; | |
| 398 case 2: | |
| 399 character += static_cast<unsigned char>(*sequence++); | |
| 400 character <<= 6; | |
| 401 case 1: | |
| 402 character += static_cast<unsigned char>(*sequence++); | |
| 403 } | |
| 404 | |
| 405 return character - offsetsFromUTF8[length - 1]; | |
| 406 } | |
| 407 | |
| 408 ConversionResult convertUTF8ToUTF16( | |
| 409 const char** sourceStart, const char* sourceEnd, | |
| 410 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | |
| 411 { | |
| 412 ConversionResult result = conversionOK; | |
| 413 const char* source = *sourceStart; | |
| 414 UChar* target = *targetStart; | |
| 415 UChar orAllData = 0; | |
| 416 while (source < sourceEnd) { | |
| 417 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | |
| 418 if (sourceEnd - source < utf8SequenceLength) { | |
| 419 result = sourceExhausted; | |
| 420 break; | |
| 421 } | |
| 422 // Do this check whether lenient or strict | |
| 423 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) { | |
| 424 result = sourceIllegal; | |
| 425 break; | |
| 426 } | |
| 427 | |
| 428 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | |
| 429 | |
| 430 if (target >= targetEnd) { | |
| 431 source -= utf8SequenceLength; // Back up source pointer! | |
| 432 result = targetExhausted; | |
| 433 break; | |
| 434 } | |
| 435 | |
| 436 if (U_IS_BMP(character)) { | |
| 437 // UTF-16 surrogate values are illegal in UTF-32 | |
| 438 if (U_IS_SURROGATE(character)) { | |
| 439 if (strict) { | |
| 440 source -= utf8SequenceLength; // return to the illegal value itself | |
| 441 result = sourceIllegal; | |
| 442 break; | |
| 443 } | |
| 444 *target++ = replacementCharacter; | |
| 445 orAllData |= replacementCharacter; | |
| 446 } else { | |
| 447 *target++ = static_cast<UChar>(character); // normal case | |
| 448 orAllData |= character; | |
| 449 } | |
| 450 } else if (U_IS_SUPPLEMENTARY(character)) { | |
| 451 // target is a character in range 0xFFFF - 0x10FFFF | |
| 452 if (target + 1 >= targetEnd) { | |
| 453 source -= utf8SequenceLength; // Back up source pointer! | |
| 454 result = targetExhausted; | |
| 455 break; | |
| 456 } | |
| 457 *target++ = U16_LEAD(character); | |
| 458 *target++ = U16_TRAIL(character); | |
| 459 orAllData = 0xffff; | |
| 460 } else { | |
| 461 if (strict) { | |
| 462 source -= utf8SequenceLength; // return to the start | |
| 463 result = sourceIllegal; | |
| 464 break; // Bail out; shouldn't continue | |
| 465 } else { | |
| 466 *target++ = replacementCharacter; | |
| 467 orAllData |= replacementCharacter; | |
| 468 } | |
| 469 } | |
| 470 } | |
| 471 *sourceStart = source; | |
| 472 *targetStart = target; | |
| 473 | |
| 474 if (sourceAllASCII) | |
| 475 *sourceAllASCII = !(orAllData & ~0x7f); | |
| 476 | |
| 477 return result; | |
| 478 } | |
| 479 | |
| 277 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec k room is available. | 480 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec k room is available. |
| 278 static inline void putUTF8Triple(char*& buffer, UChar ch) | 481 static inline void putUTF8Triple(char*& buffer, UChar ch) |
| 279 { | 482 { |
| 280 DCHECK_GE(ch, 0x0800); | 483 DCHECK_GE(ch, 0x0800); |
| 281 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | 484 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); |
| 282 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | 485 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); |
| 283 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | 486 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); |
| 284 } | 487 } |
| 285 | 488 |
| 489 String16 String16::fromUTF8(const char* stringStart, size_t length) | |
| 490 { | |
| 491 if (!stringStart || !length) | |
| 492 return String16(); | |
| 493 | |
| 494 std::vector<UChar> buffer(length); | |
| 495 UChar* bufferStart = buffer.data(); | |
| 496 | |
| 497 UChar* bufferCurrent = bufferStart; | |
| 498 const char* stringCurrent = stringStart; | |
| 499 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, bufferCurrent + buffer.size(), 0, true) != conversionOK) | |
| 500 return String16(); | |
| 501 | |
| 502 unsigned utf16Length = bufferCurrent - bufferStart; | |
| 503 return String16(bufferStart, utf16Length); | |
| 504 } | |
| 505 | |
| 286 // trim from start | 506 // trim from start |
| 287 static inline wstring <rim(wstring &s) | 507 static inline wstring <rim(wstring &s) |
| 288 { | 508 { |
| 289 s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<i nt, int>(std::isspace)))); | 509 s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<i nt, int>(std::isspace)))); |
| 290 return s; | 510 return s; |
| 291 } | 511 } |
| 292 | 512 |
| 293 // trim from end | 513 // trim from end |
| 294 static inline wstring &rtrim(wstring &s) | 514 static inline wstring &rtrim(wstring &s) |
| 295 { | 515 { |
| 296 s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>( std::isspace))).base(), s.end()); | 516 s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>( std::isspace))).base(), s.end()); |
| 297 return s; | 517 return s; |
| 298 } | 518 } |
| 299 | 519 |
| 300 // trim from both ends | 520 // trim from both ends |
| 301 static inline wstring &trim(wstring &s) | 521 static inline wstring &trim(wstring &s) |
| 302 { | 522 { |
| 303 return ltrim(rtrim(s)); | 523 return ltrim(rtrim(s)); |
| 304 } | 524 } |
| 305 | 525 |
| 526 // static | |
| 527 std::string String16::intToString(int i) | |
| 528 { | |
| 529 char buffer[50]; | |
| 530 std::sprintf(buffer, "%d", i); | |
| 531 return std::string(buffer); | |
| 532 } | |
| 533 | |
| 534 // static | |
| 535 std::string String16::doubleToString(double d) | |
| 536 { | |
| 537 char buffer[100]; | |
| 538 std::sprintf(buffer, "%f", d); | |
| 539 return std::string(buffer); | |
| 540 } | |
| 541 | |
| 306 std::string String16::utf8() const | 542 std::string String16::utf8() const |
| 307 { | 543 { |
| 308 unsigned length = this->length(); | 544 unsigned length = this->length(); |
| 309 | 545 |
| 310 if (!length) | 546 if (!length) |
| 311 return std::string(""); | 547 return std::string(""); |
| 312 | 548 |
| 313 // Allocate a buffer big enough to hold all the characters | 549 // Allocate a buffer big enough to hold all the characters |
| 314 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | 550 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). |
| 315 // Optimization ideas, if we find this function is hot: | 551 // Optimization ideas, if we find this function is hot: |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 357 | 593 |
| 358 String16 String16::stripWhiteSpace() const | 594 String16 String16::stripWhiteSpace() const |
| 359 { | 595 { |
| 360 wstring result(m_impl); | 596 wstring result(m_impl); |
| 361 trim(result); | 597 trim(result); |
| 362 return result; | 598 return result; |
| 363 } | 599 } |
| 364 | 600 |
| 365 } // namespace protocol | 601 } // namespace protocol |
| 366 } // namespace blink | 602 } // namespace blink |
| OLD | NEW |