| OLD | NEW |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/string_util.h" | 5 #include "base/string_util.h" |
| 6 | 6 |
| 7 #include <string.h> | 7 #include <string.h> |
| 8 #include <vector> | 8 #include <vector> |
| 9 | 9 |
| 10 #include "base/basictypes.h" | 10 #include "base/basictypes.h" |
| 11 #include "base/logging.h" | 11 #include "base/logging.h" |
| 12 #include "base/singleton.h" | 12 #include "base/singleton.h" |
| 13 #include "unicode/ucnv.h" | 13 #include "unicode/ucnv.h" |
| 14 #include "unicode/numfmt.h" | 14 #include "unicode/numfmt.h" |
| 15 #include "unicode/ustring.h" | 15 #include "unicode/ustring.h" |
| 16 | 16 |
| 17 namespace { | 17 namespace { |
| 18 | 18 |
| 19 inline bool IsValidCodepoint(uint32 code_point) { |
| 20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and |
| 21 // codepoints larger than 0x10FFFF (the highest codepoint allowed). |
| 22 // Non-characters and unassigned codepoints are allowed. |
| 23 return code_point < 0xD800u || |
| 24 (code_point >= 0xE000u && code_point <= 0x10FFFFu); |
| 25 } |
| 26 |
| 19 // ReadUnicodeCharacter -------------------------------------------------------- | 27 // ReadUnicodeCharacter -------------------------------------------------------- |
| 20 | 28 |
| 21 // Reads a UTF-8 stream, placing the next code point into the given output | 29 // Reads a UTF-8 stream, placing the next code point into the given output |
| 22 // |*code_point|. |src| represents the entire string to read, and |*char_index| | 30 // |*code_point|. |src| represents the entire string to read, and |*char_index| |
| 23 // is the character offset within the string to start reading at. |*char_index| | 31 // is the character offset within the string to start reading at. |*char_index| |
| 24 // will be updated to index the last character read, such that incrementing it | 32 // will be updated to index the last character read, such that incrementing it |
| 25 // (as in a for loop) will take the reader to the next character. | 33 // (as in a for loop) will take the reader to the next character. |
| 26 // | 34 // |
| 27 // Returns true on success. On false, |*code_point| will be invalid. | 35 // Returns true on success. On false, |*code_point| will be invalid. |
| 28 bool ReadUnicodeCharacter(const char* src, int32 src_len, | 36 bool ReadUnicodeCharacter(const char* src, int32 src_len, |
| 29 int32* char_index, uint32* code_point_out) { | 37 int32* char_index, uint32* code_point_out) { |
| 30 // U8_NEXT expects to be able to use -1 to signal an error, so we must | 38 // U8_NEXT expects to be able to use -1 to signal an error, so we must |
| 31 // use a signed type for code_point. But this function returns false | 39 // use a signed type for code_point. But this function returns false |
| 32 // on error anyway, so code_point_out is unsigned. | 40 // on error anyway, so code_point_out is unsigned. |
| 33 int32 code_point; | 41 int32 code_point; |
| 34 U8_NEXT(src, *char_index, src_len, code_point); | 42 U8_NEXT(src, *char_index, src_len, code_point); |
| 35 *code_point_out = static_cast<uint32>(code_point); | 43 *code_point_out = static_cast<uint32>(code_point); |
| 36 | 44 |
| 37 // The ICU macro above moves to the next char, we want to point to the last | 45 // The ICU macro above moves to the next char, we want to point to the last |
| 38 // char consumed. | 46 // char consumed. |
| 39 (*char_index)--; | 47 (*char_index)--; |
| 40 | 48 |
| 41 // Validate the decoded value. | 49 // Validate the decoded value. |
| 42 return U_IS_UNICODE_CHAR(code_point); | 50 return IsValidCodepoint(code_point); |
| 43 } | 51 } |
| 44 | 52 |
| 45 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | 53 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. |
| 46 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | 54 bool ReadUnicodeCharacter(const char16* src, int32 src_len, |
| 47 int32* char_index, uint32* code_point) { | 55 int32* char_index, uint32* code_point) { |
| 48 if (U16_IS_SURROGATE(src[*char_index])) { | 56 if (U16_IS_SURROGATE(src[*char_index])) { |
| 49 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | 57 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || |
| 50 *char_index + 1 >= src_len || | 58 *char_index + 1 >= src_len || |
| 51 !U16_IS_TRAIL(src[*char_index + 1])) { | 59 !U16_IS_TRAIL(src[*char_index + 1])) { |
| 52 // Invalid surrogate pair. | 60 // Invalid surrogate pair. |
| 53 return false; | 61 return false; |
| 54 } | 62 } |
| 55 | 63 |
| 56 // Valid surrogate pair. | 64 // Valid surrogate pair. |
| 57 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | 65 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], |
| 58 src[*char_index + 1]); | 66 src[*char_index + 1]); |
| 59 (*char_index)++; | 67 (*char_index)++; |
| 60 } else { | 68 } else { |
| 61 // Not a surrogate, just one 16-bit word. | 69 // Not a surrogate, just one 16-bit word. |
| 62 *code_point = src[*char_index]; | 70 *code_point = src[*char_index]; |
| 63 } | 71 } |
| 64 | 72 |
| 65 return U_IS_UNICODE_CHAR(*code_point); | 73 return IsValidCodepoint(*code_point); |
| 66 } | 74 } |
| 67 | 75 |
| 68 #if defined(WCHAR_T_IS_UTF32) | 76 #if defined(WCHAR_T_IS_UTF32) |
| 69 // Reads UTF-32 character. The usage is the same as the 8-bit version above. | 77 // Reads UTF-32 character. The usage is the same as the 8-bit version above. |
| 70 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, | 78 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, |
| 71 int32* char_index, uint32* code_point) { | 79 int32* char_index, uint32* code_point) { |
| 72 // Conversion is easy since the source is 32-bit. | 80 // Conversion is easy since the source is 32-bit. |
| 73 *code_point = src[*char_index]; | 81 *code_point = src[*char_index]; |
| 74 | 82 |
| 75 // Validate the value. | 83 // Validate the value. |
| 76 return U_IS_UNICODE_CHAR(*code_point); | 84 return IsValidCodepoint(*code_point); |
| 77 } | 85 } |
| 78 #endif // defined(WCHAR_T_IS_UTF32) | 86 #endif // defined(WCHAR_T_IS_UTF32) |
| 79 | 87 |
| 80 // WriteUnicodeCharacter ------------------------------------------------------- | 88 // WriteUnicodeCharacter ------------------------------------------------------- |
| 81 | 89 |
| 82 // Appends a UTF-8 character to the given 8-bit string. | 90 // Appends a UTF-8 character to the given 8-bit string. |
| 83 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | 91 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { |
| 84 if (code_point <= 0x7f) { | 92 if (code_point <= 0x7f) { |
| 85 // Fast path the common case of one byte. | 93 // Fast path the common case of one byte. |
| 86 output->push_back(code_point); | 94 output->push_back(code_point); |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 127 // the result. | 135 // the result. |
| 128 template<typename SRC_CHAR, typename DEST_STRING> | 136 template<typename SRC_CHAR, typename DEST_STRING> |
| 129 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { | 137 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { |
| 130 output->clear(); | 138 output->clear(); |
| 131 | 139 |
| 132 // ICU requires 32-bit numbers. | 140 // ICU requires 32-bit numbers. |
| 133 bool success = true; | 141 bool success = true; |
| 134 int32 src_len32 = static_cast<int32>(src_len); | 142 int32 src_len32 = static_cast<int32>(src_len); |
| 135 for (int32 i = 0; i < src_len32; i++) { | 143 for (int32 i = 0; i < src_len32; i++) { |
| 136 uint32 code_point; | 144 uint32 code_point; |
| 137 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) | 145 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
| 138 WriteUnicodeCharacter(code_point, output); | 146 WriteUnicodeCharacter(code_point, output); |
| 139 else | 147 } else { |
| 148 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) |
| 149 // in place of an invalid codepoint. |
| 140 success = false; | 150 success = false; |
| 151 } |
| 141 } | 152 } |
| 142 return success; | 153 return success; |
| 143 } | 154 } |
| 144 | 155 |
| 145 | 156 |
| 146 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | 157 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount |
| 147 // of space in the given string. We also assume that the input character types | 158 // of space in the given string. We also assume that the input character types |
| 148 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | 159 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume |
| 149 // the string length is greater than zero. | 160 // the string length is greater than zero. |
| 150 template<typename CHAR> | 161 template<typename CHAR> |
| (...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 421 const char* codepage_name, | 432 const char* codepage_name, |
| 422 OnStringUtilConversionError::Type on_error, | 433 OnStringUtilConversionError::Type on_error, |
| 423 std::wstring* wide) { | 434 std::wstring* wide) { |
| 424 wide->clear(); | 435 wide->clear(); |
| 425 | 436 |
| 426 UErrorCode status = U_ZERO_ERROR; | 437 UErrorCode status = U_ZERO_ERROR; |
| 427 UConverter* converter = ucnv_open(codepage_name, &status); | 438 UConverter* converter = ucnv_open(codepage_name, &status); |
| 428 if (!U_SUCCESS(status)) | 439 if (!U_SUCCESS(status)) |
| 429 return false; | 440 return false; |
| 430 | 441 |
| 431 // The worst case is all the input characters are non-BMP (32-bit) ones. | 442 // Even in the worst case, the maximum length in 2-byte units of UTF-16 |
| 432 size_t uchar_max_length = encoded.length() * 2 + 1; | 443 // output would be at most the same as the number of bytes in input. There |
| 444 // is no single-byte encoding in which a character is mapped to a |
| 445 // non-BMP character requiring two 2-byte units. |
| 446 // |
| 447 // Moreover, non-BMP characters in legacy multibyte encodings |
| 448 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are |
| 449 // BOCU and SCSU, but we don't care about them. |
| 450 size_t uchar_max_length = encoded.length() + 1; |
| 433 | 451 |
| 434 UChar* uchar_dst; | 452 UChar* uchar_dst; |
| 435 #if defined(WCHAR_T_IS_UTF16) | 453 #if defined(WCHAR_T_IS_UTF16) |
| 436 uchar_dst = WriteInto(wide, uchar_max_length); | 454 uchar_dst = WriteInto(wide, uchar_max_length); |
| 437 #elif defined(WCHAR_T_IS_UTF32) | 455 #elif defined(WCHAR_T_IS_UTF32) |
| 438 // When wchar_t is wider than UChar (16 bits), convert into a temporary | 456 // When wchar_t is wider than UChar (16 bits), convert into a temporary |
| 439 // UChar* buffer. | 457 // UChar* buffer. |
| 440 std::vector<UChar> wide_uchar(uchar_max_length); | 458 std::vector<UChar> wide_uchar(uchar_max_length); |
| 441 uchar_dst = &wide_uchar[0]; | 459 uchar_dst = &wide_uchar[0]; |
| 442 #endif // defined(WCHAR_T_IS_UTF32) | 460 #endif // defined(WCHAR_T_IS_UTF32) |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 536 // This implementation is not so fast since it converts the text encoding | 554 // This implementation is not so fast since it converts the text encoding |
| 537 // twice. Please feel free to file a bug if this function hurts the | 555 // twice. Please feel free to file a bug if this function hurts the |
| 538 // performance of Chrome. | 556 // performance of Chrome. |
| 539 DCHECK(IsStringUTF8(input)); | 557 DCHECK(IsStringUTF8(input)); |
| 540 std::wstring input_wide = UTF8ToWide(input); | 558 std::wstring input_wide = UTF8ToWide(input); |
| 541 std::wstring output_wide; | 559 std::wstring output_wide; |
| 542 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); | 560 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); |
| 543 *output = WideToUTF8(output_wide); | 561 *output = WideToUTF8(output_wide); |
| 544 return result; | 562 return result; |
| 545 } | 563 } |
| OLD | NEW |