OLD | NEW |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/string_util.h" | 5 #include "base/string_util.h" |
6 | 6 |
7 #include <string.h> | 7 #include <string.h> |
8 #include <vector> | 8 #include <vector> |
9 | 9 |
10 #include "base/basictypes.h" | 10 #include "base/basictypes.h" |
11 #include "base/logging.h" | 11 #include "base/logging.h" |
12 #include "base/singleton.h" | 12 #include "base/singleton.h" |
13 #include "unicode/ucnv.h" | 13 #include "unicode/ucnv.h" |
14 #include "unicode/numfmt.h" | 14 #include "unicode/numfmt.h" |
15 #include "unicode/ustring.h" | 15 #include "unicode/ustring.h" |
16 | 16 |
17 namespace { | 17 namespace { |
18 | 18 |
| 19 inline bool IsValidCodepoint(uint32 code_point) { |
| 20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and |
| 21 // codepoints larger than 0x10FFFF (the highest codepoint allowed). |
| 22 // Non-characters and unassigned codepoints are allowed. |
| 23 return code_point < 0xD800u || |
| 24 (code_point >= 0xE000u && code_point <= 0x10FFFFu); |
| 25 } |
| 26 |
19 // ReadUnicodeCharacter -------------------------------------------------------- | 27 // ReadUnicodeCharacter -------------------------------------------------------- |
20 | 28 |
21 // Reads a UTF-8 stream, placing the next code point into the given output | 29 // Reads a UTF-8 stream, placing the next code point into the given output |
22 // |*code_point|. |src| represents the entire string to read, and |*char_index| | 30 // |*code_point|. |src| represents the entire string to read, and |*char_index| |
23 // is the character offset within the string to start reading at. |*char_index| | 31 // is the character offset within the string to start reading at. |*char_index| |
24 // will be updated to index the last character read, such that incrementing it | 32 // will be updated to index the last character read, such that incrementing it |
25 // (as in a for loop) will take the reader to the next character. | 33 // (as in a for loop) will take the reader to the next character. |
26 // | 34 // |
27 // Returns true on success. On false, |*code_point| will be invalid. | 35 // Returns true on success. On false, |*code_point| will be invalid. |
28 bool ReadUnicodeCharacter(const char* src, int32 src_len, | 36 bool ReadUnicodeCharacter(const char* src, int32 src_len, |
29 int32* char_index, uint32* code_point_out) { | 37 int32* char_index, uint32* code_point_out) { |
30 // U8_NEXT expects to be able to use -1 to signal an error, so we must | 38 // U8_NEXT expects to be able to use -1 to signal an error, so we must |
31 // use a signed type for code_point. But this function returns false | 39 // use a signed type for code_point. But this function returns false |
32 // on error anyway, so code_point_out is unsigned. | 40 // on error anyway, so code_point_out is unsigned. |
33 int32 code_point; | 41 int32 code_point; |
34 U8_NEXT(src, *char_index, src_len, code_point); | 42 U8_NEXT(src, *char_index, src_len, code_point); |
35 *code_point_out = static_cast<uint32>(code_point); | 43 *code_point_out = static_cast<uint32>(code_point); |
36 | 44 |
37 // The ICU macro above moves to the next char, we want to point to the last | 45 // The ICU macro above moves to the next char, we want to point to the last |
38 // char consumed. | 46 // char consumed. |
39 (*char_index)--; | 47 (*char_index)--; |
40 | 48 |
41 // Validate the decoded value. | 49 // Validate the decoded value. |
42 return U_IS_UNICODE_CHAR(code_point); | 50 return IsValidCodepoint(code_point); |
43 } | 51 } |
44 | 52 |
45 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | 53 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. |
46 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | 54 bool ReadUnicodeCharacter(const char16* src, int32 src_len, |
47 int32* char_index, uint32* code_point) { | 55 int32* char_index, uint32* code_point) { |
48 if (U16_IS_SURROGATE(src[*char_index])) { | 56 if (U16_IS_SURROGATE(src[*char_index])) { |
49 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | 57 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || |
50 *char_index + 1 >= src_len || | 58 *char_index + 1 >= src_len || |
51 !U16_IS_TRAIL(src[*char_index + 1])) { | 59 !U16_IS_TRAIL(src[*char_index + 1])) { |
52 // Invalid surrogate pair. | 60 // Invalid surrogate pair. |
53 return false; | 61 return false; |
54 } | 62 } |
55 | 63 |
56 // Valid surrogate pair. | 64 // Valid surrogate pair. |
57 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | 65 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], |
58 src[*char_index + 1]); | 66 src[*char_index + 1]); |
59 (*char_index)++; | 67 (*char_index)++; |
60 } else { | 68 } else { |
61 // Not a surrogate, just one 16-bit word. | 69 // Not a surrogate, just one 16-bit word. |
62 *code_point = src[*char_index]; | 70 *code_point = src[*char_index]; |
63 } | 71 } |
64 | 72 |
65 return U_IS_UNICODE_CHAR(*code_point); | 73 return IsValidCodepoint(*code_point); |
66 } | 74 } |
67 | 75 |
68 #if defined(WCHAR_T_IS_UTF32) | 76 #if defined(WCHAR_T_IS_UTF32) |
69 // Reads UTF-32 character. The usage is the same as the 8-bit version above. | 77 // Reads UTF-32 character. The usage is the same as the 8-bit version above. |
70 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, | 78 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, |
71 int32* char_index, uint32* code_point) { | 79 int32* char_index, uint32* code_point) { |
72 // Conversion is easy since the source is 32-bit. | 80 // Conversion is easy since the source is 32-bit. |
73 *code_point = src[*char_index]; | 81 *code_point = src[*char_index]; |
74 | 82 |
75 // Validate the value. | 83 // Validate the value. |
76 return U_IS_UNICODE_CHAR(*code_point); | 84 return IsValidCodepoint(*code_point); |
77 } | 85 } |
78 #endif // defined(WCHAR_T_IS_UTF32) | 86 #endif // defined(WCHAR_T_IS_UTF32) |
79 | 87 |
80 // WriteUnicodeCharacter ------------------------------------------------------- | 88 // WriteUnicodeCharacter ------------------------------------------------------- |
81 | 89 |
82 // Appends a UTF-8 character to the given 8-bit string. | 90 // Appends a UTF-8 character to the given 8-bit string. |
83 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | 91 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { |
84 if (code_point <= 0x7f) { | 92 if (code_point <= 0x7f) { |
85 // Fast path the common case of one byte. | 93 // Fast path the common case of one byte. |
86 output->push_back(code_point); | 94 output->push_back(code_point); |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
127 // the result. | 135 // the result. |
128 template<typename SRC_CHAR, typename DEST_STRING> | 136 template<typename SRC_CHAR, typename DEST_STRING> |
129 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { | 137 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { |
130 output->clear(); | 138 output->clear(); |
131 | 139 |
132 // ICU requires 32-bit numbers. | 140 // ICU requires 32-bit numbers. |
133 bool success = true; | 141 bool success = true; |
134 int32 src_len32 = static_cast<int32>(src_len); | 142 int32 src_len32 = static_cast<int32>(src_len); |
135 for (int32 i = 0; i < src_len32; i++) { | 143 for (int32 i = 0; i < src_len32; i++) { |
136 uint32 code_point; | 144 uint32 code_point; |
137 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) | 145 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
138 WriteUnicodeCharacter(code_point, output); | 146 WriteUnicodeCharacter(code_point, output); |
139 else | 147 } else { |
| 148 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) |
| 149 // in place of an invalid codepoint. |
140 success = false; | 150 success = false; |
| 151 } |
141 } | 152 } |
142 return success; | 153 return success; |
143 } | 154 } |
144 | 155 |
145 | 156 |
146 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | 157 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount |
147 // of space in the given string. We also assume that the input character types | 158 // of space in the given string. We also assume that the input character types |
148 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | 159 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume |
149 // the string length is greater than zero. | 160 // the string length is greater than zero. |
150 template<typename CHAR> | 161 template<typename CHAR> |
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
421 const char* codepage_name, | 432 const char* codepage_name, |
422 OnStringUtilConversionError::Type on_error, | 433 OnStringUtilConversionError::Type on_error, |
423 std::wstring* wide) { | 434 std::wstring* wide) { |
424 wide->clear(); | 435 wide->clear(); |
425 | 436 |
426 UErrorCode status = U_ZERO_ERROR; | 437 UErrorCode status = U_ZERO_ERROR; |
427 UConverter* converter = ucnv_open(codepage_name, &status); | 438 UConverter* converter = ucnv_open(codepage_name, &status); |
428 if (!U_SUCCESS(status)) | 439 if (!U_SUCCESS(status)) |
429 return false; | 440 return false; |
430 | 441 |
431 // The worst case is all the input characters are non-BMP (32-bit) ones. | 442 // Even in the worst case, the maximum length in 2-byte units of UTF-16 |
432 size_t uchar_max_length = encoded.length() * 2 + 1; | 443 // output would be at most the same as the number of bytes in input. There |
| 444 // is no single-byte encoding in which a character is mapped to a |
| 445 // non-BMP character requiring two 2-byte units. |
| 446 // |
| 447 // Moreover, non-BMP characters in legacy multibyte encodings |
| 448 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are |
| 449 // BOCU and SCSU, but we don't care about them. |
| 450 size_t uchar_max_length = encoded.length() + 1; |
433 | 451 |
434 UChar* uchar_dst; | 452 UChar* uchar_dst; |
435 #if defined(WCHAR_T_IS_UTF16) | 453 #if defined(WCHAR_T_IS_UTF16) |
436 uchar_dst = WriteInto(wide, uchar_max_length); | 454 uchar_dst = WriteInto(wide, uchar_max_length); |
437 #elif defined(WCHAR_T_IS_UTF32) | 455 #elif defined(WCHAR_T_IS_UTF32) |
438 // When wchar_t is wider than UChar (16 bits), convert into a temporary | 456 // When wchar_t is wider than UChar (16 bits), convert into a temporary |
439 // UChar* buffer. | 457 // UChar* buffer. |
440 std::vector<UChar> wide_uchar(uchar_max_length); | 458 std::vector<UChar> wide_uchar(uchar_max_length); |
441 uchar_dst = &wide_uchar[0]; | 459 uchar_dst = &wide_uchar[0]; |
442 #endif // defined(WCHAR_T_IS_UTF32) | 460 #endif // defined(WCHAR_T_IS_UTF32) |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
536 // This implementation is not so fast since it converts the text encoding | 554 // This implementation is not so fast since it converts the text encoding |
537 // twice. Please feel free to file a bug if this function hurts the | 555 // twice. Please feel free to file a bug if this function hurts the |
538 // performance of Chrome. | 556 // performance of Chrome. |
539 DCHECK(IsStringUTF8(input)); | 557 DCHECK(IsStringUTF8(input)); |
540 std::wstring input_wide = UTF8ToWide(input); | 558 std::wstring input_wide = UTF8ToWide(input); |
541 std::wstring output_wide; | 559 std::wstring output_wide; |
542 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); | 560 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); |
543 *output = WideToUTF8(output_wide); | 561 *output = WideToUTF8(output_wide); |
544 return result; | 562 return result; |
545 } | 563 } |
OLD | NEW |