Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(711)

Side by Side Diff: base/string_util_icu.cc

Issue 147038: Pass through non-character codepoints in UTF-8,16,32 and Wide conversion func... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/string_util.h" 5 #include "base/string_util.h"
6 6
7 #include <string.h> 7 #include <string.h>
8 #include <vector> 8 #include <vector>
9 9
10 #include "base/basictypes.h" 10 #include "base/basictypes.h"
11 #include "base/logging.h" 11 #include "base/logging.h"
12 #include "base/singleton.h" 12 #include "base/singleton.h"
13 #include "unicode/ucnv.h" 13 #include "unicode/ucnv.h"
14 #include "unicode/numfmt.h" 14 #include "unicode/numfmt.h"
15 #include "unicode/ustring.h" 15 #include "unicode/ustring.h"
16 16
17 namespace { 17 namespace {
18 18
19 inline bool IsValidCodepoint(uint32 code_point) {
20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
21 // codepoints larger than 0x10FFFF (the highest codepoint allowed).
22 // Non-characters and unassigned codepoints are allowed.
23 return code_point < 0xD800u ||
24 (code_point >= 0xE000u && code_point <= 0x10FFFFu);
25 }
26
19 // ReadUnicodeCharacter -------------------------------------------------------- 27 // ReadUnicodeCharacter --------------------------------------------------------
20 28
21 // Reads a UTF-8 stream, placing the next code point into the given output 29 // Reads a UTF-8 stream, placing the next code point into the given output
22 // |*code_point|. |src| represents the entire string to read, and |*char_index| 30 // |*code_point|. |src| represents the entire string to read, and |*char_index|
23 // is the character offset within the string to start reading at. |*char_index| 31 // is the character offset within the string to start reading at. |*char_index|
24 // will be updated to index the last character read, such that incrementing it 32 // will be updated to index the last character read, such that incrementing it
25 // (as in a for loop) will take the reader to the next character. 33 // (as in a for loop) will take the reader to the next character.
26 // 34 //
27 // Returns true on success. On false, |*code_point| will be invalid. 35 // Returns true on success. On false, |*code_point| will be invalid.
28 bool ReadUnicodeCharacter(const char* src, int32 src_len, 36 bool ReadUnicodeCharacter(const char* src, int32 src_len,
29 int32* char_index, uint32* code_point_out) { 37 int32* char_index, uint32* code_point_out) {
30 // U8_NEXT expects to be able to use -1 to signal an error, so we must 38 // U8_NEXT expects to be able to use -1 to signal an error, so we must
31 // use a signed type for code_point. But this function returns false 39 // use a signed type for code_point. But this function returns false
32 // on error anyway, so code_point_out is unsigned. 40 // on error anyway, so code_point_out is unsigned.
33 int32 code_point; 41 int32 code_point;
34 U8_NEXT(src, *char_index, src_len, code_point); 42 U8_NEXT(src, *char_index, src_len, code_point);
35 *code_point_out = static_cast<uint32>(code_point); 43 *code_point_out = static_cast<uint32>(code_point);
36 44
37 // The ICU macro above moves to the next char, we want to point to the last 45 // The ICU macro above moves to the next char, we want to point to the last
38 // char consumed. 46 // char consumed.
39 (*char_index)--; 47 (*char_index)--;
40 48
41 // Validate the decoded value. 49 // Validate the decoded value.
42 return U_IS_UNICODE_CHAR(code_point); 50 return IsValidCodepoint(code_point);
43 } 51 }
44 52
45 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. 53 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.
46 bool ReadUnicodeCharacter(const char16* src, int32 src_len, 54 bool ReadUnicodeCharacter(const char16* src, int32 src_len,
47 int32* char_index, uint32* code_point) { 55 int32* char_index, uint32* code_point) {
48 if (U16_IS_SURROGATE(src[*char_index])) { 56 if (U16_IS_SURROGATE(src[*char_index])) {
49 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || 57 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
50 *char_index + 1 >= src_len || 58 *char_index + 1 >= src_len ||
51 !U16_IS_TRAIL(src[*char_index + 1])) { 59 !U16_IS_TRAIL(src[*char_index + 1])) {
52 // Invalid surrogate pair. 60 // Invalid surrogate pair.
53 return false; 61 return false;
54 } 62 }
55 63
56 // Valid surrogate pair. 64 // Valid surrogate pair.
57 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], 65 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
58 src[*char_index + 1]); 66 src[*char_index + 1]);
59 (*char_index)++; 67 (*char_index)++;
60 } else { 68 } else {
61 // Not a surrogate, just one 16-bit word. 69 // Not a surrogate, just one 16-bit word.
62 *code_point = src[*char_index]; 70 *code_point = src[*char_index];
63 } 71 }
64 72
65 return U_IS_UNICODE_CHAR(*code_point); 73 return IsValidCodepoint(*code_point);
66 } 74 }
67 75
68 #if defined(WCHAR_T_IS_UTF32) 76 #if defined(WCHAR_T_IS_UTF32)
69 // Reads UTF-32 character. The usage is the same as the 8-bit version above. 77 // Reads UTF-32 character. The usage is the same as the 8-bit version above.
70 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, 78 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
71 int32* char_index, uint32* code_point) { 79 int32* char_index, uint32* code_point) {
72 // Conversion is easy since the source is 32-bit. 80 // Conversion is easy since the source is 32-bit.
73 *code_point = src[*char_index]; 81 *code_point = src[*char_index];
74 82
75 // Validate the value. 83 // Validate the value.
76 return U_IS_UNICODE_CHAR(*code_point); 84 return IsValidCodepoint(*code_point);
77 } 85 }
78 #endif // defined(WCHAR_T_IS_UTF32) 86 #endif // defined(WCHAR_T_IS_UTF32)
79 87
80 // WriteUnicodeCharacter ------------------------------------------------------- 88 // WriteUnicodeCharacter -------------------------------------------------------
81 89
82 // Appends a UTF-8 character to the given 8-bit string. 90 // Appends a UTF-8 character to the given 8-bit string.
83 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { 91 void WriteUnicodeCharacter(uint32 code_point, std::string* output) {
84 if (code_point <= 0x7f) { 92 if (code_point <= 0x7f) {
85 // Fast path the common case of one byte. 93 // Fast path the common case of one byte.
86 output->push_back(code_point); 94 output->push_back(code_point);
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
127 // the result. 135 // the result.
128 template<typename SRC_CHAR, typename DEST_STRING> 136 template<typename SRC_CHAR, typename DEST_STRING>
129 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { 137 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
130 output->clear(); 138 output->clear();
131 139
132 // ICU requires 32-bit numbers. 140 // ICU requires 32-bit numbers.
133 bool success = true; 141 bool success = true;
134 int32 src_len32 = static_cast<int32>(src_len); 142 int32 src_len32 = static_cast<int32>(src_len);
135 for (int32 i = 0; i < src_len32; i++) { 143 for (int32 i = 0; i < src_len32; i++) {
136 uint32 code_point; 144 uint32 code_point;
137 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) 145 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
138 WriteUnicodeCharacter(code_point, output); 146 WriteUnicodeCharacter(code_point, output);
139 else 147 } else {
148 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
149 // in place of an invalid codepoint.
140 success = false; 150 success = false;
151 }
141 } 152 }
142 return success; 153 return success;
143 } 154 }
144 155
145 156
146 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount 157 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount
147 // of space in the given string. We also assume that the input character types 158 // of space in the given string. We also assume that the input character types
148 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume 159 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume
149 // the string length is greater than zero. 160 // the string length is greater than zero.
150 template<typename CHAR> 161 template<typename CHAR>
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after
421 const char* codepage_name, 432 const char* codepage_name,
422 OnStringUtilConversionError::Type on_error, 433 OnStringUtilConversionError::Type on_error,
423 std::wstring* wide) { 434 std::wstring* wide) {
424 wide->clear(); 435 wide->clear();
425 436
426 UErrorCode status = U_ZERO_ERROR; 437 UErrorCode status = U_ZERO_ERROR;
427 UConverter* converter = ucnv_open(codepage_name, &status); 438 UConverter* converter = ucnv_open(codepage_name, &status);
428 if (!U_SUCCESS(status)) 439 if (!U_SUCCESS(status))
429 return false; 440 return false;
430 441
431 // The worst case is all the input characters are non-BMP (32-bit) ones. 442 // Even in the worst case, the maximum length in 2-byte units of UTF-16
432 size_t uchar_max_length = encoded.length() * 2 + 1; 443 // output would be at most the same as the number of bytes in input. There
444 // is no single-byte encoding in which a character is mapped to a
445 // non-BMP character requiring two 2-byte units.
446 //
447 // Moreover, non-BMP characters in legacy multibyte encodings
448 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
449 // BOCU and SCSU, but we don't care about them.
450 size_t uchar_max_length = encoded.length() + 1;
433 451
434 UChar* uchar_dst; 452 UChar* uchar_dst;
435 #if defined(WCHAR_T_IS_UTF16) 453 #if defined(WCHAR_T_IS_UTF16)
436 uchar_dst = WriteInto(wide, uchar_max_length); 454 uchar_dst = WriteInto(wide, uchar_max_length);
437 #elif defined(WCHAR_T_IS_UTF32) 455 #elif defined(WCHAR_T_IS_UTF32)
438 // When wchar_t is wider than UChar (16 bits), convert into a temporary 456 // When wchar_t is wider than UChar (16 bits), convert into a temporary
439 // UChar* buffer. 457 // UChar* buffer.
440 std::vector<UChar> wide_uchar(uchar_max_length); 458 std::vector<UChar> wide_uchar(uchar_max_length);
441 uchar_dst = &wide_uchar[0]; 459 uchar_dst = &wide_uchar[0];
442 #endif // defined(WCHAR_T_IS_UTF32) 460 #endif // defined(WCHAR_T_IS_UTF32)
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
536 // This implementation is not so fast since it converts the text encoding 554 // This implementation is not so fast since it converts the text encoding
537 // twice. Please feel free to file a bug if this function hurts the 555 // twice. Please feel free to file a bug if this function hurts the
538 // performance of Chrome. 556 // performance of Chrome.
539 DCHECK(IsStringUTF8(input)); 557 DCHECK(IsStringUTF8(input));
540 std::wstring input_wide = UTF8ToWide(input); 558 std::wstring input_wide = UTF8ToWide(input);
541 std::wstring output_wide; 559 std::wstring output_wide;
542 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); 560 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);
543 *output = WideToUTF8(output_wide); 561 *output = WideToUTF8(output_wide);
544 return result; 562 return result;
545 } 563 }
OLDNEW
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698