base/string_util_icu.cc - Issue 147038: Pass through non-character codepoints in UTF-8,16,32 and Wide conversion func...

Side by Side Diff: base/string_util_icu.cc

Issue 147038: Pass through non-character codepoints in UTF-8,16,32 and Wide conversion func... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/string_util.h"	5 #include "base/string_util.h"

6	6

7 #include <string.h>	7 #include <string.h>

8 #include <vector>	8 #include <vector>

9	9

10 #include "base/basictypes.h"	10 #include "base/basictypes.h"

11 #include "base/logging.h"	11 #include "base/logging.h"

12 #include "base/singleton.h"	12 #include "base/singleton.h"

13 #include "unicode/ucnv.h"	13 #include "unicode/ucnv.h"

14 #include "unicode/numfmt.h"	14 #include "unicode/numfmt.h"

15 #include "unicode/ustring.h"	15 #include "unicode/ustring.h"

16	16

17 namespace {	17 namespace {

18	18

	19 inline bool IsValidCodepoint(uint32 code_point) {

	20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and

	21 // codepoints larger than 0x10FFFF (the highest codepoint allowed).

	22 // Non-characters and unassigned codepoints are allowed.

	23 return code_point < 0xD800u \|\|

	24 (code_point >= 0xE000u && code_point <= 0x10FFFFu);

	25 }

	26

19 // ReadUnicodeCharacter --------------------------------------------------------	27 // ReadUnicodeCharacter --------------------------------------------------------

20	28

21 // Reads a UTF-8 stream, placing the next code point into the given output	29 // Reads a UTF-8 stream, placing the next code point into the given output

22 // \|code_point\|. \|src\| represents the entire string to read, and \|char_index\|	30 // \|code_point\|. \|src\| represents the entire string to read, and \|char_index\|

23 // is the character offset within the string to start reading at. \|*char_index\|	31 // is the character offset within the string to start reading at. \|*char_index\|

24 // will be updated to index the last character read, such that incrementing it	32 // will be updated to index the last character read, such that incrementing it

25 // (as in a for loop) will take the reader to the next character.	33 // (as in a for loop) will take the reader to the next character.

26 //	34 //

27 // Returns true on success. On false, \|*code_point\| will be invalid.	35 // Returns true on success. On false, \|*code_point\| will be invalid.

28 bool ReadUnicodeCharacter(const char* src, int32 src_len,	36 bool ReadUnicodeCharacter(const char* src, int32 src_len,

29 int32* char_index, uint32* code_point_out) {	37 int32* char_index, uint32* code_point_out) {

30 // U8_NEXT expects to be able to use -1 to signal an error, so we must	38 // U8_NEXT expects to be able to use -1 to signal an error, so we must

31 // use a signed type for code_point. But this function returns false	39 // use a signed type for code_point. But this function returns false

32 // on error anyway, so code_point_out is unsigned.	40 // on error anyway, so code_point_out is unsigned.

33 int32 code_point;	41 int32 code_point;

34 U8_NEXT(src, *char_index, src_len, code_point);	42 U8_NEXT(src, *char_index, src_len, code_point);

35 *code_point_out = static_cast<uint32>(code_point);	43 *code_point_out = static_cast<uint32>(code_point);

36	44

37 // The ICU macro above moves to the next char, we want to point to the last	45 // The ICU macro above moves to the next char, we want to point to the last

38 // char consumed.	46 // char consumed.

39 (*char_index)--;	47 (*char_index)--;

40	48

41 // Validate the decoded value.	49 // Validate the decoded value.

42 return U_IS_UNICODE_CHAR(code_point);	50 return IsValidCodepoint(code_point);

43 }	51 }

44	52

45 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.	53 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.

46 bool ReadUnicodeCharacter(const char16* src, int32 src_len,	54 bool ReadUnicodeCharacter(const char16* src, int32 src_len,

47 int32* char_index, uint32* code_point) {	55 int32* char_index, uint32* code_point) {

48 if (U16_IS_SURROGATE(src[*char_index])) {	56 if (U16_IS_SURROGATE(src[*char_index])) {

49 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) \|\|	57 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) \|\|

50 *char_index + 1 >= src_len \|\|	58 *char_index + 1 >= src_len \|\|

51 !U16_IS_TRAIL(src[*char_index + 1])) {	59 !U16_IS_TRAIL(src[*char_index + 1])) {

52 // Invalid surrogate pair.	60 // Invalid surrogate pair.

53 return false;	61 return false;

54 }	62 }

55	63

56 // Valid surrogate pair.	64 // Valid surrogate pair.

57 code_point = U16_GET_SUPPLEMENTARY(src[char_index],	65 code_point = U16_GET_SUPPLEMENTARY(src[char_index],

58 src[*char_index + 1]);	66 src[*char_index + 1]);

59 (*char_index)++;	67 (*char_index)++;

60 } else {	68 } else {

61 // Not a surrogate, just one 16-bit word.	69 // Not a surrogate, just one 16-bit word.

62 code_point = src[char_index];	70 code_point = src[char_index];

63 }	71 }

64	72

65 return U_IS_UNICODE_CHAR(*code_point);	73 return IsValidCodepoint(*code_point);

66 }	74 }

67	75

68 #if defined(WCHAR_T_IS_UTF32)	76 #if defined(WCHAR_T_IS_UTF32)

69 // Reads UTF-32 character. The usage is the same as the 8-bit version above.	77 // Reads UTF-32 character. The usage is the same as the 8-bit version above.

70 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,	78 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,

71 int32* char_index, uint32* code_point) {	79 int32* char_index, uint32* code_point) {

72 // Conversion is easy since the source is 32-bit.	80 // Conversion is easy since the source is 32-bit.

73 code_point = src[char_index];	81 code_point = src[char_index];

74	82

75 // Validate the value.	83 // Validate the value.

76 return U_IS_UNICODE_CHAR(*code_point);	84 return IsValidCodepoint(*code_point);

77 }	85 }

78 #endif // defined(WCHAR_T_IS_UTF32)	86 #endif // defined(WCHAR_T_IS_UTF32)

79	87

80 // WriteUnicodeCharacter -------------------------------------------------------	88 // WriteUnicodeCharacter -------------------------------------------------------

81	89

82 // Appends a UTF-8 character to the given 8-bit string.	90 // Appends a UTF-8 character to the given 8-bit string.

83 void WriteUnicodeCharacter(uint32 code_point, std::string* output) {	91 void WriteUnicodeCharacter(uint32 code_point, std::string* output) {

84 if (code_point <= 0x7f) {	92 if (code_point <= 0x7f) {

85 // Fast path the common case of one byte.	93 // Fast path the common case of one byte.

86 output->push_back(code_point);	94 output->push_back(code_point);

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
127 // the result.	135 // the result.

128 template<typename SRC_CHAR, typename DEST_STRING>	136 template<typename SRC_CHAR, typename DEST_STRING>

129 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {	137 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {

130 output->clear();	138 output->clear();

131	139

132 // ICU requires 32-bit numbers.	140 // ICU requires 32-bit numbers.

133 bool success = true;	141 bool success = true;

134 int32 src_len32 = static_cast<int32>(src_len);	142 int32 src_len32 = static_cast<int32>(src_len);

135 for (int32 i = 0; i < src_len32; i++) {	143 for (int32 i = 0; i < src_len32; i++) {

136 uint32 code_point;	144 uint32 code_point;

137 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point))	145 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {

138 WriteUnicodeCharacter(code_point, output);	146 WriteUnicodeCharacter(code_point, output);

139 else	147 } else {

	148 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)

	149 // in place of an invalid codepoint.

140 success = false;	150 success = false;

	151 }

141 }	152 }

142 return success;	153 return success;

143 }	154 }

144	155

145	156

146 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount	157 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount

147 // of space in the given string. We also assume that the input character types	158 // of space in the given string. We also assume that the input character types

148 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume	159 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume

149 // the string length is greater than zero.	160 // the string length is greater than zero.

150 template<typename CHAR>	161 template<typename CHAR>

(...skipping 270 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
421 const char* codepage_name,	432 const char* codepage_name,

422 OnStringUtilConversionError::Type on_error,	433 OnStringUtilConversionError::Type on_error,

423 std::wstring* wide) {	434 std::wstring* wide) {

424 wide->clear();	435 wide->clear();

425	436

426 UErrorCode status = U_ZERO_ERROR;	437 UErrorCode status = U_ZERO_ERROR;

427 UConverter* converter = ucnv_open(codepage_name, &status);	438 UConverter* converter = ucnv_open(codepage_name, &status);

428 if (!U_SUCCESS(status))	439 if (!U_SUCCESS(status))

429 return false;	440 return false;

430	441

431 // The worst case is all the input characters are non-BMP (32-bit) ones.	442 // Even in the worst case, the maximum length in 2-byte units of UTF-16

432 size_t uchar_max_length = encoded.length() * 2 + 1;	443 // output would be at most the same as the number of bytes in input. There

	444 // is no single-byte encoding in which a character is mapped to a

	445 // non-BMP character requiring two 2-byte units.

	446 //

	447 // Moreover, non-BMP characters in legacy multibyte encodings

	448 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are

	449 // BOCU and SCSU, but we don't care about them.

	450 size_t uchar_max_length = encoded.length() + 1;

433	451

434 UChar* uchar_dst;	452 UChar* uchar_dst;

435 #if defined(WCHAR_T_IS_UTF16)	453 #if defined(WCHAR_T_IS_UTF16)

436 uchar_dst = WriteInto(wide, uchar_max_length);	454 uchar_dst = WriteInto(wide, uchar_max_length);

437 #elif defined(WCHAR_T_IS_UTF32)	455 #elif defined(WCHAR_T_IS_UTF32)

438 // When wchar_t is wider than UChar (16 bits), convert into a temporary	456 // When wchar_t is wider than UChar (16 bits), convert into a temporary

439 // UChar* buffer.	457 // UChar* buffer.

440 std::vector<UChar> wide_uchar(uchar_max_length);	458 std::vector<UChar> wide_uchar(uchar_max_length);

441 uchar_dst = &wide_uchar[0];	459 uchar_dst = &wide_uchar[0];

442 #endif // defined(WCHAR_T_IS_UTF32)	460 #endif // defined(WCHAR_T_IS_UTF32)

(...skipping 93 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
536 // This implementation is not so fast since it converts the text encoding	554 // This implementation is not so fast since it converts the text encoding

537 // twice. Please feel free to file a bug if this function hurts the	555 // twice. Please feel free to file a bug if this function hurts the

538 // performance of Chrome.	556 // performance of Chrome.

539 DCHECK(IsStringUTF8(input));	557 DCHECK(IsStringUTF8(input));

540 std::wstring input_wide = UTF8ToWide(input);	558 std::wstring input_wide = UTF8ToWide(input);

541 std::wstring output_wide;	559 std::wstring output_wide;

542 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);	560 TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);

543 *output = WideToUTF8(output_wide);	561 *output = WideToUTF8(output_wide);

544 return result;	562 return result;

545 }	563 }

OLD	NEW

« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »