Index: base/i18n/icu_string_conversions.cc |
=================================================================== |
--- base/i18n/icu_string_conversions.cc (revision 31214) |
+++ base/i18n/icu_string_conversions.cc (working copy) |
@@ -157,6 +157,90 @@ |
// Codepage <-> Wide/UTF-16 --------------------------------------------------- |
+// Convert a UTF-16 string into the specified codepage_name. If the codepage |
+// isn't found, return false. |
+bool UTF16ToCodepage(const string16& utf16, |
+ const char* codepage_name, |
+ OnStringConversionError::Type on_error, |
+ std::string* encoded) { |
+ encoded->clear(); |
+ |
+ UErrorCode status = U_ZERO_ERROR; |
+ UConverter* converter = ucnv_open(codepage_name, &status); |
+ if (!U_SUCCESS(status)) |
+ return false; |
+ |
+ return ConvertFromUTF16(converter, utf16.c_str(), |
+ static_cast<int>(utf16.length()), on_error, encoded); |
+} |
+ |
+bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, |
+ const char* codepage_name, |
+ OnStringConversionError::Type on_error, |
+ string16* utf16, |
+ size_t* offset_for_adjustment) { |
+ utf16->clear(); |
+ |
+ UErrorCode status = U_ZERO_ERROR; |
+ UConverter* converter = ucnv_open(codepage_name, &status); |
+ if (!U_SUCCESS(status)) |
+ return false; |
+ |
+ // Even in the worst case, the maximum length in 2-byte units of UTF-16 |
+ // output would be at most the same as the number of bytes in input. There |
+ // is no single-byte encoding in which a character is mapped to a |
+ // non-BMP character requiring two 2-byte units. |
+ // |
+ // Moreover, non-BMP characters in legacy multibyte encodings |
+ // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are |
+ // BOCU and SCSU, but we don't care about them. |
+ size_t uchar_max_length = encoded.length() + 1; |
+ |
+ SetUpErrorHandlerForToUChars(on_error, converter, &status); |
+ char16* byte_buffer = WriteInto(utf16, uchar_max_length); |
+ int byte_buffer_length = static_cast<int>(uchar_max_length); |
+ const char* data = encoded.data(); |
+ int length = static_cast<int>(encoded.length()); |
+ int actual_size = 0; |
+ if (offset_for_adjustment) { |
+ if (*offset_for_adjustment >= encoded.length()) { |
+ *offset_for_adjustment = string16::npos; |
+ } else if (*offset_for_adjustment != 0) { |
+ // Try to adjust the offset by converting the string in two pieces and |
+ // using the length of the first piece as the adjusted offset. |
+ actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, |
+ data, static_cast<int>(*offset_for_adjustment), &status); |
+ if (U_SUCCESS(status)) { |
+ // Conversion succeeded, so update the offset and then fall through to |
+ // appending the second half of the string. |
+ data += *offset_for_adjustment; |
+ length -= *offset_for_adjustment; |
+ *offset_for_adjustment = actual_size; |
+ byte_buffer += actual_size; |
+ byte_buffer_length -= actual_size; |
+ } else { |
+ // The offset may have been in the middle of an encoding sequence; mark |
+ // it as having failed to adjust and then try to convert the entire |
+ // string. |
+ *offset_for_adjustment = string16::npos; |
+ actual_size = 0; |
+ ucnv_reset(converter); |
+ status = U_ZERO_ERROR; |
+ } |
+ } |
+ } |
+ actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, |
+ length, &status); |
+ ucnv_close(converter); |
+ if (!U_SUCCESS(status)) { |
+ utf16->clear(); // Make sure the output is empty on error. |
+ return false; |
+ } |
+ |
+ utf16->resize(actual_size); |
+ return true; |
+} |
+ |
// Convert a wstring into the specified codepage_name. If the codepage |
// isn't found, return false. |
bool WideToCodepage(const std::wstring& wide, |
@@ -188,31 +272,16 @@ |
#endif // defined(WCHAR_T_IS_UTF32) |
} |
-// Convert a UTF-16 string into the specified codepage_name. If the codepage |
-// isn't found, return false. |
-bool UTF16ToCodepage(const string16& utf16, |
- const char* codepage_name, |
- OnStringConversionError::Type on_error, |
- std::string* encoded) { |
- encoded->clear(); |
- |
- UErrorCode status = U_ZERO_ERROR; |
- UConverter* converter = ucnv_open(codepage_name, &status); |
- if (!U_SUCCESS(status)) |
- return false; |
- |
- return ConvertFromUTF16(converter, utf16.c_str(), |
- static_cast<int>(utf16.length()), on_error, encoded); |
-} |
- |
// Converts a string of the given codepage into wstring. |
// If the codepage isn't found, return false. |
-bool CodepageToWide(const std::string& encoded, |
- const char* codepage_name, |
- OnStringConversionError::Type on_error, |
- std::wstring* wide) { |
+bool CodepageToWideAndAdjustOffset(const std::string& encoded, |
+ const char* codepage_name, |
+ OnStringConversionError::Type on_error, |
+ std::wstring* wide, |
+ size_t* offset_for_adjustment) { |
#if defined(WCHAR_T_IS_UTF16) |
- return CodepageToUTF16(encoded, codepage_name, on_error, wide); |
+ return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, |
+ offset_for_adjustment); |
#elif defined(WCHAR_T_IS_UTF32) |
wide->clear(); |
@@ -227,70 +296,53 @@ |
// this can be 4 times larger than actually needed. |
size_t wchar_max_length = encoded.length() + 1; |
- // The byte buffer and its length to pass to ucnv_toAlgorithimic. |
- char* byte_buffer = reinterpret_cast<char*>( |
- WriteInto(wide, wchar_max_length)); |
- int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; |
- |
SetUpErrorHandlerForToUChars(on_error, converter, &status); |
- int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), |
- converter, |
- byte_buffer, |
- byte_buffer_length, |
- encoded.data(), |
- static_cast<int>(encoded.length()), |
- &status); |
+ char* byte_buffer = |
+ reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); |
+ int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); |
+ const char* data = encoded.data(); |
+ int length = static_cast<int>(encoded.length()); |
+ int actual_size = 0; |
+ if (offset_for_adjustment) { |
+ if (*offset_for_adjustment >= encoded.length()) { |
+ *offset_for_adjustment = std::wstring::npos; |
+ } else if (*offset_for_adjustment != 0) { |
+ // Try to adjust the offset by converting the string in two pieces and |
+ // using the length of the first piece as the adjusted offset. |
+ actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, |
+ byte_buffer, byte_buffer_length, data, |
+ static_cast<int>(*offset_for_adjustment), &status); |
+ if (U_SUCCESS(status)) { |
+ // Conversion succeeded, so update the offset and then fall through to |
+ // appending the second half of the string. |
+ data += *offset_for_adjustment; |
+ length -= *offset_for_adjustment; |
+ *offset_for_adjustment = actual_size / sizeof(wchar_t); |
+ byte_buffer += actual_size; |
+ byte_buffer_length -= actual_size; |
+ } else { |
+ // The offset may have been in the middle of an encoding sequence; mark |
+ // it as having failed to adjust and then try to convert the entire |
+ // string. |
+ *offset_for_adjustment = std::wstring::npos; |
+ actual_size = 0; |
+ ucnv_reset(converter); |
+ status = U_ZERO_ERROR; |
+ } |
+ } |
+ } |
+ actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, |
+ byte_buffer, byte_buffer_length, data, length, &status); |
ucnv_close(converter); |
- |
if (!U_SUCCESS(status)) { |
wide->clear(); // Make sure the output is empty on error. |
return false; |
} |
// actual_size is # of bytes. |
- wide->resize(actual_size / 4); |
+ wide->resize(actual_size / sizeof(wchar_t)); |
return true; |
#endif // defined(WCHAR_T_IS_UTF32) |
} |
-// Converts a string of the given codepage into UTF-16. |
-// If the codepage isn't found, return false. |
-bool CodepageToUTF16(const std::string& encoded, |
- const char* codepage_name, |
- OnStringConversionError::Type on_error, |
- string16* utf16) { |
- utf16->clear(); |
- |
- UErrorCode status = U_ZERO_ERROR; |
- UConverter* converter = ucnv_open(codepage_name, &status); |
- if (!U_SUCCESS(status)) |
- return false; |
- |
- // Even in the worst case, the maximum length in 2-byte units of UTF-16 |
- // output would be at most the same as the number of bytes in input. There |
- // is no single-byte encoding in which a character is mapped to a |
- // non-BMP character requiring two 2-byte units. |
- // |
- // Moreover, non-BMP characters in legacy multibyte encodings |
- // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are |
- // BOCU and SCSU, but we don't care about them. |
- size_t uchar_max_length = encoded.length() + 1; |
- |
- SetUpErrorHandlerForToUChars(on_error, converter, &status); |
- int actual_size = ucnv_toUChars(converter, |
- WriteInto(utf16, uchar_max_length), |
- static_cast<int>(uchar_max_length), |
- encoded.data(), |
- static_cast<int>(encoded.length()), |
- &status); |
- ucnv_close(converter); |
- if (!U_SUCCESS(status)) { |
- utf16->clear(); // Make sure the output is empty on error. |
- return false; |
- } |
- |
- utf16->resize(actual_size); |
- return true; |
-} |
- |
} // namespace base |