| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/i18n/icu_string_conversions.h" | |
| 6 | |
| 7 #include <vector> | |
| 8 | |
| 9 #include "base/basictypes.h" | |
| 10 #include "base/logging.h" | |
| 11 #include "base/memory/scoped_ptr.h" | |
| 12 #include "base/strings/string_util.h" | |
| 13 #include "base/strings/utf_string_conversions.h" | |
| 14 #include "third_party/icu/source/common/unicode/ucnv.h" | |
| 15 #include "third_party/icu/source/common/unicode/ucnv_cb.h" | |
| 16 #include "third_party/icu/source/common/unicode/ucnv_err.h" | |
| 17 #include "third_party/icu/source/common/unicode/unorm.h" | |
| 18 #include "third_party/icu/source/common/unicode/ustring.h" | |
| 19 | |
| 20 namespace base { | |
| 21 | |
| 22 namespace { | |
| 23 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUBSTITUTE | |
| 24 // in source/common/ucnv_err.c. | |
| 25 | |
| 26 // Copyright (c) 1995-2006 International Business Machines Corporation | |
| 27 // and others | |
| 28 // | |
| 29 // All rights reserved. | |
| 30 // | |
| 31 | |
| 32 // Permission is hereby granted, free of charge, to any person obtaining a | |
| 33 // copy of this software and associated documentation files (the "Software"), | |
| 34 // to deal in the Software without restriction, including without limitation | |
| 35 // the rights to use, copy, modify, merge, publish, distribute, and/or | |
| 36 // sell copies of the Software, and to permit persons to whom the Software | |
| 37 // is furnished to do so, provided that the above copyright notice(s) and | |
| 38 // this permission notice appear in all copies of the Software and that | |
| 39 // both the above copyright notice(s) and this permission notice appear in | |
| 40 // supporting documentation. | |
| 41 // | |
| 42 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
| 43 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| 44 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
| 45 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS | |
| 46 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT | |
| 47 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS | |
| 48 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE | |
| 49 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE | |
| 50 // OR PERFORMANCE OF THIS SOFTWARE. | |
| 51 // | |
| 52 // Except as contained in this notice, the name of a copyright holder | |
| 53 // shall not be used in advertising or otherwise to promote the sale, use | |
| 54 // or other dealings in this Software without prior written authorization | |
| 55 // of the copyright holder. | |
| 56 | |
| 57 // ___________________________________________________________________________ | |
| 58 // | |
| 59 // All trademarks and registered trademarks mentioned herein are the property | |
| 60 // of their respective owners. | |
| 61 | |
| 62 void ToUnicodeCallbackSubstitute(const void* context, | |
| 63 UConverterToUnicodeArgs *to_args, | |
| 64 const char* code_units, | |
| 65 int32_t length, | |
| 66 UConverterCallbackReason reason, | |
| 67 UErrorCode * err) { | |
| 68 static const UChar kReplacementChar = 0xFFFD; | |
| 69 if (reason <= UCNV_IRREGULAR) { | |
| 70 if (context == NULL || | |
| 71 (*(reinterpret_cast<const char*>(context)) == 'i' && | |
| 72 reason == UCNV_UNASSIGNED)) { | |
| 73 *err = U_ZERO_ERROR; | |
| 74 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); | |
| 75 } | |
| 76 // else the caller must have set the error code accordingly. | |
| 77 } | |
| 78 // else ignore the reset, close and clone calls. | |
| 79 } | |
| 80 | |
| 81 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, | |
| 82 int uchar_len, OnStringConversionError::Type on_error, | |
| 83 std::string* encoded) { | |
| 84 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, | |
| 85 ucnv_getMaxCharSize(converter)); | |
| 86 encoded->resize(encoded_max_length); | |
| 87 | |
| 88 UErrorCode status = U_ZERO_ERROR; | |
| 89 | |
| 90 // Setup our error handler. | |
| 91 switch (on_error) { | |
| 92 case OnStringConversionError::FAIL: | |
| 93 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, | |
| 94 NULL, NULL, &status); | |
| 95 break; | |
| 96 case OnStringConversionError::SKIP: | |
| 97 case OnStringConversionError::SUBSTITUTE: | |
| 98 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, | |
| 99 NULL, NULL, &status); | |
| 100 break; | |
| 101 default: | |
| 102 NOTREACHED(); | |
| 103 } | |
| 104 | |
| 105 // ucnv_fromUChars returns size not including terminating null | |
| 106 int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], | |
| 107 encoded_max_length, uchar_src, uchar_len, &status); | |
| 108 encoded->resize(actual_size); | |
| 109 ucnv_close(converter); | |
| 110 if (U_SUCCESS(status)) | |
| 111 return true; | |
| 112 encoded->clear(); // Make sure the output is empty on error. | |
| 113 return false; | |
| 114 } | |
| 115 | |
| 116 // Set up our error handler for ToUTF-16 converters | |
| 117 void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error, | |
| 118 UConverter* converter, UErrorCode* status) { | |
| 119 switch (on_error) { | |
| 120 case OnStringConversionError::FAIL: | |
| 121 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, | |
| 122 NULL, NULL, status); | |
| 123 break; | |
| 124 case OnStringConversionError::SKIP: | |
| 125 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, | |
| 126 NULL, NULL, status); | |
| 127 break; | |
| 128 case OnStringConversionError::SUBSTITUTE: | |
| 129 ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, | |
| 130 NULL, NULL, status); | |
| 131 break; | |
| 132 default: | |
| 133 NOTREACHED(); | |
| 134 } | |
| 135 } | |
| 136 | |
| 137 } // namespace | |
| 138 | |
| 139 // Codepage <-> Wide/UTF-16 --------------------------------------------------- | |
| 140 | |
| 141 bool UTF16ToCodepage(const string16& utf16, | |
| 142 const char* codepage_name, | |
| 143 OnStringConversionError::Type on_error, | |
| 144 std::string* encoded) { | |
| 145 encoded->clear(); | |
| 146 | |
| 147 UErrorCode status = U_ZERO_ERROR; | |
| 148 UConverter* converter = ucnv_open(codepage_name, &status); | |
| 149 if (!U_SUCCESS(status)) | |
| 150 return false; | |
| 151 | |
| 152 return ConvertFromUTF16(converter, utf16.c_str(), | |
| 153 static_cast<int>(utf16.length()), on_error, encoded); | |
| 154 } | |
| 155 | |
| 156 bool CodepageToUTF16(const std::string& encoded, | |
| 157 const char* codepage_name, | |
| 158 OnStringConversionError::Type on_error, | |
| 159 string16* utf16) { | |
| 160 utf16->clear(); | |
| 161 | |
| 162 UErrorCode status = U_ZERO_ERROR; | |
| 163 UConverter* converter = ucnv_open(codepage_name, &status); | |
| 164 if (!U_SUCCESS(status)) | |
| 165 return false; | |
| 166 | |
| 167 // Even in the worst case, the maximum length in 2-byte units of UTF-16 | |
| 168 // output would be at most the same as the number of bytes in input. There | |
| 169 // is no single-byte encoding in which a character is mapped to a | |
| 170 // non-BMP character requiring two 2-byte units. | |
| 171 // | |
| 172 // Moreover, non-BMP characters in legacy multibyte encodings | |
| 173 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are | |
| 174 // BOCU and SCSU, but we don't care about them. | |
| 175 size_t uchar_max_length = encoded.length() + 1; | |
| 176 | |
| 177 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
| 178 scoped_ptr<char16[]> buffer(new char16[uchar_max_length]); | |
| 179 int actual_size = ucnv_toUChars(converter, buffer.get(), | |
| 180 static_cast<int>(uchar_max_length), encoded.data(), | |
| 181 static_cast<int>(encoded.length()), &status); | |
| 182 ucnv_close(converter); | |
| 183 if (!U_SUCCESS(status)) { | |
| 184 utf16->clear(); // Make sure the output is empty on error. | |
| 185 return false; | |
| 186 } | |
| 187 | |
| 188 utf16->assign(buffer.get(), actual_size); | |
| 189 return true; | |
| 190 } | |
| 191 | |
| 192 bool ConvertToUtf8AndNormalize(const std::string& text, | |
| 193 const std::string& charset, | |
| 194 std::string* result) { | |
| 195 result->clear(); | |
| 196 string16 utf16; | |
| 197 if (!CodepageToUTF16( | |
| 198 text, charset.c_str(), OnStringConversionError::FAIL, &utf16)) | |
| 199 return false; | |
| 200 | |
| 201 UErrorCode status = U_ZERO_ERROR; | |
| 202 size_t max_length = utf16.length() + 1; | |
| 203 string16 normalized_utf16; | |
| 204 scoped_ptr<char16[]> buffer(new char16[max_length]); | |
| 205 int actual_length = unorm_normalize( | |
| 206 utf16.c_str(), utf16.length(), UNORM_NFC, 0, | |
| 207 buffer.get(), static_cast<int>(max_length), &status); | |
| 208 if (!U_SUCCESS(status)) | |
| 209 return false; | |
| 210 normalized_utf16.assign(buffer.get(), actual_length); | |
| 211 | |
| 212 return UTF16ToUTF8(normalized_utf16.data(), | |
| 213 normalized_utf16.length(), result); | |
| 214 } | |
| 215 | |
| 216 } // namespace base | |
| OLD | NEW |