base/i18n/icu_string_conversions.cc - Issue 380007: Clean up recent string conversion function changes, part 1: Remove unnecessar...

Side by Side Diff: base/i18n/icu_string_conversions.cc

Issue 380007: Clean up recent string conversion function changes, part 1: Remove unnecessar... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/i18n/icu_string_conversions.h"	5 #include "base/i18n/icu_string_conversions.h"

6	6

7 #include <vector>	7 #include <vector>

8	8

9 #include "base/basictypes.h"	9 #include "base/basictypes.h"

10 #include "base/logging.h"	10 #include "base/logging.h"

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
150	150

151 } // namespace	151 } // namespace

152	152

153 const char kCodepageLatin1[] = "ISO-8859-1";	153 const char kCodepageLatin1[] = "ISO-8859-1";

154 const char kCodepageUTF8[] = "UTF-8";	154 const char kCodepageUTF8[] = "UTF-8";

155 const char kCodepageUTF16BE[] = "UTF-16BE";	155 const char kCodepageUTF16BE[] = "UTF-16BE";

156 const char kCodepageUTF16LE[] = "UTF-16LE";	156 const char kCodepageUTF16LE[] = "UTF-16LE";

157	157

158 // Codepage <-> Wide/UTF-16 ---------------------------------------------------	158 // Codepage <-> Wide/UTF-16 ---------------------------------------------------

159	159

160 // Convert a UTF-16 string into the specified codepage_name. If the codepage

161 // isn't found, return false.

162 bool UTF16ToCodepage(const string16& utf16,	160 bool UTF16ToCodepage(const string16& utf16,

163 const char* codepage_name,	161 const char* codepage_name,

164 OnStringConversionError::Type on_error,	162 OnStringConversionError::Type on_error,

165 std::string* encoded) {	163 std::string* encoded) {

166 encoded->clear();	164 encoded->clear();

167	165

168 UErrorCode status = U_ZERO_ERROR;	166 UErrorCode status = U_ZERO_ERROR;

169 UConverter* converter = ucnv_open(codepage_name, &status);	167 UConverter* converter = ucnv_open(codepage_name, &status);

170 if (!U_SUCCESS(status))	168 if (!U_SUCCESS(status))

171 return false;	169 return false;

172	170

173 return ConvertFromUTF16(converter, utf16.c_str(),	171 return ConvertFromUTF16(converter, utf16.c_str(),

174 static_cast<int>(utf16.length()), on_error, encoded);	172 static_cast<int>(utf16.length()), on_error, encoded);

175 }	173 }

176	174

177 bool CodepageToUTF16AndAdjustOffset(const std::string& encoded,	175 bool CodepageToUTF16(const std::string& encoded,

178 const char* codepage_name,	176 const char* codepage_name,

179 OnStringConversionError::Type on_error,	177 OnStringConversionError::Type on_error,

180 string16* utf16,	178 string16* utf16) {

181 size_t* offset_for_adjustment) {

182 utf16->clear();	179 utf16->clear();

183	180

184 UErrorCode status = U_ZERO_ERROR;	181 UErrorCode status = U_ZERO_ERROR;

185 UConverter* converter = ucnv_open(codepage_name, &status);	182 UConverter* converter = ucnv_open(codepage_name, &status);

186 if (!U_SUCCESS(status))	183 if (!U_SUCCESS(status))

187 return false;	184 return false;

188	185

189 // Even in the worst case, the maximum length in 2-byte units of UTF-16	186 // Even in the worst case, the maximum length in 2-byte units of UTF-16

190 // output would be at most the same as the number of bytes in input. There	187 // output would be at most the same as the number of bytes in input. There

191 // is no single-byte encoding in which a character is mapped to a	188 // is no single-byte encoding in which a character is mapped to a

192 // non-BMP character requiring two 2-byte units.	189 // non-BMP character requiring two 2-byte units.

193 //	190 //

194 // Moreover, non-BMP characters in legacy multibyte encodings	191 // Moreover, non-BMP characters in legacy multibyte encodings

195 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are	192 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are

196 // BOCU and SCSU, but we don't care about them.	193 // BOCU and SCSU, but we don't care about them.

197 size_t uchar_max_length = encoded.length() + 1;	194 size_t uchar_max_length = encoded.length() + 1;

198	195

199 SetUpErrorHandlerForToUChars(on_error, converter, &status);	196 SetUpErrorHandlerForToUChars(on_error, converter, &status);

200 char16* byte_buffer = WriteInto(utf16, uchar_max_length);	197 int actual_size = ucnv_toUChars(converter, WriteInto(utf16, uchar_max_length),

201 int byte_buffer_length = static_cast<int>(uchar_max_length);	198 static_cast<int>(uchar_max_length), encoded.data(),

202 const char* data = encoded.data();	199 static_cast<int>(encoded.length()), &status);

203 int length = static_cast<int>(encoded.length());

204 int actual_size = 0;

205 if (offset_for_adjustment) {

206 if (*offset_for_adjustment >= encoded.length()) {

207 *offset_for_adjustment = string16::npos;

208 } else if (*offset_for_adjustment != 0) {

209 // Try to adjust the offset by converting the string in two pieces and

210 // using the length of the first piece as the adjusted offset.

211 actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length,

212 data, static_cast<int>(*offset_for_adjustment), &status);

213 if (U_SUCCESS(status)) {

214 // Conversion succeeded, so update the offset and then fall through to

215 // appending the second half of the string.

216 data += *offset_for_adjustment;

217 length -= *offset_for_adjustment;

218 *offset_for_adjustment = actual_size;

219 byte_buffer += actual_size;

220 byte_buffer_length -= actual_size;

221 } else {

222 // The offset may have been in the middle of an encoding sequence; mark

223 // it as having failed to adjust and then try to convert the entire

224 // string.

225 *offset_for_adjustment = string16::npos;

226 actual_size = 0;

227 ucnv_reset(converter);

228 status = U_ZERO_ERROR;

229 }

230 }

231 }

232 actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data,

233 length, &status);

234 ucnv_close(converter);	200 ucnv_close(converter);

235 if (!U_SUCCESS(status)) {	201 if (!U_SUCCESS(status)) {

236 utf16->clear(); // Make sure the output is empty on error.	202 utf16->clear(); // Make sure the output is empty on error.

237 return false;	203 return false;

238 }	204 }

239	205

240 utf16->resize(actual_size);	206 utf16->resize(actual_size);

241 return true;	207 return true;

242 }	208 }

243	209

244 // Convert a wstring into the specified codepage_name. If the codepage

245 // isn't found, return false.

246 bool WideToCodepage(const std::wstring& wide,	210 bool WideToCodepage(const std::wstring& wide,

247 const char* codepage_name,	211 const char* codepage_name,

248 OnStringConversionError::Type on_error,	212 OnStringConversionError::Type on_error,

249 std::string* encoded) {	213 std::string* encoded) {

250 #if defined(WCHAR_T_IS_UTF16)	214 #if defined(WCHAR_T_IS_UTF16)

251 return UTF16ToCodepage(wide, codepage_name, on_error, encoded);	215 return UTF16ToCodepage(wide, codepage_name, on_error, encoded);

252 #elif defined(WCHAR_T_IS_UTF32)	216 #elif defined(WCHAR_T_IS_UTF32)

253 encoded->clear();	217 encoded->clear();

254	218

255 UErrorCode status = U_ZERO_ERROR;	219 UErrorCode status = U_ZERO_ERROR;

256 UConverter* converter = ucnv_open(codepage_name, &status);	220 UConverter* converter = ucnv_open(codepage_name, &status);

257 if (!U_SUCCESS(status))	221 if (!U_SUCCESS(status))

258 return false;	222 return false;

259	223

260 int utf16_len;	224 int utf16_len;

261 // When wchar_t is wider than UChar (16 bits), transform \|wide\| into a	225 // When wchar_t is wider than UChar (16 bits), transform \|wide\| into a

262 // UChar* string. Size the UChar* buffer to be large enough to hold twice	226 // UChar* string. Size the UChar* buffer to be large enough to hold twice

263 // as many UTF-16 code units (UChar's) as there are Unicode code points,	227 // as many UTF-16 code units (UChar's) as there are Unicode code points,

264 // in case each code points translates to a UTF-16 surrogate pair,	228 // in case each code points translates to a UTF-16 surrogate pair,

265 // and leave room for a NUL terminator.	229 // and leave room for a NUL terminator.

266 std::vector<UChar> utf16(wide.length() * 2 + 1);	230 std::vector<UChar> utf16(wide.length() * 2 + 1);

267 u_strFromWCS(&utf16[0], utf16.size(), &utf16_len,	231 u_strFromWCS(&utf16[0], utf16.size(), &utf16_len,

268 wide.c_str(), wide.length(), &status);	232 wide.c_str(), wide.length(), &status);

269 DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";	233 DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";

270	234

271 return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);	235 return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);

272 #endif // defined(WCHAR_T_IS_UTF32)	236 #endif // defined(WCHAR_T_IS_UTF32)

273 }	237 }

274	238

275 // Converts a string of the given codepage into wstring.	239 bool CodepageToWide(const std::string& encoded,

276 // If the codepage isn't found, return false.	240 const char* codepage_name,

277 bool CodepageToWideAndAdjustOffset(const std::string& encoded,	241 OnStringConversionError::Type on_error,

278 const char* codepage_name,	242 std::wstring* wide) {

279 OnStringConversionError::Type on_error,

280 std::wstring* wide,

281 size_t* offset_for_adjustment) {

282 #if defined(WCHAR_T_IS_UTF16)	243 #if defined(WCHAR_T_IS_UTF16)

283 return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide,	244 return CodepageToUTF16(encoded, codepage_name, on_error, wide);

284 offset_for_adjustment);

285 #elif defined(WCHAR_T_IS_UTF32)	245 #elif defined(WCHAR_T_IS_UTF32)

286 wide->clear();	246 wide->clear();

287	247

288 UErrorCode status = U_ZERO_ERROR;	248 UErrorCode status = U_ZERO_ERROR;

289 UConverter* converter = ucnv_open(codepage_name, &status);	249 UConverter* converter = ucnv_open(codepage_name, &status);

290 if (!U_SUCCESS(status))	250 if (!U_SUCCESS(status))

291 return false;	251 return false;

292	252

293 // The maximum length in 4 byte unit of UTF-32 output would be	253 // The maximum length in 4 byte unit of UTF-32 output would be

294 // at most the same as the number of bytes in input. In the worst	254 // at most the same as the number of bytes in input. In the worst

295 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),	255 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),

296 // this can be 4 times larger than actually needed.	256 // this can be 4 times larger than actually needed.

297 size_t wchar_max_length = encoded.length() + 1;	257 size_t wchar_max_length = encoded.length() + 1;

298	258

299 SetUpErrorHandlerForToUChars(on_error, converter, &status);	259 SetUpErrorHandlerForToUChars(on_error, converter, &status);

300 char* byte_buffer =	260 int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), converter,

301 reinterpret_cast<char*>(WriteInto(wide, wchar_max_length));	261 reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)),

302 int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t);	262 static_cast<int>(wchar_max_length) * sizeof(wchar_t), encoded.data(),

303 const char* data = encoded.data();	263 static_cast<int>(encoded.length()), &status);

304 int length = static_cast<int>(encoded.length());

305 int actual_size = 0;

306 if (offset_for_adjustment) {

307 if (*offset_for_adjustment >= encoded.length()) {

308 *offset_for_adjustment = std::wstring::npos;

309 } else if (*offset_for_adjustment != 0) {

310 // Try to adjust the offset by converting the string in two pieces and

311 // using the length of the first piece as the adjusted offset.

312 actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter,

313 byte_buffer, byte_buffer_length, data,

314 static_cast<int>(*offset_for_adjustment), &status);

315 if (U_SUCCESS(status)) {

316 // Conversion succeeded, so update the offset and then fall through to

317 // appending the second half of the string.

318 data += *offset_for_adjustment;

319 length -= *offset_for_adjustment;

320 *offset_for_adjustment = actual_size / sizeof(wchar_t);

321 byte_buffer += actual_size;

322 byte_buffer_length -= actual_size;

323 } else {

324 // The offset may have been in the middle of an encoding sequence; mark

325 // it as having failed to adjust and then try to convert the entire

326 // string.

327 *offset_for_adjustment = std::wstring::npos;

328 actual_size = 0;

329 ucnv_reset(converter);

330 status = U_ZERO_ERROR;

331 }

332 }

333 }

334 actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter,

335 byte_buffer, byte_buffer_length, data, length, &status);

336 ucnv_close(converter);	264 ucnv_close(converter);

337 if (!U_SUCCESS(status)) {	265 if (!U_SUCCESS(status)) {

338 wide->clear(); // Make sure the output is empty on error.	266 wide->clear(); // Make sure the output is empty on error.

339 return false;	267 return false;

340 }	268 }

341	269

342 // actual_size is # of bytes.	270 // actual_size is # of bytes.

343 wide->resize(actual_size / sizeof(wchar_t));	271 wide->resize(actual_size / sizeof(wchar_t));

344 return true;	272 return true;

345 #endif // defined(WCHAR_T_IS_UTF32)	273 #endif // defined(WCHAR_T_IS_UTF32)

346 }	274 }

347	275

348 } // namespace base	276 } // namespace base

OLD	NEW

« no previous file with comments | « base/i18n/icu_string_conversions.h ('k') | base/i18n/icu_string_conversions_unittest.cc » ('j') | no next file with comments »