Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(163)

Side by Side Diff: base/utf_string_conversions.cc

Issue 372017: Fix various problems with inline autocomplete and URLs that change length dur... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/utf_string_conversions.h ('k') | base/utf_string_conversions_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/utf_string_conversions.h" 5 #include "base/utf_string_conversions.h"
6 6
7 #include <vector> 7 #include <vector>
8 8
9 #include "base/basictypes.h" 9 #include "base/basictypes.h"
10 #include "base/logging.h" 10 #include "base/logging.h"
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
77 // Conversion is easy since the source is 32-bit. 77 // Conversion is easy since the source is 32-bit.
78 *code_point = src[*char_index]; 78 *code_point = src[*char_index];
79 79
80 // Validate the value. 80 // Validate the value.
81 return IsValidCodepoint(*code_point); 81 return IsValidCodepoint(*code_point);
82 } 82 }
83 #endif // defined(WCHAR_T_IS_UTF32) 83 #endif // defined(WCHAR_T_IS_UTF32)
84 84
85 // WriteUnicodeCharacter ------------------------------------------------------- 85 // WriteUnicodeCharacter -------------------------------------------------------
86 86
87 // Appends a UTF-8 character to the given 8-bit string. 87 // Appends a UTF-8 character to the given 8-bit string. Returns the number of
88 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { 88 // bytes written.
89 size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
89 if (code_point <= 0x7f) { 90 if (code_point <= 0x7f) {
90 // Fast path the common case of one byte. 91 // Fast path the common case of one byte.
91 output->push_back(code_point); 92 output->push_back(code_point);
92 return; 93 return 1;
93 } 94 }
94 95
95 // U8_APPEND_UNSAFE can append up to 4 bytes. 96 // CBU8_APPEND_UNSAFE can append up to 4 bytes.
96 int32 char_offset = static_cast<int32>(output->length()); 97 size_t char_offset = output->length();
98 size_t original_char_offset = char_offset;
97 output->resize(char_offset + CBU8_MAX_LENGTH); 99 output->resize(char_offset + CBU8_MAX_LENGTH);
98 100
99 CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); 101 CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
100 102
101 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so 103 // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
102 // it will represent the new length of the string. 104 // it will represent the new length of the string.
103 output->resize(char_offset); 105 output->resize(char_offset);
106 return char_offset - original_char_offset;
104 } 107 }
105 108
106 // Appends the given code point as a UTF-16 character to the STL string. 109 // Appends the given code point as a UTF-16 character to the given 16-bit
107 void WriteUnicodeCharacter(uint32 code_point, string16* output) { 110 // string. Returns the number of 16-bit values written.
111 size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
108 if (CBU16_LENGTH(code_point) == 1) { 112 if (CBU16_LENGTH(code_point) == 1) {
109 // Thie code point is in the Basic Multilingual Plane (BMP). 113 // Thie code point is in the Basic Multilingual Plane (BMP).
110 output->push_back(static_cast<char16>(code_point)); 114 output->push_back(static_cast<char16>(code_point));
111 } else { 115 return 1;
112 // Non-BMP characters use a double-character encoding.
113 int32 char_offset = static_cast<int32>(output->length());
114 output->resize(char_offset + CBU16_MAX_LENGTH);
115 CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
116 } 116 }
117 // Non-BMP characters use a double-character encoding.
118 size_t char_offset = output->length();
119 output->resize(char_offset + CBU16_MAX_LENGTH);
120 CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
121 return CBU16_MAX_LENGTH;
117 } 122 }
118 123
119 #if defined(WCHAR_T_IS_UTF32) 124 #if defined(WCHAR_T_IS_UTF32)
120 // Appends the given UTF-32 character to the given 32-bit string. 125 // Appends the given UTF-32 character to the given 32-bit string. Returns the
121 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { 126 // number of 32-bit values written.
127 inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
122 // This is the easy case, just append the character. 128 // This is the easy case, just append the character.
123 output->push_back(code_point); 129 output->push_back(code_point);
130 return 1;
124 } 131 }
125 #endif // defined(WCHAR_T_IS_UTF32) 132 #endif // defined(WCHAR_T_IS_UTF32)
126 133
127 // Generalized Unicode converter ----------------------------------------------- 134 // Generalized Unicode converter -----------------------------------------------
128 135
129 // Converts the given source Unicode character type to the given destination 136 // Converts the given source Unicode character type to the given destination
130 // Unicode character type as a STL string. The given input buffer and size 137 // Unicode character type as a STL string. The given input buffer and size
131 // determine the source, and the given output STL string will be replaced by 138 // determine the source, and the given output STL string will be replaced by
132 // the result. 139 // the result.
133 template<typename SRC_CHAR, typename DEST_STRING> 140 template<typename SRC_CHAR, typename DEST_STRING>
134 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { 141 bool ConvertUnicode(const SRC_CHAR* src,
135 output->clear(); 142 size_t src_len,
143 DEST_STRING* output,
144 size_t* offset_for_adjustment) {
145 size_t output_offset =
146 (offset_for_adjustment && *offset_for_adjustment < src_len) ?
147 *offset_for_adjustment : DEST_STRING::npos;
136 148
137 // ICU requires 32-bit numbers. 149 // ICU requires 32-bit numbers.
138 bool success = true; 150 bool success = true;
139 int32 src_len32 = static_cast<int32>(src_len); 151 int32 src_len32 = static_cast<int32>(src_len);
140 for (int32 i = 0; i < src_len32; i++) { 152 for (int32 i = 0; i < src_len32; i++) {
141 uint32 code_point; 153 uint32 code_point;
154 size_t original_i = i;
155 size_t chars_written = 0;
142 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { 156 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
143 WriteUnicodeCharacter(code_point, output); 157 chars_written = WriteUnicodeCharacter(code_point, output);
144 } else { 158 } else {
145 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) 159 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
146 // in place of an invalid codepoint. 160 // in place of an invalid codepoint.
147 success = false; 161 success = false;
148 } 162 }
163 if ((output_offset != DEST_STRING::npos) &&
164 (*offset_for_adjustment > original_i)) {
165 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
166 // character read, not after it (so that incrementing it in the loop
167 // increment will place it at the right location), so we need to account
168 // for that in determining the amount that was read.
169 if (*offset_for_adjustment <= static_cast<size_t>(i))
170 output_offset = DEST_STRING::npos;
171 else
172 output_offset += chars_written - (i - original_i + 1);
173 }
149 } 174 }
175
176 if (offset_for_adjustment)
177 *offset_for_adjustment = output_offset;
150 return success; 178 return success;
151 } 179 }
152 180
153 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount 181 // Guesses the length of the output in UTF-8 in bytes, clears that output
154 // of space in the given string. We also assume that the input character types 182 // string, and reserves that amount of space. We assume that the input
155 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume 183 // character types are unsigned, which will be true for UTF-16 and -32 on our
156 // the string length is greater than zero. 184 // systems.
157 template<typename CHAR> 185 template<typename CHAR>
158 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { 186 void PrepareForUTF8Output(const CHAR* src,
187 size_t src_len,
188 std::string* output) {
189 output->clear();
190 if (src_len == 0)
191 return;
159 if (src[0] < 0x80) { 192 if (src[0] < 0x80) {
160 // Assume that the entire input will be ASCII. 193 // Assume that the entire input will be ASCII.
161 output->reserve(src_len); 194 output->reserve(src_len);
162 } else { 195 } else {
163 // Assume that the entire input is non-ASCII and will have 3 bytes per char. 196 // Assume that the entire input is non-ASCII and will have 3 bytes per char.
164 output->reserve(src_len * 3); 197 output->reserve(src_len * 3);
165 } 198 }
166 } 199 }
167 200
168 // Guesses the size of the output buffer (containing either UTF-16 or -32 data) 201 // Prepares an output buffer (containing either UTF-16 or -32 data) given some
169 // given some UTF-8 input that will be converted to it. See ReserveUTF8Output. 202 // UTF-8 input that will be converted to it. See PrepareForUTF8Output().
170 // We assume the source length is > 0.
171 template<typename STRING> 203 template<typename STRING>
172 void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { 204 void PrepareForUTF16Or32Output(const char* src,
205 size_t src_len,
206 STRING* output) {
207 output->clear();
208 if (src_len == 0)
209 return;
173 if (static_cast<unsigned char>(src[0]) < 0x80) { 210 if (static_cast<unsigned char>(src[0]) < 0x80) {
174 // Assume the input is all ASCII, which means 1:1 correspondence. 211 // Assume the input is all ASCII, which means 1:1 correspondence.
175 output->reserve(src_len); 212 output->reserve(src_len);
176 } else { 213 } else {
177 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each 214 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
178 // character. 215 // character.
179 output->reserve(src_len / 2); 216 output->reserve(src_len / 2);
180 } 217 }
181 } 218 }
182 219
183 } // namespace 220 } // namespace
184 221
185 // UTF-8 <-> Wide -------------------------------------------------------------- 222 // UTF-8 <-> Wide --------------------------------------------------------------
186 223
187 std::string WideToUTF8(const std::wstring& wide) { 224 bool WideToUTF8AndAdjustOffset(const wchar_t* src,
225 size_t src_len,
226 std::string* output,
227 size_t* offset_for_adjustment) {
228 PrepareForUTF8Output(src, src_len, output);
229 return ConvertUnicode<wchar_t, std::string>(src, src_len, output,
230 offset_for_adjustment);
231 }
232
233 std::string WideToUTF8AndAdjustOffset(const std::wstring& wide,
234 size_t* offset_for_adjustment) {
188 std::string ret; 235 std::string ret;
189 if (wide.empty())
190 return ret;
191
192 // Ignore the success flag of this call, it will do the best it can for 236 // Ignore the success flag of this call, it will do the best it can for
193 // invalid input, which is what we want here. 237 // invalid input, which is what we want here.
194 WideToUTF8(wide.data(), wide.length(), &ret); 238 WideToUTF8AndAdjustOffset(wide.data(), wide.length(), &ret,
239 offset_for_adjustment);
195 return ret; 240 return ret;
196 } 241 }
197 242
198 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { 243 bool UTF8ToWideAndAdjustOffset(const char* src,
199 if (src_len == 0) { 244 size_t src_len,
200 output->clear(); 245 std::wstring* output,
201 return true; 246 size_t* offset_for_adjustment) {
202 } 247 PrepareForUTF16Or32Output(src, src_len, output);
203 248 return ConvertUnicode<char, std::wstring>(src, src_len, output,
204 ReserveUTF8Output(src, src_len, output); 249 offset_for_adjustment);
205 return ConvertUnicode<wchar_t, std::string>(src, src_len, output);
206 } 250 }
207 251
208 std::wstring UTF8ToWide(const base::StringPiece& utf8) { 252 std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
253 size_t* offset_for_adjustment) {
209 std::wstring ret; 254 std::wstring ret;
210 if (utf8.empty()) 255 UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret,
211 return ret; 256 offset_for_adjustment);
212
213 UTF8ToWide(utf8.data(), utf8.length(), &ret);
214 return ret; 257 return ret;
215 } 258 }
216 259
217 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
218 if (src_len == 0) {
219 output->clear();
220 return true;
221 }
222
223 ReserveUTF16Or32Output(src, src_len, output);
224 return ConvertUnicode<char, std::wstring>(src, src_len, output);
225 }
226
227 // UTF-16 <-> Wide ------------------------------------------------------------- 260 // UTF-16 <-> Wide -------------------------------------------------------------
228 261
229 #if defined(WCHAR_T_IS_UTF16) 262 #if defined(WCHAR_T_IS_UTF16)
230 263
231 // When wide == UTF-16, then conversions are a NOP. 264 // When wide == UTF-16, then conversions are a NOP.
232 string16 WideToUTF16(const std::wstring& wide) { 265 bool WideToUTF16AndAdjustOffset(const wchar_t* src,
266 size_t src_len,
267 string16* output,
268 size_t* offset_for_adjustment) {
269 output->assign(src, src_len);
270 if (offset_for_adjustment && (*offset_for_adjustment >= src_len))
271 *offset_for_adjustment = string16::npos;
272 return true;
273 }
274
275 string16 WideToUTF16AndAdjustOffset(const std::wstring& wide,
276 size_t* offset_for_adjustment) {
277 if (offset_for_adjustment && (*offset_for_adjustment >= wide.length()))
278 *offset_for_adjustment = string16::npos;
233 return wide; 279 return wide;
234 } 280 }
235 281
236 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { 282 bool UTF16ToWideAndAdjustOffset(const char16* src,
283 size_t src_len,
284 std::wstring* output,
285 size_t* offset_for_adjustment) {
237 output->assign(src, src_len); 286 output->assign(src, src_len);
287 if (offset_for_adjustment && (*offset_for_adjustment >= src_len))
288 *offset_for_adjustment = std::wstring::npos;
238 return true; 289 return true;
239 } 290 }
240 291
241 std::wstring UTF16ToWide(const string16& utf16) { 292 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
293 size_t* offset_for_adjustment) {
294 if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length()))
295 *offset_for_adjustment = std::wstring::npos;
242 return utf16; 296 return utf16;
243 } 297 }
244 298
245 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
246 output->assign(src, src_len);
247 return true;
248 }
249
250 #elif defined(WCHAR_T_IS_UTF32) 299 #elif defined(WCHAR_T_IS_UTF32)
251 300
252 string16 WideToUTF16(const std::wstring& wide) { 301 bool WideToUTF16AndAdjustOffset(const wchar_t* src,
302 size_t src_len,
303 string16* output,
304 size_t* offset_for_adjustment) {
305 output->clear();
306 // Assume that normally we won't have any non-BMP characters so the counts
307 // will be the same.
308 output->reserve(src_len);
309 return ConvertUnicode<wchar_t, string16>(src, src_len, output,
310 offset_for_adjustment);
311 }
312
313 string16 WideToUTF16AndAdjustOffset(const std::wstring& wide,
314 size_t* offset_for_adjustment) {
253 string16 ret; 315 string16 ret;
254 if (wide.empty()) 316 WideToUTF16AndAdjustOffset(wide.data(), wide.length(), &ret,
255 return ret; 317 offset_for_adjustment);
256
257 WideToUTF16(wide.data(), wide.length(), &ret);
258 return ret; 318 return ret;
259 } 319 }
260 320
261 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { 321 bool UTF16ToWideAndAdjustOffset(const char16* src,
262 if (src_len == 0) { 322 size_t src_len,
263 output->clear(); 323 std::wstring* output,
264 return true; 324 size_t* offset_for_adjustment) {
265 } 325 output->clear();
266
267 // Assume that normally we won't have any non-BMP characters so the counts 326 // Assume that normally we won't have any non-BMP characters so the counts
268 // will be the same. 327 // will be the same.
269 output->reserve(src_len); 328 output->reserve(src_len);
270 return ConvertUnicode<wchar_t, string16>(src, src_len, output); 329 return ConvertUnicode<char16, std::wstring>(src, src_len, output,
330 offset_for_adjustment);
271 } 331 }
272 332
273 std::wstring UTF16ToWide(const string16& utf16) { 333 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
334 size_t* offset_for_adjustment) {
274 std::wstring ret; 335 std::wstring ret;
275 if (utf16.empty()) 336 UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret,
276 return ret; 337 offset_for_adjustment);
277
278 UTF16ToWide(utf16.data(), utf16.length(), &ret);
279 return ret; 338 return ret;
280 } 339 }
281 340
282 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
283 if (src_len == 0) {
284 output->clear();
285 return true;
286 }
287
288 // Assume that normally we won't have any non-BMP characters so the counts
289 // will be the same.
290 output->reserve(src_len);
291 return ConvertUnicode<char16, std::wstring>(src, src_len, output);
292 }
293
294 #endif // defined(WCHAR_T_IS_UTF32) 341 #endif // defined(WCHAR_T_IS_UTF32)
295 342
296 // UTF16 <-> UTF8 -------------------------------------------------------------- 343 // UTF16 <-> UTF8 --------------------------------------------------------------
297 344
298 #if defined(WCHAR_T_IS_UTF32) 345 #if defined(WCHAR_T_IS_UTF32)
299 346
300 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { 347 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
301 if (src_len == 0) { 348 PrepareForUTF16Or32Output(src, src_len, output);
302 output->clear(); 349 return ConvertUnicode<char, string16>(src, src_len, output, NULL);
303 return true;
304 }
305
306 ReserveUTF16Or32Output(src, src_len, output);
307 return ConvertUnicode<char, string16>(src, src_len, output);
308 } 350 }
309 351
310 string16 UTF8ToUTF16(const std::string& utf8) { 352 string16 UTF8ToUTF16(const std::string& utf8) {
311 string16 ret; 353 string16 ret;
312 if (utf8.empty())
313 return ret;
314
315 // Ignore the success flag of this call, it will do the best it can for 354 // Ignore the success flag of this call, it will do the best it can for
316 // invalid input, which is what we want here. 355 // invalid input, which is what we want here.
317 UTF8ToUTF16(utf8.data(), utf8.length(), &ret); 356 UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
318 return ret; 357 return ret;
319 } 358 }
320 359
321 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { 360 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
322 if (src_len == 0) { 361 PrepareForUTF8Output(src, src_len, output);
323 output->clear(); 362 return ConvertUnicode<char16, std::string>(src, src_len, output, NULL);
324 return true;
325 }
326
327 ReserveUTF8Output(src, src_len, output);
328 return ConvertUnicode<char16, std::string>(src, src_len, output);
329 } 363 }
330 364
331 std::string UTF16ToUTF8(const string16& utf16) { 365 std::string UTF16ToUTF8(const string16& utf16) {
332 std::string ret; 366 std::string ret;
333 if (utf16.empty())
334 return ret;
335
336 // Ignore the success flag of this call, it will do the best it can for 367 // Ignore the success flag of this call, it will do the best it can for
337 // invalid input, which is what we want here. 368 // invalid input, which is what we want here.
338 UTF16ToUTF8(utf16.data(), utf16.length(), &ret); 369 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
339 return ret; 370 return ret;
340 } 371 }
341 372
342 #elif defined(WCHAR_T_IS_UTF16) 373 #elif defined(WCHAR_T_IS_UTF16)
343 // Easy case since we can use the "wide" versions we already wrote above. 374 // Easy case since we can use the "wide" versions we already wrote above.
344 375
345 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { 376 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
346 return UTF8ToWide(src, src_len, output); 377 return UTF8ToWide(src, src_len, output);
347 } 378 }
348 379
349 string16 UTF8ToUTF16(const std::string& utf8) { 380 string16 UTF8ToUTF16(const std::string& utf8) {
350 return UTF8ToWide(utf8); 381 return UTF8ToWide(utf8);
351 } 382 }
352 383
353 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { 384 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
354 return WideToUTF8(src, src_len, output); 385 return WideToUTF8(src, src_len, output);
355 } 386 }
356 387
357 std::string UTF16ToUTF8(const string16& utf16) { 388 std::string UTF16ToUTF8(const string16& utf16) {
358 return WideToUTF8(utf16); 389 return WideToUTF8(utf16);
359 } 390 }
360 391
361 #endif 392 #endif
OLDNEW
« no previous file with comments | « base/utf_string_conversions.h ('k') | base/utf_string_conversions_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698