OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/utf_offset_string_conversions.h" | 5 #include "base/utf_offset_string_conversions.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
9 #include "base/scoped_ptr.h" | 9 #include "base/scoped_ptr.h" |
10 #include "base/string_piece.h" | 10 #include "base/string_piece.h" |
11 #include "base/utf_string_conversion_utils.h" | 11 #include "base/utf_string_conversion_utils.h" |
12 | 12 |
13 using base::PrepareForUTF16Or32Output; | 13 using base::PrepareForUTF16Or32Output; |
14 using base::ReadUnicodeCharacter; | 14 using base::ReadUnicodeCharacter; |
15 using base::WriteUnicodeCharacter; | 15 using base::WriteUnicodeCharacter; |
16 | 16 |
17 // Generalized Unicode converter ----------------------------------------------- | |
18 | |
19 // Converts the given source Unicode character type to the given destination | 17 // Converts the given source Unicode character type to the given destination |
20 // Unicode character type as a STL string. The given input buffer and size | 18 // Unicode character type as a STL string. The given input buffer and size |
21 // determine the source, and the given output STL string will be replaced by | 19 // determine the source, and the given output STL string will be replaced by |
22 // the result. | 20 // the result. |
23 template<typename SRC_CHAR> | 21 bool ConvertUnicode(const char* src, |
24 bool ConvertUnicode(const SRC_CHAR* src, | |
25 size_t src_len, | 22 size_t src_len, |
26 std::wstring* output, | 23 string16* output, |
27 std::vector<size_t>* offsets_for_adjustment) { | 24 std::vector<size_t>* offsets_for_adjustment) { |
28 if (offsets_for_adjustment) { | 25 if (offsets_for_adjustment) { |
29 std::for_each(offsets_for_adjustment->begin(), | 26 std::for_each(offsets_for_adjustment->begin(), |
30 offsets_for_adjustment->end(), | 27 offsets_for_adjustment->end(), |
31 LimitOffset<std::wstring>(src_len)); | 28 LimitOffset<string16>(src_len)); |
32 } | 29 } |
33 | 30 |
34 // ICU requires 32-bit numbers. | 31 // ICU requires 32-bit numbers. |
35 bool success = true; | 32 bool success = true; |
36 AdjustOffset::Adjustments adjustments; | 33 OffsetAdjuster offset_adjuster(offsets_for_adjustment); |
37 int32 src_len32 = static_cast<int32>(src_len); | 34 int32 src_len32 = static_cast<int32>(src_len); |
38 for (int32 i = 0; i < src_len32; i++) { | 35 for (int32 i = 0; i < src_len32; i++) { |
39 uint32 code_point; | 36 uint32 code_point; |
40 size_t original_i = i; | 37 size_t original_i = i; |
41 size_t chars_written = 0; | 38 size_t chars_written = 0; |
42 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | 39 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
43 chars_written = WriteUnicodeCharacter(code_point, output); | 40 chars_written = WriteUnicodeCharacter(code_point, output); |
44 } else { | 41 } else { |
45 chars_written = WriteUnicodeCharacter(0xFFFD, output); | 42 chars_written = WriteUnicodeCharacter(0xFFFD, output); |
46 success = false; | 43 success = false; |
47 } | 44 } |
48 if (offsets_for_adjustment) { | 45 if (offsets_for_adjustment) { |
49 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last | 46 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last |
50 // character read, not after it (so that incrementing it in the loop | 47 // character read, not after it (so that incrementing it in the loop |
51 // increment will place it at the right location), so we need to account | 48 // increment will place it at the right location), so we need to account |
52 // for that in determining the amount that was read. | 49 // for that in determining the amount that was read. |
53 adjustments.push_back(AdjustOffset::Adjustment( | 50 offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i, |
54 original_i, i - original_i + 1, chars_written)); | 51 i - original_i + 1, chars_written)); |
55 } | 52 } |
56 } | 53 } |
57 | |
58 // Make offset adjustment. | |
59 if (offsets_for_adjustment && !adjustments.empty()) { | |
60 std::for_each(offsets_for_adjustment->begin(), | |
61 offsets_for_adjustment->end(), | |
62 AdjustOffset(adjustments)); | |
63 } | |
64 | |
65 return success; | 54 return success; |
66 } | 55 } |
67 | 56 |
68 // UTF-8 <-> Wide -------------------------------------------------------------- | 57 bool UTF8ToUTF16AndAdjustOffset(const char* src, |
69 | 58 size_t src_len, |
70 bool UTF8ToWideAndAdjustOffset(const char* src, | 59 string16* output, |
71 size_t src_len, | 60 size_t* offset_for_adjustment) { |
72 std::wstring* output, | |
73 size_t* offset_for_adjustment) { | |
74 std::vector<size_t> offsets; | 61 std::vector<size_t> offsets; |
75 if (offset_for_adjustment) | 62 if (offset_for_adjustment) |
76 offsets.push_back(*offset_for_adjustment); | 63 offsets.push_back(*offset_for_adjustment); |
77 PrepareForUTF16Or32Output(src, src_len, output); | 64 PrepareForUTF16Or32Output(src, src_len, output); |
78 bool ret = ConvertUnicode(src, src_len, output, &offsets); | 65 bool ret = ConvertUnicode(src, src_len, output, &offsets); |
79 if (offset_for_adjustment) | 66 if (offset_for_adjustment) |
80 *offset_for_adjustment = offsets[0]; | 67 *offset_for_adjustment = offsets[0]; |
81 return ret; | 68 return ret; |
82 } | 69 } |
83 | 70 |
84 bool UTF8ToWideAndAdjustOffsets(const char* src, | 71 bool UTF8ToUTF16AndAdjustOffsets(const char* src, |
85 size_t src_len, | 72 size_t src_len, |
86 std::wstring* output, | 73 string16* output, |
87 std::vector<size_t>* offsets_for_adjustment) { | 74 std::vector<size_t>* offsets_for_adjustment) { |
88 PrepareForUTF16Or32Output(src, src_len, output); | 75 PrepareForUTF16Or32Output(src, src_len, output); |
89 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); | 76 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); |
90 } | 77 } |
91 | 78 |
92 std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, | 79 string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8, |
93 size_t* offset_for_adjustment) { | |
94 std::vector<size_t> offsets; | |
95 if (offset_for_adjustment) | |
96 offsets.push_back(*offset_for_adjustment); | |
97 std::wstring result; | |
98 UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, | |
99 &offsets); | |
100 if (offset_for_adjustment) | |
101 *offset_for_adjustment = offsets[0]; | |
102 return result; | |
103 } | |
104 | |
105 std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, | |
106 std::vector<size_t>* | |
107 offsets_for_adjustment) { | |
108 std::wstring result; | |
109 UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, | |
110 offsets_for_adjustment); | |
111 return result; | |
112 } | |
113 | |
114 // UTF-16 <-> Wide ------------------------------------------------------------- | |
115 | |
116 #if defined(WCHAR_T_IS_UTF16) | |
117 | |
118 // When wide == UTF-16, then conversions are a NOP. | |
119 bool UTF16ToWideAndAdjustOffset(const char16* src, | |
120 size_t src_len, | |
121 std::wstring* output, | |
122 size_t* offset_for_adjustment) { | |
123 output->assign(src, src_len); | |
124 if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) | |
125 *offset_for_adjustment = std::wstring::npos; | |
126 return true; | |
127 } | |
128 | |
129 bool UTF16ToWideAndAdjustOffsets(const char16* src, | |
130 size_t src_len, | |
131 std::wstring* output, | |
132 std::vector<size_t>* offsets_for_adjustment) { | |
133 output->assign(src, src_len); | |
134 if (offsets_for_adjustment) { | |
135 std::for_each(offsets_for_adjustment->begin(), | |
136 offsets_for_adjustment->end(), | |
137 LimitOffset<std::wstring>(src_len)); | |
138 } | |
139 return true; | |
140 } | |
141 | |
142 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, | |
143 size_t* offset_for_adjustment) { | |
144 if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) | |
145 *offset_for_adjustment = std::wstring::npos; | |
146 return utf16; | |
147 } | |
148 | |
149 std::wstring UTF16ToWideAndAdjustOffsets( | |
150 const string16& utf16, | |
151 std::vector<size_t>* offsets_for_adjustment) { | |
152 if (offsets_for_adjustment) { | |
153 std::for_each(offsets_for_adjustment->begin(), | |
154 offsets_for_adjustment->end(), | |
155 LimitOffset<std::wstring>(utf16.length())); | |
156 } | |
157 return utf16; | |
158 } | |
159 | |
160 #elif defined(WCHAR_T_IS_UTF32) | |
161 | |
162 bool UTF16ToWideAndAdjustOffset(const char16* src, | |
163 size_t src_len, | |
164 std::wstring* output, | |
165 size_t* offset_for_adjustment) { | |
166 std::vector<size_t> offsets; | |
167 if (offset_for_adjustment) | |
168 offsets.push_back(*offset_for_adjustment); | |
169 output->clear(); | |
170 // Assume that normally we won't have any non-BMP characters so the counts | |
171 // will be the same. | |
172 output->reserve(src_len); | |
173 bool ret = ConvertUnicode(src, src_len, output, &offsets); | |
174 if (offset_for_adjustment) | |
175 *offset_for_adjustment = offsets[0]; | |
176 return ret; | |
177 } | |
178 | |
179 bool UTF16ToWideAndAdjustOffsets(const char16* src, | |
180 size_t src_len, | |
181 std::wstring* output, | |
182 std::vector<size_t>* offsets_for_adjustment) { | |
183 output->clear(); | |
184 // Assume that normally we won't have any non-BMP characters so the counts | |
185 // will be the same. | |
186 output->reserve(src_len); | |
187 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); | |
188 } | |
189 | |
190 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, | |
191 size_t* offset_for_adjustment) { | 80 size_t* offset_for_adjustment) { |
192 std::vector<size_t> offsets; | 81 std::vector<size_t> offsets; |
193 if (offset_for_adjustment) | 82 if (offset_for_adjustment) |
194 offsets.push_back(*offset_for_adjustment); | 83 offsets.push_back(*offset_for_adjustment); |
195 std::wstring result; | 84 string16 result; |
196 UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, | 85 UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result, |
197 &offsets); | 86 &offsets); |
198 if (offset_for_adjustment) | 87 if (offset_for_adjustment) |
199 *offset_for_adjustment = offsets[0]; | 88 *offset_for_adjustment = offsets[0]; |
200 return result; | 89 return result; |
201 } | 90 } |
202 | 91 |
203 std::wstring UTF16ToWideAndAdjustOffsets( | 92 string16 UTF8ToUTF16AndAdjustOffsets( |
204 const string16& utf16, | 93 const base::StringPiece& utf8, |
205 std::vector<size_t>* offsets_for_adjustment) { | 94 std::vector<size_t>* offsets_for_adjustment) { |
206 std::wstring result; | 95 string16 result; |
207 UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, | 96 UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result, |
208 offsets_for_adjustment); | 97 offsets_for_adjustment); |
209 return result; | 98 return result; |
210 } | 99 } |
211 | 100 |
212 #endif // defined(WCHAR_T_IS_UTF32) | 101 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, |
| 102 size_t original_length, |
| 103 size_t output_length) |
| 104 : original_offset(original_offset), |
| 105 original_length(original_length), |
| 106 output_length(output_length) { |
| 107 } |
213 | 108 |
214 AdjustOffset::Adjustment::Adjustment(size_t location, | 109 OffsetAdjuster::OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment) |
215 size_t old_length, | 110 : offsets_for_adjustment_(offsets_for_adjustment) { |
216 size_t new_length) | 111 } |
217 : location(location), | |
218 old_length(old_length), | |
219 new_length(new_length) {} | |
220 | 112 |
221 AdjustOffset::AdjustOffset(const Adjustments& adjustments) | 113 OffsetAdjuster::~OffsetAdjuster() { |
222 : adjustments_(adjustments) {} | 114 if (!offsets_for_adjustment_ || adjustments_.empty()) |
| 115 return; |
| 116 for (std::vector<size_t>::iterator i(offsets_for_adjustment_->begin()); |
| 117 i != offsets_for_adjustment_->end(); ++i) |
| 118 AdjustOffset(i); |
| 119 } |
223 | 120 |
224 void AdjustOffset::operator()(size_t& offset) { | 121 void OffsetAdjuster::Add(const Adjustment& adjustment) { |
225 if (offset == std::wstring::npos) | 122 adjustments_.push_back(adjustment); |
| 123 } |
| 124 |
| 125 void OffsetAdjuster::AdjustOffset(std::vector<size_t>::iterator offset) { |
| 126 if (*offset == string16::npos) |
226 return; | 127 return; |
227 size_t adjustment = 0; | 128 size_t adjustment = 0; |
228 for (Adjustments::const_iterator i = adjustments_.begin(); | 129 for (std::vector<Adjustment>::const_iterator i = adjustments_.begin(); |
229 i != adjustments_.end(); ++i) { | 130 i != adjustments_.end(); ++i) { |
230 size_t location = i->location; | 131 if (*offset == i->original_offset && i->output_length == 0) { |
231 if (offset == location && i->new_length == 0) { | 132 *offset = string16::npos; |
232 offset = std::wstring::npos; | |
233 return; | 133 return; |
234 } | 134 } |
235 if (offset <= location) | 135 if (*offset <= i->original_offset) |
236 break; | 136 break; |
237 if (offset < (location + i->old_length)) { | 137 if (*offset < (i->original_offset + i->original_length)) { |
238 offset = std::wstring::npos; | 138 *offset = string16::npos; |
239 return; | 139 return; |
240 } | 140 } |
241 adjustment += (i->old_length - i->new_length); | 141 adjustment += (i->original_length - i->output_length); |
242 } | 142 } |
243 offset -= adjustment; | 143 *offset -= adjustment; |
244 } | 144 } |
OLD | NEW |