| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/utf_offset_string_conversions.h" | 5 #include "base/utf_offset_string_conversions.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 | 8 |
| 9 #include "base/scoped_ptr.h" | 9 #include "base/scoped_ptr.h" |
| 10 #include "base/string_piece.h" | 10 #include "base/string_piece.h" |
| 11 #include "base/utf_string_conversion_utils.h" | 11 #include "base/utf_string_conversion_utils.h" |
| 12 | 12 |
| 13 using base::PrepareForUTF16Or32Output; | 13 using base::PrepareForUTF16Or32Output; |
| 14 using base::ReadUnicodeCharacter; | 14 using base::ReadUnicodeCharacter; |
| 15 using base::WriteUnicodeCharacter; | 15 using base::WriteUnicodeCharacter; |
| 16 | 16 |
| 17 // Generalized Unicode converter ----------------------------------------------- | |
| 18 | |
| 19 // Converts the given source Unicode character type to the given destination | 17 // Converts the given source Unicode character type to the given destination |
| 20 // Unicode character type as a STL string. The given input buffer and size | 18 // Unicode character type as a STL string. The given input buffer and size |
| 21 // determine the source, and the given output STL string will be replaced by | 19 // determine the source, and the given output STL string will be replaced by |
| 22 // the result. | 20 // the result. |
| 23 template<typename SRC_CHAR> | 21 bool ConvertUnicode(const char* src, |
| 24 bool ConvertUnicode(const SRC_CHAR* src, | |
| 25 size_t src_len, | 22 size_t src_len, |
| 26 std::wstring* output, | 23 string16* output, |
| 27 std::vector<size_t>* offsets_for_adjustment) { | 24 std::vector<size_t>* offsets_for_adjustment) { |
| 28 if (offsets_for_adjustment) { | 25 if (offsets_for_adjustment) { |
| 29 std::for_each(offsets_for_adjustment->begin(), | 26 std::for_each(offsets_for_adjustment->begin(), |
| 30 offsets_for_adjustment->end(), | 27 offsets_for_adjustment->end(), |
| 31 LimitOffset<std::wstring>(src_len)); | 28 LimitOffset<string16>(src_len)); |
| 32 } | 29 } |
| 33 | 30 |
| 34 // ICU requires 32-bit numbers. | 31 // ICU requires 32-bit numbers. |
| 35 bool success = true; | 32 bool success = true; |
| 36 AdjustOffset::Adjustments adjustments; | 33 OffsetAdjuster offset_adjuster(offsets_for_adjustment); |
| 37 int32 src_len32 = static_cast<int32>(src_len); | 34 int32 src_len32 = static_cast<int32>(src_len); |
| 38 for (int32 i = 0; i < src_len32; i++) { | 35 for (int32 i = 0; i < src_len32; i++) { |
| 39 uint32 code_point; | 36 uint32 code_point; |
| 40 size_t original_i = i; | 37 size_t original_i = i; |
| 41 size_t chars_written = 0; | 38 size_t chars_written = 0; |
| 42 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | 39 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
| 43 chars_written = WriteUnicodeCharacter(code_point, output); | 40 chars_written = WriteUnicodeCharacter(code_point, output); |
| 44 } else { | 41 } else { |
| 45 chars_written = WriteUnicodeCharacter(0xFFFD, output); | 42 chars_written = WriteUnicodeCharacter(0xFFFD, output); |
| 46 success = false; | 43 success = false; |
| 47 } | 44 } |
| 48 if (offsets_for_adjustment) { | 45 if (offsets_for_adjustment) { |
| 49 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last | 46 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last |
| 50 // character read, not after it (so that incrementing it in the loop | 47 // character read, not after it (so that incrementing it in the loop |
| 51 // increment will place it at the right location), so we need to account | 48 // increment will place it at the right location), so we need to account |
| 52 // for that in determining the amount that was read. | 49 // for that in determining the amount that was read. |
| 53 adjustments.push_back(AdjustOffset::Adjustment( | 50 offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i, |
| 54 original_i, i - original_i + 1, chars_written)); | 51 i - original_i + 1, chars_written)); |
| 55 } | 52 } |
| 56 } | 53 } |
| 57 | |
| 58 // Make offset adjustment. | |
| 59 if (offsets_for_adjustment && !adjustments.empty()) { | |
| 60 std::for_each(offsets_for_adjustment->begin(), | |
| 61 offsets_for_adjustment->end(), | |
| 62 AdjustOffset(adjustments)); | |
| 63 } | |
| 64 | |
| 65 return success; | 54 return success; |
| 66 } | 55 } |
| 67 | 56 |
| 68 // UTF-8 <-> Wide -------------------------------------------------------------- | 57 bool UTF8ToUTF16AndAdjustOffset(const char* src, |
| 69 | 58 size_t src_len, |
| 70 bool UTF8ToWideAndAdjustOffset(const char* src, | 59 string16* output, |
| 71 size_t src_len, | 60 size_t* offset_for_adjustment) { |
| 72 std::wstring* output, | |
| 73 size_t* offset_for_adjustment) { | |
| 74 std::vector<size_t> offsets; | 61 std::vector<size_t> offsets; |
| 75 if (offset_for_adjustment) | 62 if (offset_for_adjustment) |
| 76 offsets.push_back(*offset_for_adjustment); | 63 offsets.push_back(*offset_for_adjustment); |
| 77 PrepareForUTF16Or32Output(src, src_len, output); | 64 PrepareForUTF16Or32Output(src, src_len, output); |
| 78 bool ret = ConvertUnicode(src, src_len, output, &offsets); | 65 bool ret = ConvertUnicode(src, src_len, output, &offsets); |
| 79 if (offset_for_adjustment) | 66 if (offset_for_adjustment) |
| 80 *offset_for_adjustment = offsets[0]; | 67 *offset_for_adjustment = offsets[0]; |
| 81 return ret; | 68 return ret; |
| 82 } | 69 } |
| 83 | 70 |
| 84 bool UTF8ToWideAndAdjustOffsets(const char* src, | 71 bool UTF8ToUTF16AndAdjustOffsets(const char* src, |
| 85 size_t src_len, | 72 size_t src_len, |
| 86 std::wstring* output, | 73 string16* output, |
| 87 std::vector<size_t>* offsets_for_adjustment) { | 74 std::vector<size_t>* offsets_for_adjustment) { |
| 88 PrepareForUTF16Or32Output(src, src_len, output); | 75 PrepareForUTF16Or32Output(src, src_len, output); |
| 89 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); | 76 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); |
| 90 } | 77 } |
| 91 | 78 |
| 92 std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, | 79 string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8, |
| 93 size_t* offset_for_adjustment) { | |
| 94 std::vector<size_t> offsets; | |
| 95 if (offset_for_adjustment) | |
| 96 offsets.push_back(*offset_for_adjustment); | |
| 97 std::wstring result; | |
| 98 UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, | |
| 99 &offsets); | |
| 100 if (offset_for_adjustment) | |
| 101 *offset_for_adjustment = offsets[0]; | |
| 102 return result; | |
| 103 } | |
| 104 | |
| 105 std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, | |
| 106 std::vector<size_t>* | |
| 107 offsets_for_adjustment) { | |
| 108 std::wstring result; | |
| 109 UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, | |
| 110 offsets_for_adjustment); | |
| 111 return result; | |
| 112 } | |
| 113 | |
| 114 // UTF-16 <-> Wide ------------------------------------------------------------- | |
| 115 | |
| 116 #if defined(WCHAR_T_IS_UTF16) | |
| 117 | |
| 118 // When wide == UTF-16, then conversions are a NOP. | |
| 119 bool UTF16ToWideAndAdjustOffset(const char16* src, | |
| 120 size_t src_len, | |
| 121 std::wstring* output, | |
| 122 size_t* offset_for_adjustment) { | |
| 123 output->assign(src, src_len); | |
| 124 if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) | |
| 125 *offset_for_adjustment = std::wstring::npos; | |
| 126 return true; | |
| 127 } | |
| 128 | |
| 129 bool UTF16ToWideAndAdjustOffsets(const char16* src, | |
| 130 size_t src_len, | |
| 131 std::wstring* output, | |
| 132 std::vector<size_t>* offsets_for_adjustment) { | |
| 133 output->assign(src, src_len); | |
| 134 if (offsets_for_adjustment) { | |
| 135 std::for_each(offsets_for_adjustment->begin(), | |
| 136 offsets_for_adjustment->end(), | |
| 137 LimitOffset<std::wstring>(src_len)); | |
| 138 } | |
| 139 return true; | |
| 140 } | |
| 141 | |
| 142 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, | |
| 143 size_t* offset_for_adjustment) { | |
| 144 if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) | |
| 145 *offset_for_adjustment = std::wstring::npos; | |
| 146 return utf16; | |
| 147 } | |
| 148 | |
| 149 std::wstring UTF16ToWideAndAdjustOffsets( | |
| 150 const string16& utf16, | |
| 151 std::vector<size_t>* offsets_for_adjustment) { | |
| 152 if (offsets_for_adjustment) { | |
| 153 std::for_each(offsets_for_adjustment->begin(), | |
| 154 offsets_for_adjustment->end(), | |
| 155 LimitOffset<std::wstring>(utf16.length())); | |
| 156 } | |
| 157 return utf16; | |
| 158 } | |
| 159 | |
| 160 #elif defined(WCHAR_T_IS_UTF32) | |
| 161 | |
| 162 bool UTF16ToWideAndAdjustOffset(const char16* src, | |
| 163 size_t src_len, | |
| 164 std::wstring* output, | |
| 165 size_t* offset_for_adjustment) { | |
| 166 std::vector<size_t> offsets; | |
| 167 if (offset_for_adjustment) | |
| 168 offsets.push_back(*offset_for_adjustment); | |
| 169 output->clear(); | |
| 170 // Assume that normally we won't have any non-BMP characters so the counts | |
| 171 // will be the same. | |
| 172 output->reserve(src_len); | |
| 173 bool ret = ConvertUnicode(src, src_len, output, &offsets); | |
| 174 if (offset_for_adjustment) | |
| 175 *offset_for_adjustment = offsets[0]; | |
| 176 return ret; | |
| 177 } | |
| 178 | |
| 179 bool UTF16ToWideAndAdjustOffsets(const char16* src, | |
| 180 size_t src_len, | |
| 181 std::wstring* output, | |
| 182 std::vector<size_t>* offsets_for_adjustment) { | |
| 183 output->clear(); | |
| 184 // Assume that normally we won't have any non-BMP characters so the counts | |
| 185 // will be the same. | |
| 186 output->reserve(src_len); | |
| 187 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); | |
| 188 } | |
| 189 | |
| 190 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, | |
| 191 size_t* offset_for_adjustment) { | 80 size_t* offset_for_adjustment) { |
| 192 std::vector<size_t> offsets; | 81 std::vector<size_t> offsets; |
| 193 if (offset_for_adjustment) | 82 if (offset_for_adjustment) |
| 194 offsets.push_back(*offset_for_adjustment); | 83 offsets.push_back(*offset_for_adjustment); |
| 195 std::wstring result; | 84 string16 result; |
| 196 UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, | 85 UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result, |
| 197 &offsets); | 86 &offsets); |
| 198 if (offset_for_adjustment) | 87 if (offset_for_adjustment) |
| 199 *offset_for_adjustment = offsets[0]; | 88 *offset_for_adjustment = offsets[0]; |
| 200 return result; | 89 return result; |
| 201 } | 90 } |
| 202 | 91 |
| 203 std::wstring UTF16ToWideAndAdjustOffsets( | 92 string16 UTF8ToUTF16AndAdjustOffsets( |
| 204 const string16& utf16, | 93 const base::StringPiece& utf8, |
| 205 std::vector<size_t>* offsets_for_adjustment) { | 94 std::vector<size_t>* offsets_for_adjustment) { |
| 206 std::wstring result; | 95 string16 result; |
| 207 UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, | 96 UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result, |
| 208 offsets_for_adjustment); | 97 offsets_for_adjustment); |
| 209 return result; | 98 return result; |
| 210 } | 99 } |
| 211 | 100 |
| 212 #endif // defined(WCHAR_T_IS_UTF32) | 101 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, |
| 102 size_t original_length, |
| 103 size_t output_length) |
| 104 : original_offset(original_offset), |
| 105 original_length(original_length), |
| 106 output_length(output_length) { |
| 107 } |
| 213 | 108 |
| 214 AdjustOffset::Adjustment::Adjustment(size_t location, | 109 OffsetAdjuster::OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment) |
| 215 size_t old_length, | 110 : offsets_for_adjustment_(offsets_for_adjustment) { |
| 216 size_t new_length) | 111 } |
| 217 : location(location), | |
| 218 old_length(old_length), | |
| 219 new_length(new_length) {} | |
| 220 | 112 |
| 221 AdjustOffset::AdjustOffset(const Adjustments& adjustments) | 113 OffsetAdjuster::~OffsetAdjuster() { |
| 222 : adjustments_(adjustments) {} | 114 if (!offsets_for_adjustment_ || adjustments_.empty()) |
| 115 return; |
| 116 for (std::vector<size_t>::iterator i(offsets_for_adjustment_->begin()); |
| 117 i != offsets_for_adjustment_->end(); ++i) |
| 118 AdjustOffset(i); |
| 119 } |
| 223 | 120 |
| 224 void AdjustOffset::operator()(size_t& offset) { | 121 void OffsetAdjuster::Add(const Adjustment& adjustment) { |
| 225 if (offset == std::wstring::npos) | 122 adjustments_.push_back(adjustment); |
| 123 } |
| 124 |
| 125 void OffsetAdjuster::AdjustOffset(std::vector<size_t>::iterator offset) { |
| 126 if (*offset == string16::npos) |
| 226 return; | 127 return; |
| 227 size_t adjustment = 0; | 128 size_t adjustment = 0; |
| 228 for (Adjustments::const_iterator i = adjustments_.begin(); | 129 for (std::vector<Adjustment>::const_iterator i = adjustments_.begin(); |
| 229 i != adjustments_.end(); ++i) { | 130 i != adjustments_.end(); ++i) { |
| 230 size_t location = i->location; | 131 if (*offset == i->original_offset && i->output_length == 0) { |
| 231 if (offset == location && i->new_length == 0) { | 132 *offset = string16::npos; |
| 232 offset = std::wstring::npos; | |
| 233 return; | 133 return; |
| 234 } | 134 } |
| 235 if (offset <= location) | 135 if (*offset <= i->original_offset) |
| 236 break; | 136 break; |
| 237 if (offset < (location + i->old_length)) { | 137 if (*offset < (i->original_offset + i->original_length)) { |
| 238 offset = std::wstring::npos; | 138 *offset = string16::npos; |
| 239 return; | 139 return; |
| 240 } | 140 } |
| 241 adjustment += (i->old_length - i->new_length); | 141 adjustment += (i->original_length - i->output_length); |
| 242 } | 142 } |
| 243 offset -= adjustment; | 143 *offset -= adjustment; |
| 244 } | 144 } |
| OLD | NEW |