| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/strings/utf_offset_string_conversions.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 | |
| 9 #include "base/logging.h" | |
| 10 #include "base/memory/scoped_ptr.h" | |
| 11 #include "base/strings/string_piece.h" | |
| 12 #include "base/strings/utf_string_conversion_utils.h" | |
| 13 | |
| 14 namespace base { | |
| 15 | |
| 16 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, | |
| 17 size_t original_length, | |
| 18 size_t output_length) | |
| 19 : original_offset(original_offset), | |
| 20 original_length(original_length), | |
| 21 output_length(output_length) { | |
| 22 } | |
| 23 | |
| 24 // static | |
| 25 void OffsetAdjuster::AdjustOffsets( | |
| 26 const Adjustments& adjustments, | |
| 27 std::vector<size_t>* offsets_for_adjustment) { | |
| 28 if (!offsets_for_adjustment || adjustments.empty()) | |
| 29 return; | |
| 30 for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); | |
| 31 i != offsets_for_adjustment->end(); ++i) | |
| 32 AdjustOffset(adjustments, &(*i)); | |
| 33 } | |
| 34 | |
| 35 // static | |
| 36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, | |
| 37 size_t* offset) { | |
| 38 if (*offset == string16::npos) | |
| 39 return; | |
| 40 int adjustment = 0; | |
| 41 for (Adjustments::const_iterator i = adjustments.begin(); | |
| 42 i != adjustments.end(); ++i) { | |
| 43 if (*offset <= i->original_offset) | |
| 44 break; | |
| 45 if (*offset < (i->original_offset + i->original_length)) { | |
| 46 *offset = string16::npos; | |
| 47 return; | |
| 48 } | |
| 49 adjustment += static_cast<int>(i->original_length - i->output_length); | |
| 50 } | |
| 51 *offset -= adjustment; | |
| 52 } | |
| 53 | |
| 54 // static | |
| 55 void OffsetAdjuster::UnadjustOffsets( | |
| 56 const Adjustments& adjustments, | |
| 57 std::vector<size_t>* offsets_for_unadjustment) { | |
| 58 if (!offsets_for_unadjustment || adjustments.empty()) | |
| 59 return; | |
| 60 for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); | |
| 61 i != offsets_for_unadjustment->end(); ++i) | |
| 62 UnadjustOffset(adjustments, &(*i)); | |
| 63 } | |
| 64 | |
| 65 // static | |
| 66 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, | |
| 67 size_t* offset) { | |
| 68 if (*offset == string16::npos) | |
| 69 return; | |
| 70 int adjustment = 0; | |
| 71 for (Adjustments::const_iterator i = adjustments.begin(); | |
| 72 i != adjustments.end(); ++i) { | |
| 73 if (*offset + adjustment <= i->original_offset) | |
| 74 break; | |
| 75 adjustment += static_cast<int>(i->original_length - i->output_length); | |
| 76 if ((*offset + adjustment) < | |
| 77 (i->original_offset + i->original_length)) { | |
| 78 *offset = string16::npos; | |
| 79 return; | |
| 80 } | |
| 81 } | |
| 82 *offset += adjustment; | |
| 83 } | |
| 84 | |
| 85 // static | |
| 86 void OffsetAdjuster::MergeSequentialAdjustments( | |
| 87 const Adjustments& first_adjustments, | |
| 88 Adjustments* adjustments_on_adjusted_string) { | |
| 89 Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); | |
| 90 Adjustments::const_iterator first_iter = first_adjustments.begin(); | |
| 91 // Simultaneously iterate over all |adjustments_on_adjusted_string| and | |
| 92 // |first_adjustments|, adding adjustments to or correcting the adjustments | |
| 93 // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the | |
| 94 // current number of characters collapsed by |first_adjustments| up to this | |
| 95 // point. |currently_collapsing| keeps track of the number of characters | |
| 96 // collapsed by |first_adjustments| into the current |adjusted_iter|'s | |
| 97 // length. These are characters that will change |shift| as soon as we're | |
| 98 // done processing the current |adjusted_iter|; they are not yet reflected in | |
| 99 // |shift|. | |
| 100 size_t shift = 0; | |
| 101 size_t currently_collapsing = 0; | |
| 102 while (adjusted_iter != adjustments_on_adjusted_string->end()) { | |
| 103 if ((first_iter == first_adjustments.end()) || | |
| 104 ((adjusted_iter->original_offset + shift + | |
| 105 adjusted_iter->original_length) <= first_iter->original_offset)) { | |
| 106 // Entire |adjusted_iter| (accounting for its shift and including its | |
| 107 // whole original length) comes before |first_iter|. | |
| 108 // | |
| 109 // Correct the offset at |adjusted_iter| and move onto the next | |
| 110 // adjustment that needs revising. | |
| 111 adjusted_iter->original_offset += shift; | |
| 112 shift += currently_collapsing; | |
| 113 currently_collapsing = 0; | |
| 114 ++adjusted_iter; | |
| 115 } else if ((adjusted_iter->original_offset + shift) > | |
| 116 first_iter->original_offset) { | |
| 117 // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). | |
| 118 | |
| 119 // It's not possible for the adjustments to overlap. (It shouldn't | |
| 120 // be possible that we have an |adjusted_iter->original_offset| that, | |
| 121 // when adjusted by the computed |shift|, is in the middle of | |
| 122 // |first_iter|'s output's length. After all, that would mean the | |
| 123 // current adjustment_on_adjusted_string somehow points to an offset | |
| 124 // that was supposed to have been eliminated by the first set of | |
| 125 // adjustments.) | |
| 126 DCHECK_LE(first_iter->original_offset + first_iter->output_length, | |
| 127 adjusted_iter->original_offset + shift); | |
| 128 | |
| 129 // Add the |first_adjustment_iter| to the full set of adjustments while | |
| 130 // making sure |adjusted_iter| continues pointing to the same element. | |
| 131 // We do this by inserting the |first_adjustment_iter| right before | |
| 132 // |adjusted_iter|, then incrementing |adjusted_iter| so it points to | |
| 133 // the following element. | |
| 134 shift += first_iter->original_length - first_iter->output_length; | |
| 135 adjusted_iter = adjustments_on_adjusted_string->insert( | |
| 136 adjusted_iter, *first_iter); | |
| 137 ++adjusted_iter; | |
| 138 ++first_iter; | |
| 139 } else { | |
| 140 // The first adjustment adjusted something that then got further adjusted | |
| 141 // by the second set of adjustments. In other words, |first_iter| points | |
| 142 // to something in the range covered by |adjusted_iter|'s length (after | |
| 143 // accounting for |shift|). Precisely, | |
| 144 // adjusted_iter->original_offset + shift | |
| 145 // <= | |
| 146 // first_iter->original_offset | |
| 147 // <= | |
| 148 // adjusted_iter->original_offset + shift + | |
| 149 // adjusted_iter->original_length | |
| 150 | |
| 151 // Modify the current |adjusted_iter| to include whatever collapsing | |
| 152 // happened in |first_iter|, then advance to the next |first_adjustments| | |
| 153 // because we dealt with the current one. | |
| 154 const int collapse = static_cast<int>(first_iter->original_length) - | |
| 155 static_cast<int>(first_iter->output_length); | |
| 156 // This function does not know how to deal with a string that expands and | |
| 157 // then gets modified, only strings that collapse and then get modified. | |
| 158 DCHECK_GT(collapse, 0); | |
| 159 adjusted_iter->original_length += collapse; | |
| 160 currently_collapsing += collapse; | |
| 161 ++first_iter; | |
| 162 } | |
| 163 } | |
| 164 DCHECK_EQ(0u, currently_collapsing); | |
| 165 if (first_iter != first_adjustments.end()) { | |
| 166 // Only first adjustments are left. These do not need to be modified. | |
| 167 // (Their offsets are already correct with respect to the original string.) | |
| 168 // Append them all. | |
| 169 DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); | |
| 170 adjustments_on_adjusted_string->insert( | |
| 171 adjustments_on_adjusted_string->end(), first_iter, | |
| 172 first_adjustments.end()); | |
| 173 } | |
| 174 } | |
| 175 | |
| 176 // Converts the given source Unicode character type to the given destination | |
| 177 // Unicode character type as a STL string. The given input buffer and size | |
| 178 // determine the source, and the given output STL string will be replaced by | |
| 179 // the result. If non-NULL, |adjustments| is set to reflect the all the | |
| 180 // alterations to the string that are not one-character-to-one-character. | |
| 181 // It will always be sorted by increasing offset. | |
| 182 template<typename SrcChar, typename DestStdString> | |
| 183 bool ConvertUnicode(const SrcChar* src, | |
| 184 size_t src_len, | |
| 185 DestStdString* output, | |
| 186 OffsetAdjuster::Adjustments* adjustments) { | |
| 187 if (adjustments) | |
| 188 adjustments->clear(); | |
| 189 // ICU requires 32-bit numbers. | |
| 190 bool success = true; | |
| 191 int32 src_len32 = static_cast<int32>(src_len); | |
| 192 for (int32 i = 0; i < src_len32; i++) { | |
| 193 uint32 code_point; | |
| 194 size_t original_i = i; | |
| 195 size_t chars_written = 0; | |
| 196 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | |
| 197 chars_written = WriteUnicodeCharacter(code_point, output); | |
| 198 } else { | |
| 199 chars_written = WriteUnicodeCharacter(0xFFFD, output); | |
| 200 success = false; | |
| 201 } | |
| 202 | |
| 203 // Only bother writing an adjustment if this modification changed the | |
| 204 // length of this character. | |
| 205 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last | |
| 206 // character read, not after it (so that incrementing it in the loop | |
| 207 // increment will place it at the right location), so we need to account | |
| 208 // for that in determining the amount that was read. | |
| 209 if (adjustments && ((i - original_i + 1) != chars_written)) { | |
| 210 adjustments->push_back(OffsetAdjuster::Adjustment( | |
| 211 original_i, i - original_i + 1, chars_written)); | |
| 212 } | |
| 213 } | |
| 214 return success; | |
| 215 } | |
| 216 | |
| 217 bool UTF8ToUTF16WithAdjustments( | |
| 218 const char* src, | |
| 219 size_t src_len, | |
| 220 string16* output, | |
| 221 base::OffsetAdjuster::Adjustments* adjustments) { | |
| 222 PrepareForUTF16Or32Output(src, src_len, output); | |
| 223 return ConvertUnicode(src, src_len, output, adjustments); | |
| 224 } | |
| 225 | |
| 226 string16 UTF8ToUTF16WithAdjustments( | |
| 227 const base::StringPiece& utf8, | |
| 228 base::OffsetAdjuster::Adjustments* adjustments) { | |
| 229 string16 result; | |
| 230 UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); | |
| 231 return result; | |
| 232 } | |
| 233 | |
| 234 string16 UTF8ToUTF16AndAdjustOffsets( | |
| 235 const base::StringPiece& utf8, | |
| 236 std::vector<size_t>* offsets_for_adjustment) { | |
| 237 std::for_each(offsets_for_adjustment->begin(), | |
| 238 offsets_for_adjustment->end(), | |
| 239 LimitOffset<base::StringPiece>(utf8.length())); | |
| 240 OffsetAdjuster::Adjustments adjustments; | |
| 241 string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); | |
| 242 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); | |
| 243 return result; | |
| 244 } | |
| 245 | |
| 246 std::string UTF16ToUTF8AndAdjustOffsets( | |
| 247 const base::StringPiece16& utf16, | |
| 248 std::vector<size_t>* offsets_for_adjustment) { | |
| 249 std::for_each(offsets_for_adjustment->begin(), | |
| 250 offsets_for_adjustment->end(), | |
| 251 LimitOffset<base::StringPiece16>(utf16.length())); | |
| 252 std::string result; | |
| 253 PrepareForUTF8Output(utf16.data(), utf16.length(), &result); | |
| 254 OffsetAdjuster::Adjustments adjustments; | |
| 255 ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); | |
| 256 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); | |
| 257 return result; | |
| 258 } | |
| 259 | |
| 260 } // namespace base | |
| OLD | NEW |