OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/utf_offset_string_conversions.h" | 5 #include "base/utf_offset_string_conversions.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
| 9 #include "base/logging.h" |
9 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
10 #include "base/string_piece.h" | 11 #include "base/string_piece.h" |
| 12 #include "base/third_party/icu/icu_utf.h" |
11 #include "base/utf_string_conversion_utils.h" | 13 #include "base/utf_string_conversion_utils.h" |
12 | 14 |
13 using base::PrepareForUTF16Or32Output; | 15 using base::PrepareForUTF16Or32Output; |
14 using base::PrepareForUTF8Output; | 16 using base::PrepareForUTF8Output; |
15 using base::ReadUnicodeCharacter; | 17 using base::ReadUnicodeCharacter; |
16 using base::WriteUnicodeCharacter; | 18 using base::WriteUnicodeCharacter; |
17 | 19 |
| 20 namespace { |
| 21 |
18 // Converts the given source Unicode character type to the given destination | 22 // Converts the given source Unicode character type to the given destination |
19 // Unicode character type as a STL string. The given input buffer and size | 23 // Unicode character type as a STL string. The given input buffer and size |
20 // determine the source, and the given output STL string will be replaced by | 24 // determine the source, and the given output STL string will be replaced by |
21 // the result. | 25 // the result. |
22 template<typename SrcChar, typename DestStdString> | 26 template<typename SrcChar, typename DestStdString> |
23 bool ConvertUnicode(const SrcChar* src, | 27 bool ConvertUnicode(const SrcChar* src, |
24 size_t src_len, | 28 size_t src_len, |
25 DestStdString* output, | 29 DestStdString* output, |
26 std::vector<size_t>* offsets_for_adjustment) { | 30 std::vector<size_t>* offsets_for_adjustment) { |
27 if (offsets_for_adjustment) { | 31 if (offsets_for_adjustment) { |
(...skipping 21 matching lines...) Expand all Loading... |
49 // character read, not after it (so that incrementing it in the loop | 53 // character read, not after it (so that incrementing it in the loop |
50 // increment will place it at the right location), so we need to account | 54 // increment will place it at the right location), so we need to account |
51 // for that in determining the amount that was read. | 55 // for that in determining the amount that was read. |
52 offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i, | 56 offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i, |
53 i - original_i + 1, chars_written)); | 57 i - original_i + 1, chars_written)); |
54 } | 58 } |
55 } | 59 } |
56 return success; | 60 return success; |
57 } | 61 } |
58 | 62 |
| 63 } // namespace |
| 64 |
| 65 bool IsValidCodePointIndex(const string16& s, size_t index) { |
| 66 return index == 0 || index == s.length() || |
| 67 !(CBU16_IS_TRAIL(s[index]) && CBU16_IS_LEAD(s[index - 1])); |
| 68 } |
| 69 |
| 70 ptrdiff_t Utf16IndexToOffset(const string16& s, size_t base, size_t pos) { |
| 71 DCHECK_LE(base, s.length()); |
| 72 DCHECK_LE(pos, s.length()); |
| 73 ptrdiff_t delta = 0; |
| 74 while (base < pos) |
| 75 delta += IsValidCodePointIndex(s, base++) ? 1 : 0; |
| 76 while (pos < base) |
| 77 delta -= IsValidCodePointIndex(s, pos++) ? 1 : 0; |
| 78 return delta; |
| 79 } |
| 80 |
| 81 size_t Utf16OffsetToIndex(const string16& s, size_t pos, ptrdiff_t offset) { |
| 82 DCHECK_LE(pos, s.length()); |
| 83 while (offset > 0 && pos < s.length()) |
| 84 offset -= IsValidCodePointIndex(s, pos++) ? 1 : 0; |
| 85 while (offset < 0 && pos > 0) |
| 86 offset += IsValidCodePointIndex(s, --pos) ? 1 : 0; |
| 87 // If offset != 0 then we ran off the edge of the string, which shouldn't |
| 88 // happen but is handled anyway for safety. |
| 89 DCHECK_EQ(offset, 0); |
| 90 // Since the second half of a surrogate pair has "length" zero, there is an |
| 91 // ambiguity in the returned position. Resolve it by always returning a valid |
| 92 // index. |
| 93 if (!IsValidCodePointIndex(s, pos)) |
| 94 ++pos; |
| 95 return pos; |
| 96 } |
| 97 |
59 bool UTF8ToUTF16AndAdjustOffset(const char* src, | 98 bool UTF8ToUTF16AndAdjustOffset(const char* src, |
60 size_t src_len, | 99 size_t src_len, |
61 string16* output, | 100 string16* output, |
62 size_t* offset_for_adjustment) { | 101 size_t* offset_for_adjustment) { |
63 std::vector<size_t> offsets; | 102 std::vector<size_t> offsets; |
64 if (offset_for_adjustment) | 103 if (offset_for_adjustment) |
65 offsets.push_back(*offset_for_adjustment); | 104 offsets.push_back(*offset_for_adjustment); |
66 PrepareForUTF16Or32Output(src, src_len, output); | 105 PrepareForUTF16Or32Output(src, src_len, output); |
67 bool ret = ConvertUnicode(src, src_len, output, &offsets); | 106 bool ret = ConvertUnicode(src, src_len, output, &offsets); |
68 if (offset_for_adjustment) | 107 if (offset_for_adjustment) |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
158 if (*offset <= i->original_offset) | 197 if (*offset <= i->original_offset) |
159 break; | 198 break; |
160 if (*offset < (i->original_offset + i->original_length)) { | 199 if (*offset < (i->original_offset + i->original_length)) { |
161 *offset = string16::npos; | 200 *offset = string16::npos; |
162 return; | 201 return; |
163 } | 202 } |
164 adjustment += (i->original_length - i->output_length); | 203 adjustment += (i->original_length - i->output_length); |
165 } | 204 } |
166 *offset -= adjustment; | 205 *offset -= adjustment; |
167 } | 206 } |
OLD | NEW |