OLD | NEW |
(Empty) | |
| 1 // Copyright (C) 2011 Google Inc. |
| 2 // |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 // you may not use this file except in compliance with the License. |
| 5 // You may obtain a copy of the License at |
| 6 // |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 // |
| 9 // Unless required by applicable law or agreed to in writing, software |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 // See the License for the specific language governing permissions and |
| 13 // limitations under the License. |
| 14 |
| 15 // Author: George Yakovlev |
| 16 // Philippe Liard |
| 17 |
| 18 #include "regexp_adapter.h" |
| 19 |
| 20 #include <string> |
| 21 |
| 22 #include <unicode/regex.h> |
| 23 #include <unicode/unistr.h> |
| 24 |
| 25 #include "base/basictypes.h" |
| 26 #include "base/logging.h" |
| 27 #include "base/memory/scoped_ptr.h" |
| 28 #include "default_logger.h" |
| 29 |
| 30 namespace i18n { |
| 31 namespace phonenumbers { |
| 32 |
| 33 using icu::RegexMatcher; |
| 34 using icu::RegexPattern; |
| 35 using icu::UnicodeString; |
| 36 |
| 37 namespace { |
| 38 |
| 39 // Converts UnicodeString 'source' to a UTF8-formatted std::string. |
| 40 string UnicodeStringToUtf8String(const UnicodeString& source) { |
| 41 string data; |
| 42 source.toUTF8String<string>(data); |
| 43 return data; |
| 44 } |
| 45 |
| 46 } // namespace |
| 47 |
| 48 // Implementation of the abstract classes RegExpInput and RegExp using ICU |
| 49 // regular expression capabilities. |
| 50 |
| 51 // ICU implementation of the RegExpInput abstract class. |
| 52 class IcuRegExpInput : public RegExpInput { |
| 53 public: |
| 54 explicit IcuRegExpInput(const string& utf8_input) |
| 55 : utf8_input_(UnicodeString::fromUTF8(utf8_input)), |
| 56 position_(0) {} |
| 57 |
| 58 virtual ~IcuRegExpInput() {} |
| 59 |
| 60 virtual string ToString() const { |
| 61 return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); |
| 62 } |
| 63 |
| 64 UnicodeString* Data() { |
| 65 return &utf8_input_; |
| 66 } |
| 67 |
| 68 // The current start position. For a newly created input, position is 0. Each |
| 69 // call to ConsumeRegExp() or RegExp::Consume() advances the position in the |
| 70 // case of the successful match to be after the match. |
| 71 int position() const { |
| 72 return position_; |
| 73 } |
| 74 |
| 75 void set_position(int position) { |
| 76 DCHECK(position >= 0 && position <= utf8_input_.length()); |
| 77 position_ = position; |
| 78 } |
| 79 |
| 80 private: |
| 81 UnicodeString utf8_input_; |
| 82 int position_; |
| 83 |
| 84 DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); |
| 85 }; |
| 86 |
| 87 // ICU implementation of the RegExp abstract class. |
| 88 class IcuRegExp : public RegExp { |
| 89 public: |
| 90 explicit IcuRegExp(const string& utf8_regexp) { |
| 91 UParseError parse_error; |
| 92 UErrorCode status = U_ZERO_ERROR; |
| 93 utf8_regexp_.reset(RegexPattern::compile( |
| 94 UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status)); |
| 95 if (U_FAILURE(status)) { |
| 96 // The provided regular expressions should compile correctly. |
| 97 LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp; |
| 98 utf8_regexp_.reset(NULL); |
| 99 } |
| 100 } |
| 101 |
| 102 virtual ~IcuRegExp() {} |
| 103 |
| 104 virtual bool Consume(RegExpInput* input_string, |
| 105 bool anchor_at_start, |
| 106 string* matched_string1, |
| 107 string* matched_string2, |
| 108 string* matched_string3) const { |
| 109 DCHECK(input_string); |
| 110 if (!utf8_regexp_.get()) { |
| 111 return false; |
| 112 } |
| 113 IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); |
| 114 UErrorCode status = U_ZERO_ERROR; |
| 115 const scoped_ptr<RegexMatcher> matcher( |
| 116 utf8_regexp_->matcher(*input->Data(), status)); |
| 117 bool match_succeeded = anchor_at_start |
| 118 ? matcher->lookingAt(input->position(), status) |
| 119 : matcher->find(input->position(), status); |
| 120 if (!match_succeeded || U_FAILURE(status)) { |
| 121 return false; |
| 122 } |
| 123 string* const matched_strings[] = { |
| 124 matched_string1, matched_string2, matched_string3 |
| 125 }; |
| 126 // If less matches than expected - fail. |
| 127 for (size_t i = 0; i < arraysize(matched_strings); ++i) { |
| 128 if (matched_strings[i]) { |
| 129 // Groups are counted from 1 rather than 0. |
| 130 const int group_index = i + 1; |
| 131 if (group_index > matcher->groupCount()) { |
| 132 return false; |
| 133 } |
| 134 *matched_strings[i] = |
| 135 UnicodeStringToUtf8String(matcher->group(group_index, status)); |
| 136 } |
| 137 } |
| 138 input->set_position(matcher->end(status)); |
| 139 return !U_FAILURE(status); |
| 140 } |
| 141 |
| 142 bool Match(const string& input_string, |
| 143 bool full_match, |
| 144 string* matched_string) const { |
| 145 if (!utf8_regexp_.get()) { |
| 146 return false; |
| 147 } |
| 148 IcuRegExpInput input(input_string); |
| 149 UErrorCode status = U_ZERO_ERROR; |
| 150 const scoped_ptr<RegexMatcher> matcher( |
| 151 utf8_regexp_->matcher(*input.Data(), status)); |
| 152 bool match_succeeded = full_match |
| 153 ? matcher->matches(input.position(), status) |
| 154 : matcher->find(input.position(), status); |
| 155 if (!match_succeeded || U_FAILURE(status)) { |
| 156 return false; |
| 157 } |
| 158 if (matcher->groupCount() > 0 && matched_string) { |
| 159 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); |
| 160 } |
| 161 return !U_FAILURE(status); |
| 162 } |
| 163 |
| 164 bool Replace(string* string_to_process, |
| 165 bool global, |
| 166 const string& replacement_string) const { |
| 167 DCHECK(string_to_process); |
| 168 if (!utf8_regexp_.get()) { |
| 169 return false; |
| 170 } |
| 171 IcuRegExpInput input(*string_to_process); |
| 172 UErrorCode status = U_ZERO_ERROR; |
| 173 const scoped_ptr<RegexMatcher> matcher( |
| 174 utf8_regexp_->matcher(*input.Data(), status)); |
| 175 if (U_FAILURE(status)) { |
| 176 return false; |
| 177 } |
| 178 UnicodeString result = global |
| 179 ? matcher->replaceAll( |
| 180 UnicodeString::fromUTF8(replacement_string), status) |
| 181 : matcher->replaceFirst( |
| 182 UnicodeString::fromUTF8(replacement_string), status); |
| 183 if (U_FAILURE(status)) { |
| 184 return false; |
| 185 } |
| 186 const string replaced_string = UnicodeStringToUtf8String(result); |
| 187 if (replaced_string == *string_to_process) { |
| 188 return false; |
| 189 } |
| 190 *string_to_process = replaced_string; |
| 191 return true; |
| 192 } |
| 193 |
| 194 private: |
| 195 scoped_ptr<RegexPattern> utf8_regexp_; |
| 196 |
| 197 DISALLOW_COPY_AND_ASSIGN(IcuRegExp); |
| 198 }; |
| 199 |
| 200 RegExpInput* RegExpInput::Create(const string& utf8_input) { |
| 201 return new IcuRegExpInput(utf8_input); |
| 202 } |
| 203 |
| 204 RegExp* RegExp::Create(const string& utf8_regexp) { |
| 205 return new IcuRegExp(utf8_regexp); |
| 206 } |
| 207 |
| 208 } // namespace phonenumbers |
| 209 } // namespace i18n |
OLD | NEW |