OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "third_party/libphonenumber/cpp/src/regexp_adapter.h" |
| 6 |
| 7 // Setup all of the Chromium and WebKit defines |
| 8 #include "base/logging.h" |
| 9 #include "base/scoped_ptr.h" |
| 10 #include "build/build_config.h" |
| 11 #include "unicode/regex.h" |
| 12 #include "unicode/stringpiece.h" |
| 13 #include "unicode/unistr.h" |
| 14 |
| 15 namespace { |
| 16 |
| 17 // Converts |source| to UTF-8 string, returns it starting at position |pos|. |
| 18 std::string UnicodeStringToUtf8String(icu::UnicodeString const& source, |
| 19 int pos) { |
| 20 std::string data; |
| 21 source.toUTF8String<std::string>(data); |
| 22 return data.substr(pos); |
| 23 } |
| 24 |
| 25 } // namespace |
| 26 |
| 27 // Implementation of the abstract classes RegularExpressionInput and |
| 28 // RegularExpression using ICU regular expression capabilities. |
| 29 |
| 30 // The Regular Expression input class. |
| 31 class IcuRegularExpressionInput : public reg_exp::RegularExpressionInput { |
| 32 public: |
| 33 explicit IcuRegularExpressionInput(const char* utf8_input); |
| 34 |
| 35 // RegularExpressionInput implementation: |
| 36 // Matches string to regular expression, returns true if expression was |
| 37 // matched, false otherwise, advances position in the match. |
| 38 // |reg_exp| - expression to be matched. |
| 39 // |beginning_only| - if true match would be successfull only if appears at |
| 40 // the beginning of the tested region of the string. |
| 41 // |matched_string1| - successfully matched first string. Can be NULL. |
| 42 // |matched_string2| - successfully matched second string. Can be NULL. |
| 43 virtual bool ConsumeRegExp(std::string const& reg_exp, |
| 44 bool beginning_only, |
| 45 std::string* matched_string1, |
| 46 std::string* matched_string2); |
| 47 |
| 48 // Convert unmatched input to a string. |
| 49 virtual std::string ToString() const; |
| 50 |
| 51 icu::UnicodeString* Data() { return &utf8_input_; } |
| 52 |
| 53 // Position in the input. For the newly created input position is 0, |
| 54 // each call to ConsumeRegExp() or RegularExpression::Consume() advances |
| 55 // position in the case of the successful match to be after the match. |
| 56 int pos() const { return pos_; } |
| 57 void set_pos(int pos) { pos_ = pos; } |
| 58 |
| 59 private: |
| 60 icu::UnicodeString utf8_input_; |
| 61 int pos_; |
| 62 |
| 63 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpressionInput); |
| 64 }; |
| 65 |
| 66 // The regular expression class. |
| 67 class IcuRegularExpression : public reg_exp::RegularExpression { |
| 68 public: |
| 69 explicit IcuRegularExpression(const char* utf8_regexp); |
| 70 |
| 71 // RegularExpression implementation: |
| 72 // Matches string to regular expression, returns true if expression was |
| 73 // matched, false otherwise, advances position in the match. |
| 74 // |input_string| - string to be searched. |
| 75 // |beginning_only| - if true match would be successfull only if appears at |
| 76 // the beginning of the tested region of the string. |
| 77 // |matched_string1| - successfully matched first string. Can be NULL. |
| 78 // |matched_string2| - successfully matched second string. Can be NULL. |
| 79 // |matched_string3| - successfully matched third string. Can be NULL. |
| 80 virtual bool Consume(reg_exp::RegularExpressionInput* input_string, |
| 81 bool beginning_only, |
| 82 std::string* matched_string1, |
| 83 std::string* matched_string2, |
| 84 std::string* matched_string3) const; |
| 85 |
| 86 // Matches string to regular expression, returns true if expression was |
| 87 // matched, false otherwise. |
| 88 // |input_string| - string to be searched. |
| 89 // |full_match| - if true match would be successfull only if it matches the |
| 90 // complete string. |
| 91 // |matched_string| - successfully matched string. Can be NULL. |
| 92 virtual bool Match(const char* input_string, |
| 93 bool full_match, |
| 94 std::string* matched_string) const; |
| 95 |
| 96 // Replaces match(es) in the |string_to_process|. if |global| is true, |
| 97 // replaces all the matches, only the first match otherwise. |
| 98 // |replacement_string| - text the matches are replaced with. |
| 99 // Returns true if expression successfully processed through the string, |
| 100 // even if no actual replacements were made. Returns false in case of an |
| 101 // error. |
| 102 virtual bool Replace(std::string* string_to_process, |
| 103 bool global, |
| 104 const char* replacement_string) const; |
| 105 private: |
| 106 scoped_ptr<icu::RegexPattern> utf8_regexp_; |
| 107 |
| 108 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpression); |
| 109 }; |
| 110 |
| 111 IcuRegularExpressionInput::IcuRegularExpressionInput(const char* utf8_input) |
| 112 : pos_(0) { |
| 113 DCHECK(utf8_input); |
| 114 utf8_input_ = icu::UnicodeString::fromUTF8(utf8_input); |
| 115 } |
| 116 |
| 117 bool IcuRegularExpressionInput::ConsumeRegExp(std::string const& reg_exp, |
| 118 bool beginning_only, |
| 119 std::string* matched_string1, |
| 120 std::string* matched_string2) { |
| 121 IcuRegularExpression re(reg_exp.c_str()); |
| 122 |
| 123 return re.Consume(this, beginning_only, matched_string1, matched_string2, |
| 124 NULL); |
| 125 } |
| 126 |
| 127 std::string IcuRegularExpressionInput::ToString() const { |
| 128 if (pos_ < 0 || pos_ > utf8_input_.length()) |
| 129 return std::string(); |
| 130 return UnicodeStringToUtf8String(utf8_input_, pos_); |
| 131 } |
| 132 |
| 133 IcuRegularExpression::IcuRegularExpression(const char* utf8_regexp) { |
| 134 DCHECK(utf8_regexp); |
| 135 UParseError pe; |
| 136 UErrorCode status = U_ZERO_ERROR; |
| 137 utf8_regexp_.reset(icu::RegexPattern::compile( |
| 138 icu::UnicodeString::fromUTF8(utf8_regexp), 0, pe, status)); |
| 139 if (U_FAILURE(status)) { |
| 140 // All of the passed regular expressions should compile correctly. |
| 141 utf8_regexp_.reset(NULL); |
| 142 NOTREACHED(); |
| 143 } |
| 144 } |
| 145 |
| 146 bool IcuRegularExpression::Consume( |
| 147 reg_exp::RegularExpressionInput* input_string, |
| 148 bool beginning_only, |
| 149 std::string* matched_string1, |
| 150 std::string* matched_string2, |
| 151 std::string* matched_string3) const { |
| 152 DCHECK(input_string); |
| 153 // matched_string1 may be NULL |
| 154 // matched_string2 may be NULL |
| 155 // matched_string3 may be NULL |
| 156 if (!utf8_regexp_.get()) |
| 157 return false; |
| 158 |
| 159 IcuRegularExpressionInput* input = |
| 160 reinterpret_cast<IcuRegularExpressionInput *>(input_string); |
| 161 UErrorCode status = U_ZERO_ERROR; |
| 162 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input->Data()), |
| 163 status)); |
| 164 |
| 165 if (U_FAILURE(status)) |
| 166 return false; |
| 167 |
| 168 if (beginning_only) { |
| 169 if (!matcher->lookingAt(input->pos(), status)) |
| 170 return false; |
| 171 } else { |
| 172 if (!matcher->find(input->pos(), status)) |
| 173 return false; |
| 174 } |
| 175 if (U_FAILURE(status)) |
| 176 return false; |
| 177 // If less matches than expected - fail. |
| 178 if ((matched_string3 && matcher->groupCount() < 3) || |
| 179 (matched_string2 && matcher->groupCount() < 2) || |
| 180 (matched_string1 && matcher->groupCount() < 1)) { |
| 181 return false; |
| 182 } |
| 183 if (matcher->groupCount() > 0 && matched_string1) { |
| 184 *matched_string1 = UnicodeStringToUtf8String(matcher->group(1, status), 0); |
| 185 } |
| 186 if (matcher->groupCount() > 1 && matched_string2) { |
| 187 *matched_string2 = UnicodeStringToUtf8String(matcher->group(2, status), 0); |
| 188 } |
| 189 if (matcher->groupCount() > 2 && matched_string3) { |
| 190 *matched_string3 = UnicodeStringToUtf8String(matcher->group(3, status), 0); |
| 191 } |
| 192 input->set_pos(matcher->end(status)); |
| 193 return true; |
| 194 } |
| 195 |
| 196 bool IcuRegularExpression::Match(const char* input_string, |
| 197 bool full_match, |
| 198 std::string* matched_string) const { |
| 199 DCHECK(input_string); |
| 200 // matched_string may be NULL |
| 201 if (!utf8_regexp_.get()) |
| 202 return false; |
| 203 |
| 204 IcuRegularExpressionInput input(input_string); |
| 205 UErrorCode status = U_ZERO_ERROR; |
| 206 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), |
| 207 status)); |
| 208 |
| 209 if (U_FAILURE(status)) |
| 210 return false; |
| 211 |
| 212 if (full_match) { |
| 213 if (!matcher->matches(input.pos(), status)) |
| 214 return false; |
| 215 } else { |
| 216 if (!matcher->find(input.pos(), status)) |
| 217 return false; |
| 218 } |
| 219 if (U_FAILURE(status)) |
| 220 return false; |
| 221 if (matcher->groupCount() > 0 && matched_string) { |
| 222 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status), 0); |
| 223 } |
| 224 return true; |
| 225 } |
| 226 |
| 227 bool IcuRegularExpression::Replace(std::string* string_to_process, |
| 228 bool global, |
| 229 const char* replacement_string) const { |
| 230 DCHECK(string_to_process); |
| 231 DCHECK(replacement_string); |
| 232 |
| 233 std::string adapted_replacement(replacement_string); |
| 234 // Adapt replacement string from RE2 (\0-9 for matches) format to ICU format |
| 235 // ($0-9 for matches). All '$' should be prepended with '\' as well. |
| 236 size_t backslash_pos = adapted_replacement.find('\\'); |
| 237 size_t dollar_pos = adapted_replacement.find('$'); |
| 238 while (backslash_pos != std::string::npos || |
| 239 dollar_pos != std::string::npos) { |
| 240 bool process_dollar = false; |
| 241 if (backslash_pos == std::string::npos || |
| 242 (dollar_pos != std::string::npos && dollar_pos < backslash_pos)) { |
| 243 process_dollar = true; |
| 244 } |
| 245 if (process_dollar) { |
| 246 adapted_replacement.insert(dollar_pos, "\\"); |
| 247 dollar_pos = adapted_replacement.find('$', dollar_pos + 2); |
| 248 if (backslash_pos != std::string::npos) |
| 249 ++backslash_pos; |
| 250 } else { |
| 251 if (adapted_replacement.length() > backslash_pos + 1) { |
| 252 if (adapted_replacement[backslash_pos + 1] >= '0' && |
| 253 adapted_replacement[backslash_pos + 1] <= '9') { |
| 254 adapted_replacement[backslash_pos] = '$'; |
| 255 } |
| 256 if (adapted_replacement[backslash_pos + 1] == '\\') { |
| 257 // Skip two characters instead of one. |
| 258 ++backslash_pos; |
| 259 } |
| 260 } |
| 261 backslash_pos = adapted_replacement.find('\\', backslash_pos + 1); |
| 262 } |
| 263 } |
| 264 |
| 265 IcuRegularExpressionInput input(string_to_process->c_str()); |
| 266 UErrorCode status = U_ZERO_ERROR; |
| 267 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), |
| 268 status)); |
| 269 if (U_FAILURE(status)) |
| 270 return false; |
| 271 |
| 272 icu::UnicodeString result; |
| 273 |
| 274 if (global) { |
| 275 result = matcher->replaceAll( |
| 276 icu::UnicodeString::fromUTF8(adapted_replacement), |
| 277 status); |
| 278 } else { |
| 279 result = matcher->replaceFirst( |
| 280 icu::UnicodeString::fromUTF8(adapted_replacement), |
| 281 status); |
| 282 } |
| 283 if (U_FAILURE(status)) |
| 284 return false; |
| 285 *string_to_process = UnicodeStringToUtf8String(result, 0); |
| 286 return true; |
| 287 } |
| 288 |
| 289 namespace reg_exp { |
| 290 |
| 291 RegularExpressionInput* CreateRegularExpressionInput(const char* utf8_input) { |
| 292 return new IcuRegularExpressionInput(utf8_input); |
| 293 } |
| 294 |
| 295 RegularExpression* CreateRegularExpression(const char* utf8_regexp) { |
| 296 return new IcuRegularExpression(utf8_regexp); |
| 297 } |
| 298 |
| 299 } // namespace reg_exp |
OLD | NEW |