Chromium Code Reviews| Index: third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc |
| =================================================================== |
| --- third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc (revision 0) |
| +++ third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc (revision 0) |
| @@ -0,0 +1,300 @@ |
| +// Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "third_party/libphonenumber/cpp/src/regexp_adapter.h" |
| + |
| +// Setup all of the Chromium and WebKit defines |
| +#include "base/logging.h" |
| +#include "base/scoped_ptr.h" |
| +#include "build/build_config.h" |
| +#include "unicode/regex.h" |
| +#include "unicode/stringpiece.h" |
| +#include "unicode/unistr.h" |
| + |
| +namespace { |
| + |
| +// Converts |source| to UTF-8 string, returns it starting at position |pos|. |
| +std::string UnicodeStringToUtf8String(icu::UnicodeString const& source, |
| + int pos) { |
| + std::string data; |
| + source.toUTF8String<std::string>(data); |
| + return data.substr(pos); |
| +} |
| + |
| +} // namespace |
| + |
| +// Implementation of the abstract classes RegularExpressionInput and |
| +// RegularExpression using ICU regular expression capabilities. |
| + |
| +// The Regular Expression input class. |
| +class IcuRegularExpressionInput : public reg_exp::RegularExpressionInput { |
| + public: |
| + explicit IcuRegularExpressionInput(const char* utf8_input); |
| + |
| + // RegularExpressionInput implementation: |
| + // Matches string to regular expression, returns true if expression was |
| + // matched, false otherwise, advances position in the match. |
| + // |reg_exp| - expression to be matched. |
| + // |beginning_only| - if true match would be successfull only if appears at |
| + // the beginning of the tested region of the string. |
| + // |matched_string1| - successfully matched first string. Can be NULL. |
| + // |matched_string2| - successfully matched second string. Can be NULL. |
| + virtual bool ConsumeRegExp(std::string const& reg_exp, |
| + bool beginning_only, |
| + std::string* matched_string1, |
| + std::string* matched_string2); |
| + |
| + // Convert unmatched input to a string. |
| + virtual std::string ToString() const; |
| + |
| + icu::UnicodeString* Data() { return &utf8_input_; } |
| + |
| + // Position in the input. For the newly created input position is 0, |
| + // each call to ConsumeRegExp() or RegularExpression::Consume() advances |
| + // position in the case of the successful match to be after the match. |
| + int pos() const { return pos_; } |
| + void set_pos(int pos) { pos_ = pos; } |
| + |
| + private: |
| + icu::UnicodeString utf8_input_; |
| + int pos_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(IcuRegularExpressionInput); |
| +}; |
| + |
| +// The regular expression class. |
| +class IcuRegularExpression : public reg_exp::RegularExpression { |
| + public: |
| + explicit IcuRegularExpression(const char* utf8_regexp); |
| + |
| + // RegularExpression implementation: |
| + // Matches string to regular expression, returns true if expression was |
| + // matched, false otherwise, advances position in the match. |
| + // |input_string| - string to be searched. |
| + // |beginning_only| - if true match would be successfull only if appears at |
| + // the beginning of the tested region of the string. |
| + // |matched_string1| - successfully matched first string. Can be NULL. |
| + // |matched_string2| - successfully matched second string. Can be NULL. |
| + // |matched_string3| - successfully matched third string. Can be NULL. |
| + virtual bool Consume(reg_exp::RegularExpressionInput* input_string, |
| + bool beginning_only, |
| + std::string* matched_string1, |
| + std::string* matched_string2, |
| + std::string* matched_string3) const; |
| + |
| + // Matches string to regular expression, returns true if expression was |
| + // matched, false otherwise. |
| + // |input_string| - string to be searched. |
| + // |full_match| - if true match would be successfull only if it matches the |
| + // complete string. |
| + // |matched_string| - successfully matched string. Can be NULL. |
| + virtual bool Match(const char* input_string, |
| + bool full_match, |
| + std::string* matched_string) const; |
| + |
| + // Replaces match(es) in the |string_to_process|. if |global| is true, |
| + // replaces all the matches, only the first match otherwise. |
| + // |replacement_string| - text the matches are replaced with. |
| + // Returns true if expression successfully processed through the string, |
| + // even if no actual replacements were made. Returns false in case of an |
| + // error. |
| + virtual bool Replace(std::string* string_to_process, |
| + bool global, |
| + const char* replacement_string) const; |
| + private: |
| + scoped_ptr<icu::RegexPattern> utf8_regexp_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(IcuRegularExpression); |
| +}; |
| + |
| +IcuRegularExpressionInput::IcuRegularExpressionInput(const char* utf8_input) |
| + : pos_(0) { |
| + DCHECK(utf8_input); |
| + utf8_input_ = icu::UnicodeString::fromUTF8(utf8_input); |
| +} |
| + |
| +bool IcuRegularExpressionInput::ConsumeRegExp(std::string const& reg_exp, |
| + bool beginning_only, |
| + std::string* matched_string1, |
| + std::string* matched_string2) { |
| + IcuRegularExpression re(reg_exp.c_str()); |
| + |
| + return re.Consume(this, beginning_only, matched_string1, matched_string2, |
| + NULL); |
| +} |
| + |
| +std::string IcuRegularExpressionInput::ToString() const { |
| + if (pos_ < 0 || pos_ > utf8_input_.length()) |
| + return std::string(); |
| + return UnicodeStringToUtf8String(utf8_input_, pos_); |
| +} |
| + |
| +IcuRegularExpression::IcuRegularExpression(const char* utf8_regexp) { |
| + DCHECK(utf8_regexp); |
| + UParseError pe; |
| + UErrorCode status = U_ZERO_ERROR; |
| + utf8_regexp_.reset(icu::RegexPattern::compile( |
| + icu::UnicodeString::fromUTF8(utf8_regexp), 0, pe, status)); |
| + if (U_FAILURE(status)) { |
| + // All of the passed regular expressions should compile correctly. |
| + utf8_regexp_.reset(NULL); |
| + NOTREACHED(); |
| + } |
| +} |
| + |
| +bool IcuRegularExpression::Consume( |
| + reg_exp::RegularExpressionInput* input_string, |
| + bool beginning_only, |
| + std::string* matched_string1, |
| + std::string* matched_string2, |
| + std::string* matched_string3) const { |
| + DCHECK(input_string); |
| + // matched_string1 may be NULL |
| + // matched_string2 may be NULL |
| + // matched_string3 may be NULL |
| + if (!utf8_regexp_.get()) |
| + return false; |
| + |
| + IcuRegularExpressionInput* input = |
| + reinterpret_cast<IcuRegularExpressionInput *>(input_string); |
| + UErrorCode status = U_ZERO_ERROR; |
| + scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input->Data()), |
| + status)); |
| + |
| + if (U_FAILURE(status)) |
| + return false; |
| + |
| + if (beginning_only) { |
| + if (!matcher->lookingAt(input->pos(), status)) |
| + return false; |
| + } else { |
| + if (!matcher->find(input->pos(), status)) |
| + return false; |
| + } |
| + if (U_FAILURE(status)) |
| + return false; |
| + // If less matches than expected - fail. |
| + if ((matched_string3 && matcher->groupCount() < 3) || |
| + (matched_string2 && matcher->groupCount() < 2) || |
| + (matched_string1 && matcher->groupCount() < 1)) { |
| + return false; |
| + } |
| + if (matcher->groupCount() > 0 && matched_string1) { |
| + *matched_string1 = UnicodeStringToUtf8String(matcher->group(1, status), 0); |
| + } |
| + if (matcher->groupCount() > 1 && matched_string2) { |
| + *matched_string2 = UnicodeStringToUtf8String(matcher->group(2, status), 0); |
| + } |
| + if (matcher->groupCount() > 2 && matched_string3) { |
| + *matched_string3 = UnicodeStringToUtf8String(matcher->group(3, status), 0); |
| + } |
| + input->set_pos(matcher->end(status)); |
| + return true; |
| +} |
| + |
| +bool IcuRegularExpression::Match(const char* input_string, |
| + bool full_match, |
| + std::string* matched_string) const { |
| + DCHECK(input_string); |
| + // matched_string may be NULL |
| + if (!utf8_regexp_.get()) |
| + return false; |
| + |
| + IcuRegularExpressionInput input(input_string); |
| + UErrorCode status = U_ZERO_ERROR; |
| + scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), |
| + status)); |
| + |
| + if (U_FAILURE(status)) |
| + return false; |
| + |
| + if (full_match) { |
| + if (!matcher->matches(input.pos(), status)) |
| + return false; |
| + } else { |
| + if (!matcher->find(input.pos(), status)) |
| + return false; |
| + } |
| + if (U_FAILURE(status)) |
| + return false; |
| + if (matcher->groupCount() > 0 && matched_string) { |
| + *matched_string = UnicodeStringToUtf8String(matcher->group(1, status), 0); |
| + } |
| + return true; |
| +} |
| + |
| +bool IcuRegularExpression::Replace(std::string* string_to_process, |
| + bool global, |
| + const char* replacement_string) const { |
| + DCHECK(string_to_process); |
| + DCHECK(replacement_string); |
| + |
| + std::string adapted_replacement(replacement_string); |
| + // Adapt replacement string from RE2 (\0-9 for matches) format to ICU format |
| + // ($0-9 for matches). All '$' should be prepended with '\' as well. |
| + size_t backslash_pos = adapted_replacement.find('\\'); |
| + size_t dollar_pos = adapted_replacement.find('$'); |
| + while (backslash_pos != std::string::npos || |
| + dollar_pos != std::string::npos) { |
| + bool process_dollar = false; |
| + if (backslash_pos == std::string::npos || |
| + (dollar_pos != std::string::npos && dollar_pos < backslash_pos)) { |
| + process_dollar = true; |
| + } |
| + if (process_dollar) { |
| + adapted_replacement.insert(dollar_pos, "\\"); |
| + dollar_pos = adapted_replacement.find('$', dollar_pos + 2); |
| + if (backslash_pos != std::string::npos) |
| + ++backslash_pos; |
| + } else { |
| + if (adapted_replacement.length() > backslash_pos + 1) { |
| + if (adapted_replacement[backslash_pos + 1] >= '0' && |
| + adapted_replacement[backslash_pos + 1] <= '9') { |
| + adapted_replacement[backslash_pos] = '$'; |
| + } |
| + if (adapted_replacement[backslash_pos + 1] == '\\') { |
| + // Skip two characters instead of one. |
| + ++backslash_pos; |
| + } |
| + } |
| + backslash_pos = adapted_replacement.find('\\', backslash_pos + 1); |
| + } |
| + } |
| + |
| + IcuRegularExpressionInput input(string_to_process->c_str()); |
| + UErrorCode status = U_ZERO_ERROR; |
| + scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), |
| + status)); |
| + if (U_FAILURE(status)) |
| + return false; |
| + |
| + icu::UnicodeString result; |
| + |
| + if (global) { |
| + result = matcher->replaceAll( |
| + icu::UnicodeString::fromUTF8(adapted_replacement), |
| + status); |
| + } else { |
| + result = matcher->replaceFirst( |
| + icu::UnicodeString::fromUTF8(adapted_replacement), |
| + status); |
| + } |
| + if (U_FAILURE(status)) |
| + return false; |
| + *string_to_process = UnicodeStringToUtf8String(result, 0); |
| + return true; |
| +} |
| + |
| +namespace reg_exp { |
| + |
| +RegularExpressionInput* CreateRegularExpressionInput(const char* utf8_input) { |
| + return new IcuRegularExpressionInput(utf8_input); |
| +} |
| + |
| +RegularExpression* CreateRegularExpression(const char* utf8_regexp) { |
| + return new IcuRegularExpression(utf8_regexp); |
| +} |
| + |
| +} // namespace reg_exp |
| + |
|
Mark Mentovai
2011/05/03 20:52:59
Blank line at the end of this guy too.
|
| Property changes on: third_party\libphonenumber\chrome\regexp_adapter_icuregexp.cc |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| + LF |