Index: third_party/libphonenumber/cpp/src/regexp_adapter_icu.cc |
diff --git a/third_party/libphonenumber/cpp/src/regexp_adapter_icu.cc b/third_party/libphonenumber/cpp/src/regexp_adapter_icu.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..cf07d639407ecb13626695442e7fa5b409c73f8a |
--- /dev/null |
+++ b/third_party/libphonenumber/cpp/src/regexp_adapter_icu.cc |
@@ -0,0 +1,209 @@ |
+// Copyright (C) 2011 Google Inc. |
+// |
+// Licensed under the Apache License, Version 2.0 (the "License"); |
+// you may not use this file except in compliance with the License. |
+// You may obtain a copy of the License at |
+// |
+// http://www.apache.org/licenses/LICENSE-2.0 |
+// |
+// Unless required by applicable law or agreed to in writing, software |
+// distributed under the License is distributed on an "AS IS" BASIS, |
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
+// See the License for the specific language governing permissions and |
+// limitations under the License. |
+ |
+// Author: George Yakovlev |
+// Philippe Liard |
+ |
+#include "regexp_adapter.h" |
+ |
+#include <string> |
+ |
+#include <unicode/regex.h> |
+#include <unicode/unistr.h> |
+ |
+#include "base/basictypes.h" |
+#include "base/logging.h" |
+#include "base/memory/scoped_ptr.h" |
+#include "default_logger.h" |
+ |
+namespace i18n { |
+namespace phonenumbers { |
+ |
+using icu::RegexMatcher; |
+using icu::RegexPattern; |
+using icu::UnicodeString; |
+ |
+namespace { |
+ |
+// Converts UnicodeString 'source' to a UTF8-formatted std::string. |
+string UnicodeStringToUtf8String(const UnicodeString& source) { |
+ string data; |
+ source.toUTF8String<string>(data); |
+ return data; |
+} |
+ |
+} // namespace |
+ |
+// Implementation of the abstract classes RegExpInput and RegExp using ICU |
+// regular expression capabilities. |
+ |
+// ICU implementation of the RegExpInput abstract class. |
+class IcuRegExpInput : public RegExpInput { |
+ public: |
+ explicit IcuRegExpInput(const string& utf8_input) |
+ : utf8_input_(UnicodeString::fromUTF8(utf8_input)), |
+ position_(0) {} |
+ |
+ virtual ~IcuRegExpInput() {} |
+ |
+ virtual string ToString() const { |
+ return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); |
+ } |
+ |
+ UnicodeString* Data() { |
+ return &utf8_input_; |
+ } |
+ |
+ // The current start position. For a newly created input, position is 0. Each |
+ // call to ConsumeRegExp() or RegExp::Consume() advances the position in the |
+ // case of the successful match to be after the match. |
+ int position() const { |
+ return position_; |
+ } |
+ |
+ void set_position(int position) { |
+ DCHECK(position >= 0 && position <= utf8_input_.length()); |
+ position_ = position; |
+ } |
+ |
+ private: |
+ UnicodeString utf8_input_; |
+ int position_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); |
+}; |
+ |
+// ICU implementation of the RegExp abstract class. |
+class IcuRegExp : public RegExp { |
+ public: |
+ explicit IcuRegExp(const string& utf8_regexp) { |
+ UParseError parse_error; |
+ UErrorCode status = U_ZERO_ERROR; |
+ utf8_regexp_.reset(RegexPattern::compile( |
+ UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status)); |
+ if (U_FAILURE(status)) { |
+ // The provided regular expressions should compile correctly. |
+ LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp; |
+ utf8_regexp_.reset(NULL); |
+ } |
+ } |
+ |
+ virtual ~IcuRegExp() {} |
+ |
+ virtual bool Consume(RegExpInput* input_string, |
+ bool anchor_at_start, |
+ string* matched_string1, |
+ string* matched_string2, |
+ string* matched_string3) const { |
+ DCHECK(input_string); |
+ if (!utf8_regexp_.get()) { |
+ return false; |
+ } |
+ IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); |
+ UErrorCode status = U_ZERO_ERROR; |
+ const scoped_ptr<RegexMatcher> matcher( |
+ utf8_regexp_->matcher(*input->Data(), status)); |
+ bool match_succeeded = anchor_at_start |
+ ? matcher->lookingAt(input->position(), status) |
+ : matcher->find(input->position(), status); |
+ if (!match_succeeded || U_FAILURE(status)) { |
+ return false; |
+ } |
+ string* const matched_strings[] = { |
+ matched_string1, matched_string2, matched_string3 |
+ }; |
+ // If less matches than expected - fail. |
+ for (size_t i = 0; i < arraysize(matched_strings); ++i) { |
+ if (matched_strings[i]) { |
+ // Groups are counted from 1 rather than 0. |
+ const int group_index = i + 1; |
+ if (group_index > matcher->groupCount()) { |
+ return false; |
+ } |
+ *matched_strings[i] = |
+ UnicodeStringToUtf8String(matcher->group(group_index, status)); |
+ } |
+ } |
+ input->set_position(matcher->end(status)); |
+ return !U_FAILURE(status); |
+ } |
+ |
+ bool Match(const string& input_string, |
+ bool full_match, |
+ string* matched_string) const { |
+ if (!utf8_regexp_.get()) { |
+ return false; |
+ } |
+ IcuRegExpInput input(input_string); |
+ UErrorCode status = U_ZERO_ERROR; |
+ const scoped_ptr<RegexMatcher> matcher( |
+ utf8_regexp_->matcher(*input.Data(), status)); |
+ bool match_succeeded = full_match |
+ ? matcher->matches(input.position(), status) |
+ : matcher->find(input.position(), status); |
+ if (!match_succeeded || U_FAILURE(status)) { |
+ return false; |
+ } |
+ if (matcher->groupCount() > 0 && matched_string) { |
+ *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); |
+ } |
+ return !U_FAILURE(status); |
+ } |
+ |
+ bool Replace(string* string_to_process, |
+ bool global, |
+ const string& replacement_string) const { |
+ DCHECK(string_to_process); |
+ if (!utf8_regexp_.get()) { |
+ return false; |
+ } |
+ IcuRegExpInput input(*string_to_process); |
+ UErrorCode status = U_ZERO_ERROR; |
+ const scoped_ptr<RegexMatcher> matcher( |
+ utf8_regexp_->matcher(*input.Data(), status)); |
+ if (U_FAILURE(status)) { |
+ return false; |
+ } |
+ UnicodeString result = global |
+ ? matcher->replaceAll( |
+ UnicodeString::fromUTF8(replacement_string), status) |
+ : matcher->replaceFirst( |
+ UnicodeString::fromUTF8(replacement_string), status); |
+ if (U_FAILURE(status)) { |
+ return false; |
+ } |
+ const string replaced_string = UnicodeStringToUtf8String(result); |
+ if (replaced_string == *string_to_process) { |
+ return false; |
+ } |
+ *string_to_process = replaced_string; |
+ return true; |
+ } |
+ |
+ private: |
+ scoped_ptr<RegexPattern> utf8_regexp_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(IcuRegExp); |
+}; |
+ |
+RegExpInput* RegExpInput::Create(const string& utf8_input) { |
+ return new IcuRegExpInput(utf8_input); |
+} |
+ |
+RegExp* RegExp::Create(const string& utf8_regexp) { |
+ return new IcuRegExp(utf8_regexp); |
+} |
+ |
+} // namespace phonenumbers |
+} // namespace i18n |