OLD | NEW |
| (Empty) |
1 // Copyright (C) 2011 Google Inc. | |
2 // | |
3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
4 // you may not use this file except in compliance with the License. | |
5 // You may obtain a copy of the License at | |
6 // | |
7 // http://www.apache.org/licenses/LICENSE-2.0 | |
8 // | |
9 // Unless required by applicable law or agreed to in writing, software | |
10 // distributed under the License is distributed on an "AS IS" BASIS, | |
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 // See the License for the specific language governing permissions and | |
13 // limitations under the License. | |
14 | |
15 // Author: George Yakovlev | |
16 // Philippe Liard | |
17 | |
18 #include "regexp_adapter.h" | |
19 | |
20 #include <string> | |
21 | |
22 #include <unicode/regex.h> | |
23 #include <unicode/unistr.h> | |
24 | |
25 #include "base/basictypes.h" | |
26 #include "base/logging.h" | |
27 #include "base/memory/scoped_ptr.h" | |
28 #include "default_logger.h" | |
29 | |
30 namespace i18n { | |
31 namespace phonenumbers { | |
32 | |
33 using icu::RegexMatcher; | |
34 using icu::RegexPattern; | |
35 using icu::UnicodeString; | |
36 | |
37 namespace { | |
38 | |
39 // Converts UnicodeString 'source' to a UTF8-formatted std::string. | |
40 string UnicodeStringToUtf8String(const UnicodeString& source) { | |
41 string data; | |
42 source.toUTF8String<string>(data); | |
43 return data; | |
44 } | |
45 | |
46 } // namespace | |
47 | |
48 // Implementation of the abstract classes RegExpInput and RegExp using ICU | |
49 // regular expression capabilities. | |
50 | |
51 // ICU implementation of the RegExpInput abstract class. | |
52 class IcuRegExpInput : public RegExpInput { | |
53 public: | |
54 explicit IcuRegExpInput(const string& utf8_input) | |
55 : utf8_input_(UnicodeString::fromUTF8(utf8_input)), | |
56 position_(0) {} | |
57 | |
58 virtual ~IcuRegExpInput() {} | |
59 | |
60 virtual string ToString() const { | |
61 return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); | |
62 } | |
63 | |
64 UnicodeString* Data() { | |
65 return &utf8_input_; | |
66 } | |
67 | |
68 // The current start position. For a newly created input, position is 0. Each | |
69 // call to ConsumeRegExp() or RegExp::Consume() advances the position in the | |
70 // case of the successful match to be after the match. | |
71 int position() const { | |
72 return position_; | |
73 } | |
74 | |
75 void set_position(int position) { | |
76 DCHECK(position >= 0 && position <= utf8_input_.length()); | |
77 position_ = position; | |
78 } | |
79 | |
80 private: | |
81 UnicodeString utf8_input_; | |
82 int position_; | |
83 | |
84 DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); | |
85 }; | |
86 | |
87 // ICU implementation of the RegExp abstract class. | |
88 class IcuRegExp : public RegExp { | |
89 public: | |
90 explicit IcuRegExp(const string& utf8_regexp) { | |
91 UParseError parse_error; | |
92 UErrorCode status = U_ZERO_ERROR; | |
93 utf8_regexp_.reset(RegexPattern::compile( | |
94 UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status)); | |
95 if (U_FAILURE(status)) { | |
96 // The provided regular expressions should compile correctly. | |
97 LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp; | |
98 utf8_regexp_.reset(NULL); | |
99 } | |
100 } | |
101 | |
102 virtual ~IcuRegExp() {} | |
103 | |
104 virtual bool Consume(RegExpInput* input_string, | |
105 bool anchor_at_start, | |
106 string* matched_string1, | |
107 string* matched_string2, | |
108 string* matched_string3) const { | |
109 DCHECK(input_string); | |
110 if (!utf8_regexp_.get()) { | |
111 return false; | |
112 } | |
113 IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); | |
114 UErrorCode status = U_ZERO_ERROR; | |
115 const scoped_ptr<RegexMatcher> matcher( | |
116 utf8_regexp_->matcher(*input->Data(), status)); | |
117 bool match_succeeded = anchor_at_start | |
118 ? matcher->lookingAt(input->position(), status) | |
119 : matcher->find(input->position(), status); | |
120 if (!match_succeeded || U_FAILURE(status)) { | |
121 return false; | |
122 } | |
123 string* const matched_strings[] = { | |
124 matched_string1, matched_string2, matched_string3 | |
125 }; | |
126 // If less matches than expected - fail. | |
127 for (size_t i = 0; i < arraysize(matched_strings); ++i) { | |
128 if (matched_strings[i]) { | |
129 // Groups are counted from 1 rather than 0. | |
130 const int group_index = i + 1; | |
131 if (group_index > matcher->groupCount()) { | |
132 return false; | |
133 } | |
134 *matched_strings[i] = | |
135 UnicodeStringToUtf8String(matcher->group(group_index, status)); | |
136 } | |
137 } | |
138 input->set_position(matcher->end(status)); | |
139 return !U_FAILURE(status); | |
140 } | |
141 | |
142 bool Match(const string& input_string, | |
143 bool full_match, | |
144 string* matched_string) const { | |
145 if (!utf8_regexp_.get()) { | |
146 return false; | |
147 } | |
148 IcuRegExpInput input(input_string); | |
149 UErrorCode status = U_ZERO_ERROR; | |
150 const scoped_ptr<RegexMatcher> matcher( | |
151 utf8_regexp_->matcher(*input.Data(), status)); | |
152 bool match_succeeded = full_match | |
153 ? matcher->matches(input.position(), status) | |
154 : matcher->find(input.position(), status); | |
155 if (!match_succeeded || U_FAILURE(status)) { | |
156 return false; | |
157 } | |
158 if (matcher->groupCount() > 0 && matched_string) { | |
159 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); | |
160 } | |
161 return !U_FAILURE(status); | |
162 } | |
163 | |
164 bool Replace(string* string_to_process, | |
165 bool global, | |
166 const string& replacement_string) const { | |
167 DCHECK(string_to_process); | |
168 if (!utf8_regexp_.get()) { | |
169 return false; | |
170 } | |
171 IcuRegExpInput input(*string_to_process); | |
172 UErrorCode status = U_ZERO_ERROR; | |
173 const scoped_ptr<RegexMatcher> matcher( | |
174 utf8_regexp_->matcher(*input.Data(), status)); | |
175 if (U_FAILURE(status)) { | |
176 return false; | |
177 } | |
178 UnicodeString result = global | |
179 ? matcher->replaceAll( | |
180 UnicodeString::fromUTF8(replacement_string), status) | |
181 : matcher->replaceFirst( | |
182 UnicodeString::fromUTF8(replacement_string), status); | |
183 if (U_FAILURE(status)) { | |
184 return false; | |
185 } | |
186 const string replaced_string = UnicodeStringToUtf8String(result); | |
187 if (replaced_string == *string_to_process) { | |
188 return false; | |
189 } | |
190 *string_to_process = replaced_string; | |
191 return true; | |
192 } | |
193 | |
194 private: | |
195 scoped_ptr<RegexPattern> utf8_regexp_; | |
196 | |
197 DISALLOW_COPY_AND_ASSIGN(IcuRegExp); | |
198 }; | |
199 | |
200 RegExpInput* RegExpInput::Create(const string& utf8_input) { | |
201 return new IcuRegExpInput(utf8_input); | |
202 } | |
203 | |
204 RegExp* RegExp::Create(const string& utf8_regexp) { | |
205 return new IcuRegExp(utf8_regexp); | |
206 } | |
207 | |
208 } // namespace phonenumbers | |
209 } // namespace i18n | |
OLD | NEW |