Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(176)

Side by Side Diff: third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc

Issue 6930013: Re-committing http://codereview.chromium.org/6803005/ after fixing multi-dll build: (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 9 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "third_party/libphonenumber/cpp/src/regexp_adapter.h"
6
7 // Setup all of the Chromium and WebKit defines
8 #include "base/logging.h"
9 #include "base/scoped_ptr.h"
10 #include "build/build_config.h"
11 #include "unicode/regex.h"
12 #include "unicode/stringpiece.h"
13 #include "unicode/unistr.h"
14
15 namespace {
16
17 // Converts |source| to UTF-8 string, returns it starting at position |pos|.
18 std::string UnicodeStringToUtf8String(icu::UnicodeString const& source,
19 int pos) {
20 std::string data;
21 source.toUTF8String<std::string>(data);
22 return data.substr(pos);
23 }
24
25 } // namespace
26
27 // Implementation of the abstract classes RegularExpressionInput and
28 // RegularExpression using ICU regular expression capabilities.
29
30 // The Regular Expression input class.
31 class IcuRegularExpressionInput : public reg_exp::RegularExpressionInput {
32 public:
33 explicit IcuRegularExpressionInput(const char* utf8_input);
34
35 // RegularExpressionInput implementation:
36 // Matches string to regular expression, returns true if expression was
37 // matched, false otherwise, advances position in the match.
38 // |reg_exp| - expression to be matched.
39 // |beginning_only| - if true match would be successfull only if appears at
40 // the beginning of the tested region of the string.
41 // |matched_string1| - successfully matched first string. Can be NULL.
42 // |matched_string2| - successfully matched second string. Can be NULL.
43 virtual bool ConsumeRegExp(std::string const& reg_exp,
44 bool beginning_only,
45 std::string* matched_string1,
46 std::string* matched_string2);
47
48 // Convert unmatched input to a string.
49 virtual std::string ToString() const;
50
51 icu::UnicodeString* Data() { return &utf8_input_; }
52
53 // Position in the input. For the newly created input position is 0,
54 // each call to ConsumeRegExp() or RegularExpression::Consume() advances
55 // position in the case of the successful match to be after the match.
56 int pos() const { return pos_; }
57 void set_pos(int pos) { pos_ = pos; }
58
59 private:
60 icu::UnicodeString utf8_input_;
61 int pos_;
62
63 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpressionInput);
64 };
65
66 // The regular expression class.
67 class IcuRegularExpression : public reg_exp::RegularExpression {
68 public:
69 explicit IcuRegularExpression(const char* utf8_regexp);
70
71 // RegularExpression implementation:
72 // Matches string to regular expression, returns true if expression was
73 // matched, false otherwise, advances position in the match.
74 // |input_string| - string to be searched.
75 // |beginning_only| - if true match would be successfull only if appears at
76 // the beginning of the tested region of the string.
77 // |matched_string1| - successfully matched first string. Can be NULL.
78 // |matched_string2| - successfully matched second string. Can be NULL.
79 // |matched_string3| - successfully matched third string. Can be NULL.
80 virtual bool Consume(reg_exp::RegularExpressionInput* input_string,
81 bool beginning_only,
82 std::string* matched_string1,
83 std::string* matched_string2,
84 std::string* matched_string3) const;
85
86 // Matches string to regular expression, returns true if expression was
87 // matched, false otherwise.
88 // |input_string| - string to be searched.
89 // |full_match| - if true match would be successfull only if it matches the
90 // complete string.
91 // |matched_string| - successfully matched string. Can be NULL.
92 virtual bool Match(const char* input_string,
93 bool full_match,
94 std::string* matched_string) const;
95
96 // Replaces match(es) in the |string_to_process|. if |global| is true,
97 // replaces all the matches, only the first match otherwise.
98 // |replacement_string| - text the matches are replaced with.
99 // Returns true if expression successfully processed through the string,
100 // even if no actual replacements were made. Returns false in case of an
101 // error.
102 virtual bool Replace(std::string* string_to_process,
103 bool global,
104 const char* replacement_string) const;
105 private:
106 scoped_ptr<icu::RegexPattern> utf8_regexp_;
107
108 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpression);
109 };
110
111 IcuRegularExpressionInput::IcuRegularExpressionInput(const char* utf8_input)
112 : pos_(0) {
113 DCHECK(utf8_input);
114 utf8_input_ = icu::UnicodeString::fromUTF8(utf8_input);
115 }
116
117 bool IcuRegularExpressionInput::ConsumeRegExp(std::string const& reg_exp,
118 bool beginning_only,
119 std::string* matched_string1,
120 std::string* matched_string2) {
121 IcuRegularExpression re(reg_exp.c_str());
122
123 return re.Consume(this, beginning_only, matched_string1, matched_string2,
124 NULL);
125 }
126
127 std::string IcuRegularExpressionInput::ToString() const {
128 if (pos_ < 0 || pos_ > utf8_input_.length())
129 return std::string();
130 return UnicodeStringToUtf8String(utf8_input_, pos_);
131 }
132
133 IcuRegularExpression::IcuRegularExpression(const char* utf8_regexp) {
134 DCHECK(utf8_regexp);
135 UParseError pe;
136 UErrorCode status = U_ZERO_ERROR;
137 utf8_regexp_.reset(icu::RegexPattern::compile(
138 icu::UnicodeString::fromUTF8(utf8_regexp), 0, pe, status));
139 if (U_FAILURE(status)) {
140 // All of the passed regular expressions should compile correctly.
141 utf8_regexp_.reset(NULL);
142 NOTREACHED();
143 }
144 }
145
146 bool IcuRegularExpression::Consume(
147 reg_exp::RegularExpressionInput* input_string,
148 bool beginning_only,
149 std::string* matched_string1,
150 std::string* matched_string2,
151 std::string* matched_string3) const {
152 DCHECK(input_string);
153 // matched_string1 may be NULL
154 // matched_string2 may be NULL
155 // matched_string3 may be NULL
156 if (!utf8_regexp_.get())
157 return false;
158
159 IcuRegularExpressionInput* input =
160 reinterpret_cast<IcuRegularExpressionInput *>(input_string);
161 UErrorCode status = U_ZERO_ERROR;
162 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input->Data()),
163 status));
164
165 if (U_FAILURE(status))
166 return false;
167
168 if (beginning_only) {
169 if (!matcher->lookingAt(input->pos(), status))
170 return false;
171 } else {
172 if (!matcher->find(input->pos(), status))
173 return false;
174 }
175 if (U_FAILURE(status))
176 return false;
177 // If less matches than expected - fail.
178 if ((matched_string3 && matcher->groupCount() < 3) ||
179 (matched_string2 && matcher->groupCount() < 2) ||
180 (matched_string1 && matcher->groupCount() < 1)) {
181 return false;
182 }
183 if (matcher->groupCount() > 0 && matched_string1) {
184 *matched_string1 = UnicodeStringToUtf8String(matcher->group(1, status), 0);
185 }
186 if (matcher->groupCount() > 1 && matched_string2) {
187 *matched_string2 = UnicodeStringToUtf8String(matcher->group(2, status), 0);
188 }
189 if (matcher->groupCount() > 2 && matched_string3) {
190 *matched_string3 = UnicodeStringToUtf8String(matcher->group(3, status), 0);
191 }
192 input->set_pos(matcher->end(status));
193 return true;
194 }
195
196 bool IcuRegularExpression::Match(const char* input_string,
197 bool full_match,
198 std::string* matched_string) const {
199 DCHECK(input_string);
200 // matched_string may be NULL
201 if (!utf8_regexp_.get())
202 return false;
203
204 IcuRegularExpressionInput input(input_string);
205 UErrorCode status = U_ZERO_ERROR;
206 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()),
207 status));
208
209 if (U_FAILURE(status))
210 return false;
211
212 if (full_match) {
213 if (!matcher->matches(input.pos(), status))
214 return false;
215 } else {
216 if (!matcher->find(input.pos(), status))
217 return false;
218 }
219 if (U_FAILURE(status))
220 return false;
221 if (matcher->groupCount() > 0 && matched_string) {
222 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status), 0);
223 }
224 return true;
225 }
226
227 bool IcuRegularExpression::Replace(std::string* string_to_process,
228 bool global,
229 const char* replacement_string) const {
230 DCHECK(string_to_process);
231 DCHECK(replacement_string);
232
233 std::string adapted_replacement(replacement_string);
234 // Adapt replacement string from RE2 (\0-9 for matches) format to ICU format
235 // ($0-9 for matches). All '$' should be prepended with '\' as well.
236 size_t backslash_pos = adapted_replacement.find('\\');
237 size_t dollar_pos = adapted_replacement.find('$');
238 while (backslash_pos != std::string::npos ||
239 dollar_pos != std::string::npos) {
240 bool process_dollar = false;
241 if (backslash_pos == std::string::npos ||
242 (dollar_pos != std::string::npos && dollar_pos < backslash_pos)) {
243 process_dollar = true;
244 }
245 if (process_dollar) {
246 adapted_replacement.insert(dollar_pos, "\\");
247 dollar_pos = adapted_replacement.find('$', dollar_pos + 2);
248 if (backslash_pos != std::string::npos)
249 ++backslash_pos;
250 } else {
251 if (adapted_replacement.length() > backslash_pos + 1) {
252 if (adapted_replacement[backslash_pos + 1] >= '0' &&
253 adapted_replacement[backslash_pos + 1] <= '9') {
254 adapted_replacement[backslash_pos] = '$';
255 }
256 if (adapted_replacement[backslash_pos + 1] == '\\') {
257 // Skip two characters instead of one.
258 ++backslash_pos;
259 }
260 }
261 backslash_pos = adapted_replacement.find('\\', backslash_pos + 1);
262 }
263 }
264
265 IcuRegularExpressionInput input(string_to_process->c_str());
266 UErrorCode status = U_ZERO_ERROR;
267 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()),
268 status));
269 if (U_FAILURE(status))
270 return false;
271
272 icu::UnicodeString result;
273
274 if (global) {
275 result = matcher->replaceAll(
276 icu::UnicodeString::fromUTF8(adapted_replacement),
277 status);
278 } else {
279 result = matcher->replaceFirst(
280 icu::UnicodeString::fromUTF8(adapted_replacement),
281 status);
282 }
283 if (U_FAILURE(status))
284 return false;
285 *string_to_process = UnicodeStringToUtf8String(result, 0);
286 return true;
287 }
288
289 namespace reg_exp {
290
291 RegularExpressionInput* CreateRegularExpressionInput(const char* utf8_input) {
292 return new IcuRegularExpressionInput(utf8_input);
293 }
294
295 RegularExpression* CreateRegularExpression(const char* utf8_regexp) {
296 return new IcuRegularExpression(utf8_regexp);
297 }
298
299 } // namespace reg_exp
OLDNEW
« no previous file with comments | « third_party/libphonenumber/README.chromium ('k') | third_party/libphonenumber/cpp/CMakeLists.txt » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698