Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(156)

Side by Side Diff: third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc

Issue 6803005: Autofill phone number enhancements and integration of Phone Number Util Library: part 1 (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 9 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "third_party/libphonenumber/cpp/src/regexp_adapter.h"
6
7 // Setup all of the Chromium and WebKit defines
Mark Mentovai 2011/04/27 16:53:01 “Set up” as a verb is two words.
GeorgeY 2011/04/28 07:21:18 Fixed
8 #include <build/build_config.h>
Mark Mentovai 2011/04/27 16:53:01 These should all be "…" includes, not <…> includes
GeorgeY 2011/04/28 07:21:18 Yes, I used their style while the file was in thei
9 #include <unicode/regex.h>
10 #include <unicode/stringpiece.h>
11 #include <unicode/unistr.h>
12
13 #include "base/logging.h"
14 #include "base/scoped_ptr.h"
15
16 namespace {
17
18 // Converts |source| to utf8 string, returns it.
Mark Mentovai 2011/04/27 16:53:01 “to a UTF-8 string.” This comment is silent on wh
GeorgeY 2011/04/28 07:21:18 Done.
19 std::string UnicodeStringToUtf8String(icu::UnicodeString const& source,
20 int pos) {
Mark Mentovai 2011/04/27 16:53:01 This looks like it would have fit on the preceding
GeorgeY 2011/04/28 07:21:18 Nope, short by two characters.
21 std::string data;
22 source.toUTF8String<std::string>(data);
23 return data.substr(pos);
24 }
25
26 } // namespace
27
28 // Implementation of the abstract classes RegularExpressionInput and
29 // RegularExpression using ICU regular expression capabilities.
30
31 // The reg exp input class.
Mark Mentovai 2011/04/27 16:53:01 Don’t abbreviate. Regular expression.
GeorgeY 2011/04/28 07:21:18 Done.
32 class IcuRegularExpressionInput : public reg_exp::RegularExpressionInput {
33 public:
34 explicit IcuRegularExpressionInput(const char* utf8_input);
35
36 // RegularExpressionInput implementation:
37 // Matches string to regular expression, returns true if expression was
38 // matched, false otherwise, advances position in the match.
39 // |reg_exp| - expression to be matched.
40 // |beginning_only| - if true match would be successfull only if appears at
41 // the beginning of the tested region of the string.
42 // |matched_string1| - successfully matched first string. Can be NULL.
43 // |matched_string2| - successfully matched second string. Can be NULL.
44 virtual bool ConsumeRegExp(std::string const& reg_exp,
45 bool beginning_only,
46 std::string* matched_string1,
47 std::string* matched_string2);
48
49 // Convert unmatched input to a string.
50 virtual std::string ToString() const;
51
52 icu::UnicodeString* Data() { return &utf8_input_; }
53
54 // Position in the input. For the newly created input position is 0,
55 // each call to ConsumeRegExp() or RegularExpression::Consume() advances
56 // position in the case of the successful match to be after the match.
57 int pos() const { return pos_; }
58 void set_pos(int pos) { pos_ = pos; }
59
60 private:
61 icu::UnicodeString utf8_input_;
62 int pos_;
63
64 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpressionInput);
65 };
66
67 // The regular expression class.
68 class IcuRegularExpression : public reg_exp::RegularExpression {
69 public:
70 explicit IcuRegularExpression(const char* utf8_regexp);
71
72 // RegularExpression implementation:
73 // Matches string to regular expression, returns true if expression was
74 // matched, false otherwise, advances position in the match.
75 // |input_string| - string to be searched.
76 // |beginning_only| - if true match would be successfull only if appears at
77 // the beginning of the tested region of the string.
78 // |matched_string1| - successfully matched first string. Can be NULL.
79 // |matched_string2| - successfully matched second string. Can be NULL.
80 virtual bool Consume(reg_exp::RegularExpressionInput* input_string,
81 bool beginning_only,
82 std::string* matched_string1,
83 std::string* matched_string2) const;
84
85 // Matches string to regular expression, returns true if expression was
86 // matched, false otherwise.
87 // |input_string| - string to be searched.
88 // |full_match| - if true match would be successfull only if it matches the
89 // complete string.
90 // |matched_string| - successfully matched string. Can be NULL.
91 virtual bool Match(const char* input_string,
92 bool full_match,
93 std::string* matched_string) const;
94
95 // Replaces match(es) in the |string_to_process|. if |global| is true,
96 // replaces all the matches, only the first match otherwise.
97 // |replacement_string| - text the matches are replaced with.
98 // Returns true if expression successfully processed through the string,
99 // even if no actual replacements were made. Returns false in case of an
100 // error.
101 virtual bool Replace(std::string* string_to_process,
102 bool global,
103 const char* replacement_string) const;
104 private:
105 scoped_ptr<icu::RegexPattern> utf8_regexp_;
106
107 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpression);
108 };
109
110 IcuRegularExpressionInput::IcuRegularExpressionInput(
111 const char* utf8_input)
Mark Mentovai 2011/04/27 16:53:01 This looks like it would fit on the preceding line
GeorgeY 2011/04/28 07:21:18 Done.
112 : pos_(0) {
113 DCHECK(utf8_input);
114 utf8_input_ = icu::UnicodeString::fromUTF8(utf8_input);
115 }
116
117 bool IcuRegularExpressionInput::ConsumeRegExp(std::string const& reg_exp,
118 bool beginning_only,
119 std::string* matched_string1,
120 std::string* matched_string2) {
121 IcuRegularExpression re(reg_exp.c_str());
122
123 return re.Consume(this, beginning_only, matched_string1, matched_string2);
124 }
125
126 std::string IcuRegularExpressionInput::ToString() const {
127 if (pos_ < 0 || pos_ > utf8_input_.length())
128 return std::string();
129 return UnicodeStringToUtf8String(utf8_input_, pos_);
130 }
131
132 IcuRegularExpression::IcuRegularExpression(const char* utf8_regexp) {
133 DCHECK(utf8_regexp);
134 UParseError pe;
135 UErrorCode status = U_ZERO_ERROR;
136 utf8_regexp_.reset(icu::RegexPattern::compile(
137 icu::UnicodeString::fromUTF8(utf8_regexp), 0, pe, status));
138 if (U_FAILURE(status)) {
139 // All of the passed Regular expressions should compile correctly.
Mark Mentovai 2011/04/27 16:53:01 Lowercase r.
GeorgeY 2011/04/28 07:21:18 Done.
140 utf8_regexp_.reset(NULL);
141 NOTREACHED();
142 }
143 }
144
145 bool IcuRegularExpression::Consume(
146 reg_exp::RegularExpressionInput* input_string,
147 bool beginning_only,
148 std::string* matched_string1,
149 std::string* matched_string2) const {
150 DCHECK(input_string);
151 // matched_string1 may be NULL
152 // matched_string2 may be NULL
153 if (!utf8_regexp_.get())
154 return false;
155
156 IcuRegularExpressionInput* input =
157 reinterpret_cast<IcuRegularExpressionInput *>(input_string);
158 UErrorCode status = U_ZERO_ERROR;
159 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input->Data()),
160 status));
161
162 if (U_FAILURE(status))
163 return false;
164
165 if (beginning_only) {
166 if (!matcher->lookingAt(input->pos(), status))
167 return false;
168 } else {
169 if (!matcher->find(input->pos(), status))
170 return false;
171 }
172 if (U_FAILURE(status))
173 return false;
174 // If less matches than expected - fail.
175 if ((matched_string2 && matcher->groupCount() < 2) ||
176 (matched_string1 && matcher->groupCount() < 1)) {
177 return false;
178 }
179 if (matcher->groupCount() > 0 && matched_string1) {
180 *matched_string1 = UnicodeStringToUtf8String(matcher->group(1, status), 0);
181 }
182 if (matcher->groupCount() > 1 && matched_string2) {
183 *matched_string2 = UnicodeStringToUtf8String(matcher->group(2, status), 0);
184 }
185 input->set_pos(matcher->end(status));
186 return true;
187 }
188
189 bool IcuRegularExpression::Match(const char* input_string,
190 bool full_match,
191 std::string* matched_string) const {
192 DCHECK(input_string);
193 // matched_string may be NULL
194 if (!utf8_regexp_.get())
195 return false;
196
197 IcuRegularExpressionInput input(input_string);
198 UErrorCode status = U_ZERO_ERROR;
199 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()),
200 status));
201
202 if (U_FAILURE(status))
203 return false;
204
205 if (full_match) {
206 if (!matcher->matches(input.pos(), status))
207 return false;
208 } else {
209 if (!matcher->find(input.pos(), status))
210 return false;
211 }
212 if (U_FAILURE(status))
213 return false;
214 if (matcher->groupCount() > 0 && matched_string) {
215 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status), 0);
216 }
217 return true;
218 }
219
220 bool IcuRegularExpression::Replace(std::string* string_to_process,
221 bool global,
222 const char* replacement_string) const {
223 DCHECK(string_to_process);
224 DCHECK(replacement_string);
225
226 std::string adapted_replacement(replacement_string);
227 // Adapt replacement string from RE2 (\0-9 for matches) format to ICU format
228 // ($0-9 for matches). All '$' should be pre-pended with '\' as well.
Mark Mentovai 2011/04/27 16:53:01 prepended (no hyphen).
GeorgeY 2011/04/28 07:21:18 Tell it to VS spell-checker :). Fixed.
229 size_t backslash_pos = adapted_replacement.find('\\');
230 size_t dollar_pos = adapted_replacement.find('$');
231 while (backslash_pos != std::string::npos ||
232 dollar_pos != std::string::npos) {
233 bool process_dollar = false;
234 if (backslash_pos == std::string::npos ||
235 (dollar_pos != std::string::npos && dollar_pos < backslash_pos)) {
236 process_dollar = true;
237 }
238 if (process_dollar) {
239 adapted_replacement.insert(dollar_pos, "\\");
240 dollar_pos = adapted_replacement.find('$', dollar_pos + 2);
241 if (backslash_pos != std::string::npos)
242 ++backslash_pos;
243 } else {
244 if (adapted_replacement.length() > backslash_pos + 1) {
245 if (adapted_replacement[backslash_pos + 1] >= '0' &&
246 adapted_replacement[backslash_pos + 1] <= '9') {
247 adapted_replacement[backslash_pos] = '$';
248 }
249 if (adapted_replacement[backslash_pos + 1] == '\\') {
250 // Skip two characters instead of one.
251 ++backslash_pos;
252 }
253 }
254 backslash_pos = adapted_replacement.find('\\', backslash_pos + 1);
255 }
256 }
257
258 IcuRegularExpressionInput input(string_to_process->c_str());
259 UErrorCode status = U_ZERO_ERROR;
260 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()),
261 status));
262 if (U_FAILURE(status))
263 return false;
264
265 icu::UnicodeString result;
266
267 if (global) {
268 result = matcher->replaceAll(
269 icu::UnicodeString::fromUTF8(adapted_replacement),
270 status);
271 } else {
272 result = matcher->replaceFirst(
273 icu::UnicodeString::fromUTF8(adapted_replacement),
274 status);
275 }
276 if (U_FAILURE(status))
277 return false;
278 *string_to_process = UnicodeStringToUtf8String(result, 0);
279 return true;
280 }
281
282
283 namespace reg_exp {
284
285 RegularExpressionInput* CreateRegularExpressionInput(const char* utf8_input) {
286 return new IcuRegularExpressionInput(utf8_input);
287 }
288
289 RegularExpression* CreateRegularExpression(const char* utf8_regexp) {
290 return new IcuRegularExpression(utf8_regexp);
291 }
292
293 } // namespace reg_exp
294
Mark Mentovai 2011/04/27 16:53:01 This blank line isn’t necessary.
GeorgeY 2011/04/28 07:21:18 It is, otherwise Lint would report the lack of the
Mark Mentovai 2011/04/28 14:27:03 GeorgeY wrote:
GeorgeY 2011/05/02 23:36:59 Apparently Lint does require empty line at the end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698