|
OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "third_party/libphonenumber/cpp/src/regexp_adapter.h" | |
6 | |
7 // Setup all of the Chromium and WebKit defines | |
Mark Mentovai
2011/04/27 16:53:01
“Set up” as a verb is two words.
GeorgeY
2011/04/28 07:21:18
Fixed
| |
8 #include <build/build_config.h> | |
Mark Mentovai
2011/04/27 16:53:01
These should all be "…" includes, not <…> includes
GeorgeY
2011/04/28 07:21:18
Yes, I used their style while the file was in thei
| |
9 #include <unicode/regex.h> | |
10 #include <unicode/stringpiece.h> | |
11 #include <unicode/unistr.h> | |
12 | |
13 #include "base/logging.h" | |
14 #include "base/scoped_ptr.h" | |
15 | |
16 namespace { | |
17 | |
18 // Converts |source| to utf8 string, returns it. | |
Mark Mentovai
2011/04/27 16:53:01
“to a UTF-8 string.”
This comment is silent on wh
GeorgeY
2011/04/28 07:21:18
Done.
| |
19 std::string UnicodeStringToUtf8String(icu::UnicodeString const& source, | |
20 int pos) { | |
Mark Mentovai
2011/04/27 16:53:01
This looks like it would have fit on the preceding
GeorgeY
2011/04/28 07:21:18
Nope, short by two characters.
| |
21 std::string data; | |
22 source.toUTF8String<std::string>(data); | |
23 return data.substr(pos); | |
24 } | |
25 | |
26 } // namespace | |
27 | |
28 // Implementation of the abstract classes RegularExpressionInput and | |
29 // RegularExpression using ICU regular expression capabilities. | |
30 | |
31 // The reg exp input class. | |
Mark Mentovai
2011/04/27 16:53:01
Don’t abbreviate. Regular expression.
GeorgeY
2011/04/28 07:21:18
Done.
| |
32 class IcuRegularExpressionInput : public reg_exp::RegularExpressionInput { | |
33 public: | |
34 explicit IcuRegularExpressionInput(const char* utf8_input); | |
35 | |
36 // RegularExpressionInput implementation: | |
37 // Matches string to regular expression, returns true if expression was | |
38 // matched, false otherwise, advances position in the match. | |
39 // |reg_exp| - expression to be matched. | |
40 // |beginning_only| - if true match would be successfull only if appears at | |
41 // the beginning of the tested region of the string. | |
42 // |matched_string1| - successfully matched first string. Can be NULL. | |
43 // |matched_string2| - successfully matched second string. Can be NULL. | |
44 virtual bool ConsumeRegExp(std::string const& reg_exp, | |
45 bool beginning_only, | |
46 std::string* matched_string1, | |
47 std::string* matched_string2); | |
48 | |
49 // Convert unmatched input to a string. | |
50 virtual std::string ToString() const; | |
51 | |
52 icu::UnicodeString* Data() { return &utf8_input_; } | |
53 | |
54 // Position in the input. For the newly created input position is 0, | |
55 // each call to ConsumeRegExp() or RegularExpression::Consume() advances | |
56 // position in the case of the successful match to be after the match. | |
57 int pos() const { return pos_; } | |
58 void set_pos(int pos) { pos_ = pos; } | |
59 | |
60 private: | |
61 icu::UnicodeString utf8_input_; | |
62 int pos_; | |
63 | |
64 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpressionInput); | |
65 }; | |
66 | |
67 // The regular expression class. | |
68 class IcuRegularExpression : public reg_exp::RegularExpression { | |
69 public: | |
70 explicit IcuRegularExpression(const char* utf8_regexp); | |
71 | |
72 // RegularExpression implementation: | |
73 // Matches string to regular expression, returns true if expression was | |
74 // matched, false otherwise, advances position in the match. | |
75 // |input_string| - string to be searched. | |
76 // |beginning_only| - if true match would be successfull only if appears at | |
77 // the beginning of the tested region of the string. | |
78 // |matched_string1| - successfully matched first string. Can be NULL. | |
79 // |matched_string2| - successfully matched second string. Can be NULL. | |
80 virtual bool Consume(reg_exp::RegularExpressionInput* input_string, | |
81 bool beginning_only, | |
82 std::string* matched_string1, | |
83 std::string* matched_string2) const; | |
84 | |
85 // Matches string to regular expression, returns true if expression was | |
86 // matched, false otherwise. | |
87 // |input_string| - string to be searched. | |
88 // |full_match| - if true match would be successfull only if it matches the | |
89 // complete string. | |
90 // |matched_string| - successfully matched string. Can be NULL. | |
91 virtual bool Match(const char* input_string, | |
92 bool full_match, | |
93 std::string* matched_string) const; | |
94 | |
95 // Replaces match(es) in the |string_to_process|. if |global| is true, | |
96 // replaces all the matches, only the first match otherwise. | |
97 // |replacement_string| - text the matches are replaced with. | |
98 // Returns true if expression successfully processed through the string, | |
99 // even if no actual replacements were made. Returns false in case of an | |
100 // error. | |
101 virtual bool Replace(std::string* string_to_process, | |
102 bool global, | |
103 const char* replacement_string) const; | |
104 private: | |
105 scoped_ptr<icu::RegexPattern> utf8_regexp_; | |
106 | |
107 DISALLOW_COPY_AND_ASSIGN(IcuRegularExpression); | |
108 }; | |
109 | |
110 IcuRegularExpressionInput::IcuRegularExpressionInput( | |
111 const char* utf8_input) | |
Mark Mentovai
2011/04/27 16:53:01
This looks like it would fit on the preceding line
GeorgeY
2011/04/28 07:21:18
Done.
| |
112 : pos_(0) { | |
113 DCHECK(utf8_input); | |
114 utf8_input_ = icu::UnicodeString::fromUTF8(utf8_input); | |
115 } | |
116 | |
117 bool IcuRegularExpressionInput::ConsumeRegExp(std::string const& reg_exp, | |
118 bool beginning_only, | |
119 std::string* matched_string1, | |
120 std::string* matched_string2) { | |
121 IcuRegularExpression re(reg_exp.c_str()); | |
122 | |
123 return re.Consume(this, beginning_only, matched_string1, matched_string2); | |
124 } | |
125 | |
126 std::string IcuRegularExpressionInput::ToString() const { | |
127 if (pos_ < 0 || pos_ > utf8_input_.length()) | |
128 return std::string(); | |
129 return UnicodeStringToUtf8String(utf8_input_, pos_); | |
130 } | |
131 | |
132 IcuRegularExpression::IcuRegularExpression(const char* utf8_regexp) { | |
133 DCHECK(utf8_regexp); | |
134 UParseError pe; | |
135 UErrorCode status = U_ZERO_ERROR; | |
136 utf8_regexp_.reset(icu::RegexPattern::compile( | |
137 icu::UnicodeString::fromUTF8(utf8_regexp), 0, pe, status)); | |
138 if (U_FAILURE(status)) { | |
139 // All of the passed Regular expressions should compile correctly. | |
Mark Mentovai
2011/04/27 16:53:01
Lowercase r.
GeorgeY
2011/04/28 07:21:18
Done.
| |
140 utf8_regexp_.reset(NULL); | |
141 NOTREACHED(); | |
142 } | |
143 } | |
144 | |
145 bool IcuRegularExpression::Consume( | |
146 reg_exp::RegularExpressionInput* input_string, | |
147 bool beginning_only, | |
148 std::string* matched_string1, | |
149 std::string* matched_string2) const { | |
150 DCHECK(input_string); | |
151 // matched_string1 may be NULL | |
152 // matched_string2 may be NULL | |
153 if (!utf8_regexp_.get()) | |
154 return false; | |
155 | |
156 IcuRegularExpressionInput* input = | |
157 reinterpret_cast<IcuRegularExpressionInput *>(input_string); | |
158 UErrorCode status = U_ZERO_ERROR; | |
159 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input->Data()), | |
160 status)); | |
161 | |
162 if (U_FAILURE(status)) | |
163 return false; | |
164 | |
165 if (beginning_only) { | |
166 if (!matcher->lookingAt(input->pos(), status)) | |
167 return false; | |
168 } else { | |
169 if (!matcher->find(input->pos(), status)) | |
170 return false; | |
171 } | |
172 if (U_FAILURE(status)) | |
173 return false; | |
174 // If less matches than expected - fail. | |
175 if ((matched_string2 && matcher->groupCount() < 2) || | |
176 (matched_string1 && matcher->groupCount() < 1)) { | |
177 return false; | |
178 } | |
179 if (matcher->groupCount() > 0 && matched_string1) { | |
180 *matched_string1 = UnicodeStringToUtf8String(matcher->group(1, status), 0); | |
181 } | |
182 if (matcher->groupCount() > 1 && matched_string2) { | |
183 *matched_string2 = UnicodeStringToUtf8String(matcher->group(2, status), 0); | |
184 } | |
185 input->set_pos(matcher->end(status)); | |
186 return true; | |
187 } | |
188 | |
189 bool IcuRegularExpression::Match(const char* input_string, | |
190 bool full_match, | |
191 std::string* matched_string) const { | |
192 DCHECK(input_string); | |
193 // matched_string may be NULL | |
194 if (!utf8_regexp_.get()) | |
195 return false; | |
196 | |
197 IcuRegularExpressionInput input(input_string); | |
198 UErrorCode status = U_ZERO_ERROR; | |
199 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), | |
200 status)); | |
201 | |
202 if (U_FAILURE(status)) | |
203 return false; | |
204 | |
205 if (full_match) { | |
206 if (!matcher->matches(input.pos(), status)) | |
207 return false; | |
208 } else { | |
209 if (!matcher->find(input.pos(), status)) | |
210 return false; | |
211 } | |
212 if (U_FAILURE(status)) | |
213 return false; | |
214 if (matcher->groupCount() > 0 && matched_string) { | |
215 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status), 0); | |
216 } | |
217 return true; | |
218 } | |
219 | |
220 bool IcuRegularExpression::Replace(std::string* string_to_process, | |
221 bool global, | |
222 const char* replacement_string) const { | |
223 DCHECK(string_to_process); | |
224 DCHECK(replacement_string); | |
225 | |
226 std::string adapted_replacement(replacement_string); | |
227 // Adapt replacement string from RE2 (\0-9 for matches) format to ICU format | |
228 // ($0-9 for matches). All '$' should be pre-pended with '\' as well. | |
Mark Mentovai
2011/04/27 16:53:01
prepended (no hyphen).
GeorgeY
2011/04/28 07:21:18
Tell it to VS spell-checker :). Fixed.
| |
229 size_t backslash_pos = adapted_replacement.find('\\'); | |
230 size_t dollar_pos = adapted_replacement.find('$'); | |
231 while (backslash_pos != std::string::npos || | |
232 dollar_pos != std::string::npos) { | |
233 bool process_dollar = false; | |
234 if (backslash_pos == std::string::npos || | |
235 (dollar_pos != std::string::npos && dollar_pos < backslash_pos)) { | |
236 process_dollar = true; | |
237 } | |
238 if (process_dollar) { | |
239 adapted_replacement.insert(dollar_pos, "\\"); | |
240 dollar_pos = adapted_replacement.find('$', dollar_pos + 2); | |
241 if (backslash_pos != std::string::npos) | |
242 ++backslash_pos; | |
243 } else { | |
244 if (adapted_replacement.length() > backslash_pos + 1) { | |
245 if (adapted_replacement[backslash_pos + 1] >= '0' && | |
246 adapted_replacement[backslash_pos + 1] <= '9') { | |
247 adapted_replacement[backslash_pos] = '$'; | |
248 } | |
249 if (adapted_replacement[backslash_pos + 1] == '\\') { | |
250 // Skip two characters instead of one. | |
251 ++backslash_pos; | |
252 } | |
253 } | |
254 backslash_pos = adapted_replacement.find('\\', backslash_pos + 1); | |
255 } | |
256 } | |
257 | |
258 IcuRegularExpressionInput input(string_to_process->c_str()); | |
259 UErrorCode status = U_ZERO_ERROR; | |
260 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), | |
261 status)); | |
262 if (U_FAILURE(status)) | |
263 return false; | |
264 | |
265 icu::UnicodeString result; | |
266 | |
267 if (global) { | |
268 result = matcher->replaceAll( | |
269 icu::UnicodeString::fromUTF8(adapted_replacement), | |
270 status); | |
271 } else { | |
272 result = matcher->replaceFirst( | |
273 icu::UnicodeString::fromUTF8(adapted_replacement), | |
274 status); | |
275 } | |
276 if (U_FAILURE(status)) | |
277 return false; | |
278 *string_to_process = UnicodeStringToUtf8String(result, 0); | |
279 return true; | |
280 } | |
281 | |
282 | |
283 namespace reg_exp { | |
284 | |
285 RegularExpressionInput* CreateRegularExpressionInput(const char* utf8_input) { | |
286 return new IcuRegularExpressionInput(utf8_input); | |
287 } | |
288 | |
289 RegularExpression* CreateRegularExpression(const char* utf8_regexp) { | |
290 return new IcuRegularExpression(utf8_regexp); | |
291 } | |
292 | |
293 } // namespace reg_exp | |
294 | |
Mark Mentovai
2011/04/27 16:53:01
This blank line isn’t necessary.
GeorgeY
2011/04/28 07:21:18
It is, otherwise Lint would report the lack of the
Mark Mentovai
2011/04/28 14:27:03
GeorgeY wrote:
GeorgeY
2011/05/02 23:36:59
Apparently Lint does require empty line at the end
| |
OLD | NEW |