|
OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "third_party/libphonenumber/cpp/src/regexp_adapter.h" | |
6 | |
7 // Setup all of the chrome/webkit defines | |
dhollowa
2011/04/06 15:02:55
nit: s/chrome\/webkit defines/ Chromium and WebKit
GeorgeY
2011/04/07 00:00:39
Done.
| |
8 #include <build/build_config.h> | |
9 #include <unicode/regex.h> | |
10 #include <unicode/stringpiece.h> | |
11 #include <unicode/unistr.h> | |
12 | |
13 #include "base/logging.h" | |
14 #include "base/scoped_ptr.h" | |
15 | |
16 | |
dhollowa
2011/04/06 15:02:55
nit: remove extra space.
GeorgeY
2011/04/07 00:00:39
Done.
| |
17 namespace { | |
18 | |
19 std::string UnicodeStringToUtf8String(icu::UnicodeString const& source, | |
20 int pos) { | |
21 std::string data; | |
22 source.toUTF8String<std::string>(data); | |
23 return data.substr(pos); | |
24 } | |
25 | |
26 } // namespace | |
27 | |
28 class IcuRegularExpressionInput : public reg_exp::RegularExpressionInput { | |
dhollowa
2011/04/06 15:02:55
Please add comments for class.
GeorgeY
2011/04/07 00:00:39
Copied comments from the libphonenumber/cpp/src/re
| |
29 public: | |
30 explicit IcuRegularExpressionInput(const char* utf8_input); | |
31 | |
32 virtual bool ConsumeRegExp(std::string const& reg_exp, | |
dhollowa
2011/04/06 15:02:55
nit: add comment to designate interface. i.e. //
GeorgeY
2011/04/07 00:00:39
Done.
| |
33 bool beginning_only, | |
34 std::string* matched_string1, | |
35 std::string* matched_string2); | |
36 virtual std::string ToString() const; | |
37 | |
38 icu::UnicodeString* Data() { return &utf8_input_; } | |
39 | |
40 int pos() const { return pos_; } | |
dhollowa
2011/04/06 15:02:55
Please add comments. It is not clear, upon casual
GeorgeY
2011/04/07 00:00:39
Done.
| |
41 void set_pos(int pos) { pos_ = pos; } | |
42 | |
43 private: | |
dhollowa
2011/04/06 15:02:55
DISALLOW_COPY_AND_ASSIGN
GeorgeY
2011/04/07 00:00:39
Done.
| |
44 icu::UnicodeString utf8_input_; | |
45 int pos_; | |
46 }; | |
47 | |
48 | |
dhollowa
2011/04/06 15:02:55
nit: remove extra space.
GeorgeY
2011/04/07 00:00:39
Done.
| |
49 class IcuRegularExpression : public reg_exp::RegularExpression { | |
50 public: | |
51 explicit IcuRegularExpression(const char* utf8_regexp); | |
52 | |
53 virtual bool Consume(reg_exp::RegularExpressionInput* input_string, | |
54 bool beginning_only, | |
55 std::string* matched_string1, | |
56 std::string* matched_string2) const; | |
57 | |
58 virtual bool Match(const char* input_string, | |
59 bool full_match, | |
60 std::string* matched_string) const; | |
61 | |
62 virtual bool Replace(std::string* string_to_process, | |
63 bool global, | |
64 const char* replacement_string) const; | |
65 private: | |
dhollowa
2011/04/06 15:02:55
DISALLOW_COPY_AND_ASSIGN
GeorgeY
2011/04/07 00:00:39
Done.
| |
66 scoped_ptr<icu::RegexPattern> utf8_regexp_; | |
67 }; | |
68 | |
69 IcuRegularExpressionInput::IcuRegularExpressionInput( | |
70 const char* utf8_input) | |
71 : utf8_input_(icu::UnicodeString::fromUTF8(utf8_input)), | |
dhollowa
2011/04/06 15:02:55
clank may not like inlined ctor with non-trivial c
GeorgeY
2011/04/07 00:00:39
Moved to the body.
| |
72 pos_(0) { | |
73 DCHECK(utf8_input); | |
74 } | |
75 | |
76 bool IcuRegularExpressionInput::ConsumeRegExp(std::string const& reg_exp, | |
77 bool beginning_only, | |
78 std::string* matched_string1, | |
79 std::string* matched_string2) { | |
80 IcuRegularExpression re(reg_exp.c_str()); | |
81 | |
82 return re.Consume(this, beginning_only, matched_string1, matched_string2); | |
83 } | |
84 | |
85 std::string IcuRegularExpressionInput::ToString() const { | |
86 if (pos_ < 0 || pos_ > utf8_input_.length()) | |
87 return std::string(); | |
88 return UnicodeStringToUtf8String(utf8_input_, pos_); | |
89 } | |
90 | |
91 IcuRegularExpression::IcuRegularExpression(const char* utf8_regexp) { | |
92 DCHECK(utf8_regexp); | |
93 UParseError pe; | |
94 UErrorCode status = U_ZERO_ERROR; | |
95 utf8_regexp_.reset(icu::RegexPattern::compile( | |
96 icu::UnicodeString::fromUTF8(utf8_regexp), 0, pe, status)); | |
97 if (U_FAILURE(status)) { | |
98 // All of the passed Regular expressions should compile correctly. | |
99 DCHECK(false); | |
dhollowa
2011/04/06 15:02:55
NOTREACHED();
GeorgeY
2011/04/07 00:00:39
Done.
| |
100 utf8_regexp_.reset(NULL); | |
101 } | |
102 } | |
103 | |
104 bool IcuRegularExpression::Consume( | |
105 reg_exp::RegularExpressionInput* input_string, | |
106 bool beginning_only, | |
107 std::string* matched_string1, | |
108 std::string* matched_string2) const { | |
109 DCHECK(input_string); | |
110 // matched_string1 may be NULL | |
111 // matched_string2 may be NULL | |
112 if (!utf8_regexp_.get()) | |
113 return false; | |
114 | |
115 IcuRegularExpressionInput* input = | |
116 reinterpret_cast<IcuRegularExpressionInput *>(input_string); | |
117 UErrorCode status = U_ZERO_ERROR; | |
118 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input->Data()), | |
119 status)); | |
120 | |
121 if (U_FAILURE(status)) | |
122 return false; | |
123 | |
124 if (beginning_only) { | |
125 if (!matcher->lookingAt(input->pos(), status)) | |
126 return false; | |
127 } else { | |
128 if (!matcher->find(input->pos(), status)) | |
129 return false; | |
130 } | |
131 if (U_FAILURE(status)) | |
132 return false; | |
133 // If less matches than expected - fail. | |
134 if ((matched_string2 && matcher->groupCount() < 2) || | |
135 (matched_string1 && matcher->groupCount() < 1)) { | |
136 return false; | |
137 } | |
138 if (matcher->groupCount() > 0 && matched_string1) { | |
139 *matched_string1 = UnicodeStringToUtf8String(matcher->group(1, status), 0); | |
140 } | |
141 if (matcher->groupCount() > 1 && matched_string2) { | |
142 *matched_string2 = UnicodeStringToUtf8String(matcher->group(2, status), 0); | |
143 } | |
144 input->set_pos(matcher->end(status)); | |
145 return true; | |
146 } | |
147 | |
148 bool IcuRegularExpression::Match(const char* input_string, | |
149 bool full_match, | |
150 std::string* matched_string) const { | |
151 DCHECK(input_string); | |
152 // matched_string may be NULL | |
153 if (!utf8_regexp_.get()) | |
154 return false; | |
155 | |
156 IcuRegularExpressionInput input(input_string); | |
157 UErrorCode status = U_ZERO_ERROR; | |
158 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), | |
159 status)); | |
160 | |
161 if (U_FAILURE(status)) | |
162 return false; | |
163 | |
164 if (full_match) { | |
165 if (!matcher->matches(input.pos(), status)) | |
166 return false; | |
167 } else { | |
168 if (!matcher->find(input.pos(), status)) | |
169 return false; | |
170 } | |
171 if (U_FAILURE(status)) | |
172 return false; | |
173 if (matcher->groupCount() > 0 && matched_string) { | |
174 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status), 0); | |
175 } | |
176 return true; | |
177 } | |
178 | |
179 bool IcuRegularExpression::Replace(std::string* string_to_process, | |
180 bool global, | |
181 const char* replacement_string) const { | |
182 DCHECK(string_to_process); | |
183 DCHECK(replacement_string); | |
184 | |
185 std::string adapted_replacement(replacement_string); | |
186 // Adapt replacement string from RE2 (\0-9 for matches) format to ICU format | |
187 // ($0-9 for matches). All '$' should be pre-pended with '\' as well. | |
188 size_t backslash_pos = adapted_replacement.find('\\'); | |
189 size_t dollar_pos = adapted_replacement.find('$'); | |
190 while (backslash_pos != std::string::npos || | |
191 dollar_pos != std::string::npos) { | |
192 bool process_dollar = false; | |
193 if (backslash_pos == std::string::npos || | |
194 (dollar_pos != std::string::npos && dollar_pos < backslash_pos)) { | |
195 process_dollar = true; | |
196 } | |
197 if (process_dollar) { | |
198 adapted_replacement.insert(dollar_pos, "\\"); | |
199 dollar_pos = adapted_replacement.find('$', dollar_pos + 2); | |
200 if (backslash_pos != std::string::npos) | |
201 ++backslash_pos; | |
202 } else { | |
203 if (adapted_replacement.length() > backslash_pos + 1) { | |
204 if (adapted_replacement[backslash_pos + 1] >= '0' && | |
205 adapted_replacement[backslash_pos + 1] <= '9') { | |
206 adapted_replacement[backslash_pos] = '$'; | |
207 } | |
208 if (adapted_replacement[backslash_pos + 1] == '\\') { | |
209 // Skip two characters instead of one. | |
210 ++backslash_pos; | |
211 } | |
212 } | |
213 backslash_pos = adapted_replacement.find('\\', backslash_pos + 1); | |
214 } | |
215 } | |
216 | |
217 IcuRegularExpressionInput input(string_to_process->c_str()); | |
218 UErrorCode status = U_ZERO_ERROR; | |
219 scoped_ptr<icu::RegexMatcher> matcher(utf8_regexp_->matcher(*(input.Data()), | |
220 status)); | |
221 | |
dhollowa
2011/04/06 15:02:55
if (U_FAILURE(status))...
GeorgeY
2011/04/07 00:00:39
Done.
| |
222 icu::UnicodeString result; | |
223 | |
224 if (global) { | |
225 result = matcher->replaceAll( | |
226 icu::UnicodeString::fromUTF8(adapted_replacement), | |
227 status); | |
228 } else { | |
229 result = matcher->replaceFirst( | |
230 icu::UnicodeString::fromUTF8(adapted_replacement), | |
231 status); | |
232 } | |
233 if (U_FAILURE(status)) | |
234 return false; | |
235 *string_to_process = UnicodeStringToUtf8String(result, 0); | |
236 return true; | |
237 } | |
238 | |
239 | |
240 namespace reg_exp { | |
241 | |
242 RegularExpressionInput* CreateRegularExpressionInput(const char* utf8_input) { | |
243 return new IcuRegularExpressionInput(utf8_input); | |
244 } | |
245 | |
246 RegularExpression* CreateRegularExpression(const char* utf8_regexp) { | |
247 return new IcuRegularExpression(utf8_regexp); | |
248 } | |
249 | |
250 } // namespace reg_exp | |
251 | |
OLD | NEW |