Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(430)

Side by Side Diff: components/autofill/core/browser/address_field.cc

Issue 1453193002: autofill: switch autofill_regexes to RE2 library (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address reviews Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « components/autofill.gypi ('k') | components/autofill/core/browser/autofill_regex_constants.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/autofill/core/browser/address_field.h" 5 #include "components/autofill/core/browser/address_field.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string16.h"
12 #include "base/strings/string_util.h" 11 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "components/autofill/core/browser/autofill_field.h" 12 #include "components/autofill/core/browser/autofill_field.h"
15 #include "components/autofill/core/browser/autofill_regex_constants.h" 13 #include "components/autofill/core/browser/autofill_regex_constants.h"
16 #include "components/autofill/core/browser/autofill_scanner.h" 14 #include "components/autofill/core/browser/autofill_scanner.h"
17 #include "components/autofill/core/browser/field_types.h" 15 #include "components/autofill/core/browser/field_types.h"
18 16
19 using base::UTF8ToUTF16;
20
21 namespace autofill { 17 namespace autofill {
22 18
23 namespace { 19 namespace {
24 20
25 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) { 21 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) {
26 *field = scanner->Cursor(); 22 *field = scanner->Cursor();
27 scanner->Advance(); 23 scanner->Advance();
28 return true; 24 return true;
29 } 25 }
30 26
(...skipping 10 matching lines...) Expand all
41 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT; 37 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT;
42 38
43 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { 39 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) {
44 if (scanner->IsEnd()) 40 if (scanner->IsEnd())
45 return NULL; 41 return NULL;
46 42
47 scoped_ptr<AddressField> address_field(new AddressField); 43 scoped_ptr<AddressField> address_field(new AddressField);
48 const AutofillField* const initial_field = scanner->Cursor(); 44 const AutofillField* const initial_field = scanner->Cursor();
49 size_t saved_cursor = scanner->SaveCursor(); 45 size_t saved_cursor = scanner->SaveCursor();
50 46
51 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe);
52 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe);
53
54 // Allow address fields to appear in any order. 47 // Allow address fields to appear in any order.
55 size_t begin_trailing_non_labeled_fields = 0; 48 size_t begin_trailing_non_labeled_fields = 0;
56 bool has_trailing_non_labeled_fields = false; 49 bool has_trailing_non_labeled_fields = false;
57 while (!scanner->IsEnd()) { 50 while (!scanner->IsEnd()) {
58 const size_t cursor = scanner->SaveCursor(); 51 const size_t cursor = scanner->SaveCursor();
59 if (address_field->ParseAddressLines(scanner) || 52 if (address_field->ParseAddressLines(scanner) ||
60 address_field->ParseCityStateZipCode(scanner) || 53 address_field->ParseCityStateZipCode(scanner) ||
61 address_field->ParseCountry(scanner) || 54 address_field->ParseCountry(scanner) ||
62 address_field->ParseCompany(scanner)) { 55 address_field->ParseCompany(scanner)) {
63 has_trailing_non_labeled_fields = false; 56 has_trailing_non_labeled_fields = false;
64 continue; 57 continue;
65 } else if (ParseField(scanner, attention_ignored, NULL) || 58 } else if (ParseField(scanner, kAttentionIgnoredRe, NULL) ||
66 ParseField(scanner, region_ignored, NULL)) { 59 ParseField(scanner, kRegionIgnoredRe, NULL)) {
67 // We ignore the following: 60 // We ignore the following:
68 // * Attention. 61 // * Attention.
69 // * Province/Region/Other. 62 // * Province/Region/Other.
70 continue; 63 continue;
71 } else if (scanner->Cursor() != initial_field && 64 } else if (scanner->Cursor() != initial_field &&
72 ParseEmptyLabel(scanner, NULL)) { 65 ParseEmptyLabel(scanner, NULL)) {
73 // Ignore non-labeled fields within an address; the page 66 // Ignore non-labeled fields within an address; the page
74 // MapQuest Driving Directions North America.html contains such a field. 67 // MapQuest Driving Directions North America.html contains such a field.
75 // We only ignore such fields after we've parsed at least one other field; 68 // We only ignore such fields after we've parsed at least one other field;
76 // otherwise we'd effectively parse address fields before other field 69 // otherwise we'd effectively parse address fields before other field
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
141 AddClassification(city_, ADDRESS_HOME_CITY, map) && 134 AddClassification(city_, ADDRESS_HOME_CITY, map) &&
142 AddClassification(state_, ADDRESS_HOME_STATE, map) && 135 AddClassification(state_, ADDRESS_HOME_STATE, map) &&
143 AddClassification(zip_, ADDRESS_HOME_ZIP, map) && 136 AddClassification(zip_, ADDRESS_HOME_ZIP, map) &&
144 AddClassification(country_, ADDRESS_HOME_COUNTRY, map); 137 AddClassification(country_, ADDRESS_HOME_COUNTRY, map);
145 } 138 }
146 139
147 bool AddressField::ParseCompany(AutofillScanner* scanner) { 140 bool AddressField::ParseCompany(AutofillScanner* scanner) {
148 if (company_ && !company_->IsEmpty()) 141 if (company_ && !company_->IsEmpty())
149 return false; 142 return false;
150 143
151 return ParseField(scanner, UTF8ToUTF16(kCompanyRe), &company_); 144 return ParseField(scanner, kCompanyRe, &company_);
152 } 145 }
153 146
154 bool AddressField::ParseAddressLines(AutofillScanner* scanner) { 147 bool AddressField::ParseAddressLines(AutofillScanner* scanner) {
155 // We only match the string "address" in page text, not in element names, 148 // We only match the string "address" in page text, not in element names,
156 // because sometimes every element in a group of address fields will have 149 // because sometimes every element in a group of address fields will have
157 // a name containing the string "address"; for example, on the page 150 // a name containing the string "address"; for example, on the page
158 // Kohl's - Register Billing Address.html the text element labeled "city" 151 // Kohl's - Register Billing Address.html the text element labeled "city"
159 // has the name "BILL_TO_ADDRESS<>city". We do match address labels 152 // has the name "BILL_TO_ADDRESS<>city". We do match address labels
160 // such as "address1", which appear as element names on various pages (eg 153 // such as "address1", which appear as element names on various pages (eg
161 // AmericanGirl-Registration.html, BloomingdalesBilling.html, 154 // AmericanGirl-Registration.html, BloomingdalesBilling.html,
162 // EBay Registration Enter Information.html). 155 // EBay Registration Enter Information.html).
163 if (address1_ || street_address_) 156 if (address1_ || street_address_)
164 return false; 157 return false;
165 158
166 // Ignore "Address Lookup" field. http://crbug.com/427622 159 // Ignore "Address Lookup" field. http://crbug.com/427622
167 if (ParseField(scanner, base::UTF8ToUTF16(kAddressLookupRe), NULL)) 160 if (ParseField(scanner, kAddressLookupRe, NULL))
168 return false; 161 return false;
169 162
170 base::string16 pattern = UTF8ToUTF16(kAddressLine1Re); 163 if (!ParseFieldSpecifics(scanner, kAddressLine1Re, MATCH_DEFAULT,
171 base::string16 label_pattern = UTF8ToUTF16(kAddressLine1LabelRe);
172 if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, &address1_) &&
173 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
174 &address1_) && 164 &address1_) &&
175 !ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_TEXT_AREA, 165 !ParseFieldSpecifics(scanner, kAddressLine1LabelRe,
176 &street_address_) && 166 MATCH_LABEL | MATCH_TEXT, &address1_) &&
177 !ParseFieldSpecifics(scanner, label_pattern, 167 !ParseFieldSpecifics(scanner, kAddressLine1Re,
178 MATCH_LABEL | MATCH_TEXT_AREA, 168 MATCH_DEFAULT | MATCH_TEXT_AREA, &street_address_) &&
179 &street_address_)) 169 !ParseFieldSpecifics(scanner, kAddressLine1LabelRe,
170 MATCH_LABEL | MATCH_TEXT_AREA, &street_address_))
180 return false; 171 return false;
181 172
182 if (street_address_) 173 if (street_address_)
183 return true; 174 return true;
184 175
185 // This code may not pick up pages that have an address field consisting of a 176 // This code may not pick up pages that have an address field consisting of a
186 // sequence of unlabeled address fields. If we need to add this, see 177 // sequence of unlabeled address fields. If we need to add this, see
187 // discussion on https://codereview.chromium.org/741493003/ 178 // discussion on https://codereview.chromium.org/741493003/
188 pattern = UTF8ToUTF16(kAddressLine2Re); 179 if (!ParseField(scanner, kAddressLine2Re, &address2_) &&
189 label_pattern = UTF8ToUTF16(kAddressLine2LabelRe); 180 !ParseFieldSpecifics(scanner, kAddressLine2LabelRe,
190 if (!ParseField(scanner, pattern, &address2_) && 181 MATCH_LABEL | MATCH_TEXT, &address2_))
191 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
192 &address2_))
193 return true; 182 return true;
194 183
195 // Optionally parse address line 3. This uses the same label regexp as 184 // Optionally parse address line 3. This uses the same label regexp as
196 // address 2 above. 185 // address 2 above.
197 pattern = UTF8ToUTF16(kAddressLinesExtraRe); 186 if (!ParseField(scanner, kAddressLinesExtraRe, &address3_) &&
198 if (!ParseField(scanner, pattern, &address3_) && 187 !ParseFieldSpecifics(scanner, kAddressLine2LabelRe,
199 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, 188 MATCH_LABEL | MATCH_TEXT, &address3_))
200 &address3_))
201 return true; 189 return true;
202 190
203 // Try for surplus lines, which we will promptly discard. Some pages have 4 191 // Try for surplus lines, which we will promptly discard. Some pages have 4
204 // address lines (e.g. uk/ShoesDirect2.html)! 192 // address lines (e.g. uk/ShoesDirect2.html)!
205 // 193 //
206 // Since these are rare, don't bother considering unlabeled lines as extra 194 // Since these are rare, don't bother considering unlabeled lines as extra
207 // address lines. 195 // address lines.
208 pattern = UTF8ToUTF16(kAddressLinesExtraRe); 196 while (ParseField(scanner, kAddressLinesExtraRe, NULL)) {
209 while (ParseField(scanner, pattern, NULL)) {
210 // Consumed a surplus line, try for another. 197 // Consumed a surplus line, try for another.
211 } 198 }
212 return true; 199 return true;
213 } 200 }
214 201
215 bool AddressField::ParseCountry(AutofillScanner* scanner) { 202 bool AddressField::ParseCountry(AutofillScanner* scanner) {
216 if (country_ && !country_->IsEmpty()) 203 if (country_ && !country_->IsEmpty())
217 return false; 204 return false;
218 205
219 scanner->SaveCursor(); 206 scanner->SaveCursor();
220 if (ParseFieldSpecifics(scanner, 207 if (ParseFieldSpecifics(scanner, kCountryRe, MATCH_DEFAULT | MATCH_SELECT,
221 UTF8ToUTF16(kCountryRe),
222 MATCH_DEFAULT | MATCH_SELECT,
223 &country_)) { 208 &country_)) {
224 return true; 209 return true;
225 } 210 }
226 211
227 // The occasional page (e.g. google account registration page) calls this a 212 // The occasional page (e.g. google account registration page) calls this a
228 // "location". However, this only makes sense for select tags. 213 // "location". However, this only makes sense for select tags.
229 scanner->Rewind(); 214 scanner->Rewind();
230 return ParseFieldSpecifics(scanner, 215 return ParseFieldSpecifics(scanner, kCountryLocationRe,
231 UTF8ToUTF16(kCountryLocationRe),
232 MATCH_LABEL | MATCH_NAME | MATCH_SELECT, 216 MATCH_LABEL | MATCH_NAME | MATCH_SELECT,
233 &country_); 217 &country_);
234 } 218 }
235 219
236 bool AddressField::ParseZipCode(AutofillScanner* scanner) { 220 bool AddressField::ParseZipCode(AutofillScanner* scanner) {
237 if (zip_) 221 if (zip_)
238 return false; 222 return false;
239 223
240 if (!ParseFieldSpecifics(scanner, 224 if (!ParseFieldSpecifics(scanner, kZipCodeRe, kZipCodeMatchType, &zip_)) {
241 UTF8ToUTF16(kZipCodeRe),
242 kZipCodeMatchType,
243 &zip_)) {
244 return false; 225 return false;
245 } 226 }
246 227
247 // Look for a zip+4, whose field name will also often contain 228 // Look for a zip+4, whose field name will also often contain
248 // the substring "zip". 229 // the substring "zip".
249 ParseFieldSpecifics(scanner, UTF8ToUTF16(kZip4Re), kZipCodeMatchType, &zip4_); 230 ParseFieldSpecifics(scanner, kZip4Re, kZipCodeMatchType, &zip4_);
250 return true; 231 return true;
251 } 232 }
252 233
253 bool AddressField::ParseCity(AutofillScanner* scanner) { 234 bool AddressField::ParseCity(AutofillScanner* scanner) {
254 if (city_) 235 if (city_)
255 return false; 236 return false;
256 237
257 return ParseFieldSpecifics(scanner, 238 return ParseFieldSpecifics(scanner, kCityRe, kCityMatchType, &city_);
258 UTF8ToUTF16(kCityRe),
259 kCityMatchType,
260 &city_);
261 } 239 }
262 240
263 bool AddressField::ParseState(AutofillScanner* scanner) { 241 bool AddressField::ParseState(AutofillScanner* scanner) {
264 if (state_) 242 if (state_)
265 return false; 243 return false;
266 244
267 return ParseFieldSpecifics(scanner, 245 // Ignore spurious matches for "United States".
Evan Stade 2015/12/08 19:34:56 where is this coming from? is this a new addition?
Ilya Sherman 2015/12/08 20:02:28 It's coming from a change to one of the regexes, w
268 UTF8ToUTF16(kStateRe), 246 size_t saved_cursor = scanner->SaveCursor();
269 kStateMatchType, 247 if (ParseFieldSpecifics(scanner, "United States", kStateMatchType, nullptr)) {
270 &state_); 248 scanner->RewindTo(saved_cursor);
249 return false;
250 }
251
252 return ParseFieldSpecifics(scanner, kStateRe, kStateMatchType, &state_);
271 } 253 }
272 254
273 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) { 255 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) {
274 // Simple cases. 256 // Simple cases.
275 if (scanner->IsEnd()) 257 if (scanner->IsEnd())
276 return false; 258 return false;
277 if (city_ && state_ && zip_) 259 if (city_ && state_ && zip_)
278 return false; 260 return false;
279 if (state_ && zip_) 261 if (state_ && zip_)
280 return ParseCity(scanner); 262 return ParseCity(scanner);
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
322 304
323 return false; 305 return false;
324 } 306 }
325 307
326 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode( 308 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode(
327 AutofillScanner* scanner) { 309 AutofillScanner* scanner) {
328 if (zip_) 310 if (zip_)
329 return RESULT_MATCH_NONE; 311 return RESULT_MATCH_NONE;
330 312
331 ParseNameLabelResult result = ParseNameAndLabelSeparately( 313 ParseNameLabelResult result = ParseNameAndLabelSeparately(
332 scanner, UTF8ToUTF16(kZipCodeRe), kZipCodeMatchType, &zip_); 314 scanner, kZipCodeRe, kZipCodeMatchType, &zip_);
333 315
334 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd()) 316 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd())
335 return result; 317 return result;
336 318
337 size_t saved_cursor = scanner->SaveCursor(); 319 size_t saved_cursor = scanner->SaveCursor();
338 bool found_non_zip4 = ParseCity(scanner); 320 bool found_non_zip4 = ParseCity(scanner);
339 if (found_non_zip4) 321 if (found_non_zip4)
340 city_ = nullptr; 322 city_ = nullptr;
341 scanner->RewindTo(saved_cursor); 323 scanner->RewindTo(saved_cursor);
342 if (!found_non_zip4) { 324 if (!found_non_zip4) {
343 found_non_zip4 = ParseState(scanner); 325 found_non_zip4 = ParseState(scanner);
344 if (found_non_zip4) 326 if (found_non_zip4)
345 state_ = nullptr; 327 state_ = nullptr;
346 scanner->RewindTo(saved_cursor); 328 scanner->RewindTo(saved_cursor);
347 } 329 }
348 330
349 if (!found_non_zip4) { 331 if (!found_non_zip4) {
350 // Look for a zip+4, whose field name will also often contain 332 // Look for a zip+4, whose field name will also often contain
351 // the substring "zip". 333 // the substring "zip".
352 ParseFieldSpecifics(scanner, 334 ParseFieldSpecifics(scanner, kZip4Re, kZipCodeMatchType, &zip4_);
353 UTF8ToUTF16(kZip4Re),
354 kZipCodeMatchType,
355 &zip4_);
356 } 335 }
357 return result; 336 return result;
358 } 337 }
359 338
360 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity( 339 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity(
361 AutofillScanner* scanner) { 340 AutofillScanner* scanner) {
362 if (city_) 341 if (city_)
363 return RESULT_MATCH_NONE; 342 return RESULT_MATCH_NONE;
364 343
365 return ParseNameAndLabelSeparately( 344 return ParseNameAndLabelSeparately(scanner, kCityRe, kCityMatchType, &city_);
366 scanner, UTF8ToUTF16(kCityRe), kCityMatchType, &city_);
367 } 345 }
368 346
369 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState( 347 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState(
370 AutofillScanner* scanner) { 348 AutofillScanner* scanner) {
371 if (state_) 349 if (state_)
372 return RESULT_MATCH_NONE; 350 return RESULT_MATCH_NONE;
373 351
374 return ParseNameAndLabelSeparately( 352 size_t saved_cursor = scanner->SaveCursor();
375 scanner, UTF8ToUTF16(kStateRe), kStateMatchType, &state_); 353 if (ParseFieldSpecifics(scanner, "United States", kStateMatchType, nullptr)) {
354 scanner->RewindTo(saved_cursor);
355 return RESULT_MATCH_NONE;
356 }
357
358 return ParseNameAndLabelSeparately(scanner, kStateRe, kStateMatchType,
359 &state_);
376 } 360 }
377 361
378 } // namespace autofill 362 } // namespace autofill
OLDNEW
« no previous file with comments | « components/autofill.gypi ('k') | components/autofill/core/browser/autofill_regex_constants.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698