Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(237)

Side by Side Diff: components/autofill/core/browser/address_field.cc

Issue 1453193002: autofill: switch autofill_regexes to RE2 library (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: 14 tests failing Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/autofill/core/browser/address_field.h" 5 #include "components/autofill/core/browser/address_field.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string16.h" 11 #include "base/strings/string16.h"
12 #include "base/strings/string_util.h" 12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h" 13 #include "base/strings/utf_string_conversions.h"
14 #include "components/autofill/core/browser/autofill_field.h" 14 #include "components/autofill/core/browser/autofill_field.h"
15 #include "components/autofill/core/browser/autofill_regex_constants.h" 15 #include "components/autofill/core/browser/autofill_regex_constants.h"
16 #include "components/autofill/core/browser/autofill_scanner.h" 16 #include "components/autofill/core/browser/autofill_scanner.h"
17 #include "components/autofill/core/browser/field_types.h" 17 #include "components/autofill/core/browser/field_types.h"
18 18
19 using base::ASCIIToUTF16;
19 using base::UTF8ToUTF16; 20 using base::UTF8ToUTF16;
Ilya Sherman 2015/11/26 02:25:09 Are these needed?
tfarina 2015/11/26 14:22:27 Done.
20 21
21 namespace autofill { 22 namespace autofill {
22 23
23 namespace { 24 namespace {
24 25
25 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) { 26 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) {
26 *field = scanner->Cursor(); 27 *field = scanner->Cursor();
27 scanner->Advance(); 28 scanner->Advance();
28 return true; 29 return true;
29 } 30 }
(...skipping 11 matching lines...) Expand all
41 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT; 42 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT;
42 43
43 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { 44 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) {
44 if (scanner->IsEnd()) 45 if (scanner->IsEnd())
45 return NULL; 46 return NULL;
46 47
47 scoped_ptr<AddressField> address_field(new AddressField); 48 scoped_ptr<AddressField> address_field(new AddressField);
48 const AutofillField* const initial_field = scanner->Cursor(); 49 const AutofillField* const initial_field = scanner->Cursor();
49 size_t saved_cursor = scanner->SaveCursor(); 50 size_t saved_cursor = scanner->SaveCursor();
50 51
51 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe);
52 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe);
53
54 // Allow address fields to appear in any order. 52 // Allow address fields to appear in any order.
55 size_t begin_trailing_non_labeled_fields = 0; 53 size_t begin_trailing_non_labeled_fields = 0;
56 bool has_trailing_non_labeled_fields = false; 54 bool has_trailing_non_labeled_fields = false;
57 while (!scanner->IsEnd()) { 55 while (!scanner->IsEnd()) {
58 const size_t cursor = scanner->SaveCursor(); 56 const size_t cursor = scanner->SaveCursor();
59 if (address_field->ParseAddressLines(scanner) || 57 if (address_field->ParseAddressLines(scanner) ||
60 address_field->ParseCityStateZipCode(scanner) || 58 address_field->ParseCityStateZipCode(scanner) ||
61 address_field->ParseCountry(scanner) || 59 address_field->ParseCountry(scanner) ||
62 address_field->ParseCompany(scanner)) { 60 address_field->ParseCompany(scanner)) {
63 has_trailing_non_labeled_fields = false; 61 has_trailing_non_labeled_fields = false;
64 continue; 62 continue;
65 } else if (ParseField(scanner, attention_ignored, NULL) || 63 } else if (ParseField(scanner, kAttentionIgnoredRe, NULL) ||
66 ParseField(scanner, region_ignored, NULL)) { 64 ParseField(scanner, kRegionIgnoredRe, NULL)) {
67 // We ignore the following: 65 // We ignore the following:
68 // * Attention. 66 // * Attention.
69 // * Province/Region/Other. 67 // * Province/Region/Other.
70 continue; 68 continue;
71 } else if (scanner->Cursor() != initial_field && 69 } else if (scanner->Cursor() != initial_field &&
72 ParseEmptyLabel(scanner, NULL)) { 70 ParseEmptyLabel(scanner, NULL)) {
73 // Ignore non-labeled fields within an address; the page 71 // Ignore non-labeled fields within an address; the page
74 // MapQuest Driving Directions North America.html contains such a field. 72 // MapQuest Driving Directions North America.html contains such a field.
75 // We only ignore such fields after we've parsed at least one other field; 73 // We only ignore such fields after we've parsed at least one other field;
76 // otherwise we'd effectively parse address fields before other field 74 // otherwise we'd effectively parse address fields before other field
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
141 AddClassification(city_, ADDRESS_HOME_CITY, map) && 139 AddClassification(city_, ADDRESS_HOME_CITY, map) &&
142 AddClassification(state_, ADDRESS_HOME_STATE, map) && 140 AddClassification(state_, ADDRESS_HOME_STATE, map) &&
143 AddClassification(zip_, ADDRESS_HOME_ZIP, map) && 141 AddClassification(zip_, ADDRESS_HOME_ZIP, map) &&
144 AddClassification(country_, ADDRESS_HOME_COUNTRY, map); 142 AddClassification(country_, ADDRESS_HOME_COUNTRY, map);
145 } 143 }
146 144
147 bool AddressField::ParseCompany(AutofillScanner* scanner) { 145 bool AddressField::ParseCompany(AutofillScanner* scanner) {
148 if (company_ && !company_->IsEmpty()) 146 if (company_ && !company_->IsEmpty())
149 return false; 147 return false;
150 148
151 return ParseField(scanner, UTF8ToUTF16(kCompanyRe), &company_); 149 return ParseField(scanner, kCompanyRe, &company_);
152 } 150 }
153 151
154 bool AddressField::ParseAddressLines(AutofillScanner* scanner) { 152 bool AddressField::ParseAddressLines(AutofillScanner* scanner) {
155 // We only match the string "address" in page text, not in element names, 153 // We only match the string "address" in page text, not in element names,
156 // because sometimes every element in a group of address fields will have 154 // because sometimes every element in a group of address fields will have
157 // a name containing the string "address"; for example, on the page 155 // a name containing the string "address"; for example, on the page
158 // Kohl's - Register Billing Address.html the text element labeled "city" 156 // Kohl's - Register Billing Address.html the text element labeled "city"
159 // has the name "BILL_TO_ADDRESS<>city". We do match address labels 157 // has the name "BILL_TO_ADDRESS<>city". We do match address labels
160 // such as "address1", which appear as element names on various pages (eg 158 // such as "address1", which appear as element names on various pages (eg
161 // AmericanGirl-Registration.html, BloomingdalesBilling.html, 159 // AmericanGirl-Registration.html, BloomingdalesBilling.html,
162 // EBay Registration Enter Information.html). 160 // EBay Registration Enter Information.html).
163 if (address1_ || street_address_) 161 if (address1_ || street_address_)
164 return false; 162 return false;
165 163
166 // Ignore "Address Lookup" field. http://crbug.com/427622 164 // Ignore "Address Lookup" field. http://crbug.com/427622
167 if (ParseField(scanner, base::UTF8ToUTF16(kAddressLookupRe), NULL)) 165 if (ParseField(scanner, kAddressLookupRe, NULL))
168 return false; 166 return false;
169 167
170 base::string16 pattern = UTF8ToUTF16(kAddressLine1Re); 168 if (!ParseFieldSpecifics(scanner, kAddressLine1Re, MATCH_DEFAULT,
171 base::string16 label_pattern = UTF8ToUTF16(kAddressLine1LabelRe);
172 if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, &address1_) &&
173 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
174 &address1_) && 169 &address1_) &&
175 !ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_TEXT_AREA, 170 !ParseFieldSpecifics(scanner, kAddressLine1LabelRe,
176 &street_address_) && 171 MATCH_LABEL | MATCH_TEXT, &address1_) &&
177 !ParseFieldSpecifics(scanner, label_pattern, 172 !ParseFieldSpecifics(scanner, kAddressLine1Re,
178 MATCH_LABEL | MATCH_TEXT_AREA, 173 MATCH_DEFAULT | MATCH_TEXT_AREA, &street_address_) &&
179 &street_address_)) 174 !ParseFieldSpecifics(scanner, kAddressLine1LabelRe,
175 MATCH_LABEL | MATCH_TEXT_AREA, &street_address_))
180 return false; 176 return false;
181 177
182 if (street_address_) 178 if (street_address_)
183 return true; 179 return true;
184 180
185 // This code may not pick up pages that have an address field consisting of a 181 // This code may not pick up pages that have an address field consisting of a
186 // sequence of unlabeled address fields. If we need to add this, see 182 // sequence of unlabeled address fields. If we need to add this, see
187 // discussion on https://codereview.chromium.org/741493003/ 183 // discussion on https://codereview.chromium.org/741493003/
188 pattern = UTF8ToUTF16(kAddressLine2Re); 184 if (!ParseField(scanner, kAddressLine2Re, &address2_) &&
189 label_pattern = UTF8ToUTF16(kAddressLine2LabelRe); 185 !ParseFieldSpecifics(scanner, kAddressLine2LabelRe,
190 if (!ParseField(scanner, pattern, &address2_) && 186 MATCH_LABEL | MATCH_TEXT, &address2_))
191 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
192 &address2_))
193 return true; 187 return true;
194 188
195 // Optionally parse address line 3. This uses the same label regexp as 189 // Optionally parse address line 3. This uses the same label regexp as
196 // address 2 above. 190 // address 2 above.
197 pattern = UTF8ToUTF16(kAddressLinesExtraRe); 191 if (!ParseField(scanner, kAddressLinesExtraRe, &address3_) &&
198 if (!ParseField(scanner, pattern, &address3_) && 192 !ParseFieldSpecifics(scanner, kAddressLinesExtraRe,
Ilya Sherman 2015/11/26 02:25:09 Please use kAddressLine2LabelRe here, as the code
tfarina 2015/11/26 14:22:27 ops, that is what happens when you try to make pat
199 !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, 193 MATCH_LABEL | MATCH_TEXT, &address3_))
200 &address3_))
201 return true; 194 return true;
202 195
203 // Try for surplus lines, which we will promptly discard. Some pages have 4 196 // Try for surplus lines, which we will promptly discard. Some pages have 4
204 // address lines (e.g. uk/ShoesDirect2.html)! 197 // address lines (e.g. uk/ShoesDirect2.html)!
205 // 198 //
206 // Since these are rare, don't bother considering unlabeled lines as extra 199 // Since these are rare, don't bother considering unlabeled lines as extra
207 // address lines. 200 // address lines.
208 pattern = UTF8ToUTF16(kAddressLinesExtraRe); 201 while (ParseField(scanner, kAddressLinesExtraRe, NULL)) {
209 while (ParseField(scanner, pattern, NULL)) {
210 // Consumed a surplus line, try for another. 202 // Consumed a surplus line, try for another.
211 } 203 }
212 return true; 204 return true;
213 } 205 }
214 206
215 bool AddressField::ParseCountry(AutofillScanner* scanner) { 207 bool AddressField::ParseCountry(AutofillScanner* scanner) {
216 if (country_ && !country_->IsEmpty()) 208 if (country_ && !country_->IsEmpty())
217 return false; 209 return false;
218 210
219 scanner->SaveCursor(); 211 scanner->SaveCursor();
220 if (ParseFieldSpecifics(scanner, 212 if (ParseFieldSpecifics(scanner, kCountryRe, MATCH_DEFAULT | MATCH_SELECT,
221 UTF8ToUTF16(kCountryRe),
222 MATCH_DEFAULT | MATCH_SELECT,
223 &country_)) { 213 &country_)) {
224 return true; 214 return true;
225 } 215 }
226 216
227 // The occasional page (e.g. google account registration page) calls this a 217 // The occasional page (e.g. google account registration page) calls this a
228 // "location". However, this only makes sense for select tags. 218 // "location". However, this only makes sense for select tags.
229 scanner->Rewind(); 219 scanner->Rewind();
230 return ParseFieldSpecifics(scanner, 220 return ParseFieldSpecifics(scanner, kCountryLocationRe,
231 UTF8ToUTF16(kCountryLocationRe),
232 MATCH_LABEL | MATCH_NAME | MATCH_SELECT, 221 MATCH_LABEL | MATCH_NAME | MATCH_SELECT,
233 &country_); 222 &country_);
234 } 223 }
235 224
236 bool AddressField::ParseZipCode(AutofillScanner* scanner) { 225 bool AddressField::ParseZipCode(AutofillScanner* scanner) {
237 if (zip_) 226 if (zip_)
238 return false; 227 return false;
239 228
240 if (!ParseFieldSpecifics(scanner, 229 if (!ParseFieldSpecifics(scanner, kZipCodeRe, kZipCodeMatchType, &zip_)) {
241 UTF8ToUTF16(kZipCodeRe),
242 kZipCodeMatchType,
243 &zip_)) {
244 return false; 230 return false;
245 } 231 }
246 232
247 // Look for a zip+4, whose field name will also often contain 233 // Look for a zip+4, whose field name will also often contain
248 // the substring "zip". 234 // the substring "zip".
249 ParseFieldSpecifics(scanner, UTF8ToUTF16(kZip4Re), kZipCodeMatchType, &zip4_); 235 ParseFieldSpecifics(scanner, kZip4Re, kZipCodeMatchType, &zip4_);
250 return true; 236 return true;
251 } 237 }
252 238
253 bool AddressField::ParseCity(AutofillScanner* scanner) { 239 bool AddressField::ParseCity(AutofillScanner* scanner) {
254 if (city_) 240 if (city_)
255 return false; 241 return false;
256 242
257 return ParseFieldSpecifics(scanner, 243 return ParseFieldSpecifics(scanner, kCityRe, kCityMatchType, &city_);
258 UTF8ToUTF16(kCityRe),
259 kCityMatchType,
260 &city_);
261 } 244 }
262 245
263 bool AddressField::ParseState(AutofillScanner* scanner) { 246 bool AddressField::ParseState(AutofillScanner* scanner) {
264 if (state_) 247 if (state_)
265 return false; 248 return false;
266 249
267 return ParseFieldSpecifics(scanner, 250 size_t saved_cursor = scanner->SaveCursor();
Ilya Sherman 2015/11/26 02:25:09 Please add a comment above this block of code like
tfarina 2015/11/26 14:22:27 Done.
268 UTF8ToUTF16(kStateRe), 251 if (ParseFieldSpecifics(scanner, "United States", kStateMatchType, nullptr)) {
269 kStateMatchType, 252 scanner->RewindTo(saved_cursor);
270 &state_); 253 return false;
254 }
255
256 return ParseFieldSpecifics(scanner, kStateRe, kStateMatchType, &state_);
271 } 257 }
272 258
273 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) { 259 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) {
274 // Simple cases. 260 // Simple cases.
275 if (scanner->IsEnd()) 261 if (scanner->IsEnd())
276 return false; 262 return false;
277 if (city_ && state_ && zip_) 263 if (city_ && state_ && zip_)
278 return false; 264 return false;
279 if (state_ && zip_) 265 if (state_ && zip_)
280 return ParseCity(scanner); 266 return ParseCity(scanner);
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
322 308
323 return false; 309 return false;
324 } 310 }
325 311
326 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode( 312 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode(
327 AutofillScanner* scanner) { 313 AutofillScanner* scanner) {
328 if (zip_) 314 if (zip_)
329 return RESULT_MATCH_NONE; 315 return RESULT_MATCH_NONE;
330 316
331 ParseNameLabelResult result = ParseNameAndLabelSeparately( 317 ParseNameLabelResult result = ParseNameAndLabelSeparately(
332 scanner, UTF8ToUTF16(kZipCodeRe), kZipCodeMatchType, &zip_); 318 scanner, kZipCodeRe, kZipCodeMatchType, &zip_);
333 319
334 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd()) 320 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd())
335 return result; 321 return result;
336 322
337 size_t saved_cursor = scanner->SaveCursor(); 323 size_t saved_cursor = scanner->SaveCursor();
338 bool found_non_zip4 = ParseCity(scanner); 324 bool found_non_zip4 = ParseCity(scanner);
339 if (found_non_zip4) 325 if (found_non_zip4)
340 city_ = nullptr; 326 city_ = nullptr;
341 scanner->RewindTo(saved_cursor); 327 scanner->RewindTo(saved_cursor);
342 if (!found_non_zip4) { 328 if (!found_non_zip4) {
343 found_non_zip4 = ParseState(scanner); 329 found_non_zip4 = ParseState(scanner);
344 if (found_non_zip4) 330 if (found_non_zip4)
345 state_ = nullptr; 331 state_ = nullptr;
346 scanner->RewindTo(saved_cursor); 332 scanner->RewindTo(saved_cursor);
347 } 333 }
348 334
349 if (!found_non_zip4) { 335 if (!found_non_zip4) {
350 // Look for a zip+4, whose field name will also often contain 336 // Look for a zip+4, whose field name will also often contain
351 // the substring "zip". 337 // the substring "zip".
352 ParseFieldSpecifics(scanner, 338 ParseFieldSpecifics(scanner, kZip4Re, kZipCodeMatchType, &zip4_);
353 UTF8ToUTF16(kZip4Re),
354 kZipCodeMatchType,
355 &zip4_);
356 } 339 }
357 return result; 340 return result;
358 } 341 }
359 342
360 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity( 343 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity(
361 AutofillScanner* scanner) { 344 AutofillScanner* scanner) {
362 if (city_) 345 if (city_)
363 return RESULT_MATCH_NONE; 346 return RESULT_MATCH_NONE;
364 347
365 return ParseNameAndLabelSeparately( 348 return ParseNameAndLabelSeparately(scanner, kCityRe, kCityMatchType, &city_);
366 scanner, UTF8ToUTF16(kCityRe), kCityMatchType, &city_);
367 } 349 }
368 350
369 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState( 351 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState(
370 AutofillScanner* scanner) { 352 AutofillScanner* scanner) {
371 if (state_) 353 if (state_)
372 return RESULT_MATCH_NONE; 354 return RESULT_MATCH_NONE;
373 355
374 return ParseNameAndLabelSeparately( 356 size_t saved_cursor = scanner->SaveCursor();
375 scanner, UTF8ToUTF16(kStateRe), kStateMatchType, &state_); 357 ParseNameLabelResult result = ParseNameAndLabelSeparately(
358 scanner, "United States", kStateMatchType, nullptr);
359
360 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd())
361 return result;
362 scanner->RewindTo(saved_cursor);
Ilya Sherman 2015/11/26 02:25:09 This logic is not correct. I still think it shoul
tfarina 2015/11/26 14:22:27 Done.
363
364 return ParseNameAndLabelSeparately(scanner, kStateRe, kStateMatchType,
365 &state_);
376 } 366 }
377 367
378 } // namespace autofill 368 } // namespace autofill
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698