Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(443)

Side by Side Diff: components/autofill/core/browser/address_field.cc

Issue 1028633004: Autofill: Improve heuristics for city/state/zip fields. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/autofill/core/browser/address_field.h" 5 #include "components/autofill/core/browser/address_field.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string16.h" 11 #include "base/strings/string16.h"
12 #include "base/strings/string_util.h" 12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h" 13 #include "base/strings/utf_string_conversions.h"
14 #include "components/autofill/core/browser/autofill_field.h" 14 #include "components/autofill/core/browser/autofill_field.h"
15 #include "components/autofill/core/browser/autofill_regex_constants.h" 15 #include "components/autofill/core/browser/autofill_regex_constants.h"
16 #include "components/autofill/core/browser/autofill_scanner.h" 16 #include "components/autofill/core/browser/autofill_scanner.h"
17 #include "components/autofill/core/browser/field_types.h" 17 #include "components/autofill/core/browser/field_types.h"
18 18
19 using base::UTF8ToUTF16; 19 using base::UTF8ToUTF16;
20 20
21 namespace autofill { 21 namespace autofill {
22 22
23 namespace {
24
25 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) {
26 *field = scanner->Cursor();
27 scanner->Advance();
28 return true;
29 }
30
31 } // namespace
32
33 // Some sites use type="tel" for zip fields (to get a numerical input).
34 // http://crbug.com/426958
35 // static
Evan Stade 2015/03/24 00:04:32 don't think the // static notation is useful for c
Lei Zhang 2015/03/25 00:42:28 Done.
36 const int AddressField::kZipCodeMatchType =
37 MATCH_DEFAULT | MATCH_TELEPHONE | MATCH_NUMBER;
38
39 // Select fields are allowed here. This occurs on top-100 site rediff.com.
40 // static
41 const int AddressField::kCityMatchType = MATCH_DEFAULT | MATCH_SELECT;
42
43 // static
44 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT;
45
23 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { 46 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) {
24 if (scanner->IsEnd()) 47 if (scanner->IsEnd())
25 return NULL; 48 return NULL;
26 49
27 scoped_ptr<AddressField> address_field(new AddressField); 50 scoped_ptr<AddressField> address_field(new AddressField);
28 const AutofillField* const initial_field = scanner->Cursor(); 51 const AutofillField* const initial_field = scanner->Cursor();
29 size_t saved_cursor = scanner->SaveCursor(); 52 size_t saved_cursor = scanner->SaveCursor();
30 53
31 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe); 54 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe);
32 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe); 55 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe);
33 56
34 // Allow address fields to appear in any order. 57 // Allow address fields to appear in any order.
35 size_t begin_trailing_non_labeled_fields = 0; 58 size_t begin_trailing_non_labeled_fields = 0;
36 bool has_trailing_non_labeled_fields = false; 59 bool has_trailing_non_labeled_fields = false;
37 while (!scanner->IsEnd()) { 60 while (!scanner->IsEnd()) {
38 const size_t cursor = scanner->SaveCursor(); 61 const size_t cursor = scanner->SaveCursor();
39 if (address_field->ParseAddressLines(scanner) || 62 if (address_field->ParseAddressLines(scanner) ||
40 address_field->ParseCity(scanner) || 63 address_field->ParseCityStateZipCode(scanner) ||
41 address_field->ParseState(scanner) ||
42 address_field->ParseZipCode(scanner) ||
43 address_field->ParseCountry(scanner) || 64 address_field->ParseCountry(scanner) ||
44 address_field->ParseCompany(scanner)) { 65 address_field->ParseCompany(scanner)) {
45 has_trailing_non_labeled_fields = false; 66 has_trailing_non_labeled_fields = false;
46 continue; 67 continue;
47 } else if (ParseField(scanner, attention_ignored, NULL) || 68 } else if (ParseField(scanner, attention_ignored, NULL) ||
48 ParseField(scanner, region_ignored, NULL)) { 69 ParseField(scanner, region_ignored, NULL)) {
49 // We ignore the following: 70 // We ignore the following:
50 // * Attention. 71 // * Attention.
51 // * Province/Region/Other. 72 // * Province/Region/Other.
52 continue; 73 continue;
(...skipping 161 matching lines...) Expand 10 before | Expand all | Expand 10 after
214 MATCH_LABEL | MATCH_NAME | MATCH_SELECT, 235 MATCH_LABEL | MATCH_NAME | MATCH_SELECT,
215 &country_); 236 &country_);
216 } 237 }
217 238
218 bool AddressField::ParseZipCode(AutofillScanner* scanner) { 239 bool AddressField::ParseZipCode(AutofillScanner* scanner) {
219 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this 240 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this
220 // is called a "post code". 241 // is called a "post code".
221 if (zip_) 242 if (zip_)
222 return false; 243 return false;
223 244
224 // Some sites use type="tel" for zip fields (to get a numerical input).
225 // http://crbug.com/426958
226 if (!ParseFieldSpecifics(scanner, 245 if (!ParseFieldSpecifics(scanner,
227 UTF8ToUTF16(kZipCodeRe), 246 UTF8ToUTF16(kZipCodeRe),
228 MATCH_DEFAULT | MATCH_TELEPHONE, 247 kZipCodeMatchType,
229 &zip_)) { 248 &zip_)) {
230 return false; 249 return false;
231 } 250 }
232 251
233 // Look for a zip+4, whose field name will also often contain 252 // Look for a zip+4, whose field name will also often contain
234 // the substring "zip". 253 // the substring "zip".
235 ParseFieldSpecifics(scanner, 254 ParseFieldSpecifics(scanner, UTF8ToUTF16(kZip4Re), kZipCodeMatchType, &zip4_);
236 UTF8ToUTF16(kZip4Re),
237 MATCH_DEFAULT | MATCH_TELEPHONE,
238 &zip4_);
239 return true; 255 return true;
240 } 256 }
241 257
242 bool AddressField::ParseCity(AutofillScanner* scanner) { 258 bool AddressField::ParseCity(AutofillScanner* scanner) {
243 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use 259 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use
244 // the term "town". 260 // the term "town".
245 if (city_) 261 if (city_)
246 return false; 262 return false;
247 263
248 // Select fields are allowed here. This occurs on top-100 site rediff.com.
249 return ParseFieldSpecifics(scanner, 264 return ParseFieldSpecifics(scanner,
250 UTF8ToUTF16(kCityRe), 265 UTF8ToUTF16(kCityRe),
251 MATCH_DEFAULT | MATCH_SELECT, 266 kCityMatchType,
252 &city_); 267 &city_);
253 } 268 }
254 269
255 bool AddressField::ParseState(AutofillScanner* scanner) { 270 bool AddressField::ParseState(AutofillScanner* scanner) {
256 if (state_) 271 if (state_)
257 return false; 272 return false;
258 273
259 return ParseFieldSpecifics(scanner, 274 return ParseFieldSpecifics(scanner,
260 UTF8ToUTF16(kStateRe), 275 UTF8ToUTF16(kStateRe),
261 MATCH_DEFAULT | MATCH_SELECT, 276 kStateMatchType,
262 &state_); 277 &state_);
263 } 278 }
264 279
280 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) {
281 // Simple cases.
282 if (scanner->IsEnd())
283 return false;
284 if (city_ && state_ && zip_)
285 return false;
286 if (state_ && zip_)
287 return ParseCity(scanner);
288 if (city_ && zip_)
289 return ParseState(scanner);
290 if (city_ && state_)
291 return ParseZipCode(scanner);
292
293 // Check for name + label matches.
Evan Stade 2015/03/24 00:04:32 nit: "Check for matches to both name and label."
Lei Zhang 2015/03/25 00:42:28 Done.
294 ParseNameLabelResult city_result = ParseNameAndLabelForCity(scanner);
295 if (city_result == RESULT_MATCH_NAME_LABEL)
296 return true;
297 ParseNameLabelResult state_result = ParseNameAndLabelForState(scanner);
298 if (state_result == RESULT_MATCH_NAME_LABEL)
299 return true;
300 ParseNameLabelResult zip_result = ParseNameAndLabelForZipCode(scanner);
301 if (zip_result == RESULT_MATCH_NAME_LABEL)
302 return true;
303
304 // Check if there is only one potential match.
305 bool maybe_city = (city_result != RESULT_MATCH_NONE);
Evan Stade 2015/03/24 00:04:32 nit: remove excess parens
Lei Zhang 2015/03/25 00:42:28 Done.
306 bool maybe_state = (state_result != RESULT_MATCH_NONE);
307 bool maybe_zip = (zip_result != RESULT_MATCH_NONE);
308 if (maybe_city && !maybe_state && !maybe_zip)
309 return SetFieldAndAdvanceCursor(scanner, &city_);
310 if (maybe_state && !maybe_city && !maybe_zip)
311 return SetFieldAndAdvanceCursor(scanner, &state_);
312 if (maybe_zip && !maybe_city && !maybe_state)
313 return ParseZipCode(scanner);
314
315 // Otherwise give name priority over label.
316 if (city_result == RESULT_MATCH_NAME)
317 return SetFieldAndAdvanceCursor(scanner, &city_);
318 if (state_result == RESULT_MATCH_NAME)
319 return SetFieldAndAdvanceCursor(scanner, &state_);
320 if (zip_result == RESULT_MATCH_NAME)
321 return ParseZipCode(scanner);
322
323 if (city_result == RESULT_MATCH_LABEL)
324 return SetFieldAndAdvanceCursor(scanner, &city_);
325 if (state_result == RESULT_MATCH_LABEL)
326 return SetFieldAndAdvanceCursor(scanner, &state_);
327 if (zip_result == RESULT_MATCH_LABEL)
328 return ParseZipCode(scanner);
329
330 return false;
331 }
332
333 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode(
334 AutofillScanner* scanner) {
335 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this
336 // is called a "post code".
337 if (zip_)
338 return RESULT_MATCH_NONE;
339
340 ParseNameLabelResult result = ParseNameAndLabelSeparately(
341 scanner, UTF8ToUTF16(kZipCodeRe), kZipCodeMatchType, &zip_);
342
343 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd())
344 return result;
345
346 size_t saved_cursor = scanner->SaveCursor();
347 bool found_non_zip4 = ParseCity(scanner);
348 if (found_non_zip4)
349 city_ = nullptr;
350 scanner->RewindTo(saved_cursor);
351 if (!found_non_zip4) {
352 found_non_zip4 = ParseState(scanner);
353 if (found_non_zip4)
354 state_ = nullptr;
355 scanner->RewindTo(saved_cursor);
356 }
357
358 if (!found_non_zip4) {
359 // Look for a zip+4, whose field name will also often contain
360 // the substring "zip".
361 ParseFieldSpecifics(scanner,
362 UTF8ToUTF16(kZip4Re),
363 kZipCodeMatchType,
364 &zip4_);
365 }
366 return result;
367 }
368
369 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity(
370 AutofillScanner* scanner) {
371 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use
Evan Stade 2015/03/24 00:04:32 I dunno where this comment belongs, but not here
Lei Zhang 2015/03/25 00:42:28 That's because it started it out as a copy + paste
372 // the term "town".
373 if (city_)
374 return RESULT_MATCH_NONE;
375
376 // Select fields are allowed here. This occurs on top-100 site rediff.com.
Evan Stade 2015/03/24 00:04:32 repeated comment
Lei Zhang 2015/03/25 00:42:28 deleted
377 return ParseNameAndLabelSeparately(
378 scanner, UTF8ToUTF16(kCityRe), kCityMatchType, &city_);
379 }
380
381 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState(
382 AutofillScanner* scanner) {
383 if (state_)
384 return RESULT_MATCH_NONE;
385
386 return ParseNameAndLabelSeparately(
387 scanner, UTF8ToUTF16(kStateRe), kStateMatchType, &state_);
388 }
389
265 } // namespace autofill 390 } // namespace autofill
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698