Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(455)

Side by Side Diff: components/autofill/core/browser/address_field.cc

Issue 1028633004: Autofill: Improve heuristics for city/state/zip fields. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comment Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/autofill/core/browser/address_field.h" 5 #include "components/autofill/core/browser/address_field.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string16.h" 11 #include "base/strings/string16.h"
12 #include "base/strings/string_util.h" 12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h" 13 #include "base/strings/utf_string_conversions.h"
14 #include "components/autofill/core/browser/autofill_field.h" 14 #include "components/autofill/core/browser/autofill_field.h"
15 #include "components/autofill/core/browser/autofill_regex_constants.h" 15 #include "components/autofill/core/browser/autofill_regex_constants.h"
16 #include "components/autofill/core/browser/autofill_scanner.h" 16 #include "components/autofill/core/browser/autofill_scanner.h"
17 #include "components/autofill/core/browser/field_types.h" 17 #include "components/autofill/core/browser/field_types.h"
18 18
19 using base::UTF8ToUTF16; 19 using base::UTF8ToUTF16;
20 20
21 namespace autofill { 21 namespace autofill {
22 22
23 namespace {
24
25 bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) {
26 *field = scanner->Cursor();
27 scanner->Advance();
28 return true;
29 }
30
31 } // namespace
32
33 // Some sites use type="tel" for zip fields (to get a numerical input).
34 // http://crbug.com/426958
35 const int AddressField::kZipCodeMatchType =
36 MATCH_DEFAULT | MATCH_TELEPHONE | MATCH_NUMBER;
37
38 // Select fields are allowed here. This occurs on top-100 site rediff.com.
39 const int AddressField::kCityMatchType = MATCH_DEFAULT | MATCH_SELECT;
40
41 const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT;
42
23 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { 43 scoped_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) {
24 if (scanner->IsEnd()) 44 if (scanner->IsEnd())
25 return NULL; 45 return NULL;
26 46
27 scoped_ptr<AddressField> address_field(new AddressField); 47 scoped_ptr<AddressField> address_field(new AddressField);
28 const AutofillField* const initial_field = scanner->Cursor(); 48 const AutofillField* const initial_field = scanner->Cursor();
29 size_t saved_cursor = scanner->SaveCursor(); 49 size_t saved_cursor = scanner->SaveCursor();
30 50
31 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe); 51 base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe);
32 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe); 52 base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe);
33 53
34 // Allow address fields to appear in any order. 54 // Allow address fields to appear in any order.
35 size_t begin_trailing_non_labeled_fields = 0; 55 size_t begin_trailing_non_labeled_fields = 0;
36 bool has_trailing_non_labeled_fields = false; 56 bool has_trailing_non_labeled_fields = false;
37 while (!scanner->IsEnd()) { 57 while (!scanner->IsEnd()) {
38 const size_t cursor = scanner->SaveCursor(); 58 const size_t cursor = scanner->SaveCursor();
39 if (address_field->ParseAddressLines(scanner) || 59 if (address_field->ParseAddressLines(scanner) ||
40 address_field->ParseCity(scanner) || 60 address_field->ParseCityStateZipCode(scanner) ||
41 address_field->ParseState(scanner) ||
42 address_field->ParseZipCode(scanner) ||
43 address_field->ParseCountry(scanner) || 61 address_field->ParseCountry(scanner) ||
44 address_field->ParseCompany(scanner)) { 62 address_field->ParseCompany(scanner)) {
45 has_trailing_non_labeled_fields = false; 63 has_trailing_non_labeled_fields = false;
46 continue; 64 continue;
47 } else if (ParseField(scanner, attention_ignored, NULL) || 65 } else if (ParseField(scanner, attention_ignored, NULL) ||
48 ParseField(scanner, region_ignored, NULL)) { 66 ParseField(scanner, region_ignored, NULL)) {
49 // We ignore the following: 67 // We ignore the following:
50 // * Attention. 68 // * Attention.
51 // * Province/Region/Other. 69 // * Province/Region/Other.
52 continue; 70 continue;
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after
209 // The occasional page (e.g. google account registration page) calls this a 227 // The occasional page (e.g. google account registration page) calls this a
210 // "location". However, this only makes sense for select tags. 228 // "location". However, this only makes sense for select tags.
211 scanner->Rewind(); 229 scanner->Rewind();
212 return ParseFieldSpecifics(scanner, 230 return ParseFieldSpecifics(scanner,
213 UTF8ToUTF16(kCountryLocationRe), 231 UTF8ToUTF16(kCountryLocationRe),
214 MATCH_LABEL | MATCH_NAME | MATCH_SELECT, 232 MATCH_LABEL | MATCH_NAME | MATCH_SELECT,
215 &country_); 233 &country_);
216 } 234 }
217 235
218 bool AddressField::ParseZipCode(AutofillScanner* scanner) { 236 bool AddressField::ParseZipCode(AutofillScanner* scanner) {
219 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this
220 // is called a "post code".
221 if (zip_) 237 if (zip_)
222 return false; 238 return false;
223 239
224 // Some sites use type="tel" for zip fields (to get a numerical input).
225 // http://crbug.com/426958
226 if (!ParseFieldSpecifics(scanner, 240 if (!ParseFieldSpecifics(scanner,
227 UTF8ToUTF16(kZipCodeRe), 241 UTF8ToUTF16(kZipCodeRe),
228 MATCH_DEFAULT | MATCH_TELEPHONE, 242 kZipCodeMatchType,
229 &zip_)) { 243 &zip_)) {
230 return false; 244 return false;
231 } 245 }
232 246
233 // Look for a zip+4, whose field name will also often contain 247 // Look for a zip+4, whose field name will also often contain
234 // the substring "zip". 248 // the substring "zip".
235 ParseFieldSpecifics(scanner, 249 ParseFieldSpecifics(scanner, UTF8ToUTF16(kZip4Re), kZipCodeMatchType, &zip4_);
236 UTF8ToUTF16(kZip4Re),
237 MATCH_DEFAULT | MATCH_TELEPHONE,
238 &zip4_);
239 return true; 250 return true;
240 } 251 }
241 252
242 bool AddressField::ParseCity(AutofillScanner* scanner) { 253 bool AddressField::ParseCity(AutofillScanner* scanner) {
243 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use
244 // the term "town".
245 if (city_) 254 if (city_)
246 return false; 255 return false;
247 256
248 // Select fields are allowed here. This occurs on top-100 site rediff.com.
249 return ParseFieldSpecifics(scanner, 257 return ParseFieldSpecifics(scanner,
250 UTF8ToUTF16(kCityRe), 258 UTF8ToUTF16(kCityRe),
251 MATCH_DEFAULT | MATCH_SELECT, 259 kCityMatchType,
252 &city_); 260 &city_);
253 } 261 }
254 262
255 bool AddressField::ParseState(AutofillScanner* scanner) { 263 bool AddressField::ParseState(AutofillScanner* scanner) {
256 if (state_) 264 if (state_)
257 return false; 265 return false;
258 266
259 return ParseFieldSpecifics(scanner, 267 return ParseFieldSpecifics(scanner,
260 UTF8ToUTF16(kStateRe), 268 UTF8ToUTF16(kStateRe),
261 MATCH_DEFAULT | MATCH_SELECT, 269 kStateMatchType,
262 &state_); 270 &state_);
263 } 271 }
264 272
273 bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) {
274 // Simple cases.
275 if (scanner->IsEnd())
276 return false;
277 if (city_ && state_ && zip_)
278 return false;
279 if (state_ && zip_)
280 return ParseCity(scanner);
281 if (city_ && zip_)
282 return ParseState(scanner);
283 if (city_ && state_)
284 return ParseZipCode(scanner);
285
286 // Check for matches to both name and label.
287 ParseNameLabelResult city_result = ParseNameAndLabelForCity(scanner);
288 if (city_result == RESULT_MATCH_NAME_LABEL)
289 return true;
290 ParseNameLabelResult state_result = ParseNameAndLabelForState(scanner);
291 if (state_result == RESULT_MATCH_NAME_LABEL)
292 return true;
293 ParseNameLabelResult zip_result = ParseNameAndLabelForZipCode(scanner);
294 if (zip_result == RESULT_MATCH_NAME_LABEL)
295 return true;
296
297 // Check if there is only one potential match.
298 bool maybe_city = city_result != RESULT_MATCH_NONE;
299 bool maybe_state = state_result != RESULT_MATCH_NONE;
300 bool maybe_zip = zip_result != RESULT_MATCH_NONE;
301 if (maybe_city && !maybe_state && !maybe_zip)
302 return SetFieldAndAdvanceCursor(scanner, &city_);
303 if (maybe_state && !maybe_city && !maybe_zip)
304 return SetFieldAndAdvanceCursor(scanner, &state_);
305 if (maybe_zip && !maybe_city && !maybe_state)
306 return ParseZipCode(scanner);
307
308 // Otherwise give name priority over label.
309 if (city_result == RESULT_MATCH_NAME)
310 return SetFieldAndAdvanceCursor(scanner, &city_);
311 if (state_result == RESULT_MATCH_NAME)
312 return SetFieldAndAdvanceCursor(scanner, &state_);
313 if (zip_result == RESULT_MATCH_NAME)
314 return ParseZipCode(scanner);
315
316 if (city_result == RESULT_MATCH_LABEL)
317 return SetFieldAndAdvanceCursor(scanner, &city_);
318 if (state_result == RESULT_MATCH_LABEL)
319 return SetFieldAndAdvanceCursor(scanner, &state_);
320 if (zip_result == RESULT_MATCH_LABEL)
321 return ParseZipCode(scanner);
322
323 return false;
324 }
325
326 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode(
327 AutofillScanner* scanner) {
328 if (zip_)
329 return RESULT_MATCH_NONE;
330
331 ParseNameLabelResult result = ParseNameAndLabelSeparately(
332 scanner, UTF8ToUTF16(kZipCodeRe), kZipCodeMatchType, &zip_);
333
334 if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd())
335 return result;
336
337 size_t saved_cursor = scanner->SaveCursor();
338 bool found_non_zip4 = ParseCity(scanner);
339 if (found_non_zip4)
340 city_ = nullptr;
341 scanner->RewindTo(saved_cursor);
342 if (!found_non_zip4) {
343 found_non_zip4 = ParseState(scanner);
344 if (found_non_zip4)
345 state_ = nullptr;
346 scanner->RewindTo(saved_cursor);
347 }
348
349 if (!found_non_zip4) {
350 // Look for a zip+4, whose field name will also often contain
351 // the substring "zip".
352 ParseFieldSpecifics(scanner,
353 UTF8ToUTF16(kZip4Re),
354 kZipCodeMatchType,
355 &zip4_);
356 }
357 return result;
358 }
359
360 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity(
361 AutofillScanner* scanner) {
362 if (city_)
363 return RESULT_MATCH_NONE;
364
365 return ParseNameAndLabelSeparately(
366 scanner, UTF8ToUTF16(kCityRe), kCityMatchType, &city_);
367 }
368
369 AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState(
370 AutofillScanner* scanner) {
371 if (state_)
372 return RESULT_MATCH_NONE;
373
374 return ParseNameAndLabelSeparately(
375 scanner, UTF8ToUTF16(kStateRe), kStateMatchType, &state_);
376 }
377
265 } // namespace autofill 378 } // namespace autofill
OLDNEW
« no previous file with comments | « components/autofill/core/browser/address_field.h ('k') | components/autofill/core/browser/form_field.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698