Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(62)

Side by Side Diff: chrome/browser/autofill/address_field.cc

Issue 7014011: Change heuristic regex and order to match grabber-continental. (Closed) Base URL: http://git.chromium.org/git/chromium.git@trunk
Patch Set: Use bit pattern version. Created 9 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/autofill/address_field.h" 5 #include "chrome/browser/autofill/address_field.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
11 #include "base/string16.h" 11 #include "base/string16.h"
12 #include "base/string_util.h" 12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h" 13 #include "base/utf_string_conversions.h"
14 #include "chrome/browser/autofill/autofill_field.h" 14 #include "chrome/browser/autofill/autofill_field.h"
15 #include "chrome/browser/autofill/autofill_scanner.h" 15 #include "chrome/browser/autofill/autofill_scanner.h"
16 #include "chrome/browser/autofill/email_field.h"
17 #include "chrome/browser/autofill/phone_field.h"
16 #include "grit/autofill_resources.h" 18 #include "grit/autofill_resources.h"
17 #include "ui/base/l10n/l10n_util.h" 19 #include "ui/base/l10n/l10n_util.h"
18 20
19 bool AddressField::GetFieldInfo(FieldTypeMap* field_type_map) const { 21 bool AddressField::GetFieldInfo(FieldTypeMap* field_type_map) const {
20 AutofillFieldType address_company; 22 AutofillFieldType address_company;
21 AutofillFieldType address_line1; 23 AutofillFieldType address_line1;
22 AutofillFieldType address_line2; 24 AutofillFieldType address_line2;
23 AutofillFieldType address_city; 25 AutofillFieldType address_city;
24 AutofillFieldType address_state; 26 AutofillFieldType address_state;
25 AutofillFieldType address_zip; 27 AutofillFieldType address_zip;
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
71 const AutofillField* initial_field = scanner->Cursor(); 73 const AutofillField* initial_field = scanner->Cursor();
72 scanner->SaveCursor(); 74 scanner->SaveCursor();
73 75
74 string16 attention_ignored = 76 string16 attention_ignored =
75 l10n_util::GetStringUTF16(IDS_AUTOFILL_ATTENTION_IGNORED_RE); 77 l10n_util::GetStringUTF16(IDS_AUTOFILL_ATTENTION_IGNORED_RE);
76 string16 region_ignored = 78 string16 region_ignored =
77 l10n_util::GetStringUTF16(IDS_AUTOFILL_REGION_IGNORED_RE); 79 l10n_util::GetStringUTF16(IDS_AUTOFILL_REGION_IGNORED_RE);
78 80
79 // Allow address fields to appear in any order. 81 // Allow address fields to appear in any order.
80 while (!scanner->IsEnd()) { 82 while (!scanner->IsEnd()) {
83 // Every loop, we have to parse email and phone, and if we find it, we
84 // break the loop even while continuing address. Because we want to give
85 // email and phone more priority than address. We have to rewind |scanner|
86 // position after the email and phone parsing.
87 size_t cursor_position = scanner->SaveCursor();
88 bool is_email = EmailField::Parse(scanner, is_ecml);
89 scanner->RewindTo(cursor_position);
90 if (is_email)
91 break;
92
93 cursor_position = scanner->SaveCursor();
94 bool is_phone = PhoneField::Parse(scanner, is_ecml);
95 scanner->RewindTo(cursor_position);
96 if (is_phone)
97 break;
98
81 if (ParseCompany(scanner, is_ecml, address_field.get()) || 99 if (ParseCompany(scanner, is_ecml, address_field.get()) ||
82 ParseAddressLines(scanner, is_ecml, address_field.get()) || 100 ParseAddressLines(scanner, is_ecml, address_field.get()) ||
83 ParseCity(scanner, is_ecml, address_field.get()) || 101 ParseCity(scanner, is_ecml, address_field.get()) ||
84 ParseState(scanner, is_ecml, address_field.get()) || 102 ParseState(scanner, is_ecml, address_field.get()) ||
85 ParseZipCode(scanner, is_ecml, address_field.get()) || 103 ParseZipCode(scanner, is_ecml, address_field.get()) ||
86 ParseCountry(scanner, is_ecml, address_field.get())) { 104 ParseCountry(scanner, is_ecml, address_field.get())) {
87 continue; 105 continue;
88 } else if (ParseText(scanner, attention_ignored) || 106 } else if (ParseText(scanner, attention_ignored,
89 ParseText(scanner, region_ignored)) { 107 MATCH_NAME | MATCH_LABEL | MATCH_TEXT) ||
108 ParseText(scanner, region_ignored,
109 MATCH_NAME | MATCH_LABEL | MATCH_TEXT)) {
90 // We ignore the following: 110 // We ignore the following:
91 // * Attention. 111 // * Attention.
92 // * Province/Region/Other. 112 // * Province/Region/Other.
93 continue; 113 continue;
94 } else if (scanner->Cursor() != initial_field && ParseEmpty(scanner)) { 114 } else if (scanner->Cursor() != initial_field && ParseEmpty(scanner)) {
95 // Ignore non-labeled fields within an address; the page 115 // Ignore non-labeled fields within an address; the page
96 // MapQuest Driving Directions North America.html contains such a field. 116 // MapQuest Driving Directions North America.html contains such a field.
97 // We only ignore such fields after we've parsed at least one other field; 117 // We only ignore such fields after we've parsed at least one other field;
98 // otherwise we'd effectively parse address fields before other field 118 // otherwise we'd effectively parse address fields before other field
99 // types after any non-labeled fields, and we want email address fields to 119 // types after any non-labeled fields, and we want email address fields to
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
159 return false; 179 return false;
160 180
161 string16 pattern; 181 string16 pattern;
162 if (is_ecml) { 182 if (is_ecml) {
163 pattern = GetEcmlPattern(kEcmlShipToCompanyName, 183 pattern = GetEcmlPattern(kEcmlShipToCompanyName,
164 kEcmlBillToCompanyName, '|'); 184 kEcmlBillToCompanyName, '|');
165 } else { 185 } else {
166 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_COMPANY_RE); 186 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_COMPANY_RE);
167 } 187 }
168 188
169 return ParseText(scanner, pattern, &address_field->company_); 189 return ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
190 &address_field->company_);
170 } 191 }
171 192
172 // static 193 // static
173 bool AddressField::ParseAddressLines(AutofillScanner* scanner, 194 bool AddressField::ParseAddressLines(AutofillScanner* scanner,
174 bool is_ecml, 195 bool is_ecml,
175 AddressField* address_field) { 196 AddressField* address_field) {
176 // We only match the string "address" in page text, not in element names, 197 // We only match the string "address" in page text, not in element names,
177 // because sometimes every element in a group of address fields will have 198 // because sometimes every element in a group of address fields will have
178 // a name containing the string "address"; for example, on the page 199 // a name containing the string "address"; for example, on the page
179 // Kohl's - Register Billing Address.html the text element labeled "city" 200 // Kohl's - Register Billing Address.html the text element labeled "city"
180 // has the name "BILL_TO_ADDRESS<>city". We do match address labels 201 // has the name "BILL_TO_ADDRESS<>city". We do match address labels
181 // such as "address1", which appear as element names on various pages (eg 202 // such as "address1", which appear as element names on various pages (eg
182 // AmericanGirl-Registration.html, BloomingdalesBilling.html, 203 // AmericanGirl-Registration.html, BloomingdalesBilling.html,
183 // EBay Registration Enter Information.html). 204 // EBay Registration Enter Information.html).
184 if (address_field->address1_) 205 if (address_field->address1_)
185 return false; 206 return false;
186 207
187 string16 pattern; 208 string16 pattern;
188 if (is_ecml) { 209 if (is_ecml) {
189 pattern = GetEcmlPattern(kEcmlShipToAddress1, kEcmlBillToAddress1, '|'); 210 pattern = GetEcmlPattern(kEcmlShipToAddress1, kEcmlBillToAddress1, '|');
190 if (!ParseText(scanner, pattern, &address_field->address1_)) 211 if (!ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
212 &address_field->address1_))
191 return false; 213 return false;
192 } else { 214 } else {
193 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_RE); 215 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_RE);
194 string16 label_pattern = 216 string16 label_pattern =
195 l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE); 217 l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE);
196 218
197 if (!ParseText(scanner, pattern, &address_field->address1_) && 219 if (!ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
198 !ParseLabelText(scanner, label_pattern, &address_field->address1_)) 220 &address_field->address1_) &&
221 !ParseText(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
222 &address_field->address1_))
199 return false; 223 return false;
200 } 224 }
201 225
202 // Optionally parse more address lines, which may have empty labels. 226 // Optionally parse more address lines, which may have empty labels.
203 // Some pages have 3 address lines (eg SharperImageModifyAccount.html) 227 // Some pages have 3 address lines (eg SharperImageModifyAccount.html)
204 // Some pages even have 4 address lines (e.g. uk/ShoesDirect2.html)! 228 // Some pages even have 4 address lines (e.g. uk/ShoesDirect2.html)!
205 if (is_ecml) { 229 if (is_ecml) {
206 pattern = GetEcmlPattern(kEcmlShipToAddress2, kEcmlBillToAddress2, '|'); 230 pattern = GetEcmlPattern(kEcmlShipToAddress2, kEcmlBillToAddress2, '|');
207 if (!ParseEmptyText(scanner, &address_field->address2_)) 231 if (!ParseEmptyText(scanner, &address_field->address2_))
208 ParseText(scanner, pattern, &address_field->address2_); 232 ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
233 &address_field->address2_);
209 } else { 234 } else {
210 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_2_RE); 235 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_2_RE);
211 string16 label_pattern = 236 string16 label_pattern =
212 l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE); 237 l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE);
213 if (!ParseEmptyText(scanner, &address_field->address2_) && 238 if (!ParseEmptyText(scanner, &address_field->address2_) &&
214 !ParseText(scanner, pattern, &address_field->address2_)) 239 !ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
215 ParseLabelText(scanner, label_pattern, &address_field->address2_); 240 &address_field->address2_))
241 ParseText(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
242 &address_field->address2_);
216 } 243 }
217 244
218 // Try for a third line, which we will promptly discard. 245 // Try for a third line, which we will promptly discard.
219 if (address_field->address2_ != NULL) { 246 if (address_field->address2_ != NULL) {
220 if (is_ecml) { 247 if (is_ecml) {
221 pattern = GetEcmlPattern(kEcmlShipToAddress3, kEcmlBillToAddress3, '|'); 248 pattern = GetEcmlPattern(kEcmlShipToAddress3, kEcmlBillToAddress3, '|');
222 ParseText(scanner, pattern); 249 ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT);
223 } else { 250 } else {
224 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_3_RE); 251 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_3_RE);
225 if (!ParseEmptyText(scanner, NULL)) 252 if (!ParseEmptyText(scanner, NULL))
226 ParseText(scanner, pattern, NULL); 253 ParseText(scanner, pattern,
254 MATCH_NAME | MATCH_LABEL | MATCH_TEXT, NULL);
227 } 255 }
228 } 256 }
229 257
230 return true; 258 return true;
231 } 259 }
232 260
233 // static 261 // static
234 bool AddressField::ParseCountry(AutofillScanner* scanner, 262 bool AddressField::ParseCountry(AutofillScanner* scanner,
235 bool is_ecml, 263 bool is_ecml,
236 AddressField* address_field) { 264 AddressField* address_field) {
237 // Parse a country. The occasional page (e.g. 265 // Parse a country. The occasional page (e.g.
238 // Travelocity_New Member Information1.html) calls this a "location". 266 // Travelocity_New Member Information1.html) calls this a "location".
239 // Note: ECML standard uses 2 letter country code (ISO 3166) 267 // Note: ECML standard uses 2 letter country code (ISO 3166)
240 if (address_field->country_ && !address_field->country_->IsEmpty()) 268 if (address_field->country_ && !address_field->country_->IsEmpty())
241 return false; 269 return false;
242 270
243 string16 pattern; 271 string16 pattern;
244 if (is_ecml) 272 if (is_ecml)
245 pattern = GetEcmlPattern(kEcmlShipToCountry, kEcmlBillToCountry, '|'); 273 pattern = GetEcmlPattern(kEcmlShipToCountry, kEcmlBillToCountry, '|');
246 else 274 else
247 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_COUNTRY_RE); 275 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_COUNTRY_RE);
248 276
249 return ParseText(scanner, pattern, &address_field->country_); 277 return ParseText(scanner, pattern,
278 MATCH_NAME | MATCH_LABEL | MATCH_TEXT | MATCH_SELECT,
279 &address_field->country_);
250 } 280 }
251 281
252 // static 282 // static
253 bool AddressField::ParseZipCode(AutofillScanner* scanner, 283 bool AddressField::ParseZipCode(AutofillScanner* scanner,
254 bool is_ecml, 284 bool is_ecml,
255 AddressField* address_field) { 285 AddressField* address_field) {
256 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this 286 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this
257 // is called a "post code". 287 // is called a "post code".
258 // 288 //
259 // HACK: Just for the MapQuest driving directions page we match the 289 // HACK: Just for the MapQuest driving directions page we match the
(...skipping 18 matching lines...) Expand all
278 // more detail. 308 // more detail.
279 string16 bill_to_postal_code_field(ASCIIToUTF16(kEcmlBillToPostalCode)); 309 string16 bill_to_postal_code_field(ASCIIToUTF16(kEcmlBillToPostalCode));
280 if (StartsWith(name, bill_to_postal_code_field, false)) { 310 if (StartsWith(name, bill_to_postal_code_field, false)) {
281 tempType = kBillingAddress; 311 tempType = kBillingAddress;
282 } else if (StartsWith(name, bill_to_postal_code_field, false)) { 312 } else if (StartsWith(name, bill_to_postal_code_field, false)) {
283 tempType = kShippingAddress; 313 tempType = kShippingAddress;
284 } else { 314 } else {
285 tempType = kGenericAddress; 315 tempType = kGenericAddress;
286 } 316 }
287 317
288 if (!ParseText(scanner, pattern, &address_field->zip_)) 318 if (!ParseText(scanner, pattern, MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
319 &address_field->zip_))
289 return false; 320 return false;
290 321
291 address_field->type_ = tempType; 322 address_field->type_ = tempType;
292 if (!is_ecml) { 323 if (!is_ecml) {
293 // Look for a zip+4, whose field name will also often contain 324 // Look for a zip+4, whose field name will also often contain
294 // the substring "zip". 325 // the substring "zip".
295 ParseText(scanner, 326 ParseText(scanner,
296 l10n_util::GetStringUTF16(IDS_AUTOFILL_ZIP_4_RE), 327 l10n_util::GetStringUTF16(IDS_AUTOFILL_ZIP_4_RE),
328 MATCH_NAME | MATCH_LABEL | MATCH_TEXT,
297 &address_field->zip4_); 329 &address_field->zip4_);
298 } 330 }
299 331
300 return true; 332 return true;
301 } 333 }
302 334
303 // static 335 // static
304 bool AddressField::ParseCity(AutofillScanner* scanner, 336 bool AddressField::ParseCity(AutofillScanner* scanner,
305 bool is_ecml, 337 bool is_ecml,
306 AddressField* address_field) { 338 AddressField* address_field) {
307 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use 339 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use
308 // the term "town". 340 // the term "town".
309 if (address_field->city_) 341 if (address_field->city_)
310 return false; 342 return false;
311 343
312 string16 pattern; 344 string16 pattern;
313 if (is_ecml) 345 if (is_ecml)
314 pattern = GetEcmlPattern(kEcmlShipToCity, kEcmlBillToCity, '|'); 346 pattern = GetEcmlPattern(kEcmlShipToCity, kEcmlBillToCity, '|');
315 else 347 else
316 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_CITY_RE); 348 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_CITY_RE);
317 349
318 return ParseText(scanner, pattern, &address_field->city_); 350 return ParseText(scanner, pattern,
351 MATCH_NAME | MATCH_LABEL | MATCH_TEXT | MATCH_SELECT,
352 &address_field->city_);
319 } 353 }
320 354
321 // static 355 // static
322 bool AddressField::ParseState(AutofillScanner* scanner, 356 bool AddressField::ParseState(AutofillScanner* scanner,
323 bool is_ecml, 357 bool is_ecml,
324 AddressField* address_field) { 358 AddressField* address_field) {
325 if (address_field->state_) 359 if (address_field->state_)
326 return false; 360 return false;
327 361
328 string16 pattern; 362 string16 pattern;
329 if (is_ecml) 363 if (is_ecml)
330 pattern = GetEcmlPattern(kEcmlShipToStateProv, kEcmlBillToStateProv, '|'); 364 pattern = GetEcmlPattern(kEcmlShipToStateProv, kEcmlBillToStateProv, '|');
331 else 365 else
332 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_STATE_RE); 366 pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_STATE_RE);
333 367
334 return ParseText(scanner, pattern, &address_field->state_); 368 return ParseText(scanner, pattern,
369 MATCH_NAME | MATCH_LABEL | MATCH_TEXT | MATCH_SELECT,
370 &address_field->state_);
335 } 371 }
336 372
337 AddressType AddressField::AddressTypeFromText(const string16 &text) { 373 AddressType AddressField::AddressTypeFromText(const string16 &text) {
338 if (text.find(l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_TYPE_SAME_AS_RE)) 374 if (text.find(l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_TYPE_SAME_AS_RE))
339 != string16::npos || 375 != string16::npos ||
340 text.find(l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_TYPE_USE_MY_RE)) 376 text.find(l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_TYPE_USE_MY_RE))
341 != string16::npos) 377 != string16::npos)
342 // This text could be a checkbox label such as "same as my billing 378 // This text could be a checkbox label such as "same as my billing
343 // address" or "use my shipping address". 379 // address" or "use my shipping address".
344 // ++ It would help if we generally skipped all text that appears 380 // ++ It would help if we generally skipped all text that appears
(...skipping 15 matching lines...) Expand all
360 return kBillingAddress; 396 return kBillingAddress;
361 397
362 if (bill == string16::npos && ship != string16::npos) 398 if (bill == string16::npos && ship != string16::npos)
363 return kShippingAddress; 399 return kShippingAddress;
364 400
365 if (bill > ship) 401 if (bill > ship)
366 return kBillingAddress; 402 return kBillingAddress;
367 403
368 return kShippingAddress; 404 return kShippingAddress;
369 } 405 }
OLDNEW
« no previous file with comments | « no previous file | chrome/browser/autofill/autofill_resources.grd » ('j') | chrome/browser/autofill/form_field.cc » ('J')

Powered by Google App Engine
This is Rietveld 408576698