Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(126)

Unified Diff: components/autofill/content/renderer/form_classifier.cc

Issue 1883183002: [Password Manager] HTML parsing based client-side form type classifier (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Changed the signature of ClassifyFormAndFindGenerationField Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/autofill/content/renderer/form_classifier.cc
diff --git a/components/autofill/content/renderer/form_classifier.cc b/components/autofill/content/renderer/form_classifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3d23f8bac90766c378638d282dcca0a7b91fa7e
--- /dev/null
+++ b/components/autofill/content/renderer/form_classifier.cc
@@ -0,0 +1,287 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/autofill/content/renderer/form_classifier.h"
+
+#include <algorithm>
+
+#include "base/strings/string_util.h"
vabr (Chromium) 2016/06/10 13:22:12 Please also #include "base/string16.h".
kolos1 2016/06/13 14:27:34 Done.
+#include "components/autofill/content/renderer/form_autofill_util.h"
+#include "third_party/WebKit/public/platform/WebString.h"
+#include "third_party/WebKit/public/platform/WebVector.h"
+#include "third_party/WebKit/public/web/WebFormControlElement.h"
+#include "third_party/WebKit/public/web/WebInputElement.h"
+
+using autofill::form_util::WebFormControlElementToFormField;
+using blink::WebFormControlElement;
+using blink::WebInputElement;
+using blink::WebString;
+using blink::WebVector;
+
+namespace autofill {
+
+namespace {
+
+// The words that frequently appear in attribute values of signin forms.
+const char* const kSigninTextFeatures[] = {"signin", "login", "logon", "auth"};
+const int kNumberOfSigninFeatures = arraysize(kSigninTextFeatures);
vabr (Chromium) 2016/06/10 13:22:11 Please #include "base/macros.h" for arraysize.
vabr (Chromium) 2016/06/10 13:22:12 Please use constexpr here and with the constants b
vabr (Chromium) 2016/06/10 13:22:12 Please use size_t instead of int. Unlike int, size
kolos1 2016/06/13 14:27:34 Done.
kolos1 2016/06/13 14:27:34 Done.
kolos1 2016/06/13 14:27:34 Done.
+
+// The words that frequently appear in attribute values of signup forms.
+const char* const kSignupTextFeatures[] = {"signup", "regist", "creat"};
+const int kNumberOfSignupFeatures = arraysize(kSignupTextFeatures);
+
+// The words that frequently appear in attribute values of captcha elements.
+const char* const kCaptchaFeatures[] = {"captcha", "security", "code"};
+const int kNumberOfCaptchaFeatures = arraysize(kCaptchaFeatures);
+
+// The characters that should be removed from attribute values.
+const char kCharactersToBeRemoved[] = {'-', '_'};
+
+// Minimal number of input fields to detect signup/change password form.
vabr (Chromium) 2016/06/10 13:22:11 The comment does not make it clear how the minimal
kolos1 2016/06/13 14:27:34 Fixed the comment.
+const size_t MINIMAL_NUMBER_OF_TEXT_FIELDS = 2;
+const size_t MINIMAL_NUMBER_OF_PASSWORD_FIELDS = 2;
+const size_t MINIMAL_NUMBER_OF_CHECKBOX_FIELDS = 3;
+const size_t MINIMAL_NUMBER_OF_OTHER_FIELDS = 2;
+
+// Helper function that removes all occurrences of the given character |c| from
+// the string |str|.
+void RemoveAllOccurrencesOfCharacter(std::string* str, char c) {
+ str->erase(std::remove(str->begin(), str->end(), c), str->end());
+}
+
+// Find |features| in |element|'s attribute values. Returns true if at least one
+// text feature was found.
+bool FindTextFeaturesForClass(const blink::WebElement& element,
+ const char* const features[],
+ size_t number_of_features) {
+ for (unsigned i = 0; i < element.attributeCount(); ++i) {
+ std::string filtered_value =
+ base::ToLowerASCII(element.attributeValue(i).utf8());
+ for (char d : kCharactersToBeRemoved)
dvadym 2016/06/09 12:40:59 Can we use regexp for removing symbols?
kolos1 2016/06/10 12:18:08 Done.
+ RemoveAllOccurrencesOfCharacter(&filtered_value, d);
+ if (filtered_value.empty())
+ continue;
+ for (size_t j = 0; j < number_of_features; j++) {
vabr (Chromium) 2016/06/10 13:22:12 nit: ++j (Let's be consistent and use prefix incre
kolos1 2016/06/13 14:27:34 Done.
+ if (filtered_value.find(features[j]) != std::string::npos)
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns true if at least one captcha feature was found in |element|'s
+// attribute values.
+bool IsCaptchaInput(const blink::WebInputElement& element) {
+ return FindTextFeaturesForClass(element, kCaptchaFeatures,
+ kNumberOfCaptchaFeatures);
+}
+
+// Finds <img>'s inside |form| and checks if <img>'s attributes contains captcha
+// text features. Returns true, if at least one occurrence was found.
+bool FindCaptchaInImgElements(const blink::WebElement& form,
+ bool ingnore_invisible) {
+ CR_DEFINE_STATIC_LOCAL(WebString, kImageTag, ("img"));
+
+ blink::WebElementCollection img_elements =
+ form.getElementsByHTMLTagName(kImageTag);
+ for (blink::WebElement element = img_elements.firstItem(); !element.isNull();
+ element = img_elements.nextItem()) {
+ if (ingnore_invisible && !form_util::IsWebNodeVisible(element))
+ continue;
+ if (FindTextFeaturesForClass(element, kCaptchaFeatures,
+ kNumberOfCaptchaFeatures))
+ return true;
+ }
+ return false;
+}
+
+// Finds signin and signup features in |element|'s attribute values. Sets to
+// true |found_signin_text_features| or |found_signup_text_features| if
+// appropriate features were found.
+void FindTextFeaturesInElement(bool* found_signin_text_features,
dvadym 2016/06/09 12:40:59 Input arguments should be before output arguments
kolos1 2016/06/10 12:18:08 Done.
+ bool* found_signup_text_features,
dvadym 2016/06/09 12:40:59 Could you please add DCHECK for found_*_text_featu
kolos1 2016/06/10 12:18:08 Done.
+ const blink::WebElement& element) {
+ if (!found_signin_text_features) {
+ *found_signin_text_features = FindTextFeaturesForClass(
+ element, kSigninTextFeatures, kNumberOfSigninFeatures);
+ }
+ if (!found_signup_text_features) {
+ *found_signup_text_features = FindTextFeaturesForClass(
+ element, kSignupTextFeatures, kNumberOfSignupFeatures);
+ }
+}
+
+// Returns true if |element| has type "button" or "image".
+bool IsButtonOrImageElement(const WebFormControlElement& element) {
+ CR_DEFINE_STATIC_LOCAL(WebString, kButton, ("button"));
dvadym 2016/06/09 12:40:59 All input type names const are in InputTypeNames.h
kolos1 2016/06/10 12:18:08 I'm not sure we could use this file. As I understa
dvadym 2016/06/10 14:12:09 Acknowledged
+ CR_DEFINE_STATIC_LOCAL(WebString, kImage, ("image"));
+
+ return element.formControlType() == kButton ||
+ element.formControlType() == kImage;
+}
+
+// Returns true if |element| has type "submit".
+bool IsSubmitElement(const WebFormControlElement& element) {
+ CR_DEFINE_STATIC_LOCAL(WebString, kSubmit, ("submit"));
+
+ return element.formControlType() == kSubmit;
+}
+
+// Returns true if |element| has type "hidden";
+bool IsHiddenElement(const WebFormControlElement& element) {
+ CR_DEFINE_STATIC_LOCAL(WebString, kHidden, ("hidden"));
+
+ return element.formControlType() == kHidden;
+}
+
+// Returns true if |element| has type "select-multiple" or "select-one".
+bool IsSelectElement(const WebFormControlElement& element) {
+ CR_DEFINE_STATIC_LOCAL(WebString, kSelectOne, ("select-one"));
+ CR_DEFINE_STATIC_LOCAL(WebString, kSelectMultiple, ("select-multiple"));
+
+ return element.formControlType() == kSelectOne ||
+ element.formControlType() == kSelectMultiple;
+}
+
+// Return true if |form| contains at least one visible password element.
+bool FormContainsVisiblePasswordFields(const blink::WebFormElement& form) {
+ WebVector<WebFormControlElement> control_elements;
+ form.getFormControlElements(control_elements);
+ for (auto& control_element : control_elements) {
+ const WebInputElement* input_element = toWebInputElement(&control_element);
+ if (!input_element)
+ continue;
+ if (input_element->isPasswordField() &&
+ form_util::IsWebNodeVisible(*input_element))
+ return true;
+ }
+ return false;
+}
+
+// Finds text features in <form> tag of |form| and its ancestors.
vabr (Chromium) 2016/06/10 13:22:12 nit: The comment sounds not completely correct. Wh
kolos1 2016/06/13 14:27:34 Fixed the comment.
+// Sets |found_signin_text_features| and |found_signup_text_features| to true,
+// if corresponding features are found.
+void FindTextFeaturesInFormAndItsAncestors(const blink::WebFormElement& form,
dvadym 2016/06/09 12:40:59 The same comment as to FindTextFeaturesInElement a
kolos1 2016/06/10 12:18:08 Done.
+ bool* found_signin_text_features,
+ bool* found_signup_text_features) {
+ CR_DEFINE_STATIC_LOCAL(WebString, kInput, ("input"));
+
+ unsigned number_of_inputs = form.getElementsByHTMLTagName(kInput).length();
+ blink::WebNode parent = form;
+ for (; !parent.isNull();) {
dvadym 2016/06/09 12:40:59 I'm concerning a little bit in performance of this
kolos1 2016/06/10 12:18:08 I will ask to review Webkit specialist. Thanks.
+ if (parent.getElementsByHTMLTagName(kInput).length() > number_of_inputs)
vabr (Chromium) 2016/06/10 13:22:11 +1 to Vadym's worry about performance. This line i
dvadym 2016/06/10 14:12:09 Yeah, perfomance impact is not clear. Independentl
kolos1 2016/06/13 14:27:34 I reduced the number of calls "getElementsByHTMLTa
+ break;
+ if (parent.isElementNode()) {
+ blink::WebElement element = parent.toConst<blink::WebElement>();
+ bool has_signin_feature = FindTextFeaturesForClass(
+ element, kSigninTextFeatures, kNumberOfSigninFeatures);
+ bool has_signup_feature = FindTextFeaturesForClass(
+ element, kSignupTextFeatures, kNumberOfSignupFeatures);
+ if (has_signin_feature && has_signup_feature)
+ break;
+ if (has_signin_feature) {
+ *found_signin_text_features = true;
+ break;
+ }
+ if (has_signup_feature) {
+ *found_signup_text_features = true;
+ break;
+ }
+ }
+ parent = parent.parentNode();
+ }
+}
+
+} // namespace
+
+bool ClassifyFormAndFindGenerationField(const blink::WebFormElement& form,
+ base::string16* generation_field) {
+ DCHECK(generation_field);
+
+ if (form.isNull())
+ return false;
+
+ bool ignore_invisible_elements = FormContainsVisiblePasswordFields(form);
+
+ bool found_signin_text_features = false;
+ bool found_signup_text_features = false;
+ size_t number_of_text_input_fields = 0;
+ size_t number_of_password_input_fields = 0;
+ size_t number_of_checkbox_input_fields = 0;
+ size_t number_of_other_input_fields = 0;
+ bool found_captcha =
+ FindCaptchaInImgElements(form, ignore_invisible_elements);
+
+ FindTextFeaturesInFormAndItsAncestors(form, &found_signin_text_features,
+ &found_signup_text_features);
+
+ std::vector<WebInputElement> passwords;
+ WebVector<WebFormControlElement> control_elements;
+ form.getFormControlElements(control_elements);
+
+ for (size_t i = 0; i < control_elements.size(); ++i) {
vabr (Chromium) 2016/06/10 13:22:11 nit: Compress the first two line into: for (const
kolos1 2016/06/13 14:27:34 Done.
+ WebFormControlElement control_element = control_elements[i];
+ bool element_is_invisible = !form_util::IsWebNodeVisible(control_element);
vabr (Chromium) 2016/06/10 13:22:12 IsWebNodeVisible might be expensive, please only c
kolos1 2016/06/13 14:27:34 Done.
+ if ((element_is_invisible && ignore_invisible_elements) ||
+ IsHiddenElement(control_element))
vabr (Chromium) 2016/06/10 13:22:11 nit: "if" statements with more than 2 lines in tot
kolos1 2016/06/13 14:27:34 Done.
+ continue;
+
+ // If type="button" or "image", skip them, because it might be a link
+ // to another form.
+ if (IsButtonOrImageElement(control_element))
+ continue;
+
+ FindTextFeaturesInElement(&found_signin_text_features,
+ &found_signup_text_features, control_element);
+
+ // Since <select> is not WebInputElement, but WebSelectElement, process
+ // them as a special case.
+ if (IsSelectElement(control_element)) {
+ number_of_other_input_fields++;
vabr (Chromium) 2016/06/10 13:22:12 nit: Here and below: please use prefix ++ unless y
kolos1 2016/06/13 14:27:34 Done.
+ } else {
+ WebInputElement* input_element = toWebInputElement(&control_element);
+ if (!input_element)
+ continue;
+
+ if (input_element->isTextField()) {
+ if (input_element->isPasswordField()) {
+ number_of_password_input_fields++;
+ passwords.push_back(*input_element);
+ } else {
+ number_of_text_input_fields++;
+ found_captcha = found_captcha || IsCaptchaInput(*input_element);
+ }
+ } else { // Non-text fields.
+ if (input_element->isCheckbox())
+ number_of_checkbox_input_fields++;
+ else if (!IsSubmitElement(*input_element))
+ number_of_other_input_fields++;
+ }
+ }
+ }
+
+ if (number_of_password_input_fields == 0)
+ return false;
+
+ if ((number_of_text_input_fields - found_captcha >=
+ MINIMAL_NUMBER_OF_TEXT_FIELDS ||
+ number_of_password_input_fields >= MINIMAL_NUMBER_OF_PASSWORD_FIELDS ||
+ number_of_checkbox_input_fields >= MINIMAL_NUMBER_OF_CHECKBOX_FIELDS ||
+ number_of_other_input_fields >= MINIMAL_NUMBER_OF_OTHER_FIELDS) ||
+ (found_signup_text_features && !found_signin_text_features)) {
+ WebInputElement password_creation_field;
+
+ // TODO(crbug.com/618309): Improve local classifier to distinguish password
+ // creation and password usage fields on the change password forms.
+ if (passwords.size() == 3)
+ password_creation_field = passwords[1];
+ else
+ password_creation_field = passwords[0];
dvadym 2016/06/09 12:40:59 What's about case when passwords.size() > 3? It co
kolos1 2016/06/10 12:18:08 I saw one site where there were 4 password fields
+
+ *generation_field = password_creation_field.nameForAutofill();
+ return true;
+ }
+ return false;
+}
+}

Powered by Google App Engine
This is Rietveld 408576698