Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6267)

Unified Diff: chrome/renderer/autofill/form_manager.cc

Issue 7531023: Improve Autofill heuristics when detecting labels from previous elements. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebase Created 9 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/autofill/form_manager.cc
diff --git a/chrome/renderer/autofill/form_manager.cc b/chrome/renderer/autofill/form_manager.cc
index 53884db4f166cc3f861aebe2c6ea2c5063fb7235..3cb6ceac912c42f8971b1c1c88fc291dc3b78803 100644
--- a/chrome/renderer/autofill/form_manager.cc
+++ b/chrome/renderer/autofill/form_manager.cc
@@ -78,14 +78,6 @@ bool IsSelectElement(const WebFormControlElement& element) {
return element.formControlType() == ASCIIToUTF16("select-one");
}
-bool IsTextContainerElement(const WebElement& element) {
- return
- element.hasTagName("p") ||
- element.hasTagName("b") ||
- element.hasTagName("span") ||
- element.hasTagName("font");
-}
-
bool IsOptionElement(const WebElement& element) {
return element.hasTagName("option");
}
@@ -107,6 +99,31 @@ bool IsAutofillableElement(const WebFormControlElement& element) {
return IsTextInput(input_element) || IsSelectElement(element);
}
+// Appends |suffix| to |prefix| so that any intermediary whitespace is collapsed
+// to a single space. If |force_whitespace| is true, then the resulting string
+// is guaranteed to have a space between |prefix| and |suffix|. Otherwise, the
+// result includes a space only if |prefix| has trailing whitespace or |suffix|
+// has leading whitespace.
dhollowa 2011/08/01 22:16:54 Some examples in this comment would help the casua
Ilya Sherman 2011/08/01 23:59:36 Done.
+const string16 CombineAndCollapseWhitespace(const string16& prefix,
+ const string16& suffix,
+ bool force_whitespace) {
+ string16 prefix_trimmed;
+ TrimPositions prefix_trailing_whitespace =
+ TrimWhitespace(prefix, TRIM_TRAILING, &prefix_trimmed);
+
+ // Recursively compute the children's text.
+ string16 suffix_trimmed;
+ TrimPositions suffix_leading_whitespace =
+ TrimWhitespace(suffix, TRIM_LEADING, &suffix_trimmed);
+
+ if (prefix_trailing_whitespace || suffix_leading_whitespace ||
+ force_whitespace) {
+ return prefix_trimmed + ASCIIToUTF16(" ") + suffix_trimmed;
+ } else {
+ return prefix_trimmed + suffix_trimmed;
+ }
+}
+
// This is a helper function for the FindChildText() function (see below).
// Search depth is limited with the |depth| parameter.
string16 FindChildTextInner(const WebNode& node, int depth) {
@@ -126,39 +143,27 @@ string16 FindChildTextInner(const WebNode& node, int depth) {
const WebElement element = node.toConst<WebElement>();
if (IsOptionElement(element) ||
IsScriptElement(element) ||
- IsNoScriptElement(element)) {
+ IsNoScriptElement(element) ||
+ (element.isFormControlElement() &&
+ IsAutofillableElement(element.toConst<WebFormControlElement>()))) {
return string16();
}
}
// Extract the text exactly at this node.
string16 node_text = node.nodeValue();
- TrimPositions node_trailing_whitespace =
- TrimWhitespace(node_text, TRIM_TRAILING, &node_text);
// Recursively compute the children's text.
// Preserve inter-element whitespace separation.
string16 child_text = FindChildTextInner(node.firstChild(), depth - 1);
- TrimPositions child_leading_whitespace =
- TrimWhitespace(child_text, TRIM_LEADING, &child_text);
- if (node_trailing_whitespace || child_leading_whitespace ||
- (node.nodeType() == WebNode::TextNode && node_text.empty())) {
- node_text += ASCIIToUTF16(" ");
- }
- node_text += child_text;
- node_trailing_whitespace =
- TrimWhitespace(node_text, TRIM_TRAILING, &node_text);
+ bool add_space = node.nodeType() == WebNode::TextNode && node_text.empty();
+ node_text = CombineAndCollapseWhitespace(node_text, child_text, add_space);
// Recursively compute the siblings' text.
// Again, preserve inter-element whitespace separation.
string16 sibling_text = FindChildTextInner(node.nextSibling(), depth - 1);
- TrimPositions sibling_leading_whitespace =
- TrimWhitespace(sibling_text, TRIM_LEADING, &sibling_text);
- if (node_trailing_whitespace || sibling_leading_whitespace ||
- (node.nodeType() == WebNode::TextNode && node_text.empty())) {
- node_text += ASCIIToUTF16(" ");
- }
- node_text += sibling_text;
+ add_space = node.nodeType() == WebNode::TextNode && node_text.empty();
+ node_text = CombineAndCollapseWhitespace(node_text, sibling_text, add_space);
return node_text;
}
@@ -169,85 +174,72 @@ string16 FindChildTextInner(const WebNode& node, int depth) {
// used when the structure is not directly known. However, unlike with
// |innerText()|, the search depth and breadth are limited to a fixed threshold.
// Whitespace is trimmed from text accumulated at descendant nodes.
-string16 FindChildText(const WebElement& element) {
- WebNode child = element.firstChild();
+string16 FindChildText(const WebNode& node) {
+ if (node.isTextNode())
+ return node.nodeValue();
+
+ WebNode child = node.firstChild();
const int kChildSearchDepth = 10;
- string16 element_text = FindChildTextInner(child, kChildSearchDepth);
- TrimWhitespace(element_text, TRIM_ALL, &element_text);
- return element_text;
+ string16 node_text = FindChildTextInner(child, kChildSearchDepth);
+ TrimWhitespace(node_text, TRIM_ALL, &node_text);
+ return node_text;
}
// Helper for |InferLabelForElement()| that infers a label, if possible, from
-// a previous sibling of |element|.
+// a previous sibling of |element|,
+// e.g. Some Text <input ...>
+// or Some <span>Text</span> <input ...>
+// or <p>Some Text</p><input ...>
+// or <label>Some Text</label> <input ...>
+// or Some Text <img><input ...>
+// or <b>Some Text</b><br/> <input ...>.
string16 InferLabelFromPrevious(const WebFormControlElement& element) {
string16 inferred_label;
- WebNode previous = element.previousSibling();
- if (previous.isNull())
- return string16();
-
- // Check for text immediately before the |element|.
- if (previous.isTextNode()) {
- inferred_label = previous.nodeValue();
- TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
- }
-
- // If we didn't find text, check for an immediately preceding text container,
- // e.g. <p>Some Text</p><input ...>
- // Note the lack of whitespace between <p> and <input> elements.
- if (inferred_label.empty() && previous.isElementNode()) {
- WebElement previous_element = previous.to<WebElement>();
- if (IsTextContainerElement(previous_element))
- inferred_label = FindChildText(previous_element);
- }
-
- // If we didn't find one immediately preceding, check for a text container
- // separated from this node only by whitespace,
- // e.g. <p>Some Text</p> <input ...>
- // Note the whitespace between <p> and <input> elements.
- if (inferred_label.empty() && previous.isTextNode()) {
- WebNode sibling = previous.previousSibling();
- if (!sibling.isNull() && sibling.isElementNode()) {
- WebElement previous_element = sibling.to<WebElement>();
- if (IsTextContainerElement(previous_element))
- inferred_label = FindChildText(previous_element);
- }
- }
+ WebNode previous = element;
+ while (true) {
+ previous = previous.previousSibling();
+ if (previous.isNull())
+ break;
- // Look for a text node prior to <img> or <br> tags,
- // e.g. Some Text<img/><input ...> or Some Text<br/><input ...>
- while (inferred_label.empty() && !previous.isNull()) {
- if (previous.isTextNode()) {
- inferred_label = previous.nodeValue();
- TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
- } else if (previous.isElementNode()) {
- WebElement previous_element = previous.to<WebElement>();
- if (IsTextContainerElement(previous_element))
- inferred_label = FindChildText(previous_element);
- else if (!HasTagName(previous, "img") && !HasTagName(previous, "br"))
- break;
- } else {
+ WebNode::NodeType node_type = previous.nodeType();
+ if (node_type != WebNode::TextNode &&
+ node_type != WebNode::ElementNode &&
+ node_type != WebNode::CommentNode)
break;
- }
- previous = previous.previousSibling();
- }
+ // Coalesce any text contained in multiple consecutive plain text nodes or
+ // on of a few HTML elements that are essentially equivalent to text nodes.
+ if (previous.isTextNode() ||
+ HasTagName(previous, "b") || HasTagName(previous, "strong") ||
+ HasTagName(previous, "span") || HasTagName(previous, "font")) {
+ string16 value = FindChildText(previous);
+ // A text node's value will be empty if it is for a line break.
+ bool add_space = previous.isTextNode() && value.empty();
+ inferred_label =
+ CombineAndCollapseWhitespace(value, inferred_label, add_space);
+ } else if (previous.isElementNode()) {
dhollowa 2011/08/01 22:16:54 Instead of "else if", it would be more readable IM
Ilya Sherman 2011/08/01 23:59:36 Done.
+ // All other elements are only allowed if we have not yet found any
+ // candidate label text.
+ string16 trimmed_label;
+ TrimWhitespace(inferred_label, TRIM_ALL, &trimmed_label);
+ if (trimmed_label.empty()) {
+ // <img> and <br> tags often appear between the input element and its
+ // label text, so skip over them.
+ if (HasTagName(previous, "img") || HasTagName(previous, "br"))
+ continue;
+
+ // We expect <p> and <label> tags to contain the full label text, so
+ // only allow these if we have not yet found any candidate label text.
+ if (HasTagName(previous, "p") || HasTagName(previous, "label"))
+ inferred_label = FindChildText(previous);
+ }
- // Look for a label node prior to the <input> tag,
- // e.g. <label>Some Text</label><input ...>
- while (inferred_label.empty() && !previous.isNull()) {
- if (previous.isTextNode()) {
- inferred_label = previous.nodeValue();
- TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
- } else if (HasTagName(previous, "label")) {
- inferred_label = FindChildText(previous.to<WebElement>());
- } else {
break;
}
-
- previous = previous.previousSibling();
}
+ TrimWhitespace(inferred_label, TRIM_ALL, &inferred_label);
return inferred_label;
}
@@ -262,7 +254,7 @@ string16 InferLabelFromListItem(const WebFormControlElement& element) {
}
if (!parent.isNull() && HasTagName(parent, "li"))
- return FindChildText(parent.to<WebElement>());
+ return FindChildText(parent);
return string16();
}
@@ -289,7 +281,7 @@ string16 InferLabelFromTableColumn(const WebFormControlElement& element) {
WebNode previous = parent.previousSibling();
while (inferred_label.empty() && !previous.isNull()) {
if (HasTagName(previous, "td") || HasTagName(previous, "th"))
- inferred_label = FindChildText(previous.to<WebElement>());
+ inferred_label = FindChildText(previous);
previous = previous.previousSibling();
}
@@ -316,7 +308,7 @@ string16 InferLabelFromTableRow(const WebFormControlElement& element) {
WebNode previous = parent.previousSibling();
while (inferred_label.empty() && !previous.isNull()) {
if (HasTagName(previous, "tr"))
- inferred_label = FindChildText(previous.to<WebElement>());
+ inferred_label = FindChildText(previous);
previous = previous.previousSibling();
}
@@ -330,23 +322,29 @@ string16 InferLabelFromTableRow(const WebFormControlElement& element) {
// e.g. <div>Some Text</div><div><input ...></div>
string16 InferLabelFromDivTable(const WebFormControlElement& element) {
WebNode node = element.parentNode();
- while (!node.isNull() && node.isElementNode() &&
- !node.to<WebElement>().hasTagName("div") &&
- // If the element is in a table, its label most likely is too.
- !node.to<WebElement>().hasTagName("table")) {
- node = node.parentNode();
- }
+ bool looking_for_parent = true;
- if (node.isNull() || !HasTagName(node, "div"))
- return string16();
-
- // Search the siblings while we cannot find label.
+ // Search the sibling and parent <div>s until we find a candidate label.
string16 inferred_label;
while (inferred_label.empty() && !node.isNull()) {
- if (HasTagName(node, "div"))
- inferred_label = FindChildText(node.to<WebElement>());
+ if (HasTagName(node, "div")) {
+ looking_for_parent = false;
+ inferred_label = FindChildText(node);
+ } else if (looking_for_parent &&
+ (HasTagName(node, "table") || HasTagName(node, "fieldset"))) {
+ // If the element is in a table or fieldset, its label most likely is too.
+ break;
+ }
+
+ if (node.previousSibling().isNull()) {
+ // If there are no more siblings, continue walking up the tree.
+ looking_for_parent = true;
+ }
- node = node.previousSibling();
+ if (looking_for_parent)
+ node = node.parentNode();
+ else
+ node = node.previousSibling();
}
return inferred_label;
@@ -373,7 +371,7 @@ string16 InferLabelFromDefinitionList(const WebFormControlElement& element) {
if (previous.isNull() || !HasTagName(previous, "dt"))
return string16();
- return FindChildText(previous.to<WebElement>());
+ return FindChildText(previous);
}
// Infers corresponding label for |element| from surrounding context in the DOM,
@@ -645,9 +643,12 @@ string16 FormManager::LabelForElement(const WebFormControlElement& element) {
WebNodeList labels = element.document().getElementsByTagName("label");
for (unsigned i = 0; i < labels.length(); ++i) {
WebLabelElement label = labels.item(i).to<WebLabelElement>();
- DCHECK(label.hasTagName("label"));
- if (label.correspondingControl() == element)
+ WebElement corresponding_control = label.correspondingControl();
+ if (corresponding_control == element ||
+ (corresponding_control.isNull() &&
+ label.getAttribute("for") == element.nameForAutofill())) {
return FindChildText(label);
+ }
}
// Infer the label from context if not found in label element.
@@ -732,13 +733,21 @@ bool FormManager::WebFormElementToFormData(const WebFormElement& element,
WebLabelElement label = labels.item(i).to<WebLabelElement>();
WebFormControlElement field_element =
label.correspondingControl().to<WebFormControlElement>();
- if (field_element.isNull() ||
+
+ string16 element_name;
+ if (field_element.isNull()) {
+ // Sometimes site authors will incorrectly specify the corresponding
+ // field element's name rather than its id, so we compensate here.
+ element_name = label.getAttribute("for");
+ } else if (
!field_element.isFormControlElement() ||
- field_element.formControlType() == WebString::fromUTF8("hidden"))
+ field_element.formControlType() == WebString::fromUTF8("hidden")) {
continue;
+ } else {
+ element_name = field_element.nameForAutofill();
+ }
- std::map<string16, FormField*>::iterator iter =
- name_map.find(field_element.nameForAutofill());
+ std::map<string16, FormField*>::iterator iter = name_map.find(element_name);
// Concatenate labels because some sites might have multiple label
// candidates.
if (iter != name_map.end())
« no previous file with comments | « chrome/browser/autofill/form_structure_browsertest.cc ('k') | chrome/test/data/autofill/heuristics/input/16_crbug_87517.html » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698