OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "config.h" | |
6 #include "DocumentStatisticsCollector.h" | |
7 | |
8 #include "core/HTMLNames.h" | |
9 #include "core/InputTypeNames.h" | |
10 #include "core/css/CSSComputedStyleDeclaration.h" | |
11 #include "core/dom/ElementTraversal.h" | |
12 #include "core/dom/NodeComputedStyle.h" | |
13 #include "core/editing/iterators/TextIterator.h" | |
14 #include "core/html/HTMLHeadElement.h" | |
15 #include "core/html/HTMLInputElement.h" | |
16 #include "core/html/HTMLMetaElement.h" | |
17 #include "public/platform/WebDistillability.h" | |
18 #include "wtf/text/StringBuilder.h" | |
19 #include "wtf/text/StringImpl.h" | |
20 | |
21 namespace blink { | |
22 | |
23 using namespace HTMLNames; | |
24 | |
25 namespace { | |
26 | |
27 // Saturate the length of a paragraph to save time. | |
28 const int kTextContentLengthSaturation = 1000; | |
29 | |
30 unsigned textContentLengthSaturated(Element& root) | |
31 { | |
32 unsigned length = 0; | |
33 // This skips shadow dom intentionally. | |
34 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) { | |
35 if (!node.isTextNode()) { | |
36 continue; | |
37 } | |
38 length += toText(node).length(); | |
39 if (length > kTextContentLengthSaturation) { | |
40 return kTextContentLengthSaturation; | |
41 } | |
42 } | |
43 return length; | |
44 } | |
45 | |
46 bool isVisible(Element& element) | |
47 { | |
48 const ComputedStyle* style = element.computedStyle(); | |
49 if (!style) { | |
50 return false; | |
51 } | |
52 ASSERT(style->display() != NONE); | |
53 return ( | |
54 style->visibility() != HIDDEN | |
55 && style->opacity() != 0 | |
56 ); | |
57 } | |
58 | |
59 bool matchAttributes(Element& element, const Vector<String>& words) | |
60 { | |
61 const String& classes = element.getClassAttribute(); | |
62 const String& id = element.getIdAttribute(); | |
63 for (const String& word : words) { | |
64 if (classes.findIgnoringCase(word) != WTF::kNotFound | |
65 || id.findIgnoringCase(word) != WTF::kNotFound) { | |
66 return true; | |
67 } | |
68 } | |
69 return false; | |
70 } | |
71 | |
72 // underListItem denotes that at least one of the ancesters is <li> element. | |
73 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false) | |
74 { | |
75 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ()); | |
76 if (unlikelyCandidates.isEmpty()) { | |
77 for (auto word : {"banner", "combx", "comment", "community", "disqus", " extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox ", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pag er", "popup"}) { | |
mdjones
2015/11/03 02:54:46
Do webkit files not have the 80 char line limit?
esprehn
2015/11/03 04:12:10
They do not, you should wrap where you think it's
wychen
2015/11/03 07:56:39
Fixed here:
https://codereview.chromium.org/141903
| |
78 unlikelyCandidates.append(word); | |
79 } | |
80 } | |
81 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ()); | |
82 if (highlyLikelyCandidates.isEmpty()) { | |
83 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) { | |
84 highlyLikelyCandidates.append(word); | |
85 } | |
86 } | |
87 // Filter out short P elements. The threshold is set to around 2 English sen tences. | |
88 const unsigned kParagraphLengthThreshold = 140; | |
89 | |
90 // Saturate the scores to save time. The max is the score of 6 long paragrap hs. | |
91 const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - k ParagraphLengthThreshold); | |
92 const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturat ion); | |
93 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation ; | |
94 | |
95 for (Node& node : NodeTraversal::childrenOf(root)) { | |
96 if (node.isTextNode()) { | |
97 features.textContentLength += toText(node).length(); | |
98 continue; | |
99 } | |
100 if (!node.isElementNode()) { | |
101 continue; | |
102 } | |
103 | |
104 features.elementCount++; | |
105 Element& element = toElement(node); | |
106 if (element.hasTagName(aTag)) { | |
107 features.anchorCount++; | |
108 } else if (element.hasTagName(formTag)) { | |
109 features.formCount++; | |
110 } else if (element.hasTagName(inputTag)) { | |
111 const HTMLInputElement& input = toHTMLInputElement(element); | |
112 if (equalIgnoringCase(input.type(), InputTypeNames::text)) { | |
113 features.textInputCount++; | |
114 } else if (equalIgnoringCase(input.type(), InputTypeNames::password) ) { | |
115 features.passwordInputCount++; | |
116 } | |
117 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { | |
118 if (element.hasTagName(pTag)) { | |
119 features.pCount++; | |
120 } else { | |
121 features.preCount++; | |
122 } | |
123 if (!underListItem | |
124 && (features.mozScore < kMozScoreSaturation | |
125 || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation | |
126 || features.mozScoreAllLinear < kMozScoreAllLinearSaturation ) | |
127 && isVisible(element) | |
128 && (!matchAttributes(element, unlikelyCandidates) || matchAttrib utes(element, highlyLikelyCandidates)) | |
129 ) { | |
130 unsigned length = textContentLengthSaturated(element); | |
131 if (length >= kParagraphLengthThreshold) { | |
132 features.mozScore += sqrt(length - kParagraphLengthThreshold ); | |
133 features.mozScore = std::min(features.mozScore, kMozScoreSat uration); | |
134 } | |
135 features.mozScoreAllSqrt += sqrt(length); | |
136 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation); | |
137 | |
138 features.mozScoreAllLinear += length; | |
139 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation); | |
140 } | |
141 } else if (element.hasTagName(liTag)) { | |
142 underListItem = true; | |
143 } | |
144 collectFeatures(element, features, underListItem); | |
145 } | |
146 } | |
147 | |
148 bool hasOpenGraphArticle(const Element& head) | |
149 { | |
150 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type")); | |
151 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property")); | |
152 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) { | |
153 if (!isHTMLMetaElement(*child)) | |
154 continue; | |
155 const HTMLMetaElement& meta = toHTMLMetaElement(*child); | |
156 | |
157 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) { | |
158 if (equalIgnoringCase(meta.content(), "article")) { | |
159 return true; | |
160 } | |
161 } | |
162 } | |
163 return false; | |
164 } | |
165 | |
166 } // namespace | |
167 | |
168 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document) | |
169 { | |
170 WebDistillabilityFeatures features = WebDistillabilityFeatures(); | |
171 | |
172 if (!document.frame() || !document.frame()->isMainFrame()) | |
173 return features; | |
174 | |
175 ASSERT(document.hasFinishedParsing()); | |
176 | |
177 if (!document.body() || !document.head()) | |
178 return features; | |
179 | |
180 // Traverse the DOM tree and collect statistics. | |
181 collectFeatures(*document.body(), features); | |
182 features.openGraph = hasOpenGraphArticle(*document.head()); | |
183 | |
184 return features; | |
185 } | |
186 | |
187 } | |
OLD | NEW |