Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(52)

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1248643004: Test distillability without JavaScript (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@early
Patch Set: fix browsertest, merge webkit CL, merge http://crrev.com/1403413004 Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/page_features.h" 5 #include "components/dom_distiller/core/page_features.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/json/json_reader.h" 9 #include "third_party/WebKit/public/platform/WebDistillability.h"
10 #include "third_party/re2/re2/re2.h" 10 #include "third_party/re2/re2/re2.h"
11 #include "url/gurl.h"
11 12
12 namespace dom_distiller { 13 namespace dom_distiller {
13 /* This code needs to derive features in the same way and order in which they 14 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the 15 * are derived when training the model. Parts of that code are reproduced in the
15 * comments below. 16 * comments below.
16 */ 17 */
17 18
18 namespace { 19 namespace {
19 20
20 std::string GetLastSegment(const std::string& path) { 21 std::string GetLastSegment(const std::string& path) {
21 // return re.search('[^/]*\/?$', path).group(0) 22 // return re.search('[^/]*\/?$', path).group(0)
22 if (path.size() == 0) 23 if (path.size() == 0)
23 return ""; 24 return "";
24 size_t start = path.rfind("/", path.size() - 1); 25 size_t start = path.rfind("/", path.size() - 1);
25 return start == std::string::npos ? "" : path.substr(start + 1); 26 return start == std::string::npos ? "" : path.substr(start + 1);
26 } 27 }
27 28
28 int CountMatches(const std::string& s, const std::string& p) { 29 int CountMatches(const std::string& s, const std::string& p) {
29 // return len(re.findall(p, s)) 30 // return len(re.findall(p, s))
30 re2::StringPiece sp(s); 31 re2::StringPiece sp(s);
31 re2::RE2 regexp(p); 32 re2::RE2 regexp(p);
32 int count = 0; 33 int count = 0;
33 while (re2::RE2::FindAndConsume(&sp, regexp)) 34 while (re2::RE2::FindAndConsume(&sp, regexp))
34 count++; 35 count++;
35 return count; 36 return count;
36 } 37 }
37 38
38 int GetWordCount(const std::string& s) {
39 return CountMatches(s, "\\w+");
40 }
41
42 bool Contains(const std::string& n, const std::string& h) { 39 bool Contains(const std::string& n, const std::string& h) {
43 return h.find(n) != std::string::npos; 40 return h.find(n) != std::string::npos;
44 } 41 }
45 42
46 bool EndsWith(const std::string& t, const std::string& s) { 43 bool EndsWith(const std::string& t, const std::string& s) {
47 return s.size() >= t.size() && 44 return s.size() >= t.size() &&
48 s.compare(s.size() - t.size(), std::string::npos, t) == 0; 45 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
49 } 46 }
50 47
51 } // namespace 48 } // namespace
52 49
53 int kDerivedFeaturesCount = 29; 50 unsigned kDerivedFeaturesCount = 22;
54 51
55 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, 52 std::vector<double> CalculateDerivedFeatures(
56 const GURL& url, 53 const blink::WebDistillabilityFeatures& f,
57 double numElements, 54 const GURL& url) {
58 double numAnchors,
59 double numForms,
60 const std::string& innerText,
61 const std::string& textContent,
62 const std::string& innerHTML) {
63 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
64 // they are here).
65 const std::string& path = url.path(); 55 const std::string& path = url.path();
66 int innerTextWords = GetWordCount(innerText);
67 int textContentWords = GetWordCount(textContent);
68 int innerHTMLWords = GetWordCount(innerHTML);
69 std::vector<double> features; 56 std::vector<double> features;
70 // 'opengraph', opengraph, 57 // 'opengraph', opengraph,
71 features.push_back(isOGArticle); 58 features.push_back(f.openGraph);
72 // 'forum', 'forum' in path, 59 // 'forum', 'forum' in path,
73 features.push_back(Contains("forum", path)); 60 features.push_back(Contains("forum", path));
74 // 'index', 'index' in path, 61 // 'index', 'index' in path,
75 features.push_back(Contains("index", path)); 62 features.push_back(Contains("index", path));
63 // 'search', 'search' in path,
64 features.push_back(Contains("search", path));
76 // 'view', 'view' in path, 65 // 'view', 'view' in path,
77 features.push_back(Contains("view", path)); 66 features.push_back(Contains("view", path));
67 // 'archive', 'archive' in path,
68 features.push_back(Contains("archive", path));
78 // 'asp', '.asp' in path, 69 // 'asp', '.asp' in path,
79 features.push_back(Contains(".asp", path)); 70 features.push_back(Contains(".asp", path));
80 // 'phpbb', 'phpbb' in path, 71 // 'phpbb', 'phpbb' in path,
81 features.push_back(Contains("phpbb", path)); 72 features.push_back(Contains("phpbb", path));
82 // 'php', path.endswith('.php'), 73 // 'php', path.endswith('.php'),
83 features.push_back(EndsWith(".php", path)); 74 features.push_back(EndsWith(".php", path));
84 // 'pathlength', len(path), 75 // 'pathLength', len(path),
85 features.push_back(path.size()); 76 features.push_back(path.size());
86 // 'domain', len(path) < 2, 77 // 'domain', len(path) < 2,
87 features.push_back(path.size() < 2); 78 features.push_back(path.size() < 2);
88 // 'pathcomponents', CountMatches(path, r'\/.'), 79 // 'pathComponents', CountMatches(path, r'\/.'),
89 features.push_back(CountMatches(path, "\\/.")); 80 features.push_back(CountMatches(path, "\\/."));
90 // 'slugdetector', CountMatches(path, r'[^\w/]'), 81 // 'slugDetector', CountMatches(path, r'[^\w/]'),
91 features.push_back(CountMatches(path, "[^\\w/]")); 82 features.push_back(CountMatches(path, "[^\\w/]"));
92 // 'pathnumbers', CountMatches(path, r'\d+'), 83 // 'pathNumbers', CountMatches(path, r'\d+'),
93 features.push_back(CountMatches(path, "\\d+")); 84 features.push_back(CountMatches(path, "\\d+"));
94 // 'lastSegmentLength', len(GetLastSegment(path)), 85 // 'lastSegmentLength', len(GetLastSegment(path)),
95 features.push_back(GetLastSegment(path).size()); 86 features.push_back(GetLastSegment(path).size());
96 // 'formcount', numForms, 87 // 'formCount', numForms,
97 features.push_back(numForms); 88 features.push_back(f.formCount);
98 // 'anchorcount', numAnchors, 89 // 'anchorCount', numAnchors,
99 features.push_back(numAnchors); 90 features.push_back(f.anchorCount);
100 // 'elementcount', numElements, 91 // 'elementCount', numElements,
101 features.push_back(numElements); 92 features.push_back(f.elementCount);
102 // 'anchorratio', float(numAnchors) / max(1, numElements), 93 // 'anchorRatio', float(numAnchors) / max(1, numElements),
103 features.push_back(double(numAnchors) / std::max<double>(1, numElements)); 94 features.push_back(
104 // 'innertextlength', len(innerText), 95 double(f.anchorCount) / std::max<double>(1, f.elementCount));
105 features.push_back(innerText.size()); 96 // 'mozScore'
106 // 'textcontentlength', len(textContent), 97 features.push_back(f.mozScore);
107 features.push_back(textContent.size()); 98 // 'mozScoreAllSqrt'
108 // 'innerhtmllength', len(innerHTML), 99 features.push_back(f.mozScoreAllSqrt);
109 features.push_back(innerHTML.size()); 100 // 'mozScoreAllLinear'
110 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), 101 features.push_back(f.mozScoreAllLinear);
111 features.push_back(double(innerText.size()) / 102
112 std::max<double>(1.0, innerHTML.size()));
113 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
114 features.push_back(double(textContent.size()) /
115 std::max<double>(1.0, innerHTML.size()));
116 // 'innertexttextcontentlengthratio',
117 // float(len(innerText)) / max(1, len(textContent)),
118 features.push_back(double(innerText.size()) /
119 std::max<double>(1.0, textContent.size()));
120 // 'innertextwordcount', innerTextWords,
121 features.push_back(innerTextWords);
122 // 'textcontentwordcount', textContentWords,
123 features.push_back(textContentWords);
124 // 'innerhtmlwordcount', innerHTMLWords,
125 features.push_back(innerHTMLWords);
126 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
127 features.push_back(double(innerTextWords) /
128 std::max<int>(1.0, innerHTMLWords));
129 // 'textcontentwordcountratio',
130 // float(textContentWords) / max(1, innerHTMLWords),
131 features.push_back(double(textContentWords) /
132 std::max<int>(1.0, innerHTMLWords));
133 // 'innertexttextcontentwordcountratio',
134 // float(innerTextWords) / max(1, textContentWords),
135 features.push_back(double(innerTextWords) /
136 std::max<int>(1.0, textContentWords));
137 return features; 103 return features;
138 } 104 }
139 105
140 std::vector<double> CalculateDerivedFeaturesFromJSON(
141 const base::Value* stringified_json) {
142 std::string stringified;
143 if (!stringified_json->GetAsString(&stringified)) {
144 return std::vector<double>();
145 }
146
147 scoped_ptr<base::Value> json = base::JSONReader::Read(stringified);
148 if (!json) {
149 return std::vector<double>();
150 }
151
152 const base::DictionaryValue* dict;
153 if (!json->GetAsDictionary(&dict)) {
154 return std::vector<double>();
155 }
156
157 bool isOGArticle = false;
158 std::string url, innerText, textContent, innerHTML;
159 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
160
161 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
162 dict->GetString("url", &url) &&
163 dict->GetDouble("numElements", &numElements) &&
164 dict->GetDouble("numAnchors", &numAnchors) &&
165 dict->GetDouble("numForms", &numForms) &&
166 dict->GetString("innerText", &innerText) &&
167 dict->GetString("textContent", &textContent) &&
168 dict->GetString("innerHTML", &innerHTML))) {
169 return std::vector<double>();
170 }
171
172 GURL parsed_url(url);
173 if (!parsed_url.is_valid()) {
174 return std::vector<double>();
175 }
176
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
178 numAnchors, numForms, innerText, textContent,
179 innerHTML);
180 }
181
182 } // namespace dom_distiller 106 } // namespace dom_distiller
OLDNEW
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698