Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(208)

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1248643004: Test distillability without JavaScript (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@early
Patch Set: draft (WIP) Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/page_features.h" 5 #include "components/dom_distiller/core/page_features.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/json/json_reader.h" 9 #include "base/logging.h"
10 #include "third_party/WebKit/public/platform/WebDistillability.h"
10 #include "third_party/re2/re2/re2.h" 11 #include "third_party/re2/re2/re2.h"
12 #include "url/gurl.h"
11 13
12 namespace dom_distiller { 14 namespace dom_distiller {
13 /* This code needs to derive features in the same way and order in which they 15 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the 16 * are derived when training the model. Parts of that code are reproduced in the
15 * comments below. 17 * comments below.
16 */ 18 */
17 19
18 namespace { 20 namespace {
19 21
20 std::string GetLastSegment(const std::string& path) { 22 std::string GetLastSegment(const std::string& path) {
21 // return re.search('[^/]*\/?$', path).group(0) 23 // return re.search('[^/]*\/?$', path).group(0)
22 if (path.size() == 0) 24 if (path.size() == 0)
23 return ""; 25 return "";
24 size_t start = path.rfind("/", path.size() - 1); 26 size_t start = path.rfind("/", path.size() - 1);
25 return start == std::string::npos ? "" : path.substr(start + 1); 27 return start == std::string::npos ? "" : path.substr(start + 1);
26 } 28 }
27 29
28 int CountMatches(const std::string& s, const std::string& p) { 30 int CountMatches(const std::string& s, const std::string& p) {
29 // return len(re.findall(p, s)) 31 // return len(re.findall(p, s))
30 re2::StringPiece sp(s); 32 re2::StringPiece sp(s);
31 re2::RE2 regexp(p); 33 re2::RE2 regexp(p);
32 int count = 0; 34 int count = 0;
33 while (re2::RE2::FindAndConsume(&sp, regexp)) 35 while (re2::RE2::FindAndConsume(&sp, regexp))
34 count++; 36 count++;
35 return count; 37 return count;
36 } 38 }
37 39
38 int GetWordCount(const std::string& s) {
39 return CountMatches(s, "\\w+");
40 }
41
42 bool Contains(const std::string& n, const std::string& h) { 40 bool Contains(const std::string& n, const std::string& h) {
43 return h.find(n) != std::string::npos; 41 return h.find(n) != std::string::npos;
44 } 42 }
45 43
46 bool EndsWith(const std::string& t, const std::string& s) { 44 bool EndsWith(const std::string& t, const std::string& s) {
47 return s.size() >= t.size() && 45 return s.size() >= t.size() &&
48 s.compare(s.size() - t.size(), std::string::npos, t) == 0; 46 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
49 } 47 }
50 48
51 } // namespace 49 } // namespace
52 50
53 int kDerivedFeaturesCount = 29; 51 unsigned kDerivedFeaturesCount = 22;
54 52
55 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, 53 std::vector<double> CalculateDerivedFeatures(
56 const GURL& url, 54 const blink::WebDistillabilityFeatures& f,
57 double numElements, 55 const GURL& url) {
58 double numAnchors,
59 double numForms,
60 const std::string& innerText,
61 const std::string& textContent,
62 const std::string& innerHTML) {
63 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
64 // they are here).
65 const std::string& path = url.path(); 56 const std::string& path = url.path();
66 int innerTextWords = GetWordCount(innerText);
67 int textContentWords = GetWordCount(textContent);
68 int innerHTMLWords = GetWordCount(innerHTML);
69 std::vector<double> features; 57 std::vector<double> features;
70 // 'opengraph', opengraph, 58 // 'opengraph', opengraph,
71 features.push_back(isOGArticle); 59 features.push_back(f.openGraph);
72 // 'forum', 'forum' in path, 60 // 'forum', 'forum' in path,
73 features.push_back(Contains("forum", path)); 61 features.push_back(Contains("forum", path));
74 // 'index', 'index' in path, 62 // 'index', 'index' in path,
75 features.push_back(Contains("index", path)); 63 features.push_back(Contains("index", path));
64 // 'search', 'search' in path,
65 features.push_back(Contains("search", path));
76 // 'view', 'view' in path, 66 // 'view', 'view' in path,
77 features.push_back(Contains("view", path)); 67 features.push_back(Contains("view", path));
68 // 'archive', 'archive' in path,
69 features.push_back(Contains("archive", path));
78 // 'asp', '.asp' in path, 70 // 'asp', '.asp' in path,
79 features.push_back(Contains(".asp", path)); 71 features.push_back(Contains(".asp", path));
80 // 'phpbb', 'phpbb' in path, 72 // 'phpbb', 'phpbb' in path,
81 features.push_back(Contains("phpbb", path)); 73 features.push_back(Contains("phpbb", path));
82 // 'php', path.endswith('.php'), 74 // 'php', path.endswith('.php'),
83 features.push_back(EndsWith(".php", path)); 75 features.push_back(EndsWith(".php", path));
84 // 'pathlength', len(path), 76 // 'pathLength', len(path),
85 features.push_back(path.size()); 77 features.push_back(path.size());
86 // 'domain', len(path) < 2, 78 // 'domain', len(path) < 2,
87 features.push_back(path.size() < 2); 79 features.push_back(path.size() < 2);
88 // 'pathcomponents', CountMatches(path, r'\/.'), 80 // 'pathComponents', CountMatches(path, r'\/.'),
89 features.push_back(CountMatches(path, "\\/.")); 81 features.push_back(CountMatches(path, "\\/."));
90 // 'slugdetector', CountMatches(path, r'[^\w/]'), 82 // 'slugDetector', CountMatches(path, r'[^\w/]'),
91 features.push_back(CountMatches(path, "[^\\w/]")); 83 features.push_back(CountMatches(path, "[^\\w/]"));
92 // 'pathnumbers', CountMatches(path, r'\d+'), 84 // 'pathNumbers', CountMatches(path, r'\d+'),
93 features.push_back(CountMatches(path, "\\d+")); 85 features.push_back(CountMatches(path, "\\d+"));
94 // 'lastSegmentLength', len(GetLastSegment(path)), 86 // 'lastSegmentLength', len(GetLastSegment(path)),
95 features.push_back(GetLastSegment(path).size()); 87 features.push_back(GetLastSegment(path).size());
96 // 'formcount', numForms, 88 // 'formCount', numForms,
97 features.push_back(numForms); 89 features.push_back(f.formCount);
98 // 'anchorcount', numAnchors, 90 // 'anchorCount', numAnchors,
99 features.push_back(numAnchors); 91 features.push_back(f.anchorCount);
100 // 'elementcount', numElements, 92 // 'elementCount', numElements,
101 features.push_back(numElements); 93 features.push_back(f.elementCount);
102 // 'anchorratio', float(numAnchors) / max(1, numElements), 94 // 'anchorRatio', float(numAnchors) / max(1, numElements),
103 features.push_back(double(numAnchors) / std::max<double>(1, numElements)); 95 features.push_back(
104 // 'innertextlength', len(innerText), 96 double(f.anchorCount) / std::max<double>(1, f.elementCount));
105 features.push_back(innerText.size()); 97 // 'mozScore'
106 // 'textcontentlength', len(textContent), 98 features.push_back(f.mozScore);
107 features.push_back(textContent.size()); 99 // 'mozScoreAllSqrt'
108 // 'innerhtmllength', len(innerHTML), 100 features.push_back(f.mozScoreAllSqrt);
109 features.push_back(innerHTML.size()); 101 // 'mozScoreAllLinear'
110 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), 102 features.push_back(f.mozScoreAllLinear);
111 features.push_back(double(innerText.size()) / 103
112 std::max<double>(1.0, innerHTML.size())); 104 CHECK(features.size() == kDerivedFeaturesCount);
113 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
114 features.push_back(double(textContent.size()) /
115 std::max<double>(1.0, innerHTML.size()));
116 // 'innertexttextcontentlengthratio',
117 // float(len(innerText)) / max(1, len(textContent)),
118 features.push_back(double(innerText.size()) /
119 std::max<double>(1.0, textContent.size()));
120 // 'innertextwordcount', innerTextWords,
121 features.push_back(innerTextWords);
122 // 'textcontentwordcount', textContentWords,
123 features.push_back(textContentWords);
124 // 'innerhtmlwordcount', innerHTMLWords,
125 features.push_back(innerHTMLWords);
126 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
127 features.push_back(double(innerTextWords) /
128 std::max<int>(1.0, innerHTMLWords));
129 // 'textcontentwordcountratio',
130 // float(textContentWords) / max(1, innerHTMLWords),
131 features.push_back(double(textContentWords) /
132 std::max<int>(1.0, innerHTMLWords));
133 // 'innertexttextcontentwordcountratio',
134 // float(innerTextWords) / max(1, textContentWords),
135 features.push_back(double(innerTextWords) /
136 std::max<int>(1.0, textContentWords));
137 return features; 105 return features;
138 } 106 }
139 107
140 std::vector<double> CalculateDerivedFeaturesFromJSON(
141 const base::Value* stringified_json) {
142 std::string stringified;
143 if (!stringified_json->GetAsString(&stringified)) {
144 return std::vector<double>();
145 }
146
147 scoped_ptr<base::Value> json = base::JSONReader::Read(stringified);
148 if (!json) {
149 return std::vector<double>();
150 }
151
152 const base::DictionaryValue* dict;
153 if (!json->GetAsDictionary(&dict)) {
154 return std::vector<double>();
155 }
156
157 bool isOGArticle = false;
158 std::string url, innerText, textContent, innerHTML;
159 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
160
161 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
162 dict->GetString("url", &url) &&
163 dict->GetDouble("numElements", &numElements) &&
164 dict->GetDouble("numAnchors", &numAnchors) &&
165 dict->GetDouble("numForms", &numForms) &&
166 dict->GetString("innerText", &innerText) &&
167 dict->GetString("textContent", &textContent) &&
168 dict->GetString("innerHTML", &innerHTML))) {
169 return std::vector<double>();
170 }
171
172 GURL parsed_url(url);
173 if (!parsed_url.is_valid()) {
174 return std::vector<double>();
175 }
176
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
178 numAnchors, numForms, innerText, textContent,
179 innerHTML);
180 }
181
182 } // namespace dom_distiller 108 } // namespace dom_distiller
OLDNEW
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698