Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(43)

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1409133007: Add a new set of page features for distillability testing (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@webkit
Patch Set: Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/page_features.h" 5 #include "components/dom_distiller/core/page_features.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/json/json_reader.h" 9 #include "base/json/json_reader.h"
10 #include "third_party/WebKit/public/platform/WebDistillability.h"
10 #include "third_party/re2/re2/re2.h" 11 #include "third_party/re2/re2/re2.h"
12 #include "url/gurl.h"
11 13
12 namespace dom_distiller { 14 namespace dom_distiller {
13 /* This code needs to derive features in the same way and order in which they 15 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the 16 * are derived when training the model. Parts of that code are reproduced in the
15 * comments below. 17 * comments below.
16 */ 18 */
17 19
18 namespace { 20 namespace {
19 21
20 std::string GetLastSegment(const std::string& path) { 22 std::string GetLastSegment(const std::string& path) {
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
172 GURL parsed_url(url); 174 GURL parsed_url(url);
173 if (!parsed_url.is_valid()) { 175 if (!parsed_url.is_valid()) {
174 return std::vector<double>(); 176 return std::vector<double>();
175 } 177 }
176 178
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, 179 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
178 numAnchors, numForms, innerText, textContent, 180 numAnchors, numForms, innerText, textContent,
179 innerHTML); 181 innerHTML);
180 } 182 }
181 183
184 std::vector<double> CalculateDerivedFeatures(
185 const blink::WebDistillabilityFeatures& f,
186 const GURL& url) {
187 const std::string& path = url.path();
188 std::vector<double> features;
189 // 'opengraph', opengraph,
190 features.push_back(f.openGraph);
191 // 'forum', 'forum' in path,
192 features.push_back(Contains("forum", path));
193 // 'index', 'index' in path,
194 features.push_back(Contains("index", path));
195 // 'search', 'search' in path,
196 features.push_back(Contains("search", path));
197 // 'view', 'view' in path,
198 features.push_back(Contains("view", path));
199 // 'archive', 'archive' in path,
200 features.push_back(Contains("archive", path));
201 // 'asp', '.asp' in path,
202 features.push_back(Contains(".asp", path));
203 // 'phpbb', 'phpbb' in path,
204 features.push_back(Contains("phpbb", path));
205 // 'php', path.endswith('.php'),
206 features.push_back(EndsWith(".php", path));
207 // 'pathLength', len(path),
208 features.push_back(path.size());
209 // 'domain', len(path) < 2,
210 features.push_back(path.size() < 2);
211 // 'pathComponents', CountMatches(path, r'\/.'),
212 features.push_back(CountMatches(path, "\\/."));
213 // 'slugDetector', CountMatches(path, r'[^\w/]'),
214 features.push_back(CountMatches(path, "[^\\w/]"));
215 // 'pathNumbers', CountMatches(path, r'\d+'),
216 features.push_back(CountMatches(path, "\\d+"));
217 // 'lastSegmentLength', len(GetLastSegment(path)),
218 features.push_back(GetLastSegment(path).size());
219 // 'formCount', numForms,
220 features.push_back(f.formCount);
221 // 'anchorCount', numAnchors,
222 features.push_back(f.anchorCount);
223 // 'elementCount', numElements,
224 features.push_back(f.elementCount);
225 // 'anchorRatio', float(numAnchors) / max(1, numElements),
226 features.push_back(
227 double(f.anchorCount) / std::max<double>(1, f.elementCount));
228 // 'mozScore'
229 features.push_back(f.mozScore);
230 // 'mozScoreAllSqrt'
231 features.push_back(f.mozScoreAllSqrt);
232 // 'mozScoreAllLinear'
233 features.push_back(f.mozScoreAllLinear);
234
235 return features;
236 }
237
182 } // namespace dom_distiller 238 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698