Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(295)

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1409133007: Add a new set of page features for distillability testing (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@webkit
Patch Set: fix DEPS Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/page_features.h" 5 #include "components/dom_distiller/core/page_features.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/json/json_reader.h" 9 #include "base/json/json_reader.h"
10 #include "third_party/re2/re2/re2.h" 10 #include "third_party/re2/re2/re2.h"
11 #include "url/gurl.h"
11 12
12 namespace dom_distiller { 13 namespace dom_distiller {
13 /* This code needs to derive features in the same way and order in which they 14 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the 15 * are derived when training the model. Parts of that code are reproduced in the
15 * comments below. 16 * comments below.
16 */ 17 */
17 18
18 namespace { 19 namespace {
19 20
20 std::string GetLastSegment(const std::string& path) { 21 std::string GetLastSegment(const std::string& path) {
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
172 GURL parsed_url(url); 173 GURL parsed_url(url);
173 if (!parsed_url.is_valid()) { 174 if (!parsed_url.is_valid()) {
174 return std::vector<double>(); 175 return std::vector<double>();
175 } 176 }
176 177
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, 178 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
178 numAnchors, numForms, innerText, textContent, 179 numAnchors, numForms, innerText, textContent,
179 innerHTML); 180 innerHTML);
180 } 181 }
181 182
183 std::vector<double> CalculateDerivedFeatures(
184 bool openGraph,
185 const GURL& url,
186 unsigned elementCount,
187 unsigned anchorCount,
188 unsigned formCount,
189 double mozScore,
190 double mozScoreAllSqrt,
191 double mozScoreAllLinear) {
192 const std::string& path = url.path();
193 std::vector<double> features;
194 // 'opengraph', opengraph,
195 features.push_back(openGraph);
196 // 'forum', 'forum' in path,
197 features.push_back(Contains("forum", path));
198 // 'index', 'index' in path,
199 features.push_back(Contains("index", path));
200 // 'search', 'search' in path,
201 features.push_back(Contains("search", path));
202 // 'view', 'view' in path,
203 features.push_back(Contains("view", path));
204 // 'archive', 'archive' in path,
205 features.push_back(Contains("archive", path));
206 // 'asp', '.asp' in path,
207 features.push_back(Contains(".asp", path));
208 // 'phpbb', 'phpbb' in path,
209 features.push_back(Contains("phpbb", path));
210 // 'php', path.endswith('.php'),
211 features.push_back(EndsWith(".php", path));
212 // 'pathLength', len(path),
213 features.push_back(path.size());
214 // 'domain', len(path) < 2,
215 features.push_back(path.size() < 2);
216 // 'pathComponents', CountMatches(path, r'\/.'),
217 features.push_back(CountMatches(path, "\\/."));
218 // 'slugDetector', CountMatches(path, r'[^\w/]'),
219 features.push_back(CountMatches(path, "[^\\w/]"));
220 // 'pathNumbers', CountMatches(path, r'\d+'),
221 features.push_back(CountMatches(path, "\\d+"));
222 // 'lastSegmentLength', len(GetLastSegment(path)),
223 features.push_back(GetLastSegment(path).size());
224 // 'formCount', numForms,
225 features.push_back(formCount);
226 // 'anchorCount', numAnchors,
227 features.push_back(anchorCount);
228 // 'elementCount', numElements,
229 features.push_back(elementCount);
230 // 'anchorRatio', float(numAnchors) / max(1, numElements),
231 features.push_back(
232 double(anchorCount) / std::max<double>(1, elementCount));
233 // 'mozScore'
234 features.push_back(mozScore);
235 // 'mozScoreAllSqrt'
236 features.push_back(mozScoreAllSqrt);
237 // 'mozScoreAllLinear'
238 features.push_back(mozScoreAllLinear);
239
240 return features;
241 }
242
182 } // namespace dom_distiller 243 } // namespace dom_distiller
OLDNEW
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698