OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
9 #include "base/json/json_reader.h" | 9 #include "base/json/json_reader.h" |
| 10 #include "third_party/WebKit/public/platform/WebDistillability.h" |
10 #include "third_party/re2/re2/re2.h" | 11 #include "third_party/re2/re2/re2.h" |
| 12 #include "url/gurl.h" |
11 | 13 |
12 namespace dom_distiller { | 14 namespace dom_distiller { |
13 /* This code needs to derive features in the same way and order in which they | 15 /* This code needs to derive features in the same way and order in which they |
14 * are derived when training the model. Parts of that code are reproduced in the | 16 * are derived when training the model. Parts of that code are reproduced in the |
15 * comments below. | 17 * comments below. |
16 */ | 18 */ |
17 | 19 |
18 namespace { | 20 namespace { |
19 | 21 |
20 std::string GetLastSegment(const std::string& path) { | 22 std::string GetLastSegment(const std::string& path) { |
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
172 GURL parsed_url(url); | 174 GURL parsed_url(url); |
173 if (!parsed_url.is_valid()) { | 175 if (!parsed_url.is_valid()) { |
174 return std::vector<double>(); | 176 return std::vector<double>(); |
175 } | 177 } |
176 | 178 |
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | 179 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
178 numAnchors, numForms, innerText, textContent, | 180 numAnchors, numForms, innerText, textContent, |
179 innerHTML); | 181 innerHTML); |
180 } | 182 } |
181 | 183 |
| 184 std::vector<double> CalculateDerivedFeatures( |
| 185 const blink::WebDistillabilityFeatures& f, |
| 186 const GURL& url) { |
| 187 const std::string& path = url.path(); |
| 188 std::vector<double> features; |
| 189 // 'opengraph', opengraph, |
| 190 features.push_back(f.openGraph); |
| 191 // 'forum', 'forum' in path, |
| 192 features.push_back(Contains("forum", path)); |
| 193 // 'index', 'index' in path, |
| 194 features.push_back(Contains("index", path)); |
| 195 // 'search', 'search' in path, |
| 196 features.push_back(Contains("search", path)); |
| 197 // 'view', 'view' in path, |
| 198 features.push_back(Contains("view", path)); |
| 199 // 'archive', 'archive' in path, |
| 200 features.push_back(Contains("archive", path)); |
| 201 // 'asp', '.asp' in path, |
| 202 features.push_back(Contains(".asp", path)); |
| 203 // 'phpbb', 'phpbb' in path, |
| 204 features.push_back(Contains("phpbb", path)); |
| 205 // 'php', path.endswith('.php'), |
| 206 features.push_back(EndsWith(".php", path)); |
| 207 // 'pathLength', len(path), |
| 208 features.push_back(path.size()); |
| 209 // 'domain', len(path) < 2, |
| 210 features.push_back(path.size() < 2); |
| 211 // 'pathComponents', CountMatches(path, r'\/.'), |
| 212 features.push_back(CountMatches(path, "\\/.")); |
| 213 // 'slugDetector', CountMatches(path, r'[^\w/]'), |
| 214 features.push_back(CountMatches(path, "[^\\w/]")); |
| 215 // 'pathNumbers', CountMatches(path, r'\d+'), |
| 216 features.push_back(CountMatches(path, "\\d+")); |
| 217 // 'lastSegmentLength', len(GetLastSegment(path)), |
| 218 features.push_back(GetLastSegment(path).size()); |
| 219 // 'formCount', numForms, |
| 220 features.push_back(f.formCount); |
| 221 // 'anchorCount', numAnchors, |
| 222 features.push_back(f.anchorCount); |
| 223 // 'elementCount', numElements, |
| 224 features.push_back(f.elementCount); |
| 225 // 'anchorRatio', float(numAnchors) / max(1, numElements), |
| 226 features.push_back( |
| 227 double(f.anchorCount) / std::max<double>(1, f.elementCount)); |
| 228 // 'mozScore' |
| 229 features.push_back(f.mozScore); |
| 230 // 'mozScoreAllSqrt' |
| 231 features.push_back(f.mozScoreAllSqrt); |
| 232 // 'mozScoreAllLinear' |
| 233 features.push_back(f.mozScoreAllLinear); |
| 234 |
| 235 return features; |
| 236 } |
| 237 |
182 } // namespace dom_distiller | 238 } // namespace dom_distiller |
OLD | NEW |