OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
9 #include "base/json/json_reader.h" | 9 #include "base/json/json_reader.h" |
10 #include "third_party/re2/re2/re2.h" | 10 #include "third_party/re2/re2/re2.h" |
| 11 #include "url/gurl.h" |
11 | 12 |
12 namespace dom_distiller { | 13 namespace dom_distiller { |
13 /* This code needs to derive features in the same way and order in which they | 14 /* This code needs to derive features in the same way and order in which they |
14 * are derived when training the model. Parts of that code are reproduced in the | 15 * are derived when training the model. Parts of that code are reproduced in the |
15 * comments below. | 16 * comments below. |
16 */ | 17 */ |
17 | 18 |
18 namespace { | 19 namespace { |
19 | 20 |
20 std::string GetLastSegment(const std::string& path) { | 21 std::string GetLastSegment(const std::string& path) { |
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
172 GURL parsed_url(url); | 173 GURL parsed_url(url); |
173 if (!parsed_url.is_valid()) { | 174 if (!parsed_url.is_valid()) { |
174 return std::vector<double>(); | 175 return std::vector<double>(); |
175 } | 176 } |
176 | 177 |
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | 178 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
178 numAnchors, numForms, innerText, textContent, | 179 numAnchors, numForms, innerText, textContent, |
179 innerHTML); | 180 innerHTML); |
180 } | 181 } |
181 | 182 |
| 183 std::vector<double> CalculateDerivedFeatures( |
| 184 bool openGraph, |
| 185 const GURL& url, |
| 186 unsigned elementCount, |
| 187 unsigned anchorCount, |
| 188 unsigned formCount, |
| 189 double mozScore, |
| 190 double mozScoreAllSqrt, |
| 191 double mozScoreAllLinear) { |
| 192 const std::string& path = url.path(); |
| 193 std::vector<double> features; |
| 194 // 'opengraph', opengraph, |
| 195 features.push_back(openGraph); |
| 196 // 'forum', 'forum' in path, |
| 197 features.push_back(Contains("forum", path)); |
| 198 // 'index', 'index' in path, |
| 199 features.push_back(Contains("index", path)); |
| 200 // 'search', 'search' in path, |
| 201 features.push_back(Contains("search", path)); |
| 202 // 'view', 'view' in path, |
| 203 features.push_back(Contains("view", path)); |
| 204 // 'archive', 'archive' in path, |
| 205 features.push_back(Contains("archive", path)); |
| 206 // 'asp', '.asp' in path, |
| 207 features.push_back(Contains(".asp", path)); |
| 208 // 'phpbb', 'phpbb' in path, |
| 209 features.push_back(Contains("phpbb", path)); |
| 210 // 'php', path.endswith('.php'), |
| 211 features.push_back(EndsWith(".php", path)); |
| 212 // 'pathLength', len(path), |
| 213 features.push_back(path.size()); |
| 214 // 'domain', len(path) < 2, |
| 215 features.push_back(path.size() < 2); |
| 216 // 'pathComponents', CountMatches(path, r'\/.'), |
| 217 features.push_back(CountMatches(path, "\\/.")); |
| 218 // 'slugDetector', CountMatches(path, r'[^\w/]'), |
| 219 features.push_back(CountMatches(path, "[^\\w/]")); |
| 220 // 'pathNumbers', CountMatches(path, r'\d+'), |
| 221 features.push_back(CountMatches(path, "\\d+")); |
| 222 // 'lastSegmentLength', len(GetLastSegment(path)), |
| 223 features.push_back(GetLastSegment(path).size()); |
| 224 // 'formCount', numForms, |
| 225 features.push_back(formCount); |
| 226 // 'anchorCount', numAnchors, |
| 227 features.push_back(anchorCount); |
| 228 // 'elementCount', numElements, |
| 229 features.push_back(elementCount); |
| 230 // 'anchorRatio', float(numAnchors) / max(1, numElements), |
| 231 features.push_back( |
| 232 double(anchorCount) / std::max<double>(1, elementCount)); |
| 233 // 'mozScore' |
| 234 features.push_back(mozScore); |
| 235 // 'mozScoreAllSqrt' |
| 236 features.push_back(mozScoreAllSqrt); |
| 237 // 'mozScoreAllLinear' |
| 238 features.push_back(mozScoreAllLinear); |
| 239 |
| 240 return features; |
| 241 } |
| 242 |
182 } // namespace dom_distiller | 243 } // namespace dom_distiller |
OLD | NEW |