Index: components/dom_distiller/core/page_features.cc |
diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc |
index f931bbe5cb23cdc75513005d00cac20c84b92b93..b80c2659bf4a6ac877ec805a0f53212b81cfae42 100644 |
--- a/components/dom_distiller/core/page_features.cc |
+++ b/components/dom_distiller/core/page_features.cc |
@@ -7,7 +7,9 @@ |
#include <string> |
#include "base/json/json_reader.h" |
+#include "third_party/WebKit/public/platform/WebDistillability.h" |
#include "third_party/re2/re2/re2.h" |
+#include "url/gurl.h" |
namespace dom_distiller { |
/* This code needs to derive features in the same way and order in which they |
@@ -179,4 +181,58 @@ std::vector<double> CalculateDerivedFeaturesFromJSON( |
innerHTML); |
} |
+std::vector<double> CalculateDerivedFeatures( |
+ const blink::WebDistillabilityFeatures& f, |
+ const GURL& url) { |
+ const std::string& path = url.path(); |
+ std::vector<double> features; |
+ // 'opengraph', opengraph, |
+ features.push_back(f.openGraph); |
+ // 'forum', 'forum' in path, |
+ features.push_back(Contains("forum", path)); |
+ // 'index', 'index' in path, |
+ features.push_back(Contains("index", path)); |
+ // 'search', 'search' in path, |
+ features.push_back(Contains("search", path)); |
+ // 'view', 'view' in path, |
+ features.push_back(Contains("view", path)); |
+ // 'archive', 'archive' in path, |
+ features.push_back(Contains("archive", path)); |
+ // 'asp', '.asp' in path, |
+ features.push_back(Contains(".asp", path)); |
+ // 'phpbb', 'phpbb' in path, |
+ features.push_back(Contains("phpbb", path)); |
+ // 'php', path.endswith('.php'), |
+ features.push_back(EndsWith(".php", path)); |
+ // 'pathLength', len(path), |
+ features.push_back(path.size()); |
+ // 'domain', len(path) < 2, |
+ features.push_back(path.size() < 2); |
+ // 'pathComponents', CountMatches(path, r'\/.'), |
+ features.push_back(CountMatches(path, "\\/.")); |
+ // 'slugDetector', CountMatches(path, r'[^\w/]'), |
+ features.push_back(CountMatches(path, "[^\\w/]")); |
+ // 'pathNumbers', CountMatches(path, r'\d+'), |
+ features.push_back(CountMatches(path, "\\d+")); |
+ // 'lastSegmentLength', len(GetLastSegment(path)), |
+ features.push_back(GetLastSegment(path).size()); |
+ // 'formCount', numForms, |
+ features.push_back(f.formCount); |
+ // 'anchorCount', numAnchors, |
+ features.push_back(f.anchorCount); |
+ // 'elementCount', numElements, |
+ features.push_back(f.elementCount); |
+ // 'anchorRatio', float(numAnchors) / max(1, numElements), |
+ features.push_back( |
+ double(f.anchorCount) / std::max<double>(1, f.elementCount)); |
+ // 'mozScore' |
+ features.push_back(f.mozScore); |
+ // 'mozScoreAllSqrt' |
+ features.push_back(f.mozScoreAllSqrt); |
+ // 'mozScoreAllLinear' |
+ features.push_back(f.mozScoreAllLinear); |
+ |
+ return features; |
+} |
+ |
} // namespace dom_distiller |