Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1015)

Unified Diff: components/dom_distiller/core/page_features.cc

Issue 1409133007: Add a new set of page features for distillability testing (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@webkit
Patch Set: fix DEPS Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/dom_distiller/core/page_features.cc
diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc
index f931bbe5cb23cdc75513005d00cac20c84b92b93..8425dc78ecd3fc57834f70ed0af4f663424d5503 100644
--- a/components/dom_distiller/core/page_features.cc
+++ b/components/dom_distiller/core/page_features.cc
@@ -8,6 +8,7 @@
#include "base/json/json_reader.h"
#include "third_party/re2/re2/re2.h"
+#include "url/gurl.h"
namespace dom_distiller {
/* This code needs to derive features in the same way and order in which they
@@ -179,4 +180,64 @@ std::vector<double> CalculateDerivedFeaturesFromJSON(
innerHTML);
}
+std::vector<double> CalculateDerivedFeatures(
+ bool openGraph,
+ const GURL& url,
+ unsigned elementCount,
+ unsigned anchorCount,
+ unsigned formCount,
+ double mozScore,
+ double mozScoreAllSqrt,
+ double mozScoreAllLinear) {
+ const std::string& path = url.path();
+ std::vector<double> features;
+ // 'opengraph', opengraph,
+ features.push_back(openGraph);
+ // 'forum', 'forum' in path,
+ features.push_back(Contains("forum", path));
+ // 'index', 'index' in path,
+ features.push_back(Contains("index", path));
+ // 'search', 'search' in path,
+ features.push_back(Contains("search", path));
+ // 'view', 'view' in path,
+ features.push_back(Contains("view", path));
+ // 'archive', 'archive' in path,
+ features.push_back(Contains("archive", path));
+ // 'asp', '.asp' in path,
+ features.push_back(Contains(".asp", path));
+ // 'phpbb', 'phpbb' in path,
+ features.push_back(Contains("phpbb", path));
+ // 'php', path.endswith('.php'),
+ features.push_back(EndsWith(".php", path));
+ // 'pathLength', len(path),
+ features.push_back(path.size());
+ // 'domain', len(path) < 2,
+ features.push_back(path.size() < 2);
+ // 'pathComponents', CountMatches(path, r'\/.'),
+ features.push_back(CountMatches(path, "\\/."));
+ // 'slugDetector', CountMatches(path, r'[^\w/]'),
+ features.push_back(CountMatches(path, "[^\\w/]"));
+ // 'pathNumbers', CountMatches(path, r'\d+'),
+ features.push_back(CountMatches(path, "\\d+"));
+ // 'lastSegmentLength', len(GetLastSegment(path)),
+ features.push_back(GetLastSegment(path).size());
+ // 'formCount', numForms,
+ features.push_back(formCount);
+ // 'anchorCount', numAnchors,
+ features.push_back(anchorCount);
+ // 'elementCount', numElements,
+ features.push_back(elementCount);
+ // 'anchorRatio', float(numAnchors) / max(1, numElements),
+ features.push_back(
+ double(anchorCount) / std::max<double>(1, elementCount));
+ // 'mozScore'
+ features.push_back(mozScore);
+ // 'mozScoreAllSqrt'
+ features.push_back(mozScoreAllSqrt);
+ // 'mozScoreAllLinear'
+ features.push_back(mozScoreAllLinear);
+
+ return features;
+}
+
} // namespace dom_distiller
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698