| Index: components/dom_distiller/core/page_features.cc
|
| diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc
|
| index f931bbe5cb23cdc75513005d00cac20c84b92b93..b80c2659bf4a6ac877ec805a0f53212b81cfae42 100644
|
| --- a/components/dom_distiller/core/page_features.cc
|
| +++ b/components/dom_distiller/core/page_features.cc
|
| @@ -7,7 +7,9 @@
|
| #include <string>
|
|
|
| #include "base/json/json_reader.h"
|
| +#include "third_party/WebKit/public/platform/WebDistillability.h"
|
| #include "third_party/re2/re2/re2.h"
|
| +#include "url/gurl.h"
|
|
|
| namespace dom_distiller {
|
| /* This code needs to derive features in the same way and order in which they
|
| @@ -179,4 +181,58 @@ std::vector<double> CalculateDerivedFeaturesFromJSON(
|
| innerHTML);
|
| }
|
|
|
| +std::vector<double> CalculateDerivedFeatures(
|
| + const blink::WebDistillabilityFeatures& f,
|
| + const GURL& url) {
|
| + const std::string& path = url.path();
|
| + std::vector<double> features;
|
| + // 'opengraph', opengraph,
|
| + features.push_back(f.openGraph);
|
| + // 'forum', 'forum' in path,
|
| + features.push_back(Contains("forum", path));
|
| + // 'index', 'index' in path,
|
| + features.push_back(Contains("index", path));
|
| + // 'search', 'search' in path,
|
| + features.push_back(Contains("search", path));
|
| + // 'view', 'view' in path,
|
| + features.push_back(Contains("view", path));
|
| + // 'archive', 'archive' in path,
|
| + features.push_back(Contains("archive", path));
|
| + // 'asp', '.asp' in path,
|
| + features.push_back(Contains(".asp", path));
|
| + // 'phpbb', 'phpbb' in path,
|
| + features.push_back(Contains("phpbb", path));
|
| + // 'php', path.endswith('.php'),
|
| + features.push_back(EndsWith(".php", path));
|
| + // 'pathLength', len(path),
|
| + features.push_back(path.size());
|
| + // 'domain', len(path) < 2,
|
| + features.push_back(path.size() < 2);
|
| + // 'pathComponents', CountMatches(path, r'\/.'),
|
| + features.push_back(CountMatches(path, "\\/."));
|
| + // 'slugDetector', CountMatches(path, r'[^\w/]'),
|
| + features.push_back(CountMatches(path, "[^\\w/]"));
|
| + // 'pathNumbers', CountMatches(path, r'\d+'),
|
| + features.push_back(CountMatches(path, "\\d+"));
|
| + // 'lastSegmentLength', len(GetLastSegment(path)),
|
| + features.push_back(GetLastSegment(path).size());
|
| + // 'formCount', numForms,
|
| + features.push_back(f.formCount);
|
| + // 'anchorCount', numAnchors,
|
| + features.push_back(f.anchorCount);
|
| + // 'elementCount', numElements,
|
| + features.push_back(f.elementCount);
|
| + // 'anchorRatio', float(numAnchors) / max(1, numElements),
|
| + features.push_back(
|
| + double(f.anchorCount) / std::max<double>(1, f.elementCount));
|
| + // 'mozScore'
|
| + features.push_back(f.mozScore);
|
| + // 'mozScoreAllSqrt'
|
| + features.push_back(f.mozScoreAllSqrt);
|
| + // 'mozScoreAllLinear'
|
| + features.push_back(f.mozScoreAllLinear);
|
| +
|
| + return features;
|
| +}
|
| +
|
| } // namespace dom_distiller
|
|
|