Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(368)

Unified Diff: components/dom_distiller/core/page_features.cc

Issue 1248643004: Test distillability without JavaScript (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@early
Patch Set: fix browsertest, merge webkit CL, merge http://crrev.com/1403413004 Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/dom_distiller/core/page_features.cc
diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc
index f931bbe5cb23cdc75513005d00cac20c84b92b93..3cdd74d6726dc8d6a8e1665c3f42a7c17cef96db 100644
--- a/components/dom_distiller/core/page_features.cc
+++ b/components/dom_distiller/core/page_features.cc
@@ -6,8 +6,9 @@
#include <string>
-#include "base/json/json_reader.h"
+#include "third_party/WebKit/public/platform/WebDistillability.h"
#include "third_party/re2/re2/re2.h"
+#include "url/gurl.h"
namespace dom_distiller {
/* This code needs to derive features in the same way and order in which they
@@ -35,10 +36,6 @@ int CountMatches(const std::string& s, const std::string& p) {
return count;
}
-int GetWordCount(const std::string& s) {
- return CountMatches(s, "\\w+");
-}
-
bool Contains(const std::string& n, const std::string& h) {
return h.find(n) != std::string::npos;
}
@@ -50,133 +47,60 @@ bool EndsWith(const std::string& t, const std::string& s) {
} // namespace
-int kDerivedFeaturesCount = 29;
+unsigned kDerivedFeaturesCount = 22;
-std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
- const GURL& url,
- double numElements,
- double numAnchors,
- double numForms,
- const std::string& innerText,
- const std::string& textContent,
- const std::string& innerHTML) {
- // In the training pipeline, the strings are explicitly encoded in utf-8 (as
- // they are here).
+std::vector<double> CalculateDerivedFeatures(
+ const blink::WebDistillabilityFeatures& f,
+ const GURL& url) {
const std::string& path = url.path();
- int innerTextWords = GetWordCount(innerText);
- int textContentWords = GetWordCount(textContent);
- int innerHTMLWords = GetWordCount(innerHTML);
std::vector<double> features;
// 'opengraph', opengraph,
- features.push_back(isOGArticle);
+ features.push_back(f.openGraph);
// 'forum', 'forum' in path,
features.push_back(Contains("forum", path));
// 'index', 'index' in path,
features.push_back(Contains("index", path));
+ // 'search', 'search' in path,
+ features.push_back(Contains("search", path));
// 'view', 'view' in path,
features.push_back(Contains("view", path));
+ // 'archive', 'archive' in path,
+ features.push_back(Contains("archive", path));
// 'asp', '.asp' in path,
features.push_back(Contains(".asp", path));
// 'phpbb', 'phpbb' in path,
features.push_back(Contains("phpbb", path));
// 'php', path.endswith('.php'),
features.push_back(EndsWith(".php", path));
- // 'pathlength', len(path),
+ // 'pathLength', len(path),
features.push_back(path.size());
// 'domain', len(path) < 2,
features.push_back(path.size() < 2);
- // 'pathcomponents', CountMatches(path, r'\/.'),
+ // 'pathComponents', CountMatches(path, r'\/.'),
features.push_back(CountMatches(path, "\\/."));
- // 'slugdetector', CountMatches(path, r'[^\w/]'),
+ // 'slugDetector', CountMatches(path, r'[^\w/]'),
features.push_back(CountMatches(path, "[^\\w/]"));
- // 'pathnumbers', CountMatches(path, r'\d+'),
+ // 'pathNumbers', CountMatches(path, r'\d+'),
features.push_back(CountMatches(path, "\\d+"));
// 'lastSegmentLength', len(GetLastSegment(path)),
features.push_back(GetLastSegment(path).size());
- // 'formcount', numForms,
- features.push_back(numForms);
- // 'anchorcount', numAnchors,
- features.push_back(numAnchors);
- // 'elementcount', numElements,
- features.push_back(numElements);
- // 'anchorratio', float(numAnchors) / max(1, numElements),
- features.push_back(double(numAnchors) / std::max<double>(1, numElements));
- // 'innertextlength', len(innerText),
- features.push_back(innerText.size());
- // 'textcontentlength', len(textContent),
- features.push_back(textContent.size());
- // 'innerhtmllength', len(innerHTML),
- features.push_back(innerHTML.size());
- // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
- features.push_back(double(innerText.size()) /
- std::max<double>(1.0, innerHTML.size()));
- // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
- features.push_back(double(textContent.size()) /
- std::max<double>(1.0, innerHTML.size()));
- // 'innertexttextcontentlengthratio',
- // float(len(innerText)) / max(1, len(textContent)),
- features.push_back(double(innerText.size()) /
- std::max<double>(1.0, textContent.size()));
- // 'innertextwordcount', innerTextWords,
- features.push_back(innerTextWords);
- // 'textcontentwordcount', textContentWords,
- features.push_back(textContentWords);
- // 'innerhtmlwordcount', innerHTMLWords,
- features.push_back(innerHTMLWords);
- // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
- features.push_back(double(innerTextWords) /
- std::max<int>(1.0, innerHTMLWords));
- // 'textcontentwordcountratio',
- // float(textContentWords) / max(1, innerHTMLWords),
- features.push_back(double(textContentWords) /
- std::max<int>(1.0, innerHTMLWords));
- // 'innertexttextcontentwordcountratio',
- // float(innerTextWords) / max(1, textContentWords),
- features.push_back(double(innerTextWords) /
- std::max<int>(1.0, textContentWords));
- return features;
-}
-
-std::vector<double> CalculateDerivedFeaturesFromJSON(
- const base::Value* stringified_json) {
- std::string stringified;
- if (!stringified_json->GetAsString(&stringified)) {
- return std::vector<double>();
- }
-
- scoped_ptr<base::Value> json = base::JSONReader::Read(stringified);
- if (!json) {
- return std::vector<double>();
- }
+ // 'formCount', numForms,
+ features.push_back(f.formCount);
+ // 'anchorCount', numAnchors,
+ features.push_back(f.anchorCount);
+ // 'elementCount', numElements,
+ features.push_back(f.elementCount);
+ // 'anchorRatio', float(numAnchors) / max(1, numElements),
+ features.push_back(
+ double(f.anchorCount) / std::max<double>(1, f.elementCount));
+ // 'mozScore'
+ features.push_back(f.mozScore);
+ // 'mozScoreAllSqrt'
+ features.push_back(f.mozScoreAllSqrt);
+ // 'mozScoreAllLinear'
+ features.push_back(f.mozScoreAllLinear);
- const base::DictionaryValue* dict;
- if (!json->GetAsDictionary(&dict)) {
- return std::vector<double>();
- }
-
- bool isOGArticle = false;
- std::string url, innerText, textContent, innerHTML;
- double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
-
- if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
- dict->GetString("url", &url) &&
- dict->GetDouble("numElements", &numElements) &&
- dict->GetDouble("numAnchors", &numAnchors) &&
- dict->GetDouble("numForms", &numForms) &&
- dict->GetString("innerText", &innerText) &&
- dict->GetString("textContent", &textContent) &&
- dict->GetString("innerHTML", &innerHTML))) {
- return std::vector<double>();
- }
-
- GURL parsed_url(url);
- if (!parsed_url.is_valid()) {
- return std::vector<double>();
- }
-
- return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
- numAnchors, numForms, innerText, textContent,
- innerHTML);
+ return features;
}
} // namespace dom_distiller
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698