Index: components/dom_distiller/core/page_features.cc |
diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc |
index f931bbe5cb23cdc75513005d00cac20c84b92b93..3cdd74d6726dc8d6a8e1665c3f42a7c17cef96db 100644 |
--- a/components/dom_distiller/core/page_features.cc |
+++ b/components/dom_distiller/core/page_features.cc |
@@ -6,8 +6,9 @@ |
#include <string> |
-#include "base/json/json_reader.h" |
+#include "third_party/WebKit/public/platform/WebDistillability.h" |
#include "third_party/re2/re2/re2.h" |
+#include "url/gurl.h" |
namespace dom_distiller { |
/* This code needs to derive features in the same way and order in which they |
@@ -35,10 +36,6 @@ int CountMatches(const std::string& s, const std::string& p) { |
return count; |
} |
-int GetWordCount(const std::string& s) { |
- return CountMatches(s, "\\w+"); |
-} |
- |
bool Contains(const std::string& n, const std::string& h) { |
return h.find(n) != std::string::npos; |
} |
@@ -50,133 +47,60 @@ bool EndsWith(const std::string& t, const std::string& s) { |
} // namespace |
-int kDerivedFeaturesCount = 29; |
+unsigned kDerivedFeaturesCount = 22; |
-std::vector<double> CalculateDerivedFeatures(bool isOGArticle, |
- const GURL& url, |
- double numElements, |
- double numAnchors, |
- double numForms, |
- const std::string& innerText, |
- const std::string& textContent, |
- const std::string& innerHTML) { |
- // In the training pipeline, the strings are explicitly encoded in utf-8 (as |
- // they are here). |
+std::vector<double> CalculateDerivedFeatures( |
+ const blink::WebDistillabilityFeatures& f, |
+ const GURL& url) { |
const std::string& path = url.path(); |
- int innerTextWords = GetWordCount(innerText); |
- int textContentWords = GetWordCount(textContent); |
- int innerHTMLWords = GetWordCount(innerHTML); |
std::vector<double> features; |
// 'opengraph', opengraph, |
- features.push_back(isOGArticle); |
+ features.push_back(f.openGraph); |
// 'forum', 'forum' in path, |
features.push_back(Contains("forum", path)); |
// 'index', 'index' in path, |
features.push_back(Contains("index", path)); |
+ // 'search', 'search' in path, |
+ features.push_back(Contains("search", path)); |
// 'view', 'view' in path, |
features.push_back(Contains("view", path)); |
+ // 'archive', 'archive' in path, |
+ features.push_back(Contains("archive", path)); |
// 'asp', '.asp' in path, |
features.push_back(Contains(".asp", path)); |
// 'phpbb', 'phpbb' in path, |
features.push_back(Contains("phpbb", path)); |
// 'php', path.endswith('.php'), |
features.push_back(EndsWith(".php", path)); |
- // 'pathlength', len(path), |
+ // 'pathLength', len(path), |
features.push_back(path.size()); |
// 'domain', len(path) < 2, |
features.push_back(path.size() < 2); |
- // 'pathcomponents', CountMatches(path, r'\/.'), |
+ // 'pathComponents', CountMatches(path, r'\/.'), |
features.push_back(CountMatches(path, "\\/.")); |
- // 'slugdetector', CountMatches(path, r'[^\w/]'), |
+ // 'slugDetector', CountMatches(path, r'[^\w/]'), |
features.push_back(CountMatches(path, "[^\\w/]")); |
- // 'pathnumbers', CountMatches(path, r'\d+'), |
+ // 'pathNumbers', CountMatches(path, r'\d+'), |
features.push_back(CountMatches(path, "\\d+")); |
// 'lastSegmentLength', len(GetLastSegment(path)), |
features.push_back(GetLastSegment(path).size()); |
- // 'formcount', numForms, |
- features.push_back(numForms); |
- // 'anchorcount', numAnchors, |
- features.push_back(numAnchors); |
- // 'elementcount', numElements, |
- features.push_back(numElements); |
- // 'anchorratio', float(numAnchors) / max(1, numElements), |
- features.push_back(double(numAnchors) / std::max<double>(1, numElements)); |
- // 'innertextlength', len(innerText), |
- features.push_back(innerText.size()); |
- // 'textcontentlength', len(textContent), |
- features.push_back(textContent.size()); |
- // 'innerhtmllength', len(innerHTML), |
- features.push_back(innerHTML.size()); |
- // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), |
- features.push_back(double(innerText.size()) / |
- std::max<double>(1.0, innerHTML.size())); |
- // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), |
- features.push_back(double(textContent.size()) / |
- std::max<double>(1.0, innerHTML.size())); |
- // 'innertexttextcontentlengthratio', |
- // float(len(innerText)) / max(1, len(textContent)), |
- features.push_back(double(innerText.size()) / |
- std::max<double>(1.0, textContent.size())); |
- // 'innertextwordcount', innerTextWords, |
- features.push_back(innerTextWords); |
- // 'textcontentwordcount', textContentWords, |
- features.push_back(textContentWords); |
- // 'innerhtmlwordcount', innerHTMLWords, |
- features.push_back(innerHTMLWords); |
- // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), |
- features.push_back(double(innerTextWords) / |
- std::max<int>(1.0, innerHTMLWords)); |
- // 'textcontentwordcountratio', |
- // float(textContentWords) / max(1, innerHTMLWords), |
- features.push_back(double(textContentWords) / |
- std::max<int>(1.0, innerHTMLWords)); |
- // 'innertexttextcontentwordcountratio', |
- // float(innerTextWords) / max(1, textContentWords), |
- features.push_back(double(innerTextWords) / |
- std::max<int>(1.0, textContentWords)); |
- return features; |
-} |
- |
-std::vector<double> CalculateDerivedFeaturesFromJSON( |
- const base::Value* stringified_json) { |
- std::string stringified; |
- if (!stringified_json->GetAsString(&stringified)) { |
- return std::vector<double>(); |
- } |
- |
- scoped_ptr<base::Value> json = base::JSONReader::Read(stringified); |
- if (!json) { |
- return std::vector<double>(); |
- } |
+ // 'formCount', numForms, |
+ features.push_back(f.formCount); |
+ // 'anchorCount', numAnchors, |
+ features.push_back(f.anchorCount); |
+ // 'elementCount', numElements, |
+ features.push_back(f.elementCount); |
+ // 'anchorRatio', float(numAnchors) / max(1, numElements), |
+ features.push_back( |
+ double(f.anchorCount) / std::max<double>(1, f.elementCount)); |
+ // 'mozScore' |
+ features.push_back(f.mozScore); |
+ // 'mozScoreAllSqrt' |
+ features.push_back(f.mozScoreAllSqrt); |
+ // 'mozScoreAllLinear' |
+ features.push_back(f.mozScoreAllLinear); |
- const base::DictionaryValue* dict; |
- if (!json->GetAsDictionary(&dict)) { |
- return std::vector<double>(); |
- } |
- |
- bool isOGArticle = false; |
- std::string url, innerText, textContent, innerHTML; |
- double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; |
- |
- if (!(dict->GetBoolean("opengraph", &isOGArticle) && |
- dict->GetString("url", &url) && |
- dict->GetDouble("numElements", &numElements) && |
- dict->GetDouble("numAnchors", &numAnchors) && |
- dict->GetDouble("numForms", &numForms) && |
- dict->GetString("innerText", &innerText) && |
- dict->GetString("textContent", &textContent) && |
- dict->GetString("innerHTML", &innerHTML))) { |
- return std::vector<double>(); |
- } |
- |
- GURL parsed_url(url); |
- if (!parsed_url.is_valid()) { |
- return std::vector<double>(); |
- } |
- |
- return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
- numAnchors, numForms, innerText, textContent, |
- innerHTML); |
+ return features; |
} |
} // namespace dom_distiller |