components/dom_distiller/core/page_features.cc - Issue 1042053003: Add calculation of derived features for distillable page model

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1042053003: Add calculation of derived features for distillable page model (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Calculate paths correctly Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | components/dom_distiller/core/page_features_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "components/dom_distiller/core/page_features.h"

	6

	7 #include <string>

	8

	9 #include "third_party/re2/re2/re2.h"

	10

	11 namespace dom_distiller {

	12 /* This code needs to derive features in the same way and order in which they

	13 * are derived when training the model. Parts of that code are reproduced in the

	14 * comments below.

	15 */

	16

	17 namespace {

	18 std::string GetLastSegment(const std::string& path) {

	19 // return re.search('[^/]*\/?$', path).group(0)

	20 if (path.size() == 0)

	21 return "";

	22 size_t start = path.rfind("/", path.size() - 1);

	23 return start == std::string::npos ? "" : path.substr(start + 1);

	24 }

	25

	26 int CountMatches(const std::string& s, const std::string& p) {

	27 // return len(re.findall(p, s))

	28 re2::StringPiece sp(s);

	29 re2::RE2 regexp(p);

	30 int count = 0;

	31 while (re2::RE2::FindAndConsume(&sp, regexp))

	32 count++;

	33 return count;

	34 }

	35

	36 int GetWordCount(const std::string& s) {

	37 return CountMatches(s, "\\w+");

	38 }

	39

	40 bool Contains(const std::string& n, const std::string& h) {

	41 return h.find(n) != std::string::npos;

	42 }

	43

	44 bool EndsWith(const std::string& t, const std::string& s) {

	45 return s.size() >= t.size() &&

	46 s.compare(s.size() - t.size(), std::string::npos, t) == 0;

	47 }

	48 }

	49

	50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,

	51 const GURL& url,

	52 double numElements,

	53 double numAnchors,

	54 double numForms,

	55 const std::string& innerText,

	56 const std::string& textContent,

	57 const std::string& innerHTML) {

	58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as

	59 // they are here).

	60 const std::string& path = url.path();

	61 int innerTextWords = GetWordCount(innerText);

	62 int textContentWords = GetWordCount(textContent);

	63 int innerHTMLWords = GetWordCount(innerHTML);

	64 std::vector<double> features;

	65 // 'opengraph', opengraph,

	66 features.push_back(isOGArticle);

	67 // 'forum', 'forum' in path,

	68 features.push_back(Contains("forum", path));

	69 // 'index', 'index' in path,

	70 features.push_back(Contains("index", path));

	71 // 'view', 'view' in path,

	72 features.push_back(Contains("view", path));

	73 // 'asp', '.asp' in path,

	74 features.push_back(Contains(".asp", path));

	75 // 'phpbb', 'phpbb' in path,

	76 features.push_back(Contains(".phpbb", path));
	nyquist 2015/04/01 01:19:14 The contains-statement here starts with '.', but t The contains-statement here starts with '.', but the python does not. cjhopman 2015/04/01 20:12:13 Done. Show quoted text On 2015/04/01 01:19:14, nyquist wrote: > The contains-statement here starts with '.', but the python does not. Done.
	77 // 'php', path.endswith('.php'),

	78 features.push_back(EndsWith(".php", path));

	79 // 'pathlength', len(path),

	80 features.push_back(path.size());

	81 // 'domain', len(path) < 2,

	82 features.push_back(path.size() < 2);

	83 // 'pathcomponents', CountMatches(path, r'\/.'),

	84 features.push_back(CountMatches(path, "\\/."));

	85 // 'slugdetector', CountMatches(path, r'[^\w/]'),

	86 features.push_back(CountMatches(path, "[^\\w/]"));

	87 // 'pathnumbers', CountMatches(path, r'\d+'),

	88 features.push_back(CountMatches(path, "\\d+"));

	89 // 'lastSegmentLength', len(GetLastSegment(path)),

	90 features.push_back(GetLastSegment(path).size());

	91 // 'formcount', numForms,

	92 features.push_back(numForms);

	93 // 'anchorcount', numAnchors,

	94 features.push_back(numAnchors);

	95 // 'elementcount', numElements,

	96 features.push_back(numElements);

	97 // 'anchorratio', float(numAnchors) / max(1, numElements),

	98 features.push_back(double(numAnchors) / std::max<double>(1, numElements));

	99 // 'innertextlength', len(innerText),

	100 features.push_back(innerText.size());

	101 // 'textcontentlength', len(textContent),

	102 features.push_back(textContent.size());

	103 // 'innerhtmllength', len(innerHTML),

	104 features.push_back(innerHTML.size());

	105 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),

	106 features.push_back(double(innerText.size()) /

	107 std::max<double>(1.0, innerHTML.size()));

	108 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),

	109 features.push_back(double(textContent.size()) /

	110 std::max<double>(1.0, innerHTML.size()));

	111 // 'innertexttextcontentlengthratio',

	112 // float(len(innerText)) / max(1, len(textContent)),

	113 features.push_back(double(innerText.size()) /

	114 std::max<double>(1.0, textContent.size()));

	115 // 'innertextwordcount', innerTextWords,

	116 features.push_back(innerTextWords);

	117 // 'textcontentwordcount', textContentWords,

	118 features.push_back(textContentWords);

	119 // 'innerhtmlwordcount', innerHTMLWords,

	120 features.push_back(innerHTMLWords);

	121 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),

	122 features.push_back(double(innerTextWords) /

	123 std::max<int>(1.0, innerHTMLWords));

	124 // 'textcontentwordcountratio',

	125 // float(textContentWords) / max(1, innerHTMLWords),

	126 features.push_back(double(textContentWords) /

	127 std::max<int>(1.0, innerHTMLWords));

	128 // 'innertexttextcontentwordcountratio',

	129 // float(innerTextWords) / max(1, textContentWords),

	130 features.push_back(double(innerTextWords) /

	131 std::max<int>(1.0, textContentWords));

	132 return features;

	133 }

	134

	135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) {

	136 const base::DictionaryValue* dict;

	137 if (!json->GetAsDictionary(&dict)) {

	138 return std::vector<double>();

	139 }

	140

	141 bool isOGArticle;

	142 std::string url, innerText, textContent, innerHTML;

	143 double numElements, numAnchors, numForms;

	144

	145 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&

	146 dict->GetString("url", &url) &&

	147 dict->GetDouble("numElements", &numElements) &&

	148 dict->GetDouble("numAnchors", &numAnchors) &&

	149 dict->GetDouble("numForms", &numForms) &&

	150 dict->GetString("innerText", &innerText) &&

	151 dict->GetString("textContent", &textContent) &&

	152 dict->GetString("innerHTML", &innerHTML))) {

	153 return std::vector<double>();

	154 }

	155

	156 GURL parsed_url(url);

	157 if (!parsed_url.is_valid()) {

	158 return std::vector<double>();

	159 }

	160

	161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,

	162 numAnchors, numForms, innerText, textContent,

	163 innerHTML);

	164 }

	165 }

OLD	NEW