Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(229)

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1042053003: Add calculation of derived features for distillable page model (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Calculate paths correctly Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/dom_distiller/core/page_features.h"
6
7 #include <string>
8
9 #include "third_party/re2/re2/re2.h"
10
11 namespace dom_distiller {
12 /* This code needs to derive features in the same way and order in which they
13 * are derived when training the model. Parts of that code are reproduced in the
14 * comments below.
15 */
16
17 namespace {
18 std::string GetLastSegment(const std::string& path) {
19 // return re.search('[^/]*\/?$', path).group(0)
20 if (path.size() == 0)
21 return "";
22 size_t start = path.rfind("/", path.size() - 1);
23 return start == std::string::npos ? "" : path.substr(start + 1);
24 }
25
26 int CountMatches(const std::string& s, const std::string& p) {
27 // return len(re.findall(p, s))
28 re2::StringPiece sp(s);
29 re2::RE2 regexp(p);
30 int count = 0;
31 while (re2::RE2::FindAndConsume(&sp, regexp))
32 count++;
33 return count;
34 }
35
36 int GetWordCount(const std::string& s) {
37 return CountMatches(s, "\\w+");
38 }
39
40 bool Contains(const std::string& n, const std::string& h) {
41 return h.find(n) != std::string::npos;
42 }
43
44 bool EndsWith(const std::string& t, const std::string& s) {
45 return s.size() >= t.size() &&
46 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
47 }
48 }
49
50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
51 const GURL& url,
52 double numElements,
53 double numAnchors,
54 double numForms,
55 const std::string& innerText,
56 const std::string& textContent,
57 const std::string& innerHTML) {
58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
59 // they are here).
60 const std::string& path = url.path();
61 int innerTextWords = GetWordCount(innerText);
62 int textContentWords = GetWordCount(textContent);
63 int innerHTMLWords = GetWordCount(innerHTML);
64 std::vector<double> features;
65 // 'opengraph', opengraph,
66 features.push_back(isOGArticle);
67 // 'forum', 'forum' in path,
68 features.push_back(Contains("forum", path));
69 // 'index', 'index' in path,
70 features.push_back(Contains("index", path));
71 // 'view', 'view' in path,
72 features.push_back(Contains("view", path));
73 // 'asp', '.asp' in path,
74 features.push_back(Contains(".asp", path));
75 // 'phpbb', 'phpbb' in path,
76 features.push_back(Contains(".phpbb", path));
nyquist 2015/04/01 01:19:14 The contains-statement here starts with '.', but t
cjhopman 2015/04/01 20:12:13 Done.
77 // 'php', path.endswith('.php'),
78 features.push_back(EndsWith(".php", path));
79 // 'pathlength', len(path),
80 features.push_back(path.size());
81 // 'domain', len(path) < 2,
82 features.push_back(path.size() < 2);
83 // 'pathcomponents', CountMatches(path, r'\/.'),
84 features.push_back(CountMatches(path, "\\/."));
85 // 'slugdetector', CountMatches(path, r'[^\w/]'),
86 features.push_back(CountMatches(path, "[^\\w/]"));
87 // 'pathnumbers', CountMatches(path, r'\d+'),
88 features.push_back(CountMatches(path, "\\d+"));
89 // 'lastSegmentLength', len(GetLastSegment(path)),
90 features.push_back(GetLastSegment(path).size());
91 // 'formcount', numForms,
92 features.push_back(numForms);
93 // 'anchorcount', numAnchors,
94 features.push_back(numAnchors);
95 // 'elementcount', numElements,
96 features.push_back(numElements);
97 // 'anchorratio', float(numAnchors) / max(1, numElements),
98 features.push_back(double(numAnchors) / std::max<double>(1, numElements));
99 // 'innertextlength', len(innerText),
100 features.push_back(innerText.size());
101 // 'textcontentlength', len(textContent),
102 features.push_back(textContent.size());
103 // 'innerhtmllength', len(innerHTML),
104 features.push_back(innerHTML.size());
105 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
106 features.push_back(double(innerText.size()) /
107 std::max<double>(1.0, innerHTML.size()));
108 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
109 features.push_back(double(textContent.size()) /
110 std::max<double>(1.0, innerHTML.size()));
111 // 'innertexttextcontentlengthratio',
112 // float(len(innerText)) / max(1, len(textContent)),
113 features.push_back(double(innerText.size()) /
114 std::max<double>(1.0, textContent.size()));
115 // 'innertextwordcount', innerTextWords,
116 features.push_back(innerTextWords);
117 // 'textcontentwordcount', textContentWords,
118 features.push_back(textContentWords);
119 // 'innerhtmlwordcount', innerHTMLWords,
120 features.push_back(innerHTMLWords);
121 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
122 features.push_back(double(innerTextWords) /
123 std::max<int>(1.0, innerHTMLWords));
124 // 'textcontentwordcountratio',
125 // float(textContentWords) / max(1, innerHTMLWords),
126 features.push_back(double(textContentWords) /
127 std::max<int>(1.0, innerHTMLWords));
128 // 'innertexttextcontentwordcountratio',
129 // float(innerTextWords) / max(1, textContentWords),
130 features.push_back(double(innerTextWords) /
131 std::max<int>(1.0, textContentWords));
132 return features;
133 }
134
135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) {
136 const base::DictionaryValue* dict;
137 if (!json->GetAsDictionary(&dict)) {
138 return std::vector<double>();
139 }
140
141 bool isOGArticle;
142 std::string url, innerText, textContent, innerHTML;
143 double numElements, numAnchors, numForms;
144
145 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
146 dict->GetString("url", &url) &&
147 dict->GetDouble("numElements", &numElements) &&
148 dict->GetDouble("numAnchors", &numAnchors) &&
149 dict->GetDouble("numForms", &numForms) &&
150 dict->GetString("innerText", &innerText) &&
151 dict->GetString("textContent", &textContent) &&
152 dict->GetString("innerHTML", &innerHTML))) {
153 return std::vector<double>();
154 }
155
156 GURL parsed_url(url);
157 if (!parsed_url.is_valid()) {
158 return std::vector<double>();
159 }
160
161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
162 numAnchors, numForms, innerText, textContent,
163 innerHTML);
164 }
165 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698