OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
9 #include "base/json/json_reader.h" | 9 #include "third_party/WebKit/public/platform/WebDistillability.h" |
10 #include "third_party/re2/re2/re2.h" | 10 #include "third_party/re2/re2/re2.h" |
| 11 #include "url/gurl.h" |
11 | 12 |
12 namespace dom_distiller { | 13 namespace dom_distiller { |
13 /* This code needs to derive features in the same way and order in which they | 14 /* This code needs to derive features in the same way and order in which they |
14 * are derived when training the model. Parts of that code are reproduced in the | 15 * are derived when training the model. Parts of that code are reproduced in the |
15 * comments below. | 16 * comments below. |
16 */ | 17 */ |
17 | 18 |
18 namespace { | 19 namespace { |
19 | 20 |
20 std::string GetLastSegment(const std::string& path) { | 21 std::string GetLastSegment(const std::string& path) { |
21 // return re.search('[^/]*\/?$', path).group(0) | 22 // return re.search('[^/]*\/?$', path).group(0) |
22 if (path.size() == 0) | 23 if (path.size() == 0) |
23 return ""; | 24 return ""; |
24 size_t start = path.rfind("/", path.size() - 1); | 25 size_t start = path.rfind("/", path.size() - 1); |
25 return start == std::string::npos ? "" : path.substr(start + 1); | 26 return start == std::string::npos ? "" : path.substr(start + 1); |
26 } | 27 } |
27 | 28 |
28 int CountMatches(const std::string& s, const std::string& p) { | 29 int CountMatches(const std::string& s, const std::string& p) { |
29 // return len(re.findall(p, s)) | 30 // return len(re.findall(p, s)) |
30 re2::StringPiece sp(s); | 31 re2::StringPiece sp(s); |
31 re2::RE2 regexp(p); | 32 re2::RE2 regexp(p); |
32 int count = 0; | 33 int count = 0; |
33 while (re2::RE2::FindAndConsume(&sp, regexp)) | 34 while (re2::RE2::FindAndConsume(&sp, regexp)) |
34 count++; | 35 count++; |
35 return count; | 36 return count; |
36 } | 37 } |
37 | 38 |
38 int GetWordCount(const std::string& s) { | |
39 return CountMatches(s, "\\w+"); | |
40 } | |
41 | |
42 bool Contains(const std::string& n, const std::string& h) { | 39 bool Contains(const std::string& n, const std::string& h) { |
43 return h.find(n) != std::string::npos; | 40 return h.find(n) != std::string::npos; |
44 } | 41 } |
45 | 42 |
46 bool EndsWith(const std::string& t, const std::string& s) { | 43 bool EndsWith(const std::string& t, const std::string& s) { |
47 return s.size() >= t.size() && | 44 return s.size() >= t.size() && |
48 s.compare(s.size() - t.size(), std::string::npos, t) == 0; | 45 s.compare(s.size() - t.size(), std::string::npos, t) == 0; |
49 } | 46 } |
50 | 47 |
51 } // namespace | 48 } // namespace |
52 | 49 |
53 int kDerivedFeaturesCount = 29; | 50 unsigned kDerivedFeaturesCount = 22; |
54 | 51 |
55 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, | 52 std::vector<double> CalculateDerivedFeatures( |
56 const GURL& url, | 53 const blink::WebDistillabilityFeatures& f, |
57 double numElements, | 54 const GURL& url) { |
58 double numAnchors, | |
59 double numForms, | |
60 const std::string& innerText, | |
61 const std::string& textContent, | |
62 const std::string& innerHTML) { | |
63 // In the training pipeline, the strings are explicitly encoded in utf-8 (as | |
64 // they are here). | |
65 const std::string& path = url.path(); | 55 const std::string& path = url.path(); |
66 int innerTextWords = GetWordCount(innerText); | |
67 int textContentWords = GetWordCount(textContent); | |
68 int innerHTMLWords = GetWordCount(innerHTML); | |
69 std::vector<double> features; | 56 std::vector<double> features; |
70 // 'opengraph', opengraph, | 57 // 'opengraph', opengraph, |
71 features.push_back(isOGArticle); | 58 features.push_back(f.openGraph); |
72 // 'forum', 'forum' in path, | 59 // 'forum', 'forum' in path, |
73 features.push_back(Contains("forum", path)); | 60 features.push_back(Contains("forum", path)); |
74 // 'index', 'index' in path, | 61 // 'index', 'index' in path, |
75 features.push_back(Contains("index", path)); | 62 features.push_back(Contains("index", path)); |
| 63 // 'search', 'search' in path, |
| 64 features.push_back(Contains("search", path)); |
76 // 'view', 'view' in path, | 65 // 'view', 'view' in path, |
77 features.push_back(Contains("view", path)); | 66 features.push_back(Contains("view", path)); |
| 67 // 'archive', 'archive' in path, |
| 68 features.push_back(Contains("archive", path)); |
78 // 'asp', '.asp' in path, | 69 // 'asp', '.asp' in path, |
79 features.push_back(Contains(".asp", path)); | 70 features.push_back(Contains(".asp", path)); |
80 // 'phpbb', 'phpbb' in path, | 71 // 'phpbb', 'phpbb' in path, |
81 features.push_back(Contains("phpbb", path)); | 72 features.push_back(Contains("phpbb", path)); |
82 // 'php', path.endswith('.php'), | 73 // 'php', path.endswith('.php'), |
83 features.push_back(EndsWith(".php", path)); | 74 features.push_back(EndsWith(".php", path)); |
84 // 'pathlength', len(path), | 75 // 'pathLength', len(path), |
85 features.push_back(path.size()); | 76 features.push_back(path.size()); |
86 // 'domain', len(path) < 2, | 77 // 'domain', len(path) < 2, |
87 features.push_back(path.size() < 2); | 78 features.push_back(path.size() < 2); |
88 // 'pathcomponents', CountMatches(path, r'\/.'), | 79 // 'pathComponents', CountMatches(path, r'\/.'), |
89 features.push_back(CountMatches(path, "\\/.")); | 80 features.push_back(CountMatches(path, "\\/.")); |
90 // 'slugdetector', CountMatches(path, r'[^\w/]'), | 81 // 'slugDetector', CountMatches(path, r'[^\w/]'), |
91 features.push_back(CountMatches(path, "[^\\w/]")); | 82 features.push_back(CountMatches(path, "[^\\w/]")); |
92 // 'pathnumbers', CountMatches(path, r'\d+'), | 83 // 'pathNumbers', CountMatches(path, r'\d+'), |
93 features.push_back(CountMatches(path, "\\d+")); | 84 features.push_back(CountMatches(path, "\\d+")); |
94 // 'lastSegmentLength', len(GetLastSegment(path)), | 85 // 'lastSegmentLength', len(GetLastSegment(path)), |
95 features.push_back(GetLastSegment(path).size()); | 86 features.push_back(GetLastSegment(path).size()); |
96 // 'formcount', numForms, | 87 // 'formCount', numForms, |
97 features.push_back(numForms); | 88 features.push_back(f.formCount); |
98 // 'anchorcount', numAnchors, | 89 // 'anchorCount', numAnchors, |
99 features.push_back(numAnchors); | 90 features.push_back(f.anchorCount); |
100 // 'elementcount', numElements, | 91 // 'elementCount', numElements, |
101 features.push_back(numElements); | 92 features.push_back(f.elementCount); |
102 // 'anchorratio', float(numAnchors) / max(1, numElements), | 93 // 'anchorRatio', float(numAnchors) / max(1, numElements), |
103 features.push_back(double(numAnchors) / std::max<double>(1, numElements)); | 94 features.push_back( |
104 // 'innertextlength', len(innerText), | 95 double(f.anchorCount) / std::max<double>(1, f.elementCount)); |
105 features.push_back(innerText.size()); | 96 // 'mozScore' |
106 // 'textcontentlength', len(textContent), | 97 features.push_back(f.mozScore); |
107 features.push_back(textContent.size()); | 98 // 'mozScoreAllSqrt' |
108 // 'innerhtmllength', len(innerHTML), | 99 features.push_back(f.mozScoreAllSqrt); |
109 features.push_back(innerHTML.size()); | 100 // 'mozScoreAllLinear' |
110 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), | 101 features.push_back(f.mozScoreAllLinear); |
111 features.push_back(double(innerText.size()) / | 102 |
112 std::max<double>(1.0, innerHTML.size())); | |
113 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), | |
114 features.push_back(double(textContent.size()) / | |
115 std::max<double>(1.0, innerHTML.size())); | |
116 // 'innertexttextcontentlengthratio', | |
117 // float(len(innerText)) / max(1, len(textContent)), | |
118 features.push_back(double(innerText.size()) / | |
119 std::max<double>(1.0, textContent.size())); | |
120 // 'innertextwordcount', innerTextWords, | |
121 features.push_back(innerTextWords); | |
122 // 'textcontentwordcount', textContentWords, | |
123 features.push_back(textContentWords); | |
124 // 'innerhtmlwordcount', innerHTMLWords, | |
125 features.push_back(innerHTMLWords); | |
126 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), | |
127 features.push_back(double(innerTextWords) / | |
128 std::max<int>(1.0, innerHTMLWords)); | |
129 // 'textcontentwordcountratio', | |
130 // float(textContentWords) / max(1, innerHTMLWords), | |
131 features.push_back(double(textContentWords) / | |
132 std::max<int>(1.0, innerHTMLWords)); | |
133 // 'innertexttextcontentwordcountratio', | |
134 // float(innerTextWords) / max(1, textContentWords), | |
135 features.push_back(double(innerTextWords) / | |
136 std::max<int>(1.0, textContentWords)); | |
137 return features; | 103 return features; |
138 } | 104 } |
139 | 105 |
140 std::vector<double> CalculateDerivedFeaturesFromJSON( | |
141 const base::Value* stringified_json) { | |
142 std::string stringified; | |
143 if (!stringified_json->GetAsString(&stringified)) { | |
144 return std::vector<double>(); | |
145 } | |
146 | |
147 scoped_ptr<base::Value> json = base::JSONReader::Read(stringified); | |
148 if (!json) { | |
149 return std::vector<double>(); | |
150 } | |
151 | |
152 const base::DictionaryValue* dict; | |
153 if (!json->GetAsDictionary(&dict)) { | |
154 return std::vector<double>(); | |
155 } | |
156 | |
157 bool isOGArticle = false; | |
158 std::string url, innerText, textContent, innerHTML; | |
159 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; | |
160 | |
161 if (!(dict->GetBoolean("opengraph", &isOGArticle) && | |
162 dict->GetString("url", &url) && | |
163 dict->GetDouble("numElements", &numElements) && | |
164 dict->GetDouble("numAnchors", &numAnchors) && | |
165 dict->GetDouble("numForms", &numForms) && | |
166 dict->GetString("innerText", &innerText) && | |
167 dict->GetString("textContent", &textContent) && | |
168 dict->GetString("innerHTML", &innerHTML))) { | |
169 return std::vector<double>(); | |
170 } | |
171 | |
172 GURL parsed_url(url); | |
173 if (!parsed_url.is_valid()) { | |
174 return std::vector<double>(); | |
175 } | |
176 | |
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | |
178 numAnchors, numForms, innerText, textContent, | |
179 innerHTML); | |
180 } | |
181 | |
182 } // namespace dom_distiller | 106 } // namespace dom_distiller |
OLD | NEW |