OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "components/dom_distiller/core/page_features.h" | |
6 | |
7 #include <string> | |
8 | |
9 #include "third_party/re2/re2/re2.h" | |
10 | |
11 namespace dom_distiller { | |
12 /* This code needs to derive features in the same way and order in which they | |
13 * are derived when training the model. Parts of that code are reproduced in the | |
14 * comments below. | |
15 */ | |
16 | |
17 namespace { | |
18 std::string GetLastSegment(const std::string& path) { | |
19 // return re.search('[^/]*\/?$', path).group(0) | |
20 if (path.size() == 0) | |
21 return ""; | |
22 size_t start = path.rfind("/", path.size() - 1); | |
23 return start == std::string::npos ? "" : path.substr(start + 1); | |
24 } | |
25 | |
26 int CountMatches(const std::string& s, const std::string& p) { | |
27 // return len(re.findall(p, s)) | |
28 re2::StringPiece sp(s); | |
29 re2::RE2 regexp(p); | |
30 int count = 0; | |
31 while (re2::RE2::FindAndConsume(&sp, regexp)) | |
32 count++; | |
33 return count; | |
34 } | |
35 | |
36 int GetWordCount(const std::string& s) { | |
37 return CountMatches(s, "\\w+"); | |
38 } | |
39 | |
40 bool Contains(const std::string& n, const std::string& h) { | |
41 return h.find(n) != std::string::npos; | |
42 } | |
43 | |
44 bool EndsWith(const std::string& t, const std::string& s) { | |
45 return s.size() >= t.size() && | |
46 s.compare(s.size() - t.size(), std::string::npos, t) == 0; | |
47 } | |
48 } | |
49 | |
50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, | |
51 const GURL& url, | |
52 double numElements, | |
53 double numAnchors, | |
54 double numForms, | |
55 const std::string& innerText, | |
56 const std::string& textContent, | |
57 const std::string& innerHTML) { | |
58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as | |
59 // they are here). | |
60 const std::string& path = url.path(); | |
61 int innerTextWords = GetWordCount(innerText); | |
62 int textContentWords = GetWordCount(textContent); | |
63 int innerHTMLWords = GetWordCount(innerHTML); | |
64 std::vector<double> features; | |
65 // 'opengraph', opengraph, | |
66 features.push_back(isOGArticle); | |
67 // 'forum', 'forum' in path, | |
68 features.push_back(Contains("forum", path)); | |
69 // 'index', 'index' in path, | |
70 features.push_back(Contains("index", path)); | |
71 // 'view', 'view' in path, | |
72 features.push_back(Contains("view", path)); | |
73 // 'asp', '.asp' in path, | |
74 features.push_back(Contains(".asp", path)); | |
75 // 'phpbb', 'phpbb' in path, | |
76 features.push_back(Contains(".phpbb", path)); | |
nyquist
2015/04/01 01:19:14
The contains-statement here starts with '.', but t
cjhopman
2015/04/01 20:12:13
Done.
| |
77 // 'php', path.endswith('.php'), | |
78 features.push_back(EndsWith(".php", path)); | |
79 // 'pathlength', len(path), | |
80 features.push_back(path.size()); | |
81 // 'domain', len(path) < 2, | |
82 features.push_back(path.size() < 2); | |
83 // 'pathcomponents', CountMatches(path, r'\/.'), | |
84 features.push_back(CountMatches(path, "\\/.")); | |
85 // 'slugdetector', CountMatches(path, r'[^\w/]'), | |
86 features.push_back(CountMatches(path, "[^\\w/]")); | |
87 // 'pathnumbers', CountMatches(path, r'\d+'), | |
88 features.push_back(CountMatches(path, "\\d+")); | |
89 // 'lastSegmentLength', len(GetLastSegment(path)), | |
90 features.push_back(GetLastSegment(path).size()); | |
91 // 'formcount', numForms, | |
92 features.push_back(numForms); | |
93 // 'anchorcount', numAnchors, | |
94 features.push_back(numAnchors); | |
95 // 'elementcount', numElements, | |
96 features.push_back(numElements); | |
97 // 'anchorratio', float(numAnchors) / max(1, numElements), | |
98 features.push_back(double(numAnchors) / std::max<double>(1, numElements)); | |
99 // 'innertextlength', len(innerText), | |
100 features.push_back(innerText.size()); | |
101 // 'textcontentlength', len(textContent), | |
102 features.push_back(textContent.size()); | |
103 // 'innerhtmllength', len(innerHTML), | |
104 features.push_back(innerHTML.size()); | |
105 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), | |
106 features.push_back(double(innerText.size()) / | |
107 std::max<double>(1.0, innerHTML.size())); | |
108 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), | |
109 features.push_back(double(textContent.size()) / | |
110 std::max<double>(1.0, innerHTML.size())); | |
111 // 'innertexttextcontentlengthratio', | |
112 // float(len(innerText)) / max(1, len(textContent)), | |
113 features.push_back(double(innerText.size()) / | |
114 std::max<double>(1.0, textContent.size())); | |
115 // 'innertextwordcount', innerTextWords, | |
116 features.push_back(innerTextWords); | |
117 // 'textcontentwordcount', textContentWords, | |
118 features.push_back(textContentWords); | |
119 // 'innerhtmlwordcount', innerHTMLWords, | |
120 features.push_back(innerHTMLWords); | |
121 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), | |
122 features.push_back(double(innerTextWords) / | |
123 std::max<int>(1.0, innerHTMLWords)); | |
124 // 'textcontentwordcountratio', | |
125 // float(textContentWords) / max(1, innerHTMLWords), | |
126 features.push_back(double(textContentWords) / | |
127 std::max<int>(1.0, innerHTMLWords)); | |
128 // 'innertexttextcontentwordcountratio', | |
129 // float(innerTextWords) / max(1, textContentWords), | |
130 features.push_back(double(innerTextWords) / | |
131 std::max<int>(1.0, textContentWords)); | |
132 return features; | |
133 } | |
134 | |
135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) { | |
136 const base::DictionaryValue* dict; | |
137 if (!json->GetAsDictionary(&dict)) { | |
138 return std::vector<double>(); | |
139 } | |
140 | |
141 bool isOGArticle; | |
142 std::string url, innerText, textContent, innerHTML; | |
143 double numElements, numAnchors, numForms; | |
144 | |
145 if (!(dict->GetBoolean("opengraph", &isOGArticle) && | |
146 dict->GetString("url", &url) && | |
147 dict->GetDouble("numElements", &numElements) && | |
148 dict->GetDouble("numAnchors", &numAnchors) && | |
149 dict->GetDouble("numForms", &numForms) && | |
150 dict->GetString("innerText", &innerText) && | |
151 dict->GetString("textContent", &textContent) && | |
152 dict->GetString("innerHTML", &innerHTML))) { | |
153 return std::vector<double>(); | |
154 } | |
155 | |
156 GURL parsed_url(url); | |
157 if (!parsed_url.is_valid()) { | |
158 return std::vector<double>(); | |
159 } | |
160 | |
161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | |
162 numAnchors, numForms, innerText, textContent, | |
163 innerHTML); | |
164 } | |
165 } | |
OLD | NEW |