OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
| 9 #include "base/json/json_reader.h" |
9 #include "third_party/re2/re2/re2.h" | 10 #include "third_party/re2/re2/re2.h" |
10 | 11 |
11 namespace dom_distiller { | 12 namespace dom_distiller { |
12 /* This code needs to derive features in the same way and order in which they | 13 /* This code needs to derive features in the same way and order in which they |
13 * are derived when training the model. Parts of that code are reproduced in the | 14 * are derived when training the model. Parts of that code are reproduced in the |
14 * comments below. | 15 * comments below. |
15 */ | 16 */ |
16 | 17 |
17 namespace { | 18 namespace { |
18 std::string GetLastSegment(const std::string& path) { | 19 std::string GetLastSegment(const std::string& path) { |
(...skipping 21 matching lines...) Expand all Loading... |
40 bool Contains(const std::string& n, const std::string& h) { | 41 bool Contains(const std::string& n, const std::string& h) { |
41 return h.find(n) != std::string::npos; | 42 return h.find(n) != std::string::npos; |
42 } | 43 } |
43 | 44 |
44 bool EndsWith(const std::string& t, const std::string& s) { | 45 bool EndsWith(const std::string& t, const std::string& s) { |
45 return s.size() >= t.size() && | 46 return s.size() >= t.size() && |
46 s.compare(s.size() - t.size(), std::string::npos, t) == 0; | 47 s.compare(s.size() - t.size(), std::string::npos, t) == 0; |
47 } | 48 } |
48 } | 49 } |
49 | 50 |
| 51 int kDerivedFeaturesCount = 29; |
| 52 |
50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, | 53 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, |
51 const GURL& url, | 54 const GURL& url, |
52 double numElements, | 55 double numElements, |
53 double numAnchors, | 56 double numAnchors, |
54 double numForms, | 57 double numForms, |
55 const std::string& innerText, | 58 const std::string& innerText, |
56 const std::string& textContent, | 59 const std::string& textContent, |
57 const std::string& innerHTML) { | 60 const std::string& innerHTML) { |
58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as | 61 // In the training pipeline, the strings are explicitly encoded in utf-8 (as |
59 // they are here). | 62 // they are here). |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
125 // float(textContentWords) / max(1, innerHTMLWords), | 128 // float(textContentWords) / max(1, innerHTMLWords), |
126 features.push_back(double(textContentWords) / | 129 features.push_back(double(textContentWords) / |
127 std::max<int>(1.0, innerHTMLWords)); | 130 std::max<int>(1.0, innerHTMLWords)); |
128 // 'innertexttextcontentwordcountratio', | 131 // 'innertexttextcontentwordcountratio', |
129 // float(innerTextWords) / max(1, textContentWords), | 132 // float(innerTextWords) / max(1, textContentWords), |
130 features.push_back(double(innerTextWords) / | 133 features.push_back(double(innerTextWords) / |
131 std::max<int>(1.0, textContentWords)); | 134 std::max<int>(1.0, textContentWords)); |
132 return features; | 135 return features; |
133 } | 136 } |
134 | 137 |
135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) { | 138 std::vector<double> CalculateDerivedFeaturesFromJSON( |
| 139 const base::Value* stringified_json) { |
| 140 std::string stringified; |
| 141 if (!stringified_json->GetAsString(&stringified)) { |
| 142 return std::vector<double>(); |
| 143 } |
| 144 |
| 145 scoped_ptr<base::Value> json(base::JSONReader::Read(stringified)); |
| 146 if (!json) { |
| 147 return std::vector<double>(); |
| 148 } |
| 149 |
136 const base::DictionaryValue* dict; | 150 const base::DictionaryValue* dict; |
137 if (!json->GetAsDictionary(&dict)) { | 151 if (!json->GetAsDictionary(&dict)) { |
138 return std::vector<double>(); | 152 return std::vector<double>(); |
139 } | 153 } |
140 | 154 |
141 bool isOGArticle = false; | 155 bool isOGArticle = false; |
142 std::string url, innerText, textContent, innerHTML; | 156 std::string url, innerText, textContent, innerHTML; |
143 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; | 157 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; |
144 | 158 |
145 if (!(dict->GetBoolean("opengraph", &isOGArticle) && | 159 if (!(dict->GetBoolean("opengraph", &isOGArticle) && |
(...skipping 10 matching lines...) Expand all Loading... |
156 GURL parsed_url(url); | 170 GURL parsed_url(url); |
157 if (!parsed_url.is_valid()) { | 171 if (!parsed_url.is_valid()) { |
158 return std::vector<double>(); | 172 return std::vector<double>(); |
159 } | 173 } |
160 | 174 |
161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | 175 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
162 numAnchors, numForms, innerText, textContent, | 176 numAnchors, numForms, innerText, textContent, |
163 innerHTML); | 177 innerHTML); |
164 } | 178 } |
165 } | 179 } |
OLD | NEW |