| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 | 8 |
| 9 #include "base/json/json_reader.h" |
| 9 #include "third_party/re2/re2/re2.h" | 10 #include "third_party/re2/re2/re2.h" |
| 10 | 11 |
| 11 namespace dom_distiller { | 12 namespace dom_distiller { |
| 12 /* This code needs to derive features in the same way and order in which they | 13 /* This code needs to derive features in the same way and order in which they |
| 13 * are derived when training the model. Parts of that code are reproduced in the | 14 * are derived when training the model. Parts of that code are reproduced in the |
| 14 * comments below. | 15 * comments below. |
| 15 */ | 16 */ |
| 16 | 17 |
| 17 namespace { | 18 namespace { |
| 18 std::string GetLastSegment(const std::string& path) { | 19 std::string GetLastSegment(const std::string& path) { |
| (...skipping 21 matching lines...) Expand all Loading... |
| 40 bool Contains(const std::string& n, const std::string& h) { | 41 bool Contains(const std::string& n, const std::string& h) { |
| 41 return h.find(n) != std::string::npos; | 42 return h.find(n) != std::string::npos; |
| 42 } | 43 } |
| 43 | 44 |
| 44 bool EndsWith(const std::string& t, const std::string& s) { | 45 bool EndsWith(const std::string& t, const std::string& s) { |
| 45 return s.size() >= t.size() && | 46 return s.size() >= t.size() && |
| 46 s.compare(s.size() - t.size(), std::string::npos, t) == 0; | 47 s.compare(s.size() - t.size(), std::string::npos, t) == 0; |
| 47 } | 48 } |
| 48 } | 49 } |
| 49 | 50 |
| 51 int kDerivedFeaturesCount = 29; |
| 52 |
| 50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, | 53 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, |
| 51 const GURL& url, | 54 const GURL& url, |
| 52 double numElements, | 55 double numElements, |
| 53 double numAnchors, | 56 double numAnchors, |
| 54 double numForms, | 57 double numForms, |
| 55 const std::string& innerText, | 58 const std::string& innerText, |
| 56 const std::string& textContent, | 59 const std::string& textContent, |
| 57 const std::string& innerHTML) { | 60 const std::string& innerHTML) { |
| 58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as | 61 // In the training pipeline, the strings are explicitly encoded in utf-8 (as |
| 59 // they are here). | 62 // they are here). |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 125 // float(textContentWords) / max(1, innerHTMLWords), | 128 // float(textContentWords) / max(1, innerHTMLWords), |
| 126 features.push_back(double(textContentWords) / | 129 features.push_back(double(textContentWords) / |
| 127 std::max<int>(1.0, innerHTMLWords)); | 130 std::max<int>(1.0, innerHTMLWords)); |
| 128 // 'innertexttextcontentwordcountratio', | 131 // 'innertexttextcontentwordcountratio', |
| 129 // float(innerTextWords) / max(1, textContentWords), | 132 // float(innerTextWords) / max(1, textContentWords), |
| 130 features.push_back(double(innerTextWords) / | 133 features.push_back(double(innerTextWords) / |
| 131 std::max<int>(1.0, textContentWords)); | 134 std::max<int>(1.0, textContentWords)); |
| 132 return features; | 135 return features; |
| 133 } | 136 } |
| 134 | 137 |
| 135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) { | 138 std::vector<double> CalculateDerivedFeaturesFromJSON( |
| 139 const base::Value* stringified_json) { |
| 140 std::string stringified; |
| 141 if (!stringified_json->GetAsString(&stringified)) { |
| 142 return std::vector<double>(); |
| 143 } |
| 144 |
| 145 scoped_ptr<base::Value> json(base::JSONReader::Read(stringified)); |
| 146 if (!json) { |
| 147 return std::vector<double>(); |
| 148 } |
| 149 |
| 136 const base::DictionaryValue* dict; | 150 const base::DictionaryValue* dict; |
| 137 if (!json->GetAsDictionary(&dict)) { | 151 if (!json->GetAsDictionary(&dict)) { |
| 138 return std::vector<double>(); | 152 return std::vector<double>(); |
| 139 } | 153 } |
| 140 | 154 |
| 141 bool isOGArticle = false; | 155 bool isOGArticle = false; |
| 142 std::string url, innerText, textContent, innerHTML; | 156 std::string url, innerText, textContent, innerHTML; |
| 143 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; | 157 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; |
| 144 | 158 |
| 145 if (!(dict->GetBoolean("opengraph", &isOGArticle) && | 159 if (!(dict->GetBoolean("opengraph", &isOGArticle) && |
| (...skipping 10 matching lines...) Expand all Loading... |
| 156 GURL parsed_url(url); | 170 GURL parsed_url(url); |
| 157 if (!parsed_url.is_valid()) { | 171 if (!parsed_url.is_valid()) { |
| 158 return std::vector<double>(); | 172 return std::vector<double>(); |
| 159 } | 173 } |
| 160 | 174 |
| 161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | 175 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
| 162 numAnchors, numForms, innerText, textContent, | 176 numAnchors, numForms, innerText, textContent, |
| 163 innerHTML); | 177 innerHTML); |
| 164 } | 178 } |
| 165 } | 179 } |
| OLD | NEW |