| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 | 8 |
| 9 #include "base/json/json_reader.h" | 9 #include "base/json/json_reader.h" |
| 10 #include "third_party/re2/re2/re2.h" | 10 #include "third_party/re2/re2/re2.h" |
| 11 | 11 |
| 12 namespace dom_distiller { | 12 namespace dom_distiller { |
| 13 /* This code needs to derive features in the same way and order in which they | 13 /* This code needs to derive features in the same way and order in which they |
| 14 * are derived when training the model. Parts of that code are reproduced in the | 14 * are derived when training the model. Parts of that code are reproduced in the |
| 15 * comments below. | 15 * comments below. |
| 16 */ | 16 */ |
| 17 | 17 |
| 18 namespace { | 18 namespace { |
| 19 |
| 19 std::string GetLastSegment(const std::string& path) { | 20 std::string GetLastSegment(const std::string& path) { |
| 20 // return re.search('[^/]*\/?$', path).group(0) | 21 // return re.search('[^/]*\/?$', path).group(0) |
| 21 if (path.size() == 0) | 22 if (path.size() == 0) |
| 22 return ""; | 23 return ""; |
| 23 size_t start = path.rfind("/", path.size() - 1); | 24 size_t start = path.rfind("/", path.size() - 1); |
| 24 return start == std::string::npos ? "" : path.substr(start + 1); | 25 return start == std::string::npos ? "" : path.substr(start + 1); |
| 25 } | 26 } |
| 26 | 27 |
| 27 int CountMatches(const std::string& s, const std::string& p) { | 28 int CountMatches(const std::string& s, const std::string& p) { |
| 28 // return len(re.findall(p, s)) | 29 // return len(re.findall(p, s)) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 39 } | 40 } |
| 40 | 41 |
| 41 bool Contains(const std::string& n, const std::string& h) { | 42 bool Contains(const std::string& n, const std::string& h) { |
| 42 return h.find(n) != std::string::npos; | 43 return h.find(n) != std::string::npos; |
| 43 } | 44 } |
| 44 | 45 |
| 45 bool EndsWith(const std::string& t, const std::string& s) { | 46 bool EndsWith(const std::string& t, const std::string& s) { |
| 46 return s.size() >= t.size() && | 47 return s.size() >= t.size() && |
| 47 s.compare(s.size() - t.size(), std::string::npos, t) == 0; | 48 s.compare(s.size() - t.size(), std::string::npos, t) == 0; |
| 48 } | 49 } |
| 49 } | 50 |
| 51 } // namespace |
| 50 | 52 |
| 51 int kDerivedFeaturesCount = 29; | 53 int kDerivedFeaturesCount = 29; |
| 52 | 54 |
| 53 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, | 55 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, |
| 54 const GURL& url, | 56 const GURL& url, |
| 55 double numElements, | 57 double numElements, |
| 56 double numAnchors, | 58 double numAnchors, |
| 57 double numForms, | 59 double numForms, |
| 58 const std::string& innerText, | 60 const std::string& innerText, |
| 59 const std::string& textContent, | 61 const std::string& textContent, |
| (...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 169 | 171 |
| 170 GURL parsed_url(url); | 172 GURL parsed_url(url); |
| 171 if (!parsed_url.is_valid()) { | 173 if (!parsed_url.is_valid()) { |
| 172 return std::vector<double>(); | 174 return std::vector<double>(); |
| 173 } | 175 } |
| 174 | 176 |
| 175 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | 177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
| 176 numAnchors, numForms, innerText, textContent, | 178 numAnchors, numForms, innerText, textContent, |
| 177 innerHTML); | 179 innerHTML); |
| 178 } | 180 } |
| 179 } | 181 |
| 182 } // namespace dom_distiller |
| OLD | NEW |