| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
| 6 | 6 |
| 7 #include <stddef.h> | 7 #include <stddef.h> |
| 8 | 8 |
| 9 #include <string> | 9 #include <string> |
| 10 | 10 |
| 11 #include "base/json/json_reader.h" | 11 #include "base/json/json_reader.h" |
| 12 #include "third_party/re2/src/re2/re2.h" | 12 #include "third_party/re2/src/re2/re2.h" |
| 13 #include "url/gurl.h" | 13 #include "url/gurl.h" |
| 14 | 14 |
| 15 namespace dom_distiller { | 15 namespace dom_distiller { |
| 16 /* This code needs to derive features in the same way and order in which they | 16 /* This code needs to derive features in the same way and order in which they |
| 17 * are derived when training the model. Parts of that code are reproduced in the | 17 * are derived when training the model. Parts of that code are reproduced in the |
| 18 * comments below. | 18 * comments below. |
| 19 */ | 19 */ |
| 20 | 20 |
| 21 namespace { | 21 namespace { |
| 22 | 22 |
| 23 std::string GetLastSegment(const std::string& path) { | 23 std::string GetLastSegment(const std::string& path) { |
| 24 // return re.search('[^/]*\/?$', path).group(0) | 24 // return re.search('[^/]*\/?$', path).group(0) |
| 25 if (path.size() == 0) | 25 if (path.size() == 0) |
| 26 return ""; | 26 return ""; |
| 27 size_t start = path.rfind("/", path.size() - 1); | 27 if (path.size() == 1) { |
| 28 DCHECK(path[0] == '/'); |
| 29 return path; |
| 30 } |
| 31 size_t start = path.rfind("/", path.size() - 2); |
| 28 return start == std::string::npos ? "" : path.substr(start + 1); | 32 return start == std::string::npos ? "" : path.substr(start + 1); |
| 29 } | 33 } |
| 30 | 34 |
| 31 int CountMatches(const std::string& s, const std::string& p) { | 35 int CountMatches(const std::string& s, const std::string& p) { |
| 32 // return len(re.findall(p, s)) | 36 // return len(re.findall(p, s)) |
| 33 re2::StringPiece sp(s); | 37 re2::StringPiece sp(s); |
| 34 re2::RE2 regexp(p); | 38 re2::RE2 regexp(p); |
| 35 int count = 0; | 39 int count = 0; |
| 36 while (re2::RE2::FindAndConsume(&sp, regexp)) | 40 while (re2::RE2::FindAndConsume(&sp, regexp)) |
| 37 count++; | 41 count++; |
| (...skipping 198 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 236 features.push_back(mozScore); | 240 features.push_back(mozScore); |
| 237 // 'mozScoreAllSqrt' | 241 // 'mozScoreAllSqrt' |
| 238 features.push_back(mozScoreAllSqrt); | 242 features.push_back(mozScoreAllSqrt); |
| 239 // 'mozScoreAllLinear' | 243 // 'mozScoreAllLinear' |
| 240 features.push_back(mozScoreAllLinear); | 244 features.push_back(mozScoreAllLinear); |
| 241 | 245 |
| 242 return features; | 246 return features; |
| 243 } | 247 } |
| 244 | 248 |
| 245 } // namespace dom_distiller | 249 } // namespace dom_distiller |
| OLD | NEW |