| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 | 8 |
| 9 #include "base/json/json_reader.h" | 9 #include "third_party/WebKit/public/platform/WebDistillability.h" |
| 10 #include "third_party/re2/re2/re2.h" | 10 #include "third_party/re2/re2/re2.h" |
| 11 #include "url/gurl.h" |
| 11 | 12 |
| 12 namespace dom_distiller { | 13 namespace dom_distiller { |
| 13 /* This code needs to derive features in the same way and order in which they | 14 /* This code needs to derive features in the same way and order in which they |
| 14 * are derived when training the model. Parts of that code are reproduced in the | 15 * are derived when training the model. Parts of that code are reproduced in the |
| 15 * comments below. | 16 * comments below. |
| 16 */ | 17 */ |
| 17 | 18 |
| 18 namespace { | 19 namespace { |
| 19 | 20 |
| 20 std::string GetLastSegment(const std::string& path) { | 21 std::string GetLastSegment(const std::string& path) { |
| 21 // return re.search('[^/]*\/?$', path).group(0) | 22 // return re.search('[^/]*\/?$', path).group(0) |
| 22 if (path.size() == 0) | 23 if (path.size() == 0) |
| 23 return ""; | 24 return ""; |
| 24 size_t start = path.rfind("/", path.size() - 1); | 25 size_t start = path.rfind("/", path.size() - 1); |
| 25 return start == std::string::npos ? "" : path.substr(start + 1); | 26 return start == std::string::npos ? "" : path.substr(start + 1); |
| 26 } | 27 } |
| 27 | 28 |
| 28 int CountMatches(const std::string& s, const std::string& p) { | 29 int CountMatches(const std::string& s, const std::string& p) { |
| 29 // return len(re.findall(p, s)) | 30 // return len(re.findall(p, s)) |
| 30 re2::StringPiece sp(s); | 31 re2::StringPiece sp(s); |
| 31 re2::RE2 regexp(p); | 32 re2::RE2 regexp(p); |
| 32 int count = 0; | 33 int count = 0; |
| 33 while (re2::RE2::FindAndConsume(&sp, regexp)) | 34 while (re2::RE2::FindAndConsume(&sp, regexp)) |
| 34 count++; | 35 count++; |
| 35 return count; | 36 return count; |
| 36 } | 37 } |
| 37 | 38 |
| 38 int GetWordCount(const std::string& s) { | |
| 39 return CountMatches(s, "\\w+"); | |
| 40 } | |
| 41 | |
| 42 bool Contains(const std::string& n, const std::string& h) { | 39 bool Contains(const std::string& n, const std::string& h) { |
| 43 return h.find(n) != std::string::npos; | 40 return h.find(n) != std::string::npos; |
| 44 } | 41 } |
| 45 | 42 |
| 46 bool EndsWith(const std::string& t, const std::string& s) { | 43 bool EndsWith(const std::string& t, const std::string& s) { |
| 47 return s.size() >= t.size() && | 44 return s.size() >= t.size() && |
| 48 s.compare(s.size() - t.size(), std::string::npos, t) == 0; | 45 s.compare(s.size() - t.size(), std::string::npos, t) == 0; |
| 49 } | 46 } |
| 50 | 47 |
| 51 } // namespace | 48 } // namespace |
| 52 | 49 |
| 53 int kDerivedFeaturesCount = 29; | 50 unsigned kDerivedFeaturesCount = 22; |
| 54 | 51 |
| 55 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, | 52 std::vector<double> CalculateDerivedFeatures( |
| 56 const GURL& url, | 53 const blink::WebDistillabilityFeatures& f, |
| 57 double numElements, | 54 const GURL& url) { |
| 58 double numAnchors, | |
| 59 double numForms, | |
| 60 const std::string& innerText, | |
| 61 const std::string& textContent, | |
| 62 const std::string& innerHTML) { | |
| 63 // In the training pipeline, the strings are explicitly encoded in utf-8 (as | |
| 64 // they are here). | |
| 65 const std::string& path = url.path(); | 55 const std::string& path = url.path(); |
| 66 int innerTextWords = GetWordCount(innerText); | |
| 67 int textContentWords = GetWordCount(textContent); | |
| 68 int innerHTMLWords = GetWordCount(innerHTML); | |
| 69 std::vector<double> features; | 56 std::vector<double> features; |
| 70 // 'opengraph', opengraph, | 57 // 'opengraph', opengraph, |
| 71 features.push_back(isOGArticle); | 58 features.push_back(f.openGraph); |
| 72 // 'forum', 'forum' in path, | 59 // 'forum', 'forum' in path, |
| 73 features.push_back(Contains("forum", path)); | 60 features.push_back(Contains("forum", path)); |
| 74 // 'index', 'index' in path, | 61 // 'index', 'index' in path, |
| 75 features.push_back(Contains("index", path)); | 62 features.push_back(Contains("index", path)); |
| 63 // 'search', 'search' in path, |
| 64 features.push_back(Contains("search", path)); |
| 76 // 'view', 'view' in path, | 65 // 'view', 'view' in path, |
| 77 features.push_back(Contains("view", path)); | 66 features.push_back(Contains("view", path)); |
| 67 // 'archive', 'archive' in path, |
| 68 features.push_back(Contains("archive", path)); |
| 78 // 'asp', '.asp' in path, | 69 // 'asp', '.asp' in path, |
| 79 features.push_back(Contains(".asp", path)); | 70 features.push_back(Contains(".asp", path)); |
| 80 // 'phpbb', 'phpbb' in path, | 71 // 'phpbb', 'phpbb' in path, |
| 81 features.push_back(Contains("phpbb", path)); | 72 features.push_back(Contains("phpbb", path)); |
| 82 // 'php', path.endswith('.php'), | 73 // 'php', path.endswith('.php'), |
| 83 features.push_back(EndsWith(".php", path)); | 74 features.push_back(EndsWith(".php", path)); |
| 84 // 'pathlength', len(path), | 75 // 'pathLength', len(path), |
| 85 features.push_back(path.size()); | 76 features.push_back(path.size()); |
| 86 // 'domain', len(path) < 2, | 77 // 'domain', len(path) < 2, |
| 87 features.push_back(path.size() < 2); | 78 features.push_back(path.size() < 2); |
| 88 // 'pathcomponents', CountMatches(path, r'\/.'), | 79 // 'pathComponents', CountMatches(path, r'\/.'), |
| 89 features.push_back(CountMatches(path, "\\/.")); | 80 features.push_back(CountMatches(path, "\\/.")); |
| 90 // 'slugdetector', CountMatches(path, r'[^\w/]'), | 81 // 'slugDetector', CountMatches(path, r'[^\w/]'), |
| 91 features.push_back(CountMatches(path, "[^\\w/]")); | 82 features.push_back(CountMatches(path, "[^\\w/]")); |
| 92 // 'pathnumbers', CountMatches(path, r'\d+'), | 83 // 'pathNumbers', CountMatches(path, r'\d+'), |
| 93 features.push_back(CountMatches(path, "\\d+")); | 84 features.push_back(CountMatches(path, "\\d+")); |
| 94 // 'lastSegmentLength', len(GetLastSegment(path)), | 85 // 'lastSegmentLength', len(GetLastSegment(path)), |
| 95 features.push_back(GetLastSegment(path).size()); | 86 features.push_back(GetLastSegment(path).size()); |
| 96 // 'formcount', numForms, | 87 // 'formCount', numForms, |
| 97 features.push_back(numForms); | 88 features.push_back(f.formCount); |
| 98 // 'anchorcount', numAnchors, | 89 // 'anchorCount', numAnchors, |
| 99 features.push_back(numAnchors); | 90 features.push_back(f.anchorCount); |
| 100 // 'elementcount', numElements, | 91 // 'elementCount', numElements, |
| 101 features.push_back(numElements); | 92 features.push_back(f.elementCount); |
| 102 // 'anchorratio', float(numAnchors) / max(1, numElements), | 93 // 'anchorRatio', float(numAnchors) / max(1, numElements), |
| 103 features.push_back(double(numAnchors) / std::max<double>(1, numElements)); | 94 features.push_back( |
| 104 // 'innertextlength', len(innerText), | 95 double(f.anchorCount) / std::max<double>(1, f.elementCount)); |
| 105 features.push_back(innerText.size()); | 96 // 'mozScore' |
| 106 // 'textcontentlength', len(textContent), | 97 features.push_back(f.mozScore); |
| 107 features.push_back(textContent.size()); | 98 // 'mozScoreAllSqrt' |
| 108 // 'innerhtmllength', len(innerHTML), | 99 features.push_back(f.mozScoreAllSqrt); |
| 109 features.push_back(innerHTML.size()); | 100 // 'mozScoreAllLinear' |
| 110 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), | 101 features.push_back(f.mozScoreAllLinear); |
| 111 features.push_back(double(innerText.size()) / | 102 |
| 112 std::max<double>(1.0, innerHTML.size())); | |
| 113 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), | |
| 114 features.push_back(double(textContent.size()) / | |
| 115 std::max<double>(1.0, innerHTML.size())); | |
| 116 // 'innertexttextcontentlengthratio', | |
| 117 // float(len(innerText)) / max(1, len(textContent)), | |
| 118 features.push_back(double(innerText.size()) / | |
| 119 std::max<double>(1.0, textContent.size())); | |
| 120 // 'innertextwordcount', innerTextWords, | |
| 121 features.push_back(innerTextWords); | |
| 122 // 'textcontentwordcount', textContentWords, | |
| 123 features.push_back(textContentWords); | |
| 124 // 'innerhtmlwordcount', innerHTMLWords, | |
| 125 features.push_back(innerHTMLWords); | |
| 126 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), | |
| 127 features.push_back(double(innerTextWords) / | |
| 128 std::max<int>(1.0, innerHTMLWords)); | |
| 129 // 'textcontentwordcountratio', | |
| 130 // float(textContentWords) / max(1, innerHTMLWords), | |
| 131 features.push_back(double(textContentWords) / | |
| 132 std::max<int>(1.0, innerHTMLWords)); | |
| 133 // 'innertexttextcontentwordcountratio', | |
| 134 // float(innerTextWords) / max(1, textContentWords), | |
| 135 features.push_back(double(innerTextWords) / | |
| 136 std::max<int>(1.0, textContentWords)); | |
| 137 return features; | 103 return features; |
| 138 } | 104 } |
| 139 | 105 |
| 140 std::vector<double> CalculateDerivedFeaturesFromJSON( | |
| 141 const base::Value* stringified_json) { | |
| 142 std::string stringified; | |
| 143 if (!stringified_json->GetAsString(&stringified)) { | |
| 144 return std::vector<double>(); | |
| 145 } | |
| 146 | |
| 147 scoped_ptr<base::Value> json = base::JSONReader::Read(stringified); | |
| 148 if (!json) { | |
| 149 return std::vector<double>(); | |
| 150 } | |
| 151 | |
| 152 const base::DictionaryValue* dict; | |
| 153 if (!json->GetAsDictionary(&dict)) { | |
| 154 return std::vector<double>(); | |
| 155 } | |
| 156 | |
| 157 bool isOGArticle = false; | |
| 158 std::string url, innerText, textContent, innerHTML; | |
| 159 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; | |
| 160 | |
| 161 if (!(dict->GetBoolean("opengraph", &isOGArticle) && | |
| 162 dict->GetString("url", &url) && | |
| 163 dict->GetDouble("numElements", &numElements) && | |
| 164 dict->GetDouble("numAnchors", &numAnchors) && | |
| 165 dict->GetDouble("numForms", &numForms) && | |
| 166 dict->GetString("innerText", &innerText) && | |
| 167 dict->GetString("textContent", &textContent) && | |
| 168 dict->GetString("innerHTML", &innerHTML))) { | |
| 169 return std::vector<double>(); | |
| 170 } | |
| 171 | |
| 172 GURL parsed_url(url); | |
| 173 if (!parsed_url.is_valid()) { | |
| 174 return std::vector<double>(); | |
| 175 } | |
| 176 | |
| 177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, | |
| 178 numAnchors, numForms, innerText, textContent, | |
| 179 innerHTML); | |
| 180 } | |
| 181 | |
| 182 } // namespace dom_distiller | 106 } // namespace dom_distiller |
| OLD | NEW |