Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(482)

Side by Side Diff: components/dom_distiller/core/page_features.cc

Issue 1047223003: Add integration of the new heuristics (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@dd-adaboost-model
Patch Set: whitelist resources for ios Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/page_features.h" 5 #include "components/dom_distiller/core/page_features.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/json/json_reader.h"
9 #include "third_party/re2/re2/re2.h" 10 #include "third_party/re2/re2/re2.h"
10 11
11 namespace dom_distiller { 12 namespace dom_distiller {
12 /* This code needs to derive features in the same way and order in which they 13 /* This code needs to derive features in the same way and order in which they
13 * are derived when training the model. Parts of that code are reproduced in the 14 * are derived when training the model. Parts of that code are reproduced in the
14 * comments below. 15 * comments below.
15 */ 16 */
16 17
17 namespace { 18 namespace {
18 std::string GetLastSegment(const std::string& path) { 19 std::string GetLastSegment(const std::string& path) {
(...skipping 21 matching lines...) Expand all
40 bool Contains(const std::string& n, const std::string& h) { 41 bool Contains(const std::string& n, const std::string& h) {
41 return h.find(n) != std::string::npos; 42 return h.find(n) != std::string::npos;
42 } 43 }
43 44
44 bool EndsWith(const std::string& t, const std::string& s) { 45 bool EndsWith(const std::string& t, const std::string& s) {
45 return s.size() >= t.size() && 46 return s.size() >= t.size() &&
46 s.compare(s.size() - t.size(), std::string::npos, t) == 0; 47 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
47 } 48 }
48 } 49 }
49 50
51 int kDerivedFeaturesCount = 29;
52
50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, 53 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
51 const GURL& url, 54 const GURL& url,
52 double numElements, 55 double numElements,
53 double numAnchors, 56 double numAnchors,
54 double numForms, 57 double numForms,
55 const std::string& innerText, 58 const std::string& innerText,
56 const std::string& textContent, 59 const std::string& textContent,
57 const std::string& innerHTML) { 60 const std::string& innerHTML) {
58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as 61 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
59 // they are here). 62 // they are here).
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
125 // float(textContentWords) / max(1, innerHTMLWords), 128 // float(textContentWords) / max(1, innerHTMLWords),
126 features.push_back(double(textContentWords) / 129 features.push_back(double(textContentWords) /
127 std::max<int>(1.0, innerHTMLWords)); 130 std::max<int>(1.0, innerHTMLWords));
128 // 'innertexttextcontentwordcountratio', 131 // 'innertexttextcontentwordcountratio',
129 // float(innerTextWords) / max(1, textContentWords), 132 // float(innerTextWords) / max(1, textContentWords),
130 features.push_back(double(innerTextWords) / 133 features.push_back(double(innerTextWords) /
131 std::max<int>(1.0, textContentWords)); 134 std::max<int>(1.0, textContentWords));
132 return features; 135 return features;
133 } 136 }
134 137
135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) { 138 std::vector<double> CalculateDerivedFeaturesFromJSON(
139 const base::Value* stringified_json) {
140 std::string stringified;
141 if (!stringified_json->GetAsString(&stringified)) {
142 return std::vector<double>();
143 }
144
145 scoped_ptr<base::Value> json(base::JSONReader::Read(stringified));
146 if (!json) {
147 return std::vector<double>();
148 }
149
136 const base::DictionaryValue* dict; 150 const base::DictionaryValue* dict;
137 if (!json->GetAsDictionary(&dict)) { 151 if (!json->GetAsDictionary(&dict)) {
138 return std::vector<double>(); 152 return std::vector<double>();
139 } 153 }
140 154
141 bool isOGArticle = false; 155 bool isOGArticle = false;
142 std::string url, innerText, textContent, innerHTML; 156 std::string url, innerText, textContent, innerHTML;
143 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; 157 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
144 158
145 if (!(dict->GetBoolean("opengraph", &isOGArticle) && 159 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
(...skipping 10 matching lines...) Expand all
156 GURL parsed_url(url); 170 GURL parsed_url(url);
157 if (!parsed_url.is_valid()) { 171 if (!parsed_url.is_valid()) {
158 return std::vector<double>(); 172 return std::vector<double>();
159 } 173 }
160 174
161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, 175 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
162 numAnchors, numForms, innerText, textContent, 176 numAnchors, numForms, innerText, textContent,
163 innerHTML); 177 innerHTML);
164 } 178 }
165 } 179 }
OLDNEW
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698