Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(56)

Side by Side Diff: components/dom_distiller/content/renderer/distillability_agent.cc

Issue 1972503002: Add flag "distillability-dev" for distillability development Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: fix tests Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/metrics/histogram.h" 5 #include "base/metrics/histogram.h"
6 #include "base/strings/string_number_conversions.h"
6 #include "base/strings/string_util.h" 7 #include "base/strings/string_util.h"
7 8
8 #include "components/dom_distiller/content/common/distillability_service.mojom.h " 9 #include "components/dom_distiller/content/common/distillability_service.mojom.h "
9 #include "components/dom_distiller/content/renderer/distillability_agent.h" 10 #include "components/dom_distiller/content/renderer/distillability_agent.h"
10 #include "components/dom_distiller/core/distillable_page_detector.h" 11 #include "components/dom_distiller/core/distillable_page_detector.h"
12 #include "components/dom_distiller/core/dom_distiller_features.h"
11 #include "components/dom_distiller/core/experiments.h" 13 #include "components/dom_distiller/core/experiments.h"
12 #include "components/dom_distiller/core/page_features.h" 14 #include "components/dom_distiller/core/page_features.h"
13 #include "components/dom_distiller/core/url_utils.h" 15 #include "components/dom_distiller/core/url_utils.h"
14 #include "content/public/common/service_registry.h" 16 #include "content/public/common/service_registry.h"
15 #include "content/public/renderer/render_frame.h" 17 #include "content/public/renderer/render_frame.h"
16 18
17 #include "third_party/WebKit/public/platform/WebDistillability.h" 19 #include "third_party/WebKit/public/platform/WebDistillability.h"
18 #include "third_party/WebKit/public/web/WebDocument.h" 20 #include "third_party/WebKit/public/web/WebDocument.h"
19 #include "third_party/WebKit/public/web/WebElement.h" 21 #include "third_party/WebKit/public/web/WebElement.h"
20 #include "third_party/WebKit/public/web/WebLocalFrame.h" 22 #include "third_party/WebKit/public/web/WebLocalFrame.h"
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
58 60
59 bool IsBlacklisted(const GURL& url) { 61 bool IsBlacklisted(const GURL& url) {
60 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { 62 for (size_t i = 0; i < arraysize(kBlacklist); ++i) {
61 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { 63 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) {
62 return true; 64 return true;
63 } 65 }
64 } 66 }
65 return false; 67 return false;
66 } 68 }
67 69
70 std::string DoubleToString(double v) {
71 // base::DoubleToString() returns format like ".1", which is invalid
72 // in python json parser.
73 std::stringstream ss;
esprehn 2016/05/23 20:19:42 this is terrible, can we get an argument added to
wychen 2016/05/23 21:40:24 JSON writer takes care of this.
74 ss << v;
75 return ss.str();
76 }
77
78 void dumpDistillability(content::RenderFrame* render_frame,
esprehn 2016/05/23 20:19:42 Dump I think? your name is wrong
wychen 2016/05/23 21:40:24 Done.
79 const std::vector<double>& derived,
80 double score,
81 bool distillable,
82 double long_score,
83 bool long_page,
84 bool blacklisted) {
85 std::string msg = "adaboost_classification = {\n";
86 msg += "\"derived_features\": [";
87 for (unsigned i = 0; i < derived.size(); i++) {
88 if (i != 0) {
89 msg += ", ";
90 }
91 msg += DoubleToString(derived[i]);
92 }
93 msg += "],\n";
94 msg += "\"score\": " + DoubleToString(score) + ",\n";
95 msg += "\"distillable\": " + base::IntToString(distillable) + ",\n";
96 msg += "\"long_score\": " + DoubleToString(long_score) + ",\n";
97 msg += "\"long_page\": " + base::IntToString(long_page) + ",\n";
esprehn 2016/05/23 20:19:42 Can you use base::Value and base::JSONWriter::Writ
wychen 2016/05/23 21:40:24 Awesome idea! Done. The key order is sorted though
98 msg += "\"blacklisted\": " + base::IntToString(blacklisted) + "\n";
99 msg += "}";
100 render_frame->AddMessageToConsole(content::CONSOLE_MESSAGE_LEVEL_DEBUG, msg);
101 }
102
68 bool IsDistillablePageAdaboost(WebDocument& doc, 103 bool IsDistillablePageAdaboost(WebDocument& doc,
69 const DistillablePageDetector* detector, 104 const DistillablePageDetector* detector,
70 const DistillablePageDetector* long_page, 105 const DistillablePageDetector* long_page,
71 bool is_last) { 106 bool is_last,
72 WebDistillabilityFeatures features = doc.distillabilityFeatures(); 107 content::RenderFrame* render_frame) {
108 bool isDevMode = IsDistillabilityDevSet();
109 WebDistillabilityFeatures features = doc.distillabilityFeatures(isDevMode);
73 GURL parsed_url(doc.url()); 110 GURL parsed_url(doc.url());
74 if (!parsed_url.is_valid()) { 111 if (!parsed_url.is_valid()) {
75 return false; 112 return false;
76 } 113 }
77 std::vector<double> derived = CalculateDerivedFeatures( 114 std::vector<double> derived = CalculateDerivedFeatures(
78 features.openGraph, 115 features.openGraph,
79 parsed_url, 116 parsed_url,
80 features.elementCount, 117 features.elementCount,
81 features.anchorCount, 118 features.anchorCount,
82 features.formCount, 119 features.formCount,
83 features.mozScore, 120 features.mozScore,
84 features.mozScoreAllSqrt, 121 features.mozScoreAllSqrt,
85 features.mozScoreAllLinear 122 features.mozScoreAllLinear
86 ); 123 );
87 bool distillable = detector->Classify(derived); 124 double score = detector->Score(derived) - detector->GetThreshold();
88 bool long_article = long_page->Classify(derived); 125 double long_score = long_page->Score(derived) - long_page->GetThreshold();
126 bool distillable = score > 0;
127 bool long_article = long_score > 0;
89 bool blacklisted = IsBlacklisted(parsed_url); 128 bool blacklisted = IsBlacklisted(parsed_url);
90 129
130 if (isDevMode) {
131 dumpDistillability(render_frame, derived, score, distillable, long_score,
132 long_article, blacklisted);
133 }
134
91 int bucket = static_cast<unsigned>(features.isMobileFriendly) | 135 int bucket = static_cast<unsigned>(features.isMobileFriendly) |
92 (static_cast<unsigned>(distillable) << 1); 136 (static_cast<unsigned>(distillable) << 1);
93 if (is_last) { 137 if (is_last) {
94 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading", 138 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading",
95 bucket, 4); 139 bucket, 4);
96 } else { 140 } else {
97 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing", 141 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing",
98 bucket, 4); 142 bucket, 4);
99 } 143 }
100 144
101 if (blacklisted) { 145 if (blacklisted) {
102 return false; 146 return false;
103 } 147 }
104 if (features.isMobileFriendly) { 148 if (features.isMobileFriendly) {
105 return false; 149 return false;
106 } 150 }
107 return distillable && long_article; 151 return distillable && long_article;
108 } 152 }
109 153
110 bool IsDistillablePage(WebDocument& doc, bool is_last) { 154 bool IsDistillablePage(WebDocument& doc, bool is_last,
155 content::RenderFrame* render_frame) {
111 switch (GetDistillerHeuristicsType()) { 156 switch (GetDistillerHeuristicsType()) {
112 case DistillerHeuristicsType::ALWAYS_TRUE: 157 case DistillerHeuristicsType::ALWAYS_TRUE:
113 return true; 158 return true;
114 case DistillerHeuristicsType::OG_ARTICLE: 159 case DistillerHeuristicsType::OG_ARTICLE:
115 return doc.distillabilityFeatures().openGraph; 160 return doc.distillabilityFeatures(false).openGraph;
116 case DistillerHeuristicsType::ADABOOST_MODEL: 161 case DistillerHeuristicsType::ADABOOST_MODEL:
117 return IsDistillablePageAdaboost(doc, 162 return IsDistillablePageAdaboost(doc,
118 DistillablePageDetector::GetNewModel(), 163 DistillablePageDetector::GetNewModel(),
119 DistillablePageDetector::GetLongPageModel(), is_last); 164 DistillablePageDetector::GetLongPageModel(), is_last, render_frame);
120 case DistillerHeuristicsType::NONE: 165 case DistillerHeuristicsType::NONE:
121 default: 166 default:
122 return false; 167 return false;
123 } 168 }
124 } 169 }
125 170
126 } // namespace 171 } // namespace
127 172
128 DistillabilityAgent::DistillabilityAgent( 173 DistillabilityAgent::DistillabilityAgent(
129 content::RenderFrame* render_frame) 174 content::RenderFrame* render_frame)
(...skipping 17 matching lines...) Expand all
147 bool is_loaded = layout_type == WebMeaningfulLayout::FinishedLoading; 192 bool is_loaded = layout_type == WebMeaningfulLayout::FinishedLoading;
148 if (!NeedToUpdate(is_loaded)) return; 193 if (!NeedToUpdate(is_loaded)) return;
149 194
150 bool is_last = IsLast(is_loaded); 195 bool is_last = IsLast(is_loaded);
151 // Connect to Mojo service on browser to notify page distillability. 196 // Connect to Mojo service on browser to notify page distillability.
152 DistillabilityServicePtr distillability_service; 197 DistillabilityServicePtr distillability_service;
153 render_frame()->GetServiceRegistry()->ConnectToRemoteService( 198 render_frame()->GetServiceRegistry()->ConnectToRemoteService(
154 mojo::GetProxy(&distillability_service)); 199 mojo::GetProxy(&distillability_service));
155 DCHECK(distillability_service); 200 DCHECK(distillability_service);
156 distillability_service->NotifyIsDistillable( 201 distillability_service->NotifyIsDistillable(
157 IsDistillablePage(doc, is_last), is_last); 202 IsDistillablePage(doc, is_last, render_frame()), is_last);
158 } 203 }
159 204
160 DistillabilityAgent::~DistillabilityAgent() {} 205 DistillabilityAgent::~DistillabilityAgent() {}
161 206
162 } // namespace dom_distiller 207 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698