Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/metrics/histogram.h" | 5 #include "base/metrics/histogram.h" |
| 6 #include "base/strings/string_number_conversions.h" | |
| 6 #include "base/strings/string_util.h" | 7 #include "base/strings/string_util.h" |
| 7 | 8 |
| 8 #include "components/dom_distiller/content/common/distillability_service.mojom.h " | 9 #include "components/dom_distiller/content/common/distillability_service.mojom.h " |
| 9 #include "components/dom_distiller/content/renderer/distillability_agent.h" | 10 #include "components/dom_distiller/content/renderer/distillability_agent.h" |
| 10 #include "components/dom_distiller/core/distillable_page_detector.h" | 11 #include "components/dom_distiller/core/distillable_page_detector.h" |
| 12 #include "components/dom_distiller/core/dom_distiller_features.h" | |
| 11 #include "components/dom_distiller/core/experiments.h" | 13 #include "components/dom_distiller/core/experiments.h" |
| 12 #include "components/dom_distiller/core/page_features.h" | 14 #include "components/dom_distiller/core/page_features.h" |
| 13 #include "components/dom_distiller/core/url_utils.h" | 15 #include "components/dom_distiller/core/url_utils.h" |
| 14 #include "content/public/common/service_registry.h" | 16 #include "content/public/common/service_registry.h" |
| 15 #include "content/public/renderer/render_frame.h" | 17 #include "content/public/renderer/render_frame.h" |
| 16 | 18 |
| 17 #include "third_party/WebKit/public/platform/WebDistillability.h" | 19 #include "third_party/WebKit/public/platform/WebDistillability.h" |
| 18 #include "third_party/WebKit/public/web/WebDocument.h" | 20 #include "third_party/WebKit/public/web/WebDocument.h" |
| 19 #include "third_party/WebKit/public/web/WebElement.h" | 21 #include "third_party/WebKit/public/web/WebElement.h" |
| 20 #include "third_party/WebKit/public/web/WebLocalFrame.h" | 22 #include "third_party/WebKit/public/web/WebLocalFrame.h" |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 58 | 60 |
| 59 bool IsBlacklisted(const GURL& url) { | 61 bool IsBlacklisted(const GURL& url) { |
| 60 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { | 62 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { |
| 61 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { | 63 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { |
| 62 return true; | 64 return true; |
| 63 } | 65 } |
| 64 } | 66 } |
| 65 return false; | 67 return false; |
| 66 } | 68 } |
| 67 | 69 |
| 70 std::string DoubleToString(double v) { | |
| 71 // base::DoubleToString() returns format like ".1", which is invalid | |
| 72 // in python json parser. | |
| 73 std::stringstream ss; | |
|
esprehn
2016/05/23 20:19:42
this is terrible, can we get an argument added to
wychen
2016/05/23 21:40:24
JSON writer takes care of this.
| |
| 74 ss << v; | |
| 75 return ss.str(); | |
| 76 } | |
| 77 | |
| 78 void dumpDistillability(content::RenderFrame* render_frame, | |
|
esprehn
2016/05/23 20:19:42
Dump I think? your name is wrong
wychen
2016/05/23 21:40:24
Done.
| |
| 79 const std::vector<double>& derived, | |
| 80 double score, | |
| 81 bool distillable, | |
| 82 double long_score, | |
| 83 bool long_page, | |
| 84 bool blacklisted) { | |
| 85 std::string msg = "adaboost_classification = {\n"; | |
| 86 msg += "\"derived_features\": ["; | |
| 87 for (unsigned i = 0; i < derived.size(); i++) { | |
| 88 if (i != 0) { | |
| 89 msg += ", "; | |
| 90 } | |
| 91 msg += DoubleToString(derived[i]); | |
| 92 } | |
| 93 msg += "],\n"; | |
| 94 msg += "\"score\": " + DoubleToString(score) + ",\n"; | |
| 95 msg += "\"distillable\": " + base::IntToString(distillable) + ",\n"; | |
| 96 msg += "\"long_score\": " + DoubleToString(long_score) + ",\n"; | |
| 97 msg += "\"long_page\": " + base::IntToString(long_page) + ",\n"; | |
|
esprehn
2016/05/23 20:19:42
Can you use base::Value and base::JSONWriter::Writ
wychen
2016/05/23 21:40:24
Awesome idea! Done. The key order is sorted though
| |
| 98 msg += "\"blacklisted\": " + base::IntToString(blacklisted) + "\n"; | |
| 99 msg += "}"; | |
| 100 render_frame->AddMessageToConsole(content::CONSOLE_MESSAGE_LEVEL_DEBUG, msg); | |
| 101 } | |
| 102 | |
| 68 bool IsDistillablePageAdaboost(WebDocument& doc, | 103 bool IsDistillablePageAdaboost(WebDocument& doc, |
| 69 const DistillablePageDetector* detector, | 104 const DistillablePageDetector* detector, |
| 70 const DistillablePageDetector* long_page, | 105 const DistillablePageDetector* long_page, |
| 71 bool is_last) { | 106 bool is_last, |
| 72 WebDistillabilityFeatures features = doc.distillabilityFeatures(); | 107 content::RenderFrame* render_frame) { |
| 108 bool isDevMode = IsDistillabilityDevSet(); | |
| 109 WebDistillabilityFeatures features = doc.distillabilityFeatures(isDevMode); | |
| 73 GURL parsed_url(doc.url()); | 110 GURL parsed_url(doc.url()); |
| 74 if (!parsed_url.is_valid()) { | 111 if (!parsed_url.is_valid()) { |
| 75 return false; | 112 return false; |
| 76 } | 113 } |
| 77 std::vector<double> derived = CalculateDerivedFeatures( | 114 std::vector<double> derived = CalculateDerivedFeatures( |
| 78 features.openGraph, | 115 features.openGraph, |
| 79 parsed_url, | 116 parsed_url, |
| 80 features.elementCount, | 117 features.elementCount, |
| 81 features.anchorCount, | 118 features.anchorCount, |
| 82 features.formCount, | 119 features.formCount, |
| 83 features.mozScore, | 120 features.mozScore, |
| 84 features.mozScoreAllSqrt, | 121 features.mozScoreAllSqrt, |
| 85 features.mozScoreAllLinear | 122 features.mozScoreAllLinear |
| 86 ); | 123 ); |
| 87 bool distillable = detector->Classify(derived); | 124 double score = detector->Score(derived) - detector->GetThreshold(); |
| 88 bool long_article = long_page->Classify(derived); | 125 double long_score = long_page->Score(derived) - long_page->GetThreshold(); |
| 126 bool distillable = score > 0; | |
| 127 bool long_article = long_score > 0; | |
| 89 bool blacklisted = IsBlacklisted(parsed_url); | 128 bool blacklisted = IsBlacklisted(parsed_url); |
| 90 | 129 |
| 130 if (isDevMode) { | |
| 131 dumpDistillability(render_frame, derived, score, distillable, long_score, | |
| 132 long_article, blacklisted); | |
| 133 } | |
| 134 | |
| 91 int bucket = static_cast<unsigned>(features.isMobileFriendly) | | 135 int bucket = static_cast<unsigned>(features.isMobileFriendly) | |
| 92 (static_cast<unsigned>(distillable) << 1); | 136 (static_cast<unsigned>(distillable) << 1); |
| 93 if (is_last) { | 137 if (is_last) { |
| 94 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading", | 138 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading", |
| 95 bucket, 4); | 139 bucket, 4); |
| 96 } else { | 140 } else { |
| 97 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing", | 141 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing", |
| 98 bucket, 4); | 142 bucket, 4); |
| 99 } | 143 } |
| 100 | 144 |
| 101 if (blacklisted) { | 145 if (blacklisted) { |
| 102 return false; | 146 return false; |
| 103 } | 147 } |
| 104 if (features.isMobileFriendly) { | 148 if (features.isMobileFriendly) { |
| 105 return false; | 149 return false; |
| 106 } | 150 } |
| 107 return distillable && long_article; | 151 return distillable && long_article; |
| 108 } | 152 } |
| 109 | 153 |
| 110 bool IsDistillablePage(WebDocument& doc, bool is_last) { | 154 bool IsDistillablePage(WebDocument& doc, bool is_last, |
| 155 content::RenderFrame* render_frame) { | |
| 111 switch (GetDistillerHeuristicsType()) { | 156 switch (GetDistillerHeuristicsType()) { |
| 112 case DistillerHeuristicsType::ALWAYS_TRUE: | 157 case DistillerHeuristicsType::ALWAYS_TRUE: |
| 113 return true; | 158 return true; |
| 114 case DistillerHeuristicsType::OG_ARTICLE: | 159 case DistillerHeuristicsType::OG_ARTICLE: |
| 115 return doc.distillabilityFeatures().openGraph; | 160 return doc.distillabilityFeatures(false).openGraph; |
| 116 case DistillerHeuristicsType::ADABOOST_MODEL: | 161 case DistillerHeuristicsType::ADABOOST_MODEL: |
| 117 return IsDistillablePageAdaboost(doc, | 162 return IsDistillablePageAdaboost(doc, |
| 118 DistillablePageDetector::GetNewModel(), | 163 DistillablePageDetector::GetNewModel(), |
| 119 DistillablePageDetector::GetLongPageModel(), is_last); | 164 DistillablePageDetector::GetLongPageModel(), is_last, render_frame); |
| 120 case DistillerHeuristicsType::NONE: | 165 case DistillerHeuristicsType::NONE: |
| 121 default: | 166 default: |
| 122 return false; | 167 return false; |
| 123 } | 168 } |
| 124 } | 169 } |
| 125 | 170 |
| 126 } // namespace | 171 } // namespace |
| 127 | 172 |
| 128 DistillabilityAgent::DistillabilityAgent( | 173 DistillabilityAgent::DistillabilityAgent( |
| 129 content::RenderFrame* render_frame) | 174 content::RenderFrame* render_frame) |
| (...skipping 17 matching lines...) Expand all Loading... | |
| 147 bool is_loaded = layout_type == WebMeaningfulLayout::FinishedLoading; | 192 bool is_loaded = layout_type == WebMeaningfulLayout::FinishedLoading; |
| 148 if (!NeedToUpdate(is_loaded)) return; | 193 if (!NeedToUpdate(is_loaded)) return; |
| 149 | 194 |
| 150 bool is_last = IsLast(is_loaded); | 195 bool is_last = IsLast(is_loaded); |
| 151 // Connect to Mojo service on browser to notify page distillability. | 196 // Connect to Mojo service on browser to notify page distillability. |
| 152 DistillabilityServicePtr distillability_service; | 197 DistillabilityServicePtr distillability_service; |
| 153 render_frame()->GetServiceRegistry()->ConnectToRemoteService( | 198 render_frame()->GetServiceRegistry()->ConnectToRemoteService( |
| 154 mojo::GetProxy(&distillability_service)); | 199 mojo::GetProxy(&distillability_service)); |
| 155 DCHECK(distillability_service); | 200 DCHECK(distillability_service); |
| 156 distillability_service->NotifyIsDistillable( | 201 distillability_service->NotifyIsDistillable( |
| 157 IsDistillablePage(doc, is_last), is_last); | 202 IsDistillablePage(doc, is_last, render_frame()), is_last); |
| 158 } | 203 } |
| 159 | 204 |
| 160 DistillabilityAgent::~DistillabilityAgent() {} | 205 DistillabilityAgent::~DistillabilityAgent() {} |
| 161 | 206 |
| 162 } // namespace dom_distiller | 207 } // namespace dom_distiller |
| OLD | NEW |