| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/content/renderer/distillability_agent.h" | 5 #include "components/dom_distiller/content/renderer/distillability_agent.h" |
| 6 | 6 |
| 7 #include "base/metrics/histogram_macros.h" | 7 #include "base/metrics/histogram_macros.h" |
| 8 #include "base/strings/string_util.h" | 8 #include "base/strings/string_util.h" |
| 9 #include "components/dom_distiller/content/common/distillability_service.mojom.h
" | 9 #include "components/dom_distiller/content/common/distillability_service.mojom.h
" |
| 10 #include "components/dom_distiller/core/distillable_page_detector.h" | 10 #include "components/dom_distiller/core/distillable_page_detector.h" |
| (...skipping 23 matching lines...) Expand all Loading... |
| 34 BLACKLISTED, | 34 BLACKLISTED, |
| 35 TOO_SHORT, | 35 TOO_SHORT, |
| 36 NOT_REJECTED, | 36 NOT_REJECTED, |
| 37 REJECTION_BUCKET_BOUNDARY | 37 REJECTION_BUCKET_BOUNDARY |
| 38 }; | 38 }; |
| 39 | 39 |
| 40 // Returns whether it is necessary to send updates back to the browser. | 40 // Returns whether it is necessary to send updates back to the browser. |
| 41 // The number of updates can be from 0 to 2. See the tests in | 41 // The number of updates can be from 0 to 2. See the tests in |
| 42 // "distillable_page_utils_browsertest.cc". | 42 // "distillable_page_utils_browsertest.cc". |
| 43 // Most heuristics types only require one update after parsing. | 43 // Most heuristics types only require one update after parsing. |
| 44 // Adaboost is the only one doing the second update, which is after loading. | 44 // Adaboost-based heuristics are the only ones doing the second update, |
| 45 // which is after loading. |
| 45 bool NeedToUpdate(bool is_loaded) { | 46 bool NeedToUpdate(bool is_loaded) { |
| 46 switch (GetDistillerHeuristicsType()) { | 47 switch (GetDistillerHeuristicsType()) { |
| 47 case DistillerHeuristicsType::ALWAYS_TRUE: | 48 case DistillerHeuristicsType::ALWAYS_TRUE: |
| 48 return !is_loaded; | 49 return !is_loaded; |
| 49 case DistillerHeuristicsType::OG_ARTICLE: | 50 case DistillerHeuristicsType::OG_ARTICLE: |
| 50 return !is_loaded; | 51 return !is_loaded; |
| 51 case DistillerHeuristicsType::ADABOOST_MODEL: | 52 case DistillerHeuristicsType::ADABOOST_MODEL: |
| 53 case DistillerHeuristicsType::ALL_ARTICLES: |
| 52 return true; | 54 return true; |
| 53 case DistillerHeuristicsType::NONE: | 55 case DistillerHeuristicsType::NONE: |
| 54 default: | 56 default: |
| 55 return false; | 57 return false; |
| 56 } | 58 } |
| 57 } | 59 } |
| 58 | 60 |
| 59 // Returns whether this update is the last one for the page. | 61 // Returns whether this update is the last one for the page. |
| 60 bool IsLast(bool is_loaded) { | 62 bool IsLast(bool is_loaded) { |
| 61 if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL) | 63 if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL || |
| 64 GetDistillerHeuristicsType() == DistillerHeuristicsType::ALL_ARTICLES) |
| 62 return is_loaded; | 65 return is_loaded; |
| 63 | 66 |
| 64 return true; | 67 return true; |
| 65 } | 68 } |
| 66 | 69 |
| 67 bool IsBlacklisted(const GURL& url) { | 70 bool IsBlacklisted(const GURL& url) { |
| 68 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { | 71 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { |
| 69 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { | 72 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { |
| 70 return true; | 73 return true; |
| 71 } | 74 } |
| 72 } | 75 } |
| 73 return false; | 76 return false; |
| 74 } | 77 } |
| 75 | 78 |
| 76 bool IsDistillablePageAdaboost(WebDocument& doc, | 79 bool IsDistillablePageAdaboost(WebDocument& doc, |
| 77 const DistillablePageDetector* detector, | 80 const DistillablePageDetector* detector, |
| 78 const DistillablePageDetector* long_page, | 81 const DistillablePageDetector* long_page, |
| 79 bool is_last) { | 82 bool is_last, |
| 83 bool exclude_mobile) { |
| 80 WebDistillabilityFeatures features = doc.DistillabilityFeatures(); | 84 WebDistillabilityFeatures features = doc.DistillabilityFeatures(); |
| 81 GURL parsed_url(doc.Url()); | 85 GURL parsed_url(doc.Url()); |
| 82 if (!parsed_url.is_valid()) { | 86 if (!parsed_url.is_valid()) { |
| 83 return false; | 87 return false; |
| 84 } | 88 } |
| 85 std::vector<double> derived = CalculateDerivedFeatures( | 89 std::vector<double> derived = CalculateDerivedFeatures( |
| 86 features.open_graph, parsed_url, features.element_count, | 90 features.open_graph, parsed_url, features.element_count, |
| 87 features.anchor_count, features.form_count, features.moz_score, | 91 features.anchor_count, features.form_count, features.moz_score, |
| 88 features.moz_score_all_sqrt, features.moz_score_all_linear); | 92 features.moz_score_all_sqrt, features.moz_score_all_linear); |
| 89 double score = detector->Score(derived) - detector->GetThreshold(); | 93 double score = detector->Score(derived) - detector->GetThreshold(); |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 138 TOO_SHORT, REJECTION_BUCKET_BOUNDARY); | 142 TOO_SHORT, REJECTION_BUCKET_BOUNDARY); |
| 139 } else { | 143 } else { |
| 140 UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection", | 144 UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection", |
| 141 NOT_REJECTED, REJECTION_BUCKET_BOUNDARY); | 145 NOT_REJECTED, REJECTION_BUCKET_BOUNDARY); |
| 142 } | 146 } |
| 143 } | 147 } |
| 144 | 148 |
| 145 if (blacklisted) { | 149 if (blacklisted) { |
| 146 return false; | 150 return false; |
| 147 } | 151 } |
| 148 if (features.is_mobile_friendly) { | 152 if (exclude_mobile && features.is_mobile_friendly) { |
| 149 return false; | 153 return false; |
| 150 } | 154 } |
| 151 return distillable && long_article; | 155 return distillable && long_article; |
| 152 } | 156 } |
| 153 | 157 |
| 154 bool IsDistillablePage(WebDocument& doc, bool is_last) { | 158 bool IsDistillablePage(WebDocument& doc, bool is_last) { |
| 155 switch (GetDistillerHeuristicsType()) { | 159 switch (GetDistillerHeuristicsType()) { |
| 156 case DistillerHeuristicsType::ALWAYS_TRUE: | 160 case DistillerHeuristicsType::ALWAYS_TRUE: |
| 157 return true; | 161 return true; |
| 158 case DistillerHeuristicsType::OG_ARTICLE: | 162 case DistillerHeuristicsType::OG_ARTICLE: |
| 159 return doc.DistillabilityFeatures().open_graph; | 163 return doc.DistillabilityFeatures().open_graph; |
| 160 case DistillerHeuristicsType::ADABOOST_MODEL: | 164 case DistillerHeuristicsType::ADABOOST_MODEL: |
| 161 return IsDistillablePageAdaboost(doc, | 165 return IsDistillablePageAdaboost( |
| 162 DistillablePageDetector::GetNewModel(), | 166 doc, DistillablePageDetector::GetNewModel(), |
| 163 DistillablePageDetector::GetLongPageModel(), is_last); | 167 DistillablePageDetector::GetLongPageModel(), is_last, true); |
| 168 case DistillerHeuristicsType::ALL_ARTICLES: |
| 169 return IsDistillablePageAdaboost( |
| 170 doc, DistillablePageDetector::GetNewModel(), |
| 171 DistillablePageDetector::GetLongPageModel(), is_last, false); |
| 164 case DistillerHeuristicsType::NONE: | 172 case DistillerHeuristicsType::NONE: |
| 165 default: | 173 default: |
| 166 return false; | 174 return false; |
| 167 } | 175 } |
| 168 } | 176 } |
| 169 | 177 |
| 170 } // namespace | 178 } // namespace |
| 171 | 179 |
| 172 DistillabilityAgent::DistillabilityAgent( | 180 DistillabilityAgent::DistillabilityAgent( |
| 173 content::RenderFrame* render_frame) | 181 content::RenderFrame* render_frame) |
| (...skipping 30 matching lines...) Expand all Loading... |
| 204 IsDistillablePage(doc, is_last), is_last); | 212 IsDistillablePage(doc, is_last), is_last); |
| 205 } | 213 } |
| 206 | 214 |
| 207 DistillabilityAgent::~DistillabilityAgent() {} | 215 DistillabilityAgent::~DistillabilityAgent() {} |
| 208 | 216 |
| 209 void DistillabilityAgent::OnDestruct() { | 217 void DistillabilityAgent::OnDestruct() { |
| 210 delete this; | 218 delete this; |
| 211 } | 219 } |
| 212 | 220 |
| 213 } // namespace dom_distiller | 221 } // namespace dom_distiller |
| OLD | NEW |