OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/metrics/histogram.h" | 5 #include "base/metrics/histogram.h" |
6 | 6 |
7 #include "components/dom_distiller/content/common/distiller_messages.h" | 7 #include "components/dom_distiller/content/common/distiller_messages.h" |
8 #include "components/dom_distiller/content/renderer/distillability_agent.h" | 8 #include "components/dom_distiller/content/renderer/distillability_agent.h" |
9 #include "components/dom_distiller/core/distillable_page_detector.h" | 9 #include "components/dom_distiller/core/distillable_page_detector.h" |
10 #include "components/dom_distiller/core/experiments.h" | 10 #include "components/dom_distiller/core/experiments.h" |
11 #include "components/dom_distiller/core/page_features.h" | 11 #include "components/dom_distiller/core/page_features.h" |
12 #include "components/dom_distiller/core/url_utils.h" | 12 #include "components/dom_distiller/core/url_utils.h" |
13 #include "content/public/renderer/render_frame.h" | 13 #include "content/public/renderer/render_frame.h" |
14 | 14 |
15 #include "third_party/WebKit/public/platform/WebDistillability.h" | 15 #include "third_party/WebKit/public/platform/WebDistillability.h" |
16 #include "third_party/WebKit/public/web/WebDocument.h" | 16 #include "third_party/WebKit/public/web/WebDocument.h" |
17 #include "third_party/WebKit/public/web/WebElement.h" | 17 #include "third_party/WebKit/public/web/WebElement.h" |
18 #include "third_party/WebKit/public/web/WebLocalFrame.h" | 18 #include "third_party/WebKit/public/web/WebLocalFrame.h" |
19 | 19 |
20 namespace dom_distiller { | 20 namespace dom_distiller { |
21 | 21 |
22 using namespace blink; | 22 using namespace blink; |
23 | 23 |
24 namespace { | 24 namespace { |
25 | 25 |
| 26 const char* const kBlacklist[] = { |
| 27 "www.reddit.com" |
| 28 }; |
| 29 |
26 // Returns whether it is necessary to send updates back to the browser. | 30 // Returns whether it is necessary to send updates back to the browser. |
27 // The number of updates can be from 0 to 2. See the tests in | 31 // The number of updates can be from 0 to 2. See the tests in |
28 // "distillable_page_utils_browsertest.cc". | 32 // "distillable_page_utils_browsertest.cc". |
29 // Most heuristics types only require one update after parsing. | 33 // Most heuristics types only require one update after parsing. |
30 // Adaboost is the only one doing the second update, which is after loading. | 34 // Adaboost is the only one doing the second update, which is after loading. |
31 bool NeedToUpdate(bool is_loaded) { | 35 bool NeedToUpdate(bool is_loaded) { |
32 switch (GetDistillerHeuristicsType()) { | 36 switch (GetDistillerHeuristicsType()) { |
33 case DistillerHeuristicsType::ALWAYS_TRUE: | 37 case DistillerHeuristicsType::ALWAYS_TRUE: |
34 return !is_loaded; | 38 return !is_loaded; |
35 case DistillerHeuristicsType::OG_ARTICLE: | 39 case DistillerHeuristicsType::OG_ARTICLE: |
36 return !is_loaded; | 40 return !is_loaded; |
37 case DistillerHeuristicsType::ADABOOST_MODEL: | 41 case DistillerHeuristicsType::ADABOOST_MODEL: |
38 return true; | 42 return true; |
39 case DistillerHeuristicsType::NONE: | 43 case DistillerHeuristicsType::NONE: |
40 default: | 44 default: |
41 return false; | 45 return false; |
42 } | 46 } |
43 } | 47 } |
44 | 48 |
45 // Returns whether this update is the last one for the page. | 49 // Returns whether this update is the last one for the page. |
46 bool IsLast(bool is_loaded) { | 50 bool IsLast(bool is_loaded) { |
47 if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL) | 51 if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL) |
48 return is_loaded; | 52 return is_loaded; |
49 | 53 |
50 return true; | 54 return true; |
51 } | 55 } |
52 | 56 |
| 57 bool IsBlacklisted(const GURL& url) { |
| 58 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { |
| 59 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { |
| 60 return true; |
| 61 } |
| 62 } |
| 63 return false; |
| 64 } |
| 65 |
53 bool IsDistillablePageAdaboost(WebDocument& doc, | 66 bool IsDistillablePageAdaboost(WebDocument& doc, |
54 const DistillablePageDetector* detector, | 67 const DistillablePageDetector* detector, |
55 bool is_last) { | 68 bool is_last) { |
56 WebDistillabilityFeatures features = doc.distillabilityFeatures(); | 69 WebDistillabilityFeatures features = doc.distillabilityFeatures(); |
57 GURL parsed_url(doc.url()); | 70 GURL parsed_url(doc.url()); |
58 if (!parsed_url.is_valid()) { | 71 if (!parsed_url.is_valid()) { |
59 return false; | 72 return false; |
60 } | 73 } |
61 bool distillable = detector->Classify(CalculateDerivedFeatures( | 74 bool distillable = detector->Classify(CalculateDerivedFeatures( |
62 features.openGraph, | 75 features.openGraph, |
63 parsed_url, | 76 parsed_url, |
64 features.elementCount, | 77 features.elementCount, |
65 features.anchorCount, | 78 features.anchorCount, |
66 features.formCount, | 79 features.formCount, |
67 features.mozScore, | 80 features.mozScore, |
68 features.mozScoreAllSqrt, | 81 features.mozScoreAllSqrt, |
69 features.mozScoreAllLinear | 82 features.mozScoreAllLinear |
70 )); | 83 )); |
| 84 bool blacklisted = IsBlacklisted(parsed_url); |
71 | 85 |
72 int bucket = static_cast<unsigned>(features.isMobileFriendly) | | 86 int bucket = static_cast<unsigned>(features.isMobileFriendly) | |
73 (static_cast<unsigned>(distillable) << 1); | 87 (static_cast<unsigned>(distillable) << 1); |
74 if (is_last) { | 88 if (is_last) { |
75 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading", | 89 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading", |
76 bucket, 4); | 90 bucket, 4); |
77 } else { | 91 } else { |
78 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing", | 92 UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing", |
79 bucket, 4); | 93 bucket, 4); |
80 } | 94 } |
81 return distillable && (!features.isMobileFriendly); | 95 |
| 96 return distillable && (!features.isMobileFriendly) && (!blacklisted); |
82 } | 97 } |
83 | 98 |
84 bool IsDistillablePage(WebDocument& doc, bool is_last) { | 99 bool IsDistillablePage(WebDocument& doc, bool is_last) { |
85 switch (GetDistillerHeuristicsType()) { | 100 switch (GetDistillerHeuristicsType()) { |
86 case DistillerHeuristicsType::ALWAYS_TRUE: | 101 case DistillerHeuristicsType::ALWAYS_TRUE: |
87 return true; | 102 return true; |
88 case DistillerHeuristicsType::OG_ARTICLE: | 103 case DistillerHeuristicsType::OG_ARTICLE: |
89 return doc.distillabilityFeatures().openGraph; | 104 return doc.distillabilityFeatures().openGraph; |
90 case DistillerHeuristicsType::ADABOOST_MODEL: | 105 case DistillerHeuristicsType::ADABOOST_MODEL: |
91 return IsDistillablePageAdaboost( | 106 return IsDistillablePageAdaboost( |
(...skipping 30 matching lines...) Expand all Loading... |
122 | 137 |
123 bool is_last = IsLast(is_loaded); | 138 bool is_last = IsLast(is_loaded); |
124 Send(new FrameHostMsg_Distillability(routing_id(), | 139 Send(new FrameHostMsg_Distillability(routing_id(), |
125 IsDistillablePage(doc, is_last), is_last)); | 140 IsDistillablePage(doc, is_last), is_last)); |
126 } | 141 } |
127 | 142 |
128 | 143 |
129 DistillabilityAgent::~DistillabilityAgent() {} | 144 DistillabilityAgent::~DistillabilityAgent() {} |
130 | 145 |
131 } // namespace dom_distiller | 146 } // namespace dom_distiller |
OLD | NEW |