Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(210)

Side by Side Diff: components/dom_distiller/core/distiller.cc

Issue 1903853002: Stop fetching the next page if the first page has no content (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@2704
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | components/dom_distiller/core/distiller_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/distiller.h" 5 #include "components/dom_distiller/core/distiller.h"
6 6
7 #include <map> 7 #include <map>
8 #include <utility> 8 #include <utility>
9 #include <vector> 9 #include <vector>
10 10
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
134 url)); 134 url));
135 } 135 }
136 } 136 }
137 137
138 void DistillerImpl::OnPageDistillationFinished( 138 void DistillerImpl::OnPageDistillationFinished(
139 int page_num, 139 int page_num,
140 const GURL& page_url, 140 const GURL& page_url,
141 scoped_ptr<proto::DomDistillerResult> distiller_result, 141 scoped_ptr<proto::DomDistillerResult> distiller_result,
142 bool distillation_successful) { 142 bool distillation_successful) {
143 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 143 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
144 if (distillation_successful) { 144 if (!distillation_successful) {
145 started_pages_index_.erase(page_num);
146 RunDistillerCallbackIfDone();
147 return;
148 }
145 149
146 if (distiller_result->has_statistics_info() && page_num == 0) { 150 if (distiller_result->has_statistics_info() && page_num == 0) {
147 if (distiller_result->statistics_info().has_word_count()) { 151 if (distiller_result->statistics_info().has_word_count()) {
148 UMA_HISTOGRAM_CUSTOM_COUNTS( 152 UMA_HISTOGRAM_CUSTOM_COUNTS(
149 "DomDistiller.Statistics.FirstPageWordCount", 153 "DomDistiller.Statistics.FirstPageWordCount",
150 distiller_result->statistics_info().word_count(), 154 distiller_result->statistics_info().word_count(),
151 1, 4000, 50); 155 1, 4000, 50);
156 }
157 }
158
159 DCHECK(distiller_result.get());
160 DistilledPageData* page_data =
161 GetPageAtIndex(started_pages_index_[page_num]);
162 page_data->distilled_page_proto =
163 new base::RefCountedData<DistilledPageProto>();
164 page_data->page_num = page_num;
165 if (distiller_result->has_title()) {
166 page_data->distilled_page_proto->data.set_title(
167 distiller_result->title());
168 }
169 page_data->distilled_page_proto->data.set_url(page_url.spec());
170 bool content_empty = true;
171 if (distiller_result->has_distilled_content() &&
172 distiller_result->distilled_content().has_html()) {
173 page_data->distilled_page_proto->data.set_html(
174 distiller_result->distilled_content().html());
175 if (!distiller_result->distilled_content().html().empty()) {
176 content_empty = false;
177 }
178 }
179
180 if (distiller_result->has_timing_info()) {
181 const proto::TimingInfo& distiller_timing_info =
182 distiller_result->timing_info();
183 DistilledPageProto::TimingInfo timing_info;
184 if (distiller_timing_info.has_markup_parsing_time()) {
185 timing_info.set_name("markup_parsing");
186 timing_info.set_time(distiller_timing_info.markup_parsing_time());
187 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
188 }
189
190 if (distiller_timing_info.has_document_construction_time()) {
191 timing_info.set_name("document_construction");
192 timing_info.set_time(
193 distiller_timing_info.document_construction_time());
194 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
195 }
196
197 if (distiller_timing_info.has_article_processing_time()) {
198 timing_info.set_name("article_processing");
199 timing_info.set_time(
200 distiller_timing_info.article_processing_time());
201 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
202 }
203
204 if (distiller_timing_info.has_formatting_time()) {
205 timing_info.set_name("formatting");
206 timing_info.set_time(
207 distiller_timing_info.formatting_time());
208 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
209 }
210
211 if (distiller_timing_info.has_total_time()) {
212 timing_info.set_name("total");
213 timing_info.set_time(
214 distiller_timing_info.total_time());
215 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
216 }
217
218 for (int i = 0; i < distiller_timing_info.other_times_size(); i++) {
219 timing_info.set_name(distiller_timing_info.other_times(i).name());
220 timing_info.set_time(distiller_timing_info.other_times(i).time());
221 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
222 }
223 }
224
225 if (distiller_result->has_debug_info() &&
226 distiller_result->debug_info().has_log()) {
227 page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
228 distiller_result->debug_info().log());
229 }
230
231 if (distiller_result->has_text_direction()) {
232 page_data->distilled_page_proto->data.set_text_direction(
233 distiller_result->text_direction());
234 } else {
235 page_data->distilled_page_proto->data.set_text_direction("auto");
236 }
237
238 if (distiller_result->has_pagination_info()) {
239 const proto::PaginationInfo& pagination_info =
240 distiller_result->pagination_info();
241 // Skip the next page if the first page is empty.
242 if (pagination_info.has_next_page() &&
243 (page_num != 0 || !content_empty)) {
244 GURL next_page_url(pagination_info.next_page());
245 if (next_page_url.is_valid()) {
246 // The pages should be in same origin.
247 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
248 AddToDistillationQueue(page_num + 1, next_page_url);
249 page_data->distilled_page_proto->data.mutable_pagination_info()->
250 set_next_page(next_page_url.spec());
152 } 251 }
153 } 252 }
154 253
155 DCHECK(distiller_result.get()); 254 if (pagination_info.has_prev_page()) {
156 DistilledPageData* page_data = 255 GURL prev_page_url(pagination_info.prev_page());
157 GetPageAtIndex(started_pages_index_[page_num]); 256 if (prev_page_url.is_valid()) {
158 page_data->distilled_page_proto = 257 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
159 new base::RefCountedData<DistilledPageProto>(); 258 AddToDistillationQueue(page_num - 1, prev_page_url);
160 page_data->page_num = page_num; 259 page_data->distilled_page_proto->data.mutable_pagination_info()->
161 if (distiller_result->has_title()) { 260 set_prev_page(prev_page_url.spec());
162 page_data->distilled_page_proto->data.set_title(
163 distiller_result->title());
164 }
165 page_data->distilled_page_proto->data.set_url(page_url.spec());
166 if (distiller_result->has_distilled_content() &&
167 distiller_result->distilled_content().has_html()) {
168 page_data->distilled_page_proto->data.set_html(
169 distiller_result->distilled_content().html());
170 }
171
172 if (distiller_result->has_timing_info()) {
173 const proto::TimingInfo& distiller_timing_info =
174 distiller_result->timing_info();
175 DistilledPageProto::TimingInfo timing_info;
176 if (distiller_timing_info.has_markup_parsing_time()) {
177 timing_info.set_name("markup_parsing");
178 timing_info.set_time(distiller_timing_info.markup_parsing_time());
179 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
180 }
181
182 if (distiller_timing_info.has_document_construction_time()) {
183 timing_info.set_name("document_construction");
184 timing_info.set_time(
185 distiller_timing_info.document_construction_time());
186 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
187 }
188
189 if (distiller_timing_info.has_article_processing_time()) {
190 timing_info.set_name("article_processing");
191 timing_info.set_time(
192 distiller_timing_info.article_processing_time());
193 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
194 }
195
196 if (distiller_timing_info.has_formatting_time()) {
197 timing_info.set_name("formatting");
198 timing_info.set_time(
199 distiller_timing_info.formatting_time());
200 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
201 }
202
203 if (distiller_timing_info.has_total_time()) {
204 timing_info.set_name("total");
205 timing_info.set_time(
206 distiller_timing_info.total_time());
207 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
208 }
209
210 for (int i = 0; i < distiller_timing_info.other_times_size(); i++) {
211 timing_info.set_name(distiller_timing_info.other_times(i).name());
212 timing_info.set_time(distiller_timing_info.other_times(i).time());
213 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
214 } 261 }
215 } 262 }
216 263
217 if (distiller_result->has_debug_info() && 264 if (pagination_info.has_canonical_page()) {
218 distiller_result->debug_info().has_log()) { 265 GURL canonical_page_url(pagination_info.canonical_page());
219 page_data->distilled_page_proto->data.mutable_debug_info()->set_log( 266 if (canonical_page_url.is_valid()) {
220 distiller_result->debug_info().log()); 267 page_data->distilled_page_proto->data.mutable_pagination_info()->
221 } 268 set_canonical_page(canonical_page_url.spec());
222
223 if (distiller_result->has_text_direction()) {
224 page_data->distilled_page_proto->data.set_text_direction(
225 distiller_result->text_direction());
226 } else {
227 page_data->distilled_page_proto->data.set_text_direction("auto");
228 }
229
230 if (distiller_result->has_pagination_info()) {
231 const proto::PaginationInfo& pagination_info =
232 distiller_result->pagination_info();
233 if (pagination_info.has_next_page()) {
234 GURL next_page_url(pagination_info.next_page());
235 if (next_page_url.is_valid()) {
236 // The pages should be in same origin.
237 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
238 AddToDistillationQueue(page_num + 1, next_page_url);
239 page_data->distilled_page_proto->data.mutable_pagination_info()->
240 set_next_page(next_page_url.spec());
241 }
242 }
243
244 if (pagination_info.has_prev_page()) {
245 GURL prev_page_url(pagination_info.prev_page());
246 if (prev_page_url.is_valid()) {
247 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
248 AddToDistillationQueue(page_num - 1, prev_page_url);
249 page_data->distilled_page_proto->data.mutable_pagination_info()->
250 set_prev_page(prev_page_url.spec());
251 }
252 }
253
254 if (pagination_info.has_canonical_page()) {
255 GURL canonical_page_url(pagination_info.canonical_page());
256 if (canonical_page_url.is_valid()) {
257 page_data->distilled_page_proto->data.mutable_pagination_info()->
258 set_canonical_page(canonical_page_url.spec());
259 }
260 } 269 }
261 } 270 }
271 }
262 272
263 for (int img_num = 0; img_num < distiller_result->content_images_size(); 273 for (int img_num = 0; img_num < distiller_result->content_images_size();
264 ++img_num) { 274 ++img_num) {
265 std::string image_id = 275 std::string image_id =
266 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num); 276 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
267 FetchImage(page_num, image_id, 277 FetchImage(page_num, image_id,
268 distiller_result->content_images(img_num).url()); 278 distiller_result->content_images(img_num).url());
269 } 279 }
270 280
271 AddPageIfDone(page_num); 281 AddPageIfDone(page_num);
272 DistillNextPage(); 282 DistillNextPage();
273 } else {
274 started_pages_index_.erase(page_num);
275 RunDistillerCallbackIfDone();
276 }
277 } 283 }
278 284
279 void DistillerImpl::FetchImage(int page_num, 285 void DistillerImpl::FetchImage(int page_num,
280 const std::string& image_id, 286 const std::string& image_id,
281 const std::string& image_url) { 287 const std::string& image_url) {
282 if (!GURL(image_url).is_valid()) return; 288 if (!GURL(image_url).is_valid()) return;
283 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); 289 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
284 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); 290 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
285 DistillerURLFetcher* fetcher = 291 DistillerURLFetcher* fetcher =
286 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 292 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
388 DCHECK(finished_pages_index_.empty()); 394 DCHECK(finished_pages_index_.empty());
389 395
390 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_, 396 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
391 false); 397 false);
392 finished_cb_.Run(std::move(article_proto)); 398 finished_cb_.Run(std::move(article_proto));
393 finished_cb_.Reset(); 399 finished_cb_.Reset();
394 } 400 }
395 } 401 }
396 402
397 } // namespace dom_distiller 403 } // namespace dom_distiller
OLDNEW
« no previous file with comments | « no previous file | components/dom_distiller/core/distiller_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698