Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1012)

Side by Side Diff: components/dom_distiller/core/distiller.cc

Issue 130543003: Store page no for distilled pages undergoing distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address comments. Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/distiller.h" 5 #include "components/dom_distiller/core/distiller.h"
6 6
7 #include <map> 7 #include <map>
8 8
9 #include "base/bind.h" 9 #include "base/bind.h"
10 #include "base/callback.h" 10 #include "base/callback.h"
11 #include "base/location.h" 11 #include "base/location.h"
12 #include "base/message_loop/message_loop.h" 12 #include "base/message_loop/message_loop.h"
13 #include "base/strings/string_number_conversions.h" 13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/utf_string_conversions.h" 14 #include "base/strings/utf_string_conversions.h"
15 #include "base/values.h" 15 #include "base/values.h"
16 #include "components/dom_distiller/core/distiller_page.h" 16 #include "components/dom_distiller/core/distiller_page.h"
17 #include "components/dom_distiller/core/distiller_url_fetcher.h" 17 #include "components/dom_distiller/core/distiller_url_fetcher.h"
18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" 18 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
20 #include "net/url_request/url_request_context_getter.h" 20 #include "net/url_request/url_request_context_getter.h"
21 21
22 namespace { 22 namespace {
23 // Maximum number of distilled pages in an article. 23 // Maximum number of distilled pages in an article.
24 const int kMaxPagesInArticle = 32; 24 const size_t kMaxPagesInArticle = 32;
25 } 25 }
26 26
27 namespace dom_distiller { 27 namespace dom_distiller {
28 28
29 DistillerFactoryImpl::DistillerFactoryImpl( 29 DistillerFactoryImpl::DistillerFactoryImpl(
30 scoped_ptr<DistillerPageFactory> distiller_page_factory, 30 scoped_ptr<DistillerPageFactory> distiller_page_factory,
31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory)
32 : distiller_page_factory_(distiller_page_factory.Pass()), 32 : distiller_page_factory_(distiller_page_factory.Pass()),
33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {}
34 34
35 DistillerFactoryImpl::~DistillerFactoryImpl() {} 35 DistillerFactoryImpl::~DistillerFactoryImpl() {}
36 36
37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); 39 *distiller_page_factory_, *distiller_url_fetcher_factory_));
40 distiller->Init(); 40 distiller->Init();
41 return distiller.PassAs<Distiller>(); 41 return distiller.PassAs<Distiller>();
42 } 42 }
43 43
44 DistillerImpl::DistilledPageData::DistilledPageData() {}
45
46 DistillerImpl::DistilledPageData::~DistilledPageData() {}
47
44 DistillerImpl::DistillerImpl( 48 DistillerImpl::DistillerImpl(
45 const DistillerPageFactory& distiller_page_factory, 49 const DistillerPageFactory& distiller_page_factory,
46 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
47 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), 51 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
48 distillation_in_progress_(false) { 52 max_pages_in_article_(kMaxPagesInArticle) {
49 page_distiller_.reset(new PageDistiller(distiller_page_factory)); 53 page_distiller_.reset(new PageDistiller(distiller_page_factory));
50 } 54 }
51 55
52 DistillerImpl::~DistillerImpl() { 56 DistillerImpl::~DistillerImpl() { DCHECK(AreAllPagesFinished()); }
53 DCHECK(image_fetchers_.empty()); 57
54 DCHECK(!distillation_in_progress_); 58 void DistillerImpl::Init() {
59 DCHECK(AreAllPagesFinished());
60 page_distiller_->Init();
55 } 61 }
56 62
57 void DistillerImpl::Init() { 63 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
58 DCHECK(!distillation_in_progress_); 64 max_pages_in_article_ = max_num_pages;
59 page_distiller_->Init(); 65 }
60 article_proto_.reset(new DistilledArticleProto()); 66
67 bool DistillerImpl::AreAllPagesFinished() const {
68 return started_pages_index_.empty() && waiting_pages_.empty();
69 }
70
71 size_t DistillerImpl::TotalPageCount() const {
72 return waiting_pages_.size() + started_pages_index_.size() +
73 finished_pages_index_.size();
74 }
75
76 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
77 if (!IsPageNumberInUse(page_num) && url.is_valid() &&
78 TotalPageCount() < max_pages_in_article_ &&
79 seen_urls_.find(url.spec()) == seen_urls_.end()) {
80 waiting_pages_[page_num] = url;
81 }
82 }
83
84 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
85 return waiting_pages_.find(page_num) != waiting_pages_.end() ||
86 started_pages_index_.find(page_num) != started_pages_index_.end() ||
87 finished_pages_index_.find(page_num) != finished_pages_index_.end();
88 }
89
90 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
91 const {
92 DCHECK_LT(index, pages_.size());
93 DistilledPageData* page_data = pages_[index];
94 DCHECK(page_data);
95 return page_data;
61 } 96 }
62 97
63 void DistillerImpl::DistillPage(const GURL& url, 98 void DistillerImpl::DistillPage(const GURL& url,
64 const DistillerCallback& distillation_cb) { 99 const DistillerCallback& distillation_cb) {
65 DCHECK(!distillation_in_progress_); 100 DCHECK(AreAllPagesFinished());
66 distillation_cb_ = distillation_cb; 101 distillation_cb_ = distillation_cb;
67 DistillPage(url); 102
103 AddToDistillationQueue(0, url);
104 DistillNextPage();
68 } 105 }
69 106
70 void DistillerImpl::DistillPage(const GURL& url) { 107 void DistillerImpl::DistillNextPage() {
71 DCHECK(!distillation_in_progress_); 108 if (!waiting_pages_.empty()) {
72 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && 109 std::map<int, GURL>::iterator front = waiting_pages_.begin();
73 processed_urls_.find(url.spec()) == processed_urls_.end()) { 110 int page_num = front->first;
74 distillation_in_progress_ = true; 111 const GURL url = front->second;
75 // Distill the next page. 112
113 waiting_pages_.erase(front);
76 DCHECK(url.is_valid()); 114 DCHECK(url.is_valid());
77 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); 115 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
116 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
117 seen_urls_.insert(url.spec());
118 pages_.push_back(new DistilledPageData());
119 started_pages_index_[page_num] = pages_.size() - 1;
78 page_distiller_->DistillPage( 120 page_distiller_->DistillPage(
79 url, 121 url,
80 base::Bind(&DistillerImpl::OnPageDistillationFinished, 122 base::Bind(&DistillerImpl::OnPageDistillationFinished,
81 base::Unretained(this), 123 base::Unretained(this),
124 page_num,
82 url)); 125 url));
83 } else {
84 RunDistillerCallbackIfDone();
85 } 126 }
86 } 127 }
87 128
88 void DistillerImpl::OnPageDistillationFinished( 129 void DistillerImpl::OnPageDistillationFinished(
130 int page_num,
89 const GURL& page_url, 131 const GURL& page_url,
90 scoped_ptr<DistilledPageInfo> distilled_page, 132 scoped_ptr<DistilledPageInfo> distilled_page,
91 bool distillation_successful) { 133 bool distillation_successful) {
92 DCHECK(distillation_in_progress_);
93 DCHECK(distilled_page.get()); 134 DCHECK(distilled_page.get());
94 if (!distillation_successful) { 135 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
95 RunDistillerCallbackIfDone(); 136 if (distillation_successful) {
96 } else { 137 DistilledPageData* page_data =
97 DistilledPageProto* current_page = article_proto_->add_pages(); 138 GetPageAtIndex(started_pages_index_[page_num]);
98 // Set the title of the article as the title of the first page. 139 DistilledPageProto* current_page = new DistilledPageProto();
99 if (article_proto_->pages_size() == 1) { 140 page_data->proto.reset(current_page);
100 article_proto_->set_title(distilled_page->title); 141 page_data->page_num = page_num;
101 } 142 page_data->title = distilled_page->title;
102 143
103 current_page->set_url(page_url.spec()); 144 current_page->set_url(page_url.spec());
104 current_page->set_html(distilled_page->html); 145 current_page->set_html(distilled_page->html);
105 146
106 GURL next_page_url(distilled_page->next_page_url); 147 GURL next_page_url(distilled_page->next_page_url);
107 if (next_page_url.is_valid()) { 148 if (next_page_url.is_valid()) {
108 // The pages should be in same origin. 149 // The pages should be in same origin.
109 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); 150 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
110 } 151 }
111 152
112 processed_urls_.insert(page_url.spec());
113 distillation_in_progress_ = false;
114 int page_number = article_proto_->pages_size();
115 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); 153 for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
116 ++img_num) { 154 ++img_num) {
117 std::string image_id = 155 std::string image_id =
118 base::IntToString(page_number) + "_" + base::IntToString(img_num); 156 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
119 FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); 157 FetchImage(page_num, image_id, distilled_page->image_urls[img_num]);
120 } 158 }
121 DistillPage(next_page_url); 159
160 AddToDistillationQueue(page_num + 1, next_page_url);
cjhopman 2014/02/15 02:44:15 Nit: why not do this in the `if (next_page_url.is_
shashi 2014/02/15 03:15:36 Done.
161 AddPageIfDone(page_num);
162 DistillNextPage();
163 } else {
164 started_pages_index_.erase(page_num);
165 RunDistillerCallbackIfDone();
122 } 166 }
123 } 167 }
124 168
125 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, 169 void DistillerImpl::FetchImage(int page_num,
126 const std::string& image_id, 170 const std::string& image_id,
127 const std::string& item) { 171 const std::string& item) {
172 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
173 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
128 DistillerURLFetcher* fetcher = 174 DistillerURLFetcher* fetcher =
129 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 175 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
130 image_fetchers_.push_back(fetcher); 176 page_data->image_fetchers_.push_back(fetcher);
177
131 fetcher->FetchURL(item, 178 fetcher->FetchURL(item,
132 base::Bind(&DistillerImpl::OnFetchImageDone, 179 base::Bind(&DistillerImpl::OnFetchImageDone,
133 base::Unretained(this), 180 base::Unretained(this),
134 base::Unretained(distilled_page_proto), 181 page_num,
135 base::Unretained(fetcher), 182 base::Unretained(fetcher),
136 image_id)); 183 image_id));
137 } 184 }
138 185
139 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, 186 void DistillerImpl::OnFetchImageDone(int page_num,
140 DistillerURLFetcher* url_fetcher, 187 DistillerURLFetcher* url_fetcher,
141 const std::string& id, 188 const std::string& id,
142 const std::string& response) { 189 const std::string& response) {
143 DCHECK_GT(article_proto_->pages_size(), 0); 190 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
144 DCHECK(distilled_page_proto); 191 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
192 DCHECK(page_data->proto);
145 DCHECK(url_fetcher); 193 DCHECK(url_fetcher);
146 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = 194 ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
147 std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); 195 std::find(page_data->image_fetchers_.begin(),
196 page_data->image_fetchers_.end(),
197 url_fetcher);
148 198
149 DCHECK(fetcher_it != image_fetchers_.end()); 199 DCHECK(fetcher_it != page_data->image_fetchers_.end());
150 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone 200 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
151 // callback is invoked by the |url_fetcher|. 201 // callback is invoked by the |url_fetcher|.
152 image_fetchers_.weak_erase(fetcher_it); 202 page_data->image_fetchers_.weak_erase(fetcher_it);
153 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); 203 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
154 DistilledPageProto_Image* image = distilled_page_proto->add_image(); 204
205 DistilledPageProto_Image* image = page_data->proto->add_image();
155 image->set_name(id); 206 image->set_name(id);
156 image->set_data(response); 207 image->set_data(response);
157 RunDistillerCallbackIfDone(); 208
209 AddPageIfDone(page_num);
210 }
211
212 void DistillerImpl::AddPageIfDone(int page_num) {
213 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
214 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
215 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
216 if (page_data->image_fetchers_.empty()) {
217 finished_pages_index_[page_num] = started_pages_index_[page_num];
218 started_pages_index_.erase(page_num);
219 RunDistillerCallbackIfDone();
220 }
158 } 221 }
159 222
160 void DistillerImpl::RunDistillerCallbackIfDone() { 223 void DistillerImpl::RunDistillerCallbackIfDone() {
161 if (image_fetchers_.empty() && !distillation_in_progress_) { 224 DCHECK(!distillation_cb_.is_null());
162 distillation_cb_.Run(article_proto_.Pass()); 225 if (AreAllPagesFinished()) {
226 bool first_page = true;
227 scoped_ptr<DistilledArticleProto> article_proto(
228 new DistilledArticleProto());
229 // Stitch the pages back into the article.
230 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
231 it != finished_pages_index_.end();) {
232 DistilledPageData* page_data = GetPageAtIndex(it->second);
233 *(article_proto->add_pages()) = *(page_data->proto);
234
235 if (first_page) {
236 article_proto->set_title(page_data->title);
237 first_page = false;
238 }
239
240 finished_pages_index_.erase(it++);
241 }
242
243 pages_.clear();
244 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
245 max_pages_in_article_);
246
247 DCHECK(pages_.empty());
248 DCHECK(finished_pages_index_.empty());
249 distillation_cb_.Run(article_proto.Pass());
250 distillation_cb_.Reset();
163 } 251 }
164 } 252 }
165 253
166 } // namespace dom_distiller 254 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698