Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: components/dom_distiller/core/distiller.cc

Issue 130543003: Store page no for distilled pages undergoing distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Partition internal states into 3 sets: started, pending, finished. Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/dom_distiller/core/distiller.h" 5 #include "components/dom_distiller/core/distiller.h"
6 6
7 #include <map> 7 #include <map>
8 8
9 #include "base/bind.h" 9 #include "base/bind.h"
10 #include "base/callback.h" 10 #include "base/callback.h"
11 #include "base/location.h" 11 #include "base/location.h"
12 #include "base/message_loop/message_loop.h" 12 #include "base/message_loop/message_loop.h"
13 #include "base/strings/string_number_conversions.h" 13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/utf_string_conversions.h" 14 #include "base/strings/utf_string_conversions.h"
15 #include "base/values.h" 15 #include "base/values.h"
16 #include "components/dom_distiller/core/distiller_page.h" 16 #include "components/dom_distiller/core/distiller_page.h"
17 #include "components/dom_distiller/core/distiller_url_fetcher.h" 17 #include "components/dom_distiller/core/distiller_url_fetcher.h"
18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" 18 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
20 #include "net/url_request/url_request_context_getter.h" 20 #include "net/url_request/url_request_context_getter.h"
21 21
22 namespace { 22 namespace {
23 // Maximum number of distilled pages in an article. 23 // Maximum number of distilled pages in an article.
24 const int kMaxPagesInArticle = 32; 24 const size_t kMaxPagesInArticle = 32;
25 } 25 }
26 26
27 namespace dom_distiller { 27 namespace dom_distiller {
28 28
29 DistillerFactoryImpl::DistillerFactoryImpl( 29 DistillerFactoryImpl::DistillerFactoryImpl(
30 scoped_ptr<DistillerPageFactory> distiller_page_factory, 30 scoped_ptr<DistillerPageFactory> distiller_page_factory,
31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory)
32 : distiller_page_factory_(distiller_page_factory.Pass()), 32 : distiller_page_factory_(distiller_page_factory.Pass()),
33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {}
34 34
35 DistillerFactoryImpl::~DistillerFactoryImpl() {} 35 DistillerFactoryImpl::~DistillerFactoryImpl() {}
36 36
37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); 39 *distiller_page_factory_, *distiller_url_fetcher_factory_));
40 distiller->Init(); 40 distiller->Init();
41 return distiller.PassAs<Distiller>(); 41 return distiller.PassAs<Distiller>();
42 } 42 }
43 43
44 DistillerImpl::DistilledPageData::DistilledPageData() {}
45
46 DistillerImpl::DistilledPageData::~DistilledPageData() {}
47
44 DistillerImpl::DistillerImpl( 48 DistillerImpl::DistillerImpl(
45 const DistillerPageFactory& distiller_page_factory, 49 const DistillerPageFactory& distiller_page_factory,
46 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
47 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), 51 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory) {
48 distillation_in_progress_(false) {
49 page_distiller_.reset(new PageDistiller(distiller_page_factory)); 52 page_distiller_.reset(new PageDistiller(distiller_page_factory));
50 } 53 }
51 54
52 DistillerImpl::~DistillerImpl() { 55 DistillerImpl::~DistillerImpl() { DCHECK(NoPendingPages()); }
53 DCHECK(image_fetchers_.empty()); 56
54 DCHECK(!distillation_in_progress_); 57 void DistillerImpl::Init() {
58 DCHECK(NoPendingPages());
59 page_distiller_->Init();
55 } 60 }
56 61
57 void DistillerImpl::Init() { 62 size_t DistillerImpl::GetMaxNumPagesInArticle() const {
58 DCHECK(!distillation_in_progress_); 63 return kMaxPagesInArticle;
59 page_distiller_->Init(); 64 }
60 article_proto_.reset(new DistilledArticleProto()); 65
66 bool DistillerImpl::NoPendingPages() const {
67 return started_pages_.empty() && waiting_pages_.empty();
68 }
69
70 size_t DistillerImpl::TotalPageCount() const {
71 return waiting_pages_.size() + started_pages_.size() + finished_pages_.size();
72 }
73
74 void DistillerImpl::AddToDistillationQueue(int page_no, const GURL& url) {
75 if (!IsPageNumberInUse(page_no) && url.is_valid() &&
76 TotalPageCount() < GetMaxNumPagesInArticle() &&
77 seen_urls_.find(url.spec()) == seen_urls_.end()) {
78 waiting_pages_[page_no] = url;
79 }
80 }
81
82 bool DistillerImpl::IsPageNumberInUse(int page_no) const {
83 return waiting_pages_.find(page_no) != waiting_pages_.end() ||
84 started_pages_.find(page_no) != started_pages_.end() ||
85 finished_pages_index_.find(page_no) != finished_pages_index_.end();
61 } 86 }
62 87
63 void DistillerImpl::DistillPage(const GURL& url, 88 void DistillerImpl::DistillPage(const GURL& url,
64 const DistillerCallback& distillation_cb) { 89 const DistillerCallback& distillation_cb) {
65 DCHECK(!distillation_in_progress_); 90 DCHECK(NoPendingPages());
66 distillation_cb_ = distillation_cb; 91 distillation_cb_ = distillation_cb;
67 DistillPage(url); 92
93 AddToDistillationQueue(0, url);
94 DistillNextPage();
68 } 95 }
69 96
70 void DistillerImpl::DistillPage(const GURL& url) { 97 void DistillerImpl::DistillNextPage() {
71 DCHECK(!distillation_in_progress_); 98 if (!waiting_pages_.empty()) {
72 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && 99 std::map<int, GURL>::iterator front = waiting_pages_.begin();
73 processed_urls_.find(url.spec()) == processed_urls_.end()) { 100 int page_no = front->first;
74 distillation_in_progress_ = true; 101 const GURL url = front->second;
75 // Distill the next page. 102
103 waiting_pages_.erase(front);
76 DCHECK(url.is_valid()); 104 DCHECK(url.is_valid());
77 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); 105 DCHECK(started_pages_.find(page_no) == started_pages_.end());
106 started_pages_.insert(page_no);
78 page_distiller_->DistillPage( 107 page_distiller_->DistillPage(
79 url, 108 url,
80 base::Bind(&DistillerImpl::OnPageDistillationFinished, 109 base::Bind(&DistillerImpl::OnPageDistillationFinished,
81 base::Unretained(this), 110 base::Unretained(this),
111 page_no,
82 url)); 112 url));
83 } else {
84 RunDistillerCallbackIfDone();
85 } 113 }
86 } 114 }
87 115
88 void DistillerImpl::OnPageDistillationFinished( 116 void DistillerImpl::OnPageDistillationFinished(
117 int page_no,
89 const GURL& page_url, 118 const GURL& page_url,
90 scoped_ptr<DistilledPageInfo> distilled_page, 119 scoped_ptr<DistilledPageInfo> distilled_page,
91 bool distillation_successful) { 120 bool distillation_successful) {
92 DCHECK(distillation_in_progress_);
93 DCHECK(distilled_page.get()); 121 DCHECK(distilled_page.get());
94 if (!distillation_successful) { 122 DCHECK(IsPageNumberInUse(page_no));
cjhopman 2014/02/14 20:53:52 This could be more specific and check that page_no
shashi 2014/02/14 23:25:29 Done.
95 RunDistillerCallbackIfDone(); 123 if (distillation_successful) {
96 } else { 124 DistilledPageData* page_data = new DistilledPageData();
97 DistilledPageProto* current_page = article_proto_->add_pages(); 125 DistilledPageProto* current_page = new DistilledPageProto();
98 // Set the title of the article as the title of the first page. 126 page_data->proto.reset(current_page);
99 if (article_proto_->pages_size() == 1) { 127 page_data->page_no = page_no;
100 article_proto_->set_title(distilled_page->title); 128 page_data->title = distilled_page->title;
101 }
102 129
103 current_page->set_url(page_url.spec()); 130 current_page->set_url(page_url.spec());
104 current_page->set_html(distilled_page->html); 131 current_page->set_html(distilled_page->html);
105 132
106 GURL next_page_url(distilled_page->next_page_url); 133 GURL next_page_url(distilled_page->next_page_url);
107 if (next_page_url.is_valid()) { 134 if (next_page_url.is_valid()) {
108 // The pages should be in same origin. 135 // The pages should be in same origin.
109 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); 136 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
110 } 137 }
111 138
112 processed_urls_.insert(page_url.spec()); 139 seen_urls_.insert(page_url.spec());
cjhopman 2014/02/14 20:53:52 I think this should be done in ::AddToDistillation
shashi 2014/02/14 23:25:29 Done.
113 distillation_in_progress_ = false;
114 int page_number = article_proto_->pages_size();
115 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); 140 for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
116 ++img_num) { 141 ++img_num) {
117 std::string image_id = 142 std::string image_id =
118 base::IntToString(page_number) + "_" + base::IntToString(img_num); 143 base::IntToString(page_no + 1) + "_" + base::IntToString(img_num);
119 FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); 144 FetchImage(page_data, image_id, distilled_page->image_urls[img_num]);
120 } 145 }
121 DistillPage(next_page_url); 146
147 AddToDistillationQueue(page_no + 1, next_page_url);
148 CheckAndAddPageIfDone(page_data);
149 DistillNextPage();
150 } else {
151 started_pages_.erase(page_no);
152 RunDistillerCallbackIfDone();
122 } 153 }
123 } 154 }
124 155
125 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, 156 void DistillerImpl::FetchImage(DistilledPageData* distilled_page_data,
126 const std::string& image_id, 157 const std::string& image_id,
127 const std::string& item) { 158 const std::string& item) {
159 DCHECK(distilled_page_data);
128 DistillerURLFetcher* fetcher = 160 DistillerURLFetcher* fetcher =
129 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); 161 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
130 image_fetchers_.push_back(fetcher); 162 distilled_page_data->image_fetchers_.push_back(fetcher);
163
131 fetcher->FetchURL(item, 164 fetcher->FetchURL(item,
132 base::Bind(&DistillerImpl::OnFetchImageDone, 165 base::Bind(&DistillerImpl::OnFetchImageDone,
133 base::Unretained(this), 166 base::Unretained(this),
134 base::Unretained(distilled_page_proto), 167 base::Unretained(distilled_page_data),
135 base::Unretained(fetcher), 168 base::Unretained(fetcher),
136 image_id)); 169 image_id));
137 } 170 }
138 171
139 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, 172 void DistillerImpl::OnFetchImageDone(DistilledPageData* distilled_page_data,
140 DistillerURLFetcher* url_fetcher, 173 DistillerURLFetcher* url_fetcher,
141 const std::string& id, 174 const std::string& id,
142 const std::string& response) { 175 const std::string& response) {
143 DCHECK_GT(article_proto_->pages_size(), 0); 176 DCHECK(distilled_page_data);
144 DCHECK(distilled_page_proto); 177 DCHECK(distilled_page_data->proto);
145 DCHECK(url_fetcher); 178 DCHECK(url_fetcher);
146 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = 179 ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
147 std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); 180 std::find(distilled_page_data->image_fetchers_.begin(),
181 distilled_page_data->image_fetchers_.end(),
182 url_fetcher);
148 183
149 DCHECK(fetcher_it != image_fetchers_.end()); 184 DCHECK(fetcher_it != distilled_page_data->image_fetchers_.end());
150 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone 185 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
151 // callback is invoked by the |url_fetcher|. 186 // callback is invoked by the |url_fetcher|.
152 image_fetchers_.weak_erase(fetcher_it); 187 distilled_page_data->image_fetchers_.weak_erase(fetcher_it);
153 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); 188 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
154 DistilledPageProto_Image* image = distilled_page_proto->add_image(); 189
190 DistilledPageProto_Image* image = distilled_page_data->proto->add_image();
155 image->set_name(id); 191 image->set_name(id);
156 image->set_data(response); 192 image->set_data(response);
157 RunDistillerCallbackIfDone(); 193
194 CheckAndAddPageIfDone(distilled_page_data);
195 }
196
197 void DistillerImpl::CheckAndAddPageIfDone(
198 DistilledPageData* distilled_page_data) {
199 DCHECK(distilled_page_data);
200 int page_no = distilled_page_data->page_no;
201 DCHECK(started_pages_.find(page_no) != started_pages_.end());
202 DCHECK(finished_pages_index_.find(page_no) == finished_pages_index_.end());
203 if (distilled_page_data->image_fetchers_.empty()) {
204 started_pages_.erase(page_no);
205 finished_pages_.push_back(distilled_page_data);
206 finished_pages_index_[page_no] = finished_pages_.size() - 1;
207 RunDistillerCallbackIfDone();
208 }
158 } 209 }
159 210
160 void DistillerImpl::RunDistillerCallbackIfDone() { 211 void DistillerImpl::RunDistillerCallbackIfDone() {
161 if (image_fetchers_.empty() && !distillation_in_progress_) { 212 DCHECK(!distillation_cb_.is_null());
162 distillation_cb_.Run(article_proto_.Pass()); 213 if (NoPendingPages()) {
214 bool first_page = true;
215 scoped_ptr<DistilledArticleProto> article_proto(
216 new DistilledArticleProto());
217 // Stitch the pages back into the article.
218 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
219 it != finished_pages_index_.end();) {
220 const DistilledPageData* page_data = finished_pages_[it->second];
221 *(article_proto->add_pages()) = *(page_data->proto);
222
223 if (first_page) {
224 article_proto->set_title(page_data->title);
225 first_page = false;
226 }
227
228 finished_pages_index_.erase(it++);
229 }
230
231 finished_pages_.clear();
232 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
233 GetMaxNumPagesInArticle());
234
235 DCHECK(finished_pages_.empty());
236 DCHECK(finished_pages_index_.empty());
237 distillation_cb_.Run(article_proto.Pass());
238 distillation_cb_.Reset();
163 } 239 }
164 } 240 }
165 241
166 } // namespace dom_distiller 242 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698