OLD | NEW |
---|---|
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/distiller.h" | 5 #include "components/dom_distiller/core/distiller.h" |
6 | 6 |
7 #include <map> | 7 #include <map> |
8 | 8 |
9 #include "base/bind.h" | 9 #include "base/bind.h" |
10 #include "base/callback.h" | 10 #include "base/callback.h" |
11 #include "base/location.h" | 11 #include "base/location.h" |
12 #include "base/message_loop/message_loop.h" | 12 #include "base/message_loop/message_loop.h" |
13 #include "base/strings/string_number_conversions.h" | 13 #include "base/strings/string_number_conversions.h" |
14 #include "base/strings/utf_string_conversions.h" | 14 #include "base/strings/utf_string_conversions.h" |
15 #include "base/values.h" | 15 #include "base/values.h" |
16 #include "components/dom_distiller/core/distiller_page.h" | 16 #include "components/dom_distiller/core/distiller_page.h" |
17 #include "components/dom_distiller/core/distiller_url_fetcher.h" | 17 #include "components/dom_distiller/core/distiller_url_fetcher.h" |
18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" | 18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" |
19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" | 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" |
20 #include "net/url_request/url_request_context_getter.h" | 20 #include "net/url_request/url_request_context_getter.h" |
21 | 21 |
22 namespace { | 22 namespace { |
23 // Maximum number of distilled pages in an article. | 23 // Maximum number of distilled pages in an article. |
24 const int kMaxPagesInArticle = 32; | 24 const size_t kMaxPagesInArticle = 32; |
25 } | 25 } |
26 | 26 |
27 namespace dom_distiller { | 27 namespace dom_distiller { |
28 | 28 |
29 DistillerFactoryImpl::DistillerFactoryImpl( | 29 DistillerFactoryImpl::DistillerFactoryImpl( |
30 scoped_ptr<DistillerPageFactory> distiller_page_factory, | 30 scoped_ptr<DistillerPageFactory> distiller_page_factory, |
31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) | 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) |
32 : distiller_page_factory_(distiller_page_factory.Pass()), | 32 : distiller_page_factory_(distiller_page_factory.Pass()), |
33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} | 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} |
34 | 34 |
35 DistillerFactoryImpl::~DistillerFactoryImpl() {} | 35 DistillerFactoryImpl::~DistillerFactoryImpl() {} |
36 | 36 |
37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { | 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { |
38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); | 39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); |
40 distiller->Init(); | 40 distiller->Init(); |
41 return distiller.PassAs<Distiller>(); | 41 return distiller.PassAs<Distiller>(); |
42 } | 42 } |
43 | 43 |
44 DistillerImpl::DistilledPageData::DistilledPageData() {} | |
45 | |
46 DistillerImpl::DistilledPageData::~DistilledPageData() {} | |
47 | |
44 DistillerImpl::DistillerImpl( | 48 DistillerImpl::DistillerImpl( |
45 const DistillerPageFactory& distiller_page_factory, | 49 const DistillerPageFactory& distiller_page_factory, |
46 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) | 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) |
47 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), | 51 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory) { |
48 distillation_in_progress_(false) { | |
49 page_distiller_.reset(new PageDistiller(distiller_page_factory)); | 52 page_distiller_.reset(new PageDistiller(distiller_page_factory)); |
50 } | 53 } |
51 | 54 |
52 DistillerImpl::~DistillerImpl() { | 55 DistillerImpl::~DistillerImpl() { DCHECK(NoPendingPages()); } |
53 DCHECK(image_fetchers_.empty()); | 56 |
54 DCHECK(!distillation_in_progress_); | 57 void DistillerImpl::Init() { |
58 DCHECK(NoPendingPages()); | |
59 page_distiller_->Init(); | |
55 } | 60 } |
56 | 61 |
57 void DistillerImpl::Init() { | 62 size_t DistillerImpl::GetMaxNumPagesInArticle() const { |
58 DCHECK(!distillation_in_progress_); | 63 return kMaxPagesInArticle; |
59 page_distiller_->Init(); | 64 } |
60 article_proto_.reset(new DistilledArticleProto()); | 65 |
66 bool DistillerImpl::NoPendingPages() const { | |
67 return started_pages_.empty() && waiting_pages_.empty(); | |
68 } | |
69 | |
70 size_t DistillerImpl::TotalPageCount() const { | |
71 return waiting_pages_.size() + started_pages_.size() + finished_pages_.size(); | |
72 } | |
73 | |
74 void DistillerImpl::AddToDistillationQueue(int page_no, const GURL& url) { | |
75 if (!IsPageNumberInUse(page_no) && url.is_valid() && | |
76 TotalPageCount() < GetMaxNumPagesInArticle() && | |
77 seen_urls_.find(url.spec()) == seen_urls_.end()) { | |
78 waiting_pages_[page_no] = url; | |
79 } | |
80 } | |
81 | |
82 bool DistillerImpl::IsPageNumberInUse(int page_no) const { | |
83 return waiting_pages_.find(page_no) != waiting_pages_.end() || | |
84 started_pages_.find(page_no) != started_pages_.end() || | |
85 finished_pages_index_.find(page_no) != finished_pages_index_.end(); | |
61 } | 86 } |
62 | 87 |
63 void DistillerImpl::DistillPage(const GURL& url, | 88 void DistillerImpl::DistillPage(const GURL& url, |
64 const DistillerCallback& distillation_cb) { | 89 const DistillerCallback& distillation_cb) { |
65 DCHECK(!distillation_in_progress_); | 90 DCHECK(NoPendingPages()); |
66 distillation_cb_ = distillation_cb; | 91 distillation_cb_ = distillation_cb; |
67 DistillPage(url); | 92 |
93 AddToDistillationQueue(0, url); | |
94 DistillNextPage(); | |
68 } | 95 } |
69 | 96 |
70 void DistillerImpl::DistillPage(const GURL& url) { | 97 void DistillerImpl::DistillNextPage() { |
71 DCHECK(!distillation_in_progress_); | 98 if (!waiting_pages_.empty()) { |
72 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && | 99 std::map<int, GURL>::iterator front = waiting_pages_.begin(); |
73 processed_urls_.find(url.spec()) == processed_urls_.end()) { | 100 int page_no = front->first; |
74 distillation_in_progress_ = true; | 101 const GURL url = front->second; |
75 // Distill the next page. | 102 |
103 waiting_pages_.erase(front); | |
76 DCHECK(url.is_valid()); | 104 DCHECK(url.is_valid()); |
77 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); | 105 DCHECK(started_pages_.find(page_no) == started_pages_.end()); |
106 started_pages_.insert(page_no); | |
78 page_distiller_->DistillPage( | 107 page_distiller_->DistillPage( |
79 url, | 108 url, |
80 base::Bind(&DistillerImpl::OnPageDistillationFinished, | 109 base::Bind(&DistillerImpl::OnPageDistillationFinished, |
81 base::Unretained(this), | 110 base::Unretained(this), |
111 page_no, | |
82 url)); | 112 url)); |
83 } else { | |
84 RunDistillerCallbackIfDone(); | |
85 } | 113 } |
86 } | 114 } |
87 | 115 |
88 void DistillerImpl::OnPageDistillationFinished( | 116 void DistillerImpl::OnPageDistillationFinished( |
117 int page_no, | |
89 const GURL& page_url, | 118 const GURL& page_url, |
90 scoped_ptr<DistilledPageInfo> distilled_page, | 119 scoped_ptr<DistilledPageInfo> distilled_page, |
91 bool distillation_successful) { | 120 bool distillation_successful) { |
92 DCHECK(distillation_in_progress_); | |
93 DCHECK(distilled_page.get()); | 121 DCHECK(distilled_page.get()); |
94 if (!distillation_successful) { | 122 DCHECK(IsPageNumberInUse(page_no)); |
cjhopman
2014/02/14 20:53:52
This could be more specific and check that page_no
shashi
2014/02/14 23:25:29
Done.
| |
95 RunDistillerCallbackIfDone(); | 123 if (distillation_successful) { |
96 } else { | 124 DistilledPageData* page_data = new DistilledPageData(); |
97 DistilledPageProto* current_page = article_proto_->add_pages(); | 125 DistilledPageProto* current_page = new DistilledPageProto(); |
98 // Set the title of the article as the title of the first page. | 126 page_data->proto.reset(current_page); |
99 if (article_proto_->pages_size() == 1) { | 127 page_data->page_no = page_no; |
100 article_proto_->set_title(distilled_page->title); | 128 page_data->title = distilled_page->title; |
101 } | |
102 | 129 |
103 current_page->set_url(page_url.spec()); | 130 current_page->set_url(page_url.spec()); |
104 current_page->set_html(distilled_page->html); | 131 current_page->set_html(distilled_page->html); |
105 | 132 |
106 GURL next_page_url(distilled_page->next_page_url); | 133 GURL next_page_url(distilled_page->next_page_url); |
107 if (next_page_url.is_valid()) { | 134 if (next_page_url.is_valid()) { |
108 // The pages should be in same origin. | 135 // The pages should be in same origin. |
109 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); | 136 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); |
110 } | 137 } |
111 | 138 |
112 processed_urls_.insert(page_url.spec()); | 139 seen_urls_.insert(page_url.spec()); |
cjhopman
2014/02/14 20:53:52
I think this should be done in ::AddToDistillation
shashi
2014/02/14 23:25:29
Done.
| |
113 distillation_in_progress_ = false; | |
114 int page_number = article_proto_->pages_size(); | |
115 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); | 140 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); |
116 ++img_num) { | 141 ++img_num) { |
117 std::string image_id = | 142 std::string image_id = |
118 base::IntToString(page_number) + "_" + base::IntToString(img_num); | 143 base::IntToString(page_no + 1) + "_" + base::IntToString(img_num); |
119 FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); | 144 FetchImage(page_data, image_id, distilled_page->image_urls[img_num]); |
120 } | 145 } |
121 DistillPage(next_page_url); | 146 |
147 AddToDistillationQueue(page_no + 1, next_page_url); | |
148 CheckAndAddPageIfDone(page_data); | |
149 DistillNextPage(); | |
150 } else { | |
151 started_pages_.erase(page_no); | |
152 RunDistillerCallbackIfDone(); | |
122 } | 153 } |
123 } | 154 } |
124 | 155 |
125 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, | 156 void DistillerImpl::FetchImage(DistilledPageData* distilled_page_data, |
126 const std::string& image_id, | 157 const std::string& image_id, |
127 const std::string& item) { | 158 const std::string& item) { |
159 DCHECK(distilled_page_data); | |
128 DistillerURLFetcher* fetcher = | 160 DistillerURLFetcher* fetcher = |
129 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); | 161 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); |
130 image_fetchers_.push_back(fetcher); | 162 distilled_page_data->image_fetchers_.push_back(fetcher); |
163 | |
131 fetcher->FetchURL(item, | 164 fetcher->FetchURL(item, |
132 base::Bind(&DistillerImpl::OnFetchImageDone, | 165 base::Bind(&DistillerImpl::OnFetchImageDone, |
133 base::Unretained(this), | 166 base::Unretained(this), |
134 base::Unretained(distilled_page_proto), | 167 base::Unretained(distilled_page_data), |
135 base::Unretained(fetcher), | 168 base::Unretained(fetcher), |
136 image_id)); | 169 image_id)); |
137 } | 170 } |
138 | 171 |
139 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, | 172 void DistillerImpl::OnFetchImageDone(DistilledPageData* distilled_page_data, |
140 DistillerURLFetcher* url_fetcher, | 173 DistillerURLFetcher* url_fetcher, |
141 const std::string& id, | 174 const std::string& id, |
142 const std::string& response) { | 175 const std::string& response) { |
143 DCHECK_GT(article_proto_->pages_size(), 0); | 176 DCHECK(distilled_page_data); |
144 DCHECK(distilled_page_proto); | 177 DCHECK(distilled_page_data->proto); |
145 DCHECK(url_fetcher); | 178 DCHECK(url_fetcher); |
146 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = | 179 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = |
147 std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); | 180 std::find(distilled_page_data->image_fetchers_.begin(), |
181 distilled_page_data->image_fetchers_.end(), | |
182 url_fetcher); | |
148 | 183 |
149 DCHECK(fetcher_it != image_fetchers_.end()); | 184 DCHECK(fetcher_it != distilled_page_data->image_fetchers_.end()); |
150 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone | 185 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone |
151 // callback is invoked by the |url_fetcher|. | 186 // callback is invoked by the |url_fetcher|. |
152 image_fetchers_.weak_erase(fetcher_it); | 187 distilled_page_data->image_fetchers_.weak_erase(fetcher_it); |
153 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); | 188 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); |
154 DistilledPageProto_Image* image = distilled_page_proto->add_image(); | 189 |
190 DistilledPageProto_Image* image = distilled_page_data->proto->add_image(); | |
155 image->set_name(id); | 191 image->set_name(id); |
156 image->set_data(response); | 192 image->set_data(response); |
157 RunDistillerCallbackIfDone(); | 193 |
194 CheckAndAddPageIfDone(distilled_page_data); | |
195 } | |
196 | |
197 void DistillerImpl::CheckAndAddPageIfDone( | |
198 DistilledPageData* distilled_page_data) { | |
199 DCHECK(distilled_page_data); | |
200 int page_no = distilled_page_data->page_no; | |
201 DCHECK(started_pages_.find(page_no) != started_pages_.end()); | |
202 DCHECK(finished_pages_index_.find(page_no) == finished_pages_index_.end()); | |
203 if (distilled_page_data->image_fetchers_.empty()) { | |
204 started_pages_.erase(page_no); | |
205 finished_pages_.push_back(distilled_page_data); | |
206 finished_pages_index_[page_no] = finished_pages_.size() - 1; | |
207 RunDistillerCallbackIfDone(); | |
208 } | |
158 } | 209 } |
159 | 210 |
160 void DistillerImpl::RunDistillerCallbackIfDone() { | 211 void DistillerImpl::RunDistillerCallbackIfDone() { |
161 if (image_fetchers_.empty() && !distillation_in_progress_) { | 212 DCHECK(!distillation_cb_.is_null()); |
162 distillation_cb_.Run(article_proto_.Pass()); | 213 if (NoPendingPages()) { |
214 bool first_page = true; | |
215 scoped_ptr<DistilledArticleProto> article_proto( | |
216 new DistilledArticleProto()); | |
217 // Stitch the pages back into the article. | |
218 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin(); | |
219 it != finished_pages_index_.end();) { | |
220 const DistilledPageData* page_data = finished_pages_[it->second]; | |
221 *(article_proto->add_pages()) = *(page_data->proto); | |
222 | |
223 if (first_page) { | |
224 article_proto->set_title(page_data->title); | |
225 first_page = false; | |
226 } | |
227 | |
228 finished_pages_index_.erase(it++); | |
229 } | |
230 | |
231 finished_pages_.clear(); | |
232 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()), | |
233 GetMaxNumPagesInArticle()); | |
234 | |
235 DCHECK(finished_pages_.empty()); | |
236 DCHECK(finished_pages_index_.empty()); | |
237 distillation_cb_.Run(article_proto.Pass()); | |
238 distillation_cb_.Reset(); | |
163 } | 239 } |
164 } | 240 } |
165 | 241 |
166 } // namespace dom_distiller | 242 } // namespace dom_distiller |
OLD | NEW |