OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/core/distiller.h" | 5 #include "components/dom_distiller/core/distiller.h" |
6 | 6 |
7 #include <map> | 7 #include <map> |
8 | 8 |
9 #include "base/bind.h" | 9 #include "base/bind.h" |
10 #include "base/callback.h" | 10 #include "base/callback.h" |
11 #include "base/location.h" | 11 #include "base/location.h" |
12 #include "base/message_loop/message_loop.h" | 12 #include "base/message_loop/message_loop.h" |
13 #include "base/strings/string_number_conversions.h" | 13 #include "base/strings/string_number_conversions.h" |
14 #include "base/strings/utf_string_conversions.h" | 14 #include "base/strings/utf_string_conversions.h" |
15 #include "base/values.h" | 15 #include "base/values.h" |
16 #include "components/dom_distiller/core/distiller_page.h" | 16 #include "components/dom_distiller/core/distiller_page.h" |
17 #include "components/dom_distiller/core/distiller_url_fetcher.h" | 17 #include "components/dom_distiller/core/distiller_url_fetcher.h" |
18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" | 18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" |
19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" | 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" |
20 #include "net/url_request/url_request_context_getter.h" | 20 #include "net/url_request/url_request_context_getter.h" |
21 | 21 |
22 namespace { | 22 namespace { |
23 // Maximum number of distilled pages in an article. | 23 // Maximum number of distilled pages in an article. |
24 const int kMaxPagesInArticle = 32; | 24 const size_t kMaxPagesInArticle = 32; |
25 } | 25 } |
26 | 26 |
27 namespace dom_distiller { | 27 namespace dom_distiller { |
28 | 28 |
29 DistillerFactoryImpl::DistillerFactoryImpl( | 29 DistillerFactoryImpl::DistillerFactoryImpl( |
30 scoped_ptr<DistillerPageFactory> distiller_page_factory, | 30 scoped_ptr<DistillerPageFactory> distiller_page_factory, |
31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) | 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) |
32 : distiller_page_factory_(distiller_page_factory.Pass()), | 32 : distiller_page_factory_(distiller_page_factory.Pass()), |
33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} | 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} |
34 | 34 |
35 DistillerFactoryImpl::~DistillerFactoryImpl() {} | 35 DistillerFactoryImpl::~DistillerFactoryImpl() {} |
36 | 36 |
37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { | 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { |
38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); | 39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); |
40 distiller->Init(); | 40 distiller->Init(); |
41 return distiller.PassAs<Distiller>(); | 41 return distiller.PassAs<Distiller>(); |
42 } | 42 } |
43 | 43 |
| 44 DistillerImpl::DistilledPageData::DistilledPageData() {} |
| 45 |
| 46 DistillerImpl::DistilledPageData::~DistilledPageData() {} |
| 47 |
44 DistillerImpl::DistillerImpl( | 48 DistillerImpl::DistillerImpl( |
45 const DistillerPageFactory& distiller_page_factory, | 49 const DistillerPageFactory& distiller_page_factory, |
46 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) | 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) |
47 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), | 51 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), |
48 distillation_in_progress_(false) { | 52 max_pages_in_article_(kMaxPagesInArticle) { |
49 page_distiller_.reset(new PageDistiller(distiller_page_factory)); | 53 page_distiller_.reset(new PageDistiller(distiller_page_factory)); |
50 } | 54 } |
51 | 55 |
52 DistillerImpl::~DistillerImpl() { | 56 DistillerImpl::~DistillerImpl() { DCHECK(AreAllPagesFinished()); } |
53 DCHECK(image_fetchers_.empty()); | 57 |
54 DCHECK(!distillation_in_progress_); | 58 void DistillerImpl::Init() { |
| 59 DCHECK(AreAllPagesFinished()); |
| 60 page_distiller_->Init(); |
55 } | 61 } |
56 | 62 |
57 void DistillerImpl::Init() { | 63 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) { |
58 DCHECK(!distillation_in_progress_); | 64 max_pages_in_article_ = max_num_pages; |
59 page_distiller_->Init(); | 65 } |
60 article_proto_.reset(new DistilledArticleProto()); | 66 |
| 67 bool DistillerImpl::AreAllPagesFinished() const { |
| 68 return started_pages_index_.empty() && waiting_pages_.empty(); |
| 69 } |
| 70 |
| 71 size_t DistillerImpl::TotalPageCount() const { |
| 72 return waiting_pages_.size() + started_pages_index_.size() + |
| 73 finished_pages_index_.size(); |
| 74 } |
| 75 |
| 76 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) { |
| 77 if (!IsPageNumberInUse(page_num) && url.is_valid() && |
| 78 TotalPageCount() < max_pages_in_article_ && |
| 79 seen_urls_.find(url.spec()) == seen_urls_.end()) { |
| 80 waiting_pages_[page_num] = url; |
| 81 } |
| 82 } |
| 83 |
| 84 bool DistillerImpl::IsPageNumberInUse(int page_num) const { |
| 85 return waiting_pages_.find(page_num) != waiting_pages_.end() || |
| 86 started_pages_index_.find(page_num) != started_pages_index_.end() || |
| 87 finished_pages_index_.find(page_num) != finished_pages_index_.end(); |
| 88 } |
| 89 |
| 90 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index) |
| 91 const { |
| 92 DCHECK_LT(index, pages_.size()); |
| 93 DistilledPageData* page_data = pages_[index]; |
| 94 DCHECK(page_data); |
| 95 return page_data; |
61 } | 96 } |
62 | 97 |
63 void DistillerImpl::DistillPage(const GURL& url, | 98 void DistillerImpl::DistillPage(const GURL& url, |
64 const DistillerCallback& distillation_cb) { | 99 const DistillerCallback& distillation_cb) { |
65 DCHECK(!distillation_in_progress_); | 100 DCHECK(AreAllPagesFinished()); |
66 distillation_cb_ = distillation_cb; | 101 distillation_cb_ = distillation_cb; |
67 DistillPage(url); | 102 |
| 103 AddToDistillationQueue(0, url); |
| 104 DistillNextPage(); |
68 } | 105 } |
69 | 106 |
70 void DistillerImpl::DistillPage(const GURL& url) { | 107 void DistillerImpl::DistillNextPage() { |
71 DCHECK(!distillation_in_progress_); | 108 if (!waiting_pages_.empty()) { |
72 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && | 109 std::map<int, GURL>::iterator front = waiting_pages_.begin(); |
73 processed_urls_.find(url.spec()) == processed_urls_.end()) { | 110 int page_num = front->first; |
74 distillation_in_progress_ = true; | 111 const GURL url = front->second; |
75 // Distill the next page. | 112 |
| 113 waiting_pages_.erase(front); |
76 DCHECK(url.is_valid()); | 114 DCHECK(url.is_valid()); |
77 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); | 115 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end()); |
| 116 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); |
| 117 seen_urls_.insert(url.spec()); |
| 118 pages_.push_back(new DistilledPageData()); |
| 119 started_pages_index_[page_num] = pages_.size() - 1; |
78 page_distiller_->DistillPage( | 120 page_distiller_->DistillPage( |
79 url, | 121 url, |
80 base::Bind(&DistillerImpl::OnPageDistillationFinished, | 122 base::Bind(&DistillerImpl::OnPageDistillationFinished, |
81 base::Unretained(this), | 123 base::Unretained(this), |
| 124 page_num, |
82 url)); | 125 url)); |
83 } else { | |
84 RunDistillerCallbackIfDone(); | |
85 } | 126 } |
86 } | 127 } |
87 | 128 |
88 void DistillerImpl::OnPageDistillationFinished( | 129 void DistillerImpl::OnPageDistillationFinished( |
| 130 int page_num, |
89 const GURL& page_url, | 131 const GURL& page_url, |
90 scoped_ptr<DistilledPageInfo> distilled_page, | 132 scoped_ptr<DistilledPageInfo> distilled_page, |
91 bool distillation_successful) { | 133 bool distillation_successful) { |
92 DCHECK(distillation_in_progress_); | |
93 DCHECK(distilled_page.get()); | 134 DCHECK(distilled_page.get()); |
94 if (!distillation_successful) { | 135 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); |
95 RunDistillerCallbackIfDone(); | 136 if (distillation_successful) { |
96 } else { | 137 DistilledPageData* page_data = |
97 DistilledPageProto* current_page = article_proto_->add_pages(); | 138 GetPageAtIndex(started_pages_index_[page_num]); |
98 // Set the title of the article as the title of the first page. | 139 DistilledPageProto* current_page = new DistilledPageProto(); |
99 if (article_proto_->pages_size() == 1) { | 140 page_data->proto.reset(current_page); |
100 article_proto_->set_title(distilled_page->title); | 141 page_data->page_num = page_num; |
101 } | 142 page_data->title = distilled_page->title; |
102 | 143 |
103 current_page->set_url(page_url.spec()); | 144 current_page->set_url(page_url.spec()); |
104 current_page->set_html(distilled_page->html); | 145 current_page->set_html(distilled_page->html); |
105 | 146 |
106 GURL next_page_url(distilled_page->next_page_url); | 147 GURL next_page_url(distilled_page->next_page_url); |
107 if (next_page_url.is_valid()) { | 148 if (next_page_url.is_valid()) { |
108 // The pages should be in same origin. | 149 // The pages should be in same origin. |
109 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); | 150 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); |
| 151 AddToDistillationQueue(page_num + 1, next_page_url); |
110 } | 152 } |
111 | 153 |
112 processed_urls_.insert(page_url.spec()); | |
113 distillation_in_progress_ = false; | |
114 int page_number = article_proto_->pages_size(); | |
115 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); | 154 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); |
116 ++img_num) { | 155 ++img_num) { |
117 std::string image_id = | 156 std::string image_id = |
118 base::IntToString(page_number) + "_" + base::IntToString(img_num); | 157 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num); |
119 FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); | 158 FetchImage(page_num, image_id, distilled_page->image_urls[img_num]); |
120 } | 159 } |
121 DistillPage(next_page_url); | 160 |
| 161 AddPageIfDone(page_num); |
| 162 DistillNextPage(); |
| 163 } else { |
| 164 started_pages_index_.erase(page_num); |
| 165 RunDistillerCallbackIfDone(); |
122 } | 166 } |
123 } | 167 } |
124 | 168 |
125 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, | 169 void DistillerImpl::FetchImage(int page_num, |
126 const std::string& image_id, | 170 const std::string& image_id, |
127 const std::string& item) { | 171 const std::string& item) { |
| 172 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); |
| 173 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); |
128 DistillerURLFetcher* fetcher = | 174 DistillerURLFetcher* fetcher = |
129 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); | 175 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); |
130 image_fetchers_.push_back(fetcher); | 176 page_data->image_fetchers_.push_back(fetcher); |
| 177 |
131 fetcher->FetchURL(item, | 178 fetcher->FetchURL(item, |
132 base::Bind(&DistillerImpl::OnFetchImageDone, | 179 base::Bind(&DistillerImpl::OnFetchImageDone, |
133 base::Unretained(this), | 180 base::Unretained(this), |
134 base::Unretained(distilled_page_proto), | 181 page_num, |
135 base::Unretained(fetcher), | 182 base::Unretained(fetcher), |
136 image_id)); | 183 image_id)); |
137 } | 184 } |
138 | 185 |
139 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, | 186 void DistillerImpl::OnFetchImageDone(int page_num, |
140 DistillerURLFetcher* url_fetcher, | 187 DistillerURLFetcher* url_fetcher, |
141 const std::string& id, | 188 const std::string& id, |
142 const std::string& response) { | 189 const std::string& response) { |
143 DCHECK_GT(article_proto_->pages_size(), 0); | 190 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); |
144 DCHECK(distilled_page_proto); | 191 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); |
| 192 DCHECK(page_data->proto); |
145 DCHECK(url_fetcher); | 193 DCHECK(url_fetcher); |
146 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = | 194 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = |
147 std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); | 195 std::find(page_data->image_fetchers_.begin(), |
| 196 page_data->image_fetchers_.end(), |
| 197 url_fetcher); |
148 | 198 |
149 DCHECK(fetcher_it != image_fetchers_.end()); | 199 DCHECK(fetcher_it != page_data->image_fetchers_.end()); |
150 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone | 200 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone |
151 // callback is invoked by the |url_fetcher|. | 201 // callback is invoked by the |url_fetcher|. |
152 image_fetchers_.weak_erase(fetcher_it); | 202 page_data->image_fetchers_.weak_erase(fetcher_it); |
153 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); | 203 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); |
154 DistilledPageProto_Image* image = distilled_page_proto->add_image(); | 204 |
| 205 DistilledPageProto_Image* image = page_data->proto->add_image(); |
155 image->set_name(id); | 206 image->set_name(id); |
156 image->set_data(response); | 207 image->set_data(response); |
157 RunDistillerCallbackIfDone(); | 208 |
| 209 AddPageIfDone(page_num); |
| 210 } |
| 211 |
| 212 void DistillerImpl::AddPageIfDone(int page_num) { |
| 213 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); |
| 214 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); |
| 215 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); |
| 216 if (page_data->image_fetchers_.empty()) { |
| 217 finished_pages_index_[page_num] = started_pages_index_[page_num]; |
| 218 started_pages_index_.erase(page_num); |
| 219 RunDistillerCallbackIfDone(); |
| 220 } |
158 } | 221 } |
159 | 222 |
160 void DistillerImpl::RunDistillerCallbackIfDone() { | 223 void DistillerImpl::RunDistillerCallbackIfDone() { |
161 if (image_fetchers_.empty() && !distillation_in_progress_) { | 224 DCHECK(!distillation_cb_.is_null()); |
162 distillation_cb_.Run(article_proto_.Pass()); | 225 if (AreAllPagesFinished()) { |
| 226 bool first_page = true; |
| 227 scoped_ptr<DistilledArticleProto> article_proto( |
| 228 new DistilledArticleProto()); |
| 229 // Stitch the pages back into the article. |
| 230 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin(); |
| 231 it != finished_pages_index_.end();) { |
| 232 DistilledPageData* page_data = GetPageAtIndex(it->second); |
| 233 *(article_proto->add_pages()) = *(page_data->proto); |
| 234 |
| 235 if (first_page) { |
| 236 article_proto->set_title(page_data->title); |
| 237 first_page = false; |
| 238 } |
| 239 |
| 240 finished_pages_index_.erase(it++); |
| 241 } |
| 242 |
| 243 pages_.clear(); |
| 244 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()), |
| 245 max_pages_in_article_); |
| 246 |
| 247 DCHECK(pages_.empty()); |
| 248 DCHECK(finished_pages_index_.empty()); |
| 249 distillation_cb_.Run(article_proto.Pass()); |
| 250 distillation_cb_.Reset(); |
163 } | 251 } |
164 } | 252 } |
165 | 253 |
166 } // namespace dom_distiller | 254 } // namespace dom_distiller |
OLD | NEW |