Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/core/distiller.h" | 5 #include "components/dom_distiller/core/distiller.h" |
| 6 | 6 |
| 7 #include <map> | 7 #include <map> |
| 8 | 8 |
| 9 #include "base/bind.h" | 9 #include "base/bind.h" |
| 10 #include "base/callback.h" | 10 #include "base/callback.h" |
| 11 #include "base/location.h" | 11 #include "base/location.h" |
| 12 #include "base/message_loop/message_loop.h" | 12 #include "base/message_loop/message_loop.h" |
| 13 #include "base/strings/string_number_conversions.h" | 13 #include "base/strings/string_number_conversions.h" |
| 14 #include "base/strings/utf_string_conversions.h" | 14 #include "base/strings/utf_string_conversions.h" |
| 15 #include "base/values.h" | 15 #include "base/values.h" |
| 16 #include "components/dom_distiller/core/distiller_page.h" | 16 #include "components/dom_distiller/core/distiller_page.h" |
| 17 #include "components/dom_distiller/core/distiller_url_fetcher.h" | 17 #include "components/dom_distiller/core/distiller_url_fetcher.h" |
| 18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" | 18 #include "components/dom_distiller/core/proto/distilled_article.pb.h" |
| 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" | 19 #include "components/dom_distiller/core/proto/distilled_page.pb.h" |
| 20 #include "net/url_request/url_request_context_getter.h" | 20 #include "net/url_request/url_request_context_getter.h" |
| 21 | 21 |
| 22 namespace { | 22 namespace { |
| 23 // Maximum number of distilled pages in an article. | 23 // Maximum number of distilled pages in an article. |
| 24 const int kMaxPagesInArticle = 32; | 24 const size_t kMaxPagesInArticle = 32; |
| 25 } | 25 } |
| 26 | 26 |
| 27 namespace dom_distiller { | 27 namespace dom_distiller { |
| 28 | 28 |
| 29 DistillerFactoryImpl::DistillerFactoryImpl( | 29 DistillerFactoryImpl::DistillerFactoryImpl( |
| 30 scoped_ptr<DistillerPageFactory> distiller_page_factory, | 30 scoped_ptr<DistillerPageFactory> distiller_page_factory, |
| 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) | 31 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory) |
| 32 : distiller_page_factory_(distiller_page_factory.Pass()), | 32 : distiller_page_factory_(distiller_page_factory.Pass()), |
| 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} | 33 distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {} |
| 34 | 34 |
| 35 DistillerFactoryImpl::~DistillerFactoryImpl() {} | 35 DistillerFactoryImpl::~DistillerFactoryImpl() {} |
| 36 | 36 |
| 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { | 37 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { |
| 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | 38 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
| 39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); | 39 *distiller_page_factory_, *distiller_url_fetcher_factory_)); |
| 40 distiller->Init(); | 40 distiller->Init(); |
| 41 return distiller.PassAs<Distiller>(); | 41 return distiller.PassAs<Distiller>(); |
| 42 } | 42 } |
| 43 | 43 |
| 44 DistillerImpl::DistilledPageData::DistilledPageData() {} | |
| 45 | |
| 46 DistillerImpl::DistilledPageData::~DistilledPageData() {} | |
| 47 | |
| 44 DistillerImpl::DistillerImpl( | 48 DistillerImpl::DistillerImpl( |
| 45 const DistillerPageFactory& distiller_page_factory, | 49 const DistillerPageFactory& distiller_page_factory, |
| 46 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) | 50 const DistillerURLFetcherFactory& distiller_url_fetcher_factory) |
| 47 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), | 51 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory) { |
| 48 distillation_in_progress_(false) { | |
| 49 page_distiller_.reset(new PageDistiller(distiller_page_factory)); | 52 page_distiller_.reset(new PageDistiller(distiller_page_factory)); |
| 50 } | 53 } |
| 51 | 54 |
| 52 DistillerImpl::~DistillerImpl() { | 55 DistillerImpl::~DistillerImpl() { DCHECK(NoPendingPages()); } |
| 53 DCHECK(image_fetchers_.empty()); | 56 |
| 54 DCHECK(!distillation_in_progress_); | 57 void DistillerImpl::Init() { |
| 58 DCHECK(NoPendingPages()); | |
| 59 page_distiller_->Init(); | |
| 55 } | 60 } |
| 56 | 61 |
| 57 void DistillerImpl::Init() { | 62 size_t DistillerImpl::GetMaxNumPagesInArticle() const { |
| 58 DCHECK(!distillation_in_progress_); | 63 return kMaxPagesInArticle; |
| 59 page_distiller_->Init(); | 64 } |
| 60 article_proto_.reset(new DistilledArticleProto()); | 65 |
| 66 bool DistillerImpl::NoPendingPages() const { | |
| 67 return started_pages_.empty() && waiting_pages_.empty(); | |
| 68 } | |
| 69 | |
| 70 size_t DistillerImpl::TotalPageCount() const { | |
| 71 return waiting_pages_.size() + started_pages_.size() + finished_pages_.size(); | |
| 72 } | |
| 73 | |
| 74 void DistillerImpl::AddToDistillationQueue(int page_no, const GURL& url) { | |
| 75 if (!IsPageNumberInUse(page_no) && url.is_valid() && | |
| 76 TotalPageCount() < GetMaxNumPagesInArticle() && | |
| 77 seen_urls_.find(url.spec()) == seen_urls_.end()) { | |
| 78 waiting_pages_[page_no] = url; | |
| 79 } | |
| 80 } | |
| 81 | |
| 82 bool DistillerImpl::IsPageNumberInUse(int page_no) const { | |
| 83 return waiting_pages_.find(page_no) != waiting_pages_.end() || | |
| 84 started_pages_.find(page_no) != started_pages_.end() || | |
| 85 finished_pages_index_.find(page_no) != finished_pages_index_.end(); | |
| 61 } | 86 } |
| 62 | 87 |
| 63 void DistillerImpl::DistillPage(const GURL& url, | 88 void DistillerImpl::DistillPage(const GURL& url, |
| 64 const DistillerCallback& distillation_cb) { | 89 const DistillerCallback& distillation_cb) { |
| 65 DCHECK(!distillation_in_progress_); | 90 DCHECK(NoPendingPages()); |
| 66 distillation_cb_ = distillation_cb; | 91 distillation_cb_ = distillation_cb; |
| 67 DistillPage(url); | 92 |
| 93 AddToDistillationQueue(0, url); | |
| 94 DistillNextPage(); | |
| 68 } | 95 } |
| 69 | 96 |
| 70 void DistillerImpl::DistillPage(const GURL& url) { | 97 void DistillerImpl::DistillNextPage() { |
| 71 DCHECK(!distillation_in_progress_); | 98 if (!waiting_pages_.empty()) { |
| 72 if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && | 99 std::map<int, GURL>::iterator front = waiting_pages_.begin(); |
| 73 processed_urls_.find(url.spec()) == processed_urls_.end()) { | 100 int page_no = front->first; |
| 74 distillation_in_progress_ = true; | 101 const GURL url = front->second; |
| 75 // Distill the next page. | 102 |
| 103 waiting_pages_.erase(front); | |
| 76 DCHECK(url.is_valid()); | 104 DCHECK(url.is_valid()); |
| 77 DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); | 105 DCHECK(started_pages_.find(page_no) == started_pages_.end()); |
| 106 started_pages_.insert(page_no); | |
| 78 page_distiller_->DistillPage( | 107 page_distiller_->DistillPage( |
| 79 url, | 108 url, |
| 80 base::Bind(&DistillerImpl::OnPageDistillationFinished, | 109 base::Bind(&DistillerImpl::OnPageDistillationFinished, |
| 81 base::Unretained(this), | 110 base::Unretained(this), |
| 111 page_no, | |
| 82 url)); | 112 url)); |
| 83 } else { | |
| 84 RunDistillerCallbackIfDone(); | |
| 85 } | 113 } |
| 86 } | 114 } |
| 87 | 115 |
| 88 void DistillerImpl::OnPageDistillationFinished( | 116 void DistillerImpl::OnPageDistillationFinished( |
| 117 int page_no, | |
| 89 const GURL& page_url, | 118 const GURL& page_url, |
| 90 scoped_ptr<DistilledPageInfo> distilled_page, | 119 scoped_ptr<DistilledPageInfo> distilled_page, |
| 91 bool distillation_successful) { | 120 bool distillation_successful) { |
| 92 DCHECK(distillation_in_progress_); | |
| 93 DCHECK(distilled_page.get()); | 121 DCHECK(distilled_page.get()); |
| 94 if (!distillation_successful) { | 122 DCHECK(IsPageNumberInUse(page_no)); |
|
cjhopman
2014/02/14 20:53:52
This could be more specific and check that page_no
shashi
2014/02/14 23:25:29
Done.
| |
| 95 RunDistillerCallbackIfDone(); | 123 if (distillation_successful) { |
| 96 } else { | 124 DistilledPageData* page_data = new DistilledPageData(); |
| 97 DistilledPageProto* current_page = article_proto_->add_pages(); | 125 DistilledPageProto* current_page = new DistilledPageProto(); |
| 98 // Set the title of the article as the title of the first page. | 126 page_data->proto.reset(current_page); |
| 99 if (article_proto_->pages_size() == 1) { | 127 page_data->page_no = page_no; |
| 100 article_proto_->set_title(distilled_page->title); | 128 page_data->title = distilled_page->title; |
| 101 } | |
| 102 | 129 |
| 103 current_page->set_url(page_url.spec()); | 130 current_page->set_url(page_url.spec()); |
| 104 current_page->set_html(distilled_page->html); | 131 current_page->set_html(distilled_page->html); |
| 105 | 132 |
| 106 GURL next_page_url(distilled_page->next_page_url); | 133 GURL next_page_url(distilled_page->next_page_url); |
| 107 if (next_page_url.is_valid()) { | 134 if (next_page_url.is_valid()) { |
| 108 // The pages should be in same origin. | 135 // The pages should be in same origin. |
| 109 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); | 136 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); |
| 110 } | 137 } |
| 111 | 138 |
| 112 processed_urls_.insert(page_url.spec()); | 139 seen_urls_.insert(page_url.spec()); |
|
cjhopman
2014/02/14 20:53:52
I think this should be done in ::AddToDistillation
shashi
2014/02/14 23:25:29
Done.
| |
| 113 distillation_in_progress_ = false; | |
| 114 int page_number = article_proto_->pages_size(); | |
| 115 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); | 140 for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); |
| 116 ++img_num) { | 141 ++img_num) { |
| 117 std::string image_id = | 142 std::string image_id = |
| 118 base::IntToString(page_number) + "_" + base::IntToString(img_num); | 143 base::IntToString(page_no + 1) + "_" + base::IntToString(img_num); |
| 119 FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); | 144 FetchImage(page_data, image_id, distilled_page->image_urls[img_num]); |
| 120 } | 145 } |
| 121 DistillPage(next_page_url); | 146 |
| 147 AddToDistillationQueue(page_no + 1, next_page_url); | |
| 148 CheckAndAddPageIfDone(page_data); | |
| 149 DistillNextPage(); | |
| 150 } else { | |
| 151 started_pages_.erase(page_no); | |
| 152 RunDistillerCallbackIfDone(); | |
| 122 } | 153 } |
| 123 } | 154 } |
| 124 | 155 |
| 125 void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, | 156 void DistillerImpl::FetchImage(DistilledPageData* distilled_page_data, |
| 126 const std::string& image_id, | 157 const std::string& image_id, |
| 127 const std::string& item) { | 158 const std::string& item) { |
| 159 DCHECK(distilled_page_data); | |
| 128 DistillerURLFetcher* fetcher = | 160 DistillerURLFetcher* fetcher = |
| 129 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); | 161 distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); |
| 130 image_fetchers_.push_back(fetcher); | 162 distilled_page_data->image_fetchers_.push_back(fetcher); |
| 163 | |
| 131 fetcher->FetchURL(item, | 164 fetcher->FetchURL(item, |
| 132 base::Bind(&DistillerImpl::OnFetchImageDone, | 165 base::Bind(&DistillerImpl::OnFetchImageDone, |
| 133 base::Unretained(this), | 166 base::Unretained(this), |
| 134 base::Unretained(distilled_page_proto), | 167 base::Unretained(distilled_page_data), |
| 135 base::Unretained(fetcher), | 168 base::Unretained(fetcher), |
| 136 image_id)); | 169 image_id)); |
| 137 } | 170 } |
| 138 | 171 |
| 139 void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, | 172 void DistillerImpl::OnFetchImageDone(DistilledPageData* distilled_page_data, |
| 140 DistillerURLFetcher* url_fetcher, | 173 DistillerURLFetcher* url_fetcher, |
| 141 const std::string& id, | 174 const std::string& id, |
| 142 const std::string& response) { | 175 const std::string& response) { |
| 143 DCHECK_GT(article_proto_->pages_size(), 0); | 176 DCHECK(distilled_page_data); |
| 144 DCHECK(distilled_page_proto); | 177 DCHECK(distilled_page_data->proto); |
| 145 DCHECK(url_fetcher); | 178 DCHECK(url_fetcher); |
| 146 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = | 179 ScopedVector<DistillerURLFetcher>::iterator fetcher_it = |
| 147 std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); | 180 std::find(distilled_page_data->image_fetchers_.begin(), |
| 181 distilled_page_data->image_fetchers_.end(), | |
| 182 url_fetcher); | |
| 148 | 183 |
| 149 DCHECK(fetcher_it != image_fetchers_.end()); | 184 DCHECK(fetcher_it != distilled_page_data->image_fetchers_.end()); |
| 150 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone | 185 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone |
| 151 // callback is invoked by the |url_fetcher|. | 186 // callback is invoked by the |url_fetcher|. |
| 152 image_fetchers_.weak_erase(fetcher_it); | 187 distilled_page_data->image_fetchers_.weak_erase(fetcher_it); |
| 153 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); | 188 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); |
| 154 DistilledPageProto_Image* image = distilled_page_proto->add_image(); | 189 |
| 190 DistilledPageProto_Image* image = distilled_page_data->proto->add_image(); | |
| 155 image->set_name(id); | 191 image->set_name(id); |
| 156 image->set_data(response); | 192 image->set_data(response); |
| 157 RunDistillerCallbackIfDone(); | 193 |
| 194 CheckAndAddPageIfDone(distilled_page_data); | |
| 195 } | |
| 196 | |
| 197 void DistillerImpl::CheckAndAddPageIfDone( | |
| 198 DistilledPageData* distilled_page_data) { | |
| 199 DCHECK(distilled_page_data); | |
| 200 int page_no = distilled_page_data->page_no; | |
| 201 DCHECK(started_pages_.find(page_no) != started_pages_.end()); | |
| 202 DCHECK(finished_pages_index_.find(page_no) == finished_pages_index_.end()); | |
| 203 if (distilled_page_data->image_fetchers_.empty()) { | |
| 204 started_pages_.erase(page_no); | |
| 205 finished_pages_.push_back(distilled_page_data); | |
| 206 finished_pages_index_[page_no] = finished_pages_.size() - 1; | |
| 207 RunDistillerCallbackIfDone(); | |
| 208 } | |
| 158 } | 209 } |
| 159 | 210 |
| 160 void DistillerImpl::RunDistillerCallbackIfDone() { | 211 void DistillerImpl::RunDistillerCallbackIfDone() { |
| 161 if (image_fetchers_.empty() && !distillation_in_progress_) { | 212 DCHECK(!distillation_cb_.is_null()); |
| 162 distillation_cb_.Run(article_proto_.Pass()); | 213 if (NoPendingPages()) { |
| 214 bool first_page = true; | |
| 215 scoped_ptr<DistilledArticleProto> article_proto( | |
| 216 new DistilledArticleProto()); | |
| 217 // Stitch the pages back into the article. | |
| 218 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin(); | |
| 219 it != finished_pages_index_.end();) { | |
| 220 const DistilledPageData* page_data = finished_pages_[it->second]; | |
| 221 *(article_proto->add_pages()) = *(page_data->proto); | |
| 222 | |
| 223 if (first_page) { | |
| 224 article_proto->set_title(page_data->title); | |
| 225 first_page = false; | |
| 226 } | |
| 227 | |
| 228 finished_pages_index_.erase(it++); | |
| 229 } | |
| 230 | |
| 231 finished_pages_.clear(); | |
| 232 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()), | |
| 233 GetMaxNumPagesInArticle()); | |
| 234 | |
| 235 DCHECK(finished_pages_.empty()); | |
| 236 DCHECK(finished_pages_index_.empty()); | |
| 237 distillation_cb_.Run(article_proto.Pass()); | |
| 238 distillation_cb_.Reset(); | |
| 163 } | 239 } |
| 164 } | 240 } |
| 165 | 241 |
| 166 } // namespace dom_distiller | 242 } // namespace dom_distiller |
| OLD | NEW |