OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/history/url_index_private_data.h" | |
6 | |
7 #include <functional> | |
8 #include <iterator> | |
9 #include <limits> | |
10 #include <numeric> | |
11 #include <string> | |
12 #include <vector> | |
13 | |
14 #include "base/basictypes.h" | |
15 #include "base/files/file_util.h" | |
16 #include "base/i18n/break_iterator.h" | |
17 #include "base/i18n/case_conversion.h" | |
18 #include "base/metrics/histogram.h" | |
19 #include "base/strings/string_util.h" | |
20 #include "base/strings/utf_string_conversions.h" | |
21 #include "base/time/time.h" | |
22 #include "chrome/browser/history/history_service.h" | |
23 #include "chrome/browser/history/in_memory_url_index.h" | |
24 #include "components/bookmarks/browser/bookmark_utils.h" | |
25 #include "components/history/core/browser/history_database.h" | |
26 #include "components/history/core/browser/history_db_task.h" | |
27 #include "net/base/net_util.h" | |
28 | |
29 #if defined(USE_SYSTEM_PROTOBUF) | |
30 #include <google/protobuf/repeated_field.h> | |
31 #else | |
32 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | |
33 #endif | |
34 | |
35 using google::protobuf::RepeatedField; | |
36 using google::protobuf::RepeatedPtrField; | |
37 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
38 | |
39 namespace { | |
40 static const size_t kMaxVisitsToStoreInCache = 10u; | |
41 } // anonymous namespace | |
42 | |
43 namespace history { | |
44 | |
45 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | |
46 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | |
47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | |
48 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; | |
49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry | |
50 CharWordMapEntry; | |
51 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | |
52 WordIDHistoryMapItem; | |
53 typedef imui:: | |
54 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | |
55 WordIDHistoryMapEntry; | |
56 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; | |
57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | |
58 HistoryInfoMapEntry; | |
59 typedef imui:: | |
60 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo | |
61 HistoryInfoMapEntry_VisitInfo; | |
62 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; | |
63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry | |
64 WordStartsMapEntry; | |
65 | |
66 | |
67 // Algorithm Functions --------------------------------------------------------- | |
68 | |
69 // Comparison function for sorting search terms by descending length. | |
70 bool LengthGreater(const base::string16& string_a, | |
71 const base::string16& string_b) { | |
72 return string_a.length() > string_b.length(); | |
73 } | |
74 | |
75 | |
76 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- | |
77 | |
78 // HistoryDBTask used to update the recent visit data for a particular | |
79 // row from the history database. | |
80 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask { | |
81 public: | |
82 explicit UpdateRecentVisitsFromHistoryDBTask( | |
83 URLIndexPrivateData* private_data, | |
84 URLID url_id); | |
85 | |
86 bool RunOnDBThread(HistoryBackend* backend, | |
87 history::HistoryDatabase* db) override; | |
88 void DoneRunOnMainThread() override; | |
89 | |
90 private: | |
91 ~UpdateRecentVisitsFromHistoryDBTask() override; | |
92 | |
93 // The URLIndexPrivateData that gets updated after the historyDB | |
94 // task returns. | |
95 URLIndexPrivateData* private_data_; | |
96 // The ID of the URL to get visits for and then update. | |
97 URLID url_id_; | |
98 // Whether fetching the recent visits for the URL succeeded. | |
99 bool succeeded_; | |
100 // The awaited data that's shown to private_data_ for it to copy and | |
101 // store. | |
102 VisitVector recent_visits_; | |
103 | |
104 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); | |
105 }; | |
106 | |
107 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( | |
108 URLIndexPrivateData* private_data, | |
109 URLID url_id) | |
110 : private_data_(private_data), | |
111 url_id_(url_id), | |
112 succeeded_(false) { | |
113 } | |
114 | |
115 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( | |
116 HistoryBackend* backend, | |
117 HistoryDatabase* db) { | |
118 // Make sure the private data is going to get as many recent visits as | |
119 // ScoredHistoryMatch::GetFrequency() hopes to use. | |
120 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); | |
121 succeeded_ = db->GetMostRecentVisitsForURL(url_id_, | |
122 kMaxVisitsToStoreInCache, | |
123 &recent_visits_); | |
124 if (!succeeded_) | |
125 recent_visits_.clear(); | |
126 return true; // Always claim to be done; do not retry failures. | |
127 } | |
128 | |
129 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { | |
130 if (succeeded_) | |
131 private_data_->UpdateRecentVisits(url_id_, recent_visits_); | |
132 } | |
133 | |
134 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { | |
135 } | |
136 | |
137 | |
138 // URLIndexPrivateData --------------------------------------------------------- | |
139 | |
140 URLIndexPrivateData::URLIndexPrivateData() | |
141 : restored_cache_version_(0), | |
142 saved_cache_version_(kCurrentCacheFileVersion), | |
143 pre_filter_item_count_(0), | |
144 post_filter_item_count_(0), | |
145 post_scoring_item_count_(0) { | |
146 } | |
147 | |
148 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
149 base::string16 search_string, | |
150 size_t cursor_position, | |
151 size_t max_matches, | |
152 const std::string& languages, | |
153 const history::ScoredHistoryMatch::Builder& builder) { | |
154 // If cursor position is set and useful (not at either end of the | |
155 // string), allow the search string to be broken at cursor position. | |
156 // We do this by pretending there's a space where the cursor is. | |
157 if ((cursor_position != base::string16::npos) && | |
158 (cursor_position < search_string.length()) && | |
159 (cursor_position > 0)) { | |
160 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); | |
161 } | |
162 pre_filter_item_count_ = 0; | |
163 post_filter_item_count_ = 0; | |
164 post_scoring_item_count_ = 0; | |
165 // The search string we receive may contain escaped characters. For reducing | |
166 // the index we need individual, lower-cased words, ignoring escapings. For | |
167 // the final filtering we need whitespace separated substrings possibly | |
168 // containing escaped characters. | |
169 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
170 base::string16 lower_unescaped_string = | |
171 net::UnescapeURLComponent(lower_raw_string, | |
172 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
173 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
174 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
175 // four 'words': "colspec", "id", "mstone" and "release". | |
176 String16Vector lower_words( | |
177 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
178 ScoredHistoryMatches scored_items; | |
179 | |
180 // Do nothing if we have indexed no words (probably because we've not been | |
181 // initialized yet) or the search string has no words. | |
182 if (word_list_.empty() || lower_words.empty()) { | |
183 search_term_cache_.clear(); // Invalidate the term cache. | |
184 return scored_items; | |
185 } | |
186 | |
187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
188 // approach. | |
189 ResetSearchTermCache(); | |
190 | |
191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
192 | |
193 // Trim the candidate pool if it is large. Note that we do not filter out | |
194 // items that do not contain the search terms as proper substrings -- doing | |
195 // so is the performance-costly operation we are trying to avoid in order | |
196 // to maintain omnibox responsiveness. | |
197 const size_t kItemsToScoreLimit = 500; | |
198 pre_filter_item_count_ = history_id_set.size(); | |
199 // If we trim the results set we do not want to cache the results for next | |
200 // time as the user's ultimately desired result could easily be eliminated | |
201 // in this early rough filter. | |
202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
203 if (was_trimmed) { | |
204 HistoryIDVector history_ids; | |
205 std::copy(history_id_set.begin(), history_id_set.end(), | |
206 std::back_inserter(history_ids)); | |
207 // Trim down the set by sorting by typed-count, visit-count, and last | |
208 // visit. | |
209 HistoryItemFactorGreater | |
210 item_factor_functor(history_info_map_); | |
211 std::partial_sort(history_ids.begin(), | |
212 history_ids.begin() + kItemsToScoreLimit, | |
213 history_ids.end(), | |
214 item_factor_functor); | |
215 history_id_set.clear(); | |
216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
217 std::inserter(history_id_set, history_id_set.end())); | |
218 post_filter_item_count_ = history_id_set.size(); | |
219 } | |
220 | |
221 // Pass over all of the candidates filtering out any without a proper | |
222 // substring match, inserting those which pass in order by score. Note that | |
223 // in this step we are using the raw search string complete with escaped | |
224 // URL elements. When the user has specifically typed something akin to | |
225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
226 // specific substring appears in the URL or page title. | |
227 | |
228 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
229 // we only want to break up the search string on 'true' whitespace rather than | |
230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
231 // get two 'terms': "colspec=id%20mstone" and "release". | |
232 history::String16Vector lower_raw_terms; | |
233 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16, | |
234 &lower_raw_terms) == 0) { | |
235 // Don't score matches when there are no terms to score against. (It's | |
236 // possible that the word break iterater that extracts words to search | |
237 // for in the database allows some whitespace "words" whereas Tokenize | |
238 // excludes a long list of whitespace.) One could write a scoring | |
239 // function that gives a reasonable order to matches when there | |
240 // are no terms (i.e., all the words are some form of whitespace), | |
241 // but this is such a rare edge case that it's not worth the time. | |
242 return scored_items; | |
243 } | |
244 scored_items = | |
245 std::for_each( | |
246 history_id_set.begin(), history_id_set.end(), | |
247 AddHistoryMatch(*this, languages, lower_raw_string, lower_raw_terms, | |
248 base::Time::Now(), builder)).ScoredMatches(); | |
249 | |
250 // Select and sort only the top |max_matches| results. | |
251 if (scored_items.size() > max_matches) { | |
252 std::partial_sort(scored_items.begin(), | |
253 scored_items.begin() + | |
254 max_matches, | |
255 scored_items.end(), | |
256 ScoredHistoryMatch::MatchScoreGreater); | |
257 scored_items.resize(max_matches); | |
258 } else { | |
259 std::sort(scored_items.begin(), scored_items.end(), | |
260 ScoredHistoryMatch::MatchScoreGreater); | |
261 } | |
262 post_scoring_item_count_ = scored_items.size(); | |
263 | |
264 if (was_trimmed) { | |
265 search_term_cache_.clear(); // Invalidate the term cache. | |
266 } else { | |
267 // Remove any stale SearchTermCacheItems. | |
268 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
269 cache_iter != search_term_cache_.end(); ) { | |
270 if (!cache_iter->second.used_) | |
271 search_term_cache_.erase(cache_iter++); | |
272 else | |
273 ++cache_iter; | |
274 } | |
275 } | |
276 | |
277 return scored_items; | |
278 } | |
279 | |
280 bool URLIndexPrivateData::UpdateURL( | |
281 HistoryService* history_service, | |
282 const URLRow& row, | |
283 const std::string& languages, | |
284 const std::set<std::string>& scheme_whitelist, | |
285 base::CancelableTaskTracker* tracker) { | |
286 // The row may or may not already be in our index. If it is not already | |
287 // indexed and it qualifies then it gets indexed. If it is already | |
288 // indexed and still qualifies then it gets updated, otherwise it | |
289 // is deleted from the index. | |
290 bool row_was_updated = false; | |
291 URLID row_id = row.id(); | |
292 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
293 if (row_pos == history_info_map_.end()) { | |
294 // This new row should be indexed if it qualifies. | |
295 URLRow new_row(row); | |
296 new_row.set_id(row_id); | |
297 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
298 IndexRow(NULL, | |
299 history_service, | |
300 new_row, | |
301 languages, | |
302 scheme_whitelist, | |
303 tracker); | |
304 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
305 // This indexed row still qualifies and will be re-indexed. | |
306 // The url won't have changed but the title, visit count, etc. | |
307 // might have changed. | |
308 URLRow& row_to_update = row_pos->second.url_row; | |
309 bool title_updated = row_to_update.title() != row.title(); | |
310 if (row_to_update.visit_count() != row.visit_count() || | |
311 row_to_update.typed_count() != row.typed_count() || | |
312 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
313 row_to_update.set_visit_count(row.visit_count()); | |
314 row_to_update.set_typed_count(row.typed_count()); | |
315 row_to_update.set_last_visit(row.last_visit()); | |
316 // If something appears to have changed, update the recent visits | |
317 // information. | |
318 ScheduleUpdateRecentVisits(history_service, row_id, tracker); | |
319 // While the URL is guaranteed to remain stable, the title may have | |
320 // changed. If so, then update the index with the changed words. | |
321 if (title_updated) { | |
322 // Clear all words associated with this row and re-index both the | |
323 // URL and title. | |
324 RemoveRowWordsFromIndex(row_to_update); | |
325 row_to_update.set_title(row.title()); | |
326 RowWordStarts word_starts; | |
327 AddRowWordsToIndex(row_to_update, &word_starts, languages); | |
328 word_starts_map_[row_id] = word_starts; | |
329 } | |
330 row_was_updated = true; | |
331 } | |
332 } else { | |
333 // This indexed row no longer qualifies and will be de-indexed by | |
334 // clearing all words associated with this row. | |
335 RemoveRowFromIndex(row); | |
336 row_was_updated = true; | |
337 } | |
338 if (row_was_updated) | |
339 search_term_cache_.clear(); // This invalidates the cache. | |
340 return row_was_updated; | |
341 } | |
342 | |
343 void URLIndexPrivateData::UpdateRecentVisits( | |
344 URLID url_id, | |
345 const VisitVector& recent_visits) { | |
346 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); | |
347 if (row_pos != history_info_map_.end()) { | |
348 VisitInfoVector* visits = &row_pos->second.visits; | |
349 visits->clear(); | |
350 const size_t size = | |
351 std::min(recent_visits.size(), kMaxVisitsToStoreInCache); | |
352 visits->reserve(size); | |
353 for (size_t i = 0; i < size; i++) { | |
354 // Copy from the VisitVector the only fields visits needs. | |
355 visits->push_back(std::make_pair(recent_visits[i].visit_time, | |
356 recent_visits[i].transition)); | |
357 } | |
358 } | |
359 // Else: Oddly, the URL doesn't seem to exist in the private index. | |
360 // Ignore this update. This can happen if, for instance, the user | |
361 // removes the URL from URLIndexPrivateData before the historyDB call | |
362 // returns. | |
363 } | |
364 | |
365 void URLIndexPrivateData::ScheduleUpdateRecentVisits( | |
366 HistoryService* history_service, | |
367 URLID url_id, | |
368 base::CancelableTaskTracker* tracker) { | |
369 history_service->ScheduleDBTask( | |
370 scoped_ptr<history::HistoryDBTask>( | |
371 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker); | |
372 } | |
373 | |
374 // Helper functor for DeleteURL. | |
375 class HistoryInfoMapItemHasURL { | |
376 public: | |
377 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
378 | |
379 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) { | |
380 return item.second.url_row.url() == url_; | |
381 } | |
382 | |
383 private: | |
384 const GURL& url_; | |
385 }; | |
386 | |
387 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
388 // Find the matching entry in the history_info_map_. | |
389 HistoryInfoMap::iterator pos = std::find_if( | |
390 history_info_map_.begin(), | |
391 history_info_map_.end(), | |
392 HistoryInfoMapItemHasURL(url)); | |
393 if (pos == history_info_map_.end()) | |
394 return false; | |
395 RemoveRowFromIndex(pos->second.url_row); | |
396 search_term_cache_.clear(); // This invalidates the cache. | |
397 return true; | |
398 } | |
399 | |
400 // static | |
401 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( | |
402 const base::FilePath& file_path, | |
403 const std::string& languages) { | |
404 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
405 if (!base::PathExists(file_path)) | |
406 return NULL; | |
407 std::string data; | |
408 // If there is no cache file then simply give up. This will cause us to | |
409 // attempt to rebuild from the history database. | |
410 if (!base::ReadFileToString(file_path, &data)) | |
411 return NULL; | |
412 | |
413 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); | |
414 InMemoryURLIndexCacheItem index_cache; | |
415 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { | |
416 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " | |
417 << file_path.value(); | |
418 return restored_data; | |
419 } | |
420 | |
421 if (!restored_data->RestorePrivateData(index_cache, languages)) | |
422 return NULL; | |
423 | |
424 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", | |
425 base::TimeTicks::Now() - beginning_time); | |
426 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
427 restored_data->history_id_word_map_.size()); | |
428 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); | |
429 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
430 restored_data->word_map_.size()); | |
431 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
432 restored_data->char_word_map_.size()); | |
433 if (restored_data->Empty()) | |
434 return NULL; // 'No data' is the same as a failed reload. | |
435 return restored_data; | |
436 } | |
437 | |
438 // static | |
439 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
440 HistoryDatabase* history_db, | |
441 const std::string& languages, | |
442 const std::set<std::string>& scheme_whitelist) { | |
443 if (!history_db) | |
444 return NULL; | |
445 | |
446 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
447 | |
448 scoped_refptr<URLIndexPrivateData> | |
449 rebuilt_data(new URLIndexPrivateData); | |
450 URLDatabase::URLEnumerator history_enum; | |
451 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
452 return NULL; | |
453 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); | |
454 for (URLRow row; history_enum.GetNextURL(&row); ) { | |
455 rebuilt_data->IndexRow( | |
456 history_db, NULL, row, languages, scheme_whitelist, NULL); | |
457 } | |
458 | |
459 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
460 base::TimeTicks::Now() - beginning_time); | |
461 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
462 rebuilt_data->history_id_word_map_.size()); | |
463 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
464 rebuilt_data->word_map_.size()); | |
465 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
466 rebuilt_data->char_word_map_.size()); | |
467 return rebuilt_data; | |
468 } | |
469 | |
470 // static | |
471 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( | |
472 scoped_refptr<URLIndexPrivateData> private_data, | |
473 const base::FilePath& file_path) { | |
474 DCHECK(private_data.get()); | |
475 DCHECK(!file_path.empty()); | |
476 return private_data->SaveToFile(file_path); | |
477 } | |
478 | |
479 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { | |
480 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; | |
481 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; | |
482 data_copy->word_list_ = word_list_; | |
483 data_copy->available_words_ = available_words_; | |
484 data_copy->word_map_ = word_map_; | |
485 data_copy->char_word_map_ = char_word_map_; | |
486 data_copy->word_id_history_map_ = word_id_history_map_; | |
487 data_copy->history_id_word_map_ = history_id_word_map_; | |
488 data_copy->history_info_map_ = history_info_map_; | |
489 data_copy->word_starts_map_ = word_starts_map_; | |
490 return data_copy; | |
491 // Not copied: | |
492 // search_term_cache_ | |
493 // pre_filter_item_count_ | |
494 // post_filter_item_count_ | |
495 // post_scoring_item_count_ | |
496 } | |
497 | |
498 bool URLIndexPrivateData::Empty() const { | |
499 return history_info_map_.empty(); | |
500 } | |
501 | |
502 void URLIndexPrivateData::Clear() { | |
503 last_time_rebuilt_from_history_ = base::Time(); | |
504 word_list_.clear(); | |
505 available_words_.clear(); | |
506 word_map_.clear(); | |
507 char_word_map_.clear(); | |
508 word_id_history_map_.clear(); | |
509 history_id_word_map_.clear(); | |
510 history_info_map_.clear(); | |
511 word_starts_map_.clear(); | |
512 } | |
513 | |
514 URLIndexPrivateData::~URLIndexPrivateData() {} | |
515 | |
516 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | |
517 const String16Vector& unsorted_words) { | |
518 // Break the terms down into individual terms (words), get the candidate | |
519 // set for each term, and intersect each to get a final candidate list. | |
520 // Note that a single 'term' from the user's perspective might be | |
521 // a string like "http://www.somewebsite.com" which, from our perspective, | |
522 // is four words: 'http', 'www', 'somewebsite', and 'com'. | |
523 HistoryIDSet history_id_set; | |
524 String16Vector words(unsorted_words); | |
525 // Sort the words into the longest first as such are likely to narrow down | |
526 // the results quicker. Also, single character words are the most expensive | |
527 // to process so save them for last. | |
528 std::sort(words.begin(), words.end(), LengthGreater); | |
529 for (String16Vector::iterator iter = words.begin(); iter != words.end(); | |
530 ++iter) { | |
531 base::string16 uni_word = *iter; | |
532 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); | |
533 if (term_history_set.empty()) { | |
534 history_id_set.clear(); | |
535 break; | |
536 } | |
537 if (iter == words.begin()) { | |
538 history_id_set.swap(term_history_set); | |
539 } else { | |
540 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( | |
541 history_id_set, term_history_set); | |
542 history_id_set.swap(new_history_id_set); | |
543 } | |
544 } | |
545 return history_id_set; | |
546 } | |
547 | |
548 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( | |
549 const base::string16& term) { | |
550 if (term.empty()) | |
551 return HistoryIDSet(); | |
552 | |
553 // TODO(mrossetti): Consider optimizing for very common terms such as | |
554 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently | |
555 // occuring words in the user's searches. | |
556 | |
557 size_t term_length = term.length(); | |
558 WordIDSet word_id_set; | |
559 if (term_length > 1) { | |
560 // See if this term or a prefix thereof is present in the cache. | |
561 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); | |
562 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
563 cache_iter != search_term_cache_.end(); ++cache_iter) { | |
564 if (StartsWith(term, cache_iter->first, false) && | |
565 (best_prefix == search_term_cache_.end() || | |
566 cache_iter->first.length() > best_prefix->first.length())) | |
567 best_prefix = cache_iter; | |
568 } | |
569 | |
570 // If a prefix was found then determine the leftover characters to be used | |
571 // for further refining the results from that prefix. | |
572 Char16Set prefix_chars; | |
573 base::string16 leftovers(term); | |
574 if (best_prefix != search_term_cache_.end()) { | |
575 // If the prefix is an exact match for the term then grab the cached | |
576 // results and we're done. | |
577 size_t prefix_length = best_prefix->first.length(); | |
578 if (prefix_length == term_length) { | |
579 best_prefix->second.used_ = true; | |
580 return best_prefix->second.history_id_set_; | |
581 } | |
582 | |
583 // Otherwise we have a handy starting point. | |
584 // If there are no history results for this prefix then we can bail early | |
585 // as there will be no history results for the full term. | |
586 if (best_prefix->second.history_id_set_.empty()) { | |
587 search_term_cache_[term] = SearchTermCacheItem(); | |
588 return HistoryIDSet(); | |
589 } | |
590 word_id_set = best_prefix->second.word_id_set_; | |
591 prefix_chars = Char16SetFromString16(best_prefix->first); | |
592 leftovers = term.substr(prefix_length); | |
593 } | |
594 | |
595 // Filter for each remaining, unique character in the term. | |
596 Char16Set leftover_chars = Char16SetFromString16(leftovers); | |
597 Char16Set unique_chars = | |
598 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); | |
599 | |
600 // Reduce the word set with any leftover, unprocessed characters. | |
601 if (!unique_chars.empty()) { | |
602 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); | |
603 // We might come up empty on the leftovers. | |
604 if (leftover_set.empty()) { | |
605 search_term_cache_[term] = SearchTermCacheItem(); | |
606 return HistoryIDSet(); | |
607 } | |
608 // Or there may not have been a prefix from which to start. | |
609 if (prefix_chars.empty()) { | |
610 word_id_set.swap(leftover_set); | |
611 } else { | |
612 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | |
613 word_id_set, leftover_set); | |
614 word_id_set.swap(new_word_id_set); | |
615 } | |
616 } | |
617 | |
618 // We must filter the word list because the resulting word set surely | |
619 // contains words which do not have the search term as a proper subset. | |
620 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); | |
621 word_set_iter != word_id_set.end(); ) { | |
622 if (word_list_[*word_set_iter].find(term) == base::string16::npos) | |
623 word_id_set.erase(word_set_iter++); | |
624 else | |
625 ++word_set_iter; | |
626 } | |
627 } else { | |
628 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); | |
629 } | |
630 | |
631 // If any words resulted then we can compose a set of history IDs by unioning | |
632 // the sets from each word. | |
633 HistoryIDSet history_id_set; | |
634 if (!word_id_set.empty()) { | |
635 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | |
636 word_id_iter != word_id_set.end(); ++word_id_iter) { | |
637 WordID word_id = *word_id_iter; | |
638 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | |
639 if (word_iter != word_id_history_map_.end()) { | |
640 HistoryIDSet& word_history_id_set(word_iter->second); | |
641 history_id_set.insert(word_history_id_set.begin(), | |
642 word_history_id_set.end()); | |
643 } | |
644 } | |
645 } | |
646 | |
647 // Record a new cache entry for this word if the term is longer than | |
648 // a single character. | |
649 if (term_length > 1) | |
650 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); | |
651 | |
652 return history_id_set; | |
653 } | |
654 | |
655 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( | |
656 const Char16Set& term_chars) { | |
657 WordIDSet word_id_set; | |
658 for (Char16Set::const_iterator c_iter = term_chars.begin(); | |
659 c_iter != term_chars.end(); ++c_iter) { | |
660 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); | |
661 if (char_iter == char_word_map_.end()) { | |
662 // A character was not found so there are no matching results: bail. | |
663 word_id_set.clear(); | |
664 break; | |
665 } | |
666 WordIDSet& char_word_id_set(char_iter->second); | |
667 // It is possible for there to no longer be any words associated with | |
668 // a particular character. Give up in that case. | |
669 if (char_word_id_set.empty()) { | |
670 word_id_set.clear(); | |
671 break; | |
672 } | |
673 | |
674 if (c_iter == term_chars.begin()) { | |
675 // First character results becomes base set of results. | |
676 word_id_set = char_word_id_set; | |
677 } else { | |
678 // Subsequent character results get intersected in. | |
679 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( | |
680 word_id_set, char_word_id_set); | |
681 word_id_set.swap(new_word_id_set); | |
682 } | |
683 } | |
684 return word_id_set; | |
685 } | |
686 | |
687 bool URLIndexPrivateData::IndexRow( | |
688 HistoryDatabase* history_db, | |
689 HistoryService* history_service, | |
690 const URLRow& row, | |
691 const std::string& languages, | |
692 const std::set<std::string>& scheme_whitelist, | |
693 base::CancelableTaskTracker* tracker) { | |
694 const GURL& gurl(row.url()); | |
695 | |
696 // Index only URLs with a whitelisted scheme. | |
697 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) | |
698 return false; | |
699 | |
700 URLID row_id = row.id(); | |
701 // Strip out username and password before saving and indexing. | |
702 base::string16 url(net::FormatUrl(gurl, languages, | |
703 net::kFormatUrlOmitUsernamePassword, | |
704 net::UnescapeRule::NONE, | |
705 NULL, NULL, NULL)); | |
706 | |
707 HistoryID history_id = static_cast<HistoryID>(row_id); | |
708 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); | |
709 | |
710 // Add the row for quick lookup in the history info store. | |
711 URLRow new_row(GURL(url), row_id); | |
712 new_row.set_visit_count(row.visit_count()); | |
713 new_row.set_typed_count(row.typed_count()); | |
714 new_row.set_last_visit(row.last_visit()); | |
715 new_row.set_title(row.title()); | |
716 history_info_map_[history_id].url_row = new_row; | |
717 | |
718 // Index the words contained in the URL and title of the row. | |
719 RowWordStarts word_starts; | |
720 AddRowWordsToIndex(new_row, &word_starts, languages); | |
721 word_starts_map_[history_id] = word_starts; | |
722 | |
723 // Update the recent visits information or schedule the update | |
724 // as appropriate. | |
725 if (history_db) { | |
726 // We'd like to check that we're on the history DB thread. | |
727 // However, unittest code actually calls this on the UI thread. | |
728 // So we don't do any thread checks. | |
729 VisitVector recent_visits; | |
730 // Make sure the private data is going to get as many recent visits as | |
731 // ScoredHistoryMatch::GetFrequency() hopes to use. | |
732 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); | |
733 if (history_db->GetMostRecentVisitsForURL(row_id, | |
734 kMaxVisitsToStoreInCache, | |
735 &recent_visits)) | |
736 UpdateRecentVisits(row_id, recent_visits); | |
737 } else { | |
738 DCHECK(tracker); | |
739 DCHECK(history_service); | |
740 ScheduleUpdateRecentVisits(history_service, row_id, tracker); | |
741 } | |
742 | |
743 return true; | |
744 } | |
745 | |
746 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, | |
747 RowWordStarts* word_starts, | |
748 const std::string& languages) { | |
749 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
750 // Split URL into individual, unique words then add in the title words. | |
751 const GURL& gurl(row.url()); | |
752 const base::string16& url = | |
753 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL); | |
754 String16Set url_words = String16SetFromString16(url, | |
755 word_starts ? &word_starts->url_word_starts_ : NULL); | |
756 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title()); | |
757 String16Set title_words = String16SetFromString16(title, | |
758 word_starts ? &word_starts->title_word_starts_ : NULL); | |
759 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words); | |
760 for (String16Set::iterator word_iter = words.begin(); | |
761 word_iter != words.end(); ++word_iter) | |
762 AddWordToIndex(*word_iter, history_id); | |
763 | |
764 search_term_cache_.clear(); // Invalidate the term cache. | |
765 } | |
766 | |
767 void URLIndexPrivateData::AddWordToIndex(const base::string16& term, | |
768 HistoryID history_id) { | |
769 WordMap::iterator word_pos = word_map_.find(term); | |
770 if (word_pos != word_map_.end()) | |
771 UpdateWordHistory(word_pos->second, history_id); | |
772 else | |
773 AddWordHistory(term, history_id); | |
774 } | |
775 | |
776 void URLIndexPrivateData::AddWordHistory(const base::string16& term, | |
777 HistoryID history_id) { | |
778 WordID word_id = word_list_.size(); | |
779 if (available_words_.empty()) { | |
780 word_list_.push_back(term); | |
781 } else { | |
782 word_id = *(available_words_.begin()); | |
783 word_list_[word_id] = term; | |
784 available_words_.erase(word_id); | |
785 } | |
786 word_map_[term] = word_id; | |
787 | |
788 HistoryIDSet history_id_set; | |
789 history_id_set.insert(history_id); | |
790 word_id_history_map_[word_id] = history_id_set; | |
791 AddToHistoryIDWordMap(history_id, word_id); | |
792 | |
793 // For each character in the newly added word (i.e. a word that is not | |
794 // already in the word index), add the word to the character index. | |
795 Char16Set characters = Char16SetFromString16(term); | |
796 for (Char16Set::iterator uni_char_iter = characters.begin(); | |
797 uni_char_iter != characters.end(); ++uni_char_iter) { | |
798 base::char16 uni_char = *uni_char_iter; | |
799 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); | |
800 if (char_iter != char_word_map_.end()) { | |
801 // Update existing entry in the char/word index. | |
802 WordIDSet& word_id_set(char_iter->second); | |
803 word_id_set.insert(word_id); | |
804 } else { | |
805 // Create a new entry in the char/word index. | |
806 WordIDSet word_id_set; | |
807 word_id_set.insert(word_id); | |
808 char_word_map_[uni_char] = word_id_set; | |
809 } | |
810 } | |
811 } | |
812 | |
813 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, | |
814 HistoryID history_id) { | |
815 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); | |
816 DCHECK(history_pos != word_id_history_map_.end()); | |
817 HistoryIDSet& history_id_set(history_pos->second); | |
818 history_id_set.insert(history_id); | |
819 AddToHistoryIDWordMap(history_id, word_id); | |
820 } | |
821 | |
822 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, | |
823 WordID word_id) { | |
824 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); | |
825 if (iter != history_id_word_map_.end()) { | |
826 WordIDSet& word_id_set(iter->second); | |
827 word_id_set.insert(word_id); | |
828 } else { | |
829 WordIDSet word_id_set; | |
830 word_id_set.insert(word_id); | |
831 history_id_word_map_[history_id] = word_id_set; | |
832 } | |
833 } | |
834 | |
835 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { | |
836 RemoveRowWordsFromIndex(row); | |
837 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
838 history_info_map_.erase(history_id); | |
839 word_starts_map_.erase(history_id); | |
840 } | |
841 | |
842 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { | |
843 // Remove the entries in history_id_word_map_ and word_id_history_map_ for | |
844 // this row. | |
845 HistoryID history_id = static_cast<HistoryID>(row.id()); | |
846 WordIDSet word_id_set = history_id_word_map_[history_id]; | |
847 history_id_word_map_.erase(history_id); | |
848 | |
849 // Reconcile any changes to word usage. | |
850 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | |
851 word_id_iter != word_id_set.end(); ++word_id_iter) { | |
852 WordID word_id = *word_id_iter; | |
853 word_id_history_map_[word_id].erase(history_id); | |
854 if (!word_id_history_map_[word_id].empty()) | |
855 continue; // The word is still in use. | |
856 | |
857 // The word is no longer in use. Reconcile any changes to character usage. | |
858 base::string16 word = word_list_[word_id]; | |
859 Char16Set characters = Char16SetFromString16(word); | |
860 for (Char16Set::iterator uni_char_iter = characters.begin(); | |
861 uni_char_iter != characters.end(); ++uni_char_iter) { | |
862 base::char16 uni_char = *uni_char_iter; | |
863 char_word_map_[uni_char].erase(word_id); | |
864 if (char_word_map_[uni_char].empty()) | |
865 char_word_map_.erase(uni_char); // No longer in use. | |
866 } | |
867 | |
868 // Complete the removal of references to the word. | |
869 word_id_history_map_.erase(word_id); | |
870 word_map_.erase(word); | |
871 word_list_[word_id] = base::string16(); | |
872 available_words_.insert(word_id); | |
873 } | |
874 } | |
875 | |
876 void URLIndexPrivateData::ResetSearchTermCache() { | |
877 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | |
878 iter != search_term_cache_.end(); ++iter) | |
879 iter->second.used_ = false; | |
880 } | |
881 | |
882 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { | |
883 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
884 InMemoryURLIndexCacheItem index_cache; | |
885 SavePrivateData(&index_cache); | |
886 std::string data; | |
887 if (!index_cache.SerializeToString(&data)) { | |
888 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; | |
889 return false; | |
890 } | |
891 | |
892 int size = data.size(); | |
893 if (base::WriteFile(file_path, data.c_str(), size) != size) { | |
894 LOG(WARNING) << "Failed to write " << file_path.value(); | |
895 return false; | |
896 } | |
897 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", | |
898 base::TimeTicks::Now() - beginning_time); | |
899 return true; | |
900 } | |
901 | |
902 void URLIndexPrivateData::SavePrivateData( | |
903 InMemoryURLIndexCacheItem* cache) const { | |
904 DCHECK(cache); | |
905 cache->set_last_rebuild_timestamp( | |
906 last_time_rebuilt_from_history_.ToInternalValue()); | |
907 cache->set_version(saved_cache_version_); | |
908 // history_item_count_ is no longer used but rather than change the protobuf | |
909 // definition use a placeholder. This will go away with the switch to SQLite. | |
910 cache->set_history_item_count(0); | |
911 SaveWordList(cache); | |
912 SaveWordMap(cache); | |
913 SaveCharWordMap(cache); | |
914 SaveWordIDHistoryMap(cache); | |
915 SaveHistoryInfoMap(cache); | |
916 SaveWordStartsMap(cache); | |
917 } | |
918 | |
919 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { | |
920 if (word_list_.empty()) | |
921 return; | |
922 WordListItem* list_item = cache->mutable_word_list(); | |
923 list_item->set_word_count(word_list_.size()); | |
924 for (String16Vector::const_iterator iter = word_list_.begin(); | |
925 iter != word_list_.end(); ++iter) | |
926 list_item->add_word(base::UTF16ToUTF8(*iter)); | |
927 } | |
928 | |
929 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { | |
930 if (word_map_.empty()) | |
931 return; | |
932 WordMapItem* map_item = cache->mutable_word_map(); | |
933 map_item->set_item_count(word_map_.size()); | |
934 for (WordMap::const_iterator iter = word_map_.begin(); | |
935 iter != word_map_.end(); ++iter) { | |
936 WordMapEntry* map_entry = map_item->add_word_map_entry(); | |
937 map_entry->set_word(base::UTF16ToUTF8(iter->first)); | |
938 map_entry->set_word_id(iter->second); | |
939 } | |
940 } | |
941 | |
942 void URLIndexPrivateData::SaveCharWordMap( | |
943 InMemoryURLIndexCacheItem* cache) const { | |
944 if (char_word_map_.empty()) | |
945 return; | |
946 CharWordMapItem* map_item = cache->mutable_char_word_map(); | |
947 map_item->set_item_count(char_word_map_.size()); | |
948 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); | |
949 iter != char_word_map_.end(); ++iter) { | |
950 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); | |
951 map_entry->set_char_16(iter->first); | |
952 const WordIDSet& word_id_set(iter->second); | |
953 map_entry->set_item_count(word_id_set.size()); | |
954 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); | |
955 set_iter != word_id_set.end(); ++set_iter) | |
956 map_entry->add_word_id(*set_iter); | |
957 } | |
958 } | |
959 | |
960 void URLIndexPrivateData::SaveWordIDHistoryMap( | |
961 InMemoryURLIndexCacheItem* cache) const { | |
962 if (word_id_history_map_.empty()) | |
963 return; | |
964 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); | |
965 map_item->set_item_count(word_id_history_map_.size()); | |
966 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); | |
967 iter != word_id_history_map_.end(); ++iter) { | |
968 WordIDHistoryMapEntry* map_entry = | |
969 map_item->add_word_id_history_map_entry(); | |
970 map_entry->set_word_id(iter->first); | |
971 const HistoryIDSet& history_id_set(iter->second); | |
972 map_entry->set_item_count(history_id_set.size()); | |
973 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); | |
974 set_iter != history_id_set.end(); ++set_iter) | |
975 map_entry->add_history_id(*set_iter); | |
976 } | |
977 } | |
978 | |
979 void URLIndexPrivateData::SaveHistoryInfoMap( | |
980 InMemoryURLIndexCacheItem* cache) const { | |
981 if (history_info_map_.empty()) | |
982 return; | |
983 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); | |
984 map_item->set_item_count(history_info_map_.size()); | |
985 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
986 iter != history_info_map_.end(); ++iter) { | |
987 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); | |
988 map_entry->set_history_id(iter->first); | |
989 const URLRow& url_row(iter->second.url_row); | |
990 // Note: We only save information that contributes to the index so there | |
991 // is no need to save search_term_cache_ (not persistent). | |
992 map_entry->set_visit_count(url_row.visit_count()); | |
993 map_entry->set_typed_count(url_row.typed_count()); | |
994 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); | |
995 map_entry->set_url(url_row.url().spec()); | |
996 map_entry->set_title(base::UTF16ToUTF8(url_row.title())); | |
997 const VisitInfoVector& visits(iter->second.visits); | |
998 for (VisitInfoVector::const_iterator visit_iter = visits.begin(); | |
999 visit_iter != visits.end(); ++visit_iter) { | |
1000 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); | |
1001 visit_info->set_visit_time(visit_iter->first.ToInternalValue()); | |
1002 visit_info->set_transition_type(visit_iter->second); | |
1003 } | |
1004 } | |
1005 } | |
1006 | |
1007 void URLIndexPrivateData::SaveWordStartsMap( | |
1008 InMemoryURLIndexCacheItem* cache) const { | |
1009 if (word_starts_map_.empty()) | |
1010 return; | |
1011 // For unit testing: Enable saving of the cache as an earlier version to | |
1012 // allow testing of cache file upgrading in ReadFromFile(). | |
1013 // TODO(mrossetti): Instead of intruding on production code with this kind of | |
1014 // test harness, save a copy of an older version cache with known results. | |
1015 // Implement this when switching the caching over to SQLite. | |
1016 if (saved_cache_version_ < 1) | |
1017 return; | |
1018 | |
1019 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); | |
1020 map_item->set_item_count(word_starts_map_.size()); | |
1021 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); | |
1022 iter != word_starts_map_.end(); ++iter) { | |
1023 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); | |
1024 map_entry->set_history_id(iter->first); | |
1025 const RowWordStarts& word_starts(iter->second); | |
1026 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); | |
1027 i != word_starts.url_word_starts_.end(); ++i) | |
1028 map_entry->add_url_word_starts(*i); | |
1029 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); | |
1030 i != word_starts.title_word_starts_.end(); ++i) | |
1031 map_entry->add_title_word_starts(*i); | |
1032 } | |
1033 } | |
1034 | |
1035 bool URLIndexPrivateData::RestorePrivateData( | |
1036 const InMemoryURLIndexCacheItem& cache, | |
1037 const std::string& languages) { | |
1038 last_time_rebuilt_from_history_ = | |
1039 base::Time::FromInternalValue(cache.last_rebuild_timestamp()); | |
1040 const base::TimeDelta rebuilt_ago = | |
1041 base::Time::Now() - last_time_rebuilt_from_history_; | |
1042 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || | |
1043 (rebuilt_ago < base::TimeDelta::FromDays(-1))) { | |
1044 // Cache is more than a week old or, somehow, from some time in the future. | |
1045 // It's probably a good time to rebuild the index from history to | |
1046 // allow synced entries to now appear, expired entries to disappear, etc. | |
1047 // Allow one day in the future to make the cache not rebuild on simple | |
1048 // system clock changes such as time zone changes. | |
1049 return false; | |
1050 } | |
1051 if (cache.has_version()) { | |
1052 if (cache.version() < kCurrentCacheFileVersion) { | |
1053 // Don't try to restore an old format cache file. (This will cause | |
1054 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData | |
1055 // from history.) | |
1056 return false; | |
1057 } | |
1058 restored_cache_version_ = cache.version(); | |
1059 } | |
1060 return RestoreWordList(cache) && RestoreWordMap(cache) && | |
1061 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && | |
1062 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); | |
1063 } | |
1064 | |
1065 bool URLIndexPrivateData::RestoreWordList( | |
1066 const InMemoryURLIndexCacheItem& cache) { | |
1067 if (!cache.has_word_list()) | |
1068 return false; | |
1069 const WordListItem& list_item(cache.word_list()); | |
1070 uint32 expected_item_count = list_item.word_count(); | |
1071 uint32 actual_item_count = list_item.word_size(); | |
1072 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1073 return false; | |
1074 const RepeatedPtrField<std::string>& words(list_item.word()); | |
1075 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); | |
1076 iter != words.end(); ++iter) | |
1077 word_list_.push_back(base::UTF8ToUTF16(*iter)); | |
1078 return true; | |
1079 } | |
1080 | |
1081 bool URLIndexPrivateData::RestoreWordMap( | |
1082 const InMemoryURLIndexCacheItem& cache) { | |
1083 if (!cache.has_word_map()) | |
1084 return false; | |
1085 const WordMapItem& list_item(cache.word_map()); | |
1086 uint32 expected_item_count = list_item.item_count(); | |
1087 uint32 actual_item_count = list_item.word_map_entry_size(); | |
1088 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1089 return false; | |
1090 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); | |
1091 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); | |
1092 iter != entries.end(); ++iter) | |
1093 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id(); | |
1094 return true; | |
1095 } | |
1096 | |
1097 bool URLIndexPrivateData::RestoreCharWordMap( | |
1098 const InMemoryURLIndexCacheItem& cache) { | |
1099 if (!cache.has_char_word_map()) | |
1100 return false; | |
1101 const CharWordMapItem& list_item(cache.char_word_map()); | |
1102 uint32 expected_item_count = list_item.item_count(); | |
1103 uint32 actual_item_count = list_item.char_word_map_entry_size(); | |
1104 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1105 return false; | |
1106 const RepeatedPtrField<CharWordMapEntry>& | |
1107 entries(list_item.char_word_map_entry()); | |
1108 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = | |
1109 entries.begin(); iter != entries.end(); ++iter) { | |
1110 expected_item_count = iter->item_count(); | |
1111 actual_item_count = iter->word_id_size(); | |
1112 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1113 return false; | |
1114 base::char16 uni_char = static_cast<base::char16>(iter->char_16()); | |
1115 WordIDSet word_id_set; | |
1116 const RepeatedField<int32>& word_ids(iter->word_id()); | |
1117 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); | |
1118 jiter != word_ids.end(); ++jiter) | |
1119 word_id_set.insert(*jiter); | |
1120 char_word_map_[uni_char] = word_id_set; | |
1121 } | |
1122 return true; | |
1123 } | |
1124 | |
1125 bool URLIndexPrivateData::RestoreWordIDHistoryMap( | |
1126 const InMemoryURLIndexCacheItem& cache) { | |
1127 if (!cache.has_word_id_history_map()) | |
1128 return false; | |
1129 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); | |
1130 uint32 expected_item_count = list_item.item_count(); | |
1131 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); | |
1132 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1133 return false; | |
1134 const RepeatedPtrField<WordIDHistoryMapEntry>& | |
1135 entries(list_item.word_id_history_map_entry()); | |
1136 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = | |
1137 entries.begin(); iter != entries.end(); ++iter) { | |
1138 expected_item_count = iter->item_count(); | |
1139 actual_item_count = iter->history_id_size(); | |
1140 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1141 return false; | |
1142 WordID word_id = iter->word_id(); | |
1143 HistoryIDSet history_id_set; | |
1144 const RepeatedField<int64>& history_ids(iter->history_id()); | |
1145 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); | |
1146 jiter != history_ids.end(); ++jiter) { | |
1147 history_id_set.insert(*jiter); | |
1148 AddToHistoryIDWordMap(*jiter, word_id); | |
1149 } | |
1150 word_id_history_map_[word_id] = history_id_set; | |
1151 } | |
1152 return true; | |
1153 } | |
1154 | |
1155 bool URLIndexPrivateData::RestoreHistoryInfoMap( | |
1156 const InMemoryURLIndexCacheItem& cache) { | |
1157 if (!cache.has_history_info_map()) | |
1158 return false; | |
1159 const HistoryInfoMapItem& list_item(cache.history_info_map()); | |
1160 uint32 expected_item_count = list_item.item_count(); | |
1161 uint32 actual_item_count = list_item.history_info_map_entry_size(); | |
1162 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1163 return false; | |
1164 const RepeatedPtrField<HistoryInfoMapEntry>& | |
1165 entries(list_item.history_info_map_entry()); | |
1166 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = | |
1167 entries.begin(); iter != entries.end(); ++iter) { | |
1168 HistoryID history_id = iter->history_id(); | |
1169 GURL url(iter->url()); | |
1170 URLRow url_row(url, history_id); | |
1171 url_row.set_visit_count(iter->visit_count()); | |
1172 url_row.set_typed_count(iter->typed_count()); | |
1173 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); | |
1174 if (iter->has_title()) { | |
1175 base::string16 title(base::UTF8ToUTF16(iter->title())); | |
1176 url_row.set_title(title); | |
1177 } | |
1178 history_info_map_[history_id].url_row = url_row; | |
1179 | |
1180 // Restore visits list. | |
1181 VisitInfoVector visits; | |
1182 visits.reserve(iter->visits_size()); | |
1183 for (int i = 0; i < iter->visits_size(); ++i) { | |
1184 visits.push_back(std::make_pair( | |
1185 base::Time::FromInternalValue(iter->visits(i).visit_time()), | |
1186 ui::PageTransitionFromInt(iter->visits(i).transition_type()))); | |
1187 } | |
1188 history_info_map_[history_id].visits = visits; | |
1189 } | |
1190 return true; | |
1191 } | |
1192 | |
1193 bool URLIndexPrivateData::RestoreWordStartsMap( | |
1194 const InMemoryURLIndexCacheItem& cache, | |
1195 const std::string& languages) { | |
1196 // Note that this function must be called after RestoreHistoryInfoMap() has | |
1197 // been run as the word starts may have to be recalculated from the urls and | |
1198 // page titles. | |
1199 if (cache.has_word_starts_map()) { | |
1200 const WordStartsMapItem& list_item(cache.word_starts_map()); | |
1201 uint32 expected_item_count = list_item.item_count(); | |
1202 uint32 actual_item_count = list_item.word_starts_map_entry_size(); | |
1203 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1204 return false; | |
1205 const RepeatedPtrField<WordStartsMapEntry>& | |
1206 entries(list_item.word_starts_map_entry()); | |
1207 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = | |
1208 entries.begin(); iter != entries.end(); ++iter) { | |
1209 HistoryID history_id = iter->history_id(); | |
1210 RowWordStarts word_starts; | |
1211 // Restore the URL word starts. | |
1212 const RepeatedField<int32>& url_starts(iter->url_word_starts()); | |
1213 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); | |
1214 jiter != url_starts.end(); ++jiter) | |
1215 word_starts.url_word_starts_.push_back(*jiter); | |
1216 // Restore the page title word starts. | |
1217 const RepeatedField<int32>& title_starts(iter->title_word_starts()); | |
1218 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); | |
1219 jiter != title_starts.end(); ++jiter) | |
1220 word_starts.title_word_starts_.push_back(*jiter); | |
1221 word_starts_map_[history_id] = word_starts; | |
1222 } | |
1223 } else { | |
1224 // Since the cache did not contain any word starts we must rebuild then from | |
1225 // the URL and page titles. | |
1226 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
1227 iter != history_info_map_.end(); ++iter) { | |
1228 RowWordStarts word_starts; | |
1229 const URLRow& row(iter->second.url_row); | |
1230 const base::string16& url = | |
1231 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL); | |
1232 String16VectorFromString16(url, false, &word_starts.url_word_starts_); | |
1233 const base::string16& title = | |
1234 bookmarks::CleanUpTitleForMatching(row.title()); | |
1235 String16VectorFromString16(title, false, &word_starts.title_word_starts_); | |
1236 word_starts_map_[iter->first] = word_starts; | |
1237 } | |
1238 } | |
1239 return true; | |
1240 } | |
1241 | |
1242 // static | |
1243 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | |
1244 const GURL& gurl, | |
1245 const std::set<std::string>& whitelist) { | |
1246 return whitelist.find(gurl.scheme()) != whitelist.end(); | |
1247 } | |
1248 | |
1249 | |
1250 // SearchTermCacheItem --------------------------------------------------------- | |
1251 | |
1252 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | |
1253 const WordIDSet& word_id_set, | |
1254 const HistoryIDSet& history_id_set) | |
1255 : word_id_set_(word_id_set), | |
1256 history_id_set_(history_id_set), | |
1257 used_(true) {} | |
1258 | |
1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() | |
1260 : used_(true) {} | |
1261 | |
1262 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} | |
1263 | |
1264 | |
1265 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- | |
1266 | |
1267 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( | |
1268 const URLIndexPrivateData& private_data, | |
1269 const std::string& languages, | |
1270 const base::string16& lower_string, | |
1271 const String16Vector& lower_terms, | |
1272 const base::Time now, | |
1273 const ScoredHistoryMatch::Builder& builder) | |
1274 : private_data_(private_data), | |
1275 languages_(languages), | |
1276 builder_(builder), | |
1277 lower_string_(lower_string), | |
1278 lower_terms_(lower_terms), | |
1279 now_(now) { | |
1280 // Calculate offsets for each term. For instance, the offset for | |
1281 // ".net" should be 1, indicating that the actual word-part of the term | |
1282 // starts at offset 1. | |
1283 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u); | |
1284 for (size_t i = 0; i < lower_terms_.size(); ++i) { | |
1285 base::i18n::BreakIterator iter(lower_terms_[i], | |
1286 base::i18n::BreakIterator::BREAK_WORD); | |
1287 // If the iterator doesn't work, assume an offset of 0. | |
1288 if (!iter.Init()) | |
1289 continue; | |
1290 // Find the first word start. | |
1291 while (iter.Advance() && !iter.IsWord()) {} | |
1292 if (iter.IsWord()) | |
1293 lower_terms_to_word_starts_offsets_[i] = iter.prev(); | |
1294 // Else: the iterator didn't find a word break. Assume an offset of 0. | |
1295 } | |
1296 } | |
1297 | |
1298 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
1299 | |
1300 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
1301 const HistoryID history_id) { | |
1302 HistoryInfoMap::const_iterator hist_pos = | |
1303 private_data_.history_info_map_.find(history_id); | |
1304 if (hist_pos != private_data_.history_info_map_.end()) { | |
1305 const URLRow& hist_item = hist_pos->second.url_row; | |
1306 const VisitInfoVector& visits = hist_pos->second.visits; | |
1307 WordStartsMap::const_iterator starts_pos = | |
1308 private_data_.word_starts_map_.find(history_id); | |
1309 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
1310 ScoredHistoryMatch match = builder_.Build( | |
1311 hist_item, visits, languages_, lower_string_, lower_terms_, | |
1312 lower_terms_to_word_starts_offsets_, starts_pos->second, now_); | |
1313 if (match.raw_score > 0) | |
1314 scored_matches_.push_back(match); | |
1315 } | |
1316 } | |
1317 | |
1318 | |
1319 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | |
1320 | |
1321 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
1322 const HistoryInfoMap& history_info_map) | |
1323 : history_info_map_(history_info_map) { | |
1324 } | |
1325 | |
1326 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
1327 | |
1328 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
1329 const HistoryID h1, | |
1330 const HistoryID h2) { | |
1331 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
1332 if (entry1 == history_info_map_.end()) | |
1333 return false; | |
1334 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
1335 if (entry2 == history_info_map_.end()) | |
1336 return true; | |
1337 const URLRow& r1(entry1->second.url_row); | |
1338 const URLRow& r2(entry2->second.url_row); | |
1339 // First cut: typed count, visit count, recency. | |
1340 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
1341 // recently visited (within the last 12/24 hours) as highly important. Get | |
1342 // input from mpearson. | |
1343 if (r1.typed_count() != r2.typed_count()) | |
1344 return (r1.typed_count() > r2.typed_count()); | |
1345 if (r1.visit_count() != r2.visit_count()) | |
1346 return (r1.visit_count() > r2.visit_count()); | |
1347 return (r1.last_visit() > r2.last_visit()); | |
1348 } | |
1349 | |
1350 } // namespace history | |
OLD | NEW |