Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(135)

Side by Side Diff: chrome/browser/history/url_index_private_data.cc

Issue 963823003: Move InMemoryURLIndex into chrome/browser/autocomplete (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@shortcut-database
Patch Set: Fixing win_chromium_x64_rel_ng build Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « chrome/browser/history/url_index_private_data.h ('k') | chrome/browser/ui/BUILD.gn » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/url_index_private_data.h"
6
7 #include <functional>
8 #include <iterator>
9 #include <limits>
10 #include <numeric>
11 #include <string>
12 #include <vector>
13
14 #include "base/basictypes.h"
15 #include "base/files/file_util.h"
16 #include "base/i18n/break_iterator.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/time/time.h"
22 #include "chrome/browser/history/history_service.h"
23 #include "chrome/browser/history/in_memory_url_index.h"
24 #include "components/bookmarks/browser/bookmark_utils.h"
25 #include "components/history/core/browser/history_database.h"
26 #include "components/history/core/browser/history_db_task.h"
27 #include "net/base/net_util.h"
28
29 #if defined(USE_SYSTEM_PROTOBUF)
30 #include <google/protobuf/repeated_field.h>
31 #else
32 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
33 #endif
34
35 using google::protobuf::RepeatedField;
36 using google::protobuf::RepeatedPtrField;
37 using in_memory_url_index::InMemoryURLIndexCacheItem;
38
39 namespace {
40 static const size_t kMaxVisitsToStoreInCache = 10u;
41 } // anonymous namespace
42
43 namespace history {
44
45 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
46 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
48 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
50 CharWordMapEntry;
51 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
52 WordIDHistoryMapItem;
53 typedef imui::
54 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
55 WordIDHistoryMapEntry;
56 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
58 HistoryInfoMapEntry;
59 typedef imui::
60 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
61 HistoryInfoMapEntry_VisitInfo;
62 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
64 WordStartsMapEntry;
65
66
67 // Algorithm Functions ---------------------------------------------------------
68
69 // Comparison function for sorting search terms by descending length.
70 bool LengthGreater(const base::string16& string_a,
71 const base::string16& string_b) {
72 return string_a.length() > string_b.length();
73 }
74
75
76 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
77
78 // HistoryDBTask used to update the recent visit data for a particular
79 // row from the history database.
80 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
81 public:
82 explicit UpdateRecentVisitsFromHistoryDBTask(
83 URLIndexPrivateData* private_data,
84 URLID url_id);
85
86 bool RunOnDBThread(HistoryBackend* backend,
87 history::HistoryDatabase* db) override;
88 void DoneRunOnMainThread() override;
89
90 private:
91 ~UpdateRecentVisitsFromHistoryDBTask() override;
92
93 // The URLIndexPrivateData that gets updated after the historyDB
94 // task returns.
95 URLIndexPrivateData* private_data_;
96 // The ID of the URL to get visits for and then update.
97 URLID url_id_;
98 // Whether fetching the recent visits for the URL succeeded.
99 bool succeeded_;
100 // The awaited data that's shown to private_data_ for it to copy and
101 // store.
102 VisitVector recent_visits_;
103
104 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
105 };
106
107 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
108 URLIndexPrivateData* private_data,
109 URLID url_id)
110 : private_data_(private_data),
111 url_id_(url_id),
112 succeeded_(false) {
113 }
114
115 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
116 HistoryBackend* backend,
117 HistoryDatabase* db) {
118 // Make sure the private data is going to get as many recent visits as
119 // ScoredHistoryMatch::GetFrequency() hopes to use.
120 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
121 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
122 kMaxVisitsToStoreInCache,
123 &recent_visits_);
124 if (!succeeded_)
125 recent_visits_.clear();
126 return true; // Always claim to be done; do not retry failures.
127 }
128
129 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
130 if (succeeded_)
131 private_data_->UpdateRecentVisits(url_id_, recent_visits_);
132 }
133
134 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
135 }
136
137
138 // URLIndexPrivateData ---------------------------------------------------------
139
140 URLIndexPrivateData::URLIndexPrivateData()
141 : restored_cache_version_(0),
142 saved_cache_version_(kCurrentCacheFileVersion),
143 pre_filter_item_count_(0),
144 post_filter_item_count_(0),
145 post_scoring_item_count_(0) {
146 }
147
148 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
149 base::string16 search_string,
150 size_t cursor_position,
151 size_t max_matches,
152 const std::string& languages,
153 const history::ScoredHistoryMatch::Builder& builder) {
154 // If cursor position is set and useful (not at either end of the
155 // string), allow the search string to be broken at cursor position.
156 // We do this by pretending there's a space where the cursor is.
157 if ((cursor_position != base::string16::npos) &&
158 (cursor_position < search_string.length()) &&
159 (cursor_position > 0)) {
160 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
161 }
162 pre_filter_item_count_ = 0;
163 post_filter_item_count_ = 0;
164 post_scoring_item_count_ = 0;
165 // The search string we receive may contain escaped characters. For reducing
166 // the index we need individual, lower-cased words, ignoring escapings. For
167 // the final filtering we need whitespace separated substrings possibly
168 // containing escaped characters.
169 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
170 base::string16 lower_unescaped_string =
171 net::UnescapeURLComponent(lower_raw_string,
172 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
173 // Extract individual 'words' (as opposed to 'terms'; see below) from the
174 // search string. When the user types "colspec=ID%20Mstone Release" we get
175 // four 'words': "colspec", "id", "mstone" and "release".
176 String16Vector lower_words(
177 history::String16VectorFromString16(lower_unescaped_string, false, NULL));
178 ScoredHistoryMatches scored_items;
179
180 // Do nothing if we have indexed no words (probably because we've not been
181 // initialized yet) or the search string has no words.
182 if (word_list_.empty() || lower_words.empty()) {
183 search_term_cache_.clear(); // Invalidate the term cache.
184 return scored_items;
185 }
186
187 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
188 // approach.
189 ResetSearchTermCache();
190
191 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
192
193 // Trim the candidate pool if it is large. Note that we do not filter out
194 // items that do not contain the search terms as proper substrings -- doing
195 // so is the performance-costly operation we are trying to avoid in order
196 // to maintain omnibox responsiveness.
197 const size_t kItemsToScoreLimit = 500;
198 pre_filter_item_count_ = history_id_set.size();
199 // If we trim the results set we do not want to cache the results for next
200 // time as the user's ultimately desired result could easily be eliminated
201 // in this early rough filter.
202 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
203 if (was_trimmed) {
204 HistoryIDVector history_ids;
205 std::copy(history_id_set.begin(), history_id_set.end(),
206 std::back_inserter(history_ids));
207 // Trim down the set by sorting by typed-count, visit-count, and last
208 // visit.
209 HistoryItemFactorGreater
210 item_factor_functor(history_info_map_);
211 std::partial_sort(history_ids.begin(),
212 history_ids.begin() + kItemsToScoreLimit,
213 history_ids.end(),
214 item_factor_functor);
215 history_id_set.clear();
216 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
217 std::inserter(history_id_set, history_id_set.end()));
218 post_filter_item_count_ = history_id_set.size();
219 }
220
221 // Pass over all of the candidates filtering out any without a proper
222 // substring match, inserting those which pass in order by score. Note that
223 // in this step we are using the raw search string complete with escaped
224 // URL elements. When the user has specifically typed something akin to
225 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
226 // specific substring appears in the URL or page title.
227
228 // We call these 'terms' (as opposed to 'words'; see above) as in this case
229 // we only want to break up the search string on 'true' whitespace rather than
230 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
231 // get two 'terms': "colspec=id%20mstone" and "release".
232 history::String16Vector lower_raw_terms;
233 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
234 &lower_raw_terms) == 0) {
235 // Don't score matches when there are no terms to score against. (It's
236 // possible that the word break iterater that extracts words to search
237 // for in the database allows some whitespace "words" whereas Tokenize
238 // excludes a long list of whitespace.) One could write a scoring
239 // function that gives a reasonable order to matches when there
240 // are no terms (i.e., all the words are some form of whitespace),
241 // but this is such a rare edge case that it's not worth the time.
242 return scored_items;
243 }
244 scored_items =
245 std::for_each(
246 history_id_set.begin(), history_id_set.end(),
247 AddHistoryMatch(*this, languages, lower_raw_string, lower_raw_terms,
248 base::Time::Now(), builder)).ScoredMatches();
249
250 // Select and sort only the top |max_matches| results.
251 if (scored_items.size() > max_matches) {
252 std::partial_sort(scored_items.begin(),
253 scored_items.begin() +
254 max_matches,
255 scored_items.end(),
256 ScoredHistoryMatch::MatchScoreGreater);
257 scored_items.resize(max_matches);
258 } else {
259 std::sort(scored_items.begin(), scored_items.end(),
260 ScoredHistoryMatch::MatchScoreGreater);
261 }
262 post_scoring_item_count_ = scored_items.size();
263
264 if (was_trimmed) {
265 search_term_cache_.clear(); // Invalidate the term cache.
266 } else {
267 // Remove any stale SearchTermCacheItems.
268 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
269 cache_iter != search_term_cache_.end(); ) {
270 if (!cache_iter->second.used_)
271 search_term_cache_.erase(cache_iter++);
272 else
273 ++cache_iter;
274 }
275 }
276
277 return scored_items;
278 }
279
280 bool URLIndexPrivateData::UpdateURL(
281 HistoryService* history_service,
282 const URLRow& row,
283 const std::string& languages,
284 const std::set<std::string>& scheme_whitelist,
285 base::CancelableTaskTracker* tracker) {
286 // The row may or may not already be in our index. If it is not already
287 // indexed and it qualifies then it gets indexed. If it is already
288 // indexed and still qualifies then it gets updated, otherwise it
289 // is deleted from the index.
290 bool row_was_updated = false;
291 URLID row_id = row.id();
292 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
293 if (row_pos == history_info_map_.end()) {
294 // This new row should be indexed if it qualifies.
295 URLRow new_row(row);
296 new_row.set_id(row_id);
297 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
298 IndexRow(NULL,
299 history_service,
300 new_row,
301 languages,
302 scheme_whitelist,
303 tracker);
304 } else if (RowQualifiesAsSignificant(row, base::Time())) {
305 // This indexed row still qualifies and will be re-indexed.
306 // The url won't have changed but the title, visit count, etc.
307 // might have changed.
308 URLRow& row_to_update = row_pos->second.url_row;
309 bool title_updated = row_to_update.title() != row.title();
310 if (row_to_update.visit_count() != row.visit_count() ||
311 row_to_update.typed_count() != row.typed_count() ||
312 row_to_update.last_visit() != row.last_visit() || title_updated) {
313 row_to_update.set_visit_count(row.visit_count());
314 row_to_update.set_typed_count(row.typed_count());
315 row_to_update.set_last_visit(row.last_visit());
316 // If something appears to have changed, update the recent visits
317 // information.
318 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
319 // While the URL is guaranteed to remain stable, the title may have
320 // changed. If so, then update the index with the changed words.
321 if (title_updated) {
322 // Clear all words associated with this row and re-index both the
323 // URL and title.
324 RemoveRowWordsFromIndex(row_to_update);
325 row_to_update.set_title(row.title());
326 RowWordStarts word_starts;
327 AddRowWordsToIndex(row_to_update, &word_starts, languages);
328 word_starts_map_[row_id] = word_starts;
329 }
330 row_was_updated = true;
331 }
332 } else {
333 // This indexed row no longer qualifies and will be de-indexed by
334 // clearing all words associated with this row.
335 RemoveRowFromIndex(row);
336 row_was_updated = true;
337 }
338 if (row_was_updated)
339 search_term_cache_.clear(); // This invalidates the cache.
340 return row_was_updated;
341 }
342
343 void URLIndexPrivateData::UpdateRecentVisits(
344 URLID url_id,
345 const VisitVector& recent_visits) {
346 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
347 if (row_pos != history_info_map_.end()) {
348 VisitInfoVector* visits = &row_pos->second.visits;
349 visits->clear();
350 const size_t size =
351 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
352 visits->reserve(size);
353 for (size_t i = 0; i < size; i++) {
354 // Copy from the VisitVector the only fields visits needs.
355 visits->push_back(std::make_pair(recent_visits[i].visit_time,
356 recent_visits[i].transition));
357 }
358 }
359 // Else: Oddly, the URL doesn't seem to exist in the private index.
360 // Ignore this update. This can happen if, for instance, the user
361 // removes the URL from URLIndexPrivateData before the historyDB call
362 // returns.
363 }
364
365 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
366 HistoryService* history_service,
367 URLID url_id,
368 base::CancelableTaskTracker* tracker) {
369 history_service->ScheduleDBTask(
370 scoped_ptr<history::HistoryDBTask>(
371 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
372 }
373
374 // Helper functor for DeleteURL.
375 class HistoryInfoMapItemHasURL {
376 public:
377 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
378
379 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
380 return item.second.url_row.url() == url_;
381 }
382
383 private:
384 const GURL& url_;
385 };
386
387 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
388 // Find the matching entry in the history_info_map_.
389 HistoryInfoMap::iterator pos = std::find_if(
390 history_info_map_.begin(),
391 history_info_map_.end(),
392 HistoryInfoMapItemHasURL(url));
393 if (pos == history_info_map_.end())
394 return false;
395 RemoveRowFromIndex(pos->second.url_row);
396 search_term_cache_.clear(); // This invalidates the cache.
397 return true;
398 }
399
400 // static
401 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
402 const base::FilePath& file_path,
403 const std::string& languages) {
404 base::TimeTicks beginning_time = base::TimeTicks::Now();
405 if (!base::PathExists(file_path))
406 return NULL;
407 std::string data;
408 // If there is no cache file then simply give up. This will cause us to
409 // attempt to rebuild from the history database.
410 if (!base::ReadFileToString(file_path, &data))
411 return NULL;
412
413 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
414 InMemoryURLIndexCacheItem index_cache;
415 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
416 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
417 << file_path.value();
418 return restored_data;
419 }
420
421 if (!restored_data->RestorePrivateData(index_cache, languages))
422 return NULL;
423
424 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
425 base::TimeTicks::Now() - beginning_time);
426 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
427 restored_data->history_id_word_map_.size());
428 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
429 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
430 restored_data->word_map_.size());
431 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
432 restored_data->char_word_map_.size());
433 if (restored_data->Empty())
434 return NULL; // 'No data' is the same as a failed reload.
435 return restored_data;
436 }
437
438 // static
439 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
440 HistoryDatabase* history_db,
441 const std::string& languages,
442 const std::set<std::string>& scheme_whitelist) {
443 if (!history_db)
444 return NULL;
445
446 base::TimeTicks beginning_time = base::TimeTicks::Now();
447
448 scoped_refptr<URLIndexPrivateData>
449 rebuilt_data(new URLIndexPrivateData);
450 URLDatabase::URLEnumerator history_enum;
451 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
452 return NULL;
453 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
454 for (URLRow row; history_enum.GetNextURL(&row); ) {
455 rebuilt_data->IndexRow(
456 history_db, NULL, row, languages, scheme_whitelist, NULL);
457 }
458
459 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
460 base::TimeTicks::Now() - beginning_time);
461 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
462 rebuilt_data->history_id_word_map_.size());
463 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
464 rebuilt_data->word_map_.size());
465 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
466 rebuilt_data->char_word_map_.size());
467 return rebuilt_data;
468 }
469
470 // static
471 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
472 scoped_refptr<URLIndexPrivateData> private_data,
473 const base::FilePath& file_path) {
474 DCHECK(private_data.get());
475 DCHECK(!file_path.empty());
476 return private_data->SaveToFile(file_path);
477 }
478
479 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
480 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
481 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
482 data_copy->word_list_ = word_list_;
483 data_copy->available_words_ = available_words_;
484 data_copy->word_map_ = word_map_;
485 data_copy->char_word_map_ = char_word_map_;
486 data_copy->word_id_history_map_ = word_id_history_map_;
487 data_copy->history_id_word_map_ = history_id_word_map_;
488 data_copy->history_info_map_ = history_info_map_;
489 data_copy->word_starts_map_ = word_starts_map_;
490 return data_copy;
491 // Not copied:
492 // search_term_cache_
493 // pre_filter_item_count_
494 // post_filter_item_count_
495 // post_scoring_item_count_
496 }
497
498 bool URLIndexPrivateData::Empty() const {
499 return history_info_map_.empty();
500 }
501
502 void URLIndexPrivateData::Clear() {
503 last_time_rebuilt_from_history_ = base::Time();
504 word_list_.clear();
505 available_words_.clear();
506 word_map_.clear();
507 char_word_map_.clear();
508 word_id_history_map_.clear();
509 history_id_word_map_.clear();
510 history_info_map_.clear();
511 word_starts_map_.clear();
512 }
513
514 URLIndexPrivateData::~URLIndexPrivateData() {}
515
516 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
517 const String16Vector& unsorted_words) {
518 // Break the terms down into individual terms (words), get the candidate
519 // set for each term, and intersect each to get a final candidate list.
520 // Note that a single 'term' from the user's perspective might be
521 // a string like "http://www.somewebsite.com" which, from our perspective,
522 // is four words: 'http', 'www', 'somewebsite', and 'com'.
523 HistoryIDSet history_id_set;
524 String16Vector words(unsorted_words);
525 // Sort the words into the longest first as such are likely to narrow down
526 // the results quicker. Also, single character words are the most expensive
527 // to process so save them for last.
528 std::sort(words.begin(), words.end(), LengthGreater);
529 for (String16Vector::iterator iter = words.begin(); iter != words.end();
530 ++iter) {
531 base::string16 uni_word = *iter;
532 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
533 if (term_history_set.empty()) {
534 history_id_set.clear();
535 break;
536 }
537 if (iter == words.begin()) {
538 history_id_set.swap(term_history_set);
539 } else {
540 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
541 history_id_set, term_history_set);
542 history_id_set.swap(new_history_id_set);
543 }
544 }
545 return history_id_set;
546 }
547
548 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
549 const base::string16& term) {
550 if (term.empty())
551 return HistoryIDSet();
552
553 // TODO(mrossetti): Consider optimizing for very common terms such as
554 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
555 // occuring words in the user's searches.
556
557 size_t term_length = term.length();
558 WordIDSet word_id_set;
559 if (term_length > 1) {
560 // See if this term or a prefix thereof is present in the cache.
561 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
562 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
563 cache_iter != search_term_cache_.end(); ++cache_iter) {
564 if (StartsWith(term, cache_iter->first, false) &&
565 (best_prefix == search_term_cache_.end() ||
566 cache_iter->first.length() > best_prefix->first.length()))
567 best_prefix = cache_iter;
568 }
569
570 // If a prefix was found then determine the leftover characters to be used
571 // for further refining the results from that prefix.
572 Char16Set prefix_chars;
573 base::string16 leftovers(term);
574 if (best_prefix != search_term_cache_.end()) {
575 // If the prefix is an exact match for the term then grab the cached
576 // results and we're done.
577 size_t prefix_length = best_prefix->first.length();
578 if (prefix_length == term_length) {
579 best_prefix->second.used_ = true;
580 return best_prefix->second.history_id_set_;
581 }
582
583 // Otherwise we have a handy starting point.
584 // If there are no history results for this prefix then we can bail early
585 // as there will be no history results for the full term.
586 if (best_prefix->second.history_id_set_.empty()) {
587 search_term_cache_[term] = SearchTermCacheItem();
588 return HistoryIDSet();
589 }
590 word_id_set = best_prefix->second.word_id_set_;
591 prefix_chars = Char16SetFromString16(best_prefix->first);
592 leftovers = term.substr(prefix_length);
593 }
594
595 // Filter for each remaining, unique character in the term.
596 Char16Set leftover_chars = Char16SetFromString16(leftovers);
597 Char16Set unique_chars =
598 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
599
600 // Reduce the word set with any leftover, unprocessed characters.
601 if (!unique_chars.empty()) {
602 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
603 // We might come up empty on the leftovers.
604 if (leftover_set.empty()) {
605 search_term_cache_[term] = SearchTermCacheItem();
606 return HistoryIDSet();
607 }
608 // Or there may not have been a prefix from which to start.
609 if (prefix_chars.empty()) {
610 word_id_set.swap(leftover_set);
611 } else {
612 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
613 word_id_set, leftover_set);
614 word_id_set.swap(new_word_id_set);
615 }
616 }
617
618 // We must filter the word list because the resulting word set surely
619 // contains words which do not have the search term as a proper subset.
620 for (WordIDSet::iterator word_set_iter = word_id_set.begin();
621 word_set_iter != word_id_set.end(); ) {
622 if (word_list_[*word_set_iter].find(term) == base::string16::npos)
623 word_id_set.erase(word_set_iter++);
624 else
625 ++word_set_iter;
626 }
627 } else {
628 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
629 }
630
631 // If any words resulted then we can compose a set of history IDs by unioning
632 // the sets from each word.
633 HistoryIDSet history_id_set;
634 if (!word_id_set.empty()) {
635 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
636 word_id_iter != word_id_set.end(); ++word_id_iter) {
637 WordID word_id = *word_id_iter;
638 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
639 if (word_iter != word_id_history_map_.end()) {
640 HistoryIDSet& word_history_id_set(word_iter->second);
641 history_id_set.insert(word_history_id_set.begin(),
642 word_history_id_set.end());
643 }
644 }
645 }
646
647 // Record a new cache entry for this word if the term is longer than
648 // a single character.
649 if (term_length > 1)
650 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
651
652 return history_id_set;
653 }
654
655 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
656 const Char16Set& term_chars) {
657 WordIDSet word_id_set;
658 for (Char16Set::const_iterator c_iter = term_chars.begin();
659 c_iter != term_chars.end(); ++c_iter) {
660 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
661 if (char_iter == char_word_map_.end()) {
662 // A character was not found so there are no matching results: bail.
663 word_id_set.clear();
664 break;
665 }
666 WordIDSet& char_word_id_set(char_iter->second);
667 // It is possible for there to no longer be any words associated with
668 // a particular character. Give up in that case.
669 if (char_word_id_set.empty()) {
670 word_id_set.clear();
671 break;
672 }
673
674 if (c_iter == term_chars.begin()) {
675 // First character results becomes base set of results.
676 word_id_set = char_word_id_set;
677 } else {
678 // Subsequent character results get intersected in.
679 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
680 word_id_set, char_word_id_set);
681 word_id_set.swap(new_word_id_set);
682 }
683 }
684 return word_id_set;
685 }
686
687 bool URLIndexPrivateData::IndexRow(
688 HistoryDatabase* history_db,
689 HistoryService* history_service,
690 const URLRow& row,
691 const std::string& languages,
692 const std::set<std::string>& scheme_whitelist,
693 base::CancelableTaskTracker* tracker) {
694 const GURL& gurl(row.url());
695
696 // Index only URLs with a whitelisted scheme.
697 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
698 return false;
699
700 URLID row_id = row.id();
701 // Strip out username and password before saving and indexing.
702 base::string16 url(net::FormatUrl(gurl, languages,
703 net::kFormatUrlOmitUsernamePassword,
704 net::UnescapeRule::NONE,
705 NULL, NULL, NULL));
706
707 HistoryID history_id = static_cast<HistoryID>(row_id);
708 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
709
710 // Add the row for quick lookup in the history info store.
711 URLRow new_row(GURL(url), row_id);
712 new_row.set_visit_count(row.visit_count());
713 new_row.set_typed_count(row.typed_count());
714 new_row.set_last_visit(row.last_visit());
715 new_row.set_title(row.title());
716 history_info_map_[history_id].url_row = new_row;
717
718 // Index the words contained in the URL and title of the row.
719 RowWordStarts word_starts;
720 AddRowWordsToIndex(new_row, &word_starts, languages);
721 word_starts_map_[history_id] = word_starts;
722
723 // Update the recent visits information or schedule the update
724 // as appropriate.
725 if (history_db) {
726 // We'd like to check that we're on the history DB thread.
727 // However, unittest code actually calls this on the UI thread.
728 // So we don't do any thread checks.
729 VisitVector recent_visits;
730 // Make sure the private data is going to get as many recent visits as
731 // ScoredHistoryMatch::GetFrequency() hopes to use.
732 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
733 if (history_db->GetMostRecentVisitsForURL(row_id,
734 kMaxVisitsToStoreInCache,
735 &recent_visits))
736 UpdateRecentVisits(row_id, recent_visits);
737 } else {
738 DCHECK(tracker);
739 DCHECK(history_service);
740 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
741 }
742
743 return true;
744 }
745
746 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
747 RowWordStarts* word_starts,
748 const std::string& languages) {
749 HistoryID history_id = static_cast<HistoryID>(row.id());
750 // Split URL into individual, unique words then add in the title words.
751 const GURL& gurl(row.url());
752 const base::string16& url =
753 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
754 String16Set url_words = String16SetFromString16(url,
755 word_starts ? &word_starts->url_word_starts_ : NULL);
756 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
757 String16Set title_words = String16SetFromString16(title,
758 word_starts ? &word_starts->title_word_starts_ : NULL);
759 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
760 for (String16Set::iterator word_iter = words.begin();
761 word_iter != words.end(); ++word_iter)
762 AddWordToIndex(*word_iter, history_id);
763
764 search_term_cache_.clear(); // Invalidate the term cache.
765 }
766
767 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
768 HistoryID history_id) {
769 WordMap::iterator word_pos = word_map_.find(term);
770 if (word_pos != word_map_.end())
771 UpdateWordHistory(word_pos->second, history_id);
772 else
773 AddWordHistory(term, history_id);
774 }
775
776 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
777 HistoryID history_id) {
778 WordID word_id = word_list_.size();
779 if (available_words_.empty()) {
780 word_list_.push_back(term);
781 } else {
782 word_id = *(available_words_.begin());
783 word_list_[word_id] = term;
784 available_words_.erase(word_id);
785 }
786 word_map_[term] = word_id;
787
788 HistoryIDSet history_id_set;
789 history_id_set.insert(history_id);
790 word_id_history_map_[word_id] = history_id_set;
791 AddToHistoryIDWordMap(history_id, word_id);
792
793 // For each character in the newly added word (i.e. a word that is not
794 // already in the word index), add the word to the character index.
795 Char16Set characters = Char16SetFromString16(term);
796 for (Char16Set::iterator uni_char_iter = characters.begin();
797 uni_char_iter != characters.end(); ++uni_char_iter) {
798 base::char16 uni_char = *uni_char_iter;
799 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
800 if (char_iter != char_word_map_.end()) {
801 // Update existing entry in the char/word index.
802 WordIDSet& word_id_set(char_iter->second);
803 word_id_set.insert(word_id);
804 } else {
805 // Create a new entry in the char/word index.
806 WordIDSet word_id_set;
807 word_id_set.insert(word_id);
808 char_word_map_[uni_char] = word_id_set;
809 }
810 }
811 }
812
813 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
814 HistoryID history_id) {
815 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
816 DCHECK(history_pos != word_id_history_map_.end());
817 HistoryIDSet& history_id_set(history_pos->second);
818 history_id_set.insert(history_id);
819 AddToHistoryIDWordMap(history_id, word_id);
820 }
821
822 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
823 WordID word_id) {
824 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
825 if (iter != history_id_word_map_.end()) {
826 WordIDSet& word_id_set(iter->second);
827 word_id_set.insert(word_id);
828 } else {
829 WordIDSet word_id_set;
830 word_id_set.insert(word_id);
831 history_id_word_map_[history_id] = word_id_set;
832 }
833 }
834
835 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
836 RemoveRowWordsFromIndex(row);
837 HistoryID history_id = static_cast<HistoryID>(row.id());
838 history_info_map_.erase(history_id);
839 word_starts_map_.erase(history_id);
840 }
841
842 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
843 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
844 // this row.
845 HistoryID history_id = static_cast<HistoryID>(row.id());
846 WordIDSet word_id_set = history_id_word_map_[history_id];
847 history_id_word_map_.erase(history_id);
848
849 // Reconcile any changes to word usage.
850 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
851 word_id_iter != word_id_set.end(); ++word_id_iter) {
852 WordID word_id = *word_id_iter;
853 word_id_history_map_[word_id].erase(history_id);
854 if (!word_id_history_map_[word_id].empty())
855 continue; // The word is still in use.
856
857 // The word is no longer in use. Reconcile any changes to character usage.
858 base::string16 word = word_list_[word_id];
859 Char16Set characters = Char16SetFromString16(word);
860 for (Char16Set::iterator uni_char_iter = characters.begin();
861 uni_char_iter != characters.end(); ++uni_char_iter) {
862 base::char16 uni_char = *uni_char_iter;
863 char_word_map_[uni_char].erase(word_id);
864 if (char_word_map_[uni_char].empty())
865 char_word_map_.erase(uni_char); // No longer in use.
866 }
867
868 // Complete the removal of references to the word.
869 word_id_history_map_.erase(word_id);
870 word_map_.erase(word);
871 word_list_[word_id] = base::string16();
872 available_words_.insert(word_id);
873 }
874 }
875
876 void URLIndexPrivateData::ResetSearchTermCache() {
877 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
878 iter != search_term_cache_.end(); ++iter)
879 iter->second.used_ = false;
880 }
881
882 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
883 base::TimeTicks beginning_time = base::TimeTicks::Now();
884 InMemoryURLIndexCacheItem index_cache;
885 SavePrivateData(&index_cache);
886 std::string data;
887 if (!index_cache.SerializeToString(&data)) {
888 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
889 return false;
890 }
891
892 int size = data.size();
893 if (base::WriteFile(file_path, data.c_str(), size) != size) {
894 LOG(WARNING) << "Failed to write " << file_path.value();
895 return false;
896 }
897 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
898 base::TimeTicks::Now() - beginning_time);
899 return true;
900 }
901
902 void URLIndexPrivateData::SavePrivateData(
903 InMemoryURLIndexCacheItem* cache) const {
904 DCHECK(cache);
905 cache->set_last_rebuild_timestamp(
906 last_time_rebuilt_from_history_.ToInternalValue());
907 cache->set_version(saved_cache_version_);
908 // history_item_count_ is no longer used but rather than change the protobuf
909 // definition use a placeholder. This will go away with the switch to SQLite.
910 cache->set_history_item_count(0);
911 SaveWordList(cache);
912 SaveWordMap(cache);
913 SaveCharWordMap(cache);
914 SaveWordIDHistoryMap(cache);
915 SaveHistoryInfoMap(cache);
916 SaveWordStartsMap(cache);
917 }
918
919 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
920 if (word_list_.empty())
921 return;
922 WordListItem* list_item = cache->mutable_word_list();
923 list_item->set_word_count(word_list_.size());
924 for (String16Vector::const_iterator iter = word_list_.begin();
925 iter != word_list_.end(); ++iter)
926 list_item->add_word(base::UTF16ToUTF8(*iter));
927 }
928
929 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
930 if (word_map_.empty())
931 return;
932 WordMapItem* map_item = cache->mutable_word_map();
933 map_item->set_item_count(word_map_.size());
934 for (WordMap::const_iterator iter = word_map_.begin();
935 iter != word_map_.end(); ++iter) {
936 WordMapEntry* map_entry = map_item->add_word_map_entry();
937 map_entry->set_word(base::UTF16ToUTF8(iter->first));
938 map_entry->set_word_id(iter->second);
939 }
940 }
941
942 void URLIndexPrivateData::SaveCharWordMap(
943 InMemoryURLIndexCacheItem* cache) const {
944 if (char_word_map_.empty())
945 return;
946 CharWordMapItem* map_item = cache->mutable_char_word_map();
947 map_item->set_item_count(char_word_map_.size());
948 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
949 iter != char_word_map_.end(); ++iter) {
950 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
951 map_entry->set_char_16(iter->first);
952 const WordIDSet& word_id_set(iter->second);
953 map_entry->set_item_count(word_id_set.size());
954 for (WordIDSet::const_iterator set_iter = word_id_set.begin();
955 set_iter != word_id_set.end(); ++set_iter)
956 map_entry->add_word_id(*set_iter);
957 }
958 }
959
960 void URLIndexPrivateData::SaveWordIDHistoryMap(
961 InMemoryURLIndexCacheItem* cache) const {
962 if (word_id_history_map_.empty())
963 return;
964 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
965 map_item->set_item_count(word_id_history_map_.size());
966 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
967 iter != word_id_history_map_.end(); ++iter) {
968 WordIDHistoryMapEntry* map_entry =
969 map_item->add_word_id_history_map_entry();
970 map_entry->set_word_id(iter->first);
971 const HistoryIDSet& history_id_set(iter->second);
972 map_entry->set_item_count(history_id_set.size());
973 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
974 set_iter != history_id_set.end(); ++set_iter)
975 map_entry->add_history_id(*set_iter);
976 }
977 }
978
979 void URLIndexPrivateData::SaveHistoryInfoMap(
980 InMemoryURLIndexCacheItem* cache) const {
981 if (history_info_map_.empty())
982 return;
983 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
984 map_item->set_item_count(history_info_map_.size());
985 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
986 iter != history_info_map_.end(); ++iter) {
987 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
988 map_entry->set_history_id(iter->first);
989 const URLRow& url_row(iter->second.url_row);
990 // Note: We only save information that contributes to the index so there
991 // is no need to save search_term_cache_ (not persistent).
992 map_entry->set_visit_count(url_row.visit_count());
993 map_entry->set_typed_count(url_row.typed_count());
994 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
995 map_entry->set_url(url_row.url().spec());
996 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
997 const VisitInfoVector& visits(iter->second.visits);
998 for (VisitInfoVector::const_iterator visit_iter = visits.begin();
999 visit_iter != visits.end(); ++visit_iter) {
1000 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1001 visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1002 visit_info->set_transition_type(visit_iter->second);
1003 }
1004 }
1005 }
1006
1007 void URLIndexPrivateData::SaveWordStartsMap(
1008 InMemoryURLIndexCacheItem* cache) const {
1009 if (word_starts_map_.empty())
1010 return;
1011 // For unit testing: Enable saving of the cache as an earlier version to
1012 // allow testing of cache file upgrading in ReadFromFile().
1013 // TODO(mrossetti): Instead of intruding on production code with this kind of
1014 // test harness, save a copy of an older version cache with known results.
1015 // Implement this when switching the caching over to SQLite.
1016 if (saved_cache_version_ < 1)
1017 return;
1018
1019 WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1020 map_item->set_item_count(word_starts_map_.size());
1021 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1022 iter != word_starts_map_.end(); ++iter) {
1023 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1024 map_entry->set_history_id(iter->first);
1025 const RowWordStarts& word_starts(iter->second);
1026 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1027 i != word_starts.url_word_starts_.end(); ++i)
1028 map_entry->add_url_word_starts(*i);
1029 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1030 i != word_starts.title_word_starts_.end(); ++i)
1031 map_entry->add_title_word_starts(*i);
1032 }
1033 }
1034
1035 bool URLIndexPrivateData::RestorePrivateData(
1036 const InMemoryURLIndexCacheItem& cache,
1037 const std::string& languages) {
1038 last_time_rebuilt_from_history_ =
1039 base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1040 const base::TimeDelta rebuilt_ago =
1041 base::Time::Now() - last_time_rebuilt_from_history_;
1042 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1043 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1044 // Cache is more than a week old or, somehow, from some time in the future.
1045 // It's probably a good time to rebuild the index from history to
1046 // allow synced entries to now appear, expired entries to disappear, etc.
1047 // Allow one day in the future to make the cache not rebuild on simple
1048 // system clock changes such as time zone changes.
1049 return false;
1050 }
1051 if (cache.has_version()) {
1052 if (cache.version() < kCurrentCacheFileVersion) {
1053 // Don't try to restore an old format cache file. (This will cause
1054 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1055 // from history.)
1056 return false;
1057 }
1058 restored_cache_version_ = cache.version();
1059 }
1060 return RestoreWordList(cache) && RestoreWordMap(cache) &&
1061 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1062 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1063 }
1064
1065 bool URLIndexPrivateData::RestoreWordList(
1066 const InMemoryURLIndexCacheItem& cache) {
1067 if (!cache.has_word_list())
1068 return false;
1069 const WordListItem& list_item(cache.word_list());
1070 uint32 expected_item_count = list_item.word_count();
1071 uint32 actual_item_count = list_item.word_size();
1072 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1073 return false;
1074 const RepeatedPtrField<std::string>& words(list_item.word());
1075 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1076 iter != words.end(); ++iter)
1077 word_list_.push_back(base::UTF8ToUTF16(*iter));
1078 return true;
1079 }
1080
1081 bool URLIndexPrivateData::RestoreWordMap(
1082 const InMemoryURLIndexCacheItem& cache) {
1083 if (!cache.has_word_map())
1084 return false;
1085 const WordMapItem& list_item(cache.word_map());
1086 uint32 expected_item_count = list_item.item_count();
1087 uint32 actual_item_count = list_item.word_map_entry_size();
1088 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1089 return false;
1090 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1091 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1092 iter != entries.end(); ++iter)
1093 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1094 return true;
1095 }
1096
1097 bool URLIndexPrivateData::RestoreCharWordMap(
1098 const InMemoryURLIndexCacheItem& cache) {
1099 if (!cache.has_char_word_map())
1100 return false;
1101 const CharWordMapItem& list_item(cache.char_word_map());
1102 uint32 expected_item_count = list_item.item_count();
1103 uint32 actual_item_count = list_item.char_word_map_entry_size();
1104 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1105 return false;
1106 const RepeatedPtrField<CharWordMapEntry>&
1107 entries(list_item.char_word_map_entry());
1108 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1109 entries.begin(); iter != entries.end(); ++iter) {
1110 expected_item_count = iter->item_count();
1111 actual_item_count = iter->word_id_size();
1112 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1113 return false;
1114 base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1115 WordIDSet word_id_set;
1116 const RepeatedField<int32>& word_ids(iter->word_id());
1117 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1118 jiter != word_ids.end(); ++jiter)
1119 word_id_set.insert(*jiter);
1120 char_word_map_[uni_char] = word_id_set;
1121 }
1122 return true;
1123 }
1124
1125 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1126 const InMemoryURLIndexCacheItem& cache) {
1127 if (!cache.has_word_id_history_map())
1128 return false;
1129 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1130 uint32 expected_item_count = list_item.item_count();
1131 uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1132 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1133 return false;
1134 const RepeatedPtrField<WordIDHistoryMapEntry>&
1135 entries(list_item.word_id_history_map_entry());
1136 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1137 entries.begin(); iter != entries.end(); ++iter) {
1138 expected_item_count = iter->item_count();
1139 actual_item_count = iter->history_id_size();
1140 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1141 return false;
1142 WordID word_id = iter->word_id();
1143 HistoryIDSet history_id_set;
1144 const RepeatedField<int64>& history_ids(iter->history_id());
1145 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1146 jiter != history_ids.end(); ++jiter) {
1147 history_id_set.insert(*jiter);
1148 AddToHistoryIDWordMap(*jiter, word_id);
1149 }
1150 word_id_history_map_[word_id] = history_id_set;
1151 }
1152 return true;
1153 }
1154
1155 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1156 const InMemoryURLIndexCacheItem& cache) {
1157 if (!cache.has_history_info_map())
1158 return false;
1159 const HistoryInfoMapItem& list_item(cache.history_info_map());
1160 uint32 expected_item_count = list_item.item_count();
1161 uint32 actual_item_count = list_item.history_info_map_entry_size();
1162 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1163 return false;
1164 const RepeatedPtrField<HistoryInfoMapEntry>&
1165 entries(list_item.history_info_map_entry());
1166 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1167 entries.begin(); iter != entries.end(); ++iter) {
1168 HistoryID history_id = iter->history_id();
1169 GURL url(iter->url());
1170 URLRow url_row(url, history_id);
1171 url_row.set_visit_count(iter->visit_count());
1172 url_row.set_typed_count(iter->typed_count());
1173 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1174 if (iter->has_title()) {
1175 base::string16 title(base::UTF8ToUTF16(iter->title()));
1176 url_row.set_title(title);
1177 }
1178 history_info_map_[history_id].url_row = url_row;
1179
1180 // Restore visits list.
1181 VisitInfoVector visits;
1182 visits.reserve(iter->visits_size());
1183 for (int i = 0; i < iter->visits_size(); ++i) {
1184 visits.push_back(std::make_pair(
1185 base::Time::FromInternalValue(iter->visits(i).visit_time()),
1186 ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1187 }
1188 history_info_map_[history_id].visits = visits;
1189 }
1190 return true;
1191 }
1192
1193 bool URLIndexPrivateData::RestoreWordStartsMap(
1194 const InMemoryURLIndexCacheItem& cache,
1195 const std::string& languages) {
1196 // Note that this function must be called after RestoreHistoryInfoMap() has
1197 // been run as the word starts may have to be recalculated from the urls and
1198 // page titles.
1199 if (cache.has_word_starts_map()) {
1200 const WordStartsMapItem& list_item(cache.word_starts_map());
1201 uint32 expected_item_count = list_item.item_count();
1202 uint32 actual_item_count = list_item.word_starts_map_entry_size();
1203 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1204 return false;
1205 const RepeatedPtrField<WordStartsMapEntry>&
1206 entries(list_item.word_starts_map_entry());
1207 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1208 entries.begin(); iter != entries.end(); ++iter) {
1209 HistoryID history_id = iter->history_id();
1210 RowWordStarts word_starts;
1211 // Restore the URL word starts.
1212 const RepeatedField<int32>& url_starts(iter->url_word_starts());
1213 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1214 jiter != url_starts.end(); ++jiter)
1215 word_starts.url_word_starts_.push_back(*jiter);
1216 // Restore the page title word starts.
1217 const RepeatedField<int32>& title_starts(iter->title_word_starts());
1218 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1219 jiter != title_starts.end(); ++jiter)
1220 word_starts.title_word_starts_.push_back(*jiter);
1221 word_starts_map_[history_id] = word_starts;
1222 }
1223 } else {
1224 // Since the cache did not contain any word starts we must rebuild then from
1225 // the URL and page titles.
1226 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1227 iter != history_info_map_.end(); ++iter) {
1228 RowWordStarts word_starts;
1229 const URLRow& row(iter->second.url_row);
1230 const base::string16& url =
1231 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1232 String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1233 const base::string16& title =
1234 bookmarks::CleanUpTitleForMatching(row.title());
1235 String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1236 word_starts_map_[iter->first] = word_starts;
1237 }
1238 }
1239 return true;
1240 }
1241
1242 // static
1243 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1244 const GURL& gurl,
1245 const std::set<std::string>& whitelist) {
1246 return whitelist.find(gurl.scheme()) != whitelist.end();
1247 }
1248
1249
1250 // SearchTermCacheItem ---------------------------------------------------------
1251
1252 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1253 const WordIDSet& word_id_set,
1254 const HistoryIDSet& history_id_set)
1255 : word_id_set_(word_id_set),
1256 history_id_set_(history_id_set),
1257 used_(true) {}
1258
1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1260 : used_(true) {}
1261
1262 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1263
1264
1265 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1266
1267 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1268 const URLIndexPrivateData& private_data,
1269 const std::string& languages,
1270 const base::string16& lower_string,
1271 const String16Vector& lower_terms,
1272 const base::Time now,
1273 const ScoredHistoryMatch::Builder& builder)
1274 : private_data_(private_data),
1275 languages_(languages),
1276 builder_(builder),
1277 lower_string_(lower_string),
1278 lower_terms_(lower_terms),
1279 now_(now) {
1280 // Calculate offsets for each term. For instance, the offset for
1281 // ".net" should be 1, indicating that the actual word-part of the term
1282 // starts at offset 1.
1283 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1284 for (size_t i = 0; i < lower_terms_.size(); ++i) {
1285 base::i18n::BreakIterator iter(lower_terms_[i],
1286 base::i18n::BreakIterator::BREAK_WORD);
1287 // If the iterator doesn't work, assume an offset of 0.
1288 if (!iter.Init())
1289 continue;
1290 // Find the first word start.
1291 while (iter.Advance() && !iter.IsWord()) {}
1292 if (iter.IsWord())
1293 lower_terms_to_word_starts_offsets_[i] = iter.prev();
1294 // Else: the iterator didn't find a word break. Assume an offset of 0.
1295 }
1296 }
1297
1298 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1299
1300 void URLIndexPrivateData::AddHistoryMatch::operator()(
1301 const HistoryID history_id) {
1302 HistoryInfoMap::const_iterator hist_pos =
1303 private_data_.history_info_map_.find(history_id);
1304 if (hist_pos != private_data_.history_info_map_.end()) {
1305 const URLRow& hist_item = hist_pos->second.url_row;
1306 const VisitInfoVector& visits = hist_pos->second.visits;
1307 WordStartsMap::const_iterator starts_pos =
1308 private_data_.word_starts_map_.find(history_id);
1309 DCHECK(starts_pos != private_data_.word_starts_map_.end());
1310 ScoredHistoryMatch match = builder_.Build(
1311 hist_item, visits, languages_, lower_string_, lower_terms_,
1312 lower_terms_to_word_starts_offsets_, starts_pos->second, now_);
1313 if (match.raw_score > 0)
1314 scored_matches_.push_back(match);
1315 }
1316 }
1317
1318
1319 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1320
1321 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1322 const HistoryInfoMap& history_info_map)
1323 : history_info_map_(history_info_map) {
1324 }
1325
1326 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1327
1328 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1329 const HistoryID h1,
1330 const HistoryID h2) {
1331 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1332 if (entry1 == history_info_map_.end())
1333 return false;
1334 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1335 if (entry2 == history_info_map_.end())
1336 return true;
1337 const URLRow& r1(entry1->second.url_row);
1338 const URLRow& r2(entry2->second.url_row);
1339 // First cut: typed count, visit count, recency.
1340 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1341 // recently visited (within the last 12/24 hours) as highly important. Get
1342 // input from mpearson.
1343 if (r1.typed_count() != r2.typed_count())
1344 return (r1.typed_count() > r2.typed_count());
1345 if (r1.visit_count() != r2.visit_count())
1346 return (r1.visit_count() > r2.visit_count());
1347 return (r1.last_visit() > r2.last_visit());
1348 }
1349
1350 } // namespace history
OLDNEW
« no previous file with comments | « chrome/browser/history/url_index_private_data.h ('k') | chrome/browser/ui/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698