Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/history/in_memory_url_index.h" | 5 #include "chrome/browser/history/in_memory_url_index.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <iterator> | 8 #include <iterator> |
| 9 #include <limits> | 9 #include <limits> |
| 10 #include <numeric> | |
| 10 | 11 |
| 11 #include "base/file_util.h" | 12 #include "base/file_util.h" |
| 12 #include "base/i18n/break_iterator.h" | 13 #include "base/i18n/break_iterator.h" |
| 13 #include "base/metrics/histogram.h" | 14 #include "base/metrics/histogram.h" |
| 14 #include "base/string_util.h" | 15 #include "base/string_util.h" |
| 15 #include "base/time.h" | 16 #include "base/time.h" |
| 16 #include "base/utf_string_conversions.h" | 17 #include "base/utf_string_conversions.h" |
| 17 #include "chrome/browser/autocomplete/history_provider_util.h" | 18 #include "chrome/browser/autocomplete/history_provider_util.h" |
| 18 #include "chrome/browser/history/url_database.h" | 19 #include "chrome/browser/history/url_database.h" |
| 19 #include "chrome/browser/profiles/profile.h" | 20 #include "chrome/browser/profiles/profile.h" |
| 20 #include "net/base/escape.h" | 21 #include "net/base/escape.h" |
| 21 #include "net/base/net_util.h" | 22 #include "net/base/net_util.h" |
| 22 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | 23 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" |
| 23 #include "ui/base/l10n/l10n_util.h" | 24 #include "ui/base/l10n/l10n_util.h" |
| 24 | 25 |
| 25 using google::protobuf::RepeatedField; | 26 using google::protobuf::RepeatedField; |
| 26 using google::protobuf::RepeatedPtrField; | 27 using google::protobuf::RepeatedPtrField; |
| 28 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
| 27 | 29 |
| 28 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
| 29 namespace history { | 30 namespace history { |
| 30 | 31 |
| 31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | 32 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; |
| 32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | 33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; |
| 33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | 34 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; |
| 34 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; | 35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; |
| 35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry | 36 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry |
| 36 CharWordMapEntry; | 37 CharWordMapEntry; |
| 37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | 38 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem |
| 38 WordIDHistoryMapItem; | 39 WordIDHistoryMapItem; |
| 39 typedef imui:: | 40 typedef imui:: |
| 40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | 41 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry |
| 41 WordIDHistoryMapEntry; | 42 WordIDHistoryMapEntry; |
| 42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; | 43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; |
| 43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | 44 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry |
| 44 HistoryInfoMapEntry; | 45 HistoryInfoMapEntry; |
| 45 | 46 |
| 46 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; | 47 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; |
| 47 | 48 |
| 49 const float kOrderMaxValue = 50.0; | |
| 50 const float kStartMaxValue = 50.0; | |
| 51 const size_t kMaxSignificantStart = 20; | |
| 52 const float kCompleteMaxValue = 50.0; | |
| 53 const float kLastVisitMaxValue = 50.0; | |
| 54 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); | |
| 55 const float kVisitCountMaxValue = 50.0; | |
| 56 const int kMaxSignificantVisits = 20; | |
| 57 const float kTypedCountMaxValue = 50.0; | |
| 58 const int kMaxSignificantTyped = 20; | |
| 59 const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + | |
| 60 kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue; | |
| 61 const float kMaxNormalizedRawScore = 1000.0; | |
| 62 | |
| 48 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} | 63 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} |
| 49 | 64 |
| 50 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, | 65 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info) |
| 51 size_t input_location, | 66 : HistoryMatch(url_info, 0, false, false), |
| 52 bool match_in_scheme, | 67 raw_score(0) {} |
| 53 bool innermost_match, | |
| 54 int score) | |
| 55 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), | |
| 56 raw_score(score) { | |
| 57 } | |
| 58 | 68 |
| 59 struct InMemoryURLIndex::TermCharWordSet { | 69 struct InMemoryURLIndex::TermCharWordSet { |
| 60 TermCharWordSet() // Required for STL resize(). | 70 TermCharWordSet() // Required for STL resize(). |
| 61 : char_(0), | 71 : char_(0), |
| 62 word_id_set_(), | 72 word_id_set_(), |
| 63 used_(false) {} | 73 used_(false) {} |
| 64 TermCharWordSet(const char16& uni_char, | 74 TermCharWordSet(const char16& uni_char, |
| 65 const WordIDSet& word_id_set, | 75 const WordIDSet& word_id_set, |
| 66 bool used) | 76 bool used) |
| 67 : char_(uni_char), | 77 : char_(uni_char), |
| 68 word_id_set_(word_id_set), | 78 word_id_set_(word_id_set), |
| 69 used_(used) {} | 79 used_(used) {} |
| 70 | 80 |
| 71 // Predicate for STL algorithm use. | 81 // Predicate for STL algorithm use. |
| 72 bool is_not_used() const { return !used_; } | 82 bool is_not_used() const { return !used_; } |
| 73 | 83 |
| 74 char16 char_; | 84 char16 char_; |
| 75 WordIDSet word_id_set_; | 85 WordIDSet word_id_set_; |
| 76 bool used_; // true if this set has been used for the current term search. | 86 bool used_; // true if this set has been used for the current term search. |
| 77 }; | 87 }; |
| 78 | 88 |
| 89 // Comparison function for sorting TermMatches by their offsets. | |
| 90 bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) { | |
| 91 return m1.offset < m2.offset; | |
| 92 } | |
| 93 | |
| 94 // std::accumulate helper function to add up TermMatches' lengths. | |
| 95 int AddumulateMatchLength(int total, const TermMatch& match) { | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Addumulate -> Accumulate
mrossetti
2011/03/10 01:31:14
LOL
| |
| 96 return total + match.length; | |
| 97 } | |
| 98 | |
| 79 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) | 99 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) |
| 80 : history_dir_(history_dir), | 100 : history_dir_(history_dir), |
| 81 history_item_count_(0) { | 101 history_item_count_(0) { |
| 82 } | 102 } |
| 83 | 103 |
| 84 // Called only by unit tests. | 104 // Called only by unit tests. |
| 85 InMemoryURLIndex::InMemoryURLIndex() | 105 InMemoryURLIndex::InMemoryURLIndex() |
| 86 : history_item_count_(0) { | 106 : history_item_count_(0) { |
| 87 } | 107 } |
| 88 | 108 |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 102 SaveToCacheFile(); | 122 SaveToCacheFile(); |
| 103 } | 123 } |
| 104 | 124 |
| 105 bool InMemoryURLIndex::IndexRow(const URLRow& row) { | 125 bool InMemoryURLIndex::IndexRow(const URLRow& row) { |
| 106 const GURL& gurl(row.url()); | 126 const GURL& gurl(row.url()); |
| 107 string16 url(net::FormatUrl(gurl, languages_, | 127 string16 url(net::FormatUrl(gurl, languages_, |
| 108 net::kFormatUrlOmitUsernamePassword, | 128 net::kFormatUrlOmitUsernamePassword, |
| 109 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, | 129 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, |
| 110 NULL, NULL, NULL)); | 130 NULL, NULL, NULL)); |
| 111 | 131 |
| 112 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). | |
| 113 HistoryID history_id = static_cast<HistoryID>(row.id()); | 132 HistoryID history_id = static_cast<HistoryID>(row.id()); |
| 133 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max()); | |
| 114 | 134 |
| 115 // Add the row for quick lookup in the history info store. | 135 // Add the row for quick lookup in the history info store. |
| 116 url = l10n_util::ToLower(url); | |
| 117 URLRow new_row(GURL(url), row.id()); | 136 URLRow new_row(GURL(url), row.id()); |
| 118 new_row.set_visit_count(row.visit_count()); | 137 new_row.set_visit_count(row.visit_count()); |
| 119 new_row.set_typed_count(row.typed_count()); | 138 new_row.set_typed_count(row.typed_count()); |
| 120 new_row.set_last_visit(row.last_visit()); | 139 new_row.set_last_visit(row.last_visit()); |
| 121 new_row.set_title(row.title()); | 140 new_row.set_title(row.title()); |
| 122 history_info_map_[history_id] = new_row; | 141 history_info_map_[history_id] = new_row; |
| 123 | 142 |
| 124 // Split into individual, unique words. | 143 // Split URL into individual, unique words then add in the title words. |
| 125 String16Set words = WordSetFromString16(url); | 144 url = l10n_util::ToLower(url); |
| 145 String16Set url_words = WordSetFromString16(url); | |
| 146 String16Set title_words = WordSetFromString16(row.title()); | |
| 147 String16Set words; | |
| 148 std::set_union(url_words.begin(), url_words.end(), | |
| 149 title_words.begin(), title_words.end(), | |
| 150 std::insert_iterator<String16Set>(words, words.begin())); | |
| 151 for (String16Set::iterator word_iter = words.begin(); | |
| 152 word_iter != words.end(); ++word_iter) | |
| 153 AddWordToIndex(*word_iter, history_id); | |
| 126 | 154 |
| 127 // For each word, add a new entry into the word index referring to the | |
| 128 // associated history item. | |
| 129 for (String16Set::iterator iter = words.begin(); | |
| 130 iter != words.end(); ++iter) { | |
| 131 String16Set::value_type uni_word = *iter; | |
| 132 AddWordToIndex(uni_word, history_id); | |
| 133 } | |
| 134 ++history_item_count_; | 155 ++history_item_count_; |
| 135 return true; | 156 return true; |
| 136 } | 157 } |
| 137 | 158 |
| 138 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, | 159 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, |
| 139 bool clear_cache) { | 160 bool clear_cache) { |
| 140 ClearPrivateData(); | 161 ClearPrivateData(); |
| 141 | 162 |
| 142 if (!history_db) | 163 if (!history_db) |
| 143 return false; | 164 return false; |
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 285 ScoredHistoryMatches scored_items; | 306 ScoredHistoryMatches scored_items; |
| 286 if (!terms.empty()) { | 307 if (!terms.empty()) { |
| 287 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- | 308 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- |
| 288 // and-sweep approach. | 309 // and-sweep approach. |
| 289 ResetTermCharWordSetCache(); | 310 ResetTermCharWordSetCache(); |
| 290 | 311 |
| 291 // Lowercase the terms. | 312 // Lowercase the terms. |
| 292 // TODO(mrossetti): Another opportunity for a transform algorithm. | 313 // TODO(mrossetti): Another opportunity for a transform algorithm. |
| 293 String16Vector lower_terms; | 314 String16Vector lower_terms; |
| 294 for (String16Vector::const_iterator term_iter = terms.begin(); | 315 for (String16Vector::const_iterator term_iter = terms.begin(); |
| 295 term_iter != terms.end(); ++term_iter) { | 316 term_iter != terms.end(); ++term_iter) |
| 296 String16Vector::value_type lower_string(*term_iter); | 317 lower_terms.push_back(InMemoryURLIndex::ToLower(*term_iter)); |
| 297 std::transform(lower_string.begin(), | |
| 298 lower_string.end(), | |
| 299 lower_string.begin(), | |
| 300 tolower); | |
| 301 lower_terms.push_back(lower_string); | |
| 302 } | |
| 303 | 318 |
| 304 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); | 319 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); |
| 305 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); | 320 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); |
| 306 | 321 |
| 307 // Pass over all of the candidates filtering out any without a proper | 322 // Pass over all of the candidates filtering out any without a proper |
| 308 // substring match, inserting those which pass in order by score. | 323 // substring match, inserting those which pass in order by score. |
| 309 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | 324 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
| 310 AddHistoryMatch(*this, | 325 AddHistoryMatch(*this, |
| 311 lower_terms)).ScoredMatches(); | 326 lower_terms)).ScoredMatches(); |
| 312 } | 327 } |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 365 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( | 380 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( |
| 366 const string16& uni_word) { | 381 const string16& uni_word) { |
| 367 InMemoryURLIndex::HistoryIDSet history_id_set; | 382 InMemoryURLIndex::HistoryIDSet history_id_set; |
| 368 | 383 |
| 369 // For each unique character in the word, in order of first appearance, get | 384 // For each unique character in the word, in order of first appearance, get |
| 370 // the char/word_id map entry and intersect with the set in an incremental | 385 // the char/word_id map entry and intersect with the set in an incremental |
| 371 // manner. | 386 // manner. |
| 372 Char16Vector uni_chars = Char16VectorFromString16(uni_word); | 387 Char16Vector uni_chars = Char16VectorFromString16(uni_word); |
| 373 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); | 388 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); |
| 374 | 389 |
| 390 // TODO(mrossetti): At this point, as a possible optimization, we could | |
| 391 // scan through all candidate words and make sure the |uni_word| is a | |
| 392 // substring within the candidate words, eliminating those which aren't. | |
| 393 // I'm not sure it would be worth the effort. And remember, we've got to do | |
| 394 // a final substring match in order to get the highlighting ranges later | |
| 395 // in the process in any case. | |
| 396 | |
| 375 // If any words resulted then we can compose a set of history IDs by unioning | 397 // If any words resulted then we can compose a set of history IDs by unioning |
| 376 // the sets from each word. | 398 // the sets from each word. |
| 377 if (!word_id_set.empty()) { | 399 if (!word_id_set.empty()) { |
| 378 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | 400 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); |
| 379 word_id_iter != word_id_set.end(); ++word_id_iter) { | 401 word_id_iter != word_id_set.end(); ++word_id_iter) { |
| 380 WordID word_id = *word_id_iter; | 402 WordID word_id = *word_id_iter; |
| 381 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | 403 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); |
| 382 if (word_iter != word_id_history_map_.end()) { | 404 if (word_iter != word_id_history_map_.end()) { |
| 383 HistoryIDSet& word_history_id_set(word_iter->second); | 405 HistoryIDSet& word_history_id_set(word_iter->second); |
| 384 history_id_set.insert(word_history_id_set.begin(), | 406 history_id_set.insert(word_history_id_set.begin(), |
| 385 word_history_id_set.end()); | 407 word_history_id_set.end()); |
| 386 } | 408 } |
| 387 } | 409 } |
| 388 } | 410 } |
| 389 | 411 |
| 390 return history_id_set; | 412 return history_id_set; |
| 391 } | 413 } |
| 392 | 414 |
| 393 // Utility Functions | 415 // Utility Functions |
| 394 | 416 |
| 395 // static | 417 // static |
| 418 string16 InMemoryURLIndex::ToLower(const string16& upper_string) { | |
| 419 string16 lower_string(upper_string); | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: How about:
string16 lower_string;
lower_
mrossetti
2011/03/10 01:31:14
This function replaced with l10n_util::ToLower().
| |
| 420 std::transform(lower_string.begin(), | |
| 421 lower_string.end(), | |
| 422 lower_string.begin(), | |
| 423 tolower); | |
|
Peter Kasting
2011/03/09 02:31:48
Is this calling the C tolower() function? If so,
mrossetti
2011/03/10 01:31:14
Done. Replaced function.
| |
| 424 return lower_string; | |
| 425 } | |
| 426 | |
| 427 // static | |
| 396 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( | 428 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( |
| 397 const string16& uni_string) { | 429 const string16& uni_string) { |
| 398 String16Set words; | 430 String16Vector words = WordVectorFromString16(uni_string, false); |
| 399 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD); | 431 String16Set word_set; |
| 400 if (iter.Init()) { | 432 for (String16Vector::const_iterator iter = words.begin(); iter != words.end(); |
| 401 while (iter.Advance()) { | 433 ++iter) |
| 434 word_set.insert(InMemoryURLIndex::ToLower(*iter)); | |
| 435 return word_set; | |
| 436 } | |
| 437 | |
| 438 // static | |
| 439 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16( | |
| 440 const string16& uni_string, | |
| 441 bool break_on_space) { | |
| 442 base::BreakIterator iter(&uni_string, break_on_space ? | |
| 443 base::BreakIterator::BREAK_SPACE : | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: You can collapse lines 2 & 3 to a single line
mrossetti
2011/03/10 01:31:14
Done.
| |
| 444 base::BreakIterator::BREAK_WORD); | |
| 445 String16Vector words; | |
| 446 if (break_on_space) { | |
| 447 if (iter.Init()) { | |
| 448 while (iter.Advance()) { | |
| 449 string16 word = iter.GetString(); | |
| 450 TrimWhitespace(word, TRIM_ALL, &word); | |
| 451 if (word.size()) | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: !word.empty()
mrossetti
2011/03/10 01:31:14
Done.
| |
| 452 words.push_back(word); | |
| 453 } | |
| 454 } | |
| 455 } else { | |
| 456 if (iter.Init()) { | |
| 402 if (iter.IsWord()) | 457 if (iter.IsWord()) |
| 403 words.insert(iter.GetString()); | 458 words.push_back(iter.GetString()); |
|
Peter Kasting
2011/03/09 02:31:48
I confess, I'm a little confused why one arm of th
mrossetti
2011/03/10 01:31:14
The iterator behaves differently depending on the
Peter Kasting
2011/03/10 02:13:43
If BreakIterator should be changed, can you file a
| |
| 459 while (iter.Advance()) { | |
| 460 if (iter.IsWord()) | |
| 461 words.push_back(iter.GetString()); | |
| 462 } | |
| 404 } | 463 } |
| 405 } | 464 } |
| 406 return words; | 465 return words; |
| 407 } | 466 } |
| 408 | 467 |
| 409 // static | 468 // static |
| 410 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( | 469 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( |
| 411 const string16& uni_word) { | 470 const string16& uni_word) { |
| 412 InMemoryURLIndex::Char16Vector characters; | 471 InMemoryURLIndex::Char16Vector characters; |
| 413 InMemoryURLIndex::Char16Set unique_characters; | 472 InMemoryURLIndex::Char16Set unique_characters; |
| (...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 539 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); | 598 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); |
| 540 (c_iter != uni_chars.end()) && | 599 (c_iter != uni_chars.end()) && |
| 541 (t_iter != term_char_word_set_cache_.end()) && | 600 (t_iter != term_char_word_set_cache_.end()) && |
| 542 (*c_iter == t_iter->char_); | 601 (*c_iter == t_iter->char_); |
| 543 ++c_iter, ++t_iter) | 602 ++c_iter, ++t_iter) |
| 544 t_iter->used_ = true; // Update the cache. | 603 t_iter->used_ = true; // Update the cache. |
| 545 return t_iter - term_char_word_set_cache_.begin() - 1; | 604 return t_iter - term_char_word_set_cache_.begin() - 1; |
| 546 } | 605 } |
| 547 | 606 |
| 548 // static | 607 // static |
| 549 int InMemoryURLIndex::RawScoreForURL(const URLRow& row, | 608 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, |
| 550 const String16Vector& terms, | 609 const string16& string, |
| 551 size_t* first_term_location) { | 610 int term_num) { |
| 611 TermMatches matches; | |
| 612 for (size_t location = string.find(term); location != string16::npos; | |
| 613 location = string.find(term, location + 1)) | |
| 614 matches.push_back(TermMatch(term_num, location, term.size())); | |
| 615 return matches; | |
| 616 } | |
| 617 | |
| 618 // static | |
| 619 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) { | |
| 620 if (matches.empty()) | |
| 621 return matches; | |
| 622 TermMatches sorted_matches = matches; | |
| 623 std::sort(sorted_matches.begin(), sorted_matches.end(), | |
| 624 CompareMatchOffset); | |
| 625 TermMatches clean_matches; | |
| 626 TermMatch last_match = sorted_matches[0]; | |
| 627 clean_matches.push_back(last_match); | |
| 628 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1; | |
| 629 iter != sorted_matches.end(); ++iter) { | |
| 630 if (iter->offset >= last_match.offset + last_match.length) { | |
| 631 last_match = *iter; | |
| 632 clean_matches.push_back(last_match); | |
| 633 } | |
| 634 } | |
| 635 return clean_matches; | |
| 636 } | |
| 637 | |
| 638 // static | |
| 639 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL( | |
| 640 const URLRow& row, | |
| 641 const String16Vector& terms) { | |
| 642 ScoredHistoryMatch match(row); | |
| 552 GURL gurl = row.url(); | 643 GURL gurl = row.url(); |
| 553 if (!gurl.is_valid()) | 644 if (!gurl.is_valid()) |
| 554 return 0; | 645 return match; |
| 555 | 646 |
| 556 string16 url = UTF8ToUTF16(gurl.spec()); | 647 // Figure out where each search term appears in the URL and/or page title |
| 557 | 648 // so that we can score as well as provide autocomplete highlighting. |
| 558 // Collect all term start positions so we can see if they appear in order. | 649 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec())); |
| 559 std::vector<size_t> term_locations; | 650 url = l10n_util::ToLower(url); |
| 560 int out_of_order = 0; // Count the terms which are out of order. | 651 string16 title = row.title(); |
| 561 size_t start_location_total = 0; | 652 int term_num = 0; |
| 562 size_t term_length_total = 0; | |
| 563 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); | 653 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); |
| 564 ++iter) { | 654 ++iter, ++term_num) { |
| 565 string16 term = *iter; | 655 string16 term = *iter; |
| 566 size_t term_location = url.find(term); | 656 TermMatches url_term_matches = MatchTermInString(term, url, term_num); |
| 567 if (term_location == string16::npos) | 657 TermMatches title_term_matches = MatchTermInString(term, title, term_num); |
| 568 return 0; // A term was not found. This should not happen. | 658 if (url_term_matches.empty() && title_term_matches.empty()) |
| 569 if (iter != terms.begin()) { | 659 return match; // A term was not found in either URL or title - reject. |
| 570 // See if this term is out-of-order. | 660 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(), |
| 571 for (std::vector<size_t>::const_iterator order_iter = | 661 url_term_matches.end()); |
| 572 term_locations.begin(); order_iter != term_locations.end(); | 662 match.title_matches.insert(match.title_matches.end(), |
| 573 ++order_iter) { | 663 title_term_matches.begin(), |
| 574 if (term_location <= *order_iter) | 664 title_term_matches.end()); |
| 575 ++out_of_order; | |
| 576 } | |
| 577 } else { | |
| 578 *first_term_location = term_location; | |
| 579 } | |
| 580 term_locations.push_back(term_location); | |
| 581 start_location_total += term_location; | |
| 582 term_length_total += term.size(); | |
| 583 } | 665 } |
| 584 | 666 |
| 585 // Calculate a raw score. | 667 // Sort matches by offset and eliminate any which overlap. |
| 586 // TODO(mrossetti): This is good enough for now but must be fine-tuned. | 668 match.url_matches = SortAndDeoverlap(match.url_matches); |
| 587 const float kOrderMaxValue = 10.0; | 669 match.title_matches = SortAndDeoverlap(match.title_matches); |
| 588 float order_value = 10.0; | |
| 589 if (terms.size() > 1) { | |
| 590 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; | |
| 591 order_value = | |
| 592 (static_cast<float>(max_possible_out_of_order - out_of_order) / | |
| 593 max_possible_out_of_order) * kOrderMaxValue; | |
| 594 } | |
| 595 | 670 |
| 596 const float kStartMaxValue = 10.0; | 671 // Get partial scores based on term matching. Note that the score for |
| 597 const size_t kMaxSignificantStart = 20; | 672 // each of the URL and title are adjusted by the fraction of the |
| 598 float start_value = | 673 // terms appearing in each. |
| 599 (static_cast<float>(kMaxSignificantStart - | 674 int url_score = RawScoreForMatches(match.url_matches, url.size()) * |
| 600 std::min(kMaxSignificantStart, start_location_total / terms.size()))) / | 675 match.url_matches.size() / terms.size(); |
| 601 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; | 676 int title_score = RawScoreForMatches(match.title_matches, title.size()) * |
| 677 match.title_matches.size() / terms.size(); | |
| 678 // Arbitrarily pick the best. | |
| 679 // TODO(mrossetti): It might make sense that a term which appears in both the | |
| 680 // URL and the Title should boost the score a bit. | |
| 681 int term_score = std::max(url_score, title_score); | |
| 602 | 682 |
| 603 const float kCompleteMaxValue = 10.0; | 683 // Factor in attributes of the URLRow. |
| 604 float complete_value = | 684 // Items which have been visited recently score higher. |
| 605 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * | |
| 606 kStartMaxValue; | |
| 607 | |
| 608 const float kLastVisitMaxValue = 10.0; | |
| 609 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); | |
| 610 int64 delta_time = (kMaxSignificantDay - | 685 int64 delta_time = (kMaxSignificantDay - |
| 611 std::min((base::Time::Now() - row.last_visit()), | 686 std::min((base::Time::Now() - row.last_visit()), |
| 612 kMaxSignificantDay)).ToInternalValue(); | 687 kMaxSignificantDay)).ToInternalValue(); |
| 613 float last_visit_value = | 688 float last_visit_value = |
| 614 (static_cast<float>(delta_time) / | 689 (static_cast<float>(delta_time) / |
| 615 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * | 690 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * |
| 616 kLastVisitMaxValue; | 691 kLastVisitMaxValue; |
| 617 | 692 |
| 618 const float kVisitCountMaxValue = 10.0; | |
| 619 const int kMaxSignificantVisits = 10; | |
| 620 float visit_count_value = | 693 float visit_count_value = |
| 621 (static_cast<float>(std::min(row.visit_count(), | 694 (static_cast<float>(std::min(row.visit_count(), |
| 622 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * | 695 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * |
| 623 kVisitCountMaxValue; | 696 kVisitCountMaxValue; |
| 624 | 697 |
| 625 const float kTypedCountMaxValue = 20.0; | |
| 626 const int kMaxSignificantTyped = 10; | |
| 627 float typed_count_value = | 698 float typed_count_value = |
| 628 (static_cast<float>(std::min(row.typed_count(), | 699 (static_cast<float>(std::min(row.typed_count(), |
| 629 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * | 700 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * |
| 630 kTypedCountMaxValue; | 701 kTypedCountMaxValue; |
| 631 | 702 |
| 632 float raw_score = order_value + start_value + complete_value + | 703 float raw_score = term_score + last_visit_value + visit_count_value + |
| 633 last_visit_value + visit_count_value + typed_count_value; | 704 typed_count_value; |
| 634 | 705 |
| 635 // Normalize the score. | 706 // Normalize the score. |
| 636 const float kMaxNormalizedRawScore = 1000.0; | 707 match.raw_score = static_cast<int>((raw_score / kMaxRawScore) * |
| 637 raw_score = | 708 kMaxNormalizedRawScore); |
| 638 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + | 709 |
| 639 kLastVisitMaxValue + kVisitCountMaxValue + | 710 return match; |
| 640 kTypedCountMaxValue)) * | |
| 641 kMaxNormalizedRawScore; | |
| 642 return static_cast<int>(raw_score); | |
| 643 } | 711 } |
| 644 | 712 |
| 645 // static | 713 int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches, |
| 646 base::Time InMemoryURLIndex::RecentThreshold() { | 714 size_t max_length) { |
| 647 return base::Time::Now() - | 715 // TODO(mrossetti): This is good enough for now but must be fine-tuned. |
| 648 base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); | 716 if (matches.empty()) |
| 717 return 0; | |
| 718 | |
| 719 // Search terms appearing in order score higher. | |
| 720 float order_value = 10.0; | |
| 721 if (matches.size() > 1) { | |
| 722 int max_possible_out_of_order = matches.size() - 1; | |
| 723 int out_of_order = 0; | |
| 724 for (size_t i = 1; i < matches.size(); ++i) { | |
| 725 if (matches[i-1].term_num > matches[i].term_num) | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Spaces around '-'
mrossetti
2011/03/10 01:31:14
Done.
| |
| 726 ++out_of_order; | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Indent 2
mrossetti
2011/03/10 01:31:14
Done.
| |
| 727 } | |
| 728 order_value = | |
| 729 (static_cast<float>(max_possible_out_of_order - out_of_order) / | |
| 730 max_possible_out_of_order) * kOrderMaxValue; | |
| 731 } | |
| 732 | |
| 733 // Search terms which appear earlier score higher. | |
| 734 // No points if the first search term does not appear in the first | |
| 735 // kMaxSignificantStart chars. | |
| 736 float start_value = | |
| 737 (static_cast<float>(kMaxSignificantStart - | |
| 738 std::min(kMaxSignificantStart, matches[0].offset))) / | |
| 739 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Is this second cast needed?
mrossetti
2011/03/10 01:31:14
Yes.
| |
| 740 | |
| 741 // Search terms which comprise a greater portion score higher. | |
| 742 size_t term_length_total = std::accumulate(matches.begin(), matches.end(), | |
| 743 0, AddumulateMatchLength); | |
| 744 float complete_value = | |
| 745 (static_cast<float>(term_length_total) / static_cast<float>(max_length)) * | |
| 746 kStartMaxValue; | |
| 747 | |
| 748 return static_cast<int>(order_value + start_value + complete_value); | |
| 649 } | 749 } |
| 650 | 750 |
| 651 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( | 751 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( |
| 652 const InMemoryURLIndex& index, | 752 const InMemoryURLIndex& index, |
| 653 const String16Vector& lower_terms) | 753 const String16Vector& lower_terms) |
| 654 : index_(index), | 754 : index_(index), |
| 655 lower_terms_(lower_terms) { | 755 lower_terms_(lower_terms) { |
| 656 } | 756 } |
| 657 | 757 |
| 658 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} | 758 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} |
| 659 | 759 |
| 660 void InMemoryURLIndex::AddHistoryMatch::operator()( | 760 void InMemoryURLIndex::AddHistoryMatch::operator()( |
| 661 const InMemoryURLIndex::HistoryID history_id) { | 761 const InMemoryURLIndex::HistoryID history_id) { |
| 662 HistoryInfoMap::const_iterator hist_pos = | 762 HistoryInfoMap::const_iterator hist_pos = |
| 663 index_.history_info_map_.find(history_id); | 763 index_.history_info_map_.find(history_id); |
| 664 // Note that a history_id may be present in the word_id_history_map_ yet not | 764 // Note that a history_id may be present in the word_id_history_map_ yet not |
| 665 // be found in the history_info_map_. This occurs when an item has been | 765 // be found in the history_info_map_. This occurs when an item has been |
| 666 // deleted by the user or the item no longer qualifies as a quick result. | 766 // deleted by the user or the item no longer qualifies as a quick result. |
| 667 if (hist_pos != index_.history_info_map_.end()) { | 767 if (hist_pos != index_.history_info_map_.end()) { |
| 668 const URLRow& hist_item = hist_pos->second; | 768 const URLRow& hist_item = hist_pos->second; |
| 669 // TODO(mrossetti): Accommodate multiple term highlighting. | 769 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_)); |
| 670 size_t input_location = 0; | 770 if (match.raw_score != 0) { |
| 671 int score = InMemoryURLIndex::RawScoreForURL( | |
| 672 hist_item, lower_terms_, &input_location); | |
| 673 if (score != 0) { | |
| 674 // We only retain the top 10 highest scoring results so | 771 // We only retain the top 10 highest scoring results so |
| 675 // see if this one fits into the top 10 and, if so, where. | 772 // see if this one fits into the top 10 and, if so, where. |
| 676 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); | 773 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); |
| 677 while (scored_iter != scored_matches_.end() && | 774 while (scored_iter != scored_matches_.end() && |
| 678 (*scored_iter).raw_score > score) | 775 (*scored_iter).raw_score > match.raw_score) |
| 679 ++scored_iter; | 776 ++scored_iter; |
| 680 if ((scored_matches_.size() < 10) || | 777 if ((scored_matches_.size() < 10) || |
| 681 (scored_iter != scored_matches_.end())) { | 778 (scored_iter != scored_matches_.end())) { |
| 682 // Create and insert the new item. | 779 // Insert the new item. |
| 683 // TODO(mrossetti): Properly set |match_in_scheme| and | |
| 684 // |innermost_match|. | |
| 685 bool match_in_scheme = false; | |
| 686 bool innermost_match = true; | |
| 687 ScoredHistoryMatch match(hist_item, input_location, | |
| 688 match_in_scheme, innermost_match, score); | |
| 689 if (!scored_matches_.empty()) | 780 if (!scored_matches_.empty()) |
| 690 scored_matches_.insert(scored_iter, match); | 781 scored_matches_.insert(scored_iter, match); |
| 691 else | 782 else |
| 692 scored_matches_.push_back(match); | 783 scored_matches_.push_back(match); |
| 693 // Trim any entries beyond 10. | 784 // Trim any entries beyond 10. |
| 694 if (scored_matches_.size() > 10) { | 785 if (scored_matches_.size() > 10) { |
| 695 scored_matches_.erase(scored_matches_.begin() + 10, | 786 scored_matches_.erase(scored_matches_.begin() + 10, |
| 696 scored_matches_.end()); | 787 scored_matches_.end()); |
| 697 } | 788 } |
| 698 } | 789 } |
| (...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 917 if (iter->has_title()) { | 1008 if (iter->has_title()) { |
| 918 string16 title(UTF8ToUTF16(iter->title())); | 1009 string16 title(UTF8ToUTF16(iter->title())); |
| 919 url_row.set_title(title); | 1010 url_row.set_title(title); |
| 920 } | 1011 } |
| 921 history_info_map_[history_id] = url_row; | 1012 history_info_map_[history_id] = url_row; |
| 922 } | 1013 } |
| 923 return true; | 1014 return true; |
| 924 } | 1015 } |
| 925 | 1016 |
| 926 } // namespace history | 1017 } // namespace history |
| OLD | NEW |