| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/history/in_memory_url_index.h" | 5 #include "chrome/browser/history/in_memory_url_index.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <iterator> | 8 #include <iterator> |
| 9 #include <limits> | 9 #include <limits> |
| 10 #include <numeric> |
| 10 | 11 |
| 11 #include "base/file_util.h" | 12 #include "base/file_util.h" |
| 12 #include "base/i18n/break_iterator.h" | 13 #include "base/i18n/break_iterator.h" |
| 13 #include "base/metrics/histogram.h" | 14 #include "base/metrics/histogram.h" |
| 14 #include "base/string_util.h" | 15 #include "base/string_util.h" |
| 15 #include "base/time.h" | 16 #include "base/time.h" |
| 16 #include "base/utf_string_conversions.h" | 17 #include "base/utf_string_conversions.h" |
| 17 #include "chrome/browser/autocomplete/history_provider_util.h" | 18 #include "chrome/browser/autocomplete/history_provider_util.h" |
| 18 #include "chrome/browser/history/url_database.h" | 19 #include "chrome/browser/history/url_database.h" |
| 19 #include "chrome/browser/profiles/profile.h" | 20 #include "chrome/browser/profiles/profile.h" |
| 21 #include "chrome/common/url_constants.h" |
| 22 #include "googleurl/src/url_util.h" |
| 20 #include "net/base/escape.h" | 23 #include "net/base/escape.h" |
| 21 #include "net/base/net_util.h" | 24 #include "net/base/net_util.h" |
| 22 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | 25 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" |
| 23 #include "ui/base/l10n/l10n_util.h" | 26 #include "ui/base/l10n/l10n_util.h" |
| 24 | 27 |
| 25 using google::protobuf::RepeatedField; | 28 using google::protobuf::RepeatedField; |
| 26 using google::protobuf::RepeatedPtrField; | 29 using google::protobuf::RepeatedPtrField; |
| 30 using in_memory_url_index::InMemoryURLIndexCacheItem; |
| 27 | 31 |
| 28 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
| 29 namespace history { | 32 namespace history { |
| 30 | 33 |
| 31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | 34 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; |
| 32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | 35 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; |
| 33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | 36 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; |
| 34 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; | 37 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; |
| 35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry | 38 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry |
| 36 CharWordMapEntry; | 39 CharWordMapEntry; |
| 37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | 40 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem |
| 38 WordIDHistoryMapItem; | 41 WordIDHistoryMapItem; |
| 39 typedef imui:: | 42 typedef imui:: |
| 40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | 43 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry |
| 41 WordIDHistoryMapEntry; | 44 WordIDHistoryMapEntry; |
| 42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; | 45 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; |
| 43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | 46 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry |
| 44 HistoryInfoMapEntry; | 47 HistoryInfoMapEntry; |
| 45 | 48 |
| 46 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; | 49 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; |
| 47 | 50 |
| 48 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} | 51 const float kOrderMaxValue = 50.0; |
| 52 const float kStartMaxValue = 50.0; |
| 53 const size_t kMaxSignificantStart = 20; |
| 54 const float kCompleteMaxValue = 50.0; |
| 55 const float kLastVisitMaxValue = 50.0; |
| 56 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); |
| 57 const float kVisitCountMaxValue = 50.0; |
| 58 const int kMaxSignificantVisits = 20; |
| 59 const float kTypedCountMaxValue = 50.0; |
| 60 const int kMaxSignificantTyped = 20; |
| 61 const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + |
| 62 kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue; |
| 63 const float kMaxNormalizedRawScore = 1000.0; |
| 49 | 64 |
| 50 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, | 65 ScoredHistoryMatch::ScoredHistoryMatch() |
| 51 size_t input_location, | 66 : raw_score(0), |
| 52 bool match_in_scheme, | 67 prefix_adjust(0) {} |
| 53 bool innermost_match, | 68 |
| 54 int score) | 69 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info) |
| 55 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), | 70 : HistoryMatch(url_info, 0, false, false), |
| 56 raw_score(score) { | 71 raw_score(0), |
| 57 } | 72 prefix_adjust(0) {} |
| 58 | 73 |
| 59 struct InMemoryURLIndex::TermCharWordSet { | 74 struct InMemoryURLIndex::TermCharWordSet { |
| 60 TermCharWordSet() // Required for STL resize(). | 75 TermCharWordSet() // Required for STL resize(). |
| 61 : char_(0), | 76 : char_(0), |
| 62 word_id_set_(), | 77 word_id_set_(), |
| 63 used_(false) {} | 78 used_(false) {} |
| 64 TermCharWordSet(const char16& uni_char, | 79 TermCharWordSet(const char16& uni_char, |
| 65 const WordIDSet& word_id_set, | 80 const WordIDSet& word_id_set, |
| 66 bool used) | 81 bool used) |
| 67 : char_(uni_char), | 82 : char_(uni_char), |
| 68 word_id_set_(word_id_set), | 83 word_id_set_(word_id_set), |
| 69 used_(used) {} | 84 used_(used) {} |
| 70 | 85 |
| 71 // Predicate for STL algorithm use. | 86 // Predicate for STL algorithm use. |
| 72 bool is_not_used() const { return !used_; } | 87 bool is_not_used() const { return !used_; } |
| 73 | 88 |
| 74 char16 char_; | 89 char16 char_; |
| 75 WordIDSet word_id_set_; | 90 WordIDSet word_id_set_; |
| 76 bool used_; // true if this set has been used for the current term search. | 91 bool used_; // true if this set has been used for the current term search. |
| 77 }; | 92 }; |
| 78 | 93 |
| 94 // Comparison function for sorting TermMatches by their offsets. |
| 95 bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) { |
| 96 return m1.offset < m2.offset; |
| 97 } |
| 98 |
| 99 // std::accumulate helper function to add up TermMatches' lengths. |
| 100 int AccumulateMatchLength(int total, const TermMatch& match) { |
| 101 return total + match.length; |
| 102 } |
| 103 |
| 79 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) | 104 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) |
| 80 : history_dir_(history_dir), | 105 : history_dir_(history_dir), |
| 81 history_item_count_(0) { | 106 history_item_count_(0) { |
| 82 } | 107 } |
| 83 | 108 |
| 84 // Called only by unit tests. | 109 // Called only by unit tests. |
| 85 InMemoryURLIndex::InMemoryURLIndex() | 110 InMemoryURLIndex::InMemoryURLIndex() |
| 86 : history_item_count_(0) { | 111 : history_item_count_(0) { |
| 87 } | 112 } |
| 88 | 113 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 102 SaveToCacheFile(); | 127 SaveToCacheFile(); |
| 103 } | 128 } |
| 104 | 129 |
| 105 bool InMemoryURLIndex::IndexRow(const URLRow& row) { | 130 bool InMemoryURLIndex::IndexRow(const URLRow& row) { |
| 106 const GURL& gurl(row.url()); | 131 const GURL& gurl(row.url()); |
| 107 string16 url(net::FormatUrl(gurl, languages_, | 132 string16 url(net::FormatUrl(gurl, languages_, |
| 108 net::kFormatUrlOmitUsernamePassword, | 133 net::kFormatUrlOmitUsernamePassword, |
| 109 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, | 134 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, |
| 110 NULL, NULL, NULL)); | 135 NULL, NULL, NULL)); |
| 111 | 136 |
| 112 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). | |
| 113 HistoryID history_id = static_cast<HistoryID>(row.id()); | 137 HistoryID history_id = static_cast<HistoryID>(row.id()); |
| 138 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max()); |
| 114 | 139 |
| 115 // Add the row for quick lookup in the history info store. | 140 // Add the row for quick lookup in the history info store. |
| 116 url = l10n_util::ToLower(url); | |
| 117 URLRow new_row(GURL(url), row.id()); | 141 URLRow new_row(GURL(url), row.id()); |
| 118 new_row.set_visit_count(row.visit_count()); | 142 new_row.set_visit_count(row.visit_count()); |
| 119 new_row.set_typed_count(row.typed_count()); | 143 new_row.set_typed_count(row.typed_count()); |
| 120 new_row.set_last_visit(row.last_visit()); | 144 new_row.set_last_visit(row.last_visit()); |
| 121 new_row.set_title(row.title()); | 145 new_row.set_title(row.title()); |
| 122 history_info_map_[history_id] = new_row; | 146 history_info_map_[history_id] = new_row; |
| 123 | 147 |
| 124 // Split into individual, unique words. | 148 // Split URL into individual, unique words then add in the title words. |
| 125 String16Set words = WordSetFromString16(url); | 149 url = l10n_util::ToLower(url); |
| 150 String16Set url_words = WordSetFromString16(url); |
| 151 String16Set title_words = WordSetFromString16(row.title()); |
| 152 String16Set words; |
| 153 std::set_union(url_words.begin(), url_words.end(), |
| 154 title_words.begin(), title_words.end(), |
| 155 std::insert_iterator<String16Set>(words, words.begin())); |
| 156 for (String16Set::iterator word_iter = words.begin(); |
| 157 word_iter != words.end(); ++word_iter) |
| 158 AddWordToIndex(*word_iter, history_id); |
| 126 | 159 |
| 127 // For each word, add a new entry into the word index referring to the | |
| 128 // associated history item. | |
| 129 for (String16Set::iterator iter = words.begin(); | |
| 130 iter != words.end(); ++iter) { | |
| 131 String16Set::value_type uni_word = *iter; | |
| 132 AddWordToIndex(uni_word, history_id); | |
| 133 } | |
| 134 ++history_item_count_; | 160 ++history_item_count_; |
| 135 return true; | 161 return true; |
| 136 } | 162 } |
| 137 | 163 |
| 138 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, | 164 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, |
| 139 bool clear_cache) { | 165 bool clear_cache) { |
| 140 ClearPrivateData(); | 166 ClearPrivateData(); |
| 141 | 167 |
| 142 if (!history_db) | 168 if (!history_db) |
| 143 return false; | 169 return false; |
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 285 ScoredHistoryMatches scored_items; | 311 ScoredHistoryMatches scored_items; |
| 286 if (!terms.empty()) { | 312 if (!terms.empty()) { |
| 287 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- | 313 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- |
| 288 // and-sweep approach. | 314 // and-sweep approach. |
| 289 ResetTermCharWordSetCache(); | 315 ResetTermCharWordSetCache(); |
| 290 | 316 |
| 291 // Lowercase the terms. | 317 // Lowercase the terms. |
| 292 // TODO(mrossetti): Another opportunity for a transform algorithm. | 318 // TODO(mrossetti): Another opportunity for a transform algorithm. |
| 293 String16Vector lower_terms; | 319 String16Vector lower_terms; |
| 294 for (String16Vector::const_iterator term_iter = terms.begin(); | 320 for (String16Vector::const_iterator term_iter = terms.begin(); |
| 295 term_iter != terms.end(); ++term_iter) { | 321 term_iter != terms.end(); ++term_iter) |
| 296 String16Vector::value_type lower_string(*term_iter); | 322 lower_terms.push_back(l10n_util::ToLower(*term_iter)); |
| 297 std::transform(lower_string.begin(), | |
| 298 lower_string.end(), | |
| 299 lower_string.begin(), | |
| 300 tolower); | |
| 301 lower_terms.push_back(lower_string); | |
| 302 } | |
| 303 | 323 |
| 304 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); | 324 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); |
| 305 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); | 325 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); |
| 306 | 326 |
| 307 // Pass over all of the candidates filtering out any without a proper | 327 // Pass over all of the candidates filtering out any without a proper |
| 308 // substring match, inserting those which pass in order by score. | 328 // substring match, inserting those which pass in order by score. |
| 309 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | 329 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
| 310 AddHistoryMatch(*this, | 330 AddHistoryMatch(*this, |
| 311 lower_terms)).ScoredMatches(); | 331 lower_terms)).ScoredMatches(); |
| 312 } | 332 } |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 365 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( | 385 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( |
| 366 const string16& uni_word) { | 386 const string16& uni_word) { |
| 367 InMemoryURLIndex::HistoryIDSet history_id_set; | 387 InMemoryURLIndex::HistoryIDSet history_id_set; |
| 368 | 388 |
| 369 // For each unique character in the word, in order of first appearance, get | 389 // For each unique character in the word, in order of first appearance, get |
| 370 // the char/word_id map entry and intersect with the set in an incremental | 390 // the char/word_id map entry and intersect with the set in an incremental |
| 371 // manner. | 391 // manner. |
| 372 Char16Vector uni_chars = Char16VectorFromString16(uni_word); | 392 Char16Vector uni_chars = Char16VectorFromString16(uni_word); |
| 373 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); | 393 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); |
| 374 | 394 |
| 395 // TODO(mrossetti): At this point, as a possible optimization, we could |
| 396 // scan through all candidate words and make sure the |uni_word| is a |
| 397 // substring within the candidate words, eliminating those which aren't. |
| 398 // I'm not sure it would be worth the effort. And remember, we've got to do |
| 399 // a final substring match in order to get the highlighting ranges later |
| 400 // in the process in any case. |
| 401 |
| 375 // If any words resulted then we can compose a set of history IDs by unioning | 402 // If any words resulted then we can compose a set of history IDs by unioning |
| 376 // the sets from each word. | 403 // the sets from each word. |
| 377 if (!word_id_set.empty()) { | 404 if (!word_id_set.empty()) { |
| 378 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); | 405 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); |
| 379 word_id_iter != word_id_set.end(); ++word_id_iter) { | 406 word_id_iter != word_id_set.end(); ++word_id_iter) { |
| 380 WordID word_id = *word_id_iter; | 407 WordID word_id = *word_id_iter; |
| 381 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); | 408 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); |
| 382 if (word_iter != word_id_history_map_.end()) { | 409 if (word_iter != word_id_history_map_.end()) { |
| 383 HistoryIDSet& word_history_id_set(word_iter->second); | 410 HistoryIDSet& word_history_id_set(word_iter->second); |
| 384 history_id_set.insert(word_history_id_set.begin(), | 411 history_id_set.insert(word_history_id_set.begin(), |
| 385 word_history_id_set.end()); | 412 word_history_id_set.end()); |
| 386 } | 413 } |
| 387 } | 414 } |
| 388 } | 415 } |
| 389 | 416 |
| 390 return history_id_set; | 417 return history_id_set; |
| 391 } | 418 } |
| 392 | 419 |
| 393 // Utility Functions | 420 // Utility Functions |
| 394 | 421 |
| 395 // static | 422 // static |
| 396 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( | 423 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( |
| 397 const string16& uni_string) { | 424 const string16& uni_string) { |
| 398 String16Set words; | 425 String16Vector words = WordVectorFromString16(uni_string, false); |
| 399 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD); | 426 String16Set word_set; |
| 400 if (iter.Init()) { | 427 for (String16Vector::const_iterator iter = words.begin(); iter != words.end(); |
| 401 while (iter.Advance()) { | 428 ++iter) |
| 429 word_set.insert(l10n_util::ToLower(*iter)); |
| 430 return word_set; |
| 431 } |
| 432 |
| 433 // static |
| 434 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16( |
| 435 const string16& uni_string, |
| 436 bool break_on_space) { |
| 437 // TODO(mrossetti): Come back and update the following code if the |
| 438 // BreakIterator is changed. Here are some comments: |
| 439 // The iterator behaves differently depending on the breaking strategy. Its |
| 440 // unit tests do not properly test this case as its basic word and space tests |
| 441 // always use a test string starting with a space. |
| 442 base::BreakIterator iter(&uni_string, break_on_space ? |
| 443 base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD); |
| 444 String16Vector words; |
| 445 if (break_on_space) { |
| 446 if (iter.Init()) { |
| 447 while (iter.Advance()) { |
| 448 string16 word = iter.GetString(); |
| 449 TrimWhitespace(word, TRIM_ALL, &word); |
| 450 if (!word.empty()) |
| 451 words.push_back(word); |
| 452 } |
| 453 } |
| 454 } else { |
| 455 if (iter.Init()) { |
| 402 if (iter.IsWord()) | 456 if (iter.IsWord()) |
| 403 words.insert(iter.GetString()); | 457 words.push_back(iter.GetString()); |
| 458 while (iter.Advance()) { |
| 459 if (iter.IsWord()) |
| 460 words.push_back(iter.GetString()); |
| 461 } |
| 404 } | 462 } |
| 405 } | 463 } |
| 406 return words; | 464 return words; |
| 407 } | 465 } |
| 408 | 466 |
| 409 // static | 467 // static |
| 410 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( | 468 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( |
| 411 const string16& uni_word) { | 469 const string16& uni_word) { |
| 412 InMemoryURLIndex::Char16Vector characters; | 470 InMemoryURLIndex::Char16Vector characters; |
| 413 InMemoryURLIndex::Char16Set unique_characters; | 471 InMemoryURLIndex::Char16Set unique_characters; |
| (...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 539 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); | 597 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); |
| 540 (c_iter != uni_chars.end()) && | 598 (c_iter != uni_chars.end()) && |
| 541 (t_iter != term_char_word_set_cache_.end()) && | 599 (t_iter != term_char_word_set_cache_.end()) && |
| 542 (*c_iter == t_iter->char_); | 600 (*c_iter == t_iter->char_); |
| 543 ++c_iter, ++t_iter) | 601 ++c_iter, ++t_iter) |
| 544 t_iter->used_ = true; // Update the cache. | 602 t_iter->used_ = true; // Update the cache. |
| 545 return t_iter - term_char_word_set_cache_.begin() - 1; | 603 return t_iter - term_char_word_set_cache_.begin() - 1; |
| 546 } | 604 } |
| 547 | 605 |
| 548 // static | 606 // static |
| 549 int InMemoryURLIndex::RawScoreForURL(const URLRow& row, | 607 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, |
| 550 const String16Vector& terms, | 608 const string16& string, |
| 551 size_t* first_term_location) { | 609 int term_num) { |
| 610 TermMatches matches; |
| 611 for (size_t location = string.find(term); location != string16::npos; |
| 612 location = string.find(term, location + 1)) |
| 613 matches.push_back(TermMatch(term_num, location, term.size())); |
| 614 return matches; |
| 615 } |
| 616 |
| 617 // static |
| 618 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) { |
| 619 if (matches.empty()) |
| 620 return matches; |
| 621 TermMatches sorted_matches = matches; |
| 622 std::sort(sorted_matches.begin(), sorted_matches.end(), |
| 623 CompareMatchOffset); |
| 624 TermMatches clean_matches; |
| 625 TermMatch last_match = sorted_matches[0]; |
| 626 clean_matches.push_back(last_match); |
| 627 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1; |
| 628 iter != sorted_matches.end(); ++iter) { |
| 629 if (iter->offset >= last_match.offset + last_match.length) { |
| 630 last_match = *iter; |
| 631 clean_matches.push_back(last_match); |
| 632 } |
| 633 } |
| 634 return clean_matches; |
| 635 } |
| 636 |
| 637 // static |
| 638 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL( |
| 639 const URLRow& row, |
| 640 const String16Vector& terms) { |
| 641 ScoredHistoryMatch match(row); |
| 552 GURL gurl = row.url(); | 642 GURL gurl = row.url(); |
| 553 if (!gurl.is_valid()) | 643 if (!gurl.is_valid()) |
| 554 return 0; | 644 return match; |
| 555 | 645 |
| 556 string16 url = UTF8ToUTF16(gurl.spec()); | 646 // Figure out where each search term appears in the URL and/or page title |
| 557 | 647 // so that we can score as well as provide autocomplete highlighting. |
| 558 // Collect all term start positions so we can see if they appear in order. | 648 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec())); |
| 559 std::vector<size_t> term_locations; | 649 // Strip any 'http://' prefix before matching. |
| 560 int out_of_order = 0; // Count the terms which are out of order. | 650 if (url_util::FindAndCompareScheme(url, chrome::kHttpScheme, NULL)) { |
| 561 size_t start_location_total = 0; | 651 match.prefix_adjust = strlen(chrome::kHttpScheme) + 3; // Allow for '://'. |
| 562 size_t term_length_total = 0; | 652 url = url.substr(match.prefix_adjust); |
| 563 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); | |
| 564 ++iter) { | |
| 565 string16 term = *iter; | |
| 566 size_t term_location = url.find(term); | |
| 567 if (term_location == string16::npos) | |
| 568 return 0; // A term was not found. This should not happen. | |
| 569 if (iter != terms.begin()) { | |
| 570 // See if this term is out-of-order. | |
| 571 for (std::vector<size_t>::const_iterator order_iter = | |
| 572 term_locations.begin(); order_iter != term_locations.end(); | |
| 573 ++order_iter) { | |
| 574 if (term_location <= *order_iter) | |
| 575 ++out_of_order; | |
| 576 } | |
| 577 } else { | |
| 578 *first_term_location = term_location; | |
| 579 } | |
| 580 term_locations.push_back(term_location); | |
| 581 start_location_total += term_location; | |
| 582 term_length_total += term.size(); | |
| 583 } | 653 } |
| 584 | 654 |
| 585 // Calculate a raw score. | 655 string16 title = l10n_util::ToLower(row.title()); |
| 586 // TODO(mrossetti): This is good enough for now but must be fine-tuned. | 656 int term_num = 0; |
| 587 const float kOrderMaxValue = 10.0; | 657 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); |
| 588 float order_value = 10.0; | 658 ++iter, ++term_num) { |
| 589 if (terms.size() > 1) { | 659 string16 term = *iter; |
| 590 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; | 660 TermMatches url_term_matches = MatchTermInString(term, url, term_num); |
| 591 order_value = | 661 TermMatches title_term_matches = MatchTermInString(term, title, term_num); |
| 592 (static_cast<float>(max_possible_out_of_order - out_of_order) / | 662 if (url_term_matches.empty() && title_term_matches.empty()) |
| 593 max_possible_out_of_order) * kOrderMaxValue; | 663 return match; // A term was not found in either URL or title - reject. |
| 664 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(), |
| 665 url_term_matches.end()); |
| 666 match.title_matches.insert(match.title_matches.end(), |
| 667 title_term_matches.begin(), |
| 668 title_term_matches.end()); |
| 594 } | 669 } |
| 595 | 670 |
| 596 const float kStartMaxValue = 10.0; | 671 // Sort matches by offset and eliminate any which overlap. |
| 597 const size_t kMaxSignificantStart = 20; | 672 match.url_matches = SortAndDeoverlap(match.url_matches); |
| 598 float start_value = | 673 match.title_matches = SortAndDeoverlap(match.title_matches); |
| 599 (static_cast<float>(kMaxSignificantStart - | |
| 600 std::min(kMaxSignificantStart, start_location_total / terms.size()))) / | |
| 601 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; | |
| 602 | 674 |
| 603 const float kCompleteMaxValue = 10.0; | 675 // Get partial scores based on term matching. Note that the score for |
| 604 float complete_value = | 676 // each of the URL and title are adjusted by the fraction of the |
| 605 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * | 677 // terms appearing in each. |
| 606 kStartMaxValue; | 678 int url_score = RawScoreForMatches(match.url_matches, url.size()) * |
| 679 match.url_matches.size() / terms.size(); |
| 680 int title_score = RawScoreForMatches(match.title_matches, title.size()) * |
| 681 match.title_matches.size() / terms.size(); |
| 682 // Arbitrarily pick the best. |
| 683 // TODO(mrossetti): It might make sense that a term which appears in both the |
| 684 // URL and the Title should boost the score a bit. |
| 685 int term_score = std::max(url_score, title_score); |
| 607 | 686 |
| 608 const float kLastVisitMaxValue = 10.0; | 687 // Factor in attributes of the URLRow. |
| 609 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); | 688 // Items which have been visited recently score higher. |
| 610 int64 delta_time = (kMaxSignificantDay - | 689 int64 delta_time = (kMaxSignificantDay - |
| 611 std::min((base::Time::Now() - row.last_visit()), | 690 std::min((base::Time::Now() - row.last_visit()), |
| 612 kMaxSignificantDay)).ToInternalValue(); | 691 kMaxSignificantDay)).ToInternalValue(); |
| 613 float last_visit_value = | 692 float last_visit_value = |
| 614 (static_cast<float>(delta_time) / | 693 (static_cast<float>(delta_time) / |
| 615 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * | 694 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * |
| 616 kLastVisitMaxValue; | 695 kLastVisitMaxValue; |
| 617 | 696 |
| 618 const float kVisitCountMaxValue = 10.0; | |
| 619 const int kMaxSignificantVisits = 10; | |
| 620 float visit_count_value = | 697 float visit_count_value = |
| 621 (static_cast<float>(std::min(row.visit_count(), | 698 (static_cast<float>(std::min(row.visit_count(), |
| 622 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * | 699 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * |
| 623 kVisitCountMaxValue; | 700 kVisitCountMaxValue; |
| 624 | 701 |
| 625 const float kTypedCountMaxValue = 20.0; | |
| 626 const int kMaxSignificantTyped = 10; | |
| 627 float typed_count_value = | 702 float typed_count_value = |
| 628 (static_cast<float>(std::min(row.typed_count(), | 703 (static_cast<float>(std::min(row.typed_count(), |
| 629 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * | 704 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * |
| 630 kTypedCountMaxValue; | 705 kTypedCountMaxValue; |
| 631 | 706 |
| 632 float raw_score = order_value + start_value + complete_value + | 707 float raw_score = term_score + last_visit_value + visit_count_value + |
| 633 last_visit_value + visit_count_value + typed_count_value; | 708 typed_count_value; |
| 634 | 709 |
| 635 // Normalize the score. | 710 // Normalize the score. |
| 636 const float kMaxNormalizedRawScore = 1000.0; | 711 match.raw_score = static_cast<int>((raw_score / kMaxRawScore) * |
| 637 raw_score = | 712 kMaxNormalizedRawScore); |
| 638 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + | 713 return match; |
| 639 kLastVisitMaxValue + kVisitCountMaxValue + | |
| 640 kTypedCountMaxValue)) * | |
| 641 kMaxNormalizedRawScore; | |
| 642 return static_cast<int>(raw_score); | |
| 643 } | 714 } |
| 644 | 715 |
| 645 // static | 716 int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches, |
| 646 base::Time InMemoryURLIndex::RecentThreshold() { | 717 size_t max_length) { |
| 647 return base::Time::Now() - | 718 // TODO(mrossetti): This is good enough for now but must be fine-tuned. |
| 648 base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); | 719 if (matches.empty()) |
| 720 return 0; |
| 721 |
| 722 // Search terms appearing in order score higher. |
| 723 float order_value = 10.0; |
| 724 if (matches.size() > 1) { |
| 725 int max_possible_out_of_order = matches.size() - 1; |
| 726 int out_of_order = 0; |
| 727 for (size_t i = 1; i < matches.size(); ++i) { |
| 728 if (matches[i - 1].term_num > matches[i].term_num) |
| 729 ++out_of_order; |
| 730 } |
| 731 order_value = |
| 732 (static_cast<float>(max_possible_out_of_order - out_of_order) / |
| 733 max_possible_out_of_order) * kOrderMaxValue; |
| 734 } |
| 735 |
| 736 // Search terms which appear earlier score higher. |
| 737 // No points if the first search term does not appear in the first |
| 738 // kMaxSignificantStart chars. |
| 739 float start_value = |
| 740 (static_cast<float>(kMaxSignificantStart - |
| 741 std::min(kMaxSignificantStart, matches[0].offset))) / |
| 742 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; |
| 743 |
| 744 // Search terms which comprise a greater portion score higher. |
| 745 size_t term_length_total = std::accumulate(matches.begin(), matches.end(), |
| 746 0, AccumulateMatchLength); |
| 747 float complete_value = |
| 748 (static_cast<float>(term_length_total) / static_cast<float>(max_length)) * |
| 749 kStartMaxValue; |
| 750 |
| 751 return static_cast<int>(order_value + start_value + complete_value); |
| 649 } | 752 } |
| 650 | 753 |
| 651 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( | 754 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( |
| 652 const InMemoryURLIndex& index, | 755 const InMemoryURLIndex& index, |
| 653 const String16Vector& lower_terms) | 756 const String16Vector& lower_terms) |
| 654 : index_(index), | 757 : index_(index), |
| 655 lower_terms_(lower_terms) { | 758 lower_terms_(lower_terms) { |
| 656 } | 759 } |
| 657 | 760 |
| 658 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} | 761 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} |
| 659 | 762 |
| 660 void InMemoryURLIndex::AddHistoryMatch::operator()( | 763 void InMemoryURLIndex::AddHistoryMatch::operator()( |
| 661 const InMemoryURLIndex::HistoryID history_id) { | 764 const InMemoryURLIndex::HistoryID history_id) { |
| 662 HistoryInfoMap::const_iterator hist_pos = | 765 HistoryInfoMap::const_iterator hist_pos = |
| 663 index_.history_info_map_.find(history_id); | 766 index_.history_info_map_.find(history_id); |
| 664 // Note that a history_id may be present in the word_id_history_map_ yet not | 767 // Note that a history_id may be present in the word_id_history_map_ yet not |
| 665 // be found in the history_info_map_. This occurs when an item has been | 768 // be found in the history_info_map_. This occurs when an item has been |
| 666 // deleted by the user or the item no longer qualifies as a quick result. | 769 // deleted by the user or the item no longer qualifies as a quick result. |
| 667 if (hist_pos != index_.history_info_map_.end()) { | 770 if (hist_pos != index_.history_info_map_.end()) { |
| 668 const URLRow& hist_item = hist_pos->second; | 771 const URLRow& hist_item = hist_pos->second; |
| 669 // TODO(mrossetti): Accommodate multiple term highlighting. | 772 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_)); |
| 670 size_t input_location = 0; | 773 if (match.raw_score != 0) { |
| 671 int score = InMemoryURLIndex::RawScoreForURL( | |
| 672 hist_item, lower_terms_, &input_location); | |
| 673 if (score != 0) { | |
| 674 // We only retain the top 10 highest scoring results so | 774 // We only retain the top 10 highest scoring results so |
| 675 // see if this one fits into the top 10 and, if so, where. | 775 // see if this one fits into the top 10 and, if so, where. |
| 676 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); | 776 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); |
| 677 while (scored_iter != scored_matches_.end() && | 777 while (scored_iter != scored_matches_.end() && |
| 678 (*scored_iter).raw_score > score) | 778 (*scored_iter).raw_score > match.raw_score) |
| 679 ++scored_iter; | 779 ++scored_iter; |
| 680 if ((scored_matches_.size() < 10) || | 780 if ((scored_matches_.size() < 10) || |
| 681 (scored_iter != scored_matches_.end())) { | 781 (scored_iter != scored_matches_.end())) { |
| 682 // Create and insert the new item. | 782 // Insert the new item. |
| 683 // TODO(mrossetti): Properly set |match_in_scheme| and | |
| 684 // |innermost_match|. | |
| 685 bool match_in_scheme = false; | |
| 686 bool innermost_match = true; | |
| 687 ScoredHistoryMatch match(hist_item, input_location, | |
| 688 match_in_scheme, innermost_match, score); | |
| 689 if (!scored_matches_.empty()) | 783 if (!scored_matches_.empty()) |
| 690 scored_matches_.insert(scored_iter, match); | 784 scored_matches_.insert(scored_iter, match); |
| 691 else | 785 else |
| 692 scored_matches_.push_back(match); | 786 scored_matches_.push_back(match); |
| 693 // Trim any entries beyond 10. | 787 // Trim any entries beyond 10. |
| 694 if (scored_matches_.size() > 10) { | 788 if (scored_matches_.size() > 10) { |
| 695 scored_matches_.erase(scored_matches_.begin() + 10, | 789 scored_matches_.erase(scored_matches_.begin() + 10, |
| 696 scored_matches_.end()); | 790 scored_matches_.end()); |
| 697 } | 791 } |
| 698 } | 792 } |
| (...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 917 if (iter->has_title()) { | 1011 if (iter->has_title()) { |
| 918 string16 title(UTF8ToUTF16(iter->title())); | 1012 string16 title(UTF8ToUTF16(iter->title())); |
| 919 url_row.set_title(title); | 1013 url_row.set_title(title); |
| 920 } | 1014 } |
| 921 history_info_map_[history_id] = url_row; | 1015 history_info_map_[history_id] = url_row; |
| 922 } | 1016 } |
| 923 return true; | 1017 return true; |
| 924 } | 1018 } |
| 925 | 1019 |
| 926 } // namespace history | 1020 } // namespace history |
| OLD | NEW |