Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(181)

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/history/in_memory_url_index.h" 5 #include "chrome/browser/history/in_memory_url_index.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <iterator> 8 #include <iterator>
9 #include <limits> 9 #include <limits>
10 #include <numeric>
10 11
11 #include "base/file_util.h" 12 #include "base/file_util.h"
12 #include "base/i18n/break_iterator.h" 13 #include "base/i18n/break_iterator.h"
13 #include "base/metrics/histogram.h" 14 #include "base/metrics/histogram.h"
14 #include "base/string_util.h" 15 #include "base/string_util.h"
15 #include "base/time.h" 16 #include "base/time.h"
16 #include "base/utf_string_conversions.h" 17 #include "base/utf_string_conversions.h"
17 #include "chrome/browser/autocomplete/history_provider_util.h" 18 #include "chrome/browser/autocomplete/history_provider_util.h"
18 #include "chrome/browser/history/url_database.h" 19 #include "chrome/browser/history/url_database.h"
19 #include "chrome/browser/profiles/profile.h" 20 #include "chrome/browser/profiles/profile.h"
21 #include "chrome/common/url_constants.h"
22 #include "googleurl/src/url_util.h"
20 #include "net/base/escape.h" 23 #include "net/base/escape.h"
21 #include "net/base/net_util.h" 24 #include "net/base/net_util.h"
22 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 25 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
23 #include "ui/base/l10n/l10n_util.h" 26 #include "ui/base/l10n/l10n_util.h"
24 27
25 using google::protobuf::RepeatedField; 28 using google::protobuf::RepeatedField;
26 using google::protobuf::RepeatedPtrField; 29 using google::protobuf::RepeatedPtrField;
30 using in_memory_url_index::InMemoryURLIndexCacheItem;
27 31
28 using in_memory_url_index::InMemoryURLIndexCacheItem;
29 namespace history { 32 namespace history {
30 33
31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; 34 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; 35 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; 36 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
34 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; 37 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 38 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
36 CharWordMapEntry; 39 CharWordMapEntry;
37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 40 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
38 WordIDHistoryMapItem; 41 WordIDHistoryMapItem;
39 typedef imui:: 42 typedef imui::
40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 43 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
41 WordIDHistoryMapEntry; 44 WordIDHistoryMapEntry;
42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 45 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 46 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
44 HistoryInfoMapEntry; 47 HistoryInfoMapEntry;
45 48
46 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; 49 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;
47 50
48 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} 51 const float kOrderMaxValue = 50.0;
52 const float kStartMaxValue = 50.0;
53 const size_t kMaxSignificantStart = 20;
54 const float kCompleteMaxValue = 50.0;
55 const float kLastVisitMaxValue = 50.0;
56 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
57 const float kVisitCountMaxValue = 50.0;
58 const int kMaxSignificantVisits = 20;
59 const float kTypedCountMaxValue = 50.0;
60 const int kMaxSignificantTyped = 20;
61 const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
62 kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue;
63 const float kMaxNormalizedRawScore = 1000.0;
49 64
50 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, 65 ScoredHistoryMatch::ScoredHistoryMatch()
51 size_t input_location, 66 : raw_score(0),
52 bool match_in_scheme, 67 prefix_adjust(0) {}
53 bool innermost_match, 68
54 int score) 69 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)
55 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), 70 : HistoryMatch(url_info, 0, false, false),
56 raw_score(score) { 71 raw_score(0),
57 } 72 prefix_adjust(0) {}
58 73
59 struct InMemoryURLIndex::TermCharWordSet { 74 struct InMemoryURLIndex::TermCharWordSet {
60 TermCharWordSet() // Required for STL resize(). 75 TermCharWordSet() // Required for STL resize().
61 : char_(0), 76 : char_(0),
62 word_id_set_(), 77 word_id_set_(),
63 used_(false) {} 78 used_(false) {}
64 TermCharWordSet(const char16& uni_char, 79 TermCharWordSet(const char16& uni_char,
65 const WordIDSet& word_id_set, 80 const WordIDSet& word_id_set,
66 bool used) 81 bool used)
67 : char_(uni_char), 82 : char_(uni_char),
68 word_id_set_(word_id_set), 83 word_id_set_(word_id_set),
69 used_(used) {} 84 used_(used) {}
70 85
71 // Predicate for STL algorithm use. 86 // Predicate for STL algorithm use.
72 bool is_not_used() const { return !used_; } 87 bool is_not_used() const { return !used_; }
73 88
74 char16 char_; 89 char16 char_;
75 WordIDSet word_id_set_; 90 WordIDSet word_id_set_;
76 bool used_; // true if this set has been used for the current term search. 91 bool used_; // true if this set has been used for the current term search.
77 }; 92 };
78 93
94 // Comparison function for sorting TermMatches by their offsets.
95 bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) {
96 return m1.offset < m2.offset;
97 }
98
99 // std::accumulate helper function to add up TermMatches' lengths.
100 int AccumulateMatchLength(int total, const TermMatch& match) {
101 return total + match.length;
102 }
103
79 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) 104 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)
80 : history_dir_(history_dir), 105 : history_dir_(history_dir),
81 history_item_count_(0) { 106 history_item_count_(0) {
82 } 107 }
83 108
84 // Called only by unit tests. 109 // Called only by unit tests.
85 InMemoryURLIndex::InMemoryURLIndex() 110 InMemoryURLIndex::InMemoryURLIndex()
86 : history_item_count_(0) { 111 : history_item_count_(0) {
87 } 112 }
88 113
(...skipping 13 matching lines...) Expand all
102 SaveToCacheFile(); 127 SaveToCacheFile();
103 } 128 }
104 129
105 bool InMemoryURLIndex::IndexRow(const URLRow& row) { 130 bool InMemoryURLIndex::IndexRow(const URLRow& row) {
106 const GURL& gurl(row.url()); 131 const GURL& gurl(row.url());
107 string16 url(net::FormatUrl(gurl, languages_, 132 string16 url(net::FormatUrl(gurl, languages_,
108 net::kFormatUrlOmitUsernamePassword, 133 net::kFormatUrlOmitUsernamePassword,
109 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 134 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
110 NULL, NULL, NULL)); 135 NULL, NULL, NULL));
111 136
112 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
113 HistoryID history_id = static_cast<HistoryID>(row.id()); 137 HistoryID history_id = static_cast<HistoryID>(row.id());
138 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());
114 139
115 // Add the row for quick lookup in the history info store. 140 // Add the row for quick lookup in the history info store.
116 url = l10n_util::ToLower(url);
117 URLRow new_row(GURL(url), row.id()); 141 URLRow new_row(GURL(url), row.id());
118 new_row.set_visit_count(row.visit_count()); 142 new_row.set_visit_count(row.visit_count());
119 new_row.set_typed_count(row.typed_count()); 143 new_row.set_typed_count(row.typed_count());
120 new_row.set_last_visit(row.last_visit()); 144 new_row.set_last_visit(row.last_visit());
121 new_row.set_title(row.title()); 145 new_row.set_title(row.title());
122 history_info_map_[history_id] = new_row; 146 history_info_map_[history_id] = new_row;
123 147
124 // Split into individual, unique words. 148 // Split URL into individual, unique words then add in the title words.
125 String16Set words = WordSetFromString16(url); 149 url = l10n_util::ToLower(url);
150 String16Set url_words = WordSetFromString16(url);
151 String16Set title_words = WordSetFromString16(row.title());
152 String16Set words;
153 std::set_union(url_words.begin(), url_words.end(),
154 title_words.begin(), title_words.end(),
155 std::insert_iterator<String16Set>(words, words.begin()));
156 for (String16Set::iterator word_iter = words.begin();
157 word_iter != words.end(); ++word_iter)
158 AddWordToIndex(*word_iter, history_id);
126 159
127 // For each word, add a new entry into the word index referring to the
128 // associated history item.
129 for (String16Set::iterator iter = words.begin();
130 iter != words.end(); ++iter) {
131 String16Set::value_type uni_word = *iter;
132 AddWordToIndex(uni_word, history_id);
133 }
134 ++history_item_count_; 160 ++history_item_count_;
135 return true; 161 return true;
136 } 162 }
137 163
138 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, 164 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,
139 bool clear_cache) { 165 bool clear_cache) {
140 ClearPrivateData(); 166 ClearPrivateData();
141 167
142 if (!history_db) 168 if (!history_db)
143 return false; 169 return false;
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
285 ScoredHistoryMatches scored_items; 311 ScoredHistoryMatches scored_items;
286 if (!terms.empty()) { 312 if (!terms.empty()) {
287 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- 313 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
288 // and-sweep approach. 314 // and-sweep approach.
289 ResetTermCharWordSetCache(); 315 ResetTermCharWordSetCache();
290 316
291 // Lowercase the terms. 317 // Lowercase the terms.
292 // TODO(mrossetti): Another opportunity for a transform algorithm. 318 // TODO(mrossetti): Another opportunity for a transform algorithm.
293 String16Vector lower_terms; 319 String16Vector lower_terms;
294 for (String16Vector::const_iterator term_iter = terms.begin(); 320 for (String16Vector::const_iterator term_iter = terms.begin();
295 term_iter != terms.end(); ++term_iter) { 321 term_iter != terms.end(); ++term_iter)
296 String16Vector::value_type lower_string(*term_iter); 322 lower_terms.push_back(l10n_util::ToLower(*term_iter));
297 std::transform(lower_string.begin(),
298 lower_string.end(),
299 lower_string.begin(),
300 tolower);
301 lower_terms.push_back(lower_string);
302 }
303 323
304 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); 324 String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
305 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 325 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
306 326
307 // Pass over all of the candidates filtering out any without a proper 327 // Pass over all of the candidates filtering out any without a proper
308 // substring match, inserting those which pass in order by score. 328 // substring match, inserting those which pass in order by score.
309 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 329 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
310 AddHistoryMatch(*this, 330 AddHistoryMatch(*this,
311 lower_terms)).ScoredMatches(); 331 lower_terms)).ScoredMatches();
312 } 332 }
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
365 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 385 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
366 const string16& uni_word) { 386 const string16& uni_word) {
367 InMemoryURLIndex::HistoryIDSet history_id_set; 387 InMemoryURLIndex::HistoryIDSet history_id_set;
368 388
369 // For each unique character in the word, in order of first appearance, get 389 // For each unique character in the word, in order of first appearance, get
370 // the char/word_id map entry and intersect with the set in an incremental 390 // the char/word_id map entry and intersect with the set in an incremental
371 // manner. 391 // manner.
372 Char16Vector uni_chars = Char16VectorFromString16(uni_word); 392 Char16Vector uni_chars = Char16VectorFromString16(uni_word);
373 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); 393 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
374 394
395 // TODO(mrossetti): At this point, as a possible optimization, we could
396 // scan through all candidate words and make sure the |uni_word| is a
397 // substring within the candidate words, eliminating those which aren't.
398 // I'm not sure it would be worth the effort. And remember, we've got to do
399 // a final substring match in order to get the highlighting ranges later
400 // in the process in any case.
401
375 // If any words resulted then we can compose a set of history IDs by unioning 402 // If any words resulted then we can compose a set of history IDs by unioning
376 // the sets from each word. 403 // the sets from each word.
377 if (!word_id_set.empty()) { 404 if (!word_id_set.empty()) {
378 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 405 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
379 word_id_iter != word_id_set.end(); ++word_id_iter) { 406 word_id_iter != word_id_set.end(); ++word_id_iter) {
380 WordID word_id = *word_id_iter; 407 WordID word_id = *word_id_iter;
381 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 408 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
382 if (word_iter != word_id_history_map_.end()) { 409 if (word_iter != word_id_history_map_.end()) {
383 HistoryIDSet& word_history_id_set(word_iter->second); 410 HistoryIDSet& word_history_id_set(word_iter->second);
384 history_id_set.insert(word_history_id_set.begin(), 411 history_id_set.insert(word_history_id_set.begin(),
385 word_history_id_set.end()); 412 word_history_id_set.end());
386 } 413 }
387 } 414 }
388 } 415 }
389 416
390 return history_id_set; 417 return history_id_set;
391 } 418 }
392 419
393 // Utility Functions 420 // Utility Functions
394 421
395 // static 422 // static
396 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( 423 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
397 const string16& uni_string) { 424 const string16& uni_string) {
398 String16Set words; 425 String16Vector words = WordVectorFromString16(uni_string, false);
399 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD); 426 String16Set word_set;
400 if (iter.Init()) { 427 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
401 while (iter.Advance()) { 428 ++iter)
429 word_set.insert(l10n_util::ToLower(*iter));
430 return word_set;
431 }
432
433 // static
434 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(
435 const string16& uni_string,
436 bool break_on_space) {
437 // TODO(mrossetti): Come back and update the following code if the
438 // BreakIterator is changed. Here are some comments:
439 // The iterator behaves differently depending on the breaking strategy. Its
440 // unit tests do not properly test this case as its basic word and space tests
441 // always use a test string starting with a space.
442 base::BreakIterator iter(&uni_string, break_on_space ?
443 base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD);
444 String16Vector words;
445 if (break_on_space) {
446 if (iter.Init()) {
447 while (iter.Advance()) {
448 string16 word = iter.GetString();
449 TrimWhitespace(word, TRIM_ALL, &word);
450 if (!word.empty())
451 words.push_back(word);
452 }
453 }
454 } else {
455 if (iter.Init()) {
402 if (iter.IsWord()) 456 if (iter.IsWord())
403 words.insert(iter.GetString()); 457 words.push_back(iter.GetString());
458 while (iter.Advance()) {
459 if (iter.IsWord())
460 words.push_back(iter.GetString());
461 }
404 } 462 }
405 } 463 }
406 return words; 464 return words;
407 } 465 }
408 466
409 // static 467 // static
410 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( 468 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(
411 const string16& uni_word) { 469 const string16& uni_word) {
412 InMemoryURLIndex::Char16Vector characters; 470 InMemoryURLIndex::Char16Vector characters;
413 InMemoryURLIndex::Char16Set unique_characters; 471 InMemoryURLIndex::Char16Set unique_characters;
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after
539 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); 597 for (Char16Vector::const_iterator c_iter = uni_chars.begin();
540 (c_iter != uni_chars.end()) && 598 (c_iter != uni_chars.end()) &&
541 (t_iter != term_char_word_set_cache_.end()) && 599 (t_iter != term_char_word_set_cache_.end()) &&
542 (*c_iter == t_iter->char_); 600 (*c_iter == t_iter->char_);
543 ++c_iter, ++t_iter) 601 ++c_iter, ++t_iter)
544 t_iter->used_ = true; // Update the cache. 602 t_iter->used_ = true; // Update the cache.
545 return t_iter - term_char_word_set_cache_.begin() - 1; 603 return t_iter - term_char_word_set_cache_.begin() - 1;
546 } 604 }
547 605
548 // static 606 // static
549 int InMemoryURLIndex::RawScoreForURL(const URLRow& row, 607 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,
550 const String16Vector& terms, 608 const string16& string,
551 size_t* first_term_location) { 609 int term_num) {
610 TermMatches matches;
611 for (size_t location = string.find(term); location != string16::npos;
612 location = string.find(term, location + 1))
613 matches.push_back(TermMatch(term_num, location, term.size()));
614 return matches;
615 }
616
617 // static
618 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {
619 if (matches.empty())
620 return matches;
621 TermMatches sorted_matches = matches;
622 std::sort(sorted_matches.begin(), sorted_matches.end(),
623 CompareMatchOffset);
624 TermMatches clean_matches;
625 TermMatch last_match = sorted_matches[0];
626 clean_matches.push_back(last_match);
627 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;
628 iter != sorted_matches.end(); ++iter) {
629 if (iter->offset >= last_match.offset + last_match.length) {
630 last_match = *iter;
631 clean_matches.push_back(last_match);
632 }
633 }
634 return clean_matches;
635 }
636
637 // static
638 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(
639 const URLRow& row,
640 const String16Vector& terms) {
641 ScoredHistoryMatch match(row);
552 GURL gurl = row.url(); 642 GURL gurl = row.url();
553 if (!gurl.is_valid()) 643 if (!gurl.is_valid())
554 return 0; 644 return match;
555 645
556 string16 url = UTF8ToUTF16(gurl.spec()); 646 // Figure out where each search term appears in the URL and/or page title
557 647 // so that we can score as well as provide autocomplete highlighting.
558 // Collect all term start positions so we can see if they appear in order. 648 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));
559 std::vector<size_t> term_locations; 649 // Strip any 'http://' prefix before matching.
560 int out_of_order = 0; // Count the terms which are out of order. 650 if (url_util::FindAndCompareScheme(url, chrome::kHttpScheme, NULL)) {
561 size_t start_location_total = 0; 651 match.prefix_adjust = strlen(chrome::kHttpScheme) + 3; // Allow for '://'.
562 size_t term_length_total = 0; 652 url = url.substr(match.prefix_adjust);
563 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
564 ++iter) {
565 string16 term = *iter;
566 size_t term_location = url.find(term);
567 if (term_location == string16::npos)
568 return 0; // A term was not found. This should not happen.
569 if (iter != terms.begin()) {
570 // See if this term is out-of-order.
571 for (std::vector<size_t>::const_iterator order_iter =
572 term_locations.begin(); order_iter != term_locations.end();
573 ++order_iter) {
574 if (term_location <= *order_iter)
575 ++out_of_order;
576 }
577 } else {
578 *first_term_location = term_location;
579 }
580 term_locations.push_back(term_location);
581 start_location_total += term_location;
582 term_length_total += term.size();
583 } 653 }
584 654
585 // Calculate a raw score. 655 string16 title = l10n_util::ToLower(row.title());
586 // TODO(mrossetti): This is good enough for now but must be fine-tuned. 656 int term_num = 0;
587 const float kOrderMaxValue = 10.0; 657 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
588 float order_value = 10.0; 658 ++iter, ++term_num) {
589 if (terms.size() > 1) { 659 string16 term = *iter;
590 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; 660 TermMatches url_term_matches = MatchTermInString(term, url, term_num);
591 order_value = 661 TermMatches title_term_matches = MatchTermInString(term, title, term_num);
592 (static_cast<float>(max_possible_out_of_order - out_of_order) / 662 if (url_term_matches.empty() && title_term_matches.empty())
593 max_possible_out_of_order) * kOrderMaxValue; 663 return match; // A term was not found in either URL or title - reject.
664 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),
665 url_term_matches.end());
666 match.title_matches.insert(match.title_matches.end(),
667 title_term_matches.begin(),
668 title_term_matches.end());
594 } 669 }
595 670
596 const float kStartMaxValue = 10.0; 671 // Sort matches by offset and eliminate any which overlap.
597 const size_t kMaxSignificantStart = 20; 672 match.url_matches = SortAndDeoverlap(match.url_matches);
598 float start_value = 673 match.title_matches = SortAndDeoverlap(match.title_matches);
599 (static_cast<float>(kMaxSignificantStart -
600 std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
601 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
602 674
603 const float kCompleteMaxValue = 10.0; 675 // Get partial scores based on term matching. Note that the score for
604 float complete_value = 676 // each of the URL and title are adjusted by the fraction of the
605 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * 677 // terms appearing in each.
606 kStartMaxValue; 678 int url_score = RawScoreForMatches(match.url_matches, url.size()) *
679 match.url_matches.size() / terms.size();
680 int title_score = RawScoreForMatches(match.title_matches, title.size()) *
681 match.title_matches.size() / terms.size();
682 // Arbitrarily pick the best.
683 // TODO(mrossetti): It might make sense that a term which appears in both the
684 // URL and the Title should boost the score a bit.
685 int term_score = std::max(url_score, title_score);
607 686
608 const float kLastVisitMaxValue = 10.0; 687 // Factor in attributes of the URLRow.
609 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); 688 // Items which have been visited recently score higher.
610 int64 delta_time = (kMaxSignificantDay - 689 int64 delta_time = (kMaxSignificantDay -
611 std::min((base::Time::Now() - row.last_visit()), 690 std::min((base::Time::Now() - row.last_visit()),
612 kMaxSignificantDay)).ToInternalValue(); 691 kMaxSignificantDay)).ToInternalValue();
613 float last_visit_value = 692 float last_visit_value =
614 (static_cast<float>(delta_time) / 693 (static_cast<float>(delta_time) /
615 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * 694 static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
616 kLastVisitMaxValue; 695 kLastVisitMaxValue;
617 696
618 const float kVisitCountMaxValue = 10.0;
619 const int kMaxSignificantVisits = 10;
620 float visit_count_value = 697 float visit_count_value =
621 (static_cast<float>(std::min(row.visit_count(), 698 (static_cast<float>(std::min(row.visit_count(),
622 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * 699 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
623 kVisitCountMaxValue; 700 kVisitCountMaxValue;
624 701
625 const float kTypedCountMaxValue = 20.0;
626 const int kMaxSignificantTyped = 10;
627 float typed_count_value = 702 float typed_count_value =
628 (static_cast<float>(std::min(row.typed_count(), 703 (static_cast<float>(std::min(row.typed_count(),
629 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * 704 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *
630 kTypedCountMaxValue; 705 kTypedCountMaxValue;
631 706
632 float raw_score = order_value + start_value + complete_value + 707 float raw_score = term_score + last_visit_value + visit_count_value +
633 last_visit_value + visit_count_value + typed_count_value; 708 typed_count_value;
634 709
635 // Normalize the score. 710 // Normalize the score.
636 const float kMaxNormalizedRawScore = 1000.0; 711 match.raw_score = static_cast<int>((raw_score / kMaxRawScore) *
637 raw_score = 712 kMaxNormalizedRawScore);
638 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + 713 return match;
639 kLastVisitMaxValue + kVisitCountMaxValue +
640 kTypedCountMaxValue)) *
641 kMaxNormalizedRawScore;
642 return static_cast<int>(raw_score);
643 } 714 }
644 715
645 // static 716 int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches,
646 base::Time InMemoryURLIndex::RecentThreshold() { 717 size_t max_length) {
647 return base::Time::Now() - 718 // TODO(mrossetti): This is good enough for now but must be fine-tuned.
648 base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); 719 if (matches.empty())
720 return 0;
721
722 // Search terms appearing in order score higher.
723 float order_value = 10.0;
724 if (matches.size() > 1) {
725 int max_possible_out_of_order = matches.size() - 1;
726 int out_of_order = 0;
727 for (size_t i = 1; i < matches.size(); ++i) {
728 if (matches[i - 1].term_num > matches[i].term_num)
729 ++out_of_order;
730 }
731 order_value =
732 (static_cast<float>(max_possible_out_of_order - out_of_order) /
733 max_possible_out_of_order) * kOrderMaxValue;
734 }
735
736 // Search terms which appear earlier score higher.
737 // No points if the first search term does not appear in the first
738 // kMaxSignificantStart chars.
739 float start_value =
740 (static_cast<float>(kMaxSignificantStart -
741 std::min(kMaxSignificantStart, matches[0].offset))) /
742 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
743
744 // Search terms which comprise a greater portion score higher.
745 size_t term_length_total = std::accumulate(matches.begin(), matches.end(),
746 0, AccumulateMatchLength);
747 float complete_value =
748 (static_cast<float>(term_length_total) / static_cast<float>(max_length)) *
749 kStartMaxValue;
750
751 return static_cast<int>(order_value + start_value + complete_value);
649 } 752 }
650 753
651 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 754 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
652 const InMemoryURLIndex& index, 755 const InMemoryURLIndex& index,
653 const String16Vector& lower_terms) 756 const String16Vector& lower_terms)
654 : index_(index), 757 : index_(index),
655 lower_terms_(lower_terms) { 758 lower_terms_(lower_terms) {
656 } 759 }
657 760
658 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 761 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
659 762
660 void InMemoryURLIndex::AddHistoryMatch::operator()( 763 void InMemoryURLIndex::AddHistoryMatch::operator()(
661 const InMemoryURLIndex::HistoryID history_id) { 764 const InMemoryURLIndex::HistoryID history_id) {
662 HistoryInfoMap::const_iterator hist_pos = 765 HistoryInfoMap::const_iterator hist_pos =
663 index_.history_info_map_.find(history_id); 766 index_.history_info_map_.find(history_id);
664 // Note that a history_id may be present in the word_id_history_map_ yet not 767 // Note that a history_id may be present in the word_id_history_map_ yet not
665 // be found in the history_info_map_. This occurs when an item has been 768 // be found in the history_info_map_. This occurs when an item has been
666 // deleted by the user or the item no longer qualifies as a quick result. 769 // deleted by the user or the item no longer qualifies as a quick result.
667 if (hist_pos != index_.history_info_map_.end()) { 770 if (hist_pos != index_.history_info_map_.end()) {
668 const URLRow& hist_item = hist_pos->second; 771 const URLRow& hist_item = hist_pos->second;
669 // TODO(mrossetti): Accommodate multiple term highlighting. 772 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));
670 size_t input_location = 0; 773 if (match.raw_score != 0) {
671 int score = InMemoryURLIndex::RawScoreForURL(
672 hist_item, lower_terms_, &input_location);
673 if (score != 0) {
674 // We only retain the top 10 highest scoring results so 774 // We only retain the top 10 highest scoring results so
675 // see if this one fits into the top 10 and, if so, where. 775 // see if this one fits into the top 10 and, if so, where.
676 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); 776 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
677 while (scored_iter != scored_matches_.end() && 777 while (scored_iter != scored_matches_.end() &&
678 (*scored_iter).raw_score > score) 778 (*scored_iter).raw_score > match.raw_score)
679 ++scored_iter; 779 ++scored_iter;
680 if ((scored_matches_.size() < 10) || 780 if ((scored_matches_.size() < 10) ||
681 (scored_iter != scored_matches_.end())) { 781 (scored_iter != scored_matches_.end())) {
682 // Create and insert the new item. 782 // Insert the new item.
683 // TODO(mrossetti): Properly set |match_in_scheme| and
684 // |innermost_match|.
685 bool match_in_scheme = false;
686 bool innermost_match = true;
687 ScoredHistoryMatch match(hist_item, input_location,
688 match_in_scheme, innermost_match, score);
689 if (!scored_matches_.empty()) 783 if (!scored_matches_.empty())
690 scored_matches_.insert(scored_iter, match); 784 scored_matches_.insert(scored_iter, match);
691 else 785 else
692 scored_matches_.push_back(match); 786 scored_matches_.push_back(match);
693 // Trim any entries beyond 10. 787 // Trim any entries beyond 10.
694 if (scored_matches_.size() > 10) { 788 if (scored_matches_.size() > 10) {
695 scored_matches_.erase(scored_matches_.begin() + 10, 789 scored_matches_.erase(scored_matches_.begin() + 10,
696 scored_matches_.end()); 790 scored_matches_.end());
697 } 791 }
698 } 792 }
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after
917 if (iter->has_title()) { 1011 if (iter->has_title()) {
918 string16 title(UTF8ToUTF16(iter->title())); 1012 string16 title(UTF8ToUTF16(iter->title()));
919 url_row.set_title(title); 1013 url_row.set_title(title);
920 } 1014 }
921 history_info_map_[history_id] = url_row; 1015 history_info_map_[history_id] = url_row;
922 } 1016 }
923 return true; 1017 return true;
924 } 1018 }
925 1019
926 } // namespace history 1020 } // namespace history
OLDNEW
« no previous file with comments | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698