Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(127)

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/history/in_memory_url_index.h" 5 #include "chrome/browser/history/in_memory_url_index.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <iterator> 8 #include <iterator>
9 #include <limits> 9 #include <limits>
10 #include <numeric>
10 11
11 #include "base/file_util.h" 12 #include "base/file_util.h"
12 #include "base/i18n/break_iterator.h" 13 #include "base/i18n/break_iterator.h"
13 #include "base/metrics/histogram.h" 14 #include "base/metrics/histogram.h"
14 #include "base/string_util.h" 15 #include "base/string_util.h"
15 #include "base/time.h" 16 #include "base/time.h"
16 #include "base/utf_string_conversions.h" 17 #include "base/utf_string_conversions.h"
17 #include "chrome/browser/autocomplete/history_provider_util.h" 18 #include "chrome/browser/autocomplete/history_provider_util.h"
18 #include "chrome/browser/history/url_database.h" 19 #include "chrome/browser/history/url_database.h"
19 #include "chrome/browser/profiles/profile.h" 20 #include "chrome/browser/profiles/profile.h"
20 #include "net/base/escape.h" 21 #include "net/base/escape.h"
21 #include "net/base/net_util.h" 22 #include "net/base/net_util.h"
22 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 23 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
23 #include "ui/base/l10n/l10n_util.h" 24 #include "ui/base/l10n/l10n_util.h"
24 25
25 using google::protobuf::RepeatedField; 26 using google::protobuf::RepeatedField;
26 using google::protobuf::RepeatedPtrField; 27 using google::protobuf::RepeatedPtrField;
28 using in_memory_url_index::InMemoryURLIndexCacheItem;
27 29
28 using in_memory_url_index::InMemoryURLIndexCacheItem;
29 namespace history { 30 namespace history {
30 31
31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; 32 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; 33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; 34 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
34 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; 35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 36 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
36 CharWordMapEntry; 37 CharWordMapEntry;
37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 38 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
38 WordIDHistoryMapItem; 39 WordIDHistoryMapItem;
39 typedef imui:: 40 typedef imui::
40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 41 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
41 WordIDHistoryMapEntry; 42 WordIDHistoryMapEntry;
42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 44 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
44 HistoryInfoMapEntry; 45 HistoryInfoMapEntry;
45 46
46 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; 47 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;
47 48
49 const float kOrderMaxValue = 50.0;
50 const float kStartMaxValue = 50.0;
51 const size_t kMaxSignificantStart = 20;
52 const float kCompleteMaxValue = 50.0;
53 const float kLastVisitMaxValue = 50.0;
54 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
55 const float kVisitCountMaxValue = 50.0;
56 const int kMaxSignificantVisits = 20;
57 const float kTypedCountMaxValue = 50.0;
58 const int kMaxSignificantTyped = 20;
59 const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
60 kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue;
61 const float kMaxNormalizedRawScore = 1000.0;
62
48 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} 63 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
49 64
50 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, 65 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)
51 size_t input_location, 66 : HistoryMatch(url_info, 0, false, false),
52 bool match_in_scheme, 67 raw_score(0) {}
53 bool innermost_match,
54 int score)
55 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
56 raw_score(score) {
57 }
58 68
59 struct InMemoryURLIndex::TermCharWordSet { 69 struct InMemoryURLIndex::TermCharWordSet {
60 TermCharWordSet() // Required for STL resize(). 70 TermCharWordSet() // Required for STL resize().
61 : char_(0), 71 : char_(0),
62 word_id_set_(), 72 word_id_set_(),
63 used_(false) {} 73 used_(false) {}
64 TermCharWordSet(const char16& uni_char, 74 TermCharWordSet(const char16& uni_char,
65 const WordIDSet& word_id_set, 75 const WordIDSet& word_id_set,
66 bool used) 76 bool used)
67 : char_(uni_char), 77 : char_(uni_char),
68 word_id_set_(word_id_set), 78 word_id_set_(word_id_set),
69 used_(used) {} 79 used_(used) {}
70 80
71 // Predicate for STL algorithm use. 81 // Predicate for STL algorithm use.
72 bool is_not_used() const { return !used_; } 82 bool is_not_used() const { return !used_; }
73 83
74 char16 char_; 84 char16 char_;
75 WordIDSet word_id_set_; 85 WordIDSet word_id_set_;
76 bool used_; // true if this set has been used for the current term search. 86 bool used_; // true if this set has been used for the current term search.
77 }; 87 };
78 88
89 // Comparison function for sorting TermMatches by their offsets.
90 bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) {
91 return m1.offset < m2.offset;
92 }
93
94 // std::accumulate helper function to add up TermMatches' lengths.
95 int AddumulateMatchLength(int total, const TermMatch& match) {
Peter Kasting 2011/03/09 02:31:48 Nit: Addumulate -> Accumulate
mrossetti 2011/03/10 01:31:14 LOL
96 return total + match.length;
97 }
98
79 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) 99 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)
80 : history_dir_(history_dir), 100 : history_dir_(history_dir),
81 history_item_count_(0) { 101 history_item_count_(0) {
82 } 102 }
83 103
84 // Called only by unit tests. 104 // Called only by unit tests.
85 InMemoryURLIndex::InMemoryURLIndex() 105 InMemoryURLIndex::InMemoryURLIndex()
86 : history_item_count_(0) { 106 : history_item_count_(0) {
87 } 107 }
88 108
(...skipping 13 matching lines...) Expand all
102 SaveToCacheFile(); 122 SaveToCacheFile();
103 } 123 }
104 124
105 bool InMemoryURLIndex::IndexRow(const URLRow& row) { 125 bool InMemoryURLIndex::IndexRow(const URLRow& row) {
106 const GURL& gurl(row.url()); 126 const GURL& gurl(row.url());
107 string16 url(net::FormatUrl(gurl, languages_, 127 string16 url(net::FormatUrl(gurl, languages_,
108 net::kFormatUrlOmitUsernamePassword, 128 net::kFormatUrlOmitUsernamePassword,
109 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 129 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
110 NULL, NULL, NULL)); 130 NULL, NULL, NULL));
111 131
112 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
113 HistoryID history_id = static_cast<HistoryID>(row.id()); 132 HistoryID history_id = static_cast<HistoryID>(row.id());
133 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());
114 134
115 // Add the row for quick lookup in the history info store. 135 // Add the row for quick lookup in the history info store.
116 url = l10n_util::ToLower(url);
117 URLRow new_row(GURL(url), row.id()); 136 URLRow new_row(GURL(url), row.id());
118 new_row.set_visit_count(row.visit_count()); 137 new_row.set_visit_count(row.visit_count());
119 new_row.set_typed_count(row.typed_count()); 138 new_row.set_typed_count(row.typed_count());
120 new_row.set_last_visit(row.last_visit()); 139 new_row.set_last_visit(row.last_visit());
121 new_row.set_title(row.title()); 140 new_row.set_title(row.title());
122 history_info_map_[history_id] = new_row; 141 history_info_map_[history_id] = new_row;
123 142
124 // Split into individual, unique words. 143 // Split URL into individual, unique words then add in the title words.
125 String16Set words = WordSetFromString16(url); 144 url = l10n_util::ToLower(url);
145 String16Set url_words = WordSetFromString16(url);
146 String16Set title_words = WordSetFromString16(row.title());
147 String16Set words;
148 std::set_union(url_words.begin(), url_words.end(),
149 title_words.begin(), title_words.end(),
150 std::insert_iterator<String16Set>(words, words.begin()));
151 for (String16Set::iterator word_iter = words.begin();
152 word_iter != words.end(); ++word_iter)
153 AddWordToIndex(*word_iter, history_id);
126 154
127 // For each word, add a new entry into the word index referring to the
128 // associated history item.
129 for (String16Set::iterator iter = words.begin();
130 iter != words.end(); ++iter) {
131 String16Set::value_type uni_word = *iter;
132 AddWordToIndex(uni_word, history_id);
133 }
134 ++history_item_count_; 155 ++history_item_count_;
135 return true; 156 return true;
136 } 157 }
137 158
138 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, 159 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,
139 bool clear_cache) { 160 bool clear_cache) {
140 ClearPrivateData(); 161 ClearPrivateData();
141 162
142 if (!history_db) 163 if (!history_db)
143 return false; 164 return false;
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
285 ScoredHistoryMatches scored_items; 306 ScoredHistoryMatches scored_items;
286 if (!terms.empty()) { 307 if (!terms.empty()) {
287 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- 308 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
288 // and-sweep approach. 309 // and-sweep approach.
289 ResetTermCharWordSetCache(); 310 ResetTermCharWordSetCache();
290 311
291 // Lowercase the terms. 312 // Lowercase the terms.
292 // TODO(mrossetti): Another opportunity for a transform algorithm. 313 // TODO(mrossetti): Another opportunity for a transform algorithm.
293 String16Vector lower_terms; 314 String16Vector lower_terms;
294 for (String16Vector::const_iterator term_iter = terms.begin(); 315 for (String16Vector::const_iterator term_iter = terms.begin();
295 term_iter != terms.end(); ++term_iter) { 316 term_iter != terms.end(); ++term_iter)
296 String16Vector::value_type lower_string(*term_iter); 317 lower_terms.push_back(InMemoryURLIndex::ToLower(*term_iter));
297 std::transform(lower_string.begin(),
298 lower_string.end(),
299 lower_string.begin(),
300 tolower);
301 lower_terms.push_back(lower_string);
302 }
303 318
304 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); 319 String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
305 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 320 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
306 321
307 // Pass over all of the candidates filtering out any without a proper 322 // Pass over all of the candidates filtering out any without a proper
308 // substring match, inserting those which pass in order by score. 323 // substring match, inserting those which pass in order by score.
309 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 324 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
310 AddHistoryMatch(*this, 325 AddHistoryMatch(*this,
311 lower_terms)).ScoredMatches(); 326 lower_terms)).ScoredMatches();
312 } 327 }
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
365 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 380 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
366 const string16& uni_word) { 381 const string16& uni_word) {
367 InMemoryURLIndex::HistoryIDSet history_id_set; 382 InMemoryURLIndex::HistoryIDSet history_id_set;
368 383
369 // For each unique character in the word, in order of first appearance, get 384 // For each unique character in the word, in order of first appearance, get
370 // the char/word_id map entry and intersect with the set in an incremental 385 // the char/word_id map entry and intersect with the set in an incremental
371 // manner. 386 // manner.
372 Char16Vector uni_chars = Char16VectorFromString16(uni_word); 387 Char16Vector uni_chars = Char16VectorFromString16(uni_word);
373 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); 388 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
374 389
390 // TODO(mrossetti): At this point, as a possible optimization, we could
391 // scan through all candidate words and make sure the |uni_word| is a
392 // substring within the candidate words, eliminating those which aren't.
393 // I'm not sure it would be worth the effort. And remember, we've got to do
394 // a final substring match in order to get the highlighting ranges later
395 // in the process in any case.
396
375 // If any words resulted then we can compose a set of history IDs by unioning 397 // If any words resulted then we can compose a set of history IDs by unioning
376 // the sets from each word. 398 // the sets from each word.
377 if (!word_id_set.empty()) { 399 if (!word_id_set.empty()) {
378 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 400 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
379 word_id_iter != word_id_set.end(); ++word_id_iter) { 401 word_id_iter != word_id_set.end(); ++word_id_iter) {
380 WordID word_id = *word_id_iter; 402 WordID word_id = *word_id_iter;
381 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 403 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
382 if (word_iter != word_id_history_map_.end()) { 404 if (word_iter != word_id_history_map_.end()) {
383 HistoryIDSet& word_history_id_set(word_iter->second); 405 HistoryIDSet& word_history_id_set(word_iter->second);
384 history_id_set.insert(word_history_id_set.begin(), 406 history_id_set.insert(word_history_id_set.begin(),
385 word_history_id_set.end()); 407 word_history_id_set.end());
386 } 408 }
387 } 409 }
388 } 410 }
389 411
390 return history_id_set; 412 return history_id_set;
391 } 413 }
392 414
393 // Utility Functions 415 // Utility Functions
394 416
395 // static 417 // static
418 string16 InMemoryURLIndex::ToLower(const string16& upper_string) {
419 string16 lower_string(upper_string);
Peter Kasting 2011/03/09 02:31:48 Nit: How about: string16 lower_string; lower_
mrossetti 2011/03/10 01:31:14 This function replaced with l10n_util::ToLower().
420 std::transform(lower_string.begin(),
421 lower_string.end(),
422 lower_string.begin(),
423 tolower);
Peter Kasting 2011/03/09 02:31:48 Is this calling the C tolower() function? If so,
mrossetti 2011/03/10 01:31:14 Done. Replaced function.
424 return lower_string;
425 }
426
427 // static
396 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( 428 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
397 const string16& uni_string) { 429 const string16& uni_string) {
398 String16Set words; 430 String16Vector words = WordVectorFromString16(uni_string, false);
399 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD); 431 String16Set word_set;
400 if (iter.Init()) { 432 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
401 while (iter.Advance()) { 433 ++iter)
434 word_set.insert(InMemoryURLIndex::ToLower(*iter));
435 return word_set;
436 }
437
438 // static
439 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(
440 const string16& uni_string,
441 bool break_on_space) {
442 base::BreakIterator iter(&uni_string, break_on_space ?
443 base::BreakIterator::BREAK_SPACE :
Peter Kasting 2011/03/09 02:31:48 Nit: You can collapse lines 2 & 3 to a single line
mrossetti 2011/03/10 01:31:14 Done.
444 base::BreakIterator::BREAK_WORD);
445 String16Vector words;
446 if (break_on_space) {
447 if (iter.Init()) {
448 while (iter.Advance()) {
449 string16 word = iter.GetString();
450 TrimWhitespace(word, TRIM_ALL, &word);
451 if (word.size())
Peter Kasting 2011/03/09 02:31:48 Nit: !word.empty()
mrossetti 2011/03/10 01:31:14 Done.
452 words.push_back(word);
453 }
454 }
455 } else {
456 if (iter.Init()) {
402 if (iter.IsWord()) 457 if (iter.IsWord())
403 words.insert(iter.GetString()); 458 words.push_back(iter.GetString());
Peter Kasting 2011/03/09 02:31:48 I confess, I'm a little confused why one arm of th
mrossetti 2011/03/10 01:31:14 The iterator behaves differently depending on the
Peter Kasting 2011/03/10 02:13:43 If BreakIterator should be changed, can you file a
459 while (iter.Advance()) {
460 if (iter.IsWord())
461 words.push_back(iter.GetString());
462 }
404 } 463 }
405 } 464 }
406 return words; 465 return words;
407 } 466 }
408 467
409 // static 468 // static
410 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( 469 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(
411 const string16& uni_word) { 470 const string16& uni_word) {
412 InMemoryURLIndex::Char16Vector characters; 471 InMemoryURLIndex::Char16Vector characters;
413 InMemoryURLIndex::Char16Set unique_characters; 472 InMemoryURLIndex::Char16Set unique_characters;
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after
539 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); 598 for (Char16Vector::const_iterator c_iter = uni_chars.begin();
540 (c_iter != uni_chars.end()) && 599 (c_iter != uni_chars.end()) &&
541 (t_iter != term_char_word_set_cache_.end()) && 600 (t_iter != term_char_word_set_cache_.end()) &&
542 (*c_iter == t_iter->char_); 601 (*c_iter == t_iter->char_);
543 ++c_iter, ++t_iter) 602 ++c_iter, ++t_iter)
544 t_iter->used_ = true; // Update the cache. 603 t_iter->used_ = true; // Update the cache.
545 return t_iter - term_char_word_set_cache_.begin() - 1; 604 return t_iter - term_char_word_set_cache_.begin() - 1;
546 } 605 }
547 606
548 // static 607 // static
549 int InMemoryURLIndex::RawScoreForURL(const URLRow& row, 608 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,
550 const String16Vector& terms, 609 const string16& string,
551 size_t* first_term_location) { 610 int term_num) {
611 TermMatches matches;
612 for (size_t location = string.find(term); location != string16::npos;
613 location = string.find(term, location + 1))
614 matches.push_back(TermMatch(term_num, location, term.size()));
615 return matches;
616 }
617
618 // static
619 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {
620 if (matches.empty())
621 return matches;
622 TermMatches sorted_matches = matches;
623 std::sort(sorted_matches.begin(), sorted_matches.end(),
624 CompareMatchOffset);
625 TermMatches clean_matches;
626 TermMatch last_match = sorted_matches[0];
627 clean_matches.push_back(last_match);
628 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;
629 iter != sorted_matches.end(); ++iter) {
630 if (iter->offset >= last_match.offset + last_match.length) {
631 last_match = *iter;
632 clean_matches.push_back(last_match);
633 }
634 }
635 return clean_matches;
636 }
637
638 // static
639 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(
640 const URLRow& row,
641 const String16Vector& terms) {
642 ScoredHistoryMatch match(row);
552 GURL gurl = row.url(); 643 GURL gurl = row.url();
553 if (!gurl.is_valid()) 644 if (!gurl.is_valid())
554 return 0; 645 return match;
555 646
556 string16 url = UTF8ToUTF16(gurl.spec()); 647 // Figure out where each search term appears in the URL and/or page title
557 648 // so that we can score as well as provide autocomplete highlighting.
558 // Collect all term start positions so we can see if they appear in order. 649 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));
559 std::vector<size_t> term_locations; 650 url = l10n_util::ToLower(url);
560 int out_of_order = 0; // Count the terms which are out of order. 651 string16 title = row.title();
561 size_t start_location_total = 0; 652 int term_num = 0;
562 size_t term_length_total = 0;
563 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 653 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
564 ++iter) { 654 ++iter, ++term_num) {
565 string16 term = *iter; 655 string16 term = *iter;
566 size_t term_location = url.find(term); 656 TermMatches url_term_matches = MatchTermInString(term, url, term_num);
567 if (term_location == string16::npos) 657 TermMatches title_term_matches = MatchTermInString(term, title, term_num);
568 return 0; // A term was not found. This should not happen. 658 if (url_term_matches.empty() && title_term_matches.empty())
569 if (iter != terms.begin()) { 659 return match; // A term was not found in either URL or title - reject.
570 // See if this term is out-of-order. 660 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),
571 for (std::vector<size_t>::const_iterator order_iter = 661 url_term_matches.end());
572 term_locations.begin(); order_iter != term_locations.end(); 662 match.title_matches.insert(match.title_matches.end(),
573 ++order_iter) { 663 title_term_matches.begin(),
574 if (term_location <= *order_iter) 664 title_term_matches.end());
575 ++out_of_order;
576 }
577 } else {
578 *first_term_location = term_location;
579 }
580 term_locations.push_back(term_location);
581 start_location_total += term_location;
582 term_length_total += term.size();
583 } 665 }
584 666
585 // Calculate a raw score. 667 // Sort matches by offset and eliminate any which overlap.
586 // TODO(mrossetti): This is good enough for now but must be fine-tuned. 668 match.url_matches = SortAndDeoverlap(match.url_matches);
587 const float kOrderMaxValue = 10.0; 669 match.title_matches = SortAndDeoverlap(match.title_matches);
588 float order_value = 10.0;
589 if (terms.size() > 1) {
590 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
591 order_value =
592 (static_cast<float>(max_possible_out_of_order - out_of_order) /
593 max_possible_out_of_order) * kOrderMaxValue;
594 }
595 670
596 const float kStartMaxValue = 10.0; 671 // Get partial scores based on term matching. Note that the score for
597 const size_t kMaxSignificantStart = 20; 672 // each of the URL and title are adjusted by the fraction of the
598 float start_value = 673 // terms appearing in each.
599 (static_cast<float>(kMaxSignificantStart - 674 int url_score = RawScoreForMatches(match.url_matches, url.size()) *
600 std::min(kMaxSignificantStart, start_location_total / terms.size()))) / 675 match.url_matches.size() / terms.size();
601 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; 676 int title_score = RawScoreForMatches(match.title_matches, title.size()) *
677 match.title_matches.size() / terms.size();
678 // Arbitrarily pick the best.
679 // TODO(mrossetti): It might make sense that a term which appears in both the
680 // URL and the Title should boost the score a bit.
681 int term_score = std::max(url_score, title_score);
602 682
603 const float kCompleteMaxValue = 10.0; 683 // Factor in attributes of the URLRow.
604 float complete_value = 684 // Items which have been visited recently score higher.
605 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
606 kStartMaxValue;
607
608 const float kLastVisitMaxValue = 10.0;
609 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
610 int64 delta_time = (kMaxSignificantDay - 685 int64 delta_time = (kMaxSignificantDay -
611 std::min((base::Time::Now() - row.last_visit()), 686 std::min((base::Time::Now() - row.last_visit()),
612 kMaxSignificantDay)).ToInternalValue(); 687 kMaxSignificantDay)).ToInternalValue();
613 float last_visit_value = 688 float last_visit_value =
614 (static_cast<float>(delta_time) / 689 (static_cast<float>(delta_time) /
615 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * 690 static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
616 kLastVisitMaxValue; 691 kLastVisitMaxValue;
617 692
618 const float kVisitCountMaxValue = 10.0;
619 const int kMaxSignificantVisits = 10;
620 float visit_count_value = 693 float visit_count_value =
621 (static_cast<float>(std::min(row.visit_count(), 694 (static_cast<float>(std::min(row.visit_count(),
622 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * 695 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
623 kVisitCountMaxValue; 696 kVisitCountMaxValue;
624 697
625 const float kTypedCountMaxValue = 20.0;
626 const int kMaxSignificantTyped = 10;
627 float typed_count_value = 698 float typed_count_value =
628 (static_cast<float>(std::min(row.typed_count(), 699 (static_cast<float>(std::min(row.typed_count(),
629 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * 700 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *
630 kTypedCountMaxValue; 701 kTypedCountMaxValue;
631 702
632 float raw_score = order_value + start_value + complete_value + 703 float raw_score = term_score + last_visit_value + visit_count_value +
633 last_visit_value + visit_count_value + typed_count_value; 704 typed_count_value;
634 705
635 // Normalize the score. 706 // Normalize the score.
636 const float kMaxNormalizedRawScore = 1000.0; 707 match.raw_score = static_cast<int>((raw_score / kMaxRawScore) *
637 raw_score = 708 kMaxNormalizedRawScore);
638 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + 709
639 kLastVisitMaxValue + kVisitCountMaxValue + 710 return match;
640 kTypedCountMaxValue)) *
641 kMaxNormalizedRawScore;
642 return static_cast<int>(raw_score);
643 } 711 }
644 712
645 // static 713 int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches,
646 base::Time InMemoryURLIndex::RecentThreshold() { 714 size_t max_length) {
647 return base::Time::Now() - 715 // TODO(mrossetti): This is good enough for now but must be fine-tuned.
648 base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); 716 if (matches.empty())
717 return 0;
718
719 // Search terms appearing in order score higher.
720 float order_value = 10.0;
721 if (matches.size() > 1) {
722 int max_possible_out_of_order = matches.size() - 1;
723 int out_of_order = 0;
724 for (size_t i = 1; i < matches.size(); ++i) {
725 if (matches[i-1].term_num > matches[i].term_num)
Peter Kasting 2011/03/09 02:31:48 Nit: Spaces around '-'
mrossetti 2011/03/10 01:31:14 Done.
726 ++out_of_order;
Peter Kasting 2011/03/09 02:31:48 Nit: Indent 2
mrossetti 2011/03/10 01:31:14 Done.
727 }
728 order_value =
729 (static_cast<float>(max_possible_out_of_order - out_of_order) /
730 max_possible_out_of_order) * kOrderMaxValue;
731 }
732
733 // Search terms which appear earlier score higher.
734 // No points if the first search term does not appear in the first
735 // kMaxSignificantStart chars.
736 float start_value =
737 (static_cast<float>(kMaxSignificantStart -
738 std::min(kMaxSignificantStart, matches[0].offset))) /
739 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
Peter Kasting 2011/03/09 02:31:48 Nit: Is this second cast needed?
mrossetti 2011/03/10 01:31:14 Yes.
740
741 // Search terms which comprise a greater portion score higher.
742 size_t term_length_total = std::accumulate(matches.begin(), matches.end(),
743 0, AddumulateMatchLength);
744 float complete_value =
745 (static_cast<float>(term_length_total) / static_cast<float>(max_length)) *
746 kStartMaxValue;
747
748 return static_cast<int>(order_value + start_value + complete_value);
649 } 749 }
650 750
651 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 751 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
652 const InMemoryURLIndex& index, 752 const InMemoryURLIndex& index,
653 const String16Vector& lower_terms) 753 const String16Vector& lower_terms)
654 : index_(index), 754 : index_(index),
655 lower_terms_(lower_terms) { 755 lower_terms_(lower_terms) {
656 } 756 }
657 757
658 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 758 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
659 759
660 void InMemoryURLIndex::AddHistoryMatch::operator()( 760 void InMemoryURLIndex::AddHistoryMatch::operator()(
661 const InMemoryURLIndex::HistoryID history_id) { 761 const InMemoryURLIndex::HistoryID history_id) {
662 HistoryInfoMap::const_iterator hist_pos = 762 HistoryInfoMap::const_iterator hist_pos =
663 index_.history_info_map_.find(history_id); 763 index_.history_info_map_.find(history_id);
664 // Note that a history_id may be present in the word_id_history_map_ yet not 764 // Note that a history_id may be present in the word_id_history_map_ yet not
665 // be found in the history_info_map_. This occurs when an item has been 765 // be found in the history_info_map_. This occurs when an item has been
666 // deleted by the user or the item no longer qualifies as a quick result. 766 // deleted by the user or the item no longer qualifies as a quick result.
667 if (hist_pos != index_.history_info_map_.end()) { 767 if (hist_pos != index_.history_info_map_.end()) {
668 const URLRow& hist_item = hist_pos->second; 768 const URLRow& hist_item = hist_pos->second;
669 // TODO(mrossetti): Accommodate multiple term highlighting. 769 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));
670 size_t input_location = 0; 770 if (match.raw_score != 0) {
671 int score = InMemoryURLIndex::RawScoreForURL(
672 hist_item, lower_terms_, &input_location);
673 if (score != 0) {
674 // We only retain the top 10 highest scoring results so 771 // We only retain the top 10 highest scoring results so
675 // see if this one fits into the top 10 and, if so, where. 772 // see if this one fits into the top 10 and, if so, where.
676 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); 773 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
677 while (scored_iter != scored_matches_.end() && 774 while (scored_iter != scored_matches_.end() &&
678 (*scored_iter).raw_score > score) 775 (*scored_iter).raw_score > match.raw_score)
679 ++scored_iter; 776 ++scored_iter;
680 if ((scored_matches_.size() < 10) || 777 if ((scored_matches_.size() < 10) ||
681 (scored_iter != scored_matches_.end())) { 778 (scored_iter != scored_matches_.end())) {
682 // Create and insert the new item. 779 // Insert the new item.
683 // TODO(mrossetti): Properly set |match_in_scheme| and
684 // |innermost_match|.
685 bool match_in_scheme = false;
686 bool innermost_match = true;
687 ScoredHistoryMatch match(hist_item, input_location,
688 match_in_scheme, innermost_match, score);
689 if (!scored_matches_.empty()) 780 if (!scored_matches_.empty())
690 scored_matches_.insert(scored_iter, match); 781 scored_matches_.insert(scored_iter, match);
691 else 782 else
692 scored_matches_.push_back(match); 783 scored_matches_.push_back(match);
693 // Trim any entries beyond 10. 784 // Trim any entries beyond 10.
694 if (scored_matches_.size() > 10) { 785 if (scored_matches_.size() > 10) {
695 scored_matches_.erase(scored_matches_.begin() + 10, 786 scored_matches_.erase(scored_matches_.begin() + 10,
696 scored_matches_.end()); 787 scored_matches_.end());
697 } 788 }
698 } 789 }
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after
917 if (iter->has_title()) { 1008 if (iter->has_title()) {
918 string16 title(UTF8ToUTF16(iter->title())); 1009 string16 title(UTF8ToUTF16(iter->title()));
919 url_row.set_title(title); 1010 url_row.set_title(title);
920 } 1011 }
921 history_info_map_[history_id] = url_row; 1012 history_info_map_[history_id] = url_row;
922 } 1013 }
923 return true; 1014 return true;
924 } 1015 }
925 1016
926 } // namespace history 1017 } // namespace history
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698