chrome/browser/history/in_memory_url_index.cc - Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be...

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 9 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/in_memory_url_index.h"	5 #include "chrome/browser/history/in_memory_url_index.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <iterator>	8 #include <iterator>

9 #include <limits>	9 #include <limits>

	10 #include <numeric>

10	11

11 #include "base/file_util.h"	12 #include "base/file_util.h"

12 #include "base/i18n/break_iterator.h"	13 #include "base/i18n/break_iterator.h"

13 #include "base/metrics/histogram.h"	14 #include "base/metrics/histogram.h"

14 #include "base/string_util.h"	15 #include "base/string_util.h"

15 #include "base/time.h"	16 #include "base/time.h"

16 #include "base/utf_string_conversions.h"	17 #include "base/utf_string_conversions.h"

17 #include "chrome/browser/autocomplete/history_provider_util.h"	18 #include "chrome/browser/autocomplete/history_provider_util.h"

18 #include "chrome/browser/history/url_database.h"	19 #include "chrome/browser/history/url_database.h"

19 #include "chrome/browser/profiles/profile.h"	20 #include "chrome/browser/profiles/profile.h"

	21 #include "chrome/common/url_constants.h"

	22 #include "googleurl/src/url_util.h"

20 #include "net/base/escape.h"	23 #include "net/base/escape.h"

21 #include "net/base/net_util.h"	24 #include "net/base/net_util.h"

22 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"	25 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"

23 #include "ui/base/l10n/l10n_util.h"	26 #include "ui/base/l10n/l10n_util.h"

24	27

25 using google::protobuf::RepeatedField;	28 using google::protobuf::RepeatedField;

26 using google::protobuf::RepeatedPtrField;	29 using google::protobuf::RepeatedPtrField;

	30 using in_memory_url_index::InMemoryURLIndexCacheItem;

27	31

28 using in_memory_url_index::InMemoryURLIndexCacheItem;

29 namespace history {	32 namespace history {

30	33

31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;	34 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;

32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;	35 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;

33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;	36 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;

34 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;	37 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;

35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry	38 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry

36 CharWordMapEntry;	39 CharWordMapEntry;

37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem	40 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem

38 WordIDHistoryMapItem;	41 WordIDHistoryMapItem;

39 typedef imui::	42 typedef imui::

40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry	43 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry

41 WordIDHistoryMapEntry;	44 WordIDHistoryMapEntry;

42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;	45 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;

43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry	46 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry

44 HistoryInfoMapEntry;	47 HistoryInfoMapEntry;

45	48

46 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;	49 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;

47	50

48 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}	51 const float kOrderMaxValue = 50.0;

	52 const float kStartMaxValue = 50.0;

	53 const size_t kMaxSignificantStart = 20;

	54 const float kCompleteMaxValue = 50.0;

	55 const float kLastVisitMaxValue = 50.0;

	56 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);

	57 const float kVisitCountMaxValue = 50.0;

	58 const int kMaxSignificantVisits = 20;

	59 const float kTypedCountMaxValue = 50.0;

	60 const int kMaxSignificantTyped = 20;

	61 const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +

	62 kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue;

	63 const float kMaxNormalizedRawScore = 1000.0;

49	64

50 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,	65 ScoredHistoryMatch::ScoredHistoryMatch()

51 size_t input_location,	66 : raw_score(0),

52 bool match_in_scheme,	67 prefix_adjust(0) {}

53 bool innermost_match,	68

54 int score)	69 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)

55 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),	70 : HistoryMatch(url_info, 0, false, false),

56 raw_score(score) {	71 raw_score(0),

57 }	72 prefix_adjust(0) {}

58	73

59 struct InMemoryURLIndex::TermCharWordSet {	74 struct InMemoryURLIndex::TermCharWordSet {

60 TermCharWordSet() // Required for STL resize().	75 TermCharWordSet() // Required for STL resize().

61 : char_(0),	76 : char_(0),

62 word_id_set_(),	77 word_id_set_(),

63 used_(false) {}	78 used_(false) {}

64 TermCharWordSet(const char16& uni_char,	79 TermCharWordSet(const char16& uni_char,

65 const WordIDSet& word_id_set,	80 const WordIDSet& word_id_set,

66 bool used)	81 bool used)

67 : char_(uni_char),	82 : char_(uni_char),

68 word_id_set_(word_id_set),	83 word_id_set_(word_id_set),

69 used_(used) {}	84 used_(used) {}

70	85

71 // Predicate for STL algorithm use.	86 // Predicate for STL algorithm use.

72 bool is_not_used() const { return !used_; }	87 bool is_not_used() const { return !used_; }

73	88

74 char16 char_;	89 char16 char_;

75 WordIDSet word_id_set_;	90 WordIDSet word_id_set_;

76 bool used_; // true if this set has been used for the current term search.	91 bool used_; // true if this set has been used for the current term search.

77 };	92 };

78	93

	94 // Comparison function for sorting TermMatches by their offsets.

	95 bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) {

	96 return m1.offset < m2.offset;

	97 }

	98

	99 // std::accumulate helper function to add up TermMatches' lengths.

	100 int AccumulateMatchLength(int total, const TermMatch& match) {

	101 return total + match.length;

	102 }

	103

79 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)	104 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)

80 : history_dir_(history_dir),	105 : history_dir_(history_dir),

81 history_item_count_(0) {	106 history_item_count_(0) {

82 }	107 }

83	108

84 // Called only by unit tests.	109 // Called only by unit tests.

85 InMemoryURLIndex::InMemoryURLIndex()	110 InMemoryURLIndex::InMemoryURLIndex()

86 : history_item_count_(0) {	111 : history_item_count_(0) {

87 }	112 }

88	113

(...skipping 13 matching lines...) Expand all Loading...
102 SaveToCacheFile();	127 SaveToCacheFile();

103 }	128 }

104	129

105 bool InMemoryURLIndex::IndexRow(const URLRow& row) {	130 bool InMemoryURLIndex::IndexRow(const URLRow& row) {

106 const GURL& gurl(row.url());	131 const GURL& gurl(row.url());

107 string16 url(net::FormatUrl(gurl, languages_,	132 string16 url(net::FormatUrl(gurl, languages_,

108 net::kFormatUrlOmitUsernamePassword,	133 net::kFormatUrlOmitUsernamePassword,

109 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,	134 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,

110 NULL, NULL, NULL));	135 NULL, NULL, NULL));

111	136

112 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().

113 HistoryID history_id = static_cast<HistoryID>(row.id());	137 HistoryID history_id = static_cast<HistoryID>(row.id());

	138 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());

114	139

115 // Add the row for quick lookup in the history info store.	140 // Add the row for quick lookup in the history info store.

116 url = l10n_util::ToLower(url);

117 URLRow new_row(GURL(url), row.id());	141 URLRow new_row(GURL(url), row.id());

118 new_row.set_visit_count(row.visit_count());	142 new_row.set_visit_count(row.visit_count());

119 new_row.set_typed_count(row.typed_count());	143 new_row.set_typed_count(row.typed_count());

120 new_row.set_last_visit(row.last_visit());	144 new_row.set_last_visit(row.last_visit());

121 new_row.set_title(row.title());	145 new_row.set_title(row.title());

122 history_info_map_[history_id] = new_row;	146 history_info_map_[history_id] = new_row;

123	147

124 // Split into individual, unique words.	148 // Split URL into individual, unique words then add in the title words.

125 String16Set words = WordSetFromString16(url);	149 url = l10n_util::ToLower(url);

	150 String16Set url_words = WordSetFromString16(url);

	151 String16Set title_words = WordSetFromString16(row.title());

	152 String16Set words;

	153 std::set_union(url_words.begin(), url_words.end(),

	154 title_words.begin(), title_words.end(),

	155 std::insert_iterator<String16Set>(words, words.begin()));

	156 for (String16Set::iterator word_iter = words.begin();

	157 word_iter != words.end(); ++word_iter)

	158 AddWordToIndex(*word_iter, history_id);

126	159

127 // For each word, add a new entry into the word index referring to the

128 // associated history item.

129 for (String16Set::iterator iter = words.begin();

130 iter != words.end(); ++iter) {

131 String16Set::value_type uni_word = *iter;

132 AddWordToIndex(uni_word, history_id);

133 }

134 ++history_item_count_;	160 ++history_item_count_;

135 return true;	161 return true;

136 }	162 }

137	163

138 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,	164 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,

139 bool clear_cache) {	165 bool clear_cache) {

140 ClearPrivateData();	166 ClearPrivateData();

141	167

142 if (!history_db)	168 if (!history_db)

143 return false;	169 return false;

(...skipping 141 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
285 ScoredHistoryMatches scored_items;	311 ScoredHistoryMatches scored_items;

286 if (!terms.empty()) {	312 if (!terms.empty()) {

287 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-	313 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-

288 // and-sweep approach.	314 // and-sweep approach.

289 ResetTermCharWordSetCache();	315 ResetTermCharWordSetCache();

290	316

291 // Lowercase the terms.	317 // Lowercase the terms.

292 // TODO(mrossetti): Another opportunity for a transform algorithm.	318 // TODO(mrossetti): Another opportunity for a transform algorithm.

293 String16Vector lower_terms;	319 String16Vector lower_terms;

294 for (String16Vector::const_iterator term_iter = terms.begin();	320 for (String16Vector::const_iterator term_iter = terms.begin();

295 term_iter != terms.end(); ++term_iter) {	321 term_iter != terms.end(); ++term_iter)

296 String16Vector::value_type lower_string(*term_iter);	322 lower_terms.push_back(l10n_util::ToLower(*term_iter));

297 std::transform(lower_string.begin(),

298 lower_string.end(),

299 lower_string.begin(),

300 tolower);

301 lower_terms.push_back(lower_string);

302 }

303	323

304 String16Vector::value_type all_terms(JoinString(lower_terms, ' '));	324 String16Vector::value_type all_terms(JoinString(lower_terms, ' '));

305 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);	325 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);

306	326

307 // Pass over all of the candidates filtering out any without a proper	327 // Pass over all of the candidates filtering out any without a proper

308 // substring match, inserting those which pass in order by score.	328 // substring match, inserting those which pass in order by score.

309 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),	329 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

310 AddHistoryMatch(*this,	330 AddHistoryMatch(*this,

311 lower_terms)).ScoredMatches();	331 lower_terms)).ScoredMatches();

312 }	332 }

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
365 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(	385 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(

366 const string16& uni_word) {	386 const string16& uni_word) {

367 InMemoryURLIndex::HistoryIDSet history_id_set;	387 InMemoryURLIndex::HistoryIDSet history_id_set;

368	388

369 // For each unique character in the word, in order of first appearance, get	389 // For each unique character in the word, in order of first appearance, get

370 // the char/word_id map entry and intersect with the set in an incremental	390 // the char/word_id map entry and intersect with the set in an incremental

371 // manner.	391 // manner.

372 Char16Vector uni_chars = Char16VectorFromString16(uni_word);	392 Char16Vector uni_chars = Char16VectorFromString16(uni_word);

373 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));	393 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));

374	394

	395 // TODO(mrossetti): At this point, as a possible optimization, we could

	396 // scan through all candidate words and make sure the \|uni_word\| is a

	397 // substring within the candidate words, eliminating those which aren't.

	398 // I'm not sure it would be worth the effort. And remember, we've got to do

	399 // a final substring match in order to get the highlighting ranges later

	400 // in the process in any case.

	401

375 // If any words resulted then we can compose a set of history IDs by unioning	402 // If any words resulted then we can compose a set of history IDs by unioning

376 // the sets from each word.	403 // the sets from each word.

377 if (!word_id_set.empty()) {	404 if (!word_id_set.empty()) {

378 for (WordIDSet::iterator word_id_iter = word_id_set.begin();	405 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

379 word_id_iter != word_id_set.end(); ++word_id_iter) {	406 word_id_iter != word_id_set.end(); ++word_id_iter) {

380 WordID word_id = *word_id_iter;	407 WordID word_id = *word_id_iter;

381 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);	408 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);

382 if (word_iter != word_id_history_map_.end()) {	409 if (word_iter != word_id_history_map_.end()) {

383 HistoryIDSet& word_history_id_set(word_iter->second);	410 HistoryIDSet& word_history_id_set(word_iter->second);

384 history_id_set.insert(word_history_id_set.begin(),	411 history_id_set.insert(word_history_id_set.begin(),

385 word_history_id_set.end());	412 word_history_id_set.end());

386 }	413 }

387 }	414 }

388 }	415 }

389	416

390 return history_id_set;	417 return history_id_set;

391 }	418 }

392	419

393 // Utility Functions	420 // Utility Functions

394	421

395 // static	422 // static

396 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(	423 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(

397 const string16& uni_string) {	424 const string16& uni_string) {

398 String16Set words;	425 String16Vector words = WordVectorFromString16(uni_string, false);

399 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD);	426 String16Set word_set;

400 if (iter.Init()) {	427 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();

401 while (iter.Advance()) {	428 ++iter)

	429 word_set.insert(l10n_util::ToLower(*iter));

	430 return word_set;

	431 }

	432

	433 // static

	434 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(

	435 const string16& uni_string,

	436 bool break_on_space) {

	437 // TODO(mrossetti): Come back and update the following code if the

	438 // BreakIterator is changed. Here are some comments:

	439 // The iterator behaves differently depending on the breaking strategy. Its

	440 // unit tests do not properly test this case as its basic word and space tests

	441 // always use a test string starting with a space.

	442 base::BreakIterator iter(&uni_string, break_on_space ?

	443 base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD);

	444 String16Vector words;

	445 if (break_on_space) {

	446 if (iter.Init()) {

	447 while (iter.Advance()) {

	448 string16 word = iter.GetString();

	449 TrimWhitespace(word, TRIM_ALL, &word);

	450 if (!word.empty())

	451 words.push_back(word);

	452 }

	453 }

	454 } else {

	455 if (iter.Init()) {

402 if (iter.IsWord())	456 if (iter.IsWord())

403 words.insert(iter.GetString());	457 words.push_back(iter.GetString());

	458 while (iter.Advance()) {

	459 if (iter.IsWord())

	460 words.push_back(iter.GetString());

	461 }

404 }	462 }

405 }	463 }

406 return words;	464 return words;

407 }	465 }

408	466

409 // static	467 // static

410 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(	468 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(

411 const string16& uni_word) {	469 const string16& uni_word) {

412 InMemoryURLIndex::Char16Vector characters;	470 InMemoryURLIndex::Char16Vector characters;

413 InMemoryURLIndex::Char16Set unique_characters;	471 InMemoryURLIndex::Char16Set unique_characters;

(...skipping 125 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
539 for (Char16Vector::const_iterator c_iter = uni_chars.begin();	597 for (Char16Vector::const_iterator c_iter = uni_chars.begin();

540 (c_iter != uni_chars.end()) &&	598 (c_iter != uni_chars.end()) &&

541 (t_iter != term_char_word_set_cache_.end()) &&	599 (t_iter != term_char_word_set_cache_.end()) &&

542 (*c_iter == t_iter->char_);	600 (*c_iter == t_iter->char_);

543 ++c_iter, ++t_iter)	601 ++c_iter, ++t_iter)

544 t_iter->used_ = true; // Update the cache.	602 t_iter->used_ = true; // Update the cache.

545 return t_iter - term_char_word_set_cache_.begin() - 1;	603 return t_iter - term_char_word_set_cache_.begin() - 1;

546 }	604 }

547	605

548 // static	606 // static

549 int InMemoryURLIndex::RawScoreForURL(const URLRow& row,	607 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,

550 const String16Vector& terms,	608 const string16& string,

551 size_t* first_term_location) {	609 int term_num) {

	610 TermMatches matches;

	611 for (size_t location = string.find(term); location != string16::npos;

	612 location = string.find(term, location + 1))

	613 matches.push_back(TermMatch(term_num, location, term.size()));

	614 return matches;

	615 }

	616

	617 // static

	618 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {

	619 if (matches.empty())

	620 return matches;

	621 TermMatches sorted_matches = matches;

	622 std::sort(sorted_matches.begin(), sorted_matches.end(),

	623 CompareMatchOffset);

	624 TermMatches clean_matches;

	625 TermMatch last_match = sorted_matches[0];

	626 clean_matches.push_back(last_match);

	627 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;

	628 iter != sorted_matches.end(); ++iter) {

	629 if (iter->offset >= last_match.offset + last_match.length) {

	630 last_match = *iter;

	631 clean_matches.push_back(last_match);

	632 }

	633 }

	634 return clean_matches;

	635 }

	636

	637 // static

	638 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(

	639 const URLRow& row,

	640 const String16Vector& terms) {

	641 ScoredHistoryMatch match(row);

552 GURL gurl = row.url();	642 GURL gurl = row.url();

553 if (!gurl.is_valid())	643 if (!gurl.is_valid())

554 return 0;	644 return match;

555	645

556 string16 url = UTF8ToUTF16(gurl.spec());	646 // Figure out where each search term appears in the URL and/or page title

557	647 // so that we can score as well as provide autocomplete highlighting.

558 // Collect all term start positions so we can see if they appear in order.	648 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));

559 std::vector<size_t> term_locations;	649 // Strip any 'http://' prefix before matching.

560 int out_of_order = 0; // Count the terms which are out of order.	650 if (url_util::FindAndCompareScheme(url, chrome::kHttpScheme, NULL)) {

561 size_t start_location_total = 0;	651 match.prefix_adjust = strlen(chrome::kHttpScheme) + 3; // Allow for '://'.

562 size_t term_length_total = 0;	652 url = url.substr(match.prefix_adjust);

563 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();

564 ++iter) {

565 string16 term = *iter;

566 size_t term_location = url.find(term);

567 if (term_location == string16::npos)

568 return 0; // A term was not found. This should not happen.

569 if (iter != terms.begin()) {

570 // See if this term is out-of-order.

571 for (std::vector<size_t>::const_iterator order_iter =

572 term_locations.begin(); order_iter != term_locations.end();

573 ++order_iter) {

574 if (term_location <= *order_iter)

575 ++out_of_order;

576 }

577 } else {

578 *first_term_location = term_location;

579 }

580 term_locations.push_back(term_location);

581 start_location_total += term_location;

582 term_length_total += term.size();

583 }	653 }

584	654

585 // Calculate a raw score.	655 string16 title = l10n_util::ToLower(row.title());

586 // TODO(mrossetti): This is good enough for now but must be fine-tuned.	656 int term_num = 0;

587 const float kOrderMaxValue = 10.0;	657 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();

588 float order_value = 10.0;	658 ++iter, ++term_num) {

589 if (terms.size() > 1) {	659 string16 term = *iter;

590 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;	660 TermMatches url_term_matches = MatchTermInString(term, url, term_num);

591 order_value =	661 TermMatches title_term_matches = MatchTermInString(term, title, term_num);

592 (static_cast<float>(max_possible_out_of_order - out_of_order) /	662 if (url_term_matches.empty() && title_term_matches.empty())

593 max_possible_out_of_order) * kOrderMaxValue;	663 return match; // A term was not found in either URL or title - reject.

	664 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),

	665 url_term_matches.end());

	666 match.title_matches.insert(match.title_matches.end(),

	667 title_term_matches.begin(),

	668 title_term_matches.end());

594 }	669 }

595	670

596 const float kStartMaxValue = 10.0;	671 // Sort matches by offset and eliminate any which overlap.

597 const size_t kMaxSignificantStart = 20;	672 match.url_matches = SortAndDeoverlap(match.url_matches);

598 float start_value =	673 match.title_matches = SortAndDeoverlap(match.title_matches);

599 (static_cast<float>(kMaxSignificantStart -

600 std::min(kMaxSignificantStart, start_location_total / terms.size()))) /

601 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;

602	674

603 const float kCompleteMaxValue = 10.0;	675 // Get partial scores based on term matching. Note that the score for

604 float complete_value =	676 // each of the URL and title are adjusted by the fraction of the

605 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *	677 // terms appearing in each.

606 kStartMaxValue;	678 int url_score = RawScoreForMatches(match.url_matches, url.size()) *

	679 match.url_matches.size() / terms.size();

	680 int title_score = RawScoreForMatches(match.title_matches, title.size()) *

	681 match.title_matches.size() / terms.size();

	682 // Arbitrarily pick the best.

	683 // TODO(mrossetti): It might make sense that a term which appears in both the

	684 // URL and the Title should boost the score a bit.

	685 int term_score = std::max(url_score, title_score);

607	686

608 const float kLastVisitMaxValue = 10.0;	687 // Factor in attributes of the URLRow.

609 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);	688 // Items which have been visited recently score higher.

610 int64 delta_time = (kMaxSignificantDay -	689 int64 delta_time = (kMaxSignificantDay -

611 std::min((base::Time::Now() - row.last_visit()),	690 std::min((base::Time::Now() - row.last_visit()),

612 kMaxSignificantDay)).ToInternalValue();	691 kMaxSignificantDay)).ToInternalValue();

613 float last_visit_value =	692 float last_visit_value =

614 (static_cast<float>(delta_time) /	693 (static_cast<float>(delta_time) /

615 static_cast<float>(kMaxSignificantDay.ToInternalValue())) *	694 static_cast<float>(kMaxSignificantDay.ToInternalValue())) *

616 kLastVisitMaxValue;	695 kLastVisitMaxValue;

617	696

618 const float kVisitCountMaxValue = 10.0;

619 const int kMaxSignificantVisits = 10;

620 float visit_count_value =	697 float visit_count_value =

621 (static_cast<float>(std::min(row.visit_count(),	698 (static_cast<float>(std::min(row.visit_count(),

622 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *	699 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *

623 kVisitCountMaxValue;	700 kVisitCountMaxValue;

624	701

625 const float kTypedCountMaxValue = 20.0;

626 const int kMaxSignificantTyped = 10;

627 float typed_count_value =	702 float typed_count_value =

628 (static_cast<float>(std::min(row.typed_count(),	703 (static_cast<float>(std::min(row.typed_count(),

629 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *	704 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *

630 kTypedCountMaxValue;	705 kTypedCountMaxValue;

631	706

632 float raw_score = order_value + start_value + complete_value +	707 float raw_score = term_score + last_visit_value + visit_count_value +

633 last_visit_value + visit_count_value + typed_count_value;	708 typed_count_value;

634	709

635 // Normalize the score.	710 // Normalize the score.

636 const float kMaxNormalizedRawScore = 1000.0;	711 match.raw_score = static_cast<int>((raw_score / kMaxRawScore) *

637 raw_score =	712 kMaxNormalizedRawScore);

638 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +	713 return match;

639 kLastVisitMaxValue + kVisitCountMaxValue +

640 kTypedCountMaxValue)) *

641 kMaxNormalizedRawScore;

642 return static_cast<int>(raw_score);

643 }	714 }

644	715

645 // static	716 int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches,

646 base::Time InMemoryURLIndex::RecentThreshold() {	717 size_t max_length) {

647 return base::Time::Now() -	718 // TODO(mrossetti): This is good enough for now but must be fine-tuned.

648 base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);	719 if (matches.empty())

	720 return 0;

	721

	722 // Search terms appearing in order score higher.

	723 float order_value = 10.0;

	724 if (matches.size() > 1) {

	725 int max_possible_out_of_order = matches.size() - 1;

	726 int out_of_order = 0;

	727 for (size_t i = 1; i < matches.size(); ++i) {

	728 if (matches[i - 1].term_num > matches[i].term_num)

	729 ++out_of_order;

	730 }

	731 order_value =

	732 (static_cast<float>(max_possible_out_of_order - out_of_order) /

	733 max_possible_out_of_order) * kOrderMaxValue;

	734 }

	735

	736 // Search terms which appear earlier score higher.

	737 // No points if the first search term does not appear in the first

	738 // kMaxSignificantStart chars.

	739 float start_value =

	740 (static_cast<float>(kMaxSignificantStart -

	741 std::min(kMaxSignificantStart, matches[0].offset))) /

	742 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;

	743

	744 // Search terms which comprise a greater portion score higher.

	745 size_t term_length_total = std::accumulate(matches.begin(), matches.end(),

	746 0, AccumulateMatchLength);

	747 float complete_value =

	748 (static_cast<float>(term_length_total) / static_cast<float>(max_length)) *

	749 kStartMaxValue;

	750

	751 return static_cast<int>(order_value + start_value + complete_value);

649 }	752 }

650	753

651 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(	754 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

652 const InMemoryURLIndex& index,	755 const InMemoryURLIndex& index,

653 const String16Vector& lower_terms)	756 const String16Vector& lower_terms)

654 : index_(index),	757 : index_(index),

655 lower_terms_(lower_terms) {	758 lower_terms_(lower_terms) {

656 }	759 }

657	760

658 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}	761 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}

659	762

660 void InMemoryURLIndex::AddHistoryMatch::operator()(	763 void InMemoryURLIndex::AddHistoryMatch::operator()(

661 const InMemoryURLIndex::HistoryID history_id) {	764 const InMemoryURLIndex::HistoryID history_id) {

662 HistoryInfoMap::const_iterator hist_pos =	765 HistoryInfoMap::const_iterator hist_pos =

663 index_.history_info_map_.find(history_id);	766 index_.history_info_map_.find(history_id);

664 // Note that a history_id may be present in the word_id_history_map_ yet not	767 // Note that a history_id may be present in the word_id_history_map_ yet not

665 // be found in the history_info_map_. This occurs when an item has been	768 // be found in the history_info_map_. This occurs when an item has been

666 // deleted by the user or the item no longer qualifies as a quick result.	769 // deleted by the user or the item no longer qualifies as a quick result.

667 if (hist_pos != index_.history_info_map_.end()) {	770 if (hist_pos != index_.history_info_map_.end()) {

668 const URLRow& hist_item = hist_pos->second;	771 const URLRow& hist_item = hist_pos->second;

669 // TODO(mrossetti): Accommodate multiple term highlighting.	772 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));

670 size_t input_location = 0;	773 if (match.raw_score != 0) {

671 int score = InMemoryURLIndex::RawScoreForURL(

672 hist_item, lower_terms_, &input_location);

673 if (score != 0) {

674 // We only retain the top 10 highest scoring results so	774 // We only retain the top 10 highest scoring results so

675 // see if this one fits into the top 10 and, if so, where.	775 // see if this one fits into the top 10 and, if so, where.

676 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();	776 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();

677 while (scored_iter != scored_matches_.end() &&	777 while (scored_iter != scored_matches_.end() &&

678 (*scored_iter).raw_score > score)	778 (*scored_iter).raw_score > match.raw_score)

679 ++scored_iter;	779 ++scored_iter;

680 if ((scored_matches_.size() < 10) \|\|	780 if ((scored_matches_.size() < 10) \|\|

681 (scored_iter != scored_matches_.end())) {	781 (scored_iter != scored_matches_.end())) {

682 // Create and insert the new item.	782 // Insert the new item.

683 // TODO(mrossetti): Properly set \|match_in_scheme\| and

684 // \|innermost_match\|.

685 bool match_in_scheme = false;

686 bool innermost_match = true;

687 ScoredHistoryMatch match(hist_item, input_location,

688 match_in_scheme, innermost_match, score);

689 if (!scored_matches_.empty())	783 if (!scored_matches_.empty())

690 scored_matches_.insert(scored_iter, match);	784 scored_matches_.insert(scored_iter, match);

691 else	785 else

692 scored_matches_.push_back(match);	786 scored_matches_.push_back(match);

693 // Trim any entries beyond 10.	787 // Trim any entries beyond 10.

694 if (scored_matches_.size() > 10) {	788 if (scored_matches_.size() > 10) {

695 scored_matches_.erase(scored_matches_.begin() + 10,	789 scored_matches_.erase(scored_matches_.begin() + 10,

696 scored_matches_.end());	790 scored_matches_.end());

697 }	791 }

698 }	792 }

(...skipping 218 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
917 if (iter->has_title()) {	1011 if (iter->has_title()) {

918 string16 title(UTF8ToUTF16(iter->title()));	1012 string16 title(UTF8ToUTF16(iter->title()));

919 url_row.set_title(title);	1013 url_row.set_title(title);

920 }	1014 }

921 history_info_map_[history_id] = url_row;	1015 history_info_map_[history_id] = url_row;

922 }	1016 }

923 return true;	1017 return true;

924 }	1018 }

925	1019

926 } // namespace history	1020 } // namespace history

OLD	NEW

« no previous file with comments | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_unittest.cc » ('j') | no next file with comments »