chrome/browser/history/in_memory_url_index.cc - Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be...

Side by Side Diff: chrome/browser/history/in_memory_url_index.cc

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 9 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« chrome/browser/history/in_memory_url_index.h ('K') | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_unittest.cc » ('j') | chrome/browser/history/in_memory_url_index_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/history/in_memory_url_index.h"	5 #include "chrome/browser/history/in_memory_url_index.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <iterator>	8 #include <iterator>

9 #include <limits>	9 #include <limits>

	10 #include <numeric>

10	11

11 #include "base/file_util.h"	12 #include "base/file_util.h"

12 #include "base/i18n/break_iterator.h"	13 #include "base/i18n/break_iterator.h"

13 #include "base/metrics/histogram.h"	14 #include "base/metrics/histogram.h"

14 #include "base/string_util.h"	15 #include "base/string_util.h"

15 #include "base/time.h"	16 #include "base/time.h"

16 #include "base/utf_string_conversions.h"	17 #include "base/utf_string_conversions.h"

17 #include "chrome/browser/autocomplete/history_provider_util.h"	18 #include "chrome/browser/autocomplete/history_provider_util.h"

18 #include "chrome/browser/history/url_database.h"	19 #include "chrome/browser/history/url_database.h"

19 #include "chrome/browser/profiles/profile.h"	20 #include "chrome/browser/profiles/profile.h"

20 #include "net/base/escape.h"	21 #include "net/base/escape.h"

21 #include "net/base/net_util.h"	22 #include "net/base/net_util.h"

22 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"	23 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"

23 #include "ui/base/l10n/l10n_util.h"	24 #include "ui/base/l10n/l10n_util.h"

24	25

25 using google::protobuf::RepeatedField;	26 using google::protobuf::RepeatedField;

26 using google::protobuf::RepeatedPtrField;	27 using google::protobuf::RepeatedPtrField;

	28 using in_memory_url_index::InMemoryURLIndexCacheItem;

27	29

28 using in_memory_url_index::InMemoryURLIndexCacheItem;

29 namespace history {	30 namespace history {

30	31

31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;	32 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;

32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;	33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;

33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;	34 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;

34 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;	35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;

35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry	36 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry

36 CharWordMapEntry;	37 CharWordMapEntry;

37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem	38 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem

38 WordIDHistoryMapItem;	39 WordIDHistoryMapItem;

39 typedef imui::	40 typedef imui::

40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry	41 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry

41 WordIDHistoryMapEntry;	42 WordIDHistoryMapEntry;

42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;	43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;

43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry	44 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry

44 HistoryInfoMapEntry;	45 HistoryInfoMapEntry;

45	46

46 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;	47 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;

47	48

	49 const float kOrderMaxValue = 50.0;

	50 const float kStartMaxValue = 50.0;

	51 const size_t kMaxSignificantStart = 20;

	52 const float kCompleteMaxValue = 50.0;

	53 const float kLastVisitMaxValue = 50.0;

	54 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);

	55 const float kVisitCountMaxValue = 50.0;

	56 const int kMaxSignificantVisits = 20;

	57 const float kTypedCountMaxValue = 50.0;

	58 const int kMaxSignificantTyped = 20;

	59 const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +

	60 kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue;

	61 const float kMaxNormalizedRawScore = 1000.0;

	62

48 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}	63 ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}

49	64

50 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,	65 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)

51 size_t input_location,	66 : HistoryMatch(url_info, 0, false, false),

52 bool match_in_scheme,	67 raw_score(0) {}

53 bool innermost_match,

54 int score)

55 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),

56 raw_score(score) {

57 }

58	68

59 struct InMemoryURLIndex::TermCharWordSet {	69 struct InMemoryURLIndex::TermCharWordSet {

60 TermCharWordSet() // Required for STL resize().	70 TermCharWordSet() // Required for STL resize().

61 : char_(0),	71 : char_(0),

62 word_id_set_(),	72 word_id_set_(),

63 used_(false) {}	73 used_(false) {}

64 TermCharWordSet(const char16& uni_char,	74 TermCharWordSet(const char16& uni_char,

65 const WordIDSet& word_id_set,	75 const WordIDSet& word_id_set,

66 bool used)	76 bool used)

67 : char_(uni_char),	77 : char_(uni_char),

68 word_id_set_(word_id_set),	78 word_id_set_(word_id_set),

69 used_(used) {}	79 used_(used) {}

70	80

71 // Predicate for STL algorithm use.	81 // Predicate for STL algorithm use.

72 bool is_not_used() const { return !used_; }	82 bool is_not_used() const { return !used_; }

73	83

74 char16 char_;	84 char16 char_;

75 WordIDSet word_id_set_;	85 WordIDSet word_id_set_;

76 bool used_; // true if this set has been used for the current term search.	86 bool used_; // true if this set has been used for the current term search.

77 };	87 };

78	88

	89 // Comparison function for sorting TermMatches by their offsets.

	90 bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) {

	91 return m1.offset < m2.offset;

	92 }

	93

	94 // std::accumulate helper function to add up TermMatches' lengths.

	95 int AddumulateMatchLength(int total, const TermMatch& match) {
	Peter Kasting 2011/03/09 02:31:48 Nit: Addumulate -> Accumulate Nit: Addumulate -> Accumulate mrossetti 2011/03/10 01:31:14 LOL Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: Addumulate -> Accumulate LOL
	96 return total + match.length;

	97 }

	98

79 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)	99 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)

80 : history_dir_(history_dir),	100 : history_dir_(history_dir),

81 history_item_count_(0) {	101 history_item_count_(0) {

82 }	102 }

83	103

84 // Called only by unit tests.	104 // Called only by unit tests.

85 InMemoryURLIndex::InMemoryURLIndex()	105 InMemoryURLIndex::InMemoryURLIndex()

86 : history_item_count_(0) {	106 : history_item_count_(0) {

87 }	107 }

88	108

(...skipping 13 matching lines...) Expand all Loading...
102 SaveToCacheFile();	122 SaveToCacheFile();

103 }	123 }

104	124

105 bool InMemoryURLIndex::IndexRow(const URLRow& row) {	125 bool InMemoryURLIndex::IndexRow(const URLRow& row) {

106 const GURL& gurl(row.url());	126 const GURL& gurl(row.url());

107 string16 url(net::FormatUrl(gurl, languages_,	127 string16 url(net::FormatUrl(gurl, languages_,

108 net::kFormatUrlOmitUsernamePassword,	128 net::kFormatUrlOmitUsernamePassword,

109 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,	129 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS,

110 NULL, NULL, NULL));	130 NULL, NULL, NULL));

111	131

112 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().

113 HistoryID history_id = static_cast<HistoryID>(row.id());	132 HistoryID history_id = static_cast<HistoryID>(row.id());

	133 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());

114	134

115 // Add the row for quick lookup in the history info store.	135 // Add the row for quick lookup in the history info store.

116 url = l10n_util::ToLower(url);

117 URLRow new_row(GURL(url), row.id());	136 URLRow new_row(GURL(url), row.id());

118 new_row.set_visit_count(row.visit_count());	137 new_row.set_visit_count(row.visit_count());

119 new_row.set_typed_count(row.typed_count());	138 new_row.set_typed_count(row.typed_count());

120 new_row.set_last_visit(row.last_visit());	139 new_row.set_last_visit(row.last_visit());

121 new_row.set_title(row.title());	140 new_row.set_title(row.title());

122 history_info_map_[history_id] = new_row;	141 history_info_map_[history_id] = new_row;

123	142

124 // Split into individual, unique words.	143 // Split URL into individual, unique words then add in the title words.

125 String16Set words = WordSetFromString16(url);	144 url = l10n_util::ToLower(url);

	145 String16Set url_words = WordSetFromString16(url);

	146 String16Set title_words = WordSetFromString16(row.title());

	147 String16Set words;

	148 std::set_union(url_words.begin(), url_words.end(),

	149 title_words.begin(), title_words.end(),

	150 std::insert_iterator<String16Set>(words, words.begin()));

	151 for (String16Set::iterator word_iter = words.begin();

	152 word_iter != words.end(); ++word_iter)

	153 AddWordToIndex(*word_iter, history_id);

126	154

127 // For each word, add a new entry into the word index referring to the

128 // associated history item.

129 for (String16Set::iterator iter = words.begin();

130 iter != words.end(); ++iter) {

131 String16Set::value_type uni_word = *iter;

132 AddWordToIndex(uni_word, history_id);

133 }

134 ++history_item_count_;	155 ++history_item_count_;

135 return true;	156 return true;

136 }	157 }

137	158

138 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,	159 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,

139 bool clear_cache) {	160 bool clear_cache) {

140 ClearPrivateData();	161 ClearPrivateData();

141	162

142 if (!history_db)	163 if (!history_db)

143 return false;	164 return false;

(...skipping 141 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
285 ScoredHistoryMatches scored_items;	306 ScoredHistoryMatches scored_items;

286 if (!terms.empty()) {	307 if (!terms.empty()) {

287 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-	308 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-

288 // and-sweep approach.	309 // and-sweep approach.

289 ResetTermCharWordSetCache();	310 ResetTermCharWordSetCache();

290	311

291 // Lowercase the terms.	312 // Lowercase the terms.

292 // TODO(mrossetti): Another opportunity for a transform algorithm.	313 // TODO(mrossetti): Another opportunity for a transform algorithm.

293 String16Vector lower_terms;	314 String16Vector lower_terms;

294 for (String16Vector::const_iterator term_iter = terms.begin();	315 for (String16Vector::const_iterator term_iter = terms.begin();

295 term_iter != terms.end(); ++term_iter) {	316 term_iter != terms.end(); ++term_iter)

296 String16Vector::value_type lower_string(*term_iter);	317 lower_terms.push_back(InMemoryURLIndex::ToLower(*term_iter));

297 std::transform(lower_string.begin(),

298 lower_string.end(),

299 lower_string.begin(),

300 tolower);

301 lower_terms.push_back(lower_string);

302 }

303	318

304 String16Vector::value_type all_terms(JoinString(lower_terms, ' '));	319 String16Vector::value_type all_terms(JoinString(lower_terms, ' '));

305 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);	320 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);

306	321

307 // Pass over all of the candidates filtering out any without a proper	322 // Pass over all of the candidates filtering out any without a proper

308 // substring match, inserting those which pass in order by score.	323 // substring match, inserting those which pass in order by score.

309 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),	324 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),

310 AddHistoryMatch(*this,	325 AddHistoryMatch(*this,

311 lower_terms)).ScoredMatches();	326 lower_terms)).ScoredMatches();

312 }	327 }

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
365 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(	380 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(

366 const string16& uni_word) {	381 const string16& uni_word) {

367 InMemoryURLIndex::HistoryIDSet history_id_set;	382 InMemoryURLIndex::HistoryIDSet history_id_set;

368	383

369 // For each unique character in the word, in order of first appearance, get	384 // For each unique character in the word, in order of first appearance, get

370 // the char/word_id map entry and intersect with the set in an incremental	385 // the char/word_id map entry and intersect with the set in an incremental

371 // manner.	386 // manner.

372 Char16Vector uni_chars = Char16VectorFromString16(uni_word);	387 Char16Vector uni_chars = Char16VectorFromString16(uni_word);

373 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));	388 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));

374	389

	390 // TODO(mrossetti): At this point, as a possible optimization, we could

	391 // scan through all candidate words and make sure the \|uni_word\| is a

	392 // substring within the candidate words, eliminating those which aren't.

	393 // I'm not sure it would be worth the effort. And remember, we've got to do

	394 // a final substring match in order to get the highlighting ranges later

	395 // in the process in any case.

	396

375 // If any words resulted then we can compose a set of history IDs by unioning	397 // If any words resulted then we can compose a set of history IDs by unioning

376 // the sets from each word.	398 // the sets from each word.

377 if (!word_id_set.empty()) {	399 if (!word_id_set.empty()) {

378 for (WordIDSet::iterator word_id_iter = word_id_set.begin();	400 for (WordIDSet::iterator word_id_iter = word_id_set.begin();

379 word_id_iter != word_id_set.end(); ++word_id_iter) {	401 word_id_iter != word_id_set.end(); ++word_id_iter) {

380 WordID word_id = *word_id_iter;	402 WordID word_id = *word_id_iter;

381 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);	403 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);

382 if (word_iter != word_id_history_map_.end()) {	404 if (word_iter != word_id_history_map_.end()) {

383 HistoryIDSet& word_history_id_set(word_iter->second);	405 HistoryIDSet& word_history_id_set(word_iter->second);

384 history_id_set.insert(word_history_id_set.begin(),	406 history_id_set.insert(word_history_id_set.begin(),

385 word_history_id_set.end());	407 word_history_id_set.end());

386 }	408 }

387 }	409 }

388 }	410 }

389	411

390 return history_id_set;	412 return history_id_set;

391 }	413 }

392	414

393 // Utility Functions	415 // Utility Functions

394	416

395 // static	417 // static

	418 string16 InMemoryURLIndex::ToLower(const string16& upper_string) {

	419 string16 lower_string(upper_string);
	Peter Kasting 2011/03/09 02:31:48 Nit: How about: string16 lower_string; lower_ Nit: How about: string16 lower_string; lower_string.reserve(upper_string.length()); std::transform(upper_string.begin(), upper_string.end(), std::back_inserter(lower_string), ...); (Avoids a copy) mrossetti 2011/03/10 01:31:14 This function replaced with l10n_util::ToLower(). Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: How about: > > string16 lower_string; > lower_string.reserve(upper_string.length()); > std::transform(upper_string.begin(), upper_string.end(), > std::back_inserter(lower_string), ...); > > (Avoids a copy) This function replaced with l10n_util::ToLower().
	420 std::transform(lower_string.begin(),

	421 lower_string.end(),

	422 lower_string.begin(),

	423 tolower);
	Peter Kasting 2011/03/09 02:31:48 Is this calling the C tolower() function? If so, Is this calling the C tolower() function? If so, don't we want to be using l10n_util or ICU instead? mrossetti 2011/03/10 01:31:14 Done. Replaced function. Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Is this calling the C tolower() function? If so, don't we want to be using > l10n_util or ICU instead? Done. Replaced function.
	424 return lower_string;

	425 }

	426

	427 // static

396 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(	428 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(

397 const string16& uni_string) {	429 const string16& uni_string) {

398 String16Set words;	430 String16Vector words = WordVectorFromString16(uni_string, false);

399 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD);	431 String16Set word_set;

400 if (iter.Init()) {	432 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();

401 while (iter.Advance()) {	433 ++iter)

	434 word_set.insert(InMemoryURLIndex::ToLower(*iter));

	435 return word_set;

	436 }

	437

	438 // static

	439 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(

	440 const string16& uni_string,

	441 bool break_on_space) {

	442 base::BreakIterator iter(&uni_string, break_on_space ?

	443 base::BreakIterator::BREAK_SPACE :
	Peter Kasting 2011/03/09 02:31:48 Nit: You can collapse lines 2 & 3 to a single line Nit: You can collapse lines 2 & 3 to a single line by indenting 4: base::BreakIterator iter(&uni_string, break_on_space ? base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD); mrossetti 2011/03/10 01:31:14 Done. Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: You can collapse lines 2 & 3 to a single line by indenting 4: > > base::BreakIterator iter(&uni_string, break_on_space ? > base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD); Done.
	444 base::BreakIterator::BREAK_WORD);

	445 String16Vector words;

	446 if (break_on_space) {

	447 if (iter.Init()) {

	448 while (iter.Advance()) {

	449 string16 word = iter.GetString();

	450 TrimWhitespace(word, TRIM_ALL, &word);

	451 if (word.size())
	Peter Kasting 2011/03/09 02:31:48 Nit: !word.empty() Nit: !word.empty() mrossetti 2011/03/10 01:31:14 Done. Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: !word.empty() Done.
	452 words.push_back(word);

	453 }

	454 }

	455 } else {

	456 if (iter.Init()) {

402 if (iter.IsWord())	457 if (iter.IsWord())

403 words.insert(iter.GetString());	458 words.push_back(iter.GetString());
	Peter Kasting 2011/03/09 02:31:48 I confess, I'm a little confused why one arm of th I confess, I'm a little confused why one arm of the conditional does this but the other doesn't. mrossetti 2011/03/10 01:31:14 The iterator behaves differently depending on the Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > I confess, I'm a little confused why one arm of the conditional does this but > the other doesn't. The iterator behaves differently depending on the breaking strategy, which was a surprise to me, too. The BreakIterator unit tests do not properly test this case as its basic word and space tests always use a test string starting with a space; I verified this behavior through experimentation and implementing specific unit tests in in_memory_url_index_unittest.cc. And I was loathe to modify the behavior of BreakIterator as it would touch a lot of code. Peter Kasting 2011/03/10 02:13:43 If BreakIterator should be changed, can you file a Show quoted text On 2011/03/10 01:31:14, mrossetti wrote: > On 2011/03/09 02:31:48, Peter Kasting wrote: > > I confess, I'm a little confused why one arm of the conditional does this but > > the other doesn't. > > The iterator behaves differently depending on the breaking strategy, which was a > surprise to me, too. The BreakIterator unit tests do not properly test this case > as its basic word and space tests always use a test string starting with a > space; I verified this behavior through experimentation and implementing > specific unit tests in in_memory_url_index_unittest.cc. And I was loathe to > modify the behavior of BreakIterator as it would touch a lot of code. If BreakIterator should be changed, can you file a bug for that so we can at least do it separately? If not, maybe add some comments in here?
	459 while (iter.Advance()) {

	460 if (iter.IsWord())

	461 words.push_back(iter.GetString());

	462 }

404 }	463 }

405 }	464 }

406 return words;	465 return words;

407 }	466 }

408	467

409 // static	468 // static

410 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(	469 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(

411 const string16& uni_word) {	470 const string16& uni_word) {

412 InMemoryURLIndex::Char16Vector characters;	471 InMemoryURLIndex::Char16Vector characters;

413 InMemoryURLIndex::Char16Set unique_characters;	472 InMemoryURLIndex::Char16Set unique_characters;

(...skipping 125 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
539 for (Char16Vector::const_iterator c_iter = uni_chars.begin();	598 for (Char16Vector::const_iterator c_iter = uni_chars.begin();

540 (c_iter != uni_chars.end()) &&	599 (c_iter != uni_chars.end()) &&

541 (t_iter != term_char_word_set_cache_.end()) &&	600 (t_iter != term_char_word_set_cache_.end()) &&

542 (*c_iter == t_iter->char_);	601 (*c_iter == t_iter->char_);

543 ++c_iter, ++t_iter)	602 ++c_iter, ++t_iter)

544 t_iter->used_ = true; // Update the cache.	603 t_iter->used_ = true; // Update the cache.

545 return t_iter - term_char_word_set_cache_.begin() - 1;	604 return t_iter - term_char_word_set_cache_.begin() - 1;

546 }	605 }

547	606

548 // static	607 // static

549 int InMemoryURLIndex::RawScoreForURL(const URLRow& row,	608 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,

550 const String16Vector& terms,	609 const string16& string,

551 size_t* first_term_location) {	610 int term_num) {

	611 TermMatches matches;

	612 for (size_t location = string.find(term); location != string16::npos;

	613 location = string.find(term, location + 1))

	614 matches.push_back(TermMatch(term_num, location, term.size()));

	615 return matches;

	616 }

	617

	618 // static

	619 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {

	620 if (matches.empty())

	621 return matches;

	622 TermMatches sorted_matches = matches;

	623 std::sort(sorted_matches.begin(), sorted_matches.end(),

	624 CompareMatchOffset);

	625 TermMatches clean_matches;

	626 TermMatch last_match = sorted_matches[0];

	627 clean_matches.push_back(last_match);

	628 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;

	629 iter != sorted_matches.end(); ++iter) {

	630 if (iter->offset >= last_match.offset + last_match.length) {

	631 last_match = *iter;

	632 clean_matches.push_back(last_match);

	633 }

	634 }

	635 return clean_matches;

	636 }

	637

	638 // static

	639 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(

	640 const URLRow& row,

	641 const String16Vector& terms) {

	642 ScoredHistoryMatch match(row);

552 GURL gurl = row.url();	643 GURL gurl = row.url();

553 if (!gurl.is_valid())	644 if (!gurl.is_valid())

554 return 0;	645 return match;

555	646

556 string16 url = UTF8ToUTF16(gurl.spec());	647 // Figure out where each search term appears in the URL and/or page title

557	648 // so that we can score as well as provide autocomplete highlighting.

558 // Collect all term start positions so we can see if they appear in order.	649 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));

559 std::vector<size_t> term_locations;	650 url = l10n_util::ToLower(url);

560 int out_of_order = 0; // Count the terms which are out of order.	651 string16 title = row.title();

561 size_t start_location_total = 0;	652 int term_num = 0;

562 size_t term_length_total = 0;

563 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();	653 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();

564 ++iter) {	654 ++iter, ++term_num) {

565 string16 term = *iter;	655 string16 term = *iter;

566 size_t term_location = url.find(term);	656 TermMatches url_term_matches = MatchTermInString(term, url, term_num);

567 if (term_location == string16::npos)	657 TermMatches title_term_matches = MatchTermInString(term, title, term_num);

568 return 0; // A term was not found. This should not happen.	658 if (url_term_matches.empty() && title_term_matches.empty())

569 if (iter != terms.begin()) {	659 return match; // A term was not found in either URL or title - reject.

570 // See if this term is out-of-order.	660 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),

571 for (std::vector<size_t>::const_iterator order_iter =	661 url_term_matches.end());

572 term_locations.begin(); order_iter != term_locations.end();	662 match.title_matches.insert(match.title_matches.end(),

573 ++order_iter) {	663 title_term_matches.begin(),

574 if (term_location <= *order_iter)	664 title_term_matches.end());

575 ++out_of_order;

576 }

577 } else {

578 *first_term_location = term_location;

579 }

580 term_locations.push_back(term_location);

581 start_location_total += term_location;

582 term_length_total += term.size();

583 }	665 }

584	666

585 // Calculate a raw score.	667 // Sort matches by offset and eliminate any which overlap.

586 // TODO(mrossetti): This is good enough for now but must be fine-tuned.	668 match.url_matches = SortAndDeoverlap(match.url_matches);

587 const float kOrderMaxValue = 10.0;	669 match.title_matches = SortAndDeoverlap(match.title_matches);

588 float order_value = 10.0;

589 if (terms.size() > 1) {

590 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;

591 order_value =

592 (static_cast<float>(max_possible_out_of_order - out_of_order) /

593 max_possible_out_of_order) * kOrderMaxValue;

594 }

595	670

596 const float kStartMaxValue = 10.0;	671 // Get partial scores based on term matching. Note that the score for

597 const size_t kMaxSignificantStart = 20;	672 // each of the URL and title are adjusted by the fraction of the

598 float start_value =	673 // terms appearing in each.

599 (static_cast<float>(kMaxSignificantStart -	674 int url_score = RawScoreForMatches(match.url_matches, url.size()) *

600 std::min(kMaxSignificantStart, start_location_total / terms.size()))) /	675 match.url_matches.size() / terms.size();

601 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;	676 int title_score = RawScoreForMatches(match.title_matches, title.size()) *

	677 match.title_matches.size() / terms.size();

	678 // Arbitrarily pick the best.

	679 // TODO(mrossetti): It might make sense that a term which appears in both the

	680 // URL and the Title should boost the score a bit.

	681 int term_score = std::max(url_score, title_score);

602	682

603 const float kCompleteMaxValue = 10.0;	683 // Factor in attributes of the URLRow.

604 float complete_value =	684 // Items which have been visited recently score higher.

605 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *

606 kStartMaxValue;

607

608 const float kLastVisitMaxValue = 10.0;

609 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);

610 int64 delta_time = (kMaxSignificantDay -	685 int64 delta_time = (kMaxSignificantDay -

611 std::min((base::Time::Now() - row.last_visit()),	686 std::min((base::Time::Now() - row.last_visit()),

612 kMaxSignificantDay)).ToInternalValue();	687 kMaxSignificantDay)).ToInternalValue();

613 float last_visit_value =	688 float last_visit_value =

614 (static_cast<float>(delta_time) /	689 (static_cast<float>(delta_time) /

615 static_cast<float>(kMaxSignificantDay.ToInternalValue())) *	690 static_cast<float>(kMaxSignificantDay.ToInternalValue())) *

616 kLastVisitMaxValue;	691 kLastVisitMaxValue;

617	692

618 const float kVisitCountMaxValue = 10.0;

619 const int kMaxSignificantVisits = 10;

620 float visit_count_value =	693 float visit_count_value =

621 (static_cast<float>(std::min(row.visit_count(),	694 (static_cast<float>(std::min(row.visit_count(),

622 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *	695 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *

623 kVisitCountMaxValue;	696 kVisitCountMaxValue;

624	697

625 const float kTypedCountMaxValue = 20.0;

626 const int kMaxSignificantTyped = 10;

627 float typed_count_value =	698 float typed_count_value =

628 (static_cast<float>(std::min(row.typed_count(),	699 (static_cast<float>(std::min(row.typed_count(),

629 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *	700 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *

630 kTypedCountMaxValue;	701 kTypedCountMaxValue;

631	702

632 float raw_score = order_value + start_value + complete_value +	703 float raw_score = term_score + last_visit_value + visit_count_value +

633 last_visit_value + visit_count_value + typed_count_value;	704 typed_count_value;

634	705

635 // Normalize the score.	706 // Normalize the score.

636 const float kMaxNormalizedRawScore = 1000.0;	707 match.raw_score = static_cast<int>((raw_score / kMaxRawScore) *

637 raw_score =	708 kMaxNormalizedRawScore);

638 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +	709

639 kLastVisitMaxValue + kVisitCountMaxValue +	710 return match;

640 kTypedCountMaxValue)) *

641 kMaxNormalizedRawScore;

642 return static_cast<int>(raw_score);

643 }	711 }

644	712

645 // static	713 int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches,

646 base::Time InMemoryURLIndex::RecentThreshold() {	714 size_t max_length) {

647 return base::Time::Now() -	715 // TODO(mrossetti): This is good enough for now but must be fine-tuned.

648 base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);	716 if (matches.empty())

	717 return 0;

	718

	719 // Search terms appearing in order score higher.

	720 float order_value = 10.0;

	721 if (matches.size() > 1) {

	722 int max_possible_out_of_order = matches.size() - 1;

	723 int out_of_order = 0;

	724 for (size_t i = 1; i < matches.size(); ++i) {

	725 if (matches[i-1].term_num > matches[i].term_num)
	Peter Kasting 2011/03/09 02:31:48 Nit: Spaces around '-' Nit: Spaces around '-' mrossetti 2011/03/10 01:31:14 Done. Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: Spaces around '-' Done.
	726 ++out_of_order;
	Peter Kasting 2011/03/09 02:31:48 Nit: Indent 2 Nit: Indent 2 mrossetti 2011/03/10 01:31:14 Done. Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: Indent 2 Done.
	727 }

	728 order_value =

	729 (static_cast<float>(max_possible_out_of_order - out_of_order) /

	730 max_possible_out_of_order) * kOrderMaxValue;

	731 }

	732

	733 // Search terms which appear earlier score higher.

	734 // No points if the first search term does not appear in the first

	735 // kMaxSignificantStart chars.

	736 float start_value =

	737 (static_cast<float>(kMaxSignificantStart -

	738 std::min(kMaxSignificantStart, matches[0].offset))) /

	739 static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
	Peter Kasting 2011/03/09 02:31:48 Nit: Is this second cast needed? Nit: Is this second cast needed? mrossetti 2011/03/10 01:31:14 Yes. Show quoted text On 2011/03/09 02:31:48, Peter Kasting wrote: > Nit: Is this second cast needed? Yes.
	740

	741 // Search terms which comprise a greater portion score higher.

	742 size_t term_length_total = std::accumulate(matches.begin(), matches.end(),

	743 0, AddumulateMatchLength);

	744 float complete_value =

	745 (static_cast<float>(term_length_total) / static_cast<float>(max_length)) *

	746 kStartMaxValue;

	747

	748 return static_cast<int>(order_value + start_value + complete_value);

649 }	749 }

650	750

651 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(	751 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(

652 const InMemoryURLIndex& index,	752 const InMemoryURLIndex& index,

653 const String16Vector& lower_terms)	753 const String16Vector& lower_terms)

654 : index_(index),	754 : index_(index),

655 lower_terms_(lower_terms) {	755 lower_terms_(lower_terms) {

656 }	756 }

657	757

658 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}	758 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}

659	759

660 void InMemoryURLIndex::AddHistoryMatch::operator()(	760 void InMemoryURLIndex::AddHistoryMatch::operator()(

661 const InMemoryURLIndex::HistoryID history_id) {	761 const InMemoryURLIndex::HistoryID history_id) {

662 HistoryInfoMap::const_iterator hist_pos =	762 HistoryInfoMap::const_iterator hist_pos =

663 index_.history_info_map_.find(history_id);	763 index_.history_info_map_.find(history_id);

664 // Note that a history_id may be present in the word_id_history_map_ yet not	764 // Note that a history_id may be present in the word_id_history_map_ yet not

665 // be found in the history_info_map_. This occurs when an item has been	765 // be found in the history_info_map_. This occurs when an item has been

666 // deleted by the user or the item no longer qualifies as a quick result.	766 // deleted by the user or the item no longer qualifies as a quick result.

667 if (hist_pos != index_.history_info_map_.end()) {	767 if (hist_pos != index_.history_info_map_.end()) {

668 const URLRow& hist_item = hist_pos->second;	768 const URLRow& hist_item = hist_pos->second;

669 // TODO(mrossetti): Accommodate multiple term highlighting.	769 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));

670 size_t input_location = 0;	770 if (match.raw_score != 0) {

671 int score = InMemoryURLIndex::RawScoreForURL(

672 hist_item, lower_terms_, &input_location);

673 if (score != 0) {

674 // We only retain the top 10 highest scoring results so	771 // We only retain the top 10 highest scoring results so

675 // see if this one fits into the top 10 and, if so, where.	772 // see if this one fits into the top 10 and, if so, where.

676 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();	773 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();

677 while (scored_iter != scored_matches_.end() &&	774 while (scored_iter != scored_matches_.end() &&

678 (*scored_iter).raw_score > score)	775 (*scored_iter).raw_score > match.raw_score)

679 ++scored_iter;	776 ++scored_iter;

680 if ((scored_matches_.size() < 10) \|\|	777 if ((scored_matches_.size() < 10) \|\|

681 (scored_iter != scored_matches_.end())) {	778 (scored_iter != scored_matches_.end())) {

682 // Create and insert the new item.	779 // Insert the new item.

683 // TODO(mrossetti): Properly set \|match_in_scheme\| and

684 // \|innermost_match\|.

685 bool match_in_scheme = false;

686 bool innermost_match = true;

687 ScoredHistoryMatch match(hist_item, input_location,

688 match_in_scheme, innermost_match, score);

689 if (!scored_matches_.empty())	780 if (!scored_matches_.empty())

690 scored_matches_.insert(scored_iter, match);	781 scored_matches_.insert(scored_iter, match);

691 else	782 else

692 scored_matches_.push_back(match);	783 scored_matches_.push_back(match);

693 // Trim any entries beyond 10.	784 // Trim any entries beyond 10.

694 if (scored_matches_.size() > 10) {	785 if (scored_matches_.size() > 10) {

695 scored_matches_.erase(scored_matches_.begin() + 10,	786 scored_matches_.erase(scored_matches_.begin() + 10,

696 scored_matches_.end());	787 scored_matches_.end());

697 }	788 }

698 }	789 }

(...skipping 218 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
917 if (iter->has_title()) {	1008 if (iter->has_title()) {

918 string16 title(UTF8ToUTF16(iter->title()));	1009 string16 title(UTF8ToUTF16(iter->title()));

919 url_row.set_title(title);	1010 url_row.set_title(title);

920 }	1011 }

921 history_info_map_[history_id] = url_row;	1012 history_info_map_[history_id] = url_row;

922 }	1013 }

923 return true;	1014 return true;

924 }	1015 }

925	1016

926 } // namespace history	1017 } // namespace history

OLD	NEW