chrome/browser/history/url_index_private_data.cc - Issue 9655003: Gather word-start Information to Aid in Scoring.

Unified Diff: chrome/browser/history/url_index_private_data.cc

Issue 9655003: Gather word-start Information to Aid in Scoring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/browser/history/url_index_private_data.cc

===================================================================

--- chrome/browser/history/url_index_private_data.cc (revision 125621)

+++ chrome/browser/history/url_index_private_data.cc (working copy)

@@ -42,6 +42,9 @@

typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;

typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry

HistoryInfoMapEntry;

+typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;

+typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry

+ WordStartsMapEntry;

// The maximum score any candidate result can achieve.

const int kMaxTotalScore = 1425;

@@ -115,7 +118,9 @@

// InMemoryURLIndex's Private Data ---------------------------------------------

URLIndexPrivateData::URLIndexPrivateData()

- : pre_filter_item_count_(0),

+ : restored_cache_version_(0),

+ saved_cache_version_(kCurrentCacheFileVersion),

+ pre_filter_item_count_(0),

post_filter_item_count_(0),

post_scoring_item_count_(0) {

URLIndexPrivateData::InitializeSchemeWhitelist(&scheme_whitelist_);

@@ -131,6 +136,7 @@

word_id_history_map_.clear();

history_id_word_map_.clear();

history_info_map_.clear();

+ word_starts_map_.clear();

}

// Cache Updating --------------------------------------------------------------

@@ -161,11 +167,14 @@

history_info_map_[history_id] = new_row;

// Index the words contained in the URL and title of the row.

- AddRowWordsToIndex(new_row);

+ WordStarts word_starts;

+ AddRowWordsToIndex(new_row, &word_starts);

+ word_starts_map_[history_id] = word_starts;

return true;

}

-void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row) {

+void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,

+ WordStarts* word_starts) {

HistoryID history_id = static_cast<HistoryID>(row.id());

// Split URL into individual, unique words then add in the title words.

const GURL& gurl(row.url());

@@ -174,8 +183,10 @@

net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,

NULL, NULL, NULL));

url = base::i18n::ToLower(url);

- String16Set url_words = String16SetFromString16(url);

- String16Set title_words = String16SetFromString16(row.title());

+ String16Set url_words = String16SetFromString16(url,

+ word_starts ? &word_starts->url_word_starts_ : NULL);

+ String16Set title_words = String16SetFromString16(row.title(),

+ word_starts ? &word_starts->title_word_starts_ : NULL);

String16Set words;

std::set_union(url_words.begin(), url_words.end(),

title_words.begin(), title_words.end(),

@@ -246,6 +257,7 @@

RemoveRowWordsFromIndex(row);

HistoryID history_id = static_cast<HistoryID>(row.id());

history_info_map_.erase(history_id);

+ word_starts_map_.erase(history_id);

}

void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {

@@ -328,7 +340,9 @@

// URL and title.

RemoveRowWordsFromIndex(row_to_update);

row_to_update.set_title(row.title());

- AddRowWordsToIndex(row_to_update);

+ WordStarts word_starts;

+ AddRowWordsToIndex(row_to_update, &word_starts);

+ word_starts_map_[row_id] = word_starts;

}

row_was_updated = true;

}

@@ -424,7 +438,7 @@

// search string. When the user types "colspec=ID%20Mstone Release" we get

// four 'words': "colspec", "id", "mstone" and "release".

String16Vector lower_words(

- history::String16VectorFromString16(lower_unescaped_string, false));

+ history::String16VectorFromString16(lower_unescaped_string, false, NULL));

ScoredHistoryMatches scored_items;

// Do nothing if we have indexed no words (probably because we've not been

@@ -536,8 +550,12 @@

// deleted by the user or the item no longer qualifies as a quick result.

if (hist_pos != private_data_.history_info_map_.end()) {

const URLRow& hist_item = hist_pos->second;

- ScoredHistoryMatch match(

- ScoredMatchForURL(hist_item, lower_string_, lower_terms_));

+ WordStartsMap::const_iterator starts_pos =

+ private_data_.word_starts_map_.find(history_id);

+ DCHECK(starts_pos != private_data_.word_starts_map_.end());

+ ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_string_,

+ lower_terms_,

+ starts_pos->second));

if (match.raw_score > 0)

scored_matches_.push_back(match);

}

@@ -548,7 +566,8 @@

ScoredHistoryMatch URLIndexPrivateData::ScoredMatchForURL(

const URLRow& row,

const string16& lower_string,

- const String16Vector& terms) {

+ const String16Vector& terms,

+ const WordStarts& word_starts) {

ScoredHistoryMatch match(row);

GURL gurl = row.url();

if (!gurl.is_valid())

@@ -664,14 +683,13 @@

// Score component for how early in the match string the first search term

// appears. Start with kStartMaxValue points and discount by

- // kStartMaxValue/kMaxSignificantStart points for each character later than

+ // kStartMaxValue/kMaxSignificantChars points for each character later than

// the first at which the term begins. No points are earned if the start of

- // the match occurs at or after kMaxSignificantStart.

- const size_t kMaxSignificantStart = 50;

+ // the match occurs at or after kMaxSignificantChars.

const int kStartMaxValue = 1000;

- int start_value = (kMaxSignificantStart -

- std::min(kMaxSignificantStart, matches[0].offset)) * kStartMaxValue /

- kMaxSignificantStart;

+ int start_value = (kMaxSignificantChars -

+ std::min(kMaxSignificantChars, matches[0].offset)) * kStartMaxValue /

+ kMaxSignificantChars;

// Score component for how much of the matched string the input terms cover.

// kCompleteMaxValue points times the fraction of the URL/page title string

@@ -928,6 +946,7 @@

InMemoryURLIndexCacheItem* cache) const {

DCHECK(cache);

cache->set_timestamp(base::Time::Now().ToInternalValue());

+ cache->set_version(saved_cache_version_);

// history_item_count_ is no longer used but rather than change the protobuf

// definition use a placeholder. This will go away with the switch to SQLite.

cache->set_history_item_count(0);

@@ -936,6 +955,7 @@

SaveCharWordMap(cache);

SaveWordIDHistoryMap(cache);

SaveHistoryInfoMap(cache);

+ SaveWordStartsMap(cache);

}

void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {

@@ -1020,6 +1040,33 @@

}

+void URLIndexPrivateData::SaveWordStartsMap(

+ InMemoryURLIndexCacheItem* cache) const {

+ if (word_starts_map_.empty())

+ return;

+ // For unit testing: Enable saving of the cache as an earlier version to

+ // allow testing of cache file upgrading in ReadFromFile().

Peter Kasting 2012/03/09 02:37:43 Nit: Is this the best way of doing this? For most

mrossetti 2012/03/14 23:23:49 I hear you, and understand. Since saving using pro

+ if (saved_cache_version_ < 1)

+ return;

+ WordStartsMapItem* map_item = cache->mutable_word_starts_map();

+ map_item->set_item_count(word_starts_map_.size());

+ for (WordStartsMap::const_iterator iter = word_starts_map_.begin();

+ iter != word_starts_map_.end(); ++iter) {

+ WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();

+ map_entry->set_history_id(iter->first);

+ const WordStarts& word_starts(iter->second);

+ for (std::vector<int>::const_iterator siter =

+ word_starts.url_word_starts_.begin();

+ siter != word_starts.url_word_starts_.end(); ++siter)

+ map_entry->add_url_word_starts(*siter);

+ for (std::vector<int>::const_iterator siter =

+ word_starts.title_word_starts_.begin();

+ siter != word_starts.title_word_starts_.end(); ++siter)

+ map_entry->add_title_word_starts(*siter);

+ }

// Cache Restoring -------------------------------------------------------------

bool URLIndexPrivateData::RestoreFromFile(const FilePath& file_path) {

@@ -1090,9 +1137,11 @@

bool URLIndexPrivateData::RestorePrivateData(

const InMemoryURLIndexCacheItem& cache) {

+ if (cache.has_version())

+ restored_cache_version_ = cache.version();

return RestoreWordList(cache) && RestoreWordMap(cache) &&

RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&

- RestoreHistoryInfoMap(cache);

+ RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache);

}

bool URLIndexPrivateData::RestoreWordList(

@@ -1213,4 +1262,54 @@

return true;

}

+bool URLIndexPrivateData::RestoreWordStartsMap(

+ const InMemoryURLIndexCacheItem& cache) {

+ // Note that this function must be called after RestoreHistoryInfoMap() has

+ // been run as the word starts may have to be recalculated from the urls and

+ // page titles.

+ if (cache.has_word_starts_map()) {

+ const WordStartsMapItem& list_item(cache.word_starts_map());

+ uint32 expected_item_count = list_item.item_count();

+ uint32 actual_item_count = list_item.word_starts_map_entry_size();

+ if (actual_item_count == 0 || actual_item_count != expected_item_count)

+ return false;

+ const RepeatedPtrField<WordStartsMapEntry>&

+ entries(list_item.word_starts_map_entry());

+ for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =

+ entries.begin(); iter != entries.end(); ++iter) {

+ HistoryID history_id = iter->history_id();

+ WordStarts word_starts;

+ // Restore the URL word starts.

+ const RepeatedField<int32>& url_starts(iter->url_word_starts());

+ for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();

Peter Kasting 2012/03/09 02:37:43 Nit: Argh... use |i| and |j| rather than |iter| an

+ jiter != url_starts.end(); ++jiter)

+ word_starts.url_word_starts_.push_back(*jiter);

+ // Restore the page title word starts.

+ const RepeatedField<int32>& title_starts(iter->title_word_starts());

+ for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();

+ jiter != title_starts.end(); ++jiter)

+ word_starts.title_word_starts_.push_back(*jiter);

+ word_starts_map_[history_id] = word_starts;

+ }

+ } else {

+ // Since the cache did not contain any word starts we must rebuild then from

+ // the URL and page titles.

+ for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();

+ iter != history_info_map_.end(); ++iter) {

+ WordStarts word_starts;

+ const URLRow& row(iter->second);

+ string16 url(net::FormatUrl(row.url(), languages_,

+ net::kFormatUrlOmitUsernamePassword,

+ net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,

+ NULL, NULL, NULL));

+ url = base::i18n::ToLower(url);

+ String16VectorFromString16(url, false, &word_starts.url_word_starts_);

+ String16VectorFromString16(

+ row.title(), false, &word_starts.title_word_starts_);

+ word_starts_map_[iter->first] = word_starts;

+ }

+ return true;

} // namespace history

« chrome/browser/history/in_memory_url_index_types_unittest.cc ('K') | « chrome/browser/history/url_index_private_data.h ('k') | no next file » | no next file with comments »