Chromium Code Reviews| Index: chrome/browser/history/in_memory_url_index.cc |
| =================================================================== |
| --- chrome/browser/history/in_memory_url_index.cc (revision 77314) |
| +++ chrome/browser/history/in_memory_url_index.cc (working copy) |
| @@ -7,6 +7,7 @@ |
| #include <algorithm> |
| #include <iterator> |
| #include <limits> |
| +#include <numeric> |
| #include "base/file_util.h" |
| #include "base/i18n/break_iterator.h" |
| @@ -24,8 +25,8 @@ |
| using google::protobuf::RepeatedField; |
| using google::protobuf::RepeatedPtrField; |
| +using in_memory_url_index::InMemoryURLIndexCacheItem; |
| -using in_memory_url_index::InMemoryURLIndexCacheItem; |
| namespace history { |
| typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; |
| @@ -45,16 +46,25 @@ |
| const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; |
| +const float kOrderMaxValue = 50.0; |
| +const float kStartMaxValue = 50.0; |
| +const size_t kMaxSignificantStart = 20; |
| +const float kCompleteMaxValue = 50.0; |
| +const float kLastVisitMaxValue = 50.0; |
| +const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); |
| +const float kVisitCountMaxValue = 50.0; |
| +const int kMaxSignificantVisits = 20; |
| +const float kTypedCountMaxValue = 50.0; |
| +const int kMaxSignificantTyped = 20; |
| +const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + |
| + kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue; |
| +const float kMaxNormalizedRawScore = 1000.0; |
| + |
| ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} |
| -ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, |
| - size_t input_location, |
| - bool match_in_scheme, |
| - bool innermost_match, |
| - int score) |
| - : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), |
| - raw_score(score) { |
| -} |
| +ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info) |
| + : HistoryMatch(url_info, 0, false, false), |
| + raw_score(0) {} |
| struct InMemoryURLIndex::TermCharWordSet { |
| TermCharWordSet() // Required for STL resize(). |
| @@ -76,6 +86,16 @@ |
| bool used_; // true if this set has been used for the current term search. |
| }; |
| +// Comparison function for sorting TermMatches by their offsets. |
| +bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) { |
| + return m1.offset < m2.offset; |
| +} |
| + |
| +// std::accumulate helper function to add up TermMatches' lengths. |
| +int AddumulateMatchLength(int total, const TermMatch& match) { |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Addumulate -> Accumulate
mrossetti
2011/03/10 01:31:14
LOL
|
| + return total + match.length; |
| +} |
| + |
| InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) |
| : history_dir_(history_dir), |
| history_item_count_(0) { |
| @@ -109,11 +129,10 @@ |
| UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, |
| NULL, NULL, NULL)); |
| - // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). |
| HistoryID history_id = static_cast<HistoryID>(row.id()); |
| + DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max()); |
| // Add the row for quick lookup in the history info store. |
| - url = l10n_util::ToLower(url); |
| URLRow new_row(GURL(url), row.id()); |
| new_row.set_visit_count(row.visit_count()); |
| new_row.set_typed_count(row.typed_count()); |
| @@ -121,16 +140,18 @@ |
| new_row.set_title(row.title()); |
| history_info_map_[history_id] = new_row; |
| - // Split into individual, unique words. |
| - String16Set words = WordSetFromString16(url); |
| + // Split URL into individual, unique words then add in the title words. |
| + url = l10n_util::ToLower(url); |
| + String16Set url_words = WordSetFromString16(url); |
| + String16Set title_words = WordSetFromString16(row.title()); |
| + String16Set words; |
| + std::set_union(url_words.begin(), url_words.end(), |
| + title_words.begin(), title_words.end(), |
| + std::insert_iterator<String16Set>(words, words.begin())); |
| + for (String16Set::iterator word_iter = words.begin(); |
| + word_iter != words.end(); ++word_iter) |
| + AddWordToIndex(*word_iter, history_id); |
| - // For each word, add a new entry into the word index referring to the |
| - // associated history item. |
| - for (String16Set::iterator iter = words.begin(); |
| - iter != words.end(); ++iter) { |
| - String16Set::value_type uni_word = *iter; |
| - AddWordToIndex(uni_word, history_id); |
| - } |
| ++history_item_count_; |
| return true; |
| } |
| @@ -292,14 +313,8 @@ |
| // TODO(mrossetti): Another opportunity for a transform algorithm. |
| String16Vector lower_terms; |
| for (String16Vector::const_iterator term_iter = terms.begin(); |
| - term_iter != terms.end(); ++term_iter) { |
| - String16Vector::value_type lower_string(*term_iter); |
| - std::transform(lower_string.begin(), |
| - lower_string.end(), |
| - lower_string.begin(), |
| - tolower); |
| - lower_terms.push_back(lower_string); |
| - } |
| + term_iter != terms.end(); ++term_iter) |
| + lower_terms.push_back(InMemoryURLIndex::ToLower(*term_iter)); |
| String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); |
| HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); |
| @@ -372,6 +387,13 @@ |
| Char16Vector uni_chars = Char16VectorFromString16(uni_word); |
| WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); |
| + // TODO(mrossetti): At this point, as a possible optimization, we could |
| + // scan through all candidate words and make sure the |uni_word| is a |
| + // substring within the candidate words, eliminating those which aren't. |
| + // I'm not sure it would be worth the effort. And remember, we've got to do |
| + // a final substring match in order to get the highlighting ranges later |
| + // in the process in any case. |
| + |
| // If any words resulted then we can compose a set of history IDs by unioning |
| // the sets from each word. |
| if (!word_id_set.empty()) { |
| @@ -393,14 +415,51 @@ |
| // Utility Functions |
| // static |
| +string16 InMemoryURLIndex::ToLower(const string16& upper_string) { |
| + string16 lower_string(upper_string); |
|
Peter Kasting
2011/03/09 02:31:48
Nit: How about:
string16 lower_string;
lower_
mrossetti
2011/03/10 01:31:14
This function replaced with l10n_util::ToLower().
|
| + std::transform(lower_string.begin(), |
| + lower_string.end(), |
| + lower_string.begin(), |
| + tolower); |
|
Peter Kasting
2011/03/09 02:31:48
Is this calling the C tolower() function? If so,
mrossetti
2011/03/10 01:31:14
Done. Replaced function.
|
| + return lower_string; |
| +} |
| + |
| +// static |
| InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( |
| const string16& uni_string) { |
| - String16Set words; |
| - base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD); |
| - if (iter.Init()) { |
| - while (iter.Advance()) { |
| + String16Vector words = WordVectorFromString16(uni_string, false); |
| + String16Set word_set; |
| + for (String16Vector::const_iterator iter = words.begin(); iter != words.end(); |
| + ++iter) |
| + word_set.insert(InMemoryURLIndex::ToLower(*iter)); |
| + return word_set; |
| +} |
| + |
| +// static |
| +InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16( |
| + const string16& uni_string, |
| + bool break_on_space) { |
| + base::BreakIterator iter(&uni_string, break_on_space ? |
| + base::BreakIterator::BREAK_SPACE : |
|
Peter Kasting
2011/03/09 02:31:48
Nit: You can collapse lines 2 & 3 to a single line
mrossetti
2011/03/10 01:31:14
Done.
|
| + base::BreakIterator::BREAK_WORD); |
| + String16Vector words; |
| + if (break_on_space) { |
| + if (iter.Init()) { |
| + while (iter.Advance()) { |
| + string16 word = iter.GetString(); |
| + TrimWhitespace(word, TRIM_ALL, &word); |
| + if (word.size()) |
|
Peter Kasting
2011/03/09 02:31:48
Nit: !word.empty()
mrossetti
2011/03/10 01:31:14
Done.
|
| + words.push_back(word); |
| + } |
| + } |
| + } else { |
| + if (iter.Init()) { |
| if (iter.IsWord()) |
| - words.insert(iter.GetString()); |
| + words.push_back(iter.GetString()); |
|
Peter Kasting
2011/03/09 02:31:48
I confess, I'm a little confused why one arm of th
mrossetti
2011/03/10 01:31:14
The iterator behaves differently depending on the
Peter Kasting
2011/03/10 02:13:43
If BreakIterator should be changed, can you file a
|
| + while (iter.Advance()) { |
| + if (iter.IsWord()) |
| + words.push_back(iter.GetString()); |
| + } |
| } |
| } |
| return words; |
| @@ -546,67 +605,83 @@ |
| } |
| // static |
| -int InMemoryURLIndex::RawScoreForURL(const URLRow& row, |
| - const String16Vector& terms, |
| - size_t* first_term_location) { |
| +TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, |
| + const string16& string, |
| + int term_num) { |
| + TermMatches matches; |
| + for (size_t location = string.find(term); location != string16::npos; |
| + location = string.find(term, location + 1)) |
| + matches.push_back(TermMatch(term_num, location, term.size())); |
| + return matches; |
| +} |
| + |
| +// static |
| +TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) { |
| + if (matches.empty()) |
| + return matches; |
| + TermMatches sorted_matches = matches; |
| + std::sort(sorted_matches.begin(), sorted_matches.end(), |
| + CompareMatchOffset); |
| + TermMatches clean_matches; |
| + TermMatch last_match = sorted_matches[0]; |
| + clean_matches.push_back(last_match); |
| + for (TermMatches::const_iterator iter = sorted_matches.begin() + 1; |
| + iter != sorted_matches.end(); ++iter) { |
| + if (iter->offset >= last_match.offset + last_match.length) { |
| + last_match = *iter; |
| + clean_matches.push_back(last_match); |
| + } |
| + } |
| + return clean_matches; |
| +} |
| + |
| +// static |
| +ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL( |
| + const URLRow& row, |
| + const String16Vector& terms) { |
| + ScoredHistoryMatch match(row); |
| GURL gurl = row.url(); |
| if (!gurl.is_valid()) |
| - return 0; |
| + return match; |
| - string16 url = UTF8ToUTF16(gurl.spec()); |
| - |
| - // Collect all term start positions so we can see if they appear in order. |
| - std::vector<size_t> term_locations; |
| - int out_of_order = 0; // Count the terms which are out of order. |
| - size_t start_location_total = 0; |
| - size_t term_length_total = 0; |
| + // Figure out where each search term appears in the URL and/or page title |
| + // so that we can score as well as provide autocomplete highlighting. |
| + string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec())); |
| + url = l10n_util::ToLower(url); |
| + string16 title = row.title(); |
| + int term_num = 0; |
| for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); |
| - ++iter) { |
| + ++iter, ++term_num) { |
| string16 term = *iter; |
| - size_t term_location = url.find(term); |
| - if (term_location == string16::npos) |
| - return 0; // A term was not found. This should not happen. |
| - if (iter != terms.begin()) { |
| - // See if this term is out-of-order. |
| - for (std::vector<size_t>::const_iterator order_iter = |
| - term_locations.begin(); order_iter != term_locations.end(); |
| - ++order_iter) { |
| - if (term_location <= *order_iter) |
| - ++out_of_order; |
| - } |
| - } else { |
| - *first_term_location = term_location; |
| - } |
| - term_locations.push_back(term_location); |
| - start_location_total += term_location; |
| - term_length_total += term.size(); |
| + TermMatches url_term_matches = MatchTermInString(term, url, term_num); |
| + TermMatches title_term_matches = MatchTermInString(term, title, term_num); |
| + if (url_term_matches.empty() && title_term_matches.empty()) |
| + return match; // A term was not found in either URL or title - reject. |
| + match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(), |
| + url_term_matches.end()); |
| + match.title_matches.insert(match.title_matches.end(), |
| + title_term_matches.begin(), |
| + title_term_matches.end()); |
| } |
| - // Calculate a raw score. |
| - // TODO(mrossetti): This is good enough for now but must be fine-tuned. |
| - const float kOrderMaxValue = 10.0; |
| - float order_value = 10.0; |
| - if (terms.size() > 1) { |
| - int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; |
| - order_value = |
| - (static_cast<float>(max_possible_out_of_order - out_of_order) / |
| - max_possible_out_of_order) * kOrderMaxValue; |
| - } |
| + // Sort matches by offset and eliminate any which overlap. |
| + match.url_matches = SortAndDeoverlap(match.url_matches); |
| + match.title_matches = SortAndDeoverlap(match.title_matches); |
| - const float kStartMaxValue = 10.0; |
| - const size_t kMaxSignificantStart = 20; |
| - float start_value = |
| - (static_cast<float>(kMaxSignificantStart - |
| - std::min(kMaxSignificantStart, start_location_total / terms.size()))) / |
| - static_cast<float>(kMaxSignificantStart) * kStartMaxValue; |
| + // Get partial scores based on term matching. Note that the score for |
| + // each of the URL and title are adjusted by the fraction of the |
| + // terms appearing in each. |
| + int url_score = RawScoreForMatches(match.url_matches, url.size()) * |
| + match.url_matches.size() / terms.size(); |
| + int title_score = RawScoreForMatches(match.title_matches, title.size()) * |
| + match.title_matches.size() / terms.size(); |
| + // Arbitrarily pick the best. |
| + // TODO(mrossetti): It might make sense that a term which appears in both the |
| + // URL and the Title should boost the score a bit. |
| + int term_score = std::max(url_score, title_score); |
| - const float kCompleteMaxValue = 10.0; |
| - float complete_value = |
| - (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * |
| - kStartMaxValue; |
| - |
| - const float kLastVisitMaxValue = 10.0; |
| - const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); |
| + // Factor in attributes of the URLRow. |
| + // Items which have been visited recently score higher. |
| int64 delta_time = (kMaxSignificantDay - |
| std::min((base::Time::Now() - row.last_visit()), |
| kMaxSignificantDay)).ToInternalValue(); |
| @@ -615,37 +690,62 @@ |
| static_cast<float>(kMaxSignificantDay.ToInternalValue())) * |
| kLastVisitMaxValue; |
| - const float kVisitCountMaxValue = 10.0; |
| - const int kMaxSignificantVisits = 10; |
| float visit_count_value = |
| (static_cast<float>(std::min(row.visit_count(), |
| kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * |
| kVisitCountMaxValue; |
| - const float kTypedCountMaxValue = 20.0; |
| - const int kMaxSignificantTyped = 10; |
| float typed_count_value = |
| (static_cast<float>(std::min(row.typed_count(), |
| kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * |
| kTypedCountMaxValue; |
| - float raw_score = order_value + start_value + complete_value + |
| - last_visit_value + visit_count_value + typed_count_value; |
| + float raw_score = term_score + last_visit_value + visit_count_value + |
| + typed_count_value; |
| // Normalize the score. |
| - const float kMaxNormalizedRawScore = 1000.0; |
| - raw_score = |
| - (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + |
| - kLastVisitMaxValue + kVisitCountMaxValue + |
| - kTypedCountMaxValue)) * |
| - kMaxNormalizedRawScore; |
| - return static_cast<int>(raw_score); |
| + match.raw_score = static_cast<int>((raw_score / kMaxRawScore) * |
| + kMaxNormalizedRawScore); |
| + |
| + return match; |
| } |
| -// static |
| -base::Time InMemoryURLIndex::RecentThreshold() { |
| - return base::Time::Now() - |
| - base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); |
| +int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches, |
| + size_t max_length) { |
| + // TODO(mrossetti): This is good enough for now but must be fine-tuned. |
| + if (matches.empty()) |
| + return 0; |
| + |
| + // Search terms appearing in order score higher. |
| + float order_value = 10.0; |
| + if (matches.size() > 1) { |
| + int max_possible_out_of_order = matches.size() - 1; |
| + int out_of_order = 0; |
| + for (size_t i = 1; i < matches.size(); ++i) { |
| + if (matches[i-1].term_num > matches[i].term_num) |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Spaces around '-'
mrossetti
2011/03/10 01:31:14
Done.
|
| + ++out_of_order; |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Indent 2
mrossetti
2011/03/10 01:31:14
Done.
|
| + } |
| + order_value = |
| + (static_cast<float>(max_possible_out_of_order - out_of_order) / |
| + max_possible_out_of_order) * kOrderMaxValue; |
| + } |
| + |
| + // Search terms which appear earlier score higher. |
| + // No points if the first search term does not appear in the first |
| + // kMaxSignificantStart chars. |
| + float start_value = |
| + (static_cast<float>(kMaxSignificantStart - |
| + std::min(kMaxSignificantStart, matches[0].offset))) / |
| + static_cast<float>(kMaxSignificantStart) * kStartMaxValue; |
|
Peter Kasting
2011/03/09 02:31:48
Nit: Is this second cast needed?
mrossetti
2011/03/10 01:31:14
Yes.
|
| + |
| + // Search terms which comprise a greater portion score higher. |
| + size_t term_length_total = std::accumulate(matches.begin(), matches.end(), |
| + 0, AddumulateMatchLength); |
| + float complete_value = |
| + (static_cast<float>(term_length_total) / static_cast<float>(max_length)) * |
| + kStartMaxValue; |
| + |
| + return static_cast<int>(order_value + start_value + complete_value); |
| } |
| InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( |
| @@ -666,26 +766,17 @@ |
| // deleted by the user or the item no longer qualifies as a quick result. |
| if (hist_pos != index_.history_info_map_.end()) { |
| const URLRow& hist_item = hist_pos->second; |
| - // TODO(mrossetti): Accommodate multiple term highlighting. |
| - size_t input_location = 0; |
| - int score = InMemoryURLIndex::RawScoreForURL( |
| - hist_item, lower_terms_, &input_location); |
| - if (score != 0) { |
| + ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_)); |
| + if (match.raw_score != 0) { |
| // We only retain the top 10 highest scoring results so |
| // see if this one fits into the top 10 and, if so, where. |
| ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); |
| while (scored_iter != scored_matches_.end() && |
| - (*scored_iter).raw_score > score) |
| + (*scored_iter).raw_score > match.raw_score) |
| ++scored_iter; |
| if ((scored_matches_.size() < 10) || |
| (scored_iter != scored_matches_.end())) { |
| - // Create and insert the new item. |
| - // TODO(mrossetti): Properly set |match_in_scheme| and |
| - // |innermost_match|. |
| - bool match_in_scheme = false; |
| - bool innermost_match = true; |
| - ScoredHistoryMatch match(hist_item, input_location, |
| - match_in_scheme, innermost_match, score); |
| + // Insert the new item. |
| if (!scored_matches_.empty()) |
| scored_matches_.insert(scored_iter, match); |
| else |