Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5549)

Unified Diff: chrome/browser/history/in_memory_url_index.cc

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/history/in_memory_url_index.cc
===================================================================
--- chrome/browser/history/in_memory_url_index.cc (revision 77314)
+++ chrome/browser/history/in_memory_url_index.cc (working copy)
@@ -7,6 +7,7 @@
#include <algorithm>
#include <iterator>
#include <limits>
+#include <numeric>
#include "base/file_util.h"
#include "base/i18n/break_iterator.h"
@@ -24,8 +25,8 @@
using google::protobuf::RepeatedField;
using google::protobuf::RepeatedPtrField;
+using in_memory_url_index::InMemoryURLIndexCacheItem;
-using in_memory_url_index::InMemoryURLIndexCacheItem;
namespace history {
typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
@@ -45,16 +46,25 @@
const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;
+const float kOrderMaxValue = 50.0;
+const float kStartMaxValue = 50.0;
+const size_t kMaxSignificantStart = 20;
+const float kCompleteMaxValue = 50.0;
+const float kLastVisitMaxValue = 50.0;
+const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
+const float kVisitCountMaxValue = 50.0;
+const int kMaxSignificantVisits = 20;
+const float kTypedCountMaxValue = 50.0;
+const int kMaxSignificantTyped = 20;
+const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
+ kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue;
+const float kMaxNormalizedRawScore = 1000.0;
+
ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
-ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
- size_t input_location,
- bool match_in_scheme,
- bool innermost_match,
- int score)
- : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
- raw_score(score) {
-}
+ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)
+ : HistoryMatch(url_info, 0, false, false),
+ raw_score(0) {}
struct InMemoryURLIndex::TermCharWordSet {
TermCharWordSet() // Required for STL resize().
@@ -76,6 +86,16 @@
bool used_; // true if this set has been used for the current term search.
};
+// Comparison function for sorting TermMatches by their offsets.
+bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) {
+ return m1.offset < m2.offset;
+}
+
+// std::accumulate helper function to add up TermMatches' lengths.
+int AddumulateMatchLength(int total, const TermMatch& match) {
Peter Kasting 2011/03/09 02:31:48 Nit: Addumulate -> Accumulate
mrossetti 2011/03/10 01:31:14 LOL
+ return total + match.length;
+}
+
InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)
: history_dir_(history_dir),
history_item_count_(0) {
@@ -109,11 +129,10 @@
UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
NULL, NULL, NULL));
- // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
HistoryID history_id = static_cast<HistoryID>(row.id());
+ DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());
// Add the row for quick lookup in the history info store.
- url = l10n_util::ToLower(url);
URLRow new_row(GURL(url), row.id());
new_row.set_visit_count(row.visit_count());
new_row.set_typed_count(row.typed_count());
@@ -121,16 +140,18 @@
new_row.set_title(row.title());
history_info_map_[history_id] = new_row;
- // Split into individual, unique words.
- String16Set words = WordSetFromString16(url);
+ // Split URL into individual, unique words then add in the title words.
+ url = l10n_util::ToLower(url);
+ String16Set url_words = WordSetFromString16(url);
+ String16Set title_words = WordSetFromString16(row.title());
+ String16Set words;
+ std::set_union(url_words.begin(), url_words.end(),
+ title_words.begin(), title_words.end(),
+ std::insert_iterator<String16Set>(words, words.begin()));
+ for (String16Set::iterator word_iter = words.begin();
+ word_iter != words.end(); ++word_iter)
+ AddWordToIndex(*word_iter, history_id);
- // For each word, add a new entry into the word index referring to the
- // associated history item.
- for (String16Set::iterator iter = words.begin();
- iter != words.end(); ++iter) {
- String16Set::value_type uni_word = *iter;
- AddWordToIndex(uni_word, history_id);
- }
++history_item_count_;
return true;
}
@@ -292,14 +313,8 @@
// TODO(mrossetti): Another opportunity for a transform algorithm.
String16Vector lower_terms;
for (String16Vector::const_iterator term_iter = terms.begin();
- term_iter != terms.end(); ++term_iter) {
- String16Vector::value_type lower_string(*term_iter);
- std::transform(lower_string.begin(),
- lower_string.end(),
- lower_string.begin(),
- tolower);
- lower_terms.push_back(lower_string);
- }
+ term_iter != terms.end(); ++term_iter)
+ lower_terms.push_back(InMemoryURLIndex::ToLower(*term_iter));
String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
@@ -372,6 +387,13 @@
Char16Vector uni_chars = Char16VectorFromString16(uni_word);
WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
+ // TODO(mrossetti): At this point, as a possible optimization, we could
+ // scan through all candidate words and make sure the |uni_word| is a
+ // substring within the candidate words, eliminating those which aren't.
+ // I'm not sure it would be worth the effort. And remember, we've got to do
+ // a final substring match in order to get the highlighting ranges later
+ // in the process in any case.
+
// If any words resulted then we can compose a set of history IDs by unioning
// the sets from each word.
if (!word_id_set.empty()) {
@@ -393,14 +415,51 @@
// Utility Functions
// static
+string16 InMemoryURLIndex::ToLower(const string16& upper_string) {
+ string16 lower_string(upper_string);
Peter Kasting 2011/03/09 02:31:48 Nit: How about: string16 lower_string; lower_
mrossetti 2011/03/10 01:31:14 This function replaced with l10n_util::ToLower().
+ std::transform(lower_string.begin(),
+ lower_string.end(),
+ lower_string.begin(),
+ tolower);
Peter Kasting 2011/03/09 02:31:48 Is this calling the C tolower() function? If so,
mrossetti 2011/03/10 01:31:14 Done. Replaced function.
+ return lower_string;
+}
+
+// static
InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
const string16& uni_string) {
- String16Set words;
- base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD);
- if (iter.Init()) {
- while (iter.Advance()) {
+ String16Vector words = WordVectorFromString16(uni_string, false);
+ String16Set word_set;
+ for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
+ ++iter)
+ word_set.insert(InMemoryURLIndex::ToLower(*iter));
+ return word_set;
+}
+
+// static
+InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(
+ const string16& uni_string,
+ bool break_on_space) {
+ base::BreakIterator iter(&uni_string, break_on_space ?
+ base::BreakIterator::BREAK_SPACE :
Peter Kasting 2011/03/09 02:31:48 Nit: You can collapse lines 2 & 3 to a single line
mrossetti 2011/03/10 01:31:14 Done.
+ base::BreakIterator::BREAK_WORD);
+ String16Vector words;
+ if (break_on_space) {
+ if (iter.Init()) {
+ while (iter.Advance()) {
+ string16 word = iter.GetString();
+ TrimWhitespace(word, TRIM_ALL, &word);
+ if (word.size())
Peter Kasting 2011/03/09 02:31:48 Nit: !word.empty()
mrossetti 2011/03/10 01:31:14 Done.
+ words.push_back(word);
+ }
+ }
+ } else {
+ if (iter.Init()) {
if (iter.IsWord())
- words.insert(iter.GetString());
+ words.push_back(iter.GetString());
Peter Kasting 2011/03/09 02:31:48 I confess, I'm a little confused why one arm of th
mrossetti 2011/03/10 01:31:14 The iterator behaves differently depending on the
Peter Kasting 2011/03/10 02:13:43 If BreakIterator should be changed, can you file a
+ while (iter.Advance()) {
+ if (iter.IsWord())
+ words.push_back(iter.GetString());
+ }
}
}
return words;
@@ -546,67 +605,83 @@
}
// static
-int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
- const String16Vector& terms,
- size_t* first_term_location) {
+TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,
+ const string16& string,
+ int term_num) {
+ TermMatches matches;
+ for (size_t location = string.find(term); location != string16::npos;
+ location = string.find(term, location + 1))
+ matches.push_back(TermMatch(term_num, location, term.size()));
+ return matches;
+}
+
+// static
+TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {
+ if (matches.empty())
+ return matches;
+ TermMatches sorted_matches = matches;
+ std::sort(sorted_matches.begin(), sorted_matches.end(),
+ CompareMatchOffset);
+ TermMatches clean_matches;
+ TermMatch last_match = sorted_matches[0];
+ clean_matches.push_back(last_match);
+ for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;
+ iter != sorted_matches.end(); ++iter) {
+ if (iter->offset >= last_match.offset + last_match.length) {
+ last_match = *iter;
+ clean_matches.push_back(last_match);
+ }
+ }
+ return clean_matches;
+}
+
+// static
+ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(
+ const URLRow& row,
+ const String16Vector& terms) {
+ ScoredHistoryMatch match(row);
GURL gurl = row.url();
if (!gurl.is_valid())
- return 0;
+ return match;
- string16 url = UTF8ToUTF16(gurl.spec());
-
- // Collect all term start positions so we can see if they appear in order.
- std::vector<size_t> term_locations;
- int out_of_order = 0; // Count the terms which are out of order.
- size_t start_location_total = 0;
- size_t term_length_total = 0;
+ // Figure out where each search term appears in the URL and/or page title
+ // so that we can score as well as provide autocomplete highlighting.
+ string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));
+ url = l10n_util::ToLower(url);
+ string16 title = row.title();
+ int term_num = 0;
for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
- ++iter) {
+ ++iter, ++term_num) {
string16 term = *iter;
- size_t term_location = url.find(term);
- if (term_location == string16::npos)
- return 0; // A term was not found. This should not happen.
- if (iter != terms.begin()) {
- // See if this term is out-of-order.
- for (std::vector<size_t>::const_iterator order_iter =
- term_locations.begin(); order_iter != term_locations.end();
- ++order_iter) {
- if (term_location <= *order_iter)
- ++out_of_order;
- }
- } else {
- *first_term_location = term_location;
- }
- term_locations.push_back(term_location);
- start_location_total += term_location;
- term_length_total += term.size();
+ TermMatches url_term_matches = MatchTermInString(term, url, term_num);
+ TermMatches title_term_matches = MatchTermInString(term, title, term_num);
+ if (url_term_matches.empty() && title_term_matches.empty())
+ return match; // A term was not found in either URL or title - reject.
+ match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),
+ url_term_matches.end());
+ match.title_matches.insert(match.title_matches.end(),
+ title_term_matches.begin(),
+ title_term_matches.end());
}
- // Calculate a raw score.
- // TODO(mrossetti): This is good enough for now but must be fine-tuned.
- const float kOrderMaxValue = 10.0;
- float order_value = 10.0;
- if (terms.size() > 1) {
- int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
- order_value =
- (static_cast<float>(max_possible_out_of_order - out_of_order) /
- max_possible_out_of_order) * kOrderMaxValue;
- }
+ // Sort matches by offset and eliminate any which overlap.
+ match.url_matches = SortAndDeoverlap(match.url_matches);
+ match.title_matches = SortAndDeoverlap(match.title_matches);
- const float kStartMaxValue = 10.0;
- const size_t kMaxSignificantStart = 20;
- float start_value =
- (static_cast<float>(kMaxSignificantStart -
- std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
- static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
+ // Get partial scores based on term matching. Note that the score for
+ // each of the URL and title are adjusted by the fraction of the
+ // terms appearing in each.
+ int url_score = RawScoreForMatches(match.url_matches, url.size()) *
+ match.url_matches.size() / terms.size();
+ int title_score = RawScoreForMatches(match.title_matches, title.size()) *
+ match.title_matches.size() / terms.size();
+ // Arbitrarily pick the best.
+ // TODO(mrossetti): It might make sense that a term which appears in both the
+ // URL and the Title should boost the score a bit.
+ int term_score = std::max(url_score, title_score);
- const float kCompleteMaxValue = 10.0;
- float complete_value =
- (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
- kStartMaxValue;
-
- const float kLastVisitMaxValue = 10.0;
- const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
+ // Factor in attributes of the URLRow.
+ // Items which have been visited recently score higher.
int64 delta_time = (kMaxSignificantDay -
std::min((base::Time::Now() - row.last_visit()),
kMaxSignificantDay)).ToInternalValue();
@@ -615,37 +690,62 @@
static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
kLastVisitMaxValue;
- const float kVisitCountMaxValue = 10.0;
- const int kMaxSignificantVisits = 10;
float visit_count_value =
(static_cast<float>(std::min(row.visit_count(),
kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
kVisitCountMaxValue;
- const float kTypedCountMaxValue = 20.0;
- const int kMaxSignificantTyped = 10;
float typed_count_value =
(static_cast<float>(std::min(row.typed_count(),
kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *
kTypedCountMaxValue;
- float raw_score = order_value + start_value + complete_value +
- last_visit_value + visit_count_value + typed_count_value;
+ float raw_score = term_score + last_visit_value + visit_count_value +
+ typed_count_value;
// Normalize the score.
- const float kMaxNormalizedRawScore = 1000.0;
- raw_score =
- (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
- kLastVisitMaxValue + kVisitCountMaxValue +
- kTypedCountMaxValue)) *
- kMaxNormalizedRawScore;
- return static_cast<int>(raw_score);
+ match.raw_score = static_cast<int>((raw_score / kMaxRawScore) *
+ kMaxNormalizedRawScore);
+
+ return match;
}
-// static
-base::Time InMemoryURLIndex::RecentThreshold() {
- return base::Time::Now() -
- base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
+int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches,
+ size_t max_length) {
+ // TODO(mrossetti): This is good enough for now but must be fine-tuned.
+ if (matches.empty())
+ return 0;
+
+ // Search terms appearing in order score higher.
+ float order_value = 10.0;
+ if (matches.size() > 1) {
+ int max_possible_out_of_order = matches.size() - 1;
+ int out_of_order = 0;
+ for (size_t i = 1; i < matches.size(); ++i) {
+ if (matches[i-1].term_num > matches[i].term_num)
Peter Kasting 2011/03/09 02:31:48 Nit: Spaces around '-'
mrossetti 2011/03/10 01:31:14 Done.
+ ++out_of_order;
Peter Kasting 2011/03/09 02:31:48 Nit: Indent 2
mrossetti 2011/03/10 01:31:14 Done.
+ }
+ order_value =
+ (static_cast<float>(max_possible_out_of_order - out_of_order) /
+ max_possible_out_of_order) * kOrderMaxValue;
+ }
+
+ // Search terms which appear earlier score higher.
+ // No points if the first search term does not appear in the first
+ // kMaxSignificantStart chars.
+ float start_value =
+ (static_cast<float>(kMaxSignificantStart -
+ std::min(kMaxSignificantStart, matches[0].offset))) /
+ static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
Peter Kasting 2011/03/09 02:31:48 Nit: Is this second cast needed?
mrossetti 2011/03/10 01:31:14 Yes.
+
+ // Search terms which comprise a greater portion score higher.
+ size_t term_length_total = std::accumulate(matches.begin(), matches.end(),
+ 0, AddumulateMatchLength);
+ float complete_value =
+ (static_cast<float>(term_length_total) / static_cast<float>(max_length)) *
+ kStartMaxValue;
+
+ return static_cast<int>(order_value + start_value + complete_value);
}
InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
@@ -666,26 +766,17 @@
// deleted by the user or the item no longer qualifies as a quick result.
if (hist_pos != index_.history_info_map_.end()) {
const URLRow& hist_item = hist_pos->second;
- // TODO(mrossetti): Accommodate multiple term highlighting.
- size_t input_location = 0;
- int score = InMemoryURLIndex::RawScoreForURL(
- hist_item, lower_terms_, &input_location);
- if (score != 0) {
+ ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));
+ if (match.raw_score != 0) {
// We only retain the top 10 highest scoring results so
// see if this one fits into the top 10 and, if so, where.
ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
while (scored_iter != scored_matches_.end() &&
- (*scored_iter).raw_score > score)
+ (*scored_iter).raw_score > match.raw_score)
++scored_iter;
if ((scored_matches_.size() < 10) ||
(scored_iter != scored_matches_.end())) {
- // Create and insert the new item.
- // TODO(mrossetti): Properly set |match_in_scheme| and
- // |innermost_match|.
- bool match_in_scheme = false;
- bool innermost_match = true;
- ScoredHistoryMatch match(hist_item, input_location,
- match_in_scheme, innermost_match, score);
+ // Insert the new item.
if (!scored_matches_.empty())
scored_matches_.insert(scored_iter, match);
else

Powered by Google App Engine
This is Rietveld 408576698