Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1398)

Unified Diff: chrome/browser/history/in_memory_url_index.cc

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/history/in_memory_url_index.cc
===================================================================
--- chrome/browser/history/in_memory_url_index.cc (revision 77643)
+++ chrome/browser/history/in_memory_url_index.cc (working copy)
@@ -7,6 +7,7 @@
#include <algorithm>
#include <iterator>
#include <limits>
+#include <numeric>
#include "base/file_util.h"
#include "base/i18n/break_iterator.h"
@@ -17,6 +18,8 @@
#include "chrome/browser/autocomplete/history_provider_util.h"
#include "chrome/browser/history/url_database.h"
#include "chrome/browser/profiles/profile.h"
+#include "chrome/common/url_constants.h"
+#include "googleurl/src/url_util.h"
#include "net/base/escape.h"
#include "net/base/net_util.h"
#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
@@ -24,8 +27,8 @@
using google::protobuf::RepeatedField;
using google::protobuf::RepeatedPtrField;
+using in_memory_url_index::InMemoryURLIndexCacheItem;
-using in_memory_url_index::InMemoryURLIndexCacheItem;
namespace history {
typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
@@ -45,17 +48,29 @@
const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;
-ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
+const float kOrderMaxValue = 50.0;
+const float kStartMaxValue = 50.0;
+const size_t kMaxSignificantStart = 20;
+const float kCompleteMaxValue = 50.0;
+const float kLastVisitMaxValue = 50.0;
+const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
+const float kVisitCountMaxValue = 50.0;
+const int kMaxSignificantVisits = 20;
+const float kTypedCountMaxValue = 50.0;
+const int kMaxSignificantTyped = 20;
+const float kMaxRawScore = kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
+ kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue;
+const float kMaxNormalizedRawScore = 1000.0;
-ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
- size_t input_location,
- bool match_in_scheme,
- bool innermost_match,
- int score)
- : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
- raw_score(score) {
-}
+ScoredHistoryMatch::ScoredHistoryMatch()
+ : raw_score(0),
+ prefix_adjust(0) {}
+ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)
+ : HistoryMatch(url_info, 0, false, false),
+ raw_score(0),
+ prefix_adjust(0) {}
+
struct InMemoryURLIndex::TermCharWordSet {
TermCharWordSet() // Required for STL resize().
: char_(0),
@@ -76,6 +91,16 @@
bool used_; // true if this set has been used for the current term search.
};
+// Comparison function for sorting TermMatches by their offsets.
+bool CompareMatchOffset(const TermMatch& m1, const TermMatch& m2) {
+ return m1.offset < m2.offset;
+}
+
+// std::accumulate helper function to add up TermMatches' lengths.
+int AccumulateMatchLength(int total, const TermMatch& match) {
+ return total + match.length;
+}
+
InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)
: history_dir_(history_dir),
history_item_count_(0) {
@@ -109,11 +134,10 @@
UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
NULL, NULL, NULL));
- // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
HistoryID history_id = static_cast<HistoryID>(row.id());
+ DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());
// Add the row for quick lookup in the history info store.
- url = l10n_util::ToLower(url);
URLRow new_row(GURL(url), row.id());
new_row.set_visit_count(row.visit_count());
new_row.set_typed_count(row.typed_count());
@@ -121,16 +145,18 @@
new_row.set_title(row.title());
history_info_map_[history_id] = new_row;
- // Split into individual, unique words.
- String16Set words = WordSetFromString16(url);
+ // Split URL into individual, unique words then add in the title words.
+ url = l10n_util::ToLower(url);
+ String16Set url_words = WordSetFromString16(url);
+ String16Set title_words = WordSetFromString16(row.title());
+ String16Set words;
+ std::set_union(url_words.begin(), url_words.end(),
+ title_words.begin(), title_words.end(),
+ std::insert_iterator<String16Set>(words, words.begin()));
+ for (String16Set::iterator word_iter = words.begin();
+ word_iter != words.end(); ++word_iter)
+ AddWordToIndex(*word_iter, history_id);
- // For each word, add a new entry into the word index referring to the
- // associated history item.
- for (String16Set::iterator iter = words.begin();
- iter != words.end(); ++iter) {
- String16Set::value_type uni_word = *iter;
- AddWordToIndex(uni_word, history_id);
- }
++history_item_count_;
return true;
}
@@ -292,14 +318,8 @@
// TODO(mrossetti): Another opportunity for a transform algorithm.
String16Vector lower_terms;
for (String16Vector::const_iterator term_iter = terms.begin();
- term_iter != terms.end(); ++term_iter) {
- String16Vector::value_type lower_string(*term_iter);
- std::transform(lower_string.begin(),
- lower_string.end(),
- lower_string.begin(),
- tolower);
- lower_terms.push_back(lower_string);
- }
+ term_iter != terms.end(); ++term_iter)
+ lower_terms.push_back(l10n_util::ToLower(*term_iter));
String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
@@ -372,6 +392,13 @@
Char16Vector uni_chars = Char16VectorFromString16(uni_word);
WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
+ // TODO(mrossetti): At this point, as a possible optimization, we could
+ // scan through all candidate words and make sure the |uni_word| is a
+ // substring within the candidate words, eliminating those which aren't.
+ // I'm not sure it would be worth the effort. And remember, we've got to do
+ // a final substring match in order to get the highlighting ranges later
+ // in the process in any case.
+
// If any words resulted then we can compose a set of history IDs by unioning
// the sets from each word.
if (!word_id_set.empty()) {
@@ -395,12 +422,43 @@
// static
InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
const string16& uni_string) {
- String16Set words;
- base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD);
- if (iter.Init()) {
- while (iter.Advance()) {
+ String16Vector words = WordVectorFromString16(uni_string, false);
+ String16Set word_set;
+ for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
+ ++iter)
+ word_set.insert(l10n_util::ToLower(*iter));
+ return word_set;
+}
+
+// static
+InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(
+ const string16& uni_string,
+ bool break_on_space) {
+ // TODO(mrossetti): Come back and update the following code if the
+ // BreakIterator is changed. Here are some comments:
+ // The iterator behaves differently depending on the breaking strategy. Its
+ // unit tests do not properly test this case as its basic word and space tests
+ // always use a test string starting with a space.
+ base::BreakIterator iter(&uni_string, break_on_space ?
+ base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD);
+ String16Vector words;
+ if (break_on_space) {
+ if (iter.Init()) {
+ while (iter.Advance()) {
+ string16 word = iter.GetString();
+ TrimWhitespace(word, TRIM_ALL, &word);
+ if (!word.empty())
+ words.push_back(word);
+ }
+ }
+ } else {
+ if (iter.Init()) {
if (iter.IsWord())
- words.insert(iter.GetString());
+ words.push_back(iter.GetString());
+ while (iter.Advance()) {
+ if (iter.IsWord())
+ words.push_back(iter.GetString());
+ }
}
}
return words;
@@ -546,67 +604,88 @@
}
// static
-int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
- const String16Vector& terms,
- size_t* first_term_location) {
+TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,
+ const string16& string,
+ int term_num) {
+ TermMatches matches;
+ for (size_t location = string.find(term); location != string16::npos;
+ location = string.find(term, location + 1))
+ matches.push_back(TermMatch(term_num, location, term.size()));
+ return matches;
+}
+
+// static
+TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {
+ if (matches.empty())
+ return matches;
+ TermMatches sorted_matches = matches;
+ std::sort(sorted_matches.begin(), sorted_matches.end(),
+ CompareMatchOffset);
+ TermMatches clean_matches;
+ TermMatch last_match = sorted_matches[0];
+ clean_matches.push_back(last_match);
+ for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;
+ iter != sorted_matches.end(); ++iter) {
+ if (iter->offset >= last_match.offset + last_match.length) {
+ last_match = *iter;
+ clean_matches.push_back(last_match);
+ }
+ }
+ return clean_matches;
+}
+
+// static
+ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(
+ const URLRow& row,
+ const String16Vector& terms) {
+ ScoredHistoryMatch match(row);
GURL gurl = row.url();
if (!gurl.is_valid())
- return 0;
+ return match;
- string16 url = UTF8ToUTF16(gurl.spec());
+ // Figure out where each search term appears in the URL and/or page title
+ // so that we can score as well as provide autocomplete highlighting.
+ string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));
+ // Strip any 'http://' prefix before matching.
+ if (url_util::FindAndCompareScheme(url, chrome::kHttpScheme, NULL)) {
+ match.prefix_adjust = strlen(chrome::kHttpScheme) + 3; // Allow for '://'.
+ url = url.substr(match.prefix_adjust);
+ }
- // Collect all term start positions so we can see if they appear in order.
- std::vector<size_t> term_locations;
- int out_of_order = 0; // Count the terms which are out of order.
- size_t start_location_total = 0;
- size_t term_length_total = 0;
+ string16 title = l10n_util::ToLower(row.title());
+ int term_num = 0;
for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
- ++iter) {
+ ++iter, ++term_num) {
string16 term = *iter;
- size_t term_location = url.find(term);
- if (term_location == string16::npos)
- return 0; // A term was not found. This should not happen.
- if (iter != terms.begin()) {
- // See if this term is out-of-order.
- for (std::vector<size_t>::const_iterator order_iter =
- term_locations.begin(); order_iter != term_locations.end();
- ++order_iter) {
- if (term_location <= *order_iter)
- ++out_of_order;
- }
- } else {
- *first_term_location = term_location;
- }
- term_locations.push_back(term_location);
- start_location_total += term_location;
- term_length_total += term.size();
+ TermMatches url_term_matches = MatchTermInString(term, url, term_num);
+ TermMatches title_term_matches = MatchTermInString(term, title, term_num);
+ if (url_term_matches.empty() && title_term_matches.empty())
+ return match; // A term was not found in either URL or title - reject.
+ match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),
+ url_term_matches.end());
+ match.title_matches.insert(match.title_matches.end(),
+ title_term_matches.begin(),
+ title_term_matches.end());
}
- // Calculate a raw score.
- // TODO(mrossetti): This is good enough for now but must be fine-tuned.
- const float kOrderMaxValue = 10.0;
- float order_value = 10.0;
- if (terms.size() > 1) {
- int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
- order_value =
- (static_cast<float>(max_possible_out_of_order - out_of_order) /
- max_possible_out_of_order) * kOrderMaxValue;
- }
+ // Sort matches by offset and eliminate any which overlap.
+ match.url_matches = SortAndDeoverlap(match.url_matches);
+ match.title_matches = SortAndDeoverlap(match.title_matches);
- const float kStartMaxValue = 10.0;
- const size_t kMaxSignificantStart = 20;
- float start_value =
- (static_cast<float>(kMaxSignificantStart -
- std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
- static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
+ // Get partial scores based on term matching. Note that the score for
+ // each of the URL and title are adjusted by the fraction of the
+ // terms appearing in each.
+ int url_score = RawScoreForMatches(match.url_matches, url.size()) *
+ match.url_matches.size() / terms.size();
+ int title_score = RawScoreForMatches(match.title_matches, title.size()) *
+ match.title_matches.size() / terms.size();
+ // Arbitrarily pick the best.
+ // TODO(mrossetti): It might make sense that a term which appears in both the
+ // URL and the Title should boost the score a bit.
+ int term_score = std::max(url_score, title_score);
- const float kCompleteMaxValue = 10.0;
- float complete_value =
- (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
- kStartMaxValue;
-
- const float kLastVisitMaxValue = 10.0;
- const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
+ // Factor in attributes of the URLRow.
+ // Items which have been visited recently score higher.
int64 delta_time = (kMaxSignificantDay -
std::min((base::Time::Now() - row.last_visit()),
kMaxSignificantDay)).ToInternalValue();
@@ -615,37 +694,61 @@
static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
kLastVisitMaxValue;
- const float kVisitCountMaxValue = 10.0;
- const int kMaxSignificantVisits = 10;
float visit_count_value =
(static_cast<float>(std::min(row.visit_count(),
kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
kVisitCountMaxValue;
- const float kTypedCountMaxValue = 20.0;
- const int kMaxSignificantTyped = 10;
float typed_count_value =
(static_cast<float>(std::min(row.typed_count(),
kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *
kTypedCountMaxValue;
- float raw_score = order_value + start_value + complete_value +
- last_visit_value + visit_count_value + typed_count_value;
+ float raw_score = term_score + last_visit_value + visit_count_value +
+ typed_count_value;
// Normalize the score.
- const float kMaxNormalizedRawScore = 1000.0;
- raw_score =
- (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
- kLastVisitMaxValue + kVisitCountMaxValue +
- kTypedCountMaxValue)) *
- kMaxNormalizedRawScore;
- return static_cast<int>(raw_score);
+ match.raw_score = static_cast<int>((raw_score / kMaxRawScore) *
+ kMaxNormalizedRawScore);
+ return match;
}
-// static
-base::Time InMemoryURLIndex::RecentThreshold() {
- return base::Time::Now() -
- base::TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
+int InMemoryURLIndex::RawScoreForMatches(const TermMatches& matches,
+ size_t max_length) {
+ // TODO(mrossetti): This is good enough for now but must be fine-tuned.
+ if (matches.empty())
+ return 0;
+
+ // Search terms appearing in order score higher.
+ float order_value = 10.0;
+ if (matches.size() > 1) {
+ int max_possible_out_of_order = matches.size() - 1;
+ int out_of_order = 0;
+ for (size_t i = 1; i < matches.size(); ++i) {
+ if (matches[i - 1].term_num > matches[i].term_num)
+ ++out_of_order;
+ }
+ order_value =
+ (static_cast<float>(max_possible_out_of_order - out_of_order) /
+ max_possible_out_of_order) * kOrderMaxValue;
+ }
+
+ // Search terms which appear earlier score higher.
+ // No points if the first search term does not appear in the first
+ // kMaxSignificantStart chars.
+ float start_value =
+ (static_cast<float>(kMaxSignificantStart -
+ std::min(kMaxSignificantStart, matches[0].offset))) /
+ static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
+
+ // Search terms which comprise a greater portion score higher.
+ size_t term_length_total = std::accumulate(matches.begin(), matches.end(),
+ 0, AccumulateMatchLength);
+ float complete_value =
+ (static_cast<float>(term_length_total) / static_cast<float>(max_length)) *
+ kStartMaxValue;
+
+ return static_cast<int>(order_value + start_value + complete_value);
}
InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
@@ -666,26 +769,17 @@
// deleted by the user or the item no longer qualifies as a quick result.
if (hist_pos != index_.history_info_map_.end()) {
const URLRow& hist_item = hist_pos->second;
- // TODO(mrossetti): Accommodate multiple term highlighting.
- size_t input_location = 0;
- int score = InMemoryURLIndex::RawScoreForURL(
- hist_item, lower_terms_, &input_location);
- if (score != 0) {
+ ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));
+ if (match.raw_score != 0) {
// We only retain the top 10 highest scoring results so
// see if this one fits into the top 10 and, if so, where.
ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
while (scored_iter != scored_matches_.end() &&
- (*scored_iter).raw_score > score)
+ (*scored_iter).raw_score > match.raw_score)
++scored_iter;
if ((scored_matches_.size() < 10) ||
(scored_iter != scored_matches_.end())) {
- // Create and insert the new item.
- // TODO(mrossetti): Properly set |match_in_scheme| and
- // |innermost_match|.
- bool match_in_scheme = false;
- bool innermost_match = true;
- ScoredHistoryMatch match(hist_item, input_location,
- match_in_scheme, innermost_match, score);
+ // Insert the new item.
if (!scored_matches_.empty())
scored_matches_.insert(scored_iter, match);
else
« no previous file with comments | « chrome/browser/history/in_memory_url_index.h ('k') | chrome/browser/history/in_memory_url_index_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698