Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(382)

Side by Side Diff: chrome/browser/history/in_memory_url_index.h

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
7 #pragma once 7 #pragma once
8 8
9 #include <functional> 9 #include <functional>
10 #include <map> 10 #include <map>
11 #include <set> 11 #include <set>
12 #include <string> 12 #include <string>
13 #include <vector> 13 #include <vector>
14 14
15 #include "app/sql/connection.h" 15 #include "app/sql/connection.h"
16 #include "base/basictypes.h" 16 #include "base/basictypes.h"
17 #include "base/file_path.h" 17 #include "base/file_path.h"
18 #include "base/gtest_prod_util.h" 18 #include "base/gtest_prod_util.h"
19 #include "base/linked_ptr.h" 19 #include "base/linked_ptr.h"
20 #include "base/scoped_ptr.h" 20 #include "base/scoped_ptr.h"
21 #include "base/string16.h" 21 #include "base/string16.h"
22 #include "chrome/browser/autocomplete/autocomplete_match.h"
22 #include "chrome/browser/autocomplete/history_provider_util.h" 23 #include "chrome/browser/autocomplete/history_provider_util.h"
23 #include "chrome/browser/history/history_types.h" 24 #include "chrome/browser/history/history_types.h"
24 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" 25 #include "chrome/browser/history/in_memory_url_index_cache.pb.h"
25 #include "testing/gtest/include/gtest/gtest_prod.h" 26 #include "testing/gtest/include/gtest/gtest_prod.h"
26 27
27 class Profile; 28 class Profile;
28 29
29 namespace base { 30 namespace base {
30 class Time; 31 class Time;
31 } 32 }
32 33
33 namespace in_memory_url_index { 34 namespace in_memory_url_index {
34 class InMemoryURLIndexCacheItem; 35 class InMemoryURLIndexCacheItem;
35 } 36 }
36 37
37 namespace history { 38 namespace history {
38 39
39 namespace imui = in_memory_url_index; 40 namespace imui = in_memory_url_index;
40 41
41 class URLDatabase; 42 class URLDatabase;
42 43
44 // Specifies where an omnibox term occurs within a string. Used for specifying
45 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
46 // scoring a result.
47 struct TermMatch {
48 TermMatch(int term_num, size_t offset, size_t length)
49 : term_num(term_num), offset(offset), length(length) {}
Peter Kasting 2011/03/09 02:31:48 Nit: One initializer per line when they don't all
mrossetti 2011/03/10 01:31:14 Done.
50
51 int term_num; // The index of the term in the original search string.
52 size_t offset; // The starting offset of the substring match.
53 size_t length; // The length of the substring match.
54 };
55 typedef std::vector<TermMatch> TermMatches;
56
43 // Used for intermediate history result operations. 57 // Used for intermediate history result operations.
44 struct ScoredHistoryMatch : public HistoryMatch { 58 struct ScoredHistoryMatch : public HistoryMatch {
45 // Required for STL, we don't use this directly. 59 ScoredHistoryMatch(); // Required by STL.
46 ScoredHistoryMatch(); 60 explicit ScoredHistoryMatch(const URLRow& url_info);
47
48 ScoredHistoryMatch(const URLRow& url_info,
49 size_t input_location,
50 bool match_in_scheme,
51 bool innermost_match,
52 int score);
53 61
54 // An interim score taking into consideration location and completeness 62 // An interim score taking into consideration location and completeness
55 // of the match. 63 // of the match.
56 int raw_score; 64 int raw_score;
65 TermMatches url_matches; // Term matches within the URL.
66 TermMatches title_matches; // Term matches within the page title.
57 }; 67 };
58 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; 68 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches;
59 69
60 // The URL history source. 70 // The URL history source.
61 // Holds portions of the URL database in memory in an indexed form. Used to 71 // Holds portions of the URL database in memory in an indexed form. Used to
62 // quickly look up matching URLs for a given query string. Used by 72 // quickly look up matching URLs for a given query string. Used by
63 // the HistoryURLProvider for inline autocomplete and to provide URL 73 // the HistoryURLProvider for inline autocomplete and to provide URL
64 // matches to the omnibox. 74 // matches to the omnibox.
65 // 75 //
66 // Note about multi-byte codepoints and the data structures in the 76 // Note about multi-byte codepoints and the data structures in the
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 bool RestoreFromCacheFile(); 117 bool RestoreFromCacheFile();
108 118
109 // Caches the index private data and writes the cache file to the profile 119 // Caches the index private data and writes the cache file to the profile
110 // directory. 120 // directory.
111 bool SaveToCacheFile(); 121 bool SaveToCacheFile();
112 122
113 // Given a vector containing one or more words as string16s, scans the 123 // Given a vector containing one or more words as string16s, scans the
114 // history index and return a vector with all scored, matching history items. 124 // history index and return a vector with all scored, matching history items.
115 // Each term must occur somewhere in the history item for the item to 125 // Each term must occur somewhere in the history item for the item to
116 // qualify; however, the terms do not necessarily have to be adjacent. 126 // qualify; however, the terms do not necessarily have to be adjacent.
117 // Results are sorted with higher scoring items first. 127 // Results are sorted with higher scoring items first. Each term from |terms|
128 // may contain punctuation but should not contain spaces.
118 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms); 129 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
119 130
120 // Returns the date threshold for considering an history item as significant.
121 static base::Time RecentThreshold();
122
123 // Updates or adds an history item to the index if it meets the minimum 131 // Updates or adds an history item to the index if it meets the minimum
124 // 'quick' criteria. 132 // 'quick' criteria.
125 void UpdateURL(URLID row_id, const URLRow& row); 133 void UpdateURL(URLID row_id, const URLRow& row);
126 134
127 // Deletes indexing data for an history item. The item may not have actually 135 // Deletes indexing data for an history item. The item may not have actually
128 // been indexed (which is the case if it did not previously meet minimum 136 // been indexed (which is the case if it did not previously meet minimum
129 // 'quick' criteria). 137 // 'quick' criteria).
130 void DeleteURL(URLID row_id); 138 void DeleteURL(URLID row_id);
131 139
140 // Breaks the |uni_string| string down into individual words and return
141 // a vector with the individual words in their original order. Break on
142 // whitespace if |break_on_space| also on special characters.
143 static String16Vector WordVectorFromString16(const string16& uni_string,
144 bool break_on_space);
145
132 private: 146 private:
133 friend class AddHistoryMatch; 147 friend class AddHistoryMatch;
134 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Initialization); 148 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
135 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
136 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
137 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath); 149 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath);
138 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); 150 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
151 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions);
154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
139 156
140 // Signals that there are no previously cached results for the typed term. 157 // Signals that there are no previously cached results for the typed term.
141 static const size_t kNoCachedResultForTerm; 158 static const size_t kNoCachedResultForTerm;
142 159
143 // Creating one of me without a history path is not allowed (tests excepted). 160 // Creating one of me without a history path is not allowed (tests excepted).
144 InMemoryURLIndex(); 161 InMemoryURLIndex();
145 162
146 // Convenience types. 163 // Convenience types.
147 typedef std::set<string16> String16Set; 164 typedef std::set<string16> String16Set;
148 typedef std::set<char16> Char16Set; 165 typedef std::set<char16> Char16Set;
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
203 const String16Vector& lower_terms_; 220 const String16Vector& lower_terms_;
204 }; 221 };
205 222
206 // Initializes all index data members in preparation for restoring the index 223 // Initializes all index data members in preparation for restoring the index
207 // from the cache or a complete rebuild from the history database. 224 // from the cache or a complete rebuild from the history database.
208 void ClearPrivateData(); 225 void ClearPrivateData();
209 226
210 // Breaks a string down into individual words. 227 // Breaks a string down into individual words.
211 static String16Set WordSetFromString16(const string16& uni_string); 228 static String16Set WordSetFromString16(const string16& uni_string);
212 229
230 // Converts |upper_string| to lower case and returns the new string.
231 static string16 ToLower(const string16& upper_string);
232
213 // Given a vector of Char16s, representing the characters the user has typed 233 // Given a vector of Char16s, representing the characters the user has typed
214 // into the omnibox, finds words containing those characters. If any 234 // into the omnibox, finds words containing those characters. If any
215 // existing, cached set is a proper subset then starts with that cached 235 // existing, cached set is a proper subset then starts with that cached
216 // set. Updates the previously-typed-character cache. 236 // set. Updates the previously-typed-character cache.
217 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars); 237 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars);
218 238
219 // Given a vector of Char16s in |uni_chars|, compare those characters, in 239 // Given a vector of Char16s in |uni_chars|, compare those characters, in
220 // order, with the previously searched term, returning the index of the 240 // order, with the previously searched term, returning the index of the
221 // cached results in the term_char_word_set_cache_ of the set with best 241 // cached results in the term_char_word_set_cache_ of the set with best
222 // matching number of characters. Returns kNoCachedResultForTerm if there 242 // matching number of characters. Returns kNoCachedResultForTerm if there
223 // was no match at all (i.e. the first character of |uni-chars| is not the 243 // was no match at all (i.e. the first character of |uni-chars| is not the
224 // same as the character of the first entry in term_char_word_set_cache_). 244 // same as the character of the first entry in term_char_word_set_cache_).
225 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars); 245 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars);
226 246
247 // Creates a TermMatches which has an entry for each occurrence of the string
248 // |term| found in the string |string|. Mark each match with |term_num| so
249 // that the resulting TermMatches can be merged with other TermMatches for
250 // other terms.
251 static TermMatches MatchTermInString(const string16& term,
252 const string16& string,
253 int term_num);
254
227 // URL History indexing support functions. 255 // URL History indexing support functions.
228 256
229 // Indexes one URL history item. 257 // Indexes one URL history item.
230 bool IndexRow(const URLRow& row); 258 bool IndexRow(const URLRow& row);
231 259
232 // Breaks a string down into unique, individual characters in the order 260 // Breaks a string down into unique, individual characters in the order
233 // in which the characters are first encountered in the |uni_word| string. 261 // in which the characters are first encountered in the |uni_word| string.
234 static Char16Vector Char16VectorFromString16(const string16& uni_word); 262 static Char16Vector Char16VectorFromString16(const string16& uni_word);
235 263
236 // Breaks the |uni_word| string down into its individual characters. 264 // Breaks the |uni_word| string down into its individual characters.
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
269 HistoryIDSet HistoryIDsForTerm(const string16& uni_word); 297 HistoryIDSet HistoryIDsForTerm(const string16& uni_word);
270 298
271 // Calculates a raw score for this history item by first determining 299 // Calculates a raw score for this history item by first determining
272 // if all of the terms in |terms_vector| occur in |row| and, if so, 300 // if all of the terms in |terms_vector| occur in |row| and, if so,
273 // calculating a raw score based on 1) starting position of each term 301 // calculating a raw score based on 1) starting position of each term
274 // in the user input, 2) completeness of each term's match, 3) ordering 302 // in the user input, 2) completeness of each term's match, 3) ordering
275 // of the occurrence of each term (i.e. they appear in order), 4) last 303 // of the occurrence of each term (i.e. they appear in order), 4) last
276 // visit time, and 5) number of visits. 304 // visit time, and 5) number of visits.
277 // This raw score allows the results to be ordered and can be used 305 // This raw score allows the results to be ordered and can be used
278 // to influence the final score calculated by the client of this 306 // to influence the final score calculated by the client of this
279 // index. Return the starting location of the first term in 307 // index. Returns a ScoredHistoryMatch structure with the raw score and
280 // |first_term_location|. 308 // substring matching metrics.
281 static int RawScoreForURL(const URLRow& row, 309 static ScoredHistoryMatch ScoredMatchForURL(
282 const String16Vector& terms_vector, 310 const URLRow& row,
283 size_t* first_term_location); 311 const String16Vector& terms_vector);
312
313 // Calculates a partial raw score based on position, ordering and total
314 // substring match size using metrics recorded in |matches|. |max_length|
315 // is the length of the string against which the terms are being searched.
316 static int RawScoreForMatches(const TermMatches& matches,
317 size_t max_length);
318
319 // Sorts and removes overlapping substring matches from |matches| and
320 // returns the cleaned up matches.
321 static TermMatches SortAndDeoverlap(const TermMatches& matches);
284 322
285 // Utility functions supporting RestoreFromCache and SaveToCache. 323 // Utility functions supporting RestoreFromCache and SaveToCache.
286 324
287 // Construct a file path for the cache file within the same directory where 325 // Construct a file path for the cache file within the same directory where
288 // the history database is kept and saves that path to |file_path|. Returns 326 // the history database is kept and saves that path to |file_path|. Returns
289 // true if |file_path| can be successfully constructed. (This function 327 // true if |file_path| can be successfully constructed. (This function
290 // provided as a hook for unit testing.) 328 // provided as a hook for unit testing.)
291 bool GetCacheFilePath(FilePath* file_path); 329 bool GetCacheFilePath(FilePath* file_path);
292 330
293 // Encode a data structure into the protobuf |cache|. 331 // Encode a data structure into the protobuf |cache|.
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
335 TermCharWordSetVector term_char_word_set_cache_; 373 TermCharWordSetVector term_char_word_set_cache_;
336 HistoryInfoMap history_info_map_; 374 HistoryInfoMap history_info_map_;
337 std::string languages_; 375 std::string languages_;
338 376
339 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); 377 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex);
340 }; 378 };
341 379
342 } // namespace history 380 } // namespace history
343 381
344 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 382 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698