Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
| 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
| 7 #pragma once | 7 #pragma once |
| 8 | 8 |
| 9 #include <functional> | 9 #include <functional> |
| 10 #include <map> | 10 #include <map> |
| 11 #include <set> | 11 #include <set> |
| 12 #include <string> | 12 #include <string> |
| 13 #include <vector> | 13 #include <vector> |
| 14 | 14 |
| 15 #include "app/sql/connection.h" | 15 #include "app/sql/connection.h" |
| 16 #include "base/basictypes.h" | 16 #include "base/basictypes.h" |
| 17 #include "base/file_path.h" | 17 #include "base/file_path.h" |
| 18 #include "base/gtest_prod_util.h" | 18 #include "base/gtest_prod_util.h" |
| 19 #include "base/linked_ptr.h" | 19 #include "base/linked_ptr.h" |
| 20 #include "base/scoped_ptr.h" | 20 #include "base/scoped_ptr.h" |
| 21 #include "base/string16.h" | 21 #include "base/string16.h" |
| 22 #include "chrome/browser/autocomplete/autocomplete_match.h" | |
| 22 #include "chrome/browser/autocomplete/history_provider_util.h" | 23 #include "chrome/browser/autocomplete/history_provider_util.h" |
| 23 #include "chrome/browser/history/history_types.h" | 24 #include "chrome/browser/history/history_types.h" |
| 24 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" | 25 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" |
| 25 #include "testing/gtest/include/gtest/gtest_prod.h" | 26 #include "testing/gtest/include/gtest/gtest_prod.h" |
| 26 | 27 |
| 27 class Profile; | 28 class Profile; |
| 28 | 29 |
| 29 namespace base { | 30 namespace base { |
| 30 class Time; | 31 class Time; |
| 31 } | 32 } |
| 32 | 33 |
| 33 namespace in_memory_url_index { | 34 namespace in_memory_url_index { |
| 34 class InMemoryURLIndexCacheItem; | 35 class InMemoryURLIndexCacheItem; |
| 35 } | 36 } |
| 36 | 37 |
| 37 namespace history { | 38 namespace history { |
| 38 | 39 |
| 39 namespace imui = in_memory_url_index; | 40 namespace imui = in_memory_url_index; |
| 40 | 41 |
| 41 class URLDatabase; | 42 class URLDatabase; |
| 42 | 43 |
| 44 // Specifies where an omnibox term occurs within a string. Used for specifying | |
| 45 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in | |
| 46 // scoring a result. | |
| 47 struct TermMatch { | |
| 48 TermMatch(int term_num, size_t offset, size_t length) | |
| 49 : term_num(term_num), offset(offset), length(length) {} | |
|
Peter Kasting
2011/03/09 02:31:48
Nit: One initializer per line when they don't all
mrossetti
2011/03/10 01:31:14
Done.
| |
| 50 | |
| 51 int term_num; // The index of the term in the original search string. | |
| 52 size_t offset; // The starting offset of the substring match. | |
| 53 size_t length; // The length of the substring match. | |
| 54 }; | |
| 55 typedef std::vector<TermMatch> TermMatches; | |
| 56 | |
| 43 // Used for intermediate history result operations. | 57 // Used for intermediate history result operations. |
| 44 struct ScoredHistoryMatch : public HistoryMatch { | 58 struct ScoredHistoryMatch : public HistoryMatch { |
| 45 // Required for STL, we don't use this directly. | 59 ScoredHistoryMatch(); // Required by STL. |
| 46 ScoredHistoryMatch(); | 60 explicit ScoredHistoryMatch(const URLRow& url_info); |
| 47 | |
| 48 ScoredHistoryMatch(const URLRow& url_info, | |
| 49 size_t input_location, | |
| 50 bool match_in_scheme, | |
| 51 bool innermost_match, | |
| 52 int score); | |
| 53 | 61 |
| 54 // An interim score taking into consideration location and completeness | 62 // An interim score taking into consideration location and completeness |
| 55 // of the match. | 63 // of the match. |
| 56 int raw_score; | 64 int raw_score; |
| 65 TermMatches url_matches; // Term matches within the URL. | |
| 66 TermMatches title_matches; // Term matches within the page title. | |
| 57 }; | 67 }; |
| 58 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; | 68 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; |
| 59 | 69 |
| 60 // The URL history source. | 70 // The URL history source. |
| 61 // Holds portions of the URL database in memory in an indexed form. Used to | 71 // Holds portions of the URL database in memory in an indexed form. Used to |
| 62 // quickly look up matching URLs for a given query string. Used by | 72 // quickly look up matching URLs for a given query string. Used by |
| 63 // the HistoryURLProvider for inline autocomplete and to provide URL | 73 // the HistoryURLProvider for inline autocomplete and to provide URL |
| 64 // matches to the omnibox. | 74 // matches to the omnibox. |
| 65 // | 75 // |
| 66 // Note about multi-byte codepoints and the data structures in the | 76 // Note about multi-byte codepoints and the data structures in the |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 107 bool RestoreFromCacheFile(); | 117 bool RestoreFromCacheFile(); |
| 108 | 118 |
| 109 // Caches the index private data and writes the cache file to the profile | 119 // Caches the index private data and writes the cache file to the profile |
| 110 // directory. | 120 // directory. |
| 111 bool SaveToCacheFile(); | 121 bool SaveToCacheFile(); |
| 112 | 122 |
| 113 // Given a vector containing one or more words as string16s, scans the | 123 // Given a vector containing one or more words as string16s, scans the |
| 114 // history index and return a vector with all scored, matching history items. | 124 // history index and return a vector with all scored, matching history items. |
| 115 // Each term must occur somewhere in the history item for the item to | 125 // Each term must occur somewhere in the history item for the item to |
| 116 // qualify; however, the terms do not necessarily have to be adjacent. | 126 // qualify; however, the terms do not necessarily have to be adjacent. |
| 117 // Results are sorted with higher scoring items first. | 127 // Results are sorted with higher scoring items first. Each term from |terms| |
| 128 // may contain punctuation but should not contain spaces. | |
| 118 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms); | 129 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms); |
| 119 | 130 |
| 120 // Returns the date threshold for considering an history item as significant. | |
| 121 static base::Time RecentThreshold(); | |
| 122 | |
| 123 // Updates or adds an history item to the index if it meets the minimum | 131 // Updates or adds an history item to the index if it meets the minimum |
| 124 // 'quick' criteria. | 132 // 'quick' criteria. |
| 125 void UpdateURL(URLID row_id, const URLRow& row); | 133 void UpdateURL(URLID row_id, const URLRow& row); |
| 126 | 134 |
| 127 // Deletes indexing data for an history item. The item may not have actually | 135 // Deletes indexing data for an history item. The item may not have actually |
| 128 // been indexed (which is the case if it did not previously meet minimum | 136 // been indexed (which is the case if it did not previously meet minimum |
| 129 // 'quick' criteria). | 137 // 'quick' criteria). |
| 130 void DeleteURL(URLID row_id); | 138 void DeleteURL(URLID row_id); |
| 131 | 139 |
| 140 // Breaks the |uni_string| string down into individual words and return | |
| 141 // a vector with the individual words in their original order. Break on | |
| 142 // whitespace if |break_on_space| also on special characters. | |
| 143 static String16Vector WordVectorFromString16(const string16& uni_string, | |
| 144 bool break_on_space); | |
| 145 | |
| 132 private: | 146 private: |
| 133 friend class AddHistoryMatch; | 147 friend class AddHistoryMatch; |
| 134 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Initialization); | 148 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); |
| 135 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities); | |
| 136 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); | |
| 137 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath); | 149 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath); |
| 138 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); | 150 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); |
| 151 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities); | |
| 152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); | |
| 153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions); | |
| 154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); | |
| 155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); | |
| 139 | 156 |
| 140 // Signals that there are no previously cached results for the typed term. | 157 // Signals that there are no previously cached results for the typed term. |
| 141 static const size_t kNoCachedResultForTerm; | 158 static const size_t kNoCachedResultForTerm; |
| 142 | 159 |
| 143 // Creating one of me without a history path is not allowed (tests excepted). | 160 // Creating one of me without a history path is not allowed (tests excepted). |
| 144 InMemoryURLIndex(); | 161 InMemoryURLIndex(); |
| 145 | 162 |
| 146 // Convenience types. | 163 // Convenience types. |
| 147 typedef std::set<string16> String16Set; | 164 typedef std::set<string16> String16Set; |
| 148 typedef std::set<char16> Char16Set; | 165 typedef std::set<char16> Char16Set; |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 203 const String16Vector& lower_terms_; | 220 const String16Vector& lower_terms_; |
| 204 }; | 221 }; |
| 205 | 222 |
| 206 // Initializes all index data members in preparation for restoring the index | 223 // Initializes all index data members in preparation for restoring the index |
| 207 // from the cache or a complete rebuild from the history database. | 224 // from the cache or a complete rebuild from the history database. |
| 208 void ClearPrivateData(); | 225 void ClearPrivateData(); |
| 209 | 226 |
| 210 // Breaks a string down into individual words. | 227 // Breaks a string down into individual words. |
| 211 static String16Set WordSetFromString16(const string16& uni_string); | 228 static String16Set WordSetFromString16(const string16& uni_string); |
| 212 | 229 |
| 230 // Converts |upper_string| to lower case and returns the new string. | |
| 231 static string16 ToLower(const string16& upper_string); | |
| 232 | |
| 213 // Given a vector of Char16s, representing the characters the user has typed | 233 // Given a vector of Char16s, representing the characters the user has typed |
| 214 // into the omnibox, finds words containing those characters. If any | 234 // into the omnibox, finds words containing those characters. If any |
| 215 // existing, cached set is a proper subset then starts with that cached | 235 // existing, cached set is a proper subset then starts with that cached |
| 216 // set. Updates the previously-typed-character cache. | 236 // set. Updates the previously-typed-character cache. |
| 217 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars); | 237 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars); |
| 218 | 238 |
| 219 // Given a vector of Char16s in |uni_chars|, compare those characters, in | 239 // Given a vector of Char16s in |uni_chars|, compare those characters, in |
| 220 // order, with the previously searched term, returning the index of the | 240 // order, with the previously searched term, returning the index of the |
| 221 // cached results in the term_char_word_set_cache_ of the set with best | 241 // cached results in the term_char_word_set_cache_ of the set with best |
| 222 // matching number of characters. Returns kNoCachedResultForTerm if there | 242 // matching number of characters. Returns kNoCachedResultForTerm if there |
| 223 // was no match at all (i.e. the first character of |uni-chars| is not the | 243 // was no match at all (i.e. the first character of |uni-chars| is not the |
| 224 // same as the character of the first entry in term_char_word_set_cache_). | 244 // same as the character of the first entry in term_char_word_set_cache_). |
| 225 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars); | 245 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars); |
| 226 | 246 |
| 247 // Creates a TermMatches which has an entry for each occurrence of the string | |
| 248 // |term| found in the string |string|. Mark each match with |term_num| so | |
| 249 // that the resulting TermMatches can be merged with other TermMatches for | |
| 250 // other terms. | |
| 251 static TermMatches MatchTermInString(const string16& term, | |
| 252 const string16& string, | |
| 253 int term_num); | |
| 254 | |
| 227 // URL History indexing support functions. | 255 // URL History indexing support functions. |
| 228 | 256 |
| 229 // Indexes one URL history item. | 257 // Indexes one URL history item. |
| 230 bool IndexRow(const URLRow& row); | 258 bool IndexRow(const URLRow& row); |
| 231 | 259 |
| 232 // Breaks a string down into unique, individual characters in the order | 260 // Breaks a string down into unique, individual characters in the order |
| 233 // in which the characters are first encountered in the |uni_word| string. | 261 // in which the characters are first encountered in the |uni_word| string. |
| 234 static Char16Vector Char16VectorFromString16(const string16& uni_word); | 262 static Char16Vector Char16VectorFromString16(const string16& uni_word); |
| 235 | 263 |
| 236 // Breaks the |uni_word| string down into its individual characters. | 264 // Breaks the |uni_word| string down into its individual characters. |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 269 HistoryIDSet HistoryIDsForTerm(const string16& uni_word); | 297 HistoryIDSet HistoryIDsForTerm(const string16& uni_word); |
| 270 | 298 |
| 271 // Calculates a raw score for this history item by first determining | 299 // Calculates a raw score for this history item by first determining |
| 272 // if all of the terms in |terms_vector| occur in |row| and, if so, | 300 // if all of the terms in |terms_vector| occur in |row| and, if so, |
| 273 // calculating a raw score based on 1) starting position of each term | 301 // calculating a raw score based on 1) starting position of each term |
| 274 // in the user input, 2) completeness of each term's match, 3) ordering | 302 // in the user input, 2) completeness of each term's match, 3) ordering |
| 275 // of the occurrence of each term (i.e. they appear in order), 4) last | 303 // of the occurrence of each term (i.e. they appear in order), 4) last |
| 276 // visit time, and 5) number of visits. | 304 // visit time, and 5) number of visits. |
| 277 // This raw score allows the results to be ordered and can be used | 305 // This raw score allows the results to be ordered and can be used |
| 278 // to influence the final score calculated by the client of this | 306 // to influence the final score calculated by the client of this |
| 279 // index. Return the starting location of the first term in | 307 // index. Returns a ScoredHistoryMatch structure with the raw score and |
| 280 // |first_term_location|. | 308 // substring matching metrics. |
| 281 static int RawScoreForURL(const URLRow& row, | 309 static ScoredHistoryMatch ScoredMatchForURL( |
| 282 const String16Vector& terms_vector, | 310 const URLRow& row, |
| 283 size_t* first_term_location); | 311 const String16Vector& terms_vector); |
| 312 | |
| 313 // Calculates a partial raw score based on position, ordering and total | |
| 314 // substring match size using metrics recorded in |matches|. |max_length| | |
| 315 // is the length of the string against which the terms are being searched. | |
| 316 static int RawScoreForMatches(const TermMatches& matches, | |
| 317 size_t max_length); | |
| 318 | |
| 319 // Sorts and removes overlapping substring matches from |matches| and | |
| 320 // returns the cleaned up matches. | |
| 321 static TermMatches SortAndDeoverlap(const TermMatches& matches); | |
| 284 | 322 |
| 285 // Utility functions supporting RestoreFromCache and SaveToCache. | 323 // Utility functions supporting RestoreFromCache and SaveToCache. |
| 286 | 324 |
| 287 // Construct a file path for the cache file within the same directory where | 325 // Construct a file path for the cache file within the same directory where |
| 288 // the history database is kept and saves that path to |file_path|. Returns | 326 // the history database is kept and saves that path to |file_path|. Returns |
| 289 // true if |file_path| can be successfully constructed. (This function | 327 // true if |file_path| can be successfully constructed. (This function |
| 290 // provided as a hook for unit testing.) | 328 // provided as a hook for unit testing.) |
| 291 bool GetCacheFilePath(FilePath* file_path); | 329 bool GetCacheFilePath(FilePath* file_path); |
| 292 | 330 |
| 293 // Encode a data structure into the protobuf |cache|. | 331 // Encode a data structure into the protobuf |cache|. |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 335 TermCharWordSetVector term_char_word_set_cache_; | 373 TermCharWordSetVector term_char_word_set_cache_; |
| 336 HistoryInfoMap history_info_map_; | 374 HistoryInfoMap history_info_map_; |
| 337 std::string languages_; | 375 std::string languages_; |
| 338 | 376 |
| 339 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); | 377 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); |
| 340 }; | 378 }; |
| 341 | 379 |
| 342 } // namespace history | 380 } // namespace history |
| 343 | 381 |
| 344 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 382 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
| OLD | NEW |