Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(517)

Side by Side Diff: chrome/browser/history/in_memory_url_index.h

Issue 6652008: Implemented substring matching within page titles. Fixed bug where URL was be... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 9 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
7 #pragma once 7 #pragma once
8 8
9 #include <functional> 9 #include <functional>
10 #include <map> 10 #include <map>
11 #include <set> 11 #include <set>
12 #include <string> 12 #include <string>
13 #include <vector> 13 #include <vector>
14 14
15 #include "app/sql/connection.h" 15 #include "app/sql/connection.h"
16 #include "base/basictypes.h" 16 #include "base/basictypes.h"
17 #include "base/file_path.h" 17 #include "base/file_path.h"
18 #include "base/gtest_prod_util.h" 18 #include "base/gtest_prod_util.h"
19 #include "base/linked_ptr.h" 19 #include "base/linked_ptr.h"
20 #include "base/scoped_ptr.h" 20 #include "base/scoped_ptr.h"
21 #include "base/string16.h" 21 #include "base/string16.h"
22 #include "chrome/browser/autocomplete/autocomplete_match.h"
22 #include "chrome/browser/autocomplete/history_provider_util.h" 23 #include "chrome/browser/autocomplete/history_provider_util.h"
23 #include "chrome/browser/history/history_types.h" 24 #include "chrome/browser/history/history_types.h"
24 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" 25 #include "chrome/browser/history/in_memory_url_index_cache.pb.h"
25 #include "testing/gtest/include/gtest/gtest_prod.h" 26 #include "testing/gtest/include/gtest/gtest_prod.h"
26 27
27 class Profile; 28 class Profile;
28 29
29 namespace base { 30 namespace base {
30 class Time; 31 class Time;
31 } 32 }
32 33
33 namespace in_memory_url_index { 34 namespace in_memory_url_index {
34 class InMemoryURLIndexCacheItem; 35 class InMemoryURLIndexCacheItem;
35 } 36 }
36 37
37 namespace history { 38 namespace history {
38 39
39 namespace imui = in_memory_url_index; 40 namespace imui = in_memory_url_index;
40 41
41 class URLDatabase; 42 class URLDatabase;
42 43
44 // Specifies where an omnibox term occurs within a string. Used for specifying
45 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
46 // scoring a result.
47 struct TermMatch {
48 TermMatch(int term_num, size_t offset, size_t length)
49 : term_num(term_num),
50 offset(offset),
51 length(length) {}
52
53 int term_num; // The index of the term in the original search string.
54 size_t offset; // The starting offset of the substring match.
55 size_t length; // The length of the substring match.
56 };
57 typedef std::vector<TermMatch> TermMatches;
58
43 // Used for intermediate history result operations. 59 // Used for intermediate history result operations.
44 struct ScoredHistoryMatch : public HistoryMatch { 60 struct ScoredHistoryMatch : public HistoryMatch {
45 // Required for STL, we don't use this directly. 61 ScoredHistoryMatch(); // Required by STL.
46 ScoredHistoryMatch(); 62 explicit ScoredHistoryMatch(const URLRow& url_info);
47
48 ScoredHistoryMatch(const URLRow& url_info,
49 size_t input_location,
50 bool match_in_scheme,
51 bool innermost_match,
52 int score);
53 63
54 // An interim score taking into consideration location and completeness 64 // An interim score taking into consideration location and completeness
55 // of the match. 65 // of the match.
56 int raw_score; 66 int raw_score;
67 TermMatches url_matches; // Term matches within the URL.
68 TermMatches title_matches; // Term matches within the page title.
69 size_t prefix_adjust; // The length of a prefix which should be ignored.
57 }; 70 };
58 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; 71 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches;
59 72
60 // The URL history source. 73 // The URL history source.
61 // Holds portions of the URL database in memory in an indexed form. Used to 74 // Holds portions of the URL database in memory in an indexed form. Used to
62 // quickly look up matching URLs for a given query string. Used by 75 // quickly look up matching URLs for a given query string. Used by
63 // the HistoryURLProvider for inline autocomplete and to provide URL 76 // the HistoryURLProvider for inline autocomplete and to provide URL
64 // matches to the omnibox. 77 // matches to the omnibox.
65 // 78 //
66 // Note about multi-byte codepoints and the data structures in the 79 // Note about multi-byte codepoints and the data structures in the
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 bool RestoreFromCacheFile(); 120 bool RestoreFromCacheFile();
108 121
109 // Caches the index private data and writes the cache file to the profile 122 // Caches the index private data and writes the cache file to the profile
110 // directory. 123 // directory.
111 bool SaveToCacheFile(); 124 bool SaveToCacheFile();
112 125
113 // Given a vector containing one or more words as string16s, scans the 126 // Given a vector containing one or more words as string16s, scans the
114 // history index and return a vector with all scored, matching history items. 127 // history index and return a vector with all scored, matching history items.
115 // Each term must occur somewhere in the history item for the item to 128 // Each term must occur somewhere in the history item for the item to
116 // qualify; however, the terms do not necessarily have to be adjacent. 129 // qualify; however, the terms do not necessarily have to be adjacent.
117 // Results are sorted with higher scoring items first. 130 // Results are sorted with higher scoring items first. Each term from |terms|
131 // may contain punctuation but should not contain spaces.
118 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms); 132 ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
119 133
120 // Returns the date threshold for considering an history item as significant.
121 static base::Time RecentThreshold();
122
123 // Updates or adds an history item to the index if it meets the minimum 134 // Updates or adds an history item to the index if it meets the minimum
124 // 'quick' criteria. 135 // 'quick' criteria.
125 void UpdateURL(URLID row_id, const URLRow& row); 136 void UpdateURL(URLID row_id, const URLRow& row);
126 137
127 // Deletes indexing data for an history item. The item may not have actually 138 // Deletes indexing data for an history item. The item may not have actually
128 // been indexed (which is the case if it did not previously meet minimum 139 // been indexed (which is the case if it did not previously meet minimum
129 // 'quick' criteria). 140 // 'quick' criteria).
130 void DeleteURL(URLID row_id); 141 void DeleteURL(URLID row_id);
131 142
143 // Breaks the |uni_string| string down into individual words and return
144 // a vector with the individual words in their original order. Break on
145 // whitespace if |break_on_space| also on special characters.
146 static String16Vector WordVectorFromString16(const string16& uni_string,
147 bool break_on_space);
148
132 private: 149 private:
133 friend class AddHistoryMatch; 150 friend class AddHistoryMatch;
134 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Initialization); 151 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
135 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
136 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
137 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath); 152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath);
138 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); 153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions);
157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
158 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
139 159
140 // Signals that there are no previously cached results for the typed term. 160 // Signals that there are no previously cached results for the typed term.
141 static const size_t kNoCachedResultForTerm; 161 static const size_t kNoCachedResultForTerm;
142 162
143 // Creating one of me without a history path is not allowed (tests excepted). 163 // Creating one of me without a history path is not allowed (tests excepted).
144 InMemoryURLIndex(); 164 InMemoryURLIndex();
145 165
146 // Convenience types. 166 // Convenience types.
147 typedef std::set<string16> String16Set; 167 typedef std::set<string16> String16Set;
148 typedef std::set<char16> Char16Set; 168 typedef std::set<char16> Char16Set;
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
217 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars); 237 WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars);
218 238
219 // Given a vector of Char16s in |uni_chars|, compare those characters, in 239 // Given a vector of Char16s in |uni_chars|, compare those characters, in
220 // order, with the previously searched term, returning the index of the 240 // order, with the previously searched term, returning the index of the
221 // cached results in the term_char_word_set_cache_ of the set with best 241 // cached results in the term_char_word_set_cache_ of the set with best
222 // matching number of characters. Returns kNoCachedResultForTerm if there 242 // matching number of characters. Returns kNoCachedResultForTerm if there
223 // was no match at all (i.e. the first character of |uni-chars| is not the 243 // was no match at all (i.e. the first character of |uni-chars| is not the
224 // same as the character of the first entry in term_char_word_set_cache_). 244 // same as the character of the first entry in term_char_word_set_cache_).
225 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars); 245 size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars);
226 246
247 // Creates a TermMatches which has an entry for each occurrence of the string
248 // |term| found in the string |string|. Mark each match with |term_num| so
249 // that the resulting TermMatches can be merged with other TermMatches for
250 // other terms.
251 static TermMatches MatchTermInString(const string16& term,
252 const string16& string,
253 int term_num);
254
227 // URL History indexing support functions. 255 // URL History indexing support functions.
228 256
229 // Indexes one URL history item. 257 // Indexes one URL history item.
230 bool IndexRow(const URLRow& row); 258 bool IndexRow(const URLRow& row);
231 259
232 // Breaks a string down into unique, individual characters in the order 260 // Breaks a string down into unique, individual characters in the order
233 // in which the characters are first encountered in the |uni_word| string. 261 // in which the characters are first encountered in the |uni_word| string.
234 static Char16Vector Char16VectorFromString16(const string16& uni_word); 262 static Char16Vector Char16VectorFromString16(const string16& uni_word);
235 263
236 // Breaks the |uni_word| string down into its individual characters. 264 // Breaks the |uni_word| string down into its individual characters.
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
269 HistoryIDSet HistoryIDsForTerm(const string16& uni_word); 297 HistoryIDSet HistoryIDsForTerm(const string16& uni_word);
270 298
271 // Calculates a raw score for this history item by first determining 299 // Calculates a raw score for this history item by first determining
272 // if all of the terms in |terms_vector| occur in |row| and, if so, 300 // if all of the terms in |terms_vector| occur in |row| and, if so,
273 // calculating a raw score based on 1) starting position of each term 301 // calculating a raw score based on 1) starting position of each term
274 // in the user input, 2) completeness of each term's match, 3) ordering 302 // in the user input, 2) completeness of each term's match, 3) ordering
275 // of the occurrence of each term (i.e. they appear in order), 4) last 303 // of the occurrence of each term (i.e. they appear in order), 4) last
276 // visit time, and 5) number of visits. 304 // visit time, and 5) number of visits.
277 // This raw score allows the results to be ordered and can be used 305 // This raw score allows the results to be ordered and can be used
278 // to influence the final score calculated by the client of this 306 // to influence the final score calculated by the client of this
279 // index. Return the starting location of the first term in 307 // index. Returns a ScoredHistoryMatch structure with the raw score and
280 // |first_term_location|. 308 // substring matching metrics.
281 static int RawScoreForURL(const URLRow& row, 309 static ScoredHistoryMatch ScoredMatchForURL(
282 const String16Vector& terms_vector, 310 const URLRow& row,
283 size_t* first_term_location); 311 const String16Vector& terms_vector);
312
313 // Calculates a partial raw score based on position, ordering and total
314 // substring match size using metrics recorded in |matches|. |max_length|
315 // is the length of the string against which the terms are being searched.
316 static int RawScoreForMatches(const TermMatches& matches,
317 size_t max_length);
318
319 // Sorts and removes overlapping substring matches from |matches| and
320 // returns the cleaned up matches.
321 static TermMatches SortAndDeoverlap(const TermMatches& matches);
284 322
285 // Utility functions supporting RestoreFromCache and SaveToCache. 323 // Utility functions supporting RestoreFromCache and SaveToCache.
286 324
287 // Construct a file path for the cache file within the same directory where 325 // Construct a file path for the cache file within the same directory where
288 // the history database is kept and saves that path to |file_path|. Returns 326 // the history database is kept and saves that path to |file_path|. Returns
289 // true if |file_path| can be successfully constructed. (This function 327 // true if |file_path| can be successfully constructed. (This function
290 // provided as a hook for unit testing.) 328 // provided as a hook for unit testing.)
291 bool GetCacheFilePath(FilePath* file_path); 329 bool GetCacheFilePath(FilePath* file_path);
292 330
293 // Encode a data structure into the protobuf |cache|. 331 // Encode a data structure into the protobuf |cache|.
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
335 TermCharWordSetVector term_char_word_set_cache_; 373 TermCharWordSetVector term_char_word_set_cache_;
336 HistoryInfoMap history_info_map_; 374 HistoryInfoMap history_info_map_;
337 std::string languages_; 375 std::string languages_;
338 376
339 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); 377 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex);
340 }; 378 };
341 379
342 } // namespace history 380 } // namespace history
343 381
344 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ 382 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
OLDNEW
« no previous file with comments | « chrome/browser/autocomplete/history_quick_provider_unittest.cc ('k') | chrome/browser/history/in_memory_url_index.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698