OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
7 #pragma once | 7 #pragma once |
8 | 8 |
9 #include <map> | |
10 #include <set> | |
11 #include <vector> | |
12 | |
13 #include "app/sql/connection.h" | |
14 #include "base/basictypes.h" | |
15 #include "base/linked_ptr.h" | |
16 #include "base/scoped_ptr.h" | |
17 #include "base/string16.h" | |
18 #include "chrome/browser/history/history_types.h" | |
19 #include "testing/gtest/include/gtest/gtest_prod.h" | |
20 | |
21 namespace base { | |
22 class Time; | |
23 } | |
24 | |
25 namespace history { | 9 namespace history { |
26 | 10 |
27 class URLDatabase; | 11 class URLDatabase; |
28 | 12 |
29 // The URL history source. | 13 // The URL history source. |
30 // Holds portions of the URL database in memory in an indexed form. Used to | 14 // Holds portions of the URL database in memory in an indexed form. Used to |
31 // quickly look up matching URLs for a given query string. Used by | 15 // quickly look up matching URLs for a given query string. Used by |
32 // the HistoryURLProvider for inline autocomplete and to provide URL | 16 // the HistoryURLProvider for inline autocomplete and to provide URL |
33 // matches to the omnibox. | 17 // matches to the omnibox. |
34 // | |
35 // Note about multi-byte codepoints and the data structures in the | |
36 // InMemoryURLIndex class: One will quickly notice that no effort is made to | |
37 // insure that multi-byte character boundaries are detected when indexing the | |
38 // words and characters in the URL history database except when converting | |
39 // URL strings to lowercase. Multi-byte-edness makes no difference when | |
40 // indexing or when searching the index as the final filtering of results | |
41 // is dependent on the comparison of a string of bytes, not individual | |
42 // characters. While the lookup of those bytes during a search in the | |
43 // |char_word_map_| could serve up words in which the individual char16 | |
44 // occurs as a portion of a composite character the next filtering step | |
45 // will eliminate such words except in the case where a single character | |
46 // is being searched on and which character occurs as the second char16 of a | |
47 // multi-char16 instance. | |
48 class InMemoryURLIndex { | 18 class InMemoryURLIndex { |
49 public: | 19 public: |
50 InMemoryURLIndex(); | 20 InMemoryURLIndex() {} |
51 ~InMemoryURLIndex(); | 21 ~InMemoryURLIndex() {} |
52 | |
53 // Convenience types | |
54 typedef std::vector<string16> String16Vector; | |
55 | 22 |
56 // Open and index the URL history database. | 23 // Open and index the URL history database. |
57 bool Init(URLDatabase* history_db, const string16& languages); | 24 bool Init(URLDatabase* history_db); |
58 | |
59 // Reset the history index. | |
60 void Reset(); | |
61 | |
62 // Given a vector containing one or more words as string16s, scan the | |
63 // history index and return a vector with all scored, matching history items. | |
64 // Each term must occur somewhere in the history item for the item to | |
65 // qualify; however, the terms do not necessarily have to be adjacent. | |
66 HistoryMatches HistoryItemsForTerms(const String16Vector& terms); | |
67 | |
68 // Returns the date threshold for considering an history item as significant. | |
69 static base::Time RecentThreshold(); | |
70 | |
71 private: | |
72 friend class InMemoryURLIndexTest; | |
73 FRIEND_TEST(InMemoryURLIndexTest, Initialization); | |
74 | |
75 // Convenience types | |
76 typedef std::set<string16> String16Set; | |
77 typedef std::set<char16> Char16Set; | |
78 | |
79 // An index into list of all of the words we have indexed. | |
80 typedef int16 WordID; | |
81 | |
82 // A map allowing a WordID to be determined given a word. | |
83 typedef std::map<string16, WordID> WordMap; | |
84 | |
85 // A map from character to word_ids. | |
86 typedef std::set<WordID> WordIDSet; // An index into the WordList. | |
87 typedef std::map<char16, WordIDSet> CharWordIDMap; | |
88 | |
89 // A map from word_id to history item. | |
90 // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. | |
91 // Consider using a smaller type. | |
92 typedef URLID HistoryID; | |
93 typedef std::set<HistoryID> HistoryIDSet; | |
94 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; | |
95 | |
96 // Support caching of term character intersections so that we can optimize | |
97 // searches which build upon a previous search. | |
98 struct TermCharWordSet { | |
99 TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used) | |
100 : char_set_(char_set), | |
101 word_id_set_(word_id_set), | |
102 used_(used) {} | |
103 | |
104 Char16Set char_set_; | |
105 WordIDSet word_id_set_; | |
106 bool used_; // true if this set has been used for the current term search. | |
107 }; | |
108 typedef std::vector<TermCharWordSet> TermCharWordSetVector; | |
109 | |
110 // TODO(rohitrao): Probably replace this with QueryResults. | |
111 typedef std::vector<URLRow> URLRowVector; | |
112 | |
113 // A map from history_id to the history's URL and title. | |
114 typedef std::map<HistoryID, URLRow> HistoryInfoMap; | |
115 | |
116 // Break a string down into individual words. | |
117 String16Set WordsFromString16(const string16& uni_string); | |
118 | |
119 // URL History indexing support functions. | |
120 | |
121 // Index one URL history item. | |
122 bool IndexRow(URLRow row); | |
123 | |
124 // Break a string down into its individual characters. | |
125 // Note that this is temporarily intended to work on a single word, but | |
126 // _will_ work on a string of words, perhaps with unexpected results. | |
127 // TODO(mrossetti): Lots of optimizations possible here for not restarting | |
128 // a search if the user is just typing along. Also, change this to uniString | |
129 // and properly handle substring matches, scoring and sorting the results | |
130 // by score. Also, provide the metrics for where the matches occur so that | |
131 // the UI can highlight the matched sections. | |
132 Char16Set CharactersFromString16(const string16& uni_word); | |
133 | |
134 // Given a single word, add a reference to the containing history item | |
135 // to the index. | |
136 void AddWordToIndex(const string16& uni_word, HistoryID history_id); | |
137 | |
138 // Update an existing entry in the word/history index by adding the | |
139 // |history_id| to set for |word_id| in the word_id_history_map_. | |
140 void UpdateWordHistory(WordID word_id, HistoryID history_id); | |
141 | |
142 // Create a new entry in the word/history map for |word_id| and add | |
143 // |history_id| as the initial element of the word's set. | |
144 void AddWordHistory(const string16& uni_word, HistoryID history_id); | |
145 | |
146 // A list of all of indexed words. The index of a word in this list is the | |
147 // ID of the word in the word_map_. It reduces the memory overhead by | |
148 // replacing a potentially long and repeated string with a simple index. | |
149 // NOTE: A word will _never_ be removed from this vector thus insuring | |
150 // the immutability of the word_id throughout the session, reducing | |
151 // maintenance complexity. | |
152 String16Vector word_list_; | |
153 | |
154 uint64 history_item_count_; | |
155 WordMap word_map_; | |
156 CharWordIDMap char_word_map_; | |
157 WordIDHistoryMap word_id_history_map_; | |
158 TermCharWordSetVector term_char_word_set_cache_; | |
159 HistoryInfoMap history_info_map_; | |
160 string16 languages_; | |
161 | |
162 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); | |
163 }; | 25 }; |
164 | 26 |
165 } // namespace history | 27 } // namespace history |
166 | 28 |
167 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 29 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
OLD | NEW |