OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
7 #pragma once | 7 #pragma once |
8 | 8 |
| 9 #include <map> |
| 10 #include <set> |
| 11 #include <vector> |
| 12 |
| 13 #include "app/sql/connection.h" |
| 14 #include "base/basictypes.h" |
| 15 #include "base/linked_ptr.h" |
| 16 #include "base/scoped_ptr.h" |
| 17 #include "base/string16.h" |
| 18 #include "chrome/browser/history/history_types.h" |
| 19 #include "testing/gtest/include/gtest/gtest_prod.h" |
| 20 |
| 21 namespace base { |
| 22 class Time; |
| 23 } |
| 24 |
9 namespace history { | 25 namespace history { |
10 | 26 |
11 class URLDatabase; | 27 class URLDatabase; |
12 | 28 |
13 // The URL history source. | 29 // The URL history source. |
14 // Holds portions of the URL database in memory in an indexed form. Used to | 30 // Holds portions of the URL database in memory in an indexed form. Used to |
15 // quickly look up matching URLs for a given query string. Used by | 31 // quickly look up matching URLs for a given query string. Used by |
16 // the HistoryURLProvider for inline autocomplete and to provide URL | 32 // the HistoryURLProvider for inline autocomplete and to provide URL |
17 // matches to the omnibox. | 33 // matches to the omnibox. |
| 34 // |
| 35 // Note about multi-byte codepoints and the data structures in the |
| 36 // InMemoryURLIndex class: One will quickly notice that no effort is made to |
| 37 // insure that multi-byte character boundaries are detected when indexing the |
| 38 // words and characters in the URL history database except when converting |
| 39 // URL strings to lowercase. Multi-byte-edness makes no difference when |
| 40 // indexing or when searching the index as the final filtering of results |
| 41 // is dependent on the comparison of a string of bytes, not individual |
| 42 // characters. While the lookup of those bytes during a search in the |
| 43 // |char_word_map_| could serve up words in which the individual char16 |
| 44 // occurs as a portion of a composite character the next filtering step |
| 45 // will eliminate such words except in the case where a single character |
| 46 // is being searched on and which character occurs as the second char16 of a |
| 47 // multi-char16 instance. |
18 class InMemoryURLIndex { | 48 class InMemoryURLIndex { |
19 public: | 49 public: |
20 InMemoryURLIndex() {} | 50 InMemoryURLIndex(); |
21 ~InMemoryURLIndex() {} | 51 ~InMemoryURLIndex(); |
| 52 |
| 53 // Convenience types |
| 54 typedef std::vector<string16> String16Vector; |
22 | 55 |
23 // Open and index the URL history database. | 56 // Open and index the URL history database. |
24 bool Init(URLDatabase* history_db); | 57 bool Init(URLDatabase* history_db, const string16& languages); |
| 58 |
| 59 // Reset the history index. |
| 60 void Reset(); |
| 61 |
| 62 // Given a vector containing one or more words as string16s, scan the |
| 63 // history index and return a vector with all scored, matching history items. |
| 64 // Each term must occur somewhere in the history item for the item to |
| 65 // qualify; however, the terms do not necessarily have to be adjacent. |
| 66 HistoryMatches HistoryItemsForTerms(const String16Vector& terms); |
| 67 |
| 68 // Returns the date threshold for considering an history item as significant. |
| 69 static base::Time RecentThreshold(); |
| 70 |
| 71 private: |
| 72 friend class InMemoryURLIndexTest; |
| 73 FRIEND_TEST(InMemoryURLIndexTest, Initialization); |
| 74 |
| 75 // Convenience types |
| 76 typedef std::set<string16> String16Set; |
| 77 typedef std::set<char16> Char16Set; |
| 78 |
| 79 // An index into list of all of the words we have indexed. |
| 80 typedef int16 WordID; |
| 81 |
| 82 // A map allowing a WordID to be determined given a word. |
| 83 typedef std::map<string16, WordID> WordMap; |
| 84 |
| 85 // A map from character to word_ids. |
| 86 typedef std::set<WordID> WordIDSet; // An index into the WordList. |
| 87 typedef std::map<char16, WordIDSet> CharWordIDMap; |
| 88 |
| 89 // A map from word_id to history item. |
| 90 // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. |
| 91 // Consider using a smaller type. |
| 92 typedef URLID HistoryID; |
| 93 typedef std::set<HistoryID> HistoryIDSet; |
| 94 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; |
| 95 |
| 96 // Support caching of term character intersections so that we can optimize |
| 97 // searches which build upon a previous search. |
| 98 struct TermCharWordSet { |
| 99 TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used) |
| 100 : char_set_(char_set), |
| 101 word_id_set_(word_id_set), |
| 102 used_(used) {} |
| 103 |
| 104 Char16Set char_set_; |
| 105 WordIDSet word_id_set_; |
| 106 bool used_; // true if this set has been used for the current term search. |
| 107 }; |
| 108 typedef std::vector<TermCharWordSet> TermCharWordSetVector; |
| 109 |
| 110 // TODO(rohitrao): Probably replace this with QueryResults. |
| 111 typedef std::vector<URLRow> URLRowVector; |
| 112 |
| 113 // A map from history_id to the history's URL and title. |
| 114 typedef std::map<HistoryID, URLRow> HistoryInfoMap; |
| 115 |
| 116 // Break a string down into individual words. |
| 117 String16Set WordsFromString16(const string16& uni_string); |
| 118 |
| 119 // URL History indexing support functions. |
| 120 |
| 121 // Index one URL history item. |
| 122 bool IndexRow(URLRow row); |
| 123 |
| 124 // Break a string down into its individual characters. |
| 125 // Note that this is temporarily intended to work on a single word, but |
| 126 // _will_ work on a string of words, perhaps with unexpected results. |
| 127 // TODO(mrossetti): Lots of optimizations possible here for not restarting |
| 128 // a search if the user is just typing along. Also, change this to uniString |
| 129 // and properly handle substring matches, scoring and sorting the results |
| 130 // by score. Also, provide the metrics for where the matches occur so that |
| 131 // the UI can highlight the matched sections. |
| 132 Char16Set CharactersFromString16(const string16& uni_word); |
| 133 |
| 134 // Given a single word, add a reference to the containing history item |
| 135 // to the index. |
| 136 void AddWordToIndex(const string16& uni_word, HistoryID history_id); |
| 137 |
| 138 // Update an existing entry in the word/history index by adding the |
| 139 // |history_id| to set for |word_id| in the word_id_history_map_. |
| 140 void UpdateWordHistory(WordID word_id, HistoryID history_id); |
| 141 |
| 142 // Create a new entry in the word/history map for |word_id| and add |
| 143 // |history_id| as the initial element of the word's set. |
| 144 void AddWordHistory(const string16& uni_word, HistoryID history_id); |
| 145 |
| 146 // A list of all of indexed words. The index of a word in this list is the |
| 147 // ID of the word in the word_map_. It reduces the memory overhead by |
| 148 // replacing a potentially long and repeated string with a simple index. |
| 149 // NOTE: A word will _never_ be removed from this vector thus insuring |
| 150 // the immutability of the word_id throughout the session, reducing |
| 151 // maintenance complexity. |
| 152 String16Vector word_list_; |
| 153 |
| 154 uint64 history_item_count_; |
| 155 WordMap word_map_; |
| 156 CharWordIDMap char_word_map_; |
| 157 WordIDHistoryMap word_id_history_map_; |
| 158 TermCharWordSetVector term_char_word_set_cache_; |
| 159 HistoryInfoMap history_info_map_; |
| 160 string16 languages_; |
| 161 |
| 162 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); |
25 }; | 163 }; |
26 | 164 |
27 } // namespace history | 165 } // namespace history |
28 | 166 |
29 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 167 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
OLD | NEW |