| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
| 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
| 7 #pragma once | 7 #pragma once |
| 8 | 8 |
| 9 #include <map> | |
| 10 #include <set> | |
| 11 #include <vector> | |
| 12 | |
| 13 #include "app/sql/connection.h" | |
| 14 #include "base/basictypes.h" | |
| 15 #include "base/linked_ptr.h" | |
| 16 #include "base/scoped_ptr.h" | |
| 17 #include "base/string16.h" | |
| 18 #include "chrome/browser/history/history_types.h" | |
| 19 #include "testing/gtest/include/gtest/gtest_prod.h" | |
| 20 | |
| 21 namespace base { | |
| 22 class Time; | |
| 23 } | |
| 24 | |
| 25 namespace history { | 9 namespace history { |
| 26 | 10 |
| 27 class URLDatabase; | 11 class URLDatabase; |
| 28 | 12 |
| 29 // The URL history source. | 13 // The URL history source. |
| 30 // Holds portions of the URL database in memory in an indexed form. Used to | 14 // Holds portions of the URL database in memory in an indexed form. Used to |
| 31 // quickly look up matching URLs for a given query string. Used by | 15 // quickly look up matching URLs for a given query string. Used by |
| 32 // the HistoryURLProvider for inline autocomplete and to provide URL | 16 // the HistoryURLProvider for inline autocomplete and to provide URL |
| 33 // matches to the omnibox. | 17 // matches to the omnibox. |
| 34 // | |
| 35 // Note about multi-byte codepoints and the data structures in the | |
| 36 // InMemoryURLIndex class: One will quickly notice that no effort is made to | |
| 37 // insure that multi-byte character boundaries are detected when indexing the | |
| 38 // words and characters in the URL history database except when converting | |
| 39 // URL strings to lowercase. Multi-byte-edness makes no difference when | |
| 40 // indexing or when searching the index as the final filtering of results | |
| 41 // is dependent on the comparison of a string of bytes, not individual | |
| 42 // characters. While the lookup of those bytes during a search in the | |
| 43 // |char_word_map_| could serve up words in which the individual char16 | |
| 44 // occurs as a portion of a composite character the next filtering step | |
| 45 // will eliminate such words except in the case where a single character | |
| 46 // is being searched on and which character occurs as the second char16 of a | |
| 47 // multi-char16 instance. | |
| 48 class InMemoryURLIndex { | 18 class InMemoryURLIndex { |
| 49 public: | 19 public: |
| 50 InMemoryURLIndex(); | 20 InMemoryURLIndex() {} |
| 51 ~InMemoryURLIndex(); | 21 ~InMemoryURLIndex() {} |
| 52 | |
| 53 // Convenience types | |
| 54 typedef std::vector<string16> String16Vector; | |
| 55 | 22 |
| 56 // Open and index the URL history database. | 23 // Open and index the URL history database. |
| 57 bool Init(URLDatabase* history_db, const string16& languages); | 24 bool Init(URLDatabase* history_db); |
| 58 | |
| 59 // Reset the history index. | |
| 60 void Reset(); | |
| 61 | |
| 62 // Given a vector containing one or more words as string16s, scan the | |
| 63 // history index and return a vector with all scored, matching history items. | |
| 64 // Each term must occur somewhere in the history item for the item to | |
| 65 // qualify; however, the terms do not necessarily have to be adjacent. | |
| 66 HistoryMatches HistoryItemsForTerms(const String16Vector& terms); | |
| 67 | |
| 68 // Returns the date threshold for considering an history item as significant. | |
| 69 static base::Time RecentThreshold(); | |
| 70 | |
| 71 private: | |
| 72 friend class InMemoryURLIndexTest; | |
| 73 FRIEND_TEST(InMemoryURLIndexTest, Initialization); | |
| 74 | |
| 75 // Convenience types | |
| 76 typedef std::set<string16> String16Set; | |
| 77 typedef std::set<char16> Char16Set; | |
| 78 | |
| 79 // An index into list of all of the words we have indexed. | |
| 80 typedef int16 WordID; | |
| 81 | |
| 82 // A map allowing a WordID to be determined given a word. | |
| 83 typedef std::map<string16, WordID> WordMap; | |
| 84 | |
| 85 // A map from character to word_ids. | |
| 86 typedef std::set<WordID> WordIDSet; // An index into the WordList. | |
| 87 typedef std::map<char16, WordIDSet> CharWordIDMap; | |
| 88 | |
| 89 // A map from word_id to history item. | |
| 90 // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. | |
| 91 // Consider using a smaller type. | |
| 92 typedef URLID HistoryID; | |
| 93 typedef std::set<HistoryID> HistoryIDSet; | |
| 94 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; | |
| 95 | |
| 96 // Support caching of term character intersections so that we can optimize | |
| 97 // searches which build upon a previous search. | |
| 98 struct TermCharWordSet { | |
| 99 TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used) | |
| 100 : char_set_(char_set), | |
| 101 word_id_set_(word_id_set), | |
| 102 used_(used) {} | |
| 103 | |
| 104 Char16Set char_set_; | |
| 105 WordIDSet word_id_set_; | |
| 106 bool used_; // true if this set has been used for the current term search. | |
| 107 }; | |
| 108 typedef std::vector<TermCharWordSet> TermCharWordSetVector; | |
| 109 | |
| 110 // TODO(rohitrao): Probably replace this with QueryResults. | |
| 111 typedef std::vector<URLRow> URLRowVector; | |
| 112 | |
| 113 // A map from history_id to the history's URL and title. | |
| 114 typedef std::map<HistoryID, URLRow> HistoryInfoMap; | |
| 115 | |
| 116 // Break a string down into individual words. | |
| 117 String16Set WordsFromString16(const string16& uni_string); | |
| 118 | |
| 119 // URL History indexing support functions. | |
| 120 | |
| 121 // Index one URL history item. | |
| 122 bool IndexRow(URLRow row); | |
| 123 | |
| 124 // Break a string down into its individual characters. | |
| 125 // Note that this is temporarily intended to work on a single word, but | |
| 126 // _will_ work on a string of words, perhaps with unexpected results. | |
| 127 // TODO(mrossetti): Lots of optimizations possible here for not restarting | |
| 128 // a search if the user is just typing along. Also, change this to uniString | |
| 129 // and properly handle substring matches, scoring and sorting the results | |
| 130 // by score. Also, provide the metrics for where the matches occur so that | |
| 131 // the UI can highlight the matched sections. | |
| 132 Char16Set CharactersFromString16(const string16& uni_word); | |
| 133 | |
| 134 // Given a single word, add a reference to the containing history item | |
| 135 // to the index. | |
| 136 void AddWordToIndex(const string16& uni_word, HistoryID history_id); | |
| 137 | |
| 138 // Update an existing entry in the word/history index by adding the | |
| 139 // |history_id| to set for |word_id| in the word_id_history_map_. | |
| 140 void UpdateWordHistory(WordID word_id, HistoryID history_id); | |
| 141 | |
| 142 // Create a new entry in the word/history map for |word_id| and add | |
| 143 // |history_id| as the initial element of the word's set. | |
| 144 void AddWordHistory(const string16& uni_word, HistoryID history_id); | |
| 145 | |
| 146 // A list of all of indexed words. The index of a word in this list is the | |
| 147 // ID of the word in the word_map_. It reduces the memory overhead by | |
| 148 // replacing a potentially long and repeated string with a simple index. | |
| 149 // NOTE: A word will _never_ be removed from this vector thus insuring | |
| 150 // the immutability of the word_id throughout the session, reducing | |
| 151 // maintenance complexity. | |
| 152 String16Vector word_list_; | |
| 153 | |
| 154 uint64 history_item_count_; | |
| 155 WordMap word_map_; | |
| 156 CharWordIDMap char_word_map_; | |
| 157 WordIDHistoryMap word_id_history_map_; | |
| 158 TermCharWordSetVector term_char_word_set_cache_; | |
| 159 HistoryInfoMap history_info_map_; | |
| 160 string16 languages_; | |
| 161 | |
| 162 DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); | |
| 163 }; | 25 }; |
| 164 | 26 |
| 165 } // namespace history | 27 } // namespace history |
| 166 | 28 |
| 167 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ | 29 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
| OLD | NEW |