Index: chrome/browser/history/in_memory_url_index.h |
=================================================================== |
--- chrome/browser/history/in_memory_url_index.h (revision 56956) |
+++ chrome/browser/history/in_memory_url_index.h (working copy) |
@@ -6,6 +6,22 @@ |
#define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ |
#pragma once |
+#include <map> |
+#include <set> |
+#include <vector> |
+ |
+#include "app/sql/connection.h" |
+#include "base/basictypes.h" |
+#include "base/linked_ptr.h" |
+#include "base/scoped_ptr.h" |
+#include "base/string16.h" |
+#include "chrome/browser/history/history_types.h" |
+#include "testing/gtest/include/gtest/gtest_prod.h" |
+ |
+namespace base { |
+class Time; |
+} |
+ |
namespace history { |
class URLDatabase; |
@@ -15,13 +31,135 @@ |
// quickly look up matching URLs for a given query string. Used by |
// the HistoryURLProvider for inline autocomplete and to provide URL |
// matches to the omnibox. |
+// |
+// Note about multi-byte codepoints and the data structures in the |
+// InMemoryURLIndex class: One will quickly notice that no effort is made to |
+// insure that multi-byte character boundaries are detected when indexing the |
+// words and characters in the URL history database except when converting |
+// URL strings to lowercase. Multi-byte-edness makes no difference when |
+// indexing or when searching the index as the final filtering of results |
+// is dependent on the comparison of a string of bytes, not individual |
+// characters. While the lookup of those bytes during a search in the |
+// |char_word_map_| could serve up words in which the individual char16 |
+// occurs as a portion of a composite character the next filtering step |
+// will eliminate such words except in the case where a single character |
+// is being searched on and which character occurs as the second char16 of a |
+// multi-char16 instance. |
class InMemoryURLIndex { |
public: |
- InMemoryURLIndex() {} |
- ~InMemoryURLIndex() {} |
+ InMemoryURLIndex(); |
+ ~InMemoryURLIndex(); |
+ // Convenience types |
+ typedef std::vector<string16> String16Vector; |
+ |
// Open and index the URL history database. |
- bool Init(URLDatabase* history_db); |
+ bool Init(URLDatabase* history_db, const string16& languages); |
+ |
+ // Reset the history index. |
+ void Reset(); |
+ |
+ // Given a vector containing one or more words as string16s, scan the |
+ // history index and return a vector with all scored, matching history items. |
+ // Each term must occur somewhere in the history item for the item to |
+ // qualify; however, the terms do not necessarily have to be adjacent. |
+ HistoryMatches HistoryItemsForTerms(const String16Vector& terms); |
+ |
+ // Returns the date threshold for considering an history item as significant. |
+ static base::Time RecentThreshold(); |
+ |
+ private: |
+ friend class InMemoryURLIndexTest; |
+ FRIEND_TEST(InMemoryURLIndexTest, Initialization); |
+ |
+ // Convenience types |
+ typedef std::set<string16> String16Set; |
+ typedef std::set<char16> Char16Set; |
+ |
+ // An index into list of all of the words we have indexed. |
+ typedef int16 WordID; |
+ |
+ // A map allowing a WordID to be determined given a word. |
+ typedef std::map<string16, WordID> WordMap; |
+ |
+ // A map from character to word_ids. |
+ typedef std::set<WordID> WordIDSet; // An index into the WordList. |
+ typedef std::map<char16, WordIDSet> CharWordIDMap; |
+ |
+ // A map from word_id to history item. |
+ // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. |
+ // Consider using a smaller type. |
+ typedef URLID HistoryID; |
+ typedef std::set<HistoryID> HistoryIDSet; |
+ typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; |
+ |
+ // Support caching of term character intersections so that we can optimize |
+ // searches which build upon a previous search. |
+ struct TermCharWordSet { |
+ TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used) |
+ : char_set_(char_set), |
+ word_id_set_(word_id_set), |
+ used_(used) {} |
+ |
+ Char16Set char_set_; |
+ WordIDSet word_id_set_; |
+ bool used_; // true if this set has been used for the current term search. |
+ }; |
+ typedef std::vector<TermCharWordSet> TermCharWordSetVector; |
+ |
+ // TODO(rohitrao): Probably replace this with QueryResults. |
+ typedef std::vector<URLRow> URLRowVector; |
+ |
+ // A map from history_id to the history's URL and title. |
+ typedef std::map<HistoryID, URLRow> HistoryInfoMap; |
+ |
+ // Break a string down into individual words. |
+ String16Set WordsFromString16(const string16& uni_string); |
+ |
+ // URL History indexing support functions. |
+ |
+ // Index one URL history item. |
+ bool IndexRow(URLRow row); |
+ |
+ // Break a string down into its individual characters. |
+ // Note that this is temporarily intended to work on a single word, but |
+ // _will_ work on a string of words, perhaps with unexpected results. |
+ // TODO(mrossetti): Lots of optimizations possible here for not restarting |
+ // a search if the user is just typing along. Also, change this to uniString |
+ // and properly handle substring matches, scoring and sorting the results |
+ // by score. Also, provide the metrics for where the matches occur so that |
+ // the UI can highlight the matched sections. |
+ Char16Set CharactersFromString16(const string16& uni_word); |
+ |
+ // Given a single word, add a reference to the containing history item |
+ // to the index. |
+ void AddWordToIndex(const string16& uni_word, HistoryID history_id); |
+ |
+ // Update an existing entry in the word/history index by adding the |
+ // |history_id| to set for |word_id| in the word_id_history_map_. |
+ void UpdateWordHistory(WordID word_id, HistoryID history_id); |
+ |
+ // Create a new entry in the word/history map for |word_id| and add |
+ // |history_id| as the initial element of the word's set. |
+ void AddWordHistory(const string16& uni_word, HistoryID history_id); |
+ |
+ // A list of all of indexed words. The index of a word in this list is the |
+ // ID of the word in the word_map_. It reduces the memory overhead by |
+ // replacing a potentially long and repeated string with a simple index. |
+ // NOTE: A word will _never_ be removed from this vector thus insuring |
+ // the immutability of the word_id throughout the session, reducing |
+ // maintenance complexity. |
+ String16Vector word_list_; |
+ |
+ uint64 history_item_count_; |
+ WordMap word_map_; |
+ CharWordIDMap char_word_map_; |
+ WordIDHistoryMap word_id_history_map_; |
+ TermCharWordSetVector term_char_word_set_cache_; |
+ HistoryInfoMap history_info_map_; |
+ string16 languages_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); |
}; |
} // namespace history |