| Index: chrome/browser/history/in_memory_url_index.h
|
| ===================================================================
|
| --- chrome/browser/history/in_memory_url_index.h (revision 56956)
|
| +++ chrome/browser/history/in_memory_url_index.h (working copy)
|
| @@ -6,6 +6,22 @@
|
| #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
|
| #pragma once
|
|
|
| +#include <map>
|
| +#include <set>
|
| +#include <vector>
|
| +
|
| +#include "app/sql/connection.h"
|
| +#include "base/basictypes.h"
|
| +#include "base/linked_ptr.h"
|
| +#include "base/scoped_ptr.h"
|
| +#include "base/string16.h"
|
| +#include "chrome/browser/history/history_types.h"
|
| +#include "testing/gtest/include/gtest/gtest_prod.h"
|
| +
|
| +namespace base {
|
| +class Time;
|
| +}
|
| +
|
| namespace history {
|
|
|
| class URLDatabase;
|
| @@ -15,13 +31,135 @@
|
| // quickly look up matching URLs for a given query string. Used by
|
| // the HistoryURLProvider for inline autocomplete and to provide URL
|
| // matches to the omnibox.
|
| +//
|
| +// Note about multi-byte codepoints and the data structures in the
|
| +// InMemoryURLIndex class: One will quickly notice that no effort is made to
|
| +// insure that multi-byte character boundaries are detected when indexing the
|
| +// words and characters in the URL history database except when converting
|
| +// URL strings to lowercase. Multi-byte-edness makes no difference when
|
| +// indexing or when searching the index as the final filtering of results
|
| +// is dependent on the comparison of a string of bytes, not individual
|
| +// characters. While the lookup of those bytes during a search in the
|
| +// |char_word_map_| could serve up words in which the individual char16
|
| +// occurs as a portion of a composite character the next filtering step
|
| +// will eliminate such words except in the case where a single character
|
| +// is being searched on and which character occurs as the second char16 of a
|
| +// multi-char16 instance.
|
| class InMemoryURLIndex {
|
| public:
|
| - InMemoryURLIndex() {}
|
| - ~InMemoryURLIndex() {}
|
| + InMemoryURLIndex();
|
| + ~InMemoryURLIndex();
|
|
|
| + // Convenience types
|
| + typedef std::vector<string16> String16Vector;
|
| +
|
| // Open and index the URL history database.
|
| - bool Init(URLDatabase* history_db);
|
| + bool Init(URLDatabase* history_db, const string16& languages);
|
| +
|
| + // Reset the history index.
|
| + void Reset();
|
| +
|
| + // Given a vector containing one or more words as string16s, scan the
|
| + // history index and return a vector with all scored, matching history items.
|
| + // Each term must occur somewhere in the history item for the item to
|
| + // qualify; however, the terms do not necessarily have to be adjacent.
|
| + HistoryMatches HistoryItemsForTerms(const String16Vector& terms);
|
| +
|
| + // Returns the date threshold for considering an history item as significant.
|
| + static base::Time RecentThreshold();
|
| +
|
| + private:
|
| + friend class InMemoryURLIndexTest;
|
| + FRIEND_TEST(InMemoryURLIndexTest, Initialization);
|
| +
|
| + // Convenience types
|
| + typedef std::set<string16> String16Set;
|
| + typedef std::set<char16> Char16Set;
|
| +
|
| + // An index into list of all of the words we have indexed.
|
| + typedef int16 WordID;
|
| +
|
| + // A map allowing a WordID to be determined given a word.
|
| + typedef std::map<string16, WordID> WordMap;
|
| +
|
| + // A map from character to word_ids.
|
| + typedef std::set<WordID> WordIDSet; // An index into the WordList.
|
| + typedef std::map<char16, WordIDSet> CharWordIDMap;
|
| +
|
| + // A map from word_id to history item.
|
| + // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit.
|
| + // Consider using a smaller type.
|
| + typedef URLID HistoryID;
|
| + typedef std::set<HistoryID> HistoryIDSet;
|
| + typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
|
| +
|
| + // Support caching of term character intersections so that we can optimize
|
| + // searches which build upon a previous search.
|
| + struct TermCharWordSet {
|
| + TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used)
|
| + : char_set_(char_set),
|
| + word_id_set_(word_id_set),
|
| + used_(used) {}
|
| +
|
| + Char16Set char_set_;
|
| + WordIDSet word_id_set_;
|
| + bool used_; // true if this set has been used for the current term search.
|
| + };
|
| + typedef std::vector<TermCharWordSet> TermCharWordSetVector;
|
| +
|
| + // TODO(rohitrao): Probably replace this with QueryResults.
|
| + typedef std::vector<URLRow> URLRowVector;
|
| +
|
| + // A map from history_id to the history's URL and title.
|
| + typedef std::map<HistoryID, URLRow> HistoryInfoMap;
|
| +
|
| + // Break a string down into individual words.
|
| + String16Set WordsFromString16(const string16& uni_string);
|
| +
|
| + // URL History indexing support functions.
|
| +
|
| + // Index one URL history item.
|
| + bool IndexRow(URLRow row);
|
| +
|
| + // Break a string down into its individual characters.
|
| + // Note that this is temporarily intended to work on a single word, but
|
| + // _will_ work on a string of words, perhaps with unexpected results.
|
| + // TODO(mrossetti): Lots of optimizations possible here for not restarting
|
| + // a search if the user is just typing along. Also, change this to uniString
|
| + // and properly handle substring matches, scoring and sorting the results
|
| + // by score. Also, provide the metrics for where the matches occur so that
|
| + // the UI can highlight the matched sections.
|
| + Char16Set CharactersFromString16(const string16& uni_word);
|
| +
|
| + // Given a single word, add a reference to the containing history item
|
| + // to the index.
|
| + void AddWordToIndex(const string16& uni_word, HistoryID history_id);
|
| +
|
| + // Update an existing entry in the word/history index by adding the
|
| + // |history_id| to set for |word_id| in the word_id_history_map_.
|
| + void UpdateWordHistory(WordID word_id, HistoryID history_id);
|
| +
|
| + // Create a new entry in the word/history map for |word_id| and add
|
| + // |history_id| as the initial element of the word's set.
|
| + void AddWordHistory(const string16& uni_word, HistoryID history_id);
|
| +
|
| + // A list of all of indexed words. The index of a word in this list is the
|
| + // ID of the word in the word_map_. It reduces the memory overhead by
|
| + // replacing a potentially long and repeated string with a simple index.
|
| + // NOTE: A word will _never_ be removed from this vector thus insuring
|
| + // the immutability of the word_id throughout the session, reducing
|
| + // maintenance complexity.
|
| + String16Vector word_list_;
|
| +
|
| + uint64 history_item_count_;
|
| + WordMap word_map_;
|
| + CharWordIDMap char_word_map_;
|
| + WordIDHistoryMap word_id_history_map_;
|
| + TermCharWordSetVector term_char_word_set_cache_;
|
| + HistoryInfoMap history_info_map_;
|
| + string16 languages_;
|
| +
|
| + DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex);
|
| };
|
|
|
| } // namespace history
|
|
|