Chromium Code Reviews| Index: chrome/browser/history/top_sites_cache.h |
| diff --git a/chrome/browser/history/top_sites_cache.h b/chrome/browser/history/top_sites_cache.h |
| index 822d18903dc00e05ceb538e3975f71b2a6a8e3cf..98f78764ae29e010557e0701f9e9fbd38d9537a3 100644 |
| --- a/chrome/browser/history/top_sites_cache.h |
| +++ b/chrome/browser/history/top_sites_cache.h |
| @@ -6,6 +6,7 @@ |
| #define CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
| #include <map> |
| +#include <string> |
| #include <utility> |
| #include "base/memory/ref_counted.h" |
| @@ -13,9 +14,98 @@ |
| namespace history { |
| +// TopSiteCache stores cached thumbnails for visited pages. The flow to retrieve |
| +// thumbnails from a given [input URL] is: |
| +// |
| +// [input URL] --(map 1)--> [canonical URL] --(map 2)--> image. |
| +// |
| +// (map 2) simply looks up |images_| directly. (map 1) uses |canonical_urls_|, |
| +// and is more complex. |
| +// |
| +// For GetPageThumbnail(), (map 1) applies GetCanonicalURL(): |
| +// - if [canonical URL] is a key in |canonical_urls_|, return the value. |
| +// - else return [input URL]. |
| +// |
| +// For GetPageThumbnailForPrefix(), (map 1) applies GetCanonicalURLForPrefix(): |
| +// - if [canonical URL] is a key in |canonical_urls_|, return the value. |
| +// - else if [canonical URL] is "URL prefix" of some key in |canonical_urls_|, |
|
beaudoin
2013/09/05 01:41:19
nit: a "URL prefix"
huangs
2013/09/05 15:24:46
Done.
|
| +// return the value corresponding to the key (more on "URL prefix" later). |
| +// - else return [input URL]. |
| +// |
| +// To specify "URL prefix", we consider a URL as a list of string tokens, with |
| +// query parameters ignored. For example, |
| +// "http://www.google.com/path/subpath/index.html?q=3" |
| +// can be split into tokens: |
| +// ["http://www.google.com", "path", "subpath", "index.html"] |
| +// Using tokens as units of comparison, we define "URL prefix" (implemented in |
| +// UrlIsPrefix()) and "URL compare" (CanonicalURLComparator). |
| +// |
| +// Algorithms and Implementation Details: |
|
beaudoin
2013/09/05 01:41:19
Algorithms and Implementation Details for prefix s
huangs
2013/09/05 15:24:46
Done.
|
| +// |
| +// For efficient URL prefix search, instead of using a trie of tokens, our |
| +// implementation applies two ideas (illustrated by examples): |
| +// |
| +// Idea 1: Let |L| = {"aa", "h", "hh", "his", "ss", "su"} be a sorted list of |
| +// strings. Test whether a given string is a prefix of some string in |L|: |
|
beaudoin
2013/09/05 01:41:19
// strings. We test whether a |query| string is a
huangs
2013/09/05 15:24:46
Done.
|
| +// "hi": by binary search we find "his", for which "hi" is a prefix. |
| +// "zz": by binary search "zz" > all strings in |L|, so it's never a prefix. |
| +// "st": by binary search we find "su", for which "st" is not a prefix, so it's |
| +// never a prefix; since if such a prefix |p| exists, we would have found |
| +// "st" < |p| < "su". |
| +// |
| +// We apply the same idea for URL prefix, but using URL compare instead of |
|
beaudoin
2013/09/05 01:41:19
but using CanonicalURLComparator instead of
huangs
2013/09/05 15:24:46
Done.
|
| +// string compare. The general strategy is the same: apply binary search follwed |
|
beaudoin
2013/09/05 01:41:19
nit: followed
huangs
2013/09/05 15:24:46
Done.
|
| +// by a single prefix test. |
| +// |
| +// Idea 2: Let us compare the following URLs: |
| +// |A| = "http://www.google.com/test/ab/cd" |
| +// |B| = "http://www.google.com/test-case/yz" |
| +// First we consider the inefficient way. Splitting these into tokens yields: |
| +// |A| => ["http://www.google.com", "test", "ab", "cd"] |
| +// |B| => ["http://www.google.com", "test-case", "yz"] |
| +// Under lexicographical compare, for token 0 we have equivalence, and for |
| +// token 1 we have "test" < "test-case". Thus |A| < |B| (URL comparison). Note |
| +// that naive string comparison would yield opposite, since '/' > '-' (ASCII). |
|
beaudoin
2013/09/05 01:41:19
Dont talk about naive string comparison here, it's
huangs
2013/09/05 15:24:46
Done.
|
| +// |
| +// To improve efficiency, we can perform the comparison in-place instead of |
| +// splitting URLs into tokens. To do this, we treat '/' as a special separator, |
| +// so '\0' < '/' < other characters. Using this as a comparator, direct string |
| +// string comparison between |A| and |B|, results in '/' < '-', so |A| < |B|. |
| +// See CanonicalURLComparator::CompareString() for details and special cases. |
|
beaudoin
2013/09/05 01:41:19
I would replace the above paragraph with:
To impr
huangs
2013/09/05 15:24:46
Done.
|
| +// |
| +// Combining the two ideas above, we arrive at the following implementation for |
| +// efficient URL prefix matching for URLs stored in |canonical_urls_|: |
| +// - |canonical_urls_| is implemented as std::map that uses the comparator |
| +// CanonicalURLComparator. |
| +// - To see if |url| is a prefix of any key in |canonical_urls_|, we perform |
| +// |canonical_urls_.lower_bound(url)| for effective binary search, and then |
| +// use UrlIsPrefix() for prefix testing. |
| +// |
| +// Caveats: |
| +// - In the case of multiple matches, sorting would biase towards URLs that |
| +// appear first under CanonicalURLComparator(). So between |
| +// "http://www.google.com/long/url/with/very/deep/directories" |
| +// "http://www.google.com/short" |
| +// the first one will be matched against "http://www.google.com". |
| +// |
| +// - CanonicalURLComparator handles query strings, but UrlIsPrefix() is |
| +// insensitive to them. |
| + |
| +// The entries in CanonicalURLs, see CanonicalURLs for details. The second |
| +// argument gives the index of the URL into MostVisitedURLs redirects. |
| +typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; |
| + |
| // TopSitesCache caches the top sites and thumbnails for TopSites. |
| class TopSitesCache { |
| public: |
| + // Comparator used for CanonicalURLs. Its main purpose |
| + class CanonicalURLComparator { |
| + public: |
| + bool operator()(const CanonicalURLEntry& e1, |
| + const CanonicalURLEntry& e2) const; |
| + static bool CompareString(const std::string& s1, const std::string& s2); |
| + }; |
| + |
| TopSitesCache(); |
| ~TopSitesCache(); |
| @@ -37,33 +127,42 @@ class TopSitesCache { |
| bool GetPageThumbnail(const GURL& url, |
| scoped_refptr<base::RefCountedMemory>* bytes); |
| + // Similar to GetPageThumbnail(), but rather than requiring an exact match, |
| + // only requires the specified URL to be the prefix of a URL in TopSites. |
| + // If multiple matches exist, returns the thumbnail for the first match in |
| + // lexicographical order. |
| + bool GetPageThumbnailForPrefix(const GURL& prefix_url, |
| + scoped_refptr<base::RefCountedMemory>* bytes); |
| + |
| // Fetches the thumbnail score for the specified url. Returns true if |
| // there is a thumbnail score for the specified url. |
| bool GetPageThumbnailScore(const GURL& url, ThumbnailScore* score); |
| - // Returns the canonical URL for |url|. |
| + // Returns the [canonical URL] for |url|. |
| const GURL& GetCanonicalURL(const GURL& url); |
| + // Returns the [canonical URL] for |url_prefix|. If |url_prefix| matches |
| + // multiple entries then returns the [canonical URL] for the first entry in |
| + // lexicographical order. |
| + const GURL& GetCanonicalURLForPrefix(const GURL& url_prefix); |
| + |
| // Returns true if |url| is known. |
| bool IsKnownURL(const GURL& url); |
| // Returns the index into |top_sites_| for |url|. |
| size_t GetURLIndex(const GURL& url); |
| - private: |
| - // The entries in CanonicalURLs, see CanonicalURLs for details. The second |
| - // argument gives the index of the URL into MostVisitedURLs redirects. |
| - typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; |
| - |
| - // Comparator used for CanonicalURLs. |
| - class CanonicalURLComparator { |
| - public: |
| - bool operator()(const CanonicalURLEntry& e1, |
| - const CanonicalURLEntry& e2) const { |
| - return e1.first->redirects[e1.second] < e2.first->redirects[e2.second]; |
| - } |
| - }; |
| + // Returns whether or not |url1| is a prefix of |url2|. Criteria: |
| + // - Scheme, host, port: exact match required. |
| + // - Path: considered as a list of path components (e.g., ["a", "bb"] in |
| + // "/a/bb"), and |url1|'s list must be a prefix of |url2|'s list. |
| + // - Query: ignored. |
| + // Note that "http://www.google.com/test" is NOT a prefix of |
| + // "http://www.google.com/testing", although "test" is a prefix of "testing". |
| + // Also, "http://www.google.com" will not match "http://www.google.com:80". |
| + static bool UrlIsPrefix(const GURL& url1, const GURL& url2); |
| + private: |
| // This is used to map from redirect url to the MostVisitedURL the redirect is |
| // from. Ideally this would be map<GURL, size_t> (second param indexing into |
| // top_sites_), but this results in duplicating all redirect urls. As some |
| @@ -80,8 +179,11 @@ class TopSitesCache { |
| // Stores a set of redirects. This is used by GenerateCanonicalURLs. |
| void StoreRedirectChain(const RedirectList& redirects, size_t destination); |
| - // Returns the iterator into canconical_urls_ for the specified url. |
| - CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url); |
| + // Returns the iterator into canonical_urls_ for the |url|. If |match_prefix| |
| + // is true, then returns the first iterator into canonical_urls_ that has |
| + // |url| as a path-prefix. |
| + CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url, |
| + bool match_prefix); |
| // The top sites. |
| MostVisitedURLList top_sites_; |