Index: chrome/browser/history/top_sites_cache.h |
diff --git a/chrome/browser/history/top_sites_cache.h b/chrome/browser/history/top_sites_cache.h |
index 822d18903dc00e05ceb538e3975f71b2a6a8e3cf..98f78764ae29e010557e0701f9e9fbd38d9537a3 100644 |
--- a/chrome/browser/history/top_sites_cache.h |
+++ b/chrome/browser/history/top_sites_cache.h |
@@ -6,6 +6,7 @@ |
#define CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
#include <map> |
+#include <string> |
#include <utility> |
#include "base/memory/ref_counted.h" |
@@ -13,9 +14,98 @@ |
namespace history { |
+// TopSiteCache stores cached thumbnails for visited pages. The flow to retrieve |
+// thumbnails from a given [input URL] is: |
+// |
+// [input URL] --(map 1)--> [canonical URL] --(map 2)--> image. |
+// |
+// (map 2) simply looks up |images_| directly. (map 1) uses |canonical_urls_|, |
+// and is more complex. |
+// |
+// For GetPageThumbnail(), (map 1) applies GetCanonicalURL(): |
+// - if [canonical URL] is a key in |canonical_urls_|, return the value. |
+// - else return [input URL]. |
+// |
+// For GetPageThumbnailForPrefix(), (map 1) applies GetCanonicalURLForPrefix(): |
+// - if [canonical URL] is a key in |canonical_urls_|, return the value. |
+// - else if [canonical URL] is "URL prefix" of some key in |canonical_urls_|, |
beaudoin
2013/09/05 01:41:19
nit: a "URL prefix"
huangs
2013/09/05 15:24:46
Done.
|
+// return the value corresponding to the key (more on "URL prefix" later). |
+// - else return [input URL]. |
+// |
+// To specify "URL prefix", we consider a URL as a list of string tokens, with |
+// query parameters ignored. For example, |
+// "http://www.google.com/path/subpath/index.html?q=3" |
+// can be split into tokens: |
+// ["http://www.google.com", "path", "subpath", "index.html"] |
+// Using tokens as units of comparison, we define "URL prefix" (implemented in |
+// UrlIsPrefix()) and "URL compare" (CanonicalURLComparator). |
+// |
+// Algorithms and Implementation Details: |
beaudoin
2013/09/05 01:41:19
Algorithms and Implementation Details for prefix s
huangs
2013/09/05 15:24:46
Done.
|
+// |
+// For efficient URL prefix search, instead of using a trie of tokens, our |
+// implementation applies two ideas (illustrated by examples): |
+// |
+// Idea 1: Let |L| = {"aa", "h", "hh", "his", "ss", "su"} be a sorted list of |
+// strings. Test whether a given string is a prefix of some string in |L|: |
beaudoin
2013/09/05 01:41:19
// strings. We test whether a |query| string is a
huangs
2013/09/05 15:24:46
Done.
|
+// "hi": by binary search we find "his", for which "hi" is a prefix. |
+// "zz": by binary search "zz" > all strings in |L|, so it's never a prefix. |
+// "st": by binary search we find "su", for which "st" is not a prefix, so it's |
+// never a prefix; since if such a prefix |p| exists, we would have found |
+// "st" < |p| < "su". |
+// |
+// We apply the same idea for URL prefix, but using URL compare instead of |
beaudoin
2013/09/05 01:41:19
but using CanonicalURLComparator instead of
huangs
2013/09/05 15:24:46
Done.
|
+// string compare. The general strategy is the same: apply binary search follwed |
beaudoin
2013/09/05 01:41:19
nit: followed
huangs
2013/09/05 15:24:46
Done.
|
+// by a single prefix test. |
+// |
+// Idea 2: Let us compare the following URLs: |
+// |A| = "http://www.google.com/test/ab/cd" |
+// |B| = "http://www.google.com/test-case/yz" |
+// First we consider the inefficient way. Splitting these into tokens yields: |
+// |A| => ["http://www.google.com", "test", "ab", "cd"] |
+// |B| => ["http://www.google.com", "test-case", "yz"] |
+// Under lexicographical compare, for token 0 we have equivalence, and for |
+// token 1 we have "test" < "test-case". Thus |A| < |B| (URL comparison). Note |
+// that naive string comparison would yield opposite, since '/' > '-' (ASCII). |
beaudoin
2013/09/05 01:41:19
Dont talk about naive string comparison here, it's
huangs
2013/09/05 15:24:46
Done.
|
+// |
+// To improve efficiency, we can perform the comparison in-place instead of |
+// splitting URLs into tokens. To do this, we treat '/' as a special separator, |
+// so '\0' < '/' < other characters. Using this as a comparator, direct string |
+// string comparison between |A| and |B|, results in '/' < '-', so |A| < |B|. |
+// See CanonicalURLComparator::CompareString() for details and special cases. |
beaudoin
2013/09/05 01:41:19
I would replace the above paragraph with:
To impr
huangs
2013/09/05 15:24:46
Done.
|
+// |
+// Combining the two ideas above, we arrive at the following implementation for |
+// efficient URL prefix matching for URLs stored in |canonical_urls_|: |
+// - |canonical_urls_| is implemented as std::map that uses the comparator |
+// CanonicalURLComparator. |
+// - To see if |url| is a prefix of any key in |canonical_urls_|, we perform |
+// |canonical_urls_.lower_bound(url)| for effective binary search, and then |
+// use UrlIsPrefix() for prefix testing. |
+// |
+// Caveats: |
+// - In the case of multiple matches, sorting would biase towards URLs that |
+// appear first under CanonicalURLComparator(). So between |
+// "http://www.google.com/long/url/with/very/deep/directories" |
+// "http://www.google.com/short" |
+// the first one will be matched against "http://www.google.com". |
+// |
+// - CanonicalURLComparator handles query strings, but UrlIsPrefix() is |
+// insensitive to them. |
+ |
+// The entries in CanonicalURLs, see CanonicalURLs for details. The second |
+// argument gives the index of the URL into MostVisitedURLs redirects. |
+typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; |
+ |
// TopSitesCache caches the top sites and thumbnails for TopSites. |
class TopSitesCache { |
public: |
+ // Comparator used for CanonicalURLs. Its main purpose |
+ class CanonicalURLComparator { |
+ public: |
+ bool operator()(const CanonicalURLEntry& e1, |
+ const CanonicalURLEntry& e2) const; |
+ static bool CompareString(const std::string& s1, const std::string& s2); |
+ }; |
+ |
TopSitesCache(); |
~TopSitesCache(); |
@@ -37,33 +127,42 @@ class TopSitesCache { |
bool GetPageThumbnail(const GURL& url, |
scoped_refptr<base::RefCountedMemory>* bytes); |
+ // Similar to GetPageThumbnail(), but rather than requiring an exact match, |
+ // only requires the specified URL to be the prefix of a URL in TopSites. |
+ // If multiple matches exist, returns the thumbnail for the first match in |
+ // lexicographical order. |
+ bool GetPageThumbnailForPrefix(const GURL& prefix_url, |
+ scoped_refptr<base::RefCountedMemory>* bytes); |
+ |
// Fetches the thumbnail score for the specified url. Returns true if |
// there is a thumbnail score for the specified url. |
bool GetPageThumbnailScore(const GURL& url, ThumbnailScore* score); |
- // Returns the canonical URL for |url|. |
+ // Returns the [canonical URL] for |url|. |
const GURL& GetCanonicalURL(const GURL& url); |
+ // Returns the [canonical URL] for |url_prefix|. If |url_prefix| matches |
+ // multiple entries then returns the [canonical URL] for the first entry in |
+ // lexicographical order. |
+ const GURL& GetCanonicalURLForPrefix(const GURL& url_prefix); |
+ |
// Returns true if |url| is known. |
bool IsKnownURL(const GURL& url); |
// Returns the index into |top_sites_| for |url|. |
size_t GetURLIndex(const GURL& url); |
- private: |
- // The entries in CanonicalURLs, see CanonicalURLs for details. The second |
- // argument gives the index of the URL into MostVisitedURLs redirects. |
- typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; |
- |
- // Comparator used for CanonicalURLs. |
- class CanonicalURLComparator { |
- public: |
- bool operator()(const CanonicalURLEntry& e1, |
- const CanonicalURLEntry& e2) const { |
- return e1.first->redirects[e1.second] < e2.first->redirects[e2.second]; |
- } |
- }; |
+ // Returns whether or not |url1| is a prefix of |url2|. Criteria: |
+ // - Scheme, host, port: exact match required. |
+ // - Path: considered as a list of path components (e.g., ["a", "bb"] in |
+ // "/a/bb"), and |url1|'s list must be a prefix of |url2|'s list. |
+ // - Query: ignored. |
+ // Note that "http://www.google.com/test" is NOT a prefix of |
+ // "http://www.google.com/testing", although "test" is a prefix of "testing". |
+ // Also, "http://www.google.com" will not match "http://www.google.com:80". |
+ static bool UrlIsPrefix(const GURL& url1, const GURL& url2); |
+ private: |
// This is used to map from redirect url to the MostVisitedURL the redirect is |
// from. Ideally this would be map<GURL, size_t> (second param indexing into |
// top_sites_), but this results in duplicating all redirect urls. As some |
@@ -80,8 +179,11 @@ class TopSitesCache { |
// Stores a set of redirects. This is used by GenerateCanonicalURLs. |
void StoreRedirectChain(const RedirectList& redirects, size_t destination); |
- // Returns the iterator into canconical_urls_ for the specified url. |
- CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url); |
+ // Returns the iterator into canonical_urls_ for the |url|. If |match_prefix| |
+ // is true, then returns the first iterator into canonical_urls_ that has |
+ // |url| as a path-prefix. |
+ CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url, |
+ bool match_prefix); |
// The top sites. |
MostVisitedURLList top_sites_; |