Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ | 5 #ifndef CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
| 6 #define CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ | 6 #define CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
| 7 | 7 |
| 8 #include <map> | 8 #include <map> |
| 9 #include <string> | |
| 9 #include <utility> | 10 #include <utility> |
| 10 | 11 |
| 11 #include "base/memory/ref_counted.h" | 12 #include "base/memory/ref_counted.h" |
| 12 #include "chrome/browser/history/history_types.h" | 13 #include "chrome/browser/history/history_types.h" |
| 13 | 14 |
| 14 namespace history { | 15 namespace history { |
| 15 | 16 |
| 17 // TopSiteCache stores cached thumbnails for visited pages. The flow to retrieve | |
| 18 // thumbnails from a given [input URL] is: | |
| 19 // | |
| 20 // [input URL] --(map 1)--> [canonical URL] --(map 2)--> image. | |
| 21 // | |
| 22 // (map 2) simply looks up |images_| directly. (map 1) uses |canonical_urls_|, | |
| 23 // and is more complex. | |
| 24 // | |
| 25 // For GetPageThumbnail(), (map 1) applies GetCanonicalURL(): | |
| 26 // - if [canonical URL] is a key in |canonical_urls_|, return the value. | |
| 27 // - else return [input URL]. | |
| 28 // | |
| 29 // For GetPageThumbnailForPrefix(), (map 1) applies GetCanonicalURLForPrefix(): | |
| 30 // - if [canonical URL] is a key in |canonical_urls_|, return the value. | |
| 31 // - else if [canonical URL] is "URL prefix" of some key in |canonical_urls_|, | |
|
beaudoin
2013/09/05 01:41:19
nit: a "URL prefix"
huangs
2013/09/05 15:24:46
Done.
| |
| 32 // return the value corresponding to the key (more on "URL prefix" later). | |
| 33 // - else return [input URL]. | |
| 34 // | |
| 35 // To specify "URL prefix", we consider a URL as a list of string tokens, with | |
| 36 // query parameters ignored. For example, | |
| 37 // "http://www.google.com/path/subpath/index.html?q=3" | |
| 38 // can be split into tokens: | |
| 39 // ["http://www.google.com", "path", "subpath", "index.html"] | |
| 40 // Using tokens as units of comparison, we define "URL prefix" (implemented in | |
| 41 // UrlIsPrefix()) and "URL compare" (CanonicalURLComparator). | |
| 42 // | |
| 43 // Algorithms and Implementation Details: | |
|
beaudoin
2013/09/05 01:41:19
Algorithms and Implementation Details for prefix s
huangs
2013/09/05 15:24:46
Done.
| |
| 44 // | |
| 45 // For efficient URL prefix search, instead of using a trie of tokens, our | |
| 46 // implementation applies two ideas (illustrated by examples): | |
| 47 // | |
| 48 // Idea 1: Let |L| = {"aa", "h", "hh", "his", "ss", "su"} be a sorted list of | |
| 49 // strings. Test whether a given string is a prefix of some string in |L|: | |
|
beaudoin
2013/09/05 01:41:19
// strings. We test whether a |query| string is a
huangs
2013/09/05 15:24:46
Done.
| |
| 50 // "hi": by binary search we find "his", for which "hi" is a prefix. | |
| 51 // "zz": by binary search "zz" > all strings in |L|, so it's never a prefix. | |
| 52 // "st": by binary search we find "su", for which "st" is not a prefix, so it's | |
| 53 // never a prefix; since if such a prefix |p| exists, we would have found | |
| 54 // "st" < |p| < "su". | |
| 55 // | |
| 56 // We apply the same idea for URL prefix, but using URL compare instead of | |
|
beaudoin
2013/09/05 01:41:19
but using CanonicalURLComparator instead of
huangs
2013/09/05 15:24:46
Done.
| |
| 57 // string compare. The general strategy is the same: apply binary search follwed | |
|
beaudoin
2013/09/05 01:41:19
nit: followed
huangs
2013/09/05 15:24:46
Done.
| |
| 58 // by a single prefix test. | |
| 59 // | |
| 60 // Idea 2: Let us compare the following URLs: | |
| 61 // |A| = "http://www.google.com/test/ab/cd" | |
| 62 // |B| = "http://www.google.com/test-case/yz" | |
| 63 // First we consider the inefficient way. Splitting these into tokens yields: | |
| 64 // |A| => ["http://www.google.com", "test", "ab", "cd"] | |
| 65 // |B| => ["http://www.google.com", "test-case", "yz"] | |
| 66 // Under lexicographical compare, for token 0 we have equivalence, and for | |
| 67 // token 1 we have "test" < "test-case". Thus |A| < |B| (URL comparison). Note | |
| 68 // that naive string comparison would yield opposite, since '/' > '-' (ASCII). | |
|
beaudoin
2013/09/05 01:41:19
Dont talk about naive string comparison here, it's
huangs
2013/09/05 15:24:46
Done.
| |
| 69 // | |
| 70 // To improve efficiency, we can perform the comparison in-place instead of | |
| 71 // splitting URLs into tokens. To do this, we treat '/' as a special separator, | |
| 72 // so '\0' < '/' < other characters. Using this as a comparator, direct string | |
| 73 // string comparison between |A| and |B|, results in '/' < '-', so |A| < |B|. | |
| 74 // See CanonicalURLComparator::CompareString() for details and special cases. | |
|
beaudoin
2013/09/05 01:41:19
I would replace the above paragraph with:
To impr
huangs
2013/09/05 15:24:46
Done.
| |
| 75 // | |
| 76 // Combining the two ideas above, we arrive at the following implementation for | |
| 77 // efficient URL prefix matching for URLs stored in |canonical_urls_|: | |
| 78 // - |canonical_urls_| is implemented as std::map that uses the comparator | |
| 79 // CanonicalURLComparator. | |
| 80 // - To see if |url| is a prefix of any key in |canonical_urls_|, we perform | |
| 81 // |canonical_urls_.lower_bound(url)| for effective binary search, and then | |
| 82 // use UrlIsPrefix() for prefix testing. | |
| 83 // | |
| 84 // Caveats: | |
| 85 // - In the case of multiple matches, sorting would biase towards URLs that | |
| 86 // appear first under CanonicalURLComparator(). So between | |
| 87 // "http://www.google.com/long/url/with/very/deep/directories" | |
| 88 // "http://www.google.com/short" | |
| 89 // the first one will be matched against "http://www.google.com". | |
| 90 // | |
| 91 // - CanonicalURLComparator handles query strings, but UrlIsPrefix() is | |
| 92 // insensitive to them. | |
| 93 | |
| 94 // The entries in CanonicalURLs, see CanonicalURLs for details. The second | |
| 95 // argument gives the index of the URL into MostVisitedURLs redirects. | |
| 96 typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; | |
| 97 | |
| 16 // TopSitesCache caches the top sites and thumbnails for TopSites. | 98 // TopSitesCache caches the top sites and thumbnails for TopSites. |
| 17 class TopSitesCache { | 99 class TopSitesCache { |
| 18 public: | 100 public: |
| 101 // Comparator used for CanonicalURLs. Its main purpose | |
| 102 class CanonicalURLComparator { | |
| 103 public: | |
| 104 bool operator()(const CanonicalURLEntry& e1, | |
| 105 const CanonicalURLEntry& e2) const; | |
| 106 static bool CompareString(const std::string& s1, const std::string& s2); | |
| 107 }; | |
| 108 | |
| 19 TopSitesCache(); | 109 TopSitesCache(); |
| 20 ~TopSitesCache(); | 110 ~TopSitesCache(); |
| 21 | 111 |
| 22 // The top sites. | 112 // The top sites. |
| 23 void SetTopSites(const MostVisitedURLList& top_sites); | 113 void SetTopSites(const MostVisitedURLList& top_sites); |
| 24 const MostVisitedURLList& top_sites() const { return top_sites_; } | 114 const MostVisitedURLList& top_sites() const { return top_sites_; } |
| 25 | 115 |
| 26 // The thumbnails. | 116 // The thumbnails. |
| 27 void SetThumbnails(const URLToImagesMap& images); | 117 void SetThumbnails(const URLToImagesMap& images); |
| 28 const URLToImagesMap& images() const { return images_; } | 118 const URLToImagesMap& images() const { return images_; } |
| 29 | 119 |
| 30 // Returns the thumbnail as an Image for the specified url. This adds an entry | 120 // Returns the thumbnail as an Image for the specified url. This adds an entry |
| 31 // for |url| if one has not yet been added. | 121 // for |url| if one has not yet been added. |
| 32 Images* GetImage(const GURL& url); | 122 Images* GetImage(const GURL& url); |
| 33 | 123 |
| 34 // Fetches the thumbnail for the specified url. Returns true if there is a | 124 // Fetches the thumbnail for the specified url. Returns true if there is a |
| 35 // thumbnail for the specified url. It is possible for a URL to be in TopSites | 125 // thumbnail for the specified url. It is possible for a URL to be in TopSites |
| 36 // but not have an thumbnail. | 126 // but not have an thumbnail. |
| 37 bool GetPageThumbnail(const GURL& url, | 127 bool GetPageThumbnail(const GURL& url, |
| 38 scoped_refptr<base::RefCountedMemory>* bytes); | 128 scoped_refptr<base::RefCountedMemory>* bytes); |
| 39 | 129 |
| 130 // Similar to GetPageThumbnail(), but rather than requiring an exact match, | |
| 131 // only requires the specified URL to be the prefix of a URL in TopSites. | |
| 132 // If multiple matches exist, returns the thumbnail for the first match in | |
| 133 // lexicographical order. | |
| 134 bool GetPageThumbnailForPrefix(const GURL& prefix_url, | |
| 135 scoped_refptr<base::RefCountedMemory>* bytes); | |
| 136 | |
| 40 // Fetches the thumbnail score for the specified url. Returns true if | 137 // Fetches the thumbnail score for the specified url. Returns true if |
| 41 // there is a thumbnail score for the specified url. | 138 // there is a thumbnail score for the specified url. |
| 42 bool GetPageThumbnailScore(const GURL& url, ThumbnailScore* score); | 139 bool GetPageThumbnailScore(const GURL& url, ThumbnailScore* score); |
| 43 | 140 |
| 44 // Returns the canonical URL for |url|. | 141 // Returns the [canonical URL] for |url|. |
| 45 const GURL& GetCanonicalURL(const GURL& url); | 142 const GURL& GetCanonicalURL(const GURL& url); |
| 46 | 143 |
| 144 // Returns the [canonical URL] for |url_prefix|. If |url_prefix| matches | |
| 145 // multiple entries then returns the [canonical URL] for the first entry in | |
| 146 // lexicographical order. | |
| 147 const GURL& GetCanonicalURLForPrefix(const GURL& url_prefix); | |
| 148 | |
| 47 // Returns true if |url| is known. | 149 // Returns true if |url| is known. |
| 48 bool IsKnownURL(const GURL& url); | 150 bool IsKnownURL(const GURL& url); |
| 49 | 151 |
| 50 // Returns the index into |top_sites_| for |url|. | 152 // Returns the index into |top_sites_| for |url|. |
| 51 size_t GetURLIndex(const GURL& url); | 153 size_t GetURLIndex(const GURL& url); |
| 52 | 154 |
| 155 // Returns whether or not |url1| is a prefix of |url2|. Criteria: | |
| 156 // - Scheme, host, port: exact match required. | |
| 157 // - Path: considered as a list of path components (e.g., ["a", "bb"] in | |
| 158 // "/a/bb"), and |url1|'s list must be a prefix of |url2|'s list. | |
| 159 // - Query: ignored. | |
| 160 // Note that "http://www.google.com/test" is NOT a prefix of | |
| 161 // "http://www.google.com/testing", although "test" is a prefix of "testing". | |
| 162 // Also, "http://www.google.com" will not match "http://www.google.com:80". | |
| 163 static bool UrlIsPrefix(const GURL& url1, const GURL& url2); | |
| 164 | |
| 53 private: | 165 private: |
| 54 // The entries in CanonicalURLs, see CanonicalURLs for details. The second | |
| 55 // argument gives the index of the URL into MostVisitedURLs redirects. | |
| 56 typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; | |
| 57 | |
| 58 // Comparator used for CanonicalURLs. | |
| 59 class CanonicalURLComparator { | |
| 60 public: | |
| 61 bool operator()(const CanonicalURLEntry& e1, | |
| 62 const CanonicalURLEntry& e2) const { | |
| 63 return e1.first->redirects[e1.second] < e2.first->redirects[e2.second]; | |
| 64 } | |
| 65 }; | |
| 66 | |
| 67 // This is used to map from redirect url to the MostVisitedURL the redirect is | 166 // This is used to map from redirect url to the MostVisitedURL the redirect is |
| 68 // from. Ideally this would be map<GURL, size_t> (second param indexing into | 167 // from. Ideally this would be map<GURL, size_t> (second param indexing into |
| 69 // top_sites_), but this results in duplicating all redirect urls. As some | 168 // top_sites_), but this results in duplicating all redirect urls. As some |
| 70 // sites have a lot of redirects, we instead use the MostVisitedURL* and the | 169 // sites have a lot of redirects, we instead use the MostVisitedURL* and the |
| 71 // index of the redirect as the key, and the index into top_sites_ as the | 170 // index of the redirect as the key, and the index into top_sites_ as the |
| 72 // value. This way we aren't duplicating GURLs. CanonicalURLComparator | 171 // value. This way we aren't duplicating GURLs. CanonicalURLComparator |
| 73 // enforces the ordering as if we were using GURLs. | 172 // enforces the ordering as if we were using GURLs. |
| 74 typedef std::map<CanonicalURLEntry, size_t, | 173 typedef std::map<CanonicalURLEntry, size_t, |
| 75 CanonicalURLComparator> CanonicalURLs; | 174 CanonicalURLComparator> CanonicalURLs; |
| 76 | 175 |
| 77 // Generates the set of canonical urls from |top_sites_|. | 176 // Generates the set of canonical urls from |top_sites_|. |
| 78 void GenerateCanonicalURLs(); | 177 void GenerateCanonicalURLs(); |
| 79 | 178 |
| 80 // Stores a set of redirects. This is used by GenerateCanonicalURLs. | 179 // Stores a set of redirects. This is used by GenerateCanonicalURLs. |
| 81 void StoreRedirectChain(const RedirectList& redirects, size_t destination); | 180 void StoreRedirectChain(const RedirectList& redirects, size_t destination); |
| 82 | 181 |
| 83 // Returns the iterator into canconical_urls_ for the specified url. | 182 // Returns the iterator into canonical_urls_ for the |url|. If |match_prefix| |
| 84 CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url); | 183 // is true, then returns the first iterator into canonical_urls_ that has |
| 184 // |url| as a path-prefix. | |
| 185 CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url, | |
| 186 bool match_prefix); | |
| 85 | 187 |
| 86 // The top sites. | 188 // The top sites. |
| 87 MostVisitedURLList top_sites_; | 189 MostVisitedURLList top_sites_; |
| 88 | 190 |
| 89 // The images. These map from canonical url to image. | 191 // The images. These map from canonical url to image. |
| 90 URLToImagesMap images_; | 192 URLToImagesMap images_; |
| 91 | 193 |
| 92 // Generated from the redirects to and from the most visited pages. See | 194 // Generated from the redirects to and from the most visited pages. See |
| 93 // description above typedef for details. | 195 // description above typedef for details. |
| 94 CanonicalURLs canonical_urls_; | 196 CanonicalURLs canonical_urls_; |
| 95 | 197 |
| 96 DISALLOW_COPY_AND_ASSIGN(TopSitesCache); | 198 DISALLOW_COPY_AND_ASSIGN(TopSitesCache); |
| 97 }; | 199 }; |
| 98 | 200 |
| 99 } // namespace history | 201 } // namespace history |
| 100 | 202 |
| 101 #endif // CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ | 203 #endif // CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
| OLD | NEW |