OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ | 5 #ifndef CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
6 #define CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ | 6 #define CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
7 | 7 |
8 #include <map> | 8 #include <map> |
9 #include <string> | |
9 #include <utility> | 10 #include <utility> |
10 | 11 |
11 #include "base/memory/ref_counted.h" | 12 #include "base/memory/ref_counted.h" |
12 #include "chrome/browser/history/history_types.h" | 13 #include "chrome/browser/history/history_types.h" |
13 | 14 |
14 namespace history { | 15 namespace history { |
15 | 16 |
17 // TopSiteCache stores cached thumbnails for visited pages. The flow to retrieve | |
18 // thumbnails from a given [input URL] is: | |
19 // | |
20 // [input URL] --(map 1)--> [canonical URL] --(map 2)--> image. | |
21 // | |
22 // (map 2) simply looks up |images_| directly. (map 1) uses |canonical_urls_|, | |
23 // and is more complex. | |
24 // | |
25 // For GetPageThumbnail(), (map 1) applies GetCanonicalURL(): | |
26 // - if [canonical URL] is a key in |canonical_urls_|, return the value. | |
27 // - else return [input URL]. | |
28 // | |
29 // For GetPageThumbnailForPrefix(), (map 1) applies GetCanonicalURLForPrefix(): | |
30 // - if [canonical URL] is a key in |canonical_urls_|, return the value. | |
31 // - else if [canonical URL] is "URL prefix" of some key in |canonical_urls_|, | |
beaudoin
2013/09/05 01:41:19
nit: a "URL prefix"
huangs
2013/09/05 15:24:46
Done.
| |
32 // return the value corresponding to the key (more on "URL prefix" later). | |
33 // - else return [input URL]. | |
34 // | |
35 // To specify "URL prefix", we consider a URL as a list of string tokens, with | |
36 // query parameters ignored. For example, | |
37 // "http://www.google.com/path/subpath/index.html?q=3" | |
38 // can be split into tokens: | |
39 // ["http://www.google.com", "path", "subpath", "index.html"] | |
40 // Using tokens as units of comparison, we define "URL prefix" (implemented in | |
41 // UrlIsPrefix()) and "URL compare" (CanonicalURLComparator). | |
42 // | |
43 // Algorithms and Implementation Details: | |
beaudoin
2013/09/05 01:41:19
Algorithms and Implementation Details for prefix s
huangs
2013/09/05 15:24:46
Done.
| |
44 // | |
45 // For efficient URL prefix search, instead of using a trie of tokens, our | |
46 // implementation applies two ideas (illustrated by examples): | |
47 // | |
48 // Idea 1: Let |L| = {"aa", "h", "hh", "his", "ss", "su"} be a sorted list of | |
49 // strings. Test whether a given string is a prefix of some string in |L|: | |
beaudoin
2013/09/05 01:41:19
// strings. We test whether a |query| string is a
huangs
2013/09/05 15:24:46
Done.
| |
50 // "hi": by binary search we find "his", for which "hi" is a prefix. | |
51 // "zz": by binary search "zz" > all strings in |L|, so it's never a prefix. | |
52 // "st": by binary search we find "su", for which "st" is not a prefix, so it's | |
53 // never a prefix; since if such a prefix |p| exists, we would have found | |
54 // "st" < |p| < "su". | |
55 // | |
56 // We apply the same idea for URL prefix, but using URL compare instead of | |
beaudoin
2013/09/05 01:41:19
but using CanonicalURLComparator instead of
huangs
2013/09/05 15:24:46
Done.
| |
57 // string compare. The general strategy is the same: apply binary search follwed | |
beaudoin
2013/09/05 01:41:19
nit: followed
huangs
2013/09/05 15:24:46
Done.
| |
58 // by a single prefix test. | |
59 // | |
60 // Idea 2: Let us compare the following URLs: | |
61 // |A| = "http://www.google.com/test/ab/cd" | |
62 // |B| = "http://www.google.com/test-case/yz" | |
63 // First we consider the inefficient way. Splitting these into tokens yields: | |
64 // |A| => ["http://www.google.com", "test", "ab", "cd"] | |
65 // |B| => ["http://www.google.com", "test-case", "yz"] | |
66 // Under lexicographical compare, for token 0 we have equivalence, and for | |
67 // token 1 we have "test" < "test-case". Thus |A| < |B| (URL comparison). Note | |
68 // that naive string comparison would yield opposite, since '/' > '-' (ASCII). | |
beaudoin
2013/09/05 01:41:19
Dont talk about naive string comparison here, it's
huangs
2013/09/05 15:24:46
Done.
| |
69 // | |
70 // To improve efficiency, we can perform the comparison in-place instead of | |
71 // splitting URLs into tokens. To do this, we treat '/' as a special separator, | |
72 // so '\0' < '/' < other characters. Using this as a comparator, direct string | |
73 // string comparison between |A| and |B|, results in '/' < '-', so |A| < |B|. | |
74 // See CanonicalURLComparator::CompareString() for details and special cases. | |
beaudoin
2013/09/05 01:41:19
I would replace the above paragraph with:
To impr
huangs
2013/09/05 15:24:46
Done.
| |
75 // | |
76 // Combining the two ideas above, we arrive at the following implementation for | |
77 // efficient URL prefix matching for URLs stored in |canonical_urls_|: | |
78 // - |canonical_urls_| is implemented as std::map that uses the comparator | |
79 // CanonicalURLComparator. | |
80 // - To see if |url| is a prefix of any key in |canonical_urls_|, we perform | |
81 // |canonical_urls_.lower_bound(url)| for effective binary search, and then | |
82 // use UrlIsPrefix() for prefix testing. | |
83 // | |
84 // Caveats: | |
85 // - In the case of multiple matches, sorting would biase towards URLs that | |
86 // appear first under CanonicalURLComparator(). So between | |
87 // "http://www.google.com/long/url/with/very/deep/directories" | |
88 // "http://www.google.com/short" | |
89 // the first one will be matched against "http://www.google.com". | |
90 // | |
91 // - CanonicalURLComparator handles query strings, but UrlIsPrefix() is | |
92 // insensitive to them. | |
93 | |
94 // The entries in CanonicalURLs, see CanonicalURLs for details. The second | |
95 // argument gives the index of the URL into MostVisitedURLs redirects. | |
96 typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; | |
97 | |
16 // TopSitesCache caches the top sites and thumbnails for TopSites. | 98 // TopSitesCache caches the top sites and thumbnails for TopSites. |
17 class TopSitesCache { | 99 class TopSitesCache { |
18 public: | 100 public: |
101 // Comparator used for CanonicalURLs. Its main purpose | |
102 class CanonicalURLComparator { | |
103 public: | |
104 bool operator()(const CanonicalURLEntry& e1, | |
105 const CanonicalURLEntry& e2) const; | |
106 static bool CompareString(const std::string& s1, const std::string& s2); | |
107 }; | |
108 | |
19 TopSitesCache(); | 109 TopSitesCache(); |
20 ~TopSitesCache(); | 110 ~TopSitesCache(); |
21 | 111 |
22 // The top sites. | 112 // The top sites. |
23 void SetTopSites(const MostVisitedURLList& top_sites); | 113 void SetTopSites(const MostVisitedURLList& top_sites); |
24 const MostVisitedURLList& top_sites() const { return top_sites_; } | 114 const MostVisitedURLList& top_sites() const { return top_sites_; } |
25 | 115 |
26 // The thumbnails. | 116 // The thumbnails. |
27 void SetThumbnails(const URLToImagesMap& images); | 117 void SetThumbnails(const URLToImagesMap& images); |
28 const URLToImagesMap& images() const { return images_; } | 118 const URLToImagesMap& images() const { return images_; } |
29 | 119 |
30 // Returns the thumbnail as an Image for the specified url. This adds an entry | 120 // Returns the thumbnail as an Image for the specified url. This adds an entry |
31 // for |url| if one has not yet been added. | 121 // for |url| if one has not yet been added. |
32 Images* GetImage(const GURL& url); | 122 Images* GetImage(const GURL& url); |
33 | 123 |
34 // Fetches the thumbnail for the specified url. Returns true if there is a | 124 // Fetches the thumbnail for the specified url. Returns true if there is a |
35 // thumbnail for the specified url. It is possible for a URL to be in TopSites | 125 // thumbnail for the specified url. It is possible for a URL to be in TopSites |
36 // but not have an thumbnail. | 126 // but not have an thumbnail. |
37 bool GetPageThumbnail(const GURL& url, | 127 bool GetPageThumbnail(const GURL& url, |
38 scoped_refptr<base::RefCountedMemory>* bytes); | 128 scoped_refptr<base::RefCountedMemory>* bytes); |
39 | 129 |
130 // Similar to GetPageThumbnail(), but rather than requiring an exact match, | |
131 // only requires the specified URL to be the prefix of a URL in TopSites. | |
132 // If multiple matches exist, returns the thumbnail for the first match in | |
133 // lexicographical order. | |
134 bool GetPageThumbnailForPrefix(const GURL& prefix_url, | |
135 scoped_refptr<base::RefCountedMemory>* bytes); | |
136 | |
40 // Fetches the thumbnail score for the specified url. Returns true if | 137 // Fetches the thumbnail score for the specified url. Returns true if |
41 // there is a thumbnail score for the specified url. | 138 // there is a thumbnail score for the specified url. |
42 bool GetPageThumbnailScore(const GURL& url, ThumbnailScore* score); | 139 bool GetPageThumbnailScore(const GURL& url, ThumbnailScore* score); |
43 | 140 |
44 // Returns the canonical URL for |url|. | 141 // Returns the [canonical URL] for |url|. |
45 const GURL& GetCanonicalURL(const GURL& url); | 142 const GURL& GetCanonicalURL(const GURL& url); |
46 | 143 |
144 // Returns the [canonical URL] for |url_prefix|. If |url_prefix| matches | |
145 // multiple entries then returns the [canonical URL] for the first entry in | |
146 // lexicographical order. | |
147 const GURL& GetCanonicalURLForPrefix(const GURL& url_prefix); | |
148 | |
47 // Returns true if |url| is known. | 149 // Returns true if |url| is known. |
48 bool IsKnownURL(const GURL& url); | 150 bool IsKnownURL(const GURL& url); |
49 | 151 |
50 // Returns the index into |top_sites_| for |url|. | 152 // Returns the index into |top_sites_| for |url|. |
51 size_t GetURLIndex(const GURL& url); | 153 size_t GetURLIndex(const GURL& url); |
52 | 154 |
155 // Returns whether or not |url1| is a prefix of |url2|. Criteria: | |
156 // - Scheme, host, port: exact match required. | |
157 // - Path: considered as a list of path components (e.g., ["a", "bb"] in | |
158 // "/a/bb"), and |url1|'s list must be a prefix of |url2|'s list. | |
159 // - Query: ignored. | |
160 // Note that "http://www.google.com/test" is NOT a prefix of | |
161 // "http://www.google.com/testing", although "test" is a prefix of "testing". | |
162 // Also, "http://www.google.com" will not match "http://www.google.com:80". | |
163 static bool UrlIsPrefix(const GURL& url1, const GURL& url2); | |
164 | |
53 private: | 165 private: |
54 // The entries in CanonicalURLs, see CanonicalURLs for details. The second | |
55 // argument gives the index of the URL into MostVisitedURLs redirects. | |
56 typedef std::pair<MostVisitedURL*, size_t> CanonicalURLEntry; | |
57 | |
58 // Comparator used for CanonicalURLs. | |
59 class CanonicalURLComparator { | |
60 public: | |
61 bool operator()(const CanonicalURLEntry& e1, | |
62 const CanonicalURLEntry& e2) const { | |
63 return e1.first->redirects[e1.second] < e2.first->redirects[e2.second]; | |
64 } | |
65 }; | |
66 | |
67 // This is used to map from redirect url to the MostVisitedURL the redirect is | 166 // This is used to map from redirect url to the MostVisitedURL the redirect is |
68 // from. Ideally this would be map<GURL, size_t> (second param indexing into | 167 // from. Ideally this would be map<GURL, size_t> (second param indexing into |
69 // top_sites_), but this results in duplicating all redirect urls. As some | 168 // top_sites_), but this results in duplicating all redirect urls. As some |
70 // sites have a lot of redirects, we instead use the MostVisitedURL* and the | 169 // sites have a lot of redirects, we instead use the MostVisitedURL* and the |
71 // index of the redirect as the key, and the index into top_sites_ as the | 170 // index of the redirect as the key, and the index into top_sites_ as the |
72 // value. This way we aren't duplicating GURLs. CanonicalURLComparator | 171 // value. This way we aren't duplicating GURLs. CanonicalURLComparator |
73 // enforces the ordering as if we were using GURLs. | 172 // enforces the ordering as if we were using GURLs. |
74 typedef std::map<CanonicalURLEntry, size_t, | 173 typedef std::map<CanonicalURLEntry, size_t, |
75 CanonicalURLComparator> CanonicalURLs; | 174 CanonicalURLComparator> CanonicalURLs; |
76 | 175 |
77 // Generates the set of canonical urls from |top_sites_|. | 176 // Generates the set of canonical urls from |top_sites_|. |
78 void GenerateCanonicalURLs(); | 177 void GenerateCanonicalURLs(); |
79 | 178 |
80 // Stores a set of redirects. This is used by GenerateCanonicalURLs. | 179 // Stores a set of redirects. This is used by GenerateCanonicalURLs. |
81 void StoreRedirectChain(const RedirectList& redirects, size_t destination); | 180 void StoreRedirectChain(const RedirectList& redirects, size_t destination); |
82 | 181 |
83 // Returns the iterator into canconical_urls_ for the specified url. | 182 // Returns the iterator into canonical_urls_ for the |url|. If |match_prefix| |
84 CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url); | 183 // is true, then returns the first iterator into canonical_urls_ that has |
184 // |url| as a path-prefix. | |
185 CanonicalURLs::iterator GetCanonicalURLsIterator(const GURL& url, | |
186 bool match_prefix); | |
85 | 187 |
86 // The top sites. | 188 // The top sites. |
87 MostVisitedURLList top_sites_; | 189 MostVisitedURLList top_sites_; |
88 | 190 |
89 // The images. These map from canonical url to image. | 191 // The images. These map from canonical url to image. |
90 URLToImagesMap images_; | 192 URLToImagesMap images_; |
91 | 193 |
92 // Generated from the redirects to and from the most visited pages. See | 194 // Generated from the redirects to and from the most visited pages. See |
93 // description above typedef for details. | 195 // description above typedef for details. |
94 CanonicalURLs canonical_urls_; | 196 CanonicalURLs canonical_urls_; |
95 | 197 |
96 DISALLOW_COPY_AND_ASSIGN(TopSitesCache); | 198 DISALLOW_COPY_AND_ASSIGN(TopSitesCache); |
97 }; | 199 }; |
98 | 200 |
99 } // namespace history | 201 } // namespace history |
100 | 202 |
101 #endif // CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ | 203 #endif // CHROME_BROWSER_HISTORY_TOP_SITES_CACHE_H_ |
OLD | NEW |