Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/browser/extensions/api/declarative/url_component_patterns.h" | |
| 6 | |
| 7 #include "base/string_util.h" | |
| 8 #include "googleurl/src/gurl.h" | |
| 9 | |
| 10 // This class implements a mapping of URL Component Patterns, such as | |
| 11 // host_prefix, host_suffix, host_equals, ..., etc., to SubstringPatterns. | |
| 12 // | |
| 13 // The idea of this mapping is to reduce the problem of comparing many | |
| 14 // URL Component Patterns against one URL to the problem of searching many | |
| 15 // substrings in one string: | |
| 16 // | |
| 17 // ---------------------- -------------------- | |
| 18 // | URL Query operator | ----translate----> | SubstringPattern | | |
| 19 // ---------------------- -------------------- | |
| 20 // ^ | |
| 21 // | | |
| 22 // compare | |
| 23 // | | |
| 24 // v | |
| 25 // ---------------------- -------------------- | |
| 26 // | URL to compare | | | | |
| 27 // | to all URL Query | ----translate----> | String | | |
| 28 // | operators | | | | |
| 29 // ---------------------- -------------------- | |
| 30 // | |
| 31 // The reason for this problem reduction is that there are efficient algorithms | |
| 32 // for searching many substrings in one string (see Aho-Corasick algorithm). | |
| 33 // | |
| 34 // Case 1: {host,path,query}_{prefix,suffix,equals} searches. | |
| 35 // ========================================================== | |
| 36 // | |
| 37 // For searches in this class, we normalize URLs as follows: | |
| 38 // | |
| 39 // Step 1: | |
| 40 // Remove scheme, port and segment from URL: | |
| 41 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes | |
| 42 // www.example.com/index.html?search=foo | |
| 43 // | |
| 44 // We remove the scheme and port number because they can be checked later | |
| 45 // in a secondary filter step. We remove the segment (the #... part) because | |
| 46 // this is not guarantee to be ASCII-7 encoded. | |
|
Matt Perry
2012/02/14 01:38:34
guaranteed*
Matt Perry
2012/02/14 01:38:34
Forgive my ignorance, but are you sure that URLs a
battre
2012/02/14 19:32:21
Done.
battre
2012/02/14 19:32:21
According to GURL::spec():
// Returns the raw s
| |
| 47 // | |
| 48 // Step 2: | |
| 49 // Translate URL to String and add the following position markers: | |
| 50 // - BU = Beginning of URL | |
| 51 // - ED = End of Domain | |
| 52 // - EP = End of Path | |
| 53 // - EU = End of URL | |
| 54 // Furthermore, the hostname is canonicalized to start with a ".". | |
| 55 // | |
| 56 // Position markers are represented as characters >127, which are therefore | |
| 57 // guaranteed not to be part of the ASCII-7 encoded URL character set. | |
| 58 // | |
| 59 // -> www.example.com/index.html?search=foo becomes | |
| 60 // BU .www.example.com ED /index.html EP ?search=foo EU | |
| 61 // | |
| 62 // -> www.example.com/index.html becomes | |
| 63 // BU .www.example.com ED /index.html EP EU | |
| 64 // | |
| 65 // Step 3: | |
| 66 // Translate URL Component Patterns as follows: | |
| 67 // | |
| 68 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) | |
| 69 // -> host_prefix("www.example") = BU .www.example | |
| 70 // | |
| 71 // host_suffix(suffix) = suffix ED | |
| 72 // -> host_suffix("example.com") = example.com ED | |
| 73 // -> host_suffix(".example.com") = .example.com ED | |
| 74 // | |
| 75 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED | |
| 76 // -> host_equals("www.example.com") = BU .www.example.com ED | |
| 77 // | |
| 78 // | |
| 79 // path_prefix(prefix) = ED prefix | |
| 80 // -> path_prefix("/index.html") = ED /index.html | |
| 81 // | |
| 82 // path_suffix(suffix) = suffix EP | |
| 83 // -> path_suffix("index.html) = index.html EP | |
| 84 // | |
| 85 // path_equals(path) = ED path EP | |
| 86 // -> path_equals("/index.html") = ED /index.html EP | |
|
Matt Perry
2012/02/14 01:38:34
nit: good examples, but IMO less is more. 1 or 2 e
battre
2012/02/14 19:32:21
Done.
| |
| 87 // | |
| 88 // | |
| 89 // [Similarly for query parameters (query_{prefix, suffix, equals})] | |
| 90 // | |
| 91 // With this, we can search the SubstringPatterns in the normalized URL. | |
| 92 // | |
| 93 // | |
| 94 // Case 2: url_{prefix,suffix,equals,contains} searches. | |
| 95 // ===================================================== | |
| 96 // | |
| 97 // Step 1: as above | |
| 98 // | |
| 99 // Step 2: | |
| 100 // Translate URL to String and add the following position markers: | |
| 101 // - BU = Beginning of URL | |
| 102 // - EU = End of URL | |
| 103 // Furthermore, the hostname is canonicalized to start with a ".". | |
| 104 // | |
| 105 // -> www.example.com/index.html?search=foo becomes | |
| 106 // BU .www.example.com/index.html?search=foo EU | |
| 107 // | |
| 108 // url_prefix(prefix) = BU add_missing_dot_prefix(prefix) | |
| 109 // -> url_prefix("www.example") = BU .www.example | |
| 110 // | |
| 111 // url_suffix(suffix) = suffix EU | |
| 112 // -> url_suffix("index.html") = index.html EU | |
| 113 // | |
| 114 // url_contains(substring) = substring | |
| 115 // -> url_contains("index") = index | |
| 116 // | |
| 117 // url_equals(url) = BU add_missing_dot_prefix(url) EU | |
| 118 // -> url_equals("www.example.com/index.html") = | |
| 119 // BU .www.example.com/index.html EU | |
| 120 // | |
| 121 // | |
| 122 // Case 3: {host,path,query}_contains searches. | |
| 123 // ============================================ | |
| 124 // | |
| 125 // These kinds of searches are not supported directly but can be derived | |
| 126 // by a combination of a url_contains() query followed by an explicit test: | |
| 127 // | |
| 128 // host_contains(str) = url_contains(str) followed by test whether str occurs | |
| 129 // in host comonent of original URL. | |
| 130 // -> host_contains("example.co") = example.co | |
| 131 // followed by gurl.host().find("example.co"); | |
| 132 // | |
| 133 // [similarly for path_contains and query_contains]. | |
| 134 | |
| 135 namespace { | |
| 136 // These are symbols that are not contained in 7-bit ASCII used in GURLs. | |
| 137 char BEGINNING_OF_URL[] = {128, 0}; | |
|
Matt Perry
2012/02/14 01:38:34
style: const char kBeginningOfURL[] = {128, 0};
an
battre
2012/02/14 19:32:21
Done.
| |
| 138 char END_OF_DOMAIN[] = {129, 0}; | |
| 139 char END_OF_PATH[] = {130, 0}; | |
| 140 char END_OF_URL[] = {131, 0}; | |
| 141 } // namespace | |
| 142 | |
| 143 namespace extensions { | |
| 144 | |
| 145 UrlComponentPatterns::UrlComponentPatterns() : id_counter_(0) {} | |
| 146 | |
| 147 std::string UrlComponentPatterns::CanonlicalizeURLForComponentSearches( | |
| 148 const GURL& url) { | |
| 149 return BEGINNING_OF_URL + CanonicalizeHostname(url.host()) + END_OF_DOMAIN + | |
| 150 url.path() + END_OF_PATH + (url.has_query() ? "?" + url.query() : "") + | |
| 151 END_OF_URL; | |
| 152 } | |
| 153 | |
| 154 SubstringPattern UrlComponentPatterns::CreateHostPrefixPattern( | |
| 155 const std::string& prefix) { | |
| 156 return CreateSingletonPattern(BEGINNING_OF_URL + | |
| 157 CanonicalizeHostname(prefix)); | |
| 158 } | |
| 159 | |
| 160 SubstringPattern UrlComponentPatterns::CreateHostSuffixPattern( | |
| 161 const std::string& suffix) { | |
| 162 return CreateSingletonPattern(suffix + END_OF_DOMAIN); | |
| 163 } | |
| 164 | |
| 165 SubstringPattern UrlComponentPatterns::CreateHostEqualsPattern( | |
| 166 const std::string& str) { | |
| 167 return CreateSingletonPattern(BEGINNING_OF_URL + CanonicalizeHostname(str) + | |
| 168 END_OF_DOMAIN); | |
| 169 } | |
| 170 | |
| 171 SubstringPattern UrlComponentPatterns::CreatePathPrefixPattern( | |
| 172 const std::string& prefix) { | |
| 173 return CreateSingletonPattern(END_OF_DOMAIN + prefix); | |
| 174 } | |
| 175 | |
| 176 SubstringPattern UrlComponentPatterns::CreatePathSuffixPattern( | |
| 177 const std::string& suffix) { | |
| 178 return CreateSingletonPattern(suffix + END_OF_PATH); | |
| 179 } | |
| 180 | |
| 181 SubstringPattern UrlComponentPatterns::CreatePathEqualsPattern( | |
| 182 const std::string& str) { | |
| 183 return CreateSingletonPattern(END_OF_DOMAIN + str + END_OF_PATH); | |
| 184 } | |
| 185 | |
| 186 SubstringPattern UrlComponentPatterns::CreateQueryPrefixPattern( | |
| 187 const std::string& prefix) { | |
| 188 return CreateSingletonPattern(END_OF_PATH + prefix); | |
| 189 } | |
| 190 | |
| 191 SubstringPattern UrlComponentPatterns::CreateQuerySuffixPattern( | |
| 192 const std::string& suffix) { | |
| 193 return CreateSingletonPattern(suffix + END_OF_URL); | |
| 194 } | |
| 195 | |
| 196 SubstringPattern UrlComponentPatterns::CreateQueryEqualsPattern( | |
| 197 const std::string& str) { | |
| 198 return CreateSingletonPattern(END_OF_PATH + str + END_OF_URL); | |
| 199 } | |
| 200 | |
| 201 SubstringPattern UrlComponentPatterns::CreateHostSuffixPathPrefixPattern( | |
| 202 const std::string& host_suffix, | |
| 203 const std::string& path_prefix) { | |
| 204 return CreateSingletonPattern(host_suffix + END_OF_DOMAIN + path_prefix); | |
| 205 } | |
| 206 | |
| 207 std::string UrlComponentPatterns::CanonlicalizeURLForFullSearches( | |
| 208 const GURL& url) { | |
| 209 return BEGINNING_OF_URL + CanonicalizeHostname(url.host()) + url.path() + | |
| 210 (url.has_query() ? "?" + url.query() : "") + END_OF_URL; | |
| 211 } | |
| 212 | |
| 213 SubstringPattern UrlComponentPatterns::CreateURLPrefixPattern( | |
| 214 const std::string& prefix) { | |
| 215 return CreateSingletonPattern(BEGINNING_OF_URL + | |
| 216 CanonicalizeHostname(prefix)); | |
| 217 } | |
| 218 | |
| 219 SubstringPattern UrlComponentPatterns::CreateURLSuffixPattern( | |
| 220 const std::string& suffix) { | |
| 221 return CreateSingletonPattern(suffix + END_OF_URL); | |
| 222 } | |
| 223 | |
| 224 SubstringPattern UrlComponentPatterns::CreateURLContainsPattern( | |
| 225 const std::string& str) { | |
| 226 return CreateSingletonPattern(str); | |
| 227 } | |
| 228 | |
| 229 SubstringPattern UrlComponentPatterns::CreateURLEqualsPattern( | |
| 230 const std::string& str) { | |
| 231 return CreateSingletonPattern(BEGINNING_OF_URL + CanonicalizeHostname(str) + | |
| 232 END_OF_URL); | |
| 233 } | |
| 234 | |
| 235 SubstringPattern UrlComponentPatterns::CreateSingletonPattern( | |
| 236 const std::string& pattern) { | |
| 237 std::map<std::string, SubstringPattern>::const_iterator iter = | |
| 238 pattern_singletons_.find(pattern); | |
| 239 if (iter != pattern_singletons_.end()) | |
| 240 return iter->second; | |
| 241 return | |
| 242 (pattern_singletons_[pattern] = SubstringPattern(pattern, id_counter_++)); | |
| 243 } | |
| 244 | |
| 245 std::string UrlComponentPatterns::CanonicalizeHostname( | |
| 246 const std::string hostname) const { | |
| 247 if (StartsWithASCII(hostname, ".", true)) | |
|
Matt Perry
2012/02/14 01:38:34
nit: for this simple test, I'd just check hostname
battre
2012/02/14 19:32:21
Done.
| |
| 248 return hostname; | |
| 249 else | |
| 250 return "." + hostname; | |
| 251 } | |
| 252 | |
| 253 void UrlComponentPatterns::DestroySingletonPattern( | |
| 254 const SubstringPattern& pattern) { | |
| 255 pattern_singletons_.erase(pattern.pattern()); | |
| 256 } | |
| 257 | |
| 258 void UrlComponentPatterns::DestroySingletonPatterns( | |
| 259 const std::vector<SubstringPattern>& patterns) { | |
| 260 for (std::vector<SubstringPattern>::const_iterator i = patterns.begin(); | |
| 261 i != patterns.end(); ++i) { | |
| 262 DestroySingletonPattern(*i); | |
| 263 } | |
| 264 } | |
| 265 | |
| 266 } // namespace extensions | |
| OLD | NEW |