OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 #ifndef CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_ |
| 5 #define CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_ |
| 6 |
| 7 #include "googleurl/src/gurl.h" |
| 8 |
| 9 // A pattern that can be used to match URLs. A URLPattern is a very restricted |
| 10 // subset of URL syntax: |
| 11 // |
| 12 // <url-pattern> := <scheme>://<host><path> |
| 13 // <scheme> := 'http' | 'https' | 'file' | 'ftp' | 'chrome-ui' |
| 14 // <host> := '*' | '*.' <anychar except '/' and '*'>+ |
| 15 // <path> := '/' <any chars> |
| 16 // |
| 17 // * Host is not used when the scheme is 'file'. |
| 18 // * The path can have embedded '*' characters which act as glob wildcards. |
| 19 // |
| 20 // Examples of valid patterns: |
| 21 // - http://*/* |
| 22 // - http://*/foo* |
| 23 // - https://*.google.com/foo*bar |
| 24 // - chrome-ui://foo/bar |
| 25 // - file://monkey* |
| 26 // - http://127.0.0.1/* |
| 27 // |
| 28 // Examples of invalid patterns: |
| 29 // - http://* -- path not specified |
| 30 // - http://*foo/bar -- * not allowed as substring of host component |
| 31 // - http://foo.*.bar/baz -- * must be first component |
| 32 // - http:/bar -- scheme separator not found |
| 33 // - foo://* -- invalid scheme |
| 34 // |
| 35 // Design rationale: |
| 36 // * We need to be able to tell users what 'sites' a given URLPattern will |
| 37 // affect. For example "This extension will interact with the site |
| 38 // 'www.google.com'. |
| 39 // * We'd like to be able to convert as many existing Greasemonkey @include |
| 40 // patterns to URLPatterns as possible. Greasemonkey @include patterns are |
| 41 // simple globs, so this won't be perfect. |
| 42 // * Although we would like to support any scheme, it isn't clear what to tell |
| 43 // users about URLPatterns that affect data or javascript URLs, and saying |
| 44 // something useful about chrome-extension URLs is more work, so those are |
| 45 // left out for now. |
| 46 // |
| 47 // From a 2008-ish crawl of userscripts.org, the following patterns were found |
| 48 // in @include lines: |
| 49 // - total lines : 24271 |
| 50 // - @include * : 919 |
| 51 // - @include http://[^\*]+?/ : 11128 (no star in host) |
| 52 // - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.) |
| 53 // - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many |
| 54 // appear to only need subdomain |
| 55 // matching, not real prefix matching) |
| 56 // - @include http://[^\*/]+\*/ : 320 (host suffixed by *) |
| 57 // - @include contains .tld : 297 (host suffixed by .tld -- a special |
| 58 // Greasemonkey domain component that |
| 59 // tries to match all valid registry- |
| 60 // controlled suffixes) |
| 61 // - @include http://\*/ : 228 (host is * exactly, but there is |
| 62 // more to the pattern) |
| 63 // |
| 64 // So, we can support at least half of current @include lines without supporting |
| 65 // subdomain matching. We can pick up at least another 10% by supporting |
| 66 // subdomain matching. It is probably possible to coerce more of the existing |
| 67 // patterns to URLPattern, but the resulting pattern will be more restrictive |
| 68 // than the original glob, which is probably better than nothing. |
| 69 class URLPattern { |
| 70 public: |
| 71 URLPattern() : match_subdomains_(false) {} |
| 72 |
| 73 // Initializes this instance by parsing the provided string. On failure, the |
| 74 // instance will have some intermediate values and is in an invalid state. |
| 75 bool Parse(const std::string& pattern_str); |
| 76 |
| 77 // Returns true if this instance matches the specified URL. |
| 78 bool MatchesUrl(const GURL& url); |
| 79 |
| 80 // Get the scheme the pattern matches. This will always return a valid scheme |
| 81 // if is_valid() returns true. |
| 82 std::string scheme() const { return scheme_; } |
| 83 |
| 84 // Gets the host the pattern matches. This can be an empty string if the |
| 85 // pattern matches all hosts (the input was <scheme>://*/<whatever>). |
| 86 std::string host() const { return host_; } |
| 87 |
| 88 // Gets whether to match subdomains of host(). |
| 89 bool match_subdomains() const { return match_subdomains_; } |
| 90 |
| 91 // Gets the path the pattern matches with the leading slash. This can have |
| 92 // embedded asterisks which are interpreted using glob rules. |
| 93 std::string path() const { return path_; } |
| 94 |
| 95 private: |
| 96 // Returns true if |test| matches our host. |
| 97 bool MatchesHost(const GURL& test); |
| 98 |
| 99 // Returns true if |test| matches our path. |
| 100 bool MatchesPath(const GURL& test); |
| 101 |
| 102 // The scheme for the pattern. |
| 103 std::string scheme_; |
| 104 |
| 105 // The host without any leading "*" components. |
| 106 std::string host_; |
| 107 |
| 108 // Whether we should match subdomains of the host. This is true if the first |
| 109 // component of the pattern's host was "*". |
| 110 bool match_subdomains_; |
| 111 |
| 112 // The path to match. This is everything after the host of the URL, or |
| 113 // everything after the scheme in the case of file:// URLs. |
| 114 std::string path_; |
| 115 |
| 116 // The path with "?" and "\" characters escaped for use with the |
| 117 // MatchPattern() function. This is populated lazily, the first time it is |
| 118 // needed. |
| 119 std::string path_escaped_; |
| 120 }; |
| 121 |
| 122 #endif CHROME_BROWSER_EXTENSIONS_MATCH_PATTERN_H_ |
OLD | NEW |