Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(977)

Side by Side Diff: chrome/browser/extensions/api/declarative/url_matcher.cc

Issue 9390018: Implementation of a Matching strategy for URLs in the Declarative WebRequest API. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Refactored for memory improvements Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/extensions/api/declarative/url_matcher.h"
6
7 #include <algorithm>
8
9 namespace extensions {
10
11 // This set of classes implement a mapping of URL Component Patterns, such as
12 // host_prefix, host_suffix, host_equals, ..., etc., to SubstringPatterns.
13 //
14 // The idea of this mapping is to reduce the problem of comparing many
15 // URL Component Patterns against one URL to the problem of searching many
16 // substrings in one string:
17 //
18 // ---------------------- --------------------
19 // | URL Query operator | ----translate----> | SubstringPattern |
20 // ---------------------- --------------------
21 // ^
22 // |
23 // compare
24 // |
25 // v
26 // ---------------------- --------------------
27 // | URL to compare | | |
28 // | to all URL Query | ----translate----> | String |
29 // | operators | | |
30 // ---------------------- --------------------
31 //
32 // The reason for this problem reduction is that there are efficient algorithms
33 // for searching many substrings in one string (see Aho-Corasick algorithm).
34 //
35 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
36 // ==========================================================
37 //
38 // For searches in this class, we normalize URLs as follows:
39 //
40 // Step 1:
41 // Remove scheme, port and segment from URL:
42 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
43 // www.example.com/index.html?search=foo
44 //
45 // We remove the scheme and port number because they can be checked later
46 // in a secondary filter step. We remove the segment (the #... part) because
47 // this is not guaranteed to be ASCII-7 encoded.
48 //
49 // Step 2:
50 // Translate URL to String and add the following position markers:
51 // - BU = Beginning of URL
52 // - ED = End of Domain
53 // - EP = End of Path
54 // - EU = End of URL
55 // Furthermore, the hostname is canonicalized to start with a ".".
56 //
57 // Position markers are represented as characters >127, which are therefore
58 // guaranteed not to be part of the ASCII-7 encoded URL character set.
59 //
60 // -> www.example.com/index.html?search=foo becomes
61 // BU .www.example.com ED /index.html EP ?search=foo EU
62 //
63 // -> www.example.com/index.html becomes
64 // BU .www.example.com ED /index.html EP EU
65 //
66 // Step 3:
67 // Translate URL Component Patterns as follows:
68 //
69 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
70 // -> host_prefix("www.example") = BU .www.example
71 //
72 // host_suffix(suffix) = suffix ED
73 // -> host_suffix("example.com") = example.com ED
74 // -> host_suffix(".example.com") = .example.com ED
75 //
76 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
77 // -> host_equals("www.example.com") = BU .www.example.com ED
78 //
79 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
80 //
81 // With this, we can search the SubstringPatterns in the normalized URL.
82 //
83 //
84 // Case 2: url_{prefix,suffix,equals,contains} searches.
85 // =====================================================
86 //
87 // Step 1: as above
88 //
89 // Step 2:
90 // Translate URL to String and add the following position markers:
91 // - BU = Beginning of URL
92 // - EU = End of URL
93 // Furthermore, the hostname is canonicalized to start with a ".".
94 //
95 // -> www.example.com/index.html?search=foo becomes
96 // BU .www.example.com/index.html?search=foo EU
97 //
98 // url_prefix(prefix) = BU add_missing_dot_prefix(prefix)
99 // -> url_prefix("www.example") = BU .www.example
100 //
101 // url_contains(substring) = substring
102 // -> url_contains("index") = index
103 //
104 //
105 // Case 3: {host,path,query}_contains searches.
106 // ============================================
107 //
108 // These kinds of searches are not supported directly but can be derived
109 // by a combination of a url_contains() query followed by an explicit test:
110 //
111 // host_contains(str) = url_contains(str) followed by test whether str occurs
112 // in host comonent of original URL.
113 // -> host_contains("example.co") = example.co
114 // followed by gurl.host().find("example.co");
115 //
116 // [similarly for path_contains and query_contains].
117
118
119 //
120 // URLMatcherCondition
121 //
122
123 URLMatcherCondition::URLMatcherCondition(
124 Criterion criterion,
125 const SubstringPattern* substring_pattern)
126 : criterion_(criterion),
127 substring_pattern_(substring_pattern) {
128 DCHECK(substring_pattern_);
129 }
130
131 bool URLMatcherCondition::IsFullUrlCondition() const {
132 switch (criterion_) {
133 case extensions::URLMatcherCondition::HOST_CONTAINS:
134 case extensions::URLMatcherCondition::PATH_CONTAINS:
135 case extensions::URLMatcherCondition::QUERY_CONTAINS:
136 case extensions::URLMatcherCondition::URL_PREFIX:
137 case extensions::URLMatcherCondition::URL_SUFFIX:
138 case extensions::URLMatcherCondition::URL_CONTAINS:
139 case extensions::URLMatcherCondition::URL_EQUALS:
140 return true;
141 default:
142 break;
143 }
144 return false;
145 }
146
147 bool URLMatcherCondition::IsMatch(
148 const std::set<SubstringPattern::ID>& matching_substring_patterns,
149 const GURL& url) const {
150 if (matching_substring_patterns.find(substring_pattern_->id()) ==
151 matching_substring_patterns.end())
152 return false;
153 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
154 // a substring match on the raw URL. In case of a match, we need to verify
155 // that the match was found in the correct component of the URL.
156 switch (criterion_) {
157 case HOST_CONTAINS:
158 return url.host().find(substring_pattern_->pattern()) !=
159 std::string::npos;
160 case PATH_CONTAINS:
161 return url.path().find(substring_pattern_->pattern()) !=
162 std::string::npos;
163 case QUERY_CONTAINS:
164 return url.query().find(substring_pattern_->pattern()) !=
165 std::string::npos;
166 default:
167 break;
168 }
169 return true;
170 }
171
172 //
173 // URLMatcherConditionFactory
174 //
175
176 namespace {
177 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
178 char kBeginningOfURL[] = {128, 0};
179 char kEndOfDomain[] = {129, 0};
180 char kEndOfPath[] = {130, 0};
181 char kEndOfURL[] = {131, 0};
182 } // namespace
183
184 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
185
186 std::string URLMatcherConditionFactory::CanonlicalizeURLForComponentSearches(
187 const GURL& url) {
188 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
189 url.path() + kEndOfPath + (url.has_query() ? "?" + url.query() : "") +
190 kEndOfURL;
191 }
192
193 scoped_ptr<URLMatcherCondition>
194 URLMatcherConditionFactory::CreateHostPrefixCondition(
195 const std::string& prefix) {
196 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
197 kBeginningOfURL + CanonicalizeHostname(prefix));
198 }
199
200 scoped_ptr<URLMatcherCondition>
201 URLMatcherConditionFactory::CreateHostSuffixCondition(
202 const std::string& suffix) {
203 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
204 suffix + kEndOfDomain);
205 }
206
207 scoped_ptr<URLMatcherCondition>
208 URLMatcherConditionFactory::CreateHostContainsCondition(
209 const std::string& str) {
210 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
211 }
212
213 scoped_ptr<URLMatcherCondition>
214 URLMatcherConditionFactory::CreateHostEqualsCondition(
215 const std::string& str) {
216 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
217 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
218 }
219
220 scoped_ptr<URLMatcherCondition>
221 URLMatcherConditionFactory::CreatePathPrefixCondition(
222 const std::string& prefix) {
223 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
224 kEndOfDomain + prefix);
225 }
226
227 scoped_ptr<URLMatcherCondition>
228 URLMatcherConditionFactory::CreatePathSuffixCondition(
229 const std::string& suffix) {
230 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, suffix + kEndOfPath);
231 }
232
233 scoped_ptr<URLMatcherCondition>
234 URLMatcherConditionFactory::CreatePathContainsCondition(
235 const std::string& str) {
236 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
237 }
238
239 scoped_ptr<URLMatcherCondition>
240 URLMatcherConditionFactory::CreatePathEqualsCondition(
241 const std::string& str) {
242 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
243 kEndOfDomain + str + kEndOfPath);
244 }
245
246 scoped_ptr<URLMatcherCondition>
247 URLMatcherConditionFactory::CreateQueryPrefixCondition(
248 const std::string& prefix) {
249 return CreateCondition(URLMatcherCondition::QUERY_PREFIX,
250 kEndOfPath + prefix);
251 }
252
253 scoped_ptr<URLMatcherCondition>
254 URLMatcherConditionFactory::CreateQuerySuffixCondition(
255 const std::string& suffix) {
256 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, suffix + kEndOfURL);
257 }
258
259 scoped_ptr<URLMatcherCondition>
260 URLMatcherConditionFactory::CreateQueryContainsCondition(
261 const std::string& str) {
262 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
263 }
264
265 scoped_ptr<URLMatcherCondition>
266 URLMatcherConditionFactory::CreateQueryEqualsCondition(
267 const std::string& str) {
268 return CreateCondition(URLMatcherCondition::QUERY_EQUALS,
269 kEndOfPath + str + kEndOfURL);
270 }
271
272 scoped_ptr<URLMatcherCondition>
273 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
274 const std::string& host_suffix,
275 const std::string& path_prefix) {
276 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
277 host_suffix + kEndOfDomain + path_prefix);
278 }
279
280 std::string URLMatcherConditionFactory::CanonlicalizeURLForFullSearches(
281 const GURL& url) {
282 return kBeginningOfURL + CanonicalizeHostname(url.host()) + url.path() +
283 (url.has_query() ? "?" + url.query() : "") + kEndOfURL;
284 }
285
286 scoped_ptr<URLMatcherCondition>
287 URLMatcherConditionFactory::CreateURLPrefixCondition(
288 const std::string& prefix) {
289 return CreateCondition(URLMatcherCondition::URL_PREFIX,
290 kBeginningOfURL + CanonicalizeHostname(prefix));
291 }
292
293 scoped_ptr<URLMatcherCondition>
294 URLMatcherConditionFactory::CreateURLSuffixCondition(
295 const std::string& suffix) {
296 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
297 }
298
299 scoped_ptr<URLMatcherCondition>
300 URLMatcherConditionFactory::CreateURLContainsCondition(
301 const std::string& str) {
302 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
303 }
304
305 scoped_ptr<URLMatcherCondition>
306 URLMatcherConditionFactory::CreateURLEqualsCondition(
307 const std::string& str) {
308 return CreateCondition(URLMatcherCondition::QUERY_EQUALS,
309 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfURL);
310 }
311
312 void URLMatcherConditionFactory::ForgetUnusedPatterns(
313 const std::set<SubstringPattern::ID>& used_patterns) {
314 PatternSingletons::iterator i = pattern_singletons_.begin();
315 while (i != pattern_singletons_.end()) {
316 if (used_patterns.find(i->second->id()) != used_patterns.end()) {
317 ++i;
318 } else {
319 pattern_singletons_.erase(i++);
320 }
321 }
322 }
323
324 scoped_ptr<URLMatcherCondition>
325 URLMatcherConditionFactory::CreateCondition(
326 URLMatcherCondition::Criterion criterion,
327 const std::string& pattern) {
328 PatternSingletons::const_iterator iter = pattern_singletons_.find(pattern);
329 if (iter == pattern_singletons_.end()) {
330 pattern_singletons_[pattern] =
331 make_linked_ptr(new SubstringPattern(pattern, id_counter_++));
332 }
333
334 return scoped_ptr<URLMatcherCondition>(
335 new URLMatcherCondition(criterion, pattern_singletons_[pattern].get()));
336 }
337
338 std::string URLMatcherConditionFactory::CanonicalizeHostname(
339 const std::string& hostname) const {
340 if (!hostname.empty() && hostname[0] == '.')
341 return hostname;
342 else
343 return "." + hostname;
344 }
345
346
347 //
348 // URLMatcherConditionSet
349 //
350
351 URLMatcherConditionSet::URLMatcherConditionSet(
352 ID id,
353 scoped_ptr<Conditions> conditions)
354 : id_(id) {
355 conditions_.swap(*conditions);
356 }
357
358 bool URLMatcherConditionSet::IsMatch(
359 const std::set<SubstringPattern::ID>& matching_substring_patterns,
360 const GURL& url) const {
361 for (Conditions::const_iterator i = conditions_.begin();
362 i != conditions_.end(); ++i) {
363 if (!(*i)->IsMatch(matching_substring_patterns, url))
364 return false;
365 }
366 return true;
367 }
368
369 //
370 // URLMatcher
371 //
372
373 URLMatcher::URLMatcher() {}
374
375 void URLMatcher::AddConditionSets(
376 scoped_ptr<ScopedVector<const URLMatcherConditionSet> > condition_sets) {
377 std::vector<const URLMatcherConditionSet*> released_condition_sets;
378 condition_sets->release(&released_condition_sets);
379 for (std::vector<const URLMatcherConditionSet*>::const_iterator i =
380 released_condition_sets.begin(); i != released_condition_sets.end();
381 ++i) {
382 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
383 url_matcher_condition_sets_.end());
384 url_matcher_condition_sets_[(*i)->id()] = make_linked_ptr(*i);
385 }
386 UpdateInternalDatastructures();
387 }
388
389 void URLMatcher::RemoveConditionSets(
390 const std::vector<URLMatcherConditionSet::ID>& condition_ids) {
391 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
392 condition_ids.begin(); i != condition_ids.end(); ++i) {
393 DCHECK(url_matcher_condition_sets_.find(*i) !=
394 url_matcher_condition_sets_.end());
395 url_matcher_condition_sets_.erase(*i);
396 }
397 UpdateInternalDatastructures();
398 }
399
400
401 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchUrl(const GURL& url) {
402 // See URLMatcherConditionFactory for the canonicalization of URLs and the
403 // distinction between full url searches and url component searches.
404
405 // Perform all full url searches.
406 std::string full_url =
407 condition_factory_.CanonlicalizeURLForFullSearches(url);
408 std::set<SubstringPattern::ID> full_url_matches;
409 full_url_matcher_.Match(full_url, &full_url_matches);
410
411 // Perform all url component searches.
412 std::string component_url =
413 condition_factory_.CanonlicalizeURLForComponentSearches(url);
414 std::set<SubstringPattern::ID> component_url_matches;
415 url_component_matcher_.Match(component_url, &component_url_matches);
416
417 // Build the union of both result sets.
418 std::set<SubstringPattern::ID> matches;
419 matches.swap(full_url_matches);
420 matches.insert(component_url_matches.begin(), component_url_matches.end());
421
422 // Calculate all URLMatcherConditionSet for which all URLMatcherConditions
423 // were fulfilled.
424 std::set<URLMatcherConditionSet::ID> result;
425 for (std::set<SubstringPattern::ID>::const_iterator i = matches.begin();
426 i != matches.end(); ++i) {
427 // For each URLMatcherConditionSet there is exactly one condition
428 // registered in substring_match_triggers_. This means that the following
429 // logic tests each URLMatcherConditionSet exactly once if it can be
430 // completely fulfilled.
431 std::set<URLMatcherConditionSet::ID>& condition_sets =
432 substring_match_triggers_[*i];
433 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
434 condition_sets.begin(); j != condition_sets.end(); ++j) {
435 if (url_matcher_condition_sets_[*j]->IsMatch(matches, url))
436 result.insert(url_matcher_condition_sets_[*j]->id());
437 }
438 }
439
440 return result;
441 }
442
443 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
444 // The purpose of |full_url_conditions| is just that we need to execute
445 // the same logic once for Full URL searches and once for URL Component
446 // searches (see URLMatcherConditionFactory).
447
448 // Determine which patterns need to be registered when this function
449 // terminates.
450 std::set<const SubstringPattern*> new_patterns;
451 for (URLMatcherConditionSets::const_iterator condition_set_iter =
452 url_matcher_condition_sets_.begin();
453 condition_set_iter != url_matcher_condition_sets_.end();
454 ++condition_set_iter) {
455 const URLMatcherConditionSet::Conditions& conditions =
456 condition_set_iter->second->conditions();
457 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
458 conditions.begin(); condition_iter != conditions.end();
459 ++condition_iter) {
460 // If we are called to process Full URL searches, ignore all others,
461 // and vice versa.
462 if (full_url_conditions == (*condition_iter)->IsFullUrlCondition())
463 new_patterns.insert((*condition_iter)->substring_pattern());
464 }
465 }
466
467 // This is the set of patterns that were registered before this function
468 // is called.
469 std::set<const SubstringPattern*>& registered_patterns =
470 full_url_conditions ? registered_full_url_patterns_
471 : registered_url_component_patterns_;
472
473 // Add all patterns that are in new_patterns but not in registered_patterns.
474 std::vector<const SubstringPattern*> patterns_to_register;
475 std::set_difference(
476 new_patterns.begin(), new_patterns.end(),
477 registered_patterns.begin(), registered_patterns.end(),
478 std::back_inserter(patterns_to_register));
479
480 // Remove all patterns that are in registered_patterns but not in
481 // new_patterns.
482 std::vector<const SubstringPattern*> patterns_to_unregister;
483 std::set_difference(
484 registered_patterns.begin(), registered_patterns.end(),
485 new_patterns.begin(), new_patterns.end(),
486 std::back_inserter(patterns_to_unregister));
487
488 // Update the SubstringSetMatcher.
489 SubstringSetMatcher& url_matcher =
490 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
491 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
492 patterns_to_unregister);
493
494 // Update the set of registered_patterns for the next time this function
495 // is being called.
496 registered_patterns.swap(new_patterns);
497 }
498
499 void URLMatcher::UpdateTriggers() {
500 // Count substring pattern frequencies.
501 std::map<SubstringPattern::ID, size_t> substring_pattern_frequencies;
502 for (URLMatcherConditionSets::const_iterator condition_set_iter =
503 url_matcher_condition_sets_.begin();
504 condition_set_iter != url_matcher_condition_sets_.end();
505 ++condition_set_iter) {
506 const URLMatcherConditionSet::Conditions& conditions =
507 condition_set_iter->second->conditions();
508 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
509 conditions.begin(); condition_iter != conditions.end();
510 ++condition_iter) {
511 const SubstringPattern* pattern = (*condition_iter)->substring_pattern();
512 substring_pattern_frequencies[pattern->id()]++;
513 }
514 }
515
516 // Update trigger conditions: Determine for each URLMatcherConditionSet which
517 // URLMatcherCondition occurs least frequently in this URLMatcher. We assume
518 // that this condition is very specific and occurs rarely in URLs. If a match
519 // occurs for this URLMatcherCondition, we want to test all other
520 // URLMatcherCondition in the respective URLMatcherConditionSet as well to
521 // see whether the entire URLMatcherConditionSet is considered matching.
522 substring_match_triggers_.clear();
523 for (URLMatcherConditionSets::const_iterator condition_set_iter =
524 url_matcher_condition_sets_.begin();
525 condition_set_iter != url_matcher_condition_sets_.end();
526 ++condition_set_iter) {
527 const URLMatcherConditionSet::Conditions& conditions =
528 condition_set_iter->second->conditions();
529 if (conditions.empty())
530 continue;
531 SubstringPattern::ID trigger = conditions[0]->substring_pattern()->id();
532 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
533 conditions.begin() + 1; condition_iter != conditions.end();
534 ++condition_iter) {
535 SubstringPattern::ID current_id =
536 (*condition_iter)->substring_pattern()->id();
537 if (substring_pattern_frequencies[trigger] >
538 substring_pattern_frequencies[current_id]) {
539 trigger = current_id;
540 }
541 }
542 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
543 }
544 }
545
546 void URLMatcher::UpdateConditionFactory() {
547 std::set<SubstringPattern::ID> used_patterns;
548 for (URLMatcherConditionSets::const_iterator condition_set_iter =
549 url_matcher_condition_sets_.begin();
550 condition_set_iter != url_matcher_condition_sets_.end();
551 ++condition_set_iter) {
552 const URLMatcherConditionSet::Conditions& conditions =
553 condition_set_iter->second->conditions();
554 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
555 conditions.begin(); condition_iter != conditions.end();
556 ++condition_iter) {
557 used_patterns.insert((*condition_iter)->substring_pattern()->id());
558 }
559 }
560 condition_factory_.ForgetUnusedPatterns(used_patterns);
561 }
562
563 void URLMatcher::UpdateInternalDatastructures() {
564 UpdateSubstringSetMatcher(false);
565 UpdateSubstringSetMatcher(true);
566 UpdateTriggers();
567 UpdateConditionFactory();
568 }
569
570 } // namespace extensions
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698