Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5894)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address Brian's comments Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « base/string_piece_unittest.cc ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
index 74c9b0b9acec0dbc939a69370caca08331c37bf6..d10b575435f9d11bced1bb5a24cffcf213d50f01 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -21,7 +21,9 @@
#include "base/basictypes.h"
#include "base/callback_old.h"
#include "base/hash_tables.h"
+#include "base/memory/mru_cache.h"
#include "base/memory/scoped_ptr.h"
+#include "base/string_piece.h"
#include "base/string16.h"
#include "base/task.h"
@@ -92,6 +94,10 @@ class PhishingTermFeatureExtractor {
// before giving up on the current page.
static const int kMaxTotalTimeMs;
+ // The size of the cache that we use to determine if we can avoid lower
+ // casing, hashing, and UTF conversion.
+ static const int kMaxNegativeWordCacheSize;
+
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
@@ -99,7 +105,7 @@ class PhishingTermFeatureExtractor {
void ExtractFeaturesWithTimeout();
// Handles a single word in the page text.
- void HandleWord(const string16& word);
+ void HandleWord(const base::StringPiece16& word);
// Helper to verify that there is no pending feature extraction. Dies in
// debug builds if the state is not as expected. This is a no-op in release
@@ -125,6 +131,13 @@ class PhishingTermFeatureExtractor {
// The maximum number of words in an n-gram.
size_t max_words_per_term_;
+ // This cache is used to see if we need to check the word at all, as
+ // converting to UTF8, lowercasing, and hashing are all relatively expensive
+ // operations. Though this is called an MRU cache, it seems to behave like
+ // an LRU cache (i.e. it evicts the oldest accesses first).
+ typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
+ WordCache negative_word_cache_;
+
// Non-owned pointer to our clock.
FeatureExtractorClock* clock_;
« no previous file with comments | « base/string_piece_unittest.cc ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698