Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
index 74c9b0b9acec0dbc939a69370caca08331c37bf6..d10b575435f9d11bced1bb5a24cffcf213d50f01 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
@@ -21,7 +21,9 @@ |
#include "base/basictypes.h" |
#include "base/callback_old.h" |
#include "base/hash_tables.h" |
+#include "base/memory/mru_cache.h" |
#include "base/memory/scoped_ptr.h" |
+#include "base/string_piece.h" |
#include "base/string16.h" |
#include "base/task.h" |
@@ -92,6 +94,10 @@ class PhishingTermFeatureExtractor { |
// before giving up on the current page. |
static const int kMaxTotalTimeMs; |
+ // The size of the cache that we use to determine if we can avoid lower |
+ // casing, hashing, and UTF conversion. |
+ static const int kMaxNegativeWordCacheSize; |
+ |
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
// until a predefined maximum amount of time has elapsed, then posts a task |
// to the current MessageLoop to continue extraction. When extraction |
@@ -99,7 +105,7 @@ class PhishingTermFeatureExtractor { |
void ExtractFeaturesWithTimeout(); |
// Handles a single word in the page text. |
- void HandleWord(const string16& word); |
+ void HandleWord(const base::StringPiece16& word); |
// Helper to verify that there is no pending feature extraction. Dies in |
// debug builds if the state is not as expected. This is a no-op in release |
@@ -125,6 +131,13 @@ class PhishingTermFeatureExtractor { |
// The maximum number of words in an n-gram. |
size_t max_words_per_term_; |
+ // This cache is used to see if we need to check the word at all, as |
+ // converting to UTF8, lowercasing, and hashing are all relatively expensive |
+ // operations. Though this is called an MRU cache, it seems to behave like |
+ // an LRU cache (i.e. it evicts the oldest accesses first). |
+ typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; |
+ WordCache negative_word_cache_; |
+ |
// Non-owned pointer to our clock. |
FeatureExtractorClock* clock_; |