Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
index 73db09bc9f7f08b20aefd7d2ca38f4d3e2b6fc2b..3b4ba85ea3f87bbd8ad20e4ef233de583d4d46a9 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
@@ -35,6 +35,9 @@ const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10; |
// actual phishing page. |
const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
+// The maximum size of the negative word cache. |
+const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; |
+ |
// All of the state pertaining to the current feature extraction. |
struct PhishingTermFeatureExtractor::ExtractionState { |
// Stores up to max_words_per_ngram_ previous words separated by spaces. |
@@ -93,6 +96,7 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
: page_term_hashes_(page_term_hashes), |
page_word_hashes_(page_word_hashes), |
max_words_per_term_(max_words_per_term), |
+ negative_word_cache_(kMaxNegativeWordCacheSize), |
clock_(clock), |
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
Clear(); |
@@ -159,8 +163,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
// next is now positioned at the end of a word. |
- HandleWord(string16(*page_text_, state_->position, |
- next - state_->position)); |
+ HandleWord(base::StringPiece16(page_text_->data() + state_->position, |
+ next - state_->position)); |
++num_words; |
} |
state_->position = next; |
@@ -196,10 +200,21 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
// Otherwise, continue. |
} |
} |
+ // We need to clear the cache because the data that it depends on (page_text_) |
+ // is going away. |
+ negative_word_cache_.Clear(); |
RunCallback(true); |
} |
-void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
+void PhishingTermFeatureExtractor::HandleWord( |
+ const base::StringPiece16& word) { |
+ // Quickest out if we have seen this word before and know that it's not |
+ // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, |
+ // all of which are relatively expensive. |
+ if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
+ return; |
+ } |
+ |
std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
std::string word_hash = crypto::SHA256HashString(word_lower); |
@@ -208,6 +223,8 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
// Word doesn't exist in our terms so we can clear the n-gram state. |
state_->previous_words.clear(); |
state_->previous_word_sizes.clear(); |
+ // Insert into negative cache so that we don't try this again. |
+ negative_word_cache_.Put(word, true); |
return; |
} |
@@ -221,16 +238,6 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
// Note that we don't yet add the new word length to previous_word_sizes, |
// since we don't want to compute the hash for the word by itself again. |
// |
- // TODO(bryner): Use UMA stats to determine whether this is too slow. |
- // If it is, there are a couple of cases that we could optimize: |
- // - We could cache plaintext words that are not in page_word_hashes_, so |
- // that we can avoid hashing these again. |
- // - We could include positional information about words in the n-grams, |
- // rather than just a list of all of the words. For example, we could |
- // change the term format so that each word is hashed separately, or |
- // we could add extra data to the word list to indicate the position |
- // at which the word appears in an n-gram, and skip checking the word if |
- // it's not at that position. |
state_->previous_words.append(word_lower); |
std::string current_term = state_->previous_words; |
for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |