Chromium Code Reviews| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| index 73db09bc9f7f08b20aefd7d2ca38f4d3e2b6fc2b..b82f40f918eb268d711f35ae9c6eae53bfc7757a 100644 |
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| @@ -93,6 +93,7 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| : page_term_hashes_(page_term_hashes), |
| page_word_hashes_(page_word_hashes), |
| max_words_per_term_(max_words_per_term), |
| + negative_word_cache_(1000 /* max_size */), |
|
Brian Ryner
2011/08/08 22:08:00
Maybe make this a class constant, like the other k
Garrett Casto
2011/08/08 23:19:51
Done.
|
| clock_(clock), |
| ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
| Clear(); |
| @@ -159,8 +160,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
| next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
| if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
| // next is now positioned at the end of a word. |
| - HandleWord(string16(*page_text_, state_->position, |
| - next - state_->position)); |
| + HandleWord(base::StringPiece16(page_text_->data() + state_->position, |
| + next - state_->position)); |
| ++num_words; |
| } |
| state_->position = next; |
| @@ -196,10 +197,21 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
| // Otherwise, continue. |
| } |
| } |
| + // We need to clear the cache because the data that it depends on (page_text_) |
| + // is going away. |
| + negative_word_cache_.Clear(); |
|
Brian Ryner
2011/08/08 22:08:00
Hm... I could imagine there being some benefit to
Garrett Casto
2011/08/08 23:19:51
So I thought about this as well. It's rather hard
|
| RunCallback(true); |
| } |
| -void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
| +void PhishingTermFeatureExtractor::HandleWord( |
| + const base::StringPiece16& word) { |
| + // Quickest out if we have seen this word before and know that it's not |
| + // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, |
| + // all of which are relatively expensive. |
| + if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
| + return; |
| + } |
| + |
| std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
| std::string word_hash = crypto::SHA256HashString(word_lower); |
| @@ -208,6 +220,8 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
| // Word doesn't exist in our terms so we can clear the n-gram state. |
| state_->previous_words.clear(); |
| state_->previous_word_sizes.clear(); |
| + // Insert into negative cache so that we don't try this again. |
| + negative_word_cache_.Put(word, true); |
| return; |
| } |
| @@ -221,16 +235,6 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
| // Note that we don't yet add the new word length to previous_word_sizes, |
| // since we don't want to compute the hash for the word by itself again. |
| // |
| - // TODO(bryner): Use UMA stats to determine whether this is too slow. |
| - // If it is, there are a couple of cases that we could optimize: |
| - // - We could cache plaintext words that are not in page_word_hashes_, so |
| - // that we can avoid hashing these again. |
| - // - We could include positional information about words in the n-grams, |
| - // rather than just a list of all of the words. For example, we could |
| - // change the term format so that each word is hashed separately, or |
| - // we could add extra data to the word list to indicate the position |
| - // at which the word appears in an n-gram, and skip checking the word if |
| - // it's not at that position. |
| state_->previous_words.append(word_lower); |
| std::string current_term = state_->previous_words; |
| for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |