Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1780)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address Brian's comments Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
index 73db09bc9f7f08b20aefd7d2ca38f4d3e2b6fc2b..3b4ba85ea3f87bbd8ad20e4ef233de583d4d46a9 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -35,6 +35,9 @@ const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
// actual phishing page.
const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
+// The maximum size of the negative word cache.
+const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
+
// All of the state pertaining to the current feature extraction.
struct PhishingTermFeatureExtractor::ExtractionState {
// Stores up to max_words_per_ngram_ previous words separated by spaces.
@@ -93,6 +96,7 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
: page_term_hashes_(page_term_hashes),
page_word_hashes_(page_word_hashes),
max_words_per_term_(max_words_per_term),
+ negative_word_cache_(kMaxNegativeWordCacheSize),
clock_(clock),
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
Clear();
@@ -159,8 +163,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
// next is now positioned at the end of a word.
- HandleWord(string16(*page_text_, state_->position,
- next - state_->position));
+ HandleWord(base::StringPiece16(page_text_->data() + state_->position,
+ next - state_->position));
++num_words;
}
state_->position = next;
@@ -196,10 +200,21 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
// Otherwise, continue.
}
}
+ // We need to clear the cache because the data that it depends on (page_text_)
+ // is going away.
+ negative_word_cache_.Clear();
RunCallback(true);
}
-void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
+void PhishingTermFeatureExtractor::HandleWord(
+ const base::StringPiece16& word) {
+ // Quickest out if we have seen this word before and know that it's not
+ // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
+ // all of which are relatively expensive.
+ if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
+ return;
+ }
+
std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
std::string word_hash = crypto::SHA256HashString(word_lower);
@@ -208,6 +223,8 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
// Word doesn't exist in our terms so we can clear the n-gram state.
state_->previous_words.clear();
state_->previous_word_sizes.clear();
+ // Insert into negative cache so that we don't try this again.
+ negative_word_cache_.Put(word, true);
return;
}
@@ -221,16 +238,6 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
// Note that we don't yet add the new word length to previous_word_sizes,
// since we don't want to compute the hash for the word by itself again.
//
- // TODO(bryner): Use UMA stats to determine whether this is too slow.
- // If it is, there are a couple of cases that we could optimize:
- // - We could cache plaintext words that are not in page_word_hashes_, so
- // that we can avoid hashing these again.
- // - We could include positional information about words in the n-grams,
- // rather than just a list of all of the words. For example, we could
- // change the term format so that each word is hashed separately, or
- // we could add extra data to the word list to indicate the position
- // at which the word appears in an n-gram, and skip checking the word if
- // it's not at that position.
state_->previous_words.append(word_lower);
std::string current_term = state_->previous_words;
for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698