Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2689)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 9 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
index 74c9b0b9acec0dbc939a69370caca08331c37bf6..384e093cdb9de1ab48725697c66e85f2ddef7416 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -21,9 +21,11 @@
#include "base/basictypes.h"
#include "base/callback_old.h"
#include "base/hash_tables.h"
+#include "base/memory/mru_cache.h"
#include "base/memory/scoped_ptr.h"
#include "base/string16.h"
#include "base/task.h"
+#include "base/wide_string_piece.h"
namespace safe_browsing {
class FeatureExtractorClock;
@@ -99,7 +101,7 @@ class PhishingTermFeatureExtractor {
void ExtractFeaturesWithTimeout();
// Handles a single word in the page text.
- void HandleWord(const string16& word);
+ void HandleWord(const base::WideStringPiece& word);
// Helper to verify that there is no pending feature extraction. Dies in
// debug builds if the state is not as expected. This is a no-op in release
@@ -125,6 +127,13 @@ class PhishingTermFeatureExtractor {
// The maximum number of words in an n-gram.
size_t max_words_per_term_;
+ // This cache is used to see if we need to check the word at all, as
+ // converting to UTF8, lowercasing, and hashing are all relatively expensive
+ // operations. Though this is called an MRU cache, it seems to behave like
+ // an LRU cache (i.e. it evicts the oldest accesses first).
+ typedef base::HashingMRUCache<base::WideStringPiece, bool> WordCache;
+ WordCache negative_word_cache_;
+
// Non-owned pointer to our clock.
FeatureExtractorClock* clock_;

Powered by Google App Engine
This is Rietveld 408576698