chrome/renderer/safe_browsing/phishing_term_feature_extractor.h - Issue 7549003: Optimize phishing page term feature extraction.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Address Brian's comments Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // PhishingTermFeatureExtractor handles computing term features from the text	5 // PhishingTermFeatureExtractor handles computing term features from the text

6 // of a web page for the client-side phishing detection model. To do this, it	6 // of a web page for the client-side phishing detection model. To do this, it

7 // takes a list of terms that appear in the model, and scans through the page	7 // takes a list of terms that appear in the model, and scans through the page

8 // text looking for them. Any terms that appear will cause a corresponding	8 // text looking for them. Any terms that appear will cause a corresponding

9 // features::kPageTerm feature to be added to the FeatureMap.	9 // features::kPageTerm feature to be added to the FeatureMap.

10 //	10 //

11 // To make it harder for a phisher to enumerate all of the relevant terms in	11 // To make it harder for a phisher to enumerate all of the relevant terms in

12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.	12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.

13 //	13 //

14 // There is one PhishingTermFeatureExtractor per RenderView.	14 // There is one PhishingTermFeatureExtractor per RenderView.

15	15

16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_	16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_	17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

18	18

19 #include <string>	19 #include <string>

20	20

21 #include "base/basictypes.h"	21 #include "base/basictypes.h"

22 #include "base/callback_old.h"	22 #include "base/callback_old.h"

23 #include "base/hash_tables.h"	23 #include "base/hash_tables.h"

	24 #include "base/memory/mru_cache.h"

24 #include "base/memory/scoped_ptr.h"	25 #include "base/memory/scoped_ptr.h"

	26 #include "base/string_piece.h"

25 #include "base/string16.h"	27 #include "base/string16.h"

26 #include "base/task.h"	28 #include "base/task.h"

27	29

28 namespace safe_browsing {	30 namespace safe_browsing {

29 class FeatureExtractorClock;	31 class FeatureExtractorClock;

30 class FeatureMap;	32 class FeatureMap;

31	33

32 class PhishingTermFeatureExtractor {	34 class PhishingTermFeatureExtractor {

33 public:	35 public:

34 // Callback to be run when feature extraction finishes. The callback	36 // Callback to be run when feature extraction finishes. The callback

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
85	87

86 // The number of words that we will process before checking to see whether	88 // The number of words that we will process before checking to see whether

87 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be	89 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be

88 // slow, we don't do this on every word processed.	90 // slow, we don't do this on every word processed.

89 static const int kClockCheckGranularity;	91 static const int kClockCheckGranularity;

90	92

91 // The maximum total amount of time that the feature extractor will run	93 // The maximum total amount of time that the feature extractor will run

92 // before giving up on the current page.	94 // before giving up on the current page.

93 static const int kMaxTotalTimeMs;	95 static const int kMaxTotalTimeMs;

94	96

	97 // The size of the cache that we use to determine if we can avoid lower

	98 // casing, hashing, and UTF conversion.

	99 static const int kMaxNegativeWordCacheSize;

	100

95 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs	101 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs

96 // until a predefined maximum amount of time has elapsed, then posts a task	102 // until a predefined maximum amount of time has elapsed, then posts a task

97 // to the current MessageLoop to continue extraction. When extraction	103 // to the current MessageLoop to continue extraction. When extraction

98 // finishes, calls RunCallback().	104 // finishes, calls RunCallback().

99 void ExtractFeaturesWithTimeout();	105 void ExtractFeaturesWithTimeout();

100	106

101 // Handles a single word in the page text.	107 // Handles a single word in the page text.

102 void HandleWord(const string16& word);	108 void HandleWord(const base::StringPiece16& word);

103	109

104 // Helper to verify that there is no pending feature extraction. Dies in	110 // Helper to verify that there is no pending feature extraction. Dies in

105 // debug builds if the state is not as expected. This is a no-op in release	111 // debug builds if the state is not as expected. This is a no-op in release

106 // builds.	112 // builds.

107 void CheckNoPendingExtraction();	113 void CheckNoPendingExtraction();

108	114

109 // Runs \|done_callback_\| and then clears all internal state.	115 // Runs \|done_callback_\| and then clears all internal state.

110 void RunCallback(bool success);	116 void RunCallback(bool success);

111	117

112 // Clears all internal feature extraction state.	118 // Clears all internal feature extraction state.

113 void Clear();	119 void Clear();

114	120

115 // All of the term hashes that we are looking for in the page.	121 // All of the term hashes that we are looking for in the page.

116 const base::hash_set<std::string>* page_term_hashes_;	122 const base::hash_set<std::string>* page_term_hashes_;

117	123

118 // Hashes of all the individual words in page_term_hashes_. If	124 // Hashes of all the individual words in page_term_hashes_. If

119 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_	125 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_

120 // would contain (hashed) "one" and "two". We do this so that we can have a	126 // would contain (hashed) "one" and "two". We do this so that we can have a

121 // quick out in the common case that the current word we are processing	127 // quick out in the common case that the current word we are processing

122 // doesn't contain any part of one of our terms.	128 // doesn't contain any part of one of our terms.

123 const base::hash_set<std::string>* page_word_hashes_;	129 const base::hash_set<std::string>* page_word_hashes_;

124	130

125 // The maximum number of words in an n-gram.	131 // The maximum number of words in an n-gram.

126 size_t max_words_per_term_;	132 size_t max_words_per_term_;

127	133

	134 // This cache is used to see if we need to check the word at all, as

	135 // converting to UTF8, lowercasing, and hashing are all relatively expensive

	136 // operations. Though this is called an MRU cache, it seems to behave like

	137 // an LRU cache (i.e. it evicts the oldest accesses first).

	138 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;

	139 WordCache negative_word_cache_;

	140

128 // Non-owned pointer to our clock.	141 // Non-owned pointer to our clock.

129 FeatureExtractorClock* clock_;	142 FeatureExtractorClock* clock_;

130	143

131 // The output parameters from the most recent call to ExtractFeatures().	144 // The output parameters from the most recent call to ExtractFeatures().

132 const string16* page_text_; // The caller keeps ownership of this.	145 const string16* page_text_; // The caller keeps ownership of this.

133 FeatureMap* features_; // The caller keeps ownership of this.	146 FeatureMap* features_; // The caller keeps ownership of this.

134 scoped_ptr<DoneCallback> done_callback_;	147 scoped_ptr<DoneCallback> done_callback_;

135	148

136 // Stores the current state of term extraction from \|page_text_\|.	149 // Stores the current state of term extraction from \|page_text_\|.

137 scoped_ptr<ExtractionState> state_;	150 scoped_ptr<ExtractionState> state_;

138	151

139 // Used to create ExtractFeaturesWithTimeout tasks.	152 // Used to create ExtractFeaturesWithTimeout tasks.

140 // These tasks are revoked if extraction is cancelled.	153 // These tasks are revoked if extraction is cancelled.

141 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_;	154 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_;

142	155

143 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);	156 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);

144 };	157 };

145	158

146 } // namespace safe_browsing	159 } // namespace safe_browsing

147	160

148 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_	161 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

OLD	NEW

« no previous file with comments | « base/string_piece_unittest.cc ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc » ('j') | no next file with comments »