| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
| 6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
| 7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
| 8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
| 9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
| 10 // | 10 // |
| 11 // To make it harder for a phisher to enumerate all of the relevant terms in | 11 // To make it harder for a phisher to enumerate all of the relevant terms in |
| 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. |
| 13 // | 13 // |
| 14 // There is one PhishingTermFeatureExtractor per RenderView. | 14 // There is one PhishingTermFeatureExtractor per RenderView. |
| 15 | 15 |
| 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| 18 | 18 |
| 19 #include <string> | 19 #include <string> |
| 20 | 20 |
| 21 #include "base/basictypes.h" | 21 #include "base/basictypes.h" |
| 22 #include "base/callback_old.h" | 22 #include "base/callback_old.h" |
| 23 #include "base/hash_tables.h" | 23 #include "base/hash_tables.h" |
| 24 #include "base/memory/mru_cache.h" |
| 24 #include "base/memory/scoped_ptr.h" | 25 #include "base/memory/scoped_ptr.h" |
| 26 #include "base/string_piece.h" |
| 25 #include "base/string16.h" | 27 #include "base/string16.h" |
| 26 #include "base/task.h" | 28 #include "base/task.h" |
| 27 | 29 |
| 28 namespace safe_browsing { | 30 namespace safe_browsing { |
| 29 class FeatureExtractorClock; | 31 class FeatureExtractorClock; |
| 30 class FeatureMap; | 32 class FeatureMap; |
| 31 | 33 |
| 32 class PhishingTermFeatureExtractor { | 34 class PhishingTermFeatureExtractor { |
| 33 public: | 35 public: |
| 34 // Callback to be run when feature extraction finishes. The callback | 36 // Callback to be run when feature extraction finishes. The callback |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 85 | 87 |
| 86 // The number of words that we will process before checking to see whether | 88 // The number of words that we will process before checking to see whether |
| 87 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be | 89 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be |
| 88 // slow, we don't do this on every word processed. | 90 // slow, we don't do this on every word processed. |
| 89 static const int kClockCheckGranularity; | 91 static const int kClockCheckGranularity; |
| 90 | 92 |
| 91 // The maximum total amount of time that the feature extractor will run | 93 // The maximum total amount of time that the feature extractor will run |
| 92 // before giving up on the current page. | 94 // before giving up on the current page. |
| 93 static const int kMaxTotalTimeMs; | 95 static const int kMaxTotalTimeMs; |
| 94 | 96 |
| 97 // The size of the cache that we use to determine if we can avoid lower |
| 98 // casing, hashing, and UTF conversion. |
| 99 static const int kMaxNegativeWordCacheSize; |
| 100 |
| 95 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs | 101 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
| 96 // until a predefined maximum amount of time has elapsed, then posts a task | 102 // until a predefined maximum amount of time has elapsed, then posts a task |
| 97 // to the current MessageLoop to continue extraction. When extraction | 103 // to the current MessageLoop to continue extraction. When extraction |
| 98 // finishes, calls RunCallback(). | 104 // finishes, calls RunCallback(). |
| 99 void ExtractFeaturesWithTimeout(); | 105 void ExtractFeaturesWithTimeout(); |
| 100 | 106 |
| 101 // Handles a single word in the page text. | 107 // Handles a single word in the page text. |
| 102 void HandleWord(const string16& word); | 108 void HandleWord(const base::StringPiece16& word); |
| 103 | 109 |
| 104 // Helper to verify that there is no pending feature extraction. Dies in | 110 // Helper to verify that there is no pending feature extraction. Dies in |
| 105 // debug builds if the state is not as expected. This is a no-op in release | 111 // debug builds if the state is not as expected. This is a no-op in release |
| 106 // builds. | 112 // builds. |
| 107 void CheckNoPendingExtraction(); | 113 void CheckNoPendingExtraction(); |
| 108 | 114 |
| 109 // Runs |done_callback_| and then clears all internal state. | 115 // Runs |done_callback_| and then clears all internal state. |
| 110 void RunCallback(bool success); | 116 void RunCallback(bool success); |
| 111 | 117 |
| 112 // Clears all internal feature extraction state. | 118 // Clears all internal feature extraction state. |
| 113 void Clear(); | 119 void Clear(); |
| 114 | 120 |
| 115 // All of the term hashes that we are looking for in the page. | 121 // All of the term hashes that we are looking for in the page. |
| 116 const base::hash_set<std::string>* page_term_hashes_; | 122 const base::hash_set<std::string>* page_term_hashes_; |
| 117 | 123 |
| 118 // Hashes of all the individual words in page_term_hashes_. If | 124 // Hashes of all the individual words in page_term_hashes_. If |
| 119 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | 125 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
| 120 // would contain (hashed) "one" and "two". We do this so that we can have a | 126 // would contain (hashed) "one" and "two". We do this so that we can have a |
| 121 // quick out in the common case that the current word we are processing | 127 // quick out in the common case that the current word we are processing |
| 122 // doesn't contain any part of one of our terms. | 128 // doesn't contain any part of one of our terms. |
| 123 const base::hash_set<std::string>* page_word_hashes_; | 129 const base::hash_set<std::string>* page_word_hashes_; |
| 124 | 130 |
| 125 // The maximum number of words in an n-gram. | 131 // The maximum number of words in an n-gram. |
| 126 size_t max_words_per_term_; | 132 size_t max_words_per_term_; |
| 127 | 133 |
| 134 // This cache is used to see if we need to check the word at all, as |
| 135 // converting to UTF8, lowercasing, and hashing are all relatively expensive |
| 136 // operations. Though this is called an MRU cache, it seems to behave like |
| 137 // an LRU cache (i.e. it evicts the oldest accesses first). |
| 138 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; |
| 139 WordCache negative_word_cache_; |
| 140 |
| 128 // Non-owned pointer to our clock. | 141 // Non-owned pointer to our clock. |
| 129 FeatureExtractorClock* clock_; | 142 FeatureExtractorClock* clock_; |
| 130 | 143 |
| 131 // The output parameters from the most recent call to ExtractFeatures(). | 144 // The output parameters from the most recent call to ExtractFeatures(). |
| 132 const string16* page_text_; // The caller keeps ownership of this. | 145 const string16* page_text_; // The caller keeps ownership of this. |
| 133 FeatureMap* features_; // The caller keeps ownership of this. | 146 FeatureMap* features_; // The caller keeps ownership of this. |
| 134 scoped_ptr<DoneCallback> done_callback_; | 147 scoped_ptr<DoneCallback> done_callback_; |
| 135 | 148 |
| 136 // Stores the current state of term extraction from |page_text_|. | 149 // Stores the current state of term extraction from |page_text_|. |
| 137 scoped_ptr<ExtractionState> state_; | 150 scoped_ptr<ExtractionState> state_; |
| 138 | 151 |
| 139 // Used to create ExtractFeaturesWithTimeout tasks. | 152 // Used to create ExtractFeaturesWithTimeout tasks. |
| 140 // These tasks are revoked if extraction is cancelled. | 153 // These tasks are revoked if extraction is cancelled. |
| 141 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; | 154 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; |
| 142 | 155 |
| 143 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 156 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
| 144 }; | 157 }; |
| 145 | 158 |
| 146 } // namespace safe_browsing | 159 } // namespace safe_browsing |
| 147 | 160 |
| 148 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 161 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| OLD | NEW |