| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
| 6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
| 7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
| 8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
| 9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
| 10 // | 10 // |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 63 // | 63 // |
| 64 // To avoid blocking the render thread for too long, the feature extractor | 64 // To avoid blocking the render thread for too long, the feature extractor |
| 65 // may run in several chunks of work, posting a task to the current | 65 // may run in several chunks of work, posting a task to the current |
| 66 // MessageLoop to continue processing. Once feature extraction is complete, | 66 // MessageLoop to continue processing. Once feature extraction is complete, |
| 67 // |done_callback| is run on the current thread. | 67 // |done_callback| is run on the current thread. |
| 68 // PhishingTermFeatureExtractor takes ownership of the callback. | 68 // PhishingTermFeatureExtractor takes ownership of the callback. |
| 69 // | 69 // |
| 70 // |page_text| and |features| are owned by the caller, and must not be | 70 // |page_text| and |features| are owned by the caller, and must not be |
| 71 // destroyed until either |done_callback| is run or | 71 // destroyed until either |done_callback| is run or |
| 72 // CancelPendingExtraction() is called. | 72 // CancelPendingExtraction() is called. |
| 73 void ExtractFeatures(const string16* page_text, | 73 void ExtractFeatures(const base::string16* page_text, |
| 74 FeatureMap* features, | 74 FeatureMap* features, |
| 75 const DoneCallback& done_callback); | 75 const DoneCallback& done_callback); |
| 76 | 76 |
| 77 // Cancels any pending feature extraction. The DoneCallback will not be run. | 77 // Cancels any pending feature extraction. The DoneCallback will not be run. |
| 78 // Must be called if there is a feature extraction in progress when the page | 78 // Must be called if there is a feature extraction in progress when the page |
| 79 // is unloaded or the PhishingTermFeatureExtractor is destroyed. | 79 // is unloaded or the PhishingTermFeatureExtractor is destroyed. |
| 80 void CancelPendingExtraction(); | 80 void CancelPendingExtraction(); |
| 81 | 81 |
| 82 private: | 82 private: |
| 83 struct ExtractionState; | 83 struct ExtractionState; |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 139 // converting to UTF8, lowercasing, and hashing are all relatively expensive | 139 // converting to UTF8, lowercasing, and hashing are all relatively expensive |
| 140 // operations. Though this is called an MRU cache, it seems to behave like | 140 // operations. Though this is called an MRU cache, it seems to behave like |
| 141 // an LRU cache (i.e. it evicts the oldest accesses first). | 141 // an LRU cache (i.e. it evicts the oldest accesses first). |
| 142 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; | 142 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; |
| 143 WordCache negative_word_cache_; | 143 WordCache negative_word_cache_; |
| 144 | 144 |
| 145 // Non-owned pointer to our clock. | 145 // Non-owned pointer to our clock. |
| 146 FeatureExtractorClock* clock_; | 146 FeatureExtractorClock* clock_; |
| 147 | 147 |
| 148 // The output parameters from the most recent call to ExtractFeatures(). | 148 // The output parameters from the most recent call to ExtractFeatures(). |
| 149 const string16* page_text_; // The caller keeps ownership of this. | 149 const base::string16* page_text_; // The caller keeps ownership of this. |
| 150 FeatureMap* features_; // The caller keeps ownership of this. | 150 FeatureMap* features_; // The caller keeps ownership of this. |
| 151 DoneCallback done_callback_; | 151 DoneCallback done_callback_; |
| 152 | 152 |
| 153 // Stores the current state of term extraction from |page_text_|. | 153 // Stores the current state of term extraction from |page_text_|. |
| 154 scoped_ptr<ExtractionState> state_; | 154 scoped_ptr<ExtractionState> state_; |
| 155 | 155 |
| 156 // Used in scheduling ExtractFeaturesWithTimeout tasks. | 156 // Used in scheduling ExtractFeaturesWithTimeout tasks. |
| 157 // These pointers are invalidated if extraction is cancelled. | 157 // These pointers are invalidated if extraction is cancelled. |
| 158 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; | 158 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; |
| 159 | 159 |
| 160 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 160 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
| 161 }; | 161 }; |
| 162 | 162 |
| 163 } // namespace safe_browsing | 163 } // namespace safe_browsing |
| 164 | 164 |
| 165 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 165 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| OLD | NEW |