Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <list> | 7 #include <list> |
| 8 #include <map> | 8 #include <map> |
| 9 | 9 |
| 10 #include "base/bind.h" | 10 #include "base/bind.h" |
| 11 #include "base/compiler_specific.h" | 11 #include "base/compiler_specific.h" |
| 12 #include "base/i18n/break_iterator.h" | |
| 12 #include "base/i18n/case_conversion.h" | 13 #include "base/i18n/case_conversion.h" |
| 13 #include "base/logging.h" | 14 #include "base/logging.h" |
| 15 #include "base/memory/scoped_ptr.h" | |
| 14 #include "base/message_loop/message_loop.h" | 16 #include "base/message_loop/message_loop.h" |
| 15 #include "base/metrics/histogram.h" | 17 #include "base/metrics/histogram.h" |
| 16 #include "base/strings/utf_string_conversions.h" | 18 #include "base/strings/utf_string_conversions.h" |
| 17 #include "base/time/time.h" | 19 #include "base/time/time.h" |
| 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 19 #include "chrome/renderer/safe_browsing/features.h" | 21 #include "chrome/renderer/safe_browsing/features.h" |
| 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
| 21 #include "crypto/sha2.h" | 23 #include "crypto/sha2.h" |
| 22 #include "third_party/icu/source/common/unicode/ubrk.h" | |
| 23 #include "ui/base/l10n/l10n_util.h" | 24 #include "ui/base/l10n/l10n_util.h" |
| 24 | 25 |
| 25 namespace safe_browsing { | 26 namespace safe_browsing { |
| 26 | 27 |
| 27 // This time should be short enough that it doesn't noticeably disrupt the | 28 // This time should be short enough that it doesn't noticeably disrupt the |
| 28 // user's interaction with the page. | 29 // user's interaction with the page. |
| 29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; | 30 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; |
| 30 | 31 |
| 31 // Experimenting shows that we get a reasonable gain in performance by | 32 // Experimenting shows that we get a reasonable gain in performance by |
| 32 // increasing this up to around 10, but there's not much benefit in | 33 // increasing this up to around 10, but there's not much benefit in |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 44 struct PhishingTermFeatureExtractor::ExtractionState { | 45 struct PhishingTermFeatureExtractor::ExtractionState { |
| 45 // Stores up to max_words_per_term_ previous words separated by spaces. | 46 // Stores up to max_words_per_term_ previous words separated by spaces. |
| 46 std::string previous_words; | 47 std::string previous_words; |
| 47 | 48 |
| 48 // Stores the sizes of the words in previous_words. Note: the size includes | 49 // Stores the sizes of the words in previous_words. Note: the size includes |
| 49 // the space after each word. In other words, the sum of all sizes in this | 50 // the space after each word. In other words, the sum of all sizes in this |
| 50 // list is equal to the length of previous_words. | 51 // list is equal to the length of previous_words. |
| 51 std::list<size_t> previous_word_sizes; | 52 std::list<size_t> previous_word_sizes; |
| 52 | 53 |
| 53 // An iterator for word breaking. | 54 // An iterator for word breaking. |
| 54 UBreakIterator* iterator; | 55 scoped_ptr<base::i18n::BreakIterator> iterator; |
| 55 | |
| 56 // Our current position in the text that was passed to the ExtractionState | |
| 57 // constructor, speciailly, the most recent break position returned by our | |
| 58 // iterator. | |
| 59 int position; | |
| 60 | |
| 61 // True if position has been initialized. | |
| 62 bool position_initialized; | |
| 63 | 56 |
| 64 // The time at which we started feature extraction for the current page. | 57 // The time at which we started feature extraction for the current page. |
| 65 base::TimeTicks start_time; | 58 base::TimeTicks start_time; |
| 66 | 59 |
| 67 // The number of iterations we've done for the current extraction. | 60 // The number of iterations we've done for the current extraction. |
| 68 int num_iterations; | 61 int num_iterations; |
| 69 | 62 |
| 70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) | 63 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) |
| 71 : position(-1), | 64 : start_time(start_time_ticks), |
| 72 position_initialized(false), | |
| 73 start_time(start_time_ticks), | |
| 74 num_iterations(0) { | 65 num_iterations(0) { |
| 75 UErrorCode status = U_ZERO_ERROR; | 66 |
| 76 // TODO(bryner): We should pass in the language for the document. | 67 scoped_ptr<base::i18n::BreakIterator> i( |
| 77 iterator = ubrk_open(UBRK_WORD, NULL, | 68 new base::i18n::BreakIterator( |
| 78 text.data(), text.size(), | 69 text, base::i18n::BreakIterator::BREAK_WORD)); |
| 79 &status); | 70 |
| 80 if (U_FAILURE(status)) { | 71 if (i->Init()) { |
| 81 DLOG(ERROR) << "ubrk_open failed: " << status; | 72 iterator = i.Pass(); |
| 82 iterator = NULL; | 73 } else { |
| 74 DLOG(ERROR) << "failed to open iterator"; | |
| 83 } | 75 } |
| 84 } | 76 } |
| 85 | 77 |
| 86 ~ExtractionState() { | 78 ~ExtractionState() { |
|
mattm
2014/05/09 23:07:12
destructor could be removed
Andrew Hayden (chromium.org)
2014/05/12 13:07:37
Done.
| |
| 87 if (iterator) { | |
| 88 ubrk_close(iterator); | |
| 89 } | |
| 90 } | 79 } |
| 91 }; | 80 }; |
| 92 | 81 |
| 93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 82 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| 94 const base::hash_set<std::string>* page_term_hashes, | 83 const base::hash_set<std::string>* page_term_hashes, |
| 95 const base::hash_set<uint32>* page_word_hashes, | 84 const base::hash_set<uint32>* page_word_hashes, |
| 96 size_t max_words_per_term, | 85 size_t max_words_per_term, |
| 97 uint32 murmurhash3_seed, | 86 uint32 murmurhash3_seed, |
| 98 FeatureExtractorClock* clock) | 87 FeatureExtractorClock* clock) |
| 99 : page_term_hashes_(page_term_hashes), | 88 : page_term_hashes_(page_term_hashes), |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 138 // Cancel any pending callbacks, and clear our state. | 127 // Cancel any pending callbacks, and clear our state. |
| 139 weak_factory_.InvalidateWeakPtrs(); | 128 weak_factory_.InvalidateWeakPtrs(); |
| 140 Clear(); | 129 Clear(); |
| 141 } | 130 } |
| 142 | 131 |
| 143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { | 132 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
| 144 DCHECK(state_.get()); | 133 DCHECK(state_.get()); |
| 145 ++state_->num_iterations; | 134 ++state_->num_iterations; |
| 146 base::TimeTicks current_chunk_start_time = clock_->Now(); | 135 base::TimeTicks current_chunk_start_time = clock_->Now(); |
| 147 | 136 |
| 148 if (!state_->iterator) { | 137 if (!state_->iterator.get()) { |
| 149 // We failed to initialize the break iterator, so stop now. | 138 // We failed to initialize the break iterator, so stop now. |
| 150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); | 139 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); |
| 151 RunCallback(false); | 140 RunCallback(false); |
| 152 return; | 141 return; |
| 153 } | 142 } |
| 154 | 143 |
| 155 if (!state_->position_initialized) { | |
| 156 state_->position = ubrk_first(state_->iterator); | |
| 157 if (state_->position == UBRK_DONE) { | |
| 158 // No words present, so we're done. | |
| 159 RunCallback(true); | |
| 160 return; | |
| 161 } | |
| 162 state_->position_initialized = true; | |
| 163 } | |
| 164 | |
| 165 int num_words = 0; | 144 int num_words = 0; |
| 166 for (int next = ubrk_next(state_->iterator); | 145 while (state_->iterator->Advance()) { |
| 167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { | 146 if (state_->iterator->IsWord()) { |
| 168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { | 147 const size_t start = state_->iterator->prev(); |
| 169 // next is now positioned at the end of a word. | 148 const size_t length = state_->iterator->pos() - start; |
| 170 HandleWord(base::StringPiece16(page_text_->data() + state_->position, | 149 HandleWord(base::StringPiece16(page_text_->data() + start, length)); |
| 171 next - state_->position)); | |
| 172 ++num_words; | 150 ++num_words; |
| 173 } | 151 } |
| 174 state_->position = next; | |
| 175 | 152 |
| 176 if (num_words >= kClockCheckGranularity) { | 153 if (num_words >= kClockCheckGranularity) { |
| 177 num_words = 0; | 154 num_words = 0; |
| 178 base::TimeTicks now = clock_->Now(); | 155 base::TimeTicks now = clock_->Now(); |
| 179 if (now - state_->start_time >= | 156 if (now - state_->start_time >= |
| 180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 157 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
| 181 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 158 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
| 182 // We expect this to happen infrequently, so record when it does. | 159 // We expect this to happen infrequently, so record when it does. |
| 183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); | 160 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); |
| 184 RunCallback(false); | 161 RunCallback(false); |
| (...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 301 | 278 |
| 302 void PhishingTermFeatureExtractor::Clear() { | 279 void PhishingTermFeatureExtractor::Clear() { |
| 303 page_text_ = NULL; | 280 page_text_ = NULL; |
| 304 features_ = NULL; | 281 features_ = NULL; |
| 305 done_callback_.Reset(); | 282 done_callback_.Reset(); |
| 306 state_.reset(NULL); | 283 state_.reset(NULL); |
| 307 negative_word_cache_.Clear(); | 284 negative_word_cache_.Clear(); |
| 308 } | 285 } |
| 309 | 286 |
| 310 } // namespace safe_browsing | 287 } // namespace safe_browsing |
| OLD | NEW |