OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <list> | 7 #include <list> |
8 #include <map> | 8 #include <map> |
9 | 9 |
10 #include "base/bind.h" | 10 #include "base/bind.h" |
11 #include "base/compiler_specific.h" | 11 #include "base/compiler_specific.h" |
| 12 #include "base/i18n/break_iterator.h" |
12 #include "base/i18n/case_conversion.h" | 13 #include "base/i18n/case_conversion.h" |
13 #include "base/logging.h" | 14 #include "base/logging.h" |
| 15 #include "base/memory/scoped_ptr.h" |
14 #include "base/message_loop/message_loop.h" | 16 #include "base/message_loop/message_loop.h" |
15 #include "base/metrics/histogram.h" | 17 #include "base/metrics/histogram.h" |
16 #include "base/strings/utf_string_conversions.h" | 18 #include "base/strings/utf_string_conversions.h" |
17 #include "base/time/time.h" | 19 #include "base/time/time.h" |
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
19 #include "chrome/renderer/safe_browsing/features.h" | 21 #include "chrome/renderer/safe_browsing/features.h" |
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
21 #include "crypto/sha2.h" | 23 #include "crypto/sha2.h" |
22 #include "third_party/icu/source/common/unicode/ubrk.h" | |
23 #include "ui/base/l10n/l10n_util.h" | 24 #include "ui/base/l10n/l10n_util.h" |
24 | 25 |
25 namespace safe_browsing { | 26 namespace safe_browsing { |
26 | 27 |
27 // This time should be short enough that it doesn't noticeably disrupt the | 28 // This time should be short enough that it doesn't noticeably disrupt the |
28 // user's interaction with the page. | 29 // user's interaction with the page. |
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; | 30 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; |
30 | 31 |
31 // Experimenting shows that we get a reasonable gain in performance by | 32 // Experimenting shows that we get a reasonable gain in performance by |
32 // increasing this up to around 10, but there's not much benefit in | 33 // increasing this up to around 10, but there's not much benefit in |
(...skipping 11 matching lines...) Expand all Loading... |
44 struct PhishingTermFeatureExtractor::ExtractionState { | 45 struct PhishingTermFeatureExtractor::ExtractionState { |
45 // Stores up to max_words_per_term_ previous words separated by spaces. | 46 // Stores up to max_words_per_term_ previous words separated by spaces. |
46 std::string previous_words; | 47 std::string previous_words; |
47 | 48 |
48 // Stores the sizes of the words in previous_words. Note: the size includes | 49 // Stores the sizes of the words in previous_words. Note: the size includes |
49 // the space after each word. In other words, the sum of all sizes in this | 50 // the space after each word. In other words, the sum of all sizes in this |
50 // list is equal to the length of previous_words. | 51 // list is equal to the length of previous_words. |
51 std::list<size_t> previous_word_sizes; | 52 std::list<size_t> previous_word_sizes; |
52 | 53 |
53 // An iterator for word breaking. | 54 // An iterator for word breaking. |
54 UBreakIterator* iterator; | 55 scoped_ptr<base::i18n::BreakIterator> iterator; |
55 | |
56 // Our current position in the text that was passed to the ExtractionState | |
57 // constructor, speciailly, the most recent break position returned by our | |
58 // iterator. | |
59 int position; | |
60 | |
61 // True if position has been initialized. | |
62 bool position_initialized; | |
63 | 56 |
64 // The time at which we started feature extraction for the current page. | 57 // The time at which we started feature extraction for the current page. |
65 base::TimeTicks start_time; | 58 base::TimeTicks start_time; |
66 | 59 |
67 // The number of iterations we've done for the current extraction. | 60 // The number of iterations we've done for the current extraction. |
68 int num_iterations; | 61 int num_iterations; |
69 | 62 |
70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) | 63 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) |
71 : position(-1), | 64 : start_time(start_time_ticks), |
72 position_initialized(false), | |
73 start_time(start_time_ticks), | |
74 num_iterations(0) { | 65 num_iterations(0) { |
75 UErrorCode status = U_ZERO_ERROR; | |
76 // TODO(bryner): We should pass in the language for the document. | |
77 iterator = ubrk_open(UBRK_WORD, NULL, | |
78 text.data(), text.size(), | |
79 &status); | |
80 if (U_FAILURE(status)) { | |
81 DLOG(ERROR) << "ubrk_open failed: " << status; | |
82 iterator = NULL; | |
83 } | |
84 } | |
85 | 66 |
86 ~ExtractionState() { | 67 scoped_ptr<base::i18n::BreakIterator> i( |
87 if (iterator) { | 68 new base::i18n::BreakIterator( |
88 ubrk_close(iterator); | 69 text, base::i18n::BreakIterator::BREAK_WORD)); |
| 70 |
| 71 if (i->Init()) { |
| 72 iterator = i.Pass(); |
| 73 } else { |
| 74 DLOG(ERROR) << "failed to open iterator"; |
89 } | 75 } |
90 } | 76 } |
91 }; | 77 }; |
92 | 78 |
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 79 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
94 const base::hash_set<std::string>* page_term_hashes, | 80 const base::hash_set<std::string>* page_term_hashes, |
95 const base::hash_set<uint32>* page_word_hashes, | 81 const base::hash_set<uint32>* page_word_hashes, |
96 size_t max_words_per_term, | 82 size_t max_words_per_term, |
97 uint32 murmurhash3_seed, | 83 uint32 murmurhash3_seed, |
98 FeatureExtractorClock* clock) | 84 FeatureExtractorClock* clock) |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
138 // Cancel any pending callbacks, and clear our state. | 124 // Cancel any pending callbacks, and clear our state. |
139 weak_factory_.InvalidateWeakPtrs(); | 125 weak_factory_.InvalidateWeakPtrs(); |
140 Clear(); | 126 Clear(); |
141 } | 127 } |
142 | 128 |
143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { | 129 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
144 DCHECK(state_.get()); | 130 DCHECK(state_.get()); |
145 ++state_->num_iterations; | 131 ++state_->num_iterations; |
146 base::TimeTicks current_chunk_start_time = clock_->Now(); | 132 base::TimeTicks current_chunk_start_time = clock_->Now(); |
147 | 133 |
148 if (!state_->iterator) { | 134 if (!state_->iterator.get()) { |
149 // We failed to initialize the break iterator, so stop now. | 135 // We failed to initialize the break iterator, so stop now. |
150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); | 136 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); |
151 RunCallback(false); | 137 RunCallback(false); |
152 return; | 138 return; |
153 } | 139 } |
154 | 140 |
155 if (!state_->position_initialized) { | |
156 state_->position = ubrk_first(state_->iterator); | |
157 if (state_->position == UBRK_DONE) { | |
158 // No words present, so we're done. | |
159 RunCallback(true); | |
160 return; | |
161 } | |
162 state_->position_initialized = true; | |
163 } | |
164 | |
165 int num_words = 0; | 141 int num_words = 0; |
166 for (int next = ubrk_next(state_->iterator); | 142 while (state_->iterator->Advance()) { |
167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { | 143 if (state_->iterator->IsWord()) { |
168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { | 144 const size_t start = state_->iterator->prev(); |
169 // next is now positioned at the end of a word. | 145 const size_t length = state_->iterator->pos() - start; |
170 HandleWord(base::StringPiece16(page_text_->data() + state_->position, | 146 HandleWord(base::StringPiece16(page_text_->data() + start, length)); |
171 next - state_->position)); | |
172 ++num_words; | 147 ++num_words; |
173 } | 148 } |
174 state_->position = next; | |
175 | 149 |
176 if (num_words >= kClockCheckGranularity) { | 150 if (num_words >= kClockCheckGranularity) { |
177 num_words = 0; | 151 num_words = 0; |
178 base::TimeTicks now = clock_->Now(); | 152 base::TimeTicks now = clock_->Now(); |
179 if (now - state_->start_time >= | 153 if (now - state_->start_time >= |
180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 154 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
181 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 155 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
182 // We expect this to happen infrequently, so record when it does. | 156 // We expect this to happen infrequently, so record when it does. |
183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); | 157 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); |
184 RunCallback(false); | 158 RunCallback(false); |
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
301 | 275 |
302 void PhishingTermFeatureExtractor::Clear() { | 276 void PhishingTermFeatureExtractor::Clear() { |
303 page_text_ = NULL; | 277 page_text_ = NULL; |
304 features_ = NULL; | 278 features_ = NULL; |
305 done_callback_.Reset(); | 279 done_callback_.Reset(); |
306 state_.reset(NULL); | 280 state_.reset(NULL); |
307 negative_word_cache_.Clear(); | 281 negative_word_cache_.Clear(); |
308 } | 282 } |
309 | 283 |
310 } // namespace safe_browsing | 284 } // namespace safe_browsing |
OLD | NEW |