OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <list> | 7 #include <list> |
8 #include <map> | 8 #include <map> |
9 | 9 |
10 #include "base/bind.h" | 10 #include "base/bind.h" |
11 #include "base/compiler_specific.h" | 11 #include "base/compiler_specific.h" |
12 #include "base/i18n/break_iterator.h" | |
12 #include "base/i18n/case_conversion.h" | 13 #include "base/i18n/case_conversion.h" |
13 #include "base/logging.h" | 14 #include "base/logging.h" |
14 #include "base/message_loop/message_loop.h" | 15 #include "base/message_loop/message_loop.h" |
15 #include "base/metrics/histogram.h" | 16 #include "base/metrics/histogram.h" |
16 #include "base/strings/utf_string_conversions.h" | 17 #include "base/strings/utf_string_conversions.h" |
17 #include "base/time/time.h" | 18 #include "base/time/time.h" |
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 19 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
19 #include "chrome/renderer/safe_browsing/features.h" | 20 #include "chrome/renderer/safe_browsing/features.h" |
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 21 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
21 #include "crypto/sha2.h" | 22 #include "crypto/sha2.h" |
22 #include "third_party/icu/source/common/unicode/ubrk.h" | |
23 #include "ui/base/l10n/l10n_util.h" | 23 #include "ui/base/l10n/l10n_util.h" |
24 | 24 |
25 namespace safe_browsing { | 25 namespace safe_browsing { |
26 | 26 |
27 // This time should be short enough that it doesn't noticeably disrupt the | 27 // This time should be short enough that it doesn't noticeably disrupt the |
28 // user's interaction with the page. | 28 // user's interaction with the page. |
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; | 29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; |
30 | 30 |
31 // Experimenting shows that we get a reasonable gain in performance by | 31 // Experimenting shows that we get a reasonable gain in performance by |
32 // increasing this up to around 10, but there's not much benefit in | 32 // increasing this up to around 10, but there's not much benefit in |
(...skipping 11 matching lines...) Expand all Loading... | |
44 struct PhishingTermFeatureExtractor::ExtractionState { | 44 struct PhishingTermFeatureExtractor::ExtractionState { |
45 // Stores up to max_words_per_term_ previous words separated by spaces. | 45 // Stores up to max_words_per_term_ previous words separated by spaces. |
46 std::string previous_words; | 46 std::string previous_words; |
47 | 47 |
48 // Stores the sizes of the words in previous_words. Note: the size includes | 48 // Stores the sizes of the words in previous_words. Note: the size includes |
49 // the space after each word. In other words, the sum of all sizes in this | 49 // the space after each word. In other words, the sum of all sizes in this |
50 // list is equal to the length of previous_words. | 50 // list is equal to the length of previous_words. |
51 std::list<size_t> previous_word_sizes; | 51 std::list<size_t> previous_word_sizes; |
52 | 52 |
53 // An iterator for word breaking. | 53 // An iterator for word breaking. |
54 UBreakIterator* iterator; | 54 base::i18n::BreakIterator* iterator; |
mattm
2014/05/07 00:37:03
scoped_ptr
Andrew Hayden (chromium.org)
2014/05/07 15:31:58
Done.
| |
55 | 55 |
56 // Our current position in the text that was passed to the ExtractionState | 56 // True until we have advanced once. |
57 // constructor, speciailly, the most recent break position returned by our | 57 bool initial; |
58 // iterator. | |
59 int position; | |
60 | |
61 // True if position has been initialized. | |
62 bool position_initialized; | |
63 | 58 |
64 // The time at which we started feature extraction for the current page. | 59 // The time at which we started feature extraction for the current page. |
65 base::TimeTicks start_time; | 60 base::TimeTicks start_time; |
66 | 61 |
67 // The number of iterations we've done for the current extraction. | 62 // The number of iterations we've done for the current extraction. |
68 int num_iterations; | 63 int num_iterations; |
69 | 64 |
65 // Stored position from the last time we advanced the iterator. | |
66 size_t last_position; | |
67 | |
70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) | 68 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) |
71 : position(-1), | 69 : initial(true), |
72 position_initialized(false), | |
73 start_time(start_time_ticks), | 70 start_time(start_time_ticks), |
74 num_iterations(0) { | 71 num_iterations(0) { |
75 UErrorCode status = U_ZERO_ERROR; | 72 iterator = new base::i18n::BreakIterator( |
76 // TODO(bryner): We should pass in the language for the document. | 73 text, base::i18n::BreakIterator::BREAK_WORD); |
77 iterator = ubrk_open(UBRK_WORD, NULL, | 74 |
78 text.data(), text.size(), | 75 if (!iterator->Init()) { |
79 &status); | 76 DLOG(ERROR) << "failed to open iterator"; |
80 if (U_FAILURE(status)) { | 77 delete iterator; |
81 DLOG(ERROR) << "ubrk_open failed: " << status; | |
82 iterator = NULL; | 78 iterator = NULL; |
83 } | 79 } |
80 last_position = iterator->pos(); | |
84 } | 81 } |
85 | 82 |
86 ~ExtractionState() { | 83 ~ExtractionState() { |
87 if (iterator) { | 84 delete iterator; |
88 ubrk_close(iterator); | 85 iterator = NULL; |
89 } | |
90 } | 86 } |
91 }; | 87 }; |
92 | 88 |
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 89 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
94 const base::hash_set<std::string>* page_term_hashes, | 90 const base::hash_set<std::string>* page_term_hashes, |
95 const base::hash_set<uint32>* page_word_hashes, | 91 const base::hash_set<uint32>* page_word_hashes, |
96 size_t max_words_per_term, | 92 size_t max_words_per_term, |
97 uint32 murmurhash3_seed, | 93 uint32 murmurhash3_seed, |
98 FeatureExtractorClock* clock) | 94 FeatureExtractorClock* clock) |
99 : page_term_hashes_(page_term_hashes), | 95 : page_term_hashes_(page_term_hashes), |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
145 ++state_->num_iterations; | 141 ++state_->num_iterations; |
146 base::TimeTicks current_chunk_start_time = clock_->Now(); | 142 base::TimeTicks current_chunk_start_time = clock_->Now(); |
147 | 143 |
148 if (!state_->iterator) { | 144 if (!state_->iterator) { |
149 // We failed to initialize the break iterator, so stop now. | 145 // We failed to initialize the break iterator, so stop now. |
150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); | 146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); |
151 RunCallback(false); | 147 RunCallback(false); |
152 return; | 148 return; |
153 } | 149 } |
154 | 150 |
155 if (!state_->position_initialized) { | 151 if (state_->initial) { |
156 state_->position = ubrk_first(state_->iterator); | 152 if (!state_->iterator->IsValid()) { |
mattm
2014/05/07 00:37:03
Looks like this doesn't work, since BreakIterator
Andrew Hayden (chromium.org)
2014/05/07 15:31:58
I had the same thought and went off to investigate
| |
157 if (state_->position == UBRK_DONE) { | |
158 // No words present, so we're done. | 153 // No words present, so we're done. |
159 RunCallback(true); | 154 RunCallback(true); |
160 return; | 155 return; |
161 } | 156 } |
162 state_->position_initialized = true; | 157 state_->initial = false; |
163 } | 158 } |
164 | 159 |
165 int num_words = 0; | 160 int num_words = 0; |
166 for (int next = ubrk_next(state_->iterator); | 161 while (state_->iterator->Advance()) { |
167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { | 162 int next = state_->iterator->pos(); |
168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { | 163 if (state_->iterator->IsEndOfWord(next)) { |
169 // next is now positioned at the end of a word. | 164 // next is now positioned at the end of a word. |
170 HandleWord(base::StringPiece16(page_text_->data() + state_->position, | 165 HandleWord(base::StringPiece16(page_text_->data() + state_->last_position, |
171 next - state_->position)); | 166 next - state_->last_position)); |
172 ++num_words; | 167 ++num_words; |
173 } | 168 } |
174 state_->position = next; | 169 state_->last_position = next; |
175 | 170 |
mattm
2014/05/07 00:37:03
I think all of this could be simplified by using t
Andrew Hayden (chromium.org)
2014/05/07 15:31:58
Seems reasonable, done.
| |
176 if (num_words >= kClockCheckGranularity) { | 171 if (num_words >= kClockCheckGranularity) { |
177 num_words = 0; | 172 num_words = 0; |
178 base::TimeTicks now = clock_->Now(); | 173 base::TimeTicks now = clock_->Now(); |
179 if (now - state_->start_time >= | 174 if (now - state_->start_time >= |
180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 175 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
181 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 176 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
182 // We expect this to happen infrequently, so record when it does. | 177 // We expect this to happen infrequently, so record when it does. |
183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); | 178 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); |
184 RunCallback(false); | 179 RunCallback(false); |
185 return; | 180 return; |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
301 | 296 |
302 void PhishingTermFeatureExtractor::Clear() { | 297 void PhishingTermFeatureExtractor::Clear() { |
303 page_text_ = NULL; | 298 page_text_ = NULL; |
304 features_ = NULL; | 299 features_ = NULL; |
305 done_callback_.Reset(); | 300 done_callback_.Reset(); |
306 state_.reset(NULL); | 301 state_.reset(NULL); |
307 negative_word_cache_.Clear(); | 302 negative_word_cache_.Clear(); |
308 } | 303 } |
309 | 304 |
310 } // namespace safe_browsing | 305 } // namespace safe_browsing |
OLD | NEW |