Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 266883010: Refactor code to avoid direct dependency upon ICU: phishing_term_feature_extractor (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « base/i18n/break_iterator.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <list> 7 #include <list>
8 #include <map> 8 #include <map>
9 9
10 #include "base/bind.h" 10 #include "base/bind.h"
11 #include "base/compiler_specific.h" 11 #include "base/compiler_specific.h"
12 #include "base/i18n/break_iterator.h"
12 #include "base/i18n/case_conversion.h" 13 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h" 14 #include "base/logging.h"
14 #include "base/message_loop/message_loop.h" 15 #include "base/message_loop/message_loop.h"
15 #include "base/metrics/histogram.h" 16 #include "base/metrics/histogram.h"
16 #include "base/strings/utf_string_conversions.h" 17 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h" 18 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 19 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h" 20 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" 21 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "crypto/sha2.h" 22 #include "crypto/sha2.h"
22 #include "third_party/icu/source/common/unicode/ubrk.h"
23 #include "ui/base/l10n/l10n_util.h" 23 #include "ui/base/l10n/l10n_util.h"
24 24
25 namespace safe_browsing { 25 namespace safe_browsing {
26 26
27 // This time should be short enough that it doesn't noticeably disrupt the 27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page. 28 // user's interaction with the page.
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; 29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
30 30
31 // Experimenting shows that we get a reasonable gain in performance by 31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in 32 // increasing this up to around 10, but there's not much benefit in
(...skipping 11 matching lines...) Expand all
44 struct PhishingTermFeatureExtractor::ExtractionState { 44 struct PhishingTermFeatureExtractor::ExtractionState {
45 // Stores up to max_words_per_term_ previous words separated by spaces. 45 // Stores up to max_words_per_term_ previous words separated by spaces.
46 std::string previous_words; 46 std::string previous_words;
47 47
48 // Stores the sizes of the words in previous_words. Note: the size includes 48 // Stores the sizes of the words in previous_words. Note: the size includes
49 // the space after each word. In other words, the sum of all sizes in this 49 // the space after each word. In other words, the sum of all sizes in this
50 // list is equal to the length of previous_words. 50 // list is equal to the length of previous_words.
51 std::list<size_t> previous_word_sizes; 51 std::list<size_t> previous_word_sizes;
52 52
53 // An iterator for word breaking. 53 // An iterator for word breaking.
54 UBreakIterator* iterator; 54 base::i18n::BreakIterator* iterator;
mattm 2014/05/07 00:37:03 scoped_ptr
Andrew Hayden (chromium.org) 2014/05/07 15:31:58 Done.
55 55
56 // Our current position in the text that was passed to the ExtractionState 56 // True until we have advanced once.
57 // constructor, speciailly, the most recent break position returned by our 57 bool initial;
58 // iterator.
59 int position;
60
61 // True if position has been initialized.
62 bool position_initialized;
63 58
64 // The time at which we started feature extraction for the current page. 59 // The time at which we started feature extraction for the current page.
65 base::TimeTicks start_time; 60 base::TimeTicks start_time;
66 61
67 // The number of iterations we've done for the current extraction. 62 // The number of iterations we've done for the current extraction.
68 int num_iterations; 63 int num_iterations;
69 64
65 // Stored position from the last time we advanced the iterator.
66 size_t last_position;
67
70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) 68 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
71 : position(-1), 69 : initial(true),
72 position_initialized(false),
73 start_time(start_time_ticks), 70 start_time(start_time_ticks),
74 num_iterations(0) { 71 num_iterations(0) {
75 UErrorCode status = U_ZERO_ERROR; 72 iterator = new base::i18n::BreakIterator(
76 // TODO(bryner): We should pass in the language for the document. 73 text, base::i18n::BreakIterator::BREAK_WORD);
77 iterator = ubrk_open(UBRK_WORD, NULL, 74
78 text.data(), text.size(), 75 if (!iterator->Init()) {
79 &status); 76 DLOG(ERROR) << "failed to open iterator";
80 if (U_FAILURE(status)) { 77 delete iterator;
81 DLOG(ERROR) << "ubrk_open failed: " << status;
82 iterator = NULL; 78 iterator = NULL;
83 } 79 }
80 last_position = iterator->pos();
84 } 81 }
85 82
86 ~ExtractionState() { 83 ~ExtractionState() {
87 if (iterator) { 84 delete iterator;
88 ubrk_close(iterator); 85 iterator = NULL;
89 }
90 } 86 }
91 }; 87 };
92 88
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 89 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
94 const base::hash_set<std::string>* page_term_hashes, 90 const base::hash_set<std::string>* page_term_hashes,
95 const base::hash_set<uint32>* page_word_hashes, 91 const base::hash_set<uint32>* page_word_hashes,
96 size_t max_words_per_term, 92 size_t max_words_per_term,
97 uint32 murmurhash3_seed, 93 uint32 murmurhash3_seed,
98 FeatureExtractorClock* clock) 94 FeatureExtractorClock* clock)
99 : page_term_hashes_(page_term_hashes), 95 : page_term_hashes_(page_term_hashes),
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 ++state_->num_iterations; 141 ++state_->num_iterations;
146 base::TimeTicks current_chunk_start_time = clock_->Now(); 142 base::TimeTicks current_chunk_start_time = clock_->Now();
147 143
148 if (!state_->iterator) { 144 if (!state_->iterator) {
149 // We failed to initialize the break iterator, so stop now. 145 // We failed to initialize the break iterator, so stop now.
150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); 146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
151 RunCallback(false); 147 RunCallback(false);
152 return; 148 return;
153 } 149 }
154 150
155 if (!state_->position_initialized) { 151 if (state_->initial) {
156 state_->position = ubrk_first(state_->iterator); 152 if (!state_->iterator->IsValid()) {
mattm 2014/05/07 00:37:03 Looks like this doesn't work, since BreakIterator
Andrew Hayden (chromium.org) 2014/05/07 15:31:58 I had the same thought and went off to investigate
157 if (state_->position == UBRK_DONE) {
158 // No words present, so we're done. 153 // No words present, so we're done.
159 RunCallback(true); 154 RunCallback(true);
160 return; 155 return;
161 } 156 }
162 state_->position_initialized = true; 157 state_->initial = false;
163 } 158 }
164 159
165 int num_words = 0; 160 int num_words = 0;
166 for (int next = ubrk_next(state_->iterator); 161 while (state_->iterator->Advance()) {
167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { 162 int next = state_->iterator->pos();
168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { 163 if (state_->iterator->IsEndOfWord(next)) {
169 // next is now positioned at the end of a word. 164 // next is now positioned at the end of a word.
170 HandleWord(base::StringPiece16(page_text_->data() + state_->position, 165 HandleWord(base::StringPiece16(page_text_->data() + state_->last_position,
171 next - state_->position)); 166 next - state_->last_position));
172 ++num_words; 167 ++num_words;
173 } 168 }
174 state_->position = next; 169 state_->last_position = next;
175 170
mattm 2014/05/07 00:37:03 I think all of this could be simplified by using t
Andrew Hayden (chromium.org) 2014/05/07 15:31:58 Seems reasonable, done.
176 if (num_words >= kClockCheckGranularity) { 171 if (num_words >= kClockCheckGranularity) {
177 num_words = 0; 172 num_words = 0;
178 base::TimeTicks now = clock_->Now(); 173 base::TimeTicks now = clock_->Now();
179 if (now - state_->start_time >= 174 if (now - state_->start_time >=
180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 175 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
181 DLOG(ERROR) << "Feature extraction took too long, giving up"; 176 DLOG(ERROR) << "Feature extraction took too long, giving up";
182 // We expect this to happen infrequently, so record when it does. 177 // We expect this to happen infrequently, so record when it does.
183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); 178 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
184 RunCallback(false); 179 RunCallback(false);
185 return; 180 return;
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
301 296
302 void PhishingTermFeatureExtractor::Clear() { 297 void PhishingTermFeatureExtractor::Clear() {
303 page_text_ = NULL; 298 page_text_ = NULL;
304 features_ = NULL; 299 features_ = NULL;
305 done_callback_.Reset(); 300 done_callback_.Reset();
306 state_.reset(NULL); 301 state_.reset(NULL);
307 negative_word_cache_.Clear(); 302 negative_word_cache_.Clear();
308 } 303 }
309 304
310 } // namespace safe_browsing 305 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « base/i18n/break_iterator.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698