Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1222)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 266883010: Refactor code to avoid direct dependency upon ICU: phishing_term_feature_extractor (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Remove unused destructor Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <list> 7 #include <list>
8 #include <map> 8 #include <map>
9 9
10 #include "base/bind.h" 10 #include "base/bind.h"
11 #include "base/compiler_specific.h" 11 #include "base/compiler_specific.h"
12 #include "base/i18n/break_iterator.h"
12 #include "base/i18n/case_conversion.h" 13 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h" 14 #include "base/logging.h"
15 #include "base/memory/scoped_ptr.h"
14 #include "base/message_loop/message_loop.h" 16 #include "base/message_loop/message_loop.h"
15 #include "base/metrics/histogram.h" 17 #include "base/metrics/histogram.h"
16 #include "base/strings/utf_string_conversions.h" 18 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h" 19 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h" 21 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" 22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "crypto/sha2.h" 23 #include "crypto/sha2.h"
22 #include "third_party/icu/source/common/unicode/ubrk.h"
23 #include "ui/base/l10n/l10n_util.h" 24 #include "ui/base/l10n/l10n_util.h"
24 25
25 namespace safe_browsing { 26 namespace safe_browsing {
26 27
27 // This time should be short enough that it doesn't noticeably disrupt the 28 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page. 29 // user's interaction with the page.
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; 30 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
30 31
31 // Experimenting shows that we get a reasonable gain in performance by 32 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in 33 // increasing this up to around 10, but there's not much benefit in
(...skipping 11 matching lines...) Expand all
44 struct PhishingTermFeatureExtractor::ExtractionState { 45 struct PhishingTermFeatureExtractor::ExtractionState {
45 // Stores up to max_words_per_term_ previous words separated by spaces. 46 // Stores up to max_words_per_term_ previous words separated by spaces.
46 std::string previous_words; 47 std::string previous_words;
47 48
48 // Stores the sizes of the words in previous_words. Note: the size includes 49 // Stores the sizes of the words in previous_words. Note: the size includes
49 // the space after each word. In other words, the sum of all sizes in this 50 // the space after each word. In other words, the sum of all sizes in this
50 // list is equal to the length of previous_words. 51 // list is equal to the length of previous_words.
51 std::list<size_t> previous_word_sizes; 52 std::list<size_t> previous_word_sizes;
52 53
53 // An iterator for word breaking. 54 // An iterator for word breaking.
54 UBreakIterator* iterator; 55 scoped_ptr<base::i18n::BreakIterator> iterator;
55
56 // Our current position in the text that was passed to the ExtractionState
57 // constructor, speciailly, the most recent break position returned by our
58 // iterator.
59 int position;
60
61 // True if position has been initialized.
62 bool position_initialized;
63 56
64 // The time at which we started feature extraction for the current page. 57 // The time at which we started feature extraction for the current page.
65 base::TimeTicks start_time; 58 base::TimeTicks start_time;
66 59
67 // The number of iterations we've done for the current extraction. 60 // The number of iterations we've done for the current extraction.
68 int num_iterations; 61 int num_iterations;
69 62
70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) 63 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
71 : position(-1), 64 : start_time(start_time_ticks),
72 position_initialized(false),
73 start_time(start_time_ticks),
74 num_iterations(0) { 65 num_iterations(0) {
75 UErrorCode status = U_ZERO_ERROR;
76 // TODO(bryner): We should pass in the language for the document.
77 iterator = ubrk_open(UBRK_WORD, NULL,
78 text.data(), text.size(),
79 &status);
80 if (U_FAILURE(status)) {
81 DLOG(ERROR) << "ubrk_open failed: " << status;
82 iterator = NULL;
83 }
84 }
85 66
86 ~ExtractionState() { 67 scoped_ptr<base::i18n::BreakIterator> i(
87 if (iterator) { 68 new base::i18n::BreakIterator(
88 ubrk_close(iterator); 69 text, base::i18n::BreakIterator::BREAK_WORD));
70
71 if (i->Init()) {
72 iterator = i.Pass();
73 } else {
74 DLOG(ERROR) << "failed to open iterator";
89 } 75 }
90 } 76 }
91 }; 77 };
92 78
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 79 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
94 const base::hash_set<std::string>* page_term_hashes, 80 const base::hash_set<std::string>* page_term_hashes,
95 const base::hash_set<uint32>* page_word_hashes, 81 const base::hash_set<uint32>* page_word_hashes,
96 size_t max_words_per_term, 82 size_t max_words_per_term,
97 uint32 murmurhash3_seed, 83 uint32 murmurhash3_seed,
98 FeatureExtractorClock* clock) 84 FeatureExtractorClock* clock)
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
138 // Cancel any pending callbacks, and clear our state. 124 // Cancel any pending callbacks, and clear our state.
139 weak_factory_.InvalidateWeakPtrs(); 125 weak_factory_.InvalidateWeakPtrs();
140 Clear(); 126 Clear();
141 } 127 }
142 128
143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { 129 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
144 DCHECK(state_.get()); 130 DCHECK(state_.get());
145 ++state_->num_iterations; 131 ++state_->num_iterations;
146 base::TimeTicks current_chunk_start_time = clock_->Now(); 132 base::TimeTicks current_chunk_start_time = clock_->Now();
147 133
148 if (!state_->iterator) { 134 if (!state_->iterator.get()) {
149 // We failed to initialize the break iterator, so stop now. 135 // We failed to initialize the break iterator, so stop now.
150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); 136 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
151 RunCallback(false); 137 RunCallback(false);
152 return; 138 return;
153 } 139 }
154 140
155 if (!state_->position_initialized) {
156 state_->position = ubrk_first(state_->iterator);
157 if (state_->position == UBRK_DONE) {
158 // No words present, so we're done.
159 RunCallback(true);
160 return;
161 }
162 state_->position_initialized = true;
163 }
164
165 int num_words = 0; 141 int num_words = 0;
166 for (int next = ubrk_next(state_->iterator); 142 while (state_->iterator->Advance()) {
167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { 143 if (state_->iterator->IsWord()) {
168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { 144 const size_t start = state_->iterator->prev();
169 // next is now positioned at the end of a word. 145 const size_t length = state_->iterator->pos() - start;
170 HandleWord(base::StringPiece16(page_text_->data() + state_->position, 146 HandleWord(base::StringPiece16(page_text_->data() + start, length));
171 next - state_->position));
172 ++num_words; 147 ++num_words;
173 } 148 }
174 state_->position = next;
175 149
176 if (num_words >= kClockCheckGranularity) { 150 if (num_words >= kClockCheckGranularity) {
177 num_words = 0; 151 num_words = 0;
178 base::TimeTicks now = clock_->Now(); 152 base::TimeTicks now = clock_->Now();
179 if (now - state_->start_time >= 153 if (now - state_->start_time >=
180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 154 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
181 DLOG(ERROR) << "Feature extraction took too long, giving up"; 155 DLOG(ERROR) << "Feature extraction took too long, giving up";
182 // We expect this to happen infrequently, so record when it does. 156 // We expect this to happen infrequently, so record when it does.
183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); 157 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
184 RunCallback(false); 158 RunCallback(false);
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after
301 275
302 void PhishingTermFeatureExtractor::Clear() { 276 void PhishingTermFeatureExtractor::Clear() {
303 page_text_ = NULL; 277 page_text_ = NULL;
304 features_ = NULL; 278 features_ = NULL;
305 done_callback_.Reset(); 279 done_callback_.Reset();
306 state_.reset(NULL); 280 state_.reset(NULL);
307 negative_word_cache_.Clear(); 281 negative_word_cache_.Clear();
308 } 282 }
309 283
310 } // namespace safe_browsing 284 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698