chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 266883010: Refactor code to avoid direct dependency upon ICU: phishing_term_feature_extractor

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 266883010: Refactor code to avoid direct dependency upon ICU: phishing_term_feature_extractor (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Remove unused destructor Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/bind.h"	10 #include "base/bind.h"

11 #include "base/compiler_specific.h"	11 #include "base/compiler_specific.h"

	12 #include "base/i18n/break_iterator.h"

12 #include "base/i18n/case_conversion.h"	13 #include "base/i18n/case_conversion.h"

13 #include "base/logging.h"	14 #include "base/logging.h"

	15 #include "base/memory/scoped_ptr.h"

14 #include "base/message_loop/message_loop.h"	16 #include "base/message_loop/message_loop.h"

15 #include "base/metrics/histogram.h"	17 #include "base/metrics/histogram.h"

16 #include "base/strings/utf_string_conversions.h"	18 #include "base/strings/utf_string_conversions.h"

17 #include "base/time/time.h"	19 #include "base/time/time.h"

18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"	20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

19 #include "chrome/renderer/safe_browsing/features.h"	21 #include "chrome/renderer/safe_browsing/features.h"

20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"	22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"

21 #include "crypto/sha2.h"	23 #include "crypto/sha2.h"

22 #include "third_party/icu/source/common/unicode/ubrk.h"

23 #include "ui/base/l10n/l10n_util.h"	24 #include "ui/base/l10n/l10n_util.h"

24	25

25 namespace safe_browsing {	26 namespace safe_browsing {

26	27

27 // This time should be short enough that it doesn't noticeably disrupt the	28 // This time should be short enough that it doesn't noticeably disrupt the

28 // user's interaction with the page.	29 // user's interaction with the page.

29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;	30 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;

30	31

31 // Experimenting shows that we get a reasonable gain in performance by	32 // Experimenting shows that we get a reasonable gain in performance by

32 // increasing this up to around 10, but there's not much benefit in	33 // increasing this up to around 10, but there's not much benefit in

(...skipping 11 matching lines...) Expand all Loading...
44 struct PhishingTermFeatureExtractor::ExtractionState {	45 struct PhishingTermFeatureExtractor::ExtractionState {

45 // Stores up to max_words_per_term_ previous words separated by spaces.	46 // Stores up to max_words_per_term_ previous words separated by spaces.

46 std::string previous_words;	47 std::string previous_words;

47	48

48 // Stores the sizes of the words in previous_words. Note: the size includes	49 // Stores the sizes of the words in previous_words. Note: the size includes

49 // the space after each word. In other words, the sum of all sizes in this	50 // the space after each word. In other words, the sum of all sizes in this

50 // list is equal to the length of previous_words.	51 // list is equal to the length of previous_words.

51 std::list<size_t> previous_word_sizes;	52 std::list<size_t> previous_word_sizes;

52	53

53 // An iterator for word breaking.	54 // An iterator for word breaking.

54 UBreakIterator* iterator;	55 scoped_ptr<base::i18n::BreakIterator> iterator;

55

56 // Our current position in the text that was passed to the ExtractionState

57 // constructor, speciailly, the most recent break position returned by our

58 // iterator.

59 int position;

60

61 // True if position has been initialized.

62 bool position_initialized;

63	56

64 // The time at which we started feature extraction for the current page.	57 // The time at which we started feature extraction for the current page.

65 base::TimeTicks start_time;	58 base::TimeTicks start_time;

66	59

67 // The number of iterations we've done for the current extraction.	60 // The number of iterations we've done for the current extraction.

68 int num_iterations;	61 int num_iterations;

69	62

70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)	63 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)

71 : position(-1),	64 : start_time(start_time_ticks),

72 position_initialized(false),

73 start_time(start_time_ticks),

74 num_iterations(0) {	65 num_iterations(0) {

75 UErrorCode status = U_ZERO_ERROR;

76 // TODO(bryner): We should pass in the language for the document.

77 iterator = ubrk_open(UBRK_WORD, NULL,

78 text.data(), text.size(),

79 &status);

80 if (U_FAILURE(status)) {

81 DLOG(ERROR) << "ubrk_open failed: " << status;

82 iterator = NULL;

83 }

84 }

85	66

86 ~ExtractionState() {	67 scoped_ptr<base::i18n::BreakIterator> i(

87 if (iterator) {	68 new base::i18n::BreakIterator(

88 ubrk_close(iterator);	69 text, base::i18n::BreakIterator::BREAK_WORD));

	70

	71 if (i->Init()) {

	72 iterator = i.Pass();

	73 } else {

	74 DLOG(ERROR) << "failed to open iterator";

89 }	75 }

90 }	76 }

91 };	77 };

92	78

93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	79 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

94 const base::hash_set<std::string>* page_term_hashes,	80 const base::hash_set<std::string>* page_term_hashes,

95 const base::hash_set<uint32>* page_word_hashes,	81 const base::hash_set<uint32>* page_word_hashes,

96 size_t max_words_per_term,	82 size_t max_words_per_term,

97 uint32 murmurhash3_seed,	83 uint32 murmurhash3_seed,

98 FeatureExtractorClock* clock)	84 FeatureExtractorClock* clock)

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
138 // Cancel any pending callbacks, and clear our state.	124 // Cancel any pending callbacks, and clear our state.

139 weak_factory_.InvalidateWeakPtrs();	125 weak_factory_.InvalidateWeakPtrs();

140 Clear();	126 Clear();

141 }	127 }

142	128

143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {	129 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

144 DCHECK(state_.get());	130 DCHECK(state_.get());

145 ++state_->num_iterations;	131 ++state_->num_iterations;

146 base::TimeTicks current_chunk_start_time = clock_->Now();	132 base::TimeTicks current_chunk_start_time = clock_->Now();

147	133

148 if (!state_->iterator) {	134 if (!state_->iterator.get()) {

149 // We failed to initialize the break iterator, so stop now.	135 // We failed to initialize the break iterator, so stop now.

150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);	136 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);

151 RunCallback(false);	137 RunCallback(false);

152 return;	138 return;

153 }	139 }

154	140

155 if (!state_->position_initialized) {

156 state_->position = ubrk_first(state_->iterator);

157 if (state_->position == UBRK_DONE) {

158 // No words present, so we're done.

159 RunCallback(true);

160 return;

161 }

162 state_->position_initialized = true;

163 }

164

165 int num_words = 0;	141 int num_words = 0;

166 for (int next = ubrk_next(state_->iterator);	142 while (state_->iterator->Advance()) {

167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {	143 if (state_->iterator->IsWord()) {

168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {	144 const size_t start = state_->iterator->prev();

169 // next is now positioned at the end of a word.	145 const size_t length = state_->iterator->pos() - start;

170 HandleWord(base::StringPiece16(page_text_->data() + state_->position,	146 HandleWord(base::StringPiece16(page_text_->data() + start, length));

171 next - state_->position));

172 ++num_words;	147 ++num_words;

173 }	148 }

174 state_->position = next;

175	149

176 if (num_words >= kClockCheckGranularity) {	150 if (num_words >= kClockCheckGranularity) {

177 num_words = 0;	151 num_words = 0;

178 base::TimeTicks now = clock_->Now();	152 base::TimeTicks now = clock_->Now();

179 if (now - state_->start_time >=	153 if (now - state_->start_time >=

180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {	154 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

181 DLOG(ERROR) << "Feature extraction took too long, giving up";	155 DLOG(ERROR) << "Feature extraction took too long, giving up";

182 // We expect this to happen infrequently, so record when it does.	156 // We expect this to happen infrequently, so record when it does.

183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);	157 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);

184 RunCallback(false);	158 RunCallback(false);

(...skipping 116 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
301	275

302 void PhishingTermFeatureExtractor::Clear() {	276 void PhishingTermFeatureExtractor::Clear() {

303 page_text_ = NULL;	277 page_text_ = NULL;

304 features_ = NULL;	278 features_ = NULL;

305 done_callback_.Reset();	279 done_callback_.Reset();

306 state_.reset(NULL);	280 state_.reset(NULL);

307 negative_word_cache_.Clear();	281 negative_word_cache_.Clear();

308 }	282 }

309	283

310 } // namespace safe_browsing	284 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »