chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 266883010: Refactor code to avoid direct dependency upon ICU: phishing_term_feature_extractor

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 266883010: Refactor code to avoid direct dependency upon ICU: phishing_term_feature_extractor (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/bind.h"	10 #include "base/bind.h"

11 #include "base/compiler_specific.h"	11 #include "base/compiler_specific.h"

	12 #include "base/i18n/break_iterator.h"

12 #include "base/i18n/case_conversion.h"	13 #include "base/i18n/case_conversion.h"

13 #include "base/logging.h"	14 #include "base/logging.h"

14 #include "base/message_loop/message_loop.h"	15 #include "base/message_loop/message_loop.h"

15 #include "base/metrics/histogram.h"	16 #include "base/metrics/histogram.h"

16 #include "base/strings/utf_string_conversions.h"	17 #include "base/strings/utf_string_conversions.h"

17 #include "base/time/time.h"	18 #include "base/time/time.h"

18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"	19 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

19 #include "chrome/renderer/safe_browsing/features.h"	20 #include "chrome/renderer/safe_browsing/features.h"

20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"	21 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"

21 #include "crypto/sha2.h"	22 #include "crypto/sha2.h"

22 #include "third_party/icu/source/common/unicode/ubrk.h"

23 #include "ui/base/l10n/l10n_util.h"	23 #include "ui/base/l10n/l10n_util.h"

24	24

25 namespace safe_browsing {	25 namespace safe_browsing {

26	26

27 // This time should be short enough that it doesn't noticeably disrupt the	27 // This time should be short enough that it doesn't noticeably disrupt the

28 // user's interaction with the page.	28 // user's interaction with the page.

29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;	29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;

30	30

31 // Experimenting shows that we get a reasonable gain in performance by	31 // Experimenting shows that we get a reasonable gain in performance by

32 // increasing this up to around 10, but there's not much benefit in	32 // increasing this up to around 10, but there's not much benefit in

(...skipping 11 matching lines...) Expand all Loading...
44 struct PhishingTermFeatureExtractor::ExtractionState {	44 struct PhishingTermFeatureExtractor::ExtractionState {

45 // Stores up to max_words_per_term_ previous words separated by spaces.	45 // Stores up to max_words_per_term_ previous words separated by spaces.

46 std::string previous_words;	46 std::string previous_words;

47	47

48 // Stores the sizes of the words in previous_words. Note: the size includes	48 // Stores the sizes of the words in previous_words. Note: the size includes

49 // the space after each word. In other words, the sum of all sizes in this	49 // the space after each word. In other words, the sum of all sizes in this

50 // list is equal to the length of previous_words.	50 // list is equal to the length of previous_words.

51 std::list<size_t> previous_word_sizes;	51 std::list<size_t> previous_word_sizes;

52	52

53 // An iterator for word breaking.	53 // An iterator for word breaking.

54 UBreakIterator* iterator;	54 base::i18n::BreakIterator* iterator;
	mattm 2014/05/07 00:37:03 scoped_ptr scoped_ptr Andrew Hayden (chromium.org) 2014/05/07 15:31:58 Done. Show quoted text On 2014/05/07 00:37:03, mattm wrote: > scoped_ptr Done.
55	55

56 // Our current position in the text that was passed to the ExtractionState	56 // True until we have advanced once.

57 // constructor, speciailly, the most recent break position returned by our	57 bool initial;

58 // iterator.

59 int position;

60

61 // True if position has been initialized.

62 bool position_initialized;

63	58

64 // The time at which we started feature extraction for the current page.	59 // The time at which we started feature extraction for the current page.

65 base::TimeTicks start_time;	60 base::TimeTicks start_time;

66	61

67 // The number of iterations we've done for the current extraction.	62 // The number of iterations we've done for the current extraction.

68 int num_iterations;	63 int num_iterations;

69	64

	65 // Stored position from the last time we advanced the iterator.

	66 size_t last_position;

	67

70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)	68 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)

71 : position(-1),	69 : initial(true),

72 position_initialized(false),

73 start_time(start_time_ticks),	70 start_time(start_time_ticks),

74 num_iterations(0) {	71 num_iterations(0) {

75 UErrorCode status = U_ZERO_ERROR;	72 iterator = new base::i18n::BreakIterator(

76 // TODO(bryner): We should pass in the language for the document.	73 text, base::i18n::BreakIterator::BREAK_WORD);

77 iterator = ubrk_open(UBRK_WORD, NULL,	74

78 text.data(), text.size(),	75 if (!iterator->Init()) {

79 &status);	76 DLOG(ERROR) << "failed to open iterator";

80 if (U_FAILURE(status)) {	77 delete iterator;

81 DLOG(ERROR) << "ubrk_open failed: " << status;

82 iterator = NULL;	78 iterator = NULL;

83 }	79 }

	80 last_position = iterator->pos();

84 }	81 }

85	82

86 ~ExtractionState() {	83 ~ExtractionState() {

87 if (iterator) {	84 delete iterator;

88 ubrk_close(iterator);	85 iterator = NULL;

89 }

90 }	86 }

91 };	87 };

92	88

93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	89 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

94 const base::hash_set<std::string>* page_term_hashes,	90 const base::hash_set<std::string>* page_term_hashes,

95 const base::hash_set<uint32>* page_word_hashes,	91 const base::hash_set<uint32>* page_word_hashes,

96 size_t max_words_per_term,	92 size_t max_words_per_term,

97 uint32 murmurhash3_seed,	93 uint32 murmurhash3_seed,

98 FeatureExtractorClock* clock)	94 FeatureExtractorClock* clock)

99 : page_term_hashes_(page_term_hashes),	95 : page_term_hashes_(page_term_hashes),

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 ++state_->num_iterations;	141 ++state_->num_iterations;

146 base::TimeTicks current_chunk_start_time = clock_->Now();	142 base::TimeTicks current_chunk_start_time = clock_->Now();

147	143

148 if (!state_->iterator) {	144 if (!state_->iterator) {

149 // We failed to initialize the break iterator, so stop now.	145 // We failed to initialize the break iterator, so stop now.

150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);	146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);

151 RunCallback(false);	147 RunCallback(false);

152 return;	148 return;

153 }	149 }

154	150

155 if (!state_->position_initialized) {	151 if (state_->initial) {

156 state_->position = ubrk_first(state_->iterator);	152 if (!state_->iterator->IsValid()) {
	mattm 2014/05/07 00:37:03 Looks like this doesn't work, since BreakIterator Looks like this doesn't work, since BreakIterator doesn't initialize pos_ from ubrk_first: https://code.google.com/p/chromium/codesearch#chromium/src/base/i18n/break_it... Andrew Hayden (chromium.org) 2014/05/07 15:31:58 I had the same thought and went off to investigate Show quoted text On 2014/05/07 00:37:03, mattm wrote: > Looks like this doesn't work, since BreakIterator doesn't initialize pos_ from > ubrk_first: > https://code.google.com/p/chromium/codesearch#chromium/src/base/i18n/break_it... I had the same thought and went off to investigate. The results were surprising: https://code.google.com/p/chromium/issues/detail?id=370422 Anyhow, this whole thing is useless since we're using Advance, which will return false if it ends up being off the end of the string. So we can throw this entire chunk away and rely on the while loop.
157 if (state_->position == UBRK_DONE) {

158 // No words present, so we're done.	153 // No words present, so we're done.

159 RunCallback(true);	154 RunCallback(true);

160 return;	155 return;

161 }	156 }

162 state_->position_initialized = true;	157 state_->initial = false;

163 }	158 }

164	159

165 int num_words = 0;	160 int num_words = 0;

166 for (int next = ubrk_next(state_->iterator);	161 while (state_->iterator->Advance()) {

167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {	162 int next = state_->iterator->pos();

168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {	163 if (state_->iterator->IsEndOfWord(next)) {

169 // next is now positioned at the end of a word.	164 // next is now positioned at the end of a word.

170 HandleWord(base::StringPiece16(page_text_->data() + state_->position,	165 HandleWord(base::StringPiece16(page_text_->data() + state_->last_position,

171 next - state_->position));	166 next - state_->last_position));

172 ++num_words;	167 ++num_words;

173 }	168 }

174 state_->position = next;	169 state_->last_position = next;

175	170
	mattm 2014/05/07 00:37:03 I think all of this could be simplified by using t I think all of this could be simplified by using the BreakIterator idiomatically instead of just mechanically converting the code. If you just use IsWord(), prev(), pos(), it seems like keeping track of next, initial and last_position is unnecessary. Andrew Hayden (chromium.org) 2014/05/07 15:31:58 Seems reasonable, done. Show quoted text On 2014/05/07 00:37:03, mattm wrote: > I think all of this could be simplified by using the BreakIterator idiomatically > instead of just mechanically converting the code. If you just use IsWord(), > prev(), pos(), it seems like keeping track of next, initial and last_position is > unnecessary. Seems reasonable, done.
176 if (num_words >= kClockCheckGranularity) {	171 if (num_words >= kClockCheckGranularity) {

177 num_words = 0;	172 num_words = 0;

178 base::TimeTicks now = clock_->Now();	173 base::TimeTicks now = clock_->Now();

179 if (now - state_->start_time >=	174 if (now - state_->start_time >=

180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {	175 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

181 DLOG(ERROR) << "Feature extraction took too long, giving up";	176 DLOG(ERROR) << "Feature extraction took too long, giving up";

182 // We expect this to happen infrequently, so record when it does.	177 // We expect this to happen infrequently, so record when it does.

183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);	178 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);

184 RunCallback(false);	179 RunCallback(false);

185 return;	180 return;

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
301	296

302 void PhishingTermFeatureExtractor::Clear() {	297 void PhishingTermFeatureExtractor::Clear() {

303 page_text_ = NULL;	298 page_text_ = NULL;

304 features_ = NULL;	299 features_ = NULL;

305 done_callback_.Reset();	300 done_callback_.Reset();

306 state_.reset(NULL);	301 state_.reset(NULL);

307 negative_word_cache_.Clear();	302 negative_word_cache_.Clear();

308 }	303 }

309	304

310 } // namespace safe_browsing	305 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « base/i18n/break_iterator.h ('k') | no next file » | no next file with comments »