Chromium Code Reviews| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..ba297f66f1c43d4a8b7c2ca2261a023e3f1cf995 100644 |
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| @@ -9,6 +9,7 @@ |
| #include "base/bind.h" |
| #include "base/compiler_specific.h" |
| +#include "base/i18n/break_iterator.h" |
| #include "base/i18n/case_conversion.h" |
| #include "base/logging.h" |
| #include "base/message_loop/message_loop.h" |
| @@ -19,7 +20,6 @@ |
| #include "chrome/renderer/safe_browsing/features.h" |
| #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
| #include "crypto/sha2.h" |
| -#include "third_party/icu/source/common/unicode/ubrk.h" |
| #include "ui/base/l10n/l10n_util.h" |
| namespace safe_browsing { |
| @@ -51,15 +51,10 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
| std::list<size_t> previous_word_sizes; |
| // An iterator for word breaking. |
| - UBreakIterator* iterator; |
| + base::i18n::BreakIterator* iterator; |
|
mattm
2014/05/07 00:37:03
scoped_ptr
Andrew Hayden (chromium.org)
2014/05/07 15:31:58
Done.
|
| - // Our current position in the text that was passed to the ExtractionState |
| - // constructor, speciailly, the most recent break position returned by our |
| - // iterator. |
| - int position; |
| - |
| - // True if position has been initialized. |
| - bool position_initialized; |
| + // True until we have advanced once. |
| + bool initial; |
| // The time at which we started feature extraction for the current page. |
| base::TimeTicks start_time; |
| @@ -67,26 +62,27 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
| // The number of iterations we've done for the current extraction. |
| int num_iterations; |
| + // Stored position from the last time we advanced the iterator. |
| + size_t last_position; |
| + |
| ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) |
| - : position(-1), |
| - position_initialized(false), |
| + : initial(true), |
| start_time(start_time_ticks), |
| num_iterations(0) { |
| - UErrorCode status = U_ZERO_ERROR; |
| - // TODO(bryner): We should pass in the language for the document. |
| - iterator = ubrk_open(UBRK_WORD, NULL, |
| - text.data(), text.size(), |
| - &status); |
| - if (U_FAILURE(status)) { |
| - DLOG(ERROR) << "ubrk_open failed: " << status; |
| + iterator = new base::i18n::BreakIterator( |
| + text, base::i18n::BreakIterator::BREAK_WORD); |
| + |
| + if (!iterator->Init()) { |
| + DLOG(ERROR) << "failed to open iterator"; |
| + delete iterator; |
| iterator = NULL; |
| } |
| + last_position = iterator->pos(); |
| } |
| ~ExtractionState() { |
| - if (iterator) { |
| - ubrk_close(iterator); |
| - } |
| + delete iterator; |
| + iterator = NULL; |
| } |
| }; |
| @@ -152,26 +148,25 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
| return; |
| } |
| - if (!state_->position_initialized) { |
| - state_->position = ubrk_first(state_->iterator); |
| - if (state_->position == UBRK_DONE) { |
| + if (state_->initial) { |
| + if (!state_->iterator->IsValid()) { |
|
mattm
2014/05/07 00:37:03
Looks like this doesn't work, since BreakIterator
Andrew Hayden (chromium.org)
2014/05/07 15:31:58
I had the same thought and went off to investigate
|
| // No words present, so we're done. |
| RunCallback(true); |
| return; |
| } |
| - state_->position_initialized = true; |
| + state_->initial = false; |
| } |
| int num_words = 0; |
| - for (int next = ubrk_next(state_->iterator); |
| - next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
| - if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
| + while (state_->iterator->Advance()) { |
| + int next = state_->iterator->pos(); |
| + if (state_->iterator->IsEndOfWord(next)) { |
| // next is now positioned at the end of a word. |
| - HandleWord(base::StringPiece16(page_text_->data() + state_->position, |
| - next - state_->position)); |
| + HandleWord(base::StringPiece16(page_text_->data() + state_->last_position, |
| + next - state_->last_position)); |
| ++num_words; |
| } |
| - state_->position = next; |
| + state_->last_position = next; |
|
mattm
2014/05/07 00:37:03
I think all of this could be simplified by using t
Andrew Hayden (chromium.org)
2014/05/07 15:31:58
Seems reasonable, done.
|
| if (num_words >= kClockCheckGranularity) { |
| num_words = 0; |