Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..6fe15d0d422d84b27086404e3e6fadd18a1a50d4 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
@@ -9,8 +9,10 @@ |
#include "base/bind.h" |
#include "base/compiler_specific.h" |
+#include "base/i18n/break_iterator.h" |
#include "base/i18n/case_conversion.h" |
#include "base/logging.h" |
+#include "base/memory/scoped_ptr.h" |
#include "base/message_loop/message_loop.h" |
#include "base/metrics/histogram.h" |
#include "base/strings/utf_string_conversions.h" |
@@ -19,7 +21,6 @@ |
#include "chrome/renderer/safe_browsing/features.h" |
#include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
#include "crypto/sha2.h" |
-#include "third_party/icu/source/common/unicode/ubrk.h" |
#include "ui/base/l10n/l10n_util.h" |
namespace safe_browsing { |
@@ -51,15 +52,7 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
std::list<size_t> previous_word_sizes; |
// An iterator for word breaking. |
- UBreakIterator* iterator; |
- |
- // Our current position in the text that was passed to the ExtractionState |
- // constructor, speciailly, the most recent break position returned by our |
- // iterator. |
- int position; |
- |
- // True if position has been initialized. |
- bool position_initialized; |
+ scoped_ptr<base::i18n::BreakIterator> iterator; |
// The time at which we started feature extraction for the current page. |
base::TimeTicks start_time; |
@@ -68,24 +61,17 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
int num_iterations; |
ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) |
- : position(-1), |
- position_initialized(false), |
- start_time(start_time_ticks), |
+ : start_time(start_time_ticks), |
num_iterations(0) { |
- UErrorCode status = U_ZERO_ERROR; |
- // TODO(bryner): We should pass in the language for the document. |
- iterator = ubrk_open(UBRK_WORD, NULL, |
- text.data(), text.size(), |
- &status); |
- if (U_FAILURE(status)) { |
- DLOG(ERROR) << "ubrk_open failed: " << status; |
- iterator = NULL; |
- } |
- } |
- ~ExtractionState() { |
- if (iterator) { |
- ubrk_close(iterator); |
+ scoped_ptr<base::i18n::BreakIterator> i( |
+ new base::i18n::BreakIterator( |
+ text, base::i18n::BreakIterator::BREAK_WORD)); |
+ |
+ if (i->Init()) { |
+ iterator = i.Pass(); |
+ } else { |
+ DLOG(ERROR) << "failed to open iterator"; |
} |
} |
}; |
@@ -145,33 +131,21 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
++state_->num_iterations; |
base::TimeTicks current_chunk_start_time = clock_->Now(); |
- if (!state_->iterator) { |
+ if (!state_->iterator.get()) { |
// We failed to initialize the break iterator, so stop now. |
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); |
RunCallback(false); |
return; |
} |
- if (!state_->position_initialized) { |
- state_->position = ubrk_first(state_->iterator); |
- if (state_->position == UBRK_DONE) { |
- // No words present, so we're done. |
- RunCallback(true); |
- return; |
- } |
- state_->position_initialized = true; |
- } |
- |
int num_words = 0; |
- for (int next = ubrk_next(state_->iterator); |
- next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
- if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
- // next is now positioned at the end of a word. |
- HandleWord(base::StringPiece16(page_text_->data() + state_->position, |
- next - state_->position)); |
+ while (state_->iterator->Advance()) { |
+ if (state_->iterator->IsWord()) { |
+ const size_t start = state_->iterator->prev(); |
+ const size_t length = state_->iterator->pos() - start; |
+ HandleWord(base::StringPiece16(page_text_->data() + start, length)); |
++num_words; |
} |
- state_->position = next; |
if (num_words >= kClockCheckGranularity) { |
num_words = 0; |