chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

deleted file mode 100644

index 6715a590ec7638ee196830873698ca9fcb5f86c1..0000000000000000000000000000000000000000

--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

+++ /dev/null

@@ -1,298 +0,0 @@

-// Use of this source code is governed by a BSD-style license that can be

-// found in the LICENSE file.

-#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

-#include <list>

-#include <map>

-#include <memory>

-#include <utility>

-#include "base/bind.h"

-#include "base/compiler_specific.h"

-#include "base/i18n/break_iterator.h"

-#include "base/i18n/case_conversion.h"

-#include "base/location.h"

-#include "base/logging.h"

-#include "base/metrics/histogram_macros.h"

-#include "base/single_thread_task_runner.h"

-#include "base/strings/utf_string_conversions.h"

-#include "base/threading/thread_task_runner_handle.h"

-#include "base/time/time.h"

-#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

-#include "chrome/renderer/safe_browsing/features.h"

-#include "chrome/renderer/safe_browsing/murmurhash3_util.h"

-#include "crypto/sha2.h"

-namespace safe_browsing {

-// This time should be short enough that it doesn't noticeably disrupt the

-// user's interaction with the page.

-const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;

-// Experimenting shows that we get a reasonable gain in performance by

-// increasing this up to around 10, but there's not much benefit in

-// increasing it past that.

-const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;

-// This should be longer than we expect feature extraction to take on any

-// actual phishing page.

-const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

-// All of the state pertaining to the current feature extraction.

-struct PhishingTermFeatureExtractor::ExtractionState {

- // Stores up to max_words_per_term_ previous words separated by spaces.

- std::string previous_words;

- // Stores the current shingle after a new word is processed and added in.

- std::string current_shingle;

- // Stores the sizes of the words in current_shingle. Note: the size includes

- // the space after each word. In other words, the sum of all sizes in this

- // list is equal to the length of current_shingle.

- std::list<size_t> shingle_word_sizes;

- // Stores the sizes of the words in previous_words. Note: the size includes

- // the space after each word. In other words, the sum of all sizes in this

- // list is equal to the length of previous_words.

- std::list<size_t> previous_word_sizes;

- // An iterator for word breaking.

- std::unique_ptr<base::i18n::BreakIterator> iterator;

- // The time at which we started feature extraction for the current page.

- base::TimeTicks start_time;

- // The number of iterations we've done for the current extraction.

- int num_iterations;

- ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)

- : start_time(start_time_ticks),

- num_iterations(0) {

- std::unique_ptr<base::i18n::BreakIterator> i(new base::i18n::BreakIterator(

- text, base::i18n::BreakIterator::BREAK_WORD));

- if (i->Init()) {

- iterator = std::move(i);

- } else {

- DLOG(ERROR) << "failed to open iterator";

- }

-};

-PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

- const base::hash_set<std::string>* page_term_hashes,

- const base::hash_set<uint32_t>* page_word_hashes,

- size_t max_words_per_term,

- uint32_t murmurhash3_seed,

- size_t max_shingles_per_page,

- size_t shingle_size,

- FeatureExtractorClock* clock)

- : page_term_hashes_(page_term_hashes),

- page_word_hashes_(page_word_hashes),

- max_words_per_term_(max_words_per_term),

- murmurhash3_seed_(murmurhash3_seed),

- max_shingles_per_page_(max_shingles_per_page),

- shingle_size_(shingle_size),

- clock_(clock),

- weak_factory_(this) {

- Clear();

-PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

- // The RenderView should have called CancelPendingExtraction() before

- // we are destroyed.

- CheckNoPendingExtraction();

-void PhishingTermFeatureExtractor::ExtractFeatures(

- const base::string16* page_text,

- FeatureMap* features,

- std::set<uint32_t>* shingle_hashes,

- const DoneCallback& done_callback) {

- // The RenderView should have called CancelPendingExtraction() before

- // starting a new extraction, so DCHECK this.

- CheckNoPendingExtraction();

- // However, in an opt build, we will go ahead and clean up the pending

- // extraction so that we can start in a known state.

- CancelPendingExtraction();

- page_text_ = page_text;

- features_ = features;

- shingle_hashes_ = shingle_hashes,

- done_callback_ = done_callback;

- state_.reset(new ExtractionState(*page_text_, clock_->Now()));

- base::ThreadTaskRunnerHandle::Get()->PostTask(

- FROM_HERE,

- base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,

- weak_factory_.GetWeakPtr()));

-void PhishingTermFeatureExtractor::CancelPendingExtraction() {

- // Cancel any pending callbacks, and clear our state.

- weak_factory_.InvalidateWeakPtrs();

- Clear();

-void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

- DCHECK(state_.get());

- ++state_->num_iterations;

- base::TimeTicks current_chunk_start_time = clock_->Now();

- if (!state_->iterator.get()) {

- // We failed to initialize the break iterator, so stop now.

- UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);

- RunCallback(false);

- return;

- }

- int num_words = 0;

- while (state_->iterator->Advance()) {

- if (state_->iterator->IsWord()) {

- const size_t start = state_->iterator->prev();

- const size_t length = state_->iterator->pos() - start;

- HandleWord(base::StringPiece16(page_text_->data() + start, length));

- ++num_words;

- }

- if (num_words >= kClockCheckGranularity) {

- num_words = 0;

- base::TimeTicks now = clock_->Now();

- if (now - state_->start_time >=

- base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

- DLOG(ERROR) << "Feature extraction took too long, giving up";

- // We expect this to happen infrequently, so record when it does.

- UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);

- RunCallback(false);

- return;

- }

- base::TimeDelta chunk_elapsed = now - current_chunk_start_time;

- if (chunk_elapsed >=

- base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {

- // The time limit for the current chunk is up, so post a task to

- // continue extraction.

- //

- // Record how much time we actually spent on the chunk. If this is

- // much higher than kMaxTimePerChunkMs, we may need to adjust the

- // clock granularity.

- UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",

- chunk_elapsed);

- base::ThreadTaskRunnerHandle::Get()->PostTask(

- FROM_HERE,

- base::Bind(

- &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,

- weak_factory_.GetWeakPtr()));

- return;

- }

- // Otherwise, continue.

- }

- RunCallback(true);

-void PhishingTermFeatureExtractor::HandleWord(

- const base::StringPiece16& word) {

- // First, extract shingle hashes.

- const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

- state_->current_shingle.append(word_lower + " ");

- state_->shingle_word_sizes.push_back(word_lower.size() + 1);

- if (state_->shingle_word_sizes.size() == shingle_size_) {

- shingle_hashes_->insert(

- MurmurHash3String(state_->current_shingle, murmurhash3_seed_));

- state_->current_shingle.erase(0, state_->shingle_word_sizes.front());

- state_->shingle_word_sizes.pop_front();

- }

- // Check if the size of shingle hashes is over the limit.

- if (shingle_hashes_->size() > max_shingles_per_page_) {

- // Pop the largest one.

- std::set<uint32_t>::iterator it = shingle_hashes_->end();

- shingle_hashes_->erase(--it);

- }

- // Next, extract page terms.

- uint32_t word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

- // Quick out if the word is not part of any term, which is the common case.

- if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

- // Word doesn't exist in our terms so we can clear the n-gram state.

- state_->previous_words.clear();

- state_->previous_word_sizes.clear();

- return;

- }

- // Find all of the n-grams that we need to check and compute their SHA-256

- // hashes.

- std::map<std::string /* hash */, std::string /* plaintext */>

- hashes_to_check;

- hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;

- // Combine the new word with the previous words to find additional n-grams.

- // Note that we don't yet add the new word length to previous_word_sizes,

- // since we don't want to compute the hash for the word by itself again.

- //

- state_->previous_words.append(word_lower);

- std::string current_term = state_->previous_words;

- for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

- it != state_->previous_word_sizes.end(); ++it) {

- hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;

- current_term.erase(0, *it);

- }

- // Add features for any hashes that match page_term_hashes_.

- for (std::map<std::string, std::string>::iterator it =

- hashes_to_check.begin();

- it != hashes_to_check.end(); ++it) {

- if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {

- features_->AddBooleanFeature(features::kPageTerm + it->second);

- }

- // Now that we have handled the current word, we have to add a space at the

- // end of it, and add the new word's size (including the space) to

- // previous_word_sizes. Note: it's possible that the document language

- // doesn't use ASCII spaces to separate words. That's fine though, we just

- // need to be consistent with how the model is generated.

- state_->previous_words.append(" ");

- state_->previous_word_sizes.push_back(word_lower.size() + 1);

- // Cap the number of previous words.

- if (state_->previous_word_sizes.size() >= max_words_per_term_) {

- state_->previous_words.erase(0, state_->previous_word_sizes.front());

- state_->previous_word_sizes.pop_front();

- }

-void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {

- DCHECK(done_callback_.is_null());

- DCHECK(!state_.get());

- if (!done_callback_.is_null() || state_.get()) {

- LOG(ERROR) << "Extraction in progress, missing call to "

- << "CancelPendingExtraction";

- }

-void PhishingTermFeatureExtractor::RunCallback(bool success) {

- // Record some timing stats that we can use to evaluate feature extraction

- // performance. These include both successful and failed extractions.

- DCHECK(state_.get());

- UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",

- state_->num_iterations);

- UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",

- clock_->Now() - state_->start_time);

- DCHECK(!done_callback_.is_null());

- done_callback_.Run(success);

- Clear();

-void PhishingTermFeatureExtractor::Clear() {

- page_text_ = NULL;

- features_ = NULL;

- shingle_hashes_ = NULL;

- done_callback_.Reset();

- state_.reset(NULL);

-} // namespace safe_browsing