chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc - Issue 3214002: Add a term feature extractor for client-side phishing detection.

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git

Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

new file mode 100644

index 0000000000000000000000000000000000000000..812fb935047c2e55db81103bf97d6342e6b576e2

--- /dev/null

+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

@@ -0,0 +1,252 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

+#include <string>

+#include "base/callback.h"

+#include "base/hash_tables.h"

+#include "base/message_loop.h"

+#include "base/scoped_ptr.h"

+#include "base/sha2.h"

+#include "base/string16.h"

+#include "base/stringprintf.h"

+#include "base/time.h"

+#include "base/utf_string_conversions.h"

+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

+#include "chrome/renderer/safe_browsing/features.h"

+#include "testing/gmock/include/gmock/gmock.h"

+#include "testing/gtest/include/gtest/gtest.h"

+using ::testing::ContainerEq;

+using ::testing::Return;

+namespace safe_browsing {

+class PhishingTermFeatureExtractorTest : public ::testing::Test {

+ protected:

+ class MockClock : public FeatureExtractorClock {

+ public:

+ MOCK_METHOD0(Now, base::TimeTicks());

+ };

+ virtual void SetUp() {

+ base::hash_set<std::string> terms;

+ terms.insert("one");

+ terms.insert("one one");

+ terms.insert("two");

+ terms.insert("multi word test");

+ terms.insert("capitalization");

+ terms.insert("space");

+ terms.insert("separator");

+ terms.insert("punctuation");

+ // Chinese (translation of "hello")

+ terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");

+ // Chinese (translation of "goodbye")

+ terms.insert("\xe5\x86\x8d\xe8\xa7\x81");

+ for (base::hash_set<std::string>::iterator it = terms.begin();

+ it != terms.end(); ++it) {

+ term_hashes_.insert(base::SHA256HashString(*it));

+ }

+ base::hash_set<std::string> words;

+ words.insert("one");

+ words.insert("two");

+ words.insert("multi");

+ words.insert("word");

+ words.insert("test");

+ words.insert("capitalization");

+ words.insert("space");

+ words.insert("separator");

+ words.insert("punctuation");

+ words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");

+ words.insert("\xe5\x86\x8d\xe8\xa7\x81");

+ for (base::hash_set<std::string>::iterator it = words.begin();

+ it != words.end(); ++it) {

+ word_hashes_.insert(base::SHA256HashString(*it));

+ }

+ clock_ = new MockClock();

+ extractor_.reset(new PhishingTermFeatureExtractor(

+ &term_hashes_,

+ &word_hashes_,

+ 3 /* max_words_per_term */,

+ clock_));

+ }

+ // Runs the TermFeatureExtractor on |page_text|, waiting for the

+ // completion callback. Returns the success boolean from the callback.

+ bool ExtractFeatures(const string16* page_text, FeatureMap* features) {

+ success_ = false;

+ extractor_->ExtractFeatures(

+ page_text,

+ features,

+ NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));

+ msg_loop_.Run();

+ return success_;

+ }

+ // Completion callback for feature extraction.

+ void ExtractionDone(bool success) {

+ success_ = success;

+ msg_loop_.Quit();

+ }

+ MessageLoop msg_loop_;

+ scoped_ptr<PhishingTermFeatureExtractor> extractor_;

+ base::hash_set<std::string> term_hashes_;

+ base::hash_set<std::string> word_hashes_;

+ MockClock* clock_; // owned by extractor_

+ bool success_; // holds the success value from ExtractFeatures

+};

+TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {

+ // This test doesn't exercise the extraction timing.

+ EXPECT_CALL(*clock_, Now())

+ .WillRepeatedly(Return(base::TimeTicks::Now()));

+ string16 page_text = ASCIIToUTF16("blah");

+ FeatureMap expected_features; // initially empty

+ FeatureMap features;

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ page_text = ASCIIToUTF16("one one");

+ expected_features.Clear();

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("one"));

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("one one"));

+ features.Clear();

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ page_text = ASCIIToUTF16("bla bla multi word test bla");

+ expected_features.Clear();

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("multi word test"));

+ features.Clear();

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ // This text has all of the words for one of the terms, but they are

+ // not in the correct order.

+ page_text = ASCIIToUTF16("bla bla test word multi bla");

+ expected_features.Clear();

+ features.Clear();

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ page_text = ASCIIToUTF16("Capitalization plus non-space\n"

+ "separator... punctuation!");

+ expected_features.Clear();

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("capitalization"));

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("space"));

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("separator"));

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("punctuation"));

+ features.Clear();

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ // Test with empty page text.

+ page_text = string16();

+ expected_features.Clear();

+ features.Clear();

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ // Chinese translation of the phrase "hello goodbye". This tests that

+ // we can correctly separate terms in languages that don't use spaces.

+ page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");

+ expected_features.Clear();

+ expected_features.AddBooleanFeature(

+ features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));

+ expected_features.AddBooleanFeature(

+ features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));

+ features.Clear();

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+TEST_F(PhishingTermFeatureExtractorTest, Continuation) {

+ // For this test, we'll cause the feature extraction to run multiple

+ // iterations by incrementing the clock.

+ // This page has a total of 30 words. For the features to be computed

+ // correctly, the extractor has to process the entire string of text.

+ string16 page_text(ASCIIToUTF16("one "));

+ for (int i = 0; i < 28; ++i) {

+ page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));

+ }

+ page_text.append(ASCIIToUTF16("two"));

+ // Advance the clock 30 ms every 10 words processed, 10 ms between chunks.

+ // Note that this assumes kClockCheckGranularity = 10 and

+ // kMaxTimePerChunkMs = 50.

+ base::TimeTicks now = base::TimeTicks::Now();

+ EXPECT_CALL(*clock_, Now())

+ // Time check at the start of extraction.

+ .WillOnce(Return(now))

+ // Time check at the start of the first chunk of work.

+ .WillOnce(Return(now))

+ // Time check after the first 10 words.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))

+ // Time check after the next 10 words. This is over the chunk

+ // time limit, so a continuation task will be posted.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))

+ // Time check at the start of the second chunk of work.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))

+ // Time check after the next 10 words.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))

+ // A final check for the histograms.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101)));

+ FeatureMap expected_features;

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("one"));

+ expected_features.AddBooleanFeature(features::kPageTerm +

+ std::string("two"));

+ FeatureMap features;

+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));

+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

+ // Make sure none of the mock expectations carry over to the next test.

+ ::testing::Mock::VerifyAndClearExpectations(clock_);

+ // Now repeat the test with the same text, but advance the clock faster so

+ // that the extraction time exceeds the maximum total time for the feature

+ // extractor. Extraction should fail. Note that this assumes

+ // kMaxTotalTimeMs = 500.

+ EXPECT_CALL(*clock_, Now())

+ // Time check at the start of extraction.

+ .WillOnce(Return(now))

+ // Time check at the start of the first chunk of work.

+ .WillOnce(Return(now))

+ // Time check after the first 10 words,

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))

+ // Time check at the start of the second chunk of work.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))

+ // Time check after the next 10 words. This is over the limit.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))

+ // A final time check for the histograms.

+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));

+ features.Clear();

+ EXPECT_FALSE(ExtractFeatures(&page_text, &features));

+} // namespace safe_browsing

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc ('k') | no next file » | no next file with comments »