| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..812fb935047c2e55db81103bf97d6342e6b576e2
|
| --- /dev/null
|
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| @@ -0,0 +1,252 @@
|
| +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
|
| +
|
| +#include <string>
|
| +
|
| +#include "base/callback.h"
|
| +#include "base/hash_tables.h"
|
| +#include "base/message_loop.h"
|
| +#include "base/scoped_ptr.h"
|
| +#include "base/sha2.h"
|
| +#include "base/string16.h"
|
| +#include "base/stringprintf.h"
|
| +#include "base/time.h"
|
| +#include "base/utf_string_conversions.h"
|
| +#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
|
| +#include "chrome/renderer/safe_browsing/features.h"
|
| +#include "testing/gmock/include/gmock/gmock.h"
|
| +#include "testing/gtest/include/gtest/gtest.h"
|
| +
|
| +using ::testing::ContainerEq;
|
| +using ::testing::Return;
|
| +
|
| +namespace safe_browsing {
|
| +
|
| +class PhishingTermFeatureExtractorTest : public ::testing::Test {
|
| + protected:
|
| + class MockClock : public FeatureExtractorClock {
|
| + public:
|
| + MOCK_METHOD0(Now, base::TimeTicks());
|
| + };
|
| +
|
| + virtual void SetUp() {
|
| + base::hash_set<std::string> terms;
|
| + terms.insert("one");
|
| + terms.insert("one one");
|
| + terms.insert("two");
|
| + terms.insert("multi word test");
|
| + terms.insert("capitalization");
|
| + terms.insert("space");
|
| + terms.insert("separator");
|
| + terms.insert("punctuation");
|
| + // Chinese (translation of "hello")
|
| + terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
|
| + // Chinese (translation of "goodbye")
|
| + terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
|
| +
|
| + for (base::hash_set<std::string>::iterator it = terms.begin();
|
| + it != terms.end(); ++it) {
|
| + term_hashes_.insert(base::SHA256HashString(*it));
|
| + }
|
| +
|
| + base::hash_set<std::string> words;
|
| + words.insert("one");
|
| + words.insert("two");
|
| + words.insert("multi");
|
| + words.insert("word");
|
| + words.insert("test");
|
| + words.insert("capitalization");
|
| + words.insert("space");
|
| + words.insert("separator");
|
| + words.insert("punctuation");
|
| + words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
|
| + words.insert("\xe5\x86\x8d\xe8\xa7\x81");
|
| +
|
| + for (base::hash_set<std::string>::iterator it = words.begin();
|
| + it != words.end(); ++it) {
|
| + word_hashes_.insert(base::SHA256HashString(*it));
|
| + }
|
| +
|
| + clock_ = new MockClock();
|
| + extractor_.reset(new PhishingTermFeatureExtractor(
|
| + &term_hashes_,
|
| + &word_hashes_,
|
| + 3 /* max_words_per_term */,
|
| + clock_));
|
| + }
|
| +
|
| + // Runs the TermFeatureExtractor on |page_text|, waiting for the
|
| + // completion callback. Returns the success boolean from the callback.
|
| + bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
|
| + success_ = false;
|
| + extractor_->ExtractFeatures(
|
| + page_text,
|
| + features,
|
| + NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));
|
| + msg_loop_.Run();
|
| + return success_;
|
| + }
|
| +
|
| + // Completion callback for feature extraction.
|
| + void ExtractionDone(bool success) {
|
| + success_ = success;
|
| + msg_loop_.Quit();
|
| + }
|
| +
|
| + MessageLoop msg_loop_;
|
| + scoped_ptr<PhishingTermFeatureExtractor> extractor_;
|
| + base::hash_set<std::string> term_hashes_;
|
| + base::hash_set<std::string> word_hashes_;
|
| + MockClock* clock_; // owned by extractor_
|
| + bool success_; // holds the success value from ExtractFeatures
|
| +};
|
| +
|
| +TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
|
| + // This test doesn't exercise the extraction timing.
|
| + EXPECT_CALL(*clock_, Now())
|
| + .WillRepeatedly(Return(base::TimeTicks::Now()));
|
| +
|
| + string16 page_text = ASCIIToUTF16("blah");
|
| + FeatureMap expected_features; // initially empty
|
| +
|
| + FeatureMap features;
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +
|
| + page_text = ASCIIToUTF16("one one");
|
| + expected_features.Clear();
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("one"));
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("one one"));
|
| +
|
| + features.Clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +
|
| + page_text = ASCIIToUTF16("bla bla multi word test bla");
|
| + expected_features.Clear();
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("multi word test"));
|
| +
|
| + features.Clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +
|
| + // This text has all of the words for one of the terms, but they are
|
| + // not in the correct order.
|
| + page_text = ASCIIToUTF16("bla bla test word multi bla");
|
| + expected_features.Clear();
|
| +
|
| + features.Clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +
|
| + page_text = ASCIIToUTF16("Capitalization plus non-space\n"
|
| + "separator... punctuation!");
|
| + expected_features.Clear();
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("capitalization"));
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("space"));
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("separator"));
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("punctuation"));
|
| +
|
| + features.Clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +
|
| + // Test with empty page text.
|
| + page_text = string16();
|
| + expected_features.Clear();
|
| + features.Clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +
|
| + // Chinese translation of the phrase "hello goodbye". This tests that
|
| + // we can correctly separate terms in languages that don't use spaces.
|
| + page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
|
| + expected_features.Clear();
|
| + expected_features.AddBooleanFeature(
|
| + features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
|
| + expected_features.AddBooleanFeature(
|
| + features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
|
| +
|
| + features.Clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| +}
|
| +
|
| +TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
|
| + // For this test, we'll cause the feature extraction to run multiple
|
| + // iterations by incrementing the clock.
|
| +
|
| + // This page has a total of 30 words. For the features to be computed
|
| + // correctly, the extractor has to process the entire string of text.
|
| + string16 page_text(ASCIIToUTF16("one "));
|
| + for (int i = 0; i < 28; ++i) {
|
| + page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));
|
| + }
|
| + page_text.append(ASCIIToUTF16("two"));
|
| +
|
| + // Advance the clock 30 ms every 10 words processed, 10 ms between chunks.
|
| + // Note that this assumes kClockCheckGranularity = 10 and
|
| + // kMaxTimePerChunkMs = 50.
|
| + base::TimeTicks now = base::TimeTicks::Now();
|
| + EXPECT_CALL(*clock_, Now())
|
| + // Time check at the start of extraction.
|
| + .WillOnce(Return(now))
|
| + // Time check at the start of the first chunk of work.
|
| + .WillOnce(Return(now))
|
| + // Time check after the first 10 words.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
|
| + // Time check after the next 10 words. This is over the chunk
|
| + // time limit, so a continuation task will be posted.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
|
| + // Time check at the start of the second chunk of work.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
|
| + // Time check after the next 10 words.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
|
| + // A final check for the histograms.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101)));
|
| +
|
| + FeatureMap expected_features;
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("one"));
|
| + expected_features.AddBooleanFeature(features::kPageTerm +
|
| + std::string("two"));
|
| +
|
| + FeatureMap features;
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
|
| + // Make sure none of the mock expectations carry over to the next test.
|
| + ::testing::Mock::VerifyAndClearExpectations(clock_);
|
| +
|
| + // Now repeat the test with the same text, but advance the clock faster so
|
| + // that the extraction time exceeds the maximum total time for the feature
|
| + // extractor. Extraction should fail. Note that this assumes
|
| + // kMaxTotalTimeMs = 500.
|
| + EXPECT_CALL(*clock_, Now())
|
| + // Time check at the start of extraction.
|
| + .WillOnce(Return(now))
|
| + // Time check at the start of the first chunk of work.
|
| + .WillOnce(Return(now))
|
| + // Time check after the first 10 words,
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
|
| + // Time check at the start of the second chunk of work.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
|
| + // Time check after the next 10 words. This is over the limit.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
|
| + // A final time check for the histograms.
|
| + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
|
| +
|
| + features.Clear();
|
| + EXPECT_FALSE(ExtractFeatures(&page_text, &features));
|
| +}
|
| +
|
| +} // namespace safe_browsing
|
|
|