Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(70)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
new file mode 100644
index 0000000000000000000000000000000000000000..812fb935047c2e55db81103bf97d6342e6b576e2
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
+
+#include <string>
+
+#include "base/callback.h"
+#include "base/hash_tables.h"
+#include "base/message_loop.h"
+#include "base/scoped_ptr.h"
+#include "base/sha2.h"
+#include "base/string16.h"
+#include "base/stringprintf.h"
+#include "base/time.h"
+#include "base/utf_string_conversions.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using ::testing::ContainerEq;
+using ::testing::Return;
+
+namespace safe_browsing {
+
+class PhishingTermFeatureExtractorTest : public ::testing::Test {
+ protected:
+ class MockClock : public FeatureExtractorClock {
+ public:
+ MOCK_METHOD0(Now, base::TimeTicks());
+ };
+
+ virtual void SetUp() {
+ base::hash_set<std::string> terms;
+ terms.insert("one");
+ terms.insert("one one");
+ terms.insert("two");
+ terms.insert("multi word test");
+ terms.insert("capitalization");
+ terms.insert("space");
+ terms.insert("separator");
+ terms.insert("punctuation");
+ // Chinese (translation of "hello")
+ terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
+ // Chinese (translation of "goodbye")
+ terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
+
+ for (base::hash_set<std::string>::iterator it = terms.begin();
+ it != terms.end(); ++it) {
+ term_hashes_.insert(base::SHA256HashString(*it));
+ }
+
+ base::hash_set<std::string> words;
+ words.insert("one");
+ words.insert("two");
+ words.insert("multi");
+ words.insert("word");
+ words.insert("test");
+ words.insert("capitalization");
+ words.insert("space");
+ words.insert("separator");
+ words.insert("punctuation");
+ words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
+ words.insert("\xe5\x86\x8d\xe8\xa7\x81");
+
+ for (base::hash_set<std::string>::iterator it = words.begin();
+ it != words.end(); ++it) {
+ word_hashes_.insert(base::SHA256HashString(*it));
+ }
+
+ clock_ = new MockClock();
+ extractor_.reset(new PhishingTermFeatureExtractor(
+ &term_hashes_,
+ &word_hashes_,
+ 3 /* max_words_per_term */,
+ clock_));
+ }
+
+ // Runs the TermFeatureExtractor on |page_text|, waiting for the
+ // completion callback. Returns the success boolean from the callback.
+ bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
+ success_ = false;
+ extractor_->ExtractFeatures(
+ page_text,
+ features,
+ NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));
+ msg_loop_.Run();
+ return success_;
+ }
+
+ // Completion callback for feature extraction.
+ void ExtractionDone(bool success) {
+ success_ = success;
+ msg_loop_.Quit();
+ }
+
+ MessageLoop msg_loop_;
+ scoped_ptr<PhishingTermFeatureExtractor> extractor_;
+ base::hash_set<std::string> term_hashes_;
+ base::hash_set<std::string> word_hashes_;
+ MockClock* clock_; // owned by extractor_
+ bool success_; // holds the success value from ExtractFeatures
+};
+
+TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
+ // This test doesn't exercise the extraction timing.
+ EXPECT_CALL(*clock_, Now())
+ .WillRepeatedly(Return(base::TimeTicks::Now()));
+
+ string16 page_text = ASCIIToUTF16("blah");
+ FeatureMap expected_features; // initially empty
+
+ FeatureMap features;
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ page_text = ASCIIToUTF16("one one");
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("one"));
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("one one"));
+
+ features.Clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ page_text = ASCIIToUTF16("bla bla multi word test bla");
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("multi word test"));
+
+ features.Clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ // This text has all of the words for one of the terms, but they are
+ // not in the correct order.
+ page_text = ASCIIToUTF16("bla bla test word multi bla");
+ expected_features.Clear();
+
+ features.Clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ page_text = ASCIIToUTF16("Capitalization plus non-space\n"
+ "separator... punctuation!");
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("capitalization"));
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("space"));
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("separator"));
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("punctuation"));
+
+ features.Clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ // Test with empty page text.
+ page_text = string16();
+ expected_features.Clear();
+ features.Clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ // Chinese translation of the phrase "hello goodbye". This tests that
+ // we can correctly separate terms in languages that don't use spaces.
+ page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(
+ features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
+ expected_features.AddBooleanFeature(
+ features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
+
+ features.Clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+}
+
+TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
+ // For this test, we'll cause the feature extraction to run multiple
+ // iterations by incrementing the clock.
+
+ // This page has a total of 30 words. For the features to be computed
+ // correctly, the extractor has to process the entire string of text.
+ string16 page_text(ASCIIToUTF16("one "));
+ for (int i = 0; i < 28; ++i) {
+ page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));
+ }
+ page_text.append(ASCIIToUTF16("two"));
+
+ // Advance the clock 30 ms every 10 words processed, 10 ms between chunks.
+ // Note that this assumes kClockCheckGranularity = 10 and
+ // kMaxTimePerChunkMs = 50.
+ base::TimeTicks now = base::TimeTicks::Now();
+ EXPECT_CALL(*clock_, Now())
+ // Time check at the start of extraction.
+ .WillOnce(Return(now))
+ // Time check at the start of the first chunk of work.
+ .WillOnce(Return(now))
+ // Time check after the first 10 words.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
+ // Time check after the next 10 words. This is over the chunk
+ // time limit, so a continuation task will be posted.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
+ // Time check at the start of the second chunk of work.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
+ // Time check after the next 10 words.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
+ // A final check for the histograms.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101)));
+
+ FeatureMap expected_features;
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("one"));
+ expected_features.AddBooleanFeature(features::kPageTerm +
+ std::string("two"));
+
+ FeatureMap features;
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+ // Make sure none of the mock expectations carry over to the next test.
+ ::testing::Mock::VerifyAndClearExpectations(clock_);
+
+ // Now repeat the test with the same text, but advance the clock faster so
+ // that the extraction time exceeds the maximum total time for the feature
+ // extractor. Extraction should fail. Note that this assumes
+ // kMaxTotalTimeMs = 500.
+ EXPECT_CALL(*clock_, Now())
+ // Time check at the start of extraction.
+ .WillOnce(Return(now))
+ // Time check at the start of the first chunk of work.
+ .WillOnce(Return(now))
+ // Time check after the first 10 words,
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
+ // Time check at the start of the second chunk of work.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
+ // Time check after the next 10 words. This is over the limit.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
+ // A final time check for the histograms.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
+
+ features.Clear();
+ EXPECT_FALSE(ExtractFeatures(&page_text, &features));
+}
+
+} // namespace safe_browsing
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698