Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..812fb935047c2e55db81103bf97d6342e6b576e2 |
--- /dev/null |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
@@ -0,0 +1,252 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
+ |
+#include <string> |
+ |
+#include "base/callback.h" |
+#include "base/hash_tables.h" |
+#include "base/message_loop.h" |
+#include "base/scoped_ptr.h" |
+#include "base/sha2.h" |
+#include "base/string16.h" |
+#include "base/stringprintf.h" |
+#include "base/time.h" |
+#include "base/utf_string_conversions.h" |
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
+#include "chrome/renderer/safe_browsing/features.h" |
+#include "testing/gmock/include/gmock/gmock.h" |
+#include "testing/gtest/include/gtest/gtest.h" |
+ |
+using ::testing::ContainerEq; |
+using ::testing::Return; |
+ |
+namespace safe_browsing { |
+ |
+class PhishingTermFeatureExtractorTest : public ::testing::Test { |
+ protected: |
+ class MockClock : public FeatureExtractorClock { |
+ public: |
+ MOCK_METHOD0(Now, base::TimeTicks()); |
+ }; |
+ |
+ virtual void SetUp() { |
+ base::hash_set<std::string> terms; |
+ terms.insert("one"); |
+ terms.insert("one one"); |
+ terms.insert("two"); |
+ terms.insert("multi word test"); |
+ terms.insert("capitalization"); |
+ terms.insert("space"); |
+ terms.insert("separator"); |
+ terms.insert("punctuation"); |
+ // Chinese (translation of "hello") |
+ terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
+ // Chinese (translation of "goodbye") |
+ terms.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
+ |
+ for (base::hash_set<std::string>::iterator it = terms.begin(); |
+ it != terms.end(); ++it) { |
+ term_hashes_.insert(base::SHA256HashString(*it)); |
+ } |
+ |
+ base::hash_set<std::string> words; |
+ words.insert("one"); |
+ words.insert("two"); |
+ words.insert("multi"); |
+ words.insert("word"); |
+ words.insert("test"); |
+ words.insert("capitalization"); |
+ words.insert("space"); |
+ words.insert("separator"); |
+ words.insert("punctuation"); |
+ words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
+ words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
+ |
+ for (base::hash_set<std::string>::iterator it = words.begin(); |
+ it != words.end(); ++it) { |
+ word_hashes_.insert(base::SHA256HashString(*it)); |
+ } |
+ |
+ clock_ = new MockClock(); |
+ extractor_.reset(new PhishingTermFeatureExtractor( |
+ &term_hashes_, |
+ &word_hashes_, |
+ 3 /* max_words_per_term */, |
+ clock_)); |
+ } |
+ |
+ // Runs the TermFeatureExtractor on |page_text|, waiting for the |
+ // completion callback. Returns the success boolean from the callback. |
+ bool ExtractFeatures(const string16* page_text, FeatureMap* features) { |
+ success_ = false; |
+ extractor_->ExtractFeatures( |
+ page_text, |
+ features, |
+ NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone)); |
+ msg_loop_.Run(); |
+ return success_; |
+ } |
+ |
+ // Completion callback for feature extraction. |
+ void ExtractionDone(bool success) { |
+ success_ = success; |
+ msg_loop_.Quit(); |
+ } |
+ |
+ MessageLoop msg_loop_; |
+ scoped_ptr<PhishingTermFeatureExtractor> extractor_; |
+ base::hash_set<std::string> term_hashes_; |
+ base::hash_set<std::string> word_hashes_; |
+ MockClock* clock_; // owned by extractor_ |
+ bool success_; // holds the success value from ExtractFeatures |
+}; |
+ |
+TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
+ // This test doesn't exercise the extraction timing. |
+ EXPECT_CALL(*clock_, Now()) |
+ .WillRepeatedly(Return(base::TimeTicks::Now())); |
+ |
+ string16 page_text = ASCIIToUTF16("blah"); |
+ FeatureMap expected_features; // initially empty |
+ |
+ FeatureMap features; |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ |
+ page_text = ASCIIToUTF16("one one"); |
+ expected_features.Clear(); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("one")); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("one one")); |
+ |
+ features.Clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ |
+ page_text = ASCIIToUTF16("bla bla multi word test bla"); |
+ expected_features.Clear(); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("multi word test")); |
+ |
+ features.Clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ |
+ // This text has all of the words for one of the terms, but they are |
+ // not in the correct order. |
+ page_text = ASCIIToUTF16("bla bla test word multi bla"); |
+ expected_features.Clear(); |
+ |
+ features.Clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ |
+ page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
+ "separator... punctuation!"); |
+ expected_features.Clear(); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("capitalization")); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("space")); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("separator")); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("punctuation")); |
+ |
+ features.Clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ |
+ // Test with empty page text. |
+ page_text = string16(); |
+ expected_features.Clear(); |
+ features.Clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ |
+ // Chinese translation of the phrase "hello goodbye". This tests that |
+ // we can correctly separate terms in languages that don't use spaces. |
+ page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
+ expected_features.Clear(); |
+ expected_features.AddBooleanFeature( |
+ features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
+ expected_features.AddBooleanFeature( |
+ features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
+ |
+ features.Clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+} |
+ |
+TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
+ // For this test, we'll cause the feature extraction to run multiple |
+ // iterations by incrementing the clock. |
+ |
+ // This page has a total of 30 words. For the features to be computed |
+ // correctly, the extractor has to process the entire string of text. |
+ string16 page_text(ASCIIToUTF16("one ")); |
+ for (int i = 0; i < 28; ++i) { |
+ page_text.append(ASCIIToUTF16(StringPrintf("%d ", i))); |
+ } |
+ page_text.append(ASCIIToUTF16("two")); |
+ |
+ // Advance the clock 30 ms every 10 words processed, 10 ms between chunks. |
+ // Note that this assumes kClockCheckGranularity = 10 and |
+ // kMaxTimePerChunkMs = 50. |
+ base::TimeTicks now = base::TimeTicks::Now(); |
+ EXPECT_CALL(*clock_, Now()) |
+ // Time check at the start of extraction. |
+ .WillOnce(Return(now)) |
+ // Time check at the start of the first chunk of work. |
+ .WillOnce(Return(now)) |
+ // Time check after the first 10 words. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))) |
+ // Time check after the next 10 words. This is over the chunk |
+ // time limit, so a continuation task will be posted. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60))) |
+ // Time check at the start of the second chunk of work. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70))) |
+ // Time check after the next 10 words. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100))) |
+ // A final check for the histograms. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101))); |
+ |
+ FeatureMap expected_features; |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("one")); |
+ expected_features.AddBooleanFeature(features::kPageTerm + |
+ std::string("two")); |
+ |
+ FeatureMap features; |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
+ // Make sure none of the mock expectations carry over to the next test. |
+ ::testing::Mock::VerifyAndClearExpectations(clock_); |
+ |
+ // Now repeat the test with the same text, but advance the clock faster so |
+ // that the extraction time exceeds the maximum total time for the feature |
+ // extractor. Extraction should fail. Note that this assumes |
+ // kMaxTotalTimeMs = 500. |
+ EXPECT_CALL(*clock_, Now()) |
+ // Time check at the start of extraction. |
+ .WillOnce(Return(now)) |
+ // Time check at the start of the first chunk of work. |
+ .WillOnce(Return(now)) |
+ // Time check after the first 10 words, |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) |
+ // Time check at the start of the second chunk of work. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) |
+ // Time check after the next 10 words. This is over the limit. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
+ // A final time check for the histograms. |
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
+ |
+ features.Clear(); |
+ EXPECT_FALSE(ExtractFeatures(&page_text, &features)); |
+} |
+ |
+} // namespace safe_browsing |