| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| index b8627de34d3a756c2a9ac2707a3e7bb340d9aea6..774881a43b4ed07006b89bcfc64996c1384746a8 100644
|
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
|
| @@ -26,6 +26,9 @@
|
| using base::ASCIIToUTF16;
|
| using ::testing::Return;
|
|
|
| +
|
| +static const uint32 kMurmurHash3Seed = 2777808611U;
|
| +
|
| namespace safe_browsing {
|
|
|
| class PhishingTermFeatureExtractorTest : public ::testing::Test {
|
| @@ -63,27 +66,35 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
|
| words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
|
| words.insert("\xe5\x86\x8d\xe8\xa7\x81");
|
|
|
| - static const uint32 kMurmurHash3Seed = 2777808611U;
|
| for (base::hash_set<std::string>::iterator it = words.begin();
|
| it != words.end(); ++it) {
|
| word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
|
| }
|
|
|
| + ResetExtractor(3 /* max shingles per page */);
|
| + }
|
| +
|
| + void ResetExtractor(size_t max_shingles_per_page) {
|
| extractor_.reset(new PhishingTermFeatureExtractor(
|
| &term_hashes_,
|
| &word_hashes_,
|
| 3 /* max_words_per_term */,
|
| kMurmurHash3Seed,
|
| + max_shingles_per_page,
|
| + 4 /* shingle_size */,
|
| &clock_));
|
| }
|
|
|
| // Runs the TermFeatureExtractor on |page_text|, waiting for the
|
| // completion callback. Returns the success boolean from the callback.
|
| - bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) {
|
| + bool ExtractFeatures(const base::string16* page_text,
|
| + FeatureMap* features,
|
| + std::set<uint32>* shingle_hashes) {
|
| success_ = false;
|
| extractor_->ExtractFeatures(
|
| page_text,
|
| features,
|
| + shingle_hashes,
|
| base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
|
| base::Unretained(this)));
|
| msg_loop_.Run();
|
| @@ -91,10 +102,12 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
|
| }
|
|
|
| void PartialExtractFeatures(const base::string16* page_text,
|
| - FeatureMap* features) {
|
| + FeatureMap* features,
|
| + std::set<uint32>* shingle_hashes) {
|
| extractor_->ExtractFeatures(
|
| page_text,
|
| features,
|
| + shingle_hashes,
|
| base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
|
| base::Unretained(this)));
|
| msg_loop_.PostTask(
|
| @@ -129,10 +142,13 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
|
|
|
| base::string16 page_text = ASCIIToUTF16("blah");
|
| FeatureMap expected_features; // initially empty
|
| + std::set<uint32> expected_shingle_hashes;
|
|
|
| FeatureMap features;
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + std::set<uint32> shingle_hashes;
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
|
|
| page_text = ASCIIToUTF16("one one");
|
| expected_features.Clear();
|
| @@ -140,29 +156,51 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
|
| std::string("one"));
|
| expected_features.AddBooleanFeature(features::kPageTerm +
|
| std::string("one one"));
|
| + expected_shingle_hashes.clear();
|
|
|
| features.Clear();
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
|
|
| page_text = ASCIIToUTF16("bla bla multi word test bla");
|
| expected_features.Clear();
|
| expected_features.AddBooleanFeature(features::kPageTerm +
|
| std::string("multi word test"));
|
| + expected_shingle_hashes.clear();
|
| + expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
|
| + kMurmurHash3Seed));
|
|
|
| features.Clear();
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
|
|
| // This text has all of the words for one of the terms, but they are
|
| // not in the correct order.
|
| page_text = ASCIIToUTF16("bla bla test word multi bla");
|
| expected_features.Clear();
|
| + expected_shingle_hashes.clear();
|
| + expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
|
| + kMurmurHash3Seed));
|
|
|
| features.Clear();
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
|
|
| + // Test various separators.
|
| page_text = ASCIIToUTF16("Capitalization plus non-space\n"
|
| "separator... punctuation!");
|
| expected_features.Clear();
|
| @@ -174,36 +212,73 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
|
| std::string("separator"));
|
| expected_features.AddBooleanFeature(features::kPageTerm +
|
| std::string("punctuation"));
|
| + expected_shingle_hashes.clear();
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
|
| +
|
| + features.Clear();
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| + ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
| +
|
| + // Test a page with too many words and we should only 3 shingle hashes.
|
| + page_text = ASCIIToUTF16("This page has way too many words.");
|
| + expected_features.Clear();
|
| + expected_shingle_hashes.clear();
|
| + expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
|
| + kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
|
| + kMurmurHash3Seed));
|
|
|
| features.Clear();
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
|
|
| // Test with empty page text.
|
| page_text = base::string16();
|
| expected_features.Clear();
|
| + expected_shingle_hashes.clear();
|
| features.Clear();
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
|
|
| - // Chinese translation of the phrase "hello goodbye". This tests that
|
| - // we can correctly separate terms in languages that don't use spaces.
|
| + // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
|
| + // that we can correctly separate terms in languages that don't use spaces.
|
| page_text =
|
| - base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
|
| + base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
|
| + "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
|
| expected_features.Clear();
|
| expected_features.AddBooleanFeature(
|
| features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
|
| expected_features.AddBooleanFeature(
|
| features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
|
| + expected_shingle_hashes.clear();
|
| + expected_shingle_hashes.insert(MurmurHash3String(
|
| + "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
|
| + "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
|
|
|
| features.Clear();
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
| }
|
|
|
| TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
|
| // For this test, we'll cause the feature extraction to run multiple
|
| // iterations by incrementing the clock.
|
| + ResetExtractor(200 /* max shingles per page */);
|
|
|
| // This page has a total of 30 words. For the features to be computed
|
| // correctly, the extractor has to process the entire string of text.
|
| @@ -245,10 +320,67 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
|
| std::string("one"));
|
| expected_features.AddBooleanFeature(features::kPageTerm +
|
| std::string("two"));
|
| + std::set<uint32> expected_shingle_hashes;
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
|
| + expected_shingle_hashes.insert(
|
| + MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
|
|
|
| FeatureMap features;
|
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features));
|
| + std::set<uint32> shingle_hashes;
|
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| ExpectFeatureMapsAreEqual(features, expected_features);
|
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
|
| // Make sure none of the mock expectations carry over to the next test.
|
| ::testing::Mock::VerifyAndClearExpectations(&clock_);
|
|
|
| @@ -271,7 +403,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
|
| .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
|
|
|
| features.Clear();
|
| - EXPECT_FALSE(ExtractFeatures(&page_text, &features));
|
| + shingle_hashes.clear();
|
| + EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
|
| }
|
|
|
| TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
|
| @@ -294,8 +427,9 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
|
| .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
|
|
|
| FeatureMap features;
|
| + std::set<uint32> shingle_hashes;
|
| // Extract first 10 words then stop.
|
| - PartialExtractFeatures(page_text.get(), &features);
|
| + PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
|
|
|
| page_text.reset(new base::string16());
|
| for (int i = 30; i < 58; ++i) {
|
| @@ -303,12 +437,13 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
|
| }
|
| page_text->append(ASCIIToUTF16("multi word test "));
|
| features.Clear();
|
| + shingle_hashes.clear();
|
|
|
| // This part doesn't exercise the extraction timing.
|
| EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
|
|
|
| // Now extract normally and make sure nothing breaks.
|
| - EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
|
| + EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
|
|
|
| FeatureMap expected_features;
|
| expected_features.AddBooleanFeature(features::kPageTerm +
|
|
|