Chromium Code Reviews| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
| index b8627de34d3a756c2a9ac2707a3e7bb340d9aea6..98316a52af4d33f77eb55c80a3d1a4871c22c25f 100644 |
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
| @@ -26,6 +26,9 @@ |
| using base::ASCIIToUTF16; |
| using ::testing::Return; |
| + |
| +static const uint32 kMurmurHash3Seed = 2777808611U; |
| + |
| namespace safe_browsing { |
| class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| @@ -63,7 +66,6 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
| words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
| - static const uint32 kMurmurHash3Seed = 2777808611U; |
| for (base::hash_set<std::string>::iterator it = words.begin(); |
| it != words.end(); ++it) { |
| word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); |
| @@ -72,18 +74,23 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| extractor_.reset(new PhishingTermFeatureExtractor( |
| &term_hashes_, |
| &word_hashes_, |
| + 3 /* max_hashes_per_page */, |
| 3 /* max_words_per_term */, |
| kMurmurHash3Seed, |
| + 4 /* shingle_size */, |
| &clock_)); |
| } |
| // Runs the TermFeatureExtractor on |page_text|, waiting for the |
| // completion callback. Returns the success boolean from the callback. |
| - bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { |
| + bool ExtractFeatures(const base::string16* page_text, |
| + FeatureMap* features, |
| + std::set<uint32>* shingle_hashes) { |
| success_ = false; |
| extractor_->ExtractFeatures( |
| page_text, |
| features, |
| + shingle_hashes, |
| base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| base::Unretained(this))); |
| msg_loop_.Run(); |
| @@ -91,10 +98,12 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| } |
| void PartialExtractFeatures(const base::string16* page_text, |
| - FeatureMap* features) { |
| + FeatureMap* features, |
| + std::set<uint32>* shingle_hashes) { |
| extractor_->ExtractFeatures( |
| page_text, |
| features, |
| + shingle_hashes, |
| base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| base::Unretained(this))); |
| msg_loop_.PostTask( |
| @@ -129,10 +138,13 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| base::string16 page_text = ASCIIToUTF16("blah"); |
| FeatureMap expected_features; // initially empty |
| + std::set<uint32> expected_shingle_hashes; |
| FeatureMap features; |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + std::set<uint32> shingle_hashes; |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| page_text = ASCIIToUTF16("one one"); |
| expected_features.Clear(); |
| @@ -140,29 +152,51 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| std::string("one")); |
| expected_features.AddBooleanFeature(features::kPageTerm + |
| std::string("one one")); |
| + expected_shingle_hashes.clear(); |
| features.Clear(); |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| page_text = ASCIIToUTF16("bla bla multi word test bla"); |
| expected_features.Clear(); |
| expected_features.AddBooleanFeature(features::kPageTerm + |
| std::string("multi word test")); |
| + expected_shingle_hashes.clear(); |
| + expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", |
| + kMurmurHash3Seed)); |
| features.Clear(); |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| // This text has all of the words for one of the terms, but they are |
| // not in the correct order. |
| page_text = ASCIIToUTF16("bla bla test word multi bla"); |
| expected_features.Clear(); |
| + expected_shingle_hashes.clear(); |
| + expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", |
| + kMurmurHash3Seed)); |
| features.Clear(); |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| + // Test various separators. |
| page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
| "separator... punctuation!"); |
| expected_features.Clear(); |
| @@ -174,17 +208,46 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| std::string("separator")); |
| expected_features.AddBooleanFeature(features::kPageTerm + |
| std::string("punctuation")); |
| + expected_shingle_hashes.clear(); |
| + expected_shingle_hashes.insert( |
| + MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert( |
| + MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); |
| + |
| + features.Clear(); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| + ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| + |
| + // Test a page with too many words and we should only 3 shingle hashes. |
| + page_text = ASCIIToUTF16("This page has way too many words."); |
| + expected_features.Clear(); |
| + expected_shingle_hashes.clear(); |
| + expected_shingle_hashes.insert(MurmurHash3String("this page has way ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("page has way too ", |
| + kMurmurHash3Seed)); |
| + expected_shingle_hashes.insert(MurmurHash3String("has way too many ", |
| + kMurmurHash3Seed)); |
| features.Clear(); |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| // Test with empty page text. |
| page_text = base::string16(); |
| expected_features.Clear(); |
| + expected_shingle_hashes.clear(); |
| features.Clear(); |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| // Chinese translation of the phrase "hello goodbye". This tests that |
| // we can correctly separate terms in languages that don't use spaces. |
|
mattm
2014/05/06 01:00:14
Seems we should also have a similar test with enou
zysxqn
2014/05/06 20:56:57
Done.
|
| @@ -195,15 +258,19 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
| expected_features.AddBooleanFeature( |
| features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
| + expected_shingle_hashes.clear(); |
| features.Clear(); |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| + EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| } |
| TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| // For this test, we'll cause the feature extraction to run multiple |
| - // iterations by incrementing the clock. |
| + // iterations by incrementing the clock. We don't check shingle hashes here |
| + // since its size is too large. |
|
mattm
2014/05/06 01:00:14
what do you mean by size is too large? That you do
zysxqn
2014/05/06 20:56:57
Done.
|
| // This page has a total of 30 words. For the features to be computed |
| // correctly, the extractor has to process the entire string of text. |
| @@ -247,7 +314,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| std::string("two")); |
| FeatureMap features; |
| - ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| + std::set<uint32> shingle_hashes; |
| + ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| ExpectFeatureMapsAreEqual(features, expected_features); |
| // Make sure none of the mock expectations carry over to the next test. |
| ::testing::Mock::VerifyAndClearExpectations(&clock_); |
| @@ -271,7 +339,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
| features.Clear(); |
| - EXPECT_FALSE(ExtractFeatures(&page_text, &features)); |
| + shingle_hashes.clear(); |
| + EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| } |
| TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
| @@ -294,8 +363,9 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
| .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
| FeatureMap features; |
| + std::set<uint32> shingle_hashes; |
| // Extract first 10 words then stop. |
| - PartialExtractFeatures(page_text.get(), &features); |
| + PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
| page_text.reset(new base::string16()); |
| for (int i = 30; i < 58; ++i) { |
| @@ -303,12 +373,13 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
| } |
| page_text->append(ASCIIToUTF16("multi word test ")); |
| features.Clear(); |
| + shingle_hashes.clear(); |
| // This part doesn't exercise the extraction timing. |
| EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| // Now extract normally and make sure nothing breaks. |
| - EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); |
| + EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
| FeatureMap expected_features; |
| expected_features.AddBooleanFeature(features::kPageTerm + |