Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
index b8627de34d3a756c2a9ac2707a3e7bb340d9aea6..0c1a056f28a5482ad52fb709ecad1d6f2378b1f9 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc |
@@ -26,6 +26,9 @@ |
using base::ASCIIToUTF16; |
using ::testing::Return; |
+ |
+static const uint32 kMurmurHash3Seed = 2777808611U; |
+ |
namespace safe_browsing { |
class PhishingTermFeatureExtractorTest : public ::testing::Test { |
@@ -63,27 +66,35 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { |
words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
- static const uint32 kMurmurHash3Seed = 2777808611U; |
for (base::hash_set<std::string>::iterator it = words.begin(); |
it != words.end(); ++it) { |
word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); |
} |
+ ResetExtractor(3 /* max shingles per page */); |
+ } |
+ |
+ void ResetExtractor(size_t max_shingles_per_page) { |
extractor_.reset(new PhishingTermFeatureExtractor( |
&term_hashes_, |
&word_hashes_, |
3 /* max_words_per_term */, |
kMurmurHash3Seed, |
+ max_shingles_per_page, |
+ 4 /* shingle_size */, |
&clock_)); |
} |
// Runs the TermFeatureExtractor on |page_text|, waiting for the |
// completion callback. Returns the success boolean from the callback. |
- bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { |
+ bool ExtractFeatures(const base::string16* page_text, |
+ FeatureMap* features, |
+ std::set<uint32>* shingle_hashes) { |
success_ = false; |
extractor_->ExtractFeatures( |
page_text, |
features, |
+ shingle_hashes, |
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
base::Unretained(this))); |
msg_loop_.Run(); |
@@ -91,10 +102,12 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { |
} |
void PartialExtractFeatures(const base::string16* page_text, |
- FeatureMap* features) { |
+ FeatureMap* features, |
+ std::set<uint32>* shingle_hashes) { |
extractor_->ExtractFeatures( |
page_text, |
features, |
+ shingle_hashes, |
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
base::Unretained(this))); |
msg_loop_.PostTask( |
@@ -129,10 +142,13 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
base::string16 page_text = ASCIIToUTF16("blah"); |
FeatureMap expected_features; // initially empty |
+ std::set<uint32> expected_shingle_hashes; |
FeatureMap features; |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ std::set<uint32> shingle_hashes; |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
page_text = ASCIIToUTF16("one one"); |
expected_features.Clear(); |
@@ -140,29 +156,51 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
std::string("one")); |
expected_features.AddBooleanFeature(features::kPageTerm + |
std::string("one one")); |
+ expected_shingle_hashes.clear(); |
features.Clear(); |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
page_text = ASCIIToUTF16("bla bla multi word test bla"); |
expected_features.Clear(); |
expected_features.AddBooleanFeature(features::kPageTerm + |
std::string("multi word test")); |
+ expected_shingle_hashes.clear(); |
+ expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", |
+ kMurmurHash3Seed)); |
features.Clear(); |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
// This text has all of the words for one of the terms, but they are |
// not in the correct order. |
page_text = ASCIIToUTF16("bla bla test word multi bla"); |
expected_features.Clear(); |
+ expected_shingle_hashes.clear(); |
+ expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", |
+ kMurmurHash3Seed)); |
features.Clear(); |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
+ // Test various separators. |
page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
"separator... punctuation!"); |
expected_features.Clear(); |
@@ -174,36 +212,77 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
std::string("separator")); |
expected_features.AddBooleanFeature(features::kPageTerm + |
std::string("punctuation")); |
+ expected_shingle_hashes.clear(); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); |
+ |
+ features.Clear(); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
+ ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
+ |
+ // Test a page with too many words and we should only 3 minimum hashes. |
+ page_text = ASCIIToUTF16("This page has way too many words."); |
+ expected_features.Clear(); |
+ expected_shingle_hashes.clear(); |
+ expected_shingle_hashes.insert(MurmurHash3String("this page has way ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("page has way too ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("has way too many ", |
+ kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert(MurmurHash3String("way too many words ", |
+ kMurmurHash3Seed)); |
+ std::set<uint32>::iterator it = expected_shingle_hashes.end(); |
+ expected_shingle_hashes.erase(--it); |
features.Clear(); |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
// Test with empty page text. |
page_text = base::string16(); |
expected_features.Clear(); |
+ expected_shingle_hashes.clear(); |
features.Clear(); |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
- // Chinese translation of the phrase "hello goodbye". This tests that |
- // we can correctly separate terms in languages that don't use spaces. |
+ // Chinese translation of the phrase "hello goodbye hello goodbye". This tests |
+ // that we can correctly separate terms in languages that don't use spaces. |
page_text = |
- base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
+ base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81" |
+ "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
expected_features.Clear(); |
expected_features.AddBooleanFeature( |
features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
expected_features.AddBooleanFeature( |
features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
+ expected_shingle_hashes.clear(); |
+ expected_shingle_hashes.insert(MurmurHash3String( |
+ "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 " |
+ "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed)); |
features.Clear(); |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
} |
TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
// For this test, we'll cause the feature extraction to run multiple |
// iterations by incrementing the clock. |
+ ResetExtractor(200 /* max shingles per page */); |
// This page has a total of 30 words. For the features to be computed |
// correctly, the extractor has to process the entire string of text. |
@@ -245,10 +324,67 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
std::string("one")); |
expected_features.AddBooleanFeature(features::kPageTerm + |
std::string("two")); |
+ std::set<uint32> expected_shingle_hashes; |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); |
+ expected_shingle_hashes.insert( |
+ MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); |
FeatureMap features; |
- ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
+ std::set<uint32> shingle_hashes; |
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
ExpectFeatureMapsAreEqual(features, expected_features); |
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
// Make sure none of the mock expectations carry over to the next test. |
::testing::Mock::VerifyAndClearExpectations(&clock_); |
@@ -271,7 +407,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
features.Clear(); |
- EXPECT_FALSE(ExtractFeatures(&page_text, &features)); |
+ shingle_hashes.clear(); |
+ EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
} |
TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
@@ -294,8 +431,9 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
FeatureMap features; |
+ std::set<uint32> shingle_hashes; |
// Extract first 10 words then stop. |
- PartialExtractFeatures(page_text.get(), &features); |
+ PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
page_text.reset(new base::string16()); |
for (int i = 30; i < 58; ++i) { |
@@ -303,12 +441,13 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
} |
page_text->append(ASCIIToUTF16("multi word test ")); |
features.Clear(); |
+ shingle_hashes.clear(); |
// This part doesn't exercise the extraction timing. |
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
// Now extract normally and make sure nothing breaks. |
- EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); |
+ EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
FeatureMap expected_features; |
expected_features.AddBooleanFeature(features::kPageTerm + |