Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4117)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix a reference problem. Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
index b8627de34d3a756c2a9ac2707a3e7bb340d9aea6..98316a52af4d33f77eb55c80a3d1a4871c22c25f 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
@@ -26,6 +26,9 @@
using base::ASCIIToUTF16;
using ::testing::Return;
+
+static const uint32 kMurmurHash3Seed = 2777808611U;
+
namespace safe_browsing {
class PhishingTermFeatureExtractorTest : public ::testing::Test {
@@ -63,7 +66,6 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
words.insert("\xe5\x86\x8d\xe8\xa7\x81");
- static const uint32 kMurmurHash3Seed = 2777808611U;
for (base::hash_set<std::string>::iterator it = words.begin();
it != words.end(); ++it) {
word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
@@ -72,18 +74,23 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
extractor_.reset(new PhishingTermFeatureExtractor(
&term_hashes_,
&word_hashes_,
+ 3 /* max_hashes_per_page */,
3 /* max_words_per_term */,
kMurmurHash3Seed,
+ 4 /* shingle_size */,
&clock_));
}
// Runs the TermFeatureExtractor on |page_text|, waiting for the
// completion callback. Returns the success boolean from the callback.
- bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) {
+ bool ExtractFeatures(const base::string16* page_text,
+ FeatureMap* features,
+ std::set<uint32>* shingle_hashes) {
success_ = false;
extractor_->ExtractFeatures(
page_text,
features,
+ shingle_hashes,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.Run();
@@ -91,10 +98,12 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
}
void PartialExtractFeatures(const base::string16* page_text,
- FeatureMap* features) {
+ FeatureMap* features,
+ std::set<uint32>* shingle_hashes) {
extractor_->ExtractFeatures(
page_text,
features,
+ shingle_hashes,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.PostTask(
@@ -129,10 +138,13 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
base::string16 page_text = ASCIIToUTF16("blah");
FeatureMap expected_features; // initially empty
+ std::set<uint32> expected_shingle_hashes;
FeatureMap features;
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ std::set<uint32> shingle_hashes;
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
page_text = ASCIIToUTF16("one one");
expected_features.Clear();
@@ -140,29 +152,51 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("one one"));
+ expected_shingle_hashes.clear();
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
page_text = ASCIIToUTF16("bla bla multi word test bla");
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("multi word test"));
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
+ kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// This text has all of the words for one of the terms, but they are
// not in the correct order.
page_text = ASCIIToUTF16("bla bla test word multi bla");
expected_features.Clear();
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
+ kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+ // Test various separators.
page_text = ASCIIToUTF16("Capitalization plus non-space\n"
"separator... punctuation!");
expected_features.Clear();
@@ -174,17 +208,46 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
std::string("separator"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("punctuation"));
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(
+ MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
+
+ features.Clear();
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
+ ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+
+ // Test a page with too many words and we should only 3 shingle hashes.
+ page_text = ASCIIToUTF16("This page has way too many words.");
+ expected_features.Clear();
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
+ kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Test with empty page text.
page_text = base::string16();
expected_features.Clear();
+ expected_shingle_hashes.clear();
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Chinese translation of the phrase "hello goodbye". This tests that
// we can correctly separate terms in languages that don't use spaces.
mattm 2014/05/06 01:00:14 Seems we should also have a similar test with enou
zysxqn 2014/05/06 20:56:57 Done.
@@ -195,15 +258,19 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
+ expected_shingle_hashes.clear();
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
}
TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
// For this test, we'll cause the feature extraction to run multiple
- // iterations by incrementing the clock.
+ // iterations by incrementing the clock. We don't check shingle hashes here
+ // since its size is too large.
mattm 2014/05/06 01:00:14 what do you mean by size is too large? That you do
zysxqn 2014/05/06 20:56:57 Done.
// This page has a total of 30 words. For the features to be computed
// correctly, the extractor has to process the entire string of text.
@@ -247,7 +314,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
std::string("two"));
FeatureMap features;
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ std::set<uint32> shingle_hashes;
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
// Make sure none of the mock expectations carry over to the next test.
::testing::Mock::VerifyAndClearExpectations(&clock_);
@@ -271,7 +339,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
features.Clear();
- EXPECT_FALSE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
}
TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
@@ -294,8 +363,9 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
FeatureMap features;
+ std::set<uint32> shingle_hashes;
// Extract first 10 words then stop.
- PartialExtractFeatures(page_text.get(), &features);
+ PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
page_text.reset(new base::string16());
for (int i = 30; i < 58; ++i) {
@@ -303,12 +373,13 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
}
page_text->append(ASCIIToUTF16("multi word test "));
features.Clear();
+ shingle_hashes.clear();
// This part doesn't exercise the extraction timing.
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
// Now extract normally and make sure nothing breaks.
- EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
+ EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageTerm +

Powered by Google App Engine
This is Rietveld 408576698