| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <stddef.h> |
| 8 #include <stdint.h> |
| 9 |
| 7 #include <string> | 10 #include <string> |
| 8 | 11 |
| 9 #include "base/bind.h" | 12 #include "base/bind.h" |
| 10 #include "base/callback.h" | 13 #include "base/callback.h" |
| 11 #include "base/containers/hash_tables.h" | 14 #include "base/containers/hash_tables.h" |
| 12 #include "base/location.h" | 15 #include "base/location.h" |
| 13 #include "base/memory/scoped_ptr.h" | 16 #include "base/memory/scoped_ptr.h" |
| 14 #include "base/message_loop/message_loop.h" | 17 #include "base/message_loop/message_loop.h" |
| 15 #include "base/single_thread_task_runner.h" | 18 #include "base/single_thread_task_runner.h" |
| 16 #include "base/strings/string16.h" | 19 #include "base/strings/string16.h" |
| 17 #include "base/strings/stringprintf.h" | 20 #include "base/strings/stringprintf.h" |
| 18 #include "base/strings/utf_string_conversions.h" | 21 #include "base/strings/utf_string_conversions.h" |
| 19 #include "base/time/time.h" | 22 #include "base/time/time.h" |
| 23 #include "build/build_config.h" |
| 20 #include "chrome/renderer/safe_browsing/features.h" | 24 #include "chrome/renderer/safe_browsing/features.h" |
| 21 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" | 25 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" |
| 22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 26 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
| 23 #include "chrome/renderer/safe_browsing/test_utils.h" | 27 #include "chrome/renderer/safe_browsing/test_utils.h" |
| 24 #include "crypto/sha2.h" | 28 #include "crypto/sha2.h" |
| 25 #include "testing/gmock/include/gmock/gmock.h" | 29 #include "testing/gmock/include/gmock/gmock.h" |
| 26 #include "testing/gtest/include/gtest/gtest.h" | 30 #include "testing/gtest/include/gtest/gtest.h" |
| 27 | 31 |
| 28 using base::ASCIIToUTF16; | 32 using base::ASCIIToUTF16; |
| 29 using ::testing::Return; | 33 using ::testing::Return; |
| 30 | 34 |
| 31 | 35 static const uint32_t kMurmurHash3Seed = 2777808611U; |
| 32 static const uint32 kMurmurHash3Seed = 2777808611U; | |
| 33 | 36 |
| 34 namespace safe_browsing { | 37 namespace safe_browsing { |
| 35 | 38 |
| 36 class PhishingTermFeatureExtractorTest : public ::testing::Test { | 39 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| 37 protected: | 40 protected: |
| 38 void SetUp() override { | 41 void SetUp() override { |
| 39 base::hash_set<std::string> terms; | 42 base::hash_set<std::string> terms; |
| 40 terms.insert("one"); | 43 terms.insert("one"); |
| 41 terms.insert("one one"); | 44 terms.insert("one one"); |
| 42 terms.insert("two"); | 45 terms.insert("two"); |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 84 kMurmurHash3Seed, | 87 kMurmurHash3Seed, |
| 85 max_shingles_per_page, | 88 max_shingles_per_page, |
| 86 4 /* shingle_size */, | 89 4 /* shingle_size */, |
| 87 &clock_)); | 90 &clock_)); |
| 88 } | 91 } |
| 89 | 92 |
| 90 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 93 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
| 91 // completion callback. Returns the success boolean from the callback. | 94 // completion callback. Returns the success boolean from the callback. |
| 92 bool ExtractFeatures(const base::string16* page_text, | 95 bool ExtractFeatures(const base::string16* page_text, |
| 93 FeatureMap* features, | 96 FeatureMap* features, |
| 94 std::set<uint32>* shingle_hashes) { | 97 std::set<uint32_t>* shingle_hashes) { |
| 95 success_ = false; | 98 success_ = false; |
| 96 extractor_->ExtractFeatures( | 99 extractor_->ExtractFeatures( |
| 97 page_text, | 100 page_text, |
| 98 features, | 101 features, |
| 99 shingle_hashes, | 102 shingle_hashes, |
| 100 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 103 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 101 base::Unretained(this))); | 104 base::Unretained(this))); |
| 102 msg_loop_.Run(); | 105 msg_loop_.Run(); |
| 103 return success_; | 106 return success_; |
| 104 } | 107 } |
| 105 | 108 |
| 106 void PartialExtractFeatures(const base::string16* page_text, | 109 void PartialExtractFeatures(const base::string16* page_text, |
| 107 FeatureMap* features, | 110 FeatureMap* features, |
| 108 std::set<uint32>* shingle_hashes) { | 111 std::set<uint32_t>* shingle_hashes) { |
| 109 extractor_->ExtractFeatures( | 112 extractor_->ExtractFeatures( |
| 110 page_text, | 113 page_text, |
| 111 features, | 114 features, |
| 112 shingle_hashes, | 115 shingle_hashes, |
| 113 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 116 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 114 base::Unretained(this))); | 117 base::Unretained(this))); |
| 115 msg_loop_.task_runner()->PostTask( | 118 msg_loop_.task_runner()->PostTask( |
| 116 FROM_HERE, base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 119 FROM_HERE, base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
| 117 base::Unretained(this))); | 120 base::Unretained(this))); |
| 118 msg_loop_.RunUntilIdle(); | 121 msg_loop_.RunUntilIdle(); |
| 119 } | 122 } |
| 120 | 123 |
| 121 // Completion callback for feature extraction. | 124 // Completion callback for feature extraction. |
| 122 void ExtractionDone(bool success) { | 125 void ExtractionDone(bool success) { |
| 123 success_ = success; | 126 success_ = success; |
| 124 msg_loop_.QuitWhenIdle(); | 127 msg_loop_.QuitWhenIdle(); |
| 125 } | 128 } |
| 126 | 129 |
| 127 void QuitExtraction() { | 130 void QuitExtraction() { |
| 128 extractor_->CancelPendingExtraction(); | 131 extractor_->CancelPendingExtraction(); |
| 129 msg_loop_.QuitWhenIdle(); | 132 msg_loop_.QuitWhenIdle(); |
| 130 } | 133 } |
| 131 | 134 |
| 132 base::MessageLoop msg_loop_; | 135 base::MessageLoop msg_loop_; |
| 133 MockFeatureExtractorClock clock_; | 136 MockFeatureExtractorClock clock_; |
| 134 scoped_ptr<PhishingTermFeatureExtractor> extractor_; | 137 scoped_ptr<PhishingTermFeatureExtractor> extractor_; |
| 135 base::hash_set<std::string> term_hashes_; | 138 base::hash_set<std::string> term_hashes_; |
| 136 base::hash_set<uint32> word_hashes_; | 139 base::hash_set<uint32_t> word_hashes_; |
| 137 bool success_; // holds the success value from ExtractFeatures | 140 bool success_; // holds the success value from ExtractFeatures |
| 138 }; | 141 }; |
| 139 | 142 |
| 140 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 143 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| 141 // This test doesn't exercise the extraction timing. | 144 // This test doesn't exercise the extraction timing. |
| 142 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 145 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 143 | 146 |
| 144 base::string16 page_text = ASCIIToUTF16("blah"); | 147 base::string16 page_text = ASCIIToUTF16("blah"); |
| 145 FeatureMap expected_features; // initially empty | 148 FeatureMap expected_features; // initially empty |
| 146 std::set<uint32> expected_shingle_hashes; | 149 std::set<uint32_t> expected_shingle_hashes; |
| 147 | 150 |
| 148 FeatureMap features; | 151 FeatureMap features; |
| 149 std::set<uint32> shingle_hashes; | 152 std::set<uint32_t> shingle_hashes; |
| 150 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | 153 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 151 ExpectFeatureMapsAreEqual(features, expected_features); | 154 ExpectFeatureMapsAreEqual(features, expected_features); |
| 152 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | 155 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 153 | 156 |
| 154 page_text = ASCIIToUTF16("one one"); | 157 page_text = ASCIIToUTF16("one one"); |
| 155 expected_features.Clear(); | 158 expected_features.Clear(); |
| 156 expected_features.AddBooleanFeature(features::kPageTerm + | 159 expected_features.AddBooleanFeature(features::kPageTerm + |
| 157 std::string("one")); | 160 std::string("one")); |
| 158 expected_features.AddBooleanFeature(features::kPageTerm + | 161 expected_features.AddBooleanFeature(features::kPageTerm + |
| 159 std::string("one one")); | 162 std::string("one one")); |
| (...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 232 expected_features.Clear(); | 235 expected_features.Clear(); |
| 233 expected_shingle_hashes.clear(); | 236 expected_shingle_hashes.clear(); |
| 234 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", | 237 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", |
| 235 kMurmurHash3Seed)); | 238 kMurmurHash3Seed)); |
| 236 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", | 239 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", |
| 237 kMurmurHash3Seed)); | 240 kMurmurHash3Seed)); |
| 238 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", | 241 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", |
| 239 kMurmurHash3Seed)); | 242 kMurmurHash3Seed)); |
| 240 expected_shingle_hashes.insert(MurmurHash3String("way too many words ", | 243 expected_shingle_hashes.insert(MurmurHash3String("way too many words ", |
| 241 kMurmurHash3Seed)); | 244 kMurmurHash3Seed)); |
| 242 std::set<uint32>::iterator it = expected_shingle_hashes.end(); | 245 std::set<uint32_t>::iterator it = expected_shingle_hashes.end(); |
| 243 expected_shingle_hashes.erase(--it); | 246 expected_shingle_hashes.erase(--it); |
| 244 | 247 |
| 245 features.Clear(); | 248 features.Clear(); |
| 246 shingle_hashes.clear(); | 249 shingle_hashes.clear(); |
| 247 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | 250 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 248 ExpectFeatureMapsAreEqual(features, expected_features); | 251 ExpectFeatureMapsAreEqual(features, expected_features); |
| 249 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | 252 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 250 | 253 |
| 251 // Test with empty page text. | 254 // Test with empty page text. |
| 252 page_text = base::string16(); | 255 page_text = base::string16(); |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 324 // Time check after the next 5 words. | 327 // Time check after the next 5 words. |
| 325 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) | 328 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) |
| 326 // A final check for the histograms. | 329 // A final check for the histograms. |
| 327 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); | 330 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); |
| 328 | 331 |
| 329 FeatureMap expected_features; | 332 FeatureMap expected_features; |
| 330 expected_features.AddBooleanFeature(features::kPageTerm + | 333 expected_features.AddBooleanFeature(features::kPageTerm + |
| 331 std::string("one")); | 334 std::string("one")); |
| 332 expected_features.AddBooleanFeature(features::kPageTerm + | 335 expected_features.AddBooleanFeature(features::kPageTerm + |
| 333 std::string("two")); | 336 std::string("two")); |
| 334 std::set<uint32> expected_shingle_hashes; | 337 std::set<uint32_t> expected_shingle_hashes; |
| 335 expected_shingle_hashes.insert( | 338 expected_shingle_hashes.insert( |
| 336 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); | 339 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); |
| 337 expected_shingle_hashes.insert( | 340 expected_shingle_hashes.insert( |
| 338 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); | 341 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); |
| 339 expected_shingle_hashes.insert( | 342 expected_shingle_hashes.insert( |
| 340 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); | 343 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); |
| 341 expected_shingle_hashes.insert( | 344 expected_shingle_hashes.insert( |
| 342 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); | 345 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); |
| 343 expected_shingle_hashes.insert( | 346 expected_shingle_hashes.insert( |
| 344 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); | 347 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 381 expected_shingle_hashes.insert( | 384 expected_shingle_hashes.insert( |
| 382 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); | 385 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); |
| 383 expected_shingle_hashes.insert( | 386 expected_shingle_hashes.insert( |
| 384 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); | 387 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); |
| 385 expected_shingle_hashes.insert( | 388 expected_shingle_hashes.insert( |
| 386 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); | 389 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); |
| 387 expected_shingle_hashes.insert( | 390 expected_shingle_hashes.insert( |
| 388 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); | 391 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); |
| 389 | 392 |
| 390 FeatureMap features; | 393 FeatureMap features; |
| 391 std::set<uint32> shingle_hashes; | 394 std::set<uint32_t> shingle_hashes; |
| 392 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | 395 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 393 ExpectFeatureMapsAreEqual(features, expected_features); | 396 ExpectFeatureMapsAreEqual(features, expected_features); |
| 394 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | 397 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 395 // Make sure none of the mock expectations carry over to the next test. | 398 // Make sure none of the mock expectations carry over to the next test. |
| 396 ::testing::Mock::VerifyAndClearExpectations(&clock_); | 399 ::testing::Mock::VerifyAndClearExpectations(&clock_); |
| 397 | 400 |
| 398 // Now repeat the test with the same text, but advance the clock faster so | 401 // Now repeat the test with the same text, but advance the clock faster so |
| 399 // that the extraction time exceeds the maximum total time for the feature | 402 // that the extraction time exceeds the maximum total time for the feature |
| 400 // extractor. Extraction should fail. Note that this assumes | 403 // extractor. Extraction should fail. Note that this assumes |
| 401 // kMaxTotalTimeMs = 500. | 404 // kMaxTotalTimeMs = 500. |
| (...skipping 29 matching lines...) Expand all Loading... |
| 431 .WillOnce(Return(now)) | 434 .WillOnce(Return(now)) |
| 432 // Time check at the start of the first chunk of work. | 435 // Time check at the start of the first chunk of work. |
| 433 .WillOnce(Return(now)) | 436 .WillOnce(Return(now)) |
| 434 // Time check after the first 5 words. | 437 // Time check after the first 5 words. |
| 435 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 438 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
| 436 // Time check after the next 5 words. This should be greater than | 439 // Time check after the next 5 words. This should be greater than |
| 437 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 440 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
| 438 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 441 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
| 439 | 442 |
| 440 FeatureMap features; | 443 FeatureMap features; |
| 441 std::set<uint32> shingle_hashes; | 444 std::set<uint32_t> shingle_hashes; |
| 442 // Extract first 10 words then stop. | 445 // Extract first 10 words then stop. |
| 443 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); | 446 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
| 444 | 447 |
| 445 page_text.reset(new base::string16()); | 448 page_text.reset(new base::string16()); |
| 446 for (int i = 30; i < 58; ++i) { | 449 for (int i = 30; i < 58; ++i) { |
| 447 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 450 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 448 } | 451 } |
| 449 page_text->append(ASCIIToUTF16("multi word test ")); | 452 page_text->append(ASCIIToUTF16("multi word test ")); |
| 450 features.Clear(); | 453 features.Clear(); |
| 451 shingle_hashes.clear(); | 454 shingle_hashes.clear(); |
| 452 | 455 |
| 453 // This part doesn't exercise the extraction timing. | 456 // This part doesn't exercise the extraction timing. |
| 454 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 457 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 455 | 458 |
| 456 // Now extract normally and make sure nothing breaks. | 459 // Now extract normally and make sure nothing breaks. |
| 457 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); | 460 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
| 458 | 461 |
| 459 FeatureMap expected_features; | 462 FeatureMap expected_features; |
| 460 expected_features.AddBooleanFeature(features::kPageTerm + | 463 expected_features.AddBooleanFeature(features::kPageTerm + |
| 461 std::string("multi word test")); | 464 std::string("multi word test")); |
| 462 ExpectFeatureMapsAreEqual(features, expected_features); | 465 ExpectFeatureMapsAreEqual(features, expected_features); |
| 463 } | 466 } |
| 464 | 467 |
| 465 } // namespace safe_browsing | 468 } // namespace safe_browsing |
| OLD | NEW |