OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
| 7 #include <stddef.h> |
| 8 #include <stdint.h> |
| 9 |
7 #include <string> | 10 #include <string> |
8 | 11 |
9 #include "base/bind.h" | 12 #include "base/bind.h" |
10 #include "base/callback.h" | 13 #include "base/callback.h" |
11 #include "base/containers/hash_tables.h" | 14 #include "base/containers/hash_tables.h" |
12 #include "base/location.h" | 15 #include "base/location.h" |
13 #include "base/memory/scoped_ptr.h" | 16 #include "base/memory/scoped_ptr.h" |
14 #include "base/message_loop/message_loop.h" | 17 #include "base/message_loop/message_loop.h" |
15 #include "base/single_thread_task_runner.h" | 18 #include "base/single_thread_task_runner.h" |
16 #include "base/strings/string16.h" | 19 #include "base/strings/string16.h" |
17 #include "base/strings/stringprintf.h" | 20 #include "base/strings/stringprintf.h" |
18 #include "base/strings/utf_string_conversions.h" | 21 #include "base/strings/utf_string_conversions.h" |
19 #include "base/time/time.h" | 22 #include "base/time/time.h" |
| 23 #include "build/build_config.h" |
20 #include "chrome/renderer/safe_browsing/features.h" | 24 #include "chrome/renderer/safe_browsing/features.h" |
21 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" | 25 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" |
22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 26 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
23 #include "chrome/renderer/safe_browsing/test_utils.h" | 27 #include "chrome/renderer/safe_browsing/test_utils.h" |
24 #include "crypto/sha2.h" | 28 #include "crypto/sha2.h" |
25 #include "testing/gmock/include/gmock/gmock.h" | 29 #include "testing/gmock/include/gmock/gmock.h" |
26 #include "testing/gtest/include/gtest/gtest.h" | 30 #include "testing/gtest/include/gtest/gtest.h" |
27 | 31 |
28 using base::ASCIIToUTF16; | 32 using base::ASCIIToUTF16; |
29 using ::testing::Return; | 33 using ::testing::Return; |
30 | 34 |
31 | 35 static const uint32_t kMurmurHash3Seed = 2777808611U; |
32 static const uint32 kMurmurHash3Seed = 2777808611U; | |
33 | 36 |
34 namespace safe_browsing { | 37 namespace safe_browsing { |
35 | 38 |
36 class PhishingTermFeatureExtractorTest : public ::testing::Test { | 39 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
37 protected: | 40 protected: |
38 void SetUp() override { | 41 void SetUp() override { |
39 base::hash_set<std::string> terms; | 42 base::hash_set<std::string> terms; |
40 terms.insert("one"); | 43 terms.insert("one"); |
41 terms.insert("one one"); | 44 terms.insert("one one"); |
42 terms.insert("two"); | 45 terms.insert("two"); |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
84 kMurmurHash3Seed, | 87 kMurmurHash3Seed, |
85 max_shingles_per_page, | 88 max_shingles_per_page, |
86 4 /* shingle_size */, | 89 4 /* shingle_size */, |
87 &clock_)); | 90 &clock_)); |
88 } | 91 } |
89 | 92 |
90 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 93 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
91 // completion callback. Returns the success boolean from the callback. | 94 // completion callback. Returns the success boolean from the callback. |
92 bool ExtractFeatures(const base::string16* page_text, | 95 bool ExtractFeatures(const base::string16* page_text, |
93 FeatureMap* features, | 96 FeatureMap* features, |
94 std::set<uint32>* shingle_hashes) { | 97 std::set<uint32_t>* shingle_hashes) { |
95 success_ = false; | 98 success_ = false; |
96 extractor_->ExtractFeatures( | 99 extractor_->ExtractFeatures( |
97 page_text, | 100 page_text, |
98 features, | 101 features, |
99 shingle_hashes, | 102 shingle_hashes, |
100 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 103 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
101 base::Unretained(this))); | 104 base::Unretained(this))); |
102 msg_loop_.Run(); | 105 msg_loop_.Run(); |
103 return success_; | 106 return success_; |
104 } | 107 } |
105 | 108 |
106 void PartialExtractFeatures(const base::string16* page_text, | 109 void PartialExtractFeatures(const base::string16* page_text, |
107 FeatureMap* features, | 110 FeatureMap* features, |
108 std::set<uint32>* shingle_hashes) { | 111 std::set<uint32_t>* shingle_hashes) { |
109 extractor_->ExtractFeatures( | 112 extractor_->ExtractFeatures( |
110 page_text, | 113 page_text, |
111 features, | 114 features, |
112 shingle_hashes, | 115 shingle_hashes, |
113 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 116 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
114 base::Unretained(this))); | 117 base::Unretained(this))); |
115 msg_loop_.task_runner()->PostTask( | 118 msg_loop_.task_runner()->PostTask( |
116 FROM_HERE, base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 119 FROM_HERE, base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
117 base::Unretained(this))); | 120 base::Unretained(this))); |
118 msg_loop_.RunUntilIdle(); | 121 msg_loop_.RunUntilIdle(); |
119 } | 122 } |
120 | 123 |
121 // Completion callback for feature extraction. | 124 // Completion callback for feature extraction. |
122 void ExtractionDone(bool success) { | 125 void ExtractionDone(bool success) { |
123 success_ = success; | 126 success_ = success; |
124 msg_loop_.QuitWhenIdle(); | 127 msg_loop_.QuitWhenIdle(); |
125 } | 128 } |
126 | 129 |
127 void QuitExtraction() { | 130 void QuitExtraction() { |
128 extractor_->CancelPendingExtraction(); | 131 extractor_->CancelPendingExtraction(); |
129 msg_loop_.QuitWhenIdle(); | 132 msg_loop_.QuitWhenIdle(); |
130 } | 133 } |
131 | 134 |
132 base::MessageLoop msg_loop_; | 135 base::MessageLoop msg_loop_; |
133 MockFeatureExtractorClock clock_; | 136 MockFeatureExtractorClock clock_; |
134 scoped_ptr<PhishingTermFeatureExtractor> extractor_; | 137 scoped_ptr<PhishingTermFeatureExtractor> extractor_; |
135 base::hash_set<std::string> term_hashes_; | 138 base::hash_set<std::string> term_hashes_; |
136 base::hash_set<uint32> word_hashes_; | 139 base::hash_set<uint32_t> word_hashes_; |
137 bool success_; // holds the success value from ExtractFeatures | 140 bool success_; // holds the success value from ExtractFeatures |
138 }; | 141 }; |
139 | 142 |
140 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 143 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
141 // This test doesn't exercise the extraction timing. | 144 // This test doesn't exercise the extraction timing. |
142 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 145 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
143 | 146 |
144 base::string16 page_text = ASCIIToUTF16("blah"); | 147 base::string16 page_text = ASCIIToUTF16("blah"); |
145 FeatureMap expected_features; // initially empty | 148 FeatureMap expected_features; // initially empty |
146 std::set<uint32> expected_shingle_hashes; | 149 std::set<uint32_t> expected_shingle_hashes; |
147 | 150 |
148 FeatureMap features; | 151 FeatureMap features; |
149 std::set<uint32> shingle_hashes; | 152 std::set<uint32_t> shingle_hashes; |
150 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | 153 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
151 ExpectFeatureMapsAreEqual(features, expected_features); | 154 ExpectFeatureMapsAreEqual(features, expected_features); |
152 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | 155 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
153 | 156 |
154 page_text = ASCIIToUTF16("one one"); | 157 page_text = ASCIIToUTF16("one one"); |
155 expected_features.Clear(); | 158 expected_features.Clear(); |
156 expected_features.AddBooleanFeature(features::kPageTerm + | 159 expected_features.AddBooleanFeature(features::kPageTerm + |
157 std::string("one")); | 160 std::string("one")); |
158 expected_features.AddBooleanFeature(features::kPageTerm + | 161 expected_features.AddBooleanFeature(features::kPageTerm + |
159 std::string("one one")); | 162 std::string("one one")); |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
232 expected_features.Clear(); | 235 expected_features.Clear(); |
233 expected_shingle_hashes.clear(); | 236 expected_shingle_hashes.clear(); |
234 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", | 237 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", |
235 kMurmurHash3Seed)); | 238 kMurmurHash3Seed)); |
236 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", | 239 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", |
237 kMurmurHash3Seed)); | 240 kMurmurHash3Seed)); |
238 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", | 241 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", |
239 kMurmurHash3Seed)); | 242 kMurmurHash3Seed)); |
240 expected_shingle_hashes.insert(MurmurHash3String("way too many words ", | 243 expected_shingle_hashes.insert(MurmurHash3String("way too many words ", |
241 kMurmurHash3Seed)); | 244 kMurmurHash3Seed)); |
242 std::set<uint32>::iterator it = expected_shingle_hashes.end(); | 245 std::set<uint32_t>::iterator it = expected_shingle_hashes.end(); |
243 expected_shingle_hashes.erase(--it); | 246 expected_shingle_hashes.erase(--it); |
244 | 247 |
245 features.Clear(); | 248 features.Clear(); |
246 shingle_hashes.clear(); | 249 shingle_hashes.clear(); |
247 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | 250 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
248 ExpectFeatureMapsAreEqual(features, expected_features); | 251 ExpectFeatureMapsAreEqual(features, expected_features); |
249 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | 252 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
250 | 253 |
251 // Test with empty page text. | 254 // Test with empty page text. |
252 page_text = base::string16(); | 255 page_text = base::string16(); |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
324 // Time check after the next 5 words. | 327 // Time check after the next 5 words. |
325 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) | 328 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) |
326 // A final check for the histograms. | 329 // A final check for the histograms. |
327 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); | 330 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); |
328 | 331 |
329 FeatureMap expected_features; | 332 FeatureMap expected_features; |
330 expected_features.AddBooleanFeature(features::kPageTerm + | 333 expected_features.AddBooleanFeature(features::kPageTerm + |
331 std::string("one")); | 334 std::string("one")); |
332 expected_features.AddBooleanFeature(features::kPageTerm + | 335 expected_features.AddBooleanFeature(features::kPageTerm + |
333 std::string("two")); | 336 std::string("two")); |
334 std::set<uint32> expected_shingle_hashes; | 337 std::set<uint32_t> expected_shingle_hashes; |
335 expected_shingle_hashes.insert( | 338 expected_shingle_hashes.insert( |
336 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); | 339 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); |
337 expected_shingle_hashes.insert( | 340 expected_shingle_hashes.insert( |
338 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); | 341 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); |
339 expected_shingle_hashes.insert( | 342 expected_shingle_hashes.insert( |
340 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); | 343 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); |
341 expected_shingle_hashes.insert( | 344 expected_shingle_hashes.insert( |
342 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); | 345 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); |
343 expected_shingle_hashes.insert( | 346 expected_shingle_hashes.insert( |
344 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); | 347 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
381 expected_shingle_hashes.insert( | 384 expected_shingle_hashes.insert( |
382 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); | 385 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); |
383 expected_shingle_hashes.insert( | 386 expected_shingle_hashes.insert( |
384 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); | 387 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); |
385 expected_shingle_hashes.insert( | 388 expected_shingle_hashes.insert( |
386 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); | 389 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); |
387 expected_shingle_hashes.insert( | 390 expected_shingle_hashes.insert( |
388 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); | 391 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); |
389 | 392 |
390 FeatureMap features; | 393 FeatureMap features; |
391 std::set<uint32> shingle_hashes; | 394 std::set<uint32_t> shingle_hashes; |
392 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | 395 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
393 ExpectFeatureMapsAreEqual(features, expected_features); | 396 ExpectFeatureMapsAreEqual(features, expected_features); |
394 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | 397 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
395 // Make sure none of the mock expectations carry over to the next test. | 398 // Make sure none of the mock expectations carry over to the next test. |
396 ::testing::Mock::VerifyAndClearExpectations(&clock_); | 399 ::testing::Mock::VerifyAndClearExpectations(&clock_); |
397 | 400 |
398 // Now repeat the test with the same text, but advance the clock faster so | 401 // Now repeat the test with the same text, but advance the clock faster so |
399 // that the extraction time exceeds the maximum total time for the feature | 402 // that the extraction time exceeds the maximum total time for the feature |
400 // extractor. Extraction should fail. Note that this assumes | 403 // extractor. Extraction should fail. Note that this assumes |
401 // kMaxTotalTimeMs = 500. | 404 // kMaxTotalTimeMs = 500. |
(...skipping 29 matching lines...) Expand all Loading... |
431 .WillOnce(Return(now)) | 434 .WillOnce(Return(now)) |
432 // Time check at the start of the first chunk of work. | 435 // Time check at the start of the first chunk of work. |
433 .WillOnce(Return(now)) | 436 .WillOnce(Return(now)) |
434 // Time check after the first 5 words. | 437 // Time check after the first 5 words. |
435 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 438 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
436 // Time check after the next 5 words. This should be greater than | 439 // Time check after the next 5 words. This should be greater than |
437 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 440 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
438 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 441 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
439 | 442 |
440 FeatureMap features; | 443 FeatureMap features; |
441 std::set<uint32> shingle_hashes; | 444 std::set<uint32_t> shingle_hashes; |
442 // Extract first 10 words then stop. | 445 // Extract first 10 words then stop. |
443 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); | 446 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
444 | 447 |
445 page_text.reset(new base::string16()); | 448 page_text.reset(new base::string16()); |
446 for (int i = 30; i < 58; ++i) { | 449 for (int i = 30; i < 58; ++i) { |
447 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 450 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
448 } | 451 } |
449 page_text->append(ASCIIToUTF16("multi word test ")); | 452 page_text->append(ASCIIToUTF16("multi word test ")); |
450 features.Clear(); | 453 features.Clear(); |
451 shingle_hashes.clear(); | 454 shingle_hashes.clear(); |
452 | 455 |
453 // This part doesn't exercise the extraction timing. | 456 // This part doesn't exercise the extraction timing. |
454 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 457 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
455 | 458 |
456 // Now extract normally and make sure nothing breaks. | 459 // Now extract normally and make sure nothing breaks. |
457 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); | 460 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
458 | 461 |
459 FeatureMap expected_features; | 462 FeatureMap expected_features; |
460 expected_features.AddBooleanFeature(features::kPageTerm + | 463 expected_features.AddBooleanFeature(features::kPageTerm + |
461 std::string("multi word test")); | 464 std::string("multi word test")); |
462 ExpectFeatureMapsAreEqual(features, expected_features); | 465 ExpectFeatureMapsAreEqual(features, expected_features); |
463 } | 466 } |
464 | 467 |
465 } // namespace safe_browsing | 468 } // namespace safe_browsing |
OLD | NEW |