| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 | 8 |
| 9 #include "base/bind.h" | 9 #include "base/bind.h" |
| 10 #include "base/callback.h" | 10 #include "base/callback.h" |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 71 extractor_.reset(new PhishingTermFeatureExtractor( | 71 extractor_.reset(new PhishingTermFeatureExtractor( |
| 72 &term_hashes_, | 72 &term_hashes_, |
| 73 &word_hashes_, | 73 &word_hashes_, |
| 74 3 /* max_words_per_term */, | 74 3 /* max_words_per_term */, |
| 75 kMurmurHash3Seed, | 75 kMurmurHash3Seed, |
| 76 &clock_)); | 76 &clock_)); |
| 77 } | 77 } |
| 78 | 78 |
| 79 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 79 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
| 80 // completion callback. Returns the success boolean from the callback. | 80 // completion callback. Returns the success boolean from the callback. |
| 81 bool ExtractFeatures(const string16* page_text, FeatureMap* features) { | 81 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { |
| 82 success_ = false; | 82 success_ = false; |
| 83 extractor_->ExtractFeatures( | 83 extractor_->ExtractFeatures( |
| 84 page_text, | 84 page_text, |
| 85 features, | 85 features, |
| 86 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 86 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 87 base::Unretained(this))); | 87 base::Unretained(this))); |
| 88 msg_loop_.Run(); | 88 msg_loop_.Run(); |
| 89 return success_; | 89 return success_; |
| 90 } | 90 } |
| 91 | 91 |
| 92 void PartialExtractFeatures(const string16* page_text, FeatureMap* features) { | 92 void PartialExtractFeatures(const base::string16* page_text, |
| 93 FeatureMap* features) { |
| 93 extractor_->ExtractFeatures( | 94 extractor_->ExtractFeatures( |
| 94 page_text, | 95 page_text, |
| 95 features, | 96 features, |
| 96 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 97 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 97 base::Unretained(this))); | 98 base::Unretained(this))); |
| 98 msg_loop_.PostTask( | 99 msg_loop_.PostTask( |
| 99 FROM_HERE, | 100 FROM_HERE, |
| 100 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 101 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
| 101 base::Unretained(this))); | 102 base::Unretained(this))); |
| 102 msg_loop_.RunUntilIdle(); | 103 msg_loop_.RunUntilIdle(); |
| (...skipping 15 matching lines...) Expand all Loading... |
| 118 scoped_ptr<PhishingTermFeatureExtractor> extractor_; | 119 scoped_ptr<PhishingTermFeatureExtractor> extractor_; |
| 119 base::hash_set<std::string> term_hashes_; | 120 base::hash_set<std::string> term_hashes_; |
| 120 base::hash_set<uint32> word_hashes_; | 121 base::hash_set<uint32> word_hashes_; |
| 121 bool success_; // holds the success value from ExtractFeatures | 122 bool success_; // holds the success value from ExtractFeatures |
| 122 }; | 123 }; |
| 123 | 124 |
| 124 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 125 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| 125 // This test doesn't exercise the extraction timing. | 126 // This test doesn't exercise the extraction timing. |
| 126 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 127 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 127 | 128 |
| 128 string16 page_text = ASCIIToUTF16("blah"); | 129 base::string16 page_text = ASCIIToUTF16("blah"); |
| 129 FeatureMap expected_features; // initially empty | 130 FeatureMap expected_features; // initially empty |
| 130 | 131 |
| 131 FeatureMap features; | 132 FeatureMap features; |
| 132 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 133 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 133 ExpectFeatureMapsAreEqual(features, expected_features); | 134 ExpectFeatureMapsAreEqual(features, expected_features); |
| 134 | 135 |
| 135 page_text = ASCIIToUTF16("one one"); | 136 page_text = ASCIIToUTF16("one one"); |
| 136 expected_features.Clear(); | 137 expected_features.Clear(); |
| 137 expected_features.AddBooleanFeature(features::kPageTerm + | 138 expected_features.AddBooleanFeature(features::kPageTerm + |
| 138 std::string("one")); | 139 std::string("one")); |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 171 expected_features.AddBooleanFeature(features::kPageTerm + | 172 expected_features.AddBooleanFeature(features::kPageTerm + |
| 172 std::string("separator")); | 173 std::string("separator")); |
| 173 expected_features.AddBooleanFeature(features::kPageTerm + | 174 expected_features.AddBooleanFeature(features::kPageTerm + |
| 174 std::string("punctuation")); | 175 std::string("punctuation")); |
| 175 | 176 |
| 176 features.Clear(); | 177 features.Clear(); |
| 177 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 178 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 178 ExpectFeatureMapsAreEqual(features, expected_features); | 179 ExpectFeatureMapsAreEqual(features, expected_features); |
| 179 | 180 |
| 180 // Test with empty page text. | 181 // Test with empty page text. |
| 181 page_text = string16(); | 182 page_text = base::string16(); |
| 182 expected_features.Clear(); | 183 expected_features.Clear(); |
| 183 features.Clear(); | 184 features.Clear(); |
| 184 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 185 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 185 ExpectFeatureMapsAreEqual(features, expected_features); | 186 ExpectFeatureMapsAreEqual(features, expected_features); |
| 186 | 187 |
| 187 // Chinese translation of the phrase "hello goodbye". This tests that | 188 // Chinese translation of the phrase "hello goodbye". This tests that |
| 188 // we can correctly separate terms in languages that don't use spaces. | 189 // we can correctly separate terms in languages that don't use spaces. |
| 189 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); | 190 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
| 190 expected_features.Clear(); | 191 expected_features.Clear(); |
| 191 expected_features.AddBooleanFeature( | 192 expected_features.AddBooleanFeature( |
| 192 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); | 193 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
| 193 expected_features.AddBooleanFeature( | 194 expected_features.AddBooleanFeature( |
| 194 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); | 195 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
| 195 | 196 |
| 196 features.Clear(); | 197 features.Clear(); |
| 197 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 198 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 198 ExpectFeatureMapsAreEqual(features, expected_features); | 199 ExpectFeatureMapsAreEqual(features, expected_features); |
| 199 } | 200 } |
| 200 | 201 |
| 201 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { | 202 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| 202 // For this test, we'll cause the feature extraction to run multiple | 203 // For this test, we'll cause the feature extraction to run multiple |
| 203 // iterations by incrementing the clock. | 204 // iterations by incrementing the clock. |
| 204 | 205 |
| 205 // This page has a total of 30 words. For the features to be computed | 206 // This page has a total of 30 words. For the features to be computed |
| 206 // correctly, the extractor has to process the entire string of text. | 207 // correctly, the extractor has to process the entire string of text. |
| 207 string16 page_text(ASCIIToUTF16("one ")); | 208 base::string16 page_text(ASCIIToUTF16("one ")); |
| 208 for (int i = 0; i < 28; ++i) { | 209 for (int i = 0; i < 28; ++i) { |
| 209 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 210 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 210 } | 211 } |
| 211 page_text.append(ASCIIToUTF16("two")); | 212 page_text.append(ASCIIToUTF16("two")); |
| 212 | 213 |
| 213 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. | 214 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. |
| 214 // Note that this assumes kClockCheckGranularity = 5 and | 215 // Note that this assumes kClockCheckGranularity = 5 and |
| 215 // kMaxTimePerChunkMs = 10. | 216 // kMaxTimePerChunkMs = 10. |
| 216 base::TimeTicks now = base::TimeTicks::Now(); | 217 base::TimeTicks now = base::TimeTicks::Now(); |
| 217 EXPECT_CALL(clock_, Now()) | 218 EXPECT_CALL(clock_, Now()) |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 265 // Time check after the next 5 words. This is over the limit. | 266 // Time check after the next 5 words. This is over the limit. |
| 266 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) | 267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
| 267 // A final time check for the histograms. | 268 // A final time check for the histograms. |
| 268 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); | 269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
| 269 | 270 |
| 270 features.Clear(); | 271 features.Clear(); |
| 271 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); | 272 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); |
| 272 } | 273 } |
| 273 | 274 |
| 274 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { | 275 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
| 275 scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one "))); | 276 scoped_ptr<base::string16> page_text( |
| 277 new base::string16(ASCIIToUTF16("one "))); |
| 276 for (int i = 0; i < 28; ++i) { | 278 for (int i = 0; i < 28; ++i) { |
| 277 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 279 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 278 } | 280 } |
| 279 | 281 |
| 280 base::TimeTicks now = base::TimeTicks::Now(); | 282 base::TimeTicks now = base::TimeTicks::Now(); |
| 281 EXPECT_CALL(clock_, Now()) | 283 EXPECT_CALL(clock_, Now()) |
| 282 // Time check at the start of extraction. | 284 // Time check at the start of extraction. |
| 283 .WillOnce(Return(now)) | 285 .WillOnce(Return(now)) |
| 284 // Time check at the start of the first chunk of work. | 286 // Time check at the start of the first chunk of work. |
| 285 .WillOnce(Return(now)) | 287 .WillOnce(Return(now)) |
| 286 // Time check after the first 5 words. | 288 // Time check after the first 5 words. |
| 287 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 289 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
| 288 // Time check after the next 5 words. This should be greater than | 290 // Time check after the next 5 words. This should be greater than |
| 289 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 291 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
| 290 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 292 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
| 291 | 293 |
| 292 FeatureMap features; | 294 FeatureMap features; |
| 293 // Extract first 10 words then stop. | 295 // Extract first 10 words then stop. |
| 294 PartialExtractFeatures(page_text.get(), &features); | 296 PartialExtractFeatures(page_text.get(), &features); |
| 295 | 297 |
| 296 page_text.reset(new string16()); | 298 page_text.reset(new base::string16()); |
| 297 for (int i = 30; i < 58; ++i) { | 299 for (int i = 30; i < 58; ++i) { |
| 298 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 300 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 299 } | 301 } |
| 300 page_text->append(ASCIIToUTF16("multi word test ")); | 302 page_text->append(ASCIIToUTF16("multi word test ")); |
| 301 features.Clear(); | 303 features.Clear(); |
| 302 | 304 |
| 303 // This part doesn't exercise the extraction timing. | 305 // This part doesn't exercise the extraction timing. |
| 304 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 306 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 305 | 307 |
| 306 // Now extract normally and make sure nothing breaks. | 308 // Now extract normally and make sure nothing breaks. |
| 307 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); | 309 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); |
| 308 | 310 |
| 309 FeatureMap expected_features; | 311 FeatureMap expected_features; |
| 310 expected_features.AddBooleanFeature(features::kPageTerm + | 312 expected_features.AddBooleanFeature(features::kPageTerm + |
| 311 std::string("multi word test")); | 313 std::string("multi word test")); |
| 312 ExpectFeatureMapsAreEqual(features, expected_features); | 314 ExpectFeatureMapsAreEqual(features, expected_features); |
| 313 } | 315 } |
| 314 | 316 |
| 315 } // namespace safe_browsing | 317 } // namespace safe_browsing |
| OLD | NEW |