OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
9 #include "base/bind.h" | 9 #include "base/bind.h" |
10 #include "base/callback.h" | 10 #include "base/callback.h" |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 extractor_.reset(new PhishingTermFeatureExtractor( | 71 extractor_.reset(new PhishingTermFeatureExtractor( |
72 &term_hashes_, | 72 &term_hashes_, |
73 &word_hashes_, | 73 &word_hashes_, |
74 3 /* max_words_per_term */, | 74 3 /* max_words_per_term */, |
75 kMurmurHash3Seed, | 75 kMurmurHash3Seed, |
76 &clock_)); | 76 &clock_)); |
77 } | 77 } |
78 | 78 |
79 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 79 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
80 // completion callback. Returns the success boolean from the callback. | 80 // completion callback. Returns the success boolean from the callback. |
81 bool ExtractFeatures(const string16* page_text, FeatureMap* features) { | 81 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { |
82 success_ = false; | 82 success_ = false; |
83 extractor_->ExtractFeatures( | 83 extractor_->ExtractFeatures( |
84 page_text, | 84 page_text, |
85 features, | 85 features, |
86 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 86 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
87 base::Unretained(this))); | 87 base::Unretained(this))); |
88 msg_loop_.Run(); | 88 msg_loop_.Run(); |
89 return success_; | 89 return success_; |
90 } | 90 } |
91 | 91 |
92 void PartialExtractFeatures(const string16* page_text, FeatureMap* features) { | 92 void PartialExtractFeatures(const base::string16* page_text, |
| 93 FeatureMap* features) { |
93 extractor_->ExtractFeatures( | 94 extractor_->ExtractFeatures( |
94 page_text, | 95 page_text, |
95 features, | 96 features, |
96 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 97 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
97 base::Unretained(this))); | 98 base::Unretained(this))); |
98 msg_loop_.PostTask( | 99 msg_loop_.PostTask( |
99 FROM_HERE, | 100 FROM_HERE, |
100 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 101 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
101 base::Unretained(this))); | 102 base::Unretained(this))); |
102 msg_loop_.RunUntilIdle(); | 103 msg_loop_.RunUntilIdle(); |
(...skipping 15 matching lines...) Expand all Loading... |
118 scoped_ptr<PhishingTermFeatureExtractor> extractor_; | 119 scoped_ptr<PhishingTermFeatureExtractor> extractor_; |
119 base::hash_set<std::string> term_hashes_; | 120 base::hash_set<std::string> term_hashes_; |
120 base::hash_set<uint32> word_hashes_; | 121 base::hash_set<uint32> word_hashes_; |
121 bool success_; // holds the success value from ExtractFeatures | 122 bool success_; // holds the success value from ExtractFeatures |
122 }; | 123 }; |
123 | 124 |
124 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 125 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
125 // This test doesn't exercise the extraction timing. | 126 // This test doesn't exercise the extraction timing. |
126 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 127 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
127 | 128 |
128 string16 page_text = ASCIIToUTF16("blah"); | 129 base::string16 page_text = ASCIIToUTF16("blah"); |
129 FeatureMap expected_features; // initially empty | 130 FeatureMap expected_features; // initially empty |
130 | 131 |
131 FeatureMap features; | 132 FeatureMap features; |
132 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 133 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
133 ExpectFeatureMapsAreEqual(features, expected_features); | 134 ExpectFeatureMapsAreEqual(features, expected_features); |
134 | 135 |
135 page_text = ASCIIToUTF16("one one"); | 136 page_text = ASCIIToUTF16("one one"); |
136 expected_features.Clear(); | 137 expected_features.Clear(); |
137 expected_features.AddBooleanFeature(features::kPageTerm + | 138 expected_features.AddBooleanFeature(features::kPageTerm + |
138 std::string("one")); | 139 std::string("one")); |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 expected_features.AddBooleanFeature(features::kPageTerm + | 172 expected_features.AddBooleanFeature(features::kPageTerm + |
172 std::string("separator")); | 173 std::string("separator")); |
173 expected_features.AddBooleanFeature(features::kPageTerm + | 174 expected_features.AddBooleanFeature(features::kPageTerm + |
174 std::string("punctuation")); | 175 std::string("punctuation")); |
175 | 176 |
176 features.Clear(); | 177 features.Clear(); |
177 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 178 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
178 ExpectFeatureMapsAreEqual(features, expected_features); | 179 ExpectFeatureMapsAreEqual(features, expected_features); |
179 | 180 |
180 // Test with empty page text. | 181 // Test with empty page text. |
181 page_text = string16(); | 182 page_text = base::string16(); |
182 expected_features.Clear(); | 183 expected_features.Clear(); |
183 features.Clear(); | 184 features.Clear(); |
184 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 185 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
185 ExpectFeatureMapsAreEqual(features, expected_features); | 186 ExpectFeatureMapsAreEqual(features, expected_features); |
186 | 187 |
187 // Chinese translation of the phrase "hello goodbye". This tests that | 188 // Chinese translation of the phrase "hello goodbye". This tests that |
188 // we can correctly separate terms in languages that don't use spaces. | 189 // we can correctly separate terms in languages that don't use spaces. |
189 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); | 190 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
190 expected_features.Clear(); | 191 expected_features.Clear(); |
191 expected_features.AddBooleanFeature( | 192 expected_features.AddBooleanFeature( |
192 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); | 193 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
193 expected_features.AddBooleanFeature( | 194 expected_features.AddBooleanFeature( |
194 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); | 195 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
195 | 196 |
196 features.Clear(); | 197 features.Clear(); |
197 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 198 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
198 ExpectFeatureMapsAreEqual(features, expected_features); | 199 ExpectFeatureMapsAreEqual(features, expected_features); |
199 } | 200 } |
200 | 201 |
201 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { | 202 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
202 // For this test, we'll cause the feature extraction to run multiple | 203 // For this test, we'll cause the feature extraction to run multiple |
203 // iterations by incrementing the clock. | 204 // iterations by incrementing the clock. |
204 | 205 |
205 // This page has a total of 30 words. For the features to be computed | 206 // This page has a total of 30 words. For the features to be computed |
206 // correctly, the extractor has to process the entire string of text. | 207 // correctly, the extractor has to process the entire string of text. |
207 string16 page_text(ASCIIToUTF16("one ")); | 208 base::string16 page_text(ASCIIToUTF16("one ")); |
208 for (int i = 0; i < 28; ++i) { | 209 for (int i = 0; i < 28; ++i) { |
209 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 210 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
210 } | 211 } |
211 page_text.append(ASCIIToUTF16("two")); | 212 page_text.append(ASCIIToUTF16("two")); |
212 | 213 |
213 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. | 214 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. |
214 // Note that this assumes kClockCheckGranularity = 5 and | 215 // Note that this assumes kClockCheckGranularity = 5 and |
215 // kMaxTimePerChunkMs = 10. | 216 // kMaxTimePerChunkMs = 10. |
216 base::TimeTicks now = base::TimeTicks::Now(); | 217 base::TimeTicks now = base::TimeTicks::Now(); |
217 EXPECT_CALL(clock_, Now()) | 218 EXPECT_CALL(clock_, Now()) |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
265 // Time check after the next 5 words. This is over the limit. | 266 // Time check after the next 5 words. This is over the limit. |
266 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) | 267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
267 // A final time check for the histograms. | 268 // A final time check for the histograms. |
268 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); | 269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
269 | 270 |
270 features.Clear(); | 271 features.Clear(); |
271 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); | 272 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); |
272 } | 273 } |
273 | 274 |
274 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { | 275 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
275 scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one "))); | 276 scoped_ptr<base::string16> page_text( |
| 277 new base::string16(ASCIIToUTF16("one "))); |
276 for (int i = 0; i < 28; ++i) { | 278 for (int i = 0; i < 28; ++i) { |
277 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 279 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
278 } | 280 } |
279 | 281 |
280 base::TimeTicks now = base::TimeTicks::Now(); | 282 base::TimeTicks now = base::TimeTicks::Now(); |
281 EXPECT_CALL(clock_, Now()) | 283 EXPECT_CALL(clock_, Now()) |
282 // Time check at the start of extraction. | 284 // Time check at the start of extraction. |
283 .WillOnce(Return(now)) | 285 .WillOnce(Return(now)) |
284 // Time check at the start of the first chunk of work. | 286 // Time check at the start of the first chunk of work. |
285 .WillOnce(Return(now)) | 287 .WillOnce(Return(now)) |
286 // Time check after the first 5 words. | 288 // Time check after the first 5 words. |
287 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 289 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
288 // Time check after the next 5 words. This should be greater than | 290 // Time check after the next 5 words. This should be greater than |
289 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 291 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
290 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 292 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
291 | 293 |
292 FeatureMap features; | 294 FeatureMap features; |
293 // Extract first 10 words then stop. | 295 // Extract first 10 words then stop. |
294 PartialExtractFeatures(page_text.get(), &features); | 296 PartialExtractFeatures(page_text.get(), &features); |
295 | 297 |
296 page_text.reset(new string16()); | 298 page_text.reset(new base::string16()); |
297 for (int i = 30; i < 58; ++i) { | 299 for (int i = 30; i < 58; ++i) { |
298 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 300 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
299 } | 301 } |
300 page_text->append(ASCIIToUTF16("multi word test ")); | 302 page_text->append(ASCIIToUTF16("multi word test ")); |
301 features.Clear(); | 303 features.Clear(); |
302 | 304 |
303 // This part doesn't exercise the extraction timing. | 305 // This part doesn't exercise the extraction timing. |
304 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 306 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
305 | 307 |
306 // Now extract normally and make sure nothing breaks. | 308 // Now extract normally and make sure nothing breaks. |
307 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); | 309 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); |
308 | 310 |
309 FeatureMap expected_features; | 311 FeatureMap expected_features; |
310 expected_features.AddBooleanFeature(features::kPageTerm + | 312 expected_features.AddBooleanFeature(features::kPageTerm + |
311 std::string("multi word test")); | 313 std::string("multi word test")); |
312 ExpectFeatureMapsAreEqual(features, expected_features); | 314 ExpectFeatureMapsAreEqual(features, expected_features); |
313 } | 315 } |
314 | 316 |
315 } // namespace safe_browsing | 317 } // namespace safe_browsing |
OLD | NEW |