Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(558)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include <memory>
11 #include <string>
12
13 #include "base/bind.h"
14 #include "base/callback.h"
15 #include "base/containers/hash_tables.h"
16 #include "base/location.h"
17 #include "base/message_loop/message_loop.h"
18 #include "base/run_loop.h"
19 #include "base/single_thread_task_runner.h"
20 #include "base/strings/string16.h"
21 #include "base/strings/stringprintf.h"
22 #include "base/strings/utf_string_conversions.h"
23 #include "base/time/time.h"
24 #include "build/build_config.h"
25 #include "chrome/renderer/safe_browsing/features.h"
26 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
27 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
28 #include "chrome/renderer/safe_browsing/test_utils.h"
29 #include "crypto/sha2.h"
30 #include "testing/gmock/include/gmock/gmock.h"
31 #include "testing/gtest/include/gtest/gtest.h"
32
33 using base::ASCIIToUTF16;
34 using ::testing::Return;
35
36 static const uint32_t kMurmurHash3Seed = 2777808611U;
37
38 namespace safe_browsing {
39
40 class PhishingTermFeatureExtractorTest : public ::testing::Test {
41 protected:
42 void SetUp() override {
43 base::hash_set<std::string> terms;
44 terms.insert("one");
45 terms.insert("one one");
46 terms.insert("two");
47 terms.insert("multi word test");
48 terms.insert("capitalization");
49 terms.insert("space");
50 terms.insert("separator");
51 terms.insert("punctuation");
52 // Chinese (translation of "hello")
53 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
54 // Chinese (translation of "goodbye")
55 terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
56
57 for (base::hash_set<std::string>::iterator it = terms.begin();
58 it != terms.end(); ++it) {
59 term_hashes_.insert(crypto::SHA256HashString(*it));
60 }
61
62 base::hash_set<std::string> words;
63 words.insert("one");
64 words.insert("two");
65 words.insert("multi");
66 words.insert("word");
67 words.insert("test");
68 words.insert("capitalization");
69 words.insert("space");
70 words.insert("separator");
71 words.insert("punctuation");
72 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
73 words.insert("\xe5\x86\x8d\xe8\xa7\x81");
74
75 for (base::hash_set<std::string>::iterator it = words.begin();
76 it != words.end(); ++it) {
77 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
78 }
79
80 ResetExtractor(3 /* max shingles per page */);
81 }
82
83 void ResetExtractor(size_t max_shingles_per_page) {
84 extractor_.reset(new PhishingTermFeatureExtractor(
85 &term_hashes_,
86 &word_hashes_,
87 3 /* max_words_per_term */,
88 kMurmurHash3Seed,
89 max_shingles_per_page,
90 4 /* shingle_size */,
91 &clock_));
92 }
93
94 // Runs the TermFeatureExtractor on |page_text|, waiting for the
95 // completion callback. Returns the success boolean from the callback.
96 bool ExtractFeatures(const base::string16* page_text,
97 FeatureMap* features,
98 std::set<uint32_t>* shingle_hashes) {
99 success_ = false;
100 extractor_->ExtractFeatures(
101 page_text,
102 features,
103 shingle_hashes,
104 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
105 base::Unretained(this)));
106 base::RunLoop().Run();
107 return success_;
108 }
109
110 void PartialExtractFeatures(const base::string16* page_text,
111 FeatureMap* features,
112 std::set<uint32_t>* shingle_hashes) {
113 extractor_->ExtractFeatures(
114 page_text,
115 features,
116 shingle_hashes,
117 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
118 base::Unretained(this)));
119 msg_loop_.task_runner()->PostTask(
120 FROM_HERE, base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
121 base::Unretained(this)));
122 base::RunLoop().RunUntilIdle();
123 }
124
125 // Completion callback for feature extraction.
126 void ExtractionDone(bool success) {
127 success_ = success;
128 msg_loop_.QuitWhenIdle();
129 }
130
131 void QuitExtraction() {
132 extractor_->CancelPendingExtraction();
133 msg_loop_.QuitWhenIdle();
134 }
135
136 base::MessageLoop msg_loop_;
137 MockFeatureExtractorClock clock_;
138 std::unique_ptr<PhishingTermFeatureExtractor> extractor_;
139 base::hash_set<std::string> term_hashes_;
140 base::hash_set<uint32_t> word_hashes_;
141 bool success_; // holds the success value from ExtractFeatures
142 };
143
144 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
145 // This test doesn't exercise the extraction timing.
146 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
147
148 base::string16 page_text = ASCIIToUTF16("blah");
149 FeatureMap expected_features; // initially empty
150 std::set<uint32_t> expected_shingle_hashes;
151
152 FeatureMap features;
153 std::set<uint32_t> shingle_hashes;
154 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
155 ExpectFeatureMapsAreEqual(features, expected_features);
156 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
157
158 page_text = ASCIIToUTF16("one one");
159 expected_features.Clear();
160 expected_features.AddBooleanFeature(features::kPageTerm +
161 std::string("one"));
162 expected_features.AddBooleanFeature(features::kPageTerm +
163 std::string("one one"));
164 expected_shingle_hashes.clear();
165
166 features.Clear();
167 shingle_hashes.clear();
168 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
169 ExpectFeatureMapsAreEqual(features, expected_features);
170 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
171
172 page_text = ASCIIToUTF16("bla bla multi word test bla");
173 expected_features.Clear();
174 expected_features.AddBooleanFeature(features::kPageTerm +
175 std::string("multi word test"));
176 expected_shingle_hashes.clear();
177 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
178 kMurmurHash3Seed));
179 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
180 kMurmurHash3Seed));
181 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
182 kMurmurHash3Seed));
183
184 features.Clear();
185 shingle_hashes.clear();
186 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
187 ExpectFeatureMapsAreEqual(features, expected_features);
188 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
189
190 // This text has all of the words for one of the terms, but they are
191 // not in the correct order.
192 page_text = ASCIIToUTF16("bla bla test word multi bla");
193 expected_features.Clear();
194 expected_shingle_hashes.clear();
195 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
196 kMurmurHash3Seed));
197 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
198 kMurmurHash3Seed));
199 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
200 kMurmurHash3Seed));
201
202 features.Clear();
203 shingle_hashes.clear();
204 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
205 ExpectFeatureMapsAreEqual(features, expected_features);
206 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
207
208 // Test various separators.
209 page_text = ASCIIToUTF16("Capitalization plus non-space\n"
210 "separator... punctuation!");
211 expected_features.Clear();
212 expected_features.AddBooleanFeature(features::kPageTerm +
213 std::string("capitalization"));
214 expected_features.AddBooleanFeature(features::kPageTerm +
215 std::string("space"));
216 expected_features.AddBooleanFeature(features::kPageTerm +
217 std::string("separator"));
218 expected_features.AddBooleanFeature(features::kPageTerm +
219 std::string("punctuation"));
220 expected_shingle_hashes.clear();
221 expected_shingle_hashes.insert(
222 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
223 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
224 kMurmurHash3Seed));
225 expected_shingle_hashes.insert(
226 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
227
228 features.Clear();
229 shingle_hashes.clear();
230 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
231 ExpectFeatureMapsAreEqual(features, expected_features);
232 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
233
234 // Test a page with too many words and we should only 3 minimum hashes.
235 page_text = ASCIIToUTF16("This page has way too many words.");
236 expected_features.Clear();
237 expected_shingle_hashes.clear();
238 expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
239 kMurmurHash3Seed));
240 expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
241 kMurmurHash3Seed));
242 expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
243 kMurmurHash3Seed));
244 expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
245 kMurmurHash3Seed));
246 std::set<uint32_t>::iterator it = expected_shingle_hashes.end();
247 expected_shingle_hashes.erase(--it);
248
249 features.Clear();
250 shingle_hashes.clear();
251 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
252 ExpectFeatureMapsAreEqual(features, expected_features);
253 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
254
255 // Test with empty page text.
256 page_text = base::string16();
257 expected_features.Clear();
258 expected_shingle_hashes.clear();
259 features.Clear();
260 shingle_hashes.clear();
261 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
262 ExpectFeatureMapsAreEqual(features, expected_features);
263 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
264
265 #if !defined(OS_ANDROID)
266 // The test code is disabled due to http://crbug.com/392234
267 // The client-side detection feature is not enabled on Android yet.
268 // If we decided to enable the feature, we need to fix the bug first.
269
270 // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
271 // that we can correctly separate terms in languages that don't use spaces.
272 page_text =
273 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
274 "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
275 expected_features.Clear();
276 expected_features.AddBooleanFeature(
277 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
278 expected_features.AddBooleanFeature(
279 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
280 expected_shingle_hashes.clear();
281 expected_shingle_hashes.insert(MurmurHash3String(
282 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
283 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
284
285 features.Clear();
286 shingle_hashes.clear();
287 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
288 ExpectFeatureMapsAreEqual(features, expected_features);
289 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
290 #endif
291 }
292
293 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
294 // For this test, we'll cause the feature extraction to run multiple
295 // iterations by incrementing the clock.
296 ResetExtractor(200 /* max shingles per page */);
297
298 // This page has a total of 30 words. For the features to be computed
299 // correctly, the extractor has to process the entire string of text.
300 base::string16 page_text(ASCIIToUTF16("one "));
301 for (int i = 0; i < 28; ++i) {
302 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
303 }
304 page_text.append(ASCIIToUTF16("two"));
305
306 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
307 // Note that this assumes kClockCheckGranularity = 5 and
308 // kMaxTimePerChunkMs = 10.
309 base::TimeTicks now = base::TimeTicks::Now();
310 EXPECT_CALL(clock_, Now())
311 // Time check at the start of extraction.
312 .WillOnce(Return(now))
313 // Time check at the start of the first chunk of work.
314 .WillOnce(Return(now))
315 // Time check after the first 5 words.
316 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
317 // Time check after the next 5 words.
318 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
319 // Time check after the next 5 words.
320 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
321 // Time check after the next 5 words. This is over the chunk
322 // time limit, so a continuation task will be posted.
323 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
324 // Time check at the start of the second chunk of work.
325 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
326 // Time check after the next 5 words.
327 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
328 // Time check after the next 5 words.
329 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
330 // A final check for the histograms.
331 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
332
333 FeatureMap expected_features;
334 expected_features.AddBooleanFeature(features::kPageTerm +
335 std::string("one"));
336 expected_features.AddBooleanFeature(features::kPageTerm +
337 std::string("two"));
338 std::set<uint32_t> expected_shingle_hashes;
339 expected_shingle_hashes.insert(
340 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
341 expected_shingle_hashes.insert(
342 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
343 expected_shingle_hashes.insert(
344 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
345 expected_shingle_hashes.insert(
346 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
347 expected_shingle_hashes.insert(
348 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
349 expected_shingle_hashes.insert(
350 MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
351 expected_shingle_hashes.insert(
352 MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
353 expected_shingle_hashes.insert(
354 MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
355 expected_shingle_hashes.insert(
356 MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
357 expected_shingle_hashes.insert(
358 MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
359 expected_shingle_hashes.insert(
360 MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
361 expected_shingle_hashes.insert(
362 MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
363 expected_shingle_hashes.insert(
364 MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
365 expected_shingle_hashes.insert(
366 MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
367 expected_shingle_hashes.insert(
368 MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
369 expected_shingle_hashes.insert(
370 MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
371 expected_shingle_hashes.insert(
372 MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
373 expected_shingle_hashes.insert(
374 MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
375 expected_shingle_hashes.insert(
376 MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
377 expected_shingle_hashes.insert(
378 MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
379 expected_shingle_hashes.insert(
380 MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
381 expected_shingle_hashes.insert(
382 MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
383 expected_shingle_hashes.insert(
384 MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
385 expected_shingle_hashes.insert(
386 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
387 expected_shingle_hashes.insert(
388 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
389 expected_shingle_hashes.insert(
390 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
391 expected_shingle_hashes.insert(
392 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
393
394 FeatureMap features;
395 std::set<uint32_t> shingle_hashes;
396 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
397 ExpectFeatureMapsAreEqual(features, expected_features);
398 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
399 // Make sure none of the mock expectations carry over to the next test.
400 ::testing::Mock::VerifyAndClearExpectations(&clock_);
401
402 // Now repeat the test with the same text, but advance the clock faster so
403 // that the extraction time exceeds the maximum total time for the feature
404 // extractor. Extraction should fail. Note that this assumes
405 // kMaxTotalTimeMs = 500.
406 EXPECT_CALL(clock_, Now())
407 // Time check at the start of extraction.
408 .WillOnce(Return(now))
409 // Time check at the start of the first chunk of work.
410 .WillOnce(Return(now))
411 // Time check after the first 5 words,
412 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
413 // Time check at the start of the second chunk of work.
414 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
415 // Time check after the next 5 words. This is over the limit.
416 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
417 // A final time check for the histograms.
418 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
419
420 features.Clear();
421 shingle_hashes.clear();
422 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
423 }
424
425 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
426 std::unique_ptr<base::string16> page_text(
427 new base::string16(ASCIIToUTF16("one ")));
428 for (int i = 0; i < 28; ++i) {
429 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
430 }
431
432 base::TimeTicks now = base::TimeTicks::Now();
433 EXPECT_CALL(clock_, Now())
434 // Time check at the start of extraction.
435 .WillOnce(Return(now))
436 // Time check at the start of the first chunk of work.
437 .WillOnce(Return(now))
438 // Time check after the first 5 words.
439 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
440 // Time check after the next 5 words. This should be greater than
441 // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
442 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
443
444 FeatureMap features;
445 std::set<uint32_t> shingle_hashes;
446 // Extract first 10 words then stop.
447 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
448
449 page_text.reset(new base::string16());
450 for (int i = 30; i < 58; ++i) {
451 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
452 }
453 page_text->append(ASCIIToUTF16("multi word test "));
454 features.Clear();
455 shingle_hashes.clear();
456
457 // This part doesn't exercise the extraction timing.
458 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
459
460 // Now extract normally and make sure nothing breaks.
461 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
462
463 FeatureMap expected_features;
464 expected_features.AddBooleanFeature(features::kPageTerm +
465 std::string("multi word test"));
466 ExpectFeatureMapsAreEqual(features, expected_features);
467 }
468
469 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698