Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(47)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <string>
8
9 #include "base/callback.h"
10 #include "base/hash_tables.h"
11 #include "base/message_loop.h"
12 #include "base/scoped_ptr.h"
13 #include "base/sha2.h"
14 #include "base/string16.h"
15 #include "base/stringprintf.h"
16 #include "base/time.h"
17 #include "base/utf_string_conversions.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "testing/gmock/include/gmock/gmock.h"
21 #include "testing/gtest/include/gtest/gtest.h"
22
23 using ::testing::ContainerEq;
24 using ::testing::Return;
25
26 namespace safe_browsing {
27
28 class PhishingTermFeatureExtractorTest : public ::testing::Test {
29 protected:
30 class MockClock : public FeatureExtractorClock {
31 public:
32 MOCK_METHOD0(Now, base::TimeTicks());
33 };
34
35 virtual void SetUp() {
36 base::hash_set<std::string> terms;
37 terms.insert("one");
38 terms.insert("one one");
39 terms.insert("two");
40 terms.insert("multi word test");
41 terms.insert("capitalization");
42 terms.insert("space");
43 terms.insert("separator");
44 terms.insert("punctuation");
45 // Chinese (translation of "hello")
46 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
47 // Chinese (translation of "goodbye")
48 terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
49
50 for (base::hash_set<std::string>::iterator it = terms.begin();
51 it != terms.end(); ++it) {
52 term_hashes_.insert(base::SHA256HashString(*it));
53 }
54
55 base::hash_set<std::string> words;
56 words.insert("one");
57 words.insert("two");
58 words.insert("multi");
59 words.insert("word");
60 words.insert("test");
61 words.insert("capitalization");
62 words.insert("space");
63 words.insert("separator");
64 words.insert("punctuation");
65 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
66 words.insert("\xe5\x86\x8d\xe8\xa7\x81");
67
68 for (base::hash_set<std::string>::iterator it = words.begin();
69 it != words.end(); ++it) {
70 word_hashes_.insert(base::SHA256HashString(*it));
71 }
72
73 clock_ = new MockClock();
74 extractor_.reset(new PhishingTermFeatureExtractor(
75 &term_hashes_,
76 &word_hashes_,
77 3 /* max_words_per_term */,
78 clock_));
79 }
80
81 // Runs the TermFeatureExtractor on |page_text|, waiting for the
82 // completion callback. Returns the success boolean from the callback.
83 bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
84 success_ = false;
85 extractor_->ExtractFeatures(
86 page_text,
87 features,
88 NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));
89 msg_loop_.Run();
90 return success_;
91 }
92
93 // Completion callback for feature extraction.
94 void ExtractionDone(bool success) {
95 success_ = success;
96 msg_loop_.Quit();
97 }
98
99 MessageLoop msg_loop_;
100 scoped_ptr<PhishingTermFeatureExtractor> extractor_;
101 base::hash_set<std::string> term_hashes_;
102 base::hash_set<std::string> word_hashes_;
103 MockClock* clock_; // owned by extractor_
104 bool success_; // holds the success value from ExtractFeatures
105 };
106
107 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
108 // This test doesn't exercise the extraction timing.
109 EXPECT_CALL(*clock_, Now())
110 .WillRepeatedly(Return(base::TimeTicks::Now()));
111
112 string16 page_text = ASCIIToUTF16("blah");
113 FeatureMap expected_features; // initially empty
114
115 FeatureMap features;
116 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
117 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
118
119 page_text = ASCIIToUTF16("one one");
120 expected_features.Clear();
121 expected_features.AddBooleanFeature(features::kPageTerm +
122 std::string("one"));
123 expected_features.AddBooleanFeature(features::kPageTerm +
124 std::string("one one"));
125
126 features.Clear();
127 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
128 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
129
130 page_text = ASCIIToUTF16("bla bla multi word test bla");
131 expected_features.Clear();
132 expected_features.AddBooleanFeature(features::kPageTerm +
133 std::string("multi word test"));
134
135 features.Clear();
136 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
137 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
138
139 // This text has all of the words for one of the terms, but they are
140 // not in the correct order.
141 page_text = ASCIIToUTF16("bla bla test word multi bla");
142 expected_features.Clear();
143
144 features.Clear();
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
146 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
147
148 page_text = ASCIIToUTF16("Capitalization plus non-space\n"
149 "separator... punctuation!");
150 expected_features.Clear();
151 expected_features.AddBooleanFeature(features::kPageTerm +
152 std::string("capitalization"));
153 expected_features.AddBooleanFeature(features::kPageTerm +
154 std::string("space"));
155 expected_features.AddBooleanFeature(features::kPageTerm +
156 std::string("separator"));
157 expected_features.AddBooleanFeature(features::kPageTerm +
158 std::string("punctuation"));
159
160 features.Clear();
161 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
162 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
163
164 // Test with empty page text.
165 page_text = string16();
166 expected_features.Clear();
167 features.Clear();
168 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
169 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
170
171 // Chinese translation of the phrase "hello goodbye". This tests that
172 // we can correctly separate terms in languages that don't use spaces.
173 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
174 expected_features.Clear();
175 expected_features.AddBooleanFeature(
176 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
177 expected_features.AddBooleanFeature(
178 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
179
180 features.Clear();
181 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
182 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
183 }
184
185 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
186 // For this test, we'll cause the feature extraction to run multiple
187 // iterations by incrementing the clock.
188
189 // This page has a total of 30 words. For the features to be computed
190 // correctly, the extractor has to process the entire string of text.
191 string16 page_text(ASCIIToUTF16("one "));
192 for (int i = 0; i < 28; ++i) {
193 page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));
194 }
195 page_text.append(ASCIIToUTF16("two"));
196
197 // Advance the clock 30 ms every 10 words processed, 10 ms between chunks.
198 // Note that this assumes kClockCheckGranularity = 10 and
199 // kMaxTimePerChunkMs = 50.
200 base::TimeTicks now = base::TimeTicks::Now();
201 EXPECT_CALL(*clock_, Now())
202 // Time check at the start of extraction.
203 .WillOnce(Return(now))
204 // Time check at the start of the first chunk of work.
205 .WillOnce(Return(now))
206 // Time check after the first 10 words.
207 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
208 // Time check after the next 10 words. This is over the chunk
209 // time limit, so a continuation task will be posted.
210 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
211 // Time check at the start of the second chunk of work.
212 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
213 // Time check after the next 10 words.
214 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
215 // A final check for the histograms.
216 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101)));
217
218 FeatureMap expected_features;
219 expected_features.AddBooleanFeature(features::kPageTerm +
220 std::string("one"));
221 expected_features.AddBooleanFeature(features::kPageTerm +
222 std::string("two"));
223
224 FeatureMap features;
225 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
226 EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
227 // Make sure none of the mock expectations carry over to the next test.
228 ::testing::Mock::VerifyAndClearExpectations(clock_);
229
230 // Now repeat the test with the same text, but advance the clock faster so
231 // that the extraction time exceeds the maximum total time for the feature
232 // extractor. Extraction should fail. Note that this assumes
233 // kMaxTotalTimeMs = 500.
234 EXPECT_CALL(*clock_, Now())
235 // Time check at the start of extraction.
236 .WillOnce(Return(now))
237 // Time check at the start of the first chunk of work.
238 .WillOnce(Return(now))
239 // Time check after the first 10 words,
240 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
241 // Time check at the start of the second chunk of work.
242 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
243 // Time check after the next 10 words. This is over the limit.
244 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
245 // A final time check for the histograms.
246 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
247
248 features.Clear();
249 EXPECT_FALSE(ExtractFeatures(&page_text, &features));
250 }
251
252 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698