OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
9 #include "base/bind.h" | 9 #include "base/bind.h" |
10 #include "base/callback.h" | 10 #include "base/callback.h" |
11 #include "base/containers/hash_tables.h" | 11 #include "base/containers/hash_tables.h" |
12 #include "base/memory/scoped_ptr.h" | 12 #include "base/memory/scoped_ptr.h" |
13 #include "base/message_loop/message_loop.h" | 13 #include "base/message_loop/message_loop.h" |
14 #include "base/strings/string16.h" | 14 #include "base/strings/string16.h" |
15 #include "base/strings/stringprintf.h" | 15 #include "base/strings/stringprintf.h" |
16 #include "base/strings/utf_string_conversions.h" | 16 #include "base/strings/utf_string_conversions.h" |
17 #include "base/time/time.h" | 17 #include "base/time/time.h" |
18 #include "chrome/renderer/safe_browsing/features.h" | 18 #include "chrome/renderer/safe_browsing/features.h" |
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" | 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" |
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
21 #include "chrome/renderer/safe_browsing/test_utils.h" | 21 #include "chrome/renderer/safe_browsing/test_utils.h" |
22 #include "crypto/sha2.h" | 22 #include "crypto/sha2.h" |
23 #include "testing/gmock/include/gmock/gmock.h" | 23 #include "testing/gmock/include/gmock/gmock.h" |
24 #include "testing/gtest/include/gtest/gtest.h" | 24 #include "testing/gtest/include/gtest/gtest.h" |
25 | 25 |
26 using base::ASCIIToUTF16; | 26 using base::ASCIIToUTF16; |
27 using ::testing::Return; | 27 using ::testing::Return; |
28 | 28 |
29 | |
30 static const uint32 kMurmurHash3Seed = 2777808611U; | |
31 | |
29 namespace safe_browsing { | 32 namespace safe_browsing { |
30 | 33 |
31 class PhishingTermFeatureExtractorTest : public ::testing::Test { | 34 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
32 protected: | 35 protected: |
33 virtual void SetUp() { | 36 virtual void SetUp() { |
34 base::hash_set<std::string> terms; | 37 base::hash_set<std::string> terms; |
35 terms.insert("one"); | 38 terms.insert("one"); |
36 terms.insert("one one"); | 39 terms.insert("one one"); |
37 terms.insert("two"); | 40 terms.insert("two"); |
38 terms.insert("multi word test"); | 41 terms.insert("multi word test"); |
(...skipping 17 matching lines...) Expand all Loading... | |
56 words.insert("multi"); | 59 words.insert("multi"); |
57 words.insert("word"); | 60 words.insert("word"); |
58 words.insert("test"); | 61 words.insert("test"); |
59 words.insert("capitalization"); | 62 words.insert("capitalization"); |
60 words.insert("space"); | 63 words.insert("space"); |
61 words.insert("separator"); | 64 words.insert("separator"); |
62 words.insert("punctuation"); | 65 words.insert("punctuation"); |
63 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); | 66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
64 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); | 67 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
65 | 68 |
66 static const uint32 kMurmurHash3Seed = 2777808611U; | |
67 for (base::hash_set<std::string>::iterator it = words.begin(); | 69 for (base::hash_set<std::string>::iterator it = words.begin(); |
68 it != words.end(); ++it) { | 70 it != words.end(); ++it) { |
69 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); | 71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); |
70 } | 72 } |
71 | 73 |
72 extractor_.reset(new PhishingTermFeatureExtractor( | 74 extractor_.reset(new PhishingTermFeatureExtractor( |
73 &term_hashes_, | 75 &term_hashes_, |
74 &word_hashes_, | 76 &word_hashes_, |
77 3 /* max_hashes_per_page */, | |
75 3 /* max_words_per_term */, | 78 3 /* max_words_per_term */, |
76 kMurmurHash3Seed, | 79 kMurmurHash3Seed, |
80 4 /* shingle_size */, | |
77 &clock_)); | 81 &clock_)); |
78 } | 82 } |
79 | 83 |
80 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 84 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
81 // completion callback. Returns the success boolean from the callback. | 85 // completion callback. Returns the success boolean from the callback. |
82 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { | 86 bool ExtractFeatures(const base::string16* page_text, |
87 FeatureMap* features, | |
88 std::set<uint32>* shingle_hashes) { | |
83 success_ = false; | 89 success_ = false; |
84 extractor_->ExtractFeatures( | 90 extractor_->ExtractFeatures( |
85 page_text, | 91 page_text, |
86 features, | 92 features, |
93 shingle_hashes, | |
87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 94 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
88 base::Unretained(this))); | 95 base::Unretained(this))); |
89 msg_loop_.Run(); | 96 msg_loop_.Run(); |
90 return success_; | 97 return success_; |
91 } | 98 } |
92 | 99 |
93 void PartialExtractFeatures(const base::string16* page_text, | 100 void PartialExtractFeatures(const base::string16* page_text, |
94 FeatureMap* features) { | 101 FeatureMap* features, |
102 std::set<uint32>* shingle_hashes) { | |
95 extractor_->ExtractFeatures( | 103 extractor_->ExtractFeatures( |
96 page_text, | 104 page_text, |
97 features, | 105 features, |
106 shingle_hashes, | |
98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 107 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
99 base::Unretained(this))); | 108 base::Unretained(this))); |
100 msg_loop_.PostTask( | 109 msg_loop_.PostTask( |
101 FROM_HERE, | 110 FROM_HERE, |
102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 111 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
103 base::Unretained(this))); | 112 base::Unretained(this))); |
104 msg_loop_.RunUntilIdle(); | 113 msg_loop_.RunUntilIdle(); |
105 } | 114 } |
106 | 115 |
107 // Completion callback for feature extraction. | 116 // Completion callback for feature extraction. |
(...skipping 14 matching lines...) Expand all Loading... | |
122 base::hash_set<uint32> word_hashes_; | 131 base::hash_set<uint32> word_hashes_; |
123 bool success_; // holds the success value from ExtractFeatures | 132 bool success_; // holds the success value from ExtractFeatures |
124 }; | 133 }; |
125 | 134 |
126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 135 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
127 // This test doesn't exercise the extraction timing. | 136 // This test doesn't exercise the extraction timing. |
128 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 137 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
129 | 138 |
130 base::string16 page_text = ASCIIToUTF16("blah"); | 139 base::string16 page_text = ASCIIToUTF16("blah"); |
131 FeatureMap expected_features; // initially empty | 140 FeatureMap expected_features; // initially empty |
141 std::set<uint32> expected_shingle_hashes; | |
132 | 142 |
133 FeatureMap features; | 143 FeatureMap features; |
134 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 144 std::set<uint32> shingle_hashes; |
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
135 ExpectFeatureMapsAreEqual(features, expected_features); | 146 ExpectFeatureMapsAreEqual(features, expected_features); |
147 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
136 | 148 |
137 page_text = ASCIIToUTF16("one one"); | 149 page_text = ASCIIToUTF16("one one"); |
138 expected_features.Clear(); | 150 expected_features.Clear(); |
139 expected_features.AddBooleanFeature(features::kPageTerm + | 151 expected_features.AddBooleanFeature(features::kPageTerm + |
140 std::string("one")); | 152 std::string("one")); |
141 expected_features.AddBooleanFeature(features::kPageTerm + | 153 expected_features.AddBooleanFeature(features::kPageTerm + |
142 std::string("one one")); | 154 std::string("one one")); |
155 expected_shingle_hashes.clear(); | |
143 | 156 |
144 features.Clear(); | 157 features.Clear(); |
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 158 shingle_hashes.clear(); |
159 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
146 ExpectFeatureMapsAreEqual(features, expected_features); | 160 ExpectFeatureMapsAreEqual(features, expected_features); |
161 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
147 | 162 |
148 page_text = ASCIIToUTF16("bla bla multi word test bla"); | 163 page_text = ASCIIToUTF16("bla bla multi word test bla"); |
149 expected_features.Clear(); | 164 expected_features.Clear(); |
150 expected_features.AddBooleanFeature(features::kPageTerm + | 165 expected_features.AddBooleanFeature(features::kPageTerm + |
151 std::string("multi word test")); | 166 std::string("multi word test")); |
167 expected_shingle_hashes.clear(); | |
168 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", | |
169 kMurmurHash3Seed)); | |
170 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", | |
171 kMurmurHash3Seed)); | |
172 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", | |
173 kMurmurHash3Seed)); | |
152 | 174 |
153 features.Clear(); | 175 features.Clear(); |
154 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 176 shingle_hashes.clear(); |
177 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
155 ExpectFeatureMapsAreEqual(features, expected_features); | 178 ExpectFeatureMapsAreEqual(features, expected_features); |
179 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
156 | 180 |
157 // This text has all of the words for one of the terms, but they are | 181 // This text has all of the words for one of the terms, but they are |
158 // not in the correct order. | 182 // not in the correct order. |
159 page_text = ASCIIToUTF16("bla bla test word multi bla"); | 183 page_text = ASCIIToUTF16("bla bla test word multi bla"); |
160 expected_features.Clear(); | 184 expected_features.Clear(); |
185 expected_shingle_hashes.clear(); | |
186 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", | |
187 kMurmurHash3Seed)); | |
188 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", | |
189 kMurmurHash3Seed)); | |
190 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", | |
191 kMurmurHash3Seed)); | |
161 | 192 |
162 features.Clear(); | 193 features.Clear(); |
163 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 194 shingle_hashes.clear(); |
195 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
164 ExpectFeatureMapsAreEqual(features, expected_features); | 196 ExpectFeatureMapsAreEqual(features, expected_features); |
197 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
165 | 198 |
199 // Test various separators. | |
166 page_text = ASCIIToUTF16("Capitalization plus non-space\n" | 200 page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
167 "separator... punctuation!"); | 201 "separator... punctuation!"); |
168 expected_features.Clear(); | 202 expected_features.Clear(); |
169 expected_features.AddBooleanFeature(features::kPageTerm + | 203 expected_features.AddBooleanFeature(features::kPageTerm + |
170 std::string("capitalization")); | 204 std::string("capitalization")); |
171 expected_features.AddBooleanFeature(features::kPageTerm + | 205 expected_features.AddBooleanFeature(features::kPageTerm + |
172 std::string("space")); | 206 std::string("space")); |
173 expected_features.AddBooleanFeature(features::kPageTerm + | 207 expected_features.AddBooleanFeature(features::kPageTerm + |
174 std::string("separator")); | 208 std::string("separator")); |
175 expected_features.AddBooleanFeature(features::kPageTerm + | 209 expected_features.AddBooleanFeature(features::kPageTerm + |
176 std::string("punctuation")); | 210 std::string("punctuation")); |
211 expected_shingle_hashes.clear(); | |
212 expected_shingle_hashes.insert( | |
213 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); | |
214 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", | |
215 kMurmurHash3Seed)); | |
216 expected_shingle_hashes.insert( | |
217 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); | |
177 | 218 |
178 features.Clear(); | 219 features.Clear(); |
179 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 220 shingle_hashes.clear(); |
221 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
180 ExpectFeatureMapsAreEqual(features, expected_features); | 222 ExpectFeatureMapsAreEqual(features, expected_features); |
223 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
224 | |
225 // Test a page with too many words and we should only 3 shingle hashes. | |
226 page_text = ASCIIToUTF16("This page has way too many words."); | |
227 expected_features.Clear(); | |
228 expected_shingle_hashes.clear(); | |
229 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", | |
230 kMurmurHash3Seed)); | |
231 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", | |
232 kMurmurHash3Seed)); | |
233 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", | |
234 kMurmurHash3Seed)); | |
235 | |
236 features.Clear(); | |
237 shingle_hashes.clear(); | |
238 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
239 ExpectFeatureMapsAreEqual(features, expected_features); | |
240 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
181 | 241 |
182 // Test with empty page text. | 242 // Test with empty page text. |
183 page_text = base::string16(); | 243 page_text = base::string16(); |
184 expected_features.Clear(); | 244 expected_features.Clear(); |
245 expected_shingle_hashes.clear(); | |
185 features.Clear(); | 246 features.Clear(); |
186 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 247 shingle_hashes.clear(); |
248 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
187 ExpectFeatureMapsAreEqual(features, expected_features); | 249 ExpectFeatureMapsAreEqual(features, expected_features); |
250 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
188 | 251 |
189 // Chinese translation of the phrase "hello goodbye". This tests that | 252 // Chinese translation of the phrase "hello goodbye". This tests that |
190 // we can correctly separate terms in languages that don't use spaces. | 253 // we can correctly separate terms in languages that don't use spaces. |
mattm
2014/05/06 01:00:14
Seems we should also have a similar test with enou
zysxqn
2014/05/06 20:56:57
Done.
| |
191 page_text = | 254 page_text = |
192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); | 255 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
193 expected_features.Clear(); | 256 expected_features.Clear(); |
194 expected_features.AddBooleanFeature( | 257 expected_features.AddBooleanFeature( |
195 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); | 258 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
196 expected_features.AddBooleanFeature( | 259 expected_features.AddBooleanFeature( |
197 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); | 260 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
261 expected_shingle_hashes.clear(); | |
198 | 262 |
199 features.Clear(); | 263 features.Clear(); |
200 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 264 shingle_hashes.clear(); |
265 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
201 ExpectFeatureMapsAreEqual(features, expected_features); | 266 ExpectFeatureMapsAreEqual(features, expected_features); |
267 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
202 } | 268 } |
203 | 269 |
204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { | 270 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
205 // For this test, we'll cause the feature extraction to run multiple | 271 // For this test, we'll cause the feature extraction to run multiple |
206 // iterations by incrementing the clock. | 272 // iterations by incrementing the clock. We don't check shingle hashes here |
273 // since its size is too large. | |
mattm
2014/05/06 01:00:14
what do you mean by size is too large? That you do
zysxqn
2014/05/06 20:56:57
Done.
| |
207 | 274 |
208 // This page has a total of 30 words. For the features to be computed | 275 // This page has a total of 30 words. For the features to be computed |
209 // correctly, the extractor has to process the entire string of text. | 276 // correctly, the extractor has to process the entire string of text. |
210 base::string16 page_text(ASCIIToUTF16("one ")); | 277 base::string16 page_text(ASCIIToUTF16("one ")); |
211 for (int i = 0; i < 28; ++i) { | 278 for (int i = 0; i < 28; ++i) { |
212 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 279 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
213 } | 280 } |
214 page_text.append(ASCIIToUTF16("two")); | 281 page_text.append(ASCIIToUTF16("two")); |
215 | 282 |
216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. | 283 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. |
(...skipping 23 matching lines...) Expand all Loading... | |
240 // A final check for the histograms. | 307 // A final check for the histograms. |
241 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); | 308 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); |
242 | 309 |
243 FeatureMap expected_features; | 310 FeatureMap expected_features; |
244 expected_features.AddBooleanFeature(features::kPageTerm + | 311 expected_features.AddBooleanFeature(features::kPageTerm + |
245 std::string("one")); | 312 std::string("one")); |
246 expected_features.AddBooleanFeature(features::kPageTerm + | 313 expected_features.AddBooleanFeature(features::kPageTerm + |
247 std::string("two")); | 314 std::string("two")); |
248 | 315 |
249 FeatureMap features; | 316 FeatureMap features; |
250 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 317 std::set<uint32> shingle_hashes; |
318 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
251 ExpectFeatureMapsAreEqual(features, expected_features); | 319 ExpectFeatureMapsAreEqual(features, expected_features); |
252 // Make sure none of the mock expectations carry over to the next test. | 320 // Make sure none of the mock expectations carry over to the next test. |
253 ::testing::Mock::VerifyAndClearExpectations(&clock_); | 321 ::testing::Mock::VerifyAndClearExpectations(&clock_); |
254 | 322 |
255 // Now repeat the test with the same text, but advance the clock faster so | 323 // Now repeat the test with the same text, but advance the clock faster so |
256 // that the extraction time exceeds the maximum total time for the feature | 324 // that the extraction time exceeds the maximum total time for the feature |
257 // extractor. Extraction should fail. Note that this assumes | 325 // extractor. Extraction should fail. Note that this assumes |
258 // kMaxTotalTimeMs = 500. | 326 // kMaxTotalTimeMs = 500. |
259 EXPECT_CALL(clock_, Now()) | 327 EXPECT_CALL(clock_, Now()) |
260 // Time check at the start of extraction. | 328 // Time check at the start of extraction. |
261 .WillOnce(Return(now)) | 329 .WillOnce(Return(now)) |
262 // Time check at the start of the first chunk of work. | 330 // Time check at the start of the first chunk of work. |
263 .WillOnce(Return(now)) | 331 .WillOnce(Return(now)) |
264 // Time check after the first 5 words, | 332 // Time check after the first 5 words, |
265 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) | 333 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) |
266 // Time check at the start of the second chunk of work. | 334 // Time check at the start of the second chunk of work. |
267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) | 335 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) |
268 // Time check after the next 5 words. This is over the limit. | 336 // Time check after the next 5 words. This is over the limit. |
269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) | 337 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
270 // A final time check for the histograms. | 338 // A final time check for the histograms. |
271 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); | 339 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
272 | 340 |
273 features.Clear(); | 341 features.Clear(); |
274 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); | 342 shingle_hashes.clear(); |
343 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
275 } | 344 } |
276 | 345 |
277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { | 346 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
278 scoped_ptr<base::string16> page_text( | 347 scoped_ptr<base::string16> page_text( |
279 new base::string16(ASCIIToUTF16("one "))); | 348 new base::string16(ASCIIToUTF16("one "))); |
280 for (int i = 0; i < 28; ++i) { | 349 for (int i = 0; i < 28; ++i) { |
281 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 350 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
282 } | 351 } |
283 | 352 |
284 base::TimeTicks now = base::TimeTicks::Now(); | 353 base::TimeTicks now = base::TimeTicks::Now(); |
285 EXPECT_CALL(clock_, Now()) | 354 EXPECT_CALL(clock_, Now()) |
286 // Time check at the start of extraction. | 355 // Time check at the start of extraction. |
287 .WillOnce(Return(now)) | 356 .WillOnce(Return(now)) |
288 // Time check at the start of the first chunk of work. | 357 // Time check at the start of the first chunk of work. |
289 .WillOnce(Return(now)) | 358 .WillOnce(Return(now)) |
290 // Time check after the first 5 words. | 359 // Time check after the first 5 words. |
291 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 360 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
292 // Time check after the next 5 words. This should be greater than | 361 // Time check after the next 5 words. This should be greater than |
293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 362 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
294 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 363 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
295 | 364 |
296 FeatureMap features; | 365 FeatureMap features; |
366 std::set<uint32> shingle_hashes; | |
297 // Extract first 10 words then stop. | 367 // Extract first 10 words then stop. |
298 PartialExtractFeatures(page_text.get(), &features); | 368 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
299 | 369 |
300 page_text.reset(new base::string16()); | 370 page_text.reset(new base::string16()); |
301 for (int i = 30; i < 58; ++i) { | 371 for (int i = 30; i < 58; ++i) { |
302 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 372 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
303 } | 373 } |
304 page_text->append(ASCIIToUTF16("multi word test ")); | 374 page_text->append(ASCIIToUTF16("multi word test ")); |
305 features.Clear(); | 375 features.Clear(); |
376 shingle_hashes.clear(); | |
306 | 377 |
307 // This part doesn't exercise the extraction timing. | 378 // This part doesn't exercise the extraction timing. |
308 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 379 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
309 | 380 |
310 // Now extract normally and make sure nothing breaks. | 381 // Now extract normally and make sure nothing breaks. |
311 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); | 382 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
312 | 383 |
313 FeatureMap expected_features; | 384 FeatureMap expected_features; |
314 expected_features.AddBooleanFeature(features::kPageTerm + | 385 expected_features.AddBooleanFeature(features::kPageTerm + |
315 std::string("multi word test")); | 386 std::string("multi word test")); |
316 ExpectFeatureMapsAreEqual(features, expected_features); | 387 ExpectFeatureMapsAreEqual(features, expected_features); |
317 } | 388 } |
318 | 389 |
319 } // namespace safe_browsing | 390 } // namespace safe_browsing |
OLD | NEW |