Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(177)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix a reference problem. Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/bind.h" 9 #include "base/bind.h"
10 #include "base/callback.h" 10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h" 11 #include "base/containers/hash_tables.h"
12 #include "base/memory/scoped_ptr.h" 12 #include "base/memory/scoped_ptr.h"
13 #include "base/message_loop/message_loop.h" 13 #include "base/message_loop/message_loop.h"
14 #include "base/strings/string16.h" 14 #include "base/strings/string16.h"
15 #include "base/strings/stringprintf.h" 15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h" 16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h" 17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/features.h" 18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "chrome/renderer/safe_browsing/test_utils.h" 21 #include "chrome/renderer/safe_browsing/test_utils.h"
22 #include "crypto/sha2.h" 22 #include "crypto/sha2.h"
23 #include "testing/gmock/include/gmock/gmock.h" 23 #include "testing/gmock/include/gmock/gmock.h"
24 #include "testing/gtest/include/gtest/gtest.h" 24 #include "testing/gtest/include/gtest/gtest.h"
25 25
26 using base::ASCIIToUTF16; 26 using base::ASCIIToUTF16;
27 using ::testing::Return; 27 using ::testing::Return;
28 28
29
30 static const uint32 kMurmurHash3Seed = 2777808611U;
31
29 namespace safe_browsing { 32 namespace safe_browsing {
30 33
31 class PhishingTermFeatureExtractorTest : public ::testing::Test { 34 class PhishingTermFeatureExtractorTest : public ::testing::Test {
32 protected: 35 protected:
33 virtual void SetUp() { 36 virtual void SetUp() {
34 base::hash_set<std::string> terms; 37 base::hash_set<std::string> terms;
35 terms.insert("one"); 38 terms.insert("one");
36 terms.insert("one one"); 39 terms.insert("one one");
37 terms.insert("two"); 40 terms.insert("two");
38 terms.insert("multi word test"); 41 terms.insert("multi word test");
(...skipping 17 matching lines...) Expand all
56 words.insert("multi"); 59 words.insert("multi");
57 words.insert("word"); 60 words.insert("word");
58 words.insert("test"); 61 words.insert("test");
59 words.insert("capitalization"); 62 words.insert("capitalization");
60 words.insert("space"); 63 words.insert("space");
61 words.insert("separator"); 64 words.insert("separator");
62 words.insert("punctuation"); 65 words.insert("punctuation");
63 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); 66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
64 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); 67 words.insert("\xe5\x86\x8d\xe8\xa7\x81");
65 68
66 static const uint32 kMurmurHash3Seed = 2777808611U;
67 for (base::hash_set<std::string>::iterator it = words.begin(); 69 for (base::hash_set<std::string>::iterator it = words.begin();
68 it != words.end(); ++it) { 70 it != words.end(); ++it) {
69 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); 71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
70 } 72 }
71 73
72 extractor_.reset(new PhishingTermFeatureExtractor( 74 extractor_.reset(new PhishingTermFeatureExtractor(
73 &term_hashes_, 75 &term_hashes_,
74 &word_hashes_, 76 &word_hashes_,
77 3 /* max_hashes_per_page */,
75 3 /* max_words_per_term */, 78 3 /* max_words_per_term */,
76 kMurmurHash3Seed, 79 kMurmurHash3Seed,
80 4 /* shingle_size */,
77 &clock_)); 81 &clock_));
78 } 82 }
79 83
80 // Runs the TermFeatureExtractor on |page_text|, waiting for the 84 // Runs the TermFeatureExtractor on |page_text|, waiting for the
81 // completion callback. Returns the success boolean from the callback. 85 // completion callback. Returns the success boolean from the callback.
82 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { 86 bool ExtractFeatures(const base::string16* page_text,
87 FeatureMap* features,
88 std::set<uint32>* shingle_hashes) {
83 success_ = false; 89 success_ = false;
84 extractor_->ExtractFeatures( 90 extractor_->ExtractFeatures(
85 page_text, 91 page_text,
86 features, 92 features,
93 shingle_hashes,
87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, 94 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
88 base::Unretained(this))); 95 base::Unretained(this)));
89 msg_loop_.Run(); 96 msg_loop_.Run();
90 return success_; 97 return success_;
91 } 98 }
92 99
93 void PartialExtractFeatures(const base::string16* page_text, 100 void PartialExtractFeatures(const base::string16* page_text,
94 FeatureMap* features) { 101 FeatureMap* features,
102 std::set<uint32>* shingle_hashes) {
95 extractor_->ExtractFeatures( 103 extractor_->ExtractFeatures(
96 page_text, 104 page_text,
97 features, 105 features,
106 shingle_hashes,
98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, 107 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
99 base::Unretained(this))); 108 base::Unretained(this)));
100 msg_loop_.PostTask( 109 msg_loop_.PostTask(
101 FROM_HERE, 110 FROM_HERE,
102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, 111 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
103 base::Unretained(this))); 112 base::Unretained(this)));
104 msg_loop_.RunUntilIdle(); 113 msg_loop_.RunUntilIdle();
105 } 114 }
106 115
107 // Completion callback for feature extraction. 116 // Completion callback for feature extraction.
(...skipping 14 matching lines...) Expand all
122 base::hash_set<uint32> word_hashes_; 131 base::hash_set<uint32> word_hashes_;
123 bool success_; // holds the success value from ExtractFeatures 132 bool success_; // holds the success value from ExtractFeatures
124 }; 133 };
125 134
126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { 135 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
127 // This test doesn't exercise the extraction timing. 136 // This test doesn't exercise the extraction timing.
128 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 137 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
129 138
130 base::string16 page_text = ASCIIToUTF16("blah"); 139 base::string16 page_text = ASCIIToUTF16("blah");
131 FeatureMap expected_features; // initially empty 140 FeatureMap expected_features; // initially empty
141 std::set<uint32> expected_shingle_hashes;
132 142
133 FeatureMap features; 143 FeatureMap features;
134 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 144 std::set<uint32> shingle_hashes;
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
135 ExpectFeatureMapsAreEqual(features, expected_features); 146 ExpectFeatureMapsAreEqual(features, expected_features);
147 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
136 148
137 page_text = ASCIIToUTF16("one one"); 149 page_text = ASCIIToUTF16("one one");
138 expected_features.Clear(); 150 expected_features.Clear();
139 expected_features.AddBooleanFeature(features::kPageTerm + 151 expected_features.AddBooleanFeature(features::kPageTerm +
140 std::string("one")); 152 std::string("one"));
141 expected_features.AddBooleanFeature(features::kPageTerm + 153 expected_features.AddBooleanFeature(features::kPageTerm +
142 std::string("one one")); 154 std::string("one one"));
155 expected_shingle_hashes.clear();
143 156
144 features.Clear(); 157 features.Clear();
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 158 shingle_hashes.clear();
159 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
146 ExpectFeatureMapsAreEqual(features, expected_features); 160 ExpectFeatureMapsAreEqual(features, expected_features);
161 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
147 162
148 page_text = ASCIIToUTF16("bla bla multi word test bla"); 163 page_text = ASCIIToUTF16("bla bla multi word test bla");
149 expected_features.Clear(); 164 expected_features.Clear();
150 expected_features.AddBooleanFeature(features::kPageTerm + 165 expected_features.AddBooleanFeature(features::kPageTerm +
151 std::string("multi word test")); 166 std::string("multi word test"));
167 expected_shingle_hashes.clear();
168 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
169 kMurmurHash3Seed));
170 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
171 kMurmurHash3Seed));
172 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
173 kMurmurHash3Seed));
152 174
153 features.Clear(); 175 features.Clear();
154 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 176 shingle_hashes.clear();
177 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
155 ExpectFeatureMapsAreEqual(features, expected_features); 178 ExpectFeatureMapsAreEqual(features, expected_features);
179 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
156 180
157 // This text has all of the words for one of the terms, but they are 181 // This text has all of the words for one of the terms, but they are
158 // not in the correct order. 182 // not in the correct order.
159 page_text = ASCIIToUTF16("bla bla test word multi bla"); 183 page_text = ASCIIToUTF16("bla bla test word multi bla");
160 expected_features.Clear(); 184 expected_features.Clear();
185 expected_shingle_hashes.clear();
186 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
187 kMurmurHash3Seed));
188 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
189 kMurmurHash3Seed));
190 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
191 kMurmurHash3Seed));
161 192
162 features.Clear(); 193 features.Clear();
163 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 194 shingle_hashes.clear();
195 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
164 ExpectFeatureMapsAreEqual(features, expected_features); 196 ExpectFeatureMapsAreEqual(features, expected_features);
197 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
165 198
199 // Test various separators.
166 page_text = ASCIIToUTF16("Capitalization plus non-space\n" 200 page_text = ASCIIToUTF16("Capitalization plus non-space\n"
167 "separator... punctuation!"); 201 "separator... punctuation!");
168 expected_features.Clear(); 202 expected_features.Clear();
169 expected_features.AddBooleanFeature(features::kPageTerm + 203 expected_features.AddBooleanFeature(features::kPageTerm +
170 std::string("capitalization")); 204 std::string("capitalization"));
171 expected_features.AddBooleanFeature(features::kPageTerm + 205 expected_features.AddBooleanFeature(features::kPageTerm +
172 std::string("space")); 206 std::string("space"));
173 expected_features.AddBooleanFeature(features::kPageTerm + 207 expected_features.AddBooleanFeature(features::kPageTerm +
174 std::string("separator")); 208 std::string("separator"));
175 expected_features.AddBooleanFeature(features::kPageTerm + 209 expected_features.AddBooleanFeature(features::kPageTerm +
176 std::string("punctuation")); 210 std::string("punctuation"));
211 expected_shingle_hashes.clear();
212 expected_shingle_hashes.insert(
213 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
214 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
215 kMurmurHash3Seed));
216 expected_shingle_hashes.insert(
217 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
177 218
178 features.Clear(); 219 features.Clear();
179 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 220 shingle_hashes.clear();
221 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
180 ExpectFeatureMapsAreEqual(features, expected_features); 222 ExpectFeatureMapsAreEqual(features, expected_features);
223 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
224
225 // Test a page with too many words and we should only 3 shingle hashes.
226 page_text = ASCIIToUTF16("This page has way too many words.");
227 expected_features.Clear();
228 expected_shingle_hashes.clear();
229 expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
230 kMurmurHash3Seed));
231 expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
232 kMurmurHash3Seed));
233 expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
234 kMurmurHash3Seed));
235
236 features.Clear();
237 shingle_hashes.clear();
238 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
239 ExpectFeatureMapsAreEqual(features, expected_features);
240 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
181 241
182 // Test with empty page text. 242 // Test with empty page text.
183 page_text = base::string16(); 243 page_text = base::string16();
184 expected_features.Clear(); 244 expected_features.Clear();
245 expected_shingle_hashes.clear();
185 features.Clear(); 246 features.Clear();
186 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 247 shingle_hashes.clear();
248 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
187 ExpectFeatureMapsAreEqual(features, expected_features); 249 ExpectFeatureMapsAreEqual(features, expected_features);
250 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
188 251
189 // Chinese translation of the phrase "hello goodbye". This tests that 252 // Chinese translation of the phrase "hello goodbye". This tests that
190 // we can correctly separate terms in languages that don't use spaces. 253 // we can correctly separate terms in languages that don't use spaces.
mattm 2014/05/06 01:00:14 Seems we should also have a similar test with enou
zysxqn 2014/05/06 20:56:57 Done.
191 page_text = 254 page_text =
192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); 255 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
193 expected_features.Clear(); 256 expected_features.Clear();
194 expected_features.AddBooleanFeature( 257 expected_features.AddBooleanFeature(
195 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); 258 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
196 expected_features.AddBooleanFeature( 259 expected_features.AddBooleanFeature(
197 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); 260 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
261 expected_shingle_hashes.clear();
198 262
199 features.Clear(); 263 features.Clear();
200 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 264 shingle_hashes.clear();
265 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
201 ExpectFeatureMapsAreEqual(features, expected_features); 266 ExpectFeatureMapsAreEqual(features, expected_features);
267 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
202 } 268 }
203 269
204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { 270 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
205 // For this test, we'll cause the feature extraction to run multiple 271 // For this test, we'll cause the feature extraction to run multiple
206 // iterations by incrementing the clock. 272 // iterations by incrementing the clock. We don't check shingle hashes here
273 // since its size is too large.
mattm 2014/05/06 01:00:14 what do you mean by size is too large? That you do
zysxqn 2014/05/06 20:56:57 Done.
207 274
208 // This page has a total of 30 words. For the features to be computed 275 // This page has a total of 30 words. For the features to be computed
209 // correctly, the extractor has to process the entire string of text. 276 // correctly, the extractor has to process the entire string of text.
210 base::string16 page_text(ASCIIToUTF16("one ")); 277 base::string16 page_text(ASCIIToUTF16("one "));
211 for (int i = 0; i < 28; ++i) { 278 for (int i = 0; i < 28; ++i) {
212 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 279 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
213 } 280 }
214 page_text.append(ASCIIToUTF16("two")); 281 page_text.append(ASCIIToUTF16("two"));
215 282
216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. 283 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
(...skipping 23 matching lines...) Expand all
240 // A final check for the histograms. 307 // A final check for the histograms.
241 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); 308 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
242 309
243 FeatureMap expected_features; 310 FeatureMap expected_features;
244 expected_features.AddBooleanFeature(features::kPageTerm + 311 expected_features.AddBooleanFeature(features::kPageTerm +
245 std::string("one")); 312 std::string("one"));
246 expected_features.AddBooleanFeature(features::kPageTerm + 313 expected_features.AddBooleanFeature(features::kPageTerm +
247 std::string("two")); 314 std::string("two"));
248 315
249 FeatureMap features; 316 FeatureMap features;
250 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 317 std::set<uint32> shingle_hashes;
318 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
251 ExpectFeatureMapsAreEqual(features, expected_features); 319 ExpectFeatureMapsAreEqual(features, expected_features);
252 // Make sure none of the mock expectations carry over to the next test. 320 // Make sure none of the mock expectations carry over to the next test.
253 ::testing::Mock::VerifyAndClearExpectations(&clock_); 321 ::testing::Mock::VerifyAndClearExpectations(&clock_);
254 322
255 // Now repeat the test with the same text, but advance the clock faster so 323 // Now repeat the test with the same text, but advance the clock faster so
256 // that the extraction time exceeds the maximum total time for the feature 324 // that the extraction time exceeds the maximum total time for the feature
257 // extractor. Extraction should fail. Note that this assumes 325 // extractor. Extraction should fail. Note that this assumes
258 // kMaxTotalTimeMs = 500. 326 // kMaxTotalTimeMs = 500.
259 EXPECT_CALL(clock_, Now()) 327 EXPECT_CALL(clock_, Now())
260 // Time check at the start of extraction. 328 // Time check at the start of extraction.
261 .WillOnce(Return(now)) 329 .WillOnce(Return(now))
262 // Time check at the start of the first chunk of work. 330 // Time check at the start of the first chunk of work.
263 .WillOnce(Return(now)) 331 .WillOnce(Return(now))
264 // Time check after the first 5 words, 332 // Time check after the first 5 words,
265 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) 333 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
266 // Time check at the start of the second chunk of work. 334 // Time check at the start of the second chunk of work.
267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) 335 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
268 // Time check after the next 5 words. This is over the limit. 336 // Time check after the next 5 words. This is over the limit.
269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) 337 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
270 // A final time check for the histograms. 338 // A final time check for the histograms.
271 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); 339 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
272 340
273 features.Clear(); 341 features.Clear();
274 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); 342 shingle_hashes.clear();
343 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
275 } 344 }
276 345
277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { 346 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
278 scoped_ptr<base::string16> page_text( 347 scoped_ptr<base::string16> page_text(
279 new base::string16(ASCIIToUTF16("one "))); 348 new base::string16(ASCIIToUTF16("one ")));
280 for (int i = 0; i < 28; ++i) { 349 for (int i = 0; i < 28; ++i) {
281 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 350 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
282 } 351 }
283 352
284 base::TimeTicks now = base::TimeTicks::Now(); 353 base::TimeTicks now = base::TimeTicks::Now();
285 EXPECT_CALL(clock_, Now()) 354 EXPECT_CALL(clock_, Now())
286 // Time check at the start of extraction. 355 // Time check at the start of extraction.
287 .WillOnce(Return(now)) 356 .WillOnce(Return(now))
288 // Time check at the start of the first chunk of work. 357 // Time check at the start of the first chunk of work.
289 .WillOnce(Return(now)) 358 .WillOnce(Return(now))
290 // Time check after the first 5 words. 359 // Time check after the first 5 words.
291 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) 360 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
292 // Time check after the next 5 words. This should be greater than 361 // Time check after the next 5 words. This should be greater than
293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. 362 // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
294 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); 363 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
295 364
296 FeatureMap features; 365 FeatureMap features;
366 std::set<uint32> shingle_hashes;
297 // Extract first 10 words then stop. 367 // Extract first 10 words then stop.
298 PartialExtractFeatures(page_text.get(), &features); 368 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
299 369
300 page_text.reset(new base::string16()); 370 page_text.reset(new base::string16());
301 for (int i = 30; i < 58; ++i) { 371 for (int i = 30; i < 58; ++i) {
302 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 372 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
303 } 373 }
304 page_text->append(ASCIIToUTF16("multi word test ")); 374 page_text->append(ASCIIToUTF16("multi word test "));
305 features.Clear(); 375 features.Clear();
376 shingle_hashes.clear();
306 377
307 // This part doesn't exercise the extraction timing. 378 // This part doesn't exercise the extraction timing.
308 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 379 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
309 380
310 // Now extract normally and make sure nothing breaks. 381 // Now extract normally and make sure nothing breaks.
311 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); 382 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
312 383
313 FeatureMap expected_features; 384 FeatureMap expected_features;
314 expected_features.AddBooleanFeature(features::kPageTerm + 385 expected_features.AddBooleanFeature(features::kPageTerm +
315 std::string("multi word test")); 386 std::string("multi word test"));
316 ExpectFeatureMapsAreEqual(features, expected_features); 387 ExpectFeatureMapsAreEqual(features, expected_features);
317 } 388 }
318 389
319 } // namespace safe_browsing 390 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698