OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <string> | 7 #include <string> |
8 | 8 |
9 #include "base/bind.h" | 9 #include "base/bind.h" |
10 #include "base/callback.h" | 10 #include "base/callback.h" |
11 #include "base/containers/hash_tables.h" | 11 #include "base/containers/hash_tables.h" |
12 #include "base/memory/scoped_ptr.h" | 12 #include "base/memory/scoped_ptr.h" |
13 #include "base/message_loop/message_loop.h" | 13 #include "base/message_loop/message_loop.h" |
14 #include "base/strings/string16.h" | 14 #include "base/strings/string16.h" |
15 #include "base/strings/stringprintf.h" | 15 #include "base/strings/stringprintf.h" |
16 #include "base/strings/utf_string_conversions.h" | 16 #include "base/strings/utf_string_conversions.h" |
17 #include "base/time/time.h" | 17 #include "base/time/time.h" |
18 #include "chrome/renderer/safe_browsing/features.h" | 18 #include "chrome/renderer/safe_browsing/features.h" |
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" | 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" |
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
21 #include "chrome/renderer/safe_browsing/test_utils.h" | 21 #include "chrome/renderer/safe_browsing/test_utils.h" |
22 #include "crypto/sha2.h" | 22 #include "crypto/sha2.h" |
23 #include "testing/gmock/include/gmock/gmock.h" | 23 #include "testing/gmock/include/gmock/gmock.h" |
24 #include "testing/gtest/include/gtest/gtest.h" | 24 #include "testing/gtest/include/gtest/gtest.h" |
25 | 25 |
26 using base::ASCIIToUTF16; | 26 using base::ASCIIToUTF16; |
27 using ::testing::Return; | 27 using ::testing::Return; |
28 | 28 |
| 29 |
| 30 static const uint32 kMurmurHash3Seed = 2777808611U; |
| 31 |
29 namespace safe_browsing { | 32 namespace safe_browsing { |
30 | 33 |
31 class PhishingTermFeatureExtractorTest : public ::testing::Test { | 34 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
32 protected: | 35 protected: |
33 virtual void SetUp() { | 36 virtual void SetUp() { |
34 base::hash_set<std::string> terms; | 37 base::hash_set<std::string> terms; |
35 terms.insert("one"); | 38 terms.insert("one"); |
36 terms.insert("one one"); | 39 terms.insert("one one"); |
37 terms.insert("two"); | 40 terms.insert("two"); |
38 terms.insert("multi word test"); | 41 terms.insert("multi word test"); |
(...skipping 17 matching lines...) Expand all Loading... |
56 words.insert("multi"); | 59 words.insert("multi"); |
57 words.insert("word"); | 60 words.insert("word"); |
58 words.insert("test"); | 61 words.insert("test"); |
59 words.insert("capitalization"); | 62 words.insert("capitalization"); |
60 words.insert("space"); | 63 words.insert("space"); |
61 words.insert("separator"); | 64 words.insert("separator"); |
62 words.insert("punctuation"); | 65 words.insert("punctuation"); |
63 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); | 66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
64 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); | 67 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
65 | 68 |
66 static const uint32 kMurmurHash3Seed = 2777808611U; | |
67 for (base::hash_set<std::string>::iterator it = words.begin(); | 69 for (base::hash_set<std::string>::iterator it = words.begin(); |
68 it != words.end(); ++it) { | 70 it != words.end(); ++it) { |
69 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); | 71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); |
70 } | 72 } |
71 | 73 |
| 74 ResetExtractor(3 /* max shingles per page */); |
| 75 } |
| 76 |
| 77 void ResetExtractor(size_t max_shingles_per_page) { |
72 extractor_.reset(new PhishingTermFeatureExtractor( | 78 extractor_.reset(new PhishingTermFeatureExtractor( |
73 &term_hashes_, | 79 &term_hashes_, |
74 &word_hashes_, | 80 &word_hashes_, |
75 3 /* max_words_per_term */, | 81 3 /* max_words_per_term */, |
76 kMurmurHash3Seed, | 82 kMurmurHash3Seed, |
| 83 max_shingles_per_page, |
| 84 4 /* shingle_size */, |
77 &clock_)); | 85 &clock_)); |
78 } | 86 } |
79 | 87 |
80 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 88 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
81 // completion callback. Returns the success boolean from the callback. | 89 // completion callback. Returns the success boolean from the callback. |
82 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { | 90 bool ExtractFeatures(const base::string16* page_text, |
| 91 FeatureMap* features, |
| 92 std::set<uint32>* shingle_hashes) { |
83 success_ = false; | 93 success_ = false; |
84 extractor_->ExtractFeatures( | 94 extractor_->ExtractFeatures( |
85 page_text, | 95 page_text, |
86 features, | 96 features, |
| 97 shingle_hashes, |
87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
88 base::Unretained(this))); | 99 base::Unretained(this))); |
89 msg_loop_.Run(); | 100 msg_loop_.Run(); |
90 return success_; | 101 return success_; |
91 } | 102 } |
92 | 103 |
93 void PartialExtractFeatures(const base::string16* page_text, | 104 void PartialExtractFeatures(const base::string16* page_text, |
94 FeatureMap* features) { | 105 FeatureMap* features, |
| 106 std::set<uint32>* shingle_hashes) { |
95 extractor_->ExtractFeatures( | 107 extractor_->ExtractFeatures( |
96 page_text, | 108 page_text, |
97 features, | 109 features, |
| 110 shingle_hashes, |
98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 111 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
99 base::Unretained(this))); | 112 base::Unretained(this))); |
100 msg_loop_.PostTask( | 113 msg_loop_.PostTask( |
101 FROM_HERE, | 114 FROM_HERE, |
102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 115 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
103 base::Unretained(this))); | 116 base::Unretained(this))); |
104 msg_loop_.RunUntilIdle(); | 117 msg_loop_.RunUntilIdle(); |
105 } | 118 } |
106 | 119 |
107 // Completion callback for feature extraction. | 120 // Completion callback for feature extraction. |
(...skipping 14 matching lines...) Expand all Loading... |
122 base::hash_set<uint32> word_hashes_; | 135 base::hash_set<uint32> word_hashes_; |
123 bool success_; // holds the success value from ExtractFeatures | 136 bool success_; // holds the success value from ExtractFeatures |
124 }; | 137 }; |
125 | 138 |
126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 139 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
127 // This test doesn't exercise the extraction timing. | 140 // This test doesn't exercise the extraction timing. |
128 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 141 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
129 | 142 |
130 base::string16 page_text = ASCIIToUTF16("blah"); | 143 base::string16 page_text = ASCIIToUTF16("blah"); |
131 FeatureMap expected_features; // initially empty | 144 FeatureMap expected_features; // initially empty |
| 145 std::set<uint32> expected_shingle_hashes; |
132 | 146 |
133 FeatureMap features; | 147 FeatureMap features; |
134 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 148 std::set<uint32> shingle_hashes; |
| 149 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
135 ExpectFeatureMapsAreEqual(features, expected_features); | 150 ExpectFeatureMapsAreEqual(features, expected_features); |
| 151 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
136 | 152 |
137 page_text = ASCIIToUTF16("one one"); | 153 page_text = ASCIIToUTF16("one one"); |
138 expected_features.Clear(); | 154 expected_features.Clear(); |
139 expected_features.AddBooleanFeature(features::kPageTerm + | 155 expected_features.AddBooleanFeature(features::kPageTerm + |
140 std::string("one")); | 156 std::string("one")); |
141 expected_features.AddBooleanFeature(features::kPageTerm + | 157 expected_features.AddBooleanFeature(features::kPageTerm + |
142 std::string("one one")); | 158 std::string("one one")); |
| 159 expected_shingle_hashes.clear(); |
143 | 160 |
144 features.Clear(); | 161 features.Clear(); |
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 162 shingle_hashes.clear(); |
| 163 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
146 ExpectFeatureMapsAreEqual(features, expected_features); | 164 ExpectFeatureMapsAreEqual(features, expected_features); |
| 165 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
147 | 166 |
148 page_text = ASCIIToUTF16("bla bla multi word test bla"); | 167 page_text = ASCIIToUTF16("bla bla multi word test bla"); |
149 expected_features.Clear(); | 168 expected_features.Clear(); |
150 expected_features.AddBooleanFeature(features::kPageTerm + | 169 expected_features.AddBooleanFeature(features::kPageTerm + |
151 std::string("multi word test")); | 170 std::string("multi word test")); |
| 171 expected_shingle_hashes.clear(); |
| 172 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", |
| 173 kMurmurHash3Seed)); |
| 174 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", |
| 175 kMurmurHash3Seed)); |
| 176 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", |
| 177 kMurmurHash3Seed)); |
152 | 178 |
153 features.Clear(); | 179 features.Clear(); |
154 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 180 shingle_hashes.clear(); |
| 181 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
155 ExpectFeatureMapsAreEqual(features, expected_features); | 182 ExpectFeatureMapsAreEqual(features, expected_features); |
| 183 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
156 | 184 |
157 // This text has all of the words for one of the terms, but they are | 185 // This text has all of the words for one of the terms, but they are |
158 // not in the correct order. | 186 // not in the correct order. |
159 page_text = ASCIIToUTF16("bla bla test word multi bla"); | 187 page_text = ASCIIToUTF16("bla bla test word multi bla"); |
160 expected_features.Clear(); | 188 expected_features.Clear(); |
| 189 expected_shingle_hashes.clear(); |
| 190 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", |
| 191 kMurmurHash3Seed)); |
| 192 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", |
| 193 kMurmurHash3Seed)); |
| 194 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", |
| 195 kMurmurHash3Seed)); |
161 | 196 |
162 features.Clear(); | 197 features.Clear(); |
163 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 198 shingle_hashes.clear(); |
| 199 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
164 ExpectFeatureMapsAreEqual(features, expected_features); | 200 ExpectFeatureMapsAreEqual(features, expected_features); |
| 201 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
165 | 202 |
| 203 // Test various separators. |
166 page_text = ASCIIToUTF16("Capitalization plus non-space\n" | 204 page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
167 "separator... punctuation!"); | 205 "separator... punctuation!"); |
168 expected_features.Clear(); | 206 expected_features.Clear(); |
169 expected_features.AddBooleanFeature(features::kPageTerm + | 207 expected_features.AddBooleanFeature(features::kPageTerm + |
170 std::string("capitalization")); | 208 std::string("capitalization")); |
171 expected_features.AddBooleanFeature(features::kPageTerm + | 209 expected_features.AddBooleanFeature(features::kPageTerm + |
172 std::string("space")); | 210 std::string("space")); |
173 expected_features.AddBooleanFeature(features::kPageTerm + | 211 expected_features.AddBooleanFeature(features::kPageTerm + |
174 std::string("separator")); | 212 std::string("separator")); |
175 expected_features.AddBooleanFeature(features::kPageTerm + | 213 expected_features.AddBooleanFeature(features::kPageTerm + |
176 std::string("punctuation")); | 214 std::string("punctuation")); |
| 215 expected_shingle_hashes.clear(); |
| 216 expected_shingle_hashes.insert( |
| 217 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); |
| 218 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", |
| 219 kMurmurHash3Seed)); |
| 220 expected_shingle_hashes.insert( |
| 221 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); |
177 | 222 |
178 features.Clear(); | 223 features.Clear(); |
179 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 224 shingle_hashes.clear(); |
| 225 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
180 ExpectFeatureMapsAreEqual(features, expected_features); | 226 ExpectFeatureMapsAreEqual(features, expected_features); |
| 227 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 228 |
| 229 // Test a page with too many words and we should only 3 minimum hashes. |
| 230 page_text = ASCIIToUTF16("This page has way too many words."); |
| 231 expected_features.Clear(); |
| 232 expected_shingle_hashes.clear(); |
| 233 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", |
| 234 kMurmurHash3Seed)); |
| 235 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", |
| 236 kMurmurHash3Seed)); |
| 237 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", |
| 238 kMurmurHash3Seed)); |
| 239 expected_shingle_hashes.insert(MurmurHash3String("way too many words ", |
| 240 kMurmurHash3Seed)); |
| 241 std::set<uint32>::iterator it = expected_shingle_hashes.end(); |
| 242 expected_shingle_hashes.erase(--it); |
| 243 |
| 244 features.Clear(); |
| 245 shingle_hashes.clear(); |
| 246 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 247 ExpectFeatureMapsAreEqual(features, expected_features); |
| 248 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
181 | 249 |
182 // Test with empty page text. | 250 // Test with empty page text. |
183 page_text = base::string16(); | 251 page_text = base::string16(); |
184 expected_features.Clear(); | 252 expected_features.Clear(); |
| 253 expected_shingle_hashes.clear(); |
185 features.Clear(); | 254 features.Clear(); |
186 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 255 shingle_hashes.clear(); |
| 256 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
187 ExpectFeatureMapsAreEqual(features, expected_features); | 257 ExpectFeatureMapsAreEqual(features, expected_features); |
| 258 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
188 | 259 |
189 // Chinese translation of the phrase "hello goodbye". This tests that | 260 // Chinese translation of the phrase "hello goodbye hello goodbye". This tests |
190 // we can correctly separate terms in languages that don't use spaces. | 261 // that we can correctly separate terms in languages that don't use spaces. |
191 page_text = | 262 page_text = |
192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); | 263 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81" |
| 264 "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
193 expected_features.Clear(); | 265 expected_features.Clear(); |
194 expected_features.AddBooleanFeature( | 266 expected_features.AddBooleanFeature( |
195 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); | 267 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
196 expected_features.AddBooleanFeature( | 268 expected_features.AddBooleanFeature( |
197 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); | 269 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
| 270 expected_shingle_hashes.clear(); |
| 271 expected_shingle_hashes.insert(MurmurHash3String( |
| 272 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 " |
| 273 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed)); |
198 | 274 |
199 features.Clear(); | 275 features.Clear(); |
200 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 276 shingle_hashes.clear(); |
| 277 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
201 ExpectFeatureMapsAreEqual(features, expected_features); | 278 ExpectFeatureMapsAreEqual(features, expected_features); |
| 279 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
202 } | 280 } |
203 | 281 |
204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { | 282 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
205 // For this test, we'll cause the feature extraction to run multiple | 283 // For this test, we'll cause the feature extraction to run multiple |
206 // iterations by incrementing the clock. | 284 // iterations by incrementing the clock. |
| 285 ResetExtractor(200 /* max shingles per page */); |
207 | 286 |
208 // This page has a total of 30 words. For the features to be computed | 287 // This page has a total of 30 words. For the features to be computed |
209 // correctly, the extractor has to process the entire string of text. | 288 // correctly, the extractor has to process the entire string of text. |
210 base::string16 page_text(ASCIIToUTF16("one ")); | 289 base::string16 page_text(ASCIIToUTF16("one ")); |
211 for (int i = 0; i < 28; ++i) { | 290 for (int i = 0; i < 28; ++i) { |
212 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 291 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
213 } | 292 } |
214 page_text.append(ASCIIToUTF16("two")); | 293 page_text.append(ASCIIToUTF16("two")); |
215 | 294 |
216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. | 295 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. |
(...skipping 21 matching lines...) Expand all Loading... |
238 // Time check after the next 5 words. | 317 // Time check after the next 5 words. |
239 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) | 318 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) |
240 // A final check for the histograms. | 319 // A final check for the histograms. |
241 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); | 320 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); |
242 | 321 |
243 FeatureMap expected_features; | 322 FeatureMap expected_features; |
244 expected_features.AddBooleanFeature(features::kPageTerm + | 323 expected_features.AddBooleanFeature(features::kPageTerm + |
245 std::string("one")); | 324 std::string("one")); |
246 expected_features.AddBooleanFeature(features::kPageTerm + | 325 expected_features.AddBooleanFeature(features::kPageTerm + |
247 std::string("two")); | 326 std::string("two")); |
| 327 std::set<uint32> expected_shingle_hashes; |
| 328 expected_shingle_hashes.insert( |
| 329 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); |
| 330 expected_shingle_hashes.insert( |
| 331 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); |
| 332 expected_shingle_hashes.insert( |
| 333 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); |
| 334 expected_shingle_hashes.insert( |
| 335 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); |
| 336 expected_shingle_hashes.insert( |
| 337 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); |
| 338 expected_shingle_hashes.insert( |
| 339 MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed)); |
| 340 expected_shingle_hashes.insert( |
| 341 MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed)); |
| 342 expected_shingle_hashes.insert( |
| 343 MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed)); |
| 344 expected_shingle_hashes.insert( |
| 345 MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed)); |
| 346 expected_shingle_hashes.insert( |
| 347 MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed)); |
| 348 expected_shingle_hashes.insert( |
| 349 MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed)); |
| 350 expected_shingle_hashes.insert( |
| 351 MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed)); |
| 352 expected_shingle_hashes.insert( |
| 353 MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed)); |
| 354 expected_shingle_hashes.insert( |
| 355 MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed)); |
| 356 expected_shingle_hashes.insert( |
| 357 MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed)); |
| 358 expected_shingle_hashes.insert( |
| 359 MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed)); |
| 360 expected_shingle_hashes.insert( |
| 361 MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed)); |
| 362 expected_shingle_hashes.insert( |
| 363 MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed)); |
| 364 expected_shingle_hashes.insert( |
| 365 MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed)); |
| 366 expected_shingle_hashes.insert( |
| 367 MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed)); |
| 368 expected_shingle_hashes.insert( |
| 369 MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed)); |
| 370 expected_shingle_hashes.insert( |
| 371 MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed)); |
| 372 expected_shingle_hashes.insert( |
| 373 MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed)); |
| 374 expected_shingle_hashes.insert( |
| 375 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); |
| 376 expected_shingle_hashes.insert( |
| 377 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); |
| 378 expected_shingle_hashes.insert( |
| 379 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); |
| 380 expected_shingle_hashes.insert( |
| 381 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); |
248 | 382 |
249 FeatureMap features; | 383 FeatureMap features; |
250 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 384 std::set<uint32> shingle_hashes; |
| 385 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
251 ExpectFeatureMapsAreEqual(features, expected_features); | 386 ExpectFeatureMapsAreEqual(features, expected_features); |
| 387 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
252 // Make sure none of the mock expectations carry over to the next test. | 388 // Make sure none of the mock expectations carry over to the next test. |
253 ::testing::Mock::VerifyAndClearExpectations(&clock_); | 389 ::testing::Mock::VerifyAndClearExpectations(&clock_); |
254 | 390 |
255 // Now repeat the test with the same text, but advance the clock faster so | 391 // Now repeat the test with the same text, but advance the clock faster so |
256 // that the extraction time exceeds the maximum total time for the feature | 392 // that the extraction time exceeds the maximum total time for the feature |
257 // extractor. Extraction should fail. Note that this assumes | 393 // extractor. Extraction should fail. Note that this assumes |
258 // kMaxTotalTimeMs = 500. | 394 // kMaxTotalTimeMs = 500. |
259 EXPECT_CALL(clock_, Now()) | 395 EXPECT_CALL(clock_, Now()) |
260 // Time check at the start of extraction. | 396 // Time check at the start of extraction. |
261 .WillOnce(Return(now)) | 397 .WillOnce(Return(now)) |
262 // Time check at the start of the first chunk of work. | 398 // Time check at the start of the first chunk of work. |
263 .WillOnce(Return(now)) | 399 .WillOnce(Return(now)) |
264 // Time check after the first 5 words, | 400 // Time check after the first 5 words, |
265 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) | 401 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) |
266 // Time check at the start of the second chunk of work. | 402 // Time check at the start of the second chunk of work. |
267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) | 403 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) |
268 // Time check after the next 5 words. This is over the limit. | 404 // Time check after the next 5 words. This is over the limit. |
269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) | 405 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
270 // A final time check for the histograms. | 406 // A final time check for the histograms. |
271 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); | 407 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
272 | 408 |
273 features.Clear(); | 409 features.Clear(); |
274 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); | 410 shingle_hashes.clear(); |
| 411 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
275 } | 412 } |
276 | 413 |
277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { | 414 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
278 scoped_ptr<base::string16> page_text( | 415 scoped_ptr<base::string16> page_text( |
279 new base::string16(ASCIIToUTF16("one "))); | 416 new base::string16(ASCIIToUTF16("one "))); |
280 for (int i = 0; i < 28; ++i) { | 417 for (int i = 0; i < 28; ++i) { |
281 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 418 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
282 } | 419 } |
283 | 420 |
284 base::TimeTicks now = base::TimeTicks::Now(); | 421 base::TimeTicks now = base::TimeTicks::Now(); |
285 EXPECT_CALL(clock_, Now()) | 422 EXPECT_CALL(clock_, Now()) |
286 // Time check at the start of extraction. | 423 // Time check at the start of extraction. |
287 .WillOnce(Return(now)) | 424 .WillOnce(Return(now)) |
288 // Time check at the start of the first chunk of work. | 425 // Time check at the start of the first chunk of work. |
289 .WillOnce(Return(now)) | 426 .WillOnce(Return(now)) |
290 // Time check after the first 5 words. | 427 // Time check after the first 5 words. |
291 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 428 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
292 // Time check after the next 5 words. This should be greater than | 429 // Time check after the next 5 words. This should be greater than |
293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 430 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
294 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 431 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
295 | 432 |
296 FeatureMap features; | 433 FeatureMap features; |
| 434 std::set<uint32> shingle_hashes; |
297 // Extract first 10 words then stop. | 435 // Extract first 10 words then stop. |
298 PartialExtractFeatures(page_text.get(), &features); | 436 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
299 | 437 |
300 page_text.reset(new base::string16()); | 438 page_text.reset(new base::string16()); |
301 for (int i = 30; i < 58; ++i) { | 439 for (int i = 30; i < 58; ++i) { |
302 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 440 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
303 } | 441 } |
304 page_text->append(ASCIIToUTF16("multi word test ")); | 442 page_text->append(ASCIIToUTF16("multi word test ")); |
305 features.Clear(); | 443 features.Clear(); |
| 444 shingle_hashes.clear(); |
306 | 445 |
307 // This part doesn't exercise the extraction timing. | 446 // This part doesn't exercise the extraction timing. |
308 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 447 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
309 | 448 |
310 // Now extract normally and make sure nothing breaks. | 449 // Now extract normally and make sure nothing breaks. |
311 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); | 450 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
312 | 451 |
313 FeatureMap expected_features; | 452 FeatureMap expected_features; |
314 expected_features.AddBooleanFeature(features::kPageTerm + | 453 expected_features.AddBooleanFeature(features::kPageTerm + |
315 std::string("multi word test")); | 454 std::string("multi word test")); |
316 ExpectFeatureMapsAreEqual(features, expected_features); | 455 ExpectFeatureMapsAreEqual(features, expected_features); |
317 } | 456 } |
318 | 457 |
319 } // namespace safe_browsing | 458 } // namespace safe_browsing |
OLD | NEW |