| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 | 8 |
| 9 #include "base/bind.h" | 9 #include "base/bind.h" |
| 10 #include "base/callback.h" | 10 #include "base/callback.h" |
| 11 #include "base/containers/hash_tables.h" | 11 #include "base/containers/hash_tables.h" |
| 12 #include "base/memory/scoped_ptr.h" | 12 #include "base/memory/scoped_ptr.h" |
| 13 #include "base/message_loop/message_loop.h" | 13 #include "base/message_loop/message_loop.h" |
| 14 #include "base/strings/string16.h" | 14 #include "base/strings/string16.h" |
| 15 #include "base/strings/stringprintf.h" | 15 #include "base/strings/stringprintf.h" |
| 16 #include "base/strings/utf_string_conversions.h" | 16 #include "base/strings/utf_string_conversions.h" |
| 17 #include "base/time/time.h" | 17 #include "base/time/time.h" |
| 18 #include "chrome/renderer/safe_browsing/features.h" | 18 #include "chrome/renderer/safe_browsing/features.h" |
| 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" | 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" |
| 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
| 21 #include "chrome/renderer/safe_browsing/test_utils.h" | 21 #include "chrome/renderer/safe_browsing/test_utils.h" |
| 22 #include "crypto/sha2.h" | 22 #include "crypto/sha2.h" |
| 23 #include "testing/gmock/include/gmock/gmock.h" | 23 #include "testing/gmock/include/gmock/gmock.h" |
| 24 #include "testing/gtest/include/gtest/gtest.h" | 24 #include "testing/gtest/include/gtest/gtest.h" |
| 25 | 25 |
| 26 using base::ASCIIToUTF16; | 26 using base::ASCIIToUTF16; |
| 27 using ::testing::Return; | 27 using ::testing::Return; |
| 28 | 28 |
| 29 |
| 30 static const uint32 kMurmurHash3Seed = 2777808611U; |
| 31 |
| 29 namespace safe_browsing { | 32 namespace safe_browsing { |
| 30 | 33 |
| 31 class PhishingTermFeatureExtractorTest : public ::testing::Test { | 34 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| 32 protected: | 35 protected: |
| 33 virtual void SetUp() { | 36 virtual void SetUp() { |
| 34 base::hash_set<std::string> terms; | 37 base::hash_set<std::string> terms; |
| 35 terms.insert("one"); | 38 terms.insert("one"); |
| 36 terms.insert("one one"); | 39 terms.insert("one one"); |
| 37 terms.insert("two"); | 40 terms.insert("two"); |
| 38 terms.insert("multi word test"); | 41 terms.insert("multi word test"); |
| (...skipping 17 matching lines...) Expand all Loading... |
| 56 words.insert("multi"); | 59 words.insert("multi"); |
| 57 words.insert("word"); | 60 words.insert("word"); |
| 58 words.insert("test"); | 61 words.insert("test"); |
| 59 words.insert("capitalization"); | 62 words.insert("capitalization"); |
| 60 words.insert("space"); | 63 words.insert("space"); |
| 61 words.insert("separator"); | 64 words.insert("separator"); |
| 62 words.insert("punctuation"); | 65 words.insert("punctuation"); |
| 63 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); | 66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
| 64 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); | 67 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
| 65 | 68 |
| 66 static const uint32 kMurmurHash3Seed = 2777808611U; | |
| 67 for (base::hash_set<std::string>::iterator it = words.begin(); | 69 for (base::hash_set<std::string>::iterator it = words.begin(); |
| 68 it != words.end(); ++it) { | 70 it != words.end(); ++it) { |
| 69 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); | 71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); |
| 70 } | 72 } |
| 71 | 73 |
| 74 ResetExtractor(3 /* max shingles per page */); |
| 75 } |
| 76 |
| 77 void ResetExtractor(size_t max_shingles_per_page) { |
| 72 extractor_.reset(new PhishingTermFeatureExtractor( | 78 extractor_.reset(new PhishingTermFeatureExtractor( |
| 73 &term_hashes_, | 79 &term_hashes_, |
| 74 &word_hashes_, | 80 &word_hashes_, |
| 75 3 /* max_words_per_term */, | 81 3 /* max_words_per_term */, |
| 76 kMurmurHash3Seed, | 82 kMurmurHash3Seed, |
| 83 max_shingles_per_page, |
| 84 4 /* shingle_size */, |
| 77 &clock_)); | 85 &clock_)); |
| 78 } | 86 } |
| 79 | 87 |
| 80 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 88 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
| 81 // completion callback. Returns the success boolean from the callback. | 89 // completion callback. Returns the success boolean from the callback. |
| 82 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { | 90 bool ExtractFeatures(const base::string16* page_text, |
| 91 FeatureMap* features, |
| 92 std::set<uint32>* shingle_hashes) { |
| 83 success_ = false; | 93 success_ = false; |
| 84 extractor_->ExtractFeatures( | 94 extractor_->ExtractFeatures( |
| 85 page_text, | 95 page_text, |
| 86 features, | 96 features, |
| 97 shingle_hashes, |
| 87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 88 base::Unretained(this))); | 99 base::Unretained(this))); |
| 89 msg_loop_.Run(); | 100 msg_loop_.Run(); |
| 90 return success_; | 101 return success_; |
| 91 } | 102 } |
| 92 | 103 |
| 93 void PartialExtractFeatures(const base::string16* page_text, | 104 void PartialExtractFeatures(const base::string16* page_text, |
| 94 FeatureMap* features) { | 105 FeatureMap* features, |
| 106 std::set<uint32>* shingle_hashes) { |
| 95 extractor_->ExtractFeatures( | 107 extractor_->ExtractFeatures( |
| 96 page_text, | 108 page_text, |
| 97 features, | 109 features, |
| 110 shingle_hashes, |
| 98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 111 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 99 base::Unretained(this))); | 112 base::Unretained(this))); |
| 100 msg_loop_.PostTask( | 113 msg_loop_.PostTask( |
| 101 FROM_HERE, | 114 FROM_HERE, |
| 102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 115 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
| 103 base::Unretained(this))); | 116 base::Unretained(this))); |
| 104 msg_loop_.RunUntilIdle(); | 117 msg_loop_.RunUntilIdle(); |
| 105 } | 118 } |
| 106 | 119 |
| 107 // Completion callback for feature extraction. | 120 // Completion callback for feature extraction. |
| (...skipping 14 matching lines...) Expand all Loading... |
| 122 base::hash_set<uint32> word_hashes_; | 135 base::hash_set<uint32> word_hashes_; |
| 123 bool success_; // holds the success value from ExtractFeatures | 136 bool success_; // holds the success value from ExtractFeatures |
| 124 }; | 137 }; |
| 125 | 138 |
| 126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 139 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| 127 // This test doesn't exercise the extraction timing. | 140 // This test doesn't exercise the extraction timing. |
| 128 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 141 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 129 | 142 |
| 130 base::string16 page_text = ASCIIToUTF16("blah"); | 143 base::string16 page_text = ASCIIToUTF16("blah"); |
| 131 FeatureMap expected_features; // initially empty | 144 FeatureMap expected_features; // initially empty |
| 145 std::set<uint32> expected_shingle_hashes; |
| 132 | 146 |
| 133 FeatureMap features; | 147 FeatureMap features; |
| 134 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 148 std::set<uint32> shingle_hashes; |
| 149 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 135 ExpectFeatureMapsAreEqual(features, expected_features); | 150 ExpectFeatureMapsAreEqual(features, expected_features); |
| 151 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 136 | 152 |
| 137 page_text = ASCIIToUTF16("one one"); | 153 page_text = ASCIIToUTF16("one one"); |
| 138 expected_features.Clear(); | 154 expected_features.Clear(); |
| 139 expected_features.AddBooleanFeature(features::kPageTerm + | 155 expected_features.AddBooleanFeature(features::kPageTerm + |
| 140 std::string("one")); | 156 std::string("one")); |
| 141 expected_features.AddBooleanFeature(features::kPageTerm + | 157 expected_features.AddBooleanFeature(features::kPageTerm + |
| 142 std::string("one one")); | 158 std::string("one one")); |
| 159 expected_shingle_hashes.clear(); |
| 143 | 160 |
| 144 features.Clear(); | 161 features.Clear(); |
| 145 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 162 shingle_hashes.clear(); |
| 163 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 146 ExpectFeatureMapsAreEqual(features, expected_features); | 164 ExpectFeatureMapsAreEqual(features, expected_features); |
| 165 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 147 | 166 |
| 148 page_text = ASCIIToUTF16("bla bla multi word test bla"); | 167 page_text = ASCIIToUTF16("bla bla multi word test bla"); |
| 149 expected_features.Clear(); | 168 expected_features.Clear(); |
| 150 expected_features.AddBooleanFeature(features::kPageTerm + | 169 expected_features.AddBooleanFeature(features::kPageTerm + |
| 151 std::string("multi word test")); | 170 std::string("multi word test")); |
| 171 expected_shingle_hashes.clear(); |
| 172 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", |
| 173 kMurmurHash3Seed)); |
| 174 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", |
| 175 kMurmurHash3Seed)); |
| 176 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", |
| 177 kMurmurHash3Seed)); |
| 152 | 178 |
| 153 features.Clear(); | 179 features.Clear(); |
| 154 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 180 shingle_hashes.clear(); |
| 181 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 155 ExpectFeatureMapsAreEqual(features, expected_features); | 182 ExpectFeatureMapsAreEqual(features, expected_features); |
| 183 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 156 | 184 |
| 157 // This text has all of the words for one of the terms, but they are | 185 // This text has all of the words for one of the terms, but they are |
| 158 // not in the correct order. | 186 // not in the correct order. |
| 159 page_text = ASCIIToUTF16("bla bla test word multi bla"); | 187 page_text = ASCIIToUTF16("bla bla test word multi bla"); |
| 160 expected_features.Clear(); | 188 expected_features.Clear(); |
| 189 expected_shingle_hashes.clear(); |
| 190 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", |
| 191 kMurmurHash3Seed)); |
| 192 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", |
| 193 kMurmurHash3Seed)); |
| 194 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", |
| 195 kMurmurHash3Seed)); |
| 161 | 196 |
| 162 features.Clear(); | 197 features.Clear(); |
| 163 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 198 shingle_hashes.clear(); |
| 199 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 164 ExpectFeatureMapsAreEqual(features, expected_features); | 200 ExpectFeatureMapsAreEqual(features, expected_features); |
| 201 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 165 | 202 |
| 203 // Test various separators. |
| 166 page_text = ASCIIToUTF16("Capitalization plus non-space\n" | 204 page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
| 167 "separator... punctuation!"); | 205 "separator... punctuation!"); |
| 168 expected_features.Clear(); | 206 expected_features.Clear(); |
| 169 expected_features.AddBooleanFeature(features::kPageTerm + | 207 expected_features.AddBooleanFeature(features::kPageTerm + |
| 170 std::string("capitalization")); | 208 std::string("capitalization")); |
| 171 expected_features.AddBooleanFeature(features::kPageTerm + | 209 expected_features.AddBooleanFeature(features::kPageTerm + |
| 172 std::string("space")); | 210 std::string("space")); |
| 173 expected_features.AddBooleanFeature(features::kPageTerm + | 211 expected_features.AddBooleanFeature(features::kPageTerm + |
| 174 std::string("separator")); | 212 std::string("separator")); |
| 175 expected_features.AddBooleanFeature(features::kPageTerm + | 213 expected_features.AddBooleanFeature(features::kPageTerm + |
| 176 std::string("punctuation")); | 214 std::string("punctuation")); |
| 215 expected_shingle_hashes.clear(); |
| 216 expected_shingle_hashes.insert( |
| 217 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); |
| 218 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", |
| 219 kMurmurHash3Seed)); |
| 220 expected_shingle_hashes.insert( |
| 221 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); |
| 177 | 222 |
| 178 features.Clear(); | 223 features.Clear(); |
| 179 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 224 shingle_hashes.clear(); |
| 225 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 180 ExpectFeatureMapsAreEqual(features, expected_features); | 226 ExpectFeatureMapsAreEqual(features, expected_features); |
| 227 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 228 |
| 229 // Test a page with too many words and we should only 3 shingle hashes. |
| 230 page_text = ASCIIToUTF16("This page has way too many words."); |
| 231 expected_features.Clear(); |
| 232 expected_shingle_hashes.clear(); |
| 233 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", |
| 234 kMurmurHash3Seed)); |
| 235 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", |
| 236 kMurmurHash3Seed)); |
| 237 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", |
| 238 kMurmurHash3Seed)); |
| 239 |
| 240 features.Clear(); |
| 241 shingle_hashes.clear(); |
| 242 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 243 ExpectFeatureMapsAreEqual(features, expected_features); |
| 244 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 181 | 245 |
| 182 // Test with empty page text. | 246 // Test with empty page text. |
| 183 page_text = base::string16(); | 247 page_text = base::string16(); |
| 184 expected_features.Clear(); | 248 expected_features.Clear(); |
| 249 expected_shingle_hashes.clear(); |
| 185 features.Clear(); | 250 features.Clear(); |
| 186 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 251 shingle_hashes.clear(); |
| 252 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 187 ExpectFeatureMapsAreEqual(features, expected_features); | 253 ExpectFeatureMapsAreEqual(features, expected_features); |
| 254 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 188 | 255 |
| 189 // Chinese translation of the phrase "hello goodbye". This tests that | 256 // Chinese translation of the phrase "hello goodbye hello goodbye". This tests |
| 190 // we can correctly separate terms in languages that don't use spaces. | 257 // that we can correctly separate terms in languages that don't use spaces. |
| 191 page_text = | 258 page_text = |
| 192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); | 259 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81" |
| 260 "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
| 193 expected_features.Clear(); | 261 expected_features.Clear(); |
| 194 expected_features.AddBooleanFeature( | 262 expected_features.AddBooleanFeature( |
| 195 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); | 263 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
| 196 expected_features.AddBooleanFeature( | 264 expected_features.AddBooleanFeature( |
| 197 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); | 265 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
| 266 expected_shingle_hashes.clear(); |
| 267 expected_shingle_hashes.insert(MurmurHash3String( |
| 268 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 " |
| 269 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed)); |
| 198 | 270 |
| 199 features.Clear(); | 271 features.Clear(); |
| 200 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 272 shingle_hashes.clear(); |
| 273 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 201 ExpectFeatureMapsAreEqual(features, expected_features); | 274 ExpectFeatureMapsAreEqual(features, expected_features); |
| 275 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 202 } | 276 } |
| 203 | 277 |
| 204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { | 278 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| 205 // For this test, we'll cause the feature extraction to run multiple | 279 // For this test, we'll cause the feature extraction to run multiple |
| 206 // iterations by incrementing the clock. | 280 // iterations by incrementing the clock. |
| 281 ResetExtractor(200 /* max shingles per page */); |
| 207 | 282 |
| 208 // This page has a total of 30 words. For the features to be computed | 283 // This page has a total of 30 words. For the features to be computed |
| 209 // correctly, the extractor has to process the entire string of text. | 284 // correctly, the extractor has to process the entire string of text. |
| 210 base::string16 page_text(ASCIIToUTF16("one ")); | 285 base::string16 page_text(ASCIIToUTF16("one ")); |
| 211 for (int i = 0; i < 28; ++i) { | 286 for (int i = 0; i < 28; ++i) { |
| 212 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 287 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 213 } | 288 } |
| 214 page_text.append(ASCIIToUTF16("two")); | 289 page_text.append(ASCIIToUTF16("two")); |
| 215 | 290 |
| 216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. | 291 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. |
| (...skipping 21 matching lines...) Expand all Loading... |
| 238 // Time check after the next 5 words. | 313 // Time check after the next 5 words. |
| 239 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) | 314 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) |
| 240 // A final check for the histograms. | 315 // A final check for the histograms. |
| 241 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); | 316 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); |
| 242 | 317 |
| 243 FeatureMap expected_features; | 318 FeatureMap expected_features; |
| 244 expected_features.AddBooleanFeature(features::kPageTerm + | 319 expected_features.AddBooleanFeature(features::kPageTerm + |
| 245 std::string("one")); | 320 std::string("one")); |
| 246 expected_features.AddBooleanFeature(features::kPageTerm + | 321 expected_features.AddBooleanFeature(features::kPageTerm + |
| 247 std::string("two")); | 322 std::string("two")); |
| 323 std::set<uint32> expected_shingle_hashes; |
| 324 expected_shingle_hashes.insert( |
| 325 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed)); |
| 326 expected_shingle_hashes.insert( |
| 327 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed)); |
| 328 expected_shingle_hashes.insert( |
| 329 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed)); |
| 330 expected_shingle_hashes.insert( |
| 331 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed)); |
| 332 expected_shingle_hashes.insert( |
| 333 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed)); |
| 334 expected_shingle_hashes.insert( |
| 335 MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed)); |
| 336 expected_shingle_hashes.insert( |
| 337 MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed)); |
| 338 expected_shingle_hashes.insert( |
| 339 MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed)); |
| 340 expected_shingle_hashes.insert( |
| 341 MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed)); |
| 342 expected_shingle_hashes.insert( |
| 343 MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed)); |
| 344 expected_shingle_hashes.insert( |
| 345 MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed)); |
| 346 expected_shingle_hashes.insert( |
| 347 MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed)); |
| 348 expected_shingle_hashes.insert( |
| 349 MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed)); |
| 350 expected_shingle_hashes.insert( |
| 351 MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed)); |
| 352 expected_shingle_hashes.insert( |
| 353 MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed)); |
| 354 expected_shingle_hashes.insert( |
| 355 MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed)); |
| 356 expected_shingle_hashes.insert( |
| 357 MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed)); |
| 358 expected_shingle_hashes.insert( |
| 359 MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed)); |
| 360 expected_shingle_hashes.insert( |
| 361 MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed)); |
| 362 expected_shingle_hashes.insert( |
| 363 MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed)); |
| 364 expected_shingle_hashes.insert( |
| 365 MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed)); |
| 366 expected_shingle_hashes.insert( |
| 367 MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed)); |
| 368 expected_shingle_hashes.insert( |
| 369 MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed)); |
| 370 expected_shingle_hashes.insert( |
| 371 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed)); |
| 372 expected_shingle_hashes.insert( |
| 373 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed)); |
| 374 expected_shingle_hashes.insert( |
| 375 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed)); |
| 376 expected_shingle_hashes.insert( |
| 377 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed)); |
| 248 | 378 |
| 249 FeatureMap features; | 379 FeatureMap features; |
| 250 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 380 std::set<uint32> shingle_hashes; |
| 381 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 251 ExpectFeatureMapsAreEqual(features, expected_features); | 382 ExpectFeatureMapsAreEqual(features, expected_features); |
| 383 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); |
| 252 // Make sure none of the mock expectations carry over to the next test. | 384 // Make sure none of the mock expectations carry over to the next test. |
| 253 ::testing::Mock::VerifyAndClearExpectations(&clock_); | 385 ::testing::Mock::VerifyAndClearExpectations(&clock_); |
| 254 | 386 |
| 255 // Now repeat the test with the same text, but advance the clock faster so | 387 // Now repeat the test with the same text, but advance the clock faster so |
| 256 // that the extraction time exceeds the maximum total time for the feature | 388 // that the extraction time exceeds the maximum total time for the feature |
| 257 // extractor. Extraction should fail. Note that this assumes | 389 // extractor. Extraction should fail. Note that this assumes |
| 258 // kMaxTotalTimeMs = 500. | 390 // kMaxTotalTimeMs = 500. |
| 259 EXPECT_CALL(clock_, Now()) | 391 EXPECT_CALL(clock_, Now()) |
| 260 // Time check at the start of extraction. | 392 // Time check at the start of extraction. |
| 261 .WillOnce(Return(now)) | 393 .WillOnce(Return(now)) |
| 262 // Time check at the start of the first chunk of work. | 394 // Time check at the start of the first chunk of work. |
| 263 .WillOnce(Return(now)) | 395 .WillOnce(Return(now)) |
| 264 // Time check after the first 5 words, | 396 // Time check after the first 5 words, |
| 265 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) | 397 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) |
| 266 // Time check at the start of the second chunk of work. | 398 // Time check at the start of the second chunk of work. |
| 267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) | 399 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) |
| 268 // Time check after the next 5 words. This is over the limit. | 400 // Time check after the next 5 words. This is over the limit. |
| 269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) | 401 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
| 270 // A final time check for the histograms. | 402 // A final time check for the histograms. |
| 271 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); | 403 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
| 272 | 404 |
| 273 features.Clear(); | 405 features.Clear(); |
| 274 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); | 406 shingle_hashes.clear(); |
| 407 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); |
| 275 } | 408 } |
| 276 | 409 |
| 277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { | 410 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
| 278 scoped_ptr<base::string16> page_text( | 411 scoped_ptr<base::string16> page_text( |
| 279 new base::string16(ASCIIToUTF16("one "))); | 412 new base::string16(ASCIIToUTF16("one "))); |
| 280 for (int i = 0; i < 28; ++i) { | 413 for (int i = 0; i < 28; ++i) { |
| 281 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 414 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 282 } | 415 } |
| 283 | 416 |
| 284 base::TimeTicks now = base::TimeTicks::Now(); | 417 base::TimeTicks now = base::TimeTicks::Now(); |
| 285 EXPECT_CALL(clock_, Now()) | 418 EXPECT_CALL(clock_, Now()) |
| 286 // Time check at the start of extraction. | 419 // Time check at the start of extraction. |
| 287 .WillOnce(Return(now)) | 420 .WillOnce(Return(now)) |
| 288 // Time check at the start of the first chunk of work. | 421 // Time check at the start of the first chunk of work. |
| 289 .WillOnce(Return(now)) | 422 .WillOnce(Return(now)) |
| 290 // Time check after the first 5 words. | 423 // Time check after the first 5 words. |
| 291 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 424 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
| 292 // Time check after the next 5 words. This should be greater than | 425 // Time check after the next 5 words. This should be greater than |
| 293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 426 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
| 294 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 427 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
| 295 | 428 |
| 296 FeatureMap features; | 429 FeatureMap features; |
| 430 std::set<uint32> shingle_hashes; |
| 297 // Extract first 10 words then stop. | 431 // Extract first 10 words then stop. |
| 298 PartialExtractFeatures(page_text.get(), &features); | 432 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
| 299 | 433 |
| 300 page_text.reset(new base::string16()); | 434 page_text.reset(new base::string16()); |
| 301 for (int i = 30; i < 58; ++i) { | 435 for (int i = 30; i < 58; ++i) { |
| 302 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 436 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 303 } | 437 } |
| 304 page_text->append(ASCIIToUTF16("multi word test ")); | 438 page_text->append(ASCIIToUTF16("multi word test ")); |
| 305 features.Clear(); | 439 features.Clear(); |
| 440 shingle_hashes.clear(); |
| 306 | 441 |
| 307 // This part doesn't exercise the extraction timing. | 442 // This part doesn't exercise the extraction timing. |
| 308 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 443 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 309 | 444 |
| 310 // Now extract normally and make sure nothing breaks. | 445 // Now extract normally and make sure nothing breaks. |
| 311 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); | 446 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
| 312 | 447 |
| 313 FeatureMap expected_features; | 448 FeatureMap expected_features; |
| 314 expected_features.AddBooleanFeature(features::kPageTerm + | 449 expected_features.AddBooleanFeature(features::kPageTerm + |
| 315 std::string("multi word test")); | 450 std::string("multi word test")); |
| 316 ExpectFeatureMapsAreEqual(features, expected_features); | 451 ExpectFeatureMapsAreEqual(features, expected_features); |
| 317 } | 452 } |
| 318 | 453 |
| 319 } // namespace safe_browsing | 454 } // namespace safe_browsing |
| OLD | NEW |