Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 | 8 |
| 9 #include "base/bind.h" | 9 #include "base/bind.h" |
| 10 #include "base/callback.h" | 10 #include "base/callback.h" |
| 11 #include "base/containers/hash_tables.h" | 11 #include "base/containers/hash_tables.h" |
| 12 #include "base/memory/scoped_ptr.h" | 12 #include "base/memory/scoped_ptr.h" |
| 13 #include "base/message_loop/message_loop.h" | 13 #include "base/message_loop/message_loop.h" |
| 14 #include "base/strings/string16.h" | 14 #include "base/strings/string16.h" |
| 15 #include "base/strings/stringprintf.h" | 15 #include "base/strings/stringprintf.h" |
| 16 #include "base/strings/utf_string_conversions.h" | 16 #include "base/strings/utf_string_conversions.h" |
| 17 #include "base/time/time.h" | 17 #include "base/time/time.h" |
| 18 #include "chrome/renderer/safe_browsing/features.h" | 18 #include "chrome/renderer/safe_browsing/features.h" |
| 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" | 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" |
| 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
| 21 #include "chrome/renderer/safe_browsing/test_utils.h" | 21 #include "chrome/renderer/safe_browsing/test_utils.h" |
| 22 #include "crypto/sha2.h" | 22 #include "crypto/sha2.h" |
| 23 #include "testing/gmock/include/gmock/gmock.h" | 23 #include "testing/gmock/include/gmock/gmock.h" |
| 24 #include "testing/gtest/include/gtest/gtest.h" | 24 #include "testing/gtest/include/gtest/gtest.h" |
| 25 | 25 |
| 26 using base::ASCIIToUTF16; | 26 using base::ASCIIToUTF16; |
| 27 using ::testing::Return; | 27 using ::testing::Return; |
| 28 | 28 |
| 29 | |
| 30 static const uint32 kMurmurHash3Seed = 2777808611U; | |
| 31 | |
| 29 namespace safe_browsing { | 32 namespace safe_browsing { |
| 30 | 33 |
| 31 class PhishingTermFeatureExtractorTest : public ::testing::Test { | 34 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| 32 protected: | 35 protected: |
| 33 virtual void SetUp() { | 36 virtual void SetUp() { |
| 34 base::hash_set<std::string> terms; | 37 base::hash_set<std::string> terms; |
| 35 terms.insert("one"); | 38 terms.insert("one"); |
| 36 terms.insert("one one"); | 39 terms.insert("one one"); |
| 37 terms.insert("two"); | 40 terms.insert("two"); |
| 38 terms.insert("multi word test"); | 41 terms.insert("multi word test"); |
| (...skipping 17 matching lines...) Expand all Loading... | |
| 56 words.insert("multi"); | 59 words.insert("multi"); |
| 57 words.insert("word"); | 60 words.insert("word"); |
| 58 words.insert("test"); | 61 words.insert("test"); |
| 59 words.insert("capitalization"); | 62 words.insert("capitalization"); |
| 60 words.insert("space"); | 63 words.insert("space"); |
| 61 words.insert("separator"); | 64 words.insert("separator"); |
| 62 words.insert("punctuation"); | 65 words.insert("punctuation"); |
| 63 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); | 66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
| 64 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); | 67 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
| 65 | 68 |
| 66 static const uint32 kMurmurHash3Seed = 2777808611U; | |
| 67 for (base::hash_set<std::string>::iterator it = words.begin(); | 69 for (base::hash_set<std::string>::iterator it = words.begin(); |
| 68 it != words.end(); ++it) { | 70 it != words.end(); ++it) { |
| 69 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); | 71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); |
| 70 } | 72 } |
| 71 | 73 |
| 72 extractor_.reset(new PhishingTermFeatureExtractor( | 74 extractor_.reset(new PhishingTermFeatureExtractor( |
| 73 &term_hashes_, | 75 &term_hashes_, |
| 74 &word_hashes_, | 76 &word_hashes_, |
| 77 3 /* max_hashes_per_page */, | |
| 75 3 /* max_words_per_term */, | 78 3 /* max_words_per_term */, |
| 76 kMurmurHash3Seed, | 79 kMurmurHash3Seed, |
| 80 4 /* shingle_size */, | |
| 77 &clock_)); | 81 &clock_)); |
| 78 } | 82 } |
| 79 | 83 |
| 80 // Runs the TermFeatureExtractor on |page_text|, waiting for the | 84 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
| 81 // completion callback. Returns the success boolean from the callback. | 85 // completion callback. Returns the success boolean from the callback. |
| 82 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) { | 86 bool ExtractFeatures(const base::string16* page_text, |
| 87 FeatureMap* features, | |
| 88 std::set<uint32>* shingle_hashes) { | |
| 83 success_ = false; | 89 success_ = false; |
| 84 extractor_->ExtractFeatures( | 90 extractor_->ExtractFeatures( |
| 85 page_text, | 91 page_text, |
| 86 features, | 92 features, |
| 93 shingle_hashes, | |
| 87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 94 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 88 base::Unretained(this))); | 95 base::Unretained(this))); |
| 89 msg_loop_.Run(); | 96 msg_loop_.Run(); |
| 90 return success_; | 97 return success_; |
| 91 } | 98 } |
| 92 | 99 |
| 93 void PartialExtractFeatures(const base::string16* page_text, | 100 void PartialExtractFeatures(const base::string16* page_text, |
| 94 FeatureMap* features) { | 101 FeatureMap* features, |
| 102 std::set<uint32>* shingle_hashes) { | |
| 95 extractor_->ExtractFeatures( | 103 extractor_->ExtractFeatures( |
| 96 page_text, | 104 page_text, |
| 97 features, | 105 features, |
| 106 shingle_hashes, | |
| 98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, | 107 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, |
| 99 base::Unretained(this))); | 108 base::Unretained(this))); |
| 100 msg_loop_.PostTask( | 109 msg_loop_.PostTask( |
| 101 FROM_HERE, | 110 FROM_HERE, |
| 102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, | 111 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, |
| 103 base::Unretained(this))); | 112 base::Unretained(this))); |
| 104 msg_loop_.RunUntilIdle(); | 113 msg_loop_.RunUntilIdle(); |
| 105 } | 114 } |
| 106 | 115 |
| 107 // Completion callback for feature extraction. | 116 // Completion callback for feature extraction. |
| (...skipping 14 matching lines...) Expand all Loading... | |
| 122 base::hash_set<uint32> word_hashes_; | 131 base::hash_set<uint32> word_hashes_; |
| 123 bool success_; // holds the success value from ExtractFeatures | 132 bool success_; // holds the success value from ExtractFeatures |
| 124 }; | 133 }; |
| 125 | 134 |
| 126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { | 135 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| 127 // This test doesn't exercise the extraction timing. | 136 // This test doesn't exercise the extraction timing. |
| 128 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 137 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 129 | 138 |
| 130 base::string16 page_text = ASCIIToUTF16("blah"); | 139 base::string16 page_text = ASCIIToUTF16("blah"); |
| 131 FeatureMap expected_features; // initially empty | 140 FeatureMap expected_features; // initially empty |
| 141 std::set<uint32> expected_shingle_hashes; | |
| 132 | 142 |
| 133 FeatureMap features; | 143 FeatureMap features; |
| 134 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 144 std::set<uint32> shingle_hashes; |
| 145 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 135 ExpectFeatureMapsAreEqual(features, expected_features); | 146 ExpectFeatureMapsAreEqual(features, expected_features); |
| 147 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 136 | 148 |
| 137 page_text = ASCIIToUTF16("one one"); | 149 page_text = ASCIIToUTF16("one one"); |
| 138 expected_features.Clear(); | 150 expected_features.Clear(); |
| 139 expected_features.AddBooleanFeature(features::kPageTerm + | 151 expected_features.AddBooleanFeature(features::kPageTerm + |
| 140 std::string("one")); | 152 std::string("one")); |
| 141 expected_features.AddBooleanFeature(features::kPageTerm + | 153 expected_features.AddBooleanFeature(features::kPageTerm + |
| 142 std::string("one one")); | 154 std::string("one one")); |
| 155 expected_shingle_hashes.clear(); | |
| 143 | 156 |
| 144 features.Clear(); | 157 features.Clear(); |
| 145 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 158 shingle_hashes.clear(); |
| 159 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 146 ExpectFeatureMapsAreEqual(features, expected_features); | 160 ExpectFeatureMapsAreEqual(features, expected_features); |
| 161 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 147 | 162 |
| 148 page_text = ASCIIToUTF16("bla bla multi word test bla"); | 163 page_text = ASCIIToUTF16("bla bla multi word test bla"); |
| 149 expected_features.Clear(); | 164 expected_features.Clear(); |
| 150 expected_features.AddBooleanFeature(features::kPageTerm + | 165 expected_features.AddBooleanFeature(features::kPageTerm + |
| 151 std::string("multi word test")); | 166 std::string("multi word test")); |
| 167 expected_shingle_hashes.clear(); | |
| 168 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ", | |
| 169 kMurmurHash3Seed)); | |
| 170 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ", | |
| 171 kMurmurHash3Seed)); | |
| 172 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ", | |
| 173 kMurmurHash3Seed)); | |
| 152 | 174 |
| 153 features.Clear(); | 175 features.Clear(); |
| 154 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 176 shingle_hashes.clear(); |
| 177 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 155 ExpectFeatureMapsAreEqual(features, expected_features); | 178 ExpectFeatureMapsAreEqual(features, expected_features); |
| 179 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 156 | 180 |
| 157 // This text has all of the words for one of the terms, but they are | 181 // This text has all of the words for one of the terms, but they are |
| 158 // not in the correct order. | 182 // not in the correct order. |
| 159 page_text = ASCIIToUTF16("bla bla test word multi bla"); | 183 page_text = ASCIIToUTF16("bla bla test word multi bla"); |
| 160 expected_features.Clear(); | 184 expected_features.Clear(); |
| 185 expected_shingle_hashes.clear(); | |
| 186 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ", | |
| 187 kMurmurHash3Seed)); | |
| 188 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ", | |
| 189 kMurmurHash3Seed)); | |
| 190 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ", | |
| 191 kMurmurHash3Seed)); | |
| 161 | 192 |
| 162 features.Clear(); | 193 features.Clear(); |
| 163 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 194 shingle_hashes.clear(); |
| 195 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 164 ExpectFeatureMapsAreEqual(features, expected_features); | 196 ExpectFeatureMapsAreEqual(features, expected_features); |
| 197 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 165 | 198 |
| 199 // Test various separators. | |
| 166 page_text = ASCIIToUTF16("Capitalization plus non-space\n" | 200 page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
| 167 "separator... punctuation!"); | 201 "separator... punctuation!"); |
| 168 expected_features.Clear(); | 202 expected_features.Clear(); |
| 169 expected_features.AddBooleanFeature(features::kPageTerm + | 203 expected_features.AddBooleanFeature(features::kPageTerm + |
| 170 std::string("capitalization")); | 204 std::string("capitalization")); |
| 171 expected_features.AddBooleanFeature(features::kPageTerm + | 205 expected_features.AddBooleanFeature(features::kPageTerm + |
| 172 std::string("space")); | 206 std::string("space")); |
| 173 expected_features.AddBooleanFeature(features::kPageTerm + | 207 expected_features.AddBooleanFeature(features::kPageTerm + |
| 174 std::string("separator")); | 208 std::string("separator")); |
| 175 expected_features.AddBooleanFeature(features::kPageTerm + | 209 expected_features.AddBooleanFeature(features::kPageTerm + |
| 176 std::string("punctuation")); | 210 std::string("punctuation")); |
| 211 expected_shingle_hashes.clear(); | |
| 212 expected_shingle_hashes.insert( | |
| 213 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed)); | |
| 214 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ", | |
| 215 kMurmurHash3Seed)); | |
| 216 expected_shingle_hashes.insert( | |
| 217 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed)); | |
| 177 | 218 |
| 178 features.Clear(); | 219 features.Clear(); |
| 179 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 220 shingle_hashes.clear(); |
| 221 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 180 ExpectFeatureMapsAreEqual(features, expected_features); | 222 ExpectFeatureMapsAreEqual(features, expected_features); |
| 223 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 224 | |
| 225 // Test a page with too many words and we should only 3 shingle hashes. | |
| 226 page_text = ASCIIToUTF16("This page has way too many words."); | |
| 227 expected_features.Clear(); | |
| 228 expected_shingle_hashes.clear(); | |
| 229 expected_shingle_hashes.insert(MurmurHash3String("this page has way ", | |
| 230 kMurmurHash3Seed)); | |
| 231 expected_shingle_hashes.insert(MurmurHash3String("page has way too ", | |
| 232 kMurmurHash3Seed)); | |
| 233 expected_shingle_hashes.insert(MurmurHash3String("has way too many ", | |
| 234 kMurmurHash3Seed)); | |
| 235 | |
| 236 features.Clear(); | |
| 237 shingle_hashes.clear(); | |
| 238 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 239 ExpectFeatureMapsAreEqual(features, expected_features); | |
| 240 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 181 | 241 |
| 182 // Test with empty page text. | 242 // Test with empty page text. |
| 183 page_text = base::string16(); | 243 page_text = base::string16(); |
| 184 expected_features.Clear(); | 244 expected_features.Clear(); |
| 245 expected_shingle_hashes.clear(); | |
| 185 features.Clear(); | 246 features.Clear(); |
| 186 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 247 shingle_hashes.clear(); |
| 248 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 187 ExpectFeatureMapsAreEqual(features, expected_features); | 249 ExpectFeatureMapsAreEqual(features, expected_features); |
| 250 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 188 | 251 |
| 189 // Chinese translation of the phrase "hello goodbye". This tests that | 252 // Chinese translation of the phrase "hello goodbye". This tests that |
| 190 // we can correctly separate terms in languages that don't use spaces. | 253 // we can correctly separate terms in languages that don't use spaces. |
|
mattm
2014/05/06 01:00:14
Seems we should also have a similar test with enou
zysxqn
2014/05/06 20:56:57
Done.
| |
| 191 page_text = | 254 page_text = |
| 192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); | 255 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
| 193 expected_features.Clear(); | 256 expected_features.Clear(); |
| 194 expected_features.AddBooleanFeature( | 257 expected_features.AddBooleanFeature( |
| 195 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); | 258 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
| 196 expected_features.AddBooleanFeature( | 259 expected_features.AddBooleanFeature( |
| 197 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); | 260 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
| 261 expected_shingle_hashes.clear(); | |
| 198 | 262 |
| 199 features.Clear(); | 263 features.Clear(); |
| 200 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 264 shingle_hashes.clear(); |
| 265 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 201 ExpectFeatureMapsAreEqual(features, expected_features); | 266 ExpectFeatureMapsAreEqual(features, expected_features); |
| 267 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes)); | |
| 202 } | 268 } |
| 203 | 269 |
| 204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { | 270 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| 205 // For this test, we'll cause the feature extraction to run multiple | 271 // For this test, we'll cause the feature extraction to run multiple |
| 206 // iterations by incrementing the clock. | 272 // iterations by incrementing the clock. We don't check shingle hashes here |
| 273 // since its size is too large. | |
|
mattm
2014/05/06 01:00:14
what do you mean by size is too large? That you do
zysxqn
2014/05/06 20:56:57
Done.
| |
| 207 | 274 |
| 208 // This page has a total of 30 words. For the features to be computed | 275 // This page has a total of 30 words. For the features to be computed |
| 209 // correctly, the extractor has to process the entire string of text. | 276 // correctly, the extractor has to process the entire string of text. |
| 210 base::string16 page_text(ASCIIToUTF16("one ")); | 277 base::string16 page_text(ASCIIToUTF16("one ")); |
| 211 for (int i = 0; i < 28; ++i) { | 278 for (int i = 0; i < 28; ++i) { |
| 212 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 279 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 213 } | 280 } |
| 214 page_text.append(ASCIIToUTF16("two")); | 281 page_text.append(ASCIIToUTF16("two")); |
| 215 | 282 |
| 216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. | 283 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. |
| (...skipping 23 matching lines...) Expand all Loading... | |
| 240 // A final check for the histograms. | 307 // A final check for the histograms. |
| 241 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); | 308 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); |
| 242 | 309 |
| 243 FeatureMap expected_features; | 310 FeatureMap expected_features; |
| 244 expected_features.AddBooleanFeature(features::kPageTerm + | 311 expected_features.AddBooleanFeature(features::kPageTerm + |
| 245 std::string("one")); | 312 std::string("one")); |
| 246 expected_features.AddBooleanFeature(features::kPageTerm + | 313 expected_features.AddBooleanFeature(features::kPageTerm + |
| 247 std::string("two")); | 314 std::string("two")); |
| 248 | 315 |
| 249 FeatureMap features; | 316 FeatureMap features; |
| 250 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); | 317 std::set<uint32> shingle_hashes; |
| 318 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 251 ExpectFeatureMapsAreEqual(features, expected_features); | 319 ExpectFeatureMapsAreEqual(features, expected_features); |
| 252 // Make sure none of the mock expectations carry over to the next test. | 320 // Make sure none of the mock expectations carry over to the next test. |
| 253 ::testing::Mock::VerifyAndClearExpectations(&clock_); | 321 ::testing::Mock::VerifyAndClearExpectations(&clock_); |
| 254 | 322 |
| 255 // Now repeat the test with the same text, but advance the clock faster so | 323 // Now repeat the test with the same text, but advance the clock faster so |
| 256 // that the extraction time exceeds the maximum total time for the feature | 324 // that the extraction time exceeds the maximum total time for the feature |
| 257 // extractor. Extraction should fail. Note that this assumes | 325 // extractor. Extraction should fail. Note that this assumes |
| 258 // kMaxTotalTimeMs = 500. | 326 // kMaxTotalTimeMs = 500. |
| 259 EXPECT_CALL(clock_, Now()) | 327 EXPECT_CALL(clock_, Now()) |
| 260 // Time check at the start of extraction. | 328 // Time check at the start of extraction. |
| 261 .WillOnce(Return(now)) | 329 .WillOnce(Return(now)) |
| 262 // Time check at the start of the first chunk of work. | 330 // Time check at the start of the first chunk of work. |
| 263 .WillOnce(Return(now)) | 331 .WillOnce(Return(now)) |
| 264 // Time check after the first 5 words, | 332 // Time check after the first 5 words, |
| 265 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) | 333 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) |
| 266 // Time check at the start of the second chunk of work. | 334 // Time check at the start of the second chunk of work. |
| 267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) | 335 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) |
| 268 // Time check after the next 5 words. This is over the limit. | 336 // Time check after the next 5 words. This is over the limit. |
| 269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) | 337 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
| 270 // A final time check for the histograms. | 338 // A final time check for the histograms. |
| 271 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); | 339 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
| 272 | 340 |
| 273 features.Clear(); | 341 features.Clear(); |
| 274 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); | 342 shingle_hashes.clear(); |
| 343 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes)); | |
| 275 } | 344 } |
| 276 | 345 |
| 277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { | 346 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { |
| 278 scoped_ptr<base::string16> page_text( | 347 scoped_ptr<base::string16> page_text( |
| 279 new base::string16(ASCIIToUTF16("one "))); | 348 new base::string16(ASCIIToUTF16("one "))); |
| 280 for (int i = 0; i < 28; ++i) { | 349 for (int i = 0; i < 28; ++i) { |
| 281 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 350 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 282 } | 351 } |
| 283 | 352 |
| 284 base::TimeTicks now = base::TimeTicks::Now(); | 353 base::TimeTicks now = base::TimeTicks::Now(); |
| 285 EXPECT_CALL(clock_, Now()) | 354 EXPECT_CALL(clock_, Now()) |
| 286 // Time check at the start of extraction. | 355 // Time check at the start of extraction. |
| 287 .WillOnce(Return(now)) | 356 .WillOnce(Return(now)) |
| 288 // Time check at the start of the first chunk of work. | 357 // Time check at the start of the first chunk of work. |
| 289 .WillOnce(Return(now)) | 358 .WillOnce(Return(now)) |
| 290 // Time check after the first 5 words. | 359 // Time check after the first 5 words. |
| 291 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) | 360 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) |
| 292 // Time check after the next 5 words. This should be greater than | 361 // Time check after the next 5 words. This should be greater than |
| 293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. | 362 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. |
| 294 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); | 363 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); |
| 295 | 364 |
| 296 FeatureMap features; | 365 FeatureMap features; |
| 366 std::set<uint32> shingle_hashes; | |
| 297 // Extract first 10 words then stop. | 367 // Extract first 10 words then stop. |
| 298 PartialExtractFeatures(page_text.get(), &features); | 368 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes); |
| 299 | 369 |
| 300 page_text.reset(new base::string16()); | 370 page_text.reset(new base::string16()); |
| 301 for (int i = 30; i < 58; ++i) { | 371 for (int i = 30; i < 58; ++i) { |
| 302 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); | 372 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); |
| 303 } | 373 } |
| 304 page_text->append(ASCIIToUTF16("multi word test ")); | 374 page_text->append(ASCIIToUTF16("multi word test ")); |
| 305 features.Clear(); | 375 features.Clear(); |
| 376 shingle_hashes.clear(); | |
| 306 | 377 |
| 307 // This part doesn't exercise the extraction timing. | 378 // This part doesn't exercise the extraction timing. |
| 308 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); | 379 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); |
| 309 | 380 |
| 310 // Now extract normally and make sure nothing breaks. | 381 // Now extract normally and make sure nothing breaks. |
| 311 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); | 382 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes)); |
| 312 | 383 |
| 313 FeatureMap expected_features; | 384 FeatureMap expected_features; |
| 314 expected_features.AddBooleanFeature(features::kPageTerm + | 385 expected_features.AddBooleanFeature(features::kPageTerm + |
| 315 std::string("multi word test")); | 386 std::string("multi word test")); |
| 316 ExpectFeatureMapsAreEqual(features, expected_features); | 387 ExpectFeatureMapsAreEqual(features, expected_features); |
| 317 } | 388 } |
| 318 | 389 |
| 319 } // namespace safe_browsing | 390 } // namespace safe_browsing |
| OLD | NEW |