OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 |
| 7 #include <string> |
| 8 |
| 9 #include "base/callback.h" |
| 10 #include "base/hash_tables.h" |
| 11 #include "base/message_loop.h" |
| 12 #include "base/scoped_ptr.h" |
| 13 #include "base/sha2.h" |
| 14 #include "base/string16.h" |
| 15 #include "base/stringprintf.h" |
| 16 #include "base/time.h" |
| 17 #include "base/utf_string_conversions.h" |
| 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 19 #include "chrome/renderer/safe_browsing/features.h" |
| 20 #include "testing/gmock/include/gmock/gmock.h" |
| 21 #include "testing/gtest/include/gtest/gtest.h" |
| 22 |
| 23 using ::testing::ContainerEq; |
| 24 using ::testing::Return; |
| 25 |
| 26 namespace safe_browsing { |
| 27 |
| 28 class PhishingTermFeatureExtractorTest : public ::testing::Test { |
| 29 protected: |
| 30 class MockClock : public FeatureExtractorClock { |
| 31 public: |
| 32 MOCK_METHOD0(Now, base::TimeTicks()); |
| 33 }; |
| 34 |
| 35 virtual void SetUp() { |
| 36 base::hash_set<std::string> terms; |
| 37 terms.insert("one"); |
| 38 terms.insert("one one"); |
| 39 terms.insert("two"); |
| 40 terms.insert("multi word test"); |
| 41 terms.insert("capitalization"); |
| 42 terms.insert("space"); |
| 43 terms.insert("separator"); |
| 44 terms.insert("punctuation"); |
| 45 // Chinese (translation of "hello") |
| 46 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
| 47 // Chinese (translation of "goodbye") |
| 48 terms.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
| 49 |
| 50 for (base::hash_set<std::string>::iterator it = terms.begin(); |
| 51 it != terms.end(); ++it) { |
| 52 term_hashes_.insert(base::SHA256HashString(*it)); |
| 53 } |
| 54 |
| 55 base::hash_set<std::string> words; |
| 56 words.insert("one"); |
| 57 words.insert("two"); |
| 58 words.insert("multi"); |
| 59 words.insert("word"); |
| 60 words.insert("test"); |
| 61 words.insert("capitalization"); |
| 62 words.insert("space"); |
| 63 words.insert("separator"); |
| 64 words.insert("punctuation"); |
| 65 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); |
| 66 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); |
| 67 |
| 68 for (base::hash_set<std::string>::iterator it = words.begin(); |
| 69 it != words.end(); ++it) { |
| 70 word_hashes_.insert(base::SHA256HashString(*it)); |
| 71 } |
| 72 |
| 73 clock_ = new MockClock(); |
| 74 extractor_.reset(new PhishingTermFeatureExtractor( |
| 75 &term_hashes_, |
| 76 &word_hashes_, |
| 77 3 /* max_words_per_term */, |
| 78 clock_)); |
| 79 } |
| 80 |
| 81 // Runs the TermFeatureExtractor on |page_text|, waiting for the |
| 82 // completion callback. Returns the success boolean from the callback. |
| 83 bool ExtractFeatures(const string16* page_text, FeatureMap* features) { |
| 84 success_ = false; |
| 85 extractor_->ExtractFeatures( |
| 86 page_text, |
| 87 features, |
| 88 NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone)); |
| 89 msg_loop_.Run(); |
| 90 return success_; |
| 91 } |
| 92 |
| 93 // Completion callback for feature extraction. |
| 94 void ExtractionDone(bool success) { |
| 95 success_ = success; |
| 96 msg_loop_.Quit(); |
| 97 } |
| 98 |
| 99 MessageLoop msg_loop_; |
| 100 scoped_ptr<PhishingTermFeatureExtractor> extractor_; |
| 101 base::hash_set<std::string> term_hashes_; |
| 102 base::hash_set<std::string> word_hashes_; |
| 103 MockClock* clock_; // owned by extractor_ |
| 104 bool success_; // holds the success value from ExtractFeatures |
| 105 }; |
| 106 |
| 107 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { |
| 108 // This test doesn't exercise the extraction timing. |
| 109 EXPECT_CALL(*clock_, Now()) |
| 110 .WillRepeatedly(Return(base::TimeTicks::Now())); |
| 111 |
| 112 string16 page_text = ASCIIToUTF16("blah"); |
| 113 FeatureMap expected_features; // initially empty |
| 114 |
| 115 FeatureMap features; |
| 116 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 117 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 118 |
| 119 page_text = ASCIIToUTF16("one one"); |
| 120 expected_features.Clear(); |
| 121 expected_features.AddBooleanFeature(features::kPageTerm + |
| 122 std::string("one")); |
| 123 expected_features.AddBooleanFeature(features::kPageTerm + |
| 124 std::string("one one")); |
| 125 |
| 126 features.Clear(); |
| 127 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 128 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 129 |
| 130 page_text = ASCIIToUTF16("bla bla multi word test bla"); |
| 131 expected_features.Clear(); |
| 132 expected_features.AddBooleanFeature(features::kPageTerm + |
| 133 std::string("multi word test")); |
| 134 |
| 135 features.Clear(); |
| 136 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 137 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 138 |
| 139 // This text has all of the words for one of the terms, but they are |
| 140 // not in the correct order. |
| 141 page_text = ASCIIToUTF16("bla bla test word multi bla"); |
| 142 expected_features.Clear(); |
| 143 |
| 144 features.Clear(); |
| 145 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 146 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 147 |
| 148 page_text = ASCIIToUTF16("Capitalization plus non-space\n" |
| 149 "separator... punctuation!"); |
| 150 expected_features.Clear(); |
| 151 expected_features.AddBooleanFeature(features::kPageTerm + |
| 152 std::string("capitalization")); |
| 153 expected_features.AddBooleanFeature(features::kPageTerm + |
| 154 std::string("space")); |
| 155 expected_features.AddBooleanFeature(features::kPageTerm + |
| 156 std::string("separator")); |
| 157 expected_features.AddBooleanFeature(features::kPageTerm + |
| 158 std::string("punctuation")); |
| 159 |
| 160 features.Clear(); |
| 161 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 162 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 163 |
| 164 // Test with empty page text. |
| 165 page_text = string16(); |
| 166 expected_features.Clear(); |
| 167 features.Clear(); |
| 168 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 169 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 170 |
| 171 // Chinese translation of the phrase "hello goodbye". This tests that |
| 172 // we can correctly separate terms in languages that don't use spaces. |
| 173 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); |
| 174 expected_features.Clear(); |
| 175 expected_features.AddBooleanFeature( |
| 176 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); |
| 177 expected_features.AddBooleanFeature( |
| 178 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); |
| 179 |
| 180 features.Clear(); |
| 181 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 182 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 183 } |
| 184 |
| 185 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { |
| 186 // For this test, we'll cause the feature extraction to run multiple |
| 187 // iterations by incrementing the clock. |
| 188 |
| 189 // This page has a total of 30 words. For the features to be computed |
| 190 // correctly, the extractor has to process the entire string of text. |
| 191 string16 page_text(ASCIIToUTF16("one ")); |
| 192 for (int i = 0; i < 28; ++i) { |
| 193 page_text.append(ASCIIToUTF16(StringPrintf("%d ", i))); |
| 194 } |
| 195 page_text.append(ASCIIToUTF16("two")); |
| 196 |
| 197 // Advance the clock 30 ms every 10 words processed, 10 ms between chunks. |
| 198 // Note that this assumes kClockCheckGranularity = 10 and |
| 199 // kMaxTimePerChunkMs = 50. |
| 200 base::TimeTicks now = base::TimeTicks::Now(); |
| 201 EXPECT_CALL(*clock_, Now()) |
| 202 // Time check at the start of extraction. |
| 203 .WillOnce(Return(now)) |
| 204 // Time check at the start of the first chunk of work. |
| 205 .WillOnce(Return(now)) |
| 206 // Time check after the first 10 words. |
| 207 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))) |
| 208 // Time check after the next 10 words. This is over the chunk |
| 209 // time limit, so a continuation task will be posted. |
| 210 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60))) |
| 211 // Time check at the start of the second chunk of work. |
| 212 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70))) |
| 213 // Time check after the next 10 words. |
| 214 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100))) |
| 215 // A final check for the histograms. |
| 216 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101))); |
| 217 |
| 218 FeatureMap expected_features; |
| 219 expected_features.AddBooleanFeature(features::kPageTerm + |
| 220 std::string("one")); |
| 221 expected_features.AddBooleanFeature(features::kPageTerm + |
| 222 std::string("two")); |
| 223 |
| 224 FeatureMap features; |
| 225 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); |
| 226 EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); |
| 227 // Make sure none of the mock expectations carry over to the next test. |
| 228 ::testing::Mock::VerifyAndClearExpectations(clock_); |
| 229 |
| 230 // Now repeat the test with the same text, but advance the clock faster so |
| 231 // that the extraction time exceeds the maximum total time for the feature |
| 232 // extractor. Extraction should fail. Note that this assumes |
| 233 // kMaxTotalTimeMs = 500. |
| 234 EXPECT_CALL(*clock_, Now()) |
| 235 // Time check at the start of extraction. |
| 236 .WillOnce(Return(now)) |
| 237 // Time check at the start of the first chunk of work. |
| 238 .WillOnce(Return(now)) |
| 239 // Time check after the first 10 words, |
| 240 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) |
| 241 // Time check at the start of the second chunk of work. |
| 242 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) |
| 243 // Time check after the next 10 words. This is over the limit. |
| 244 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) |
| 245 // A final time check for the histograms. |
| 246 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); |
| 247 |
| 248 features.Clear(); |
| 249 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); |
| 250 } |
| 251 |
| 252 } // namespace safe_browsing |
OLD | NEW |