| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
 | 
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
 | 
| new file mode 100644
 | 
| index 0000000000000000000000000000000000000000..812fb935047c2e55db81103bf97d6342e6b576e2
 | 
| --- /dev/null
 | 
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
 | 
| @@ -0,0 +1,252 @@
 | 
| +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 | 
| +// Use of this source code is governed by a BSD-style license that can be
 | 
| +// found in the LICENSE file.
 | 
| +
 | 
| +#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
 | 
| +
 | 
| +#include <string>
 | 
| +
 | 
| +#include "base/callback.h"
 | 
| +#include "base/hash_tables.h"
 | 
| +#include "base/message_loop.h"
 | 
| +#include "base/scoped_ptr.h"
 | 
| +#include "base/sha2.h"
 | 
| +#include "base/string16.h"
 | 
| +#include "base/stringprintf.h"
 | 
| +#include "base/time.h"
 | 
| +#include "base/utf_string_conversions.h"
 | 
| +#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 | 
| +#include "chrome/renderer/safe_browsing/features.h"
 | 
| +#include "testing/gmock/include/gmock/gmock.h"
 | 
| +#include "testing/gtest/include/gtest/gtest.h"
 | 
| +
 | 
| +using ::testing::ContainerEq;
 | 
| +using ::testing::Return;
 | 
| +
 | 
| +namespace safe_browsing {
 | 
| +
 | 
| +class PhishingTermFeatureExtractorTest : public ::testing::Test {
 | 
| + protected:
 | 
| +  class MockClock : public FeatureExtractorClock {
 | 
| +   public:
 | 
| +    MOCK_METHOD0(Now, base::TimeTicks());
 | 
| +  };
 | 
| +
 | 
| +  virtual void SetUp() {
 | 
| +    base::hash_set<std::string> terms;
 | 
| +    terms.insert("one");
 | 
| +    terms.insert("one one");
 | 
| +    terms.insert("two");
 | 
| +    terms.insert("multi word test");
 | 
| +    terms.insert("capitalization");
 | 
| +    terms.insert("space");
 | 
| +    terms.insert("separator");
 | 
| +    terms.insert("punctuation");
 | 
| +    // Chinese (translation of "hello")
 | 
| +    terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
 | 
| +    // Chinese (translation of "goodbye")
 | 
| +    terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
 | 
| +
 | 
| +    for (base::hash_set<std::string>::iterator it = terms.begin();
 | 
| +         it != terms.end(); ++it) {
 | 
| +      term_hashes_.insert(base::SHA256HashString(*it));
 | 
| +    }
 | 
| +
 | 
| +    base::hash_set<std::string> words;
 | 
| +    words.insert("one");
 | 
| +    words.insert("two");
 | 
| +    words.insert("multi");
 | 
| +    words.insert("word");
 | 
| +    words.insert("test");
 | 
| +    words.insert("capitalization");
 | 
| +    words.insert("space");
 | 
| +    words.insert("separator");
 | 
| +    words.insert("punctuation");
 | 
| +    words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
 | 
| +    words.insert("\xe5\x86\x8d\xe8\xa7\x81");
 | 
| +
 | 
| +    for (base::hash_set<std::string>::iterator it = words.begin();
 | 
| +         it != words.end(); ++it) {
 | 
| +      word_hashes_.insert(base::SHA256HashString(*it));
 | 
| +    }
 | 
| +
 | 
| +    clock_ = new MockClock();
 | 
| +    extractor_.reset(new PhishingTermFeatureExtractor(
 | 
| +        &term_hashes_,
 | 
| +        &word_hashes_,
 | 
| +        3 /* max_words_per_term */,
 | 
| +        clock_));
 | 
| +  }
 | 
| +
 | 
| +  // Runs the TermFeatureExtractor on |page_text|, waiting for the
 | 
| +  // completion callback.  Returns the success boolean from the callback.
 | 
| +  bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
 | 
| +    success_ = false;
 | 
| +    extractor_->ExtractFeatures(
 | 
| +        page_text,
 | 
| +        features,
 | 
| +        NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));
 | 
| +    msg_loop_.Run();
 | 
| +    return success_;
 | 
| +  }
 | 
| +
 | 
| +  // Completion callback for feature extraction.
 | 
| +  void ExtractionDone(bool success) {
 | 
| +    success_ = success;
 | 
| +    msg_loop_.Quit();
 | 
| +  }
 | 
| +
 | 
| +  MessageLoop msg_loop_;
 | 
| +  scoped_ptr<PhishingTermFeatureExtractor> extractor_;
 | 
| +  base::hash_set<std::string> term_hashes_;
 | 
| +  base::hash_set<std::string> word_hashes_;
 | 
| +  MockClock* clock_;  // owned by extractor_
 | 
| +  bool success_;  // holds the success value from ExtractFeatures
 | 
| +};
 | 
| +
 | 
| +TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
 | 
| +  // This test doesn't exercise the extraction timing.
 | 
| +  EXPECT_CALL(*clock_, Now())
 | 
| +      .WillRepeatedly(Return(base::TimeTicks::Now()));
 | 
| +
 | 
| +  string16 page_text = ASCIIToUTF16("blah");
 | 
| +  FeatureMap expected_features;  // initially empty
 | 
| +
 | 
| +  FeatureMap features;
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +
 | 
| +  page_text = ASCIIToUTF16("one one");
 | 
| +  expected_features.Clear();
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("one"));
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("one one"));
 | 
| +
 | 
| +  features.Clear();
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +
 | 
| +  page_text = ASCIIToUTF16("bla bla multi word test bla");
 | 
| +  expected_features.Clear();
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("multi word test"));
 | 
| +
 | 
| +  features.Clear();
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +
 | 
| +  // This text has all of the words for one of the terms, but they are
 | 
| +  // not in the correct order.
 | 
| +  page_text = ASCIIToUTF16("bla bla test word multi bla");
 | 
| +  expected_features.Clear();
 | 
| +
 | 
| +  features.Clear();
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +
 | 
| +  page_text = ASCIIToUTF16("Capitalization plus non-space\n"
 | 
| +                           "separator... punctuation!");
 | 
| +  expected_features.Clear();
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("capitalization"));
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("space"));
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("separator"));
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("punctuation"));
 | 
| +
 | 
| +  features.Clear();
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +
 | 
| +  // Test with empty page text.
 | 
| +  page_text = string16();
 | 
| +  expected_features.Clear();
 | 
| +  features.Clear();
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +
 | 
| +  // Chinese translation of the phrase "hello goodbye". This tests that
 | 
| +  // we can correctly separate terms in languages that don't use spaces.
 | 
| +  page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
 | 
| +  expected_features.Clear();
 | 
| +  expected_features.AddBooleanFeature(
 | 
| +      features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
 | 
| +  expected_features.AddBooleanFeature(
 | 
| +      features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
 | 
| +
 | 
| +  features.Clear();
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +}
 | 
| +
 | 
| +TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
 | 
| +  // For this test, we'll cause the feature extraction to run multiple
 | 
| +  // iterations by incrementing the clock.
 | 
| +
 | 
| +  // This page has a total of 30 words.  For the features to be computed
 | 
| +  // correctly, the extractor has to process the entire string of text.
 | 
| +  string16 page_text(ASCIIToUTF16("one "));
 | 
| +  for (int i = 0; i < 28; ++i) {
 | 
| +    page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));
 | 
| +  }
 | 
| +  page_text.append(ASCIIToUTF16("two"));
 | 
| +
 | 
| +  // Advance the clock 30 ms every 10 words processed, 10 ms between chunks.
 | 
| +  // Note that this assumes kClockCheckGranularity = 10 and
 | 
| +  // kMaxTimePerChunkMs = 50.
 | 
| +  base::TimeTicks now = base::TimeTicks::Now();
 | 
| +  EXPECT_CALL(*clock_, Now())
 | 
| +      // Time check at the start of extraction.
 | 
| +      .WillOnce(Return(now))
 | 
| +      // Time check at the start of the first chunk of work.
 | 
| +      .WillOnce(Return(now))
 | 
| +      // Time check after the first 10 words.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
 | 
| +      // Time check after the next 10 words.  This is over the chunk
 | 
| +      // time limit, so a continuation task will be posted.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
 | 
| +      // Time check at the start of the second chunk of work.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
 | 
| +      // Time check after the next 10 words.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
 | 
| +      // A final check for the histograms.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101)));
 | 
| +
 | 
| +  FeatureMap expected_features;
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("one"));
 | 
| +  expected_features.AddBooleanFeature(features::kPageTerm +
 | 
| +                                      std::string("two"));
 | 
| +
 | 
| +  FeatureMap features;
 | 
| +  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
 | 
| +  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 | 
| +  // Make sure none of the mock expectations carry over to the next test.
 | 
| +  ::testing::Mock::VerifyAndClearExpectations(clock_);
 | 
| +
 | 
| +  // Now repeat the test with the same text, but advance the clock faster so
 | 
| +  // that the extraction time exceeds the maximum total time for the feature
 | 
| +  // extractor.  Extraction should fail.  Note that this assumes
 | 
| +  // kMaxTotalTimeMs = 500.
 | 
| +  EXPECT_CALL(*clock_, Now())
 | 
| +      // Time check at the start of extraction.
 | 
| +      .WillOnce(Return(now))
 | 
| +      // Time check at the start of the first chunk of work.
 | 
| +      .WillOnce(Return(now))
 | 
| +      // Time check after the first 10 words,
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
 | 
| +      // Time check at the start of the second chunk of work.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
 | 
| +      // Time check after the next 10 words.  This is over the limit.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
 | 
| +      // A final time check for the histograms.
 | 
| +      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
 | 
| +
 | 
| +  features.Clear();
 | 
| +  EXPECT_FALSE(ExtractFeatures(&page_text, &features));
 | 
| +}
 | 
| +
 | 
| +}  // namespace safe_browsing
 | 
| 
 |