| Index: chrome/renderer/safe_browsing/features.h
|
| diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h
|
| deleted file mode 100644
|
| index 29fa40cab8ed54419d0272668e7dc21ebaa425d7..0000000000000000000000000000000000000000
|
| --- a/chrome/renderer/safe_browsing/features.h
|
| +++ /dev/null
|
| @@ -1,181 +0,0 @@
|
| -// Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
| -// Use of this source code is governed by a BSD-style license that can be
|
| -// found in the LICENSE file.
|
| -//
|
| -// Common types and constants for extracting and evaluating features in the
|
| -// client-side phishing detection model. A feature is simply a string and an
|
| -// associated floating-point value between 0 and 1. The phishing
|
| -// classification model contains rules which give an appropriate weight to each
|
| -// feature or combination of features. These values can then be summed to
|
| -// compute a final phishiness score.
|
| -//
|
| -// Some features are boolean features. If these features are set, they always
|
| -// have a value of 0.0 or 1.0. In practice, the features are only set if the
|
| -// value is true (1.0).
|
| -//
|
| -// We also use token features. These features have a unique name that is
|
| -// constructed from the URL or page contents that we are classifying, for
|
| -// example, "UrlDomain=chromium". These features are also always set to 1.0
|
| -// if they are present.
|
| -//
|
| -// The intermediate storage of the features for a URL is a FeatureMap, which is
|
| -// just a thin wrapper around a map of feature name to value. The entire set
|
| -// of features for a URL is extracted before we do any scoring.
|
| -
|
| -#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
|
| -#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
|
| -
|
| -#include <stddef.h>
|
| -#include <string>
|
| -
|
| -#include "base/containers/hash_tables.h"
|
| -#include "base/macros.h"
|
| -
|
| -namespace safe_browsing {
|
| -
|
| -// Container for a map of features to values, which enforces behavior
|
| -// such as a maximum number of features in the map.
|
| -class FeatureMap {
|
| - public:
|
| - FeatureMap();
|
| - ~FeatureMap();
|
| -
|
| - // Adds a boolean feature to a FeatureMap with a value of 1.0.
|
| - // Returns true on success, or false if the feature map exceeds
|
| - // kMaxFeatureMapSize.
|
| - bool AddBooleanFeature(const std::string& name);
|
| -
|
| - // Adds a real-valued feature to a FeatureMap with the given value.
|
| - // Values must always be in the range [0.0, 1.0]. Returns true on
|
| - // success, or false if the feature map exceeds kMaxFeatureMapSize
|
| - // or the value is outside of the allowed range.
|
| - bool AddRealFeature(const std::string& name, double value);
|
| -
|
| - // Provides read-only access to the current set of features.
|
| - const base::hash_map<std::string, double>& features() const {
|
| - return features_;
|
| - }
|
| -
|
| - // Clears the set of features in the map.
|
| - void Clear();
|
| -
|
| - // This is an upper bound on the number of features that will be extracted.
|
| - // We should never hit this cap; it is intended as a sanity check to prevent
|
| - // the FeatureMap from growing too large.
|
| - static const size_t kMaxFeatureMapSize;
|
| -
|
| - private:
|
| - base::hash_map<std::string, double> features_;
|
| -
|
| - DISALLOW_COPY_AND_ASSIGN(FeatureMap);
|
| -};
|
| -
|
| -namespace features {
|
| -// Constants for the various feature names that we use.
|
| -//
|
| -// IMPORTANT: when adding new features, you must update kAllowedFeatures in
|
| -// chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
|
| -// should be sent in sanitized pingbacks.
|
| -
|
| -////////////////////////////////////////////////////
|
| -// URL host features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// Set if the URL's hostname is an IP address.
|
| -extern const char kUrlHostIsIpAddress[];
|
| -// Token feature containing the portion of the hostname controlled by a
|
| -// registrar, for example "com" or "co.uk".
|
| -extern const char kUrlTldToken[];
|
| -// Token feature containing the first host component below the registrar.
|
| -// For example, in "www.google.com", the domain would be "google".
|
| -extern const char kUrlDomainToken[];
|
| -// Token feature containing each host component below the domain.
|
| -// For example, in "www.host.example.com", both "www" and "host" would be
|
| -// "other host tokens".
|
| -extern const char kUrlOtherHostToken[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// Aggregate features for URL host tokens
|
| -////////////////////////////////////////////////////
|
| -
|
| -// Set if the number of "other" host tokens for a URL is greater than one.
|
| -// Longer hostnames, regardless of the specific tokens, can be a signal that
|
| -// the URL is phishy.
|
| -extern const char kUrlNumOtherHostTokensGTOne[];
|
| -// Set if the number of "other" host tokens for a URL is greater than three.
|
| -extern const char kUrlNumOtherHostTokensGTThree[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// URL path token features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// Token feature containing each alphanumeric string in the path that is at
|
| -// least 3 characters long. For example, "/abc/d/efg" would have 2 path
|
| -// token features, "abc" and "efg". Query parameters are not included.
|
| -extern const char kUrlPathToken[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// DOM HTML form features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// Set if the page has any <form> elements.
|
| -extern const char kPageHasForms[];
|
| -// The fraction of form elements whose |action| attribute points to a
|
| -// URL on a different domain from the document URL.
|
| -extern const char kPageActionOtherDomainFreq[];
|
| -// Token feature containing each URL that an |action| attribute
|
| -// points to.
|
| -extern const char kPageActionURL[];
|
| -// Set if the page has any <input type="text"> elements
|
| -// (includes inputs with missing or unknown types).
|
| -extern const char kPageHasTextInputs[];
|
| -// Set if the page has any <input type="password"> elements.
|
| -extern const char kPageHasPswdInputs[];
|
| -// Set if the page has any <input type="radio"> elements.
|
| -extern const char kPageHasRadioInputs[];
|
| -// Set if the page has any <input type="checkbox"> elements.
|
| -extern const char kPageHasCheckInputs[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// DOM HTML link features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// The fraction of links in the page which point to a domain other than the
|
| -// domain of the document. See "URL host features" above for a discussion
|
| -// of how the doamin is computed.
|
| -extern const char kPageExternalLinksFreq[];
|
| -// Token feature containing each external domain that is linked to.
|
| -extern const char kPageLinkDomain[];
|
| -// Fraction of links in the page that use https.
|
| -extern const char kPageSecureLinksFreq[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// DOM HTML script features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// Set if the number of <script> elements in the page is greater than 1.
|
| -extern const char kPageNumScriptTagsGTOne[];
|
| -// Set if the number of <script> elements in the page is greater than 6.
|
| -extern const char kPageNumScriptTagsGTSix[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// Other DOM HTML features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// The fraction of images whose src attribute points to an external domain.
|
| -extern const char kPageImgOtherDomainFreq[];
|
| -
|
| -////////////////////////////////////////////////////
|
| -// Page term features
|
| -////////////////////////////////////////////////////
|
| -
|
| -// Token feature for a term (whitespace-delimited) on a page. Terms can be
|
| -// single words or multi-word n-grams. Rather than adding this feature for
|
| -// every possible token on a page, only the terms that are mentioned in the
|
| -// classification model are added.
|
| -extern const char kPageTerm[];
|
| -
|
| -} // namespace features
|
| -} // namespace safe_browsing
|
| -
|
| -#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
|
|
|