chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6

7 #include <list>

8 #include <map>

9 #include <memory>

10 #include <utility>

11

12 #include "base/bind.h"

13 #include "base/compiler_specific.h"

14 #include "base/i18n/break_iterator.h"

15 #include "base/i18n/case_conversion.h"

16 #include "base/location.h"

17 #include "base/logging.h"

18 #include "base/metrics/histogram_macros.h"

19 #include "base/single_thread_task_runner.h"

20 #include "base/strings/utf_string_conversions.h"

21 #include "base/threading/thread_task_runner_handle.h"

22 #include "base/time/time.h"

23 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

24 #include "chrome/renderer/safe_browsing/features.h"

25 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"

26 #include "crypto/sha2.h"

27

28 namespace safe_browsing {

29

30 // This time should be short enough that it doesn't noticeably disrupt the

31 // user's interaction with the page.

32 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;

33

34 // Experimenting shows that we get a reasonable gain in performance by

35 // increasing this up to around 10, but there's not much benefit in

36 // increasing it past that.

37 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;

38

39 // This should be longer than we expect feature extraction to take on any

40 // actual phishing page.

41 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

42

43 // All of the state pertaining to the current feature extraction.

44 struct PhishingTermFeatureExtractor::ExtractionState {

45 // Stores up to max_words_per_term_ previous words separated by spaces.

46 std::string previous_words;

47

48 // Stores the current shingle after a new word is processed and added in.

49 std::string current_shingle;

50

51 // Stores the sizes of the words in current_shingle. Note: the size includes

52 // the space after each word. In other words, the sum of all sizes in this

53 // list is equal to the length of current_shingle.

54 std::list<size_t> shingle_word_sizes;

55

56 // Stores the sizes of the words in previous_words. Note: the size includes

57 // the space after each word. In other words, the sum of all sizes in this

58 // list is equal to the length of previous_words.

59 std::list<size_t> previous_word_sizes;

60

61 // An iterator for word breaking.

62 std::unique_ptr<base::i18n::BreakIterator> iterator;

63

64 // The time at which we started feature extraction for the current page.

65 base::TimeTicks start_time;

66

67 // The number of iterations we've done for the current extraction.

68 int num_iterations;

69

70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)

71 : start_time(start_time_ticks),

72 num_iterations(0) {

73 std::unique_ptr<base::i18n::BreakIterator> i(new base::i18n::BreakIterator(

74 text, base::i18n::BreakIterator::BREAK_WORD));

75

76 if (i->Init()) {

77 iterator = std::move(i);

78 } else {

79 DLOG(ERROR) << "failed to open iterator";

80 }

81 }

82 };

83

84 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

85 const base::hash_set<std::string>* page_term_hashes,

86 const base::hash_set<uint32_t>* page_word_hashes,

87 size_t max_words_per_term,

88 uint32_t murmurhash3_seed,

89 size_t max_shingles_per_page,

90 size_t shingle_size,

91 FeatureExtractorClock* clock)

92 : page_term_hashes_(page_term_hashes),

93 page_word_hashes_(page_word_hashes),

94 max_words_per_term_(max_words_per_term),

95 murmurhash3_seed_(murmurhash3_seed),

96 max_shingles_per_page_(max_shingles_per_page),

97 shingle_size_(shingle_size),

98 clock_(clock),

99 weak_factory_(this) {

100 Clear();

101 }

102

103 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

104 // The RenderView should have called CancelPendingExtraction() before

105 // we are destroyed.

106 CheckNoPendingExtraction();

107 }

108

109 void PhishingTermFeatureExtractor::ExtractFeatures(

110 const base::string16* page_text,

111 FeatureMap* features,

112 std::set<uint32_t>* shingle_hashes,

113 const DoneCallback& done_callback) {

114 // The RenderView should have called CancelPendingExtraction() before

115 // starting a new extraction, so DCHECK this.

116 CheckNoPendingExtraction();

117 // However, in an opt build, we will go ahead and clean up the pending

118 // extraction so that we can start in a known state.

119 CancelPendingExtraction();

120

121 page_text_ = page_text;

122 features_ = features;

123 shingle_hashes_ = shingle_hashes,

124 done_callback_ = done_callback;

125

126 state_.reset(new ExtractionState(*page_text_, clock_->Now()));

127 base::ThreadTaskRunnerHandle::Get()->PostTask(

128 FROM_HERE,

129 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,

130 weak_factory_.GetWeakPtr()));

131 }

132

133 void PhishingTermFeatureExtractor::CancelPendingExtraction() {

134 // Cancel any pending callbacks, and clear our state.

135 weak_factory_.InvalidateWeakPtrs();

136 Clear();

137 }

138

139 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

140 DCHECK(state_.get());

141 ++state_->num_iterations;

142 base::TimeTicks current_chunk_start_time = clock_->Now();

143

144 if (!state_->iterator.get()) {

145 // We failed to initialize the break iterator, so stop now.

146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);

147 RunCallback(false);

148 return;

149 }

150

151 int num_words = 0;

152 while (state_->iterator->Advance()) {

153 if (state_->iterator->IsWord()) {

154 const size_t start = state_->iterator->prev();

155 const size_t length = state_->iterator->pos() - start;

156 HandleWord(base::StringPiece16(page_text_->data() + start, length));

157 ++num_words;

158 }

159

160 if (num_words >= kClockCheckGranularity) {

161 num_words = 0;

162 base::TimeTicks now = clock_->Now();

163 if (now - state_->start_time >=

164 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

165 DLOG(ERROR) << "Feature extraction took too long, giving up";

166 // We expect this to happen infrequently, so record when it does.

167 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);

168 RunCallback(false);

169 return;

170 }

171 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;

172 if (chunk_elapsed >=

173 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {

174 // The time limit for the current chunk is up, so post a task to

175 // continue extraction.

176 //

177 // Record how much time we actually spent on the chunk. If this is

178 // much higher than kMaxTimePerChunkMs, we may need to adjust the

179 // clock granularity.

180 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",

181 chunk_elapsed);

182 base::ThreadTaskRunnerHandle::Get()->PostTask(

183 FROM_HERE,

184 base::Bind(

185 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,

186 weak_factory_.GetWeakPtr()));

187 return;

188 }

189 // Otherwise, continue.

190 }

191 }

192 RunCallback(true);

193 }

194

195 void PhishingTermFeatureExtractor::HandleWord(

196 const base::StringPiece16& word) {

197 // First, extract shingle hashes.

198 const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

199 state_->current_shingle.append(word_lower + " ");

200 state_->shingle_word_sizes.push_back(word_lower.size() + 1);

201 if (state_->shingle_word_sizes.size() == shingle_size_) {

202 shingle_hashes_->insert(

203 MurmurHash3String(state_->current_shingle, murmurhash3_seed_));

204 state_->current_shingle.erase(0, state_->shingle_word_sizes.front());

205 state_->shingle_word_sizes.pop_front();

206 }

207 // Check if the size of shingle hashes is over the limit.

208 if (shingle_hashes_->size() > max_shingles_per_page_) {

209 // Pop the largest one.

210 std::set<uint32_t>::iterator it = shingle_hashes_->end();

211 shingle_hashes_->erase(--it);

212 }

213

214 // Next, extract page terms.

215 uint32_t word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

216

217 // Quick out if the word is not part of any term, which is the common case.

218 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

219 // Word doesn't exist in our terms so we can clear the n-gram state.

220 state_->previous_words.clear();

221 state_->previous_word_sizes.clear();

222 return;

223 }

224

225 // Find all of the n-grams that we need to check and compute their SHA-256

226 // hashes.

227 std::map<std::string /* hash /, std::string / plaintext */>

228 hashes_to_check;

229 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;

230

231 // Combine the new word with the previous words to find additional n-grams.

232 // Note that we don't yet add the new word length to previous_word_sizes,

233 // since we don't want to compute the hash for the word by itself again.

234 //

235 state_->previous_words.append(word_lower);

236 std::string current_term = state_->previous_words;

237 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

238 it != state_->previous_word_sizes.end(); ++it) {

239 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;

240 current_term.erase(0, *it);

241 }

242

243 // Add features for any hashes that match page_term_hashes_.

244 for (std::map<std::string, std::string>::iterator it =

245 hashes_to_check.begin();

246 it != hashes_to_check.end(); ++it) {

247 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {

248 features_->AddBooleanFeature(features::kPageTerm + it->second);

249 }

250 }

251

252 // Now that we have handled the current word, we have to add a space at the

253 // end of it, and add the new word's size (including the space) to

254 // previous_word_sizes. Note: it's possible that the document language

255 // doesn't use ASCII spaces to separate words. That's fine though, we just

256 // need to be consistent with how the model is generated.

257 state_->previous_words.append(" ");

258 state_->previous_word_sizes.push_back(word_lower.size() + 1);

259

260 // Cap the number of previous words.

261 if (state_->previous_word_sizes.size() >= max_words_per_term_) {

262 state_->previous_words.erase(0, state_->previous_word_sizes.front());

263 state_->previous_word_sizes.pop_front();

264 }

265 }

266

267 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {

268 DCHECK(done_callback_.is_null());

269 DCHECK(!state_.get());

270 if (!done_callback_.is_null() \|\| state_.get()) {

271 LOG(ERROR) << "Extraction in progress, missing call to "

272 << "CancelPendingExtraction";

273 }

274 }

275

276 void PhishingTermFeatureExtractor::RunCallback(bool success) {

277 // Record some timing stats that we can use to evaluate feature extraction

278 // performance. These include both successful and failed extractions.

279 DCHECK(state_.get());

280 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",

281 state_->num_iterations);

282 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",

283 clock_->Now() - state_->start_time);

284

285 DCHECK(!done_callback_.is_null());

286 done_callback_.Run(success);

287 Clear();

288 }

289

290 void PhishingTermFeatureExtractor::Clear() {

291 page_text_ = NULL;

292 features_ = NULL;

293 shingle_hashes_ = NULL;

294 done_callback_.Reset();

295 state_.reset(NULL);

296 }

297

298 } // namespace safe_browsing

OLD	NEW