OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/history/url_index_private_data.h" | 5 #include "chrome/browser/history/url_index_private_data.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <functional> | 8 #include <functional> |
9 #include <iterator> | 9 #include <iterator> |
10 #include <limits> | 10 #include <limits> |
11 #include <numeric> | 11 #include <numeric> |
12 | 12 |
13 #include "base/bind.h" | |
13 #include "base/file_util.h" | 14 #include "base/file_util.h" |
15 #include "base/i18n/break_iterator.h" | |
14 #include "base/i18n/case_conversion.h" | 16 #include "base/i18n/case_conversion.h" |
17 #include "base/memory/scoped_ptr.h" | |
15 #include "base/metrics/histogram.h" | 18 #include "base/metrics/histogram.h" |
16 #include "base/string_util.h" | 19 #include "base/string_util.h" |
17 #include "base/threading/thread_restrictions.h" | 20 #include "base/time.h" |
18 #include "base/utf_string_conversions.h" | 21 #include "base/utf_string_conversions.h" |
19 #include "chrome/browser/autocomplete/autocomplete.h" | 22 #include "chrome/browser/autocomplete/autocomplete.h" |
20 #include "chrome/browser/history/url_database.h" | 23 #include "chrome/browser/history/url_database.h" |
21 #include "chrome/common/url_constants.h" | 24 #include "chrome/common/url_constants.h" |
22 #include "net/base/net_util.h" | 25 #include "net/base/net_util.h" |
23 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | 26 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" |
27 #include "ui/base/l10n/l10n_util.h" | |
24 | 28 |
25 using google::protobuf::RepeatedField; | 29 using google::protobuf::RepeatedField; |
26 using google::protobuf::RepeatedPtrField; | 30 using google::protobuf::RepeatedPtrField; |
27 using in_memory_url_index::InMemoryURLIndexCacheItem; | 31 using in_memory_url_index::InMemoryURLIndexCacheItem; |
28 | 32 |
29 namespace history { | 33 namespace history { |
30 | 34 |
31 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | 35 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; |
32 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | 36 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; |
33 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | 37 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
69 bool LengthGreater(const string16& string_a, const string16& string_b) { | 73 bool LengthGreater(const string16& string_a, const string16& string_b) { |
70 return string_a.length() > string_b.length(); | 74 return string_a.length() > string_b.length(); |
71 } | 75 } |
72 | 76 |
73 // std::accumulate helper function to add up TermMatches' lengths. | 77 // std::accumulate helper function to add up TermMatches' lengths. |
74 int AccumulateMatchLength(int total, const TermMatch& match) { | 78 int AccumulateMatchLength(int total, const TermMatch& match) { |
75 return total + match.length; | 79 return total + match.length; |
76 } | 80 } |
77 | 81 |
78 // Converts a raw value for some particular scoring factor into a score | 82 // Converts a raw value for some particular scoring factor into a score |
79 // component for that factor. The conversion function is piecewise linear, with | 83 // component for that factor. The conversion function is linear within the |
80 // input values provided in |value_ranks| and resulting output scores from | 84 // range of the |value_ranks| array. |value| is mapped to a range within |
81 // |kScoreRank| (mathematically, f(value_rank[i]) = kScoreRank[i]). A score | 85 // |value_ranks| with the resulting range projecting into the |kScoreRank| |
82 // cannot be higher than kScoreRank[0], and drops directly to 0 if lower than | 86 // array. A score cannot be higher than kScoreRank[0], and drops directly to 0 |
83 // kScoreRank[3]. | 87 // if lower than kScoreRank[3] (eliminating the item being scored from further |
88 // consideration since such a score is insignificant and unlikely to be | |
89 // presented to the user). | |
Peter Kasting
2012/01/14 00:12:49
Nit: Honestly, the old comment seemed a lot cleare
mrossetti
2012/03/03 05:05:56
Done.
| |
84 // | 90 // |
85 // For example, take |value| == 70 and |value_ranks| == { 100, 50, 30, 10 }. | 91 // For example, take |value| == 70 and |value_ranks| == { 100, 50, 30, 10 }. |
86 // Because 70 falls between ranks 0 (100) and 1 (50), the score is given by the | 92 // Because 70 falls between ranks 0 (100) and 1 (50), the score is given by the |
87 // linear function: | 93 // linear function: |
88 // score = m * value + b, where | 94 // score = m * value + b, where |
89 // m = (kScoreRank[0] - kScoreRank[1]) / (value_ranks[0] - value_ranks[1]) | 95 // m = (kScoreRank[0] - kScoreRank[1]) / (value_ranks[0] - value_ranks[1]) |
90 // b = value_ranks[1] | 96 // b = value_ranks[1] |
91 // Any value higher than 100 would be scored as if it were 100, and any value | 97 // Any value higher than 100 would be scored as if it were 100, and any value |
92 // lower than 10 scored 0. | 98 // lower than 10 scored 0. |
93 int ScoreForValue(int value, const int* value_ranks) { | 99 int ScoreForValue(int value, const int* value_ranks) { |
(...skipping 189 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
283 if (iter != history_id_word_map_.end()) { | 289 if (iter != history_id_word_map_.end()) { |
284 WordIDSet& word_id_set(iter->second); | 290 WordIDSet& word_id_set(iter->second); |
285 word_id_set.insert(word_id); | 291 word_id_set.insert(word_id); |
286 } else { | 292 } else { |
287 WordIDSet word_id_set; | 293 WordIDSet word_id_set; |
288 word_id_set.insert(word_id); | 294 word_id_set.insert(word_id); |
289 history_id_word_map_[history_id] = word_id_set; | 295 history_id_word_map_[history_id] = word_id_set; |
290 } | 296 } |
291 } | 297 } |
292 | 298 |
293 void URLIndexPrivateData::UpdateURL(URLID row_id, const URLRow& row) { | 299 void URLIndexPrivateData::UpdateURL(const URLRow& row) { |
294 // The row may or may not already be in our index. If it is not already | 300 // The row may or may not already be in our index. If it is not already |
295 // indexed and it qualifies then it gets indexed. If it is already | 301 // indexed and it qualifies then it gets indexed. If it is already |
296 // indexed and still qualifies then it gets updated, otherwise it | 302 // indexed and still qualifies then it gets updated, otherwise it |
297 // is deleted from the index. | 303 // is deleted from the index. |
304 URLID row_id = row.id(); | |
298 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | 305 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); |
299 if (row_pos == history_info_map_.end()) { | 306 if (row_pos == history_info_map_.end()) { |
300 // This new row should be indexed if it qualifies. | 307 // This new row should be indexed if it qualifies. |
301 URLRow new_row(row); | 308 URLRow new_row(row); |
302 new_row.set_id(row_id); | 309 new_row.set_id(row_id); |
303 if (RowQualifiesAsSignificant(new_row, base::Time())) | 310 if (RowQualifiesAsSignificant(new_row, base::Time())) |
304 IndexRow(new_row); | 311 IndexRow(new_row); |
305 } else if (RowQualifiesAsSignificant(row, base::Time())) { | 312 } else if (RowQualifiesAsSignificant(row, base::Time())) { |
306 // This indexed row still qualifies and will be re-indexed. | 313 // This indexed row still qualifies and will be re-indexed. |
307 // The url won't have changed but the title, visit count, etc. | 314 // The url won't have changed but the title, visit count, etc. |
308 // might have changed. | 315 // might have changed. |
309 URLRow& updated_row = row_pos->second; | 316 URLRow& updated_row = row_pos->second; |
310 updated_row.set_visit_count(row.visit_count()); | 317 updated_row.set_visit_count(row.visit_count()); |
311 updated_row.set_typed_count(row.typed_count()); | 318 updated_row.set_typed_count(row.typed_count()); |
312 updated_row.set_last_visit(row.last_visit()); | 319 updated_row.set_last_visit(row.last_visit()); |
313 // While the URL is guaranteed to remain stable, the title may have changed. | 320 // While the URL is guaranteed to remain stable, the title may have changed. |
314 // If so, then we need to update the index with the changed words. | 321 // If so, then we need to update the index with the changed words. |
315 if (updated_row.title() != row.title()) { | 322 if (updated_row.title() != row.title()) { |
316 // Clear all words associated with this row and re-index both the | 323 // Clear all words associated with this row and re-index both the |
317 // URL and title. | 324 // URL and title. |
318 RemoveRowWordsFromIndex(updated_row); | 325 RemoveRowWordsFromIndex(updated_row); |
319 updated_row.set_title(row.title()); | 326 updated_row.set_title(row.title()); |
320 AddRowWordsToIndex(updated_row); | 327 AddRowWordsToIndex(updated_row); |
321 } | 328 } |
322 } else { | 329 } else { |
323 // This indexed row no longer qualifies and will be de-indexed by | 330 // This indexed row no longer qualifies and will be de-indexed by |
324 // clearing all words associated with this row. | 331 // clearing all words associated with this row. |
325 URLRow& removed_row = row_pos->second; | 332 RemoveRowFromIndex(row); |
326 RemoveRowFromIndex(removed_row); | |
327 } | 333 } |
328 // This invalidates the cache. | 334 search_term_cache_.clear(); // This invalidates the cache. |
329 search_term_cache_.clear(); | |
330 } | 335 } |
331 | 336 |
332 void URLIndexPrivateData::DeleteURL(URLID row_id) { | 337 // Helper functor for DeleteURL. |
333 // Note that this does not remove any reference to this row from the | 338 class HistoryInfoMapItemHasURL { |
334 // word_id_history_map_. That map will continue to contain (and return) | 339 public: |
335 // hits against this row until that map is rebuilt, but since the | 340 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} |
336 // history_info_map_ no longer references the row no erroneous results | 341 |
337 // will propagate to the user. | 342 bool operator()(const std::pair<const HistoryID, URLRow> item) { |
Peter Kasting
2012/01/14 00:12:49
Nit: Missing &?
mrossetti
2012/03/03 05:05:56
Done.
| |
338 history_info_map_.erase(row_id); | 343 return item.second.url() == url_; |
339 search_term_cache_.clear(); // This invalidates the word cache. | 344 } |
345 | |
346 private: | |
347 const GURL& url_; | |
348 }; | |
349 | |
350 void URLIndexPrivateData::DeleteURL(const GURL& url) { | |
351 // Find the matching entry in the history_info_map_. | |
352 HistoryInfoMap::iterator pos = std::find_if( | |
353 history_info_map_.begin(), | |
354 history_info_map_.end(), | |
355 HistoryInfoMapItemHasURL(url)); | |
356 if (pos != history_info_map_.end()) { | |
357 RemoveRowFromIndex(pos->second); | |
358 search_term_cache_.clear(); // This invalidates the cache. | |
359 } | |
340 } | 360 } |
341 | 361 |
342 bool URLIndexPrivateData::URLSchemeIsWhitelisted(const GURL& gurl) const { | 362 bool URLIndexPrivateData::URLSchemeIsWhitelisted(const GURL& gurl) const { |
343 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); | 363 return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end(); |
344 } | 364 } |
345 | 365 |
346 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | 366 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- |
347 | 367 |
348 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | 368 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( |
349 const HistoryInfoMap& history_info_map) | 369 const HistoryInfoMap& history_info_map) |
(...skipping 480 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
830 whitelist->insert(std::string(chrome::kFileScheme)); | 850 whitelist->insert(std::string(chrome::kFileScheme)); |
831 whitelist->insert(std::string(chrome::kFtpScheme)); | 851 whitelist->insert(std::string(chrome::kFtpScheme)); |
832 whitelist->insert(std::string(chrome::kHttpScheme)); | 852 whitelist->insert(std::string(chrome::kHttpScheme)); |
833 whitelist->insert(std::string(chrome::kHttpsScheme)); | 853 whitelist->insert(std::string(chrome::kHttpsScheme)); |
834 whitelist->insert(std::string(chrome::kMailToScheme)); | 854 whitelist->insert(std::string(chrome::kMailToScheme)); |
835 } | 855 } |
836 | 856 |
837 // Cache Saving ---------------------------------------------------------------- | 857 // Cache Saving ---------------------------------------------------------------- |
838 | 858 |
839 bool URLIndexPrivateData::SaveToFile(const FilePath& file_path) { | 859 bool URLIndexPrivateData::SaveToFile(const FilePath& file_path) { |
840 // TODO(mrossetti): Move File IO to another thread. | |
841 base::ThreadRestrictions::ScopedAllowIO allow_io; | |
842 base::TimeTicks beginning_time = base::TimeTicks::Now(); | 860 base::TimeTicks beginning_time = base::TimeTicks::Now(); |
843 InMemoryURLIndexCacheItem index_cache; | 861 InMemoryURLIndexCacheItem index_cache; |
844 SavePrivateData(&index_cache); | 862 SavePrivateData(&index_cache); |
845 std::string data; | 863 std::string data; |
846 if (!index_cache.SerializeToString(&data)) { | 864 if (!index_cache.SerializeToString(&data)) { |
847 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; | 865 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; |
848 return false; | 866 return false; |
849 } | 867 } |
850 | 868 |
851 int size = data.size(); | 869 int size = data.size(); |
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
949 map_entry->set_visit_count(url_row.visit_count()); | 967 map_entry->set_visit_count(url_row.visit_count()); |
950 map_entry->set_typed_count(url_row.typed_count()); | 968 map_entry->set_typed_count(url_row.typed_count()); |
951 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); | 969 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); |
952 map_entry->set_url(url_row.url().spec()); | 970 map_entry->set_url(url_row.url().spec()); |
953 map_entry->set_title(UTF16ToUTF8(url_row.title())); | 971 map_entry->set_title(UTF16ToUTF8(url_row.title())); |
954 } | 972 } |
955 } | 973 } |
956 | 974 |
957 // Cache Restoring ------------------------------------------------------------- | 975 // Cache Restoring ------------------------------------------------------------- |
958 | 976 |
959 bool URLIndexPrivateData::ReloadFromHistory(history::URLDatabase* history_db) { | 977 // static |
960 Clear(); | 978 URLIndexPrivateData* URLIndexPrivateData::RestoreFromFile( |
961 | 979 const FilePath& file_path) { |
962 if (!history_db) | |
963 return false; | |
964 | |
965 base::TimeTicks beginning_time = base::TimeTicks::Now(); | 980 base::TimeTicks beginning_time = base::TimeTicks::Now(); |
966 URLDatabase::URLEnumerator history_enum; | 981 if (!file_util::PathExists(file_path)) |
967 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | 982 return NULL; |
968 return false; | |
969 URLRow row; | |
970 while (history_enum.GetNextURL(&row)) | |
971 IndexRow(row); | |
972 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
973 base::TimeTicks::Now() - beginning_time); | |
974 return true; | |
975 } | |
976 | |
977 bool URLIndexPrivateData::RestoreFromFile(const FilePath& file_path) { | |
978 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date. | |
979 // That is: ensure that the database has not been modified since the cache | |
980 // was last saved. DB file modification date is inadequate. There are no | |
981 // SQLite table checksums automatically stored. | |
982 // FIXME(mrossetti): Move File IO to another thread. | |
983 base::ThreadRestrictions::ScopedAllowIO allow_io; | |
984 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
985 std::string data; | 983 std::string data; |
986 // If there is no cache file then simply give up. This will cause us to | 984 // If there is no cache file then simply give up. This will cause us to |
987 // attempt to rebuild from the history database. | 985 // attempt to rebuild from the history database. |
988 if (!file_util::ReadFileToString(file_path, &data)) | 986 if (!file_util::ReadFileToString(file_path, &data)) |
989 return false; | 987 return NULL; |
990 | 988 |
989 scoped_ptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); | |
991 InMemoryURLIndexCacheItem index_cache; | 990 InMemoryURLIndexCacheItem index_cache; |
992 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { | 991 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { |
993 LOG(WARNING) << "Failed to parse InMemoryURLIndex cache data read from " | 992 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " |
994 << file_path.value(); | 993 << file_path.value(); |
995 return false; | 994 return restored_data.release(); |
996 } | 995 } |
997 | 996 |
998 if (!RestorePrivateData(index_cache)) { | 997 if (!restored_data->RestorePrivateData(index_cache)) { |
999 Clear(); // Back to square one -- must build from scratch. | 998 restored_data.reset(); // Back to square one -- must build from history DB. |
1000 return false; | 999 return NULL; |
1001 } | 1000 } |
1002 | 1001 |
1003 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", | 1002 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", |
1004 base::TimeTicks::Now() - beginning_time); | 1003 base::TimeTicks::Now() - beginning_time); |
1005 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | 1004 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", |
1006 history_id_word_map_.size()); | 1005 restored_data->history_id_word_map_.size()); |
1007 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); | 1006 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); |
1008 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", word_map_.size()); | 1007 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", |
1009 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", char_word_map_.size()); | 1008 restored_data->word_map_.size()); |
1010 return true; | 1009 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", |
1010 restored_data->char_word_map_.size()); | |
1011 return restored_data.release(); | |
1012 } | |
1013 | |
1014 // static | |
1015 URLIndexPrivateData* URLIndexPrivateData::RebuildFromHistory( | |
1016 URLDatabase* history_db) { | |
1017 if (!history_db) | |
1018 return NULL; | |
1019 | |
1020 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
1021 | |
1022 scoped_ptr<URLIndexPrivateData> rebuilt_data(new URLIndexPrivateData); | |
1023 URLDatabase::URLEnumerator history_enum; | |
1024 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
1025 return NULL; | |
1026 URLRow row; | |
Peter Kasting
2012/01/14 00:12:49
Nit: You can condense these three lines to two and
mrossetti
2012/03/03 05:05:56
w00t! My ancient mind just doesn't work that way!
| |
1027 while (history_enum.GetNextURL(&row)) | |
1028 rebuilt_data->IndexRow(row); | |
1029 | |
1030 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
1031 base::TimeTicks::Now() - beginning_time); | |
1032 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
1033 rebuilt_data->history_id_word_map_.size()); | |
1034 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
1035 rebuilt_data->word_map_.size()); | |
1036 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
1037 rebuilt_data->char_word_map_.size()); | |
1038 return rebuilt_data.release(); | |
1011 } | 1039 } |
1012 | 1040 |
1013 bool URLIndexPrivateData::RestorePrivateData( | 1041 bool URLIndexPrivateData::RestorePrivateData( |
1014 const InMemoryURLIndexCacheItem& cache) { | 1042 const InMemoryURLIndexCacheItem& cache) { |
1015 return RestoreWordList(cache) && RestoreWordMap(cache) && | 1043 return RestoreWordList(cache) && RestoreWordMap(cache) && |
1016 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && | 1044 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && |
1017 RestoreHistoryInfoMap(cache); | 1045 RestoreHistoryInfoMap(cache); |
1018 } | 1046 } |
1019 | 1047 |
1020 bool URLIndexPrivateData::RestoreWordList( | 1048 bool URLIndexPrivateData::RestoreWordList( |
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1129 if (iter->has_title()) { | 1157 if (iter->has_title()) { |
1130 string16 title(UTF8ToUTF16(iter->title())); | 1158 string16 title(UTF8ToUTF16(iter->title())); |
1131 url_row.set_title(title); | 1159 url_row.set_title(title); |
1132 } | 1160 } |
1133 history_info_map_[history_id] = url_row; | 1161 history_info_map_[history_id] = url_row; |
1134 } | 1162 } |
1135 return true; | 1163 return true; |
1136 } | 1164 } |
1137 | 1165 |
1138 } // namespace history | 1166 } // namespace history |
OLD | NEW |