Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(179)

Side by Side Diff: components/bookmarks/browser/bookmark_index.cc

Issue 2537223008: Add TitledUrlIndex for indexing arbitrary title/URL pairs (Closed)
Patch Set: add TypedCountSorter, typedef -> using Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/bookmarks/browser/bookmark_index.h" 5 #include "components/bookmarks/browser/bookmark_index.h"
6 6
7 #include <stdint.h> 7 #include <stdint.h>
8 8
9 #include <algorithm>
10 #include <functional>
11 #include <iterator>
12 #include <list>
13
14 #include "base/i18n/case_conversion.h" 9 #include "base/i18n/case_conversion.h"
15 #include "base/logging.h" 10 #include "base/logging.h"
16 #include "base/stl_util.h" 11 #include "base/stl_util.h"
17 #include "base/strings/utf_offset_string_conversions.h" 12 #include "base/strings/utf_offset_string_conversions.h"
18 #include "build/build_config.h" 13 #include "build/build_config.h"
19 #include "components/bookmarks/browser/bookmark_client.h"
20 #include "components/bookmarks/browser/bookmark_match.h" 14 #include "components/bookmarks/browser/bookmark_match.h"
21 #include "components/bookmarks/browser/bookmark_node.h"
22 #include "components/bookmarks/browser/bookmark_utils.h" 15 #include "components/bookmarks/browser/bookmark_utils.h"
16 #include "components/bookmarks/browser/titled_url_node.h"
17 #include "components/bookmarks/browser/titled_url_node_sorter.h"
23 #include "components/query_parser/snippet.h" 18 #include "components/query_parser/snippet.h"
24 #include "third_party/icu/source/common/unicode/normalizer2.h" 19 #include "third_party/icu/source/common/unicode/normalizer2.h"
25 #include "third_party/icu/source/common/unicode/utypes.h" 20 #include "third_party/icu/source/common/unicode/utypes.h"
26 21
27 namespace bookmarks { 22 namespace bookmarks {
28 23
29 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair;
30 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs;
31
32 namespace { 24 namespace {
33 25
34 // Returns a normalized version of the UTF16 string |text|. If it fails to 26 // Returns a normalized version of the UTF16 string |text|. If it fails to
35 // normalize the string, returns |text| itself as a best-effort. 27 // normalize the string, returns |text| itself as a best-effort.
36 base::string16 Normalize(const base::string16& text) { 28 base::string16 Normalize(const base::string16& text) {
37 UErrorCode status = U_ZERO_ERROR; 29 UErrorCode status = U_ZERO_ERROR;
38 const icu::Normalizer2* normalizer2 = 30 const icu::Normalizer2* normalizer2 =
39 icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status); 31 icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);
40 if (U_FAILURE(status)) { 32 if (U_FAILURE(status)) {
41 // Log and crash right away to capture the error code in the crash report. 33 // Log and crash right away to capture the error code in the crash report.
42 LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status); 34 LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status);
43 } 35 }
44 icu::UnicodeString unicode_text( 36 icu::UnicodeString unicode_text(
45 text.data(), static_cast<int32_t>(text.length())); 37 text.data(), static_cast<int32_t>(text.length()));
46 icu::UnicodeString unicode_normalized_text; 38 icu::UnicodeString unicode_normalized_text;
47 normalizer2->normalize(unicode_text, unicode_normalized_text, status); 39 normalizer2->normalize(unicode_text, unicode_normalized_text, status);
48 if (U_FAILURE(status)) { 40 if (U_FAILURE(status)) {
49 // This should not happen. Log the error and fall back. 41 // This should not happen. Log the error and fall back.
50 LOG(ERROR) << "normalization failed: " << u_errorName(status); 42 LOG(ERROR) << "normalization failed: " << u_errorName(status);
51 return text; 43 return text;
52 } 44 }
53 return base::string16(unicode_normalized_text.getBuffer(), 45 return base::string16(unicode_normalized_text.getBuffer(),
54 unicode_normalized_text.length()); 46 unicode_normalized_text.length());
55 } 47 }
56 48
57 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed
58 // count so that the best matches will always be added to the results.
59 struct NodeTypedCountPairSortFunctor {
60 bool operator()(const NodeTypedCountPair& a,
61 const NodeTypedCountPair& b) const {
62 return a.second > b.second;
63 }
64 };
65
66 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.
67 struct NodeTypedCountPairExtractNodeFunctor {
68 const BookmarkNode* operator()(const NodeTypedCountPair& pair) const {
69 return pair.first;
70 }
71 };
72
73 } // namespace 49 } // namespace
74 50
75 BookmarkIndex::BookmarkIndex(BookmarkClient* client) 51 BookmarkIndex::BookmarkIndex(std::unique_ptr<TitledUrlNodeSorter> sorter)
76 : client_(client) { 52 : sorter_(std::move(sorter)) {
77 DCHECK(client_);
78 } 53 }
79 54
80 BookmarkIndex::~BookmarkIndex() { 55 BookmarkIndex::~BookmarkIndex() {
81 } 56 }
82 57
83 void BookmarkIndex::Add(const BookmarkNode* node) { 58 void BookmarkIndex::Add(const TitledUrlNode* node) {
84 if (!node->is_url()) 59 if (!node->GetTitledUrlNodeUrl().is_valid())
85 return; 60 return;
86 std::vector<base::string16> terms = 61 std::vector<base::string16> terms =
87 ExtractQueryWords(Normalize(node->GetTitle())); 62 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));
88 for (size_t i = 0; i < terms.size(); ++i) 63 for (size_t i = 0; i < terms.size(); ++i)
89 RegisterNode(terms[i], node); 64 RegisterNode(terms[i], node);
90 terms = 65 terms = ExtractQueryWords(
91 ExtractQueryWords(CleanUpUrlForMatching(node->url(), nullptr)); 66 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));
92 for (size_t i = 0; i < terms.size(); ++i) 67 for (size_t i = 0; i < terms.size(); ++i)
93 RegisterNode(terms[i], node); 68 RegisterNode(terms[i], node);
94 } 69 }
95 70
96 void BookmarkIndex::Remove(const BookmarkNode* node) { 71 void BookmarkIndex::Remove(const TitledUrlNode* node) {
97 if (!node->is_url()) 72 if (!node->GetTitledUrlNodeUrl().is_valid())
98 return; 73 return;
99 74
100 std::vector<base::string16> terms = 75 std::vector<base::string16> terms =
101 ExtractQueryWords(Normalize(node->GetTitle())); 76 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));
102 for (size_t i = 0; i < terms.size(); ++i) 77 for (size_t i = 0; i < terms.size(); ++i)
103 UnregisterNode(terms[i], node); 78 UnregisterNode(terms[i], node);
104 terms = 79 terms = ExtractQueryWords(
105 ExtractQueryWords(CleanUpUrlForMatching(node->url(), nullptr)); 80 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));
106 for (size_t i = 0; i < terms.size(); ++i) 81 for (size_t i = 0; i < terms.size(); ++i)
107 UnregisterNode(terms[i], node); 82 UnregisterNode(terms[i], node);
108 } 83 }
109 84
110 void BookmarkIndex::GetBookmarksMatching( 85 void BookmarkIndex::GetResultsMatching(
111 const base::string16& input_query, 86 const base::string16& input_query,
112 size_t max_count, 87 size_t max_count,
113 query_parser::MatchingAlgorithm matching_algorithm, 88 query_parser::MatchingAlgorithm matching_algorithm,
114 std::vector<BookmarkMatch>* results) { 89 std::vector<BookmarkMatch>* results) {
115 const base::string16 query = Normalize(input_query); 90 const base::string16 query = Normalize(input_query);
116 std::vector<base::string16> terms = ExtractQueryWords(query); 91 std::vector<base::string16> terms = ExtractQueryWords(query);
117 if (terms.empty()) 92 if (terms.empty())
118 return; 93 return;
119 94
120 NodeSet matches; 95 TitledUrlNodeSet matches;
121 for (size_t i = 0; i < terms.size(); ++i) { 96 for (size_t i = 0; i < terms.size(); ++i) {
122 if (!GetBookmarksMatchingTerm( 97 if (!GetResultsMatchingTerm(terms[i], i == 0, matching_algorithm,
123 terms[i], i == 0, matching_algorithm, &matches)) { 98 &matches)) {
124 return; 99 return;
125 } 100 }
126 } 101 }
127 102
128 Nodes sorted_nodes; 103 TitledUrlNodes sorted_nodes;
129 SortMatches(matches, &sorted_nodes); 104 SortMatches(matches, &sorted_nodes);
130 105
131 // We use a QueryParser to fill in match positions for us. It's not the most 106 // We use a QueryParser to fill in match positions for us. It's not the most
132 // efficient way to go about this, but by the time we get here we know what 107 // efficient way to go about this, but by the time we get here we know what
133 // matches and so this shouldn't be performance critical. 108 // matches and so this shouldn't be performance critical.
134 query_parser::QueryParser parser; 109 query_parser::QueryParser parser;
135 query_parser::QueryNodeVector query_nodes; 110 query_parser::QueryNodeVector query_nodes;
136 parser.ParseQueryNodes(query, matching_algorithm, &query_nodes); 111 parser.ParseQueryNodes(query, matching_algorithm, &query_nodes);
137 112
138 // The highest typed counts should be at the beginning of the results vector 113 // The highest typed counts should be at the beginning of the results vector
139 // so that the best matches will always be included in the results. The loop 114 // so that the best matches will always be included in the results. The loop
140 // that calculates result relevance in HistoryContentsProvider::ConvertResults 115 // that calculates result relevance in HistoryContentsProvider::ConvertResults
141 // will run backwards to assure higher relevance will be attributed to the 116 // will run backwards to assure higher relevance will be attributed to the
142 // best matches. 117 // best matches.
143 for (Nodes::const_iterator i = sorted_nodes.begin(); 118 for (TitledUrlNodes::const_iterator i = sorted_nodes.begin();
144 i != sorted_nodes.end() && results->size() < max_count; 119 i != sorted_nodes.end() && results->size() < max_count;
145 ++i) 120 ++i)
146 AddMatchToResults(*i, &parser, query_nodes, results); 121 AddMatchToResults(*i, &parser, query_nodes, results);
147 } 122 }
148 123
149 void BookmarkIndex::SortMatches(const NodeSet& matches, 124 void BookmarkIndex::SortMatches(const TitledUrlNodeSet& matches,
150 Nodes* sorted_nodes) const { 125 TitledUrlNodes* sorted_nodes) const {
151 sorted_nodes->reserve(matches.size()); 126 if (sorter_) {
152 if (client_->SupportsTypedCountForNodes()) { 127 sorter_->SortMatches(matches, sorted_nodes);
153 NodeTypedCountPairs node_typed_counts;
154 client_->GetTypedCountForNodes(matches, &node_typed_counts);
155 std::sort(node_typed_counts.begin(),
156 node_typed_counts.end(),
157 NodeTypedCountPairSortFunctor());
158 std::transform(node_typed_counts.begin(),
159 node_typed_counts.end(),
160 std::back_inserter(*sorted_nodes),
161 NodeTypedCountPairExtractNodeFunctor());
162 } else { 128 } else {
163 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end()); 129 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());
164 } 130 }
165 } 131 }
166 132
167 void BookmarkIndex::AddMatchToResults( 133 void BookmarkIndex::AddMatchToResults(
168 const BookmarkNode* node, 134 const TitledUrlNode* node,
169 query_parser::QueryParser* parser, 135 query_parser::QueryParser* parser,
170 const query_parser::QueryNodeVector& query_nodes, 136 const query_parser::QueryNodeVector& query_nodes,
171 std::vector<BookmarkMatch>* results) { 137 std::vector<BookmarkMatch>* results) {
172 // Check that the result matches the query. The previous search 138 // Check that the result matches the query. The previous search
173 // was a simple per-word search, while the more complex matching 139 // was a simple per-word search, while the more complex matching
174 // of QueryParser may filter it out. For example, the query 140 // of QueryParser may filter it out. For example, the query
175 // ["thi"] will match the bookmark titled [Thinking], but since 141 // ["thi"] will match the title [Thinking], but since
176 // ["thi"] is quoted we don't want to do a prefix match. 142 // ["thi"] is quoted we don't want to do a prefix match.
177 query_parser::QueryWordVector title_words, url_words; 143 query_parser::QueryWordVector title_words, url_words;
178 const base::string16 lower_title = 144 const base::string16 lower_title =
179 base::i18n::ToLower(Normalize(node->GetTitle())); 145 base::i18n::ToLower(Normalize(node->GetTitledUrlNodeTitle()));
180 parser->ExtractQueryWords(lower_title, &title_words); 146 parser->ExtractQueryWords(lower_title, &title_words);
181 base::OffsetAdjuster::Adjustments adjustments; 147 base::OffsetAdjuster::Adjustments adjustments;
182 parser->ExtractQueryWords( 148 parser->ExtractQueryWords(
183 CleanUpUrlForMatching(node->url(), &adjustments), 149 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), &adjustments),
184 &url_words); 150 &url_words);
185 query_parser::Snippet::MatchPositions title_matches, url_matches; 151 query_parser::Snippet::MatchPositions title_matches, url_matches;
186 for (const auto& node : query_nodes) { 152 for (const auto& node : query_nodes) {
187 const bool has_title_matches = 153 const bool has_title_matches =
188 node->HasMatchIn(title_words, &title_matches); 154 node->HasMatchIn(title_words, &title_matches);
189 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches); 155 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches);
190 if (!has_title_matches && !has_url_matches) 156 if (!has_title_matches && !has_url_matches)
191 return; 157 return;
192 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches); 158 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);
193 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches); 159 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);
194 } 160 }
195 BookmarkMatch match; 161 BookmarkMatch match;
196 if (lower_title.length() == node->GetTitle().length()) { 162 if (lower_title.length() == node->GetTitledUrlNodeTitle().length()) {
197 // Only use title matches if the lowercase string is the same length 163 // Only use title matches if the lowercase string is the same length
198 // as the original string, otherwise the matches are meaningless. 164 // as the original string, otherwise the matches are meaningless.
199 // TODO(mpearson): revise match positions appropriately. 165 // TODO(mpearson): revise match positions appropriately.
200 match.title_match_positions.swap(title_matches); 166 match.title_match_positions.swap(title_matches);
201 } 167 }
202 // Now that we're done processing this entry, correct the offsets of the 168 // Now that we're done processing this entry, correct the offsets of the
203 // matches in |url_matches| so they point to offsets in the original URL 169 // matches in |url_matches| so they point to offsets in the original URL
204 // spec, not the cleaned-up URL string that we used for matching. 170 // spec, not the cleaned-up URL string that we used for matching.
205 std::vector<size_t> offsets = 171 std::vector<size_t> offsets =
206 BookmarkMatch::OffsetsFromMatchPositions(url_matches); 172 BookmarkMatch::OffsetsFromMatchPositions(url_matches);
207 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets); 173 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
208 url_matches = 174 url_matches =
209 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets); 175 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);
210 match.url_match_positions.swap(url_matches); 176 match.url_match_positions.swap(url_matches);
211 match.node = node; 177 match.node = node;
212 results->push_back(match); 178 results->push_back(match);
213 } 179 }
214 180
215 bool BookmarkIndex::GetBookmarksMatchingTerm( 181 bool BookmarkIndex::GetResultsMatchingTerm(
216 const base::string16& term, 182 const base::string16& term,
217 bool first_term, 183 bool first_term,
218 query_parser::MatchingAlgorithm matching_algorithm, 184 query_parser::MatchingAlgorithm matching_algorithm,
219 NodeSet* matches) { 185 TitledUrlNodeSet* matches) {
220 Index::const_iterator i = index_.lower_bound(term); 186 Index::const_iterator i = index_.lower_bound(term);
221 if (i == index_.end()) 187 if (i == index_.end())
222 return false; 188 return false;
223 189
224 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch( 190 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
225 term, matching_algorithm)) { 191 term, matching_algorithm)) {
226 // Term is too short for prefix match, compare using exact match. 192 // Term is too short for prefix match, compare using exact match.
227 if (i->first != term) 193 if (i->first != term)
228 return false; // No bookmarks with this term. 194 return false; // No title/URL pairs with this term.
229 195
230 if (first_term) { 196 if (first_term) {
231 (*matches) = i->second; 197 (*matches) = i->second;
232 return true; 198 return true;
233 } 199 }
234 *matches = base::STLSetIntersection<NodeSet>(i->second, *matches); 200 *matches = base::STLSetIntersection<TitledUrlNodeSet>(i->second, *matches);
235 } else { 201 } else {
236 // Loop through index adding all entries that start with term to 202 // Loop through index adding all entries that start with term to
237 // |prefix_matches|. 203 // |prefix_matches|.
238 NodeSet tmp_prefix_matches; 204 TitledUrlNodeSet tmp_prefix_matches;
239 // If this is the first term, then store the result directly in |matches| 205 // If this is the first term, then store the result directly in |matches|
240 // to avoid calling stl intersection (which requires a copy). 206 // to avoid calling stl intersection (which requires a copy).
241 NodeSet* prefix_matches = first_term ? matches : &tmp_prefix_matches; 207 TitledUrlNodeSet* prefix_matches =
208 first_term ? matches : &tmp_prefix_matches;
242 while (i != index_.end() && 209 while (i != index_.end() &&
243 i->first.size() >= term.size() && 210 i->first.size() >= term.size() &&
244 term.compare(0, term.size(), i->first, 0, term.size()) == 0) { 211 term.compare(0, term.size(), i->first, 0, term.size()) == 0) {
245 #if !defined(OS_ANDROID) 212 #if !defined(OS_ANDROID)
246 prefix_matches->insert(i->second.begin(), i->second.end()); 213 prefix_matches->insert(i->second.begin(), i->second.end());
247 #else 214 #else
248 // Work around a bug in the implementation of std::set::insert in the STL 215 // Work around a bug in the implementation of std::set::insert in the STL
249 // used on android (http://crbug.com/367050). 216 // used on android (http://crbug.com/367050).
250 for (NodeSet::const_iterator n = i->second.begin(); n != i->second.end(); 217 for (TitledUrlNodeSet::const_iterator n = i->second.begin();
218 n != i->second.end();
251 ++n) 219 ++n)
252 prefix_matches->insert(prefix_matches->end(), *n); 220 prefix_matches->insert(prefix_matches->end(), *n);
253 #endif 221 #endif
254 ++i; 222 ++i;
255 } 223 }
256 if (!first_term) 224 if (!first_term) {
257 *matches = base::STLSetIntersection<NodeSet>(*prefix_matches, *matches); 225 *matches =
226 base::STLSetIntersection<TitledUrlNodeSet>(*prefix_matches, *matches);
227 }
258 } 228 }
259 return !matches->empty(); 229 return !matches->empty();
260 } 230 }
261 231
262 std::vector<base::string16> BookmarkIndex::ExtractQueryWords( 232 std::vector<base::string16> BookmarkIndex::ExtractQueryWords(
263 const base::string16& query) { 233 const base::string16& query) {
264 std::vector<base::string16> terms; 234 std::vector<base::string16> terms;
265 if (query.empty()) 235 if (query.empty())
266 return std::vector<base::string16>(); 236 return std::vector<base::string16>();
267 query_parser::QueryParser parser; 237 query_parser::QueryParser parser;
268 parser.ParseQueryWords(base::i18n::ToLower(query), 238 parser.ParseQueryWords(base::i18n::ToLower(query),
269 query_parser::MatchingAlgorithm::DEFAULT, 239 query_parser::MatchingAlgorithm::DEFAULT,
270 &terms); 240 &terms);
271 return terms; 241 return terms;
272 } 242 }
273 243
274 void BookmarkIndex::RegisterNode(const base::string16& term, 244 void BookmarkIndex::RegisterNode(const base::string16& term,
275 const BookmarkNode* node) { 245 const TitledUrlNode* node) {
276 index_[term].insert(node); 246 index_[term].insert(node);
277 } 247 }
278 248
279 void BookmarkIndex::UnregisterNode(const base::string16& term, 249 void BookmarkIndex::UnregisterNode(const base::string16& term,
280 const BookmarkNode* node) { 250 const TitledUrlNode* node) {
281 Index::iterator i = index_.find(term); 251 Index::iterator i = index_.find(term);
282 if (i == index_.end()) { 252 if (i == index_.end()) {
283 // We can get here if the node has the same term more than once. For 253 // We can get here if the node has the same term more than once. For
284 // example, a bookmark with the title 'foo foo' would end up here. 254 // example, a node with the title 'foo foo' would end up here.
285 return; 255 return;
286 } 256 }
287 i->second.erase(node); 257 i->second.erase(node);
288 if (i->second.empty()) 258 if (i->second.empty())
289 index_.erase(i); 259 index_.erase(i);
290 } 260 }
291 261
292 } // namespace bookmarks 262 } // namespace bookmarks
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698