Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(412)

Side by Side Diff: components/bookmarks/browser/titled_url_index.cc

Issue 2537223008: Add TitledUrlIndex for indexing arbitrary title/URL pairs (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/bookmarks/browser/titled_url_index.h"
6
7 #include <stdint.h>
8
9 #include "base/i18n/case_conversion.h"
10 #include "base/logging.h"
11 #include "base/stl_util.h"
12 #include "base/strings/utf_offset_string_conversions.h"
13 #include "build/build_config.h"
14 #include "components/bookmarks/browser/bookmark_utils.h"
15 #include "components/bookmarks/browser/titled_url_match.h"
16 #include "components/query_parser/snippet.h"
17 #include "third_party/icu/source/common/unicode/normalizer2.h"
18 #include "third_party/icu/source/common/unicode/utypes.h"
19
20 namespace bookmarks {
21
22 namespace {
23
24 // Returns a normalized version of the UTF16 string |text|. If it fails to
25 // normalize the string, returns |text| itself as a best-effort.
26 base::string16 Normalize(const base::string16& text) {
27 UErrorCode status = U_ZERO_ERROR;
28 const icu::Normalizer2* normalizer2 =
29 icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);
30 if (U_FAILURE(status)) {
31 // Log and crash right away to capture the error code in the crash report.
32 LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status);
33 }
34 icu::UnicodeString unicode_text(text.data(),
35 static_cast<int32_t>(text.length()));
36 icu::UnicodeString unicode_normalized_text;
37 normalizer2->normalize(unicode_text, unicode_normalized_text, status);
38 if (U_FAILURE(status)) {
39 // This should not happen. Log the error and fall back.
40 LOG(ERROR) << "normalization failed: " << u_errorName(status);
41 return text;
42 }
43 return base::string16(unicode_normalized_text.getBuffer(),
44 unicode_normalized_text.length());
45 }
46
47 } // namespace
48
49 TitledUrlIndex::TitledUrlIndex() {}
50
51 TitledUrlIndex::~TitledUrlIndex() {}
52
53 void TitledUrlIndex::Add(const TitledUrlNode* node) {
54 if (!node->GetTitledUrlNodeUrl().is_valid())
55 return;
56 std::vector<base::string16> terms =
57 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));
58 for (size_t i = 0; i < terms.size(); ++i)
59 RegisterNode(terms[i], node);
60 terms = ExtractQueryWords(
61 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));
62 for (size_t i = 0; i < terms.size(); ++i)
63 RegisterNode(terms[i], node);
64 }
65
66 void TitledUrlIndex::Remove(const TitledUrlNode* node) {
67 if (!node->GetTitledUrlNodeUrl().is_valid())
68 return;
69
70 std::vector<base::string16> terms =
71 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));
72 for (size_t i = 0; i < terms.size(); ++i)
73 UnregisterNode(terms[i], node);
74 terms = ExtractQueryWords(
75 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));
76 for (size_t i = 0; i < terms.size(); ++i)
77 UnregisterNode(terms[i], node);
78 }
79
80 void TitledUrlIndex::GetResultsMatching(
81 const base::string16& input_query,
82 size_t max_count,
83 query_parser::MatchingAlgorithm matching_algorithm,
84 std::vector<TitledUrlMatch>* results) {
85 const base::string16 query = Normalize(input_query);
86 std::vector<base::string16> terms = ExtractQueryWords(query);
87 if (terms.empty())
88 return;
89
90 NodeSet matches;
91 for (size_t i = 0; i < terms.size(); ++i) {
92 if (!GetResultsMatchingTerm(terms[i], i == 0, matching_algorithm,
93 &matches)) {
94 return;
95 }
96 }
97
98 Nodes sorted_nodes;
99 SortMatches(matches, &sorted_nodes);
100
101 // We use a QueryParser to fill in match positions for us. It's not the most
102 // efficient way to go about this, but by the time we get here we know what
103 // matches and so this shouldn't be performance critical.
104 query_parser::QueryParser parser;
105 query_parser::QueryNodeVector query_nodes;
106 parser.ParseQueryNodes(query, matching_algorithm, &query_nodes);
107
108 // The highest typed counts should be at the beginning of the results vector
109 // so that the best matches will always be included in the results. The loop
110 // that calculates result relevance in HistoryContentsProvider::ConvertResults
111 // will run backwards to assure higher relevance will be attributed to the
112 // best matches.
113 for (Nodes::const_iterator i = sorted_nodes.begin();
114 i != sorted_nodes.end() && results->size() < max_count; ++i)
115 AddMatchToResults(*i, &parser, query_nodes, results);
116 }
117
118 void TitledUrlIndex::SortMatches(const NodeSet& matches,
119 Nodes* sorted_nodes) const {
120 sorted_nodes->reserve(matches.size());
121 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());
122 }
123
124 void TitledUrlIndex::AddMatchToResults(
125 const TitledUrlNode* node,
126 query_parser::QueryParser* parser,
127 const query_parser::QueryNodeVector& query_nodes,
128 std::vector<TitledUrlMatch>* results) {
129 // Check that the result matches the query. The previous search
130 // was a simple per-word search, while the more complex matching
131 // of QueryParser may filter it out. For example, the query
132 // ["thi"] will match the title [Thinking], but since
133 // ["thi"] is quoted we don't want to do a prefix match.
134 query_parser::QueryWordVector title_words, url_words;
135 const base::string16 lower_title =
136 base::i18n::ToLower(Normalize(node->GetTitledUrlNodeTitle()));
137 parser->ExtractQueryWords(lower_title, &title_words);
138 base::OffsetAdjuster::Adjustments adjustments;
139 parser->ExtractQueryWords(
140 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), &adjustments),
141 &url_words);
142 query_parser::Snippet::MatchPositions title_matches, url_matches;
143 for (const auto& node : query_nodes) {
144 const bool has_title_matches =
145 node->HasMatchIn(title_words, &title_matches);
146 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches);
147 if (!has_title_matches && !has_url_matches)
148 return;
149 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);
150 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);
151 }
152 TitledUrlMatch match;
153 if (lower_title.length() == node->GetTitledUrlNodeTitle().length()) {
154 // Only use title matches if the lowercase string is the same length
155 // as the original string, otherwise the matches are meaningless.
156 // TODO(mpearson): revise match positions appropriately.
157 match.title_match_positions.swap(title_matches);
158 }
159 // Now that we're done processing this entry, correct the offsets of the
160 // matches in |url_matches| so they point to offsets in the original URL
161 // spec, not the cleaned-up URL string that we used for matching.
162 std::vector<size_t> offsets =
163 TitledUrlMatch::OffsetsFromMatchPositions(url_matches);
164 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
165 url_matches =
166 TitledUrlMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);
167 match.url_match_positions.swap(url_matches);
168 match.node = node;
169 results->push_back(match);
170 }
171
172 bool TitledUrlIndex::GetResultsMatchingTerm(
173 const base::string16& term,
174 bool first_term,
175 query_parser::MatchingAlgorithm matching_algorithm,
176 NodeSet* matches) {
177 Index::const_iterator i = index_.lower_bound(term);
178 if (i == index_.end())
179 return false;
180
181 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
182 term, matching_algorithm)) {
183 // Term is too short for prefix match, compare using exact match.
184 if (i->first != term)
185 return false; // No title/URL pairs with this term.
186
187 if (first_term) {
188 (*matches) = i->second;
189 return true;
190 }
191 *matches = base::STLSetIntersection<NodeSet>(i->second, *matches);
192 } else {
193 // Loop through index adding all entries that start with term to
194 // |prefix_matches|.
195 NodeSet tmp_prefix_matches;
196 // If this is the first term, then store the result directly in |matches|
197 // to avoid calling stl intersection (which requires a copy).
198 NodeSet* prefix_matches = first_term ? matches : &tmp_prefix_matches;
199 while (i != index_.end() && i->first.size() >= term.size() &&
200 term.compare(0, term.size(), i->first, 0, term.size()) == 0) {
201 #if !defined(OS_ANDROID)
202 prefix_matches->insert(i->second.begin(), i->second.end());
203 #else
204 // Work around a bug in the implementation of std::set::insert in the STL
205 // used on android (http://crbug.com/367050).
206 for (NodeSet::const_iterator n = i->second.begin(); n != i->second.end();
207 ++n)
208 prefix_matches->insert(prefix_matches->end(), *n);
209 #endif
210 ++i;
211 }
212 if (!first_term)
213 *matches = base::STLSetIntersection<NodeSet>(*prefix_matches, *matches);
214 }
215 return !matches->empty();
216 }
217
218 std::vector<base::string16> TitledUrlIndex::ExtractQueryWords(
219 const base::string16& query) {
220 std::vector<base::string16> terms;
221 if (query.empty())
222 return std::vector<base::string16>();
223 query_parser::QueryParser parser;
224 parser.ParseQueryWords(base::i18n::ToLower(query),
225 query_parser::MatchingAlgorithm::DEFAULT, &terms);
226 return terms;
227 }
228
229 void TitledUrlIndex::RegisterNode(const base::string16& term,
230 const TitledUrlNode* node) {
231 index_[term].insert(node);
232 }
233
234 void TitledUrlIndex::UnregisterNode(const base::string16& term,
235 const TitledUrlNode* node) {
236 Index::iterator i = index_.find(term);
237 if (i == index_.end()) {
238 // We can get here if the node has the same term more than once. For
239 // example, a node with the title 'foo foo' would end up here.
240 return;
241 }
242 i->second.erase(node);
243 if (i->second.empty())
244 index_.erase(i);
245 }
246
247 } // namespace bookmarks
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698