components/bookmarks/browser/bookmark_index.cc - Issue 2537223008: Add TitledUrlIndex for indexing arbitrary title/URL pairs

Side by Side Diff: components/bookmarks/browser/bookmark_index.cc

Issue 2537223008: Add TitledUrlIndex for indexing arbitrary title/URL pairs (Closed)

Patch Set: add TypedCountSorter, typedef -> using Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « components/bookmarks/browser/bookmark_index.h ('k') | components/bookmarks/browser/bookmark_match.h » ('j') | components/bookmarks/browser/typed_count_sorter.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/bookmarks/browser/bookmark_index.h"	5 #include "components/bookmarks/browser/bookmark_index.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <algorithm>

10 #include <functional>

11 #include <iterator>

12 #include <list>

13

14 #include "base/i18n/case_conversion.h"	9 #include "base/i18n/case_conversion.h"

15 #include "base/logging.h"	10 #include "base/logging.h"

16 #include "base/stl_util.h"	11 #include "base/stl_util.h"

17 #include "base/strings/utf_offset_string_conversions.h"	12 #include "base/strings/utf_offset_string_conversions.h"

18 #include "build/build_config.h"	13 #include "build/build_config.h"

19 #include "components/bookmarks/browser/bookmark_client.h"

20 #include "components/bookmarks/browser/bookmark_match.h"	14 #include "components/bookmarks/browser/bookmark_match.h"

21 #include "components/bookmarks/browser/bookmark_node.h"

22 #include "components/bookmarks/browser/bookmark_utils.h"	15 #include "components/bookmarks/browser/bookmark_utils.h"

	16 #include "components/bookmarks/browser/titled_url_node.h"

	17 #include "components/bookmarks/browser/titled_url_node_sorter.h"

23 #include "components/query_parser/snippet.h"	18 #include "components/query_parser/snippet.h"

24 #include "third_party/icu/source/common/unicode/normalizer2.h"	19 #include "third_party/icu/source/common/unicode/normalizer2.h"

25 #include "third_party/icu/source/common/unicode/utypes.h"	20 #include "third_party/icu/source/common/unicode/utypes.h"

26	21

27 namespace bookmarks {	22 namespace bookmarks {

28	23

29 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair;

30 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs;

31

32 namespace {	24 namespace {

33	25

34 // Returns a normalized version of the UTF16 string \|text\|. If it fails to	26 // Returns a normalized version of the UTF16 string \|text\|. If it fails to

35 // normalize the string, returns \|text\| itself as a best-effort.	27 // normalize the string, returns \|text\| itself as a best-effort.

36 base::string16 Normalize(const base::string16& text) {	28 base::string16 Normalize(const base::string16& text) {

37 UErrorCode status = U_ZERO_ERROR;	29 UErrorCode status = U_ZERO_ERROR;

38 const icu::Normalizer2* normalizer2 =	30 const icu::Normalizer2* normalizer2 =

39 icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);	31 icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);

40 if (U_FAILURE(status)) {	32 if (U_FAILURE(status)) {

41 // Log and crash right away to capture the error code in the crash report.	33 // Log and crash right away to capture the error code in the crash report.

42 LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status);	34 LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status);

43 }	35 }

44 icu::UnicodeString unicode_text(	36 icu::UnicodeString unicode_text(

45 text.data(), static_cast<int32_t>(text.length()));	37 text.data(), static_cast<int32_t>(text.length()));

46 icu::UnicodeString unicode_normalized_text;	38 icu::UnicodeString unicode_normalized_text;

47 normalizer2->normalize(unicode_text, unicode_normalized_text, status);	39 normalizer2->normalize(unicode_text, unicode_normalized_text, status);

48 if (U_FAILURE(status)) {	40 if (U_FAILURE(status)) {

49 // This should not happen. Log the error and fall back.	41 // This should not happen. Log the error and fall back.

50 LOG(ERROR) << "normalization failed: " << u_errorName(status);	42 LOG(ERROR) << "normalization failed: " << u_errorName(status);

51 return text;	43 return text;

52 }	44 }

53 return base::string16(unicode_normalized_text.getBuffer(),	45 return base::string16(unicode_normalized_text.getBuffer(),

54 unicode_normalized_text.length());	46 unicode_normalized_text.length());

55 }	47 }

56	48

57 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed

58 // count so that the best matches will always be added to the results.

59 struct NodeTypedCountPairSortFunctor {

60 bool operator()(const NodeTypedCountPair& a,

61 const NodeTypedCountPair& b) const {

62 return a.second > b.second;

63 }

64 };

65

66 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.

67 struct NodeTypedCountPairExtractNodeFunctor {

68 const BookmarkNode* operator()(const NodeTypedCountPair& pair) const {

69 return pair.first;

70 }

71 };

72

73 } // namespace	49 } // namespace

74	50

75 BookmarkIndex::BookmarkIndex(BookmarkClient* client)	51 BookmarkIndex::BookmarkIndex(std::unique_ptr<TitledUrlNodeSorter> sorter)

76 : client_(client) {	52 : sorter_(std::move(sorter)) {

77 DCHECK(client_);

78 }	53 }

79	54

80 BookmarkIndex::~BookmarkIndex() {	55 BookmarkIndex::~BookmarkIndex() {

81 }	56 }

82	57

83 void BookmarkIndex::Add(const BookmarkNode* node) {	58 void BookmarkIndex::Add(const TitledUrlNode* node) {

84 if (!node->is_url())	59 if (!node->GetTitledUrlNodeUrl().is_valid())

85 return;	60 return;

86 std::vector<base::string16> terms =	61 std::vector<base::string16> terms =

87 ExtractQueryWords(Normalize(node->GetTitle()));	62 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));

88 for (size_t i = 0; i < terms.size(); ++i)	63 for (size_t i = 0; i < terms.size(); ++i)

89 RegisterNode(terms[i], node);	64 RegisterNode(terms[i], node);

90 terms =	65 terms = ExtractQueryWords(

91 ExtractQueryWords(CleanUpUrlForMatching(node->url(), nullptr));	66 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));

92 for (size_t i = 0; i < terms.size(); ++i)	67 for (size_t i = 0; i < terms.size(); ++i)

93 RegisterNode(terms[i], node);	68 RegisterNode(terms[i], node);

94 }	69 }

95	70

96 void BookmarkIndex::Remove(const BookmarkNode* node) {	71 void BookmarkIndex::Remove(const TitledUrlNode* node) {

97 if (!node->is_url())	72 if (!node->GetTitledUrlNodeUrl().is_valid())

98 return;	73 return;

99	74

100 std::vector<base::string16> terms =	75 std::vector<base::string16> terms =

101 ExtractQueryWords(Normalize(node->GetTitle()));	76 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));

102 for (size_t i = 0; i < terms.size(); ++i)	77 for (size_t i = 0; i < terms.size(); ++i)

103 UnregisterNode(terms[i], node);	78 UnregisterNode(terms[i], node);

104 terms =	79 terms = ExtractQueryWords(

105 ExtractQueryWords(CleanUpUrlForMatching(node->url(), nullptr));	80 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));

106 for (size_t i = 0; i < terms.size(); ++i)	81 for (size_t i = 0; i < terms.size(); ++i)

107 UnregisterNode(terms[i], node);	82 UnregisterNode(terms[i], node);

108 }	83 }

109	84

110 void BookmarkIndex::GetBookmarksMatching(	85 void BookmarkIndex::GetResultsMatching(

111 const base::string16& input_query,	86 const base::string16& input_query,

112 size_t max_count,	87 size_t max_count,

113 query_parser::MatchingAlgorithm matching_algorithm,	88 query_parser::MatchingAlgorithm matching_algorithm,

114 std::vector<BookmarkMatch>* results) {	89 std::vector<BookmarkMatch>* results) {

115 const base::string16 query = Normalize(input_query);	90 const base::string16 query = Normalize(input_query);

116 std::vector<base::string16> terms = ExtractQueryWords(query);	91 std::vector<base::string16> terms = ExtractQueryWords(query);

117 if (terms.empty())	92 if (terms.empty())

118 return;	93 return;

119	94

120 NodeSet matches;	95 TitledUrlNodeSet matches;

121 for (size_t i = 0; i < terms.size(); ++i) {	96 for (size_t i = 0; i < terms.size(); ++i) {

122 if (!GetBookmarksMatchingTerm(	97 if (!GetResultsMatchingTerm(terms[i], i == 0, matching_algorithm,

123 terms[i], i == 0, matching_algorithm, &matches)) {	98 &matches)) {

124 return;	99 return;

125 }	100 }

126 }	101 }

127	102

128 Nodes sorted_nodes;	103 TitledUrlNodes sorted_nodes;

129 SortMatches(matches, &sorted_nodes);	104 SortMatches(matches, &sorted_nodes);

130	105

131 // We use a QueryParser to fill in match positions for us. It's not the most	106 // We use a QueryParser to fill in match positions for us. It's not the most

132 // efficient way to go about this, but by the time we get here we know what	107 // efficient way to go about this, but by the time we get here we know what

133 // matches and so this shouldn't be performance critical.	108 // matches and so this shouldn't be performance critical.

134 query_parser::QueryParser parser;	109 query_parser::QueryParser parser;

135 query_parser::QueryNodeVector query_nodes;	110 query_parser::QueryNodeVector query_nodes;

136 parser.ParseQueryNodes(query, matching_algorithm, &query_nodes);	111 parser.ParseQueryNodes(query, matching_algorithm, &query_nodes);

137	112

138 // The highest typed counts should be at the beginning of the results vector	113 // The highest typed counts should be at the beginning of the results vector

139 // so that the best matches will always be included in the results. The loop	114 // so that the best matches will always be included in the results. The loop

140 // that calculates result relevance in HistoryContentsProvider::ConvertResults	115 // that calculates result relevance in HistoryContentsProvider::ConvertResults

141 // will run backwards to assure higher relevance will be attributed to the	116 // will run backwards to assure higher relevance will be attributed to the

142 // best matches.	117 // best matches.

143 for (Nodes::const_iterator i = sorted_nodes.begin();	118 for (TitledUrlNodes::const_iterator i = sorted_nodes.begin();

144 i != sorted_nodes.end() && results->size() < max_count;	119 i != sorted_nodes.end() && results->size() < max_count;

145 ++i)	120 ++i)

146 AddMatchToResults(*i, &parser, query_nodes, results);	121 AddMatchToResults(*i, &parser, query_nodes, results);

147 }	122 }

148	123

149 void BookmarkIndex::SortMatches(const NodeSet& matches,	124 void BookmarkIndex::SortMatches(const TitledUrlNodeSet& matches,

150 Nodes* sorted_nodes) const {	125 TitledUrlNodes* sorted_nodes) const {

151 sorted_nodes->reserve(matches.size());	126 if (sorter_) {

152 if (client_->SupportsTypedCountForNodes()) {	127 sorter_->SortMatches(matches, sorted_nodes);

153 NodeTypedCountPairs node_typed_counts;

154 client_->GetTypedCountForNodes(matches, &node_typed_counts);

155 std::sort(node_typed_counts.begin(),

156 node_typed_counts.end(),

157 NodeTypedCountPairSortFunctor());

158 std::transform(node_typed_counts.begin(),

159 node_typed_counts.end(),

160 std::back_inserter(*sorted_nodes),

161 NodeTypedCountPairExtractNodeFunctor());

162 } else {	128 } else {

163 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());	129 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());

164 }	130 }

165 }	131 }

166	132

167 void BookmarkIndex::AddMatchToResults(	133 void BookmarkIndex::AddMatchToResults(

168 const BookmarkNode* node,	134 const TitledUrlNode* node,

169 query_parser::QueryParser* parser,	135 query_parser::QueryParser* parser,

170 const query_parser::QueryNodeVector& query_nodes,	136 const query_parser::QueryNodeVector& query_nodes,

171 std::vector<BookmarkMatch>* results) {	137 std::vector<BookmarkMatch>* results) {

172 // Check that the result matches the query. The previous search	138 // Check that the result matches the query. The previous search

173 // was a simple per-word search, while the more complex matching	139 // was a simple per-word search, while the more complex matching

174 // of QueryParser may filter it out. For example, the query	140 // of QueryParser may filter it out. For example, the query

175 // ["thi"] will match the bookmark titled [Thinking], but since	141 // ["thi"] will match the title [Thinking], but since

176 // ["thi"] is quoted we don't want to do a prefix match.	142 // ["thi"] is quoted we don't want to do a prefix match.

177 query_parser::QueryWordVector title_words, url_words;	143 query_parser::QueryWordVector title_words, url_words;

178 const base::string16 lower_title =	144 const base::string16 lower_title =

179 base::i18n::ToLower(Normalize(node->GetTitle()));	145 base::i18n::ToLower(Normalize(node->GetTitledUrlNodeTitle()));

180 parser->ExtractQueryWords(lower_title, &title_words);	146 parser->ExtractQueryWords(lower_title, &title_words);

181 base::OffsetAdjuster::Adjustments adjustments;	147 base::OffsetAdjuster::Adjustments adjustments;

182 parser->ExtractQueryWords(	148 parser->ExtractQueryWords(

183 CleanUpUrlForMatching(node->url(), &adjustments),	149 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), &adjustments),

184 &url_words);	150 &url_words);

185 query_parser::Snippet::MatchPositions title_matches, url_matches;	151 query_parser::Snippet::MatchPositions title_matches, url_matches;

186 for (const auto& node : query_nodes) {	152 for (const auto& node : query_nodes) {

187 const bool has_title_matches =	153 const bool has_title_matches =

188 node->HasMatchIn(title_words, &title_matches);	154 node->HasMatchIn(title_words, &title_matches);

189 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches);	155 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches);

190 if (!has_title_matches && !has_url_matches)	156 if (!has_title_matches && !has_url_matches)

191 return;	157 return;

192 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);	158 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);

193 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);	159 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);

194 }	160 }

195 BookmarkMatch match;	161 BookmarkMatch match;

196 if (lower_title.length() == node->GetTitle().length()) {	162 if (lower_title.length() == node->GetTitledUrlNodeTitle().length()) {

197 // Only use title matches if the lowercase string is the same length	163 // Only use title matches if the lowercase string is the same length

198 // as the original string, otherwise the matches are meaningless.	164 // as the original string, otherwise the matches are meaningless.

199 // TODO(mpearson): revise match positions appropriately.	165 // TODO(mpearson): revise match positions appropriately.

200 match.title_match_positions.swap(title_matches);	166 match.title_match_positions.swap(title_matches);

201 }	167 }

202 // Now that we're done processing this entry, correct the offsets of the	168 // Now that we're done processing this entry, correct the offsets of the

203 // matches in \|url_matches\| so they point to offsets in the original URL	169 // matches in \|url_matches\| so they point to offsets in the original URL

204 // spec, not the cleaned-up URL string that we used for matching.	170 // spec, not the cleaned-up URL string that we used for matching.

205 std::vector<size_t> offsets =	171 std::vector<size_t> offsets =

206 BookmarkMatch::OffsetsFromMatchPositions(url_matches);	172 BookmarkMatch::OffsetsFromMatchPositions(url_matches);

207 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);	173 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);

208 url_matches =	174 url_matches =

209 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);	175 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);

210 match.url_match_positions.swap(url_matches);	176 match.url_match_positions.swap(url_matches);

211 match.node = node;	177 match.node = node;

212 results->push_back(match);	178 results->push_back(match);

213 }	179 }

214	180

215 bool BookmarkIndex::GetBookmarksMatchingTerm(	181 bool BookmarkIndex::GetResultsMatchingTerm(

216 const base::string16& term,	182 const base::string16& term,

217 bool first_term,	183 bool first_term,

218 query_parser::MatchingAlgorithm matching_algorithm,	184 query_parser::MatchingAlgorithm matching_algorithm,

219 NodeSet* matches) {	185 TitledUrlNodeSet* matches) {

220 Index::const_iterator i = index_.lower_bound(term);	186 Index::const_iterator i = index_.lower_bound(term);

221 if (i == index_.end())	187 if (i == index_.end())

222 return false;	188 return false;

223	189

224 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(	190 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(

225 term, matching_algorithm)) {	191 term, matching_algorithm)) {

226 // Term is too short for prefix match, compare using exact match.	192 // Term is too short for prefix match, compare using exact match.

227 if (i->first != term)	193 if (i->first != term)

228 return false; // No bookmarks with this term.	194 return false; // No title/URL pairs with this term.

229	195

230 if (first_term) {	196 if (first_term) {

231 (*matches) = i->second;	197 (*matches) = i->second;

232 return true;	198 return true;

233 }	199 }

234 matches = base::STLSetIntersection<NodeSet>(i->second, matches);	200 matches = base::STLSetIntersection<TitledUrlNodeSet>(i->second, matches);

235 } else {	201 } else {

236 // Loop through index adding all entries that start with term to	202 // Loop through index adding all entries that start with term to

237 // \|prefix_matches\|.	203 // \|prefix_matches\|.

238 NodeSet tmp_prefix_matches;	204 TitledUrlNodeSet tmp_prefix_matches;

239 // If this is the first term, then store the result directly in \|matches\|	205 // If this is the first term, then store the result directly in \|matches\|

240 // to avoid calling stl intersection (which requires a copy).	206 // to avoid calling stl intersection (which requires a copy).

241 NodeSet* prefix_matches = first_term ? matches : &tmp_prefix_matches;	207 TitledUrlNodeSet* prefix_matches =

	208 first_term ? matches : &tmp_prefix_matches;

242 while (i != index_.end() &&	209 while (i != index_.end() &&

243 i->first.size() >= term.size() &&	210 i->first.size() >= term.size() &&

244 term.compare(0, term.size(), i->first, 0, term.size()) == 0) {	211 term.compare(0, term.size(), i->first, 0, term.size()) == 0) {

245 #if !defined(OS_ANDROID)	212 #if !defined(OS_ANDROID)

246 prefix_matches->insert(i->second.begin(), i->second.end());	213 prefix_matches->insert(i->second.begin(), i->second.end());

247 #else	214 #else

248 // Work around a bug in the implementation of std::set::insert in the STL	215 // Work around a bug in the implementation of std::set::insert in the STL

249 // used on android (http://crbug.com/367050).	216 // used on android (http://crbug.com/367050).

250 for (NodeSet::const_iterator n = i->second.begin(); n != i->second.end();	217 for (TitledUrlNodeSet::const_iterator n = i->second.begin();

	218 n != i->second.end();

251 ++n)	219 ++n)

252 prefix_matches->insert(prefix_matches->end(), *n);	220 prefix_matches->insert(prefix_matches->end(), *n);

253 #endif	221 #endif

254 ++i;	222 ++i;

255 }	223 }

256 if (!first_term)	224 if (!first_term) {

257 matches = base::STLSetIntersection<NodeSet>(prefix_matches, *matches);	225 *matches =

	226 base::STLSetIntersection<TitledUrlNodeSet>(prefix_matches, matches);

	227 }

258 }	228 }

259 return !matches->empty();	229 return !matches->empty();

260 }	230 }

261	231

262 std::vector<base::string16> BookmarkIndex::ExtractQueryWords(	232 std::vector<base::string16> BookmarkIndex::ExtractQueryWords(

263 const base::string16& query) {	233 const base::string16& query) {

264 std::vector<base::string16> terms;	234 std::vector<base::string16> terms;

265 if (query.empty())	235 if (query.empty())

266 return std::vector<base::string16>();	236 return std::vector<base::string16>();

267 query_parser::QueryParser parser;	237 query_parser::QueryParser parser;

268 parser.ParseQueryWords(base::i18n::ToLower(query),	238 parser.ParseQueryWords(base::i18n::ToLower(query),

269 query_parser::MatchingAlgorithm::DEFAULT,	239 query_parser::MatchingAlgorithm::DEFAULT,

270 &terms);	240 &terms);

271 return terms;	241 return terms;

272 }	242 }

273	243

274 void BookmarkIndex::RegisterNode(const base::string16& term,	244 void BookmarkIndex::RegisterNode(const base::string16& term,

275 const BookmarkNode* node) {	245 const TitledUrlNode* node) {

276 index_[term].insert(node);	246 index_[term].insert(node);

277 }	247 }

278	248

279 void BookmarkIndex::UnregisterNode(const base::string16& term,	249 void BookmarkIndex::UnregisterNode(const base::string16& term,

280 const BookmarkNode* node) {	250 const TitledUrlNode* node) {

281 Index::iterator i = index_.find(term);	251 Index::iterator i = index_.find(term);

282 if (i == index_.end()) {	252 if (i == index_.end()) {

283 // We can get here if the node has the same term more than once. For	253 // We can get here if the node has the same term more than once. For

284 // example, a bookmark with the title 'foo foo' would end up here.	254 // example, a node with the title 'foo foo' would end up here.

285 return;	255 return;

286 }	256 }

287 i->second.erase(node);	257 i->second.erase(node);

288 if (i->second.empty())	258 if (i->second.empty())

289 index_.erase(i);	259 index_.erase(i);

290 }	260 }

291	261

292 } // namespace bookmarks	262 } // namespace bookmarks

OLD	NEW