components/bookmarks/browser/bookmark_index.cc - Issue 2537223008: Add TitledUrlIndex for indexing arbitrary title/URL pairs

Side by Side Diff: components/bookmarks/browser/bookmark_index.cc

Issue 2537223008: Add TitledUrlIndex for indexing arbitrary title/URL pairs (Closed)

Patch Set: refactor in-place to preserve history Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/bookmarks/browser/bookmark_index.h ('K') | « components/bookmarks/browser/bookmark_index.h ('k') | components/bookmarks/browser/bookmark_index_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/bookmarks/browser/bookmark_index.h"	5 #include "components/bookmarks/browser/bookmark_index.h"

6	6

7 #include <stdint.h>	7 #include <stdint.h>

8	8

9 #include <algorithm>

10 #include <functional>

11 #include <iterator>

12 #include <list>

13

14 #include "base/i18n/case_conversion.h"	9 #include "base/i18n/case_conversion.h"

15 #include "base/logging.h"	10 #include "base/logging.h"

16 #include "base/stl_util.h"	11 #include "base/stl_util.h"

17 #include "base/strings/utf_offset_string_conversions.h"	12 #include "base/strings/utf_offset_string_conversions.h"

18 #include "build/build_config.h"	13 #include "build/build_config.h"

19 #include "components/bookmarks/browser/bookmark_client.h"	14 #include "components/bookmarks/browser/bookmark_client.h"

20 #include "components/bookmarks/browser/bookmark_match.h"	15 #include "components/bookmarks/browser/bookmark_match.h"

21 #include "components/bookmarks/browser/bookmark_node.h"

22 #include "components/bookmarks/browser/bookmark_utils.h"	16 #include "components/bookmarks/browser/bookmark_utils.h"

	17 #include "components/bookmarks/browser/titled_url_node.h"

23 #include "components/query_parser/snippet.h"	18 #include "components/query_parser/snippet.h"

24 #include "third_party/icu/source/common/unicode/normalizer2.h"	19 #include "third_party/icu/source/common/unicode/normalizer2.h"

25 #include "third_party/icu/source/common/unicode/utypes.h"	20 #include "third_party/icu/source/common/unicode/utypes.h"

26	21

27 namespace bookmarks {	22 namespace bookmarks {

28	23

29 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair;	24 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair;

30 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs;	25 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs;

31	26

32 namespace {	27 namespace {

(...skipping 23 matching lines...) Expand all Loading...
56	51

57 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed	52 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed

58 // count so that the best matches will always be added to the results.	53 // count so that the best matches will always be added to the results.

59 struct NodeTypedCountPairSortFunctor {	54 struct NodeTypedCountPairSortFunctor {

60 bool operator()(const NodeTypedCountPair& a,	55 bool operator()(const NodeTypedCountPair& a,

61 const NodeTypedCountPair& b) const {	56 const NodeTypedCountPair& b) const {

62 return a.second > b.second;	57 return a.second > b.second;

63 }	58 }

64 };	59 };

65	60

66 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.	61 // Extract the const TitledUrlNode* stored in a NodeTypedCountPair.

67 struct NodeTypedCountPairExtractNodeFunctor {	62 struct NodeTypedCountPairExtractNodeFunctor {

68 const BookmarkNode* operator()(const NodeTypedCountPair& pair) const {	63 const TitledUrlNode* operator()(const NodeTypedCountPair& pair) const {

69 return pair.first;	64 return pair.first;

70 }	65 }

71 };	66 };

72	67

73 } // namespace	68 } // namespace

74	69

75 BookmarkIndex::BookmarkIndex(BookmarkClient* client)	70 BookmarkIndex::BookmarkIndex(BookmarkClient* client)

76 : client_(client) {	71 : client_(client) {

77 DCHECK(client_);	72 DCHECK(client_);

78 }	73 }

79	74

80 BookmarkIndex::~BookmarkIndex() {	75 BookmarkIndex::~BookmarkIndex() {

81 }	76 }

82	77

83 void BookmarkIndex::Add(const BookmarkNode* node) {	78 void BookmarkIndex::Add(const TitledUrlNode* node) {

84 if (!node->is_url())	79 if (!node->GetTitledUrlNodeUrl().is_valid())

85 return;	80 return;

86 std::vector<base::string16> terms =	81 std::vector<base::string16> terms =

87 ExtractQueryWords(Normalize(node->GetTitle()));	82 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));

88 for (size_t i = 0; i < terms.size(); ++i)	83 for (size_t i = 0; i < terms.size(); ++i)

89 RegisterNode(terms[i], node);	84 RegisterNode(terms[i], node);

90 terms =	85 terms = ExtractQueryWords(

91 ExtractQueryWords(CleanUpUrlForMatching(node->url(), nullptr));	86 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));

92 for (size_t i = 0; i < terms.size(); ++i)	87 for (size_t i = 0; i < terms.size(); ++i)

93 RegisterNode(terms[i], node);	88 RegisterNode(terms[i], node);

94 }	89 }

95	90

96 void BookmarkIndex::Remove(const BookmarkNode* node) {	91 void BookmarkIndex::Remove(const TitledUrlNode* node) {

97 if (!node->is_url())	92 if (!node->GetTitledUrlNodeUrl().is_valid())

98 return;	93 return;

99	94

100 std::vector<base::string16> terms =	95 std::vector<base::string16> terms =

101 ExtractQueryWords(Normalize(node->GetTitle()));	96 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()));

102 for (size_t i = 0; i < terms.size(); ++i)	97 for (size_t i = 0; i < terms.size(); ++i)

103 UnregisterNode(terms[i], node);	98 UnregisterNode(terms[i], node);

104 terms =	99 terms = ExtractQueryWords(

105 ExtractQueryWords(CleanUpUrlForMatching(node->url(), nullptr));	100 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr));

106 for (size_t i = 0; i < terms.size(); ++i)	101 for (size_t i = 0; i < terms.size(); ++i)

107 UnregisterNode(terms[i], node);	102 UnregisterNode(terms[i], node);

108 }	103 }

109	104

110 void BookmarkIndex::GetBookmarksMatching(	105 void BookmarkIndex::GetResultsMatching(

111 const base::string16& input_query,	106 const base::string16& input_query,

112 size_t max_count,	107 size_t max_count,

113 query_parser::MatchingAlgorithm matching_algorithm,	108 query_parser::MatchingAlgorithm matching_algorithm,

114 std::vector<BookmarkMatch>* results) {	109 std::vector<BookmarkMatch>* results) {

115 const base::string16 query = Normalize(input_query);	110 const base::string16 query = Normalize(input_query);

116 std::vector<base::string16> terms = ExtractQueryWords(query);	111 std::vector<base::string16> terms = ExtractQueryWords(query);

117 if (terms.empty())	112 if (terms.empty())

118 return;	113 return;

119	114

120 NodeSet matches;	115 NodeSet matches;

121 for (size_t i = 0; i < terms.size(); ++i) {	116 for (size_t i = 0; i < terms.size(); ++i) {

122 if (!GetBookmarksMatchingTerm(	117 if (!GetResultsMatchingTerm(terms[i], i == 0, matching_algorithm,

123 terms[i], i == 0, matching_algorithm, &matches)) {	118 &matches)) {

124 return;	119 return;

125 }	120 }

126 }	121 }

127	122

128 Nodes sorted_nodes;	123 Nodes sorted_nodes;

129 SortMatches(matches, &sorted_nodes);	124 SortMatches(matches, &sorted_nodes);

130	125

131 // We use a QueryParser to fill in match positions for us. It's not the most	126 // We use a QueryParser to fill in match positions for us. It's not the most

132 // efficient way to go about this, but by the time we get here we know what	127 // efficient way to go about this, but by the time we get here we know what

133 // matches and so this shouldn't be performance critical.	128 // matches and so this shouldn't be performance critical.

(...skipping 24 matching lines...) Expand all Loading...
158 std::transform(node_typed_counts.begin(),	153 std::transform(node_typed_counts.begin(),

159 node_typed_counts.end(),	154 node_typed_counts.end(),

160 std::back_inserter(*sorted_nodes),	155 std::back_inserter(*sorted_nodes),

161 NodeTypedCountPairExtractNodeFunctor());	156 NodeTypedCountPairExtractNodeFunctor());

162 } else {	157 } else {

163 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());	158 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());

164 }	159 }

165 }	160 }

166	161

167 void BookmarkIndex::AddMatchToResults(	162 void BookmarkIndex::AddMatchToResults(

168 const BookmarkNode* node,	163 const TitledUrlNode* node,

169 query_parser::QueryParser* parser,	164 query_parser::QueryParser* parser,

170 const query_parser::QueryNodeVector& query_nodes,	165 const query_parser::QueryNodeVector& query_nodes,

171 std::vector<BookmarkMatch>* results) {	166 std::vector<BookmarkMatch>* results) {

172 // Check that the result matches the query. The previous search	167 // Check that the result matches the query. The previous search

173 // was a simple per-word search, while the more complex matching	168 // was a simple per-word search, while the more complex matching

174 // of QueryParser may filter it out. For example, the query	169 // of QueryParser may filter it out. For example, the query

175 // ["thi"] will match the bookmark titled [Thinking], but since	170 // ["thi"] will match the title [Thinking], but since

176 // ["thi"] is quoted we don't want to do a prefix match.	171 // ["thi"] is quoted we don't want to do a prefix match.

177 query_parser::QueryWordVector title_words, url_words;	172 query_parser::QueryWordVector title_words, url_words;

178 const base::string16 lower_title =	173 const base::string16 lower_title =

179 base::i18n::ToLower(Normalize(node->GetTitle()));	174 base::i18n::ToLower(Normalize(node->GetTitledUrlNodeTitle()));

180 parser->ExtractQueryWords(lower_title, &title_words);	175 parser->ExtractQueryWords(lower_title, &title_words);

181 base::OffsetAdjuster::Adjustments adjustments;	176 base::OffsetAdjuster::Adjustments adjustments;

182 parser->ExtractQueryWords(	177 parser->ExtractQueryWords(

183 CleanUpUrlForMatching(node->url(), &adjustments),	178 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), &adjustments),

184 &url_words);	179 &url_words);

185 query_parser::Snippet::MatchPositions title_matches, url_matches;	180 query_parser::Snippet::MatchPositions title_matches, url_matches;

186 for (const auto& node : query_nodes) {	181 for (const auto& node : query_nodes) {

187 const bool has_title_matches =	182 const bool has_title_matches =

188 node->HasMatchIn(title_words, &title_matches);	183 node->HasMatchIn(title_words, &title_matches);

189 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches);	184 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches);

190 if (!has_title_matches && !has_url_matches)	185 if (!has_title_matches && !has_url_matches)

191 return;	186 return;

192 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);	187 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);

193 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);	188 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);

194 }	189 }

195 BookmarkMatch match;	190 BookmarkMatch match;

196 if (lower_title.length() == node->GetTitle().length()) {	191 if (lower_title.length() == node->GetTitledUrlNodeTitle().length()) {

197 // Only use title matches if the lowercase string is the same length	192 // Only use title matches if the lowercase string is the same length

198 // as the original string, otherwise the matches are meaningless.	193 // as the original string, otherwise the matches are meaningless.

199 // TODO(mpearson): revise match positions appropriately.	194 // TODO(mpearson): revise match positions appropriately.

200 match.title_match_positions.swap(title_matches);	195 match.title_match_positions.swap(title_matches);

201 }	196 }

202 // Now that we're done processing this entry, correct the offsets of the	197 // Now that we're done processing this entry, correct the offsets of the

203 // matches in \|url_matches\| so they point to offsets in the original URL	198 // matches in \|url_matches\| so they point to offsets in the original URL

204 // spec, not the cleaned-up URL string that we used for matching.	199 // spec, not the cleaned-up URL string that we used for matching.

205 std::vector<size_t> offsets =	200 std::vector<size_t> offsets =

206 BookmarkMatch::OffsetsFromMatchPositions(url_matches);	201 BookmarkMatch::OffsetsFromMatchPositions(url_matches);

207 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);	202 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);

208 url_matches =	203 url_matches =

209 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);	204 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);

210 match.url_match_positions.swap(url_matches);	205 match.url_match_positions.swap(url_matches);

211 match.node = node;	206 match.node = node;

212 results->push_back(match);	207 results->push_back(match);

213 }	208 }

214	209

215 bool BookmarkIndex::GetBookmarksMatchingTerm(	210 bool BookmarkIndex::GetResultsMatchingTerm(

216 const base::string16& term,	211 const base::string16& term,

217 bool first_term,	212 bool first_term,

218 query_parser::MatchingAlgorithm matching_algorithm,	213 query_parser::MatchingAlgorithm matching_algorithm,

219 NodeSet* matches) {	214 NodeSet* matches) {

220 Index::const_iterator i = index_.lower_bound(term);	215 Index::const_iterator i = index_.lower_bound(term);

221 if (i == index_.end())	216 if (i == index_.end())

222 return false;	217 return false;

223	218

224 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(	219 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(

225 term, matching_algorithm)) {	220 term, matching_algorithm)) {

226 // Term is too short for prefix match, compare using exact match.	221 // Term is too short for prefix match, compare using exact match.

227 if (i->first != term)	222 if (i->first != term)

228 return false; // No bookmarks with this term.	223 return false; // No title/URL pairs with this term.

229	224

230 if (first_term) {	225 if (first_term) {

231 (*matches) = i->second;	226 (*matches) = i->second;

232 return true;	227 return true;

233 }	228 }

234 matches = base::STLSetIntersection<NodeSet>(i->second, matches);	229 matches = base::STLSetIntersection<NodeSet>(i->second, matches);

235 } else {	230 } else {

236 // Loop through index adding all entries that start with term to	231 // Loop through index adding all entries that start with term to

237 // \|prefix_matches\|.	232 // \|prefix_matches\|.

238 NodeSet tmp_prefix_matches;	233 NodeSet tmp_prefix_matches;

(...skipping 26 matching lines...) Expand all Loading...
265 if (query.empty())	260 if (query.empty())

266 return std::vector<base::string16>();	261 return std::vector<base::string16>();

267 query_parser::QueryParser parser;	262 query_parser::QueryParser parser;

268 parser.ParseQueryWords(base::i18n::ToLower(query),	263 parser.ParseQueryWords(base::i18n::ToLower(query),

269 query_parser::MatchingAlgorithm::DEFAULT,	264 query_parser::MatchingAlgorithm::DEFAULT,

270 &terms);	265 &terms);

271 return terms;	266 return terms;

272 }	267 }

273	268

274 void BookmarkIndex::RegisterNode(const base::string16& term,	269 void BookmarkIndex::RegisterNode(const base::string16& term,

275 const BookmarkNode* node) {	270 const TitledUrlNode* node) {

276 index_[term].insert(node);	271 index_[term].insert(node);

277 }	272 }

278	273

279 void BookmarkIndex::UnregisterNode(const base::string16& term,	274 void BookmarkIndex::UnregisterNode(const base::string16& term,

280 const BookmarkNode* node) {	275 const TitledUrlNode* node) {

281 Index::iterator i = index_.find(term);	276 Index::iterator i = index_.find(term);

282 if (i == index_.end()) {	277 if (i == index_.end()) {

283 // We can get here if the node has the same term more than once. For	278 // We can get here if the node has the same term more than once. For

284 // example, a bookmark with the title 'foo foo' would end up here.	279 // example, a node with the title 'foo foo' would end up here.

285 return;	280 return;

286 }	281 }

287 i->second.erase(node);	282 i->second.erase(node);

288 if (i->second.empty())	283 if (i->second.empty())

289 index_.erase(i);	284 index_.erase(i);

290 }	285 }

291	286

292 } // namespace bookmarks	287 } // namespace bookmarks

OLD	NEW