| OLD | NEW |
| (Empty) |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "components/bookmarks/browser/bookmark_index.h" | |
| 6 | |
| 7 #include <stdint.h> | |
| 8 | |
| 9 #include "base/i18n/case_conversion.h" | |
| 10 #include "base/logging.h" | |
| 11 #include "base/stl_util.h" | |
| 12 #include "base/strings/utf_offset_string_conversions.h" | |
| 13 #include "build/build_config.h" | |
| 14 #include "components/bookmarks/browser/bookmark_match.h" | |
| 15 #include "components/bookmarks/browser/bookmark_utils.h" | |
| 16 #include "components/bookmarks/browser/titled_url_node.h" | |
| 17 #include "components/bookmarks/browser/titled_url_node_sorter.h" | |
| 18 #include "components/query_parser/snippet.h" | |
| 19 #include "third_party/icu/source/common/unicode/normalizer2.h" | |
| 20 #include "third_party/icu/source/common/unicode/utypes.h" | |
| 21 | |
| 22 namespace bookmarks { | |
| 23 | |
| 24 namespace { | |
| 25 | |
| 26 // Returns a normalized version of the UTF16 string |text|. If it fails to | |
| 27 // normalize the string, returns |text| itself as a best-effort. | |
| 28 base::string16 Normalize(const base::string16& text) { | |
| 29 UErrorCode status = U_ZERO_ERROR; | |
| 30 const icu::Normalizer2* normalizer2 = | |
| 31 icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status); | |
| 32 if (U_FAILURE(status)) { | |
| 33 // Log and crash right away to capture the error code in the crash report. | |
| 34 LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status); | |
| 35 } | |
| 36 icu::UnicodeString unicode_text( | |
| 37 text.data(), static_cast<int32_t>(text.length())); | |
| 38 icu::UnicodeString unicode_normalized_text; | |
| 39 normalizer2->normalize(unicode_text, unicode_normalized_text, status); | |
| 40 if (U_FAILURE(status)) { | |
| 41 // This should not happen. Log the error and fall back. | |
| 42 LOG(ERROR) << "normalization failed: " << u_errorName(status); | |
| 43 return text; | |
| 44 } | |
| 45 return base::string16(unicode_normalized_text.getBuffer(), | |
| 46 unicode_normalized_text.length()); | |
| 47 } | |
| 48 | |
| 49 } // namespace | |
| 50 | |
| 51 BookmarkIndex::BookmarkIndex(std::unique_ptr<TitledUrlNodeSorter> sorter) | |
| 52 : sorter_(std::move(sorter)) { | |
| 53 } | |
| 54 | |
| 55 BookmarkIndex::~BookmarkIndex() { | |
| 56 } | |
| 57 | |
| 58 void BookmarkIndex::Add(const TitledUrlNode* node) { | |
| 59 std::vector<base::string16> terms = | |
| 60 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle())); | |
| 61 for (size_t i = 0; i < terms.size(); ++i) | |
| 62 RegisterNode(terms[i], node); | |
| 63 terms = ExtractQueryWords( | |
| 64 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr)); | |
| 65 for (size_t i = 0; i < terms.size(); ++i) | |
| 66 RegisterNode(terms[i], node); | |
| 67 } | |
| 68 | |
| 69 void BookmarkIndex::Remove(const TitledUrlNode* node) { | |
| 70 std::vector<base::string16> terms = | |
| 71 ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle())); | |
| 72 for (size_t i = 0; i < terms.size(); ++i) | |
| 73 UnregisterNode(terms[i], node); | |
| 74 terms = ExtractQueryWords( | |
| 75 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), nullptr)); | |
| 76 for (size_t i = 0; i < terms.size(); ++i) | |
| 77 UnregisterNode(terms[i], node); | |
| 78 } | |
| 79 | |
| 80 void BookmarkIndex::GetResultsMatching( | |
| 81 const base::string16& input_query, | |
| 82 size_t max_count, | |
| 83 query_parser::MatchingAlgorithm matching_algorithm, | |
| 84 std::vector<BookmarkMatch>* results) { | |
| 85 const base::string16 query = Normalize(input_query); | |
| 86 std::vector<base::string16> terms = ExtractQueryWords(query); | |
| 87 if (terms.empty()) | |
| 88 return; | |
| 89 | |
| 90 TitledUrlNodeSet matches; | |
| 91 for (size_t i = 0; i < terms.size(); ++i) { | |
| 92 if (!GetResultsMatchingTerm(terms[i], i == 0, matching_algorithm, | |
| 93 &matches)) { | |
| 94 return; | |
| 95 } | |
| 96 } | |
| 97 | |
| 98 TitledUrlNodes sorted_nodes; | |
| 99 SortMatches(matches, &sorted_nodes); | |
| 100 | |
| 101 // We use a QueryParser to fill in match positions for us. It's not the most | |
| 102 // efficient way to go about this, but by the time we get here we know what | |
| 103 // matches and so this shouldn't be performance critical. | |
| 104 query_parser::QueryParser parser; | |
| 105 query_parser::QueryNodeVector query_nodes; | |
| 106 parser.ParseQueryNodes(query, matching_algorithm, &query_nodes); | |
| 107 | |
| 108 // The highest typed counts should be at the beginning of the results vector | |
| 109 // so that the best matches will always be included in the results. The loop | |
| 110 // that calculates result relevance in HistoryContentsProvider::ConvertResults | |
| 111 // will run backwards to assure higher relevance will be attributed to the | |
| 112 // best matches. | |
| 113 for (TitledUrlNodes::const_iterator i = sorted_nodes.begin(); | |
| 114 i != sorted_nodes.end() && results->size() < max_count; | |
| 115 ++i) | |
| 116 AddMatchToResults(*i, &parser, query_nodes, results); | |
| 117 } | |
| 118 | |
| 119 void BookmarkIndex::SortMatches(const TitledUrlNodeSet& matches, | |
| 120 TitledUrlNodes* sorted_nodes) const { | |
| 121 if (sorter_) { | |
| 122 sorter_->SortMatches(matches, sorted_nodes); | |
| 123 } else { | |
| 124 sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end()); | |
| 125 } | |
| 126 } | |
| 127 | |
| 128 void BookmarkIndex::AddMatchToResults( | |
| 129 const TitledUrlNode* node, | |
| 130 query_parser::QueryParser* parser, | |
| 131 const query_parser::QueryNodeVector& query_nodes, | |
| 132 std::vector<BookmarkMatch>* results) { | |
| 133 if (!node) { | |
| 134 return; | |
| 135 } | |
| 136 // Check that the result matches the query. The previous search | |
| 137 // was a simple per-word search, while the more complex matching | |
| 138 // of QueryParser may filter it out. For example, the query | |
| 139 // ["thi"] will match the title [Thinking], but since | |
| 140 // ["thi"] is quoted we don't want to do a prefix match. | |
| 141 query_parser::QueryWordVector title_words, url_words; | |
| 142 const base::string16 lower_title = | |
| 143 base::i18n::ToLower(Normalize(node->GetTitledUrlNodeTitle())); | |
| 144 parser->ExtractQueryWords(lower_title, &title_words); | |
| 145 base::OffsetAdjuster::Adjustments adjustments; | |
| 146 parser->ExtractQueryWords( | |
| 147 CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), &adjustments), | |
| 148 &url_words); | |
| 149 query_parser::Snippet::MatchPositions title_matches, url_matches; | |
| 150 for (const auto& node : query_nodes) { | |
| 151 const bool has_title_matches = | |
| 152 node->HasMatchIn(title_words, &title_matches); | |
| 153 const bool has_url_matches = node->HasMatchIn(url_words, &url_matches); | |
| 154 if (!has_title_matches && !has_url_matches) | |
| 155 return; | |
| 156 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches); | |
| 157 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches); | |
| 158 } | |
| 159 BookmarkMatch match; | |
| 160 if (lower_title.length() == node->GetTitledUrlNodeTitle().length()) { | |
| 161 // Only use title matches if the lowercase string is the same length | |
| 162 // as the original string, otherwise the matches are meaningless. | |
| 163 // TODO(mpearson): revise match positions appropriately. | |
| 164 match.title_match_positions.swap(title_matches); | |
| 165 } | |
| 166 // Now that we're done processing this entry, correct the offsets of the | |
| 167 // matches in |url_matches| so they point to offsets in the original URL | |
| 168 // spec, not the cleaned-up URL string that we used for matching. | |
| 169 std::vector<size_t> offsets = | |
| 170 BookmarkMatch::OffsetsFromMatchPositions(url_matches); | |
| 171 base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets); | |
| 172 url_matches = | |
| 173 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets); | |
| 174 match.url_match_positions.swap(url_matches); | |
| 175 match.node = node; | |
| 176 results->push_back(match); | |
| 177 } | |
| 178 | |
| 179 bool BookmarkIndex::GetResultsMatchingTerm( | |
| 180 const base::string16& term, | |
| 181 bool first_term, | |
| 182 query_parser::MatchingAlgorithm matching_algorithm, | |
| 183 TitledUrlNodeSet* matches) { | |
| 184 Index::const_iterator i = index_.lower_bound(term); | |
| 185 if (i == index_.end()) | |
| 186 return false; | |
| 187 | |
| 188 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch( | |
| 189 term, matching_algorithm)) { | |
| 190 // Term is too short for prefix match, compare using exact match. | |
| 191 if (i->first != term) | |
| 192 return false; // No title/URL pairs with this term. | |
| 193 | |
| 194 if (first_term) { | |
| 195 (*matches) = i->second; | |
| 196 return true; | |
| 197 } | |
| 198 *matches = base::STLSetIntersection<TitledUrlNodeSet>(i->second, *matches); | |
| 199 } else { | |
| 200 // Loop through index adding all entries that start with term to | |
| 201 // |prefix_matches|. | |
| 202 TitledUrlNodeSet tmp_prefix_matches; | |
| 203 // If this is the first term, then store the result directly in |matches| | |
| 204 // to avoid calling stl intersection (which requires a copy). | |
| 205 TitledUrlNodeSet* prefix_matches = | |
| 206 first_term ? matches : &tmp_prefix_matches; | |
| 207 while (i != index_.end() && | |
| 208 i->first.size() >= term.size() && | |
| 209 term.compare(0, term.size(), i->first, 0, term.size()) == 0) { | |
| 210 #if !defined(OS_ANDROID) | |
| 211 prefix_matches->insert(i->second.begin(), i->second.end()); | |
| 212 #else | |
| 213 // Work around a bug in the implementation of std::set::insert in the STL | |
| 214 // used on android (http://crbug.com/367050). | |
| 215 for (TitledUrlNodeSet::const_iterator n = i->second.begin(); | |
| 216 n != i->second.end(); | |
| 217 ++n) | |
| 218 prefix_matches->insert(prefix_matches->end(), *n); | |
| 219 #endif | |
| 220 ++i; | |
| 221 } | |
| 222 if (!first_term) { | |
| 223 *matches = | |
| 224 base::STLSetIntersection<TitledUrlNodeSet>(*prefix_matches, *matches); | |
| 225 } | |
| 226 } | |
| 227 return !matches->empty(); | |
| 228 } | |
| 229 | |
| 230 std::vector<base::string16> BookmarkIndex::ExtractQueryWords( | |
| 231 const base::string16& query) { | |
| 232 std::vector<base::string16> terms; | |
| 233 if (query.empty()) | |
| 234 return std::vector<base::string16>(); | |
| 235 query_parser::QueryParser parser; | |
| 236 parser.ParseQueryWords(base::i18n::ToLower(query), | |
| 237 query_parser::MatchingAlgorithm::DEFAULT, | |
| 238 &terms); | |
| 239 return terms; | |
| 240 } | |
| 241 | |
| 242 void BookmarkIndex::RegisterNode(const base::string16& term, | |
| 243 const TitledUrlNode* node) { | |
| 244 index_[term].insert(node); | |
| 245 } | |
| 246 | |
| 247 void BookmarkIndex::UnregisterNode(const base::string16& term, | |
| 248 const TitledUrlNode* node) { | |
| 249 Index::iterator i = index_.find(term); | |
| 250 if (i == index_.end()) { | |
| 251 // We can get here if the node has the same term more than once. For | |
| 252 // example, a node with the title 'foo foo' would end up here. | |
| 253 return; | |
| 254 } | |
| 255 i->second.erase(node); | |
| 256 if (i->second.empty()) | |
| 257 index_.erase(i); | |
| 258 } | |
| 259 | |
| 260 } // namespace bookmarks | |
| OLD | NEW |