Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/ui/title_prefix_matcher.h" | 5 #include "chrome/browser/ui/title_prefix_matcher.h" |
| 6 | 6 |
| 7 #include "base/hash_tables.h" | 7 #include "base/hash_tables.h" |
| 8 #include "base/i18n/break_iterator.h" | 8 #include "base/i18n/break_iterator.h" |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/utf_string_conversions.h" | |
| 10 | 11 |
| 11 namespace { | 12 namespace { |
| 12 // We use this value to identify that we have already seen the title associated | 13 // We use this value to identify that we have already seen the title associated |
| 13 // to this value in the duplicate_titles hash_set, ans marked it as a duplicate. | 14 // to this value in the duplicate_titles hash_set, ans marked it as a duplicate. |
| 14 const size_t kPreviouslySeenIndex = 0xFFFFFFFF; | 15 const size_t kPreviouslySeenIndex = 0xFFFFFFFF; |
| 15 } | 16 } |
| 16 | 17 |
| 17 TitlePrefixMatcher::TitleInfo::TitleInfo(const string16* title, | 18 TitlePrefixMatcher::TitleInfo::TitleInfo( |
| 18 int caller_value) | 19 const string16* title, const GURL& url, int caller_value) |
| 19 : title(title), | 20 : title(title), |
| 21 url(url), | |
| 20 prefix_length(0), | 22 prefix_length(0), |
| 21 caller_value(caller_value) { | 23 caller_value(caller_value) { |
| 22 DCHECK(title != NULL); | 24 DCHECK(title != NULL); |
| 23 } | 25 } |
| 24 | 26 |
| 25 // static | 27 // static |
| 26 void TitlePrefixMatcher::CalculatePrefixLengths( | 28 void TitlePrefixMatcher::CalculatePrefixLengths( |
| 27 std::vector<TitleInfo>* title_infos) { | 29 std::vector<TitleInfo>* title_infos) { |
| 28 DCHECK(title_infos != NULL); | 30 DCHECK(title_infos != NULL); |
| 29 // This set will contain the indexes of the TitleInfo objects in the vector | 31 // This set will contain the indexes of the TitleInfo objects in the vector |
| 30 // that have a duplicate. | 32 // that have a duplicate. |
| 31 base::hash_set<size_t> duplicate_titles; | 33 base::hash_set<size_t> duplicate_titles; |
| 32 // This map is used to identify duplicates by remembering the vector indexes | 34 // This map is used to identify duplicates by remembering the vector indexes |
| 33 // we have seen with a given title string. The vector index is set to | 35 // we have seen with a given title string. The vector index is set to |
| 34 // kPreviouslySeenIndex once we identified duplicates and placed their | 36 // kPreviouslySeenIndex once we identified duplicates and placed their |
| 35 // indices in duplicate_titles. | 37 // indices in duplicate_titles. |
| 36 base::hash_map<string16, size_t> existing_title; | 38 base::hash_map<string16, size_t> existing_title; |
| 37 // We identify if there are identical titles upfront, | 39 // We identify if there are identical titles upfront, |
| 38 // because we don't want to remove prefixes for those at all. | 40 // because we don't want to remove prefixes for those at all. |
| 39 // We do it as a separate pass so that we don't need to remove | 41 // We do it as a separate pass so that we don't need to remove |
| 40 // previously parsed titles when we find a duplicate title later on. | 42 // previously parsed titles when we find a duplicate title later on. |
| 41 for (size_t i = 0; i < title_infos->size(); ++i) { | 43 for (size_t i = 0; i < title_infos->size(); ++i) { |
| 42 // We use pairs to test existence and insert in one shot. | 44 // We use pairs to test existence and insert in one shot. |
| 43 std::pair<base::hash_map<string16, size_t>::iterator, bool> insert_result = | 45 std::pair<base::hash_map<string16, size_t>::iterator, bool> insert_result = |
| 44 existing_title.insert(std::make_pair(*title_infos->at(i).title, i)); | 46 existing_title.insert(std::make_pair(*title_infos->at(i).title, i)); |
|
Peter Kasting
2011/04/01 17:28:31
Nit: While you're here, use (*title_infos)[i] to m
MAD
2011/04/01 19:47:37
Done.
| |
| 45 if (!insert_result.second) { | 47 if (!insert_result.second) { |
| 46 // insert_result.second is false when we insert a duplicate in the set. | 48 // insert_result.second is false when we insert a duplicate in the set. |
| 47 // insert_result.first is a map iterator and thus | 49 // insert_result.first is a map iterator and thus |
| 48 // insert_result.first->first is the string title key of the map. | 50 // insert_result.first->first is the string title key of the map. |
| 49 DCHECK(*title_infos->at(i).title == insert_result.first->first); | 51 DCHECK(*title_infos->at(i).title == insert_result.first->first); |
| 50 duplicate_titles.insert(i); | 52 duplicate_titles.insert(i); |
| 51 // insert_result.first->second is the value of the title index and if it's | 53 // insert_result.first->second is the value of the title index and if it's |
| 52 // not kPreviouslySeenIndex yet, we must remember it as a duplicate too. | 54 // not kPreviouslySeenIndex yet, we must remember it as a duplicate too. |
| 53 if (insert_result.first->second != kPreviouslySeenIndex) { | 55 if (insert_result.first->second != kPreviouslySeenIndex) { |
| 54 duplicate_titles.insert(insert_result.first->second); | 56 duplicate_titles.insert(insert_result.first->second); |
| 55 insert_result.first->second = kPreviouslySeenIndex; | 57 insert_result.first->second = kPreviouslySeenIndex; |
| 56 } | 58 } |
| 57 } | 59 } |
| 58 } | 60 } |
| 59 | 61 |
| 60 // This next loop accumulates all the potential prefixes, | 62 // This next loop accumulates all the potential prefixes, |
| 61 // and remember on which titles we saw them. | 63 // and remember on which titles we saw them. |
| 62 base::hash_map<string16, std::vector<size_t> > prefixes; | 64 base::hash_map<string16, std::vector<size_t> > prefixes; |
| 63 for (size_t i = 0; i < title_infos->size(); ++i) { | 65 for (size_t i = 0; i < title_infos->size(); ++i) { |
| 64 // Duplicate titles are not to be included in this process. | 66 // Duplicate titles are not to be included in this process. |
| 65 if (duplicate_titles.find(i) != duplicate_titles.end()) | 67 if (duplicate_titles.find(i) != duplicate_titles.end()) |
| 66 continue; | 68 continue; |
| 67 const string16* title = title_infos->at(i).title; | 69 const TitleInfo& title_info = title_infos->at(i); |
| 70 const string16* title = title_info.title; | |
| 71 // We prefix the hostname at the beginning, so that we only group | |
| 72 // titles that are from the same hostname. | |
| 73 string16 hostname = ASCIIToUTF16(title_info.url.host()); | |
| 68 // We only create prefixes at word boundaries. | 74 // We only create prefixes at word boundaries. |
| 69 base::BreakIterator iter(title, base::BreakIterator::BREAK_WORD); | 75 base::BreakIterator iter(title, base::BreakIterator::BREAK_WORD); |
| 70 // We ignore this title if we can't break it into words, or if it only | 76 // We ignore this title if we can't break it into words, or if it only |
| 71 // contains a single word. | 77 // contains a single word. |
| 72 if (!iter.Init() || !iter.Advance()) | 78 if (!iter.Init() || !iter.Advance()) |
| 73 continue; | 79 continue; |
| 74 // We continue advancing even though we already advanced to the first | 80 // We continue advancing even though we already advanced to the first |
| 75 // word above, so that we can use iter.prev() to identify the end of the | 81 // word above, so that we can use iter.prev() to identify the end of the |
| 76 // previous word and more easily ignore the last word while iterating. | 82 // previous word and more easily ignore the last word while iterating. |
| 77 while (iter.Advance()) { | 83 while (iter.Advance()) { |
| 78 if (iter.IsWord()) | 84 if (iter.IsWord()) |
| 79 prefixes[title->substr(0, iter.prev())].push_back(i); | 85 prefixes[hostname + title->substr(0, iter.prev())].push_back(i); |
| 80 } | 86 } |
| 81 } | 87 } |
| 82 | 88 |
| 83 // Now we parse the map to find common prefixes | 89 // Now we parse the map to find common prefixes |
| 84 // and keep the largest per title. | 90 // and keep the largest per title. |
| 85 for (base::hash_map<string16, std::vector<size_t> >::iterator iter = | 91 for (base::hash_map<string16, std::vector<size_t> >::iterator iter = |
| 86 prefixes.begin(); iter != prefixes.end(); ++iter) { | 92 prefixes.begin(); iter != prefixes.end(); ++iter) { |
| 87 // iter->first is the prefix string, iter->second is a vector of indices. | 93 // iter->first is the prefix string, iter->second is a vector of indices. |
| 88 if (iter->second.size() > 1) { | 94 if (iter->second.size() > 1) { |
| 89 size_t prefix_length = iter->first.size(); | 95 // We need to subtract the hostname size since we added it to the prefix. |
| 96 size_t prefix_length = iter->first.size() - | |
| 97 title_infos->at(iter->second[0]).url.host().size(); | |
| 90 for (size_t i = 0; i < iter->second.size(); ++i){ | 98 for (size_t i = 0; i < iter->second.size(); ++i){ |
| 99 DCHECK(title_infos->at(iter->second[i]).url.host() == | |
|
Peter Kasting
2011/04/01 17:28:31
Nit: DCHECK_EQ()
MAD
2011/04/01 19:47:37
Done.
| |
| 100 title_infos->at(iter->second[0]).url.host()); | |
| 91 if (title_infos->at(iter->second[i]).prefix_length < prefix_length) | 101 if (title_infos->at(iter->second[i]).prefix_length < prefix_length) |
| 92 title_infos->at(iter->second[i]).prefix_length = prefix_length; | 102 title_infos->at(iter->second[i]).prefix_length = prefix_length; |
| 93 } | 103 } |
| 94 } | 104 } |
| 95 } | 105 } |
| 96 } | 106 } |
| OLD | NEW |