OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/ui/title_prefix_matcher.h" | |
6 | |
7 #include "base/hash_tables.h" | |
8 #include "base/i18n/break_iterator.h" | |
9 #include "base/logging.h" | |
10 #include "base/utf_string_conversions.h" | |
11 | |
12 namespace { | |
13 // We use this value to identify that we have already seen the title associated | |
14 // to this value in the duplicate_titles hash_set, ans marked it as a duplicate. | |
15 const size_t kPreviouslySeenIndex = 0xFFFFFFFF; | |
16 } | |
17 | |
18 // static | |
19 const int TitlePrefixMatcher::kCommonCharsToShow = 2; | |
20 const size_t TitlePrefixMatcher::kMinElidingLength = | |
21 TitlePrefixMatcher::kCommonCharsToShow + 3; | |
22 | |
23 TitlePrefixMatcher::TitleInfo::TitleInfo( | |
24 const string16* title, const GURL& url, int caller_value) | |
25 : title(title), | |
26 url(url), | |
27 prefix_length(0), | |
28 caller_value(caller_value) { | |
29 DCHECK(title != NULL); | |
30 } | |
31 | |
32 TitlePrefixMatcher::TitleInfo::~TitleInfo() { | |
33 } | |
34 | |
35 // static | |
36 void TitlePrefixMatcher::CalculatePrefixLengths( | |
37 std::vector<TitleInfo>* title_infos) { | |
38 DCHECK(title_infos != NULL); | |
39 // This set will contain the indexes of the TitleInfo objects in the vector | |
40 // that have a duplicate. | |
41 base::hash_set<size_t> duplicate_titles; | |
42 // This map is used to identify duplicates by remembering the vector indexes | |
43 // we have seen with a given title string. The vector index is set to | |
44 // kPreviouslySeenIndex once we identified duplicates and placed their | |
45 // indices in duplicate_titles. | |
46 base::hash_map<string16, size_t> existing_title; | |
47 // We identify if there are identical titles upfront, | |
48 // because we don't want to remove prefixes for those at all. | |
49 // We do it as a separate pass so that we don't need to remove | |
50 // previously parsed titles when we find a duplicate title later on. | |
51 for (size_t i = 0; i < title_infos->size(); ++i) { | |
52 // We use pairs to test existence and insert in one shot. | |
53 std::pair<base::hash_map<string16, size_t>::iterator, bool> insert_result = | |
54 existing_title.insert(std::make_pair(*(*title_infos)[i].title, i)); | |
55 if (!insert_result.second) { | |
56 // insert_result.second is false when we insert a duplicate in the set. | |
57 // insert_result.first is a map iterator and thus | |
58 // insert_result.first->first is the string title key of the map. | |
59 DCHECK_EQ(*(*title_infos)[i].title, insert_result.first->first); | |
60 duplicate_titles.insert(i); | |
61 // insert_result.first->second is the value of the title index and if it's | |
62 // not kPreviouslySeenIndex yet, we must remember it as a duplicate too. | |
63 if (insert_result.first->second != kPreviouslySeenIndex) { | |
64 duplicate_titles.insert(insert_result.first->second); | |
65 insert_result.first->second = kPreviouslySeenIndex; | |
66 } | |
67 } | |
68 } | |
69 | |
70 // This next loop accumulates all the potential prefixes, | |
71 // and remember on which titles we saw them. | |
72 base::hash_map<string16, std::vector<size_t> > prefixes; | |
73 for (size_t i = 0; i < title_infos->size(); ++i) { | |
74 // Duplicate titles are not to be included in this process. | |
75 if (duplicate_titles.find(i) != duplicate_titles.end()) | |
76 continue; | |
77 const TitleInfo& title_info = (*title_infos)[i]; | |
78 const string16* title = title_info.title; | |
79 // We prefix the hostname at the beginning, so that we only group | |
80 // titles that are from the same hostname. | |
81 string16 hostname = ASCIIToUTF16(title_info.url.host()); | |
82 // We only create prefixes at word boundaries. | |
83 base::i18n::BreakIterator iter(title, | |
84 base::i18n::BreakIterator::BREAK_WORD); | |
85 // We ignore this title if we can't break it into words, or if it only | |
86 // contains a single word. | |
87 if (!iter.Init() || !iter.Advance()) | |
88 continue; | |
89 // We continue advancing even though we already advanced to the first | |
90 // word above, so that we can use iter.prev() to identify the end of the | |
91 // previous word and more easily ignore the last word while iterating. | |
92 while (iter.Advance()) { | |
93 if (iter.IsWord()) | |
94 prefixes[hostname + title->substr(0, iter.prev())].push_back(i); | |
95 } | |
96 } | |
97 | |
98 // Now we parse the map to find common prefixes | |
99 // and keep the largest per title. | |
100 for (base::hash_map<string16, std::vector<size_t> >::iterator iter = | |
101 prefixes.begin(); iter != prefixes.end(); ++iter) { | |
102 // iter->first is the prefix string, iter->second is a vector of indices. | |
103 if (iter->second.size() > 1) { | |
104 // We need to subtract the hostname size since we added it to the prefix. | |
105 const TitleInfo& first_title_info = (*title_infos)[iter->second[0]]; | |
106 DCHECK_GE(iter->first.size(), first_title_info.url.host().size()); | |
107 size_t prefix_length = iter->first.size() - | |
108 first_title_info.url.host().size(); | |
109 for (size_t i = 0; i < iter->second.size(); ++i){ | |
110 TitleInfo& title_info = (*title_infos)[iter->second[i]]; | |
111 DCHECK_EQ(first_title_info.url.host(), title_info.url.host()); | |
112 if (title_info.prefix_length < prefix_length) | |
113 title_info.prefix_length = prefix_length; | |
114 } | |
115 } | |
116 } | |
117 } | |
OLD | NEW |