OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2005, 2006, 2008, 2010, 2013 Apple Inc. All rights reserved. | |
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Library General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Library General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Library General Public License | |
16 * along with this library; see the file COPYING.LIB. If not, write to | |
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
18 * Boston, MA 02110-1301, USA. | |
19 * | |
20 */ | |
21 | |
22 #ifndef WTF_StringHasher_h | |
23 #define WTF_StringHasher_h | |
24 | |
25 #include <wtf/unicode/Unicode.h> | |
26 | |
27 namespace WTF { | |
28 | |
29 // Paul Hsieh's SuperFastHash | |
30 // http://www.azillionmonkeys.com/qed/hash.html | |
31 | |
32 // LChar data is interpreted as Latin-1-encoded (zero extended to 16 bits). | |
33 | |
34 // NOTE: The hash computation here must stay in sync with the create_hash_table
script in | |
35 // JavaScriptCore and the CodeGeneratorJS.pm script in WebCore. | |
36 | |
37 // Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash valu
e of zero. | |
38 static const unsigned stringHashingStartValue = 0x9E3779B9U; | |
39 | |
40 class StringHasher { | |
41 public: | |
42 static const unsigned flagCount = 8; // Save 8 bits for StringImpl to use as
flags. | |
43 | |
44 StringHasher() | |
45 : m_hash(stringHashingStartValue) | |
46 , m_hasPendingCharacter(false) | |
47 , m_pendingCharacter(0) | |
48 { | |
49 } | |
50 | |
51 // The hasher hashes two characters at a time, and thus an "aligned" hasher
is one | |
52 // where an even number of characters have been added. Callers that always a
dd | |
53 // characters two at a time can use the "assuming aligned" functions. | |
54 void addCharactersAssumingAligned(UChar a, UChar b) | |
55 { | |
56 ASSERT(!m_hasPendingCharacter); | |
57 m_hash += a; | |
58 m_hash = (m_hash << 16) ^ ((b << 11) ^ m_hash); | |
59 m_hash += m_hash >> 11; | |
60 } | |
61 | |
62 void addCharacter(UChar character) | |
63 { | |
64 if (m_hasPendingCharacter) { | |
65 m_hasPendingCharacter = false; | |
66 addCharactersAssumingAligned(m_pendingCharacter, character); | |
67 return; | |
68 } | |
69 | |
70 m_pendingCharacter = character; | |
71 m_hasPendingCharacter = true; | |
72 } | |
73 | |
74 void addCharacters(UChar a, UChar b) | |
75 { | |
76 if (m_hasPendingCharacter) { | |
77 #if !ASSERT_DISABLED | |
78 m_hasPendingCharacter = false; | |
79 #endif | |
80 addCharactersAssumingAligned(m_pendingCharacter, a); | |
81 m_pendingCharacter = b; | |
82 #if !ASSERT_DISABLED | |
83 m_hasPendingCharacter = true; | |
84 #endif | |
85 return; | |
86 } | |
87 | |
88 addCharactersAssumingAligned(a, b); | |
89 } | |
90 | |
91 template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(c
onst T* data, unsigned length) | |
92 { | |
93 ASSERT(!m_hasPendingCharacter); | |
94 | |
95 bool remainder = length & 1; | |
96 length >>= 1; | |
97 | |
98 while (length--) { | |
99 addCharactersAssumingAligned(Converter(data[0]), Converter(data[1]))
; | |
100 data += 2; | |
101 } | |
102 | |
103 if (remainder) | |
104 addCharacter(Converter(*data)); | |
105 } | |
106 | |
107 template<typename T> void addCharactersAssumingAligned(const T* data, unsign
ed length) | |
108 { | |
109 addCharactersAssumingAligned<T, defaultConverter>(data, length); | |
110 } | |
111 | |
112 template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(c
onst T* data) | |
113 { | |
114 ASSERT(!m_hasPendingCharacter); | |
115 | |
116 while (T a = *data++) { | |
117 T b = *data++; | |
118 if (!b) { | |
119 addCharacter(Converter(a)); | |
120 break; | |
121 } | |
122 addCharactersAssumingAligned(Converter(a), Converter(b)); | |
123 } | |
124 } | |
125 | |
126 template<typename T> void addCharactersAssumingAligned(const T* data) | |
127 { | |
128 addCharactersAssumingAligned<T, defaultConverter>(data); | |
129 } | |
130 | |
131 template<typename T, UChar Converter(T)> void addCharacters(const T* data, u
nsigned length) | |
132 { | |
133 if (m_hasPendingCharacter && length) { | |
134 m_hasPendingCharacter = false; | |
135 addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++))
; | |
136 --length; | |
137 } | |
138 addCharactersAssumingAligned<T, Converter>(data, length); | |
139 } | |
140 | |
141 template<typename T> void addCharacters(const T* data, unsigned length) | |
142 { | |
143 addCharacters<T, defaultConverter>(data, length); | |
144 } | |
145 | |
146 template<typename T, UChar Converter(T)> void addCharacters(const T* data) | |
147 { | |
148 if (m_hasPendingCharacter && *data) { | |
149 m_hasPendingCharacter = false; | |
150 addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++))
; | |
151 } | |
152 addCharactersAssumingAligned<T, Converter>(data); | |
153 } | |
154 | |
155 template<typename T> void addCharacters(const T* data) | |
156 { | |
157 addCharacters<T, defaultConverter>(data); | |
158 } | |
159 | |
160 unsigned hashWithTop8BitsMasked() const | |
161 { | |
162 unsigned result = avalancheBits(); | |
163 | |
164 // Reserving space from the high bits for flags preserves most of the ha
sh's | |
165 // value, since hash lookup typically masks out the high bits anyway. | |
166 result &= (1U << (sizeof(result) * 8 - flagCount)) - 1; | |
167 | |
168 // This avoids ever returning a hash code of 0, since that is used to | |
169 // signal "hash not computed yet". Setting the high bit maintains | |
170 // reasonable fidelity to a hash code of 0 because it is likely to yield | |
171 // exactly 0 when hash lookup masks out the high bits. | |
172 if (!result) | |
173 result = 0x80000000 >> flagCount; | |
174 | |
175 return result; | |
176 } | |
177 | |
178 unsigned hash() const | |
179 { | |
180 unsigned result = avalancheBits(); | |
181 | |
182 // This avoids ever returning a hash code of 0, since that is used to | |
183 // signal "hash not computed yet". Setting the high bit maintains | |
184 // reasonable fidelity to a hash code of 0 because it is likely to yield | |
185 // exactly 0 when hash lookup masks out the high bits. | |
186 if (!result) | |
187 result = 0x80000000; | |
188 | |
189 return result; | |
190 } | |
191 | |
192 template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskT
op8Bits(const T* data, unsigned length) | |
193 { | |
194 StringHasher hasher; | |
195 hasher.addCharactersAssumingAligned<T, Converter>(data, length); | |
196 return hasher.hashWithTop8BitsMasked(); | |
197 } | |
198 | |
199 template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskT
op8Bits(const T* data) | |
200 { | |
201 StringHasher hasher; | |
202 hasher.addCharactersAssumingAligned<T, Converter>(data); | |
203 return hasher.hashWithTop8BitsMasked(); | |
204 } | |
205 | |
206 template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* dat
a, unsigned length) | |
207 { | |
208 return computeHashAndMaskTop8Bits<T, defaultConverter>(data, length); | |
209 } | |
210 | |
211 template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* dat
a) | |
212 { | |
213 return computeHashAndMaskTop8Bits<T, defaultConverter>(data); | |
214 } | |
215 | |
216 template<typename T, UChar Converter(T)> static unsigned computeHash(const T
* data, unsigned length) | |
217 { | |
218 StringHasher hasher; | |
219 hasher.addCharactersAssumingAligned<T, Converter>(data, length); | |
220 return hasher.hash(); | |
221 } | |
222 | |
223 template<typename T, UChar Converter(T)> static unsigned computeHash(const T
* data) | |
224 { | |
225 StringHasher hasher; | |
226 hasher.addCharactersAssumingAligned<T, Converter>(data); | |
227 return hasher.hash(); | |
228 } | |
229 | |
230 template<typename T> static unsigned computeHash(const T* data, unsigned len
gth) | |
231 { | |
232 return computeHash<T, defaultConverter>(data, length); | |
233 } | |
234 | |
235 template<typename T> static unsigned computeHash(const T* data) | |
236 { | |
237 return computeHash<T, defaultConverter>(data); | |
238 } | |
239 | |
240 static unsigned hashMemory(const void* data, unsigned length) | |
241 { | |
242 // FIXME: Why does this function use the version of the hash that drops
the top 8 bits? | |
243 // We want that for all string hashing so we can use those bits in Strin
gImpl and hash | |
244 // strings consistently, but I don't see why we'd want that for general
memory hashing. | |
245 ASSERT(!(length % 2)); | |
246 return computeHashAndMaskTop8Bits<UChar>(static_cast<const UChar*>(data)
, length / sizeof(UChar)); | |
247 } | |
248 | |
249 template<size_t length> static unsigned hashMemory(const void* data) | |
250 { | |
251 COMPILE_ASSERT(!(length % 2), length_must_be_a_multiple_of_two); | |
252 return hashMemory(data, length); | |
253 } | |
254 | |
255 private: | |
256 static UChar defaultConverter(UChar character) | |
257 { | |
258 return character; | |
259 } | |
260 | |
261 static UChar defaultConverter(LChar character) | |
262 { | |
263 return character; | |
264 } | |
265 | |
266 unsigned avalancheBits() const | |
267 { | |
268 unsigned result = m_hash; | |
269 | |
270 // Handle end case. | |
271 if (m_hasPendingCharacter) { | |
272 result += m_pendingCharacter; | |
273 result ^= result << 11; | |
274 result += result >> 17; | |
275 } | |
276 | |
277 // Force "avalanching" of final 31 bits. | |
278 result ^= result << 3; | |
279 result += result >> 5; | |
280 result ^= result << 2; | |
281 result += result >> 15; | |
282 result ^= result << 10; | |
283 | |
284 return result; | |
285 } | |
286 | |
287 unsigned m_hash; | |
288 bool m_hasPendingCharacter; | |
289 UChar m_pendingCharacter; | |
290 }; | |
291 | |
292 } // namespace WTF | |
293 | |
294 using WTF::StringHasher; | |
295 | |
296 #endif // WTF_StringHasher_h | |
OLD | NEW |