OLD | NEW |
| (Empty) |
1 /* Copyright 2013 Google Inc. All Rights Reserved. | |
2 | |
3 Distributed under MIT license. | |
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
5 */ | |
6 | |
7 #include "./static_dict.h" | |
8 | |
9 #include <algorithm> | |
10 | |
11 #include "./dictionary.h" | |
12 #include "./find_match_length.h" | |
13 #include "./static_dict_lut.h" | |
14 #include "./transform.h" | |
15 | |
16 namespace brotli { | |
17 | |
18 inline uint32_t Hash(const uint8_t *data) { | |
19 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32; | |
20 // The higher bits contain more mixture from the multiplication, | |
21 // so we take our results from there. | |
22 return h >> (32 - kDictNumBits); | |
23 } | |
24 | |
25 inline void AddMatch(size_t distance, size_t len, size_t len_code, | |
26 uint32_t* matches) { | |
27 uint32_t match = static_cast<uint32_t>((distance << 5) + len_code); | |
28 matches[len] = std::min(matches[len], match); | |
29 } | |
30 | |
31 inline size_t DictMatchLength(const uint8_t* data, | |
32 size_t id, | |
33 size_t len, | |
34 size_t maxlen) { | |
35 const size_t offset = kBrotliDictionaryOffsetsByLength[len] + len * id; | |
36 return FindMatchLengthWithLimit(&kBrotliDictionary[offset], data, | |
37 std::min(len, maxlen)); | |
38 } | |
39 | |
40 inline bool IsMatch(DictWord w, const uint8_t* data, size_t max_length) { | |
41 if (w.len > max_length) return false; | |
42 const size_t offset = kBrotliDictionaryOffsetsByLength[w.len] + w.len * w.idx; | |
43 const uint8_t* dict = &kBrotliDictionary[offset]; | |
44 if (w.transform == 0) { | |
45 // Match against base dictionary word. | |
46 return FindMatchLengthWithLimit(dict, data, w.len) == w.len; | |
47 } else if (w.transform == 10) { | |
48 // Match against uppercase first transform. | |
49 // Note that there are only ASCII uppercase words in the lookup table. | |
50 return (dict[0] >= 'a' && dict[0] <= 'z' && | |
51 (dict[0] ^ 32) == data[0] && | |
52 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) == | |
53 w.len - 1u); | |
54 } else { | |
55 // Match against uppercase all transform. | |
56 // Note that there are only ASCII uppercase words in the lookup table. | |
57 for (size_t i = 0; i < w.len; ++i) { | |
58 if (dict[i] >= 'a' && dict[i] <= 'z') { | |
59 if ((dict[i] ^ 32) != data[i]) return false; | |
60 } else { | |
61 if (dict[i] != data[i]) return false; | |
62 } | |
63 } | |
64 return true; | |
65 } | |
66 } | |
67 | |
68 bool FindAllStaticDictionaryMatches(const uint8_t* data, | |
69 size_t min_length, | |
70 size_t max_length, | |
71 uint32_t* matches) { | |
72 bool found_match = false; | |
73 size_t key = Hash(data); | |
74 size_t bucket = kStaticDictionaryBuckets[key]; | |
75 if (bucket != 0) { | |
76 size_t num = bucket & 0xff; | |
77 size_t offset = bucket >> 8; | |
78 for (size_t i = 0; i < num; ++i) { | |
79 const DictWord w = kStaticDictionaryWords[offset + i]; | |
80 const size_t l = w.len; | |
81 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; | |
82 const size_t id = w.idx; | |
83 if (w.transform == 0) { | |
84 const size_t matchlen = DictMatchLength(data, id, l, max_length); | |
85 // Transform "" + kIdentity + "" | |
86 if (matchlen == l) { | |
87 AddMatch(id, l, l, matches); | |
88 found_match = true; | |
89 } | |
90 // Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing " | |
91 if (matchlen >= l - 1) { | |
92 AddMatch(id + 12 * n, l - 1, l, matches); | |
93 if (l + 2 < max_length && | |
94 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' && | |
95 data[l + 2] == ' ') { | |
96 AddMatch(id + 49 * n, l + 3, l, matches); | |
97 } | |
98 found_match = true; | |
99 } | |
100 // Transform "" + kOmitLastN + "" (N = 2 .. 9) | |
101 size_t minlen = min_length; | |
102 if (l > 9) minlen = std::max(minlen, l - 9); | |
103 size_t maxlen = std::min(matchlen, l - 2); | |
104 for (size_t len = minlen; len <= maxlen; ++len) { | |
105 AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches); | |
106 found_match = true; | |
107 } | |
108 if (matchlen < l || l + 6 >= max_length) { | |
109 continue; | |
110 } | |
111 const uint8_t* s = &data[l]; | |
112 // Transforms "" + kIdentity + <suffix> | |
113 if (s[0] == ' ') { | |
114 AddMatch(id + n, l + 1, l, matches); | |
115 if (s[1] == 'a') { | |
116 if (s[2] == ' ') { | |
117 AddMatch(id + 28 * n, l + 3, l, matches); | |
118 } else if (s[2] == 's') { | |
119 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches); | |
120 } else if (s[2] == 't') { | |
121 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches); | |
122 } else if (s[2] == 'n') { | |
123 if (s[3] == 'd' && s[4] == ' ') { | |
124 AddMatch(id + 10 * n, l + 5, l, matches); | |
125 } | |
126 } | |
127 } else if (s[1] == 'b') { | |
128 if (s[2] == 'y' && s[3] == ' ') { | |
129 AddMatch(id + 38 * n, l + 4, l, matches); | |
130 } | |
131 } else if (s[1] == 'i') { | |
132 if (s[2] == 'n') { | |
133 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches); | |
134 } else if (s[2] == 's') { | |
135 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches); | |
136 } | |
137 } else if (s[1] == 'f') { | |
138 if (s[2] == 'o') { | |
139 if (s[3] == 'r' && s[4] == ' ') { | |
140 AddMatch(id + 25 * n, l + 5, l, matches); | |
141 } | |
142 } else if (s[2] == 'r') { | |
143 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') { | |
144 AddMatch(id + 37 * n, l + 6, l, matches); | |
145 } | |
146 } | |
147 } else if (s[1] == 'o') { | |
148 if (s[2] == 'f') { | |
149 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches); | |
150 } else if (s[2] == 'n') { | |
151 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches); | |
152 } | |
153 } else if (s[1] == 'n') { | |
154 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') { | |
155 AddMatch(id + 80 * n, l + 5, l, matches); | |
156 } | |
157 } else if (s[1] == 't') { | |
158 if (s[2] == 'h') { | |
159 if (s[3] == 'e') { | |
160 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches); | |
161 } else if (s[3] == 'a') { | |
162 if (s[4] == 't' && s[5] == ' ') { | |
163 AddMatch(id + 29 * n, l + 6, l, matches); | |
164 } | |
165 } | |
166 } else if (s[2] == 'o') { | |
167 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches); | |
168 } | |
169 } else if (s[1] == 'w') { | |
170 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') { | |
171 AddMatch(id + 35 * n, l + 6, l, matches); | |
172 } | |
173 } | |
174 } else if (s[0] == '"') { | |
175 AddMatch(id + 19 * n, l + 1, l, matches); | |
176 if (s[1] == '>') { | |
177 AddMatch(id + 21 * n, l + 2, l, matches); | |
178 } | |
179 } else if (s[0] == '.') { | |
180 AddMatch(id + 20 * n, l + 1, l, matches); | |
181 if (s[1] == ' ') { | |
182 AddMatch(id + 31 * n, l + 2, l, matches); | |
183 if (s[2] == 'T' && s[3] == 'h') { | |
184 if (s[4] == 'e') { | |
185 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches); | |
186 } else if (s[4] == 'i') { | |
187 if (s[5] == 's' && s[6] == ' ') { | |
188 AddMatch(id + 75 * n, l + 7, l, matches); | |
189 } | |
190 } | |
191 } | |
192 } | |
193 } else if (s[0] == ',') { | |
194 AddMatch(id + 76 * n, l + 1, l, matches); | |
195 if (s[1] == ' ') { | |
196 AddMatch(id + 14 * n, l + 2, l, matches); | |
197 } | |
198 } else if (s[0] == '\n') { | |
199 AddMatch(id + 22 * n, l + 1, l, matches); | |
200 if (s[1] == '\t') { | |
201 AddMatch(id + 50 * n, l + 2, l, matches); | |
202 } | |
203 } else if (s[0] == ']') { | |
204 AddMatch(id + 24 * n, l + 1, l, matches); | |
205 } else if (s[0] == '\'') { | |
206 AddMatch(id + 36 * n, l + 1, l, matches); | |
207 } else if (s[0] == ':') { | |
208 AddMatch(id + 51 * n, l + 1, l, matches); | |
209 } else if (s[0] == '(') { | |
210 AddMatch(id + 57 * n, l + 1, l, matches); | |
211 } else if (s[0] == '=') { | |
212 if (s[1] == '"') { | |
213 AddMatch(id + 70 * n, l + 2, l, matches); | |
214 } else if (s[1] == '\'') { | |
215 AddMatch(id + 86 * n, l + 2, l, matches); | |
216 } | |
217 } else if (s[0] == 'a') { | |
218 if (s[1] == 'l' && s[2] == ' ') { | |
219 AddMatch(id + 84 * n, l + 3, l, matches); | |
220 } | |
221 } else if (s[0] == 'e') { | |
222 if (s[1] == 'd') { | |
223 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches); | |
224 } else if (s[1] == 'r') { | |
225 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches); | |
226 } else if (s[1] == 's') { | |
227 if (s[2] == 't' && s[3] == ' ') { | |
228 AddMatch(id + 95 * n, l + 4, l, matches); | |
229 } | |
230 } | |
231 } else if (s[0] == 'f') { | |
232 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') { | |
233 AddMatch(id + 90 * n, l + 4, l, matches); | |
234 } | |
235 } else if (s[0] == 'i') { | |
236 if (s[1] == 'v') { | |
237 if (s[2] == 'e' && s[3] == ' ') { | |
238 AddMatch(id + 92 * n, l + 4, l, matches); | |
239 } | |
240 } else if (s[1] == 'z') { | |
241 if (s[2] == 'e' && s[3] == ' ') { | |
242 AddMatch(id + 100 * n, l + 4, l, matches); | |
243 } | |
244 } | |
245 } else if (s[0] == 'l') { | |
246 if (s[1] == 'e') { | |
247 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') { | |
248 AddMatch(id + 93 * n, l + 5, l, matches); | |
249 } | |
250 } else if (s[1] == 'y') { | |
251 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches); | |
252 } | |
253 } else if (s[0] == 'o') { | |
254 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') { | |
255 AddMatch(id + 106 * n, l + 4, l, matches); | |
256 } | |
257 } | |
258 } else { | |
259 // Set t=false for kUppercaseFirst and | |
260 // t=true otherwise (kUppercaseAll) transform. | |
261 const bool t = w.transform != kUppercaseFirst; | |
262 if (!IsMatch(w, data, max_length)) { | |
263 continue; | |
264 } | |
265 // Transform "" + kUppercase{First,All} + "" | |
266 AddMatch(id + (t ? 44 : 9) * n, l, l, matches); | |
267 found_match = true; | |
268 if (l + 1 >= max_length) { | |
269 continue; | |
270 } | |
271 // Transforms "" + kUppercase{First,All} + <suffix> | |
272 const uint8_t* s = &data[l]; | |
273 if (s[0] == ' ') { | |
274 AddMatch(id + (t ? 68 : 4) * n, l + 1, l, matches); | |
275 } else if (s[0] == '"') { | |
276 AddMatch(id + (t ? 87 : 66) * n, l + 1, l, matches); | |
277 if (s[1] == '>') { | |
278 AddMatch(id + (t ? 97 : 69) * n, l + 2, l, matches); | |
279 } | |
280 } else if (s[0] == '.') { | |
281 AddMatch(id + (t ? 101 : 79) * n, l + 1, l, matches); | |
282 if (s[1] == ' ') { | |
283 AddMatch(id + (t ? 114 : 88) * n, l + 2, l, matches); | |
284 } | |
285 } else if (s[0] == ',') { | |
286 AddMatch(id + (t ? 112 : 99) * n, l + 1, l, matches); | |
287 if (s[1] == ' ') { | |
288 AddMatch(id + (t ? 107 : 58) * n, l + 2, l, matches); | |
289 } | |
290 } else if (s[0] == '\'') { | |
291 AddMatch(id + (t ? 94 : 74) * n, l + 1, l, matches); | |
292 } else if (s[0] == '(') { | |
293 AddMatch(id + (t ? 113 : 78) * n, l + 1, l, matches); | |
294 } else if (s[0] == '=') { | |
295 if (s[1] == '"') { | |
296 AddMatch(id + (t ? 105 : 104) * n, l + 2, l, matches); | |
297 } else if (s[1] == '\'') { | |
298 AddMatch(id + (t ? 116 : 108) * n, l + 2, l, matches); | |
299 } | |
300 } | |
301 } | |
302 } | |
303 } | |
304 // Transforms with prefixes " " and "." | |
305 if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) { | |
306 bool is_space = (data[0] == ' '); | |
307 key = Hash(&data[1]); | |
308 bucket = kStaticDictionaryBuckets[key]; | |
309 size_t num = bucket & 0xff; | |
310 size_t offset = bucket >> 8; | |
311 for (size_t i = 0; i < num; ++i) { | |
312 const DictWord w = kStaticDictionaryWords[offset + i]; | |
313 const size_t l = w.len; | |
314 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; | |
315 const size_t id = w.idx; | |
316 if (w.transform == 0) { | |
317 if (!IsMatch(w, &data[1], max_length - 1)) { | |
318 continue; | |
319 } | |
320 // Transforms " " + kIdentity + "" and "." + kIdentity + "" | |
321 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches); | |
322 found_match = true; | |
323 if (l + 2 >= max_length) { | |
324 continue; | |
325 } | |
326 // Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix> | |
327 const uint8_t* s = &data[l + 1]; | |
328 if (s[0] == ' ') { | |
329 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches); | |
330 } else if (s[0] == '(') { | |
331 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches); | |
332 } else if (is_space) { | |
333 if (s[0] == ',') { | |
334 AddMatch(id + 103 * n, l + 2, l, matches); | |
335 if (s[1] == ' ') { | |
336 AddMatch(id + 33 * n, l + 3, l, matches); | |
337 } | |
338 } else if (s[0] == '.') { | |
339 AddMatch(id + 71 * n, l + 2, l, matches); | |
340 if (s[1] == ' ') { | |
341 AddMatch(id + 52 * n, l + 3, l, matches); | |
342 } | |
343 } else if (s[0] == '=') { | |
344 if (s[1] == '"') { | |
345 AddMatch(id + 81 * n, l + 3, l, matches); | |
346 } else if (s[1] == '\'') { | |
347 AddMatch(id + 98 * n, l + 3, l, matches); | |
348 } | |
349 } | |
350 } | |
351 } else if (is_space) { | |
352 // Set t=false for kUppercaseFirst and | |
353 // t=true otherwise (kUppercaseAll) transform. | |
354 const bool t = w.transform != kUppercaseFirst; | |
355 if (!IsMatch(w, &data[1], max_length - 1)) { | |
356 continue; | |
357 } | |
358 // Transforms " " + kUppercase{First,All} + "" | |
359 AddMatch(id + (t ? 85 : 30) * n, l + 1, l, matches); | |
360 found_match = true; | |
361 if (l + 2 >= max_length) { | |
362 continue; | |
363 } | |
364 // Transforms " " + kUppercase{First,All} + <suffix> | |
365 const uint8_t* s = &data[l + 1]; | |
366 if (s[0] == ' ') { | |
367 AddMatch(id + (t ? 83 : 15) * n, l + 2, l, matches); | |
368 } else if (s[0] == ',') { | |
369 if (!t) { | |
370 AddMatch(id + 109 * n, l + 2, l, matches); | |
371 } | |
372 if (s[1] == ' ') { | |
373 AddMatch(id + (t ? 111 : 65) * n, l + 3, l, matches); | |
374 } | |
375 } else if (s[0] == '.') { | |
376 AddMatch(id + (t ? 115 : 96) * n, l + 2, l, matches); | |
377 if (s[1] == ' ') { | |
378 AddMatch(id + (t ? 117 : 91) * n, l + 3, l, matches); | |
379 } | |
380 } else if (s[0] == '=') { | |
381 if (s[1] == '"') { | |
382 AddMatch(id + (t ? 110 : 118) * n, l + 3, l, matches); | |
383 } else if (s[1] == '\'') { | |
384 AddMatch(id + (t ? 119 : 120) * n, l + 3, l, matches); | |
385 } | |
386 } | |
387 } | |
388 } | |
389 } | |
390 if (max_length >= 6) { | |
391 // Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0" | |
392 if ((data[1] == ' ' && | |
393 (data[0] == 'e' || data[0] == 's' || data[0] == ',')) || | |
394 (data[0] == 0xc2 && data[1] == 0xa0)) { | |
395 key = Hash(&data[2]); | |
396 bucket = kStaticDictionaryBuckets[key]; | |
397 size_t num = bucket & 0xff; | |
398 size_t offset = bucket >> 8; | |
399 for (size_t i = 0; i < num; ++i) { | |
400 const DictWord w = kStaticDictionaryWords[offset + i]; | |
401 const size_t l = w.len; | |
402 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; | |
403 const size_t id = w.idx; | |
404 if (w.transform == 0 && IsMatch(w, &data[2], max_length - 2)) { | |
405 if (data[0] == 0xc2) { | |
406 AddMatch(id + 102 * n, l + 2, l, matches); | |
407 found_match = true; | |
408 } else if (l + 2 < max_length && data[l + 2] == ' ') { | |
409 size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13); | |
410 AddMatch(id + t * n, l + 3, l, matches); | |
411 found_match = true; | |
412 } | |
413 } | |
414 } | |
415 } | |
416 } | |
417 if (max_length >= 9) { | |
418 // Transforms with prefixes " the " and ".com/" | |
419 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' && | |
420 data[3] == 'e' && data[4] == ' ') || | |
421 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' && | |
422 data[3] == 'm' && data[4] == '/')) { | |
423 key = Hash(&data[5]); | |
424 bucket = kStaticDictionaryBuckets[key]; | |
425 size_t num = bucket & 0xff; | |
426 size_t offset = bucket >> 8; | |
427 for (size_t i = 0; i < num; ++i) { | |
428 const DictWord w = kStaticDictionaryWords[offset + i]; | |
429 const size_t l = w.len; | |
430 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; | |
431 const size_t id = w.idx; | |
432 if (w.transform == 0 && IsMatch(w, &data[5], max_length - 5)) { | |
433 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches); | |
434 found_match = true; | |
435 if (l + 5 < max_length) { | |
436 const uint8_t* s = &data[l + 5]; | |
437 if (data[0] == ' ') { | |
438 if (l + 8 < max_length && | |
439 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') { | |
440 AddMatch(id + 62 * n, l + 9, l, matches); | |
441 if (l + 12 < max_length && | |
442 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') { | |
443 AddMatch(id + 73 * n, l + 13, l, matches); | |
444 } | |
445 } | |
446 } | |
447 } | |
448 } | |
449 } | |
450 } | |
451 } | |
452 return found_match; | |
453 } | |
454 | |
455 } // namespace brotli | |
OLD | NEW |