| OLD | NEW |
| (Empty) | |
| 1 /* Copyright 2013 Google Inc. All Rights Reserved. |
| 2 |
| 3 Distributed under MIT license. |
| 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| 5 */ |
| 6 |
| 7 #include "./static_dict.h" |
| 8 |
| 9 #include <algorithm> |
| 10 |
| 11 #include "./dictionary.h" |
| 12 #include "./find_match_length.h" |
| 13 #include "./static_dict_lut.h" |
| 14 #include "./transform.h" |
| 15 |
| 16 namespace brotli { |
| 17 |
| 18 inline uint32_t Hash(const uint8_t *data) { |
| 19 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32; |
| 20 // The higher bits contain more mixture from the multiplication, |
| 21 // so we take our results from there. |
| 22 return h >> (32 - kDictNumBits); |
| 23 } |
| 24 |
| 25 inline void AddMatch(size_t distance, size_t len, size_t len_code, |
| 26 uint32_t* matches) { |
| 27 uint32_t match = static_cast<uint32_t>((distance << 5) + len_code); |
| 28 matches[len] = std::min(matches[len], match); |
| 29 } |
| 30 |
| 31 inline size_t DictMatchLength(const uint8_t* data, |
| 32 size_t id, |
| 33 size_t len, |
| 34 size_t maxlen) { |
| 35 const size_t offset = kBrotliDictionaryOffsetsByLength[len] + len * id; |
| 36 return FindMatchLengthWithLimit(&kBrotliDictionary[offset], data, |
| 37 std::min(len, maxlen)); |
| 38 } |
| 39 |
| 40 inline bool IsMatch(DictWord w, const uint8_t* data, size_t max_length) { |
| 41 if (w.len > max_length) return false; |
| 42 const size_t offset = kBrotliDictionaryOffsetsByLength[w.len] + w.len * w.idx; |
| 43 const uint8_t* dict = &kBrotliDictionary[offset]; |
| 44 if (w.transform == 0) { |
| 45 // Match against base dictionary word. |
| 46 return FindMatchLengthWithLimit(dict, data, w.len) == w.len; |
| 47 } else if (w.transform == 10) { |
| 48 // Match against uppercase first transform. |
| 49 // Note that there are only ASCII uppercase words in the lookup table. |
| 50 return (dict[0] >= 'a' && dict[0] <= 'z' && |
| 51 (dict[0] ^ 32) == data[0] && |
| 52 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) == |
| 53 w.len - 1u); |
| 54 } else { |
| 55 // Match against uppercase all transform. |
| 56 // Note that there are only ASCII uppercase words in the lookup table. |
| 57 for (size_t i = 0; i < w.len; ++i) { |
| 58 if (dict[i] >= 'a' && dict[i] <= 'z') { |
| 59 if ((dict[i] ^ 32) != data[i]) return false; |
| 60 } else { |
| 61 if (dict[i] != data[i]) return false; |
| 62 } |
| 63 } |
| 64 return true; |
| 65 } |
| 66 } |
| 67 |
| 68 bool FindAllStaticDictionaryMatches(const uint8_t* data, |
| 69 size_t min_length, |
| 70 size_t max_length, |
| 71 uint32_t* matches) { |
| 72 bool found_match = false; |
| 73 size_t key = Hash(data); |
| 74 size_t bucket = kStaticDictionaryBuckets[key]; |
| 75 if (bucket != 0) { |
| 76 size_t num = bucket & 0xff; |
| 77 size_t offset = bucket >> 8; |
| 78 for (size_t i = 0; i < num; ++i) { |
| 79 const DictWord w = kStaticDictionaryWords[offset + i]; |
| 80 const size_t l = w.len; |
| 81 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; |
| 82 const size_t id = w.idx; |
| 83 if (w.transform == 0) { |
| 84 const size_t matchlen = DictMatchLength(data, id, l, max_length); |
| 85 // Transform "" + kIdentity + "" |
| 86 if (matchlen == l) { |
| 87 AddMatch(id, l, l, matches); |
| 88 found_match = true; |
| 89 } |
| 90 // Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing " |
| 91 if (matchlen >= l - 1) { |
| 92 AddMatch(id + 12 * n, l - 1, l, matches); |
| 93 if (l + 2 < max_length && |
| 94 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' && |
| 95 data[l + 2] == ' ') { |
| 96 AddMatch(id + 49 * n, l + 3, l, matches); |
| 97 } |
| 98 found_match = true; |
| 99 } |
| 100 // Transform "" + kOmitLastN + "" (N = 2 .. 9) |
| 101 size_t minlen = min_length; |
| 102 if (l > 9) minlen = std::max(minlen, l - 9); |
| 103 size_t maxlen = std::min(matchlen, l - 2); |
| 104 for (size_t len = minlen; len <= maxlen; ++len) { |
| 105 AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches); |
| 106 found_match = true; |
| 107 } |
| 108 if (matchlen < l || l + 6 >= max_length) { |
| 109 continue; |
| 110 } |
| 111 const uint8_t* s = &data[l]; |
| 112 // Transforms "" + kIdentity + <suffix> |
| 113 if (s[0] == ' ') { |
| 114 AddMatch(id + n, l + 1, l, matches); |
| 115 if (s[1] == 'a') { |
| 116 if (s[2] == ' ') { |
| 117 AddMatch(id + 28 * n, l + 3, l, matches); |
| 118 } else if (s[2] == 's') { |
| 119 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches); |
| 120 } else if (s[2] == 't') { |
| 121 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches); |
| 122 } else if (s[2] == 'n') { |
| 123 if (s[3] == 'd' && s[4] == ' ') { |
| 124 AddMatch(id + 10 * n, l + 5, l, matches); |
| 125 } |
| 126 } |
| 127 } else if (s[1] == 'b') { |
| 128 if (s[2] == 'y' && s[3] == ' ') { |
| 129 AddMatch(id + 38 * n, l + 4, l, matches); |
| 130 } |
| 131 } else if (s[1] == 'i') { |
| 132 if (s[2] == 'n') { |
| 133 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches); |
| 134 } else if (s[2] == 's') { |
| 135 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches); |
| 136 } |
| 137 } else if (s[1] == 'f') { |
| 138 if (s[2] == 'o') { |
| 139 if (s[3] == 'r' && s[4] == ' ') { |
| 140 AddMatch(id + 25 * n, l + 5, l, matches); |
| 141 } |
| 142 } else if (s[2] == 'r') { |
| 143 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') { |
| 144 AddMatch(id + 37 * n, l + 6, l, matches); |
| 145 } |
| 146 } |
| 147 } else if (s[1] == 'o') { |
| 148 if (s[2] == 'f') { |
| 149 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches); |
| 150 } else if (s[2] == 'n') { |
| 151 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches); |
| 152 } |
| 153 } else if (s[1] == 'n') { |
| 154 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') { |
| 155 AddMatch(id + 80 * n, l + 5, l, matches); |
| 156 } |
| 157 } else if (s[1] == 't') { |
| 158 if (s[2] == 'h') { |
| 159 if (s[3] == 'e') { |
| 160 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches); |
| 161 } else if (s[3] == 'a') { |
| 162 if (s[4] == 't' && s[5] == ' ') { |
| 163 AddMatch(id + 29 * n, l + 6, l, matches); |
| 164 } |
| 165 } |
| 166 } else if (s[2] == 'o') { |
| 167 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches); |
| 168 } |
| 169 } else if (s[1] == 'w') { |
| 170 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') { |
| 171 AddMatch(id + 35 * n, l + 6, l, matches); |
| 172 } |
| 173 } |
| 174 } else if (s[0] == '"') { |
| 175 AddMatch(id + 19 * n, l + 1, l, matches); |
| 176 if (s[1] == '>') { |
| 177 AddMatch(id + 21 * n, l + 2, l, matches); |
| 178 } |
| 179 } else if (s[0] == '.') { |
| 180 AddMatch(id + 20 * n, l + 1, l, matches); |
| 181 if (s[1] == ' ') { |
| 182 AddMatch(id + 31 * n, l + 2, l, matches); |
| 183 if (s[2] == 'T' && s[3] == 'h') { |
| 184 if (s[4] == 'e') { |
| 185 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches); |
| 186 } else if (s[4] == 'i') { |
| 187 if (s[5] == 's' && s[6] == ' ') { |
| 188 AddMatch(id + 75 * n, l + 7, l, matches); |
| 189 } |
| 190 } |
| 191 } |
| 192 } |
| 193 } else if (s[0] == ',') { |
| 194 AddMatch(id + 76 * n, l + 1, l, matches); |
| 195 if (s[1] == ' ') { |
| 196 AddMatch(id + 14 * n, l + 2, l, matches); |
| 197 } |
| 198 } else if (s[0] == '\n') { |
| 199 AddMatch(id + 22 * n, l + 1, l, matches); |
| 200 if (s[1] == '\t') { |
| 201 AddMatch(id + 50 * n, l + 2, l, matches); |
| 202 } |
| 203 } else if (s[0] == ']') { |
| 204 AddMatch(id + 24 * n, l + 1, l, matches); |
| 205 } else if (s[0] == '\'') { |
| 206 AddMatch(id + 36 * n, l + 1, l, matches); |
| 207 } else if (s[0] == ':') { |
| 208 AddMatch(id + 51 * n, l + 1, l, matches); |
| 209 } else if (s[0] == '(') { |
| 210 AddMatch(id + 57 * n, l + 1, l, matches); |
| 211 } else if (s[0] == '=') { |
| 212 if (s[1] == '"') { |
| 213 AddMatch(id + 70 * n, l + 2, l, matches); |
| 214 } else if (s[1] == '\'') { |
| 215 AddMatch(id + 86 * n, l + 2, l, matches); |
| 216 } |
| 217 } else if (s[0] == 'a') { |
| 218 if (s[1] == 'l' && s[2] == ' ') { |
| 219 AddMatch(id + 84 * n, l + 3, l, matches); |
| 220 } |
| 221 } else if (s[0] == 'e') { |
| 222 if (s[1] == 'd') { |
| 223 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches); |
| 224 } else if (s[1] == 'r') { |
| 225 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches); |
| 226 } else if (s[1] == 's') { |
| 227 if (s[2] == 't' && s[3] == ' ') { |
| 228 AddMatch(id + 95 * n, l + 4, l, matches); |
| 229 } |
| 230 } |
| 231 } else if (s[0] == 'f') { |
| 232 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') { |
| 233 AddMatch(id + 90 * n, l + 4, l, matches); |
| 234 } |
| 235 } else if (s[0] == 'i') { |
| 236 if (s[1] == 'v') { |
| 237 if (s[2] == 'e' && s[3] == ' ') { |
| 238 AddMatch(id + 92 * n, l + 4, l, matches); |
| 239 } |
| 240 } else if (s[1] == 'z') { |
| 241 if (s[2] == 'e' && s[3] == ' ') { |
| 242 AddMatch(id + 100 * n, l + 4, l, matches); |
| 243 } |
| 244 } |
| 245 } else if (s[0] == 'l') { |
| 246 if (s[1] == 'e') { |
| 247 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') { |
| 248 AddMatch(id + 93 * n, l + 5, l, matches); |
| 249 } |
| 250 } else if (s[1] == 'y') { |
| 251 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches); |
| 252 } |
| 253 } else if (s[0] == 'o') { |
| 254 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') { |
| 255 AddMatch(id + 106 * n, l + 4, l, matches); |
| 256 } |
| 257 } |
| 258 } else { |
| 259 // Set t=false for kUppercaseFirst and |
| 260 // t=true otherwise (kUppercaseAll) transform. |
| 261 const bool t = w.transform != kUppercaseFirst; |
| 262 if (!IsMatch(w, data, max_length)) { |
| 263 continue; |
| 264 } |
| 265 // Transform "" + kUppercase{First,All} + "" |
| 266 AddMatch(id + (t ? 44 : 9) * n, l, l, matches); |
| 267 found_match = true; |
| 268 if (l + 1 >= max_length) { |
| 269 continue; |
| 270 } |
| 271 // Transforms "" + kUppercase{First,All} + <suffix> |
| 272 const uint8_t* s = &data[l]; |
| 273 if (s[0] == ' ') { |
| 274 AddMatch(id + (t ? 68 : 4) * n, l + 1, l, matches); |
| 275 } else if (s[0] == '"') { |
| 276 AddMatch(id + (t ? 87 : 66) * n, l + 1, l, matches); |
| 277 if (s[1] == '>') { |
| 278 AddMatch(id + (t ? 97 : 69) * n, l + 2, l, matches); |
| 279 } |
| 280 } else if (s[0] == '.') { |
| 281 AddMatch(id + (t ? 101 : 79) * n, l + 1, l, matches); |
| 282 if (s[1] == ' ') { |
| 283 AddMatch(id + (t ? 114 : 88) * n, l + 2, l, matches); |
| 284 } |
| 285 } else if (s[0] == ',') { |
| 286 AddMatch(id + (t ? 112 : 99) * n, l + 1, l, matches); |
| 287 if (s[1] == ' ') { |
| 288 AddMatch(id + (t ? 107 : 58) * n, l + 2, l, matches); |
| 289 } |
| 290 } else if (s[0] == '\'') { |
| 291 AddMatch(id + (t ? 94 : 74) * n, l + 1, l, matches); |
| 292 } else if (s[0] == '(') { |
| 293 AddMatch(id + (t ? 113 : 78) * n, l + 1, l, matches); |
| 294 } else if (s[0] == '=') { |
| 295 if (s[1] == '"') { |
| 296 AddMatch(id + (t ? 105 : 104) * n, l + 2, l, matches); |
| 297 } else if (s[1] == '\'') { |
| 298 AddMatch(id + (t ? 116 : 108) * n, l + 2, l, matches); |
| 299 } |
| 300 } |
| 301 } |
| 302 } |
| 303 } |
| 304 // Transforms with prefixes " " and "." |
| 305 if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) { |
| 306 bool is_space = (data[0] == ' '); |
| 307 key = Hash(&data[1]); |
| 308 bucket = kStaticDictionaryBuckets[key]; |
| 309 size_t num = bucket & 0xff; |
| 310 size_t offset = bucket >> 8; |
| 311 for (size_t i = 0; i < num; ++i) { |
| 312 const DictWord w = kStaticDictionaryWords[offset + i]; |
| 313 const size_t l = w.len; |
| 314 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; |
| 315 const size_t id = w.idx; |
| 316 if (w.transform == 0) { |
| 317 if (!IsMatch(w, &data[1], max_length - 1)) { |
| 318 continue; |
| 319 } |
| 320 // Transforms " " + kIdentity + "" and "." + kIdentity + "" |
| 321 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches); |
| 322 found_match = true; |
| 323 if (l + 2 >= max_length) { |
| 324 continue; |
| 325 } |
| 326 // Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix> |
| 327 const uint8_t* s = &data[l + 1]; |
| 328 if (s[0] == ' ') { |
| 329 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches); |
| 330 } else if (s[0] == '(') { |
| 331 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches); |
| 332 } else if (is_space) { |
| 333 if (s[0] == ',') { |
| 334 AddMatch(id + 103 * n, l + 2, l, matches); |
| 335 if (s[1] == ' ') { |
| 336 AddMatch(id + 33 * n, l + 3, l, matches); |
| 337 } |
| 338 } else if (s[0] == '.') { |
| 339 AddMatch(id + 71 * n, l + 2, l, matches); |
| 340 if (s[1] == ' ') { |
| 341 AddMatch(id + 52 * n, l + 3, l, matches); |
| 342 } |
| 343 } else if (s[0] == '=') { |
| 344 if (s[1] == '"') { |
| 345 AddMatch(id + 81 * n, l + 3, l, matches); |
| 346 } else if (s[1] == '\'') { |
| 347 AddMatch(id + 98 * n, l + 3, l, matches); |
| 348 } |
| 349 } |
| 350 } |
| 351 } else if (is_space) { |
| 352 // Set t=false for kUppercaseFirst and |
| 353 // t=true otherwise (kUppercaseAll) transform. |
| 354 const bool t = w.transform != kUppercaseFirst; |
| 355 if (!IsMatch(w, &data[1], max_length - 1)) { |
| 356 continue; |
| 357 } |
| 358 // Transforms " " + kUppercase{First,All} + "" |
| 359 AddMatch(id + (t ? 85 : 30) * n, l + 1, l, matches); |
| 360 found_match = true; |
| 361 if (l + 2 >= max_length) { |
| 362 continue; |
| 363 } |
| 364 // Transforms " " + kUppercase{First,All} + <suffix> |
| 365 const uint8_t* s = &data[l + 1]; |
| 366 if (s[0] == ' ') { |
| 367 AddMatch(id + (t ? 83 : 15) * n, l + 2, l, matches); |
| 368 } else if (s[0] == ',') { |
| 369 if (!t) { |
| 370 AddMatch(id + 109 * n, l + 2, l, matches); |
| 371 } |
| 372 if (s[1] == ' ') { |
| 373 AddMatch(id + (t ? 111 : 65) * n, l + 3, l, matches); |
| 374 } |
| 375 } else if (s[0] == '.') { |
| 376 AddMatch(id + (t ? 115 : 96) * n, l + 2, l, matches); |
| 377 if (s[1] == ' ') { |
| 378 AddMatch(id + (t ? 117 : 91) * n, l + 3, l, matches); |
| 379 } |
| 380 } else if (s[0] == '=') { |
| 381 if (s[1] == '"') { |
| 382 AddMatch(id + (t ? 110 : 118) * n, l + 3, l, matches); |
| 383 } else if (s[1] == '\'') { |
| 384 AddMatch(id + (t ? 119 : 120) * n, l + 3, l, matches); |
| 385 } |
| 386 } |
| 387 } |
| 388 } |
| 389 } |
| 390 if (max_length >= 6) { |
| 391 // Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0" |
| 392 if ((data[1] == ' ' && |
| 393 (data[0] == 'e' || data[0] == 's' || data[0] == ',')) || |
| 394 (data[0] == 0xc2 && data[1] == 0xa0)) { |
| 395 key = Hash(&data[2]); |
| 396 bucket = kStaticDictionaryBuckets[key]; |
| 397 size_t num = bucket & 0xff; |
| 398 size_t offset = bucket >> 8; |
| 399 for (size_t i = 0; i < num; ++i) { |
| 400 const DictWord w = kStaticDictionaryWords[offset + i]; |
| 401 const size_t l = w.len; |
| 402 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; |
| 403 const size_t id = w.idx; |
| 404 if (w.transform == 0 && IsMatch(w, &data[2], max_length - 2)) { |
| 405 if (data[0] == 0xc2) { |
| 406 AddMatch(id + 102 * n, l + 2, l, matches); |
| 407 found_match = true; |
| 408 } else if (l + 2 < max_length && data[l + 2] == ' ') { |
| 409 size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13); |
| 410 AddMatch(id + t * n, l + 3, l, matches); |
| 411 found_match = true; |
| 412 } |
| 413 } |
| 414 } |
| 415 } |
| 416 } |
| 417 if (max_length >= 9) { |
| 418 // Transforms with prefixes " the " and ".com/" |
| 419 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' && |
| 420 data[3] == 'e' && data[4] == ' ') || |
| 421 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' && |
| 422 data[3] == 'm' && data[4] == '/')) { |
| 423 key = Hash(&data[5]); |
| 424 bucket = kStaticDictionaryBuckets[key]; |
| 425 size_t num = bucket & 0xff; |
| 426 size_t offset = bucket >> 8; |
| 427 for (size_t i = 0; i < num; ++i) { |
| 428 const DictWord w = kStaticDictionaryWords[offset + i]; |
| 429 const size_t l = w.len; |
| 430 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l]; |
| 431 const size_t id = w.idx; |
| 432 if (w.transform == 0 && IsMatch(w, &data[5], max_length - 5)) { |
| 433 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches); |
| 434 found_match = true; |
| 435 if (l + 5 < max_length) { |
| 436 const uint8_t* s = &data[l + 5]; |
| 437 if (data[0] == ' ') { |
| 438 if (l + 8 < max_length && |
| 439 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') { |
| 440 AddMatch(id + 62 * n, l + 9, l, matches); |
| 441 if (l + 12 < max_length && |
| 442 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') { |
| 443 AddMatch(id + 73 * n, l + 13, l, matches); |
| 444 } |
| 445 } |
| 446 } |
| 447 } |
| 448 } |
| 449 } |
| 450 } |
| 451 } |
| 452 return found_match; |
| 453 } |
| 454 |
| 455 } // namespace brotli |
| OLD | NEW |