third_party/brotli/enc/static_dict.cc - Issue 2537133002: Update brotli to v1.0.0-snapshot.

Side by Side Diff: third_party/brotli/enc/static_dict.cc

Issue 2537133002: Update brotli to v1.0.0-snapshot. (Closed)

Patch Set: Fixed typo Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /* Copyright 2013 Google Inc. All Rights Reserved.

2

3 Distributed under MIT license.

4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT

5 */

6

7 #include "./static_dict.h"

8

9 #include <algorithm>

10

11 #include "./dictionary.h"

12 #include "./find_match_length.h"

13 #include "./static_dict_lut.h"

14 #include "./transform.h"

15

16 namespace brotli {

17

18 inline uint32_t Hash(const uint8_t *data) {

19 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32;

20 // The higher bits contain more mixture from the multiplication,

21 // so we take our results from there.

22 return h >> (32 - kDictNumBits);

23 }

24

25 inline void AddMatch(size_t distance, size_t len, size_t len_code,

26 uint32_t* matches) {

27 uint32_t match = static_cast<uint32_t>((distance << 5) + len_code);

28 matches[len] = std::min(matches[len], match);

29 }

30

31 inline size_t DictMatchLength(const uint8_t* data,

32 size_t id,

33 size_t len,

34 size_t maxlen) {

35 const size_t offset = kBrotliDictionaryOffsetsByLength[len] + len * id;

36 return FindMatchLengthWithLimit(&kBrotliDictionary[offset], data,

37 std::min(len, maxlen));

38 }

39

40 inline bool IsMatch(DictWord w, const uint8_t* data, size_t max_length) {

41 if (w.len > max_length) return false;

42 const size_t offset = kBrotliDictionaryOffsetsByLength[w.len] + w.len * w.idx;

43 const uint8_t* dict = &kBrotliDictionary[offset];

44 if (w.transform == 0) {

45 // Match against base dictionary word.

46 return FindMatchLengthWithLimit(dict, data, w.len) == w.len;

47 } else if (w.transform == 10) {

48 // Match against uppercase first transform.

49 // Note that there are only ASCII uppercase words in the lookup table.

50 return (dict[0] >= 'a' && dict[0] <= 'z' &&

51 (dict[0] ^ 32) == data[0] &&

52 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==

53 w.len - 1u);

54 } else {

55 // Match against uppercase all transform.

56 // Note that there are only ASCII uppercase words in the lookup table.

57 for (size_t i = 0; i < w.len; ++i) {

58 if (dict[i] >= 'a' && dict[i] <= 'z') {

59 if ((dict[i] ^ 32) != data[i]) return false;

60 } else {

61 if (dict[i] != data[i]) return false;

62 }

63 }

64 return true;

65 }

66 }

67

68 bool FindAllStaticDictionaryMatches(const uint8_t* data,

69 size_t min_length,

70 size_t max_length,

71 uint32_t* matches) {

72 bool found_match = false;

73 size_t key = Hash(data);

74 size_t bucket = kStaticDictionaryBuckets[key];

75 if (bucket != 0) {

76 size_t num = bucket & 0xff;

77 size_t offset = bucket >> 8;

78 for (size_t i = 0; i < num; ++i) {

79 const DictWord w = kStaticDictionaryWords[offset + i];

80 const size_t l = w.len;

81 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l];

82 const size_t id = w.idx;

83 if (w.transform == 0) {

84 const size_t matchlen = DictMatchLength(data, id, l, max_length);

85 // Transform "" + kIdentity + ""

86 if (matchlen == l) {

87 AddMatch(id, l, l, matches);

88 found_match = true;

89 }

90 // Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing "

91 if (matchlen >= l - 1) {

92 AddMatch(id + 12 * n, l - 1, l, matches);

93 if (l + 2 < max_length &&

94 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&

95 data[l + 2] == ' ') {

96 AddMatch(id + 49 * n, l + 3, l, matches);

97 }

98 found_match = true;

99 }

100 // Transform "" + kOmitLastN + "" (N = 2 .. 9)

101 size_t minlen = min_length;

102 if (l > 9) minlen = std::max(minlen, l - 9);

103 size_t maxlen = std::min(matchlen, l - 2);

104 for (size_t len = minlen; len <= maxlen; ++len) {

105 AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);

106 found_match = true;

107 }

108 if (matchlen < l \|\| l + 6 >= max_length) {

109 continue;

110 }

111 const uint8_t* s = &data[l];

112 // Transforms "" + kIdentity + <suffix>

113 if (s[0] == ' ') {

114 AddMatch(id + n, l + 1, l, matches);

115 if (s[1] == 'a') {

116 if (s[2] == ' ') {

117 AddMatch(id + 28 * n, l + 3, l, matches);

118 } else if (s[2] == 's') {

119 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);

120 } else if (s[2] == 't') {

121 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);

122 } else if (s[2] == 'n') {

123 if (s[3] == 'd' && s[4] == ' ') {

124 AddMatch(id + 10 * n, l + 5, l, matches);

125 }

126 }

127 } else if (s[1] == 'b') {

128 if (s[2] == 'y' && s[3] == ' ') {

129 AddMatch(id + 38 * n, l + 4, l, matches);

130 }

131 } else if (s[1] == 'i') {

132 if (s[2] == 'n') {

133 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);

134 } else if (s[2] == 's') {

135 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);

136 }

137 } else if (s[1] == 'f') {

138 if (s[2] == 'o') {

139 if (s[3] == 'r' && s[4] == ' ') {

140 AddMatch(id + 25 * n, l + 5, l, matches);

141 }

142 } else if (s[2] == 'r') {

143 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {

144 AddMatch(id + 37 * n, l + 6, l, matches);

145 }

146 }

147 } else if (s[1] == 'o') {

148 if (s[2] == 'f') {

149 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);

150 } else if (s[2] == 'n') {

151 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);

152 }

153 } else if (s[1] == 'n') {

154 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {

155 AddMatch(id + 80 * n, l + 5, l, matches);

156 }

157 } else if (s[1] == 't') {

158 if (s[2] == 'h') {

159 if (s[3] == 'e') {

160 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);

161 } else if (s[3] == 'a') {

162 if (s[4] == 't' && s[5] == ' ') {

163 AddMatch(id + 29 * n, l + 6, l, matches);

164 }

165 }

166 } else if (s[2] == 'o') {

167 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);

168 }

169 } else if (s[1] == 'w') {

170 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {

171 AddMatch(id + 35 * n, l + 6, l, matches);

172 }

173 }

174 } else if (s[0] == '"') {

175 AddMatch(id + 19 * n, l + 1, l, matches);

176 if (s[1] == '>') {

177 AddMatch(id + 21 * n, l + 2, l, matches);

178 }

179 } else if (s[0] == '.') {

180 AddMatch(id + 20 * n, l + 1, l, matches);

181 if (s[1] == ' ') {

182 AddMatch(id + 31 * n, l + 2, l, matches);

183 if (s[2] == 'T' && s[3] == 'h') {

184 if (s[4] == 'e') {

185 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);

186 } else if (s[4] == 'i') {

187 if (s[5] == 's' && s[6] == ' ') {

188 AddMatch(id + 75 * n, l + 7, l, matches);

189 }

190 }

191 }

192 }

193 } else if (s[0] == ',') {

194 AddMatch(id + 76 * n, l + 1, l, matches);

195 if (s[1] == ' ') {

196 AddMatch(id + 14 * n, l + 2, l, matches);

197 }

198 } else if (s[0] == '\n') {

199 AddMatch(id + 22 * n, l + 1, l, matches);

200 if (s[1] == '\t') {

201 AddMatch(id + 50 * n, l + 2, l, matches);

202 }

203 } else if (s[0] == ']') {

204 AddMatch(id + 24 * n, l + 1, l, matches);

205 } else if (s[0] == '\'') {

206 AddMatch(id + 36 * n, l + 1, l, matches);

207 } else if (s[0] == ':') {

208 AddMatch(id + 51 * n, l + 1, l, matches);

209 } else if (s[0] == '(') {

210 AddMatch(id + 57 * n, l + 1, l, matches);

211 } else if (s[0] == '=') {

212 if (s[1] == '"') {

213 AddMatch(id + 70 * n, l + 2, l, matches);

214 } else if (s[1] == '\'') {

215 AddMatch(id + 86 * n, l + 2, l, matches);

216 }

217 } else if (s[0] == 'a') {

218 if (s[1] == 'l' && s[2] == ' ') {

219 AddMatch(id + 84 * n, l + 3, l, matches);

220 }

221 } else if (s[0] == 'e') {

222 if (s[1] == 'd') {

223 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);

224 } else if (s[1] == 'r') {

225 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);

226 } else if (s[1] == 's') {

227 if (s[2] == 't' && s[3] == ' ') {

228 AddMatch(id + 95 * n, l + 4, l, matches);

229 }

230 }

231 } else if (s[0] == 'f') {

232 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {

233 AddMatch(id + 90 * n, l + 4, l, matches);

234 }

235 } else if (s[0] == 'i') {

236 if (s[1] == 'v') {

237 if (s[2] == 'e' && s[3] == ' ') {

238 AddMatch(id + 92 * n, l + 4, l, matches);

239 }

240 } else if (s[1] == 'z') {

241 if (s[2] == 'e' && s[3] == ' ') {

242 AddMatch(id + 100 * n, l + 4, l, matches);

243 }

244 }

245 } else if (s[0] == 'l') {

246 if (s[1] == 'e') {

247 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {

248 AddMatch(id + 93 * n, l + 5, l, matches);

249 }

250 } else if (s[1] == 'y') {

251 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);

252 }

253 } else if (s[0] == 'o') {

254 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {

255 AddMatch(id + 106 * n, l + 4, l, matches);

256 }

257 }

258 } else {

259 // Set t=false for kUppercaseFirst and

260 // t=true otherwise (kUppercaseAll) transform.

261 const bool t = w.transform != kUppercaseFirst;

262 if (!IsMatch(w, data, max_length)) {

263 continue;

264 }

265 // Transform "" + kUppercase{First,All} + ""

266 AddMatch(id + (t ? 44 : 9) * n, l, l, matches);

267 found_match = true;

268 if (l + 1 >= max_length) {

269 continue;

270 }

271 // Transforms "" + kUppercase{First,All} + <suffix>

272 const uint8_t* s = &data[l];

273 if (s[0] == ' ') {

274 AddMatch(id + (t ? 68 : 4) * n, l + 1, l, matches);

275 } else if (s[0] == '"') {

276 AddMatch(id + (t ? 87 : 66) * n, l + 1, l, matches);

277 if (s[1] == '>') {

278 AddMatch(id + (t ? 97 : 69) * n, l + 2, l, matches);

279 }

280 } else if (s[0] == '.') {

281 AddMatch(id + (t ? 101 : 79) * n, l + 1, l, matches);

282 if (s[1] == ' ') {

283 AddMatch(id + (t ? 114 : 88) * n, l + 2, l, matches);

284 }

285 } else if (s[0] == ',') {

286 AddMatch(id + (t ? 112 : 99) * n, l + 1, l, matches);

287 if (s[1] == ' ') {

288 AddMatch(id + (t ? 107 : 58) * n, l + 2, l, matches);

289 }

290 } else if (s[0] == '\'') {

291 AddMatch(id + (t ? 94 : 74) * n, l + 1, l, matches);

292 } else if (s[0] == '(') {

293 AddMatch(id + (t ? 113 : 78) * n, l + 1, l, matches);

294 } else if (s[0] == '=') {

295 if (s[1] == '"') {

296 AddMatch(id + (t ? 105 : 104) * n, l + 2, l, matches);

297 } else if (s[1] == '\'') {

298 AddMatch(id + (t ? 116 : 108) * n, l + 2, l, matches);

299 }

300 }

301 }

302 }

303 }

304 // Transforms with prefixes " " and "."

305 if (max_length >= 5 && (data[0] == ' ' \|\| data[0] == '.')) {

306 bool is_space = (data[0] == ' ');

307 key = Hash(&data[1]);

308 bucket = kStaticDictionaryBuckets[key];

309 size_t num = bucket & 0xff;

310 size_t offset = bucket >> 8;

311 for (size_t i = 0; i < num; ++i) {

312 const DictWord w = kStaticDictionaryWords[offset + i];

313 const size_t l = w.len;

314 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l];

315 const size_t id = w.idx;

316 if (w.transform == 0) {

317 if (!IsMatch(w, &data[1], max_length - 1)) {

318 continue;

319 }

320 // Transforms " " + kIdentity + "" and "." + kIdentity + ""

321 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);

322 found_match = true;

323 if (l + 2 >= max_length) {

324 continue;

325 }

326 // Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>

327 const uint8_t* s = &data[l + 1];

328 if (s[0] == ' ') {

329 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);

330 } else if (s[0] == '(') {

331 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);

332 } else if (is_space) {

333 if (s[0] == ',') {

334 AddMatch(id + 103 * n, l + 2, l, matches);

335 if (s[1] == ' ') {

336 AddMatch(id + 33 * n, l + 3, l, matches);

337 }

338 } else if (s[0] == '.') {

339 AddMatch(id + 71 * n, l + 2, l, matches);

340 if (s[1] == ' ') {

341 AddMatch(id + 52 * n, l + 3, l, matches);

342 }

343 } else if (s[0] == '=') {

344 if (s[1] == '"') {

345 AddMatch(id + 81 * n, l + 3, l, matches);

346 } else if (s[1] == '\'') {

347 AddMatch(id + 98 * n, l + 3, l, matches);

348 }

349 }

350 }

351 } else if (is_space) {

352 // Set t=false for kUppercaseFirst and

353 // t=true otherwise (kUppercaseAll) transform.

354 const bool t = w.transform != kUppercaseFirst;

355 if (!IsMatch(w, &data[1], max_length - 1)) {

356 continue;

357 }

358 // Transforms " " + kUppercase{First,All} + ""

359 AddMatch(id + (t ? 85 : 30) * n, l + 1, l, matches);

360 found_match = true;

361 if (l + 2 >= max_length) {

362 continue;

363 }

364 // Transforms " " + kUppercase{First,All} + <suffix>

365 const uint8_t* s = &data[l + 1];

366 if (s[0] == ' ') {

367 AddMatch(id + (t ? 83 : 15) * n, l + 2, l, matches);

368 } else if (s[0] == ',') {

369 if (!t) {

370 AddMatch(id + 109 * n, l + 2, l, matches);

371 }

372 if (s[1] == ' ') {

373 AddMatch(id + (t ? 111 : 65) * n, l + 3, l, matches);

374 }

375 } else if (s[0] == '.') {

376 AddMatch(id + (t ? 115 : 96) * n, l + 2, l, matches);

377 if (s[1] == ' ') {

378 AddMatch(id + (t ? 117 : 91) * n, l + 3, l, matches);

379 }

380 } else if (s[0] == '=') {

381 if (s[1] == '"') {

382 AddMatch(id + (t ? 110 : 118) * n, l + 3, l, matches);

383 } else if (s[1] == '\'') {

384 AddMatch(id + (t ? 119 : 120) * n, l + 3, l, matches);

385 }

386 }

387 }

388 }

389 }

390 if (max_length >= 6) {

391 // Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0"

392 if ((data[1] == ' ' &&

393 (data[0] == 'e' \|\| data[0] == 's' \|\| data[0] == ',')) \|\|

394 (data[0] == 0xc2 && data[1] == 0xa0)) {

395 key = Hash(&data[2]);

396 bucket = kStaticDictionaryBuckets[key];

397 size_t num = bucket & 0xff;

398 size_t offset = bucket >> 8;

399 for (size_t i = 0; i < num; ++i) {

400 const DictWord w = kStaticDictionaryWords[offset + i];

401 const size_t l = w.len;

402 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l];

403 const size_t id = w.idx;

404 if (w.transform == 0 && IsMatch(w, &data[2], max_length - 2)) {

405 if (data[0] == 0xc2) {

406 AddMatch(id + 102 * n, l + 2, l, matches);

407 found_match = true;

408 } else if (l + 2 < max_length && data[l + 2] == ' ') {

409 size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);

410 AddMatch(id + t * n, l + 3, l, matches);

411 found_match = true;

412 }

413 }

414 }

415 }

416 }

417 if (max_length >= 9) {

418 // Transforms with prefixes " the " and ".com/"

419 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&

420 data[3] == 'e' && data[4] == ' ') \|\|

421 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&

422 data[3] == 'm' && data[4] == '/')) {

423 key = Hash(&data[5]);

424 bucket = kStaticDictionaryBuckets[key];

425 size_t num = bucket & 0xff;

426 size_t offset = bucket >> 8;

427 for (size_t i = 0; i < num; ++i) {

428 const DictWord w = kStaticDictionaryWords[offset + i];

429 const size_t l = w.len;

430 const size_t n = 1u << kBrotliDictionarySizeBitsByLength[l];

431 const size_t id = w.idx;

432 if (w.transform == 0 && IsMatch(w, &data[5], max_length - 5)) {

433 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);

434 found_match = true;

435 if (l + 5 < max_length) {

436 const uint8_t* s = &data[l + 5];

437 if (data[0] == ' ') {

438 if (l + 8 < max_length &&

439 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {

440 AddMatch(id + 62 * n, l + 9, l, matches);

441 if (l + 12 < max_length &&

442 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {

443 AddMatch(id + 73 * n, l + 13, l, matches);

444 }

445 }

446 }

447 }

448 }

449 }

450 }

451 }

452 return found_match;

453 }

454

455 } // namespace brotli

OLD	NEW

« no previous file with comments | « third_party/brotli/enc/static_dict.c ('k') | third_party/brotli/enc/static_dict_lut.h » ('j') | no next file with comments »