third_party/hunspell_new/google/bdict_writer.cc - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/google/bdict_writer.cc

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "third_party/hunspell_new/google/bdict_writer.h"

6

7 #include "base/logging.h"

8 #include "base/strings/stringprintf.h"

9 #include "third_party/hunspell_new/google/bdict.h"

10

11 namespace hunspell {

12

13 // Represents one node the word trie in memory. This does not have to be very

14 // efficient since it is only used when building.

15 class DicNode {

16 public:

17 enum StorageType {

18 UNDEFINED, // Uninitialized storage type.

19 LEAF, // Word with no additional string data.

20 LEAFMORE, // Word with additional suffix following.

21 LIST16, // List of sub-nodes with 16-bit relative offsets.

22 LIST8, // List of sub-nodes with 8-bit relative offsets.

23 LOOKUP32, // Lookup table with 32-bit absolute offsets.

24 LOOKUP16, // LOokup table with 16-bit relative offsets.

25 };

26

27 DicNode() : addition(0), storage(UNDEFINED) {

28 }

29

30 ~DicNode() {

31 for (size_t i = 0; i < children.size(); i++)

32 delete children[i];

33 }

34

35 bool is_leaf() const { return children.empty(); }

36

37 // When non-zero, this character is the additional level that this

38 // node represents. This will be 0 for some leaf nodes when there is no

39 // addition and for the root node.

40 char addition;

41

42 std::vector<DicNode*> children;

43

44 // When there are no children, this is a leaf node and this "addition string"

45 // is appended to the result. When there are children, this will be empty.

46 std::string leaf_addition;

47

48 // For leaf nodes, this are the indices into the affix table.

49 std::vector<int> affix_indices;

50

51 // Initially uninitialized, ComputeStorage() will fill this in with the

52 // desired serialization method.

53 StorageType storage;

54 };

55

56 namespace {

57

58 void SerializeTrie(const DicNode* node, std::string* output);

59

60 // Returns true if the nth character in the given word is \|ch\|. Will return

61 // false when there is no nth character. Note that this will also match an

62 // implicit NULL at the end of the string.

63 bool NthCharacterIs(const std::string& word, size_t n, char ch) {

64 if (word.length() < n) // Want to allow n == length() to catch the NULL.

65 return false;

66 return word.c_str()[n] == ch; // Use c_str() to get NULL terminator.

67 }

68

69 // Recursively build the trie data structure for the range in the \|words\| list

70 // in [begin, end). It is assumed that all words in that range will have the

71 // same \|node_depth - 2\| characters at the beginning. This node will key off of

72 // the \|node_depth - 1\| character, with a special case for the root.

73 //

74 // \|prefix_chars\| is how deep this node is in the trie (and corresponds to how

75 // many letters of the word we will skip). The root level will have

76 // \|prefix_chars\| of 0.

77 //

78 // The given \|node\| will be filled with the data. The return value is the

79 // index into the \|words\| vector of the next word to process. It will be

80 // equal to \|end\| when all words have been consumed.

81 size_t BuildTrie(const BDictWriter::WordList& words,

82 size_t begin, size_t end,

83 size_t node_depth, DicNode* node) {

84 // Find the prefix that this node represents.

85 const std::string& begin_str = words[begin].first;

86 if (begin_str.length() < node_depth) {

87 // Singleton.

88 node->addition = 0;

89 node->affix_indices = words[begin].second;

90 return begin + 1;

91 }

92

93 // Now find the range of words sharing this prefix.

94 size_t match_count;

95 if (node_depth == 0 && begin == 0) {

96 // Special case the root node.

97 match_count = end - begin;

98 node->addition = 0;

99 } else {

100 match_count = 0;

101 node->addition = begin_str[node_depth - 1];

102 // We know the strings should have [0, node_depth-1) characters at the

103 // beginning already matching, so we only need to check the new one.

104 while (begin + match_count < end &&

105 NthCharacterIs(words[begin + match_count].first,

106 node_depth - 1, node->addition))

107 match_count++;

108 }

109

110 if (match_count == 1) {

111 // Just found a leaf node with no other words sharing its prefix. Save any

112 // remaining characters and we're done.

113 node->affix_indices = words[begin].second;

114 node->leaf_addition = begin_str.substr(node_depth);

115 return begin + 1;

116 }

117

118 // We have a range of words, add them as children of this node.

119 size_t i = begin;

120 while (i < begin + match_count) {

121 DicNode* cur = new DicNode;

122 i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);

123 node->children.push_back(cur);

124 }

125

126 return begin + match_count;

127 }

128

129 // Lookup tables are complicated. They can have a magic 0th entry not counted

130 // in the table dimensions, and also have indices only for the used sub-range.

131 // This function will compute the starting point and size of a lookup table,

132 // in addition to whether it should have the magic 0th entry for the given

133 // list of child nodes.

134 void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,

135 bool* has_0th_entry,

136 int* first_item,

137 int* list_size) {

138 *has_0th_entry = false;

139 *first_item = 0;

140 *list_size = 0;

141 if (children.empty())

142 return;

143

144 size_t first_offset = 0;

145 if (children[0]->addition == 0) {

146 *has_0th_entry = true;

147 first_offset++;

148 }

149

150 if (children.size() == first_offset)

151 return;

152

153 *first_item = static_cast<unsigned char>(children[first_offset]->addition);

154 unsigned char last_item = children[children.size() - 1]->addition;

155 list_size = last_item - first_item + 1;

156 }

157

158 // Recursively fills in the storage strategy for this node and each of its

159 // children. This must be done before actually serializing because the storage

160 // mode will depend on the size of the children.

161 size_t ComputeTrieStorage(DicNode* node) {

162 if (node->is_leaf()) {

163 // The additional affix list holds affixes when there is more than one. Each

164 // entry is two bytes, plus an additional FFFF terminator.

165 size_t supplimentary_size = 0;

166 if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {

167 // We cannot store the first affix ID of the affix list into a leaf node.

168 // In this case, we have to store all the affix IDs and a terminator

169 // into a supplimentary list.

170 supplimentary_size = node->affix_indices.size() * 2 + 2;

171 } else if (node->affix_indices.size() > 1) {

172 // We can store the first affix ID of the affix list into a leaf node.

173 // In this case, we need to store the remaining affix IDs and a

174 // terminator into a supplimentary list.

175 supplimentary_size = node->affix_indices.size() * 2;

176 }

177

178 if (node->leaf_addition.empty()) {

179 node->storage = DicNode::LEAF;

180 return 2 + supplimentary_size;

181 }

182 node->storage = DicNode::LEAFMORE;

183 // Signature & affix (2) + null for leaf_addition (1) = 3

184 return 3 + node->leaf_addition.size() + supplimentary_size;

185 }

186

187 // Recursively compute the size of the children for non-leaf nodes.

188 size_t child_size = 0;

189 for (size_t i = 0; i < node->children.size(); i++)

190 child_size += ComputeTrieStorage(node->children[i]);

191

192 // Fixed size is only 1 byte which is the ID byte and the count combined.

193 static const int kListHeaderSize = 1;

194

195 // Lists can only store up to 16 items.

196 static const size_t kListThreshold = 16;

197 if (node->children.size() < kListThreshold && child_size <= 0xFF) {

198 node->storage = DicNode::LIST8;

199 return kListHeaderSize + node->children.size() * 2 + child_size;

200 }

201

202 if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {

203 node->storage = DicNode::LIST16;

204 // Fixed size is one byte plus 3 for each table entry.

205 return kListHeaderSize + node->children.size() * 3 + child_size;

206 }

207

208 static const int kTableHeaderSize = 2; // Type + table size.

209

210 bool has_0th_item;

211 int first_table_item, table_item_count;

212 ComputeLookupStrategyDetails(node->children, &has_0th_item,

213 &first_table_item, &table_item_count);

214 if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +

215 table_item_count * 2 < 0xFFFF) {

216 // Use 16-bit addressing since the children will fit.

217 node->storage = DicNode::LOOKUP16;

218 return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +

219 child_size;

220 }

221

222 // Use 32-bit addressing as a last resort.

223 node->storage = DicNode::LOOKUP32;

224 return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +

225 child_size;

226 }

227

228 // Serializes the given node when it is DicNode::LEAF* to the output.

229 void SerializeLeaf(const DicNode* node, std::string* output) {

230 // The low 6 bits of the ID byte are the high 6 bits of the first affix ID.

231 int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;

232

233 // We may store the first value with the node or in the supplimentary list.

234 size_t first_affix_in_supplimentary_list = 1;

235 if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {

236 // There are not enough bits for this value, move it to the supplimentary

237 // list where there are more bits per value.

238 first_affix_in_supplimentary_list = 0;

239 first_affix = BDict::FIRST_AFFIX_IS_UNUSED;

240 }

241

242 unsigned char id_byte = (first_affix >> 8) &

243 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;

244

245 // The next two bits indicates an additional string and more affixes.

246 if (node->storage == DicNode::LEAFMORE)

247 id_byte \|= BDict::LEAF_NODE_ADDITIONAL_VALUE;

248 if (node->affix_indices.size() > 1 \|\| first_affix_in_supplimentary_list == 0)

249 id_byte \|= BDict::LEAF_NODE_FOLLOWING_VALUE;

250 output->push_back(id_byte);

251

252 // Following is the low 8 bits of the affix index.

253 output->push_back(first_affix & 0xff);

254

255 // Handle the optional addition with NULL terminator.

256 if (node->storage == DicNode::LEAFMORE) {

257 for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)

258 output->push_back(node->leaf_addition.c_str()[i]);

259 }

260

261 // Handle any following affixes. We already wrote the 0th one.

262 if (node->affix_indices.size() > first_affix_in_supplimentary_list) {

263 for (size_t i = first_affix_in_supplimentary_list;

264 i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;

265 i++) {

266 output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));

267 output->push_back(

268 static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));

269 }

270

271 // Terminator for affix list. We use 0xFFFF.

272 output->push_back(static_cast<unsigned char>(0xFF));

273 output->push_back(static_cast<unsigned char>(0xFF));

274 }

275 }

276

277 // Serializes the given node when it is DicNode::LIST* to the output.

278 void SerializeList(const DicNode* node, std::string* output) {

279 bool is_8_bit = node->storage == DicNode::LIST8;

280 unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE \|

281 (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);

282 id_byte \|= node->children.size(); // We assume the size is < 4 bits.

283 output->push_back(id_byte);

284

285 // Reserve enough room for the lookup table (either 2 or 3 bytes per entry).

286 int bytes_per_entry = (is_8_bit ? 2 : 3);

287 size_t table_begin = output->size();

288 output->resize(output->size() + node->children.size() * bytes_per_entry);

289 size_t children_begin = output->size();

290

291 for (size_t i = 0; i < node->children.size(); i++) {

292 // First is the character this entry represents.

293 (output)[table_begin + i bytes_per_entry] = node->children[i]->addition;

294

295 // Next is the 8- or 16-bit offset.

296 size_t offset = output->size() - children_begin;

297 if (is_8_bit) {

298 DCHECK(offset <= 0xFF);

299 (output)[table_begin + i bytes_per_entry + 1] =

300 static_cast<char>(offset & 0xFF);

301 } else {

302 unsigned short* output16 = reinterpret_cast<unsigned short*>(

303 &(output)[table_begin + i bytes_per_entry + 1]);

304 *output16 = static_cast<unsigned short>(offset);

305 }

306

307 // Now append the children's data.

308 SerializeTrie(node->children[i], output);

309 }

310 }

311

312 // Serializes the given node when it is DicNode::LOOKUP* to the output.

313 void SerializeLookup(const DicNode* node, std::string* output) {

314 unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;

315

316 bool has_0th_item;

317 int first_table_item, table_item_count;

318 ComputeLookupStrategyDetails(node->children, &has_0th_item,

319 &first_table_item, &table_item_count);

320

321 // Set the extra bits in the ID byte.

322 bool is_32_bit = (node->storage == DicNode::LOOKUP32);

323 if (is_32_bit)

324 id_byte \|= BDict::LOOKUP_NODE_32BIT_VALUE;

325 if (has_0th_item)

326 id_byte \|= BDict::LOOKUP_NODE_0TH_VALUE;

327

328 size_t begin_offset = output->size();

329

330 output->push_back(id_byte);

331 output->push_back(static_cast<char>(first_table_item));

332 output->push_back(static_cast<char>(table_item_count));

333

334 // Save room for the lookup table and the optional 0th item.

335 int bytes_per_entry = (is_32_bit ? 4 : 2);

336 size_t zeroth_item_offset = output->size();

337 if (has_0th_item)

338 output->resize(output->size() + bytes_per_entry);

339 size_t table_begin = output->size();

340 output->resize(output->size() + table_item_count * bytes_per_entry);

341

342 // Append the children.

343 for (size_t i = 0; i < node->children.size(); i++) {

344 size_t offset = output->size();

345

346 // Compute the location at which we'll store the offset of the child data.

347 // We may be writing the magic 0th item.

348 size_t offset_offset;

349 if (i == 0 && has_0th_item) {

350 offset_offset = zeroth_item_offset;

351 } else {

352 int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;

353 offset_offset = table_begin + table_index * bytes_per_entry;

354 }

355

356 // Write the offset.

357 if (is_32_bit) {

358 // Use 32-bit absolute offsets.

359 // FIXME(brettw) use bit cast.

360 unsigned* offset32 = reinterpret_cast<unsigned>(&(output)[offset_offset] );

361 *offset32 = static_cast<unsigned>(output->size());

362 } else {

363 // Use 16-bit relative offsets.

364 unsigned short* offset16 = reinterpret_cast<unsigned short>(&(output)[of fset_offset]);

365 *offset16 = static_cast<unsigned short>(output->size() - begin_offset);

366 }

367

368 SerializeTrie(node->children[i], output);

369 }

370 }

371

372 // Recursively serializes this node and all of its children to the output.

373 void SerializeTrie(const DicNode* node, std::string* output) {

374 if (node->storage == DicNode::LEAF \|\|

375 node->storage == DicNode::LEAFMORE) {

376 SerializeLeaf(node, output);

377 } else if (node->storage == DicNode::LIST8 \|\|

378 node->storage == DicNode::LIST16) {

379 SerializeList(node, output);

380 } else if (node->storage == DicNode::LOOKUP16 \|\|

381 node->storage == DicNode::LOOKUP32) {

382 SerializeLookup(node, output);

383 }

384 }

385 /*

386 void SerializeStringList(const std::vector<std::string>& list,

387 std::string* output) {

388 for (size_t i = 0; i < list.size(); i++) {

389 if (i != 0)

390 output->push_back('\n');

391 output->append(list[i]);

392 }

393 output->push_back(0);

394 }

395 */

396

397 // Appends the given uint32 to the given string.

398 void AppendUint32(uint32 a, std::string* output) {

399 size_t offset = output->size();

400 output->resize(offset + 4);

401 memcpy(&(*output)[offset], &a, sizeof(uint32));

402 }

403

404 // Serializes the given list of strings with 0 bytes separating them. The end

405 // will be marked by a double-0.

406 void SerializeStringListNullTerm(const std::vector<std::string>& strings,

407 std::string* output) {

408 for (size_t i = 0; i < strings.size(); i++) {

409 // Can't tolerate empty strings since the'll mark the end.

410 if (strings[i].empty())

411 output->push_back(' ');

412 else

413 output->append(strings[i]);

414 output->push_back(0);

415 }

416 output->push_back(0);

417 }

418

419 void SerializeReplacements(

420 const std::vector< std::pair<std::string, std::string> >& repl,

421 std::string* output) {

422 for (size_t i = 0; i < repl.size(); i++) {

423 output->append(repl[i].first);

424 output->push_back(0);

425 output->append(repl[i].second);

426 output->push_back(0);

427 }

428 output->push_back(0);

429 }

430

431 } // namespace

432

433 BDictWriter::BDictWriter() : trie_root_(NULL) {

434 }

435

436 BDictWriter::~BDictWriter() {

437 delete trie_root_;

438 }

439

440 void BDictWriter::SetWords(const WordList& words) {

441 trie_root_ = new DicNode;

442 BuildTrie(words, 0, words.size(), 0, trie_root_);

443 }

444

445 std::string BDictWriter::GetBDict() const {

446 std::string ret;

447

448 // Save room for the header. This will be populated at the end.

449 ret.resize(sizeof(hunspell::BDict::Header));

450

451 // Serialize the affix portion.

452 size_t aff_offset = ret.size();

453 SerializeAff(&ret);

454

455 // Serialize the dictionary words.

456 size_t dic_offset = ret.size();

457 ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));

458 SerializeTrie(trie_root_, &ret);

459

460 // Fill the header last, now that we have the data.

461 hunspell::BDict::Header* header =

462 reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);

463 header->signature = hunspell::BDict::SIGNATURE;

464 header->major_version = hunspell::BDict::MAJOR_VERSION;

465 header->minor_version = hunspell::BDict::MINOR_VERSION;

466 header->aff_offset = static_cast<uint32>(aff_offset);

467 header->dic_offset = static_cast<uint32>(dic_offset);

468

469 // Write the MD5 digest of the affix information and the dictionary words at

470 // the end of the BDic header.

471 if (header->major_version >= 2)

472 base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest);

473

474 return ret;

475 }

476

477 void BDictWriter::SerializeAff(std::string* output) const {

478 // Reserve enough room for the header.

479 size_t header_offset = output->size();

480 output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));

481

482 // Write the comment.

483 output->push_back('\n');

484 output->append(comment_);

485 output->push_back('\n');

486

487 // We need a magic first AF line that lists the number of following ones.

488 size_t affix_group_offset = output->size();

489 output->append(base::StringPrintf("AF %d",

490 static_cast<int>(affix_groups_.size())));

491 output->push_back(0);

492 SerializeStringListNullTerm(affix_groups_, output);

493

494 size_t affix_rule_offset = output->size();

495 SerializeStringListNullTerm(affix_rules_, output);

496

497 size_t rep_offset = output->size();

498 SerializeReplacements(replacements_, output);

499

500 size_t other_offset = output->size();

501 SerializeStringListNullTerm(other_commands_, output);

502

503 // Add the header now that we know the offsets.

504 hunspell::BDict::AffHeader* header =

505 reinterpret_cast<hunspell::BDict::AffHeader>(&(output)[header_offset]);

506 header->affix_group_offset = static_cast<uint32>(affix_group_offset);

507 header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);

508 header->rep_offset = static_cast<uint32>(rep_offset);

509 header->other_offset = static_cast<uint32>(other_offset);

510 }

511

512 } // namespace hunspell

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/google/bdict_writer.h ('k') | third_party/hunspell_new/hunspell.gyp » ('j') | no next file with comments »