OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "third_party/hunspell_new/google/bdict_writer.h" | |
6 | |
7 #include "base/logging.h" | |
8 #include "base/strings/stringprintf.h" | |
9 #include "third_party/hunspell_new/google/bdict.h" | |
10 | |
11 namespace hunspell { | |
12 | |
13 // Represents one node the word trie in memory. This does not have to be very | |
14 // efficient since it is only used when building. | |
15 class DicNode { | |
16 public: | |
17 enum StorageType { | |
18 UNDEFINED, // Uninitialized storage type. | |
19 LEAF, // Word with no additional string data. | |
20 LEAFMORE, // Word with additional suffix following. | |
21 LIST16, // List of sub-nodes with 16-bit relative offsets. | |
22 LIST8, // List of sub-nodes with 8-bit relative offsets. | |
23 LOOKUP32, // Lookup table with 32-bit absolute offsets. | |
24 LOOKUP16, // LOokup table with 16-bit relative offsets. | |
25 }; | |
26 | |
27 DicNode() : addition(0), storage(UNDEFINED) { | |
28 } | |
29 | |
30 ~DicNode() { | |
31 for (size_t i = 0; i < children.size(); i++) | |
32 delete children[i]; | |
33 } | |
34 | |
35 bool is_leaf() const { return children.empty(); } | |
36 | |
37 // When non-zero, this character is the additional level that this | |
38 // node represents. This will be 0 for some leaf nodes when there is no | |
39 // addition and for the root node. | |
40 char addition; | |
41 | |
42 std::vector<DicNode*> children; | |
43 | |
44 // When there are no children, this is a leaf node and this "addition string" | |
45 // is appended to the result. When there are children, this will be empty. | |
46 std::string leaf_addition; | |
47 | |
48 // For leaf nodes, this are the indices into the affix table. | |
49 std::vector<int> affix_indices; | |
50 | |
51 // Initially uninitialized, ComputeStorage() will fill this in with the | |
52 // desired serialization method. | |
53 StorageType storage; | |
54 }; | |
55 | |
56 namespace { | |
57 | |
58 void SerializeTrie(const DicNode* node, std::string* output); | |
59 | |
60 // Returns true if the nth character in the given word is |ch|. Will return | |
61 // false when there is no nth character. Note that this will also match an | |
62 // implicit NULL at the end of the string. | |
63 bool NthCharacterIs(const std::string& word, size_t n, char ch) { | |
64 if (word.length() < n) // Want to allow n == length() to catch the NULL. | |
65 return false; | |
66 return word.c_str()[n] == ch; // Use c_str() to get NULL terminator. | |
67 } | |
68 | |
69 // Recursively build the trie data structure for the range in the |words| list | |
70 // in [begin, end). It is assumed that all words in that range will have the | |
71 // same |node_depth - 2| characters at the beginning. This node will key off of | |
72 // the |node_depth - 1| character, with a special case for the root. | |
73 // | |
74 // |prefix_chars| is how deep this node is in the trie (and corresponds to how | |
75 // many letters of the word we will skip). The root level will have | |
76 // |prefix_chars| of 0. | |
77 // | |
78 // The given |node| will be filled with the data. The return value is the | |
79 // index into the |words| vector of the next word to process. It will be | |
80 // equal to |end| when all words have been consumed. | |
81 size_t BuildTrie(const BDictWriter::WordList& words, | |
82 size_t begin, size_t end, | |
83 size_t node_depth, DicNode* node) { | |
84 // Find the prefix that this node represents. | |
85 const std::string& begin_str = words[begin].first; | |
86 if (begin_str.length() < node_depth) { | |
87 // Singleton. | |
88 node->addition = 0; | |
89 node->affix_indices = words[begin].second; | |
90 return begin + 1; | |
91 } | |
92 | |
93 // Now find the range of words sharing this prefix. | |
94 size_t match_count; | |
95 if (node_depth == 0 && begin == 0) { | |
96 // Special case the root node. | |
97 match_count = end - begin; | |
98 node->addition = 0; | |
99 } else { | |
100 match_count = 0; | |
101 node->addition = begin_str[node_depth - 1]; | |
102 // We know the strings should have [0, node_depth-1) characters at the | |
103 // beginning already matching, so we only need to check the new one. | |
104 while (begin + match_count < end && | |
105 NthCharacterIs(words[begin + match_count].first, | |
106 node_depth - 1, node->addition)) | |
107 match_count++; | |
108 } | |
109 | |
110 if (match_count == 1) { | |
111 // Just found a leaf node with no other words sharing its prefix. Save any | |
112 // remaining characters and we're done. | |
113 node->affix_indices = words[begin].second; | |
114 node->leaf_addition = begin_str.substr(node_depth); | |
115 return begin + 1; | |
116 } | |
117 | |
118 // We have a range of words, add them as children of this node. | |
119 size_t i = begin; | |
120 while (i < begin + match_count) { | |
121 DicNode* cur = new DicNode; | |
122 i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur); | |
123 node->children.push_back(cur); | |
124 } | |
125 | |
126 return begin + match_count; | |
127 } | |
128 | |
129 // Lookup tables are complicated. They can have a magic 0th entry not counted | |
130 // in the table dimensions, and also have indices only for the used sub-range. | |
131 // This function will compute the starting point and size of a lookup table, | |
132 // in addition to whether it should have the magic 0th entry for the given | |
133 // list of child nodes. | |
134 void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children, | |
135 bool* has_0th_entry, | |
136 int* first_item, | |
137 int* list_size) { | |
138 *has_0th_entry = false; | |
139 *first_item = 0; | |
140 *list_size = 0; | |
141 if (children.empty()) | |
142 return; | |
143 | |
144 size_t first_offset = 0; | |
145 if (children[0]->addition == 0) { | |
146 *has_0th_entry = true; | |
147 first_offset++; | |
148 } | |
149 | |
150 if (children.size() == first_offset) | |
151 return; | |
152 | |
153 *first_item = static_cast<unsigned char>(children[first_offset]->addition); | |
154 unsigned char last_item = children[children.size() - 1]->addition; | |
155 *list_size = last_item - *first_item + 1; | |
156 } | |
157 | |
158 // Recursively fills in the storage strategy for this node and each of its | |
159 // children. This must be done before actually serializing because the storage | |
160 // mode will depend on the size of the children. | |
161 size_t ComputeTrieStorage(DicNode* node) { | |
162 if (node->is_leaf()) { | |
163 // The additional affix list holds affixes when there is more than one. Each | |
164 // entry is two bytes, plus an additional FFFF terminator. | |
165 size_t supplimentary_size = 0; | |
166 if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { | |
167 // We cannot store the first affix ID of the affix list into a leaf node. | |
168 // In this case, we have to store all the affix IDs and a terminator | |
169 // into a supplimentary list. | |
170 supplimentary_size = node->affix_indices.size() * 2 + 2; | |
171 } else if (node->affix_indices.size() > 1) { | |
172 // We can store the first affix ID of the affix list into a leaf node. | |
173 // In this case, we need to store the remaining affix IDs and a | |
174 // terminator into a supplimentary list. | |
175 supplimentary_size = node->affix_indices.size() * 2; | |
176 } | |
177 | |
178 if (node->leaf_addition.empty()) { | |
179 node->storage = DicNode::LEAF; | |
180 return 2 + supplimentary_size; | |
181 } | |
182 node->storage = DicNode::LEAFMORE; | |
183 // Signature & affix (2) + null for leaf_addition (1) = 3 | |
184 return 3 + node->leaf_addition.size() + supplimentary_size; | |
185 } | |
186 | |
187 // Recursively compute the size of the children for non-leaf nodes. | |
188 size_t child_size = 0; | |
189 for (size_t i = 0; i < node->children.size(); i++) | |
190 child_size += ComputeTrieStorage(node->children[i]); | |
191 | |
192 // Fixed size is only 1 byte which is the ID byte and the count combined. | |
193 static const int kListHeaderSize = 1; | |
194 | |
195 // Lists can only store up to 16 items. | |
196 static const size_t kListThreshold = 16; | |
197 if (node->children.size() < kListThreshold && child_size <= 0xFF) { | |
198 node->storage = DicNode::LIST8; | |
199 return kListHeaderSize + node->children.size() * 2 + child_size; | |
200 } | |
201 | |
202 if (node->children.size() < kListThreshold && child_size <= 0xFFFF) { | |
203 node->storage = DicNode::LIST16; | |
204 // Fixed size is one byte plus 3 for each table entry. | |
205 return kListHeaderSize + node->children.size() * 3 + child_size; | |
206 } | |
207 | |
208 static const int kTableHeaderSize = 2; // Type + table size. | |
209 | |
210 bool has_0th_item; | |
211 int first_table_item, table_item_count; | |
212 ComputeLookupStrategyDetails(node->children, &has_0th_item, | |
213 &first_table_item, &table_item_count); | |
214 if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) + | |
215 table_item_count * 2 < 0xFFFF) { | |
216 // Use 16-bit addressing since the children will fit. | |
217 node->storage = DicNode::LOOKUP16; | |
218 return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 + | |
219 child_size; | |
220 } | |
221 | |
222 // Use 32-bit addressing as a last resort. | |
223 node->storage = DicNode::LOOKUP32; | |
224 return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 + | |
225 child_size; | |
226 } | |
227 | |
228 // Serializes the given node when it is DicNode::LEAF* to the output. | |
229 void SerializeLeaf(const DicNode* node, std::string* output) { | |
230 // The low 6 bits of the ID byte are the high 6 bits of the first affix ID. | |
231 int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0; | |
232 | |
233 // We may store the first value with the node or in the supplimentary list. | |
234 size_t first_affix_in_supplimentary_list = 1; | |
235 if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { | |
236 // There are not enough bits for this value, move it to the supplimentary | |
237 // list where there are more bits per value. | |
238 first_affix_in_supplimentary_list = 0; | |
239 first_affix = BDict::FIRST_AFFIX_IS_UNUSED; | |
240 } | |
241 | |
242 unsigned char id_byte = (first_affix >> 8) & | |
243 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK; | |
244 | |
245 // The next two bits indicates an additional string and more affixes. | |
246 if (node->storage == DicNode::LEAFMORE) | |
247 id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE; | |
248 if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0) | |
249 id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE; | |
250 output->push_back(id_byte); | |
251 | |
252 // Following is the low 8 bits of the affix index. | |
253 output->push_back(first_affix & 0xff); | |
254 | |
255 // Handle the optional addition with NULL terminator. | |
256 if (node->storage == DicNode::LEAFMORE) { | |
257 for (size_t i = 0; i < node->leaf_addition.size() + 1; i++) | |
258 output->push_back(node->leaf_addition.c_str()[i]); | |
259 } | |
260 | |
261 // Handle any following affixes. We already wrote the 0th one. | |
262 if (node->affix_indices.size() > first_affix_in_supplimentary_list) { | |
263 for (size_t i = first_affix_in_supplimentary_list; | |
264 i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD; | |
265 i++) { | |
266 output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF)); | |
267 output->push_back( | |
268 static_cast<char>((node->affix_indices[i] >> 8) & 0xFF)); | |
269 } | |
270 | |
271 // Terminator for affix list. We use 0xFFFF. | |
272 output->push_back(static_cast<unsigned char>(0xFF)); | |
273 output->push_back(static_cast<unsigned char>(0xFF)); | |
274 } | |
275 } | |
276 | |
277 // Serializes the given node when it is DicNode::LIST* to the output. | |
278 void SerializeList(const DicNode* node, std::string* output) { | |
279 bool is_8_bit = node->storage == DicNode::LIST8; | |
280 unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE | | |
281 (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE); | |
282 id_byte |= node->children.size(); // We assume the size is < 4 bits. | |
283 output->push_back(id_byte); | |
284 | |
285 // Reserve enough room for the lookup table (either 2 or 3 bytes per entry). | |
286 int bytes_per_entry = (is_8_bit ? 2 : 3); | |
287 size_t table_begin = output->size(); | |
288 output->resize(output->size() + node->children.size() * bytes_per_entry); | |
289 size_t children_begin = output->size(); | |
290 | |
291 for (size_t i = 0; i < node->children.size(); i++) { | |
292 // First is the character this entry represents. | |
293 (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition; | |
294 | |
295 // Next is the 8- or 16-bit offset. | |
296 size_t offset = output->size() - children_begin; | |
297 if (is_8_bit) { | |
298 DCHECK(offset <= 0xFF); | |
299 (*output)[table_begin + i * bytes_per_entry + 1] = | |
300 static_cast<char>(offset & 0xFF); | |
301 } else { | |
302 unsigned short* output16 = reinterpret_cast<unsigned short*>( | |
303 &(*output)[table_begin + i * bytes_per_entry + 1]); | |
304 *output16 = static_cast<unsigned short>(offset); | |
305 } | |
306 | |
307 // Now append the children's data. | |
308 SerializeTrie(node->children[i], output); | |
309 } | |
310 } | |
311 | |
312 // Serializes the given node when it is DicNode::LOOKUP* to the output. | |
313 void SerializeLookup(const DicNode* node, std::string* output) { | |
314 unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE; | |
315 | |
316 bool has_0th_item; | |
317 int first_table_item, table_item_count; | |
318 ComputeLookupStrategyDetails(node->children, &has_0th_item, | |
319 &first_table_item, &table_item_count); | |
320 | |
321 // Set the extra bits in the ID byte. | |
322 bool is_32_bit = (node->storage == DicNode::LOOKUP32); | |
323 if (is_32_bit) | |
324 id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE; | |
325 if (has_0th_item) | |
326 id_byte |= BDict::LOOKUP_NODE_0TH_VALUE; | |
327 | |
328 size_t begin_offset = output->size(); | |
329 | |
330 output->push_back(id_byte); | |
331 output->push_back(static_cast<char>(first_table_item)); | |
332 output->push_back(static_cast<char>(table_item_count)); | |
333 | |
334 // Save room for the lookup table and the optional 0th item. | |
335 int bytes_per_entry = (is_32_bit ? 4 : 2); | |
336 size_t zeroth_item_offset = output->size(); | |
337 if (has_0th_item) | |
338 output->resize(output->size() + bytes_per_entry); | |
339 size_t table_begin = output->size(); | |
340 output->resize(output->size() + table_item_count * bytes_per_entry); | |
341 | |
342 // Append the children. | |
343 for (size_t i = 0; i < node->children.size(); i++) { | |
344 size_t offset = output->size(); | |
345 | |
346 // Compute the location at which we'll store the offset of the child data. | |
347 // We may be writing the magic 0th item. | |
348 size_t offset_offset; | |
349 if (i == 0 && has_0th_item) { | |
350 offset_offset = zeroth_item_offset; | |
351 } else { | |
352 int table_index = static_cast<unsigned char>(node->children[i]->addition)
- first_table_item; | |
353 offset_offset = table_begin + table_index * bytes_per_entry; | |
354 } | |
355 | |
356 // Write the offset. | |
357 if (is_32_bit) { | |
358 // Use 32-bit absolute offsets. | |
359 // FIXME(brettw) use bit cast. | |
360 unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]
); | |
361 *offset32 = static_cast<unsigned>(output->size()); | |
362 } else { | |
363 // Use 16-bit relative offsets. | |
364 unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[of
fset_offset]); | |
365 *offset16 = static_cast<unsigned short>(output->size() - begin_offset); | |
366 } | |
367 | |
368 SerializeTrie(node->children[i], output); | |
369 } | |
370 } | |
371 | |
372 // Recursively serializes this node and all of its children to the output. | |
373 void SerializeTrie(const DicNode* node, std::string* output) { | |
374 if (node->storage == DicNode::LEAF || | |
375 node->storage == DicNode::LEAFMORE) { | |
376 SerializeLeaf(node, output); | |
377 } else if (node->storage == DicNode::LIST8 || | |
378 node->storage == DicNode::LIST16) { | |
379 SerializeList(node, output); | |
380 } else if (node->storage == DicNode::LOOKUP16 || | |
381 node->storage == DicNode::LOOKUP32) { | |
382 SerializeLookup(node, output); | |
383 } | |
384 } | |
385 /* | |
386 void SerializeStringList(const std::vector<std::string>& list, | |
387 std::string* output) { | |
388 for (size_t i = 0; i < list.size(); i++) { | |
389 if (i != 0) | |
390 output->push_back('\n'); | |
391 output->append(list[i]); | |
392 } | |
393 output->push_back(0); | |
394 } | |
395 */ | |
396 | |
397 // Appends the given uint32 to the given string. | |
398 void AppendUint32(uint32 a, std::string* output) { | |
399 size_t offset = output->size(); | |
400 output->resize(offset + 4); | |
401 memcpy(&(*output)[offset], &a, sizeof(uint32)); | |
402 } | |
403 | |
404 // Serializes the given list of strings with 0 bytes separating them. The end | |
405 // will be marked by a double-0. | |
406 void SerializeStringListNullTerm(const std::vector<std::string>& strings, | |
407 std::string* output) { | |
408 for (size_t i = 0; i < strings.size(); i++) { | |
409 // Can't tolerate empty strings since the'll mark the end. | |
410 if (strings[i].empty()) | |
411 output->push_back(' '); | |
412 else | |
413 output->append(strings[i]); | |
414 output->push_back(0); | |
415 } | |
416 output->push_back(0); | |
417 } | |
418 | |
419 void SerializeReplacements( | |
420 const std::vector< std::pair<std::string, std::string> >& repl, | |
421 std::string* output) { | |
422 for (size_t i = 0; i < repl.size(); i++) { | |
423 output->append(repl[i].first); | |
424 output->push_back(0); | |
425 output->append(repl[i].second); | |
426 output->push_back(0); | |
427 } | |
428 output->push_back(0); | |
429 } | |
430 | |
431 } // namespace | |
432 | |
433 BDictWriter::BDictWriter() : trie_root_(NULL) { | |
434 } | |
435 | |
436 BDictWriter::~BDictWriter() { | |
437 delete trie_root_; | |
438 } | |
439 | |
440 void BDictWriter::SetWords(const WordList& words) { | |
441 trie_root_ = new DicNode; | |
442 BuildTrie(words, 0, words.size(), 0, trie_root_); | |
443 } | |
444 | |
445 std::string BDictWriter::GetBDict() const { | |
446 std::string ret; | |
447 | |
448 // Save room for the header. This will be populated at the end. | |
449 ret.resize(sizeof(hunspell::BDict::Header)); | |
450 | |
451 // Serialize the affix portion. | |
452 size_t aff_offset = ret.size(); | |
453 SerializeAff(&ret); | |
454 | |
455 // Serialize the dictionary words. | |
456 size_t dic_offset = ret.size(); | |
457 ret.reserve(ret.size() + ComputeTrieStorage(trie_root_)); | |
458 SerializeTrie(trie_root_, &ret); | |
459 | |
460 // Fill the header last, now that we have the data. | |
461 hunspell::BDict::Header* header = | |
462 reinterpret_cast<hunspell::BDict::Header*>(&ret[0]); | |
463 header->signature = hunspell::BDict::SIGNATURE; | |
464 header->major_version = hunspell::BDict::MAJOR_VERSION; | |
465 header->minor_version = hunspell::BDict::MINOR_VERSION; | |
466 header->aff_offset = static_cast<uint32>(aff_offset); | |
467 header->dic_offset = static_cast<uint32>(dic_offset); | |
468 | |
469 // Write the MD5 digest of the affix information and the dictionary words at | |
470 // the end of the BDic header. | |
471 if (header->major_version >= 2) | |
472 base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest); | |
473 | |
474 return ret; | |
475 } | |
476 | |
477 void BDictWriter::SerializeAff(std::string* output) const { | |
478 // Reserve enough room for the header. | |
479 size_t header_offset = output->size(); | |
480 output->resize(output->size() + sizeof(hunspell::BDict::AffHeader)); | |
481 | |
482 // Write the comment. | |
483 output->push_back('\n'); | |
484 output->append(comment_); | |
485 output->push_back('\n'); | |
486 | |
487 // We need a magic first AF line that lists the number of following ones. | |
488 size_t affix_group_offset = output->size(); | |
489 output->append(base::StringPrintf("AF %d", | |
490 static_cast<int>(affix_groups_.size()))); | |
491 output->push_back(0); | |
492 SerializeStringListNullTerm(affix_groups_, output); | |
493 | |
494 size_t affix_rule_offset = output->size(); | |
495 SerializeStringListNullTerm(affix_rules_, output); | |
496 | |
497 size_t rep_offset = output->size(); | |
498 SerializeReplacements(replacements_, output); | |
499 | |
500 size_t other_offset = output->size(); | |
501 SerializeStringListNullTerm(other_commands_, output); | |
502 | |
503 // Add the header now that we know the offsets. | |
504 hunspell::BDict::AffHeader* header = | |
505 reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]); | |
506 header->affix_group_offset = static_cast<uint32>(affix_group_offset); | |
507 header->affix_rule_offset = static_cast<uint32>(affix_rule_offset); | |
508 header->rep_offset = static_cast<uint32>(rep_offset); | |
509 header->other_offset = static_cast<uint32>(other_offset); | |
510 } | |
511 | |
512 } // namespace hunspell | |
OLD | NEW |