OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "third_party/hunspell_new/google/bdict_reader.h" | |
6 | |
7 #include "base/logging.h" | |
8 | |
9 namespace hunspell { | |
10 | |
11 // Like the "Visitor" design pattern, this lightweight object provides an | |
12 // interface around a serialized trie node at the given address in the memory. | |
13 class NodeReader { | |
14 public: | |
15 // Return values for GetChildAt. | |
16 enum FindResult { | |
17 // A node is found. | |
18 FIND_NODE, | |
19 | |
20 // There are no more children for this node, no child node is returned. | |
21 FIND_DONE, | |
22 | |
23 // There is no node at this location, but there are more if you continue | |
24 // iterating. This happens when there is a lookup node with empty entries. | |
25 FIND_NOTHING | |
26 }; | |
27 | |
28 // The default constructor makes an invalid reader. | |
29 NodeReader(); | |
30 NodeReader(const unsigned char* bdict_data, size_t bdict_length, | |
31 size_t node_offset, int node_depth); | |
32 | |
33 // Returns true if the reader is valid. False means you shouldn't use it. | |
34 bool is_valid() const { return is_valid_; } | |
35 | |
36 // Recursively finds the given NULL terminated word. | |
37 // See BDictReader::FindWord. | |
38 int FindWord(const unsigned char* word, | |
39 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const; | |
40 | |
41 // Allows iterating over the children of this node. When it returns | |
42 // FIND_NODE, |*result| will be populated with the reader for the found node. | |
43 // The first index is 0. The single character for this node will be placed | |
44 // into |*found_char|. | |
45 FindResult GetChildAt(int index, char* found_char, NodeReader* result) const; | |
46 | |
47 // Leaf ---------------------------------------------------------------------- | |
48 | |
49 inline bool is_leaf() const { | |
50 // If id_byte() sets is_valid_ to false, we need an extra check to avoid | |
51 // returning true for this type. | |
52 return (id_byte() & BDict::LEAF_NODE_TYPE_MASK) == | |
53 BDict::LEAF_NODE_TYPE_VALUE && is_valid_; | |
54 } | |
55 | |
56 // If this is a leaf node with an additional string, this function will return | |
57 // a pointer to the beginning of the additional string. It will be NULL | |
58 // terminated. If it is not a leaf or has no additional string, it will return | |
59 // NULL. | |
60 inline const unsigned char* additional_string_for_leaf() const { | |
61 // Leaf nodes with additional strings start with bits "01" in the ID byte. | |
62 if ((id_byte() & BDict::LEAF_NODE_ADDITIONAL_MASK) == | |
63 BDict::LEAF_NODE_ADDITIONAL_VALUE) { | |
64 if (node_offset_ < (bdict_length_ - 2)) | |
65 return &bdict_data_[node_offset_ + 2]; // Starts after the 2 byte ID. | |
66 // Otherwise the dictionary is corrupt. | |
67 is_valid_ = false; | |
68 } | |
69 return NULL; | |
70 } | |
71 | |
72 // Returns the first affix ID corresponding to the given leaf node. The | |
73 // current node must be a leaf or this will do the wrong thing. There may be | |
74 // additional affix IDs following the node when leaf_has_following is set, | |
75 // but this will not handle those. | |
76 inline int affix_id_for_leaf() const { | |
77 if (node_offset_ >= bdict_length_ - 2) { | |
78 is_valid_ = false; | |
79 return 0; | |
80 } | |
81 // Take the lowest 6 bits of the first byte, and all 8 bits of the second. | |
82 return ((bdict_data_[node_offset_ + 0] & | |
83 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK) << 8) + | |
84 bdict_data_[node_offset_ + 1]; | |
85 } | |
86 | |
87 // Returns true if there is a list of additional affix matches following this | |
88 // leaf node. | |
89 inline bool leaf_has_following() const { | |
90 return ((id_byte() & BDict::LEAF_NODE_FOLLOWING_MASK) == | |
91 BDict::LEAF_NODE_FOLLOWING_VALUE); | |
92 } | |
93 | |
94 // Fills the affix indices into the output array given a matching leaf node. | |
95 // |additional_bytes| is the number of bytes of the additional string, | |
96 // including the NULL terminator, following this leaf node. This will be 0 if | |
97 // there is no additional string. | |
98 int FillAffixesForLeafMatch( | |
99 size_t additional_bytes, | |
100 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const; | |
101 | |
102 // Lookup -------------------------------------------------------------------- | |
103 | |
104 inline bool is_lookup() const { | |
105 return (id_byte() & BDict::LOOKUP_NODE_TYPE_MASK) == | |
106 BDict::LOOKUP_NODE_TYPE_VALUE; | |
107 } | |
108 | |
109 inline bool is_lookup_32() const { | |
110 return (id_byte() & BDict::LOOKUP_NODE_32BIT_MASK) == | |
111 BDict::LOOKUP_NODE_32BIT_VALUE; | |
112 } | |
113 | |
114 inline bool lookup_has_0th() const { | |
115 return (id_byte() & BDict::LOOKUP_NODE_0TH_MASK) == | |
116 BDict::LOOKUP_NODE_0TH_VALUE; | |
117 } | |
118 | |
119 // Returns the first entry after the lookup table header. When there is a | |
120 // magic 0th entry, it will be that address. | |
121 // The caller checks that the result is in-bounds. | |
122 inline size_t zeroth_entry_offset() const { | |
123 return node_offset_ + 3; | |
124 } | |
125 | |
126 // Returns the index of the first element in the lookup table. This skips any | |
127 // magic 0th entry. | |
128 // The caller checks that the result is in-bounds. | |
129 size_t lookup_table_offset() const { | |
130 size_t table_offset = zeroth_entry_offset(); | |
131 if (lookup_has_0th()) | |
132 return table_offset + (is_lookup_32() ? 4 : 2); | |
133 return table_offset; | |
134 } | |
135 | |
136 inline int lookup_first_char() const { | |
137 if (node_offset_ >= bdict_length_ - 1) { | |
138 is_valid_ = false; | |
139 return 0; | |
140 } | |
141 return bdict_data_[node_offset_ + 1]; | |
142 } | |
143 | |
144 inline int lookup_num_chars() const { | |
145 if (node_offset_ >= bdict_length_ - 2) { | |
146 is_valid_ = false; | |
147 return 0; | |
148 } | |
149 return bdict_data_[node_offset_ + 2]; | |
150 } | |
151 | |
152 // Computes a node reader for the magic 0th entry of the table. This assumes | |
153 // it has a 0th entry. This will always return FOUND_NODE (for compatilibility | |
154 // with GetChildAt). | |
155 FindResult ReaderForLookup0th(NodeReader* result) const; | |
156 | |
157 // Gets a node reader for the |offset|th element in the table, not counting | |
158 // the magic 0th element, if any (so passing 0 here will give you the first | |
159 // element in the regular lookup table). The offset is assumed to be valid. | |
160 // | |
161 // |child_node_char| is the character value that the child node will | |
162 // represent. The single character for this node will be placed into | |
163 // |*found_char|. | |
164 FindResult ReaderForLookupAt(size_t index, char* found_char, | |
165 NodeReader* result) const; | |
166 | |
167 // List ---------------------------------------------------------------------- | |
168 | |
169 inline bool is_list() const { | |
170 return (id_byte() & BDict::LIST_NODE_TYPE_MASK) == | |
171 BDict::LIST_NODE_TYPE_VALUE; | |
172 } | |
173 | |
174 inline int is_list_16() const { | |
175 // 16 bit lst nodes have the high 4 bits of 1. | |
176 return (id_byte() & BDict::LIST_NODE_16BIT_MASK) == | |
177 BDict::LIST_NODE_16BIT_VALUE; | |
178 } | |
179 | |
180 inline size_t list_item_count() const { | |
181 // The list count is stored in the low 4 bits of the ID. | |
182 return id_byte() & BDict::LIST_NODE_COUNT_MASK; | |
183 } | |
184 | |
185 // Returns a NodeReader for the list item with the given index. The single | |
186 // character for this node will be placed into |*found_char|. | |
187 FindResult ReaderForListAt(size_t index, char* found_char, | |
188 NodeReader* result) const; | |
189 | |
190 private: | |
191 inline unsigned char id_byte() const { | |
192 if (!is_valid_) | |
193 return 0; // Don't continue with a corrupt node. | |
194 if (node_offset_ >= bdict_length_) { | |
195 // Return zero if out of bounds; we'll check is_valid_ in caller. | |
196 is_valid_ = false; | |
197 return 0; | |
198 } | |
199 return bdict_data_[node_offset_]; | |
200 } | |
201 | |
202 // Checks the given leaf node to see if it's a match for the given word. | |
203 // The parameters and return values are the same as BDictReader::FindWord. | |
204 int CompareLeafNode(const unsigned char* word, | |
205 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const; | |
206 | |
207 // Recursive calls used by FindWord to look up child nodes of different types. | |
208 int FindInLookup(const unsigned char* word, | |
209 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const; | |
210 int FindInList(const unsigned char* word, | |
211 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const; | |
212 | |
213 // The entire bdict file. This will be NULL if it is invalid. | |
214 const unsigned char* bdict_data_; | |
215 size_t bdict_length_; | |
216 // Points to the end of the file (for length checking convenience). | |
217 const unsigned char* bdict_end_; | |
218 | |
219 // Absolute offset within |bdict_data_| of the beginning of this node. | |
220 size_t node_offset_; | |
221 | |
222 // The character index into the word that this node represents. | |
223 int node_depth_; | |
224 | |
225 // Signals that dictionary corruption was found during node traversal. | |
226 mutable bool is_valid_; | |
227 }; | |
228 | |
229 NodeReader::NodeReader() | |
230 : bdict_data_(NULL), | |
231 bdict_length_(0), | |
232 bdict_end_(NULL), | |
233 node_offset_(0), | |
234 node_depth_(0), | |
235 is_valid_(false) { | |
236 } | |
237 | |
238 NodeReader::NodeReader(const unsigned char* bdict_data, size_t bdict_length, | |
239 size_t node_offset, int node_depth) | |
240 : bdict_data_(bdict_data), | |
241 bdict_length_(bdict_length), | |
242 bdict_end_(bdict_data + bdict_length), | |
243 node_offset_(node_offset), | |
244 node_depth_(node_depth), | |
245 is_valid_(bdict_data != NULL && node_offset < bdict_length) { | |
246 } | |
247 | |
248 int NodeReader::FindWord(const unsigned char* word, | |
249 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const { | |
250 // Return 0 if the dictionary is corrupt as BDictReader::FindWord() does. | |
251 if (!bdict_data_ || node_offset_ > bdict_length_) | |
252 return 0; | |
253 | |
254 if (is_leaf()) | |
255 return CompareLeafNode(word, affix_indices); | |
256 | |
257 if (is_lookup()) | |
258 return FindInLookup(word, affix_indices); | |
259 if (is_list()) | |
260 return FindInList(word, affix_indices); | |
261 return 0; // Corrupt file. | |
262 } | |
263 | |
264 NodeReader::FindResult NodeReader::GetChildAt(int index, char* found_char, | |
265 NodeReader* result) const { | |
266 if (is_lookup()) { | |
267 if (lookup_has_0th()) { | |
268 if (index == 0) { | |
269 *found_char = 0; | |
270 return ReaderForLookup0th(result); | |
271 } | |
272 index--; // Make index relative to the non-0th-element table. | |
273 } | |
274 return ReaderForLookupAt(index, found_char, result); | |
275 } | |
276 if (is_list()) { | |
277 return ReaderForListAt(index, found_char, result); | |
278 } | |
279 return FIND_DONE; | |
280 } | |
281 | |
282 int NodeReader::CompareLeafNode( | |
283 const unsigned char* word, | |
284 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const { | |
285 // See if there is an additional string. | |
286 const unsigned char* additional = additional_string_for_leaf(); | |
287 if (!additional) { | |
288 // No additional string. This means we should have reached the end of the | |
289 // word to get a match. | |
290 if (word[node_depth_] != 0) | |
291 return 0; | |
292 return FillAffixesForLeafMatch(0, affix_indices); | |
293 } | |
294 | |
295 // Check the additional string. | |
296 int cur = 0; | |
297 while (&additional[cur] < bdict_end_ && additional[cur]) { | |
298 if (word[node_depth_ + cur] != additional[cur]) | |
299 return 0; // Not a match. | |
300 cur++; | |
301 } | |
302 | |
303 if (&additional[cur] == bdict_end_) { | |
304 is_valid_ = false; | |
305 return 0; | |
306 } | |
307 | |
308 // Got to the end of the additional string, the word should also be over for | |
309 // a match (the same as above). | |
310 if (word[node_depth_ + cur] != 0) | |
311 return 0; | |
312 return FillAffixesForLeafMatch(cur + 1, affix_indices); | |
313 } | |
314 | |
315 int NodeReader::FillAffixesForLeafMatch( | |
316 size_t additional_bytes, | |
317 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const { | |
318 // The first match is easy, it always comes from the affix_id included in the | |
319 // leaf node. | |
320 affix_indices[0] = affix_id_for_leaf(); | |
321 | |
322 if (!leaf_has_following() && affix_indices[0] != BDict::FIRST_AFFIX_IS_UNUSED) | |
323 return 1; // Common case: no additional affix group IDs. | |
324 | |
325 // We may or may not need to ignore that first value we just read, since it | |
326 // could be a dummy placeholder value. The |list_offset| is the starting | |
327 // position in the output list to write the rest of the values, which may | |
328 // overwrite the first value. | |
329 int list_offset = 1; | |
330 if (affix_indices[0] == BDict::FIRST_AFFIX_IS_UNUSED) | |
331 list_offset = 0; | |
332 | |
333 // Save the end pointer (accounting for an odd number of bytes). | |
334 size_t array_start = node_offset_ + additional_bytes + 2; | |
335 const uint16* const bdict_short_end = reinterpret_cast<const uint16*>( | |
336 &bdict_data_[((bdict_length_ - array_start) & -2) + array_start]); | |
337 // Process all remaining matches. | |
338 const uint16* following_array = reinterpret_cast<const uint16*>( | |
339 &bdict_data_[array_start]); | |
340 for (int i = 0; i < BDict::MAX_AFFIXES_PER_WORD - list_offset; i++) { | |
341 if (&following_array[i] >= bdict_short_end) { | |
342 is_valid_ = false; | |
343 return 0; | |
344 } | |
345 if (following_array[i] == BDict::LEAF_NODE_FOLLOWING_LIST_TERMINATOR) | |
346 return i + list_offset; // Found the end of the list. | |
347 affix_indices[i + list_offset] = following_array[i]; | |
348 } | |
349 return BDict::MAX_AFFIXES_PER_WORD; | |
350 } | |
351 | |
352 int NodeReader::FindInLookup( | |
353 const unsigned char* word, | |
354 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const { | |
355 unsigned char next_char = word[node_depth_]; | |
356 | |
357 NodeReader child_reader; | |
358 if (next_char == 0 && lookup_has_0th()) { | |
359 if (ReaderForLookup0th(&child_reader) != FIND_NODE) | |
360 return 0; | |
361 } else { | |
362 // Look up in the regular part of the table. | |
363 int offset_in_table = static_cast<int>(next_char) - lookup_first_char(); | |
364 if (offset_in_table < 0 || offset_in_table > lookup_num_chars()) | |
365 return 0; // Table can not include this value. | |
366 | |
367 char dummy_char; | |
368 if (ReaderForLookupAt(offset_in_table, &dummy_char, &child_reader) != | |
369 FIND_NODE) | |
370 return 0; | |
371 DCHECK(dummy_char == static_cast<char>(next_char)); | |
372 } | |
373 | |
374 if (!child_reader.is_valid()) | |
375 return 0; // Something is messed up. | |
376 | |
377 // Now recurse into that child node. | |
378 return child_reader.FindWord(word, affix_indices); | |
379 } | |
380 | |
381 NodeReader::FindResult NodeReader::ReaderForLookup0th( | |
382 NodeReader* result) const { | |
383 size_t child_offset; | |
384 if (is_lookup_32()) { | |
385 child_offset = *reinterpret_cast<const unsigned int*>( | |
386 &bdict_data_[zeroth_entry_offset()]); | |
387 } else { | |
388 child_offset = *reinterpret_cast<const unsigned short*>( | |
389 &bdict_data_[zeroth_entry_offset()]); | |
390 child_offset += node_offset_; | |
391 } | |
392 | |
393 // Range check the offset; | |
394 if (child_offset >= bdict_length_) { | |
395 is_valid_ = false; | |
396 return FIND_DONE; | |
397 } | |
398 | |
399 // Now recurse into that child node. We don't advance to the next character | |
400 // here since the 0th element will be a leaf (see ReaderForLookupAt). | |
401 *result = NodeReader(bdict_data_, bdict_length_, child_offset, node_depth_); | |
402 return FIND_NODE; | |
403 } | |
404 | |
405 NodeReader::FindResult NodeReader::ReaderForLookupAt( | |
406 size_t index, | |
407 char* found_char, | |
408 NodeReader* result) const { | |
409 const unsigned char* table_begin = &bdict_data_[lookup_table_offset()]; | |
410 | |
411 if (index >= static_cast<size_t>(lookup_num_chars()) || !is_valid_) | |
412 return FIND_DONE; | |
413 | |
414 size_t child_offset; | |
415 if (is_lookup_32()) { | |
416 // Table contains 32-bit absolute offsets. | |
417 child_offset = | |
418 reinterpret_cast<const unsigned int*>(table_begin)[index]; | |
419 if (!child_offset) | |
420 return FIND_NOTHING; // This entry in the table is empty. | |
421 } else { | |
422 // Table contains 16-bit offsets relative to the current node. | |
423 child_offset = | |
424 reinterpret_cast<const unsigned short*>(table_begin)[index]; | |
425 if (!child_offset) | |
426 return FIND_NOTHING; // This entry in the table is empty. | |
427 child_offset += node_offset_; | |
428 } | |
429 | |
430 // Range check the offset; | |
431 if (child_offset >= bdict_length_) { | |
432 is_valid_ = false; | |
433 return FIND_DONE; // Error. | |
434 } | |
435 | |
436 // This is a bit tricky. When we've just reached the end of a word, the word | |
437 // itself will be stored in a leaf "node" off of this node. That node, of | |
438 // course, will want to know that it's the end of the word and so we have to | |
439 // have it use the same index into the word as we're using at this level. | |
440 // | |
441 // This happens when there is a word in the dictionary that is a strict | |
442 // prefix of other words in the dictionary, and so we'll have a non-leaf | |
443 // node representing the entire word before the ending leaf node. | |
444 // | |
445 // In all other cases, we want to advance to the next character. Even if the | |
446 // child node is a leaf, it will have an additional character that it will | |
447 // want to check. | |
448 *found_char = static_cast<char>(index + lookup_first_char()); | |
449 if (!is_valid_) | |
450 return FIND_DONE; | |
451 int char_advance = *found_char == 0 ? 0 : 1; | |
452 | |
453 *result = NodeReader(bdict_data_, bdict_length_, | |
454 child_offset, node_depth_ + char_advance); | |
455 return FIND_NODE; | |
456 } | |
457 | |
458 int NodeReader::FindInList( | |
459 const unsigned char* word, | |
460 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const { | |
461 unsigned char next_char = word[node_depth_]; | |
462 | |
463 // TODO(brettw) replace with binary search. | |
464 size_t list_count = list_item_count(); | |
465 const unsigned char* list_begin = &bdict_data_[node_offset_ + 1]; | |
466 | |
467 int bytes_per_index = (is_list_16() ? 3 : 2); | |
468 | |
469 for (size_t i = 0; i < list_count; i++) { | |
470 const unsigned char* list_current = &list_begin[i * bytes_per_index]; | |
471 if (list_current >= bdict_end_) { | |
472 is_valid_ = false; | |
473 return 0; | |
474 } | |
475 if (*list_current == next_char) { | |
476 // Found a match. | |
477 char dummy_char; | |
478 NodeReader child_reader; | |
479 if (ReaderForListAt(i, &dummy_char, &child_reader) != FIND_NODE) | |
480 return 0; | |
481 DCHECK(dummy_char == static_cast<char>(next_char)); | |
482 return child_reader.FindWord(word, affix_indices); | |
483 } | |
484 } | |
485 return 0; | |
486 } | |
487 | |
488 NodeReader::FindResult NodeReader::ReaderForListAt( | |
489 size_t index, | |
490 char* found_char, | |
491 NodeReader* result) const { | |
492 size_t list_begin = node_offset_ + 1; | |
493 | |
494 if (index >= list_item_count()) | |
495 return FIND_DONE; | |
496 | |
497 size_t offset; | |
498 if (is_list_16()) { | |
499 const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 3; | |
500 *found_char = static_cast<char>(list_item_begin[0]); | |
501 | |
502 // The children begin right after the list. | |
503 size_t children_begin = list_begin + list_item_count() * 3; | |
504 offset = children_begin + *reinterpret_cast<const unsigned short*>( | |
505 &list_item_begin[1]); | |
506 } else { | |
507 const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 2; | |
508 *found_char = list_item_begin[0]; | |
509 | |
510 size_t children_begin = list_begin + list_item_count() * 2; | |
511 offset = children_begin + list_item_begin[1]; | |
512 } | |
513 | |
514 if (offset == 0 || node_offset_ >= bdict_length_) { | |
515 is_valid_ = false; | |
516 return FIND_DONE; // Error, should not happen except for corruption. | |
517 } | |
518 | |
519 int char_advance = *found_char == 0 ? 0 : 1; // See ReaderForLookupAt. | |
520 *result = NodeReader(bdict_data_, bdict_length_, | |
521 offset, node_depth_ + char_advance); | |
522 return FIND_NODE; | |
523 } | |
524 | |
525 // WordIterator ---------------------------------------------------------------- | |
526 | |
527 struct WordIterator::NodeInfo { | |
528 // The current offset is set to -1 so we start iterating at 0 when Advance | |
529 // is called. | |
530 NodeInfo(const NodeReader& rdr, char add) | |
531 : reader(rdr), | |
532 addition(add), | |
533 cur_offset(-1) { | |
534 } | |
535 | |
536 // The reader for this node level. | |
537 NodeReader reader; | |
538 | |
539 // The character that this level represents. For the 0th level, this will | |
540 // be 0 (since it is the root that represents no characters). | |
541 char addition; | |
542 | |
543 // The current index into the reader that we're reading. Combined with the | |
544 // |stack_|, this allows us to iterate over the tree in depth-first order. | |
545 int cur_offset; | |
546 }; | |
547 | |
548 WordIterator::WordIterator(const NodeReader& reader) { | |
549 NodeInfo info(reader, 0); | |
550 stack_.push_back(info); | |
551 } | |
552 | |
553 WordIterator::WordIterator(const WordIterator& other) { | |
554 operator=(other); | |
555 } | |
556 | |
557 WordIterator::~WordIterator() { | |
558 // Can't be in the header for the NodeReader destructor. | |
559 } | |
560 | |
561 WordIterator& WordIterator::operator=(const WordIterator& other) { | |
562 stack_ = other.stack_; | |
563 return *this; | |
564 } | |
565 | |
566 int WordIterator::Advance(char* output_buffer, size_t output_len, | |
567 int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) { | |
568 // In-order tree walker. This uses a loop for fake tail recursion. | |
569 while (!stack_.empty()) { | |
570 NodeInfo& cur = stack_.back(); | |
571 cur.cur_offset++; | |
572 char cur_char; | |
573 NodeReader child_reader; | |
574 | |
575 /*if (cur.reader.is_leaf()) { | |
576 child_reader = cur.reader; | |
577 cur_char = cur.addition; | |
578 stack_.pop_back(); | |
579 return FoundLeaf(child_reader, cur_char, output_buffer, output_len, | |
580 affix_ids); | |
581 }*/ | |
582 | |
583 switch (cur.reader.GetChildAt(cur.cur_offset, &cur_char, &child_reader)) { | |
584 case NodeReader::FIND_NODE: | |
585 // Got a valid child node. | |
586 if (child_reader.is_leaf()) { | |
587 return FoundLeaf(child_reader, cur_char, output_buffer, output_len, | |
588 affix_ids); | |
589 } | |
590 | |
591 // Not a leaf. Add the new node to our stack and try again. | |
592 stack_.push_back(NodeInfo(child_reader, cur_char)); | |
593 break; | |
594 | |
595 case NodeReader::FIND_NOTHING: | |
596 // This one is empty, but we're not done. Continue on. | |
597 break; | |
598 | |
599 case NodeReader::FIND_DONE: | |
600 // No more children at this level, pop the stack and go back one. | |
601 stack_.pop_back(); | |
602 } | |
603 } | |
604 | |
605 return false; | |
606 } | |
607 | |
608 int WordIterator::FoundLeaf(const NodeReader& reader, char cur_char, | |
609 char* output_buffer, size_t output_len, | |
610 int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) { | |
611 // Remember that the first item in the stack is the root and so doesn't count. | |
612 int i; | |
613 for (i = 0; i < static_cast<int>(stack_.size()) - 1 && i < static_cast<int>(ou
tput_len) - 1; i++) | |
614 output_buffer[i] = stack_[i + 1].addition; | |
615 output_buffer[i++] = cur_char; // The one we just found. | |
616 | |
617 // Possibly add any extra parts. | |
618 size_t additional_string_length = 0; | |
619 const char* additional = reinterpret_cast<const char*>( | |
620 reader.additional_string_for_leaf()); | |
621 for (; i < static_cast<int>(output_len) - 1 && additional && | |
622 additional[additional_string_length] != 0; | |
623 i++, additional_string_length++) | |
624 output_buffer[i] = additional[additional_string_length]; | |
625 if (additional_string_length) | |
626 additional_string_length++; // Account for the null terminator. | |
627 output_buffer[i] = 0; | |
628 | |
629 return reader.FillAffixesForLeafMatch(additional_string_length, | |
630 affix_ids); | |
631 } | |
632 | |
633 // LineIterator ---------------------------------------------------------------- | |
634 | |
635 LineIterator::LineIterator( | |
636 const unsigned char* bdict_data, | |
637 size_t bdict_length, | |
638 size_t first_offset) | |
639 : bdict_data_(bdict_data), | |
640 bdict_length_(bdict_length), | |
641 cur_offset_(first_offset) { | |
642 } | |
643 | |
644 // Returns true when all data has been read. We're done when we reach a | |
645 // double-NULL or a the end of the input (shouldn't happen). | |
646 bool LineIterator::IsDone() const { | |
647 return cur_offset_ >= bdict_length_ || bdict_data_[cur_offset_] == 0; | |
648 } | |
649 | |
650 const char* LineIterator::Advance() { | |
651 if (IsDone()) | |
652 return NULL; | |
653 | |
654 const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]); | |
655 | |
656 // Advance over this word to find the end. | |
657 while (cur_offset_ < bdict_length_ && bdict_data_[cur_offset_]) | |
658 cur_offset_++; | |
659 cur_offset_++; // Advance over the NULL terminator. | |
660 | |
661 return begin; | |
662 } | |
663 | |
664 bool LineIterator::AdvanceAndCopy(char* buf, size_t buf_len) { | |
665 if (IsDone()) | |
666 return false; | |
667 | |
668 const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]); | |
669 | |
670 // Advance over this word to find the end. | |
671 size_t i; | |
672 for (i = 0; | |
673 i < buf_len && cur_offset_ < bdict_length_ && bdict_data_[cur_offset_]; | |
674 i++, cur_offset_++) { | |
675 buf[i] = bdict_data_[cur_offset_]; | |
676 } | |
677 // Handle the NULL terminator. | |
678 cur_offset_++; // Consume in the input | |
679 if (i < buf_len) | |
680 buf[i] = 0; // Save in the output. | |
681 else | |
682 buf[buf_len - 1] = 0; // Overflow, make sure it's terminated. | |
683 | |
684 return !!buf[0]; | |
685 } | |
686 | |
687 // ReplacementIterator --------------------------------------------------------- | |
688 | |
689 // Fills pointers to NULL terminated strings into the given output params. | |
690 // Returns false if there are no more pairs and nothing was filled in. | |
691 bool ReplacementIterator::GetNext(const char** first, const char** second) { | |
692 if (IsDone()) | |
693 return false; | |
694 *first = Advance(); | |
695 *second = Advance(); | |
696 return *first && *second; | |
697 } | |
698 | |
699 // BDictReader ----------------------------------------------------------------- | |
700 | |
701 BDictReader::BDictReader() | |
702 : bdict_data_(NULL), | |
703 bdict_length_(0), | |
704 header_(NULL) { | |
705 } | |
706 | |
707 bool BDictReader::Init(const unsigned char* bdict_data, size_t bdict_length) { | |
708 if (bdict_length < sizeof(BDict::Header)) | |
709 return false; | |
710 | |
711 // Check header. | |
712 header_ = reinterpret_cast<const BDict::Header*>(bdict_data); | |
713 if (header_->signature != BDict::SIGNATURE || | |
714 header_->major_version > BDict::MAJOR_VERSION || | |
715 header_->dic_offset > bdict_length) | |
716 return false; | |
717 | |
718 // Get the affix header, make sure there is enough room for it. | |
719 if (header_->aff_offset + sizeof(BDict::AffHeader) > bdict_length) | |
720 return false; | |
721 aff_header_ = reinterpret_cast<const BDict::AffHeader*>( | |
722 &bdict_data[header_->aff_offset]); | |
723 | |
724 // Make sure there is enough room for the affix group count dword. | |
725 if (aff_header_->affix_group_offset > bdict_length - sizeof(uint32)) | |
726 return false; | |
727 | |
728 // This function is called from SpellCheck::SpellCheckWord(), which blocks | |
729 // WebKit. To avoid blocking WebKit for a long time, we do not check the MD5 | |
730 // digest here. Instead we check the MD5 digest when Chrome finishes | |
731 // downloading a dictionary. | |
732 | |
733 // Don't set these until the end. This way, NULL bdict_data_ will indicate | |
734 // failure. | |
735 bdict_data_ = bdict_data; | |
736 bdict_length_ = bdict_length; | |
737 return true; | |
738 } | |
739 | |
740 int BDictReader::FindWord( | |
741 const char* word, | |
742 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const { | |
743 if (!bdict_data_ || | |
744 header_->dic_offset >= bdict_length_) { | |
745 // When the dictionary is corrupt, we return 0 which means the word is valid | |
746 // and has no rules. This means when there is some problem, we'll default | |
747 // to no spellchecking rather than marking everything as misspelled. | |
748 return 0; | |
749 } | |
750 NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0); | |
751 return reader.FindWord(reinterpret_cast<const unsigned char*>(word), | |
752 affix_indices); | |
753 } | |
754 | |
755 LineIterator BDictReader::GetAfLineIterator() const { | |
756 if (!bdict_data_ || | |
757 aff_header_->affix_group_offset == 0 || | |
758 aff_header_->affix_group_offset >= bdict_length_) | |
759 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid. | |
760 return LineIterator(bdict_data_, bdict_length_, | |
761 aff_header_->affix_group_offset); | |
762 } | |
763 | |
764 LineIterator BDictReader::GetAffixLineIterator() const { | |
765 if (!bdict_data_ || | |
766 aff_header_->affix_rule_offset == 0 || | |
767 aff_header_->affix_rule_offset >= bdict_length_) | |
768 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid. | |
769 return LineIterator(bdict_data_, bdict_length_, | |
770 aff_header_->affix_rule_offset); | |
771 } | |
772 | |
773 LineIterator BDictReader::GetOtherLineIterator() const { | |
774 if (!bdict_data_ || | |
775 aff_header_->other_offset == 0 || | |
776 aff_header_->other_offset >= bdict_length_) | |
777 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid. | |
778 return LineIterator(bdict_data_, bdict_length_, | |
779 aff_header_->other_offset); | |
780 } | |
781 | |
782 ReplacementIterator BDictReader::GetReplacementIterator() const { | |
783 return ReplacementIterator(bdict_data_, bdict_length_, | |
784 aff_header_->rep_offset); | |
785 } | |
786 | |
787 | |
788 WordIterator BDictReader::GetAllWordIterator() const { | |
789 NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0); | |
790 return WordIterator(reader); | |
791 } | |
792 | |
793 } // namespace hunspell | |
OLD | NEW |