third_party/hunspell_new/google/bdict_reader.cc - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/google/bdict_reader.cc

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "third_party/hunspell_new/google/bdict_reader.h"

6

7 #include "base/logging.h"

8

9 namespace hunspell {

10

11 // Like the "Visitor" design pattern, this lightweight object provides an

12 // interface around a serialized trie node at the given address in the memory.

13 class NodeReader {

14 public:

15 // Return values for GetChildAt.

16 enum FindResult {

17 // A node is found.

18 FIND_NODE,

19

20 // There are no more children for this node, no child node is returned.

21 FIND_DONE,

22

23 // There is no node at this location, but there are more if you continue

24 // iterating. This happens when there is a lookup node with empty entries.

25 FIND_NOTHING

26 };

27

28 // The default constructor makes an invalid reader.

29 NodeReader();

30 NodeReader(const unsigned char* bdict_data, size_t bdict_length,

31 size_t node_offset, int node_depth);

32

33 // Returns true if the reader is valid. False means you shouldn't use it.

34 bool is_valid() const { return is_valid_; }

35

36 // Recursively finds the given NULL terminated word.

37 // See BDictReader::FindWord.

38 int FindWord(const unsigned char* word,

39 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;

40

41 // Allows iterating over the children of this node. When it returns

42 // FIND_NODE, \|*result\| will be populated with the reader for the found node.

43 // The first index is 0. The single character for this node will be placed

44 // into \|*found_char\|.

45 FindResult GetChildAt(int index, char* found_char, NodeReader* result) const;

46

47 // Leaf ----------------------------------------------------------------------

48

49 inline bool is_leaf() const {

50 // If id_byte() sets is_valid_ to false, we need an extra check to avoid

51 // returning true for this type.

52 return (id_byte() & BDict::LEAF_NODE_TYPE_MASK) ==

53 BDict::LEAF_NODE_TYPE_VALUE && is_valid_;

54 }

55

56 // If this is a leaf node with an additional string, this function will return

57 // a pointer to the beginning of the additional string. It will be NULL

58 // terminated. If it is not a leaf or has no additional string, it will return

59 // NULL.

60 inline const unsigned char* additional_string_for_leaf() const {

61 // Leaf nodes with additional strings start with bits "01" in the ID byte.

62 if ((id_byte() & BDict::LEAF_NODE_ADDITIONAL_MASK) ==

63 BDict::LEAF_NODE_ADDITIONAL_VALUE) {

64 if (node_offset_ < (bdict_length_ - 2))

65 return &bdict_data_[node_offset_ + 2]; // Starts after the 2 byte ID.

66 // Otherwise the dictionary is corrupt.

67 is_valid_ = false;

68 }

69 return NULL;

70 }

71

72 // Returns the first affix ID corresponding to the given leaf node. The

73 // current node must be a leaf or this will do the wrong thing. There may be

74 // additional affix IDs following the node when leaf_has_following is set,

75 // but this will not handle those.

76 inline int affix_id_for_leaf() const {

77 if (node_offset_ >= bdict_length_ - 2) {

78 is_valid_ = false;

79 return 0;

80 }

81 // Take the lowest 6 bits of the first byte, and all 8 bits of the second.

82 return ((bdict_data_[node_offset_ + 0] &

83 BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK) << 8) +

84 bdict_data_[node_offset_ + 1];

85 }

86

87 // Returns true if there is a list of additional affix matches following this

88 // leaf node.

89 inline bool leaf_has_following() const {

90 return ((id_byte() & BDict::LEAF_NODE_FOLLOWING_MASK) ==

91 BDict::LEAF_NODE_FOLLOWING_VALUE);

92 }

93

94 // Fills the affix indices into the output array given a matching leaf node.

95 // \|additional_bytes\| is the number of bytes of the additional string,

96 // including the NULL terminator, following this leaf node. This will be 0 if

97 // there is no additional string.

98 int FillAffixesForLeafMatch(

99 size_t additional_bytes,

100 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;

101

102 // Lookup --------------------------------------------------------------------

103

104 inline bool is_lookup() const {

105 return (id_byte() & BDict::LOOKUP_NODE_TYPE_MASK) ==

106 BDict::LOOKUP_NODE_TYPE_VALUE;

107 }

108

109 inline bool is_lookup_32() const {

110 return (id_byte() & BDict::LOOKUP_NODE_32BIT_MASK) ==

111 BDict::LOOKUP_NODE_32BIT_VALUE;

112 }

113

114 inline bool lookup_has_0th() const {

115 return (id_byte() & BDict::LOOKUP_NODE_0TH_MASK) ==

116 BDict::LOOKUP_NODE_0TH_VALUE;

117 }

118

119 // Returns the first entry after the lookup table header. When there is a

120 // magic 0th entry, it will be that address.

121 // The caller checks that the result is in-bounds.

122 inline size_t zeroth_entry_offset() const {

123 return node_offset_ + 3;

124 }

125

126 // Returns the index of the first element in the lookup table. This skips any

127 // magic 0th entry.

128 // The caller checks that the result is in-bounds.

129 size_t lookup_table_offset() const {

130 size_t table_offset = zeroth_entry_offset();

131 if (lookup_has_0th())

132 return table_offset + (is_lookup_32() ? 4 : 2);

133 return table_offset;

134 }

135

136 inline int lookup_first_char() const {

137 if (node_offset_ >= bdict_length_ - 1) {

138 is_valid_ = false;

139 return 0;

140 }

141 return bdict_data_[node_offset_ + 1];

142 }

143

144 inline int lookup_num_chars() const {

145 if (node_offset_ >= bdict_length_ - 2) {

146 is_valid_ = false;

147 return 0;

148 }

149 return bdict_data_[node_offset_ + 2];

150 }

151

152 // Computes a node reader for the magic 0th entry of the table. This assumes

153 // it has a 0th entry. This will always return FOUND_NODE (for compatilibility

154 // with GetChildAt).

155 FindResult ReaderForLookup0th(NodeReader* result) const;

156

157 // Gets a node reader for the \|offset\|th element in the table, not counting

158 // the magic 0th element, if any (so passing 0 here will give you the first

159 // element in the regular lookup table). The offset is assumed to be valid.

160 //

161 // \|child_node_char\| is the character value that the child node will

162 // represent. The single character for this node will be placed into

163 // \|*found_char\|.

164 FindResult ReaderForLookupAt(size_t index, char* found_char,

165 NodeReader* result) const;

166

167 // List ----------------------------------------------------------------------

168

169 inline bool is_list() const {

170 return (id_byte() & BDict::LIST_NODE_TYPE_MASK) ==

171 BDict::LIST_NODE_TYPE_VALUE;

172 }

173

174 inline int is_list_16() const {

175 // 16 bit lst nodes have the high 4 bits of 1.

176 return (id_byte() & BDict::LIST_NODE_16BIT_MASK) ==

177 BDict::LIST_NODE_16BIT_VALUE;

178 }

179

180 inline size_t list_item_count() const {

181 // The list count is stored in the low 4 bits of the ID.

182 return id_byte() & BDict::LIST_NODE_COUNT_MASK;

183 }

184

185 // Returns a NodeReader for the list item with the given index. The single

186 // character for this node will be placed into \|*found_char\|.

187 FindResult ReaderForListAt(size_t index, char* found_char,

188 NodeReader* result) const;

189

190 private:

191 inline unsigned char id_byte() const {

192 if (!is_valid_)

193 return 0; // Don't continue with a corrupt node.

194 if (node_offset_ >= bdict_length_) {

195 // Return zero if out of bounds; we'll check is_valid_ in caller.

196 is_valid_ = false;

197 return 0;

198 }

199 return bdict_data_[node_offset_];

200 }

201

202 // Checks the given leaf node to see if it's a match for the given word.

203 // The parameters and return values are the same as BDictReader::FindWord.

204 int CompareLeafNode(const unsigned char* word,

205 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;

206

207 // Recursive calls used by FindWord to look up child nodes of different types.

208 int FindInLookup(const unsigned char* word,

209 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;

210 int FindInList(const unsigned char* word,

211 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;

212

213 // The entire bdict file. This will be NULL if it is invalid.

214 const unsigned char* bdict_data_;

215 size_t bdict_length_;

216 // Points to the end of the file (for length checking convenience).

217 const unsigned char* bdict_end_;

218

219 // Absolute offset within \|bdict_data_\| of the beginning of this node.

220 size_t node_offset_;

221

222 // The character index into the word that this node represents.

223 int node_depth_;

224

225 // Signals that dictionary corruption was found during node traversal.

226 mutable bool is_valid_;

227 };

228

229 NodeReader::NodeReader()

230 : bdict_data_(NULL),

231 bdict_length_(0),

232 bdict_end_(NULL),

233 node_offset_(0),

234 node_depth_(0),

235 is_valid_(false) {

236 }

237

238 NodeReader::NodeReader(const unsigned char* bdict_data, size_t bdict_length,

239 size_t node_offset, int node_depth)

240 : bdict_data_(bdict_data),

241 bdict_length_(bdict_length),

242 bdict_end_(bdict_data + bdict_length),

243 node_offset_(node_offset),

244 node_depth_(node_depth),

245 is_valid_(bdict_data != NULL && node_offset < bdict_length) {

246 }

247

248 int NodeReader::FindWord(const unsigned char* word,

249 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {

250 // Return 0 if the dictionary is corrupt as BDictReader::FindWord() does.

251 if (!bdict_data_ \|\| node_offset_ > bdict_length_)

252 return 0;

253

254 if (is_leaf())

255 return CompareLeafNode(word, affix_indices);

256

257 if (is_lookup())

258 return FindInLookup(word, affix_indices);

259 if (is_list())

260 return FindInList(word, affix_indices);

261 return 0; // Corrupt file.

262 }

263

264 NodeReader::FindResult NodeReader::GetChildAt(int index, char* found_char,

265 NodeReader* result) const {

266 if (is_lookup()) {

267 if (lookup_has_0th()) {

268 if (index == 0) {

269 *found_char = 0;

270 return ReaderForLookup0th(result);

271 }

272 index--; // Make index relative to the non-0th-element table.

273 }

274 return ReaderForLookupAt(index, found_char, result);

275 }

276 if (is_list()) {

277 return ReaderForListAt(index, found_char, result);

278 }

279 return FIND_DONE;

280 }

281

282 int NodeReader::CompareLeafNode(

283 const unsigned char* word,

284 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {

285 // See if there is an additional string.

286 const unsigned char* additional = additional_string_for_leaf();

287 if (!additional) {

288 // No additional string. This means we should have reached the end of the

289 // word to get a match.

290 if (word[node_depth_] != 0)

291 return 0;

292 return FillAffixesForLeafMatch(0, affix_indices);

293 }

294

295 // Check the additional string.

296 int cur = 0;

297 while (&additional[cur] < bdict_end_ && additional[cur]) {

298 if (word[node_depth_ + cur] != additional[cur])

299 return 0; // Not a match.

300 cur++;

301 }

302

303 if (&additional[cur] == bdict_end_) {

304 is_valid_ = false;

305 return 0;

306 }

307

308 // Got to the end of the additional string, the word should also be over for

309 // a match (the same as above).

310 if (word[node_depth_ + cur] != 0)

311 return 0;

312 return FillAffixesForLeafMatch(cur + 1, affix_indices);

313 }

314

315 int NodeReader::FillAffixesForLeafMatch(

316 size_t additional_bytes,

317 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {

318 // The first match is easy, it always comes from the affix_id included in the

319 // leaf node.

320 affix_indices[0] = affix_id_for_leaf();

321

322 if (!leaf_has_following() && affix_indices[0] != BDict::FIRST_AFFIX_IS_UNUSED)

323 return 1; // Common case: no additional affix group IDs.

324

325 // We may or may not need to ignore that first value we just read, since it

326 // could be a dummy placeholder value. The \|list_offset\| is the starting

327 // position in the output list to write the rest of the values, which may

328 // overwrite the first value.

329 int list_offset = 1;

330 if (affix_indices[0] == BDict::FIRST_AFFIX_IS_UNUSED)

331 list_offset = 0;

332

333 // Save the end pointer (accounting for an odd number of bytes).

334 size_t array_start = node_offset_ + additional_bytes + 2;

335 const uint16* const bdict_short_end = reinterpret_cast<const uint16*>(

336 &bdict_data_[((bdict_length_ - array_start) & -2) + array_start]);

337 // Process all remaining matches.

338 const uint16* following_array = reinterpret_cast<const uint16*>(

339 &bdict_data_[array_start]);

340 for (int i = 0; i < BDict::MAX_AFFIXES_PER_WORD - list_offset; i++) {

341 if (&following_array[i] >= bdict_short_end) {

342 is_valid_ = false;

343 return 0;

344 }

345 if (following_array[i] == BDict::LEAF_NODE_FOLLOWING_LIST_TERMINATOR)

346 return i + list_offset; // Found the end of the list.

347 affix_indices[i + list_offset] = following_array[i];

348 }

349 return BDict::MAX_AFFIXES_PER_WORD;

350 }

351

352 int NodeReader::FindInLookup(

353 const unsigned char* word,

354 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {

355 unsigned char next_char = word[node_depth_];

356

357 NodeReader child_reader;

358 if (next_char == 0 && lookup_has_0th()) {

359 if (ReaderForLookup0th(&child_reader) != FIND_NODE)

360 return 0;

361 } else {

362 // Look up in the regular part of the table.

363 int offset_in_table = static_cast<int>(next_char) - lookup_first_char();

364 if (offset_in_table < 0 \|\| offset_in_table > lookup_num_chars())

365 return 0; // Table can not include this value.

366

367 char dummy_char;

368 if (ReaderForLookupAt(offset_in_table, &dummy_char, &child_reader) !=

369 FIND_NODE)

370 return 0;

371 DCHECK(dummy_char == static_cast<char>(next_char));

372 }

373

374 if (!child_reader.is_valid())

375 return 0; // Something is messed up.

376

377 // Now recurse into that child node.

378 return child_reader.FindWord(word, affix_indices);

379 }

380

381 NodeReader::FindResult NodeReader::ReaderForLookup0th(

382 NodeReader* result) const {

383 size_t child_offset;

384 if (is_lookup_32()) {

385 child_offset = reinterpret_cast<const unsigned int>(

386 &bdict_data_[zeroth_entry_offset()]);

387 } else {

388 child_offset = reinterpret_cast<const unsigned short>(

389 &bdict_data_[zeroth_entry_offset()]);

390 child_offset += node_offset_;

391 }

392

393 // Range check the offset;

394 if (child_offset >= bdict_length_) {

395 is_valid_ = false;

396 return FIND_DONE;

397 }

398

399 // Now recurse into that child node. We don't advance to the next character

400 // here since the 0th element will be a leaf (see ReaderForLookupAt).

401 *result = NodeReader(bdict_data_, bdict_length_, child_offset, node_depth_);

402 return FIND_NODE;

403 }

404

405 NodeReader::FindResult NodeReader::ReaderForLookupAt(

406 size_t index,

407 char* found_char,

408 NodeReader* result) const {

409 const unsigned char* table_begin = &bdict_data_[lookup_table_offset()];

410

411 if (index >= static_cast<size_t>(lookup_num_chars()) \|\| !is_valid_)

412 return FIND_DONE;

413

414 size_t child_offset;

415 if (is_lookup_32()) {

416 // Table contains 32-bit absolute offsets.

417 child_offset =

418 reinterpret_cast<const unsigned int*>(table_begin)[index];

419 if (!child_offset)

420 return FIND_NOTHING; // This entry in the table is empty.

421 } else {

422 // Table contains 16-bit offsets relative to the current node.

423 child_offset =

424 reinterpret_cast<const unsigned short*>(table_begin)[index];

425 if (!child_offset)

426 return FIND_NOTHING; // This entry in the table is empty.

427 child_offset += node_offset_;

428 }

429

430 // Range check the offset;

431 if (child_offset >= bdict_length_) {

432 is_valid_ = false;

433 return FIND_DONE; // Error.

434 }

435

436 // This is a bit tricky. When we've just reached the end of a word, the word

437 // itself will be stored in a leaf "node" off of this node. That node, of

438 // course, will want to know that it's the end of the word and so we have to

439 // have it use the same index into the word as we're using at this level.

440 //

441 // This happens when there is a word in the dictionary that is a strict

442 // prefix of other words in the dictionary, and so we'll have a non-leaf

443 // node representing the entire word before the ending leaf node.

444 //

445 // In all other cases, we want to advance to the next character. Even if the

446 // child node is a leaf, it will have an additional character that it will

447 // want to check.

448 *found_char = static_cast<char>(index + lookup_first_char());

449 if (!is_valid_)

450 return FIND_DONE;

451 int char_advance = *found_char == 0 ? 0 : 1;

452

453 *result = NodeReader(bdict_data_, bdict_length_,

454 child_offset, node_depth_ + char_advance);

455 return FIND_NODE;

456 }

457

458 int NodeReader::FindInList(

459 const unsigned char* word,

460 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {

461 unsigned char next_char = word[node_depth_];

462

463 // TODO(brettw) replace with binary search.

464 size_t list_count = list_item_count();

465 const unsigned char* list_begin = &bdict_data_[node_offset_ + 1];

466

467 int bytes_per_index = (is_list_16() ? 3 : 2);

468

469 for (size_t i = 0; i < list_count; i++) {

470 const unsigned char* list_current = &list_begin[i * bytes_per_index];

471 if (list_current >= bdict_end_) {

472 is_valid_ = false;

473 return 0;

474 }

475 if (*list_current == next_char) {

476 // Found a match.

477 char dummy_char;

478 NodeReader child_reader;

479 if (ReaderForListAt(i, &dummy_char, &child_reader) != FIND_NODE)

480 return 0;

481 DCHECK(dummy_char == static_cast<char>(next_char));

482 return child_reader.FindWord(word, affix_indices);

483 }

484 }

485 return 0;

486 }

487

488 NodeReader::FindResult NodeReader::ReaderForListAt(

489 size_t index,

490 char* found_char,

491 NodeReader* result) const {

492 size_t list_begin = node_offset_ + 1;

493

494 if (index >= list_item_count())

495 return FIND_DONE;

496

497 size_t offset;

498 if (is_list_16()) {

499 const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 3;

500 *found_char = static_cast<char>(list_item_begin[0]);

501

502 // The children begin right after the list.

503 size_t children_begin = list_begin + list_item_count() * 3;

504 offset = children_begin + reinterpret_cast<const unsigned short>(

505 &list_item_begin[1]);

506 } else {

507 const unsigned char* list_item_begin = bdict_data_ + list_begin + index * 2;

508 *found_char = list_item_begin[0];

509

510 size_t children_begin = list_begin + list_item_count() * 2;

511 offset = children_begin + list_item_begin[1];

512 }

513

514 if (offset == 0 \|\| node_offset_ >= bdict_length_) {

515 is_valid_ = false;

516 return FIND_DONE; // Error, should not happen except for corruption.

517 }

518

519 int char_advance = *found_char == 0 ? 0 : 1; // See ReaderForLookupAt.

520 *result = NodeReader(bdict_data_, bdict_length_,

521 offset, node_depth_ + char_advance);

522 return FIND_NODE;

523 }

524

525 // WordIterator ----------------------------------------------------------------

526

527 struct WordIterator::NodeInfo {

528 // The current offset is set to -1 so we start iterating at 0 when Advance

529 // is called.

530 NodeInfo(const NodeReader& rdr, char add)

531 : reader(rdr),

532 addition(add),

533 cur_offset(-1) {

534 }

535

536 // The reader for this node level.

537 NodeReader reader;

538

539 // The character that this level represents. For the 0th level, this will

540 // be 0 (since it is the root that represents no characters).

541 char addition;

542

543 // The current index into the reader that we're reading. Combined with the

544 // \|stack_\|, this allows us to iterate over the tree in depth-first order.

545 int cur_offset;

546 };

547

548 WordIterator::WordIterator(const NodeReader& reader) {

549 NodeInfo info(reader, 0);

550 stack_.push_back(info);

551 }

552

553 WordIterator::WordIterator(const WordIterator& other) {

554 operator=(other);

555 }

556

557 WordIterator::~WordIterator() {

558 // Can't be in the header for the NodeReader destructor.

559 }

560

561 WordIterator& WordIterator::operator=(const WordIterator& other) {

562 stack_ = other.stack_;

563 return *this;

564 }

565

566 int WordIterator::Advance(char* output_buffer, size_t output_len,

567 int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) {

568 // In-order tree walker. This uses a loop for fake tail recursion.

569 while (!stack_.empty()) {

570 NodeInfo& cur = stack_.back();

571 cur.cur_offset++;

572 char cur_char;

573 NodeReader child_reader;

574

575 /*if (cur.reader.is_leaf()) {

576 child_reader = cur.reader;

577 cur_char = cur.addition;

578 stack_.pop_back();

579 return FoundLeaf(child_reader, cur_char, output_buffer, output_len,

580 affix_ids);

581 }*/

582

583 switch (cur.reader.GetChildAt(cur.cur_offset, &cur_char, &child_reader)) {

584 case NodeReader::FIND_NODE:

585 // Got a valid child node.

586 if (child_reader.is_leaf()) {

587 return FoundLeaf(child_reader, cur_char, output_buffer, output_len,

588 affix_ids);

589 }

590

591 // Not a leaf. Add the new node to our stack and try again.

592 stack_.push_back(NodeInfo(child_reader, cur_char));

593 break;

594

595 case NodeReader::FIND_NOTHING:

596 // This one is empty, but we're not done. Continue on.

597 break;

598

599 case NodeReader::FIND_DONE:

600 // No more children at this level, pop the stack and go back one.

601 stack_.pop_back();

602 }

603 }

604

605 return false;

606 }

607

608 int WordIterator::FoundLeaf(const NodeReader& reader, char cur_char,

609 char* output_buffer, size_t output_len,

610 int affix_ids[BDict::MAX_AFFIXES_PER_WORD]) {

611 // Remember that the first item in the stack is the root and so doesn't count.

612 int i;

613 for (i = 0; i < static_cast<int>(stack_.size()) - 1 && i < static_cast<int>(ou tput_len) - 1; i++)

614 output_buffer[i] = stack_[i + 1].addition;

615 output_buffer[i++] = cur_char; // The one we just found.

616

617 // Possibly add any extra parts.

618 size_t additional_string_length = 0;

619 const char* additional = reinterpret_cast<const char*>(

620 reader.additional_string_for_leaf());

621 for (; i < static_cast<int>(output_len) - 1 && additional &&

622 additional[additional_string_length] != 0;

623 i++, additional_string_length++)

624 output_buffer[i] = additional[additional_string_length];

625 if (additional_string_length)

626 additional_string_length++; // Account for the null terminator.

627 output_buffer[i] = 0;

628

629 return reader.FillAffixesForLeafMatch(additional_string_length,

630 affix_ids);

631 }

632

633 // LineIterator ----------------------------------------------------------------

634

635 LineIterator::LineIterator(

636 const unsigned char* bdict_data,

637 size_t bdict_length,

638 size_t first_offset)

639 : bdict_data_(bdict_data),

640 bdict_length_(bdict_length),

641 cur_offset_(first_offset) {

642 }

643

644 // Returns true when all data has been read. We're done when we reach a

645 // double-NULL or a the end of the input (shouldn't happen).

646 bool LineIterator::IsDone() const {

647 return cur_offset_ >= bdict_length_ \|\| bdict_data_[cur_offset_] == 0;

648 }

649

650 const char* LineIterator::Advance() {

651 if (IsDone())

652 return NULL;

653

654 const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]);

655

656 // Advance over this word to find the end.

657 while (cur_offset_ < bdict_length_ && bdict_data_[cur_offset_])

658 cur_offset_++;

659 cur_offset_++; // Advance over the NULL terminator.

660

661 return begin;

662 }

663

664 bool LineIterator::AdvanceAndCopy(char* buf, size_t buf_len) {

665 if (IsDone())

666 return false;

667

668 const char* begin = reinterpret_cast<const char*>(&bdict_data_[cur_offset_]);

669

670 // Advance over this word to find the end.

671 size_t i;

672 for (i = 0;

673 i < buf_len && cur_offset_ < bdict_length_ && bdict_data_[cur_offset_];

674 i++, cur_offset_++) {

675 buf[i] = bdict_data_[cur_offset_];

676 }

677 // Handle the NULL terminator.

678 cur_offset_++; // Consume in the input

679 if (i < buf_len)

680 buf[i] = 0; // Save in the output.

681 else

682 buf[buf_len - 1] = 0; // Overflow, make sure it's terminated.

683

684 return !!buf[0];

685 }

686

687 // ReplacementIterator ---------------------------------------------------------

688

689 // Fills pointers to NULL terminated strings into the given output params.

690 // Returns false if there are no more pairs and nothing was filled in.

691 bool ReplacementIterator::GetNext(const char first, const char second) {

692 if (IsDone())

693 return false;

694 *first = Advance();

695 *second = Advance();

696 return first && second;

697 }

698

699 // BDictReader -----------------------------------------------------------------

700

701 BDictReader::BDictReader()

702 : bdict_data_(NULL),

703 bdict_length_(0),

704 header_(NULL) {

705 }

706

707 bool BDictReader::Init(const unsigned char* bdict_data, size_t bdict_length) {

708 if (bdict_length < sizeof(BDict::Header))

709 return false;

710

711 // Check header.

712 header_ = reinterpret_cast<const BDict::Header*>(bdict_data);

713 if (header_->signature != BDict::SIGNATURE \|\|

714 header_->major_version > BDict::MAJOR_VERSION \|\|

715 header_->dic_offset > bdict_length)

716 return false;

717

718 // Get the affix header, make sure there is enough room for it.

719 if (header_->aff_offset + sizeof(BDict::AffHeader) > bdict_length)

720 return false;

721 aff_header_ = reinterpret_cast<const BDict::AffHeader*>(

722 &bdict_data[header_->aff_offset]);

723

724 // Make sure there is enough room for the affix group count dword.

725 if (aff_header_->affix_group_offset > bdict_length - sizeof(uint32))

726 return false;

727

728 // This function is called from SpellCheck::SpellCheckWord(), which blocks

729 // WebKit. To avoid blocking WebKit for a long time, we do not check the MD5

730 // digest here. Instead we check the MD5 digest when Chrome finishes

731 // downloading a dictionary.

732

733 // Don't set these until the end. This way, NULL bdict_data_ will indicate

734 // failure.

735 bdict_data_ = bdict_data;

736 bdict_length_ = bdict_length;

737 return true;

738 }

739

740 int BDictReader::FindWord(

741 const char* word,

742 int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const {

743 if (!bdict_data_ \|\|

744 header_->dic_offset >= bdict_length_) {

745 // When the dictionary is corrupt, we return 0 which means the word is valid

746 // and has no rules. This means when there is some problem, we'll default

747 // to no spellchecking rather than marking everything as misspelled.

748 return 0;

749 }

750 NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0);

751 return reader.FindWord(reinterpret_cast<const unsigned char*>(word),

752 affix_indices);

753 }

754

755 LineIterator BDictReader::GetAfLineIterator() const {

756 if (!bdict_data_ \|\|

757 aff_header_->affix_group_offset == 0 \|\|

758 aff_header_->affix_group_offset >= bdict_length_)

759 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid.

760 return LineIterator(bdict_data_, bdict_length_,

761 aff_header_->affix_group_offset);

762 }

763

764 LineIterator BDictReader::GetAffixLineIterator() const {

765 if (!bdict_data_ \|\|

766 aff_header_->affix_rule_offset == 0 \|\|

767 aff_header_->affix_rule_offset >= bdict_length_)

768 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid.

769 return LineIterator(bdict_data_, bdict_length_,

770 aff_header_->affix_rule_offset);

771 }

772

773 LineIterator BDictReader::GetOtherLineIterator() const {

774 if (!bdict_data_ \|\|

775 aff_header_->other_offset == 0 \|\|

776 aff_header_->other_offset >= bdict_length_)

777 return LineIterator(bdict_data_, 0, 0); // Item is empty or invalid.

778 return LineIterator(bdict_data_, bdict_length_,

779 aff_header_->other_offset);

780 }

781

782 ReplacementIterator BDictReader::GetReplacementIterator() const {

783 return ReplacementIterator(bdict_data_, bdict_length_,

784 aff_header_->rep_offset);

785 }

786

787

788 WordIterator BDictReader::GetAllWordIterator() const {

789 NodeReader reader(bdict_data_, bdict_length_, header_->dic_offset, 0);

790 return WordIterator(reader);

791 }

792

793 } // namespace hunspell

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/google/bdict_reader.h ('k') | third_party/hunspell_new/google/bdict_writer.h » ('j') | no next file with comments »