third_party/hunspell_new/src/hunspell/hashmgr.cxx - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/src/hunspell/hashmgr.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #include "license.hunspell"

2 #include "license.myspell"

3

4 #include <stdlib.h>

5 #include <string.h>

6 #include <stdio.h>

7 #include <ctype.h>

8

9 #include "hashmgr.hxx"

10 #include "csutil.hxx"

11 #include "atypes.hxx"

12

13 // build a hash table from a munched word list

14

15 #ifdef HUNSPELL_CHROME_CLIENT

16 HashMgr::HashMgr(hunspell::BDictReader* reader)

17 {

18 bdict_reader = reader;

19 #else

20 HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)

21 {

22 #endif

23 tablesize = 0;

24 tableptr = NULL;

25 flag_mode = FLAG_CHAR;

26 complexprefixes = 0;

27 utf8 = 0;

28 langnum = 0;

29 lang = NULL;

30 enc = NULL;

31 csconv = 0;

32 ignorechars = NULL;

33 ignorechars_utf16 = NULL;

34 ignorechars_utf16_len = 0;

35 numaliasf = 0;

36 aliasf = NULL;

37 numaliasm = 0;

38 aliasm = NULL;

39 forbiddenword = FORBIDDENWORD; // forbidden word signing flag

40 #ifdef HUNSPELL_CHROME_CLIENT

41 // No tables to load, just the AF lines.

42 load_config(NULL, NULL);

43 int ec = LoadAFLines();

44 #else

45 load_config(apath, key);

46 int ec = load_tables(tpath, key);

47 #endif

48 if (ec) {

49 /* error condition - what should we do here */

50 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);

51 if (tableptr) {

52 free(tableptr);

53 tableptr = NULL;

54 }

55 tablesize = 0;

56 }

57 }

58

59

60 HashMgr::~HashMgr()

61 {

62 if (tableptr) {

63 // now pass through hash table freeing up everything

64 // go through column by column of the table

65 for (int i=0; i < tablesize; i++) {

66 struct hentry * pt = tableptr[i];

67 struct hentry * nt = NULL;

68 while(pt) {

69 nt = pt->next;

70 if (pt->astr && (!aliasf \|\| TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)) ) free(pt->astr);

71 free(pt);

72 pt = nt;

73 }

74 }

75 free(tableptr);

76 }

77 tablesize = 0;

78

79 if (aliasf) {

80 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);

81 free(aliasf);

82 aliasf = NULL;

83 if (aliasflen) {

84 free(aliasflen);

85 aliasflen = NULL;

86 }

87 }

88 if (aliasm) {

89 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);

90 free(aliasm);

91 aliasm = NULL;

92 }

93

94 #ifndef OPENOFFICEORG

95 #ifndef MOZILLA_CLIENT

96 if (utf8) free_utf_tbl();

97 #endif

98 #endif

99

100 if (enc) free(enc);

101 if (lang) free(lang);

102

103 if (ignorechars) free(ignorechars);

104 if (ignorechars_utf16) free(ignorechars_utf16);

105

106 #ifdef HUNSPELL_CHROME_CLIENT

107 EmptyHentryCache();

108 for (std::vector<std::string*>::iterator it = pointer_to_strings_.begin();

109 it != pointer_to_strings_.end(); ++it) {

110 delete *it;

111 }

112 #endif

113 #ifdef MOZILLA_CLIENT

114 delete [] csconv;

115 #endif

116 }

117

118 #ifdef HUNSPELL_CHROME_CLIENT

119 void HashMgr::EmptyHentryCache() {

120 // We need to delete each cache entry, and each additional one in the linked

121 // list of homonyms.

122 for (HEntryCache::iterator i = hentry_cache.begin();

123 i != hentry_cache.end(); ++i) {

124 hentry* cur = i->second;

125 while (cur) {

126 hentry* next = cur->next_homonym;

127 DeleteHashEntry(cur);

128 cur = next;

129 }

130 }

131 hentry_cache.clear();

132 }

133 #endif

134

135 // lookup a root word in the hashtable

136

137 struct hentry * HashMgr::lookup(const char *word) const

138 {

139 #ifdef HUNSPELL_CHROME_CLIENT

140 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];

141 int affix_count = bdict_reader->FindWord(word, affix_ids);

142 if (affix_count == 0) { // look for custom added word

143 std::map<base::StringPiece, int>::const_iterator iter =

144 custom_word_to_affix_id_map_.find(word);

145 if (iter != custom_word_to_affix_id_map_.end()) {

146 affix_count = 1;

147 affix_ids[0] = iter->second;

148 }

149 }

150

151 static const int kMaxWordLen = 128;

152 static char word_buf[kMaxWordLen];

153 // To take account of null-termination, we use upto 127.

154 strncpy(word_buf, word, kMaxWordLen - 1);

155

156 return AffixIDsToHentry(word_buf, affix_ids, affix_count);

157 #else

158 struct hentry * dp;

159 if (tableptr) {

160 dp = tableptr[hash(word)];

161 if (!dp) return NULL;

162 for ( ; dp != NULL; dp = dp->next) {

163 if (strcmp(word, dp->word) == 0) return dp;

164 }

165 }

166 return NULL;

167 #endif

168 }

169

170 // add a word to the hash table (private)

171 int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,

172 int al, const char * desc, bool onlyupcase)

173 {

174 #ifndef HUNSPELL_CHROME_CLIENT

175 bool upcasehomonym = false;

176 int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;

177 // variable-length hash record with word and optional fields

178 struct hentry* hp =

179 (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);

180 if (!hp) return 1;

181 char * hpw = hp->word;

182 strcpy(hpw, word);

183 if (ignorechars != NULL) {

184 if (utf8) {

185 remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);

186 } else {

187 remove_ignored_chars(hpw, ignorechars);

188 }

189 }

190 if (complexprefixes) {

191 if (utf8) reverseword_utf(hpw); else reverseword(hpw);

192 }

193

194 int i = hash(hpw);

195

196 hp->blen = (unsigned char) wbl;

197 hp->clen = (unsigned char) wcl;

198 hp->alen = (short) al;

199 hp->astr = aff;

200 hp->next = NULL;

201 hp->next_homonym = NULL;

202

203 // store the description string or its pointer

204 if (desc) {

205 hp->var = H_OPT;

206 if (aliasm) {

207 hp->var += H_OPT_ALIASM;

208 store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));

209 } else {

210 strcpy(hpw + wbl + 1, desc);

211 if (complexprefixes) {

212 if (utf8) reverseword_utf(HENTRY_DATA(hp));

213 else reverseword(HENTRY_DATA(hp));

214 }

215 }

216 if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;

217 } else hp->var = 0;

218

219 struct hentry * dp = tableptr[i];

220 if (!dp) {

221 tableptr[i] = hp;

222 return 0;

223 }

224 while (dp->next != NULL) {

225 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {

226 // remove hidden onlyupcase homonym

227 if (!onlyupcase) {

228 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {

229 free(dp->astr);

230 dp->astr = hp->astr;

231 dp->alen = hp->alen;

232 free(hp);

233 return 0;

234 } else {

235 dp->next_homonym = hp;

236 }

237 } else {

238 upcasehomonym = true;

239 }

240 }

241 dp=dp->next;

242 }

243 if (strcmp(hp->word, dp->word) == 0) {

244 // remove hidden onlyupcase homonym

245 if (!onlyupcase) {

246 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {

247 free(dp->astr);

248 dp->astr = hp->astr;

249 dp->alen = hp->alen;

250 free(hp);

251 return 0;

252 } else {

253 dp->next_homonym = hp;

254 }

255 } else {

256 upcasehomonym = true;

257 }

258 }

259 if (!upcasehomonym) {

260 dp->next = hp;

261 } else {

262 // remove hidden onlyupcase homonym

263 if (hp->astr) free(hp->astr);

264 free(hp);

265 }

266 #else

267 std::map<base::StringPiece, int>::iterator iter =

268 custom_word_to_affix_id_map_.find(word);

269 if(iter == custom_word_to_affix_id_map_.end()) { // word needs to be added

270 std::string* new_string_word = new std::string(word);

271 pointer_to_strings_.push_back(new_string_word);

272 base::StringPiece sp(*(new_string_word));

273 custom_word_to_affix_id_map_[sp] = 0; // no affixes for custom words

274 return 1;

275 }

276 #endif

277 return 0;

278 }

279

280 int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,

281 unsigned short * flags, int al, char * dp, int captype)

282 {

283 // add inner capitalized forms to handle the following allcap forms:

284 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG

285 // Allcaps with suffixes: CIA's -> CIA'S

286 if (((captype == HUHCAP) \|\| (captype == HUHINITCAP) \|\|

287 ((captype == ALLCAP) && (flags != NULL))) &&

288 !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {

289 unsigned short * flags2 = (unsigned short ) malloc (sizeof(unsigned s hort) (al+1));

290 if (!flags2) return 1;

291 if (al) memcpy(flags2, flags, al * sizeof(unsigned short));

292 flags2[al] = ONLYUPCASEFLAG;

293 if (utf8) {

294 char st[BUFSIZE];

295 w_char w[BUFSIZE];

296 int wlen = u8_u16(w, BUFSIZE, word);

297 mkallsmall_utf(w, wlen, langnum);

298 mkallcap_utf(w, 1, langnum);

299 u16_u8(st, BUFSIZE, w, wlen);

300 return add_word(st,wbl,wcl,flags2,al+1,dp, true);

301 } else {

302 mkallsmall(word, csconv);

303 mkinitcap(word, csconv);

304 return add_word(word,wbl,wcl,flags2,al+1,dp, true);

305 }

306 }

307 return 0;

308 }

309

310 // detect captype and modify word length for UTF-8 encoding

311 int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {

312 int len;

313 if (utf8) {

314 w_char dest_utf[BUFSIZE];

315 len = u8_u16(dest_utf, BUFSIZE, word);

316 *captype = get_captype_utf8(dest_utf, len, langnum);

317 } else {

318 len = wbl;

319 captype = get_captype((char ) word, len, csconv);

320 }

321 return len;

322 }

323

324 // remove word (personal dictionary function for standalone applications)

325 int HashMgr::remove(const char * word)

326 {

327 #ifdef HUNSPELL_CHROME_CLIENT

328 std::map<base::StringPiece, int>::iterator iter =

329 custom_word_to_affix_id_map_.find(word);

330 if (iter != custom_word_to_affix_id_map_.end())

331 custom_word_to_affix_id_map_.erase(iter);

332 #else

333 struct hentry * dp = lookup(word);

334 while (dp) {

335 if (dp->alen == 0 \|\| !TESTAFF(dp->astr, forbiddenword, dp->alen)) {

336 unsigned short * flags =

337 (unsigned short ) malloc(sizeof(short) (dp->alen + 1));

338 if (!flags) return 1;

339 for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];

340 flags[dp->alen] = forbiddenword;

341 dp->astr = flags;

342 dp->alen++;

343 flag_qsort(flags, 0, dp->alen);

344 }

345 dp = dp->next_homonym;

346 }

347 #endif

348 return 0;

349 }

350

351 /* remove forbidden flag to add a personal word to the hash */

352 int HashMgr::remove_forbidden_flag(const char * word) {

353 struct hentry * dp = lookup(word);

354 if (!dp) return 1;

355 while (dp) {

356 if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {

357 if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.

358 else {

359 unsigned short * flags2 =

360 (unsigned short ) malloc(sizeof(short) (dp->alen - 1));

361 if (!flags2) return 1;

362 int i, j = 0;

363 for (i = 0; i < dp->alen; i++) {

364 if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];

365 }

366 dp->alen--;

367 dp->astr = flags2; // XXX allowed forbidden words

368 }

369 }

370 dp = dp->next_homonym;

371 }

372 return 0;

373 }

374

375 // add a custom dic. word to the hash table (public)

376 int HashMgr::add(const char * word)

377 {

378 unsigned short * flags = NULL;

379 int al = 0;

380 if (remove_forbidden_flag(word)) {

381 int captype;

382 int wbl = strlen(word);

383 int wcl = get_clen_and_captype(word, wbl, &captype);

384 add_word(word, wbl, wcl, flags, al, NULL, false);

385 return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, N ULL, captype);

386 }

387 return 0;

388 }

389

390 int HashMgr::add_with_affix(const char * word, const char * example)

391 {

392 // detect captype and modify word length for UTF-8 encoding

393 struct hentry * dp = lookup(example);

394 remove_forbidden_flag(word);

395 if (dp && dp->astr) {

396 int captype;

397 int wbl = strlen(word);

398 int wcl = get_clen_and_captype(word, wbl, &captype);

399 if (aliasf) {

400 add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);

401 } else {

402 unsigned short * flags = (unsigned short ) malloc (dp->alen sizeo f(short));

403 if (flags) {

404 memcpy((void ) flags, (void ) dp->astr, dp->alen * sizeof(shor t));

405 add_word(word, wbl, wcl, flags, dp->alen, NULL, false);

406 } else return 1;

407 }

408 return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp ->alen, NULL, captype);

409 }

410 return 1;

411 }

412

413 // walk the hash table entry by entry - null at end

414 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);

415 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const

416 {

417 #ifdef HUNSPELL_CHROME_CLIENT

418 // Return NULL if dictionary is not valid.

419 if (!bdict_reader->IsValid())

420 return NULL;

421

422 // This function is only ever called by one place and not nested. We can

423 // therefore keep static state between calls and use \|col\| as a "reset" flag

424 // to avoid changing the API. It is set to -1 for the first call.

425 // Allocate the iterator on the heap to prevent an exit time destructor.

426 static hunspell::WordIterator& word_iterator =

427 *new hunspell::WordIterator(bdict_reader->GetAllWordIterator());

428 if (col < 0) {

429 col = 1;

430 word_iterator = bdict_reader->GetAllWordIterator();

431 }

432

433 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];

434 static const int kMaxWordLen = 128;

435 static char word[kMaxWordLen];

436 int affix_count = word_iterator.Advance(word, kMaxWordLen, affix_ids);

437 if (affix_count == 0)

438 return NULL;

439 short word_len = static_cast<short>(strlen(word));

440

441 // Since hunspell 1.2.8, an hentry struct becomes a variable-length struct,

442 // i.e. a struct which uses its array 'word[1]' as a variable-length array.

443 // As noted above, this function is not nested. So, we just use a static

444 // struct which consists of an hentry and a char[kMaxWordLen], and initialize

445 // the static struct and return it for now.

446 // No need to create linked lists for the extra affixes.

447 static struct {

448 hentry entry;

449 char word[kMaxWordLen];

450 } hash_entry;

451

452 return InitHashEntry(&hash_entry.entry, sizeof(hash_entry),

453 &word[0], word_len, affix_ids[0]);

454 #else

455 if (hp && hp->next != NULL) return hp->next;

456 for (col++; col < tablesize; col++) {

457 if (tableptr[col]) return tableptr[col];

458 }

459 // null at end and reset to start

460 col = -1;

461 return NULL;

462 #endif

463 }

464

465 // load a munched word list and build a hash table on the fly

466 int HashMgr::load_tables(const char * tpath, const char * key)

467 {

468 #ifndef HUNSPELL_CHROME_CLIENT

469 int al;

470 char * ap;

471 char * dp;

472 char * dp2;

473 unsigned short * flags;

474 char * ts;

475

476 // open dictionary file

477 FileMgr * dict = new FileMgr(tpath, key);

478 if (dict == NULL) return 1;

479

480 // first read the first line of file to get hash table size */

481 if ((ts = dict->getline()) == NULL) {

482 HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);

483 delete dict;

484 return 2;

485 }

486 mychomp(ts);

487

488 /* remove byte order mark */

489 if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {

490 memmove(ts, ts+3, strlen(ts+3)+1);

491 // warning: dic file begins with byte order mark: possible incompatibility w ith old Hunspell versions

492 }

493

494 tablesize = atoi(ts);

495 if (tablesize == 0) {

496 HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the di c file\n");

497 delete dict;

498 return 4;

499 }

500 tablesize = tablesize + 5 + USERWORD;

501 if ((tablesize %2) == 0) tablesize++;

502

503 // allocate the hash table

504 tableptr = (struct hentry *) malloc(tablesize sizeof(struct hentry *));

505 if (! tableptr) {

506 delete dict;

507 return 3;

508 }

509 for (int i=0; i<tablesize; i++) tableptr[i] = NULL;

510

511 // loop through all words on much list and add to hash

512 // table and create word and affix strings

513

514 while ((ts = dict->getline()) != NULL) {

515 mychomp(ts);

516 // split each line into word and morphological description

517 dp = ts;

518 while ((dp = strchr(dp, ':')) != NULL) {

519 if ((dp > ts + 3) && ((dp - 3) == ' ' \|\| (dp - 3) == '\t')) {

520 for (dp -= 4; dp >= ts && (dp == ' ' \|\| dp == '\t'); dp--);

521 if (dp < ts) { // missing word

522 dp = NULL;

523 } else {

524 *(dp + 1) = '\0';

525 dp = dp + 2;

526 }

527 break;

528 }

529 dp++;

530 }

531

532 // tabulator is the old morphological field separator

533 dp2 = strchr(ts, '\t');

534 if (dp2 && (!dp \|\| dp2 < dp)) {

535 *dp2 = '\0';

536 dp = dp2 + 1;

537 }

538

539 // split each line into word and affix char strings

540 // "\/" signs slash in words (not affix separator)

541 // "/" at beginning of the line is word character (not affix separator)

542 ap = strchr(ts,'/');

543 while (ap) {

544 if (ap == ts) {

545 ap++;

546 continue;

547 } else if (*(ap - 1) != '\\') break;

548 // replace "\/" with "/"

549 for (char * sp = ap - 1; sp; sp = *(sp + 1), sp++);

550 ap = strchr(ap,'/');

551 }

552

553 if (ap) {

554 *ap = '\0';

555 if (aliasf) {

556 int index = atoi(ap + 1);

557 al = get_aliasf(index, &flags, dict);

558 if (!al) {

559 HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());

560 *ap = '\0';

561 }

562 } else {

563 al = decode_flags(&flags, ap + 1, dict);

564 if (al == -1) {

565 HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");

566 delete dict;

567 return 6;

568 }

569 flag_qsort(flags, 0, al);

570 }

571 } else {

572 al = 0;

573 ap = NULL;

574 flags = NULL;

575 }

576

577 int captype;

578 int wbl = strlen(ts);

579 int wcl = get_clen_and_captype(ts, wbl, &captype);

580 // add the word and its index plus its capitalized form optionally

581 if (add_word(ts,wbl,wcl,flags,al,dp, false) \|\|

582 add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {

583 delete dict;

584 return 5;

585 }

586 }

587

588 delete dict;

589 #endif

590 return 0;

591 }

592

593 // the hash function is a simple load and rotate

594 // algorithm borrowed

595

596 int HashMgr::hash(const char * word) const

597 {

598 #ifdef HUNSPELL_CHROME_CLIENT

599 return 0;

600 #else

601 long hv = 0;

602 for (int i=0; i < 4 && *word != 0; i++)

603 hv = (hv << 8) \| (*word++);

604 while (*word != 0) {

605 ROTATE(hv,ROTATE_LEN);

606 hv ^= (*word++);

607 }

608 return (unsigned long) hv % tablesize;

609 #endif

610 }

611

612 int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {

613 int len;

614 if (*flags == '\0') {

615 *result = NULL;

616 return 0;

617 }

618 switch (flag_mode) {

619 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)

620 len = strlen(flags);

621 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector \n", af->getlinenum());

622 len /= 2;

623 result = (unsigned short ) malloc(len * sizeof(short));

624 if (!*result) return -1;

625 for (int i = 0; i < len; i++) {

626 (result)[i] = (((unsigned short) flags[i 2]) << 8) + (unsigned sh ort) flags[i * 2 + 1];

627 }

628 break;

629 }

630 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 452 1 23 233)

631 int i;

632 len = 1;

633 char * src = flags;

634 unsigned short * dest;

635 char * p;

636 for (p = flags; *p; p++) {

637 if (*p == ',') len++;

638 }

639 result = (unsigned short ) malloc(len * sizeof(short));

640 if (!*result) return -1;

641 dest = *result;

642 for (p = flags; *p; p++) {

643 if (*p == ',') {

644 i = atoi(src);

645 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: fla g id %d is too large (max: %d)\n",

646 af->getlinenum(), i, DEFAULTFLAGS - 1);

647 *dest = (unsigned short) i;

648 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());

649 src = p + 1;

650 dest++;

651 }

652 }

653 i = atoi(src);

654 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",

655 af->getlinenum(), i, DEFAULTFLAGS - 1);

656 *dest = (unsigned short) i;

657 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong fla g id\n", af->getlinenum());

658 break;

659 }

660 case FLAG_UNI: { // UTF-8 characters

661 w_char w[BUFSIZE/2];

662 len = u8_u16(w, BUFSIZE/2, flags);

663 result = (unsigned short ) malloc(len * sizeof(short));

664 if (!*result) return -1;

665 memcpy(result, w, len sizeof(short));

666 break;

667 }

668 default: { // Ispell's one-character flags (erfg -> e r f g)

669 unsigned short * dest;

670 len = strlen(flags);

671 result = (unsigned short ) malloc(len * sizeof(short));

672 if (!*result) return -1;

673 dest = *result;

674 for (unsigned char * p = (unsigned char ) flags; p; p++) {

675 dest = (unsigned short) p;

676 dest++;

677 }

678 }

679 }

680 return len;

681 }

682

683 unsigned short HashMgr::decode_flag(const char * f) {

684 unsigned short s = 0;

685 int i;

686 switch (flag_mode) {

687 case FLAG_LONG:

688 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];

689 break;

690 case FLAG_NUM:

691 i = atoi(f);

692 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is to o large (max: %d)\n", i, DEFAULTFLAGS - 1);

693 s = (unsigned short) i;

694 break;

695 case FLAG_UNI:

696 u8_u16((w_char *) &s, 1, f);

697 break;

698 default:

699 s = (unsigned short) ((unsigned char )f);

700 }

701 if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");

702 return s;

703 }

704

705 char * HashMgr::encode_flag(unsigned short f) {

706 unsigned char ch[10];

707 if (f==0) return mystrdup("(NULL)");

708 if (flag_mode == FLAG_LONG) {

709 ch[0] = (unsigned char) (f >> 8);

710 ch[1] = (unsigned char) (f - ((f >> 8) << 8));

711 ch[2] = '\0';

712 } else if (flag_mode == FLAG_NUM) {

713 sprintf((char *) ch, "%d", f);

714 } else if (flag_mode == FLAG_UNI) {

715 u16_u8((char ) &ch, 10, (w_char ) &f, 1);

716 } else {

717 ch[0] = (unsigned char) (f);

718 ch[1] = '\0';

719 }

720 return mystrdup((char *) ch);

721 }

722

723 // read in aff file and set flag mode

724 int HashMgr::load_config(const char * affpath, const char * key)

725 {

726 char * line; // io buffers

727 int firstline = 1;

728

729 // open the affix file

730 #ifdef HUNSPELL_CHROME_CLIENT

731 hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator();

732 FileMgr * afflst = new FileMgr(&iterator);

733 #else

734 FileMgr * afflst = new FileMgr(affpath, key);

735 #endif

736 if (!afflst) {

737 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n ",affpath);

738 return 1;

739 }

740

741 // read in each line ignoring any that do not

742 // start with a known line type indicator

743

744 while ((line = afflst->getline()) != NULL) {

745 mychomp(line);

746

747 /* remove byte order mark */

748 if (firstline) {

749 firstline = 0;

750 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(l ine+3)+1);

751 }

752

753 /* parse in the try string */

754 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {

755 if (flag_mode != FLAG_CHAR) {

756 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions o f the FLAG affix file parameter\n", afflst->getlinenum());

757 }

758 if (strstr(line, "long")) flag_mode = FLAG_LONG;

759 if (strstr(line, "num")) flag_mode = FLAG_NUM;

760 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;

761 if (flag_mode == FLAG_CHAR) {

762 HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `lon g' or `UTF-8' parameter\n", afflst->getlinenum());

763 }

764 }

765 if (strncmp(line,"FORBIDDENWORD",13) == 0) {

766 char * st = NULL;

767 if (parse_string(line, &st, afflst->getlinenum())) {

768 delete afflst;

769 return 1;

770 }

771 forbiddenword = decode_flag(st);

772 free(st);

773 }

774 if (strncmp(line, "SET", 3) == 0) {

775 if (parse_string(line, &enc, afflst->getlinenum())) {

776 delete afflst;

777 return 1;

778 }

779 if (strcmp(enc, "UTF-8") == 0) {

780 utf8 = 1;

781 #ifndef OPENOFFICEORG

782 #ifndef MOZILLA_CLIENT

783 initialize_utf_tbl();

784 #endif

785 #endif

786 } else csconv = get_current_cs(enc);

787 }

788 if (strncmp(line, "LANG", 4) == 0) {

789 if (parse_string(line, &lang, afflst->getlinenum())) {

790 delete afflst;

791 return 1;

792 }

793 langnum = get_lang_num(lang);

794 }

795

796 /* parse in the ignored characters (for example, Arabic optional diacriti cs characters */

797 if (strncmp(line,"IGNORE",6) == 0) {

798 if (parse_array(line, &ignorechars, &ignorechars_utf16,

799 &ignorechars_utf16_len, utf8, afflst->getlinenum())) {

800 delete afflst;

801 return 1;

802 }

803 }

804

805 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {

806 if (parse_aliasf(line, afflst)) {

807 delete afflst;

808 return 1;

809 }

810 }

811

812 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {

813 if (parse_aliasm(line, afflst)) {

814 delete afflst;

815 return 1;

816 }

817 }

818

819 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;

820 if (((strncmp(line,"SFX",3) == 0) \|\| (strncmp(line,"PFX",3) == 0)) && iss pace(line[3])) break;

821 }

822 if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);

823 delete afflst;

824 return 0;

825 }

826

827 /* parse in the ALIAS table */

828 int HashMgr::parse_aliasf(char * line, FileMgr * af)

829 {

830 if (numaliasf != 0) {

831 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum());

832 return 1;

833 }

834 char * tp = line;

835 char * piece;

836 int i = 0;

837 int np = 0;

838 piece = mystrsep(&tp, 0);

839 while (piece) {

840 if (*piece != '\0') {

841 switch(i) {

842 case 0: { np++; break; }

843 case 1: {

844 numaliasf = atoi(piece);

845 if (numaliasf < 1) {

846 numaliasf = 0;

847 aliasf = NULL;

848 aliasflen = NULL;

849 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu mber\n", af->getlinenum());

850 return 1;

851 }

852 aliasf = (unsigned short *) malloc(numaliasf sizeof(un signed short *));

853 aliasflen = (unsigned short ) malloc(numaliasf sizeof( short));

854 if (!aliasf \|\| !aliasflen) {

855 numaliasf = 0;

856 if (aliasf) free(aliasf);

857 if (aliasflen) free(aliasflen);

858 aliasf = NULL;

859 aliasflen = NULL;

860 return 1;

861 }

862 np++;

863 break;

864 }

865 default: break;

866 }

867 i++;

868 }

869 piece = mystrsep(&tp, 0);

870 }

871 if (np != 2) {

872 numaliasf = 0;

873 free(aliasf);

874 free(aliasflen);

875 aliasf = NULL;

876 aliasflen = NULL;

877 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum( ));

878 return 1;

879 }

880

881 /* now parse the numaliasf lines to read in the remainder of the table */

882 char * nl;

883 for (int j=0; j < numaliasf; j++) {

884 if ((nl = af->getline()) == NULL) return 1;

885 mychomp(nl);

886 tp = nl;

887 i = 0;

888 aliasf[j] = NULL;

889 aliasflen[j] = 0;

890 piece = mystrsep(&tp, 0);

891 while (piece) {

892 if (*piece != '\0') {

893 switch(i) {

894 case 0: {

895 if (strncmp(piece,"AF",2) != 0) {

896 numaliasf = 0;

897 free(aliasf);

898 free(aliasflen);

899 aliasf = NULL;

900 aliasflen = NULL;

901 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

902 return 1;

903 }

904 break;

905 }

906 case 1: {

907 aliasflen[j] = (unsigned short) decode_flags(&(alias f[j]), piece, af);

908 flag_qsort(aliasf[j], 0, aliasflen[j]);

909 break;

910 }

911 default: break;

912 }

913 i++;

914 }

915 piece = mystrsep(&tp, 0);

916 }

917 if (!aliasf[j]) {

918 free(aliasf);

919 free(aliasflen);

920 aliasf = NULL;

921 aliasflen = NULL;

922 numaliasf = 0;

923 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af-> getlinenum());

924 return 1;

925 }

926 }

927 return 0;

928 }

929

930 #ifdef HUNSPELL_CHROME_CLIENT

931 int HashMgr::LoadAFLines()

932 {

933 utf8 = 1; // We always use UTF-8.

934

935 // Read in all the AF lines which tell us the rules for each affix group ID.

936 hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator();

937 FileMgr afflst(&iterator);

938 while (char* line = afflst.getline()) {

939 int rv = parse_aliasf(line, &afflst);

940 if (rv)

941 return rv;

942 }

943

944 return 0;

945 }

946

947 hentry* HashMgr::InitHashEntry(hentry* entry,

948 size_t item_size,

949 const char* word,

950 int word_length,

951 int affix_index) const {

952 // Return if the given buffer doesn't have enough space for a hentry struct

953 // or the given word is too long.

954 // Our BDICT cannot handle words longer than (128 - 1) bytes. So, it is

955 // better to return an error if the given word is too long and prevent

956 // an unexpected result caused by a long word.

957 const int kMaxWordLen = 128;

958 if (item_size < sizeof(hentry) + word_length + 1 \|\|

959 word_length >= kMaxWordLen)

960 return NULL;

961

962 // Initialize a hentry struct with the given parameters, and

963 // append the given string at the end of this hentry struct.

964 memset(entry, 0, item_size);

965 FileMgr af(NULL);

966 entry->alen = static_cast<short>(

967 const_cast<HashMgr*>(this)->get_aliasf(affix_index, &entry->astr, &af));

968 entry->blen = static_cast<unsigned char>(word_length);

969 memcpy(&entry->word, word, word_length);

970

971 return entry;

972 }

973

974 hentry* HashMgr::CreateHashEntry(const char* word,

975 int word_length,

976 int affix_index) const {

977 // Return if the given word is too long.

978 // (See the comment in HashMgr::InitHashEntry().)

979 const int kMaxWordLen = 128;

980 if (word_length >= kMaxWordLen)

981 return NULL;

982

983 const size_t kEntrySize = sizeof(hentry) + word_length + 1;

984 struct hentry* entry = reinterpret_cast<hentry*>(malloc(kEntrySize));

985 if (entry)

986 InitHashEntry(entry, kEntrySize, word, word_length, affix_index);

987

988 return entry;

989 }

990

991 void HashMgr::DeleteHashEntry(hentry* entry) const {

992 free(entry);

993 }

994

995 hentry* HashMgr::AffixIDsToHentry(char* word,

996 int* affix_ids,

997 int affix_count) const

998 {

999 if (affix_count == 0)

1000 return NULL;

1001

1002 HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache;

1003 std::string std_word(word);

1004 HEntryCache::iterator found = cache.find(std_word);

1005 if (found != cache.end()) {

1006 // We must return an existing hentry for the same word if we've previously

1007 // handed one out. Hunspell will compare pointers in some cases to see if

1008 // two words it has found are the same.

1009 return found->second;

1010 }

1011

1012 short word_len = static_cast<short>(strlen(word));

1013

1014 // We can get a number of prefixes per word. There will normally be only one,

1015 // but if not, there will be a linked list of "hentry"s for the "homonym"s

1016 // for the word.

1017 struct hentry* first_he = NULL;

1018 struct hentry* prev_he = NULL; // For making linked list.

1019 for (int i = 0; i < affix_count; i++) {

1020 struct hentry* he = CreateHashEntry(word, word_len, affix_ids[i]);

1021 if (!he)

1022 break;

1023 if (i == 0)

1024 first_he = he;

1025 if (prev_he)

1026 prev_he->next_homonym = he;

1027 prev_he = he;

1028 }

1029

1030 cache[std_word] = first_he; // Save this word in the cache for later.

1031 return first_he;

1032 }

1033

1034 hentry* HashMgr::GetHentryFromHEntryCache(char* word) {

1035 HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache;

1036 std::string std_word(word);

1037 HEntryCache::iterator found = cache.find(std_word);

1038 if (found != cache.end())

1039 return found->second;

1040 else

1041 return NULL;

1042 }

1043 #endif

1044

1045 int HashMgr::is_aliasf() {

1046 return (aliasf != NULL);

1047 }

1048

1049 int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {

1050 if ((index > 0) && (index <= numaliasf)) {

1051 *fvec = aliasf[index - 1];

1052 return aliasflen[index - 1];

1053 }

1054 HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->g etlinenum(), index);

1055 *fvec = NULL;

1056 return 0;

1057 }

1058

1059 /* parse morph alias definitions */

1060 int HashMgr::parse_aliasm(char * line, FileMgr * af)

1061 {

1062 if (numaliasm != 0) {

1063 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum());

1064 return 1;

1065 }

1066 char * tp = line;

1067 char * piece;

1068 int i = 0;

1069 int np = 0;

1070 piece = mystrsep(&tp, 0);

1071 while (piece) {

1072 if (*piece != '\0') {

1073 switch(i) {

1074 case 0: { np++; break; }

1075 case 1: {

1076 numaliasm = atoi(piece);

1077 if (numaliasm < 1) {

1078 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu mber\n", af->getlinenum());

1079 return 1;

1080 }

1081 aliasm = (char *) malloc(numaliasm sizeof(char *));

1082 if (!aliasm) {

1083 numaliasm = 0;

1084 return 1;

1085 }

1086 np++;

1087 break;

1088 }

1089 default: break;

1090 }

1091 i++;

1092 }

1093 piece = mystrsep(&tp, 0);

1094 }

1095 if (np != 2) {

1096 numaliasm = 0;

1097 free(aliasm);

1098 aliasm = NULL;

1099 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum( ));

1100 return 1;

1101 }

1102

1103 /* now parse the numaliasm lines to read in the remainder of the table */

1104 char * nl = line;

1105 for (int j=0; j < numaliasm; j++) {

1106 if ((nl = af->getline()) == NULL) return 1;

1107 mychomp(nl);

1108 tp = nl;

1109 i = 0;

1110 aliasm[j] = NULL;

1111 piece = mystrsep(&tp, ' ');

1112 while (piece) {

1113 if (*piece != '\0') {

1114 switch(i) {

1115 case 0: {

1116 if (strncmp(piece,"AM",2) != 0) {

1117 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

1118 numaliasm = 0;

1119 free(aliasm);

1120 aliasm = NULL;

1121 return 1;

1122 }

1123 break;

1124 }

1125 case 1: {

1126 // add the remaining of the line

1127 if (*tp) {

1128 *(tp - 1) = ' ';

1129 tp = tp + strlen(tp);

1130 }

1131 if (complexprefixes) {

1132 if (utf8) reverseword_utf(piece);

1133 else reverseword(piece);

1134 }

1135 aliasm[j] = mystrdup(piece);

1136 if (!aliasm[j]) {

1137 numaliasm = 0;

1138 free(aliasm);

1139 aliasm = NULL;

1140 return 1;

1141 }

1142 break; }

1143 default: break;

1144 }

1145 i++;

1146 }

1147 piece = mystrsep(&tp, ' ');

1148 }

1149 if (!aliasm[j]) {

1150 numaliasm = 0;

1151 free(aliasm);

1152 aliasm = NULL;

1153 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af-> getlinenum());

1154 return 1;

1155 }

1156 }

1157 return 0;

1158 }

1159

1160 int HashMgr::is_aliasm() {

1161 return (aliasm != NULL);

1162 }

1163

1164 char * HashMgr::get_aliasm(int index) {

1165 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];

1166 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);

1167 return NULL;

1168 }

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/src/hunspell/hashmgr.hxx ('k') | third_party/hunspell_new/src/hunspell/htypes.hxx » ('j') | no next file with comments »