third_party/hunspell_new/src/hunspell/affentry.cxx - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/src/hunspell/affentry.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #include "license.hunspell"

2 #include "license.myspell"

3

4 #include <stdlib.h>

5 #include <string.h>

6 #include <stdio.h>

7 #include <ctype.h>

8

9 #include "affentry.hxx"

10 #include "csutil.hxx"

11

12 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)

13 {

14 // register affix manager

15 pmyMgr = pmgr;

16

17 // set up its initial values

18

19 aflag = dp->aflag; // flag

20 strip = dp->strip; // string to strip

21 appnd = dp->appnd; // string to append

22 stripl = dp->stripl; // length of strip string

23 appndl = dp->appndl; // length of append string

24 numconds = dp->numconds; // length of the condition

25 opts = dp->opts; // cross product flag

26 // then copy over all of the conditions

27 if (opts & aeLONGCOND) {

28 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);

29 c.l.conds2 = dp->c.l.conds2;

30 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);

31 next = NULL;

32 nextne = NULL;

33 nexteq = NULL;

34 morphcode = dp->morphcode;

35 contclass = dp->contclass;

36 contclasslen = dp->contclasslen;

37 }

38

39

40 PfxEntry::~PfxEntry()

41 {

42 aflag = 0;

43 if (appnd) free(appnd);

44 if (strip) free(strip);

45 pmyMgr = NULL;

46 appnd = NULL;

47 strip = NULL;

48 if (opts & aeLONGCOND) free(c.l.conds2);

49 if (morphcode && !(opts & aeALIASM)) free(morphcode);

50 if (contclass && !(opts & aeALIASF)) free(contclass);

51 }

52

53 // add prefix to this word assuming conditions hold

54 char * PfxEntry::add(const char * word, int len)

55 {

56 char tword[MAXWORDUTF8LEN + 4];

57

58 if ((len > stripl \|\| (len == 0 && pmyMgr->get_fullstrip())) &&

59 (len >= numconds) && test_condition(word) &&

60 (!stripl \|\| (strncmp(word, strip, stripl) == 0)) &&

61 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {

62 /* we have a match so add prefix */

63 char * pp = tword;

64 if (appndl) {

65 strcpy(tword,appnd);

66 pp += appndl;

67 }

68 strcpy(pp, (word + stripl));

69 return mystrdup(tword);

70 }

71 return NULL;

72 }

73

74 inline char * PfxEntry::nextchar(char * p) {

75 if (p) {

76 p++;

77 if (opts & aeLONGCOND) {

78 // jump to the 2nd part of the condition

79 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;

80 // end of the MAXCONDLEN length condition

81 } else if (p == c.conds + MAXCONDLEN) return NULL;

82 return *p ? p : NULL;

83 }

84 return NULL;

85 }

86

87 inline int PfxEntry::test_condition(const char * st)

88 {

89 const char * pos = NULL; // group with pos input position

90 bool neg = false; // complementer

91 bool ingroup = false; // character in the group

92 if (numconds == 0) return 1;

93 char * p = c.conds;

94 while (1) {

95 switch (*p) {

96 case '\0': return 1;

97 case '[': {

98 neg = false;

99 ingroup = false;

100 p = nextchar(p);

101 pos = st; break;

102 }

103 case '^': { p = nextchar(p); neg = true; break; }

104 case ']': {

105 if ((neg && ingroup) \|\| (!neg && !ingroup)) return 0;

106 pos = NULL;

107 p = nextchar(p);

108 // skip the next character

109 if (!ingroup && st) for (st++; (opts & aeUTF8) && (st & 0xc0) == 0x80; st++);

110 if (*st == '\0' && p) return 0; // word <= condition

111 break;

112 }

113 case '.': if (!pos) { // dots are not metacharacters in groups: [.]

114 p = nextchar(p);

115 // skip the next character

116 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);

117 if (*st == '\0' && p) return 0; // word <= condition

118 break;

119 }

120 default: {

121 if (st == p) {

122 st++;

123 p = nextchar(p);

124 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte

125 while (p && (*p & 0xc0) == 0x80) { // character

126 if (p != st) {

127 if (!pos) return 0;

128 st = pos;

129 break;

130 }

131 p = nextchar(p);

132 st++;

133 }

134 if (pos && st != pos) {

135 ingroup = true;

136 while (p && *p != ']' && ((p = nextchar(p)) != NULL) );

137 }

138 } else if (pos) {

139 ingroup = true;

140 while (p && *p != ']' && ((p = nextchar(p)) != NULL));

141 }

142 } else if (pos) { // group

143 p = nextchar(p);

144 } else return 0;

145 }

146 }

147 if (!p) return 1;

148 }

149 }

150

151 // check if this prefix entry matches

152 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound , const FLAG needflag)

153 {

154 int tmpl; // length of tmpword

155 struct hentry * he; // hash entry of root word or NULL

156 char tmpword[MAXWORDUTF8LEN + 4];

157

158 // on entry prefix is 0 length or already matches the beginning of the word.

159 // So if the remaining root word has positive length

160 // and if there are enough chars in root word and added back strip chars

161 // to meet the number of characters conditions, then test it

162

163 tmpl = len - appndl;

164

165 if (tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) {

166

167 // generate new root word by removing prefix and adding

168 // back any characters that would have been stripped

169

170 if (stripl) strcpy (tmpword, strip);

171 strcpy ((tmpword + stripl), (word + appndl));

172

173 // now make sure all of the conditions on characters

174 // are met. Please see the appendix at the end of

175 // this file for more info on exactly what is being

176 // tested

177

178 // if all conditions are met then check if resulting

179 // root word in the dictionary

180

181 if (test_condition(tmpword)) {

182 tmpl += stripl;

183 if ((he = pmyMgr->lookup(tmpword)) != NULL) {

184 do {

185 if (TESTAFF(he->astr, aflag, he->alen) &&

186 // forbid single prefixes with needaffix flag

187 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclassl en) &&

188 // needflag

189 ((!needflag) \|\| TESTAFF(he->astr, needflag, he->alen) \|\|

190 (contclass && TESTAFF(contclass, needflag, contclasslen ))))

191 return he;

192 he = he->next_homonym; // check homonyms

193 } while (he);

194 }

195

196 // prefix matched but no root word was found

197 // if aeXPRODUCT is allowed, try again but now

198 // ross checked combined with a suffix

199

200 //if ((opts & aeXPRODUCT) && in_compound) {

201 if ((opts & aeXPRODUCT)) {

202 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NU LL,

203 0, NULL, FLAG_NULL, needflag, in_compound);

204 if (he) return he;

205 }

206 }

207 }

208 return NULL;

209 }

210

211 // check if this prefix entry matches

212 struct hentry * PfxEntry::check_twosfx(const char * word, int len,

213 char in_compound, const FLAG needflag)

214 {

215 int tmpl; // length of tmpword

216 struct hentry * he; // hash entry of root word or NULL

217 char tmpword[MAXWORDUTF8LEN + 4];

218

219 // on entry prefix is 0 length or already matches the beginning of the word.

220 // So if the remaining root word has positive length

221 // and if there are enough chars in root word and added back strip chars

222 // to meet the number of characters conditions, then test it

223

224 tmpl = len - appndl;

225

226 if ((tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) &&

227 (tmpl + stripl >= numconds)) {

228

229 // generate new root word by removing prefix and adding

230 // back any characters that would have been stripped

231

232 if (stripl) strcpy (tmpword, strip);

233 strcpy ((tmpword + stripl), (word + appndl));

234

235 // now make sure all of the conditions on characters

236 // are met. Please see the appendix at the end of

237 // this file for more info on exactly what is being

238 // tested

239

240 // if all conditions are met then check if resulting

241 // root word in the dictionary

242

243 if (test_condition(tmpword)) {

244 tmpl += stripl;

245

246 // prefix matched but no root word was found

247 // if aeXPRODUCT is allowed, try again but now

248 // cross checked combined with a suffix

249

250 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

251 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, t his, needflag);

252 if (he) return he;

253 }

254 }

255 }

256 return NULL;

257 }

258

259 // check if this prefix entry matches

260 char * PfxEntry::check_twosfx_morph(const char * word, int len,

261 char in_compound, const FLAG needflag)

262 {

263 int tmpl; // length of tmpword

264 char tmpword[MAXWORDUTF8LEN + 4];

265

266 // on entry prefix is 0 length or already matches the beginning of the word.

267 // So if the remaining root word has positive length

268 // and if there are enough chars in root word and added back strip chars

269 // to meet the number of characters conditions, then test it

270

271 tmpl = len - appndl;

272

273 if ((tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) &&

274 (tmpl + stripl >= numconds)) {

275

276 // generate new root word by removing prefix and adding

277 // back any characters that would have been stripped

278

279 if (stripl) strcpy (tmpword, strip);

280 strcpy ((tmpword + stripl), (word + appndl));

281

282 // now make sure all of the conditions on characters

283 // are met. Please see the appendix at the end of

284 // this file for more info on exactly what is being

285 // tested

286

287 // if all conditions are met then check if resulting

288 // root word in the dictionary

289

290 if (test_condition(tmpword)) {

291 tmpl += stripl;

292

293 // prefix matched but no root word was found

294 // if aeXPRODUCT is allowed, try again but now

295 // ross checked combined with a suffix

296

297 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

298 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,

299 aeXPRODUCT, this, needflag);

300 }

301 }

302 }

303 return NULL;

304 }

305

306 // check if this prefix entry matches

307 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)

308 {

309 int tmpl; // length of tmpword

310 struct hentry * he; // hash entry of root word or NULL

311 char tmpword[MAXWORDUTF8LEN + 4];

312 char result[MAXLNLEN];

313 char * st;

314

315 *result = '\0';

316

317 // on entry prefix is 0 length or already matches the beginning of the word.

318 // So if the remaining root word has positive length

319 // and if there are enough chars in root word and added back strip chars

320 // to meet the number of characters conditions, then test it

321

322 tmpl = len - appndl;

323

324 if ((tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) &&

325 (tmpl + stripl >= numconds)) {

326

327 // generate new root word by removing prefix and adding

328 // back any characters that would have been stripped

329

330 if (stripl) strcpy (tmpword, strip);

331 strcpy ((tmpword + stripl), (word + appndl));

332

333 // now make sure all of the conditions on characters

334 // are met. Please see the appendix at the end of

335 // this file for more info on exactly what is being

336 // tested

337

338 // if all conditions are met then check if resulting

339 // root word in the dictionary

340

341 if (test_condition(tmpword)) {

342 tmpl += stripl;

343 if ((he = pmyMgr->lookup(tmpword)) != NULL) {

344 do {

345 if (TESTAFF(he->astr, aflag, he->alen) &&

346 // forbid single prefixes with needaffix flag

347 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclassl en) &&

348 // needflag

349 ((!needflag) \|\| TESTAFF(he->astr, needflag, he->alen) \|\|

350 (contclass && TESTAFF(contclass, needflag, contclasslen )))) {

351 if (morphcode) {

352 mystrcat(result, " ", MAXLNLEN);

353 mystrcat(result, morphcode, MAXLNLEN);

354 } else mystrcat(result,getKey(), MAXLNLEN);

355 if (!HENTRY_FIND(he, MORPH_STEM)) {

356 mystrcat(result, " ", MAXLNLEN);

357 mystrcat(result, MORPH_STEM, MAXLNLEN);

358 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);

359 }

360 // store the pointer of the hash entry

361 if (HENTRY_DATA(he)) {

362 mystrcat(result, " ", MAXLNLEN);

363 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);

364 } else {

365 // return with debug information

366 char * flag = pmyMgr->encode_flag(getFlag());

367 mystrcat(result, " ", MAXLNLEN);

368 mystrcat(result, MORPH_FLAG, MAXLNLEN);

369 mystrcat(result, flag, MAXLNLEN);

370 free(flag);

371 }

372 mystrcat(result, "\n", MAXLNLEN);

373 }

374 he = he->next_homonym;

375 } while (he);

376 }

377

378 // prefix matched but no root word was found

379 // if aeXPRODUCT is allowed, try again but now

380 // ross checked combined with a suffix

381

382 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

383 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, th is,

384 FLAG_NULL, needflag);

385 if (st) {

386 mystrcat(result, st, MAXLNLEN);

387 free(st);

388 }

389 }

390 }

391 }

392

393 if (*result) return mystrdup(result);

394 return NULL;

395 }

396

397 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)

398 {

399 // register affix manager

400 pmyMgr = pmgr;

401

402 // set up its initial values

403 aflag = dp->aflag; // char flag

404 strip = dp->strip; // string to strip

405 appnd = dp->appnd; // string to append

406 stripl = dp->stripl; // length of strip string

407 appndl = dp->appndl; // length of append string

408 numconds = dp->numconds; // length of the condition

409 opts = dp->opts; // cross product flag

410

411 // then copy over all of the conditions

412 if (opts & aeLONGCOND) {

413 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);

414 c.l.conds2 = dp->c.l.conds2;

415 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);

416 next = NULL;

417 nextne = NULL;

418 nexteq = NULL;

419 rappnd = myrevstrdup(appnd);

420 morphcode = dp->morphcode;

421 contclass = dp->contclass;

422 contclasslen = dp->contclasslen;

423 }

424

425

426 SfxEntry::~SfxEntry()

427 {

428 aflag = 0;

429 if (appnd) free(appnd);

430 if (rappnd) free(rappnd);

431 if (strip) free(strip);

432 pmyMgr = NULL;

433 appnd = NULL;

434 strip = NULL;

435 if (opts & aeLONGCOND) free(c.l.conds2);

436 if (morphcode && !(opts & aeALIASM)) free(morphcode);

437 if (contclass && !(opts & aeALIASF)) free(contclass);

438 }

439

440 // add suffix to this word assuming conditions hold

441 char * SfxEntry::add(const char * word, int len)

442 {

443 char tword[MAXWORDUTF8LEN + 4];

444

445 /* make sure all conditions match */

446 if ((len > stripl \|\| (len == 0 && pmyMgr->get_fullstrip())) &&

447 (len >= numconds) && test_condition(word + len, word) &&

448 (!stripl \|\| (strcmp(word + len - stripl, strip) == 0)) &&

449 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {

450 /* we have a match so add suffix */

451 strcpy(tword,word);

452 if (appndl) {

453 strcpy(tword + len - stripl, appnd);

454 } else {

455 *(tword + len - stripl) = '\0';

456 }

457 return mystrdup(tword);

458 }

459 return NULL;

460 }

461

462 inline char * SfxEntry::nextchar(char * p) {

463 if (p) {

464 p++;

465 if (opts & aeLONGCOND) {

466 // jump to the 2nd part of the condition

467 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;

468 // end of the MAXCONDLEN length condition

469 } else if (p == c.conds + MAXCONDLEN) return NULL;

470 return *p ? p : NULL;

471 }

472 return NULL;

473 }

474

475 inline int SfxEntry::test_condition(const char * st, const char * beg)

476 {

477 const char * pos = NULL; // group with pos input position

478 bool neg = false; // complementer

479 bool ingroup = false; // character in the group

480 if (numconds == 0) return 1;

481 char * p = c.conds;

482 st--;

483 int i = 1;

484 while (1) {

485 switch (*p) {

486 case '\0': return 1;

487 case '[': { p = nextchar(p); pos = st; break; }

488 case '^': { p = nextchar(p); neg = true; break; }

489 case ']': { if (!neg && !ingroup) return 0;

490 i++;

491 // skip the next character

492 if (!ingroup) {

493 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x8 0; st--);

494 st--;

495 }

496 pos = NULL;

497 neg = false;

498 ingroup = false;

499 p = nextchar(p);

500 if (st < beg && p) return 0; // word <= condition

501 break;

502 }

503 case '.': if (!pos) { // dots are not metacharacters in groups: [.]

504 p = nextchar(p);

505 // skip the next character

506 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x8 0; st--);

507 if (st < beg) { // word <= condition

508 if (p) return 0; else return 1;

509 }

510 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 char acter

511 st--;

512 if (st < beg) { // word <= condition

513 if (p) return 0; else return 1;

514 }

515 }

516 break;

517 }

518 default: {

519 if (st == p) {

520 p = nextchar(p);

521 if ((opts & aeUTF8) && (*st & 0x80)) {

522 st--;

523 while (p && (st >= beg)) {

524 if (p != st) {

525 if (!pos) return 0;

526 st = pos;

527 break;

528 }

529 // first byte of the UTF-8 multibyte character

530 if ((*p & 0xc0) != 0x80) break;

531 p = nextchar(p);

532 st--;

533 }

534 if (pos && st != pos) {

535 if (neg) return 0;

536 else if (i == numconds) return 1;

537 ingroup = true;

538 while (p && *p != ']' && ((p = nextchar(p)) != NULL) );

539 st--;

540 }

541 if (p && *p != ']') p = nextchar(p);

542 } else if (pos) {

543 if (neg) return 0;

544 else if (i == numconds) return 1;

545 ingroup = true;

546 while (p && *p != ']' && ((p = nextchar(p)) != NULL));

547 // if (p && *p != ']') p = nextchar(p);

548 st--;

549 }

550 if (!pos) {

551 i++;

552 st--;

553 }

554 if (st < beg && p && *p != ']') return 0; // word <= conditi on

555 } else if (pos) { // group

556 p = nextchar(p);

557 } else return 0;

558 }

559 }

560 if (!p) return 1;

561 }

562 }

563

564 // see if this suffix is present in the word

565 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,

566 PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,

567 const FLAG badflag)

568 {

569 int tmpl; // length of tmpword

570 struct hentry * he; // hash entry pointer

571 unsigned char * cp;

572 char tmpword[MAXWORDUTF8LEN + 4];

573 PfxEntry* ep = ppfx;

574

575 // if this suffix is being cross checked with a prefix

576 // but it does not support cross products skip it

577

578 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))

579 return NULL;

580

581 // upon entry suffix is 0 length or already matches the end of the word.

582 // So if the remaining root word has positive length

583 // and if there are enough chars in root word and added back strip chars

584 // to meet the number of characters conditions, then test it

585

586 tmpl = len - appndl;

587 // the second condition is not enough for UTF-8 strings

588 // it checked in test_condition()

589

590 if ((tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) &&

591 (tmpl + stripl >= numconds)) {

592

593 // generate new root word by removing suffix and adding

594 // back any characters that would have been stripped or

595 // or null terminating the shorter string

596

597 strcpy (tmpword, word);

598 cp = (unsigned char *)(tmpword + tmpl);

599 if (stripl) {

600 strcpy ((char *)cp, strip);

601 tmpl += stripl;

602 cp = (unsigned char *)(tmpword + tmpl);

603 } else *cp = '\0';

604

605 // now make sure all of the conditions on characters

606 // are met. Please see the appendix at the end of

607 // this file for more info on exactly what is being

608 // tested

609

610 // if all conditions are met then check if resulting

611 // root word in the dictionary

612

613 if (test_condition((char ) cp, (char ) tmpword)) {

614

615 #ifdef SZOSZABLYA_POSSIBLE_ROOTS

616 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);

617 #endif

618 if ((he = pmyMgr->lookup(tmpword)) != NULL) {

619 do {

620 // check conditional suffix (enabled by prefix)

621 if ((TESTAFF(he->astr, aflag, he->alen) \|\| (ep && ep->ge tCont() &&

622 TESTAFF(ep->getCont(), aflag, ep->getContLen ()))) &&

623 (((optflags & aeXPRODUCT) == 0) \|\|

624 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) \| \|

625 // enabled by prefix

626 ((contclass) && (ep && TESTAFF(contclass, ep->getFla g(), contclasslen)))

627 ) &&

628 // handle cont. class

629 ((!cclass) \|\|

630 ((contclass) && TESTAFF(contclass, cclass, contc lasslen))

631 ) &&

632 // check only in compound homonyms (bad flags)

633 (!badflag \|\| !TESTAFF(he->astr, badflag, he->alen)

634 ) &&

635 // handle required flag

636 ((!needflag) \|\|

637 (TESTAFF(he->astr, needflag, he->alen) \|\|

638 ((contclass) && TESTAFF(contclass, needflag, contc lasslen)))

639 )

640 ) return he;

641 he = he->next_homonym; // check homonyms

642 } while (he);

643

644 // obsolote stemming code (used only by the

645 // experimental SuffixMgr:suggest_pos_stems)

646 // store resulting root in wlst

647 } else if (wlst && (*ns < maxSug)) {

648 int cwrd = 1;

649 for (int k=0; k < *ns; k++)

650 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;

651 if (cwrd) {

652 wlst[*ns] = mystrdup(tmpword);

653 if (wlst[*ns] == NULL) {

654 for (int j=0; j<*ns; j++) free(wlst[j]);

655 *ns = -1;

656 return NULL;

657 }

658 (*ns)++;

659 }

660 }

661 }

662 }

663 return NULL;

664 }

665

666 // see if two-level suffix is present in the word

667 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,

668 PfxEntry* ppfx, const FLAG needflag)

669 {

670 int tmpl; // length of tmpword

671 struct hentry * he; // hash entry pointer

672 unsigned char * cp;

673 char tmpword[MAXWORDUTF8LEN + 4];

674 PfxEntry* ep = ppfx;

675

676

677 // if this suffix is being cross checked with a prefix

678 // but it does not support cross products skip it

679

680 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)

681 return NULL;

682

683 // upon entry suffix is 0 length or already matches the end of the word.

684 // So if the remaining root word has positive length

685 // and if there are enough chars in root word and added back strip chars

686 // to meet the number of characters conditions, then test it

687

688 tmpl = len - appndl;

689

690 if ((tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) &&

691 (tmpl + stripl >= numconds)) {

692

693 // generate new root word by removing suffix and adding

694 // back any characters that would have been stripped or

695 // or null terminating the shorter string

696

697 strcpy (tmpword, word);

698 cp = (unsigned char *)(tmpword + tmpl);

699 if (stripl) {

700 strcpy ((char *)cp, strip);

701 tmpl += stripl;

702 cp = (unsigned char *)(tmpword + tmpl);

703 } else *cp = '\0';

704

705 // now make sure all of the conditions on characters

706 // are met. Please see the appendix at the end of

707 // this file for more info on exactly what is being

708 // tested

709

710 // if all conditions are met then recall suffix_check

711

712 if (test_condition((char ) cp, (char ) tmpword)) {

713 if (ppfx) {

714 // handle conditional suffix

715 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contcla sslen))

716 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);

717 else

718 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);

719 } else {

720 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, N ULL, (FLAG) aflag, needflag);

721 }

722 if (he) return he;

723 }

724 }

725 return NULL;

726 }

727

728 // see if two-level suffix is present in the word

729 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,

730 PfxEntry* ppfx, const FLAG needflag)

731 {

732 int tmpl; // length of tmpword

733 unsigned char * cp;

734 char tmpword[MAXWORDUTF8LEN + 4];

735 PfxEntry* ep = ppfx;

736 char * st;

737

738 char result[MAXLNLEN];

739

740 *result = '\0';

741

742 // if this suffix is being cross checked with a prefix

743 // but it does not support cross products skip it

744

745 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)

746 return NULL;

747

748 // upon entry suffix is 0 length or already matches the end of the word.

749 // So if the remaining root word has positive length

750 // and if there are enough chars in root word and added back strip chars

751 // to meet the number of characters conditions, then test it

752

753 tmpl = len - appndl;

754

755 if ((tmpl > 0 \|\| (tmpl == 0 && pmyMgr->get_fullstrip())) &&

756 (tmpl + stripl >= numconds)) {

757

758 // generate new root word by removing suffix and adding

759 // back any characters that would have been stripped or

760 // or null terminating the shorter string

761

762 strcpy (tmpword, word);

763 cp = (unsigned char *)(tmpword + tmpl);

764 if (stripl) {

765 strcpy ((char *)cp, strip);

766 tmpl += stripl;

767 cp = (unsigned char *)(tmpword + tmpl);

768 } else *cp = '\0';

769

770 // now make sure all of the conditions on characters

771 // are met. Please see the appendix at the end of

772 // this file for more info on exactly what is being

773 // tested

774

775 // if all conditions are met then recall suffix_check

776

777 if (test_condition((char ) cp, (char ) tmpword)) {

778 if (ppfx) {

779 // handle conditional suffix

780 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contcla sslen)) {

781 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);

782 if (st) {

783 if (ppfx->getMorph()) {

784 mystrcat(result, ppfx->getMorph(), MAXLNLEN);

785 mystrcat(result, " ", MAXLNLEN);

786 }

787 mystrcat(result,st, MAXLNLEN);

788 free(st);

789 mychomp(result);

790 }

791 } else {

792 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);

793 if (st) {

794 mystrcat(result, st, MAXLNLEN);

795 free(st);

796 mychomp(result);

797 }

798 }

799 } else {

800 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);

801 if (st) {

802 mystrcat(result, st, MAXLNLEN);

803 free(st);

804 mychomp(result);

805 }

806 }

807 if (*result) return mystrdup(result);

808 }

809 }

810 return NULL;

811 }

812

813 // get next homonym with same affix

814 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, Pfx Entry* ppfx,

815 const FLAG cclass, const FLAG needflag)

816 {

817 PfxEntry* ep = ppfx;

818 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;

819

820 while (he->next_homonym) {

821 he = he->next_homonym;

822 if ((TESTAFF(he->astr, aflag, he->alen) \|\| (ep && ep->getCont() && TESTA FF(ep->getCont(), aflag, ep->getContLen()))) &&

823 ((optflags & aeXPRODUCT) == 0 \|\|

824 TESTAFF(he->astr, eFlag, he->alen) \|\|

825 // handle conditional suffix

826 ((contclass) && TESTAFF(contclass, eFlag, contclassl en))

827 ) &&

828 // handle cont. class

829 ((!cclass) \|\|

830 ((contclass) && TESTAFF(contclass, cclass, contc lasslen))

831 ) &&

832 // handle required flag

833 ((!needflag) \|\|

834 (TESTAFF(he->astr, needflag, he->alen) \|\|

835 ((contclass) && TESTAFF(contclass, needflag, contc lasslen)))

836 )

837 ) return he;

838 }

839 return NULL;

840 }

841

842

843 #if 0

844

845 Appendix: Understanding Affix Code

846

847

848 An affix is either a prefix or a suffix attached to root words to make

849 other words.

850

851 Basically a Prefix or a Suffix is set of AffEntry objects

852 which store information about the prefix or suffix along

853 with supporting routines to check if a word has a particular

854 prefix or suffix or a combination.

855

856 The structure affentry is defined as follows:

857

858 struct affentry

859 {

860 unsigned short aflag; // ID used to represent the affix

861 char * strip; // string to strip before adding affix

862 char * appnd; // the affix string to add

863 unsigned char stripl; // length of the strip string

864 unsigned char appndl; // length of the affix string

865 char numconds; // the number of conditions that must be met

866 char opts; // flag: aeXPRODUCT- combine both prefix and suffix

867 char conds[SETSIZE]; // array which encodes the conditions to be met

868 };

869

870

871 Here is a suffix borrowed from the en_US.aff file. This file

872 is whitespace delimited.

873

874 SFX D Y 4

875 SFX D 0 e d

876 SFX D y ied [^aeiou]y

877 SFX D 0 ed [^ey]

878 SFX D 0 ed [aeiou]y

879

880 This information can be interpreted as follows:

881

882 In the first line has 4 fields

883

884 Field

885 -----

886 1 SFX - indicates this is a suffix

887 2 D - is the name of the character flag which represents this suffix

888 3 Y - indicates it can be combined with prefixes (cross product)

889 4 4 - indicates that sequence of 4 affentry structures are needed to

890 properly store the affix information

891

892 The remaining lines describe the unique information for the 4 SfxEntry

893 objects that make up this affix. Each line can be interpreted

894 as follows: (note fields 1 and 2 are as a check against line 1 info)

895

896 Field

897 -----

898 1 SFX - indicates this is a suffix

899 2 D - is the name of the character flag for this affix

900 3 y - the string of chars to strip off before adding affix

901 (a 0 here indicates the NULL string)

902 4 ied - the string of affix characters to add

903 5 [^aeiou]y - the conditions which must be met before the affix

904 can be applied

905

906 Field 5 is interesting. Since this is a suffix, field 5 tells us that

907 there are 2 conditions that must be met. The first condition is that

908 the next to the last character in the word must NOT be any of the

909 following "a", "e", "i", "o" or "u". The second condition is that

910 the last character of the word must end in "y".

911

912 So how can we encode this information concisely and be able to

913 test for both conditions in a fast manner? The answer is found

914 but studying the wonderful ispell code of Geoff Kuenning, et.al.

915 (now available under a normal BSD license).

916

917 If we set up a conds array of 256 bytes indexed (0 to 255) and access it

918 using a character (cast to an unsigned char) of a string, we have 8 bits

919 of information we can store about that character. Specifically we

920 could use each bit to say if that character is allowed in any of the

921 last (or first for prefixes) 8 characters of the word.

922

923 Basically, each character at one end of the word (up to the number

924 of conditions) is used to index into the conds array and the resulting

925 value found there says whether the that character is valid for a

926 specific character position in the word.

927

928 For prefixes, it does this by setting bit 0 if that char is valid

929 in the first position, bit 1 if valid in the second position, and so on.

930

931 If a bit is not set, then that char is not valid for that postion in the

932 word.

933

934 If working with suffixes bit 0 is used for the character closest

935 to the front, bit 1 for the next character towards the end, ...,

936 with bit numconds-1 representing the last char at the end of the string.

937

938 Note: since entries in the conds[] are 8 bits, only 8 conditions

939 (read that only 8 character positions) can be examined at one

940 end of a word (the beginning for prefixes and the end for suffixes.

941

942 So to make this clearer, lets encode the conds array values for the

943 first two affentries for the suffix D described earlier.

944

945

946 For the first affentry:

947 numconds = 1 (only examine the last character)

948

949 conds['e'] = (1 << 0) (the word must end in an E)

950 all others are all 0

951

952 For the second affentry:

953 numconds = 2 (only examine the last two characters)

954

955 conds[X] = conds[X] \| (1 << 0) (aeiou are not allowed)

956 where X is all characters but a, e, i, o, or u

957

958

959 conds['y'] = (1 << 1) (the last char must be a y)

960 all other bits for all other entries in the conds array are zero

961

962

963 #endif

964

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/src/hunspell/affentry.hxx ('k') | third_party/hunspell_new/src/hunspell/affixmgr.hxx » ('j') | no next file with comments »