third_party/hunspell_new/src/hunspell/suggestmgr.cxx - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/src/hunspell/suggestmgr.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #include "license.hunspell"

2 #include "license.myspell"

3

4 #include <stdlib.h>

5 #include <string.h>

6 #include <stdio.h>

7 #include <ctype.h>

8

9 #include "suggestmgr.hxx"

10 #include "htypes.hxx"

11 #include "csutil.hxx"

12

13 const w_char W_VLINE = { '\0', '\|' };

14

15 #ifdef HUNSPELL_CHROME_CLIENT

16 namespace {

17 // A simple class which creates temporary hentry objects which are available

18 // only in a scope. To conceal memory operations from SuggestMgr functions,

19 // this object automatically deletes all hentry objects created through

20 // CreateScopedHashEntry() calls in its destructor. So, the following snippet

21 // raises a memory error.

22 //

23 // hentry* bad_copy = NULL;

24 // {

25 // ScopedHashEntryFactory factory;

26 // hentry* scoped_copy = factory.CreateScopedHashEntry(0, source);

27 // ...

28 // bad_copy = scoped_copy;

29 // }

30 // if (bad_copy->word[0]) // memory for scoped_copy has been deleted!

31 //

32 // As listed in the above snippet, it is simple to use this class.

33 // 1. Declare an instance of this ScopedHashEntryFactory, and;

34 // 2. Call its CreateHashEntry() member instead of using 'new hentry' or

35 // 'operator='.

36 //

37 class ScopedHashEntryFactory {

38 public:

39 ScopedHashEntryFactory();

40 ~ScopedHashEntryFactory();

41

42 // Creates a temporary copy of the given hentry struct.

43 // The returned copy is available only while this object is available.

44 // NOTE: this function just calls memcpy() in creating a copy of the given

45 // hentry struct, i.e. it does NOT copy objects referred by pointers of the

46 // given hentry struct.

47 hentry* CreateScopedHashEntry(int index, const hentry* source);

48

49 private:

50 // A struct which encapsulates the new hentry struct introduced in hunspell

51 // 1.2.8. For a pointer to an hentry struct 'h', hunspell 1.2.8 stores a word

52 // (including a NUL character) into 'h->word[0]',...,'h->word[h->blen]' even

53 // though arraysize(h->word[]) is 1. Also, it changed 'astr' to a pointer so

54 // it can store affix flags into 'h->astr[0]',...,'h->astr[alen-1]'. To handle

55 // this new hentry struct, we define a struct which combines three values: an

56 // hentry struct 'hentry'; a char array 'word[kMaxWordLen]', and; an unsigned

57 // short array 'astr' so a hentry struct 'h' returned from

58 // CreateScopedHashEntry() satisfies the following equations:

59 // hentry* h = factory.CreateScopedHashEntry(0, source);

60 // h->word[0] == ((HashEntryItem*)h)->entry.word[0].

61 // h->word[1] == ((HashEntryItem*)h)->word[0].

62 // ...

63 // h->word[h->blen] == ((HashEntryItem*)h)->word[h->blen-1].

64 // h->astr[0] == ((HashEntryItem*)h)->astr[0].

65 // h->astr[1] == ((HashEntryItem*)h)->astr[1].

66 // ...

67 // h->astr[h->alen-1] == ((HashEntryItem*)h)->astr[h->alen-1].

68 enum {

69 kMaxWordLen = 128,

70 kMaxAffixLen = 8,

71 };

72 struct HashEntryItem {

73 hentry entry;

74 char word[kMaxWordLen];

75 unsigned short astr[kMaxAffixLen];

76 };

77

78 HashEntryItem hash_items_[MAX_ROOTS];

79 };

80

81 ScopedHashEntryFactory::ScopedHashEntryFactory() {

82 memset(&hash_items_[0], 0, sizeof(hash_items_));

83 }

84

85 ScopedHashEntryFactory::~ScopedHashEntryFactory() {

86 }

87

88 hentry* ScopedHashEntryFactory::CreateScopedHashEntry(int index,

89 const hentry* source) {

90 if (index >= MAX_ROOTS \|\| source->blen >= kMaxWordLen)

91 return NULL;

92

93 // Retrieve a HashEntryItem struct from our spool, initialize it, and

94 // returns the address of its 'hentry' member.

95 size_t source_size = sizeof(hentry) + source->blen + 1;

96 HashEntryItem* hash_item = &hash_items_[index];

97 memcpy(&hash_item->entry, source, source_size);

98 if (source->astr) {

99 hash_item->entry.alen = source->alen;

100 if (hash_item->entry.alen > kMaxAffixLen)

101 hash_item->entry.alen = kMaxAffixLen;

102 memcpy(hash_item->astr, source->astr, hash_item->entry.alen * sizeof(hash_it em->astr[0]));

103 hash_item->entry.astr = &hash_item->astr[0];

104 }

105 return &hash_item->entry;

106 }

107

108 } // namespace

109 #endif

110

111

112 #ifdef HUNSPELL_CHROME_CLIENT

113 SuggestMgr::SuggestMgr(hunspell::BDictReader* reader,

114 const char * tryme, int maxn,

115 AffixMgr * aptr)

116 {

117 bdict_reader = reader;

118 #else

119 SuggestMgr::SuggestMgr(const char * tryme, int maxn,

120 AffixMgr * aptr)

121 {

122 #endif

123

124 // register affix manager and check in string of chars to

125 // try when building candidate suggestions

126 pAMgr = aptr;

127

128 csconv = NULL;

129

130 ckeyl = 0;

131 ckey = NULL;

132 ckey_utf = NULL;

133

134 ctryl = 0;

135 ctry = NULL;

136 ctry_utf = NULL;

137

138 utf8 = 0;

139 langnum = 0;

140 complexprefixes = 0;

141

142 maxSug = maxn;

143 nosplitsugs = 0;

144 maxngramsugs = MAXNGRAMSUGS;

145 maxcpdsugs = MAXCOMPOUNDSUGS;

146

147 if (pAMgr) {

148 langnum = pAMgr->get_langnum();

149 ckey = pAMgr->get_key_string();

150 nosplitsugs = pAMgr->get_nosplitsugs();

151 if (pAMgr->get_maxngramsugs() >= 0)

152 maxngramsugs = pAMgr->get_maxngramsugs();

153 utf8 = pAMgr->get_utf8();

154 if (pAMgr->get_maxcpdsugs() >= 0)

155 maxcpdsugs = pAMgr->get_maxcpdsugs();

156 if (!utf8)

157 {

158 char * enc = pAMgr->get_encoding();

159 csconv = get_current_cs(enc);

160 free(enc);

161 }

162 complexprefixes = pAMgr->get_complexprefixes();

163 }

164

165 if (ckey) {

166 if (utf8) {

167 w_char t[MAXSWL];

168 ckeyl = u8_u16(t, MAXSWL, ckey);

169 ckey_utf = (w_char ) malloc(ckeyl sizeof(w_char));

170 if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));

171 else ckeyl = 0;

172 } else {

173 ckeyl = strlen(ckey);

174 }

175 }

176

177 if (tryme) {

178 ctry = mystrdup(tryme);

179 if (ctry) ctryl = strlen(ctry);

180 if (ctry && utf8) {

181 w_char t[MAXSWL];

182 ctryl = u8_u16(t, MAXSWL, tryme);

183 ctry_utf = (w_char ) malloc(ctryl sizeof(w_char));

184 if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));

185 else ctryl = 0;

186 }

187 }

188 }

189

190

191 SuggestMgr::~SuggestMgr()

192 {

193 pAMgr = NULL;

194 if (ckey) free(ckey);

195 ckey = NULL;

196 if (ckey_utf) free(ckey_utf);

197 ckey_utf = NULL;

198 ckeyl = 0;

199 if (ctry) free(ctry);

200 ctry = NULL;

201 if (ctry_utf) free(ctry_utf);

202 ctry_utf = NULL;

203 ctryl = 0;

204 maxSug = 0;

205 #ifdef MOZILLA_CLIENT

206 delete [] csconv;

207 #endif

208 }

209

210 int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,

211 int * timer, clock_t * timelimit) {

212 int cwrd = 1;

213 if (ns == maxSug) return maxSug;

214 for (int k=0; k < ns; k++) {

215 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;

216 }

217 if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {

218 wlst[ns] = mystrdup(candidate);

219 if (wlst[ns] == NULL) {

220 for (int j=0; j<ns; j++) free(wlst[j]);

221 return -1;

222 }

223 ns++;

224 }

225 return ns;

226 }

227

228 // generate suggestions for a misspelled word

229 // pass in address of array of char * pointers

230 // onlycompoundsug: probably bad suggestions (need for ngram sugs, too)

231

232 int SuggestMgr::suggest(char*** slst, const char * w, int nsug,

233 int * onlycompoundsug)

234 {

235 int nocompoundtwowords = 0;

236 char ** wlst;

237 w_char word_utf[MAXSWL];

238 int wl = 0;

239 int nsugorig = nsug;

240 char w2[MAXWORDUTF8LEN];

241 const char * word = w;

242 int oldSug = 0;

243

244 // word reversing wrapper for complex prefixes

245 if (complexprefixes) {

246 strcpy(w2, w);

247 if (utf8) reverseword_utf(w2); else reverseword(w2);

248 word = w2;

249 }

250

251 if (*slst) {

252 wlst = *slst;

253 } else {

254 wlst = (char *) malloc(maxSug sizeof(char *));

255 if (wlst == NULL) return -1;

256 for (int i = 0; i < maxSug; i++) {

257 wlst[i] = NULL;

258 }

259 }

260

261 if (utf8) {

262 wl = u8_u16(word_utf, MAXSWL, word);

263 if (wl == -1) {

264 *slst = wlst;

265 return nsug;

266 }

267 }

268

269 for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest ++) {

270

271 // limit compound suggestion

272 if (cpdsuggest > 0) oldSug = nsug;

273

274 // suggestions for an uppercase word (html -> HTML)

275 if ((nsug < maxSug) && (nsug > -1)) {

276 nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

277 capchars(wlst, word, nsug, cpdsuggest);

278 }

279

280 // perhaps we made a typical fault of spelling

281 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

282 nsug = replchars(wlst, word, nsug, cpdsuggest);

283 }

284

285 // perhaps we made chose the wrong char from a related set

286 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

287 nsug = mapchars(wlst, word, nsug, cpdsuggest);

288 }

289

290 // only suggest compound words when no other suggestion

291 if ((cpdsuggest == 0) && (nsug > nsugorig)) nocompoundtwowords=1;

292

293 // did we swap the order of chars by mistake

294 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

295 nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

296 swapchar(wlst, word, nsug, cpdsuggest);

297 }

298

299 // did we swap the order of non adjacent chars by mistake

300 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

301 nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

302 longswapchar(wlst, word, nsug, cpdsuggest);

303 }

304

305 // did we just hit the wrong key in place of a good char (case and keyboard)

306 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

307 nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

308 badcharkey(wlst, word, nsug, cpdsuggest);

309 }

310

311 // did we add a char that should not be there

312 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

313 nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

314 extrachar(wlst, word, nsug, cpdsuggest);

315 }

316

317

318 // did we forgot a char

319 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

320 nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

321 forgotchar(wlst, word, nsug, cpdsuggest);

322 }

323

324 // did we move a char

325 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

326 nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

327 movechar(wlst, word, nsug, cpdsuggest);

328 }

329

330 // did we just hit the wrong key in place of a good char

331 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

332 nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

333 badchar(wlst, word, nsug, cpdsuggest);

334 }

335

336 // did we double two characters

337 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs))) {

338 nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

339 doubletwochars(wlst, word, nsug, cpdsuggest);

340 }

341

342 // perhaps we forgot to hit space and two words ran together

343 if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcpdsugs))) {

344 nsug = twowords(wlst, word, nsug, cpdsuggest);

345 }

346

347 } // repeating ``for'' statement compounding support

348

349 if (nsug < 0) {

350 // we ran out of memory - we should free up as much as possible

351 for (int i = 0; i < maxSug; i++)

352 if (wlst[i] != NULL) free(wlst[i]);

353 free(wlst);

354 wlst = NULL;

355 }

356

357 if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;

358

359 *slst = wlst;

360 return nsug;

361 }

362

363 // generate suggestions for a word with typical mistake

364 // pass in address of array of char * pointers

365 #ifdef HUNSPELL_EXPERIMENTAL

366 int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)

367 {

368 int nocompoundtwowords = 0;

369 char ** wlst;

370 int oldSug;

371

372 char w2[MAXWORDUTF8LEN];

373 const char * word = w;

374

375 // word reversing wrapper for complex prefixes

376 if (complexprefixes) {

377 strcpy(w2, w);

378 if (utf8) reverseword_utf(w2); else reverseword(w2);

379 word = w2;

380 }

381

382 if (*slst) {

383 wlst = *slst;

384 } else {

385 wlst = (char *) malloc(maxSug sizeof(char *));

386 if (wlst == NULL) return -1;

387 }

388

389 for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest ++) {

390

391 // limit compound suggestion

392 if (cpdsuggest > 0) oldSug = nsug;

393

394 // perhaps we made a typical fault of spelling

395 if ((nsug < maxSug) && (nsug > -1))

396 nsug = replchars(wlst, word, nsug, cpdsuggest);

397

398 // perhaps we made chose the wrong char from a related set

399 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs)))

400 nsug = mapchars(wlst, word, nsug, cpdsuggest);

401

402 if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;

403

404 // perhaps we forgot to hit space and two words ran together

405

406 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest \|\| (nsug < oldSug + maxcp dsugs)) && check_forbidden(word, strlen(word))) {

407 nsug = twowords(wlst, word, nsug, cpdsuggest);

408 }

409

410 } // repeating ``for'' statement compounding support

411

412 if (nsug < 0) {

413 for (int i=0;i<maxSug; i++)

414 if (wlst[i] != NULL) free(wlst[i]);

415 free(wlst);

416 return -1;

417 }

418

419 *slst = wlst;

420 return nsug;

421 }

422 #endif // END OF HUNSPELL_EXPERIMENTAL CODE

423

424 // suggestions for an uppercase word (html -> HTML)

425 int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

426 {

427 char candidate[MAXSWUTF8L];

428 w_char candidate_utf[MAXSWL];

429 memcpy(candidate_utf, word, wl * sizeof(w_char));

430 mkallcap_utf(candidate_utf, wl, langnum);

431 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

432 return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL) ;

433 }

434

435 // suggestions for an uppercase word (html -> HTML)

436 int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest)

437 {

438 char candidate[MAXSWUTF8L];

439 strcpy(candidate, word);

440 mkallcap(candidate, csconv);

441 return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL) ;

442 }

443

444 // suggestions for when chose the wrong char out of a related set

445 int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)

446 {

447 char candidate[MAXSWUTF8L];

448 clock_t timelimit;

449 int timer;

450 candidate[0] = '\0';

451

452 int wl = strlen(word);

453 if (wl < 2 \|\| ! pAMgr) return ns;

454

455 int nummap = pAMgr->get_nummap();

456 struct mapentry* maptable = pAMgr->get_maptable();

457 if (maptable==NULL) return ns;

458

459 timelimit = clock();

460 timer = MINTIMER;

461 return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, mapt able, nummap, &timer, &timelimit);

462 }

463

464 int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn,

465 char** wlst, int cpdsuggest, int ns,

466 const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)

467 {

468 if (*(word + wn) == '\0') {

469 int cwrd = 1;

470 *(candidate + cn) = '\0';

471 int wl = strlen(candidate);

472 for (int m=0; m < ns; m++)

473 if (strcmp(candidate, wlst[m]) == 0) cwrd = 0;

474 if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {

475 if (ns < maxSug) {

476 wlst[ns] = mystrdup(candidate);

477 if (wlst[ns] == NULL) return -1;

478 ns++;

479 }

480 }

481 return ns;

482 }

483 int in_map = 0;

484 for (int j = 0; j < nummap; j++) {

485 for (int k = 0; k < maptable[j].len; k++) {

486 int len = strlen(maptable[j].set[k]);

487 if (strncmp(maptable[j].set[k], word + wn, len) == 0) {

488 in_map = 1;

489 for (int l = 0; l < maptable[j].len; l++) {

490 strcpy(candidate + cn, maptable[j].set[l]);

491 ns = map_related(word, candidate, wn + len, strlen(candidate), wlst,

492 cpdsuggest, ns, maptable, nummap, timer, timelimit);

493 if (!(*timer)) return ns;

494 }

495 }

496 }

497 }

498 if (!in_map) {

499 (candidate + cn) = (word + wn);

500 ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest,

501 ns, maptable, nummap, timer, timelimit);

502 }

503 return ns;

504 }

505

506 // suggestions for a typical fault of spelling, that

507 // differs with more, than 1 letter from the right form.

508 int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest )

509 {

510 char candidate[MAXSWUTF8L];

511 const char * r;

512 int lenr, lenp;

513 int wl = strlen(word);

514 if (wl < 2 \|\| ! pAMgr) return ns;

515

516 #ifdef HUNSPELL_CHROME_CLIENT

517 const char pattern, pattern2;

518 hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator( );

519 while (iterator.GetNext(&pattern, &pattern2)) {

520 r = word;

521 lenr = strlen(pattern2);

522 lenp = strlen(pattern);

523

524 // search every occurence of the pattern in the word

525 while ((r=strstr(r, pattern)) != NULL) {

526 strcpy(candidate, word);

527 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;

528 strcpy(candidate+(r-word), pattern2);

529 strcpy(candidate+(r-word)+lenr, r+lenp);

530 ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL );

531 if (ns == -1) return -1;

532 // check REP suggestions with space

533 char * sp = strchr(candidate, ' ');

534 if (sp) {

535 char * prev = candidate;

536 while (sp) {

537 *sp = '\0';

538 if (checkword(prev, strlen(prev), 0, NULL, NULL)) {

539 int oldns = ns;

540 *sp = ' ';

541 ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);

542 if (ns == -1) return -1;

543 if (oldns < ns) {

544 free(wlst[ns - 1]);

545 wlst[ns - 1] = mystrdup(candidate);

546 if (!wlst[ns - 1]) return -1;

547 }

548 }

549 *sp = ' ';

550 prev = sp + 1;

551 sp = strchr(prev, ' ');

552 }

553 }

554 r++; // search for the next letter

555 }

556 }

557 #else

558 int numrep = pAMgr->get_numrep();

559 struct replentry* reptable = pAMgr->get_reptable();

560 if (reptable==NULL) return ns;

561 for (int i=0; i < numrep; i++ ) {

562 r = word;

563 lenr = strlen(reptable[i].pattern2);

564 lenp = strlen(reptable[i].pattern);

565 // search every occurence of the pattern in the word

566 while ((r=strstr(r, reptable[i].pattern)) != NULL && (!reptable[i].end \|\| strlen(r) == strlen(reptable[i].pattern)) &&

567 (!reptable[i].start \|\| r == word)) {

568 strcpy(candidate, word);

569 if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;

570 strcpy(candidate+(r-word),reptable[i].pattern2);

571 strcpy(candidate+(r-word)+lenr, r+lenp);

572 ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL );

573 if (ns == -1) return -1;

574 // check REP suggestions with space

575 char * sp = strchr(candidate, ' ');

576 if (sp) {

577 char * prev = candidate;

578 while (sp) {

579 *sp = '\0';

580 if (checkword(prev, strlen(prev), 0, NULL, NULL)) {

581 int oldns = ns;

582 *sp = ' ';

583 ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);

584 if (ns == -1) return -1;

585 if (oldns < ns) {

586 free(wlst[ns - 1]);

587 wlst[ns - 1] = mystrdup(candidate);

588 if (!wlst[ns - 1]) return -1;

589 }

590 }

591 *sp = ' ';

592 prev = sp + 1;

593 sp = strchr(prev, ' ');

594 }

595 }

596 r++; // search for the next letter

597 }

598 }

599 #endif

600 return ns;

601 }

602

603 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)

604 int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsu ggest)

605 {

606 char candidate[MAXSWUTF8L];

607 int state=0;

608 int wl = strlen(word);

609 if (wl < 5 \|\| ! pAMgr) return ns;

610 for (int i=2; i < wl; i++ ) {

611 if (word[i]==word[i-2]) {

612 state++;

613 if (state==3) {

614 strcpy(candidate,word);

615 strcpy(candidate+i-1,word+i+1);

616 ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL);

617 if (ns == -1) return -1;

618 state=0;

619 }

620 } else {

621 state=0;

622 }

623 }

624 return ns;

625 }

626

627 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)

628 int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in t ns, int cpdsuggest)

629 {

630 w_char candidate_utf[MAXSWL];

631 char candidate[MAXSWUTF8L];

632 int state=0;

633 if (wl < 5 \|\| ! pAMgr) return ns;

634 for (int i=2; i < wl; i++) {

635 if (w_char_eq(word[i], word[i-2])) {

636 state++;

637 if (state==3) {

638 memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));

639 memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char));

640 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2);

641 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NUL L, NULL);

642 if (ns == -1) return -1;

643 state=0;

644 }

645 } else {

646 state=0;

647 }

648 }

649 return ns;

650 }

651

652 // error is wrong char in place of correct one (case and keyboard related versio n)

653 int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsugge st)

654 {

655 char tmpc;

656 char candidate[MAXSWUTF8L];

657 int wl = strlen(word);

658 strcpy(candidate, word);

659 // swap out each char one by one and try uppercase and neighbor

660 // keyboard chars in its place to see if that makes a good word

661

662 for (int i=0; i < wl; i++) {

663 tmpc = candidate[i];

664 // check with uppercase letters

665 candidate[i] = csconv[((unsigned char)tmpc)].cupper;

666 if (tmpc != candidate[i]) {

667 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

668 if (ns == -1) return -1;

669 candidate[i] = tmpc;

670 }

671 // check neighbor characters in keyboard string

672 if (!ckey) continue;

673 char * loc = strchr(ckey, tmpc);

674 while (loc) {

675 if ((loc > ckey) && (*(loc - 1) != '\|')) {

676 candidate[i] = *(loc - 1);

677 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

678 if (ns == -1) return -1;

679 }

680 if (((loc + 1) != '\|') && ((loc + 1) != '\0')) {

681 candidate[i] = *(loc + 1);

682 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

683 if (ns == -1) return -1;

684 }

685 loc = strchr(loc + 1, tmpc);

686 }

687 candidate[i] = tmpc;

688 }

689 return ns;

690 }

691

692 // error is wrong char in place of correct one (case and keyboard related versio n)

693 int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns , int cpdsuggest)

694 {

695 w_char tmpc;

696 w_char candidate_utf[MAXSWL];

697 char candidate[MAXSWUTF8L];

698 memcpy(candidate_utf, word, wl * sizeof(w_char));

699 // swap out each char one by one and try all the tryme

700 // chars in its place to see if that makes a good word

701 for (int i=0; i < wl; i++) {

702 tmpc = candidate_utf[i];

703 // check with uppercase letters

704 mkallcap_utf(candidate_utf + i, 1, langnum);

705 if (!w_char_eq(tmpc, candidate_utf[i])) {

706 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

707 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NU LL);

708 if (ns == -1) return -1;

709 candidate_utf[i] = tmpc;

710 }

711 // check neighbor characters in keyboard string

712 if (!ckey) continue;

713 w_char * loc = ckey_utf;

714 while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;

715 while (loc < (ckey_utf + ckeyl)) {

716 if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {

717 candidate_utf[i] = *(loc - 1);

718 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

719 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

720 if (ns == -1) return -1;

721 }

722 if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {

723 candidate_utf[i] = *(loc + 1);

724 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

725 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

726 if (ns == -1) return -1;

727 }

728 do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc) );

729 }

730 candidate_utf[i] = tmpc;

731 }

732 return ns;

733 }

734

735 // error is wrong char in place of correct one

736 int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)

737 {

738 char tmpc;

739 char candidate[MAXSWUTF8L];

740 clock_t timelimit = clock();

741 int timer = MINTIMER;

742 int wl = strlen(word);

743 strcpy(candidate, word);

744 // swap out each char one by one and try all the tryme

745 // chars in its place to see if that makes a good word

746 for (int j=0; j < ctryl; j++) {

747 for (int i=wl-1; i >= 0; i--) {

748 tmpc = candidate[i];

749 if (ctry[j] == tmpc) continue;

750 candidate[i] = ctry[j];

751 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);

752 if (ns == -1) return -1;

753 if (!timer) return ns;

754 candidate[i] = tmpc;

755 }

756 }

757 return ns;

758 }

759

760 // error is wrong char in place of correct one

761 int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i nt cpdsuggest)

762 {

763 w_char tmpc;

764 w_char candidate_utf[MAXSWL];

765 char candidate[MAXSWUTF8L];

766 clock_t timelimit = clock();

767 int timer = MINTIMER;

768 memcpy(candidate_utf, word, wl * sizeof(w_char));

769 // swap out each char one by one and try all the tryme

770 // chars in its place to see if that makes a good word

771 for (int j=0; j < ctryl; j++) {

772 for (int i=wl-1; i >= 0; i--) {

773 tmpc = candidate_utf[i];

774 if (w_char_eq(tmpc, ctry_utf[j])) continue;

775 candidate_utf[i] = ctry_utf[j];

776 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

777 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);

778 if (ns == -1) return -1;

779 if (!timer) return ns;

780 candidate_utf[i] = tmpc;

781 }

782 }

783 return ns;

784 }

785

786 // error is word has an extra letter it does not need

787 int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

788 {

789 char candidate[MAXSWUTF8L];

790 w_char candidate_utf[MAXSWL];

791 w_char * p;

792 w_char tmpc = W_VLINE; // not used value, only for VCC warning message

793 if (wl < 2) return ns;

794 // try omitting one char of word at a time

795 memcpy(candidate_utf, word, wl * sizeof(w_char));

796 for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {

797 w_char tmpc2 = *p;

798 if (p < candidate_utf + wl - 1) *p = tmpc;

799 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);

800 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NU LL);

801 if (ns == -1) return -1;

802 tmpc = tmpc2;

803 }

804 return ns;

805 }

806

807 // error is word has an extra letter it does not need

808 int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest )

809 {

810 char tmpc = '\0';

811 char candidate[MAXSWUTF8L];

812 char * p;

813 int wl = strlen(word);

814 if (wl < 2) return ns;

815 // try omitting one char of word at a time

816 strcpy (candidate, word);

817 for (p = candidate + wl - 1; p >=candidate; p--) {

818 char tmpc2 = *p;

819 *p = tmpc;

820 ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);

821 if (ns == -1) return -1;

822 tmpc = tmpc2;

823 }

824 return ns;

825 }

826

827 // error is missing a letter it needs

828 int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsugge st)

829 {

830 // TODO(rouslan): Remove the interim change below when this patch lands:

831 // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395

832 char candidate[MAXSWUTF8L + 4];

833 char * p;

834 clock_t timelimit = clock();

835 int timer = MINTIMER;

836 int wl = strlen(word);

837 // try inserting a tryme character before every letter (and the null terminat or)

838 for (int i = 0; i < ctryl; i++) {

839 strcpy(candidate, word);

840 for (p = candidate + wl; p >= candidate; p--) {

841 (p+1) = p;

842 *p = ctry[i];

843 ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit) ;

844 if (ns == -1) return -1;

845 if (!timer) return ns;

846 }

847 }

848 return ns;

849 }

850

851 // error is missing a letter it needs

852 int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns , int cpdsuggest)

853 {

854 // TODO(rouslan): Remove the interim change below when this patch lands:

855 // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395

856 w_char candidate_utf[MAXSWL + 1];

857 char candidate[MAXSWUTF8L + 4];

858 w_char * p;

859 clock_t timelimit = clock();

860 int timer = MINTIMER;

861 // try inserting a tryme character at the end of the word and before every le tter

862 for (int i = 0; i < ctryl; i++) {

863 memcpy (candidate_utf, word, wl * sizeof(w_char));

864 for (p = candidate_utf + wl; p >= candidate_utf; p--) {

865 (p + 1) = p;

866 *p = ctry_utf[i];

867 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);

868 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer , &timelimit);

869 if (ns == -1) return -1;

870 if (!timer) return ns;

871 }

872 }

873 return ns;

874 }

875

876

877 /* error is should have been two words */

878 int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest )

879 {

880 char candidate[MAXSWUTF8L];

881 char * p;

882 int c1, c2;

883 int forbidden = 0;

884 int cwrd;

885

886 int wl=strlen(word);

887 if (wl < 3) return ns;

888

889 if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);

890

891 strcpy(candidate + 1, word);

892 // split the string into two pieces after every char

893 // if both pieces are good words make them a suggestion

894 for (p = candidate + 1; p[1] != '\0'; p++) {

895 p[-1] = *p;

896 // go to end of the UTF-8 character

897 while (utf8 && ((p[1] & 0xc0) == 0x80)) {

898 *p = p[1];

899 p++;

900 }

901 if (utf8 && p[1] == '\0') break; // last UTF-8 character

902 *p = '\0';

903 c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);

904 if (c1) {

905 c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL);

906 if (c2) {

907 *p = ' ';

908

909 // spec. Hungarian code (need a better compound word support)

910 if ((langnum == LANG_hu) && !forbidden &&

911 // if 3 repeating letter, use - instead of space

912 (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) \|\| ( p[-1] == p[2]))) \|\|

913 // or multiple compounding, with more, than 6 syllables

914 ((c1 == 3) && (c2 >= 2)))) *p = '-';

915

916 cwrd = 1;

917 for (int k=0; k < ns; k++)

918 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;

919 if (ns < maxSug) {

920 if (cwrd) {

921 wlst[ns] = mystrdup(candidate);

922 if (wlst[ns] == NULL) return -1;

923 ns++;

924 }

925 } else return ns;

926 // add two word suggestion with dash, if TRY string contains

927 // "a" or "-"

928 // NOTE: cwrd doesn't modified for REP twoword sugg.

929 if (ctry && (strchr(ctry, 'a') \|\| strchr(ctry, '-')) &&

930 mystrlen(p + 1) > 1 &&

931 mystrlen(candidate) - mystrlen(p) > 1) {

932 *p = '-';

933 for (int k=0; k < ns; k++)

934 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;

935 if (ns < maxSug) {

936 if (cwrd) {

937 wlst[ns] = mystrdup(candidate);

938 if (wlst[ns] == NULL) return -1;

939 ns++;

940 }

941 } else return ns;

942 }

943 }

944 }

945 }

946 return ns;

947 }

948

949

950 // error is adjacent letter were swapped

951 int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest )

952 {

953 char candidate[MAXSWUTF8L];

954 char * p;

955 char tmpc;

956 int wl=strlen(word);

957 // try swapping adjacent chars one by one

958 strcpy(candidate, word);

959 for (p = candidate; p[1] != 0; p++) {

960 tmpc = *p;

961 *p = p[1];

962 p[1] = tmpc;

963 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

964 if (ns == -1) return -1;

965 p[1] = *p;

966 *p = tmpc;

967 }

968 // try double swaps for short words

969 // ahev -> have, owudl -> would

970 if (wl == 4 \|\| wl == 5) {

971 candidate[0] = word[1];

972 candidate[1] = word[0];

973 candidate[2] = word[2];

974 candidate[wl - 2] = word[wl - 1];

975 candidate[wl - 1] = word[wl - 2];

976 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

977 if (ns == -1) return -1;

978 if (wl == 5) {

979 candidate[0] = word[0];

980 candidate[1] = word[2];

981 candidate[2] = word[1];

982 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

983 if (ns == -1) return -1;

984 }

985 }

986 return ns;

987 }

988

989 // error is adjacent letter were swapped

990 int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

991 {

992 w_char candidate_utf[MAXSWL];

993 char candidate[MAXSWUTF8L];

994 w_char * p;

995 w_char tmpc;

996 int len = 0;

997 // try swapping adjacent chars one by one

998 memcpy (candidate_utf, word, wl * sizeof(w_char));

999 for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {

1000 tmpc = *p;

1001 *p = p[1];

1002 p[1] = tmpc;

1003 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

1004 if (len == 0) len = strlen(candidate);

1005 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);

1006 if (ns == -1) return -1;

1007 p[1] = *p;

1008 *p = tmpc;

1009 }

1010 // try double swaps for short words

1011 // ahev -> have, owudl -> would, suodn -> sound

1012 if (wl == 4 \|\| wl == 5) {

1013 candidate_utf[0] = word[1];

1014 candidate_utf[1] = word[0];

1015 candidate_utf[2] = word[2];

1016 candidate_utf[wl - 2] = word[wl - 1];

1017 candidate_utf[wl - 1] = word[wl - 2];

1018 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

1019 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);

1020 if (ns == -1) return -1;

1021 if (wl == 5) {

1022 candidate_utf[0] = word[0];

1023 candidate_utf[1] = word[2];

1024 candidate_utf[2] = word[1];

1025 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

1026 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);

1027 if (ns == -1) return -1;

1028 }

1029 }

1030 return ns;

1031 }

1032

1033 // error is not adjacent letter were swapped

1034 int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsug gest)

1035 {

1036 char candidate[MAXSWUTF8L];

1037 char * p;

1038 char * q;

1039 char tmpc;

1040 int wl=strlen(word);

1041 // try swapping not adjacent chars one by one

1042 strcpy(candidate, word);

1043 for (p = candidate; *p != 0; p++) {

1044 for (q = candidate; *q != 0; q++) {

1045 if (abs((int)(p-q)) > 1) {

1046 tmpc = *p;

1047 p = q;

1048 *q = tmpc;

1049 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

1050 if (ns == -1) return -1;

1051 q = p;

1052 *p = tmpc;

1053 }

1054 }

1055 }

1056 return ns;

1057 }

1058

1059

1060 // error is adjacent letter were swapped

1061 int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

1062 {

1063 w_char candidate_utf[MAXSWL];

1064 char candidate[MAXSWUTF8L];

1065 w_char * p;

1066 w_char * q;

1067 w_char tmpc;

1068 // try swapping not adjacent chars

1069 memcpy (candidate_utf, word, wl * sizeof(w_char));

1070 for (p = candidate_utf; p < (candidate_utf + wl); p++) {

1071 for (q = candidate_utf; q < (candidate_utf + wl); q++) {

1072 if (abs((int)(p-q)) > 1) {

1073 tmpc = *p;

1074 p = q;

1075 *q = tmpc;

1076 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

1077 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

1078 if (ns == -1) return -1;

1079 q = p;

1080 *p = tmpc;

1081 }

1082 }

1083 }

1084 return ns;

1085 }

1086

1087 // error is a letter was moved

1088 int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest )

1089 {

1090 char candidate[MAXSWUTF8L];

1091 char * p;

1092 char * q;

1093 char tmpc;

1094

1095 int wl=strlen(word);

1096 // try moving a char

1097 strcpy(candidate, word);

1098 for (p = candidate; *p != 0; p++) {

1099 for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) {

1100 tmpc = *(q-1);

1101 (q-1) = q;

1102 *q = tmpc;

1103 if ((q-p) < 2) continue; // omit swap char

1104 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

1105 if (ns == -1) return -1;

1106 }

1107 strcpy(candidate, word);

1108 }

1109 for (p = candidate + wl - 1; p > candidate; p--) {

1110 for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) {

1111 tmpc = *(q+1);

1112 (q+1) = q;

1113 *q = tmpc;

1114 if ((p-q) < 2) continue; // omit swap char

1115 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

1116 if (ns == -1) return -1;

1117 }

1118 strcpy(candidate, word);

1119 }

1120 return ns;

1121 }

1122

1123 // error is a letter was moved

1124 int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

1125 {

1126 w_char candidate_utf[MAXSWL];

1127 char candidate[MAXSWUTF8L];

1128 w_char * p;

1129 w_char * q;

1130 w_char tmpc;

1131 // try moving a char

1132 memcpy (candidate_utf, word, wl * sizeof(w_char));

1133 for (p = candidate_utf; p < (candidate_utf + wl); p++) {

1134 for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) {

1135 tmpc = *(q-1);

1136 (q-1) = q;

1137 *q = tmpc;

1138 if ((q-p) < 2) continue; // omit swap char

1139 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

1140 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

1141 if (ns == -1) return -1;

1142 }

1143 memcpy (candidate_utf, word, wl * sizeof(w_char));

1144 }

1145 for (p = candidate_utf + wl - 1; p > candidate_utf; p--) {

1146 for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) {

1147 tmpc = *(q+1);

1148 (q+1) = q;

1149 *q = tmpc;

1150 if ((p-q) < 2) continue; // omit swap char

1151 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

1152 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

1153 if (ns == -1) return -1;

1154 }

1155 memcpy (candidate_utf, word, wl * sizeof(w_char));

1156 }

1157 return ns;

1158 }

1159

1160 // generate a set of suggestions for very poorly spelled words

1161 int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md )

1162 {

1163

1164 int i, j;

1165 int lval;

1166 int sc, scphon;

1167 int lp, lpphon;

1168 int nonbmp = 0;

1169

1170 // exhaustively search through all root words

1171 // keeping track of the MAX_ROOTS most similar root words

1172 struct hentry * roots[MAX_ROOTS];

1173 char * rootsphon[MAX_ROOTS];

1174 int scores[MAX_ROOTS];

1175 int scoresphon[MAX_ROOTS];

1176 for (i = 0; i < MAX_ROOTS; i++) {

1177 roots[i] = NULL;

1178 scores[i] = -100 * i;

1179 rootsphon[i] = NULL;

1180 scoresphon[i] = -100 * i;

1181 }

1182 lp = MAX_ROOTS - 1;

1183 lpphon = MAX_ROOTS - 1;

1184 scphon = -20000;

1185 int low = NGRAM_LOWERING;

1186

1187 char w2[MAXWORDUTF8LEN];

1188 char f[MAXSWUTF8L];

1189 char * word = w;

1190

1191 // word reversing wrapper for complex prefixes

1192 if (complexprefixes) {

1193 strcpy(w2, w);

1194 if (utf8) reverseword_utf(w2); else reverseword(w2);

1195 word = w2;

1196 }

1197

1198 char mw[MAXSWUTF8L];

1199 w_char u8[MAXSWL];

1200 int nc = strlen(word);

1201 int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;

1202

1203 // set character based ngram suggestion for words with non-BMP Unicode charact ers

1204 if (n == -1) {

1205 utf8 = 0; // XXX not state-free

1206 n = nc;

1207 nonbmp = 1;

1208 low = 0;

1209 }

1210

1211 struct hentry* hp = NULL;

1212 int col = -1;

1213 #ifdef HUNSPELL_CHROME_CLIENT

1214 ScopedHashEntryFactory hash_entry_factory;

1215 #endif

1216 phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;

1217 char target[MAXSWUTF8L];

1218 char candidate[MAXSWUTF8L];

1219 if (ph) {

1220 if (utf8) {

1221 w_char _w[MAXSWL];

1222 int _wl = u8_u16(_w, MAXSWL, word);

1223 mkallcap_utf(_w, _wl, langnum);

1224 u16_u8(candidate, MAXSWUTF8L, _w, _wl);

1225 } else {

1226 strcpy(candidate, word);

1227 if (!nonbmp) mkallcap(candidate, csconv);

1228 }

1229 phonet(candidate, target, nc, *ph); // XXX phonet() is 8-bit (nc, not n)

1230 }

1231

1232 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;

1233 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;

1234 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;

1235 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;

1236

1237 for (i = 0; i < md; i++) {

1238 while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {

1239 if ((hp->astr) && (pAMgr) &&

1240 (TESTAFF(hp->astr, forbiddenword, hp->alen) \|\|

1241 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) \|\|

1242 TESTAFF(hp->astr, nosuggest, hp->alen) \|\|

1243 TESTAFF(hp->astr, nongramsuggest, hp->alen) \|\|

1244 TESTAFF(hp->astr, onlyincompound, hp->alen))) continue;

1245

1246 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +

1247 leftcommonsubstring(word, HENTRY_WORD(hp));

1248

1249 // check special pronounciation

1250 if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {

1251 int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +

1252 + leftcommonsubstring(word, f);

1253 if (sc2 > sc) sc = sc2;

1254 }

1255

1256 scphon = -20000;

1257 if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {

1258 char target2[MAXSWUTF8L];

1259 if (utf8) {

1260 w_char _w[MAXSWL];

1261 int _wl = u8_u16(_w, MAXSWL, HENTRY_WORD(hp));

1262 mkallcap_utf(_w, _wl, langnum);

1263 u16_u8(candidate, MAXSWUTF8L, _w, _wl);

1264 } else {

1265 strcpy(candidate, HENTRY_WORD(hp));

1266 mkallcap(candidate, csconv);

1267 }

1268 phonet(candidate, target2, -1, *ph);

1269 scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);

1270 }

1271

1272 if (sc > scores[lp]) {

1273 scores[lp] = sc;

1274 #ifdef HUNSPELL_CHROME_CLIENT

1275 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp);

1276 #else

1277 roots[lp] = hp;

1278 #endif

1279 lval = sc;

1280 for (j=0; j < MAX_ROOTS; j++)

1281 if (scores[j] < lval) {

1282 lp = j;

1283 lval = scores[j];

1284 }

1285 }

1286

1287

1288 if (scphon > scoresphon[lpphon]) {

1289 scoresphon[lpphon] = scphon;

1290 rootsphon[lpphon] = HENTRY_WORD(hp);

1291 lval = scphon;

1292 for (j=0; j < MAX_ROOTS; j++)

1293 if (scoresphon[j] < lval) {

1294 lpphon = j;

1295 lval = scoresphon[j];

1296 }

1297 }

1298 }}

1299

1300 // find minimum threshold for a passable suggestion

1301 // mangle original word three differnt ways

1302 // and score them to generate a minimum acceptable score

1303 int thresh = 0;

1304 for (int sp = 1; sp < 4; sp++) {

1305 if (utf8) {

1306 for (int k=sp; k < n; k+=4) ((unsigned short ) u8 + k) = '*';

1307 u16_u8(mw, MAXSWUTF8L, u8, n);

1308 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

1309 } else {

1310 strcpy(mw, word);

1311 for (int k=sp; k < n; k+=4) (mw + k) = '';

1312 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

1313 }

1314 }

1315 thresh = thresh / 3;

1316 thresh--;

1317

1318 // now expand affixes on each of these root words and

1319 // and use length adjusted ngram scores to select

1320 // possible suggestions

1321 char * guess[MAX_GUESS];

1322 char * guessorig[MAX_GUESS];

1323 int gscore[MAX_GUESS];

1324 for(i=0;i<MAX_GUESS;i++) {

1325 guess[i] = NULL;

1326 guessorig[i] = NULL;

1327 gscore[i] = -100 * i;

1328 }

1329

1330 lp = MAX_GUESS - 1;

1331

1332 struct guessword * glst;

1333 glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));

1334 if (! glst) {

1335 if (nonbmp) utf8 = 1;

1336 return ns;

1337 }

1338

1339 for (i = 0; i < MAX_ROOTS; i++) {

1340 if (roots[i]) {

1341 struct hentry * rp = roots[i];

1342 int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->bl en,

1343 rp->astr, rp->alen, word, nc,

1344 ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MOR PH_PHON) : NULL));

1345

1346 for (int k = 0; k < nw ; k++) {

1347 sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +

1348 leftcommonsubstring(word, glst[k].word);

1349

1350 if (sc > thresh) {

1351 if (sc > gscore[lp]) {

1352 if (guess[lp]) {

1353 free (guess[lp]);

1354 if (guessorig[lp]) {

1355 free(guessorig[lp]);

1356 guessorig[lp] = NULL;

1357 }

1358 }

1359 gscore[lp] = sc;

1360 guess[lp] = glst[k].word;

1361 guessorig[lp] = glst[k].orig;

1362 lval = sc;

1363 for (j=0; j < MAX_GUESS; j++)

1364 if (gscore[j] < lval) {

1365 lp = j;

1366 lval = gscore[j];

1367 }

1368 } else {

1369 free(glst[k].word);

1370 if (glst[k].orig) free(glst[k].orig);

1371 }

1372 } else {

1373 free(glst[k].word);

1374 if (glst[k].orig) free(glst[k].orig);

1375 }

1376 }

1377 }

1378 }

1379 free(glst);

1380

1381 // now we are done generating guesses

1382 // sort in order of decreasing score

1383

1384

1385 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);

1386 if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);

1387

1388 // weight suggestions with a similarity index, based on

1389 // the longest common subsequent algorithm and resort

1390

1391 int is_swap = 0;

1392 int re = 0;

1393 double fact = 1.0;

1394 if (pAMgr) {

1395 int maxd = pAMgr->get_maxdiff();

1396 if (maxd >= 0) fact = (10.0 - maxd)/5.0;

1397 }

1398

1399 for (i=0; i < MAX_GUESS; i++) {

1400 if (guess[i]) {

1401 // lowering guess[i]

1402 char gl[MAXSWUTF8L];

1403 int len;

1404 if (utf8) {

1405 w_char _w[MAXSWL];

1406 len = u8_u16(_w, MAXSWL, guess[i]);

1407 mkallsmall_utf(_w, len, langnum);

1408 u16_u8(gl, MAXSWUTF8L, _w, len);

1409 } else {

1410 strcpy(gl, guess[i]);

1411 if (!nonbmp) mkallsmall(gl, csconv);

1412 len = strlen(guess[i]);

1413 }

1414

1415 int _lcs = lcslen(word, gl);

1416

1417 // same characters with different casing

1418 if ((n == len) && (n == _lcs)) {

1419 gscore[i] += 2000;

1420 break;

1421 }

1422 // using 2-gram instead of 3, and other weightening

1423

1424 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +

1425 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);

1426

1427 gscore[i] =

1428 // length of longest common subsequent minus length difference

1429 2 * _lcs - abs((int) (n - len)) +

1430 // weight length of the left common substring

1431 leftcommonsubstring(word, gl) +

1432 // weight equal character positions

1433 (!nonbmp && commoncharacterpositions(word, gl, &is_swap) ? 1: 0) +

1434 // swap character (not neighboring)

1435 ((is_swap) ? 10 : 0) +

1436 // ngram

1437 ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +

1438 // weighted ngrams

1439 re +

1440 // different limit for dictionaries with PHONE rules

1441 (ph ? (re < len * fact ? -1000 : 0) : (re < (n + len)*fact? -1000 : 0) );

1442 }

1443 }

1444

1445 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);

1446

1447 // phonetic version

1448 if (ph) for (i=0; i < MAX_ROOTS; i++) {

1449 if (rootsphon[i]) {

1450 // lowering rootphon[i]

1451 char gl[MAXSWUTF8L];

1452 int len;

1453 if (utf8) {

1454 w_char _w[MAXSWL];

1455 len = u8_u16(_w, MAXSWL, rootsphon[i]);

1456 mkallsmall_utf(_w, len, langnum);

1457 u16_u8(gl, MAXSWUTF8L, _w, len);

1458 } else {

1459 strcpy(gl, rootsphon[i]);

1460 if (!nonbmp) mkallsmall(gl, csconv);

1461 len = strlen(rootsphon[i]);

1462 }

1463

1464 // heuristic weigthing of ngram scores

1465 scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +

1466 // weight length of the left common substring

1467 leftcommonsubstring(word, gl);

1468 }

1469 }

1470

1471 if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);

1472

1473 // copy over

1474 int oldns = ns;

1475

1476 int same = 0;

1477 for (i=0; i < MAX_GUESS; i++) {

1478 if (guess[i]) {

1479 if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same \|\| (gscore[i] > 1000))) {

1480 int unique = 1;

1481 // leave only excellent suggestions, if exists

1482 if (gscore[i] > 1000) same = 1; else if (gscore[i] < -100) {

1483 same = 1;

1484 // keep the best ngram suggestions, unless in ONLYMAXDIFF mode

1485 if (ns > oldns \|\| (pAMgr && pAMgr->get_onlymaxdiff())) {

1486 free(guess[i]);

1487 if (guessorig[i]) free(guessorig[i]);

1488 continue;

1489 }

1490 }

1491 for (j = 0; j < ns; j++) {

1492 // don't suggest previous suggestions or a previous suggestion with pr efixes or affixes

1493 if ((!guessorig[i] && strstr(guess[i], wlst[j])) \|\|

1494 (guessorig[i] && strstr(guessorig[i], wlst[j])) \|\|

1495 // check forbidden words

1496 !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;

1497 }

1498 if (unique) {

1499 wlst[ns++] = guess[i];

1500 if (guessorig[i]) {

1501 free(guess[i]);

1502 wlst[ns-1] = guessorig[i];

1503 }

1504 } else {

1505 free(guess[i]);

1506 if (guessorig[i]) free(guessorig[i]);

1507 }

1508 } else {

1509 free(guess[i]);

1510 if (guessorig[i]) free(guessorig[i]);

1511 }

1512 }

1513 }

1514

1515 oldns = ns;

1516 if (ph) for (i=0; i < MAX_ROOTS; i++) {

1517 if (rootsphon[i]) {

1518 if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {

1519 int unique = 1;

1520 for (j = 0; j < ns; j++) {

1521 // don't suggest previous suggestions or a previous suggestion with pr efixes or affixes

1522 if (strstr(rootsphon[i], wlst[j]) \|\|

1523 // check forbidden words

1524 !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) uniqu e = 0;

1525 }

1526 if (unique) {

1527 wlst[ns++] = mystrdup(rootsphon[i]);

1528 if (!wlst[ns - 1]) return ns - 1;

1529 }

1530 }

1531 }

1532 }

1533

1534 if (nonbmp) utf8 = 1;

1535 return ns;

1536 }

1537

1538

1539 // see if a candidate suggestion is spelled correctly

1540 // needs to check both root words and words with affixes

1541

1542 // obsolote MySpell-HU modifications:

1543 // return value 2 and 3 marks compounding with hyphen (-)

1544 // `3' marks roots without suffix

1545 int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time r, clock_t * timelimit)

1546 {

1547 struct hentry * rv=NULL;

1548 struct hentry * rv2=NULL;

1549 int nosuffix = 0;

1550

1551 // check time limit

1552 if (timer) {

1553 (*timer)--;

1554 if (!(*timer) && timelimit) {

1555 if ((clock() - *timelimit) > TIMELIMIT) return 0;

1556 *timer = MAXPLUSTIMER;

1557 }

1558 }

1559

1560 if (pAMgr) {

1561 if (cpdsuggest==1) {

1562 if (pAMgr->get_compound()) {

1563 rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1, 0); //EX T

1564 if (rv && (!(rv2 = pAMgr->lookup(word)) \|\| !rv2->astr \|\|

1565 !(TESTAFF(rv2->astr,pAMgr->get_forbiddenword(),rv2->alen) \|\|

1566 TESTAFF(rv2->astr,pAMgr->get_nosuggest(),rv2->alen)))) return 3; // XXX obsolote categorisation + only ICONV needs affix flag check?

1567 }

1568 return 0;

1569 }

1570

1571 rv = pAMgr->lookup(word);

1572

1573 if (rv) {

1574 if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)

1575 \|\| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;

1576 while (rv) {

1577 if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) \| \|

1578 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) \|\|

1579 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {

1580 rv = rv->next_homonym;

1581 } else break;

1582 }

1583 } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX

1584

1585 if (rv) {

1586 nosuffix=1;

1587 } else {

1588 rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suf fix

1589 }

1590

1591 if (!rv && pAMgr->have_contclass()) {

1592 rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);

1593 if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);

1594 }

1595

1596 // check forbidden words

1597 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->a len) \|\|

1598 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) \|\|

1599 TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) \|\|

1600 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;

1601

1602 if (rv) { // XXX obsolote

1603 if ((pAMgr->get_compoundflag()) &&

1604 TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nos uffix;

1605 return 1;

1606 }

1607 }

1608 return 0;

1609 }

1610

1611 int SuggestMgr::check_forbidden(const char * word, int len)

1612 {

1613 struct hentry * rv = NULL;

1614

1615 if (pAMgr) {

1616 rv = pAMgr->lookup(word);

1617 if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) \|\|

1618 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;

1619 if (!(pAMgr->prefix_check(word,len,1)))

1620 rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+su ffix, suffix

1621 // check forbidden words

1622 if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->al en)) return 1;

1623 }

1624 return 0;

1625 }

1626

1627 #ifdef HUNSPELL_EXPERIMENTAL

1628 // suggest possible stems

1629 int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)

1630 {

1631 char ** wlst;

1632

1633 struct hentry * rv = NULL;

1634

1635 char w2[MAXSWUTF8L];

1636 const char * word = w;

1637

1638 // word reversing wrapper for complex prefixes

1639 if (complexprefixes) {

1640 strcpy(w2, w);

1641 if (utf8) reverseword_utf(w2); else reverseword(w2);

1642 word = w2;

1643 }

1644

1645 int wl = strlen(word);

1646

1647

1648 if (*slst) {

1649 wlst = *slst;

1650 } else {

1651 wlst = (char *) calloc(maxSug, sizeof(char ));

1652 if (wlst == NULL) return -1;

1653 }

1654

1655 rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);

1656

1657 // delete dash from end of word

1658 if (nsug > 0) {

1659 for (int j=0; j < nsug; j++) {

1660 if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1 ] = '\0';

1661 }

1662 }

1663

1664 *slst = wlst;

1665 return nsug;

1666 }

1667 #endif // END OF HUNSPELL_EXPERIMENTAL CODE

1668

1669

1670 char * SuggestMgr::suggest_morph(const char * w)

1671 {

1672 char result[MAXLNLEN];

1673 char * r = (char *) result;

1674 char * st;

1675

1676 struct hentry * rv = NULL;

1677

1678 *result = '\0';

1679

1680 if (! pAMgr) return NULL;

1681

1682 char w2[MAXSWUTF8L];

1683 const char * word = w;

1684

1685 // word reversing wrapper for complex prefixes

1686 if (complexprefixes) {

1687 strcpy(w2, w);

1688 if (utf8) reverseword_utf(w2); else reverseword(w2);

1689 word = w2;

1690 }

1691

1692 rv = pAMgr->lookup(word);

1693

1694 while (rv) {

1695 if ((!rv->astr) \|\| !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->a len) \|\|

1696 TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) \|\|

1697 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {

1698 if (!HENTRY_FIND(rv, MORPH_STEM)) {

1699 mystrcat(result, " ", MAXLNLEN);

1700 mystrcat(result, MORPH_STEM, MAXLNLEN);

1701 mystrcat(result, word, MAXLNLEN);

1702 }

1703 if (HENTRY_DATA(rv)) {

1704 mystrcat(result, " ", MAXLNLEN);

1705 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);

1706 }

1707 mystrcat(result, "\n", MAXLNLEN);

1708 }

1709 rv = rv->next_homonym;

1710 }

1711

1712 st = pAMgr->affix_check_morph(word,strlen(word));

1713 if (st) {

1714 mystrcat(result, st, MAXLNLEN);

1715 free(st);

1716 }

1717

1718 if (pAMgr->get_compound() && (*result == '\0'))

1719 pAMgr->compound_check_morph(word, strlen(word),

1720 0, 0, 100, 0,NULL, 0, &r, NULL);

1721

1722 return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;

1723 }

1724

1725 #ifdef HUNSPELL_EXPERIMENTAL

1726 char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)

1727 {

1728 char * p = NULL;

1729 char wlst = (char ) calloc(maxSug, sizeof(char *));

1730 if (!**wlst) return NULL;

1731 // we will use only the first suggestion

1732 for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";

1733 int ns = suggest(&wlst, word, maxSug - 1, NULL);

1734 if (ns == maxSug) {

1735 p = suggest_morph(wlst[maxSug - 1]);

1736 free(wlst[maxSug - 1]);

1737 }

1738 if (wlst) free(wlst);

1739 return p;

1740 }

1741 #endif // END OF HUNSPELL_EXPERIMENTAL CODE

1742

1743 /* affixation */

1744 char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)

1745 {

1746 char result[MAXLNLEN];

1747 *result = '\0';

1748 int sfxcount = get_sfxcount(pattern);

1749

1750 if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;

1751

1752 if (HENTRY_DATA(rv)) {

1753 char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->al en,

1754 HENTRY_DATA(rv), pattern, 0);

1755 if (aff) {

1756 mystrcat(result, aff, MAXLNLEN);

1757 mystrcat(result, "\n", MAXLNLEN);

1758 free(aff);

1759 }

1760 }

1761

1762 // check all allomorphs

1763 char allomorph[MAXLNLEN];

1764 char * p = NULL;

1765 if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);

1766 while (p) {

1767 struct hentry * rv2 = NULL;

1768 p += MORPH_TAG_LEN;

1769 int plen = fieldlen(p);

1770 strncpy(allomorph, p, plen);

1771 allomorph[plen] = '\0';

1772 rv2 = pAMgr->lookup(allomorph);

1773 while (rv2) {

1774 // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount ) {

1775 if (HENTRY_DATA(rv2)) {

1776 char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);

1777 if (st && (strncmp(st + MORPH_TAG_LEN,

1778 HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {

1779 char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv 2->astr, rv2->alen,

1780 HENTRY_DATA(rv2), pattern, 0);

1781 if (aff) {

1782 mystrcat(result, aff, MAXLNLEN);

1783 mystrcat(result, "\n", MAXLNLEN);

1784 free(aff);

1785 }

1786 }

1787 }

1788 rv2 = rv2->next_homonym;

1789 }

1790 p = strstr(p + plen, MORPH_ALLOMORPH);

1791 }

1792

1793 return (*result) ? mystrdup(result) : NULL;

1794 }

1795

1796 char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {

1797 char result[MAXLNLEN];

1798 char result2[MAXLNLEN];

1799 char newpattern[MAXLNLEN];

1800 *newpattern = '\0';

1801 if (n == 0) return 0;

1802 *result2 = '\0';

1803 struct hentry * rv = NULL;

1804 if (!pAMgr) return NULL;

1805

1806 // search affixed forms with and without derivational suffixes

1807 while(1) {

1808

1809 for (int k = 0; k < n; k++) {

1810 *result = '\0';

1811 // add compound word parts (except the last one)

1812 char * s = (char *) desc[k];

1813 char * part = strstr(s, MORPH_PART);

1814 if (part) {

1815 char * nextpart = strstr(part + 1, MORPH_PART);

1816 while (nextpart) {

1817 copy_field(result + strlen(result), part, MORPH_PART);

1818 part = nextpart;

1819 nextpart = strstr(part + 1, MORPH_PART);

1820 }

1821 s = part;

1822 }

1823

1824 char **pl;

1825 char tok[MAXLNLEN];

1826 strcpy(tok, s);

1827 char * alt = strstr(tok, " \| ");

1828 while (alt) {

1829 alt[1] = MSEP_ALT;

1830 alt = strstr(alt, " \| ");

1831 }

1832 int pln = line_tok(tok, &pl, MSEP_ALT);

1833 for (int i = 0; i < pln; i++) {

1834 // remove inflectional and terminal suffixes

1835 char * is = strstr(pl[i], MORPH_INFL_SFX);

1836 if (is) *is = '\0';

1837 char * ts = strstr(pl[i], MORPH_TERM_SFX);

1838 while (ts) {

1839 *ts = '_';

1840 ts = strstr(pl[i], MORPH_TERM_SFX);

1841 }

1842 char * st = strstr(s, MORPH_STEM);

1843 if (st) {

1844 copy_field(tok, st, MORPH_STEM);

1845 rv = pAMgr->lookup(tok);

1846 while (rv) {

1847 char newpat[MAXLNLEN];

1848 strcpy(newpat, pl[i]);

1849 strcat(newpat, pattern);

1850 char * sg = suggest_hentry_gen(rv, newpat);

1851 if (!sg) sg = suggest_hentry_gen(rv, pattern);

1852 if (sg) {

1853 char ** gen;

1854 int genl = line_tok(sg, &gen, MSEP_REC);

1855 free(sg);

1856 sg = NULL;

1857 for (int j = 0; j < genl; j++) {

1858 if (strstr(pl[i], MORPH_SURF_PFX)) {

1859 int r2l = strlen(result2);

1860 result2[r2l] = MSEP_REC;

1861 strcpy(result2 + r2l + 1, result);

1862 copy_field(result2 + strlen(result2), pl[i], MOR PH_SURF_PFX);

1863 mystrcat(result2, gen[j], MAXLNLEN);

1864 } else {

1865 sprintf(result2 + strlen(result2), "%c%s%s",

1866 MSEP_REC, result, gen[j]);

1867 }

1868 }

1869 freelist(&gen, genl);

1870 }

1871 rv = rv->next_homonym;

1872 }

1873 }

1874 }

1875 freelist(&pl, pln);

1876 }

1877

1878 if (*result2 \|\| !strstr(pattern, MORPH_DERI_SFX)) break;

1879 strcpy(newpattern, pattern);

1880 pattern = newpattern;

1881 char * ds = strstr(pattern, MORPH_DERI_SFX);

1882 while (ds) {

1883 strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);

1884 ds = strstr(pattern, MORPH_DERI_SFX);

1885 }

1886 }

1887 return (*result2 ? mystrdup(result2) : NULL);

1888 }

1889

1890

1891 // generate an n-gram score comparing s1 and s2

1892 int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)

1893 {

1894 int nscore = 0;

1895 int ns;

1896 int l1;

1897 int l2;

1898 int test = 0;

1899

1900 if (utf8) {

1901 w_char su1[MAXSWL];

1902 w_char su2[MAXSWL];

1903 l1 = u8_u16(su1, MAXSWL, s1);

1904 l2 = u8_u16(su2, MAXSWL, s2);

1905 if ((l2 <= 0) \|\| (l1 == -1)) return 0;

1906 // lowering dictionary word

1907 if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);

1908 for (int j = 1; j <= n; j++) {

1909 ns = 0;

1910 for (int i = 0; i <= (l1-j); i++) {

1911 int k = 0;

1912 for (int l = 0; l <= (l2-j); l++) {

1913 for (k = 0; k < j; k++) {

1914 w_char * c1 = su1 + i + k;

1915 w_char * c2 = su2 + l + k;

1916 if ((c1->l != c2->l) \|\| (c1->h != c2->h)) break;

1917 }

1918 if (k == j) {

1919 ns++;

1920 break;

1921 }

1922 }

1923 if (k != j && opt & NGRAM_WEIGHTED) {

1924 ns--;

1925 test++;

1926 if (i == 0 \|\| i == l1-j) ns--; // side weight

1927 }

1928 }

1929 nscore = nscore + ns;

1930 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;

1931 }

1932 } else {

1933 l2 = strlen(s2);

1934 if (l2 == 0) return 0;

1935 l1 = strlen(s1);

1936 char *t = mystrdup(s2);

1937 if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);

1938 for (int j = 1; j <= n; j++) {

1939 ns = 0;

1940 for (int i = 0; i <= (l1-j); i++) {

1941 char c = *(s1 + i + j);

1942 *(s1 + i + j) = '\0';

1943 if (strstr(t,(s1+i))) {

1944 ns++;

1945 } else if (opt & NGRAM_WEIGHTED) {

1946 ns--;

1947 test++;

1948 if (i == 0 \|\| i == l1-j) ns--; // side weight

1949 }

1950 *(s1 + i + j ) = c;

1951 }

1952 nscore = nscore + ns;

1953 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;

1954 }

1955 free(t);

1956 }

1957

1958 ns = 0;

1959 if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;

1960 if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;

1961 ns = (nscore - ((ns > 0) ? ns : 0));

1962 return ns;

1963 }

1964

1965 // length of the left common substring of s1 and (decapitalised) s2

1966 int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {

1967 if (utf8) {

1968 w_char su1[MAXSWL];

1969 w_char su2[MAXSWL];

1970 su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0;

1971 // decapitalize dictionary word

1972 if (complexprefixes) {

1973 int l1 = u8_u16(su1, MAXSWL, s1);

1974 int l2 = u8_u16(su2, MAXSWL, s2);

1975 if (((short )su1+l1-1) == ((short )su2+l2-1)) return 1;

1976 } else {

1977 int i;

1978 u8_u16(su1, 1, s1);

1979 u8_u16(su2, 1, s2);

1980 unsigned short idx = (su2->h << 8) + su2->l;

1981 unsigned short otheridx = (su1->h << 8) + su1->l;

1982 if (otheridx != idx &&

1983 (otheridx != unicodetolower(idx, langnum))) return 0;

1984 int l1 = u8_u16(su1, MAXSWL, s1);

1985 int l2 = u8_u16(su2, MAXSWL, s2);

1986 for(i = 1; (i < l1) && (i < l2) &&

1987 (su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++);

1988 return i;

1989 }

1990 } else {

1991 if (complexprefixes) {

1992 int l1 = strlen(s1);

1993 int l2 = strlen(s2);

1994 if ((s2+l1-1) == (s2+l2-1)) return 1;

1995 } else {

1996 char * olds = s1;

1997 // decapitalise dictionary word

1998 if ((s1 != s2) && (s1 != csconv[((unsigned char)s2)].clower)) return 0 ;

1999 do {

2000 s1++; s2++;

2001 } while ((s1 == s2) && (*s1 != '\0'));

2002 return (int)(s1 - olds);

2003 }

2004 }

2005 return 0;

2006 }

2007

2008 int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_sw ap) {

2009 int num = 0;

2010 int diff = 0;

2011 int diffpos[2];

2012 *is_swap = 0;

2013 if (utf8) {

2014 w_char su1[MAXSWL];

2015 w_char su2[MAXSWL];

2016 int l1 = u8_u16(su1, MAXSWL, s1);

2017 int l2 = u8_u16(su2, MAXSWL, s2);

2018 // decapitalize dictionary word

2019 if (complexprefixes) {

2020 mkallsmall_utf(su2+l2-1, 1, langnum);

2021 } else {

2022 mkallsmall_utf(su2, 1, langnum);

2023 }

2024 for (int i = 0; (i < l1) && (i < l2); i++) {

2025 if (((short ) su1)[i] == ((short ) su2)[i]) {

2026 num++;

2027 } else {

2028 if (diff < 2) diffpos[diff] = i;

2029 diff++;

2030 }

2031 }

2032 if ((diff == 2) && (l1 == l2) &&

2033 (((short ) su1)[diffpos[0]] == ((short ) su2)[diffpos[1]]) &&

2034 (((short ) su1)[diffpos[1]] == ((short ) su2)[diffpos[0]])) *is_swap = 1;

2035 } else {

2036 int i;

2037 char t[MAXSWUTF8L];

2038 strcpy(t, s2);

2039 // decapitalize dictionary word

2040 if (complexprefixes) {

2041 int l2 = strlen(t);

2042 (t+l2-1) = csconv[((unsigned char)(t+l2-1))].clower;

2043 } else {

2044 mkallsmall(t, csconv);

2045 }

2046 for (i = 0; ((s1+i) != 0) && ((t+i) != 0); i++) {

2047 if ((s1+i) == (t+i)) {

2048 num++;

2049 } else {

2050 if (diff < 2) diffpos[diff] = i;

2051 diff++;

2052 }

2053 }

2054 if ((diff == 2) && ((s1+i) == 0) && ((t+i) == 0) &&

2055 ((s1+diffpos[0]) == (t+diffpos[1])) &&

2056 ((s1+diffpos[1]) == (t+diffpos[0]))) *is_swap = 1;

2057 }

2058 return num;

2059 }

2060

2061 int SuggestMgr::mystrlen(const char * word) {

2062 if (utf8) {

2063 w_char w[MAXSWL];

2064 return u8_u16(w, MAXSWL, word);

2065 } else return strlen(word);

2066 }

2067

2068 // sort in decreasing order of score

2069 void SuggestMgr::bubblesort(char rword, char rword2, int* rsc, int n )

2070 {

2071 int m = 1;

2072 while (m < n) {

2073 int j = m;

2074 while (j > 0) {

2075 if (rsc[j-1] < rsc[j]) {

2076 int sctmp = rsc[j-1];

2077 char * wdtmp = rword[j-1];

2078 rsc[j-1] = rsc[j];

2079 rword[j-1] = rword[j];

2080 rsc[j] = sctmp;

2081 rword[j] = wdtmp;

2082 if (rword2) {

2083 wdtmp = rword2[j-1];

2084 rword2[j-1] = rword2[j];

2085 rword2[j] = wdtmp;

2086 }

2087 j--;

2088 } else break;

2089 }

2090 m++;

2091 }

2092 return;

2093 }

2094

2095 // longest common subsequence

2096 void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char * * result) {

2097 int n, m;

2098 w_char su[MAXSWL];

2099 w_char su2[MAXSWL];

2100 char * b;

2101 char * c;

2102 int i;

2103 int j;

2104 if (utf8) {

2105 m = u8_u16(su, MAXSWL, s);

2106 n = u8_u16(su2, MAXSWL, s2);

2107 } else {

2108 m = strlen(s);

2109 n = strlen(s2);

2110 }

2111 c = (char *) calloc(m + 1, n + 1);

2112 b = (char *) calloc(m + 1, n + 1);

2113 if (!c \|\| !b) {

2114 if (c) free(c);

2115 if (b) free(b);

2116 *result = NULL;

2117 return;

2118 }

2119 for (i = 1; i <= m; i++) {

2120 for (j = 1; j <= n; j++) {

2121 if ( ((utf8) && (((short ) su+i-1) == ((short )su2+j-1)))

2122 \|\| ((!utf8) && (((s+i-1)) == ((s2+j-1))))) {

2123 c[i(n+1) + j] = c[(i-1)(n+1) + j-1]+1;

2124 b[i*(n+1) + j] = LCS_UPLEFT;

2125 } else if (c[(i-1)(n+1) + j] >= c[i(n+1) + j-1]) {

2126 c[i(n+1) + j] = c[(i-1)(n+1) + j];

2127 b[i*(n+1) + j] = LCS_UP;

2128 } else {

2129 c[i(n+1) + j] = c[i(n+1) + j-1];

2130 b[i*(n+1) + j] = LCS_LEFT;

2131 }

2132 }

2133 }

2134 *result = b;

2135 free(c);

2136 *l1 = m;

2137 *l2 = n;

2138 }

2139

2140 int SuggestMgr::lcslen(const char * s, const char* s2) {

2141 int m;

2142 int n;

2143 int i;

2144 int j;

2145 char * result;

2146 int len = 0;

2147 lcs(s, s2, &m, &n, &result);

2148 if (!result) return 0;

2149 i = m;

2150 j = n;

2151 while ((i != 0) && (j != 0)) {

2152 if (result[i*(n+1) + j] == LCS_UPLEFT) {

2153 len++;

2154 i--;

2155 j--;

2156 } else if (result[i*(n+1) + j] == LCS_UP) {

2157 i--;

2158 } else j--;

2159 }

2160 free(result);

2161 return len;

2162 }

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/src/hunspell/suggestmgr.hxx ('k') | third_party/hunspell_new/src/hunspell/utf_info.hxx » ('j') | no next file with comments »