| OLD | NEW |
| (Empty) |
| 1 #include "license.hunspell" | |
| 2 #include "license.myspell" | |
| 3 | |
| 4 #include <stdlib.h> | |
| 5 #include <string.h> | |
| 6 #include <stdio.h> | |
| 7 #include <ctype.h> | |
| 8 | |
| 9 #include "suggestmgr.hxx" | |
| 10 #include "htypes.hxx" | |
| 11 #include "csutil.hxx" | |
| 12 | |
| 13 const w_char W_VLINE = { '\0', '|' }; | |
| 14 | |
| 15 #ifdef HUNSPELL_CHROME_CLIENT | |
| 16 namespace { | |
| 17 // A simple class which creates temporary hentry objects which are available | |
| 18 // only in a scope. To conceal memory operations from SuggestMgr functions, | |
| 19 // this object automatically deletes all hentry objects created through | |
| 20 // CreateScopedHashEntry() calls in its destructor. So, the following snippet | |
| 21 // raises a memory error. | |
| 22 // | |
| 23 // hentry* bad_copy = NULL; | |
| 24 // { | |
| 25 // ScopedHashEntryFactory factory; | |
| 26 // hentry* scoped_copy = factory.CreateScopedHashEntry(0, source); | |
| 27 // ... | |
| 28 // bad_copy = scoped_copy; | |
| 29 // } | |
| 30 // if (bad_copy->word[0]) // memory for scoped_copy has been deleted! | |
| 31 // | |
| 32 // As listed in the above snippet, it is simple to use this class. | |
| 33 // 1. Declare an instance of this ScopedHashEntryFactory, and; | |
| 34 // 2. Call its CreateHashEntry() member instead of using 'new hentry' or | |
| 35 // 'operator='. | |
| 36 // | |
| 37 class ScopedHashEntryFactory { | |
| 38 public: | |
| 39 ScopedHashEntryFactory(); | |
| 40 ~ScopedHashEntryFactory(); | |
| 41 | |
| 42 // Creates a temporary copy of the given hentry struct. | |
| 43 // The returned copy is available only while this object is available. | |
| 44 // NOTE: this function just calls memcpy() in creating a copy of the given | |
| 45 // hentry struct, i.e. it does NOT copy objects referred by pointers of the | |
| 46 // given hentry struct. | |
| 47 hentry* CreateScopedHashEntry(int index, const hentry* source); | |
| 48 | |
| 49 private: | |
| 50 // A struct which encapsulates the new hentry struct introduced in hunspell | |
| 51 // 1.2.8. For a pointer to an hentry struct 'h', hunspell 1.2.8 stores a word | |
| 52 // (including a NUL character) into 'h->word[0]',...,'h->word[h->blen]' even | |
| 53 // though arraysize(h->word[]) is 1. Also, it changed 'astr' to a pointer so | |
| 54 // it can store affix flags into 'h->astr[0]',...,'h->astr[alen-1]'. To handle | |
| 55 // this new hentry struct, we define a struct which combines three values: an | |
| 56 // hentry struct 'hentry'; a char array 'word[kMaxWordLen]', and; an unsigned | |
| 57 // short array 'astr' so a hentry struct 'h' returned from | |
| 58 // CreateScopedHashEntry() satisfies the following equations: | |
| 59 // hentry* h = factory.CreateScopedHashEntry(0, source); | |
| 60 // h->word[0] == ((HashEntryItem*)h)->entry.word[0]. | |
| 61 // h->word[1] == ((HashEntryItem*)h)->word[0]. | |
| 62 // ... | |
| 63 // h->word[h->blen] == ((HashEntryItem*)h)->word[h->blen-1]. | |
| 64 // h->astr[0] == ((HashEntryItem*)h)->astr[0]. | |
| 65 // h->astr[1] == ((HashEntryItem*)h)->astr[1]. | |
| 66 // ... | |
| 67 // h->astr[h->alen-1] == ((HashEntryItem*)h)->astr[h->alen-1]. | |
| 68 enum { | |
| 69 kMaxWordLen = 128, | |
| 70 kMaxAffixLen = 8, | |
| 71 }; | |
| 72 struct HashEntryItem { | |
| 73 hentry entry; | |
| 74 char word[kMaxWordLen]; | |
| 75 unsigned short astr[kMaxAffixLen]; | |
| 76 }; | |
| 77 | |
| 78 HashEntryItem hash_items_[MAX_ROOTS]; | |
| 79 }; | |
| 80 | |
| 81 ScopedHashEntryFactory::ScopedHashEntryFactory() { | |
| 82 memset(&hash_items_[0], 0, sizeof(hash_items_)); | |
| 83 } | |
| 84 | |
| 85 ScopedHashEntryFactory::~ScopedHashEntryFactory() { | |
| 86 } | |
| 87 | |
| 88 hentry* ScopedHashEntryFactory::CreateScopedHashEntry(int index, | |
| 89 const hentry* source) { | |
| 90 if (index >= MAX_ROOTS || source->blen >= kMaxWordLen) | |
| 91 return NULL; | |
| 92 | |
| 93 // Retrieve a HashEntryItem struct from our spool, initialize it, and | |
| 94 // returns the address of its 'hentry' member. | |
| 95 size_t source_size = sizeof(hentry) + source->blen + 1; | |
| 96 HashEntryItem* hash_item = &hash_items_[index]; | |
| 97 memcpy(&hash_item->entry, source, source_size); | |
| 98 if (source->astr) { | |
| 99 hash_item->entry.alen = source->alen; | |
| 100 if (hash_item->entry.alen > kMaxAffixLen) | |
| 101 hash_item->entry.alen = kMaxAffixLen; | |
| 102 memcpy(hash_item->astr, source->astr, hash_item->entry.alen * sizeof(hash_it
em->astr[0])); | |
| 103 hash_item->entry.astr = &hash_item->astr[0]; | |
| 104 } | |
| 105 return &hash_item->entry; | |
| 106 } | |
| 107 | |
| 108 } // namespace | |
| 109 #endif | |
| 110 | |
| 111 | |
| 112 #ifdef HUNSPELL_CHROME_CLIENT | |
| 113 SuggestMgr::SuggestMgr(hunspell::BDictReader* reader, | |
| 114 const char * tryme, int maxn, | |
| 115 AffixMgr * aptr) | |
| 116 { | |
| 117 bdict_reader = reader; | |
| 118 #else | |
| 119 SuggestMgr::SuggestMgr(const char * tryme, int maxn, | |
| 120 AffixMgr * aptr) | |
| 121 { | |
| 122 #endif | |
| 123 | |
| 124 // register affix manager and check in string of chars to | |
| 125 // try when building candidate suggestions | |
| 126 pAMgr = aptr; | |
| 127 | |
| 128 csconv = NULL; | |
| 129 | |
| 130 ckeyl = 0; | |
| 131 ckey = NULL; | |
| 132 ckey_utf = NULL; | |
| 133 | |
| 134 ctryl = 0; | |
| 135 ctry = NULL; | |
| 136 ctry_utf = NULL; | |
| 137 | |
| 138 utf8 = 0; | |
| 139 langnum = 0; | |
| 140 complexprefixes = 0; | |
| 141 | |
| 142 maxSug = maxn; | |
| 143 nosplitsugs = 0; | |
| 144 maxngramsugs = MAXNGRAMSUGS; | |
| 145 maxcpdsugs = MAXCOMPOUNDSUGS; | |
| 146 | |
| 147 if (pAMgr) { | |
| 148 langnum = pAMgr->get_langnum(); | |
| 149 ckey = pAMgr->get_key_string(); | |
| 150 nosplitsugs = pAMgr->get_nosplitsugs(); | |
| 151 if (pAMgr->get_maxngramsugs() >= 0) | |
| 152 maxngramsugs = pAMgr->get_maxngramsugs(); | |
| 153 utf8 = pAMgr->get_utf8(); | |
| 154 if (pAMgr->get_maxcpdsugs() >= 0) | |
| 155 maxcpdsugs = pAMgr->get_maxcpdsugs(); | |
| 156 if (!utf8) | |
| 157 { | |
| 158 char * enc = pAMgr->get_encoding(); | |
| 159 csconv = get_current_cs(enc); | |
| 160 free(enc); | |
| 161 } | |
| 162 complexprefixes = pAMgr->get_complexprefixes(); | |
| 163 } | |
| 164 | |
| 165 if (ckey) { | |
| 166 if (utf8) { | |
| 167 w_char t[MAXSWL]; | |
| 168 ckeyl = u8_u16(t, MAXSWL, ckey); | |
| 169 ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char)); | |
| 170 if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char)); | |
| 171 else ckeyl = 0; | |
| 172 } else { | |
| 173 ckeyl = strlen(ckey); | |
| 174 } | |
| 175 } | |
| 176 | |
| 177 if (tryme) { | |
| 178 ctry = mystrdup(tryme); | |
| 179 if (ctry) ctryl = strlen(ctry); | |
| 180 if (ctry && utf8) { | |
| 181 w_char t[MAXSWL]; | |
| 182 ctryl = u8_u16(t, MAXSWL, tryme); | |
| 183 ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); | |
| 184 if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char)); | |
| 185 else ctryl = 0; | |
| 186 } | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 | |
| 191 SuggestMgr::~SuggestMgr() | |
| 192 { | |
| 193 pAMgr = NULL; | |
| 194 if (ckey) free(ckey); | |
| 195 ckey = NULL; | |
| 196 if (ckey_utf) free(ckey_utf); | |
| 197 ckey_utf = NULL; | |
| 198 ckeyl = 0; | |
| 199 if (ctry) free(ctry); | |
| 200 ctry = NULL; | |
| 201 if (ctry_utf) free(ctry_utf); | |
| 202 ctry_utf = NULL; | |
| 203 ctryl = 0; | |
| 204 maxSug = 0; | |
| 205 #ifdef MOZILLA_CLIENT | |
| 206 delete [] csconv; | |
| 207 #endif | |
| 208 } | |
| 209 | |
| 210 int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int
cpdsuggest, | |
| 211 int * timer, clock_t * timelimit) { | |
| 212 int cwrd = 1; | |
| 213 if (ns == maxSug) return maxSug; | |
| 214 for (int k=0; k < ns; k++) { | |
| 215 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; | |
| 216 } | |
| 217 if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) { | |
| 218 wlst[ns] = mystrdup(candidate); | |
| 219 if (wlst[ns] == NULL) { | |
| 220 for (int j=0; j<ns; j++) free(wlst[j]); | |
| 221 return -1; | |
| 222 } | |
| 223 ns++; | |
| 224 } | |
| 225 return ns; | |
| 226 } | |
| 227 | |
| 228 // generate suggestions for a misspelled word | |
| 229 // pass in address of array of char * pointers | |
| 230 // onlycompoundsug: probably bad suggestions (need for ngram sugs, too) | |
| 231 | |
| 232 int SuggestMgr::suggest(char*** slst, const char * w, int nsug, | |
| 233 int * onlycompoundsug) | |
| 234 { | |
| 235 int nocompoundtwowords = 0; | |
| 236 char ** wlst; | |
| 237 w_char word_utf[MAXSWL]; | |
| 238 int wl = 0; | |
| 239 int nsugorig = nsug; | |
| 240 char w2[MAXWORDUTF8LEN]; | |
| 241 const char * word = w; | |
| 242 int oldSug = 0; | |
| 243 | |
| 244 // word reversing wrapper for complex prefixes | |
| 245 if (complexprefixes) { | |
| 246 strcpy(w2, w); | |
| 247 if (utf8) reverseword_utf(w2); else reverseword(w2); | |
| 248 word = w2; | |
| 249 } | |
| 250 | |
| 251 if (*slst) { | |
| 252 wlst = *slst; | |
| 253 } else { | |
| 254 wlst = (char **) malloc(maxSug * sizeof(char *)); | |
| 255 if (wlst == NULL) return -1; | |
| 256 for (int i = 0; i < maxSug; i++) { | |
| 257 wlst[i] = NULL; | |
| 258 } | |
| 259 } | |
| 260 | |
| 261 if (utf8) { | |
| 262 wl = u8_u16(word_utf, MAXSWL, word); | |
| 263 if (wl == -1) { | |
| 264 *slst = wlst; | |
| 265 return nsug; | |
| 266 } | |
| 267 } | |
| 268 | |
| 269 for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest
++) { | |
| 270 | |
| 271 // limit compound suggestion | |
| 272 if (cpdsuggest > 0) oldSug = nsug; | |
| 273 | |
| 274 // suggestions for an uppercase word (html -> HTML) | |
| 275 if ((nsug < maxSug) && (nsug > -1)) { | |
| 276 nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 277 capchars(wlst, word, nsug, cpdsuggest); | |
| 278 } | |
| 279 | |
| 280 // perhaps we made a typical fault of spelling | |
| 281 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 282 nsug = replchars(wlst, word, nsug, cpdsuggest); | |
| 283 } | |
| 284 | |
| 285 // perhaps we made chose the wrong char from a related set | |
| 286 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 287 nsug = mapchars(wlst, word, nsug, cpdsuggest); | |
| 288 } | |
| 289 | |
| 290 // only suggest compound words when no other suggestion | |
| 291 if ((cpdsuggest == 0) && (nsug > nsugorig)) nocompoundtwowords=1; | |
| 292 | |
| 293 // did we swap the order of chars by mistake | |
| 294 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 295 nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 296 swapchar(wlst, word, nsug, cpdsuggest); | |
| 297 } | |
| 298 | |
| 299 // did we swap the order of non adjacent chars by mistake | |
| 300 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 301 nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 302 longswapchar(wlst, word, nsug, cpdsuggest); | |
| 303 } | |
| 304 | |
| 305 // did we just hit the wrong key in place of a good char (case and keyboard) | |
| 306 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 307 nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 308 badcharkey(wlst, word, nsug, cpdsuggest); | |
| 309 } | |
| 310 | |
| 311 // did we add a char that should not be there | |
| 312 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 313 nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 314 extrachar(wlst, word, nsug, cpdsuggest); | |
| 315 } | |
| 316 | |
| 317 | |
| 318 // did we forgot a char | |
| 319 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 320 nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 321 forgotchar(wlst, word, nsug, cpdsuggest); | |
| 322 } | |
| 323 | |
| 324 // did we move a char | |
| 325 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 326 nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 327 movechar(wlst, word, nsug, cpdsuggest); | |
| 328 } | |
| 329 | |
| 330 // did we just hit the wrong key in place of a good char | |
| 331 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 332 nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : | |
| 333 badchar(wlst, word, nsug, cpdsuggest); | |
| 334 } | |
| 335 | |
| 336 // did we double two characters | |
| 337 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) { | |
| 338 nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest)
: | |
| 339 doubletwochars(wlst, word, nsug, cpdsuggest); | |
| 340 } | |
| 341 | |
| 342 // perhaps we forgot to hit space and two words ran together | |
| 343 if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug
< oldSug + maxcpdsugs))) { | |
| 344 nsug = twowords(wlst, word, nsug, cpdsuggest); | |
| 345 } | |
| 346 | |
| 347 } // repeating ``for'' statement compounding support | |
| 348 | |
| 349 if (nsug < 0) { | |
| 350 // we ran out of memory - we should free up as much as possible | |
| 351 for (int i = 0; i < maxSug; i++) | |
| 352 if (wlst[i] != NULL) free(wlst[i]); | |
| 353 free(wlst); | |
| 354 wlst = NULL; | |
| 355 } | |
| 356 | |
| 357 if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug =
1; | |
| 358 | |
| 359 *slst = wlst; | |
| 360 return nsug; | |
| 361 } | |
| 362 | |
| 363 // generate suggestions for a word with typical mistake | |
| 364 // pass in address of array of char * pointers | |
| 365 #ifdef HUNSPELL_EXPERIMENTAL | |
| 366 int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) | |
| 367 { | |
| 368 int nocompoundtwowords = 0; | |
| 369 char ** wlst; | |
| 370 int oldSug; | |
| 371 | |
| 372 char w2[MAXWORDUTF8LEN]; | |
| 373 const char * word = w; | |
| 374 | |
| 375 // word reversing wrapper for complex prefixes | |
| 376 if (complexprefixes) { | |
| 377 strcpy(w2, w); | |
| 378 if (utf8) reverseword_utf(w2); else reverseword(w2); | |
| 379 word = w2; | |
| 380 } | |
| 381 | |
| 382 if (*slst) { | |
| 383 wlst = *slst; | |
| 384 } else { | |
| 385 wlst = (char **) malloc(maxSug * sizeof(char *)); | |
| 386 if (wlst == NULL) return -1; | |
| 387 } | |
| 388 | |
| 389 for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest
++) { | |
| 390 | |
| 391 // limit compound suggestion | |
| 392 if (cpdsuggest > 0) oldSug = nsug; | |
| 393 | |
| 394 // perhaps we made a typical fault of spelling | |
| 395 if ((nsug < maxSug) && (nsug > -1)) | |
| 396 nsug = replchars(wlst, word, nsug, cpdsuggest); | |
| 397 | |
| 398 // perhaps we made chose the wrong char from a related set | |
| 399 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs))) | |
| 400 nsug = mapchars(wlst, word, nsug, cpdsuggest); | |
| 401 | |
| 402 if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; | |
| 403 | |
| 404 // perhaps we forgot to hit space and two words ran together | |
| 405 | |
| 406 if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcp
dsugs)) && check_forbidden(word, strlen(word))) { | |
| 407 nsug = twowords(wlst, word, nsug, cpdsuggest); | |
| 408 } | |
| 409 | |
| 410 } // repeating ``for'' statement compounding support | |
| 411 | |
| 412 if (nsug < 0) { | |
| 413 for (int i=0;i<maxSug; i++) | |
| 414 if (wlst[i] != NULL) free(wlst[i]); | |
| 415 free(wlst); | |
| 416 return -1; | |
| 417 } | |
| 418 | |
| 419 *slst = wlst; | |
| 420 return nsug; | |
| 421 } | |
| 422 #endif // END OF HUNSPELL_EXPERIMENTAL CODE | |
| 423 | |
| 424 // suggestions for an uppercase word (html -> HTML) | |
| 425 int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns,
int cpdsuggest) | |
| 426 { | |
| 427 char candidate[MAXSWUTF8L]; | |
| 428 w_char candidate_utf[MAXSWL]; | |
| 429 memcpy(candidate_utf, word, wl * sizeof(w_char)); | |
| 430 mkallcap_utf(candidate_utf, wl, langnum); | |
| 431 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 432 return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL)
; | |
| 433 } | |
| 434 | |
| 435 // suggestions for an uppercase word (html -> HTML) | |
| 436 int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest) | |
| 437 { | |
| 438 char candidate[MAXSWUTF8L]; | |
| 439 strcpy(candidate, word); | |
| 440 mkallcap(candidate, csconv); | |
| 441 return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL)
; | |
| 442 } | |
| 443 | |
| 444 // suggestions for when chose the wrong char out of a related set | |
| 445 int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) | |
| 446 { | |
| 447 char candidate[MAXSWUTF8L]; | |
| 448 clock_t timelimit; | |
| 449 int timer; | |
| 450 candidate[0] = '\0'; | |
| 451 | |
| 452 int wl = strlen(word); | |
| 453 if (wl < 2 || ! pAMgr) return ns; | |
| 454 | |
| 455 int nummap = pAMgr->get_nummap(); | |
| 456 struct mapentry* maptable = pAMgr->get_maptable(); | |
| 457 if (maptable==NULL) return ns; | |
| 458 | |
| 459 timelimit = clock(); | |
| 460 timer = MINTIMER; | |
| 461 return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, mapt
able, nummap, &timer, &timelimit); | |
| 462 } | |
| 463 | |
| 464 int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn, | |
| 465 char** wlst, int cpdsuggest, int ns, | |
| 466 const mapentry* maptable, int nummap, int * timer, clock_t * timelimit) | |
| 467 { | |
| 468 if (*(word + wn) == '\0') { | |
| 469 int cwrd = 1; | |
| 470 *(candidate + cn) = '\0'; | |
| 471 int wl = strlen(candidate); | |
| 472 for (int m=0; m < ns; m++) | |
| 473 if (strcmp(candidate, wlst[m]) == 0) cwrd = 0; | |
| 474 if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) { | |
| 475 if (ns < maxSug) { | |
| 476 wlst[ns] = mystrdup(candidate); | |
| 477 if (wlst[ns] == NULL) return -1; | |
| 478 ns++; | |
| 479 } | |
| 480 } | |
| 481 return ns; | |
| 482 } | |
| 483 int in_map = 0; | |
| 484 for (int j = 0; j < nummap; j++) { | |
| 485 for (int k = 0; k < maptable[j].len; k++) { | |
| 486 int len = strlen(maptable[j].set[k]); | |
| 487 if (strncmp(maptable[j].set[k], word + wn, len) == 0) { | |
| 488 in_map = 1; | |
| 489 for (int l = 0; l < maptable[j].len; l++) { | |
| 490 strcpy(candidate + cn, maptable[j].set[l]); | |
| 491 ns = map_related(word, candidate, wn + len, strlen(candidate), wlst, | |
| 492 cpdsuggest, ns, maptable, nummap, timer, timelimit); | |
| 493 if (!(*timer)) return ns; | |
| 494 } | |
| 495 } | |
| 496 } | |
| 497 } | |
| 498 if (!in_map) { | |
| 499 *(candidate + cn) = *(word + wn); | |
| 500 ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest, | |
| 501 ns, maptable, nummap, timer, timelimit); | |
| 502 } | |
| 503 return ns; | |
| 504 } | |
| 505 | |
| 506 // suggestions for a typical fault of spelling, that | |
| 507 // differs with more, than 1 letter from the right form. | |
| 508 int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest
) | |
| 509 { | |
| 510 char candidate[MAXSWUTF8L]; | |
| 511 const char * r; | |
| 512 int lenr, lenp; | |
| 513 int wl = strlen(word); | |
| 514 if (wl < 2 || ! pAMgr) return ns; | |
| 515 | |
| 516 #ifdef HUNSPELL_CHROME_CLIENT | |
| 517 const char *pattern, *pattern2; | |
| 518 hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator(
); | |
| 519 while (iterator.GetNext(&pattern, &pattern2)) { | |
| 520 r = word; | |
| 521 lenr = strlen(pattern2); | |
| 522 lenp = strlen(pattern); | |
| 523 | |
| 524 // search every occurence of the pattern in the word | |
| 525 while ((r=strstr(r, pattern)) != NULL) { | |
| 526 strcpy(candidate, word); | |
| 527 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; | |
| 528 strcpy(candidate+(r-word), pattern2); | |
| 529 strcpy(candidate+(r-word)+lenr, r+lenp); | |
| 530 ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL
); | |
| 531 if (ns == -1) return -1; | |
| 532 // check REP suggestions with space | |
| 533 char * sp = strchr(candidate, ' '); | |
| 534 if (sp) { | |
| 535 char * prev = candidate; | |
| 536 while (sp) { | |
| 537 *sp = '\0'; | |
| 538 if (checkword(prev, strlen(prev), 0, NULL, NULL)) { | |
| 539 int oldns = ns; | |
| 540 *sp = ' '; | |
| 541 ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL,
NULL); | |
| 542 if (ns == -1) return -1; | |
| 543 if (oldns < ns) { | |
| 544 free(wlst[ns - 1]); | |
| 545 wlst[ns - 1] = mystrdup(candidate); | |
| 546 if (!wlst[ns - 1]) return -1; | |
| 547 } | |
| 548 } | |
| 549 *sp = ' '; | |
| 550 prev = sp + 1; | |
| 551 sp = strchr(prev, ' '); | |
| 552 } | |
| 553 } | |
| 554 r++; // search for the next letter | |
| 555 } | |
| 556 } | |
| 557 #else | |
| 558 int numrep = pAMgr->get_numrep(); | |
| 559 struct replentry* reptable = pAMgr->get_reptable(); | |
| 560 if (reptable==NULL) return ns; | |
| 561 for (int i=0; i < numrep; i++ ) { | |
| 562 r = word; | |
| 563 lenr = strlen(reptable[i].pattern2); | |
| 564 lenp = strlen(reptable[i].pattern); | |
| 565 // search every occurence of the pattern in the word | |
| 566 while ((r=strstr(r, reptable[i].pattern)) != NULL && (!reptable[i].end ||
strlen(r) == strlen(reptable[i].pattern)) && | |
| 567 (!reptable[i].start || r == word)) { | |
| 568 strcpy(candidate, word); | |
| 569 if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break; | |
| 570 strcpy(candidate+(r-word),reptable[i].pattern2); | |
| 571 strcpy(candidate+(r-word)+lenr, r+lenp); | |
| 572 ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL
); | |
| 573 if (ns == -1) return -1; | |
| 574 // check REP suggestions with space | |
| 575 char * sp = strchr(candidate, ' '); | |
| 576 if (sp) { | |
| 577 char * prev = candidate; | |
| 578 while (sp) { | |
| 579 *sp = '\0'; | |
| 580 if (checkword(prev, strlen(prev), 0, NULL, NULL)) { | |
| 581 int oldns = ns; | |
| 582 *sp = ' '; | |
| 583 ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL,
NULL); | |
| 584 if (ns == -1) return -1; | |
| 585 if (oldns < ns) { | |
| 586 free(wlst[ns - 1]); | |
| 587 wlst[ns - 1] = mystrdup(candidate); | |
| 588 if (!wlst[ns - 1]) return -1; | |
| 589 } | |
| 590 } | |
| 591 *sp = ' '; | |
| 592 prev = sp + 1; | |
| 593 sp = strchr(prev, ' '); | |
| 594 } | |
| 595 } | |
| 596 r++; // search for the next letter | |
| 597 } | |
| 598 } | |
| 599 #endif | |
| 600 return ns; | |
| 601 } | |
| 602 | |
| 603 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation
-> vacacation) | |
| 604 int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsu
ggest) | |
| 605 { | |
| 606 char candidate[MAXSWUTF8L]; | |
| 607 int state=0; | |
| 608 int wl = strlen(word); | |
| 609 if (wl < 5 || ! pAMgr) return ns; | |
| 610 for (int i=2; i < wl; i++ ) { | |
| 611 if (word[i]==word[i-2]) { | |
| 612 state++; | |
| 613 if (state==3) { | |
| 614 strcpy(candidate,word); | |
| 615 strcpy(candidate+i-1,word+i+1); | |
| 616 ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL); | |
| 617 if (ns == -1) return -1; | |
| 618 state=0; | |
| 619 } | |
| 620 } else { | |
| 621 state=0; | |
| 622 } | |
| 623 } | |
| 624 return ns; | |
| 625 } | |
| 626 | |
| 627 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation
-> vacacation) | |
| 628 int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in
t ns, int cpdsuggest) | |
| 629 { | |
| 630 w_char candidate_utf[MAXSWL]; | |
| 631 char candidate[MAXSWUTF8L]; | |
| 632 int state=0; | |
| 633 if (wl < 5 || ! pAMgr) return ns; | |
| 634 for (int i=2; i < wl; i++) { | |
| 635 if (w_char_eq(word[i], word[i-2])) { | |
| 636 state++; | |
| 637 if (state==3) { | |
| 638 memcpy(candidate_utf, word, (i - 1) * sizeof(w_char)); | |
| 639 memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char)); | |
| 640 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2); | |
| 641 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NUL
L, NULL); | |
| 642 if (ns == -1) return -1; | |
| 643 state=0; | |
| 644 } | |
| 645 } else { | |
| 646 state=0; | |
| 647 } | |
| 648 } | |
| 649 return ns; | |
| 650 } | |
| 651 | |
| 652 // error is wrong char in place of correct one (case and keyboard related versio
n) | |
| 653 int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsugge
st) | |
| 654 { | |
| 655 char tmpc; | |
| 656 char candidate[MAXSWUTF8L]; | |
| 657 int wl = strlen(word); | |
| 658 strcpy(candidate, word); | |
| 659 // swap out each char one by one and try uppercase and neighbor | |
| 660 // keyboard chars in its place to see if that makes a good word | |
| 661 | |
| 662 for (int i=0; i < wl; i++) { | |
| 663 tmpc = candidate[i]; | |
| 664 // check with uppercase letters | |
| 665 candidate[i] = csconv[((unsigned char)tmpc)].cupper; | |
| 666 if (tmpc != candidate[i]) { | |
| 667 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 668 if (ns == -1) return -1; | |
| 669 candidate[i] = tmpc; | |
| 670 } | |
| 671 // check neighbor characters in keyboard string | |
| 672 if (!ckey) continue; | |
| 673 char * loc = strchr(ckey, tmpc); | |
| 674 while (loc) { | |
| 675 if ((loc > ckey) && (*(loc - 1) != '|')) { | |
| 676 candidate[i] = *(loc - 1); | |
| 677 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 678 if (ns == -1) return -1; | |
| 679 } | |
| 680 if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { | |
| 681 candidate[i] = *(loc + 1); | |
| 682 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 683 if (ns == -1) return -1; | |
| 684 } | |
| 685 loc = strchr(loc + 1, tmpc); | |
| 686 } | |
| 687 candidate[i] = tmpc; | |
| 688 } | |
| 689 return ns; | |
| 690 } | |
| 691 | |
| 692 // error is wrong char in place of correct one (case and keyboard related versio
n) | |
| 693 int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns
, int cpdsuggest) | |
| 694 { | |
| 695 w_char tmpc; | |
| 696 w_char candidate_utf[MAXSWL]; | |
| 697 char candidate[MAXSWUTF8L]; | |
| 698 memcpy(candidate_utf, word, wl * sizeof(w_char)); | |
| 699 // swap out each char one by one and try all the tryme | |
| 700 // chars in its place to see if that makes a good word | |
| 701 for (int i=0; i < wl; i++) { | |
| 702 tmpc = candidate_utf[i]; | |
| 703 // check with uppercase letters | |
| 704 mkallcap_utf(candidate_utf + i, 1, langnum); | |
| 705 if (!w_char_eq(tmpc, candidate_utf[i])) { | |
| 706 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 707 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NU
LL); | |
| 708 if (ns == -1) return -1; | |
| 709 candidate_utf[i] = tmpc; | |
| 710 } | |
| 711 // check neighbor characters in keyboard string | |
| 712 if (!ckey) continue; | |
| 713 w_char * loc = ckey_utf; | |
| 714 while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++; | |
| 715 while (loc < (ckey_utf + ckeyl)) { | |
| 716 if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) { | |
| 717 candidate_utf[i] = *(loc - 1); | |
| 718 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 719 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
NULL); | |
| 720 if (ns == -1) return -1; | |
| 721 } | |
| 722 if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE))
{ | |
| 723 candidate_utf[i] = *(loc + 1); | |
| 724 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 725 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
NULL); | |
| 726 if (ns == -1) return -1; | |
| 727 } | |
| 728 do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)
); | |
| 729 } | |
| 730 candidate_utf[i] = tmpc; | |
| 731 } | |
| 732 return ns; | |
| 733 } | |
| 734 | |
| 735 // error is wrong char in place of correct one | |
| 736 int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest) | |
| 737 { | |
| 738 char tmpc; | |
| 739 char candidate[MAXSWUTF8L]; | |
| 740 clock_t timelimit = clock(); | |
| 741 int timer = MINTIMER; | |
| 742 int wl = strlen(word); | |
| 743 strcpy(candidate, word); | |
| 744 // swap out each char one by one and try all the tryme | |
| 745 // chars in its place to see if that makes a good word | |
| 746 for (int j=0; j < ctryl; j++) { | |
| 747 for (int i=wl-1; i >= 0; i--) { | |
| 748 tmpc = candidate[i]; | |
| 749 if (ctry[j] == tmpc) continue; | |
| 750 candidate[i] = ctry[j]; | |
| 751 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit); | |
| 752 if (ns == -1) return -1; | |
| 753 if (!timer) return ns; | |
| 754 candidate[i] = tmpc; | |
| 755 } | |
| 756 } | |
| 757 return ns; | |
| 758 } | |
| 759 | |
| 760 // error is wrong char in place of correct one | |
| 761 int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i
nt cpdsuggest) | |
| 762 { | |
| 763 w_char tmpc; | |
| 764 w_char candidate_utf[MAXSWL]; | |
| 765 char candidate[MAXSWUTF8L]; | |
| 766 clock_t timelimit = clock(); | |
| 767 int timer = MINTIMER; | |
| 768 memcpy(candidate_utf, word, wl * sizeof(w_char)); | |
| 769 // swap out each char one by one and try all the tryme | |
| 770 // chars in its place to see if that makes a good word | |
| 771 for (int j=0; j < ctryl; j++) { | |
| 772 for (int i=wl-1; i >= 0; i--) { | |
| 773 tmpc = candidate_utf[i]; | |
| 774 if (w_char_eq(tmpc, ctry_utf[j])) continue; | |
| 775 candidate_utf[i] = ctry_utf[j]; | |
| 776 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 777 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer,
&timelimit); | |
| 778 if (ns == -1) return -1; | |
| 779 if (!timer) return ns; | |
| 780 candidate_utf[i] = tmpc; | |
| 781 } | |
| 782 } | |
| 783 return ns; | |
| 784 } | |
| 785 | |
| 786 // error is word has an extra letter it does not need | |
| 787 int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns,
int cpdsuggest) | |
| 788 { | |
| 789 char candidate[MAXSWUTF8L]; | |
| 790 w_char candidate_utf[MAXSWL]; | |
| 791 w_char * p; | |
| 792 w_char tmpc = W_VLINE; // not used value, only for VCC warning message | |
| 793 if (wl < 2) return ns; | |
| 794 // try omitting one char of word at a time | |
| 795 memcpy(candidate_utf, word, wl * sizeof(w_char)); | |
| 796 for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) { | |
| 797 w_char tmpc2 = *p; | |
| 798 if (p < candidate_utf + wl - 1) *p = tmpc; | |
| 799 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); | |
| 800 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NU
LL); | |
| 801 if (ns == -1) return -1; | |
| 802 tmpc = tmpc2; | |
| 803 } | |
| 804 return ns; | |
| 805 } | |
| 806 | |
| 807 // error is word has an extra letter it does not need | |
| 808 int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest
) | |
| 809 { | |
| 810 char tmpc = '\0'; | |
| 811 char candidate[MAXSWUTF8L]; | |
| 812 char * p; | |
| 813 int wl = strlen(word); | |
| 814 if (wl < 2) return ns; | |
| 815 // try omitting one char of word at a time | |
| 816 strcpy (candidate, word); | |
| 817 for (p = candidate + wl - 1; p >=candidate; p--) { | |
| 818 char tmpc2 = *p; | |
| 819 *p = tmpc; | |
| 820 ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL); | |
| 821 if (ns == -1) return -1; | |
| 822 tmpc = tmpc2; | |
| 823 } | |
| 824 return ns; | |
| 825 } | |
| 826 | |
| 827 // error is missing a letter it needs | |
| 828 int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsugge
st) | |
| 829 { | |
| 830 // TODO(rouslan): Remove the interim change below when this patch lands: | |
| 831 // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395 | |
| 832 char candidate[MAXSWUTF8L + 4]; | |
| 833 char * p; | |
| 834 clock_t timelimit = clock(); | |
| 835 int timer = MINTIMER; | |
| 836 int wl = strlen(word); | |
| 837 // try inserting a tryme character before every letter (and the null terminat
or) | |
| 838 for (int i = 0; i < ctryl; i++) { | |
| 839 strcpy(candidate, word); | |
| 840 for (p = candidate + wl; p >= candidate; p--) { | |
| 841 *(p+1) = *p; | |
| 842 *p = ctry[i]; | |
| 843 ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit)
; | |
| 844 if (ns == -1) return -1; | |
| 845 if (!timer) return ns; | |
| 846 } | |
| 847 } | |
| 848 return ns; | |
| 849 } | |
| 850 | |
| 851 // error is missing a letter it needs | |
| 852 int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns
, int cpdsuggest) | |
| 853 { | |
| 854 // TODO(rouslan): Remove the interim change below when this patch lands: | |
| 855 // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395 | |
| 856 w_char candidate_utf[MAXSWL + 1]; | |
| 857 char candidate[MAXSWUTF8L + 4]; | |
| 858 w_char * p; | |
| 859 clock_t timelimit = clock(); | |
| 860 int timer = MINTIMER; | |
| 861 // try inserting a tryme character at the end of the word and before every le
tter | |
| 862 for (int i = 0; i < ctryl; i++) { | |
| 863 memcpy (candidate_utf, word, wl * sizeof(w_char)); | |
| 864 for (p = candidate_utf + wl; p >= candidate_utf; p--) { | |
| 865 *(p + 1) = *p; | |
| 866 *p = ctry_utf[i]; | |
| 867 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); | |
| 868 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer
, &timelimit); | |
| 869 if (ns == -1) return -1; | |
| 870 if (!timer) return ns; | |
| 871 } | |
| 872 } | |
| 873 return ns; | |
| 874 } | |
| 875 | |
| 876 | |
| 877 /* error is should have been two words */ | |
| 878 int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest
) | |
| 879 { | |
| 880 char candidate[MAXSWUTF8L]; | |
| 881 char * p; | |
| 882 int c1, c2; | |
| 883 int forbidden = 0; | |
| 884 int cwrd; | |
| 885 | |
| 886 int wl=strlen(word); | |
| 887 if (wl < 3) return ns; | |
| 888 | |
| 889 if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); | |
| 890 | |
| 891 strcpy(candidate + 1, word); | |
| 892 // split the string into two pieces after every char | |
| 893 // if both pieces are good words make them a suggestion | |
| 894 for (p = candidate + 1; p[1] != '\0'; p++) { | |
| 895 p[-1] = *p; | |
| 896 // go to end of the UTF-8 character | |
| 897 while (utf8 && ((p[1] & 0xc0) == 0x80)) { | |
| 898 *p = p[1]; | |
| 899 p++; | |
| 900 } | |
| 901 if (utf8 && p[1] == '\0') break; // last UTF-8 character | |
| 902 *p = '\0'; | |
| 903 c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL); | |
| 904 if (c1) { | |
| 905 c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL); | |
| 906 if (c2) { | |
| 907 *p = ' '; | |
| 908 | |
| 909 // spec. Hungarian code (need a better compound word support) | |
| 910 if ((langnum == LANG_hu) && !forbidden && | |
| 911 // if 3 repeating letter, use - instead of space | |
| 912 (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (
p[-1] == p[2]))) || | |
| 913 // or multiple compounding, with more, than 6 syllables | |
| 914 ((c1 == 3) && (c2 >= 2)))) *p = '-'; | |
| 915 | |
| 916 cwrd = 1; | |
| 917 for (int k=0; k < ns; k++) | |
| 918 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; | |
| 919 if (ns < maxSug) { | |
| 920 if (cwrd) { | |
| 921 wlst[ns] = mystrdup(candidate); | |
| 922 if (wlst[ns] == NULL) return -1; | |
| 923 ns++; | |
| 924 } | |
| 925 } else return ns; | |
| 926 // add two word suggestion with dash, if TRY string contains | |
| 927 // "a" or "-" | |
| 928 // NOTE: cwrd doesn't modified for REP twoword sugg. | |
| 929 if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && | |
| 930 mystrlen(p + 1) > 1 && | |
| 931 mystrlen(candidate) - mystrlen(p) > 1) { | |
| 932 *p = '-'; | |
| 933 for (int k=0; k < ns; k++) | |
| 934 if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; | |
| 935 if (ns < maxSug) { | |
| 936 if (cwrd) { | |
| 937 wlst[ns] = mystrdup(candidate); | |
| 938 if (wlst[ns] == NULL) return -1; | |
| 939 ns++; | |
| 940 } | |
| 941 } else return ns; | |
| 942 } | |
| 943 } | |
| 944 } | |
| 945 } | |
| 946 return ns; | |
| 947 } | |
| 948 | |
| 949 | |
| 950 // error is adjacent letter were swapped | |
| 951 int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest
) | |
| 952 { | |
| 953 char candidate[MAXSWUTF8L]; | |
| 954 char * p; | |
| 955 char tmpc; | |
| 956 int wl=strlen(word); | |
| 957 // try swapping adjacent chars one by one | |
| 958 strcpy(candidate, word); | |
| 959 for (p = candidate; p[1] != 0; p++) { | |
| 960 tmpc = *p; | |
| 961 *p = p[1]; | |
| 962 p[1] = tmpc; | |
| 963 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 964 if (ns == -1) return -1; | |
| 965 p[1] = *p; | |
| 966 *p = tmpc; | |
| 967 } | |
| 968 // try double swaps for short words | |
| 969 // ahev -> have, owudl -> would | |
| 970 if (wl == 4 || wl == 5) { | |
| 971 candidate[0] = word[1]; | |
| 972 candidate[1] = word[0]; | |
| 973 candidate[2] = word[2]; | |
| 974 candidate[wl - 2] = word[wl - 1]; | |
| 975 candidate[wl - 1] = word[wl - 2]; | |
| 976 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 977 if (ns == -1) return -1; | |
| 978 if (wl == 5) { | |
| 979 candidate[0] = word[0]; | |
| 980 candidate[1] = word[2]; | |
| 981 candidate[2] = word[1]; | |
| 982 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 983 if (ns == -1) return -1; | |
| 984 } | |
| 985 } | |
| 986 return ns; | |
| 987 } | |
| 988 | |
| 989 // error is adjacent letter were swapped | |
| 990 int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns,
int cpdsuggest) | |
| 991 { | |
| 992 w_char candidate_utf[MAXSWL]; | |
| 993 char candidate[MAXSWUTF8L]; | |
| 994 w_char * p; | |
| 995 w_char tmpc; | |
| 996 int len = 0; | |
| 997 // try swapping adjacent chars one by one | |
| 998 memcpy (candidate_utf, word, wl * sizeof(w_char)); | |
| 999 for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { | |
| 1000 tmpc = *p; | |
| 1001 *p = p[1]; | |
| 1002 p[1] = tmpc; | |
| 1003 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 1004 if (len == 0) len = strlen(candidate); | |
| 1005 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); | |
| 1006 if (ns == -1) return -1; | |
| 1007 p[1] = *p; | |
| 1008 *p = tmpc; | |
| 1009 } | |
| 1010 // try double swaps for short words | |
| 1011 // ahev -> have, owudl -> would, suodn -> sound | |
| 1012 if (wl == 4 || wl == 5) { | |
| 1013 candidate_utf[0] = word[1]; | |
| 1014 candidate_utf[1] = word[0]; | |
| 1015 candidate_utf[2] = word[2]; | |
| 1016 candidate_utf[wl - 2] = word[wl - 1]; | |
| 1017 candidate_utf[wl - 1] = word[wl - 2]; | |
| 1018 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 1019 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); | |
| 1020 if (ns == -1) return -1; | |
| 1021 if (wl == 5) { | |
| 1022 candidate_utf[0] = word[0]; | |
| 1023 candidate_utf[1] = word[2]; | |
| 1024 candidate_utf[2] = word[1]; | |
| 1025 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 1026 ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); | |
| 1027 if (ns == -1) return -1; | |
| 1028 } | |
| 1029 } | |
| 1030 return ns; | |
| 1031 } | |
| 1032 | |
| 1033 // error is not adjacent letter were swapped | |
| 1034 int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsug
gest) | |
| 1035 { | |
| 1036 char candidate[MAXSWUTF8L]; | |
| 1037 char * p; | |
| 1038 char * q; | |
| 1039 char tmpc; | |
| 1040 int wl=strlen(word); | |
| 1041 // try swapping not adjacent chars one by one | |
| 1042 strcpy(candidate, word); | |
| 1043 for (p = candidate; *p != 0; p++) { | |
| 1044 for (q = candidate; *q != 0; q++) { | |
| 1045 if (abs((int)(p-q)) > 1) { | |
| 1046 tmpc = *p; | |
| 1047 *p = *q; | |
| 1048 *q = tmpc; | |
| 1049 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 1050 if (ns == -1) return -1; | |
| 1051 *q = *p; | |
| 1052 *p = tmpc; | |
| 1053 } | |
| 1054 } | |
| 1055 } | |
| 1056 return ns; | |
| 1057 } | |
| 1058 | |
| 1059 | |
| 1060 // error is adjacent letter were swapped | |
| 1061 int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int
ns, int cpdsuggest) | |
| 1062 { | |
| 1063 w_char candidate_utf[MAXSWL]; | |
| 1064 char candidate[MAXSWUTF8L]; | |
| 1065 w_char * p; | |
| 1066 w_char * q; | |
| 1067 w_char tmpc; | |
| 1068 // try swapping not adjacent chars | |
| 1069 memcpy (candidate_utf, word, wl * sizeof(w_char)); | |
| 1070 for (p = candidate_utf; p < (candidate_utf + wl); p++) { | |
| 1071 for (q = candidate_utf; q < (candidate_utf + wl); q++) { | |
| 1072 if (abs((int)(p-q)) > 1) { | |
| 1073 tmpc = *p; | |
| 1074 *p = *q; | |
| 1075 *q = tmpc; | |
| 1076 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 1077 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
NULL); | |
| 1078 if (ns == -1) return -1; | |
| 1079 *q = *p; | |
| 1080 *p = tmpc; | |
| 1081 } | |
| 1082 } | |
| 1083 } | |
| 1084 return ns; | |
| 1085 } | |
| 1086 | |
| 1087 // error is a letter was moved | |
| 1088 int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest
) | |
| 1089 { | |
| 1090 char candidate[MAXSWUTF8L]; | |
| 1091 char * p; | |
| 1092 char * q; | |
| 1093 char tmpc; | |
| 1094 | |
| 1095 int wl=strlen(word); | |
| 1096 // try moving a char | |
| 1097 strcpy(candidate, word); | |
| 1098 for (p = candidate; *p != 0; p++) { | |
| 1099 for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) { | |
| 1100 tmpc = *(q-1); | |
| 1101 *(q-1) = *q; | |
| 1102 *q = tmpc; | |
| 1103 if ((q-p) < 2) continue; // omit swap char | |
| 1104 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 1105 if (ns == -1) return -1; | |
| 1106 } | |
| 1107 strcpy(candidate, word); | |
| 1108 } | |
| 1109 for (p = candidate + wl - 1; p > candidate; p--) { | |
| 1110 for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) { | |
| 1111 tmpc = *(q+1); | |
| 1112 *(q+1) = *q; | |
| 1113 *q = tmpc; | |
| 1114 if ((p-q) < 2) continue; // omit swap char | |
| 1115 ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); | |
| 1116 if (ns == -1) return -1; | |
| 1117 } | |
| 1118 strcpy(candidate, word); | |
| 1119 } | |
| 1120 return ns; | |
| 1121 } | |
| 1122 | |
| 1123 // error is a letter was moved | |
| 1124 int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns,
int cpdsuggest) | |
| 1125 { | |
| 1126 w_char candidate_utf[MAXSWL]; | |
| 1127 char candidate[MAXSWUTF8L]; | |
| 1128 w_char * p; | |
| 1129 w_char * q; | |
| 1130 w_char tmpc; | |
| 1131 // try moving a char | |
| 1132 memcpy (candidate_utf, word, wl * sizeof(w_char)); | |
| 1133 for (p = candidate_utf; p < (candidate_utf + wl); p++) { | |
| 1134 for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) { | |
| 1135 tmpc = *(q-1); | |
| 1136 *(q-1) = *q; | |
| 1137 *q = tmpc; | |
| 1138 if ((q-p) < 2) continue; // omit swap char | |
| 1139 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 1140 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
NULL); | |
| 1141 if (ns == -1) return -1; | |
| 1142 } | |
| 1143 memcpy (candidate_utf, word, wl * sizeof(w_char)); | |
| 1144 } | |
| 1145 for (p = candidate_utf + wl - 1; p > candidate_utf; p--) { | |
| 1146 for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) { | |
| 1147 tmpc = *(q+1); | |
| 1148 *(q+1) = *q; | |
| 1149 *q = tmpc; | |
| 1150 if ((p-q) < 2) continue; // omit swap char | |
| 1151 u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); | |
| 1152 ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
NULL); | |
| 1153 if (ns == -1) return -1; | |
| 1154 } | |
| 1155 memcpy (candidate_utf, word, wl * sizeof(w_char)); | |
| 1156 } | |
| 1157 return ns; | |
| 1158 } | |
| 1159 | |
| 1160 // generate a set of suggestions for very poorly spelled words | |
| 1161 int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md
) | |
| 1162 { | |
| 1163 | |
| 1164 int i, j; | |
| 1165 int lval; | |
| 1166 int sc, scphon; | |
| 1167 int lp, lpphon; | |
| 1168 int nonbmp = 0; | |
| 1169 | |
| 1170 // exhaustively search through all root words | |
| 1171 // keeping track of the MAX_ROOTS most similar root words | |
| 1172 struct hentry * roots[MAX_ROOTS]; | |
| 1173 char * rootsphon[MAX_ROOTS]; | |
| 1174 int scores[MAX_ROOTS]; | |
| 1175 int scoresphon[MAX_ROOTS]; | |
| 1176 for (i = 0; i < MAX_ROOTS; i++) { | |
| 1177 roots[i] = NULL; | |
| 1178 scores[i] = -100 * i; | |
| 1179 rootsphon[i] = NULL; | |
| 1180 scoresphon[i] = -100 * i; | |
| 1181 } | |
| 1182 lp = MAX_ROOTS - 1; | |
| 1183 lpphon = MAX_ROOTS - 1; | |
| 1184 scphon = -20000; | |
| 1185 int low = NGRAM_LOWERING; | |
| 1186 | |
| 1187 char w2[MAXWORDUTF8LEN]; | |
| 1188 char f[MAXSWUTF8L]; | |
| 1189 char * word = w; | |
| 1190 | |
| 1191 // word reversing wrapper for complex prefixes | |
| 1192 if (complexprefixes) { | |
| 1193 strcpy(w2, w); | |
| 1194 if (utf8) reverseword_utf(w2); else reverseword(w2); | |
| 1195 word = w2; | |
| 1196 } | |
| 1197 | |
| 1198 char mw[MAXSWUTF8L]; | |
| 1199 w_char u8[MAXSWL]; | |
| 1200 int nc = strlen(word); | |
| 1201 int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc; | |
| 1202 | |
| 1203 // set character based ngram suggestion for words with non-BMP Unicode charact
ers | |
| 1204 if (n == -1) { | |
| 1205 utf8 = 0; // XXX not state-free | |
| 1206 n = nc; | |
| 1207 nonbmp = 1; | |
| 1208 low = 0; | |
| 1209 } | |
| 1210 | |
| 1211 struct hentry* hp = NULL; | |
| 1212 int col = -1; | |
| 1213 #ifdef HUNSPELL_CHROME_CLIENT | |
| 1214 ScopedHashEntryFactory hash_entry_factory; | |
| 1215 #endif | |
| 1216 phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; | |
| 1217 char target[MAXSWUTF8L]; | |
| 1218 char candidate[MAXSWUTF8L]; | |
| 1219 if (ph) { | |
| 1220 if (utf8) { | |
| 1221 w_char _w[MAXSWL]; | |
| 1222 int _wl = u8_u16(_w, MAXSWL, word); | |
| 1223 mkallcap_utf(_w, _wl, langnum); | |
| 1224 u16_u8(candidate, MAXSWUTF8L, _w, _wl); | |
| 1225 } else { | |
| 1226 strcpy(candidate, word); | |
| 1227 if (!nonbmp) mkallcap(candidate, csconv); | |
| 1228 } | |
| 1229 phonet(candidate, target, nc, *ph); // XXX phonet() is 8-bit (nc, not n) | |
| 1230 } | |
| 1231 | |
| 1232 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL; | |
| 1233 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL; | |
| 1234 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; | |
| 1235 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; | |
| 1236 | |
| 1237 for (i = 0; i < md; i++) { | |
| 1238 while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { | |
| 1239 if ((hp->astr) && (pAMgr) && | |
| 1240 (TESTAFF(hp->astr, forbiddenword, hp->alen) || | |
| 1241 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || | |
| 1242 TESTAFF(hp->astr, nosuggest, hp->alen) || | |
| 1243 TESTAFF(hp->astr, nongramsuggest, hp->alen) || | |
| 1244 TESTAFF(hp->astr, onlyincompound, hp->alen))) continue; | |
| 1245 | |
| 1246 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + | |
| 1247 leftcommonsubstring(word, HENTRY_WORD(hp)); | |
| 1248 | |
| 1249 // check special pronounciation | |
| 1250 if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { | |
| 1251 int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + | |
| 1252 + leftcommonsubstring(word, f); | |
| 1253 if (sc2 > sc) sc = sc2; | |
| 1254 } | |
| 1255 | |
| 1256 scphon = -20000; | |
| 1257 if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) { | |
| 1258 char target2[MAXSWUTF8L]; | |
| 1259 if (utf8) { | |
| 1260 w_char _w[MAXSWL]; | |
| 1261 int _wl = u8_u16(_w, MAXSWL, HENTRY_WORD(hp)); | |
| 1262 mkallcap_utf(_w, _wl, langnum); | |
| 1263 u16_u8(candidate, MAXSWUTF8L, _w, _wl); | |
| 1264 } else { | |
| 1265 strcpy(candidate, HENTRY_WORD(hp)); | |
| 1266 mkallcap(candidate, csconv); | |
| 1267 } | |
| 1268 phonet(candidate, target2, -1, *ph); | |
| 1269 scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); | |
| 1270 } | |
| 1271 | |
| 1272 if (sc > scores[lp]) { | |
| 1273 scores[lp] = sc; | |
| 1274 #ifdef HUNSPELL_CHROME_CLIENT | |
| 1275 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp); | |
| 1276 #else | |
| 1277 roots[lp] = hp; | |
| 1278 #endif | |
| 1279 lval = sc; | |
| 1280 for (j=0; j < MAX_ROOTS; j++) | |
| 1281 if (scores[j] < lval) { | |
| 1282 lp = j; | |
| 1283 lval = scores[j]; | |
| 1284 } | |
| 1285 } | |
| 1286 | |
| 1287 | |
| 1288 if (scphon > scoresphon[lpphon]) { | |
| 1289 scoresphon[lpphon] = scphon; | |
| 1290 rootsphon[lpphon] = HENTRY_WORD(hp); | |
| 1291 lval = scphon; | |
| 1292 for (j=0; j < MAX_ROOTS; j++) | |
| 1293 if (scoresphon[j] < lval) { | |
| 1294 lpphon = j; | |
| 1295 lval = scoresphon[j]; | |
| 1296 } | |
| 1297 } | |
| 1298 }} | |
| 1299 | |
| 1300 // find minimum threshold for a passable suggestion | |
| 1301 // mangle original word three differnt ways | |
| 1302 // and score them to generate a minimum acceptable score | |
| 1303 int thresh = 0; | |
| 1304 for (int sp = 1; sp < 4; sp++) { | |
| 1305 if (utf8) { | |
| 1306 for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; | |
| 1307 u16_u8(mw, MAXSWUTF8L, u8, n); | |
| 1308 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); | |
| 1309 } else { | |
| 1310 strcpy(mw, word); | |
| 1311 for (int k=sp; k < n; k+=4) *(mw + k) = '*'; | |
| 1312 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); | |
| 1313 } | |
| 1314 } | |
| 1315 thresh = thresh / 3; | |
| 1316 thresh--; | |
| 1317 | |
| 1318 // now expand affixes on each of these root words and | |
| 1319 // and use length adjusted ngram scores to select | |
| 1320 // possible suggestions | |
| 1321 char * guess[MAX_GUESS]; | |
| 1322 char * guessorig[MAX_GUESS]; | |
| 1323 int gscore[MAX_GUESS]; | |
| 1324 for(i=0;i<MAX_GUESS;i++) { | |
| 1325 guess[i] = NULL; | |
| 1326 guessorig[i] = NULL; | |
| 1327 gscore[i] = -100 * i; | |
| 1328 } | |
| 1329 | |
| 1330 lp = MAX_GUESS - 1; | |
| 1331 | |
| 1332 struct guessword * glst; | |
| 1333 glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword)); | |
| 1334 if (! glst) { | |
| 1335 if (nonbmp) utf8 = 1; | |
| 1336 return ns; | |
| 1337 } | |
| 1338 | |
| 1339 for (i = 0; i < MAX_ROOTS; i++) { | |
| 1340 if (roots[i]) { | |
| 1341 struct hentry * rp = roots[i]; | |
| 1342 int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->bl
en, | |
| 1343 rp->astr, rp->alen, word, nc, | |
| 1344 ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MOR
PH_PHON) : NULL)); | |
| 1345 | |
| 1346 for (int k = 0; k < nw ; k++) { | |
| 1347 sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + | |
| 1348 leftcommonsubstring(word, glst[k].word); | |
| 1349 | |
| 1350 if (sc > thresh) { | |
| 1351 if (sc > gscore[lp]) { | |
| 1352 if (guess[lp]) { | |
| 1353 free (guess[lp]); | |
| 1354 if (guessorig[lp]) { | |
| 1355 free(guessorig[lp]); | |
| 1356 guessorig[lp] = NULL; | |
| 1357 } | |
| 1358 } | |
| 1359 gscore[lp] = sc; | |
| 1360 guess[lp] = glst[k].word; | |
| 1361 guessorig[lp] = glst[k].orig; | |
| 1362 lval = sc; | |
| 1363 for (j=0; j < MAX_GUESS; j++) | |
| 1364 if (gscore[j] < lval) { | |
| 1365 lp = j; | |
| 1366 lval = gscore[j]; | |
| 1367 } | |
| 1368 } else { | |
| 1369 free(glst[k].word); | |
| 1370 if (glst[k].orig) free(glst[k].orig); | |
| 1371 } | |
| 1372 } else { | |
| 1373 free(glst[k].word); | |
| 1374 if (glst[k].orig) free(glst[k].orig); | |
| 1375 } | |
| 1376 } | |
| 1377 } | |
| 1378 } | |
| 1379 free(glst); | |
| 1380 | |
| 1381 // now we are done generating guesses | |
| 1382 // sort in order of decreasing score | |
| 1383 | |
| 1384 | |
| 1385 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); | |
| 1386 if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); | |
| 1387 | |
| 1388 // weight suggestions with a similarity index, based on | |
| 1389 // the longest common subsequent algorithm and resort | |
| 1390 | |
| 1391 int is_swap = 0; | |
| 1392 int re = 0; | |
| 1393 double fact = 1.0; | |
| 1394 if (pAMgr) { | |
| 1395 int maxd = pAMgr->get_maxdiff(); | |
| 1396 if (maxd >= 0) fact = (10.0 - maxd)/5.0; | |
| 1397 } | |
| 1398 | |
| 1399 for (i=0; i < MAX_GUESS; i++) { | |
| 1400 if (guess[i]) { | |
| 1401 // lowering guess[i] | |
| 1402 char gl[MAXSWUTF8L]; | |
| 1403 int len; | |
| 1404 if (utf8) { | |
| 1405 w_char _w[MAXSWL]; | |
| 1406 len = u8_u16(_w, MAXSWL, guess[i]); | |
| 1407 mkallsmall_utf(_w, len, langnum); | |
| 1408 u16_u8(gl, MAXSWUTF8L, _w, len); | |
| 1409 } else { | |
| 1410 strcpy(gl, guess[i]); | |
| 1411 if (!nonbmp) mkallsmall(gl, csconv); | |
| 1412 len = strlen(guess[i]); | |
| 1413 } | |
| 1414 | |
| 1415 int _lcs = lcslen(word, gl); | |
| 1416 | |
| 1417 // same characters with different casing | |
| 1418 if ((n == len) && (n == _lcs)) { | |
| 1419 gscore[i] += 2000; | |
| 1420 break; | |
| 1421 } | |
| 1422 // using 2-gram instead of 3, and other weightening | |
| 1423 | |
| 1424 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + | |
| 1425 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); | |
| 1426 | |
| 1427 gscore[i] = | |
| 1428 // length of longest common subsequent minus length difference | |
| 1429 2 * _lcs - abs((int) (n - len)) + | |
| 1430 // weight length of the left common substring | |
| 1431 leftcommonsubstring(word, gl) + | |
| 1432 // weight equal character positions | |
| 1433 (!nonbmp && commoncharacterpositions(word, gl, &is_swap) ? 1: 0) + | |
| 1434 // swap character (not neighboring) | |
| 1435 ((is_swap) ? 10 : 0) + | |
| 1436 // ngram | |
| 1437 ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + | |
| 1438 // weighted ngrams | |
| 1439 re + | |
| 1440 // different limit for dictionaries with PHONE rules | |
| 1441 (ph ? (re < len * fact ? -1000 : 0) : (re < (n + len)*fact? -1000 : 0)
); | |
| 1442 } | |
| 1443 } | |
| 1444 | |
| 1445 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); | |
| 1446 | |
| 1447 // phonetic version | |
| 1448 if (ph) for (i=0; i < MAX_ROOTS; i++) { | |
| 1449 if (rootsphon[i]) { | |
| 1450 // lowering rootphon[i] | |
| 1451 char gl[MAXSWUTF8L]; | |
| 1452 int len; | |
| 1453 if (utf8) { | |
| 1454 w_char _w[MAXSWL]; | |
| 1455 len = u8_u16(_w, MAXSWL, rootsphon[i]); | |
| 1456 mkallsmall_utf(_w, len, langnum); | |
| 1457 u16_u8(gl, MAXSWUTF8L, _w, len); | |
| 1458 } else { | |
| 1459 strcpy(gl, rootsphon[i]); | |
| 1460 if (!nonbmp) mkallsmall(gl, csconv); | |
| 1461 len = strlen(rootsphon[i]); | |
| 1462 } | |
| 1463 | |
| 1464 // heuristic weigthing of ngram scores | |
| 1465 scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) + | |
| 1466 // weight length of the left common substring | |
| 1467 leftcommonsubstring(word, gl); | |
| 1468 } | |
| 1469 } | |
| 1470 | |
| 1471 if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); | |
| 1472 | |
| 1473 // copy over | |
| 1474 int oldns = ns; | |
| 1475 | |
| 1476 int same = 0; | |
| 1477 for (i=0; i < MAX_GUESS; i++) { | |
| 1478 if (guess[i]) { | |
| 1479 if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] >
1000))) { | |
| 1480 int unique = 1; | |
| 1481 // leave only excellent suggestions, if exists | |
| 1482 if (gscore[i] > 1000) same = 1; else if (gscore[i] < -100) { | |
| 1483 same = 1; | |
| 1484 // keep the best ngram suggestions, unless in ONLYMAXDIFF mode | |
| 1485 if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) { | |
| 1486 free(guess[i]); | |
| 1487 if (guessorig[i]) free(guessorig[i]); | |
| 1488 continue; | |
| 1489 } | |
| 1490 } | |
| 1491 for (j = 0; j < ns; j++) { | |
| 1492 // don't suggest previous suggestions or a previous suggestion with pr
efixes or affixes | |
| 1493 if ((!guessorig[i] && strstr(guess[i], wlst[j])) || | |
| 1494 (guessorig[i] && strstr(guessorig[i], wlst[j])) || | |
| 1495 // check forbidden words | |
| 1496 !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; | |
| 1497 } | |
| 1498 if (unique) { | |
| 1499 wlst[ns++] = guess[i]; | |
| 1500 if (guessorig[i]) { | |
| 1501 free(guess[i]); | |
| 1502 wlst[ns-1] = guessorig[i]; | |
| 1503 } | |
| 1504 } else { | |
| 1505 free(guess[i]); | |
| 1506 if (guessorig[i]) free(guessorig[i]); | |
| 1507 } | |
| 1508 } else { | |
| 1509 free(guess[i]); | |
| 1510 if (guessorig[i]) free(guessorig[i]); | |
| 1511 } | |
| 1512 } | |
| 1513 } | |
| 1514 | |
| 1515 oldns = ns; | |
| 1516 if (ph) for (i=0; i < MAX_ROOTS; i++) { | |
| 1517 if (rootsphon[i]) { | |
| 1518 if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { | |
| 1519 int unique = 1; | |
| 1520 for (j = 0; j < ns; j++) { | |
| 1521 // don't suggest previous suggestions or a previous suggestion with pr
efixes or affixes | |
| 1522 if (strstr(rootsphon[i], wlst[j]) || | |
| 1523 // check forbidden words | |
| 1524 !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) uniqu
e = 0; | |
| 1525 } | |
| 1526 if (unique) { | |
| 1527 wlst[ns++] = mystrdup(rootsphon[i]); | |
| 1528 if (!wlst[ns - 1]) return ns - 1; | |
| 1529 } | |
| 1530 } | |
| 1531 } | |
| 1532 } | |
| 1533 | |
| 1534 if (nonbmp) utf8 = 1; | |
| 1535 return ns; | |
| 1536 } | |
| 1537 | |
| 1538 | |
| 1539 // see if a candidate suggestion is spelled correctly | |
| 1540 // needs to check both root words and words with affixes | |
| 1541 | |
| 1542 // obsolote MySpell-HU modifications: | |
| 1543 // return value 2 and 3 marks compounding with hyphen (-) | |
| 1544 // `3' marks roots without suffix | |
| 1545 int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time
r, clock_t * timelimit) | |
| 1546 { | |
| 1547 struct hentry * rv=NULL; | |
| 1548 struct hentry * rv2=NULL; | |
| 1549 int nosuffix = 0; | |
| 1550 | |
| 1551 // check time limit | |
| 1552 if (timer) { | |
| 1553 (*timer)--; | |
| 1554 if (!(*timer) && timelimit) { | |
| 1555 if ((clock() - *timelimit) > TIMELIMIT) return 0; | |
| 1556 *timer = MAXPLUSTIMER; | |
| 1557 } | |
| 1558 } | |
| 1559 | |
| 1560 if (pAMgr) { | |
| 1561 if (cpdsuggest==1) { | |
| 1562 if (pAMgr->get_compound()) { | |
| 1563 rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1, 0); //EX
T | |
| 1564 if (rv && (!(rv2 = pAMgr->lookup(word)) || !rv2->astr || | |
| 1565 !(TESTAFF(rv2->astr,pAMgr->get_forbiddenword(),rv2->alen) || | |
| 1566 TESTAFF(rv2->astr,pAMgr->get_nosuggest(),rv2->alen)))) return 3; //
XXX obsolote categorisation + only ICONV needs affix flag check? | |
| 1567 } | |
| 1568 return 0; | |
| 1569 } | |
| 1570 | |
| 1571 rv = pAMgr->lookup(word); | |
| 1572 | |
| 1573 if (rv) { | |
| 1574 if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) | |
| 1575 || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; | |
| 1576 while (rv) { | |
| 1577 if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) |
| | |
| 1578 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
| 1579 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { | |
| 1580 rv = rv->next_homonym; | |
| 1581 } else break; | |
| 1582 } | |
| 1583 } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix +
suffix XXX | |
| 1584 | |
| 1585 if (rv) { | |
| 1586 nosuffix=1; | |
| 1587 } else { | |
| 1588 rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suf
fix | |
| 1589 } | |
| 1590 | |
| 1591 if (!rv && pAMgr->have_contclass()) { | |
| 1592 rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL); | |
| 1593 if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL); | |
| 1594 } | |
| 1595 | |
| 1596 // check forbidden words | |
| 1597 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->a
len) || | |
| 1598 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
| 1599 TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || | |
| 1600 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; | |
| 1601 | |
| 1602 if (rv) { // XXX obsolote | |
| 1603 if ((pAMgr->get_compoundflag()) && | |
| 1604 TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nos
uffix; | |
| 1605 return 1; | |
| 1606 } | |
| 1607 } | |
| 1608 return 0; | |
| 1609 } | |
| 1610 | |
| 1611 int SuggestMgr::check_forbidden(const char * word, int len) | |
| 1612 { | |
| 1613 struct hentry * rv = NULL; | |
| 1614 | |
| 1615 if (pAMgr) { | |
| 1616 rv = pAMgr->lookup(word); | |
| 1617 if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || | |
| 1618 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; | |
| 1619 if (!(pAMgr->prefix_check(word,len,1))) | |
| 1620 rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+su
ffix, suffix | |
| 1621 // check forbidden words | |
| 1622 if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->al
en)) return 1; | |
| 1623 } | |
| 1624 return 0; | |
| 1625 } | |
| 1626 | |
| 1627 #ifdef HUNSPELL_EXPERIMENTAL | |
| 1628 // suggest possible stems | |
| 1629 int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) | |
| 1630 { | |
| 1631 char ** wlst; | |
| 1632 | |
| 1633 struct hentry * rv = NULL; | |
| 1634 | |
| 1635 char w2[MAXSWUTF8L]; | |
| 1636 const char * word = w; | |
| 1637 | |
| 1638 // word reversing wrapper for complex prefixes | |
| 1639 if (complexprefixes) { | |
| 1640 strcpy(w2, w); | |
| 1641 if (utf8) reverseword_utf(w2); else reverseword(w2); | |
| 1642 word = w2; | |
| 1643 } | |
| 1644 | |
| 1645 int wl = strlen(word); | |
| 1646 | |
| 1647 | |
| 1648 if (*slst) { | |
| 1649 wlst = *slst; | |
| 1650 } else { | |
| 1651 wlst = (char **) calloc(maxSug, sizeof(char *)); | |
| 1652 if (wlst == NULL) return -1; | |
| 1653 } | |
| 1654 | |
| 1655 rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug); | |
| 1656 | |
| 1657 // delete dash from end of word | |
| 1658 if (nsug > 0) { | |
| 1659 for (int j=0; j < nsug; j++) { | |
| 1660 if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1
] = '\0'; | |
| 1661 } | |
| 1662 } | |
| 1663 | |
| 1664 *slst = wlst; | |
| 1665 return nsug; | |
| 1666 } | |
| 1667 #endif // END OF HUNSPELL_EXPERIMENTAL CODE | |
| 1668 | |
| 1669 | |
| 1670 char * SuggestMgr::suggest_morph(const char * w) | |
| 1671 { | |
| 1672 char result[MAXLNLEN]; | |
| 1673 char * r = (char *) result; | |
| 1674 char * st; | |
| 1675 | |
| 1676 struct hentry * rv = NULL; | |
| 1677 | |
| 1678 *result = '\0'; | |
| 1679 | |
| 1680 if (! pAMgr) return NULL; | |
| 1681 | |
| 1682 char w2[MAXSWUTF8L]; | |
| 1683 const char * word = w; | |
| 1684 | |
| 1685 // word reversing wrapper for complex prefixes | |
| 1686 if (complexprefixes) { | |
| 1687 strcpy(w2, w); | |
| 1688 if (utf8) reverseword_utf(w2); else reverseword(w2); | |
| 1689 word = w2; | |
| 1690 } | |
| 1691 | |
| 1692 rv = pAMgr->lookup(word); | |
| 1693 | |
| 1694 while (rv) { | |
| 1695 if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->a
len) || | |
| 1696 TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || | |
| 1697 TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { | |
| 1698 if (!HENTRY_FIND(rv, MORPH_STEM)) { | |
| 1699 mystrcat(result, " ", MAXLNLEN);
| |
| 1700 mystrcat(result, MORPH_STEM, MAXLNLEN); | |
| 1701 mystrcat(result, word, MAXLNLEN); | |
| 1702 } | |
| 1703 if (HENTRY_DATA(rv)) { | |
| 1704 mystrcat(result, " ", MAXLNLEN);
| |
| 1705 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); | |
| 1706 } | |
| 1707 mystrcat(result, "\n", MAXLNLEN); | |
| 1708 } | |
| 1709 rv = rv->next_homonym; | |
| 1710 } | |
| 1711 | |
| 1712 st = pAMgr->affix_check_morph(word,strlen(word)); | |
| 1713 if (st) { | |
| 1714 mystrcat(result, st, MAXLNLEN); | |
| 1715 free(st); | |
| 1716 } | |
| 1717 | |
| 1718 if (pAMgr->get_compound() && (*result == '\0')) | |
| 1719 pAMgr->compound_check_morph(word, strlen(word), | |
| 1720 0, 0, 100, 0,NULL, 0, &r, NULL); | |
| 1721 | |
| 1722 return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; | |
| 1723 } | |
| 1724 | |
| 1725 #ifdef HUNSPELL_EXPERIMENTAL | |
| 1726 char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) | |
| 1727 { | |
| 1728 char * p = NULL; | |
| 1729 char ** wlst = (char **) calloc(maxSug, sizeof(char *)); | |
| 1730 if (!**wlst) return NULL; | |
| 1731 // we will use only the first suggestion | |
| 1732 for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; | |
| 1733 int ns = suggest(&wlst, word, maxSug - 1, NULL); | |
| 1734 if (ns == maxSug) { | |
| 1735 p = suggest_morph(wlst[maxSug - 1]); | |
| 1736 free(wlst[maxSug - 1]); | |
| 1737 } | |
| 1738 if (wlst) free(wlst); | |
| 1739 return p; | |
| 1740 } | |
| 1741 #endif // END OF HUNSPELL_EXPERIMENTAL CODE | |
| 1742 | |
| 1743 /* affixation */ | |
| 1744 char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) | |
| 1745 { | |
| 1746 char result[MAXLNLEN]; | |
| 1747 *result = '\0'; | |
| 1748 int sfxcount = get_sfxcount(pattern); | |
| 1749 | |
| 1750 if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; | |
| 1751 | |
| 1752 if (HENTRY_DATA(rv)) { | |
| 1753 char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->al
en, | |
| 1754 HENTRY_DATA(rv), pattern, 0); | |
| 1755 if (aff) { | |
| 1756 mystrcat(result, aff, MAXLNLEN); | |
| 1757 mystrcat(result, "\n", MAXLNLEN); | |
| 1758 free(aff); | |
| 1759 } | |
| 1760 } | |
| 1761 | |
| 1762 // check all allomorphs | |
| 1763 char allomorph[MAXLNLEN]; | |
| 1764 char * p = NULL; | |
| 1765 if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); | |
| 1766 while (p) { | |
| 1767 struct hentry * rv2 = NULL; | |
| 1768 p += MORPH_TAG_LEN; | |
| 1769 int plen = fieldlen(p); | |
| 1770 strncpy(allomorph, p, plen); | |
| 1771 allomorph[plen] = '\0'; | |
| 1772 rv2 = pAMgr->lookup(allomorph); | |
| 1773 while (rv2) { | |
| 1774 // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount
) { | |
| 1775 if (HENTRY_DATA(rv2)) { | |
| 1776 char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM); | |
| 1777 if (st && (strncmp(st + MORPH_TAG_LEN, | |
| 1778 HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { | |
| 1779 char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv
2->astr, rv2->alen, | |
| 1780 HENTRY_DATA(rv2), pattern, 0); | |
| 1781 if (aff) { | |
| 1782 mystrcat(result, aff, MAXLNLEN); | |
| 1783 mystrcat(result, "\n", MAXLNLEN); | |
| 1784 free(aff); | |
| 1785 } | |
| 1786 } | |
| 1787 } | |
| 1788 rv2 = rv2->next_homonym; | |
| 1789 } | |
| 1790 p = strstr(p + plen, MORPH_ALLOMORPH); | |
| 1791 } | |
| 1792 | |
| 1793 return (*result) ? mystrdup(result) : NULL; | |
| 1794 } | |
| 1795 | |
| 1796 char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) { | |
| 1797 char result[MAXLNLEN]; | |
| 1798 char result2[MAXLNLEN]; | |
| 1799 char newpattern[MAXLNLEN]; | |
| 1800 *newpattern = '\0'; | |
| 1801 if (n == 0) return 0; | |
| 1802 *result2 = '\0'; | |
| 1803 struct hentry * rv = NULL; | |
| 1804 if (!pAMgr) return NULL; | |
| 1805 | |
| 1806 // search affixed forms with and without derivational suffixes | |
| 1807 while(1) { | |
| 1808 | |
| 1809 for (int k = 0; k < n; k++) { | |
| 1810 *result = '\0'; | |
| 1811 // add compound word parts (except the last one) | |
| 1812 char * s = (char *) desc[k]; | |
| 1813 char * part = strstr(s, MORPH_PART); | |
| 1814 if (part) { | |
| 1815 char * nextpart = strstr(part + 1, MORPH_PART); | |
| 1816 while (nextpart) { | |
| 1817 copy_field(result + strlen(result), part, MORPH_PART); | |
| 1818 part = nextpart; | |
| 1819 nextpart = strstr(part + 1, MORPH_PART); | |
| 1820 } | |
| 1821 s = part; | |
| 1822 } | |
| 1823 | |
| 1824 char **pl; | |
| 1825 char tok[MAXLNLEN]; | |
| 1826 strcpy(tok, s); | |
| 1827 char * alt = strstr(tok, " | "); | |
| 1828 while (alt) { | |
| 1829 alt[1] = MSEP_ALT; | |
| 1830 alt = strstr(alt, " | "); | |
| 1831 } | |
| 1832 int pln = line_tok(tok, &pl, MSEP_ALT); | |
| 1833 for (int i = 0; i < pln; i++) { | |
| 1834 // remove inflectional and terminal suffixes | |
| 1835 char * is = strstr(pl[i], MORPH_INFL_SFX); | |
| 1836 if (is) *is = '\0'; | |
| 1837 char * ts = strstr(pl[i], MORPH_TERM_SFX); | |
| 1838 while (ts) { | |
| 1839 *ts = '_'; | |
| 1840 ts = strstr(pl[i], MORPH_TERM_SFX); | |
| 1841 } | |
| 1842 char * st = strstr(s, MORPH_STEM); | |
| 1843 if (st) { | |
| 1844 copy_field(tok, st, MORPH_STEM); | |
| 1845 rv = pAMgr->lookup(tok); | |
| 1846 while (rv) { | |
| 1847 char newpat[MAXLNLEN]; | |
| 1848 strcpy(newpat, pl[i]); | |
| 1849 strcat(newpat, pattern); | |
| 1850 char * sg = suggest_hentry_gen(rv, newpat); | |
| 1851 if (!sg) sg = suggest_hentry_gen(rv, pattern); | |
| 1852 if (sg) { | |
| 1853 char ** gen; | |
| 1854 int genl = line_tok(sg, &gen, MSEP_REC); | |
| 1855 free(sg); | |
| 1856 sg = NULL; | |
| 1857 for (int j = 0; j < genl; j++) { | |
| 1858 if (strstr(pl[i], MORPH_SURF_PFX)) { | |
| 1859 int r2l = strlen(result2); | |
| 1860 result2[r2l] = MSEP_REC; | |
| 1861 strcpy(result2 + r2l + 1, result); | |
| 1862 copy_field(result2 + strlen(result2), pl[i], MOR
PH_SURF_PFX); | |
| 1863 mystrcat(result2, gen[j], MAXLNLEN); | |
| 1864 } else { | |
| 1865 sprintf(result2 + strlen(result2), "%c%s%s", | |
| 1866 MSEP_REC, result, gen[j]); | |
| 1867 } | |
| 1868 } | |
| 1869 freelist(&gen, genl); | |
| 1870 } | |
| 1871 rv = rv->next_homonym; | |
| 1872 } | |
| 1873 } | |
| 1874 } | |
| 1875 freelist(&pl, pln); | |
| 1876 } | |
| 1877 | |
| 1878 if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break; | |
| 1879 strcpy(newpattern, pattern); | |
| 1880 pattern = newpattern; | |
| 1881 char * ds = strstr(pattern, MORPH_DERI_SFX); | |
| 1882 while (ds) { | |
| 1883 strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN); | |
| 1884 ds = strstr(pattern, MORPH_DERI_SFX); | |
| 1885 } | |
| 1886 } | |
| 1887 return (*result2 ? mystrdup(result2) : NULL); | |
| 1888 } | |
| 1889 | |
| 1890 | |
| 1891 // generate an n-gram score comparing s1 and s2 | |
| 1892 int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) | |
| 1893 { | |
| 1894 int nscore = 0; | |
| 1895 int ns; | |
| 1896 int l1; | |
| 1897 int l2; | |
| 1898 int test = 0; | |
| 1899 | |
| 1900 if (utf8) { | |
| 1901 w_char su1[MAXSWL]; | |
| 1902 w_char su2[MAXSWL]; | |
| 1903 l1 = u8_u16(su1, MAXSWL, s1); | |
| 1904 l2 = u8_u16(su2, MAXSWL, s2); | |
| 1905 if ((l2 <= 0) || (l1 == -1)) return 0; | |
| 1906 // lowering dictionary word | |
| 1907 if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum); | |
| 1908 for (int j = 1; j <= n; j++) { | |
| 1909 ns = 0; | |
| 1910 for (int i = 0; i <= (l1-j); i++) { | |
| 1911 int k = 0; | |
| 1912 for (int l = 0; l <= (l2-j); l++) { | |
| 1913 for (k = 0; k < j; k++) { | |
| 1914 w_char * c1 = su1 + i + k; | |
| 1915 w_char * c2 = su2 + l + k; | |
| 1916 if ((c1->l != c2->l) || (c1->h != c2->h)) break; | |
| 1917 } | |
| 1918 if (k == j) { | |
| 1919 ns++; | |
| 1920 break; | |
| 1921 } | |
| 1922 } | |
| 1923 if (k != j && opt & NGRAM_WEIGHTED) { | |
| 1924 ns--; | |
| 1925 test++; | |
| 1926 if (i == 0 || i == l1-j) ns--; // side weight | |
| 1927 } | |
| 1928 } | |
| 1929 nscore = nscore + ns; | |
| 1930 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break; | |
| 1931 } | |
| 1932 } else { | |
| 1933 l2 = strlen(s2); | |
| 1934 if (l2 == 0) return 0; | |
| 1935 l1 = strlen(s1); | |
| 1936 char *t = mystrdup(s2); | |
| 1937 if (opt & NGRAM_LOWERING) mkallsmall(t, csconv); | |
| 1938 for (int j = 1; j <= n; j++) { | |
| 1939 ns = 0; | |
| 1940 for (int i = 0; i <= (l1-j); i++) { | |
| 1941 char c = *(s1 + i + j); | |
| 1942 *(s1 + i + j) = '\0'; | |
| 1943 if (strstr(t,(s1+i))) { | |
| 1944 ns++; | |
| 1945 } else if (opt & NGRAM_WEIGHTED) { | |
| 1946 ns--; | |
| 1947 test++; | |
| 1948 if (i == 0 || i == l1-j) ns--; // side weight | |
| 1949 } | |
| 1950 *(s1 + i + j ) = c; | |
| 1951 } | |
| 1952 nscore = nscore + ns; | |
| 1953 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break; | |
| 1954 } | |
| 1955 free(t); | |
| 1956 } | |
| 1957 | |
| 1958 ns = 0; | |
| 1959 if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2; | |
| 1960 if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; | |
| 1961 ns = (nscore - ((ns > 0) ? ns : 0)); | |
| 1962 return ns; | |
| 1963 } | |
| 1964 | |
| 1965 // length of the left common substring of s1 and (decapitalised) s2 | |
| 1966 int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { | |
| 1967 if (utf8) { | |
| 1968 w_char su1[MAXSWL]; | |
| 1969 w_char su2[MAXSWL]; | |
| 1970 su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0; | |
| 1971 // decapitalize dictionary word | |
| 1972 if (complexprefixes) { | |
| 1973 int l1 = u8_u16(su1, MAXSWL, s1); | |
| 1974 int l2 = u8_u16(su2, MAXSWL, s2); | |
| 1975 if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; | |
| 1976 } else { | |
| 1977 int i; | |
| 1978 u8_u16(su1, 1, s1); | |
| 1979 u8_u16(su2, 1, s2); | |
| 1980 unsigned short idx = (su2->h << 8) + su2->l; | |
| 1981 unsigned short otheridx = (su1->h << 8) + su1->l; | |
| 1982 if (otheridx != idx && | |
| 1983 (otheridx != unicodetolower(idx, langnum))) return 0; | |
| 1984 int l1 = u8_u16(su1, MAXSWL, s1); | |
| 1985 int l2 = u8_u16(su2, MAXSWL, s2); | |
| 1986 for(i = 1; (i < l1) && (i < l2) && | |
| 1987 (su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++); | |
| 1988 return i; | |
| 1989 } | |
| 1990 } else { | |
| 1991 if (complexprefixes) { | |
| 1992 int l1 = strlen(s1); | |
| 1993 int l2 = strlen(s2); | |
| 1994 if (*(s2+l1-1) == *(s2+l2-1)) return 1; | |
| 1995 } else { | |
| 1996 char * olds = s1; | |
| 1997 // decapitalise dictionary word | |
| 1998 if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0
; | |
| 1999 do { | |
| 2000 s1++; s2++; | |
| 2001 } while ((*s1 == *s2) && (*s1 != '\0')); | |
| 2002 return (int)(s1 - olds); | |
| 2003 } | |
| 2004 } | |
| 2005 return 0; | |
| 2006 } | |
| 2007 | |
| 2008 int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_sw
ap) { | |
| 2009 int num = 0; | |
| 2010 int diff = 0; | |
| 2011 int diffpos[2]; | |
| 2012 *is_swap = 0; | |
| 2013 if (utf8) { | |
| 2014 w_char su1[MAXSWL]; | |
| 2015 w_char su2[MAXSWL]; | |
| 2016 int l1 = u8_u16(su1, MAXSWL, s1); | |
| 2017 int l2 = u8_u16(su2, MAXSWL, s2); | |
| 2018 // decapitalize dictionary word | |
| 2019 if (complexprefixes) { | |
| 2020 mkallsmall_utf(su2+l2-1, 1, langnum); | |
| 2021 } else { | |
| 2022 mkallsmall_utf(su2, 1, langnum); | |
| 2023 } | |
| 2024 for (int i = 0; (i < l1) && (i < l2); i++) { | |
| 2025 if (((short *) su1)[i] == ((short *) su2)[i]) { | |
| 2026 num++; | |
| 2027 } else { | |
| 2028 if (diff < 2) diffpos[diff] = i; | |
| 2029 diff++; | |
| 2030 } | |
| 2031 } | |
| 2032 if ((diff == 2) && (l1 == l2) && | |
| 2033 (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) && | |
| 2034 (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap =
1; | |
| 2035 } else { | |
| 2036 int i; | |
| 2037 char t[MAXSWUTF8L]; | |
| 2038 strcpy(t, s2); | |
| 2039 // decapitalize dictionary word | |
| 2040 if (complexprefixes) { | |
| 2041 int l2 = strlen(t); | |
| 2042 *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; | |
| 2043 } else { | |
| 2044 mkallsmall(t, csconv); | |
| 2045 } | |
| 2046 for (i = 0; (*(s1+i) != 0) && (*(t+i) != 0); i++) { | |
| 2047 if (*(s1+i) == *(t+i)) { | |
| 2048 num++; | |
| 2049 } else { | |
| 2050 if (diff < 2) diffpos[diff] = i; | |
| 2051 diff++; | |
| 2052 } | |
| 2053 } | |
| 2054 if ((diff == 2) && (*(s1+i) == 0) && (*(t+i) == 0) && | |
| 2055 (*(s1+diffpos[0]) == *(t+diffpos[1])) && | |
| 2056 (*(s1+diffpos[1]) == *(t+diffpos[0]))) *is_swap = 1; | |
| 2057 } | |
| 2058 return num; | |
| 2059 } | |
| 2060 | |
| 2061 int SuggestMgr::mystrlen(const char * word) { | |
| 2062 if (utf8) { | |
| 2063 w_char w[MAXSWL]; | |
| 2064 return u8_u16(w, MAXSWL, word); | |
| 2065 } else return strlen(word); | |
| 2066 } | |
| 2067 | |
| 2068 // sort in decreasing order of score | |
| 2069 void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n ) | |
| 2070 { | |
| 2071 int m = 1; | |
| 2072 while (m < n) { | |
| 2073 int j = m; | |
| 2074 while (j > 0) { | |
| 2075 if (rsc[j-1] < rsc[j]) { | |
| 2076 int sctmp = rsc[j-1]; | |
| 2077 char * wdtmp = rword[j-1]; | |
| 2078 rsc[j-1] = rsc[j]; | |
| 2079 rword[j-1] = rword[j]; | |
| 2080 rsc[j] = sctmp; | |
| 2081 rword[j] = wdtmp; | |
| 2082 if (rword2) { | |
| 2083 wdtmp = rword2[j-1]; | |
| 2084 rword2[j-1] = rword2[j]; | |
| 2085 rword2[j] = wdtmp; | |
| 2086 } | |
| 2087 j--; | |
| 2088 } else break; | |
| 2089 } | |
| 2090 m++; | |
| 2091 } | |
| 2092 return; | |
| 2093 } | |
| 2094 | |
| 2095 // longest common subsequence | |
| 2096 void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char *
* result) { | |
| 2097 int n, m; | |
| 2098 w_char su[MAXSWL]; | |
| 2099 w_char su2[MAXSWL]; | |
| 2100 char * b; | |
| 2101 char * c; | |
| 2102 int i; | |
| 2103 int j; | |
| 2104 if (utf8) { | |
| 2105 m = u8_u16(su, MAXSWL, s); | |
| 2106 n = u8_u16(su2, MAXSWL, s2); | |
| 2107 } else { | |
| 2108 m = strlen(s); | |
| 2109 n = strlen(s2); | |
| 2110 } | |
| 2111 c = (char *) calloc(m + 1, n + 1); | |
| 2112 b = (char *) calloc(m + 1, n + 1); | |
| 2113 if (!c || !b) { | |
| 2114 if (c) free(c); | |
| 2115 if (b) free(b); | |
| 2116 *result = NULL; | |
| 2117 return; | |
| 2118 } | |
| 2119 for (i = 1; i <= m; i++) { | |
| 2120 for (j = 1; j <= n; j++) { | |
| 2121 if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1))) | |
| 2122 || ((!utf8) && ((*(s+i-1)) == (*(s2+j-1))))) { | |
| 2123 c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1; | |
| 2124 b[i*(n+1) + j] = LCS_UPLEFT; | |
| 2125 } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) { | |
| 2126 c[i*(n+1) + j] = c[(i-1)*(n+1) + j]; | |
| 2127 b[i*(n+1) + j] = LCS_UP; | |
| 2128 } else { | |
| 2129 c[i*(n+1) + j] = c[i*(n+1) + j-1]; | |
| 2130 b[i*(n+1) + j] = LCS_LEFT; | |
| 2131 } | |
| 2132 } | |
| 2133 } | |
| 2134 *result = b; | |
| 2135 free(c); | |
| 2136 *l1 = m; | |
| 2137 *l2 = n; | |
| 2138 } | |
| 2139 | |
| 2140 int SuggestMgr::lcslen(const char * s, const char* s2) { | |
| 2141 int m; | |
| 2142 int n; | |
| 2143 int i; | |
| 2144 int j; | |
| 2145 char * result; | |
| 2146 int len = 0; | |
| 2147 lcs(s, s2, &m, &n, &result); | |
| 2148 if (!result) return 0; | |
| 2149 i = m; | |
| 2150 j = n; | |
| 2151 while ((i != 0) && (j != 0)) { | |
| 2152 if (result[i*(n+1) + j] == LCS_UPLEFT) { | |
| 2153 len++; | |
| 2154 i--; | |
| 2155 j--; | |
| 2156 } else if (result[i*(n+1) + j] == LCS_UP) { | |
| 2157 i--; | |
| 2158 } else j--; | |
| 2159 } | |
| 2160 free(result); | |
| 2161 return len; | |
| 2162 } | |
| OLD | NEW |