| Index: chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx
|
| ===================================================================
|
| --- chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx (revision 21721)
|
| +++ chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx (working copy)
|
| @@ -14,13 +14,16 @@
|
| #endif
|
|
|
| #include "suggestmgr.hxx"
|
| +#include "htypes.hxx"
|
| +#include "csutil.hxx"
|
|
|
| #ifndef MOZILLA_CLIENT
|
| -#ifndef W32
|
| +#ifndef WIN32
|
| using namespace std;
|
| #endif
|
| #endif
|
|
|
| +const w_char W_VLINE = { '\0', '|' };
|
|
|
| SuggestMgr::SuggestMgr(const char * tryme, int maxn,
|
| AffixMgr * aptr)
|
| @@ -30,36 +33,54 @@
|
| // try when building candidate suggestions
|
| pAMgr = aptr;
|
|
|
| + ckeyl = 0;
|
| + ckey = NULL;
|
| + ckey_utf = NULL;
|
| +
|
| ctryl = 0;
|
| ctry = NULL;
|
| ctry_utf = NULL;
|
|
|
| + utf8 = 0;
|
| + langnum = 0;
|
| + complexprefixes = 0;
|
| +
|
| maxSug = maxn;
|
| nosplitsugs = 0;
|
| maxngramsugs = MAXNGRAMSUGS;
|
|
|
| - utf8 = 0;
|
| - complexprefixes = 0;
|
| -
|
| if (pAMgr) {
|
| char * enc = pAMgr->get_encoding();
|
| csconv = get_current_cs(enc);
|
| free(enc);
|
| + langnum = pAMgr->get_langnum();
|
| + ckey = pAMgr->get_key_string();
|
| nosplitsugs = pAMgr->get_nosplitsugs();
|
| if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();
|
| utf8 = pAMgr->get_utf8();
|
| complexprefixes = pAMgr->get_complexprefixes();
|
| }
|
|
|
| - if (tryme) {
|
| + if (ckey) {
|
| if (utf8) {
|
| w_char t[MAXSWL];
|
| + ckeyl = u8_u16(t, MAXSWL, ckey);
|
| + ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));
|
| + if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
|
| + } else {
|
| + ckeyl = strlen(ckey);
|
| + }
|
| + }
|
| +
|
| + if (tryme) {
|
| + ctry = mystrdup(tryme);
|
| + if (ctry) ctryl = strlen(ctry);
|
| + if (ctry && utf8) {
|
| + w_char t[MAXSWL];
|
| ctryl = u8_u16(t, MAXSWL, tryme);
|
| ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
|
| - memcpy(ctry_utf, t, ctryl * sizeof(w_char));
|
| - } else {
|
| - ctry = mystrdup(tryme);
|
| - ctryl = strlen(ctry);
|
| + if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));
|
| + else ctryl = 0;
|
| }
|
| }
|
| }
|
| @@ -68,6 +89,11 @@
|
| SuggestMgr::~SuggestMgr()
|
| {
|
| pAMgr = NULL;
|
| + if (ckey) free(ckey);
|
| + ckey = NULL;
|
| + if (ckey_utf) free(ckey_utf);
|
| + ckey_utf = NULL;
|
| + ckeyl = 0;
|
| if (ctry) free(ctry);
|
| ctry = NULL;
|
| if (ctry_utf) free(ctry_utf);
|
| @@ -77,7 +103,7 @@
|
| }
|
|
|
| int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
|
| - int * timer, time_t * timelimit) {
|
| + int * timer, clock_t * timelimit) {
|
| int cwrd = 1;
|
| if (ns == maxSug) return maxSug;
|
| for (int k=0; k < ns; k++) {
|
| @@ -96,13 +122,15 @@
|
|
|
| // generate suggestions for a mispelled word
|
| // pass in address of array of char * pointers
|
| +// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
|
|
|
| -int SuggestMgr::suggest(char*** slst, const char * w, int nsug)
|
| +int SuggestMgr::suggest(char*** slst, const char * w, int nsug,
|
| + int * onlycompoundsug)
|
| {
|
| - int nocompoundtwowords = 0;
|
| - char ** wlst;
|
| - w_char word_utf[MAXSWL];
|
| - int wl = 0;
|
| + int nocompoundtwowords = 0;
|
| + char ** wlst;
|
| + w_char word_utf[MAXSWL];
|
| + int wl = 0;
|
|
|
| char w2[MAXWORDUTF8LEN];
|
| const char * word = w;
|
| @@ -141,8 +169,8 @@
|
| nsug = replchars(wlst, word, nsug, cpdsuggest);
|
|
|
| // perhaps we made chose the wrong char from a related set
|
| - if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) {
|
| - nsug = mapchars(wlst, word, nsug);
|
| + if ((nsug < maxSug) && (nsug > -1)) {
|
| + nsug = mapchars(wlst, word, nsug, cpdsuggest);
|
| }
|
|
|
| // did we swap the order of chars by mistake
|
| @@ -157,6 +185,22 @@
|
| longswapchar(wlst, word, nsug, cpdsuggest);
|
| }
|
|
|
| + // did we just hit the wrong key in place of a good char (case and keyboard)
|
| + if ((nsug < maxSug) && (nsug > -1)) {
|
| + nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
| + badcharkey(wlst, word, nsug, cpdsuggest);
|
| + }
|
| +
|
| + // only suggest compound words when no other suggestion
|
| + if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1;
|
| +
|
| + // did we add a char that should not be there
|
| + if ((nsug < maxSug) && (nsug > -1)) {
|
| + nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
| + extrachar(wlst, word, nsug, cpdsuggest);
|
| + }
|
| +
|
| +
|
| // did we forgot a char
|
| if ((nsug < maxSug) && (nsug > -1)) {
|
| nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
| @@ -169,12 +213,6 @@
|
| movechar(wlst, word, nsug, cpdsuggest);
|
| }
|
|
|
| - // did we add a char that should not be there
|
| - if ((nsug < maxSug) && (nsug > -1)) {
|
| - nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
| - extrachar(wlst, word, nsug, cpdsuggest);
|
| - }
|
| -
|
| // did we just hit the wrong key in place of a good char
|
| if ((nsug < maxSug) && (nsug > -1)) {
|
| nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
| @@ -187,10 +225,6 @@
|
| doubletwochars(wlst, word, nsug, cpdsuggest);
|
| }
|
|
|
| -
|
| - // only suggest compound words when no other suggestion
|
| - if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
|
| -
|
| // perhaps we forgot to hit space and two words ran together
|
| if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {
|
| nsug = twowords(wlst, word, nsug, cpdsuggest);
|
| @@ -205,6 +239,8 @@
|
| free(wlst);
|
| wlst = NULL;
|
| }
|
| +
|
| + if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;
|
|
|
| *slst = wlst;
|
| return nsug;
|
| @@ -242,8 +278,8 @@
|
| nsug = replchars(wlst, word, nsug, cpdsuggest);
|
|
|
| // perhaps we made chose the wrong char from a related set
|
| - if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0))
|
| - nsug = mapchars(wlst, word, nsug);
|
| + if ((nsug < maxSug) && (nsug > -1))
|
| + nsug = mapchars(wlst, word, nsug, cpdsuggest);
|
|
|
| if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
|
|
|
| @@ -273,7 +309,7 @@
|
| char candidate[MAXSWUTF8L];
|
| w_char candidate_utf[MAXSWL];
|
| memcpy(candidate_utf, word, wl * sizeof(w_char));
|
| - mkallcap_utf(candidate_utf, wl, pAMgr->get_langnum());
|
| + mkallcap_utf(candidate_utf, wl, langnum);
|
| u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| }
|
| @@ -288,9 +324,9 @@
|
| }
|
|
|
| // suggestions for when chose the wrong char out of a related set
|
| -int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
|
| +int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
|
| {
|
| - time_t timelimit;
|
| + clock_t timelimit;
|
| int timer;
|
|
|
| int wl = strlen(word);
|
| @@ -300,18 +336,19 @@
|
| struct mapentry* maptable = pAMgr->get_maptable();
|
| if (maptable==NULL) return ns;
|
|
|
| - timelimit = time(NULL);
|
| + timelimit = clock();
|
| timer = MINTIMER;
|
| if (utf8) {
|
| w_char w[MAXSWL];
|
| int len = u8_u16(w, MAXSWL, word);
|
| - ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
|
| - } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
|
| + ns = map_related_utf(w, len, 0, cpdsuggest, wlst, ns, maptable, nummap, &timer, &timelimit);
|
| + } else ns = map_related(word, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);
|
| return ns;
|
| }
|
|
|
| -int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns,
|
| - const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
|
| +int SuggestMgr::map_related(const char * word, int i, char** wlst,
|
| + int cpdsuggest, int ns,
|
| + const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)
|
| {
|
| char c = *(word + i);
|
| if (c == 0) {
|
| @@ -319,8 +356,7 @@
|
| int wl = strlen(word);
|
| for (int m=0; m < ns; m++)
|
| if (strcmp(word,wlst[m]) == 0) cwrd = 0;
|
| - if ((cwrd) && (checkword(word, wl, 0, timer, timelimit) ||
|
| - checkword(word, wl, 1, timer, timelimit))) {
|
| + if ((cwrd) && checkword(word, wl, cpdsuggest, timer, timelimit)) {
|
| if (ns < maxSug) {
|
| wlst[ns] = mystrdup(word);
|
| if (wlst[ns] == NULL) return -1;
|
| @@ -334,23 +370,27 @@
|
| if (strchr(maptable[j].set,c) != 0) {
|
| in_map = 1;
|
| char * newword = mystrdup(word);
|
| + if (!newword) return -1;
|
| for (int k = 0; k < maptable[j].len; k++) {
|
| *(newword + i) = *(maptable[j].set + k);
|
| - ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit);
|
| - if (!(*timelimit)) return ns;
|
| + ns = map_related(newword, (i+1), wlst, cpdsuggest,
|
| + ns, maptable, nummap, timer, timelimit);
|
| + if (!(*timer)) return ns;
|
| }
|
| free(newword);
|
| }
|
| }
|
| if (!in_map) {
|
| i++;
|
| - ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit);
|
| + ns = map_related(word, i, wlst, cpdsuggest,
|
| + ns, maptable, nummap, timer, timelimit);
|
| }
|
| return ns;
|
| }
|
|
|
| -int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns,
|
| - const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
|
| +int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest,
|
| + char** wlst, int ns, const mapentry* maptable, int nummap,
|
| + int * timer, clock_t * timelimit)
|
| {
|
| if (i == len) {
|
| int cwrd = 1;
|
| @@ -360,8 +400,7 @@
|
| wl = strlen(s);
|
| for (int m=0; m < ns; m++)
|
| if (strcmp(s,wlst[m]) == 0) cwrd = 0;
|
| - if ((cwrd) && (checkword(s, wl, 0, timer, timelimit) ||
|
| - checkword(s, wl, 1, timer, timelimit))) {
|
| + if ((cwrd) && checkword(s, wl, cpdsuggest, timer, timelimit)) {
|
| if (ns < maxSug) {
|
| wlst[ns] = mystrdup(s);
|
| if (wlst[ns] == NULL) return -1;
|
| @@ -377,15 +416,17 @@
|
| in_map = 1;
|
| for (int k = 0; k < maptable[j].len; k++) {
|
| *(word + i) = *(maptable[j].set_utf16 + k);
|
| - ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit);
|
| - if (!(*timelimit)) return ns;
|
| + ns = map_related_utf(word, len, i + 1, cpdsuggest,
|
| + wlst, ns, maptable, nummap, timer, timelimit);
|
| + if (!(*timer)) return ns;
|
| }
|
| *((unsigned short *) word + i) = c;
|
| }
|
| }
|
| if (!in_map) {
|
| i++;
|
| - ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit);
|
| + ns = map_related_utf(word, len, i, cpdsuggest,
|
| + wlst, ns, maptable, nummap, timer, timelimit);
|
| }
|
| return ns;
|
| }
|
| @@ -416,6 +457,23 @@
|
| strcpy(candidate+(r-word)+lenr, r+lenp);
|
| ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| + // check REP suggestions with space
|
| + char * sp = strchr(candidate, ' ');
|
| + if (sp) {
|
| + *sp = '\0';
|
| + if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) {
|
| + int oldns = ns;
|
| + *sp = ' ';
|
| + ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + if (oldns < ns) {
|
| + free(wlst[ns - 1]);
|
| + wlst[ns - 1] = mystrdup(candidate);
|
| + if (!wlst[ns - 1]) return -1;
|
| + }
|
| + }
|
| + *sp = ' ';
|
| + }
|
| r++; // search for the next letter
|
| }
|
| }
|
| @@ -454,7 +512,7 @@
|
| int state=0;
|
| if (wl < 5 || ! pAMgr) return ns;
|
| for (int i=2; i < wl; i++) {
|
| - if ((word[i].l==word[i-2].l) && (word[i].h==word[i-2].h)) {
|
| + if (w_char_eq(word[i], word[i-2])) {
|
| state++;
|
| if (state==3) {
|
| memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
|
| @@ -471,25 +529,108 @@
|
| return ns;
|
| }
|
|
|
| +// error is wrong char in place of correct one (case and keyboard related version)
|
| +int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)
|
| +{
|
| + char tmpc;
|
| + char candidate[MAXSWUTF8L];
|
| + int wl = strlen(word);
|
| + strcpy(candidate, word);
|
| + // swap out each char one by one and try uppercase and neighbor
|
| + // keyboard chars in its place to see if that makes a good word
|
| +
|
| + for (int i=0; i < wl; i++) {
|
| + tmpc = candidate[i];
|
| + // check with uppercase letters
|
| + candidate[i] = csconv[((unsigned char)tmpc)].cupper;
|
| + if (tmpc != candidate[i]) {
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + candidate[i] = tmpc;
|
| + }
|
| + // check neighbor characters in keyboard string
|
| + if (!ckey) continue;
|
| + char * loc = strchr(ckey, tmpc);
|
| + while (loc) {
|
| + if ((loc > ckey) && (*(loc - 1) != '|')) {
|
| + candidate[i] = *(loc - 1);
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + }
|
| + if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
|
| + candidate[i] = *(loc + 1);
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + }
|
| + loc = strchr(loc + 1, tmpc);
|
| + }
|
| + candidate[i] = tmpc;
|
| + }
|
| + return ns;
|
| +}
|
| +
|
| +// error is wrong char in place of correct one (case and keyboard related version)
|
| +int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
|
| +{
|
| + w_char tmpc;
|
| + w_char candidate_utf[MAXSWL];
|
| + char candidate[MAXSWUTF8L];
|
| + memcpy(candidate_utf, word, wl * sizeof(w_char));
|
| + // swap out each char one by one and try all the tryme
|
| + // chars in its place to see if that makes a good word
|
| + for (int i=0; i < wl; i++) {
|
| + tmpc = candidate_utf[i];
|
| + // check with uppercase letters
|
| + mkallcap_utf(candidate_utf + i, 1, langnum);
|
| + if (!w_char_eq(tmpc, candidate_utf[i])) {
|
| + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + candidate_utf[i] = tmpc;
|
| + }
|
| + // check neighbor characters in keyboard string
|
| + if (!ckey) continue;
|
| + w_char * loc = ckey_utf;
|
| + while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;
|
| + while (loc < (ckey_utf + ckeyl)) {
|
| + if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
|
| + candidate_utf[i] = *(loc - 1);
|
| + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + }
|
| + if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
|
| + candidate_utf[i] = *(loc + 1);
|
| + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + }
|
| + do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
|
| + }
|
| + candidate_utf[i] = tmpc;
|
| + }
|
| + return ns;
|
| +}
|
| +
|
| // error is wrong char in place of correct one
|
| int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
|
| {
|
| char tmpc;
|
| char candidate[MAXSWUTF8L];
|
| - time_t timelimit = time(NULL);
|
| + clock_t timelimit = clock();
|
| int timer = MINTIMER;
|
| int wl = strlen(word);
|
| strcpy(candidate, word);
|
| // swap out each char one by one and try all the tryme
|
| // chars in its place to see if that makes a good word
|
| - for (int i=0; i < wl; i++) {
|
| - tmpc = candidate[i];
|
| - for (int j=0; j < ctryl; j++) {
|
| + for (int j=0; j < ctryl; j++) {
|
| + for (int i=wl-1; i >= 0; i--) {
|
| + tmpc = candidate[i];
|
| if (ctry[j] == tmpc) continue;
|
| candidate[i] = ctry[j];
|
| ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
|
| if (ns == -1) return -1;
|
| - if (!timelimit) return ns;
|
| + if (!timer) return ns;
|
| candidate[i] = tmpc;
|
| }
|
| }
|
| @@ -502,20 +643,20 @@
|
| w_char tmpc;
|
| w_char candidate_utf[MAXSWL];
|
| char candidate[MAXSWUTF8L];
|
| - time_t timelimit = time(NULL);
|
| + clock_t timelimit = clock();
|
| int timer = MINTIMER;
|
| memcpy(candidate_utf, word, wl * sizeof(w_char));
|
| // swap out each char one by one and try all the tryme
|
| // chars in its place to see if that makes a good word
|
| - for (int i=0; i < wl; i++) {
|
| - tmpc = candidate_utf[i];
|
| - for (int j=0; j < ctryl; j++) {
|
| - if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue;
|
| + for (int j=0; j < ctryl; j++) {
|
| + for (int i=wl-1; i >= 0; i--) {
|
| + tmpc = candidate_utf[i];
|
| + if (w_char_eq(tmpc, ctry_utf[j])) continue;
|
| candidate_utf[i] = ctry_utf[j];
|
| u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
|
| if (ns == -1) return -1;
|
| - if (!timelimit) return ns;
|
| + if (!timer) return ns;
|
| candidate_utf[i] = tmpc;
|
| }
|
| }
|
| @@ -525,18 +666,20 @@
|
| // error is word has an extra letter it does not need
|
| int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
|
| {
|
| - char candidate[MAXSWUTF8L];
|
| + char candidate[MAXSWUTF8L];
|
| w_char candidate_utf[MAXSWL];
|
| - const w_char * p;
|
| - w_char * r;
|
| + w_char * p;
|
| + w_char tmpc = W_VLINE; // not used value, only for VCC warning message
|
| if (wl < 2) return ns;
|
| // try omitting one char of word at a time
|
| - memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char));
|
| - for (p = word, r = candidate_utf; p < word + wl; ) {
|
| - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
|
| + memcpy(candidate_utf, word, wl * sizeof(w_char));
|
| + for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
|
| + w_char tmpc2 = *p;
|
| + if (p < candidate_utf + wl - 1) *p = tmpc;
|
| + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
|
| ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| - *r++ = *p++;
|
| + tmpc = tmpc2;
|
| }
|
| return ns;
|
| }
|
| @@ -544,48 +687,42 @@
|
| // error is word has an extra letter it does not need
|
| int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
|
| {
|
| + char tmpc = '\0';
|
| char candidate[MAXSWUTF8L];
|
| - const char * p;
|
| - char * r;
|
| + char * p;
|
| int wl = strlen(word);
|
| if (wl < 2) return ns;
|
| // try omitting one char of word at a time
|
| - strcpy (candidate, word + 1);
|
| - for (p = word, r = candidate; *p != 0; ) {
|
| + strcpy (candidate, word);
|
| + for (p = candidate + wl - 1; p >=candidate; p--) {
|
| + char tmpc2 = *p;
|
| + *p = tmpc;
|
| ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| - *r++ = *p++;
|
| + tmpc = tmpc2;
|
| }
|
| return ns;
|
| }
|
|
|
| -
|
| // error is missing a letter it needs
|
| int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
|
| {
|
| char candidate[MAXSWUTF8L];
|
| - const char * p;
|
| - char * q;
|
| - time_t timelimit = time(NULL);
|
| + char * p;
|
| + clock_t timelimit = clock();
|
| int timer = MINTIMER;
|
| int wl = strlen(word);
|
| - // try inserting a tryme character before every letter
|
| - strcpy(candidate + 1, word);
|
| - for (p = word, q = candidate; *p != 0; ) {
|
| - for (int i = 0; i < ctryl; i++) {
|
| - *q = ctry[i];
|
| + // try inserting a tryme character before every letter (and the null terminator)
|
| + for (int i = 0; i < ctryl; i++) {
|
| + strcpy(candidate, word);
|
| + for (p = candidate + wl; p >= candidate; p--) {
|
| + *(p+1) = *p;
|
| + *p = ctry[i];
|
| ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);
|
| if (ns == -1) return -1;
|
| - if (!timelimit) return ns;
|
| + if (!timer) return ns;
|
| }
|
| - *q++ = *p++;
|
| }
|
| - // now try adding one to end */
|
| - for (int i = 0; i < ctryl; i++) {
|
| - *q = ctry[i];
|
| - ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, NULL, NULL);
|
| - if (ns == -1) return -1;
|
| - }
|
| return ns;
|
| }
|
|
|
| @@ -594,32 +731,21 @@
|
| {
|
| w_char candidate_utf[MAXSWL];
|
| char candidate[MAXSWUTF8L];
|
| - const w_char * p;
|
| - w_char * q;
|
| - int cwrd;
|
| - time_t timelimit = time(NULL);
|
| + w_char * p;
|
| + clock_t timelimit = clock();
|
| int timer = MINTIMER;
|
| - // try inserting a tryme character before every letter
|
| - memcpy (candidate_utf + 1, word, wl * sizeof(w_char));
|
| - for (p = word, q = candidate_utf; p < (word + wl); ) {
|
| - for (int i = 0; i < ctryl; i++) {
|
| - *q = ctry_utf[i];
|
| - cwrd = 1;
|
| + // try inserting a tryme character at the end of the word and before every letter
|
| + for (int i = 0; i < ctryl; i++) {
|
| + memcpy (candidate_utf, word, wl * sizeof(w_char));
|
| + for (p = candidate_utf + wl; p >= candidate_utf; p--) {
|
| + *(p + 1) = *p;
|
| + *p = ctry_utf[i];
|
| u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
|
| ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
|
| if (ns == -1) return -1;
|
| - if (!timelimit) return ns;
|
| - }
|
| - *q++ = *p++;
|
| + if (!timer) return ns;
|
| + }
|
| }
|
| - // now try adding one to end */
|
| - for (int i = 0; i < ctryl; i++) {
|
| - *q = ctry_utf[i];
|
| - cwrd = 1;
|
| - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
|
| - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| - if (ns == -1) return -1;
|
| - }
|
| return ns;
|
| }
|
|
|
| @@ -636,19 +762,19 @@
|
| int wl=strlen(word);
|
| if (wl < 3) return ns;
|
|
|
| - if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl);
|
| + if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);
|
|
|
| strcpy(candidate + 1, word);
|
| -
|
| // split the string into two pieces after every char
|
| // if both pieces are good words make them a suggestion
|
| for (p = candidate + 1; p[1] != '\0'; p++) {
|
| p[-1] = *p;
|
| // go to end of the UTF-8 character
|
| while (utf8 && ((p[1] & 0xc0) == 0x80)) {
|
| + *p = p[1];
|
| p++;
|
| - p[-1] = *p;
|
| }
|
| + if (utf8 && p[1] == '\0') break; // last UTF-8 character
|
| *p = '\0';
|
| c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);
|
| if (c1) {
|
| @@ -657,7 +783,7 @@
|
| *p = ' ';
|
|
|
| // spec. Hungarian code (need a better compound word support)
|
| - if ((pAMgr->get_langnum() == LANG_hu) && !forbidden &&
|
| + if ((langnum == LANG_hu) && !forbidden &&
|
| // if 3 repeating letter, use - instead of space
|
| (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
|
| // or multiple compounding, with more, than 6 syllables
|
| @@ -673,6 +799,23 @@
|
| ns++;
|
| }
|
| } else return ns;
|
| + // add two word suggestion with dash, if TRY string contains
|
| + // "a" or "-"
|
| + // NOTE: cwrd doesn't modified for REP twoword sugg.
|
| + if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
|
| + mystrlen(p + 1) > 1 &&
|
| + mystrlen(candidate) - mystrlen(p) > 1) {
|
| + *p = '-';
|
| + for (int k=0; k < ns; k++)
|
| + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
| + if (ns < maxSug) {
|
| + if (cwrd) {
|
| + wlst[ns] = mystrdup(candidate);
|
| + if (wlst[ns] == NULL) return -1;
|
| + ns++;
|
| + }
|
| + } else return ns;
|
| + }
|
| }
|
| }
|
| }
|
| @@ -698,6 +841,24 @@
|
| p[1] = *p;
|
| *p = tmpc;
|
| }
|
| + // try double swaps for short words
|
| + // ahev -> have, owudl -> would
|
| + if (wl == 4 || wl == 5) {
|
| + candidate[0] = word[1];
|
| + candidate[1] = word[0];
|
| + candidate[2] = word[2];
|
| + candidate[wl - 2] = word[wl - 1];
|
| + candidate[wl - 1] = word[wl - 2];
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + if (wl == 5) {
|
| + candidate[0] = word[0];
|
| + candidate[1] = word[2];
|
| + candidate[2] = word[1];
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + }
|
| + }
|
| return ns;
|
| }
|
|
|
| @@ -708,6 +869,7 @@
|
| char candidate[MAXSWUTF8L];
|
| w_char * p;
|
| w_char tmpc;
|
| + int len = 0;
|
| // try swapping adjacent chars one by one
|
| memcpy (candidate_utf, word, wl * sizeof(w_char));
|
| for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
|
| @@ -715,11 +877,32 @@
|
| *p = p[1];
|
| p[1] = tmpc;
|
| u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + if (len == 0) len = strlen(candidate);
|
| + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| p[1] = *p;
|
| *p = tmpc;
|
| }
|
| + // try double swaps for short words
|
| + // ahev -> have, owudl -> would, suodn -> sound
|
| + if (wl == 4 || wl == 5) {
|
| + candidate_utf[0] = word[1];
|
| + candidate_utf[1] = word[0];
|
| + candidate_utf[2] = word[2];
|
| + candidate_utf[wl - 2] = word[wl - 1];
|
| + candidate_utf[wl - 1] = word[wl - 2];
|
| + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + if (wl == 5) {
|
| + candidate_utf[0] = word[0];
|
| + candidate_utf[1] = word[2];
|
| + candidate_utf[2] = word[1];
|
| + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
|
| + if (ns == -1) return -1;
|
| + }
|
| + }
|
| return ns;
|
| }
|
|
|
| @@ -794,7 +977,7 @@
|
| *(q-1) = *q;
|
| *q = tmpc;
|
| if ((q-p) < 2) continue; // omit swap char
|
| - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| }
|
| strcpy(candidate, word);
|
| @@ -805,7 +988,7 @@
|
| *(q+1) = *q;
|
| *q = tmpc;
|
| if ((p-q) < 2) continue; // omit swap char
|
| - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| }
|
| strcpy(candidate, word);
|
| @@ -830,7 +1013,7 @@
|
| *q = tmpc;
|
| if ((q-p) < 2) continue; // omit swap char
|
| u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| }
|
| memcpy (candidate_utf, word, wl * sizeof(w_char));
|
| @@ -842,7 +1025,7 @@
|
| *q = tmpc;
|
| if ((p-q) < 2) continue; // omit swap char
|
| u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
| - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
|
| + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
|
| if (ns == -1) return -1;
|
| }
|
| memcpy (candidate_utf, word, wl * sizeof(w_char));
|
| @@ -851,28 +1034,33 @@
|
| }
|
|
|
| // generate a set of suggestions for very poorly spelled words
|
| -int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr)
|
| +int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)
|
| {
|
|
|
| int i, j;
|
| int lval;
|
| - int sc;
|
| - int lp;
|
| + int sc, scphon;
|
| + int lp, lpphon;
|
| int nonbmp = 0;
|
|
|
| - if (!pHMgr) return 0;
|
| -
|
| // exhaustively search through all root words
|
| // keeping track of the MAX_ROOTS most similar root words
|
| struct hentry * roots[MAX_ROOTS];
|
| + char * rootsphon[MAX_ROOTS];
|
| int scores[MAX_ROOTS];
|
| + int scoresphon[MAX_ROOTS];
|
| for (i = 0; i < MAX_ROOTS; i++) {
|
| roots[i] = NULL;
|
| scores[i] = -100 * i;
|
| + rootsphon[i] = NULL;
|
| + scoresphon[i] = -100 * i;
|
| }
|
| lp = MAX_ROOTS - 1;
|
| -
|
| + lpphon = MAX_ROOTS - 1;
|
| + scphon = scoresphon[MAX_ROOTS-1];
|
| +
|
| char w2[MAXWORDUTF8LEN];
|
| + char f[MAXSWUTF8L];
|
| char * word = w;
|
|
|
| // word reversing wrapper for complex prefixes
|
| @@ -896,8 +1084,8 @@
|
|
|
| struct hentry* hp = NULL;
|
| int col = -1;
|
| -
|
| - #ifdef HUNSPELL_CHROME_CLIENT
|
| +
|
| +#ifdef HUNSPELL_CHROME_CLIENT
|
| // A static array of hentries required for walking the hash table.
|
| struct hentry static_hentry[MAX_ROOTS];
|
|
|
| @@ -906,31 +1094,61 @@
|
| static const int kMaxWordLen = 128;
|
| char hentry_word[MAX_ROOTS][kMaxWordLen];
|
| unsigned short hentry_astr[MAX_ROOTS];
|
| - #endif
|
| +#endif
|
|
|
| - while ((hp = pHMgr->walk_hashtable(col, hp))) {
|
| + phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
|
| + char target[MAXSWUTF8L];
|
| + char candidate[MAXSWUTF8L];
|
| + if (ph) {
|
| + strcpy(candidate, word);
|
| + mkallcap(candidate, csconv);
|
| + phonet(candidate, target, n, *ph);
|
| + }
|
| +
|
| + for (i = 0; i < md; i++) {
|
| + while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {
|
| if ((hp->astr) && (pAMgr) &&
|
| (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||
|
| + TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
|
| TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||
|
| TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;
|
| - sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
|
| +
|
| + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) +
|
| + leftcommonsubstring(word, HENTRY_WORD(hp));
|
| +
|
| + // check special pronounciation
|
| + if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
|
| + int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) +
|
| + leftcommonsubstring(word, f);
|
| + if (sc2 > sc) sc = sc2;
|
| + }
|
| +
|
| + if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {
|
| + char target2[MAXSWUTF8L];
|
| + strcpy(candidate, HENTRY_WORD(hp));
|
| + mkallcap(candidate, csconv);
|
| + phonet(candidate, target2, -1, *ph);
|
| + scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
|
| + }
|
| +
|
| if (sc > scores[lp]) {
|
| scores[lp] = sc;
|
| - #ifdef HUNSPELL_CHROME_CLIENT
|
| +#ifdef HUNSPELL_CHROME_CLIENT
|
| roots[lp] = &static_hentry[lp];
|
| roots[lp]->alen = hp->alen;
|
| if (hp->astr)
|
| hentry_astr[lp] = *hp->astr;
|
| roots[lp]->astr = &hentry_astr[lp];
|
| - roots[lp]->wlen = hp->wlen;
|
| - strcpy(&hentry_word[lp][0], hp->word);
|
| - roots[lp]->word = &hentry_word[lp][0];
|
| + roots[lp]->blen = hp->blen;
|
| + strcpy(&hentry_word[lp][0], &hp->word);
|
| + roots[lp]->word = hentry_word[lp][0];
|
| roots[lp]->next = NULL;
|
| roots[lp]->next_homonym = NULL;
|
| - #else
|
| + roots[lp]->var = 0;
|
| + roots[lp]->clen = 0;
|
| +#else
|
| roots[lp] = hp;
|
| - #endif
|
| -
|
| +#endif
|
| lval = sc;
|
| for (j=0; j < MAX_ROOTS; j++)
|
| if (scores[j] < lval) {
|
| @@ -938,8 +1156,19 @@
|
| lval = scores[j];
|
| }
|
| }
|
| - }
|
|
|
| + if (scphon > scoresphon[lpphon]) {
|
| + scoresphon[lpphon] = scphon;
|
| + rootsphon[lpphon] = HENTRY_WORD(hp);
|
| + lval = scphon;
|
| + for (j=0; j < MAX_ROOTS; j++)
|
| + if (scoresphon[j] < lval) {
|
| + lpphon = j;
|
| + lval = scoresphon[j];
|
| + }
|
| + }
|
| + }}
|
| +
|
| // find minimum threshhold for a passable suggestion
|
| // mangle original word three differnt ways
|
| // and score them to generate a minimum acceptable score
|
| @@ -948,11 +1177,11 @@
|
| if (utf8) {
|
| for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
|
| u16_u8(mw, MAXSWUTF8L, u8, n);
|
| - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
|
| + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);
|
| } else {
|
| strcpy(mw, word);
|
| for (int k=sp; k < n; k+=4) *(mw + k) = '*';
|
| - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
|
| + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);
|
| }
|
| }
|
| thresh = thresh / 3;
|
| @@ -962,9 +1191,11 @@
|
| // and use length adjusted ngram scores to select
|
| // possible suggestions
|
| char * guess[MAX_GUESS];
|
| + char * guessorig[MAX_GUESS];
|
| int gscore[MAX_GUESS];
|
| for(i=0;i<MAX_GUESS;i++) {
|
| guess[i] = NULL;
|
| + guessorig[i] = NULL;
|
| gscore[i] = -100 * i;
|
| }
|
|
|
| @@ -974,31 +1205,46 @@
|
| glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
|
| if (! glst) {
|
| if (nonbmp) utf8 = 1;
|
| - return 0;
|
| + return ns;
|
| }
|
|
|
| for (i = 0; i < MAX_ROOTS; i++) {
|
| -
|
| if (roots[i]) {
|
| struct hentry * rp = roots[i];
|
| - int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
|
| - rp->astr, rp->alen, word, nc);
|
| + int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,
|
| + rp->astr, rp->alen, word, nc,
|
| + ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));
|
|
|
| for (int k = 0; k < nw ; k++) {
|
| - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
|
| + sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) +
|
| + leftcommonsubstring(word, glst[k].word);
|
| +
|
| if ((sc > thresh)) {
|
| if (sc > gscore[lp]) {
|
| - if (guess[lp]) free (guess[lp]);
|
| + if (guess[lp]) {
|
| + free (guess[lp]);
|
| + if (guessorig[lp]) {
|
| + free(guessorig[lp]);
|
| + guessorig[lp] = NULL;
|
| + }
|
| + }
|
| gscore[lp] = sc;
|
| guess[lp] = glst[k].word;
|
| + guessorig[lp] = glst[k].orig;
|
| lval = sc;
|
| for (j=0; j < MAX_GUESS; j++)
|
| if (gscore[j] < lval) {
|
| lp = j;
|
| lval = gscore[j];
|
| }
|
| - } else free (glst[k].word);
|
| - } else free(glst[k].word);
|
| + } else {
|
| + free(glst[k].word);
|
| + if (glst[k].orig) free(glst[k].orig);
|
| + }
|
| + } else {
|
| + free(glst[k].word);
|
| + if (glst[k].orig) free(glst[k].orig);
|
| + }
|
| }
|
| }
|
| }
|
| @@ -1007,7 +1253,9 @@
|
| // now we are done generating guesses
|
| // sort in order of decreasing score
|
|
|
| - bubblesort(&guess[0], &gscore[0], MAX_GUESS);
|
| +
|
| + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
|
| + if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
|
|
|
| // weight suggestions with a similarity index, based on
|
| // the longest common subsequent algorithm and resort
|
| @@ -1021,7 +1269,7 @@
|
| if (utf8) {
|
| w_char _w[MAXSWL];
|
| len = u8_u16(_w, MAXSWL, guess[i]);
|
| - mkallsmall_utf(_w, len, pAMgr->get_langnum());
|
| + mkallsmall_utf(_w, len, langnum);
|
| u16_u8(gl, MAXSWUTF8L, _w, len);
|
| } else {
|
| strcpy(gl, guess[i]);
|
| @@ -1039,10 +1287,10 @@
|
|
|
| // heuristic weigthing of ngram scores
|
| gscore[i] +=
|
| - // length of longest common subsequent minus lenght difference
|
| + // length of longest common subsequent minus length difference
|
| 2 * _lcs - abs((int) (n - len)) +
|
| - // weight equal first letter
|
| - equalfirstletter(word, gl) +
|
| + // weight length of the left common substring
|
| + leftcommonsubstring(word, gl) +
|
| // weight equal character positions
|
| ((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +
|
| // swap character (not neighboring)
|
| @@ -1050,28 +1298,87 @@
|
| }
|
| }
|
|
|
| - bubblesort(&guess[0], &gscore[0], MAX_GUESS);
|
| + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
|
|
|
| +// phonetic version
|
| + if (ph) for (i=0; i < MAX_ROOTS; i++) {
|
| + if (rootsphon[i]) {
|
| + // lowering rootphon[i]
|
| + char gl[MAXSWUTF8L];
|
| + int len;
|
| + if (utf8) {
|
| + w_char _w[MAXSWL];
|
| + len = u8_u16(_w, MAXSWL, rootsphon[i]);
|
| + mkallsmall_utf(_w, len, langnum);
|
| + u16_u8(gl, MAXSWUTF8L, _w, len);
|
| + } else {
|
| + strcpy(gl, rootsphon[i]);
|
| + mkallsmall(gl, csconv);
|
| + len = strlen(rootsphon[i]);
|
| + }
|
| +
|
| + // heuristic weigthing of ngram scores
|
| + scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +
|
| + // weight length of the left common substring
|
| + leftcommonsubstring(word, gl);
|
| + }
|
| + }
|
| +
|
| + if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
|
| +
|
| // copy over
|
| + int oldns = ns;
|
|
|
| - int ns = 0;
|
| int same = 0;
|
| for (i=0; i < MAX_GUESS; i++) {
|
| if (guess[i]) {
|
| - if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
|
| + if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
|
| int unique = 1;
|
| - // we have excellent suggestion(s)
|
| + // leave only excellent suggestions, if exists
|
| if (gscore[i] > 1000) same = 1;
|
| - for (j=0; j < ns; j++)
|
| + for (j = 0; j < ns; j++) {
|
| // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
|
| - if (strstr(guess[i], wlst[j]) ||
|
| + if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||
|
| + (guessorig[i] && strstr(guessorig[i], wlst[j])) ||
|
| // check forbidden words
|
| !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
|
| - if (unique) wlst[ns++] = guess[i]; else free(guess[i]);
|
| - } else free(guess[i]);
|
| + }
|
| + if (unique) {
|
| + wlst[ns++] = guess[i];
|
| + if (guessorig[i]) {
|
| + free(guess[i]);
|
| + wlst[ns-1] = guessorig[i];
|
| + }
|
| + } else {
|
| + free(guess[i]);
|
| + if (guessorig[i]) free(guessorig[i]);
|
| + }
|
| + } else {
|
| + free(guess[i]);
|
| + if (guessorig[i]) free(guessorig[i]);
|
| + }
|
| }
|
| }
|
|
|
| + oldns = ns;
|
| + if (ph) for (i=0; i < MAX_ROOTS; i++) {
|
| + if (rootsphon[i]) {
|
| + if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {
|
| + int unique = 1;
|
| + for (j = 0; j < ns; j++) {
|
| + // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
|
| + if (strstr(rootsphon[i], wlst[j]) ||
|
| + // check forbidden words
|
| + !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0;
|
| + }
|
| + if (unique) {
|
| + wlst[ns++] = mystrdup(rootsphon[i]);
|
| + if (!wlst[ns - 1]) return ns - 1;
|
| + }
|
| + }
|
| + }
|
| + }
|
| +
|
| if (nonbmp) utf8 = 1;
|
| return ns;
|
| }
|
| @@ -1083,19 +1390,16 @@
|
| // obsolote MySpell-HU modifications:
|
| // return value 2 and 3 marks compounding with hyphen (-)
|
| // `3' marks roots without suffix
|
| -int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit)
|
| +int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)
|
| {
|
| struct hentry * rv=NULL;
|
| int nosuffix = 0;
|
| -
|
| +
|
| // check time limit
|
| if (timer) {
|
| (*timer)--;
|
| if (!(*timer) && timelimit) {
|
| - if (time(NULL) > *timelimit) {
|
| - *timelimit = 0;
|
| - return 0;
|
| - }
|
| + if ((clock() - *timelimit) > TIMELIMIT) return 0;
|
| *timer = MAXPLUSTIMER;
|
| }
|
| }
|
| @@ -1103,7 +1407,7 @@
|
| if (pAMgr) {
|
| if (cpdsuggest==1) {
|
| if (pAMgr->get_compound()) {
|
| - rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1);
|
| + rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT
|
| if (rv) return 3; // XXX obsolote categorisation
|
| }
|
| return 0;
|
| @@ -1114,10 +1418,15 @@
|
| if (rv) {
|
| if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|
| || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
|
| - if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
|
| - TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
|
| + while (rv) {
|
| + if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
|
| + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
|
| + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
|
| + rv = rv->next_homonym;
|
| + } else break;
|
| + }
|
| } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
|
| -
|
| +
|
| if (rv) {
|
| nosuffix=1;
|
| } else {
|
| @@ -1130,8 +1439,9 @@
|
| }
|
|
|
| // check forbidden words
|
| - if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|
| - || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
|
| + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||
|
| + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
|
| + TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
|
| TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
|
|
|
| if (rv) { // XXX obsolote
|
| @@ -1149,7 +1459,7 @@
|
|
|
| if (pAMgr) {
|
| rv = pAMgr->lookup(word);
|
| - if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
|
| + if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
|
| TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
|
| if (!(pAMgr->prefix_check(word,len,1)))
|
| rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
|
| @@ -1160,184 +1470,6 @@
|
| }
|
|
|
| #ifdef HUNSPELL_EXPERIMENTAL
|
| -// suggest stems, XXX experimental code
|
| -int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug)
|
| -{
|
| - char buf[MAXSWUTF8L];
|
| - char ** wlst;
|
| - int prevnsug = nsug;
|
| -
|
| - char w2[MAXWORDUTF8LEN];
|
| - const char * word = w;
|
| -
|
| - // word reversing wrapper for complex prefixes
|
| - if (complexprefixes) {
|
| - strcpy(w2, w);
|
| - if (utf8) reverseword_utf(w2); else reverseword(w2);
|
| - word = w2;
|
| - }
|
| -
|
| - if (*slst) {
|
| - wlst = *slst;
|
| - } else {
|
| - wlst = (char **) calloc(maxSug, sizeof(char *));
|
| - if (wlst == NULL) return -1;
|
| - }
|
| - // perhaps there are a fix stem in the dictionary
|
| - if ((nsug < maxSug) && (nsug > -1)) {
|
| -
|
| - nsug = fixstems(wlst, word, nsug);
|
| - if (nsug == prevnsug) {
|
| - char * s = mystrdup(word);
|
| - char * p = s + strlen(s);
|
| - while ((*p != '-') && (p != s)) p--;
|
| - if (*p == '-') {
|
| - *p = '\0';
|
| - nsug = fixstems(wlst, s, nsug);
|
| - if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) {
|
| - char * t;
|
| - buf[0] = '\0';
|
| - for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number?
|
| - if (*t != '\0') strcpy(buf, "# ");
|
| - strcat(buf, s);
|
| - wlst[nsug] = mystrdup(buf);
|
| - if (wlst[nsug] == NULL) return -1;
|
| - nsug++;
|
| - }
|
| - p++;
|
| - nsug = fixstems(wlst, p, nsug);
|
| - }
|
| -
|
| - free(s);
|
| - }
|
| - }
|
| -
|
| - if (nsug < 0) {
|
| - for (int i=0;i<maxSug; i++)
|
| - if (wlst[i] != NULL) free(wlst[i]);
|
| - free(wlst);
|
| - return -1;
|
| - }
|
| -
|
| - *slst = wlst;
|
| - return nsug;
|
| -}
|
| -
|
| -
|
| -// there are fix stems in dictionary
|
| -int SuggestMgr::fixstems(char ** wlst, const char * word, int ns)
|
| -{
|
| - char buf[MAXSWUTF8L];
|
| - char prefix[MAXSWUTF8L] = "";
|
| -
|
| - int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound
|
| - int cpdindex = 0;
|
| - struct hentry * rv = NULL;
|
| -
|
| - int wl = strlen(word);
|
| - int cmpdstemnum;
|
| - int cmpdstem[MAXCOMPOUND];
|
| -
|
| - if (pAMgr) {
|
| - rv = pAMgr->lookup(word);
|
| - if (rv) {
|
| - dicstem = 0;
|
| - } else {
|
| - // try stripping off affixes
|
| - rv = pAMgr->affix_check(word, wl);
|
| -
|
| - // else try check compound word
|
| - if (!rv && pAMgr->get_compound()) {
|
| - rv = pAMgr->compound_check(word, wl,
|
| - 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1);
|
| -
|
| - if (rv) {
|
| - dicstem = 2;
|
| - for (int j = 0; j < cmpdstemnum; j++) {
|
| - cpdindex += cmpdstem[j];
|
| - }
|
| - if(! (pAMgr->lookup(word + cpdindex)))
|
| - pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix
|
| - }
|
| - }
|
| -
|
| -
|
| - if (pAMgr->get_prefix()) {
|
| - strcpy(prefix, pAMgr->get_prefix());
|
| - }
|
| -
|
| - // XXX obsolete, will be a general solution for stemming
|
| - if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU)
|
| - }
|
| -
|
| - }
|
| -
|
| -
|
| -
|
| - if ((rv) && (ns < maxSug)) {
|
| -
|
| - // check fixstem flag and not_valid_stem flag
|
| - // first word
|
| - if ((ns < maxSug) && (dicstem < 2)) {
|
| - strcpy(buf, prefix);
|
| - if ((dicstem > 0) && pAMgr->get_derived()) {
|
| - // XXX obsolote
|
| - if (strlen(prefix) == 1) {
|
| - strcat(buf, (pAMgr->get_derived()) + 1);
|
| - } else {
|
| - strcat(buf, pAMgr->get_derived());
|
| - }
|
| - } else {
|
| - // special stem in affix description
|
| - const char * wordchars = pAMgr->get_wordchars();
|
| - if (rv->description &&
|
| - (strchr(wordchars, *(rv->description)))) {
|
| - char * desc = (rv->description) + 1;
|
| - while (strchr(wordchars, *desc)) desc++;
|
| - strncat(buf, rv->description, desc - (rv->description));
|
| - } else {
|
| - strcat(buf, rv->word);
|
| - }
|
| - }
|
| - wlst[ns] = mystrdup(buf);
|
| - if (wlst[ns] == NULL) return -1;
|
| - ns++;
|
| - }
|
| -
|
| - if (dicstem == 2) {
|
| -
|
| - // compound stem
|
| -
|
| -// if (rv->astr && (strchr(rv->astr, '0') == NULL)) {
|
| - if (rv->astr) {
|
| - strcpy(buf, word);
|
| - buf[cpdindex] = '\0';
|
| - if (prefix) strcat(buf, prefix);
|
| - if (pAMgr->get_derived()) {
|
| - strcat(buf, pAMgr->get_derived());
|
| - } else {
|
| - // special stem in affix description
|
| - const char * wordchars = pAMgr->get_wordchars();
|
| - if (rv->description &&
|
| - (strchr(wordchars, *(rv->description)))) {
|
| - char * desc = (rv->description) + 1;
|
| - while (strchr(wordchars, *desc)) desc++;
|
| - strncat(buf, rv->description, desc - (rv->description));
|
| - } else {
|
| - strcat(buf, rv->word);
|
| - }
|
| - }
|
| - if (ns < maxSug) {
|
| - wlst[ns] = mystrdup(buf);
|
| - if (wlst[ns] == NULL) return -1;
|
| - ns++;
|
| - }
|
| - }
|
| - }
|
| - }
|
| - return ns;
|
| -}
|
| -
|
| // suggest possible stems
|
| int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
|
| {
|
| @@ -1377,6 +1509,7 @@
|
| *slst = wlst;
|
| return nsug;
|
| }
|
| +#endif // END OF HUNSPELL_EXPERIMENTAL CODE
|
|
|
|
|
| char * SuggestMgr::suggest_morph(const char * w)
|
| @@ -1405,20 +1538,25 @@
|
|
|
| while (rv) {
|
| if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
|
| - TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) ||
|
| + TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
|
| TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
|
| - if (rv->description && ((!rv->astr) ||
|
| - !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen)))
|
| - strcat(result, word);
|
| - if (rv->description) strcat(result, rv->description);
|
| - strcat(result, "\n");
|
| + if (!HENTRY_FIND(rv, MORPH_STEM)) {
|
| + mystrcat(result, " ", MAXLNLEN);
|
| + mystrcat(result, MORPH_STEM, MAXLNLEN);
|
| + mystrcat(result, word, MAXLNLEN);
|
| + }
|
| + if (HENTRY_DATA(rv)) {
|
| + mystrcat(result, " ", MAXLNLEN);
|
| + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
|
| + }
|
| + mystrcat(result, "\n", MAXLNLEN);
|
| }
|
| rv = rv->next_homonym;
|
| }
|
|
|
| st = pAMgr->affix_check_morph(word,strlen(word));
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
|
|
| @@ -1426,28 +1564,177 @@
|
| pAMgr->compound_check_morph(word, strlen(word),
|
| 0, 0, 100, 0,NULL, 0, &r, NULL);
|
|
|
| - return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL;
|
| + return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
|
| }
|
|
|
| +#ifdef HUNSPELL_EXPERIMENTAL
|
| char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
|
| {
|
| char * p = NULL;
|
| char ** wlst = (char **) calloc(maxSug, sizeof(char *));
|
| + if (!**wlst) return NULL;
|
| // we will use only the first suggestion
|
| for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
|
| - int ns = suggest(&wlst, word, maxSug - 1);
|
| + int ns = suggest(&wlst, word, maxSug - 1, NULL);
|
| if (ns == maxSug) {
|
| p = suggest_morph(wlst[maxSug - 1]);
|
| free(wlst[maxSug - 1]);
|
| }
|
| if (wlst) free(wlst);
|
| - return p;
|
| + return p;
|
| }
|
| #endif // END OF HUNSPELL_EXPERIMENTAL CODE
|
|
|
| +/* affixation */
|
| +char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)
|
| +{
|
| + char result[MAXLNLEN];
|
| + *result = '\0';
|
| + int sfxcount = get_sfxcount(pattern);
|
|
|
| + if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;
|
| +
|
| + if (HENTRY_DATA(rv)) {
|
| + char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
|
| + HENTRY_DATA(rv), pattern, 0);
|
| + if (aff) {
|
| + mystrcat(result, aff, MAXLNLEN);
|
| + mystrcat(result, "\n", MAXLNLEN);
|
| + free(aff);
|
| + }
|
| + }
|
| +
|
| + // check all allomorphs
|
| + char allomorph[MAXLNLEN];
|
| + char * p = NULL;
|
| + if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
|
| + while (p) {
|
| + struct hentry * rv2 = NULL;
|
| + p += MORPH_TAG_LEN;
|
| + int plen = fieldlen(p);
|
| + strncpy(allomorph, p, plen);
|
| + allomorph[plen] = '\0';
|
| + rv2 = pAMgr->lookup(allomorph);
|
| + while (rv2) {
|
| +// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
|
| + if (HENTRY_DATA(rv2)) {
|
| + char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);
|
| + if (st && (strncmp(st + MORPH_TAG_LEN,
|
| + HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {
|
| + char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,
|
| + HENTRY_DATA(rv2), pattern, 0);
|
| + if (aff) {
|
| + mystrcat(result, aff, MAXLNLEN);
|
| + mystrcat(result, "\n", MAXLNLEN);
|
| + free(aff);
|
| + }
|
| + }
|
| + }
|
| + rv2 = rv2->next_homonym;
|
| + }
|
| + p = strstr(p + plen, MORPH_ALLOMORPH);
|
| + }
|
| +
|
| + return (*result) ? mystrdup(result) : NULL;
|
| +}
|
| +
|
| +char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {
|
| + char result[MAXLNLEN];
|
| + char result2[MAXLNLEN];
|
| + char newpattern[MAXLNLEN];
|
| + *newpattern = '\0';
|
| + if (n == 0) return 0;
|
| + *result2 = '\0';
|
| + struct hentry * rv = NULL;
|
| + if (!pAMgr) return NULL;
|
| +
|
| +// search affixed forms with and without derivational suffixes
|
| + while(1) {
|
| +
|
| + for (int k = 0; k < n; k++) {
|
| + *result = '\0';
|
| + // add compound word parts (except the last one)
|
| + char * s = (char *) desc[k];
|
| + char * part = strstr(s, MORPH_PART);
|
| + if (part) {
|
| + char * nextpart = strstr(part + 1, MORPH_PART);
|
| + while (nextpart) {
|
| + copy_field(result + strlen(result), part, MORPH_PART);
|
| + part = nextpart;
|
| + nextpart = strstr(part + 1, MORPH_PART);
|
| + }
|
| + s = part;
|
| + }
|
| +
|
| + char **pl;
|
| + char tok[MAXLNLEN];
|
| + strcpy(tok, s);
|
| + char * alt = strstr(tok, " | ");
|
| + while (alt) {
|
| + alt[1] = MSEP_ALT;
|
| + alt = strstr(alt, " | ");
|
| + }
|
| + int pln = line_tok(tok, &pl, MSEP_ALT);
|
| + for (int i = 0; i < pln; i++) {
|
| + // remove inflectional and terminal suffixes
|
| + char * is = strstr(pl[i], MORPH_INFL_SFX);
|
| + if (is) *is = '\0';
|
| + char * ts = strstr(pl[i], MORPH_TERM_SFX);
|
| + while (ts) {
|
| + *ts = '_';
|
| + ts = strstr(pl[i], MORPH_TERM_SFX);
|
| + }
|
| + char * st = strstr(s, MORPH_STEM);
|
| + if (st) {
|
| + copy_field(tok, st, MORPH_STEM);
|
| + rv = pAMgr->lookup(tok);
|
| + while (rv) {
|
| + char newpat[MAXLNLEN];
|
| + strcpy(newpat, pl[i]);
|
| + strcat(newpat, pattern);
|
| + char * sg = suggest_hentry_gen(rv, newpat);
|
| + if (!sg) sg = suggest_hentry_gen(rv, pattern);
|
| + if (sg) {
|
| + char ** gen;
|
| + int genl = line_tok(sg, &gen, MSEP_REC);
|
| + free(sg);
|
| + sg = NULL;
|
| + for (int j = 0; j < genl; j++) {
|
| + if (strstr(pl[i], MORPH_SURF_PFX)) {
|
| + int r2l = strlen(result2);
|
| + result2[r2l] = MSEP_REC;
|
| + strcpy(result2 + r2l + 1, result);
|
| + copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
|
| + mystrcat(result2, gen[j], MAXLNLEN);
|
| + } else {
|
| + sprintf(result2 + strlen(result2), "%c%s%s",
|
| + MSEP_REC, result, gen[j]);
|
| + }
|
| + }
|
| + freelist(&gen, genl);
|
| + }
|
| + rv = rv->next_homonym;
|
| + }
|
| + }
|
| + }
|
| + freelist(&pl, pln);
|
| + }
|
| +
|
| + if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;
|
| + strcpy(newpattern, pattern);
|
| + pattern = newpattern;
|
| + char * ds = strstr(pattern, MORPH_DERI_SFX);
|
| + while (ds) {
|
| + strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);
|
| + ds = strstr(pattern, MORPH_DERI_SFX);
|
| + }
|
| + }
|
| + return (*result2 ? mystrdup(result2) : NULL);
|
| +}
|
| +
|
| +
|
| // generate an n-gram score comparing s1 and s2
|
| -int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
|
| +int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)
|
| {
|
| int nscore = 0;
|
| int ns;
|
| @@ -1459,13 +1746,9 @@
|
| w_char su2[MAXSWL];
|
| l1 = u8_u16(su1, MAXSWL, s1);
|
| l2 = u8_u16(su2, MAXSWL, s2);
|
| - if (!l2 || (l1==-1) || (l2==-1)) return 0;
|
| - // decapitalize dictionary word
|
| - if (complexprefixes) {
|
| - mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum());
|
| - } else {
|
| - mkallsmall_utf(su2, 1, pAMgr->get_langnum());
|
| - }
|
| + if ((l2 <= 0) || (l1 == -1)) return 0;
|
| + // lowering dictionary word
|
| + if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);
|
| for (int j = 1; j <= n; j++) {
|
| ns = 0;
|
| for (int i = 0; i <= (l1-j); i++) {
|
| @@ -1489,13 +1772,9 @@
|
| char t[MAXSWUTF8L];
|
| l1 = strlen(s1);
|
| l2 = strlen(s2);
|
| - if (!l2) return 0;
|
| + if (l2 == 0) return 0;
|
| strcpy(t, s2);
|
| - if (complexprefixes) {
|
| - *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
|
| - } else {
|
| - mkallsmall(t, csconv);
|
| - }
|
| + if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);
|
| for (int j = 1; j <= n; j++) {
|
| ns = 0;
|
| for (int i = 0; i <= (l1-j); i++) {
|
| @@ -1510,13 +1789,14 @@
|
| }
|
|
|
| ns = 0;
|
| - if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
|
| - if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
|
| + if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
|
| + if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
|
| ns = (nscore - ((ns > 0) ? ns : 0));
|
| return ns;
|
| }
|
|
|
| -int SuggestMgr::equalfirstletter(char * s1, const char * s2) {
|
| +// length of the left common substring of s1 and (decapitalised) s2
|
| +int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {
|
| if (utf8) {
|
| w_char su1[MAXSWL];
|
| w_char su2[MAXSWL];
|
| @@ -1526,9 +1806,17 @@
|
| int l2 = u8_u16(su2, MAXSWL, s2);
|
| if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
|
| } else {
|
| + int i;
|
| u8_u16(su1, 1, s1);
|
| u8_u16(su2, 1, s2);
|
| - if (*((short *)su1) == *((short *)su2)) return 1;
|
| + unsigned short idx = (su2->h << 8) + su2->l;
|
| + if (*((short *)su1) != *((short *)su2) &&
|
| + (*((unsigned short *)su1) != unicodetolower(idx, langnum))) return 0;
|
| + int l1 = u8_u16(su1, MAXSWL, s1);
|
| + int l2 = u8_u16(su2, MAXSWL, s2);
|
| + for(i = 1; (i < l1) && (i < l2) &&
|
| + (*((short *)(su1 + i)) == *((short *)(su2 + i))); i++);
|
| + return i;
|
| }
|
| } else {
|
| if (complexprefixes) {
|
| @@ -1536,7 +1824,13 @@
|
| int l2 = strlen(s2);
|
| if (*(s2+l1-1) == *(s2+l2-1)) return 1;
|
| } else {
|
| - if (*s1 == *s2) return 1;
|
| + char * olds = s1;
|
| + // decapitalise dictionary word
|
| + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;
|
| + do {
|
| + s1++; s2++;
|
| + } while ((*s1 == *s2) && (*s1 != '\0'));
|
| + return s1 - olds;
|
| }
|
| }
|
| return 0;
|
| @@ -1554,9 +1848,9 @@
|
| int l2 = u8_u16(su2, MAXSWL, s2);
|
| // decapitalize dictionary word
|
| if (complexprefixes) {
|
| - mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum());
|
| + mkallsmall_utf(su2+l2-1, 1, langnum);
|
| } else {
|
| - mkallsmall_utf(su2, 1, pAMgr->get_langnum());
|
| + mkallsmall_utf(su2, 1, langnum);
|
| }
|
| for (int i = 0; (i < l1) && (i < l2); i++) {
|
| if (((short *) su1)[i] == ((short *) su2)[i]) {
|
| @@ -1603,7 +1897,7 @@
|
| }
|
|
|
| // sort in decreasing order of score
|
| -void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
|
| +void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )
|
| {
|
| int m = 1;
|
| while (m < n) {
|
| @@ -1616,6 +1910,11 @@
|
| rword[j-1] = rword[j];
|
| rsc[j] = sctmp;
|
| rword[j] = wdtmp;
|
| + if (rword2) {
|
| + wdtmp = rword2[j-1];
|
| + rword2[j-1] = rword2[j];
|
| + rword2[j] = wdtmp;
|
| + }
|
| j--;
|
| } else break;
|
| }
|
| @@ -1642,6 +1941,12 @@
|
| }
|
| c = (char *) malloc((m + 1) * (n + 1));
|
| b = (char *) malloc((m + 1) * (n + 1));
|
| + if (!c || !b) {
|
| + if (c) free(c);
|
| + if (b) free(b);
|
| + *result = NULL;
|
| + return;
|
| + }
|
| for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
|
| for (j = 0; j <= n; j++) c[j] = 0;
|
| for (i = 1; i <= m; i++) {
|
| @@ -1673,6 +1978,7 @@
|
| char * result;
|
| int len = 0;
|
| lcs(s, s2, &m, &n, &result);
|
| + if (!result) return 0;
|
| i = m;
|
| j = n;
|
| while ((i != 0) && (j != 0)) {
|
| @@ -1684,6 +1990,6 @@
|
| i--;
|
| } else j--;
|
| }
|
| - if (result) free(result);
|
| + free(result);
|
| return len;
|
| }
|
|
|