chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx - Issue 155841: Update Hunspell to the latest stable version to use the latest dictionary for...

Unified Diff: chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx

Issue 155841: Update Hunspell to the latest stable version to use the latest dictionary for... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: '' Created 11 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx ('k') | chrome/third_party/hunspell/src/hunspell/w_char.hxx » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx

===================================================================

--- chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx (revision 21721)

+++ chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx (working copy)

@@ -14,13 +14,16 @@

#endif

#include "suggestmgr.hxx"

+#include "htypes.hxx"

+#include "csutil.hxx"

#ifndef MOZILLA_CLIENT

-#ifndef W32

+#ifndef WIN32

using namespace std;

#endif

+const w_char W_VLINE = { '\0', '|' };

SuggestMgr::SuggestMgr(const char * tryme, int maxn,

AffixMgr * aptr)

@@ -30,36 +33,54 @@

// try when building candidate suggestions

pAMgr = aptr;

+ ckeyl = 0;

+ ckey = NULL;

+ ckey_utf = NULL;

ctryl = 0;

ctry = NULL;

ctry_utf = NULL;

+ utf8 = 0;

+ langnum = 0;

+ complexprefixes = 0;

maxSug = maxn;

nosplitsugs = 0;

maxngramsugs = MAXNGRAMSUGS;

- utf8 = 0;

- complexprefixes = 0;

if (pAMgr) {

char * enc = pAMgr->get_encoding();

csconv = get_current_cs(enc);

free(enc);

+ langnum = pAMgr->get_langnum();

+ ckey = pAMgr->get_key_string();

nosplitsugs = pAMgr->get_nosplitsugs();

if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();

utf8 = pAMgr->get_utf8();

complexprefixes = pAMgr->get_complexprefixes();

}

- if (tryme) {

+ if (ckey) {

if (utf8) {

w_char t[MAXSWL];

+ ckeyl = u8_u16(t, MAXSWL, ckey);

+ ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));

+ if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));

+ } else {

+ ckeyl = strlen(ckey);

+ }

+ if (tryme) {

+ ctry = mystrdup(tryme);

+ if (ctry) ctryl = strlen(ctry);

+ if (ctry && utf8) {

+ w_char t[MAXSWL];

ctryl = u8_u16(t, MAXSWL, tryme);

ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));

- memcpy(ctry_utf, t, ctryl * sizeof(w_char));

- } else {

- ctry = mystrdup(tryme);

- ctryl = strlen(ctry);

+ if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));

+ else ctryl = 0;

}

@@ -68,6 +89,11 @@

SuggestMgr::~SuggestMgr()

{

pAMgr = NULL;

+ if (ckey) free(ckey);

+ ckey = NULL;

+ if (ckey_utf) free(ckey_utf);

+ ckey_utf = NULL;

+ ckeyl = 0;

if (ctry) free(ctry);

ctry = NULL;

if (ctry_utf) free(ctry_utf);

@@ -77,7 +103,7 @@

}

int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,

- int * timer, time_t * timelimit) {

+ int * timer, clock_t * timelimit) {

int cwrd = 1;

if (ns == maxSug) return maxSug;

for (int k=0; k < ns; k++) {

@@ -96,13 +122,15 @@

// generate suggestions for a mispelled word

// pass in address of array of char * pointers

+// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)

-int SuggestMgr::suggest(char*** slst, const char * w, int nsug)

+int SuggestMgr::suggest(char*** slst, const char * w, int nsug,

+ int * onlycompoundsug)

{

- int nocompoundtwowords = 0;

- char ** wlst;

- w_char word_utf[MAXSWL];

- int wl = 0;

+ int nocompoundtwowords = 0;

+ char ** wlst;

+ w_char word_utf[MAXSWL];

+ int wl = 0;

char w2[MAXWORDUTF8LEN];

const char * word = w;

@@ -141,8 +169,8 @@

nsug = replchars(wlst, word, nsug, cpdsuggest);

// perhaps we made chose the wrong char from a related set

- if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) {

- nsug = mapchars(wlst, word, nsug);

+ if ((nsug < maxSug) && (nsug > -1)) {

+ nsug = mapchars(wlst, word, nsug, cpdsuggest);

}

// did we swap the order of chars by mistake

@@ -157,6 +185,22 @@

longswapchar(wlst, word, nsug, cpdsuggest);

}

+ // did we just hit the wrong key in place of a good char (case and keyboard)

+ if ((nsug < maxSug) && (nsug > -1)) {

+ nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

+ badcharkey(wlst, word, nsug, cpdsuggest);

+ }

+ // only suggest compound words when no other suggestion

+ if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1;

+ // did we add a char that should not be there

+ if ((nsug < maxSug) && (nsug > -1)) {

+ nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

+ extrachar(wlst, word, nsug, cpdsuggest);

+ }

// did we forgot a char

if ((nsug < maxSug) && (nsug > -1)) {

nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

@@ -169,12 +213,6 @@

movechar(wlst, word, nsug, cpdsuggest);

}

- // did we add a char that should not be there

- if ((nsug < maxSug) && (nsug > -1)) {

- nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

- extrachar(wlst, word, nsug, cpdsuggest);

- }

// did we just hit the wrong key in place of a good char

if ((nsug < maxSug) && (nsug > -1)) {

nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :

@@ -187,10 +225,6 @@

doubletwochars(wlst, word, nsug, cpdsuggest);

}

- // only suggest compound words when no other suggestion

- if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;

// perhaps we forgot to hit space and two words ran together

if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {

nsug = twowords(wlst, word, nsug, cpdsuggest);

@@ -205,6 +239,8 @@

free(wlst);

wlst = NULL;

}

+ if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;

*slst = wlst;

return nsug;

@@ -242,8 +278,8 @@

nsug = replchars(wlst, word, nsug, cpdsuggest);

// perhaps we made chose the wrong char from a related set

- if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0))

- nsug = mapchars(wlst, word, nsug);

+ if ((nsug < maxSug) && (nsug > -1))

+ nsug = mapchars(wlst, word, nsug, cpdsuggest);

if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;

@@ -273,7 +309,7 @@

char candidate[MAXSWUTF8L];

w_char candidate_utf[MAXSWL];

memcpy(candidate_utf, word, wl * sizeof(w_char));

- mkallcap_utf(candidate_utf, wl, pAMgr->get_langnum());

+ mkallcap_utf(candidate_utf, wl, langnum);

u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

}

@@ -288,9 +324,9 @@

}

// suggestions for when chose the wrong char out of a related set

-int SuggestMgr::mapchars(char** wlst, const char * word, int ns)

+int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)

{

- time_t timelimit;

+ clock_t timelimit;

int timer;

int wl = strlen(word);

@@ -300,18 +336,19 @@

struct mapentry* maptable = pAMgr->get_maptable();

if (maptable==NULL) return ns;

- timelimit = time(NULL);

+ timelimit = clock();

timer = MINTIMER;

if (utf8) {

w_char w[MAXSWL];

int len = u8_u16(w, MAXSWL, word);

- ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit);

- } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit);

+ ns = map_related_utf(w, len, 0, cpdsuggest, wlst, ns, maptable, nummap, &timer, &timelimit);

+ } else ns = map_related(word, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);

return ns;

}

-int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns,

- const mapentry* maptable, int nummap, int * timer, time_t * timelimit)

+int SuggestMgr::map_related(const char * word, int i, char** wlst,

+ int cpdsuggest, int ns,

+ const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)

{

char c = *(word + i);

if (c == 0) {

@@ -319,8 +356,7 @@

int wl = strlen(word);

for (int m=0; m < ns; m++)

if (strcmp(word,wlst[m]) == 0) cwrd = 0;

- if ((cwrd) && (checkword(word, wl, 0, timer, timelimit) ||

- checkword(word, wl, 1, timer, timelimit))) {

+ if ((cwrd) && checkword(word, wl, cpdsuggest, timer, timelimit)) {

if (ns < maxSug) {

wlst[ns] = mystrdup(word);

if (wlst[ns] == NULL) return -1;

@@ -334,23 +370,27 @@

if (strchr(maptable[j].set,c) != 0) {

in_map = 1;

char * newword = mystrdup(word);

+ if (!newword) return -1;

for (int k = 0; k < maptable[j].len; k++) {

*(newword + i) = *(maptable[j].set + k);

- ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit);

- if (!(*timelimit)) return ns;

+ ns = map_related(newword, (i+1), wlst, cpdsuggest,

+ ns, maptable, nummap, timer, timelimit);

+ if (!(*timer)) return ns;

}

free(newword);

}

if (!in_map) {

i++;

- ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit);

+ ns = map_related(word, i, wlst, cpdsuggest,

+ ns, maptable, nummap, timer, timelimit);

}

return ns;

}

-int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns,

- const mapentry* maptable, int nummap, int * timer, time_t * timelimit)

+int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest,

+ char** wlst, int ns, const mapentry* maptable, int nummap,

+ int * timer, clock_t * timelimit)

{

if (i == len) {

int cwrd = 1;

@@ -360,8 +400,7 @@

wl = strlen(s);

for (int m=0; m < ns; m++)

if (strcmp(s,wlst[m]) == 0) cwrd = 0;

- if ((cwrd) && (checkword(s, wl, 0, timer, timelimit) ||

- checkword(s, wl, 1, timer, timelimit))) {

+ if ((cwrd) && checkword(s, wl, cpdsuggest, timer, timelimit)) {

if (ns < maxSug) {

wlst[ns] = mystrdup(s);

if (wlst[ns] == NULL) return -1;

@@ -377,15 +416,17 @@

in_map = 1;

for (int k = 0; k < maptable[j].len; k++) {

*(word + i) = *(maptable[j].set_utf16 + k);

- ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit);

- if (!(*timelimit)) return ns;

+ ns = map_related_utf(word, len, i + 1, cpdsuggest,

+ wlst, ns, maptable, nummap, timer, timelimit);

+ if (!(*timer)) return ns;

}

*((unsigned short *) word + i) = c;

}

if (!in_map) {

i++;

- ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit);

+ ns = map_related_utf(word, len, i, cpdsuggest,

+ wlst, ns, maptable, nummap, timer, timelimit);

}

return ns;

}

@@ -416,6 +457,23 @@

strcpy(candidate+(r-word)+lenr, r+lenp);

ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

+ // check REP suggestions with space

+ char * sp = strchr(candidate, ' ');

+ if (sp) {

+ *sp = '\0';

+ if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) {

+ int oldns = ns;

+ *sp = ' ';

+ ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ if (oldns < ns) {

+ free(wlst[ns - 1]);

+ wlst[ns - 1] = mystrdup(candidate);

+ if (!wlst[ns - 1]) return -1;

+ }

+ *sp = ' ';

+ }

r++; // search for the next letter

}

@@ -454,7 +512,7 @@

int state=0;

if (wl < 5 || ! pAMgr) return ns;

for (int i=2; i < wl; i++) {

- if ((word[i].l==word[i-2].l) && (word[i].h==word[i-2].h)) {

+ if (w_char_eq(word[i], word[i-2])) {

state++;

if (state==3) {

memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));

@@ -471,25 +529,108 @@

return ns;

}

+// error is wrong char in place of correct one (case and keyboard related version)

+int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)

+ char tmpc;

+ char candidate[MAXSWUTF8L];

+ int wl = strlen(word);

+ strcpy(candidate, word);

+ // swap out each char one by one and try uppercase and neighbor

+ // keyboard chars in its place to see if that makes a good word

+ for (int i=0; i < wl; i++) {

+ tmpc = candidate[i];

+ // check with uppercase letters

+ candidate[i] = csconv[((unsigned char)tmpc)].cupper;

+ if (tmpc != candidate[i]) {

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ candidate[i] = tmpc;

+ }

+ // check neighbor characters in keyboard string

+ if (!ckey) continue;

+ char * loc = strchr(ckey, tmpc);

+ while (loc) {

+ if ((loc > ckey) && (*(loc - 1) != '|')) {

+ candidate[i] = *(loc - 1);

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ }

+ if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {

+ candidate[i] = *(loc + 1);

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ }

+ loc = strchr(loc + 1, tmpc);

+ }

+ candidate[i] = tmpc;

+ }

+ return ns;

+// error is wrong char in place of correct one (case and keyboard related version)

+int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

+ w_char tmpc;

+ w_char candidate_utf[MAXSWL];

+ char candidate[MAXSWUTF8L];

+ memcpy(candidate_utf, word, wl * sizeof(w_char));

+ // swap out each char one by one and try all the tryme

+ // chars in its place to see if that makes a good word

+ for (int i=0; i < wl; i++) {

+ tmpc = candidate_utf[i];

+ // check with uppercase letters

+ mkallcap_utf(candidate_utf + i, 1, langnum);

+ if (!w_char_eq(tmpc, candidate_utf[i])) {

+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ candidate_utf[i] = tmpc;

+ }

+ // check neighbor characters in keyboard string

+ if (!ckey) continue;

+ w_char * loc = ckey_utf;

+ while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;

+ while (loc < (ckey_utf + ckeyl)) {

+ if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {

+ candidate_utf[i] = *(loc - 1);

+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ }

+ if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {

+ candidate_utf[i] = *(loc + 1);

+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ }

+ do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));

+ }

+ candidate_utf[i] = tmpc;

+ }

+ return ns;

// error is wrong char in place of correct one

int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)

{

char tmpc;

char candidate[MAXSWUTF8L];

- time_t timelimit = time(NULL);

+ clock_t timelimit = clock();

int timer = MINTIMER;

int wl = strlen(word);

strcpy(candidate, word);

// swap out each char one by one and try all the tryme

// chars in its place to see if that makes a good word

- for (int i=0; i < wl; i++) {

- tmpc = candidate[i];

- for (int j=0; j < ctryl; j++) {

+ for (int j=0; j < ctryl; j++) {

+ for (int i=wl-1; i >= 0; i--) {

+ tmpc = candidate[i];

if (ctry[j] == tmpc) continue;

candidate[i] = ctry[j];

ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);

if (ns == -1) return -1;

- if (!timelimit) return ns;

+ if (!timer) return ns;

candidate[i] = tmpc;

}

@@ -502,20 +643,20 @@

w_char tmpc;

w_char candidate_utf[MAXSWL];

char candidate[MAXSWUTF8L];

- time_t timelimit = time(NULL);

+ clock_t timelimit = clock();

int timer = MINTIMER;

memcpy(candidate_utf, word, wl * sizeof(w_char));

// swap out each char one by one and try all the tryme

// chars in its place to see if that makes a good word

- for (int i=0; i < wl; i++) {

- tmpc = candidate_utf[i];

- for (int j=0; j < ctryl; j++) {

- if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue;

+ for (int j=0; j < ctryl; j++) {

+ for (int i=wl-1; i >= 0; i--) {

+ tmpc = candidate_utf[i];

+ if (w_char_eq(tmpc, ctry_utf[j])) continue;

candidate_utf[i] = ctry_utf[j];

u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);

if (ns == -1) return -1;

- if (!timelimit) return ns;

+ if (!timer) return ns;

candidate_utf[i] = tmpc;

}

@@ -525,18 +666,20 @@

// error is word has an extra letter it does not need

int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)

{

- char candidate[MAXSWUTF8L];

+ char candidate[MAXSWUTF8L];

w_char candidate_utf[MAXSWL];

- const w_char * p;

- w_char * r;

+ w_char * p;

+ w_char tmpc = W_VLINE; // not used value, only for VCC warning message

if (wl < 2) return ns;

// try omitting one char of word at a time

- memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char));

- for (p = word, r = candidate_utf; p < word + wl; ) {

- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);

+ memcpy(candidate_utf, word, wl * sizeof(w_char));

+ for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {

+ w_char tmpc2 = *p;

+ if (p < candidate_utf + wl - 1) *p = tmpc;

+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);

ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

- *r++ = *p++;

+ tmpc = tmpc2;

}

return ns;

}

@@ -544,48 +687,42 @@

// error is word has an extra letter it does not need

int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)

{

+ char tmpc = '\0';

char candidate[MAXSWUTF8L];

- const char * p;

- char * r;

+ char * p;

int wl = strlen(word);

if (wl < 2) return ns;

// try omitting one char of word at a time

- strcpy (candidate, word + 1);

- for (p = word, r = candidate; *p != 0; ) {

+ strcpy (candidate, word);

+ for (p = candidate + wl - 1; p >=candidate; p--) {

+ char tmpc2 = *p;

+ *p = tmpc;

ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

- *r++ = *p++;

+ tmpc = tmpc2;

}

return ns;

}

// error is missing a letter it needs

int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)

{

char candidate[MAXSWUTF8L];

- const char * p;

- char * q;

- time_t timelimit = time(NULL);

+ char * p;

+ clock_t timelimit = clock();

int timer = MINTIMER;

int wl = strlen(word);

- // try inserting a tryme character before every letter

- strcpy(candidate + 1, word);

- for (p = word, q = candidate; *p != 0; ) {

- for (int i = 0; i < ctryl; i++) {

- *q = ctry[i];

+ // try inserting a tryme character before every letter (and the null terminator)

+ for (int i = 0; i < ctryl; i++) {

+ strcpy(candidate, word);

+ for (p = candidate + wl; p >= candidate; p--) {

+ *(p+1) = *p;

+ *p = ctry[i];

ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);

if (ns == -1) return -1;

- if (!timelimit) return ns;

+ if (!timer) return ns;

}

- *q++ = *p++;

}

- // now try adding one to end */

- for (int i = 0; i < ctryl; i++) {

- *q = ctry[i];

- ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, NULL, NULL);

- if (ns == -1) return -1;

- }

return ns;

}

@@ -594,32 +731,21 @@

{

w_char candidate_utf[MAXSWL];

char candidate[MAXSWUTF8L];

- const w_char * p;

- w_char * q;

- int cwrd;

- time_t timelimit = time(NULL);

+ w_char * p;

+ clock_t timelimit = clock();

int timer = MINTIMER;

- // try inserting a tryme character before every letter

- memcpy (candidate_utf + 1, word, wl * sizeof(w_char));

- for (p = word, q = candidate_utf; p < (word + wl); ) {

- for (int i = 0; i < ctryl; i++) {

- *q = ctry_utf[i];

- cwrd = 1;

+ // try inserting a tryme character at the end of the word and before every letter

+ for (int i = 0; i < ctryl; i++) {

+ memcpy (candidate_utf, word, wl * sizeof(w_char));

+ for (p = candidate_utf + wl; p >= candidate_utf; p--) {

+ *(p + 1) = *p;

+ *p = ctry_utf[i];

u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);

ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);

if (ns == -1) return -1;

- if (!timelimit) return ns;

- }

- *q++ = *p++;

+ if (!timer) return ns;

+ }

}

- // now try adding one to end */

- for (int i = 0; i < ctryl; i++) {

- *q = ctry_utf[i];

- cwrd = 1;

- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);

- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

- if (ns == -1) return -1;

- }

return ns;

}

@@ -636,19 +762,19 @@

int wl=strlen(word);

if (wl < 3) return ns;

- if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl);

+ if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);

strcpy(candidate + 1, word);

// split the string into two pieces after every char

// if both pieces are good words make them a suggestion

for (p = candidate + 1; p[1] != '\0'; p++) {

p[-1] = *p;

// go to end of the UTF-8 character

while (utf8 && ((p[1] & 0xc0) == 0x80)) {

+ *p = p[1];

p++;

- p[-1] = *p;

}

+ if (utf8 && p[1] == '\0') break; // last UTF-8 character

*p = '\0';

c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);

if (c1) {

@@ -657,7 +783,7 @@

*p = ' ';

// spec. Hungarian code (need a better compound word support)

- if ((pAMgr->get_langnum() == LANG_hu) && !forbidden &&

+ if ((langnum == LANG_hu) && !forbidden &&

// if 3 repeating letter, use - instead of space

(((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||

// or multiple compounding, with more, than 6 syllables

@@ -673,6 +799,23 @@

ns++;

}

} else return ns;

+ // add two word suggestion with dash, if TRY string contains

+ // "a" or "-"

+ // NOTE: cwrd doesn't modified for REP twoword sugg.

+ if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&

+ mystrlen(p + 1) > 1 &&

+ mystrlen(candidate) - mystrlen(p) > 1) {

+ *p = '-';

+ for (int k=0; k < ns; k++)

+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;

+ if (ns < maxSug) {

+ if (cwrd) {

+ wlst[ns] = mystrdup(candidate);

+ if (wlst[ns] == NULL) return -1;

+ ns++;

+ }

+ } else return ns;

+ }

}

@@ -698,6 +841,24 @@

p[1] = *p;

*p = tmpc;

}

+ // try double swaps for short words

+ // ahev -> have, owudl -> would

+ if (wl == 4 || wl == 5) {

+ candidate[0] = word[1];

+ candidate[1] = word[0];

+ candidate[2] = word[2];

+ candidate[wl - 2] = word[wl - 1];

+ candidate[wl - 1] = word[wl - 2];

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ if (wl == 5) {

+ candidate[0] = word[0];

+ candidate[1] = word[2];

+ candidate[2] = word[1];

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ }

return ns;

}

@@ -708,6 +869,7 @@

char candidate[MAXSWUTF8L];

w_char * p;

w_char tmpc;

+ int len = 0;

// try swapping adjacent chars one by one

memcpy (candidate_utf, word, wl * sizeof(w_char));

for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {

@@ -715,11 +877,32 @@

*p = p[1];

p[1] = tmpc;

u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ if (len == 0) len = strlen(candidate);

+ ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

p[1] = *p;

*p = tmpc;

}

+ // try double swaps for short words

+ // ahev -> have, owudl -> would, suodn -> sound

+ if (wl == 4 || wl == 5) {

+ candidate_utf[0] = word[1];

+ candidate_utf[1] = word[0];

+ candidate_utf[2] = word[2];

+ candidate_utf[wl - 2] = word[wl - 1];

+ candidate_utf[wl - 1] = word[wl - 2];

+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

+ ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ if (wl == 5) {

+ candidate_utf[0] = word[0];

+ candidate_utf[1] = word[2];

+ candidate_utf[2] = word[1];

+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

+ ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);

+ if (ns == -1) return -1;

+ }

return ns;

}

@@ -794,7 +977,7 @@

*(q-1) = *q;

*q = tmpc;

if ((q-p) < 2) continue; // omit swap char

- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

}

strcpy(candidate, word);

@@ -805,7 +988,7 @@

*(q+1) = *q;

*q = tmpc;

if ((p-q) < 2) continue; // omit swap char

- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

}

strcpy(candidate, word);

@@ -830,7 +1013,7 @@

*q = tmpc;

if ((q-p) < 2) continue; // omit swap char

u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

}

memcpy (candidate_utf, word, wl * sizeof(w_char));

@@ -842,7 +1025,7 @@

*q = tmpc;

if ((p-q) < 2) continue; // omit swap char

u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);

- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);

+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);

if (ns == -1) return -1;

}

memcpy (candidate_utf, word, wl * sizeof(w_char));

@@ -851,28 +1034,33 @@

}

// generate a set of suggestions for very poorly spelled words

-int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr)

+int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)

{

int i, j;

int lval;

- int sc;

- int lp;

+ int sc, scphon;

+ int lp, lpphon;

int nonbmp = 0;

- if (!pHMgr) return 0;

// exhaustively search through all root words

// keeping track of the MAX_ROOTS most similar root words

struct hentry * roots[MAX_ROOTS];

+ char * rootsphon[MAX_ROOTS];

int scores[MAX_ROOTS];

+ int scoresphon[MAX_ROOTS];

for (i = 0; i < MAX_ROOTS; i++) {

roots[i] = NULL;

scores[i] = -100 * i;

+ rootsphon[i] = NULL;

+ scoresphon[i] = -100 * i;

}

lp = MAX_ROOTS - 1;

+ lpphon = MAX_ROOTS - 1;

+ scphon = scoresphon[MAX_ROOTS-1];

char w2[MAXWORDUTF8LEN];

+ char f[MAXSWUTF8L];

char * word = w;

// word reversing wrapper for complex prefixes

@@ -896,8 +1084,8 @@

struct hentry* hp = NULL;

int col = -1;

- #ifdef HUNSPELL_CHROME_CLIENT

+#ifdef HUNSPELL_CHROME_CLIENT

// A static array of hentries required for walking the hash table.

struct hentry static_hentry[MAX_ROOTS];

@@ -906,31 +1094,61 @@

static const int kMaxWordLen = 128;

char hentry_word[MAX_ROOTS][kMaxWordLen];

unsigned short hentry_astr[MAX_ROOTS];

- #endif

+#endif

- while ((hp = pHMgr->walk_hashtable(col, hp))) {

+ phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;

+ char target[MAXSWUTF8L];

+ char candidate[MAXSWUTF8L];

+ if (ph) {

+ strcpy(candidate, word);

+ mkallcap(candidate, csconv);

+ phonet(candidate, target, n, *ph);

+ }

+ for (i = 0; i < md; i++) {

+ while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {

if ((hp->astr) && (pAMgr) &&

(TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||

+ TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||

TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||

TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;

- sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);

+ sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) +

+ leftcommonsubstring(word, HENTRY_WORD(hp));

+ // check special pronounciation

+ if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {

+ int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) +

+ leftcommonsubstring(word, f);

+ if (sc2 > sc) sc = sc2;

+ }

+ if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {

+ char target2[MAXSWUTF8L];

+ strcpy(candidate, HENTRY_WORD(hp));

+ mkallcap(candidate, csconv);

+ phonet(candidate, target2, -1, *ph);

+ scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);

+ }

if (sc > scores[lp]) {

scores[lp] = sc;

- #ifdef HUNSPELL_CHROME_CLIENT

+#ifdef HUNSPELL_CHROME_CLIENT

roots[lp] = &static_hentry[lp];

roots[lp]->alen = hp->alen;

if (hp->astr)

hentry_astr[lp] = *hp->astr;

roots[lp]->astr = &hentry_astr[lp];

- roots[lp]->wlen = hp->wlen;

- strcpy(&hentry_word[lp][0], hp->word);

- roots[lp]->word = &hentry_word[lp][0];

+ roots[lp]->blen = hp->blen;

+ strcpy(&hentry_word[lp][0], &hp->word);

+ roots[lp]->word = hentry_word[lp][0];

roots[lp]->next = NULL;

roots[lp]->next_homonym = NULL;

- #else

+ roots[lp]->var = 0;

+ roots[lp]->clen = 0;

+#else

roots[lp] = hp;

- #endif

+#endif

lval = sc;

for (j=0; j < MAX_ROOTS; j++)

if (scores[j] < lval) {

@@ -938,8 +1156,19 @@

lval = scores[j];

}

- }

+ if (scphon > scoresphon[lpphon]) {

+ scoresphon[lpphon] = scphon;

+ rootsphon[lpphon] = HENTRY_WORD(hp);

+ lval = scphon;

+ for (j=0; j < MAX_ROOTS; j++)

+ if (scoresphon[j] < lval) {

+ lpphon = j;

+ lval = scoresphon[j];

+ }

+ }}

// find minimum threshhold for a passable suggestion

// mangle original word three differnt ways

// and score them to generate a minimum acceptable score

@@ -948,11 +1177,11 @@

if (utf8) {

for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';

u16_u8(mw, MAXSWUTF8L, u8, n);

- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);

+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);

} else {

strcpy(mw, word);

for (int k=sp; k < n; k+=4) *(mw + k) = '*';

- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);

+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);

}

thresh = thresh / 3;

@@ -962,9 +1191,11 @@

// and use length adjusted ngram scores to select

// possible suggestions

char * guess[MAX_GUESS];

+ char * guessorig[MAX_GUESS];

int gscore[MAX_GUESS];

for(i=0;i<MAX_GUESS;i++) {

guess[i] = NULL;

+ guessorig[i] = NULL;

gscore[i] = -100 * i;

}

@@ -974,31 +1205,46 @@

glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));

if (! glst) {

if (nonbmp) utf8 = 1;

- return 0;

+ return ns;

}

for (i = 0; i < MAX_ROOTS; i++) {

if (roots[i]) {

struct hentry * rp = roots[i];

- int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,

- rp->astr, rp->alen, word, nc);

+ int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,

+ rp->astr, rp->alen, word, nc,

+ ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));

for (int k = 0; k < nw ; k++) {

- sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);

+ sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) +

+ leftcommonsubstring(word, glst[k].word);

if ((sc > thresh)) {

if (sc > gscore[lp]) {

- if (guess[lp]) free (guess[lp]);

+ if (guess[lp]) {

+ free (guess[lp]);

+ if (guessorig[lp]) {

+ free(guessorig[lp]);

+ guessorig[lp] = NULL;

+ }

gscore[lp] = sc;

guess[lp] = glst[k].word;

+ guessorig[lp] = glst[k].orig;

lval = sc;

for (j=0; j < MAX_GUESS; j++)

if (gscore[j] < lval) {

lp = j;

lval = gscore[j];

}

- } else free (glst[k].word);

- } else free(glst[k].word);

+ } else {

+ free(glst[k].word);

+ if (glst[k].orig) free(glst[k].orig);

+ }

+ } else {

+ free(glst[k].word);

+ if (glst[k].orig) free(glst[k].orig);

+ }

}

@@ -1007,7 +1253,9 @@

// now we are done generating guesses

// sort in order of decreasing score

- bubblesort(&guess[0], &gscore[0], MAX_GUESS);

+ bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);

+ if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);

// weight suggestions with a similarity index, based on

// the longest common subsequent algorithm and resort

@@ -1021,7 +1269,7 @@

if (utf8) {

w_char _w[MAXSWL];

len = u8_u16(_w, MAXSWL, guess[i]);

- mkallsmall_utf(_w, len, pAMgr->get_langnum());

+ mkallsmall_utf(_w, len, langnum);

u16_u8(gl, MAXSWUTF8L, _w, len);

} else {

strcpy(gl, guess[i]);

@@ -1039,10 +1287,10 @@

// heuristic weigthing of ngram scores

gscore[i] +=

- // length of longest common subsequent minus lenght difference

+ // length of longest common subsequent minus length difference

2 * _lcs - abs((int) (n - len)) +

- // weight equal first letter

- equalfirstletter(word, gl) +

+ // weight length of the left common substring

+ leftcommonsubstring(word, gl) +

// weight equal character positions

((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +

// swap character (not neighboring)

@@ -1050,28 +1298,87 @@

}

- bubblesort(&guess[0], &gscore[0], MAX_GUESS);

+ bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);

+// phonetic version

+ if (ph) for (i=0; i < MAX_ROOTS; i++) {

+ if (rootsphon[i]) {

+ // lowering rootphon[i]

+ char gl[MAXSWUTF8L];

+ int len;

+ if (utf8) {

+ w_char _w[MAXSWL];

+ len = u8_u16(_w, MAXSWL, rootsphon[i]);

+ mkallsmall_utf(_w, len, langnum);

+ u16_u8(gl, MAXSWUTF8L, _w, len);

+ } else {

+ strcpy(gl, rootsphon[i]);

+ mkallsmall(gl, csconv);

+ len = strlen(rootsphon[i]);

+ }

+ // heuristic weigthing of ngram scores

+ scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +

+ // weight length of the left common substring

+ leftcommonsubstring(word, gl);

+ }

+ if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);

// copy over

+ int oldns = ns;

- int ns = 0;

int same = 0;

for (i=0; i < MAX_GUESS; i++) {

if (guess[i]) {

- if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {

+ if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {

int unique = 1;

- // we have excellent suggestion(s)

+ // leave only excellent suggestions, if exists

if (gscore[i] > 1000) same = 1;

- for (j=0; j < ns; j++)

+ for (j = 0; j < ns; j++) {

// don't suggest previous suggestions or a previous suggestion with prefixes or affixes

- if (strstr(guess[i], wlst[j]) ||

+ if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||

+ (guessorig[i] && strstr(guessorig[i], wlst[j])) ||

// check forbidden words

!checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;

- if (unique) wlst[ns++] = guess[i]; else free(guess[i]);

- } else free(guess[i]);

+ }

+ if (unique) {

+ wlst[ns++] = guess[i];

+ if (guessorig[i]) {

+ free(guess[i]);

+ wlst[ns-1] = guessorig[i];

+ }

+ } else {

+ free(guess[i]);

+ if (guessorig[i]) free(guessorig[i]);

+ }

+ } else {

+ free(guess[i]);

+ if (guessorig[i]) free(guessorig[i]);

+ }

}

+ oldns = ns;

+ if (ph) for (i=0; i < MAX_ROOTS; i++) {

+ if (rootsphon[i]) {

+ if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {

+ int unique = 1;

+ for (j = 0; j < ns; j++) {

+ // don't suggest previous suggestions or a previous suggestion with prefixes or affixes

+ if (strstr(rootsphon[i], wlst[j]) ||

+ // check forbidden words

+ !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0;

+ }

+ if (unique) {

+ wlst[ns++] = mystrdup(rootsphon[i]);

+ if (!wlst[ns - 1]) return ns - 1;

+ }

if (nonbmp) utf8 = 1;

return ns;

}

@@ -1083,19 +1390,16 @@

// obsolote MySpell-HU modifications:

// return value 2 and 3 marks compounding with hyphen (-)

// `3' marks roots without suffix

-int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit)

+int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)

{

struct hentry * rv=NULL;

int nosuffix = 0;

// check time limit

if (timer) {

(*timer)--;

if (!(*timer) && timelimit) {

- if (time(NULL) > *timelimit) {

- *timelimit = 0;

- return 0;

- }

+ if ((clock() - *timelimit) > TIMELIMIT) return 0;

*timer = MAXPLUSTIMER;

}

@@ -1103,7 +1407,7 @@

if (pAMgr) {

if (cpdsuggest==1) {

if (pAMgr->get_compound()) {

- rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1);

+ rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT

if (rv) return 3; // XXX obsolote categorisation

}

return 0;

@@ -1114,10 +1418,15 @@

if (rv) {

if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)

|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;

- if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||

- TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;

+ while (rv) {

+ if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||

+ TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

+ TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {

+ rv = rv->next_homonym;

+ } else break;

+ }

} else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX

if (rv) {

nosuffix=1;

} else {

@@ -1130,8 +1439,9 @@

}

// check forbidden words

- if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)

- || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||

+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||

+ TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

+ TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||

TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;

if (rv) { // XXX obsolote

@@ -1149,7 +1459,7 @@

if (pAMgr) {

rv = pAMgr->lookup(word);

- if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||

+ if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||

TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;

if (!(pAMgr->prefix_check(word,len,1)))

rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix

@@ -1160,184 +1470,6 @@

}

#ifdef HUNSPELL_EXPERIMENTAL

-// suggest stems, XXX experimental code

-int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug)

- char buf[MAXSWUTF8L];

- char ** wlst;

- int prevnsug = nsug;

- char w2[MAXWORDUTF8LEN];

- const char * word = w;

- // word reversing wrapper for complex prefixes

- if (complexprefixes) {

- strcpy(w2, w);

- if (utf8) reverseword_utf(w2); else reverseword(w2);

- word = w2;

- }

- if (*slst) {

- wlst = *slst;

- } else {

- wlst = (char **) calloc(maxSug, sizeof(char *));

- if (wlst == NULL) return -1;

- }

- // perhaps there are a fix stem in the dictionary

- if ((nsug < maxSug) && (nsug > -1)) {

- nsug = fixstems(wlst, word, nsug);

- if (nsug == prevnsug) {

- char * s = mystrdup(word);

- char * p = s + strlen(s);

- while ((*p != '-') && (p != s)) p--;

- if (*p == '-') {

- *p = '\0';

- nsug = fixstems(wlst, s, nsug);

- if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) {

- char * t;

- buf[0] = '\0';

- for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number?

- if (*t != '\0') strcpy(buf, "# ");

- strcat(buf, s);

- wlst[nsug] = mystrdup(buf);

- if (wlst[nsug] == NULL) return -1;

- nsug++;

- }

- p++;

- nsug = fixstems(wlst, p, nsug);

- }

- free(s);

- }

- if (nsug < 0) {

- for (int i=0;i<maxSug; i++)

- if (wlst[i] != NULL) free(wlst[i]);

- free(wlst);

- return -1;

- }

- *slst = wlst;

- return nsug;

-// there are fix stems in dictionary

-int SuggestMgr::fixstems(char ** wlst, const char * word, int ns)

- char buf[MAXSWUTF8L];

- char prefix[MAXSWUTF8L] = "";

- int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound

- int cpdindex = 0;

- struct hentry * rv = NULL;

- int wl = strlen(word);

- int cmpdstemnum;

- int cmpdstem[MAXCOMPOUND];

- if (pAMgr) {

- rv = pAMgr->lookup(word);

- if (rv) {

- dicstem = 0;

- } else {

- // try stripping off affixes

- rv = pAMgr->affix_check(word, wl);

- // else try check compound word

- if (!rv && pAMgr->get_compound()) {

- rv = pAMgr->compound_check(word, wl,

- 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1);

- if (rv) {

- dicstem = 2;

- for (int j = 0; j < cmpdstemnum; j++) {

- cpdindex += cmpdstem[j];

- }

- if(! (pAMgr->lookup(word + cpdindex)))

- pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix

- }

- if (pAMgr->get_prefix()) {

- strcpy(prefix, pAMgr->get_prefix());

- }

- // XXX obsolete, will be a general solution for stemming

- if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU)

- }

- if ((rv) && (ns < maxSug)) {

- // check fixstem flag and not_valid_stem flag

- // first word

- if ((ns < maxSug) && (dicstem < 2)) {

- strcpy(buf, prefix);

- if ((dicstem > 0) && pAMgr->get_derived()) {

- // XXX obsolote

- if (strlen(prefix) == 1) {

- strcat(buf, (pAMgr->get_derived()) + 1);

- } else {

- strcat(buf, pAMgr->get_derived());

- }

- } else {

- // special stem in affix description

- const char * wordchars = pAMgr->get_wordchars();

- if (rv->description &&

- (strchr(wordchars, *(rv->description)))) {

- char * desc = (rv->description) + 1;

- while (strchr(wordchars, *desc)) desc++;

- strncat(buf, rv->description, desc - (rv->description));

- } else {

- strcat(buf, rv->word);

- }

- wlst[ns] = mystrdup(buf);

- if (wlst[ns] == NULL) return -1;

- ns++;

- }

- if (dicstem == 2) {

- // compound stem

-// if (rv->astr && (strchr(rv->astr, '0') == NULL)) {

- if (rv->astr) {

- strcpy(buf, word);

- buf[cpdindex] = '\0';

- if (prefix) strcat(buf, prefix);

- if (pAMgr->get_derived()) {

- strcat(buf, pAMgr->get_derived());

- } else {

- // special stem in affix description

- const char * wordchars = pAMgr->get_wordchars();

- if (rv->description &&

- (strchr(wordchars, *(rv->description)))) {

- char * desc = (rv->description) + 1;

- while (strchr(wordchars, *desc)) desc++;

- strncat(buf, rv->description, desc - (rv->description));

- } else {

- strcat(buf, rv->word);

- }

- if (ns < maxSug) {

- wlst[ns] = mystrdup(buf);

- if (wlst[ns] == NULL) return -1;

- ns++;

- }

- return ns;

// suggest possible stems

int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)

{

@@ -1377,6 +1509,7 @@

*slst = wlst;

return nsug;

}

+#endif // END OF HUNSPELL_EXPERIMENTAL CODE

char * SuggestMgr::suggest_morph(const char * w)

@@ -1405,20 +1538,25 @@

while (rv) {

if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||

- TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) ||

+ TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||

TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {

- if (rv->description && ((!rv->astr) ||

- !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen)))

- strcat(result, word);

- if (rv->description) strcat(result, rv->description);

- strcat(result, "\n");

+ if (!HENTRY_FIND(rv, MORPH_STEM)) {

+ mystrcat(result, " ", MAXLNLEN);

+ mystrcat(result, MORPH_STEM, MAXLNLEN);

+ mystrcat(result, word, MAXLNLEN);

+ }

+ if (HENTRY_DATA(rv)) {

+ mystrcat(result, " ", MAXLNLEN);

+ mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);

+ }

+ mystrcat(result, "\n", MAXLNLEN);

}

rv = rv->next_homonym;

}

st = pAMgr->affix_check_morph(word,strlen(word));

if (st) {

- strcat(result, st);

+ mystrcat(result, st, MAXLNLEN);

free(st);

}

@@ -1426,28 +1564,177 @@

pAMgr->compound_check_morph(word, strlen(word),

0, 0, 100, 0,NULL, 0, &r, NULL);

- return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL;

+ return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;

}

+#ifdef HUNSPELL_EXPERIMENTAL

char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)

{

char * p = NULL;

char ** wlst = (char **) calloc(maxSug, sizeof(char *));

+ if (!**wlst) return NULL;

// we will use only the first suggestion

for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";

- int ns = suggest(&wlst, word, maxSug - 1);

+ int ns = suggest(&wlst, word, maxSug - 1, NULL);

if (ns == maxSug) {

p = suggest_morph(wlst[maxSug - 1]);

free(wlst[maxSug - 1]);

}

if (wlst) free(wlst);

- return p;

+ return p;

}

#endif // END OF HUNSPELL_EXPERIMENTAL CODE

+/* affixation */

+char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)

+ char result[MAXLNLEN];

+ *result = '\0';

+ int sfxcount = get_sfxcount(pattern);

+ if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;

+ if (HENTRY_DATA(rv)) {

+ char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,

+ HENTRY_DATA(rv), pattern, 0);

+ if (aff) {

+ mystrcat(result, aff, MAXLNLEN);

+ mystrcat(result, "\n", MAXLNLEN);

+ free(aff);

+ }

+ // check all allomorphs

+ char allomorph[MAXLNLEN];

+ char * p = NULL;

+ if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);

+ while (p) {

+ struct hentry * rv2 = NULL;

+ p += MORPH_TAG_LEN;

+ int plen = fieldlen(p);

+ strncpy(allomorph, p, plen);

+ allomorph[plen] = '\0';

+ rv2 = pAMgr->lookup(allomorph);

+ while (rv2) {

+// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {

+ if (HENTRY_DATA(rv2)) {

+ char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);

+ if (st && (strncmp(st + MORPH_TAG_LEN,

+ HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {

+ char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,

+ HENTRY_DATA(rv2), pattern, 0);

+ if (aff) {

+ mystrcat(result, aff, MAXLNLEN);

+ mystrcat(result, "\n", MAXLNLEN);

+ free(aff);

+ }

+ rv2 = rv2->next_homonym;

+ }

+ p = strstr(p + plen, MORPH_ALLOMORPH);

+ }

+ return (*result) ? mystrdup(result) : NULL;

+char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {

+ char result[MAXLNLEN];

+ char result2[MAXLNLEN];

+ char newpattern[MAXLNLEN];

+ *newpattern = '\0';

+ if (n == 0) return 0;

+ *result2 = '\0';

+ struct hentry * rv = NULL;

+ if (!pAMgr) return NULL;

+// search affixed forms with and without derivational suffixes

+ while(1) {

+ for (int k = 0; k < n; k++) {

+ *result = '\0';

+ // add compound word parts (except the last one)

+ char * s = (char *) desc[k];

+ char * part = strstr(s, MORPH_PART);

+ if (part) {

+ char * nextpart = strstr(part + 1, MORPH_PART);

+ while (nextpart) {

+ copy_field(result + strlen(result), part, MORPH_PART);

+ part = nextpart;

+ nextpart = strstr(part + 1, MORPH_PART);

+ }

+ s = part;

+ }

+ char **pl;

+ char tok[MAXLNLEN];

+ strcpy(tok, s);

+ char * alt = strstr(tok, " | ");

+ while (alt) {

+ alt[1] = MSEP_ALT;

+ alt = strstr(alt, " | ");

+ }

+ int pln = line_tok(tok, &pl, MSEP_ALT);

+ for (int i = 0; i < pln; i++) {

+ // remove inflectional and terminal suffixes

+ char * is = strstr(pl[i], MORPH_INFL_SFX);

+ if (is) *is = '\0';

+ char * ts = strstr(pl[i], MORPH_TERM_SFX);

+ while (ts) {

+ *ts = '_';

+ ts = strstr(pl[i], MORPH_TERM_SFX);

+ }

+ char * st = strstr(s, MORPH_STEM);

+ if (st) {

+ copy_field(tok, st, MORPH_STEM);

+ rv = pAMgr->lookup(tok);

+ while (rv) {

+ char newpat[MAXLNLEN];

+ strcpy(newpat, pl[i]);

+ strcat(newpat, pattern);

+ char * sg = suggest_hentry_gen(rv, newpat);

+ if (!sg) sg = suggest_hentry_gen(rv, pattern);

+ if (sg) {

+ char ** gen;

+ int genl = line_tok(sg, &gen, MSEP_REC);

+ free(sg);

+ sg = NULL;

+ for (int j = 0; j < genl; j++) {

+ if (strstr(pl[i], MORPH_SURF_PFX)) {

+ int r2l = strlen(result2);

+ result2[r2l] = MSEP_REC;

+ strcpy(result2 + r2l + 1, result);

+ copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);

+ mystrcat(result2, gen[j], MAXLNLEN);

+ } else {

+ sprintf(result2 + strlen(result2), "%c%s%s",

+ MSEP_REC, result, gen[j]);

+ }

+ freelist(&gen, genl);

+ }

+ rv = rv->next_homonym;

+ }

+ freelist(&pl, pln);

+ }

+ if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;

+ strcpy(newpattern, pattern);

+ pattern = newpattern;

+ char * ds = strstr(pattern, MORPH_DERI_SFX);

+ while (ds) {

+ strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);

+ ds = strstr(pattern, MORPH_DERI_SFX);

+ }

+ return (*result2 ? mystrdup(result2) : NULL);

// generate an n-gram score comparing s1 and s2

-int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)

+int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)

{

int nscore = 0;

int ns;

@@ -1459,13 +1746,9 @@

w_char su2[MAXSWL];

l1 = u8_u16(su1, MAXSWL, s1);

l2 = u8_u16(su2, MAXSWL, s2);

- if (!l2 || (l1==-1) || (l2==-1)) return 0;

- // decapitalize dictionary word

- if (complexprefixes) {

- mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum());

- } else {

- mkallsmall_utf(su2, 1, pAMgr->get_langnum());

- }

+ if ((l2 <= 0) || (l1 == -1)) return 0;

+ // lowering dictionary word

+ if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);

for (int j = 1; j <= n; j++) {

ns = 0;

for (int i = 0; i <= (l1-j); i++) {

@@ -1489,13 +1772,9 @@

char t[MAXSWUTF8L];

l1 = strlen(s1);

l2 = strlen(s2);

- if (!l2) return 0;

+ if (l2 == 0) return 0;

strcpy(t, s2);

- if (complexprefixes) {

- *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;

- } else {

- mkallsmall(t, csconv);

- }

+ if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);

for (int j = 1; j <= n; j++) {

ns = 0;

for (int i = 0; i <= (l1-j); i++) {

@@ -1510,13 +1789,14 @@

}

ns = 0;

- if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;

- if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;

+ if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;

+ if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;

ns = (nscore - ((ns > 0) ? ns : 0));

return ns;

}

-int SuggestMgr::equalfirstletter(char * s1, const char * s2) {

+// length of the left common substring of s1 and (decapitalised) s2

+int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {

if (utf8) {

w_char su1[MAXSWL];

w_char su2[MAXSWL];

@@ -1526,9 +1806,17 @@

int l2 = u8_u16(su2, MAXSWL, s2);

if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;

} else {

+ int i;

u8_u16(su1, 1, s1);

u8_u16(su2, 1, s2);

- if (*((short *)su1) == *((short *)su2)) return 1;

+ unsigned short idx = (su2->h << 8) + su2->l;

+ if (*((short *)su1) != *((short *)su2) &&

+ (*((unsigned short *)su1) != unicodetolower(idx, langnum))) return 0;

+ int l1 = u8_u16(su1, MAXSWL, s1);

+ int l2 = u8_u16(su2, MAXSWL, s2);

+ for(i = 1; (i < l1) && (i < l2) &&

+ (*((short *)(su1 + i)) == *((short *)(su2 + i))); i++);

+ return i;

}

} else {

if (complexprefixes) {

@@ -1536,7 +1824,13 @@

int l2 = strlen(s2);

if (*(s2+l1-1) == *(s2+l2-1)) return 1;

} else {

- if (*s1 == *s2) return 1;

+ char * olds = s1;

+ // decapitalise dictionary word

+ if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;

+ do {

+ s1++; s2++;

+ } while ((*s1 == *s2) && (*s1 != '\0'));

+ return s1 - olds;

}

return 0;

@@ -1554,9 +1848,9 @@

int l2 = u8_u16(su2, MAXSWL, s2);

// decapitalize dictionary word

if (complexprefixes) {

- mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum());

+ mkallsmall_utf(su2+l2-1, 1, langnum);

} else {

- mkallsmall_utf(su2, 1, pAMgr->get_langnum());

+ mkallsmall_utf(su2, 1, langnum);

}

for (int i = 0; (i < l1) && (i < l2); i++) {

if (((short *) su1)[i] == ((short *) su2)[i]) {

@@ -1603,7 +1897,7 @@

}

// sort in decreasing order of score

-void SuggestMgr::bubblesort(char** rword, int* rsc, int n )

+void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )

{

int m = 1;

while (m < n) {

@@ -1616,6 +1910,11 @@

rword[j-1] = rword[j];

rsc[j] = sctmp;

rword[j] = wdtmp;

+ if (rword2) {

+ wdtmp = rword2[j-1];

+ rword2[j-1] = rword2[j];

+ rword2[j] = wdtmp;

+ }

j--;

} else break;

}

@@ -1642,6 +1941,12 @@

}

c = (char *) malloc((m + 1) * (n + 1));

b = (char *) malloc((m + 1) * (n + 1));

+ if (!c || !b) {

+ if (c) free(c);

+ if (b) free(b);

+ *result = NULL;

+ return;

+ }

for (i = 1; i <= m; i++) c[i*(n+1)] = 0;

for (j = 0; j <= n; j++) c[j] = 0;

for (i = 1; i <= m; i++) {

@@ -1673,6 +1978,7 @@

char * result;

int len = 0;

lcs(s, s2, &m, &n, &result);

+ if (!result) return 0;

i = m;

j = n;

while ((i != 0) && (j != 0)) {

@@ -1684,6 +1990,6 @@

i--;

} else j--;

}

- if (result) free(result);

+ free(result);

return len;

}

« no previous file with comments | « chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx ('k') | chrome/third_party/hunspell/src/hunspell/w_char.hxx » ('j') | no next file with comments »