Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2125)

Unified Diff: chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx

Issue 155841: Update Hunspell to the latest stable version to use the latest dictionary for... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: '' Created 11 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx
===================================================================
--- chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx (revision 21721)
+++ chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx (working copy)
@@ -14,13 +14,16 @@
#endif
#include "suggestmgr.hxx"
+#include "htypes.hxx"
+#include "csutil.hxx"
#ifndef MOZILLA_CLIENT
-#ifndef W32
+#ifndef WIN32
using namespace std;
#endif
#endif
+const w_char W_VLINE = { '\0', '|' };
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
AffixMgr * aptr)
@@ -30,36 +33,54 @@
// try when building candidate suggestions
pAMgr = aptr;
+ ckeyl = 0;
+ ckey = NULL;
+ ckey_utf = NULL;
+
ctryl = 0;
ctry = NULL;
ctry_utf = NULL;
+ utf8 = 0;
+ langnum = 0;
+ complexprefixes = 0;
+
maxSug = maxn;
nosplitsugs = 0;
maxngramsugs = MAXNGRAMSUGS;
- utf8 = 0;
- complexprefixes = 0;
-
if (pAMgr) {
char * enc = pAMgr->get_encoding();
csconv = get_current_cs(enc);
free(enc);
+ langnum = pAMgr->get_langnum();
+ ckey = pAMgr->get_key_string();
nosplitsugs = pAMgr->get_nosplitsugs();
if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();
utf8 = pAMgr->get_utf8();
complexprefixes = pAMgr->get_complexprefixes();
}
- if (tryme) {
+ if (ckey) {
if (utf8) {
w_char t[MAXSWL];
+ ckeyl = u8_u16(t, MAXSWL, ckey);
+ ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));
+ if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
+ } else {
+ ckeyl = strlen(ckey);
+ }
+ }
+
+ if (tryme) {
+ ctry = mystrdup(tryme);
+ if (ctry) ctryl = strlen(ctry);
+ if (ctry && utf8) {
+ w_char t[MAXSWL];
ctryl = u8_u16(t, MAXSWL, tryme);
ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
- memcpy(ctry_utf, t, ctryl * sizeof(w_char));
- } else {
- ctry = mystrdup(tryme);
- ctryl = strlen(ctry);
+ if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));
+ else ctryl = 0;
}
}
}
@@ -68,6 +89,11 @@
SuggestMgr::~SuggestMgr()
{
pAMgr = NULL;
+ if (ckey) free(ckey);
+ ckey = NULL;
+ if (ckey_utf) free(ckey_utf);
+ ckey_utf = NULL;
+ ckeyl = 0;
if (ctry) free(ctry);
ctry = NULL;
if (ctry_utf) free(ctry_utf);
@@ -77,7 +103,7 @@
}
int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
- int * timer, time_t * timelimit) {
+ int * timer, clock_t * timelimit) {
int cwrd = 1;
if (ns == maxSug) return maxSug;
for (int k=0; k < ns; k++) {
@@ -96,13 +122,15 @@
// generate suggestions for a mispelled word
// pass in address of array of char * pointers
+// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
-int SuggestMgr::suggest(char*** slst, const char * w, int nsug)
+int SuggestMgr::suggest(char*** slst, const char * w, int nsug,
+ int * onlycompoundsug)
{
- int nocompoundtwowords = 0;
- char ** wlst;
- w_char word_utf[MAXSWL];
- int wl = 0;
+ int nocompoundtwowords = 0;
+ char ** wlst;
+ w_char word_utf[MAXSWL];
+ int wl = 0;
char w2[MAXWORDUTF8LEN];
const char * word = w;
@@ -141,8 +169,8 @@
nsug = replchars(wlst, word, nsug, cpdsuggest);
// perhaps we made chose the wrong char from a related set
- if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) {
- nsug = mapchars(wlst, word, nsug);
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = mapchars(wlst, word, nsug, cpdsuggest);
}
// did we swap the order of chars by mistake
@@ -157,6 +185,22 @@
longswapchar(wlst, word, nsug, cpdsuggest);
}
+ // did we just hit the wrong key in place of a good char (case and keyboard)
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
+ badcharkey(wlst, word, nsug, cpdsuggest);
+ }
+
+ // only suggest compound words when no other suggestion
+ if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1;
+
+ // did we add a char that should not be there
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
+ extrachar(wlst, word, nsug, cpdsuggest);
+ }
+
+
// did we forgot a char
if ((nsug < maxSug) && (nsug > -1)) {
nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
@@ -169,12 +213,6 @@
movechar(wlst, word, nsug, cpdsuggest);
}
- // did we add a char that should not be there
- if ((nsug < maxSug) && (nsug > -1)) {
- nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
- extrachar(wlst, word, nsug, cpdsuggest);
- }
-
// did we just hit the wrong key in place of a good char
if ((nsug < maxSug) && (nsug > -1)) {
nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
@@ -187,10 +225,6 @@
doubletwochars(wlst, word, nsug, cpdsuggest);
}
-
- // only suggest compound words when no other suggestion
- if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
-
// perhaps we forgot to hit space and two words ran together
if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {
nsug = twowords(wlst, word, nsug, cpdsuggest);
@@ -205,6 +239,8 @@
free(wlst);
wlst = NULL;
}
+
+ if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;
*slst = wlst;
return nsug;
@@ -242,8 +278,8 @@
nsug = replchars(wlst, word, nsug, cpdsuggest);
// perhaps we made chose the wrong char from a related set
- if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0))
- nsug = mapchars(wlst, word, nsug);
+ if ((nsug < maxSug) && (nsug > -1))
+ nsug = mapchars(wlst, word, nsug, cpdsuggest);
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
@@ -273,7 +309,7 @@
char candidate[MAXSWUTF8L];
w_char candidate_utf[MAXSWL];
memcpy(candidate_utf, word, wl * sizeof(w_char));
- mkallcap_utf(candidate_utf, wl, pAMgr->get_langnum());
+ mkallcap_utf(candidate_utf, wl, langnum);
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
}
@@ -288,9 +324,9 @@
}
// suggestions for when chose the wrong char out of a related set
-int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
+int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
{
- time_t timelimit;
+ clock_t timelimit;
int timer;
int wl = strlen(word);
@@ -300,18 +336,19 @@
struct mapentry* maptable = pAMgr->get_maptable();
if (maptable==NULL) return ns;
- timelimit = time(NULL);
+ timelimit = clock();
timer = MINTIMER;
if (utf8) {
w_char w[MAXSWL];
int len = u8_u16(w, MAXSWL, word);
- ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
- } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
+ ns = map_related_utf(w, len, 0, cpdsuggest, wlst, ns, maptable, nummap, &timer, &timelimit);
+ } else ns = map_related(word, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);
return ns;
}
-int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns,
- const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
+int SuggestMgr::map_related(const char * word, int i, char** wlst,
+ int cpdsuggest, int ns,
+ const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)
{
char c = *(word + i);
if (c == 0) {
@@ -319,8 +356,7 @@
int wl = strlen(word);
for (int m=0; m < ns; m++)
if (strcmp(word,wlst[m]) == 0) cwrd = 0;
- if ((cwrd) && (checkword(word, wl, 0, timer, timelimit) ||
- checkword(word, wl, 1, timer, timelimit))) {
+ if ((cwrd) && checkword(word, wl, cpdsuggest, timer, timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(word);
if (wlst[ns] == NULL) return -1;
@@ -334,23 +370,27 @@
if (strchr(maptable[j].set,c) != 0) {
in_map = 1;
char * newword = mystrdup(word);
+ if (!newword) return -1;
for (int k = 0; k < maptable[j].len; k++) {
*(newword + i) = *(maptable[j].set + k);
- ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit);
- if (!(*timelimit)) return ns;
+ ns = map_related(newword, (i+1), wlst, cpdsuggest,
+ ns, maptable, nummap, timer, timelimit);
+ if (!(*timer)) return ns;
}
free(newword);
}
}
if (!in_map) {
i++;
- ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit);
+ ns = map_related(word, i, wlst, cpdsuggest,
+ ns, maptable, nummap, timer, timelimit);
}
return ns;
}
-int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns,
- const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
+int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest,
+ char** wlst, int ns, const mapentry* maptable, int nummap,
+ int * timer, clock_t * timelimit)
{
if (i == len) {
int cwrd = 1;
@@ -360,8 +400,7 @@
wl = strlen(s);
for (int m=0; m < ns; m++)
if (strcmp(s,wlst[m]) == 0) cwrd = 0;
- if ((cwrd) && (checkword(s, wl, 0, timer, timelimit) ||
- checkword(s, wl, 1, timer, timelimit))) {
+ if ((cwrd) && checkword(s, wl, cpdsuggest, timer, timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(s);
if (wlst[ns] == NULL) return -1;
@@ -377,15 +416,17 @@
in_map = 1;
for (int k = 0; k < maptable[j].len; k++) {
*(word + i) = *(maptable[j].set_utf16 + k);
- ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit);
- if (!(*timelimit)) return ns;
+ ns = map_related_utf(word, len, i + 1, cpdsuggest,
+ wlst, ns, maptable, nummap, timer, timelimit);
+ if (!(*timer)) return ns;
}
*((unsigned short *) word + i) = c;
}
}
if (!in_map) {
i++;
- ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit);
+ ns = map_related_utf(word, len, i, cpdsuggest,
+ wlst, ns, maptable, nummap, timer, timelimit);
}
return ns;
}
@@ -416,6 +457,23 @@
strcpy(candidate+(r-word)+lenr, r+lenp);
ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
+ // check REP suggestions with space
+ char * sp = strchr(candidate, ' ');
+ if (sp) {
+ *sp = '\0';
+ if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) {
+ int oldns = ns;
+ *sp = ' ';
+ ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ if (oldns < ns) {
+ free(wlst[ns - 1]);
+ wlst[ns - 1] = mystrdup(candidate);
+ if (!wlst[ns - 1]) return -1;
+ }
+ }
+ *sp = ' ';
+ }
r++; // search for the next letter
}
}
@@ -454,7 +512,7 @@
int state=0;
if (wl < 5 || ! pAMgr) return ns;
for (int i=2; i < wl; i++) {
- if ((word[i].l==word[i-2].l) && (word[i].h==word[i-2].h)) {
+ if (w_char_eq(word[i], word[i-2])) {
state++;
if (state==3) {
memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
@@ -471,25 +529,108 @@
return ns;
}
+// error is wrong char in place of correct one (case and keyboard related version)
+int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)
+{
+ char tmpc;
+ char candidate[MAXSWUTF8L];
+ int wl = strlen(word);
+ strcpy(candidate, word);
+ // swap out each char one by one and try uppercase and neighbor
+ // keyboard chars in its place to see if that makes a good word
+
+ for (int i=0; i < wl; i++) {
+ tmpc = candidate[i];
+ // check with uppercase letters
+ candidate[i] = csconv[((unsigned char)tmpc)].cupper;
+ if (tmpc != candidate[i]) {
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ candidate[i] = tmpc;
+ }
+ // check neighbor characters in keyboard string
+ if (!ckey) continue;
+ char * loc = strchr(ckey, tmpc);
+ while (loc) {
+ if ((loc > ckey) && (*(loc - 1) != '|')) {
+ candidate[i] = *(loc - 1);
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ }
+ if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
+ candidate[i] = *(loc + 1);
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ }
+ loc = strchr(loc + 1, tmpc);
+ }
+ candidate[i] = tmpc;
+ }
+ return ns;
+}
+
+// error is wrong char in place of correct one (case and keyboard related version)
+int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
+{
+ w_char tmpc;
+ w_char candidate_utf[MAXSWL];
+ char candidate[MAXSWUTF8L];
+ memcpy(candidate_utf, word, wl * sizeof(w_char));
+ // swap out each char one by one and try all the tryme
+ // chars in its place to see if that makes a good word
+ for (int i=0; i < wl; i++) {
+ tmpc = candidate_utf[i];
+ // check with uppercase letters
+ mkallcap_utf(candidate_utf + i, 1, langnum);
+ if (!w_char_eq(tmpc, candidate_utf[i])) {
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ candidate_utf[i] = tmpc;
+ }
+ // check neighbor characters in keyboard string
+ if (!ckey) continue;
+ w_char * loc = ckey_utf;
+ while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;
+ while (loc < (ckey_utf + ckeyl)) {
+ if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
+ candidate_utf[i] = *(loc - 1);
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ }
+ if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
+ candidate_utf[i] = *(loc + 1);
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ }
+ do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
+ }
+ candidate_utf[i] = tmpc;
+ }
+ return ns;
+}
+
// error is wrong char in place of correct one
int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char tmpc;
char candidate[MAXSWUTF8L];
- time_t timelimit = time(NULL);
+ clock_t timelimit = clock();
int timer = MINTIMER;
int wl = strlen(word);
strcpy(candidate, word);
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
- for (int i=0; i < wl; i++) {
- tmpc = candidate[i];
- for (int j=0; j < ctryl; j++) {
+ for (int j=0; j < ctryl; j++) {
+ for (int i=wl-1; i >= 0; i--) {
+ tmpc = candidate[i];
if (ctry[j] == tmpc) continue;
candidate[i] = ctry[j];
ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
if (ns == -1) return -1;
- if (!timelimit) return ns;
+ if (!timer) return ns;
candidate[i] = tmpc;
}
}
@@ -502,20 +643,20 @@
w_char tmpc;
w_char candidate_utf[MAXSWL];
char candidate[MAXSWUTF8L];
- time_t timelimit = time(NULL);
+ clock_t timelimit = clock();
int timer = MINTIMER;
memcpy(candidate_utf, word, wl * sizeof(w_char));
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
- for (int i=0; i < wl; i++) {
- tmpc = candidate_utf[i];
- for (int j=0; j < ctryl; j++) {
- if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue;
+ for (int j=0; j < ctryl; j++) {
+ for (int i=wl-1; i >= 0; i--) {
+ tmpc = candidate_utf[i];
+ if (w_char_eq(tmpc, ctry_utf[j])) continue;
candidate_utf[i] = ctry_utf[j];
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
if (ns == -1) return -1;
- if (!timelimit) return ns;
+ if (!timer) return ns;
candidate_utf[i] = tmpc;
}
}
@@ -525,18 +666,20 @@
// error is word has an extra letter it does not need
int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
{
- char candidate[MAXSWUTF8L];
+ char candidate[MAXSWUTF8L];
w_char candidate_utf[MAXSWL];
- const w_char * p;
- w_char * r;
+ w_char * p;
+ w_char tmpc = W_VLINE; // not used value, only for VCC warning message
if (wl < 2) return ns;
// try omitting one char of word at a time
- memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char));
- for (p = word, r = candidate_utf; p < word + wl; ) {
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
+ memcpy(candidate_utf, word, wl * sizeof(w_char));
+ for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
+ w_char tmpc2 = *p;
+ if (p < candidate_utf + wl - 1) *p = tmpc;
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
- *r++ = *p++;
+ tmpc = tmpc2;
}
return ns;
}
@@ -544,48 +687,42 @@
// error is word has an extra letter it does not need
int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
{
+ char tmpc = '\0';
char candidate[MAXSWUTF8L];
- const char * p;
- char * r;
+ char * p;
int wl = strlen(word);
if (wl < 2) return ns;
// try omitting one char of word at a time
- strcpy (candidate, word + 1);
- for (p = word, r = candidate; *p != 0; ) {
+ strcpy (candidate, word);
+ for (p = candidate + wl - 1; p >=candidate; p--) {
+ char tmpc2 = *p;
+ *p = tmpc;
ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
- *r++ = *p++;
+ tmpc = tmpc2;
}
return ns;
}
-
// error is missing a letter it needs
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
- const char * p;
- char * q;
- time_t timelimit = time(NULL);
+ char * p;
+ clock_t timelimit = clock();
int timer = MINTIMER;
int wl = strlen(word);
- // try inserting a tryme character before every letter
- strcpy(candidate + 1, word);
- for (p = word, q = candidate; *p != 0; ) {
- for (int i = 0; i < ctryl; i++) {
- *q = ctry[i];
+ // try inserting a tryme character before every letter (and the null terminator)
+ for (int i = 0; i < ctryl; i++) {
+ strcpy(candidate, word);
+ for (p = candidate + wl; p >= candidate; p--) {
+ *(p+1) = *p;
+ *p = ctry[i];
ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);
if (ns == -1) return -1;
- if (!timelimit) return ns;
+ if (!timer) return ns;
}
- *q++ = *p++;
}
- // now try adding one to end */
- for (int i = 0; i < ctryl; i++) {
- *q = ctry[i];
- ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, NULL, NULL);
- if (ns == -1) return -1;
- }
return ns;
}
@@ -594,32 +731,21 @@
{
w_char candidate_utf[MAXSWL];
char candidate[MAXSWUTF8L];
- const w_char * p;
- w_char * q;
- int cwrd;
- time_t timelimit = time(NULL);
+ w_char * p;
+ clock_t timelimit = clock();
int timer = MINTIMER;
- // try inserting a tryme character before every letter
- memcpy (candidate_utf + 1, word, wl * sizeof(w_char));
- for (p = word, q = candidate_utf; p < (word + wl); ) {
- for (int i = 0; i < ctryl; i++) {
- *q = ctry_utf[i];
- cwrd = 1;
+ // try inserting a tryme character at the end of the word and before every letter
+ for (int i = 0; i < ctryl; i++) {
+ memcpy (candidate_utf, word, wl * sizeof(w_char));
+ for (p = candidate_utf + wl; p >= candidate_utf; p--) {
+ *(p + 1) = *p;
+ *p = ctry_utf[i];
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
if (ns == -1) return -1;
- if (!timelimit) return ns;
- }
- *q++ = *p++;
+ if (!timer) return ns;
+ }
}
- // now try adding one to end */
- for (int i = 0; i < ctryl; i++) {
- *q = ctry_utf[i];
- cwrd = 1;
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
- if (ns == -1) return -1;
- }
return ns;
}
@@ -636,19 +762,19 @@
int wl=strlen(word);
if (wl < 3) return ns;
- if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl);
+ if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);
strcpy(candidate + 1, word);
-
// split the string into two pieces after every char
// if both pieces are good words make them a suggestion
for (p = candidate + 1; p[1] != '\0'; p++) {
p[-1] = *p;
// go to end of the UTF-8 character
while (utf8 && ((p[1] & 0xc0) == 0x80)) {
+ *p = p[1];
p++;
- p[-1] = *p;
}
+ if (utf8 && p[1] == '\0') break; // last UTF-8 character
*p = '\0';
c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);
if (c1) {
@@ -657,7 +783,7 @@
*p = ' ';
// spec. Hungarian code (need a better compound word support)
- if ((pAMgr->get_langnum() == LANG_hu) && !forbidden &&
+ if ((langnum == LANG_hu) && !forbidden &&
// if 3 repeating letter, use - instead of space
(((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
// or multiple compounding, with more, than 6 syllables
@@ -673,6 +799,23 @@
ns++;
}
} else return ns;
+ // add two word suggestion with dash, if TRY string contains
+ // "a" or "-"
+ // NOTE: cwrd doesn't modified for REP twoword sugg.
+ if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
+ mystrlen(p + 1) > 1 &&
+ mystrlen(candidate) - mystrlen(p) > 1) {
+ *p = '-';
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if (ns < maxSug) {
+ if (cwrd) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+ } else return ns;
+ }
}
}
}
@@ -698,6 +841,24 @@
p[1] = *p;
*p = tmpc;
}
+ // try double swaps for short words
+ // ahev -> have, owudl -> would
+ if (wl == 4 || wl == 5) {
+ candidate[0] = word[1];
+ candidate[1] = word[0];
+ candidate[2] = word[2];
+ candidate[wl - 2] = word[wl - 1];
+ candidate[wl - 1] = word[wl - 2];
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ if (wl == 5) {
+ candidate[0] = word[0];
+ candidate[1] = word[2];
+ candidate[2] = word[1];
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ }
+ }
return ns;
}
@@ -708,6 +869,7 @@
char candidate[MAXSWUTF8L];
w_char * p;
w_char tmpc;
+ int len = 0;
// try swapping adjacent chars one by one
memcpy (candidate_utf, word, wl * sizeof(w_char));
for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
@@ -715,11 +877,32 @@
*p = p[1];
p[1] = tmpc;
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ if (len == 0) len = strlen(candidate);
+ ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
p[1] = *p;
*p = tmpc;
}
+ // try double swaps for short words
+ // ahev -> have, owudl -> would, suodn -> sound
+ if (wl == 4 || wl == 5) {
+ candidate_utf[0] = word[1];
+ candidate_utf[1] = word[0];
+ candidate_utf[2] = word[2];
+ candidate_utf[wl - 2] = word[wl - 1];
+ candidate_utf[wl - 1] = word[wl - 2];
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ if (wl == 5) {
+ candidate_utf[0] = word[0];
+ candidate_utf[1] = word[2];
+ candidate_utf[2] = word[1];
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ }
+ }
return ns;
}
@@ -794,7 +977,7 @@
*(q-1) = *q;
*q = tmpc;
if ((q-p) < 2) continue; // omit swap char
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
}
strcpy(candidate, word);
@@ -805,7 +988,7 @@
*(q+1) = *q;
*q = tmpc;
if ((p-q) < 2) continue; // omit swap char
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
}
strcpy(candidate, word);
@@ -830,7 +1013,7 @@
*q = tmpc;
if ((q-p) < 2) continue; // omit swap char
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
}
memcpy (candidate_utf, word, wl * sizeof(w_char));
@@ -842,7 +1025,7 @@
*q = tmpc;
if ((p-q) < 2) continue; // omit swap char
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
if (ns == -1) return -1;
}
memcpy (candidate_utf, word, wl * sizeof(w_char));
@@ -851,28 +1034,33 @@
}
// generate a set of suggestions for very poorly spelled words
-int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr)
+int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)
{
int i, j;
int lval;
- int sc;
- int lp;
+ int sc, scphon;
+ int lp, lpphon;
int nonbmp = 0;
- if (!pHMgr) return 0;
-
// exhaustively search through all root words
// keeping track of the MAX_ROOTS most similar root words
struct hentry * roots[MAX_ROOTS];
+ char * rootsphon[MAX_ROOTS];
int scores[MAX_ROOTS];
+ int scoresphon[MAX_ROOTS];
for (i = 0; i < MAX_ROOTS; i++) {
roots[i] = NULL;
scores[i] = -100 * i;
+ rootsphon[i] = NULL;
+ scoresphon[i] = -100 * i;
}
lp = MAX_ROOTS - 1;
-
+ lpphon = MAX_ROOTS - 1;
+ scphon = scoresphon[MAX_ROOTS-1];
+
char w2[MAXWORDUTF8LEN];
+ char f[MAXSWUTF8L];
char * word = w;
// word reversing wrapper for complex prefixes
@@ -896,8 +1084,8 @@
struct hentry* hp = NULL;
int col = -1;
-
- #ifdef HUNSPELL_CHROME_CLIENT
+
+#ifdef HUNSPELL_CHROME_CLIENT
// A static array of hentries required for walking the hash table.
struct hentry static_hentry[MAX_ROOTS];
@@ -906,31 +1094,61 @@
static const int kMaxWordLen = 128;
char hentry_word[MAX_ROOTS][kMaxWordLen];
unsigned short hentry_astr[MAX_ROOTS];
- #endif
+#endif
- while ((hp = pHMgr->walk_hashtable(col, hp))) {
+ phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
+ char target[MAXSWUTF8L];
+ char candidate[MAXSWUTF8L];
+ if (ph) {
+ strcpy(candidate, word);
+ mkallcap(candidate, csconv);
+ phonet(candidate, target, n, *ph);
+ }
+
+ for (i = 0; i < md; i++) {
+ while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {
if ((hp->astr) && (pAMgr) &&
(TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||
+ TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||
TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;
- sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
+
+ sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) +
+ leftcommonsubstring(word, HENTRY_WORD(hp));
+
+ // check special pronounciation
+ if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
+ int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) +
+ leftcommonsubstring(word, f);
+ if (sc2 > sc) sc = sc2;
+ }
+
+ if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {
+ char target2[MAXSWUTF8L];
+ strcpy(candidate, HENTRY_WORD(hp));
+ mkallcap(candidate, csconv);
+ phonet(candidate, target2, -1, *ph);
+ scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
+ }
+
if (sc > scores[lp]) {
scores[lp] = sc;
- #ifdef HUNSPELL_CHROME_CLIENT
+#ifdef HUNSPELL_CHROME_CLIENT
roots[lp] = &static_hentry[lp];
roots[lp]->alen = hp->alen;
if (hp->astr)
hentry_astr[lp] = *hp->astr;
roots[lp]->astr = &hentry_astr[lp];
- roots[lp]->wlen = hp->wlen;
- strcpy(&hentry_word[lp][0], hp->word);
- roots[lp]->word = &hentry_word[lp][0];
+ roots[lp]->blen = hp->blen;
+ strcpy(&hentry_word[lp][0], &hp->word);
+ roots[lp]->word = hentry_word[lp][0];
roots[lp]->next = NULL;
roots[lp]->next_homonym = NULL;
- #else
+ roots[lp]->var = 0;
+ roots[lp]->clen = 0;
+#else
roots[lp] = hp;
- #endif
-
+#endif
lval = sc;
for (j=0; j < MAX_ROOTS; j++)
if (scores[j] < lval) {
@@ -938,8 +1156,19 @@
lval = scores[j];
}
}
- }
+ if (scphon > scoresphon[lpphon]) {
+ scoresphon[lpphon] = scphon;
+ rootsphon[lpphon] = HENTRY_WORD(hp);
+ lval = scphon;
+ for (j=0; j < MAX_ROOTS; j++)
+ if (scoresphon[j] < lval) {
+ lpphon = j;
+ lval = scoresphon[j];
+ }
+ }
+ }}
+
// find minimum threshhold for a passable suggestion
// mangle original word three differnt ways
// and score them to generate a minimum acceptable score
@@ -948,11 +1177,11 @@
if (utf8) {
for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
u16_u8(mw, MAXSWUTF8L, u8, n);
- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);
} else {
strcpy(mw, word);
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING);
}
}
thresh = thresh / 3;
@@ -962,9 +1191,11 @@
// and use length adjusted ngram scores to select
// possible suggestions
char * guess[MAX_GUESS];
+ char * guessorig[MAX_GUESS];
int gscore[MAX_GUESS];
for(i=0;i<MAX_GUESS;i++) {
guess[i] = NULL;
+ guessorig[i] = NULL;
gscore[i] = -100 * i;
}
@@ -974,31 +1205,46 @@
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
if (! glst) {
if (nonbmp) utf8 = 1;
- return 0;
+ return ns;
}
for (i = 0; i < MAX_ROOTS; i++) {
-
if (roots[i]) {
struct hentry * rp = roots[i];
- int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
- rp->astr, rp->alen, word, nc);
+ int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,
+ rp->astr, rp->alen, word, nc,
+ ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));
for (int k = 0; k < nw ; k++) {
- sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
+ sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) +
+ leftcommonsubstring(word, glst[k].word);
+
if ((sc > thresh)) {
if (sc > gscore[lp]) {
- if (guess[lp]) free (guess[lp]);
+ if (guess[lp]) {
+ free (guess[lp]);
+ if (guessorig[lp]) {
+ free(guessorig[lp]);
+ guessorig[lp] = NULL;
+ }
+ }
gscore[lp] = sc;
guess[lp] = glst[k].word;
+ guessorig[lp] = glst[k].orig;
lval = sc;
for (j=0; j < MAX_GUESS; j++)
if (gscore[j] < lval) {
lp = j;
lval = gscore[j];
}
- } else free (glst[k].word);
- } else free(glst[k].word);
+ } else {
+ free(glst[k].word);
+ if (glst[k].orig) free(glst[k].orig);
+ }
+ } else {
+ free(glst[k].word);
+ if (glst[k].orig) free(glst[k].orig);
+ }
}
}
}
@@ -1007,7 +1253,9 @@
// now we are done generating guesses
// sort in order of decreasing score
- bubblesort(&guess[0], &gscore[0], MAX_GUESS);
+
+ bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
+ if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
// weight suggestions with a similarity index, based on
// the longest common subsequent algorithm and resort
@@ -1021,7 +1269,7 @@
if (utf8) {
w_char _w[MAXSWL];
len = u8_u16(_w, MAXSWL, guess[i]);
- mkallsmall_utf(_w, len, pAMgr->get_langnum());
+ mkallsmall_utf(_w, len, langnum);
u16_u8(gl, MAXSWUTF8L, _w, len);
} else {
strcpy(gl, guess[i]);
@@ -1039,10 +1287,10 @@
// heuristic weigthing of ngram scores
gscore[i] +=
- // length of longest common subsequent minus lenght difference
+ // length of longest common subsequent minus length difference
2 * _lcs - abs((int) (n - len)) +
- // weight equal first letter
- equalfirstletter(word, gl) +
+ // weight length of the left common substring
+ leftcommonsubstring(word, gl) +
// weight equal character positions
((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +
// swap character (not neighboring)
@@ -1050,28 +1298,87 @@
}
}
- bubblesort(&guess[0], &gscore[0], MAX_GUESS);
+ bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
+// phonetic version
+ if (ph) for (i=0; i < MAX_ROOTS; i++) {
+ if (rootsphon[i]) {
+ // lowering rootphon[i]
+ char gl[MAXSWUTF8L];
+ int len;
+ if (utf8) {
+ w_char _w[MAXSWL];
+ len = u8_u16(_w, MAXSWL, rootsphon[i]);
+ mkallsmall_utf(_w, len, langnum);
+ u16_u8(gl, MAXSWUTF8L, _w, len);
+ } else {
+ strcpy(gl, rootsphon[i]);
+ mkallsmall(gl, csconv);
+ len = strlen(rootsphon[i]);
+ }
+
+ // heuristic weigthing of ngram scores
+ scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +
+ // weight length of the left common substring
+ leftcommonsubstring(word, gl);
+ }
+ }
+
+ if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
+
// copy over
+ int oldns = ns;
- int ns = 0;
int same = 0;
for (i=0; i < MAX_GUESS; i++) {
if (guess[i]) {
- if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
+ if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
int unique = 1;
- // we have excellent suggestion(s)
+ // leave only excellent suggestions, if exists
if (gscore[i] > 1000) same = 1;
- for (j=0; j < ns; j++)
+ for (j = 0; j < ns; j++) {
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
- if (strstr(guess[i], wlst[j]) ||
+ if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||
+ (guessorig[i] && strstr(guessorig[i], wlst[j])) ||
// check forbidden words
!checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
- if (unique) wlst[ns++] = guess[i]; else free(guess[i]);
- } else free(guess[i]);
+ }
+ if (unique) {
+ wlst[ns++] = guess[i];
+ if (guessorig[i]) {
+ free(guess[i]);
+ wlst[ns-1] = guessorig[i];
+ }
+ } else {
+ free(guess[i]);
+ if (guessorig[i]) free(guessorig[i]);
+ }
+ } else {
+ free(guess[i]);
+ if (guessorig[i]) free(guessorig[i]);
+ }
}
}
+ oldns = ns;
+ if (ph) for (i=0; i < MAX_ROOTS; i++) {
+ if (rootsphon[i]) {
+ if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {
+ int unique = 1;
+ for (j = 0; j < ns; j++) {
+ // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
+ if (strstr(rootsphon[i], wlst[j]) ||
+ // check forbidden words
+ !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0;
+ }
+ if (unique) {
+ wlst[ns++] = mystrdup(rootsphon[i]);
+ if (!wlst[ns - 1]) return ns - 1;
+ }
+ }
+ }
+ }
+
if (nonbmp) utf8 = 1;
return ns;
}
@@ -1083,19 +1390,16 @@
// obsolote MySpell-HU modifications:
// return value 2 and 3 marks compounding with hyphen (-)
// `3' marks roots without suffix
-int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit)
+int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)
{
struct hentry * rv=NULL;
int nosuffix = 0;
-
+
// check time limit
if (timer) {
(*timer)--;
if (!(*timer) && timelimit) {
- if (time(NULL) > *timelimit) {
- *timelimit = 0;
- return 0;
- }
+ if ((clock() - *timelimit) > TIMELIMIT) return 0;
*timer = MAXPLUSTIMER;
}
}
@@ -1103,7 +1407,7 @@
if (pAMgr) {
if (cpdsuggest==1) {
if (pAMgr->get_compound()) {
- rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1);
+ rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT
if (rv) return 3; // XXX obsolote categorisation
}
return 0;
@@ -1114,10 +1418,15 @@
if (rv) {
if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
- if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
- TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
+ while (rv) {
+ if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
+ TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
+ TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
+ rv = rv->next_homonym;
+ } else break;
+ }
} else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
-
+
if (rv) {
nosuffix=1;
} else {
@@ -1130,8 +1439,9 @@
}
// check forbidden words
- if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
- || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||
+ TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
+ TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
if (rv) { // XXX obsolote
@@ -1149,7 +1459,7 @@
if (pAMgr) {
rv = pAMgr->lookup(word);
- if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
+ if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
if (!(pAMgr->prefix_check(word,len,1)))
rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
@@ -1160,184 +1470,6 @@
}
#ifdef HUNSPELL_EXPERIMENTAL
-// suggest stems, XXX experimental code
-int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug)
-{
- char buf[MAXSWUTF8L];
- char ** wlst;
- int prevnsug = nsug;
-
- char w2[MAXWORDUTF8LEN];
- const char * word = w;
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- strcpy(w2, w);
- if (utf8) reverseword_utf(w2); else reverseword(w2);
- word = w2;
- }
-
- if (*slst) {
- wlst = *slst;
- } else {
- wlst = (char **) calloc(maxSug, sizeof(char *));
- if (wlst == NULL) return -1;
- }
- // perhaps there are a fix stem in the dictionary
- if ((nsug < maxSug) && (nsug > -1)) {
-
- nsug = fixstems(wlst, word, nsug);
- if (nsug == prevnsug) {
- char * s = mystrdup(word);
- char * p = s + strlen(s);
- while ((*p != '-') && (p != s)) p--;
- if (*p == '-') {
- *p = '\0';
- nsug = fixstems(wlst, s, nsug);
- if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) {
- char * t;
- buf[0] = '\0';
- for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number?
- if (*t != '\0') strcpy(buf, "# ");
- strcat(buf, s);
- wlst[nsug] = mystrdup(buf);
- if (wlst[nsug] == NULL) return -1;
- nsug++;
- }
- p++;
- nsug = fixstems(wlst, p, nsug);
- }
-
- free(s);
- }
- }
-
- if (nsug < 0) {
- for (int i=0;i<maxSug; i++)
- if (wlst[i] != NULL) free(wlst[i]);
- free(wlst);
- return -1;
- }
-
- *slst = wlst;
- return nsug;
-}
-
-
-// there are fix stems in dictionary
-int SuggestMgr::fixstems(char ** wlst, const char * word, int ns)
-{
- char buf[MAXSWUTF8L];
- char prefix[MAXSWUTF8L] = "";
-
- int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound
- int cpdindex = 0;
- struct hentry * rv = NULL;
-
- int wl = strlen(word);
- int cmpdstemnum;
- int cmpdstem[MAXCOMPOUND];
-
- if (pAMgr) {
- rv = pAMgr->lookup(word);
- if (rv) {
- dicstem = 0;
- } else {
- // try stripping off affixes
- rv = pAMgr->affix_check(word, wl);
-
- // else try check compound word
- if (!rv && pAMgr->get_compound()) {
- rv = pAMgr->compound_check(word, wl,
- 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1);
-
- if (rv) {
- dicstem = 2;
- for (int j = 0; j < cmpdstemnum; j++) {
- cpdindex += cmpdstem[j];
- }
- if(! (pAMgr->lookup(word + cpdindex)))
- pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix
- }
- }
-
-
- if (pAMgr->get_prefix()) {
- strcpy(prefix, pAMgr->get_prefix());
- }
-
- // XXX obsolete, will be a general solution for stemming
- if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU)
- }
-
- }
-
-
-
- if ((rv) && (ns < maxSug)) {
-
- // check fixstem flag and not_valid_stem flag
- // first word
- if ((ns < maxSug) && (dicstem < 2)) {
- strcpy(buf, prefix);
- if ((dicstem > 0) && pAMgr->get_derived()) {
- // XXX obsolote
- if (strlen(prefix) == 1) {
- strcat(buf, (pAMgr->get_derived()) + 1);
- } else {
- strcat(buf, pAMgr->get_derived());
- }
- } else {
- // special stem in affix description
- const char * wordchars = pAMgr->get_wordchars();
- if (rv->description &&
- (strchr(wordchars, *(rv->description)))) {
- char * desc = (rv->description) + 1;
- while (strchr(wordchars, *desc)) desc++;
- strncat(buf, rv->description, desc - (rv->description));
- } else {
- strcat(buf, rv->word);
- }
- }
- wlst[ns] = mystrdup(buf);
- if (wlst[ns] == NULL) return -1;
- ns++;
- }
-
- if (dicstem == 2) {
-
- // compound stem
-
-// if (rv->astr && (strchr(rv->astr, '0') == NULL)) {
- if (rv->astr) {
- strcpy(buf, word);
- buf[cpdindex] = '\0';
- if (prefix) strcat(buf, prefix);
- if (pAMgr->get_derived()) {
- strcat(buf, pAMgr->get_derived());
- } else {
- // special stem in affix description
- const char * wordchars = pAMgr->get_wordchars();
- if (rv->description &&
- (strchr(wordchars, *(rv->description)))) {
- char * desc = (rv->description) + 1;
- while (strchr(wordchars, *desc)) desc++;
- strncat(buf, rv->description, desc - (rv->description));
- } else {
- strcat(buf, rv->word);
- }
- }
- if (ns < maxSug) {
- wlst[ns] = mystrdup(buf);
- if (wlst[ns] == NULL) return -1;
- ns++;
- }
- }
- }
- }
- return ns;
-}
-
// suggest possible stems
int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
{
@@ -1377,6 +1509,7 @@
*slst = wlst;
return nsug;
}
+#endif // END OF HUNSPELL_EXPERIMENTAL CODE
char * SuggestMgr::suggest_morph(const char * w)
@@ -1405,20 +1538,25 @@
while (rv) {
if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
- TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) ||
+ TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
- if (rv->description && ((!rv->astr) ||
- !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen)))
- strcat(result, word);
- if (rv->description) strcat(result, rv->description);
- strcat(result, "\n");
+ if (!HENTRY_FIND(rv, MORPH_STEM)) {
+ mystrcat(result, " ", MAXLNLEN);
+ mystrcat(result, MORPH_STEM, MAXLNLEN);
+ mystrcat(result, word, MAXLNLEN);
+ }
+ if (HENTRY_DATA(rv)) {
+ mystrcat(result, " ", MAXLNLEN);
+ mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
+ }
+ mystrcat(result, "\n", MAXLNLEN);
}
rv = rv->next_homonym;
}
st = pAMgr->affix_check_morph(word,strlen(word));
if (st) {
- strcat(result, st);
+ mystrcat(result, st, MAXLNLEN);
free(st);
}
@@ -1426,28 +1564,177 @@
pAMgr->compound_check_morph(word, strlen(word),
0, 0, 100, 0,NULL, 0, &r, NULL);
- return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL;
+ return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
}
+#ifdef HUNSPELL_EXPERIMENTAL
char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
{
char * p = NULL;
char ** wlst = (char **) calloc(maxSug, sizeof(char *));
+ if (!**wlst) return NULL;
// we will use only the first suggestion
for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
- int ns = suggest(&wlst, word, maxSug - 1);
+ int ns = suggest(&wlst, word, maxSug - 1, NULL);
if (ns == maxSug) {
p = suggest_morph(wlst[maxSug - 1]);
free(wlst[maxSug - 1]);
}
if (wlst) free(wlst);
- return p;
+ return p;
}
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
+/* affixation */
+char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)
+{
+ char result[MAXLNLEN];
+ *result = '\0';
+ int sfxcount = get_sfxcount(pattern);
+ if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;
+
+ if (HENTRY_DATA(rv)) {
+ char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
+ HENTRY_DATA(rv), pattern, 0);
+ if (aff) {
+ mystrcat(result, aff, MAXLNLEN);
+ mystrcat(result, "\n", MAXLNLEN);
+ free(aff);
+ }
+ }
+
+ // check all allomorphs
+ char allomorph[MAXLNLEN];
+ char * p = NULL;
+ if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
+ while (p) {
+ struct hentry * rv2 = NULL;
+ p += MORPH_TAG_LEN;
+ int plen = fieldlen(p);
+ strncpy(allomorph, p, plen);
+ allomorph[plen] = '\0';
+ rv2 = pAMgr->lookup(allomorph);
+ while (rv2) {
+// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
+ if (HENTRY_DATA(rv2)) {
+ char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);
+ if (st && (strncmp(st + MORPH_TAG_LEN,
+ HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {
+ char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,
+ HENTRY_DATA(rv2), pattern, 0);
+ if (aff) {
+ mystrcat(result, aff, MAXLNLEN);
+ mystrcat(result, "\n", MAXLNLEN);
+ free(aff);
+ }
+ }
+ }
+ rv2 = rv2->next_homonym;
+ }
+ p = strstr(p + plen, MORPH_ALLOMORPH);
+ }
+
+ return (*result) ? mystrdup(result) : NULL;
+}
+
+char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {
+ char result[MAXLNLEN];
+ char result2[MAXLNLEN];
+ char newpattern[MAXLNLEN];
+ *newpattern = '\0';
+ if (n == 0) return 0;
+ *result2 = '\0';
+ struct hentry * rv = NULL;
+ if (!pAMgr) return NULL;
+
+// search affixed forms with and without derivational suffixes
+ while(1) {
+
+ for (int k = 0; k < n; k++) {
+ *result = '\0';
+ // add compound word parts (except the last one)
+ char * s = (char *) desc[k];
+ char * part = strstr(s, MORPH_PART);
+ if (part) {
+ char * nextpart = strstr(part + 1, MORPH_PART);
+ while (nextpart) {
+ copy_field(result + strlen(result), part, MORPH_PART);
+ part = nextpart;
+ nextpart = strstr(part + 1, MORPH_PART);
+ }
+ s = part;
+ }
+
+ char **pl;
+ char tok[MAXLNLEN];
+ strcpy(tok, s);
+ char * alt = strstr(tok, " | ");
+ while (alt) {
+ alt[1] = MSEP_ALT;
+ alt = strstr(alt, " | ");
+ }
+ int pln = line_tok(tok, &pl, MSEP_ALT);
+ for (int i = 0; i < pln; i++) {
+ // remove inflectional and terminal suffixes
+ char * is = strstr(pl[i], MORPH_INFL_SFX);
+ if (is) *is = '\0';
+ char * ts = strstr(pl[i], MORPH_TERM_SFX);
+ while (ts) {
+ *ts = '_';
+ ts = strstr(pl[i], MORPH_TERM_SFX);
+ }
+ char * st = strstr(s, MORPH_STEM);
+ if (st) {
+ copy_field(tok, st, MORPH_STEM);
+ rv = pAMgr->lookup(tok);
+ while (rv) {
+ char newpat[MAXLNLEN];
+ strcpy(newpat, pl[i]);
+ strcat(newpat, pattern);
+ char * sg = suggest_hentry_gen(rv, newpat);
+ if (!sg) sg = suggest_hentry_gen(rv, pattern);
+ if (sg) {
+ char ** gen;
+ int genl = line_tok(sg, &gen, MSEP_REC);
+ free(sg);
+ sg = NULL;
+ for (int j = 0; j < genl; j++) {
+ if (strstr(pl[i], MORPH_SURF_PFX)) {
+ int r2l = strlen(result2);
+ result2[r2l] = MSEP_REC;
+ strcpy(result2 + r2l + 1, result);
+ copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
+ mystrcat(result2, gen[j], MAXLNLEN);
+ } else {
+ sprintf(result2 + strlen(result2), "%c%s%s",
+ MSEP_REC, result, gen[j]);
+ }
+ }
+ freelist(&gen, genl);
+ }
+ rv = rv->next_homonym;
+ }
+ }
+ }
+ freelist(&pl, pln);
+ }
+
+ if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;
+ strcpy(newpattern, pattern);
+ pattern = newpattern;
+ char * ds = strstr(pattern, MORPH_DERI_SFX);
+ while (ds) {
+ strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);
+ ds = strstr(pattern, MORPH_DERI_SFX);
+ }
+ }
+ return (*result2 ? mystrdup(result2) : NULL);
+}
+
+
// generate an n-gram score comparing s1 and s2
-int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
+int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)
{
int nscore = 0;
int ns;
@@ -1459,13 +1746,9 @@
w_char su2[MAXSWL];
l1 = u8_u16(su1, MAXSWL, s1);
l2 = u8_u16(su2, MAXSWL, s2);
- if (!l2 || (l1==-1) || (l2==-1)) return 0;
- // decapitalize dictionary word
- if (complexprefixes) {
- mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum());
- } else {
- mkallsmall_utf(su2, 1, pAMgr->get_langnum());
- }
+ if ((l2 <= 0) || (l1 == -1)) return 0;
+ // lowering dictionary word
+ if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1-j); i++) {
@@ -1489,13 +1772,9 @@
char t[MAXSWUTF8L];
l1 = strlen(s1);
l2 = strlen(s2);
- if (!l2) return 0;
+ if (l2 == 0) return 0;
strcpy(t, s2);
- if (complexprefixes) {
- *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
- } else {
- mkallsmall(t, csconv);
- }
+ if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1-j); i++) {
@@ -1510,13 +1789,14 @@
}
ns = 0;
- if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
- if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
+ if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
+ if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
ns = (nscore - ((ns > 0) ? ns : 0));
return ns;
}
-int SuggestMgr::equalfirstletter(char * s1, const char * s2) {
+// length of the left common substring of s1 and (decapitalised) s2
+int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {
if (utf8) {
w_char su1[MAXSWL];
w_char su2[MAXSWL];
@@ -1526,9 +1806,17 @@
int l2 = u8_u16(su2, MAXSWL, s2);
if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
} else {
+ int i;
u8_u16(su1, 1, s1);
u8_u16(su2, 1, s2);
- if (*((short *)su1) == *((short *)su2)) return 1;
+ unsigned short idx = (su2->h << 8) + su2->l;
+ if (*((short *)su1) != *((short *)su2) &&
+ (*((unsigned short *)su1) != unicodetolower(idx, langnum))) return 0;
+ int l1 = u8_u16(su1, MAXSWL, s1);
+ int l2 = u8_u16(su2, MAXSWL, s2);
+ for(i = 1; (i < l1) && (i < l2) &&
+ (*((short *)(su1 + i)) == *((short *)(su2 + i))); i++);
+ return i;
}
} else {
if (complexprefixes) {
@@ -1536,7 +1824,13 @@
int l2 = strlen(s2);
if (*(s2+l1-1) == *(s2+l2-1)) return 1;
} else {
- if (*s1 == *s2) return 1;
+ char * olds = s1;
+ // decapitalise dictionary word
+ if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;
+ do {
+ s1++; s2++;
+ } while ((*s1 == *s2) && (*s1 != '\0'));
+ return s1 - olds;
}
}
return 0;
@@ -1554,9 +1848,9 @@
int l2 = u8_u16(su2, MAXSWL, s2);
// decapitalize dictionary word
if (complexprefixes) {
- mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum());
+ mkallsmall_utf(su2+l2-1, 1, langnum);
} else {
- mkallsmall_utf(su2, 1, pAMgr->get_langnum());
+ mkallsmall_utf(su2, 1, langnum);
}
for (int i = 0; (i < l1) && (i < l2); i++) {
if (((short *) su1)[i] == ((short *) su2)[i]) {
@@ -1603,7 +1897,7 @@
}
// sort in decreasing order of score
-void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
+void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )
{
int m = 1;
while (m < n) {
@@ -1616,6 +1910,11 @@
rword[j-1] = rword[j];
rsc[j] = sctmp;
rword[j] = wdtmp;
+ if (rword2) {
+ wdtmp = rword2[j-1];
+ rword2[j-1] = rword2[j];
+ rword2[j] = wdtmp;
+ }
j--;
} else break;
}
@@ -1642,6 +1941,12 @@
}
c = (char *) malloc((m + 1) * (n + 1));
b = (char *) malloc((m + 1) * (n + 1));
+ if (!c || !b) {
+ if (c) free(c);
+ if (b) free(b);
+ *result = NULL;
+ return;
+ }
for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
for (j = 0; j <= n; j++) c[j] = 0;
for (i = 1; i <= m; i++) {
@@ -1673,6 +1978,7 @@
char * result;
int len = 0;
lcs(s, s2, &m, &n, &result);
+ if (!result) return 0;
i = m;
j = n;
while ((i != 0) && (j != 0)) {
@@ -1684,6 +1990,6 @@
i--;
} else j--;
}
- if (result) free(result);
+ free(result);
return len;
}
« no previous file with comments | « chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx ('k') | chrome/third_party/hunspell/src/hunspell/w_char.hxx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698