| Index: third_party/hunspell/src/hunspell/suggestmgr.cxx
|
| diff --git a/third_party/hunspell/src/hunspell/suggestmgr.cxx b/third_party/hunspell/src/hunspell/suggestmgr.cxx
|
| index 4391bb9a2072da67ad209852e1b7ad307853ada7..4e122da3d6b0aa42eb836f0e207de3bed0be9869 100644
|
| --- a/third_party/hunspell/src/hunspell/suggestmgr.cxx
|
| +++ b/third_party/hunspell/src/hunspell/suggestmgr.cxx
|
| @@ -1174,12 +1174,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
|
| std::string target;
|
| std::string candidate;
|
| + std::vector<w_char> w_candidate;
|
| if (ph) {
|
| if (utf8) {
|
| - std::vector<w_char> _w;
|
| - u8_u16(_w, word);
|
| - mkallcap_utf(_w, langnum);
|
| - u16_u8(candidate, _w);
|
| + u8_u16(w_candidate, word);
|
| + mkallcap_utf(w_candidate, langnum);
|
| + u16_u8(candidate, w_candidate);
|
| } else {
|
| candidate.assign(word);
|
| if (!nonbmp)
|
| @@ -1193,6 +1193,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
|
| FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
|
|
|
| + std::vector<w_char> w_word, w_target;
|
| + if (utf8) {
|
| + u8_u16(w_word, word);
|
| + u8_u16(w_target, target);
|
| + }
|
| +
|
| + std::vector<w_char> w_entry;
|
| + std::string f;
|
| + std::vector<w_char> w_f;
|
| + std::vector<w_char> w_target2;
|
| +
|
| for (size_t i = 0; i < rHMgr.size(); ++i) {
|
| while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
|
| if ((hp->astr) && (pAMgr) &&
|
| @@ -1203,15 +1214,30 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| TESTAFF(hp->astr, onlyincompound, hp->alen)))
|
| continue;
|
|
|
| - sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
|
| - leftcommonsubstring(word, HENTRY_WORD(hp));
|
| + if (utf8) {
|
| + w_entry.clear();
|
| + u8_u16(w_entry, HENTRY_WORD(hp));
|
| + sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
|
| + leftcommonsubstring(w_word, w_entry);
|
| + } else {
|
| + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
|
| + leftcommonsubstring(word, HENTRY_WORD(hp));
|
| + }
|
|
|
| // check special pronounciation
|
| - std::string f;
|
| + f.clear();
|
| if ((hp->var & H_OPT_PHON) &&
|
| copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
|
| - int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
|
| - +leftcommonsubstring(word, f.c_str());
|
| + int sc2;
|
| + if (utf8) {
|
| + w_f.clear();
|
| + u8_u16(w_f, f.c_str());
|
| + sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
|
| + leftcommonsubstring(w_word, w_f);
|
| + } else {
|
| + sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
|
| + leftcommonsubstring(word, f.c_str());
|
| + }
|
| if (sc2 > sc)
|
| sc = sc2;
|
| }
|
| @@ -1219,16 +1245,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| int scphon = -20000;
|
| if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
|
| if (utf8) {
|
| - std::vector<w_char> _w;
|
| - u8_u16(_w, HENTRY_WORD(hp));
|
| - mkallcap_utf(_w, langnum);
|
| - u16_u8(candidate, _w);
|
| + w_candidate.clear();
|
| + u8_u16(w_candidate, HENTRY_WORD(hp));
|
| + mkallcap_utf(w_candidate, langnum);
|
| + u16_u8(candidate, w_candidate);
|
| } else {
|
| - candidate.assign(HENTRY_WORD(hp));
|
| + candidate = HENTRY_WORD(hp);
|
| mkallcap(candidate, csconv);
|
| }
|
| std::string target2 = phonet(candidate, *ph);
|
| - scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
|
| + w_target2.clear();
|
| + if (utf8) {
|
| + u8_u16(w_target2, target2.c_str());
|
| + scphon = 2 * ngram(3, w_target, w_target2,
|
| + NGRAM_LONGER_WORSE);
|
| + } else {
|
| + scphon = 2 * ngram(3, target, target2,
|
| + NGRAM_LONGER_WORSE);
|
| + }
|
| }
|
|
|
| if (sc > scores[lp]) {
|
| @@ -1262,22 +1296,21 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| // find minimum threshold for a passable suggestion
|
| // mangle original word three differnt ways
|
| // and score them to generate a minimum acceptable score
|
| + std::vector<w_char> w_mw;
|
| int thresh = 0;
|
| for (int sp = 1; sp < 4; sp++) {
|
| if (utf8) {
|
| - u8_u16(u8, word);
|
| + w_mw = w_word;
|
| for (int k = sp; k < n; k += 4) {
|
| - u8[k].l = '*';
|
| - u8[k].h = 0;
|
| + w_mw[k].l = '*';
|
| + w_mw[k].h = 0;
|
| }
|
| - std::string mw;
|
| - u16_u8(mw, u8);
|
| - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
|
| + thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
|
| } else {
|
| - std::string mw(word);
|
| + std::string mw = word;
|
| for (int k = sp; k < n; k += 4)
|
| mw[k] = '*';
|
| - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
|
| + thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
|
| }
|
| }
|
| thresh = thresh / 3;
|
| @@ -1305,11 +1338,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| return;
|
| }
|
|
|
| + std::vector<w_char> w_glst_word;
|
| for (int i = 0; i < MAX_ROOTS; i++) {
|
| if (roots[i]) {
|
| struct hentry* rp = roots[i];
|
|
|
| - std::string f;
|
| + f.clear();
|
| const char *field = NULL;
|
| if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))
|
| field = f.c_str();
|
| @@ -1318,8 +1352,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| nc, field);
|
|
|
| for (int k = 0; k < nw; k++) {
|
| - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
|
| - leftcommonsubstring(word, glst[k].word);
|
| + if (utf8) {
|
| + w_glst_word.clear();
|
| + u8_u16(w_glst_word, glst[k].word);
|
| + sc = ngram(n, w_word, w_glst_word,
|
| + NGRAM_ANY_MISMATCH + low) +
|
| + leftcommonsubstring(w_word, w_glst_word);
|
| + } else {
|
| + sc = ngram(n, word, glst[k].word,
|
| + NGRAM_ANY_MISMATCH + low) +
|
| + leftcommonsubstring(word, glst[k].word);
|
| + }
|
|
|
| if (sc > thresh) {
|
| if (sc > gscore[lp]) {
|
| @@ -1373,16 +1416,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| fact = (10.0 - maxd) / 5.0;
|
| }
|
|
|
| + std::vector<w_char> w_gl;
|
| for (int i = 0; i < MAX_GUESS; i++) {
|
| if (guess[i]) {
|
| // lowering guess[i]
|
| std::string gl;
|
| int len;
|
| if (utf8) {
|
| - std::vector<w_char> _w;
|
| - len = u8_u16(_w, guess[i]);
|
| - mkallsmall_utf(_w, langnum);
|
| - u16_u8(gl, _w);
|
| + w_gl.clear();
|
| + len = u8_u16(w_gl, guess[i]);
|
| + mkallsmall_utf(w_gl, langnum);
|
| + u16_u8(gl, w_gl);
|
| } else {
|
| gl.assign(guess[i]);
|
| if (!nonbmp)
|
| @@ -1399,14 +1443,29 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| }
|
| // using 2-gram instead of 3, and other weightening
|
|
|
| - re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
|
| - ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
|
| + w_gl.clear();
|
| + if (utf8) {
|
| + u8_u16(w_gl, gl);
|
| + re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
|
| + ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
|
| + } else {
|
| + re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
|
| + ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
|
| + }
|
|
|
| + int ngram_score, leftcommon_score;
|
| + if (utf8) {
|
| + ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
|
| + leftcommon_score = leftcommonsubstring(w_word, w_gl);
|
| + } else {
|
| + ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
|
| + leftcommon_score = leftcommonsubstring(word, gl.c_str());
|
| + }
|
| gscore[i] =
|
| // length of longest common subsequent minus length difference
|
| 2 * _lcs - abs((int)(n - len)) +
|
| // weight length of the left common substring
|
| - leftcommonsubstring(word, gl.c_str()) +
|
| + leftcommon_score +
|
| // weight equal character positions
|
| (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)
|
| ? 1
|
| @@ -1414,7 +1473,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| // swap character (not neighboring)
|
| ((is_swap) ? 10 : 0) +
|
| // ngram
|
| - ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +
|
| + ngram_score +
|
| // weighted ngrams
|
| re +
|
| // different limit for dictionaries with PHONE rules
|
| @@ -1432,11 +1491,11 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| // lowering rootphon[i]
|
| std::string gl;
|
| int len;
|
| + w_gl.clear();
|
| if (utf8) {
|
| - std::vector<w_char> _w;
|
| - len = u8_u16(_w, rootsphon[i]);
|
| - mkallsmall_utf(_w, langnum);
|
| - u16_u8(gl, _w);
|
| + len = u8_u16(w_gl, rootsphon[i]);
|
| + mkallsmall_utf(w_gl, langnum);
|
| + u16_u8(gl, w_gl);
|
| } else {
|
| gl.assign(rootsphon[i]);
|
| if (!nonbmp)
|
| @@ -1444,10 +1503,15 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
| len = strlen(rootsphon[i]);
|
| }
|
|
|
| + // weight length of the left common substring
|
| + int leftcommon_score;
|
| + if (utf8)
|
| + leftcommon_score = leftcommonsubstring(w_word, w_gl);
|
| + else
|
| + leftcommon_score = leftcommonsubstring(word, gl.c_str());
|
| // heuristic weigthing of ngram scores
|
| scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +
|
| - // weight length of the left common substring
|
| - leftcommonsubstring(word, gl.c_str());
|
| + leftcommon_score;
|
| }
|
| }
|
|
|
| @@ -1852,10 +1916,10 @@ std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const
|
| return result2;
|
| }
|
|
|
| -// generate an n-gram score comparing s1 and s2
|
| +// generate an n-gram score comparing s1 and s2, UTF16 version
|
| int SuggestMgr::ngram(int n,
|
| - const std::string& s1,
|
| - const std::string& s2,
|
| + const std::vector<w_char>& su1,
|
| + const std::vector<w_char>& su2,
|
| int opt) {
|
| int nscore = 0;
|
| int ns;
|
| @@ -1863,68 +1927,44 @@ int SuggestMgr::ngram(int n,
|
| int l2;
|
| int test = 0;
|
|
|
| - if (utf8) {
|
| - std::vector<w_char> su1;
|
| - std::vector<w_char> su2;
|
| - l1 = u8_u16(su1, s1);
|
| - l2 = u8_u16(su2, s2);
|
| - if ((l2 <= 0) || (l1 == -1))
|
| - return 0;
|
| - // lowering dictionary word
|
| - if (opt & NGRAM_LOWERING)
|
| - mkallsmall_utf(su2, langnum);
|
| - for (int j = 1; j <= n; j++) {
|
| - ns = 0;
|
| - for (int i = 0; i <= (l1 - j); i++) {
|
| - int k = 0;
|
| - for (int l = 0; l <= (l2 - j); l++) {
|
| - for (k = 0; k < j; k++) {
|
| - w_char& c1 = su1[i + k];
|
| - w_char& c2 = su2[l + k];
|
| - if ((c1.l != c2.l) || (c1.h != c2.h))
|
| - break;
|
| - }
|
| - if (k == j) {
|
| - ns++;
|
| + l1 = su1.size();
|
| + l2 = su2.size();
|
| + if (l2 == 0)
|
| + return 0;
|
| + // lowering dictionary word
|
| + const std::vector<w_char>* p_su2 = &su2;
|
| + std::vector<w_char> su2_copy;
|
| + if (opt & NGRAM_LOWERING) {
|
| + su2_copy = su2;
|
| + mkallsmall_utf(su2_copy, langnum);
|
| + p_su2 = &su2_copy;
|
| + }
|
| + for (int j = 1; j <= n; j++) {
|
| + ns = 0;
|
| + for (int i = 0; i <= (l1 - j); i++) {
|
| + int k = 0;
|
| + for (int l = 0; l <= (l2 - j); l++) {
|
| + for (k = 0; k < j; k++) {
|
| + const w_char& c1 = su1[i + k];
|
| + const w_char& c2 = (*p_su2)[l + k];
|
| + if ((c1.l != c2.l) || (c1.h != c2.h))
|
| break;
|
| - }
|
| }
|
| - if (k != j && opt & NGRAM_WEIGHTED) {
|
| - ns--;
|
| - test++;
|
| - if (i == 0 || i == l1 - j)
|
| - ns--; // side weight
|
| - }
|
| - }
|
| - nscore = nscore + ns;
|
| - if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
| - break;
|
| - }
|
| - } else {
|
| - l2 = s2.size();
|
| - if (l2 == 0)
|
| - return 0;
|
| - l1 = s1.size();
|
| - std::string t(s2);
|
| - if (opt & NGRAM_LOWERING)
|
| - mkallsmall(t, csconv);
|
| - for (int j = 1; j <= n; j++) {
|
| - ns = 0;
|
| - for (int i = 0; i <= (l1 - j); i++) {
|
| - //t is haystack, s1[i..i+j) is needle
|
| - if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
|
| + if (k == j) {
|
| ns++;
|
| - } else if (opt & NGRAM_WEIGHTED) {
|
| - ns--;
|
| - test++;
|
| - if (i == 0 || i == l1 - j)
|
| - ns--; // side weight
|
| + break;
|
| }
|
| }
|
| - nscore = nscore + ns;
|
| - if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
| - break;
|
| + if (k != j && opt & NGRAM_WEIGHTED) {
|
| + ns--;
|
| + test++;
|
| + if (i == 0 || i == l1 - j)
|
| + ns--; // side weight
|
| + }
|
| }
|
| + nscore = nscore + ns;
|
| + if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
| + break;
|
| }
|
|
|
| ns = 0;
|
| @@ -1936,46 +1976,95 @@ int SuggestMgr::ngram(int n,
|
| return ns;
|
| }
|
|
|
| -// length of the left common substring of s1 and (decapitalised) s2
|
| -int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {
|
| - if (utf8) {
|
| - std::vector<w_char> su1;
|
| - std::vector<w_char> su2;
|
| - int l1 = u8_u16(su1, s1);
|
| - int l2 = u8_u16(su2, s2);
|
| - // decapitalize dictionary word
|
| - if (complexprefixes) {
|
| - if (su1[l1 - 1] == su2[l2 - 1])
|
| - return 1;
|
| - } else {
|
| - unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
|
| - unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
|
| - if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
|
| - return 0;
|
| - int i;
|
| - for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
|
| - (su1[i].h == su2[i].h);
|
| - i++)
|
| - ;
|
| - return i;
|
| +// generate an n-gram score comparing s1 and s2, non-UTF16 version
|
| +int SuggestMgr::ngram(int n,
|
| + const std::string& s1,
|
| + const std::string& s2,
|
| + int opt) {
|
| + int nscore = 0;
|
| + int ns;
|
| + int l1;
|
| + int l2;
|
| + int test = 0;
|
| +
|
| + l2 = s2.size();
|
| + if (l2 == 0)
|
| + return 0;
|
| + l1 = s1.size();
|
| + std::string t(s2);
|
| + if (opt & NGRAM_LOWERING)
|
| + mkallsmall(t, csconv);
|
| + for (int j = 1; j <= n; j++) {
|
| + ns = 0;
|
| + for (int i = 0; i <= (l1 - j); i++) {
|
| + //t is haystack, s1[i..i+j) is needle
|
| + if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
|
| + ns++;
|
| + } else if (opt & NGRAM_WEIGHTED) {
|
| + ns--;
|
| + test++;
|
| + if (i == 0 || i == l1 - j)
|
| + ns--; // side weight
|
| + }
|
| }
|
| + nscore = nscore + ns;
|
| + if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
| + break;
|
| + }
|
| +
|
| + ns = 0;
|
| + if (opt & NGRAM_LONGER_WORSE)
|
| + ns = (l2 - l1) - 2;
|
| + if (opt & NGRAM_ANY_MISMATCH)
|
| + ns = abs(l2 - l1) - 2;
|
| + ns = (nscore - ((ns > 0) ? ns : 0));
|
| + return ns;
|
| +}
|
| +
|
| +// length of the left common substring of s1 and (decapitalised) s2, UTF version
|
| +int SuggestMgr::leftcommonsubstring(
|
| + const std::vector<w_char>& su1,
|
| + const std::vector<w_char>& su2) {
|
| + int l1 = su1.size();
|
| + int l2 = su2.size();
|
| + // decapitalize dictionary word
|
| + if (complexprefixes) {
|
| + if (su1[l1 - 1] == su2[l2 - 1])
|
| + return 1;
|
| } else {
|
| - if (complexprefixes) {
|
| - int l1 = strlen(s1);
|
| - int l2 = strlen(s2);
|
| - if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
|
| - return 1;
|
| - } else if (csconv) {
|
| - const char* olds = s1;
|
| - // decapitalise dictionary word
|
| - if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
|
| - return 0;
|
| - do {
|
| - s1++;
|
| - s2++;
|
| - } while ((*s1 == *s2) && (*s1 != '\0'));
|
| - return (int)(s1 - olds);
|
| - }
|
| + unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
|
| + unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
|
| + if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
|
| + return 0;
|
| + int i;
|
| + for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
|
| + (su1[i].h == su2[i].h);
|
| + i++)
|
| + ;
|
| + return i;
|
| + }
|
| + return 0;
|
| +}
|
| +
|
| +// length of the left common substring of s1 and (decapitalised) s2, non-UTF
|
| +int SuggestMgr::leftcommonsubstring(
|
| + const char* s1,
|
| + const char* s2) {
|
| + if (complexprefixes) {
|
| + int l1 = strlen(s1);
|
| + int l2 = strlen(s2);
|
| + if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
|
| + return 1;
|
| + } else if (csconv) {
|
| + const char* olds = s1;
|
| + // decapitalise dictionary word
|
| + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
|
| + return 0;
|
| + do {
|
| + s1++;
|
| + s2++;
|
| + } while ((*s1 == *s2) && (*s1 != '\0'));
|
| + return (int)(s1 - olds);
|
| }
|
| return 0;
|
| }
|
|
|