third_party/hunspell/src/hunspell/suggestmgr.cxx - Issue 2587363003: [spellcheck] Updated Hunspell to 1.6.0

Unified Diff: third_party/hunspell/src/hunspell/suggestmgr.cxx

Issue 2587363003: [spellcheck] Updated Hunspell to 1.6.0 (Closed)

Patch Set: Handled presubmit warnings Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/hunspell/src/hunspell/suggestmgr.cxx

diff --git a/third_party/hunspell/src/hunspell/suggestmgr.cxx b/third_party/hunspell/src/hunspell/suggestmgr.cxx

index 4391bb9a2072da67ad209852e1b7ad307853ada7..4e122da3d6b0aa42eb836f0e207de3bed0be9869 100644

--- a/third_party/hunspell/src/hunspell/suggestmgr.cxx

+++ b/third_party/hunspell/src/hunspell/suggestmgr.cxx

@@ -1174,12 +1174,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;

std::string target;

std::string candidate;

+ std::vector<w_char> w_candidate;

if (ph) {

if (utf8) {

- std::vector<w_char> _w;

- u8_u16(_w, word);

- mkallcap_utf(_w, langnum);

- u16_u8(candidate, _w);

+ u8_u16(w_candidate, word);

+ mkallcap_utf(w_candidate, langnum);

+ u16_u8(candidate, w_candidate);

} else {

candidate.assign(word);

if (!nonbmp)

@@ -1193,6 +1193,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;

FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;

+ std::vector<w_char> w_word, w_target;

+ if (utf8) {

+ u8_u16(w_word, word);

+ u8_u16(w_target, target);

+ }

+ std::vector<w_char> w_entry;

+ std::string f;

+ std::vector<w_char> w_f;

+ std::vector<w_char> w_target2;

for (size_t i = 0; i < rHMgr.size(); ++i) {

while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {

if ((hp->astr) && (pAMgr) &&

@@ -1203,15 +1214,30 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

TESTAFF(hp->astr, onlyincompound, hp->alen)))

continue;

- sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +

- leftcommonsubstring(word, HENTRY_WORD(hp));

+ if (utf8) {

+ w_entry.clear();

+ u8_u16(w_entry, HENTRY_WORD(hp));

+ sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +

+ leftcommonsubstring(w_word, w_entry);

+ } else {

+ sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +

+ leftcommonsubstring(word, HENTRY_WORD(hp));

+ }

// check special pronounciation

- std::string f;

+ f.clear();

if ((hp->var & H_OPT_PHON) &&

copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {

- int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +

- +leftcommonsubstring(word, f.c_str());

+ int sc2;

+ if (utf8) {

+ w_f.clear();

+ u8_u16(w_f, f.c_str());

+ sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +

+ leftcommonsubstring(w_word, w_f);

+ } else {

+ sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +

+ leftcommonsubstring(word, f.c_str());

+ }

if (sc2 > sc)

sc = sc2;

}

@@ -1219,16 +1245,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

int scphon = -20000;

if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {

if (utf8) {

- std::vector<w_char> _w;

- u8_u16(_w, HENTRY_WORD(hp));

- mkallcap_utf(_w, langnum);

- u16_u8(candidate, _w);

+ w_candidate.clear();

+ u8_u16(w_candidate, HENTRY_WORD(hp));

+ mkallcap_utf(w_candidate, langnum);

+ u16_u8(candidate, w_candidate);

} else {

- candidate.assign(HENTRY_WORD(hp));

+ candidate = HENTRY_WORD(hp);

mkallcap(candidate, csconv);

}

std::string target2 = phonet(candidate, *ph);

- scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);

+ w_target2.clear();

+ if (utf8) {

+ u8_u16(w_target2, target2.c_str());

+ scphon = 2 * ngram(3, w_target, w_target2,

+ NGRAM_LONGER_WORSE);

+ } else {

+ scphon = 2 * ngram(3, target, target2,

+ NGRAM_LONGER_WORSE);

+ }

}

if (sc > scores[lp]) {

@@ -1262,22 +1296,21 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

// find minimum threshold for a passable suggestion

// mangle original word three differnt ways

// and score them to generate a minimum acceptable score

+ std::vector<w_char> w_mw;

int thresh = 0;

for (int sp = 1; sp < 4; sp++) {

if (utf8) {

- u8_u16(u8, word);

+ w_mw = w_word;

for (int k = sp; k < n; k += 4) {

- u8[k].l = '*';

- u8[k].h = 0;

+ w_mw[k].l = '*';

+ w_mw[k].h = 0;

}

- std::string mw;

- u16_u8(mw, u8);

- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

+ thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);

} else {

- std::string mw(word);

+ std::string mw = word;

for (int k = sp; k < n; k += 4)

mw[k] = '*';

- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

+ thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

}

thresh = thresh / 3;

@@ -1305,11 +1338,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

return;

}

+ std::vector<w_char> w_glst_word;

for (int i = 0; i < MAX_ROOTS; i++) {

if (roots[i]) {

struct hentry* rp = roots[i];

- std::string f;

+ f.clear();

const char *field = NULL;

if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))

field = f.c_str();

@@ -1318,8 +1352,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

nc, field);

for (int k = 0; k < nw; k++) {

- sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +

- leftcommonsubstring(word, glst[k].word);

+ if (utf8) {

+ w_glst_word.clear();

+ u8_u16(w_glst_word, glst[k].word);

+ sc = ngram(n, w_word, w_glst_word,

+ NGRAM_ANY_MISMATCH + low) +

+ leftcommonsubstring(w_word, w_glst_word);

+ } else {

+ sc = ngram(n, word, glst[k].word,

+ NGRAM_ANY_MISMATCH + low) +

+ leftcommonsubstring(word, glst[k].word);

+ }

if (sc > thresh) {

if (sc > gscore[lp]) {

@@ -1373,16 +1416,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

fact = (10.0 - maxd) / 5.0;

}

+ std::vector<w_char> w_gl;

for (int i = 0; i < MAX_GUESS; i++) {

if (guess[i]) {

// lowering guess[i]

std::string gl;

int len;

if (utf8) {

- std::vector<w_char> _w;

- len = u8_u16(_w, guess[i]);

- mkallsmall_utf(_w, langnum);

- u16_u8(gl, _w);

+ w_gl.clear();

+ len = u8_u16(w_gl, guess[i]);

+ mkallsmall_utf(w_gl, langnum);

+ u16_u8(gl, w_gl);

} else {

gl.assign(guess[i]);

if (!nonbmp)

@@ -1399,14 +1443,29 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

}

// using 2-gram instead of 3, and other weightening

- re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +

- ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);

+ w_gl.clear();

+ if (utf8) {

+ u8_u16(w_gl, gl);

+ re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +

+ ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);

+ } else {

+ re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +

+ ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);

+ }

+ int ngram_score, leftcommon_score;

+ if (utf8) {

+ ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);

+ leftcommon_score = leftcommonsubstring(w_word, w_gl);

+ } else {

+ ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);

+ leftcommon_score = leftcommonsubstring(word, gl.c_str());

+ }

gscore[i] =

// length of longest common subsequent minus length difference

2 * _lcs - abs((int)(n - len)) +

// weight length of the left common substring

- leftcommonsubstring(word, gl.c_str()) +

+ leftcommon_score +

// weight equal character positions

(!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)

? 1

@@ -1414,7 +1473,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

// swap character (not neighboring)

((is_swap) ? 10 : 0) +

// ngram

- ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +

+ ngram_score +

// weighted ngrams

re +

// different limit for dictionaries with PHONE rules

@@ -1432,11 +1491,11 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

// lowering rootphon[i]

std::string gl;

int len;

+ w_gl.clear();

if (utf8) {

- std::vector<w_char> _w;

- len = u8_u16(_w, rootsphon[i]);

- mkallsmall_utf(_w, langnum);

- u16_u8(gl, _w);

+ len = u8_u16(w_gl, rootsphon[i]);

+ mkallsmall_utf(w_gl, langnum);

+ u16_u8(gl, w_gl);

} else {

gl.assign(rootsphon[i]);

if (!nonbmp)

@@ -1444,10 +1503,15 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,

len = strlen(rootsphon[i]);

}

+ // weight length of the left common substring

+ int leftcommon_score;

+ if (utf8)

+ leftcommon_score = leftcommonsubstring(w_word, w_gl);

+ else

+ leftcommon_score = leftcommonsubstring(word, gl.c_str());

// heuristic weigthing of ngram scores

scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +

- // weight length of the left common substring

- leftcommonsubstring(word, gl.c_str());

+ leftcommon_score;

}

@@ -1852,10 +1916,10 @@ std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const

return result2;

}

-// generate an n-gram score comparing s1 and s2

+// generate an n-gram score comparing s1 and s2, UTF16 version

int SuggestMgr::ngram(int n,

- const std::string& s1,

- const std::string& s2,

+ const std::vector<w_char>& su1,

+ const std::vector<w_char>& su2,

int opt) {

int nscore = 0;

int ns;

@@ -1863,68 +1927,44 @@ int SuggestMgr::ngram(int n,

int l2;

int test = 0;

- if (utf8) {

- std::vector<w_char> su1;

- std::vector<w_char> su2;

- l1 = u8_u16(su1, s1);

- l2 = u8_u16(su2, s2);

- if ((l2 <= 0) || (l1 == -1))

- return 0;

- // lowering dictionary word

- if (opt & NGRAM_LOWERING)

- mkallsmall_utf(su2, langnum);

- for (int j = 1; j <= n; j++) {

- ns = 0;

- for (int i = 0; i <= (l1 - j); i++) {

- int k = 0;

- for (int l = 0; l <= (l2 - j); l++) {

- for (k = 0; k < j; k++) {

- w_char& c1 = su1[i + k];

- w_char& c2 = su2[l + k];

- if ((c1.l != c2.l) || (c1.h != c2.h))

- break;

- }

- if (k == j) {

- ns++;

+ l1 = su1.size();

+ l2 = su2.size();

+ if (l2 == 0)

+ return 0;

+ // lowering dictionary word

+ const std::vector<w_char>* p_su2 = &su2;

+ std::vector<w_char> su2_copy;

+ if (opt & NGRAM_LOWERING) {

+ su2_copy = su2;

+ mkallsmall_utf(su2_copy, langnum);

+ p_su2 = &su2_copy;

+ }

+ for (int j = 1; j <= n; j++) {

+ ns = 0;

+ for (int i = 0; i <= (l1 - j); i++) {

+ int k = 0;

+ for (int l = 0; l <= (l2 - j); l++) {

+ for (k = 0; k < j; k++) {

+ const w_char& c1 = su1[i + k];

+ const w_char& c2 = (*p_su2)[l + k];

+ if ((c1.l != c2.l) || (c1.h != c2.h))

break;

- }

}

- if (k != j && opt & NGRAM_WEIGHTED) {

- ns--;

- test++;

- if (i == 0 || i == l1 - j)

- ns--; // side weight

- }

- nscore = nscore + ns;

- if (ns < 2 && !(opt & NGRAM_WEIGHTED))

- break;

- }

- } else {

- l2 = s2.size();

- if (l2 == 0)

- return 0;

- l1 = s1.size();

- std::string t(s2);

- if (opt & NGRAM_LOWERING)

- mkallsmall(t, csconv);

- for (int j = 1; j <= n; j++) {

- ns = 0;

- for (int i = 0; i <= (l1 - j); i++) {

- //t is haystack, s1[i..i+j) is needle

- if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {

+ if (k == j) {

ns++;

- } else if (opt & NGRAM_WEIGHTED) {

- ns--;

- test++;

- if (i == 0 || i == l1 - j)

- ns--; // side weight

+ break;

}

- nscore = nscore + ns;

- if (ns < 2 && !(opt & NGRAM_WEIGHTED))

- break;

+ if (k != j && opt & NGRAM_WEIGHTED) {

+ ns--;

+ test++;

+ if (i == 0 || i == l1 - j)

+ ns--; // side weight

+ }

}

+ nscore = nscore + ns;

+ if (ns < 2 && !(opt & NGRAM_WEIGHTED))

+ break;

}

ns = 0;

@@ -1936,46 +1976,95 @@ int SuggestMgr::ngram(int n,

return ns;

}

-// length of the left common substring of s1 and (decapitalised) s2

-int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {

- if (utf8) {

- std::vector<w_char> su1;

- std::vector<w_char> su2;

- int l1 = u8_u16(su1, s1);

- int l2 = u8_u16(su2, s2);

- // decapitalize dictionary word

- if (complexprefixes) {

- if (su1[l1 - 1] == su2[l2 - 1])

- return 1;

- } else {

- unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;

- unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;

- if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))

- return 0;

- int i;

- for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&

- (su1[i].h == su2[i].h);

- i++)

- ;

- return i;

+// generate an n-gram score comparing s1 and s2, non-UTF16 version

+int SuggestMgr::ngram(int n,

+ const std::string& s1,

+ const std::string& s2,

+ int opt) {

+ int nscore = 0;

+ int ns;

+ int l1;

+ int l2;

+ int test = 0;

+ l2 = s2.size();

+ if (l2 == 0)

+ return 0;

+ l1 = s1.size();

+ std::string t(s2);

+ if (opt & NGRAM_LOWERING)

+ mkallsmall(t, csconv);

+ for (int j = 1; j <= n; j++) {

+ ns = 0;

+ for (int i = 0; i <= (l1 - j); i++) {

+ //t is haystack, s1[i..i+j) is needle

+ if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {

+ ns++;

+ } else if (opt & NGRAM_WEIGHTED) {

+ ns--;

+ test++;

+ if (i == 0 || i == l1 - j)

+ ns--; // side weight

+ }

}

+ nscore = nscore + ns;

+ if (ns < 2 && !(opt & NGRAM_WEIGHTED))

+ break;

+ }

+ ns = 0;

+ if (opt & NGRAM_LONGER_WORSE)

+ ns = (l2 - l1) - 2;

+ if (opt & NGRAM_ANY_MISMATCH)

+ ns = abs(l2 - l1) - 2;

+ ns = (nscore - ((ns > 0) ? ns : 0));

+ return ns;

+// length of the left common substring of s1 and (decapitalised) s2, UTF version

+int SuggestMgr::leftcommonsubstring(

+ const std::vector<w_char>& su1,

+ const std::vector<w_char>& su2) {

+ int l1 = su1.size();

+ int l2 = su2.size();

+ // decapitalize dictionary word

+ if (complexprefixes) {

+ if (su1[l1 - 1] == su2[l2 - 1])

+ return 1;

} else {

- if (complexprefixes) {

- int l1 = strlen(s1);

- int l2 = strlen(s2);

- if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])

- return 1;

- } else if (csconv) {

- const char* olds = s1;

- // decapitalise dictionary word

- if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))

- return 0;

- do {

- s1++;

- s2++;

- } while ((*s1 == *s2) && (*s1 != '\0'));

- return (int)(s1 - olds);

- }

+ unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;

+ unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;

+ if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))

+ return 0;

+ int i;

+ for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&

+ (su1[i].h == su2[i].h);

+ i++)

+ ;

+ return i;

+ }

+ return 0;

+// length of the left common substring of s1 and (decapitalised) s2, non-UTF

+int SuggestMgr::leftcommonsubstring(

+ const char* s1,

+ const char* s2) {

+ if (complexprefixes) {

+ int l1 = strlen(s1);

+ int l2 = strlen(s2);

+ if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])

+ return 1;

+ } else if (csconv) {

+ const char* olds = s1;

+ // decapitalise dictionary word

+ if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))

+ return 0;

+ do {

+ s1++;

+ s2++;

+ } while ((*s1 == *s2) && (*s1 != '\0'));

+ return (int)(s1 - olds);

}

return 0;

}

« no previous file with comments | « third_party/hunspell/src/hunspell/suggestmgr.hxx ('k') | no next file » | no next file with comments »