Index: third_party/hunspell/src/hunspell/suggestmgr.cxx |
diff --git a/third_party/hunspell/src/hunspell/suggestmgr.cxx b/third_party/hunspell/src/hunspell/suggestmgr.cxx |
index 4391bb9a2072da67ad209852e1b7ad307853ada7..4e122da3d6b0aa42eb836f0e207de3bed0be9869 100644 |
--- a/third_party/hunspell/src/hunspell/suggestmgr.cxx |
+++ b/third_party/hunspell/src/hunspell/suggestmgr.cxx |
@@ -1174,12 +1174,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; |
std::string target; |
std::string candidate; |
+ std::vector<w_char> w_candidate; |
if (ph) { |
if (utf8) { |
- std::vector<w_char> _w; |
- u8_u16(_w, word); |
- mkallcap_utf(_w, langnum); |
- u16_u8(candidate, _w); |
+ u8_u16(w_candidate, word); |
+ mkallcap_utf(w_candidate, langnum); |
+ u16_u8(candidate, w_candidate); |
} else { |
candidate.assign(word); |
if (!nonbmp) |
@@ -1193,6 +1193,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; |
FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; |
+ std::vector<w_char> w_word, w_target; |
+ if (utf8) { |
+ u8_u16(w_word, word); |
+ u8_u16(w_target, target); |
+ } |
+ |
+ std::vector<w_char> w_entry; |
+ std::string f; |
+ std::vector<w_char> w_f; |
+ std::vector<w_char> w_target2; |
+ |
for (size_t i = 0; i < rHMgr.size(); ++i) { |
while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { |
if ((hp->astr) && (pAMgr) && |
@@ -1203,15 +1214,30 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
TESTAFF(hp->astr, onlyincompound, hp->alen))) |
continue; |
- sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + |
- leftcommonsubstring(word, HENTRY_WORD(hp)); |
+ if (utf8) { |
+ w_entry.clear(); |
+ u8_u16(w_entry, HENTRY_WORD(hp)); |
+ sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) + |
+ leftcommonsubstring(w_word, w_entry); |
+ } else { |
+ sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + |
+ leftcommonsubstring(word, HENTRY_WORD(hp)); |
+ } |
// check special pronounciation |
- std::string f; |
+ f.clear(); |
if ((hp->var & H_OPT_PHON) && |
copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { |
- int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + |
- +leftcommonsubstring(word, f.c_str()); |
+ int sc2; |
+ if (utf8) { |
+ w_f.clear(); |
+ u8_u16(w_f, f.c_str()); |
+ sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) + |
+ leftcommonsubstring(w_word, w_f); |
+ } else { |
+ sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + |
+ leftcommonsubstring(word, f.c_str()); |
+ } |
if (sc2 > sc) |
sc = sc2; |
} |
@@ -1219,16 +1245,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
int scphon = -20000; |
if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { |
if (utf8) { |
- std::vector<w_char> _w; |
- u8_u16(_w, HENTRY_WORD(hp)); |
- mkallcap_utf(_w, langnum); |
- u16_u8(candidate, _w); |
+ w_candidate.clear(); |
+ u8_u16(w_candidate, HENTRY_WORD(hp)); |
+ mkallcap_utf(w_candidate, langnum); |
+ u16_u8(candidate, w_candidate); |
} else { |
- candidate.assign(HENTRY_WORD(hp)); |
+ candidate = HENTRY_WORD(hp); |
mkallcap(candidate, csconv); |
} |
std::string target2 = phonet(candidate, *ph); |
- scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); |
+ w_target2.clear(); |
+ if (utf8) { |
+ u8_u16(w_target2, target2.c_str()); |
+ scphon = 2 * ngram(3, w_target, w_target2, |
+ NGRAM_LONGER_WORSE); |
+ } else { |
+ scphon = 2 * ngram(3, target, target2, |
+ NGRAM_LONGER_WORSE); |
+ } |
} |
if (sc > scores[lp]) { |
@@ -1262,22 +1296,21 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
// find minimum threshold for a passable suggestion |
// mangle original word three differnt ways |
// and score them to generate a minimum acceptable score |
+ std::vector<w_char> w_mw; |
int thresh = 0; |
for (int sp = 1; sp < 4; sp++) { |
if (utf8) { |
- u8_u16(u8, word); |
+ w_mw = w_word; |
for (int k = sp; k < n; k += 4) { |
- u8[k].l = '*'; |
- u8[k].h = 0; |
+ w_mw[k].l = '*'; |
+ w_mw[k].h = 0; |
} |
- std::string mw; |
- u16_u8(mw, u8); |
- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); |
+ thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low); |
} else { |
- std::string mw(word); |
+ std::string mw = word; |
for (int k = sp; k < n; k += 4) |
mw[k] = '*'; |
- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); |
+ thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); |
} |
} |
thresh = thresh / 3; |
@@ -1305,11 +1338,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
return; |
} |
+ std::vector<w_char> w_glst_word; |
for (int i = 0; i < MAX_ROOTS; i++) { |
if (roots[i]) { |
struct hentry* rp = roots[i]; |
- std::string f; |
+ f.clear(); |
const char *field = NULL; |
if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) |
field = f.c_str(); |
@@ -1318,8 +1352,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
nc, field); |
for (int k = 0; k < nw; k++) { |
- sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + |
- leftcommonsubstring(word, glst[k].word); |
+ if (utf8) { |
+ w_glst_word.clear(); |
+ u8_u16(w_glst_word, glst[k].word); |
+ sc = ngram(n, w_word, w_glst_word, |
+ NGRAM_ANY_MISMATCH + low) + |
+ leftcommonsubstring(w_word, w_glst_word); |
+ } else { |
+ sc = ngram(n, word, glst[k].word, |
+ NGRAM_ANY_MISMATCH + low) + |
+ leftcommonsubstring(word, glst[k].word); |
+ } |
if (sc > thresh) { |
if (sc > gscore[lp]) { |
@@ -1373,16 +1416,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
fact = (10.0 - maxd) / 5.0; |
} |
+ std::vector<w_char> w_gl; |
for (int i = 0; i < MAX_GUESS; i++) { |
if (guess[i]) { |
// lowering guess[i] |
std::string gl; |
int len; |
if (utf8) { |
- std::vector<w_char> _w; |
- len = u8_u16(_w, guess[i]); |
- mkallsmall_utf(_w, langnum); |
- u16_u8(gl, _w); |
+ w_gl.clear(); |
+ len = u8_u16(w_gl, guess[i]); |
+ mkallsmall_utf(w_gl, langnum); |
+ u16_u8(gl, w_gl); |
} else { |
gl.assign(guess[i]); |
if (!nonbmp) |
@@ -1399,14 +1443,29 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
} |
// using 2-gram instead of 3, and other weightening |
- re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + |
- ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); |
+ w_gl.clear(); |
+ if (utf8) { |
+ u8_u16(w_gl, gl); |
+ re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + |
+ ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); |
+ } else { |
+ re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + |
+ ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); |
+ } |
+ int ngram_score, leftcommon_score; |
+ if (utf8) { |
+ ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low); |
+ leftcommon_score = leftcommonsubstring(w_word, w_gl); |
+ } else { |
+ ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low); |
+ leftcommon_score = leftcommonsubstring(word, gl.c_str()); |
+ } |
gscore[i] = |
// length of longest common subsequent minus length difference |
2 * _lcs - abs((int)(n - len)) + |
// weight length of the left common substring |
- leftcommonsubstring(word, gl.c_str()) + |
+ leftcommon_score + |
// weight equal character positions |
(!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) |
? 1 |
@@ -1414,7 +1473,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
// swap character (not neighboring) |
((is_swap) ? 10 : 0) + |
// ngram |
- ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + |
+ ngram_score + |
// weighted ngrams |
re + |
// different limit for dictionaries with PHONE rules |
@@ -1432,11 +1491,11 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
// lowering rootphon[i] |
std::string gl; |
int len; |
+ w_gl.clear(); |
if (utf8) { |
- std::vector<w_char> _w; |
- len = u8_u16(_w, rootsphon[i]); |
- mkallsmall_utf(_w, langnum); |
- u16_u8(gl, _w); |
+ len = u8_u16(w_gl, rootsphon[i]); |
+ mkallsmall_utf(w_gl, langnum); |
+ u16_u8(gl, w_gl); |
} else { |
gl.assign(rootsphon[i]); |
if (!nonbmp) |
@@ -1444,10 +1503,15 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, |
len = strlen(rootsphon[i]); |
} |
+ // weight length of the left common substring |
+ int leftcommon_score; |
+ if (utf8) |
+ leftcommon_score = leftcommonsubstring(w_word, w_gl); |
+ else |
+ leftcommon_score = leftcommonsubstring(word, gl.c_str()); |
// heuristic weigthing of ngram scores |
scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + |
- // weight length of the left common substring |
- leftcommonsubstring(word, gl.c_str()); |
+ leftcommon_score; |
} |
} |
@@ -1852,10 +1916,10 @@ std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const |
return result2; |
} |
-// generate an n-gram score comparing s1 and s2 |
+// generate an n-gram score comparing s1 and s2, UTF16 version |
int SuggestMgr::ngram(int n, |
- const std::string& s1, |
- const std::string& s2, |
+ const std::vector<w_char>& su1, |
+ const std::vector<w_char>& su2, |
int opt) { |
int nscore = 0; |
int ns; |
@@ -1863,68 +1927,44 @@ int SuggestMgr::ngram(int n, |
int l2; |
int test = 0; |
- if (utf8) { |
- std::vector<w_char> su1; |
- std::vector<w_char> su2; |
- l1 = u8_u16(su1, s1); |
- l2 = u8_u16(su2, s2); |
- if ((l2 <= 0) || (l1 == -1)) |
- return 0; |
- // lowering dictionary word |
- if (opt & NGRAM_LOWERING) |
- mkallsmall_utf(su2, langnum); |
- for (int j = 1; j <= n; j++) { |
- ns = 0; |
- for (int i = 0; i <= (l1 - j); i++) { |
- int k = 0; |
- for (int l = 0; l <= (l2 - j); l++) { |
- for (k = 0; k < j; k++) { |
- w_char& c1 = su1[i + k]; |
- w_char& c2 = su2[l + k]; |
- if ((c1.l != c2.l) || (c1.h != c2.h)) |
- break; |
- } |
- if (k == j) { |
- ns++; |
+ l1 = su1.size(); |
+ l2 = su2.size(); |
+ if (l2 == 0) |
+ return 0; |
+ // lowering dictionary word |
+ const std::vector<w_char>* p_su2 = &su2; |
+ std::vector<w_char> su2_copy; |
+ if (opt & NGRAM_LOWERING) { |
+ su2_copy = su2; |
+ mkallsmall_utf(su2_copy, langnum); |
+ p_su2 = &su2_copy; |
+ } |
+ for (int j = 1; j <= n; j++) { |
+ ns = 0; |
+ for (int i = 0; i <= (l1 - j); i++) { |
+ int k = 0; |
+ for (int l = 0; l <= (l2 - j); l++) { |
+ for (k = 0; k < j; k++) { |
+ const w_char& c1 = su1[i + k]; |
+ const w_char& c2 = (*p_su2)[l + k]; |
+ if ((c1.l != c2.l) || (c1.h != c2.h)) |
break; |
- } |
} |
- if (k != j && opt & NGRAM_WEIGHTED) { |
- ns--; |
- test++; |
- if (i == 0 || i == l1 - j) |
- ns--; // side weight |
- } |
- } |
- nscore = nscore + ns; |
- if (ns < 2 && !(opt & NGRAM_WEIGHTED)) |
- break; |
- } |
- } else { |
- l2 = s2.size(); |
- if (l2 == 0) |
- return 0; |
- l1 = s1.size(); |
- std::string t(s2); |
- if (opt & NGRAM_LOWERING) |
- mkallsmall(t, csconv); |
- for (int j = 1; j <= n; j++) { |
- ns = 0; |
- for (int i = 0; i <= (l1 - j); i++) { |
- //t is haystack, s1[i..i+j) is needle |
- if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { |
+ if (k == j) { |
ns++; |
- } else if (opt & NGRAM_WEIGHTED) { |
- ns--; |
- test++; |
- if (i == 0 || i == l1 - j) |
- ns--; // side weight |
+ break; |
} |
} |
- nscore = nscore + ns; |
- if (ns < 2 && !(opt & NGRAM_WEIGHTED)) |
- break; |
+ if (k != j && opt & NGRAM_WEIGHTED) { |
+ ns--; |
+ test++; |
+ if (i == 0 || i == l1 - j) |
+ ns--; // side weight |
+ } |
} |
+ nscore = nscore + ns; |
+ if (ns < 2 && !(opt & NGRAM_WEIGHTED)) |
+ break; |
} |
ns = 0; |
@@ -1936,46 +1976,95 @@ int SuggestMgr::ngram(int n, |
return ns; |
} |
-// length of the left common substring of s1 and (decapitalised) s2 |
-int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) { |
- if (utf8) { |
- std::vector<w_char> su1; |
- std::vector<w_char> su2; |
- int l1 = u8_u16(su1, s1); |
- int l2 = u8_u16(su2, s2); |
- // decapitalize dictionary word |
- if (complexprefixes) { |
- if (su1[l1 - 1] == su2[l2 - 1]) |
- return 1; |
- } else { |
- unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; |
- unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; |
- if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) |
- return 0; |
- int i; |
- for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && |
- (su1[i].h == su2[i].h); |
- i++) |
- ; |
- return i; |
+// generate an n-gram score comparing s1 and s2, non-UTF16 version |
+int SuggestMgr::ngram(int n, |
+ const std::string& s1, |
+ const std::string& s2, |
+ int opt) { |
+ int nscore = 0; |
+ int ns; |
+ int l1; |
+ int l2; |
+ int test = 0; |
+ |
+ l2 = s2.size(); |
+ if (l2 == 0) |
+ return 0; |
+ l1 = s1.size(); |
+ std::string t(s2); |
+ if (opt & NGRAM_LOWERING) |
+ mkallsmall(t, csconv); |
+ for (int j = 1; j <= n; j++) { |
+ ns = 0; |
+ for (int i = 0; i <= (l1 - j); i++) { |
+ //t is haystack, s1[i..i+j) is needle |
+ if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { |
+ ns++; |
+ } else if (opt & NGRAM_WEIGHTED) { |
+ ns--; |
+ test++; |
+ if (i == 0 || i == l1 - j) |
+ ns--; // side weight |
+ } |
} |
+ nscore = nscore + ns; |
+ if (ns < 2 && !(opt & NGRAM_WEIGHTED)) |
+ break; |
+ } |
+ |
+ ns = 0; |
+ if (opt & NGRAM_LONGER_WORSE) |
+ ns = (l2 - l1) - 2; |
+ if (opt & NGRAM_ANY_MISMATCH) |
+ ns = abs(l2 - l1) - 2; |
+ ns = (nscore - ((ns > 0) ? ns : 0)); |
+ return ns; |
+} |
+ |
+// length of the left common substring of s1 and (decapitalised) s2, UTF version |
+int SuggestMgr::leftcommonsubstring( |
+ const std::vector<w_char>& su1, |
+ const std::vector<w_char>& su2) { |
+ int l1 = su1.size(); |
+ int l2 = su2.size(); |
+ // decapitalize dictionary word |
+ if (complexprefixes) { |
+ if (su1[l1 - 1] == su2[l2 - 1]) |
+ return 1; |
} else { |
- if (complexprefixes) { |
- int l1 = strlen(s1); |
- int l2 = strlen(s2); |
- if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) |
- return 1; |
- } else if (csconv) { |
- const char* olds = s1; |
- // decapitalise dictionary word |
- if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) |
- return 0; |
- do { |
- s1++; |
- s2++; |
- } while ((*s1 == *s2) && (*s1 != '\0')); |
- return (int)(s1 - olds); |
- } |
+ unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; |
+ unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; |
+ if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) |
+ return 0; |
+ int i; |
+ for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && |
+ (su1[i].h == su2[i].h); |
+ i++) |
+ ; |
+ return i; |
+ } |
+ return 0; |
+} |
+ |
+// length of the left common substring of s1 and (decapitalised) s2, non-UTF |
+int SuggestMgr::leftcommonsubstring( |
+ const char* s1, |
+ const char* s2) { |
+ if (complexprefixes) { |
+ int l1 = strlen(s1); |
+ int l2 = strlen(s2); |
+ if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) |
+ return 1; |
+ } else if (csconv) { |
+ const char* olds = s1; |
+ // decapitalise dictionary word |
+ if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) |
+ return 0; |
+ do { |
+ s1++; |
+ s2++; |
+ } while ((*s1 == *s2) && (*s1 != '\0')); |
+ return (int)(s1 - olds); |
} |
return 0; |
} |