OLD | NEW |
1 /* ***** BEGIN LICENSE BLOCK ***** | 1 /* ***** BEGIN LICENSE BLOCK ***** |
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 | 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
3 * | 3 * |
4 * The contents of this file are subject to the Mozilla Public License Version | 4 * The contents of this file are subject to the Mozilla Public License Version |
5 * 1.1 (the "License"); you may not use this file except in compliance with | 5 * 1.1 (the "License"); you may not use this file except in compliance with |
6 * the License. You may obtain a copy of the License at | 6 * the License. You may obtain a copy of the License at |
7 * http://www.mozilla.org/MPL/ | 7 * http://www.mozilla.org/MPL/ |
8 * | 8 * |
9 * Software distributed under the License is distributed on an "AS IS" basis, | 9 * Software distributed under the License is distributed on an "AS IS" basis, |
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
(...skipping 1156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1167 } | 1167 } |
1168 | 1168 |
1169 struct hentry* hp = NULL; | 1169 struct hentry* hp = NULL; |
1170 int col = -1; | 1170 int col = -1; |
1171 #ifdef HUNSPELL_CHROME_CLIENT | 1171 #ifdef HUNSPELL_CHROME_CLIENT |
1172 ScopedHashEntryFactory hash_entry_factory; | 1172 ScopedHashEntryFactory hash_entry_factory; |
1173 #endif | 1173 #endif |
1174 phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; | 1174 phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; |
1175 std::string target; | 1175 std::string target; |
1176 std::string candidate; | 1176 std::string candidate; |
| 1177 std::vector<w_char> w_candidate; |
1177 if (ph) { | 1178 if (ph) { |
1178 if (utf8) { | 1179 if (utf8) { |
1179 std::vector<w_char> _w; | 1180 u8_u16(w_candidate, word); |
1180 u8_u16(_w, word); | 1181 mkallcap_utf(w_candidate, langnum); |
1181 mkallcap_utf(_w, langnum); | 1182 u16_u8(candidate, w_candidate); |
1182 u16_u8(candidate, _w); | |
1183 } else { | 1183 } else { |
1184 candidate.assign(word); | 1184 candidate.assign(word); |
1185 if (!nonbmp) | 1185 if (!nonbmp) |
1186 mkallcap(candidate, csconv); | 1186 mkallcap(candidate, csconv); |
1187 } | 1187 } |
1188 target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n) | 1188 target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n) |
1189 } | 1189 } |
1190 | 1190 |
1191 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL; | 1191 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL; |
1192 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL; | 1192 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL; |
1193 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; | 1193 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; |
1194 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; | 1194 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; |
1195 | 1195 |
| 1196 std::vector<w_char> w_word, w_target; |
| 1197 if (utf8) { |
| 1198 u8_u16(w_word, word); |
| 1199 u8_u16(w_target, target); |
| 1200 } |
| 1201 |
| 1202 std::vector<w_char> w_entry; |
| 1203 std::string f; |
| 1204 std::vector<w_char> w_f; |
| 1205 std::vector<w_char> w_target2; |
| 1206 |
1196 for (size_t i = 0; i < rHMgr.size(); ++i) { | 1207 for (size_t i = 0; i < rHMgr.size(); ++i) { |
1197 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { | 1208 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { |
1198 if ((hp->astr) && (pAMgr) && | 1209 if ((hp->astr) && (pAMgr) && |
1199 (TESTAFF(hp->astr, forbiddenword, hp->alen) || | 1210 (TESTAFF(hp->astr, forbiddenword, hp->alen) || |
1200 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || | 1211 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || |
1201 TESTAFF(hp->astr, nosuggest, hp->alen) || | 1212 TESTAFF(hp->astr, nosuggest, hp->alen) || |
1202 TESTAFF(hp->astr, nongramsuggest, hp->alen) || | 1213 TESTAFF(hp->astr, nongramsuggest, hp->alen) || |
1203 TESTAFF(hp->astr, onlyincompound, hp->alen))) | 1214 TESTAFF(hp->astr, onlyincompound, hp->alen))) |
1204 continue; | 1215 continue; |
1205 | 1216 |
1206 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + | 1217 if (utf8) { |
1207 leftcommonsubstring(word, HENTRY_WORD(hp)); | 1218 w_entry.clear(); |
| 1219 u8_u16(w_entry, HENTRY_WORD(hp)); |
| 1220 sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) + |
| 1221 leftcommonsubstring(w_word, w_entry); |
| 1222 } else { |
| 1223 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + |
| 1224 leftcommonsubstring(word, HENTRY_WORD(hp)); |
| 1225 } |
1208 | 1226 |
1209 // check special pronounciation | 1227 // check special pronounciation |
1210 std::string f; | 1228 f.clear(); |
1211 if ((hp->var & H_OPT_PHON) && | 1229 if ((hp->var & H_OPT_PHON) && |
1212 copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { | 1230 copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { |
1213 int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + | 1231 int sc2; |
1214 +leftcommonsubstring(word, f.c_str()); | 1232 if (utf8) { |
| 1233 w_f.clear(); |
| 1234 u8_u16(w_f, f.c_str()); |
| 1235 sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) + |
| 1236 leftcommonsubstring(w_word, w_f); |
| 1237 } else { |
| 1238 sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + |
| 1239 leftcommonsubstring(word, f.c_str()); |
| 1240 } |
1215 if (sc2 > sc) | 1241 if (sc2 > sc) |
1216 sc = sc2; | 1242 sc = sc2; |
1217 } | 1243 } |
1218 | 1244 |
1219 int scphon = -20000; | 1245 int scphon = -20000; |
1220 if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { | 1246 if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { |
1221 if (utf8) { | 1247 if (utf8) { |
1222 std::vector<w_char> _w; | 1248 w_candidate.clear(); |
1223 u8_u16(_w, HENTRY_WORD(hp)); | 1249 u8_u16(w_candidate, HENTRY_WORD(hp)); |
1224 mkallcap_utf(_w, langnum); | 1250 mkallcap_utf(w_candidate, langnum); |
1225 u16_u8(candidate, _w); | 1251 u16_u8(candidate, w_candidate); |
1226 } else { | 1252 } else { |
1227 candidate.assign(HENTRY_WORD(hp)); | 1253 candidate = HENTRY_WORD(hp); |
1228 mkallcap(candidate, csconv); | 1254 mkallcap(candidate, csconv); |
1229 } | 1255 } |
1230 std::string target2 = phonet(candidate, *ph); | 1256 std::string target2 = phonet(candidate, *ph); |
1231 scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); | 1257 w_target2.clear(); |
| 1258 if (utf8) { |
| 1259 u8_u16(w_target2, target2.c_str()); |
| 1260 scphon = 2 * ngram(3, w_target, w_target2, |
| 1261 NGRAM_LONGER_WORSE); |
| 1262 } else { |
| 1263 scphon = 2 * ngram(3, target, target2, |
| 1264 NGRAM_LONGER_WORSE); |
| 1265 } |
1232 } | 1266 } |
1233 | 1267 |
1234 if (sc > scores[lp]) { | 1268 if (sc > scores[lp]) { |
1235 scores[lp] = sc; | 1269 scores[lp] = sc; |
1236 #ifdef HUNSPELL_CHROME_CLIENT | 1270 #ifdef HUNSPELL_CHROME_CLIENT |
1237 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp); | 1271 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp); |
1238 #else | 1272 #else |
1239 roots[lp] = hp; | 1273 roots[lp] = hp; |
1240 #endif | 1274 #endif |
1241 lval = sc; | 1275 lval = sc; |
(...skipping 13 matching lines...) Expand all Loading... |
1255 lpphon = j; | 1289 lpphon = j; |
1256 lval = scoresphon[j]; | 1290 lval = scoresphon[j]; |
1257 } | 1291 } |
1258 } | 1292 } |
1259 } | 1293 } |
1260 } | 1294 } |
1261 | 1295 |
1262 // find minimum threshold for a passable suggestion | 1296 // find minimum threshold for a passable suggestion |
1263 // mangle original word three differnt ways | 1297 // mangle original word three differnt ways |
1264 // and score them to generate a minimum acceptable score | 1298 // and score them to generate a minimum acceptable score |
| 1299 std::vector<w_char> w_mw; |
1265 int thresh = 0; | 1300 int thresh = 0; |
1266 for (int sp = 1; sp < 4; sp++) { | 1301 for (int sp = 1; sp < 4; sp++) { |
1267 if (utf8) { | 1302 if (utf8) { |
1268 u8_u16(u8, word); | 1303 w_mw = w_word; |
1269 for (int k = sp; k < n; k += 4) { | 1304 for (int k = sp; k < n; k += 4) { |
1270 u8[k].l = '*'; | 1305 w_mw[k].l = '*'; |
1271 u8[k].h = 0; | 1306 w_mw[k].h = 0; |
1272 } | 1307 } |
1273 std::string mw; | 1308 thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low); |
1274 u16_u8(mw, u8); | |
1275 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); | |
1276 } else { | 1309 } else { |
1277 std::string mw(word); | 1310 std::string mw = word; |
1278 for (int k = sp; k < n; k += 4) | 1311 for (int k = sp; k < n; k += 4) |
1279 mw[k] = '*'; | 1312 mw[k] = '*'; |
1280 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); | 1313 thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); |
1281 } | 1314 } |
1282 } | 1315 } |
1283 thresh = thresh / 3; | 1316 thresh = thresh / 3; |
1284 thresh--; | 1317 thresh--; |
1285 | 1318 |
1286 // now expand affixes on each of these root words and | 1319 // now expand affixes on each of these root words and |
1287 // and use length adjusted ngram scores to select | 1320 // and use length adjusted ngram scores to select |
1288 // possible suggestions | 1321 // possible suggestions |
1289 char* guess[MAX_GUESS]; | 1322 char* guess[MAX_GUESS]; |
1290 char* guessorig[MAX_GUESS]; | 1323 char* guessorig[MAX_GUESS]; |
1291 int gscore[MAX_GUESS]; | 1324 int gscore[MAX_GUESS]; |
1292 for (int i = 0; i < MAX_GUESS; i++) { | 1325 for (int i = 0; i < MAX_GUESS; i++) { |
1293 guess[i] = NULL; | 1326 guess[i] = NULL; |
1294 guessorig[i] = NULL; | 1327 guessorig[i] = NULL; |
1295 gscore[i] = -100 * i; | 1328 gscore[i] = -100 * i; |
1296 } | 1329 } |
1297 | 1330 |
1298 lp = MAX_GUESS - 1; | 1331 lp = MAX_GUESS - 1; |
1299 | 1332 |
1300 struct guessword* glst; | 1333 struct guessword* glst; |
1301 glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword)); | 1334 glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword)); |
1302 if (!glst) { | 1335 if (!glst) { |
1303 if (nonbmp) | 1336 if (nonbmp) |
1304 utf8 = 1; | 1337 utf8 = 1; |
1305 return; | 1338 return; |
1306 } | 1339 } |
1307 | 1340 |
| 1341 std::vector<w_char> w_glst_word; |
1308 for (int i = 0; i < MAX_ROOTS; i++) { | 1342 for (int i = 0; i < MAX_ROOTS; i++) { |
1309 if (roots[i]) { | 1343 if (roots[i]) { |
1310 struct hentry* rp = roots[i]; | 1344 struct hentry* rp = roots[i]; |
1311 | 1345 |
1312 std::string f; | 1346 f.clear(); |
1313 const char *field = NULL; | 1347 const char *field = NULL; |
1314 if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) | 1348 if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) |
1315 field = f.c_str(); | 1349 field = f.c_str(); |
1316 int nw = pAMgr->expand_rootword( | 1350 int nw = pAMgr->expand_rootword( |
1317 glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word, | 1351 glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word, |
1318 nc, field); | 1352 nc, field); |
1319 | 1353 |
1320 for (int k = 0; k < nw; k++) { | 1354 for (int k = 0; k < nw; k++) { |
1321 sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + | 1355 if (utf8) { |
1322 leftcommonsubstring(word, glst[k].word); | 1356 w_glst_word.clear(); |
| 1357 u8_u16(w_glst_word, glst[k].word); |
| 1358 sc = ngram(n, w_word, w_glst_word, |
| 1359 NGRAM_ANY_MISMATCH + low) + |
| 1360 leftcommonsubstring(w_word, w_glst_word); |
| 1361 } else { |
| 1362 sc = ngram(n, word, glst[k].word, |
| 1363 NGRAM_ANY_MISMATCH + low) + |
| 1364 leftcommonsubstring(word, glst[k].word); |
| 1365 } |
1323 | 1366 |
1324 if (sc > thresh) { | 1367 if (sc > thresh) { |
1325 if (sc > gscore[lp]) { | 1368 if (sc > gscore[lp]) { |
1326 if (guess[lp]) { | 1369 if (guess[lp]) { |
1327 free(guess[lp]); | 1370 free(guess[lp]); |
1328 if (guessorig[lp]) { | 1371 if (guessorig[lp]) { |
1329 free(guessorig[lp]); | 1372 free(guessorig[lp]); |
1330 guessorig[lp] = NULL; | 1373 guessorig[lp] = NULL; |
1331 } | 1374 } |
1332 } | 1375 } |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1366 | 1409 |
1367 int is_swap = 0; | 1410 int is_swap = 0; |
1368 int re = 0; | 1411 int re = 0; |
1369 double fact = 1.0; | 1412 double fact = 1.0; |
1370 if (pAMgr) { | 1413 if (pAMgr) { |
1371 int maxd = pAMgr->get_maxdiff(); | 1414 int maxd = pAMgr->get_maxdiff(); |
1372 if (maxd >= 0) | 1415 if (maxd >= 0) |
1373 fact = (10.0 - maxd) / 5.0; | 1416 fact = (10.0 - maxd) / 5.0; |
1374 } | 1417 } |
1375 | 1418 |
| 1419 std::vector<w_char> w_gl; |
1376 for (int i = 0; i < MAX_GUESS; i++) { | 1420 for (int i = 0; i < MAX_GUESS; i++) { |
1377 if (guess[i]) { | 1421 if (guess[i]) { |
1378 // lowering guess[i] | 1422 // lowering guess[i] |
1379 std::string gl; | 1423 std::string gl; |
1380 int len; | 1424 int len; |
1381 if (utf8) { | 1425 if (utf8) { |
1382 std::vector<w_char> _w; | 1426 w_gl.clear(); |
1383 len = u8_u16(_w, guess[i]); | 1427 len = u8_u16(w_gl, guess[i]); |
1384 mkallsmall_utf(_w, langnum); | 1428 mkallsmall_utf(w_gl, langnum); |
1385 u16_u8(gl, _w); | 1429 u16_u8(gl, w_gl); |
1386 } else { | 1430 } else { |
1387 gl.assign(guess[i]); | 1431 gl.assign(guess[i]); |
1388 if (!nonbmp) | 1432 if (!nonbmp) |
1389 mkallsmall(gl, csconv); | 1433 mkallsmall(gl, csconv); |
1390 len = strlen(guess[i]); | 1434 len = strlen(guess[i]); |
1391 } | 1435 } |
1392 | 1436 |
1393 int _lcs = lcslen(word, gl.c_str()); | 1437 int _lcs = lcslen(word, gl.c_str()); |
1394 | 1438 |
1395 // same characters with different casing | 1439 // same characters with different casing |
1396 if ((n == len) && (n == _lcs)) { | 1440 if ((n == len) && (n == _lcs)) { |
1397 gscore[i] += 2000; | 1441 gscore[i] += 2000; |
1398 break; | 1442 break; |
1399 } | 1443 } |
1400 // using 2-gram instead of 3, and other weightening | 1444 // using 2-gram instead of 3, and other weightening |
1401 | 1445 |
1402 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + | 1446 w_gl.clear(); |
1403 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); | 1447 if (utf8) { |
| 1448 u8_u16(w_gl, gl); |
| 1449 re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + |
| 1450 ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); |
| 1451 } else { |
| 1452 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + |
| 1453 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); |
| 1454 } |
1404 | 1455 |
| 1456 int ngram_score, leftcommon_score; |
| 1457 if (utf8) { |
| 1458 ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low); |
| 1459 leftcommon_score = leftcommonsubstring(w_word, w_gl); |
| 1460 } else { |
| 1461 ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low); |
| 1462 leftcommon_score = leftcommonsubstring(word, gl.c_str()); |
| 1463 } |
1405 gscore[i] = | 1464 gscore[i] = |
1406 // length of longest common subsequent minus length difference | 1465 // length of longest common subsequent minus length difference |
1407 2 * _lcs - abs((int)(n - len)) + | 1466 2 * _lcs - abs((int)(n - len)) + |
1408 // weight length of the left common substring | 1467 // weight length of the left common substring |
1409 leftcommonsubstring(word, gl.c_str()) + | 1468 leftcommon_score + |
1410 // weight equal character positions | 1469 // weight equal character positions |
1411 (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) | 1470 (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) |
1412 ? 1 | 1471 ? 1 |
1413 : 0) + | 1472 : 0) + |
1414 // swap character (not neighboring) | 1473 // swap character (not neighboring) |
1415 ((is_swap) ? 10 : 0) + | 1474 ((is_swap) ? 10 : 0) + |
1416 // ngram | 1475 // ngram |
1417 ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + | 1476 ngram_score + |
1418 // weighted ngrams | 1477 // weighted ngrams |
1419 re + | 1478 re + |
1420 // different limit for dictionaries with PHONE rules | 1479 // different limit for dictionaries with PHONE rules |
1421 (ph ? (re < len * fact ? -1000 : 0) | 1480 (ph ? (re < len * fact ? -1000 : 0) |
1422 : (re < (n + len) * fact ? -1000 : 0)); | 1481 : (re < (n + len) * fact ? -1000 : 0)); |
1423 } | 1482 } |
1424 } | 1483 } |
1425 | 1484 |
1426 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); | 1485 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); |
1427 | 1486 |
1428 // phonetic version | 1487 // phonetic version |
1429 if (ph) | 1488 if (ph) |
1430 for (int i = 0; i < MAX_ROOTS; i++) { | 1489 for (int i = 0; i < MAX_ROOTS; i++) { |
1431 if (rootsphon[i]) { | 1490 if (rootsphon[i]) { |
1432 // lowering rootphon[i] | 1491 // lowering rootphon[i] |
1433 std::string gl; | 1492 std::string gl; |
1434 int len; | 1493 int len; |
| 1494 w_gl.clear(); |
1435 if (utf8) { | 1495 if (utf8) { |
1436 std::vector<w_char> _w; | 1496 len = u8_u16(w_gl, rootsphon[i]); |
1437 len = u8_u16(_w, rootsphon[i]); | 1497 mkallsmall_utf(w_gl, langnum); |
1438 mkallsmall_utf(_w, langnum); | 1498 u16_u8(gl, w_gl); |
1439 u16_u8(gl, _w); | |
1440 } else { | 1499 } else { |
1441 gl.assign(rootsphon[i]); | 1500 gl.assign(rootsphon[i]); |
1442 if (!nonbmp) | 1501 if (!nonbmp) |
1443 mkallsmall(gl, csconv); | 1502 mkallsmall(gl, csconv); |
1444 len = strlen(rootsphon[i]); | 1503 len = strlen(rootsphon[i]); |
1445 } | 1504 } |
1446 | 1505 |
| 1506 // weight length of the left common substring |
| 1507 int leftcommon_score; |
| 1508 if (utf8) |
| 1509 leftcommon_score = leftcommonsubstring(w_word, w_gl); |
| 1510 else |
| 1511 leftcommon_score = leftcommonsubstring(word, gl.c_str()); |
1447 // heuristic weigthing of ngram scores | 1512 // heuristic weigthing of ngram scores |
1448 scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + | 1513 scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + |
1449 // weight length of the left common substring | 1514 leftcommon_score; |
1450 leftcommonsubstring(word, gl.c_str()); | |
1451 } | 1515 } |
1452 } | 1516 } |
1453 | 1517 |
1454 if (ph) | 1518 if (ph) |
1455 bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); | 1519 bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); |
1456 | 1520 |
1457 // copy over | 1521 // copy over |
1458 size_t oldns = wlst.size(); | 1522 size_t oldns = wlst.size(); |
1459 | 1523 |
1460 int same = 0; | 1524 int same = 0; |
(...skipping 384 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1845 if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX)) | 1909 if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX)) |
1846 break; | 1910 break; |
1847 | 1911 |
1848 newpattern.assign(pattern); | 1912 newpattern.assign(pattern); |
1849 mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX); | 1913 mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX); |
1850 pattern = newpattern.c_str(); | 1914 pattern = newpattern.c_str(); |
1851 } | 1915 } |
1852 return result2; | 1916 return result2; |
1853 } | 1917 } |
1854 | 1918 |
1855 // generate an n-gram score comparing s1 and s2 | 1919 // generate an n-gram score comparing s1 and s2, UTF16 version |
1856 int SuggestMgr::ngram(int n, | 1920 int SuggestMgr::ngram(int n, |
1857 const std::string& s1, | 1921 const std::vector<w_char>& su1, |
1858 const std::string& s2, | 1922 const std::vector<w_char>& su2, |
1859 int opt) { | 1923 int opt) { |
1860 int nscore = 0; | 1924 int nscore = 0; |
1861 int ns; | 1925 int ns; |
1862 int l1; | 1926 int l1; |
1863 int l2; | 1927 int l2; |
1864 int test = 0; | 1928 int test = 0; |
1865 | 1929 |
1866 if (utf8) { | 1930 l1 = su1.size(); |
1867 std::vector<w_char> su1; | 1931 l2 = su2.size(); |
1868 std::vector<w_char> su2; | 1932 if (l2 == 0) |
1869 l1 = u8_u16(su1, s1); | 1933 return 0; |
1870 l2 = u8_u16(su2, s2); | 1934 // lowering dictionary word |
1871 if ((l2 <= 0) || (l1 == -1)) | 1935 const std::vector<w_char>* p_su2 = &su2; |
1872 return 0; | 1936 std::vector<w_char> su2_copy; |
1873 // lowering dictionary word | 1937 if (opt & NGRAM_LOWERING) { |
1874 if (opt & NGRAM_LOWERING) | 1938 su2_copy = su2; |
1875 mkallsmall_utf(su2, langnum); | 1939 mkallsmall_utf(su2_copy, langnum); |
1876 for (int j = 1; j <= n; j++) { | 1940 p_su2 = &su2_copy; |
1877 ns = 0; | 1941 } |
1878 for (int i = 0; i <= (l1 - j); i++) { | 1942 for (int j = 1; j <= n; j++) { |
1879 int k = 0; | 1943 ns = 0; |
1880 for (int l = 0; l <= (l2 - j); l++) { | 1944 for (int i = 0; i <= (l1 - j); i++) { |
1881 for (k = 0; k < j; k++) { | 1945 int k = 0; |
1882 w_char& c1 = su1[i + k]; | 1946 for (int l = 0; l <= (l2 - j); l++) { |
1883 w_char& c2 = su2[l + k]; | 1947 for (k = 0; k < j; k++) { |
1884 if ((c1.l != c2.l) || (c1.h != c2.h)) | 1948 const w_char& c1 = su1[i + k]; |
1885 break; | 1949 const w_char& c2 = (*p_su2)[l + k]; |
1886 } | 1950 if ((c1.l != c2.l) || (c1.h != c2.h)) |
1887 if (k == j) { | |
1888 ns++; | |
1889 break; | 1951 break; |
1890 } | |
1891 } | 1952 } |
1892 if (k != j && opt & NGRAM_WEIGHTED) { | 1953 if (k == j) { |
1893 ns--; | 1954 ns++; |
1894 test++; | 1955 break; |
1895 if (i == 0 || i == l1 - j) | |
1896 ns--; // side weight | |
1897 } | 1956 } |
1898 } | 1957 } |
1899 nscore = nscore + ns; | 1958 if (k != j && opt & NGRAM_WEIGHTED) { |
1900 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) | 1959 ns--; |
1901 break; | 1960 test++; |
| 1961 if (i == 0 || i == l1 - j) |
| 1962 ns--; // side weight |
| 1963 } |
1902 } | 1964 } |
1903 } else { | 1965 nscore = nscore + ns; |
1904 l2 = s2.size(); | 1966 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) |
1905 if (l2 == 0) | 1967 break; |
1906 return 0; | |
1907 l1 = s1.size(); | |
1908 std::string t(s2); | |
1909 if (opt & NGRAM_LOWERING) | |
1910 mkallsmall(t, csconv); | |
1911 for (int j = 1; j <= n; j++) { | |
1912 ns = 0; | |
1913 for (int i = 0; i <= (l1 - j); i++) { | |
1914 //t is haystack, s1[i..i+j) is needle | |
1915 if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { | |
1916 ns++; | |
1917 } else if (opt & NGRAM_WEIGHTED) { | |
1918 ns--; | |
1919 test++; | |
1920 if (i == 0 || i == l1 - j) | |
1921 ns--; // side weight | |
1922 } | |
1923 } | |
1924 nscore = nscore + ns; | |
1925 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) | |
1926 break; | |
1927 } | |
1928 } | 1968 } |
1929 | 1969 |
1930 ns = 0; | 1970 ns = 0; |
1931 if (opt & NGRAM_LONGER_WORSE) | 1971 if (opt & NGRAM_LONGER_WORSE) |
1932 ns = (l2 - l1) - 2; | 1972 ns = (l2 - l1) - 2; |
1933 if (opt & NGRAM_ANY_MISMATCH) | 1973 if (opt & NGRAM_ANY_MISMATCH) |
1934 ns = abs(l2 - l1) - 2; | 1974 ns = abs(l2 - l1) - 2; |
1935 ns = (nscore - ((ns > 0) ? ns : 0)); | 1975 ns = (nscore - ((ns > 0) ? ns : 0)); |
1936 return ns; | 1976 return ns; |
1937 } | 1977 } |
1938 | 1978 |
1939 // length of the left common substring of s1 and (decapitalised) s2 | 1979 // generate an n-gram score comparing s1 and s2, non-UTF16 version |
1940 int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) { | 1980 int SuggestMgr::ngram(int n, |
1941 if (utf8) { | 1981 const std::string& s1, |
1942 std::vector<w_char> su1; | 1982 const std::string& s2, |
1943 std::vector<w_char> su2; | 1983 int opt) { |
1944 int l1 = u8_u16(su1, s1); | 1984 int nscore = 0; |
1945 int l2 = u8_u16(su2, s2); | 1985 int ns; |
1946 // decapitalize dictionary word | 1986 int l1; |
1947 if (complexprefixes) { | 1987 int l2; |
1948 if (su1[l1 - 1] == su2[l2 - 1]) | 1988 int test = 0; |
1949 return 1; | 1989 |
1950 } else { | 1990 l2 = s2.size(); |
1951 unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; | 1991 if (l2 == 0) |
1952 unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; | 1992 return 0; |
1953 if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) | 1993 l1 = s1.size(); |
1954 return 0; | 1994 std::string t(s2); |
1955 int i; | 1995 if (opt & NGRAM_LOWERING) |
1956 for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && | 1996 mkallsmall(t, csconv); |
1957 (su1[i].h == su2[i].h); | 1997 for (int j = 1; j <= n; j++) { |
1958 i++) | 1998 ns = 0; |
1959 ; | 1999 for (int i = 0; i <= (l1 - j); i++) { |
1960 return i; | 2000 //t is haystack, s1[i..i+j) is needle |
| 2001 if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { |
| 2002 ns++; |
| 2003 } else if (opt & NGRAM_WEIGHTED) { |
| 2004 ns--; |
| 2005 test++; |
| 2006 if (i == 0 || i == l1 - j) |
| 2007 ns--; // side weight |
| 2008 } |
1961 } | 2009 } |
| 2010 nscore = nscore + ns; |
| 2011 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) |
| 2012 break; |
| 2013 } |
| 2014 |
| 2015 ns = 0; |
| 2016 if (opt & NGRAM_LONGER_WORSE) |
| 2017 ns = (l2 - l1) - 2; |
| 2018 if (opt & NGRAM_ANY_MISMATCH) |
| 2019 ns = abs(l2 - l1) - 2; |
| 2020 ns = (nscore - ((ns > 0) ? ns : 0)); |
| 2021 return ns; |
| 2022 } |
| 2023 |
| 2024 // length of the left common substring of s1 and (decapitalised) s2, UTF version |
| 2025 int SuggestMgr::leftcommonsubstring( |
| 2026 const std::vector<w_char>& su1, |
| 2027 const std::vector<w_char>& su2) { |
| 2028 int l1 = su1.size(); |
| 2029 int l2 = su2.size(); |
| 2030 // decapitalize dictionary word |
| 2031 if (complexprefixes) { |
| 2032 if (su1[l1 - 1] == su2[l2 - 1]) |
| 2033 return 1; |
1962 } else { | 2034 } else { |
1963 if (complexprefixes) { | 2035 unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; |
1964 int l1 = strlen(s1); | 2036 unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; |
1965 int l2 = strlen(s2); | 2037 if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) |
1966 if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) | 2038 return 0; |
1967 return 1; | 2039 int i; |
1968 } else if (csconv) { | 2040 for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && |
1969 const char* olds = s1; | 2041 (su1[i].h == su2[i].h); |
1970 // decapitalise dictionary word | 2042 i++) |
1971 if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) | 2043 ; |
1972 return 0; | 2044 return i; |
1973 do { | |
1974 s1++; | |
1975 s2++; | |
1976 } while ((*s1 == *s2) && (*s1 != '\0')); | |
1977 return (int)(s1 - olds); | |
1978 } | |
1979 } | 2045 } |
1980 return 0; | 2046 return 0; |
1981 } | 2047 } |
| 2048 |
| 2049 // length of the left common substring of s1 and (decapitalised) s2, non-UTF |
| 2050 int SuggestMgr::leftcommonsubstring( |
| 2051 const char* s1, |
| 2052 const char* s2) { |
| 2053 if (complexprefixes) { |
| 2054 int l1 = strlen(s1); |
| 2055 int l2 = strlen(s2); |
| 2056 if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) |
| 2057 return 1; |
| 2058 } else if (csconv) { |
| 2059 const char* olds = s1; |
| 2060 // decapitalise dictionary word |
| 2061 if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) |
| 2062 return 0; |
| 2063 do { |
| 2064 s1++; |
| 2065 s2++; |
| 2066 } while ((*s1 == *s2) && (*s1 != '\0')); |
| 2067 return (int)(s1 - olds); |
| 2068 } |
| 2069 return 0; |
| 2070 } |
1982 | 2071 |
1983 int SuggestMgr::commoncharacterpositions(const char* s1, | 2072 int SuggestMgr::commoncharacterpositions(const char* s1, |
1984 const char* s2, | 2073 const char* s2, |
1985 int* is_swap) { | 2074 int* is_swap) { |
1986 int num = 0; | 2075 int num = 0; |
1987 int diff = 0; | 2076 int diff = 0; |
1988 int diffpos[2]; | 2077 int diffpos[2]; |
1989 *is_swap = 0; | 2078 *is_swap = 0; |
1990 if (utf8) { | 2079 if (utf8) { |
1991 std::vector<w_char> su1; | 2080 std::vector<w_char> su1; |
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2150 } else | 2239 } else |
2151 j--; | 2240 j--; |
2152 } | 2241 } |
2153 free(result); | 2242 free(result); |
2154 return len; | 2243 return len; |
2155 } | 2244 } |
2156 | 2245 |
2157 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) { | 2246 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) { |
2158 return lcslen(s.c_str(), s2.c_str()); | 2247 return lcslen(s.c_str(), s2.c_str()); |
2159 } | 2248 } |
OLD | NEW |