Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(314)

Side by Side Diff: third_party/hunspell/src/hunspell/suggestmgr.cxx

Issue 2587363003: [spellcheck] Updated Hunspell to 1.6.0 (Closed)
Patch Set: Fix README symlink Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* ***** BEGIN LICENSE BLOCK ***** 1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 * 3 *
4 * The contents of this file are subject to the Mozilla Public License Version 4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with 5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at 6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/ 7 * http://www.mozilla.org/MPL/
8 * 8 *
9 * Software distributed under the License is distributed on an "AS IS" basis, 9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
(...skipping 1156 matching lines...) Expand 10 before | Expand all | Expand 10 after
1167 } 1167 }
1168 1168
1169 struct hentry* hp = NULL; 1169 struct hentry* hp = NULL;
1170 int col = -1; 1170 int col = -1;
1171 #ifdef HUNSPELL_CHROME_CLIENT 1171 #ifdef HUNSPELL_CHROME_CLIENT
1172 ScopedHashEntryFactory hash_entry_factory; 1172 ScopedHashEntryFactory hash_entry_factory;
1173 #endif 1173 #endif
1174 phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; 1174 phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
1175 std::string target; 1175 std::string target;
1176 std::string candidate; 1176 std::string candidate;
1177 std::vector<w_char> w_candidate;
1177 if (ph) { 1178 if (ph) {
1178 if (utf8) { 1179 if (utf8) {
1179 std::vector<w_char> _w; 1180 u8_u16(w_candidate, word);
1180 u8_u16(_w, word); 1181 mkallcap_utf(w_candidate, langnum);
1181 mkallcap_utf(_w, langnum); 1182 u16_u8(candidate, w_candidate);
1182 u16_u8(candidate, _w);
1183 } else { 1183 } else {
1184 candidate.assign(word); 1184 candidate.assign(word);
1185 if (!nonbmp) 1185 if (!nonbmp)
1186 mkallcap(candidate, csconv); 1186 mkallcap(candidate, csconv);
1187 } 1187 }
1188 target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n) 1188 target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n)
1189 } 1189 }
1190 1190
1191 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL; 1191 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;
1192 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL; 1192 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;
1193 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; 1193 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
1194 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; 1194 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
1195 1195
1196 std::vector<w_char> w_word, w_target;
1197 if (utf8) {
1198 u8_u16(w_word, word);
1199 u8_u16(w_target, target);
1200 }
1201
1202 std::vector<w_char> w_entry;
1203 std::string f;
1204 std::vector<w_char> w_f;
1205 std::vector<w_char> w_target2;
1206
1196 for (size_t i = 0; i < rHMgr.size(); ++i) { 1207 for (size_t i = 0; i < rHMgr.size(); ++i) {
1197 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { 1208 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
1198 if ((hp->astr) && (pAMgr) && 1209 if ((hp->astr) && (pAMgr) &&
1199 (TESTAFF(hp->astr, forbiddenword, hp->alen) || 1210 (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1200 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || 1211 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1201 TESTAFF(hp->astr, nosuggest, hp->alen) || 1212 TESTAFF(hp->astr, nosuggest, hp->alen) ||
1202 TESTAFF(hp->astr, nongramsuggest, hp->alen) || 1213 TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1203 TESTAFF(hp->astr, onlyincompound, hp->alen))) 1214 TESTAFF(hp->astr, onlyincompound, hp->alen)))
1204 continue; 1215 continue;
1205 1216
1206 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + 1217 if (utf8) {
1207 leftcommonsubstring(word, HENTRY_WORD(hp)); 1218 w_entry.clear();
1219 u8_u16(w_entry, HENTRY_WORD(hp));
1220 sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
1221 leftcommonsubstring(w_word, w_entry);
1222 } else {
1223 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
1224 leftcommonsubstring(word, HENTRY_WORD(hp));
1225 }
1208 1226
1209 // check special pronounciation 1227 // check special pronounciation
1210 std::string f; 1228 f.clear();
1211 if ((hp->var & H_OPT_PHON) && 1229 if ((hp->var & H_OPT_PHON) &&
1212 copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { 1230 copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1213 int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + 1231 int sc2;
1214 +leftcommonsubstring(word, f.c_str()); 1232 if (utf8) {
1233 w_f.clear();
1234 u8_u16(w_f, f.c_str());
1235 sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
1236 leftcommonsubstring(w_word, w_f);
1237 } else {
1238 sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
1239 leftcommonsubstring(word, f.c_str());
1240 }
1215 if (sc2 > sc) 1241 if (sc2 > sc)
1216 sc = sc2; 1242 sc = sc2;
1217 } 1243 }
1218 1244
1219 int scphon = -20000; 1245 int scphon = -20000;
1220 if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { 1246 if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
1221 if (utf8) { 1247 if (utf8) {
1222 std::vector<w_char> _w; 1248 w_candidate.clear();
1223 u8_u16(_w, HENTRY_WORD(hp)); 1249 u8_u16(w_candidate, HENTRY_WORD(hp));
1224 mkallcap_utf(_w, langnum); 1250 mkallcap_utf(w_candidate, langnum);
1225 u16_u8(candidate, _w); 1251 u16_u8(candidate, w_candidate);
1226 } else { 1252 } else {
1227 candidate.assign(HENTRY_WORD(hp)); 1253 candidate = HENTRY_WORD(hp);
1228 mkallcap(candidate, csconv); 1254 mkallcap(candidate, csconv);
1229 } 1255 }
1230 std::string target2 = phonet(candidate, *ph); 1256 std::string target2 = phonet(candidate, *ph);
1231 scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); 1257 w_target2.clear();
1258 if (utf8) {
1259 u8_u16(w_target2, target2.c_str());
1260 scphon = 2 * ngram(3, w_target, w_target2,
1261 NGRAM_LONGER_WORSE);
1262 } else {
1263 scphon = 2 * ngram(3, target, target2,
1264 NGRAM_LONGER_WORSE);
1265 }
1232 } 1266 }
1233 1267
1234 if (sc > scores[lp]) { 1268 if (sc > scores[lp]) {
1235 scores[lp] = sc; 1269 scores[lp] = sc;
1236 #ifdef HUNSPELL_CHROME_CLIENT 1270 #ifdef HUNSPELL_CHROME_CLIENT
1237 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp); 1271 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp);
1238 #else 1272 #else
1239 roots[lp] = hp; 1273 roots[lp] = hp;
1240 #endif 1274 #endif
1241 lval = sc; 1275 lval = sc;
(...skipping 13 matching lines...) Expand all
1255 lpphon = j; 1289 lpphon = j;
1256 lval = scoresphon[j]; 1290 lval = scoresphon[j];
1257 } 1291 }
1258 } 1292 }
1259 } 1293 }
1260 } 1294 }
1261 1295
1262 // find minimum threshold for a passable suggestion 1296 // find minimum threshold for a passable suggestion
1263 // mangle original word three differnt ways 1297 // mangle original word three differnt ways
1264 // and score them to generate a minimum acceptable score 1298 // and score them to generate a minimum acceptable score
1299 std::vector<w_char> w_mw;
1265 int thresh = 0; 1300 int thresh = 0;
1266 for (int sp = 1; sp < 4; sp++) { 1301 for (int sp = 1; sp < 4; sp++) {
1267 if (utf8) { 1302 if (utf8) {
1268 u8_u16(u8, word); 1303 w_mw = w_word;
1269 for (int k = sp; k < n; k += 4) { 1304 for (int k = sp; k < n; k += 4) {
1270 u8[k].l = '*'; 1305 w_mw[k].l = '*';
1271 u8[k].h = 0; 1306 w_mw[k].h = 0;
1272 } 1307 }
1273 std::string mw; 1308 thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
1274 u16_u8(mw, u8);
1275 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1276 } else { 1309 } else {
1277 std::string mw(word); 1310 std::string mw = word;
1278 for (int k = sp; k < n; k += 4) 1311 for (int k = sp; k < n; k += 4)
1279 mw[k] = '*'; 1312 mw[k] = '*';
1280 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); 1313 thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1281 } 1314 }
1282 } 1315 }
1283 thresh = thresh / 3; 1316 thresh = thresh / 3;
1284 thresh--; 1317 thresh--;
1285 1318
1286 // now expand affixes on each of these root words and 1319 // now expand affixes on each of these root words and
1287 // and use length adjusted ngram scores to select 1320 // and use length adjusted ngram scores to select
1288 // possible suggestions 1321 // possible suggestions
1289 char* guess[MAX_GUESS]; 1322 char* guess[MAX_GUESS];
1290 char* guessorig[MAX_GUESS]; 1323 char* guessorig[MAX_GUESS];
1291 int gscore[MAX_GUESS]; 1324 int gscore[MAX_GUESS];
1292 for (int i = 0; i < MAX_GUESS; i++) { 1325 for (int i = 0; i < MAX_GUESS; i++) {
1293 guess[i] = NULL; 1326 guess[i] = NULL;
1294 guessorig[i] = NULL; 1327 guessorig[i] = NULL;
1295 gscore[i] = -100 * i; 1328 gscore[i] = -100 * i;
1296 } 1329 }
1297 1330
1298 lp = MAX_GUESS - 1; 1331 lp = MAX_GUESS - 1;
1299 1332
1300 struct guessword* glst; 1333 struct guessword* glst;
1301 glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword)); 1334 glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword));
1302 if (!glst) { 1335 if (!glst) {
1303 if (nonbmp) 1336 if (nonbmp)
1304 utf8 = 1; 1337 utf8 = 1;
1305 return; 1338 return;
1306 } 1339 }
1307 1340
1341 std::vector<w_char> w_glst_word;
1308 for (int i = 0; i < MAX_ROOTS; i++) { 1342 for (int i = 0; i < MAX_ROOTS; i++) {
1309 if (roots[i]) { 1343 if (roots[i]) {
1310 struct hentry* rp = roots[i]; 1344 struct hentry* rp = roots[i];
1311 1345
1312 std::string f; 1346 f.clear();
1313 const char *field = NULL; 1347 const char *field = NULL;
1314 if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) 1348 if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))
1315 field = f.c_str(); 1349 field = f.c_str();
1316 int nw = pAMgr->expand_rootword( 1350 int nw = pAMgr->expand_rootword(
1317 glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word, 1351 glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word,
1318 nc, field); 1352 nc, field);
1319 1353
1320 for (int k = 0; k < nw; k++) { 1354 for (int k = 0; k < nw; k++) {
1321 sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + 1355 if (utf8) {
1322 leftcommonsubstring(word, glst[k].word); 1356 w_glst_word.clear();
1357 u8_u16(w_glst_word, glst[k].word);
1358 sc = ngram(n, w_word, w_glst_word,
1359 NGRAM_ANY_MISMATCH + low) +
1360 leftcommonsubstring(w_word, w_glst_word);
1361 } else {
1362 sc = ngram(n, word, glst[k].word,
1363 NGRAM_ANY_MISMATCH + low) +
1364 leftcommonsubstring(word, glst[k].word);
1365 }
1323 1366
1324 if (sc > thresh) { 1367 if (sc > thresh) {
1325 if (sc > gscore[lp]) { 1368 if (sc > gscore[lp]) {
1326 if (guess[lp]) { 1369 if (guess[lp]) {
1327 free(guess[lp]); 1370 free(guess[lp]);
1328 if (guessorig[lp]) { 1371 if (guessorig[lp]) {
1329 free(guessorig[lp]); 1372 free(guessorig[lp]);
1330 guessorig[lp] = NULL; 1373 guessorig[lp] = NULL;
1331 } 1374 }
1332 } 1375 }
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
1366 1409
1367 int is_swap = 0; 1410 int is_swap = 0;
1368 int re = 0; 1411 int re = 0;
1369 double fact = 1.0; 1412 double fact = 1.0;
1370 if (pAMgr) { 1413 if (pAMgr) {
1371 int maxd = pAMgr->get_maxdiff(); 1414 int maxd = pAMgr->get_maxdiff();
1372 if (maxd >= 0) 1415 if (maxd >= 0)
1373 fact = (10.0 - maxd) / 5.0; 1416 fact = (10.0 - maxd) / 5.0;
1374 } 1417 }
1375 1418
1419 std::vector<w_char> w_gl;
1376 for (int i = 0; i < MAX_GUESS; i++) { 1420 for (int i = 0; i < MAX_GUESS; i++) {
1377 if (guess[i]) { 1421 if (guess[i]) {
1378 // lowering guess[i] 1422 // lowering guess[i]
1379 std::string gl; 1423 std::string gl;
1380 int len; 1424 int len;
1381 if (utf8) { 1425 if (utf8) {
1382 std::vector<w_char> _w; 1426 w_gl.clear();
1383 len = u8_u16(_w, guess[i]); 1427 len = u8_u16(w_gl, guess[i]);
1384 mkallsmall_utf(_w, langnum); 1428 mkallsmall_utf(w_gl, langnum);
1385 u16_u8(gl, _w); 1429 u16_u8(gl, w_gl);
1386 } else { 1430 } else {
1387 gl.assign(guess[i]); 1431 gl.assign(guess[i]);
1388 if (!nonbmp) 1432 if (!nonbmp)
1389 mkallsmall(gl, csconv); 1433 mkallsmall(gl, csconv);
1390 len = strlen(guess[i]); 1434 len = strlen(guess[i]);
1391 } 1435 }
1392 1436
1393 int _lcs = lcslen(word, gl.c_str()); 1437 int _lcs = lcslen(word, gl.c_str());
1394 1438
1395 // same characters with different casing 1439 // same characters with different casing
1396 if ((n == len) && (n == _lcs)) { 1440 if ((n == len) && (n == _lcs)) {
1397 gscore[i] += 2000; 1441 gscore[i] += 2000;
1398 break; 1442 break;
1399 } 1443 }
1400 // using 2-gram instead of 3, and other weightening 1444 // using 2-gram instead of 3, and other weightening
1401 1445
1402 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + 1446 w_gl.clear();
1403 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); 1447 if (utf8) {
1448 u8_u16(w_gl, gl);
1449 re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
1450 ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
1451 } else {
1452 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
1453 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
1454 }
1404 1455
1456 int ngram_score, leftcommon_score;
1457 if (utf8) {
1458 ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
1459 leftcommon_score = leftcommonsubstring(w_word, w_gl);
1460 } else {
1461 ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
1462 leftcommon_score = leftcommonsubstring(word, gl.c_str());
1463 }
1405 gscore[i] = 1464 gscore[i] =
1406 // length of longest common subsequent minus length difference 1465 // length of longest common subsequent minus length difference
1407 2 * _lcs - abs((int)(n - len)) + 1466 2 * _lcs - abs((int)(n - len)) +
1408 // weight length of the left common substring 1467 // weight length of the left common substring
1409 leftcommonsubstring(word, gl.c_str()) + 1468 leftcommon_score +
1410 // weight equal character positions 1469 // weight equal character positions
1411 (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) 1470 (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)
1412 ? 1 1471 ? 1
1413 : 0) + 1472 : 0) +
1414 // swap character (not neighboring) 1473 // swap character (not neighboring)
1415 ((is_swap) ? 10 : 0) + 1474 ((is_swap) ? 10 : 0) +
1416 // ngram 1475 // ngram
1417 ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + 1476 ngram_score +
1418 // weighted ngrams 1477 // weighted ngrams
1419 re + 1478 re +
1420 // different limit for dictionaries with PHONE rules 1479 // different limit for dictionaries with PHONE rules
1421 (ph ? (re < len * fact ? -1000 : 0) 1480 (ph ? (re < len * fact ? -1000 : 0)
1422 : (re < (n + len) * fact ? -1000 : 0)); 1481 : (re < (n + len) * fact ? -1000 : 0));
1423 } 1482 }
1424 } 1483 }
1425 1484
1426 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); 1485 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1427 1486
1428 // phonetic version 1487 // phonetic version
1429 if (ph) 1488 if (ph)
1430 for (int i = 0; i < MAX_ROOTS; i++) { 1489 for (int i = 0; i < MAX_ROOTS; i++) {
1431 if (rootsphon[i]) { 1490 if (rootsphon[i]) {
1432 // lowering rootphon[i] 1491 // lowering rootphon[i]
1433 std::string gl; 1492 std::string gl;
1434 int len; 1493 int len;
1494 w_gl.clear();
1435 if (utf8) { 1495 if (utf8) {
1436 std::vector<w_char> _w; 1496 len = u8_u16(w_gl, rootsphon[i]);
1437 len = u8_u16(_w, rootsphon[i]); 1497 mkallsmall_utf(w_gl, langnum);
1438 mkallsmall_utf(_w, langnum); 1498 u16_u8(gl, w_gl);
1439 u16_u8(gl, _w);
1440 } else { 1499 } else {
1441 gl.assign(rootsphon[i]); 1500 gl.assign(rootsphon[i]);
1442 if (!nonbmp) 1501 if (!nonbmp)
1443 mkallsmall(gl, csconv); 1502 mkallsmall(gl, csconv);
1444 len = strlen(rootsphon[i]); 1503 len = strlen(rootsphon[i]);
1445 } 1504 }
1446 1505
1506 // weight length of the left common substring
1507 int leftcommon_score;
1508 if (utf8)
1509 leftcommon_score = leftcommonsubstring(w_word, w_gl);
1510 else
1511 leftcommon_score = leftcommonsubstring(word, gl.c_str());
1447 // heuristic weigthing of ngram scores 1512 // heuristic weigthing of ngram scores
1448 scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + 1513 scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +
1449 // weight length of the left common substring 1514 leftcommon_score;
1450 leftcommonsubstring(word, gl.c_str());
1451 } 1515 }
1452 } 1516 }
1453 1517
1454 if (ph) 1518 if (ph)
1455 bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); 1519 bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1456 1520
1457 // copy over 1521 // copy over
1458 size_t oldns = wlst.size(); 1522 size_t oldns = wlst.size();
1459 1523
1460 int same = 0; 1524 int same = 0;
(...skipping 384 matching lines...) Expand 10 before | Expand all | Expand 10 after
1845 if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX)) 1909 if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX))
1846 break; 1910 break;
1847 1911
1848 newpattern.assign(pattern); 1912 newpattern.assign(pattern);
1849 mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX); 1913 mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX);
1850 pattern = newpattern.c_str(); 1914 pattern = newpattern.c_str();
1851 } 1915 }
1852 return result2; 1916 return result2;
1853 } 1917 }
1854 1918
1855 // generate an n-gram score comparing s1 and s2 1919 // generate an n-gram score comparing s1 and s2, UTF16 version
1856 int SuggestMgr::ngram(int n, 1920 int SuggestMgr::ngram(int n,
1857 const std::string& s1, 1921 const std::vector<w_char>& su1,
1858 const std::string& s2, 1922 const std::vector<w_char>& su2,
1859 int opt) { 1923 int opt) {
1860 int nscore = 0; 1924 int nscore = 0;
1861 int ns; 1925 int ns;
1862 int l1; 1926 int l1;
1863 int l2; 1927 int l2;
1864 int test = 0; 1928 int test = 0;
1865 1929
1866 if (utf8) { 1930 l1 = su1.size();
1867 std::vector<w_char> su1; 1931 l2 = su2.size();
1868 std::vector<w_char> su2; 1932 if (l2 == 0)
1869 l1 = u8_u16(su1, s1); 1933 return 0;
1870 l2 = u8_u16(su2, s2); 1934 // lowering dictionary word
1871 if ((l2 <= 0) || (l1 == -1)) 1935 const std::vector<w_char>* p_su2 = &su2;
1872 return 0; 1936 std::vector<w_char> su2_copy;
1873 // lowering dictionary word 1937 if (opt & NGRAM_LOWERING) {
1874 if (opt & NGRAM_LOWERING) 1938 su2_copy = su2;
1875 mkallsmall_utf(su2, langnum); 1939 mkallsmall_utf(su2_copy, langnum);
1876 for (int j = 1; j <= n; j++) { 1940 p_su2 = &su2_copy;
1877 ns = 0; 1941 }
1878 for (int i = 0; i <= (l1 - j); i++) { 1942 for (int j = 1; j <= n; j++) {
1879 int k = 0; 1943 ns = 0;
1880 for (int l = 0; l <= (l2 - j); l++) { 1944 for (int i = 0; i <= (l1 - j); i++) {
1881 for (k = 0; k < j; k++) { 1945 int k = 0;
1882 w_char& c1 = su1[i + k]; 1946 for (int l = 0; l <= (l2 - j); l++) {
1883 w_char& c2 = su2[l + k]; 1947 for (k = 0; k < j; k++) {
1884 if ((c1.l != c2.l) || (c1.h != c2.h)) 1948 const w_char& c1 = su1[i + k];
1885 break; 1949 const w_char& c2 = (*p_su2)[l + k];
1886 } 1950 if ((c1.l != c2.l) || (c1.h != c2.h))
1887 if (k == j) {
1888 ns++;
1889 break; 1951 break;
1890 }
1891 } 1952 }
1892 if (k != j && opt & NGRAM_WEIGHTED) { 1953 if (k == j) {
1893 ns--; 1954 ns++;
1894 test++; 1955 break;
1895 if (i == 0 || i == l1 - j)
1896 ns--; // side weight
1897 } 1956 }
1898 } 1957 }
1899 nscore = nscore + ns; 1958 if (k != j && opt & NGRAM_WEIGHTED) {
1900 if (ns < 2 && !(opt & NGRAM_WEIGHTED)) 1959 ns--;
1901 break; 1960 test++;
1961 if (i == 0 || i == l1 - j)
1962 ns--; // side weight
1963 }
1902 } 1964 }
1903 } else { 1965 nscore = nscore + ns;
1904 l2 = s2.size(); 1966 if (ns < 2 && !(opt & NGRAM_WEIGHTED))
1905 if (l2 == 0) 1967 break;
1906 return 0;
1907 l1 = s1.size();
1908 std::string t(s2);
1909 if (opt & NGRAM_LOWERING)
1910 mkallsmall(t, csconv);
1911 for (int j = 1; j <= n; j++) {
1912 ns = 0;
1913 for (int i = 0; i <= (l1 - j); i++) {
1914 //t is haystack, s1[i..i+j) is needle
1915 if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
1916 ns++;
1917 } else if (opt & NGRAM_WEIGHTED) {
1918 ns--;
1919 test++;
1920 if (i == 0 || i == l1 - j)
1921 ns--; // side weight
1922 }
1923 }
1924 nscore = nscore + ns;
1925 if (ns < 2 && !(opt & NGRAM_WEIGHTED))
1926 break;
1927 }
1928 } 1968 }
1929 1969
1930 ns = 0; 1970 ns = 0;
1931 if (opt & NGRAM_LONGER_WORSE) 1971 if (opt & NGRAM_LONGER_WORSE)
1932 ns = (l2 - l1) - 2; 1972 ns = (l2 - l1) - 2;
1933 if (opt & NGRAM_ANY_MISMATCH) 1973 if (opt & NGRAM_ANY_MISMATCH)
1934 ns = abs(l2 - l1) - 2; 1974 ns = abs(l2 - l1) - 2;
1935 ns = (nscore - ((ns > 0) ? ns : 0)); 1975 ns = (nscore - ((ns > 0) ? ns : 0));
1936 return ns; 1976 return ns;
1937 } 1977 }
1938 1978
1939 // length of the left common substring of s1 and (decapitalised) s2 1979 // generate an n-gram score comparing s1 and s2, non-UTF16 version
1940 int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) { 1980 int SuggestMgr::ngram(int n,
1941 if (utf8) { 1981 const std::string& s1,
1942 std::vector<w_char> su1; 1982 const std::string& s2,
1943 std::vector<w_char> su2; 1983 int opt) {
1944 int l1 = u8_u16(su1, s1); 1984 int nscore = 0;
1945 int l2 = u8_u16(su2, s2); 1985 int ns;
1946 // decapitalize dictionary word 1986 int l1;
1947 if (complexprefixes) { 1987 int l2;
1948 if (su1[l1 - 1] == su2[l2 - 1]) 1988 int test = 0;
1949 return 1; 1989
1950 } else { 1990 l2 = s2.size();
1951 unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; 1991 if (l2 == 0)
1952 unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; 1992 return 0;
1953 if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) 1993 l1 = s1.size();
1954 return 0; 1994 std::string t(s2);
1955 int i; 1995 if (opt & NGRAM_LOWERING)
1956 for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && 1996 mkallsmall(t, csconv);
1957 (su1[i].h == su2[i].h); 1997 for (int j = 1; j <= n; j++) {
1958 i++) 1998 ns = 0;
1959 ; 1999 for (int i = 0; i <= (l1 - j); i++) {
1960 return i; 2000 //t is haystack, s1[i..i+j) is needle
2001 if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
2002 ns++;
2003 } else if (opt & NGRAM_WEIGHTED) {
2004 ns--;
2005 test++;
2006 if (i == 0 || i == l1 - j)
2007 ns--; // side weight
2008 }
1961 } 2009 }
2010 nscore = nscore + ns;
2011 if (ns < 2 && !(opt & NGRAM_WEIGHTED))
2012 break;
2013 }
2014
2015 ns = 0;
2016 if (opt & NGRAM_LONGER_WORSE)
2017 ns = (l2 - l1) - 2;
2018 if (opt & NGRAM_ANY_MISMATCH)
2019 ns = abs(l2 - l1) - 2;
2020 ns = (nscore - ((ns > 0) ? ns : 0));
2021 return ns;
2022 }
2023
2024 // length of the left common substring of s1 and (decapitalised) s2, UTF version
2025 int SuggestMgr::leftcommonsubstring(
2026 const std::vector<w_char>& su1,
2027 const std::vector<w_char>& su2) {
2028 int l1 = su1.size();
2029 int l2 = su2.size();
2030 // decapitalize dictionary word
2031 if (complexprefixes) {
2032 if (su1[l1 - 1] == su2[l2 - 1])
2033 return 1;
1962 } else { 2034 } else {
1963 if (complexprefixes) { 2035 unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
1964 int l1 = strlen(s1); 2036 unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
1965 int l2 = strlen(s2); 2037 if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
1966 if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) 2038 return 0;
1967 return 1; 2039 int i;
1968 } else if (csconv) { 2040 for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
1969 const char* olds = s1; 2041 (su1[i].h == su2[i].h);
1970 // decapitalise dictionary word 2042 i++)
1971 if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) 2043 ;
1972 return 0; 2044 return i;
1973 do {
1974 s1++;
1975 s2++;
1976 } while ((*s1 == *s2) && (*s1 != '\0'));
1977 return (int)(s1 - olds);
1978 }
1979 } 2045 }
1980 return 0; 2046 return 0;
1981 } 2047 }
2048
2049 // length of the left common substring of s1 and (decapitalised) s2, non-UTF
2050 int SuggestMgr::leftcommonsubstring(
2051 const char* s1,
2052 const char* s2) {
2053 if (complexprefixes) {
2054 int l1 = strlen(s1);
2055 int l2 = strlen(s2);
2056 if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
2057 return 1;
2058 } else if (csconv) {
2059 const char* olds = s1;
2060 // decapitalise dictionary word
2061 if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
2062 return 0;
2063 do {
2064 s1++;
2065 s2++;
2066 } while ((*s1 == *s2) && (*s1 != '\0'));
2067 return (int)(s1 - olds);
2068 }
2069 return 0;
2070 }
1982 2071
1983 int SuggestMgr::commoncharacterpositions(const char* s1, 2072 int SuggestMgr::commoncharacterpositions(const char* s1,
1984 const char* s2, 2073 const char* s2,
1985 int* is_swap) { 2074 int* is_swap) {
1986 int num = 0; 2075 int num = 0;
1987 int diff = 0; 2076 int diff = 0;
1988 int diffpos[2]; 2077 int diffpos[2];
1989 *is_swap = 0; 2078 *is_swap = 0;
1990 if (utf8) { 2079 if (utf8) {
1991 std::vector<w_char> su1; 2080 std::vector<w_char> su1;
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
2150 } else 2239 } else
2151 j--; 2240 j--;
2152 } 2241 }
2153 free(result); 2242 free(result);
2154 return len; 2243 return len;
2155 } 2244 }
2156 2245
2157 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) { 2246 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) {
2158 return lcslen(s.c_str(), s2.c_str()); 2247 return lcslen(s.c_str(), s2.c_str());
2159 } 2248 }
OLDNEW
« third_party/hunspell/README.chromium ('K') | « third_party/hunspell/src/hunspell/suggestmgr.hxx ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698