third_party/hunspell/src/hunspell/suggestmgr.cxx - Issue 2587363003: [spellcheck] Updated Hunspell to 1.6.0

Side by Side Diff: third_party/hunspell/src/hunspell/suggestmgr.cxx

Issue 2587363003: [spellcheck] Updated Hunspell to 1.6.0 (Closed)

Patch Set: Fix README symlink Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /* *** BEGIN LICENSE BLOCK ***	1 /* *** BEGIN LICENSE BLOCK ***

2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1	2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1

3 *	3 *

4 * The contents of this file are subject to the Mozilla Public License Version	4 * The contents of this file are subject to the Mozilla Public License Version

5 * 1.1 (the "License"); you may not use this file except in compliance with	5 * 1.1 (the "License"); you may not use this file except in compliance with

6 * the License. You may obtain a copy of the License at	6 * the License. You may obtain a copy of the License at

7 * http://www.mozilla.org/MPL/	7 * http://www.mozilla.org/MPL/

8 *	8 *

9 * Software distributed under the License is distributed on an "AS IS" basis,	9 * Software distributed under the License is distributed on an "AS IS" basis,

10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License	10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

(...skipping 1156 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1167 }	1167 }

1168	1168

1169 struct hentry* hp = NULL;	1169 struct hentry* hp = NULL;

1170 int col = -1;	1170 int col = -1;

1171 #ifdef HUNSPELL_CHROME_CLIENT	1171 #ifdef HUNSPELL_CHROME_CLIENT

1172 ScopedHashEntryFactory hash_entry_factory;	1172 ScopedHashEntryFactory hash_entry_factory;

1173 #endif	1173 #endif

1174 phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;	1174 phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;

1175 std::string target;	1175 std::string target;

1176 std::string candidate;	1176 std::string candidate;

	1177 std::vector<w_char> w_candidate;

1177 if (ph) {	1178 if (ph) {

1178 if (utf8) {	1179 if (utf8) {

1179 std::vector<w_char> _w;	1180 u8_u16(w_candidate, word);

1180 u8_u16(_w, word);	1181 mkallcap_utf(w_candidate, langnum);

1181 mkallcap_utf(_w, langnum);	1182 u16_u8(candidate, w_candidate);

1182 u16_u8(candidate, _w);

1183 } else {	1183 } else {

1184 candidate.assign(word);	1184 candidate.assign(word);

1185 if (!nonbmp)	1185 if (!nonbmp)

1186 mkallcap(candidate, csconv);	1186 mkallcap(candidate, csconv);

1187 }	1187 }

1188 target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n)	1188 target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n)

1189 }	1189 }

1190	1190

1191 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;	1191 FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;

1192 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;	1192 FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;

1193 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;	1193 FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;

1194 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;	1194 FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;

1195	1195

	1196 std::vector<w_char> w_word, w_target;

	1197 if (utf8) {

	1198 u8_u16(w_word, word);

	1199 u8_u16(w_target, target);

	1200 }

	1201

	1202 std::vector<w_char> w_entry;

	1203 std::string f;

	1204 std::vector<w_char> w_f;

	1205 std::vector<w_char> w_target2;

	1206

1196 for (size_t i = 0; i < rHMgr.size(); ++i) {	1207 for (size_t i = 0; i < rHMgr.size(); ++i) {

1197 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {	1208 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {

1198 if ((hp->astr) && (pAMgr) &&	1209 if ((hp->astr) && (pAMgr) &&

1199 (TESTAFF(hp->astr, forbiddenword, hp->alen) \|\|	1210 (TESTAFF(hp->astr, forbiddenword, hp->alen) \|\|

1200 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) \|\|	1211 TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) \|\|

1201 TESTAFF(hp->astr, nosuggest, hp->alen) \|\|	1212 TESTAFF(hp->astr, nosuggest, hp->alen) \|\|

1202 TESTAFF(hp->astr, nongramsuggest, hp->alen) \|\|	1213 TESTAFF(hp->astr, nongramsuggest, hp->alen) \|\|

1203 TESTAFF(hp->astr, onlyincompound, hp->alen)))	1214 TESTAFF(hp->astr, onlyincompound, hp->alen)))

1204 continue;	1215 continue;

1205	1216

1206 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +	1217 if (utf8) {

1207 leftcommonsubstring(word, HENTRY_WORD(hp));	1218 w_entry.clear();

	1219 u8_u16(w_entry, HENTRY_WORD(hp));

	1220 sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +

	1221 leftcommonsubstring(w_word, w_entry);

	1222 } else {

	1223 sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +

	1224 leftcommonsubstring(word, HENTRY_WORD(hp));

	1225 }

1208	1226

1209 // check special pronounciation	1227 // check special pronounciation

1210 std::string f;	1228 f.clear();

1211 if ((hp->var & H_OPT_PHON) &&	1229 if ((hp->var & H_OPT_PHON) &&

1212 copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {	1230 copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {

1213 int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +	1231 int sc2;

1214 +leftcommonsubstring(word, f.c_str());	1232 if (utf8) {

	1233 w_f.clear();

	1234 u8_u16(w_f, f.c_str());

	1235 sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +

	1236 leftcommonsubstring(w_word, w_f);

	1237 } else {

	1238 sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +

	1239 leftcommonsubstring(word, f.c_str());

	1240 }

1215 if (sc2 > sc)	1241 if (sc2 > sc)

1216 sc = sc2;	1242 sc = sc2;

1217 }	1243 }

1218	1244

1219 int scphon = -20000;	1245 int scphon = -20000;

1220 if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {	1246 if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {

1221 if (utf8) {	1247 if (utf8) {

1222 std::vector<w_char> _w;	1248 w_candidate.clear();

1223 u8_u16(_w, HENTRY_WORD(hp));	1249 u8_u16(w_candidate, HENTRY_WORD(hp));

1224 mkallcap_utf(_w, langnum);	1250 mkallcap_utf(w_candidate, langnum);

1225 u16_u8(candidate, _w);	1251 u16_u8(candidate, w_candidate);

1226 } else {	1252 } else {

1227 candidate.assign(HENTRY_WORD(hp));	1253 candidate = HENTRY_WORD(hp);

1228 mkallcap(candidate, csconv);	1254 mkallcap(candidate, csconv);

1229 }	1255 }

1230 std::string target2 = phonet(candidate, *ph);	1256 std::string target2 = phonet(candidate, *ph);

1231 scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);	1257 w_target2.clear();

	1258 if (utf8) {

	1259 u8_u16(w_target2, target2.c_str());

	1260 scphon = 2 * ngram(3, w_target, w_target2,

	1261 NGRAM_LONGER_WORSE);

	1262 } else {

	1263 scphon = 2 * ngram(3, target, target2,

	1264 NGRAM_LONGER_WORSE);

	1265 }

1232 }	1266 }

1233	1267

1234 if (sc > scores[lp]) {	1268 if (sc > scores[lp]) {

1235 scores[lp] = sc;	1269 scores[lp] = sc;

1236 #ifdef HUNSPELL_CHROME_CLIENT	1270 #ifdef HUNSPELL_CHROME_CLIENT

1237 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp);	1271 roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp);

1238 #else	1272 #else

1239 roots[lp] = hp;	1273 roots[lp] = hp;

1240 #endif	1274 #endif

1241 lval = sc;	1275 lval = sc;

(...skipping 13 matching lines...) Expand all Loading...
1255 lpphon = j;	1289 lpphon = j;

1256 lval = scoresphon[j];	1290 lval = scoresphon[j];

1257 }	1291 }

1258 }	1292 }

1259 }	1293 }

1260 }	1294 }

1261	1295

1262 // find minimum threshold for a passable suggestion	1296 // find minimum threshold for a passable suggestion

1263 // mangle original word three differnt ways	1297 // mangle original word three differnt ways

1264 // and score them to generate a minimum acceptable score	1298 // and score them to generate a minimum acceptable score

	1299 std::vector<w_char> w_mw;

1265 int thresh = 0;	1300 int thresh = 0;

1266 for (int sp = 1; sp < 4; sp++) {	1301 for (int sp = 1; sp < 4; sp++) {

1267 if (utf8) {	1302 if (utf8) {

1268 u8_u16(u8, word);	1303 w_mw = w_word;

1269 for (int k = sp; k < n; k += 4) {	1304 for (int k = sp; k < n; k += 4) {

1270 u8[k].l = '*';	1305 w_mw[k].l = '*';

1271 u8[k].h = 0;	1306 w_mw[k].h = 0;

1272 }	1307 }

1273 std::string mw;	1308 thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);

1274 u16_u8(mw, u8);

1275 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

1276 } else {	1309 } else {

1277 std::string mw(word);	1310 std::string mw = word;

1278 for (int k = sp; k < n; k += 4)	1311 for (int k = sp; k < n; k += 4)

1279 mw[k] = '*';	1312 mw[k] = '*';

1280 thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);	1313 thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);

1281 }	1314 }

1282 }	1315 }

1283 thresh = thresh / 3;	1316 thresh = thresh / 3;

1284 thresh--;	1317 thresh--;

1285	1318

1286 // now expand affixes on each of these root words and	1319 // now expand affixes on each of these root words and

1287 // and use length adjusted ngram scores to select	1320 // and use length adjusted ngram scores to select

1288 // possible suggestions	1321 // possible suggestions

1289 char* guess[MAX_GUESS];	1322 char* guess[MAX_GUESS];

1290 char* guessorig[MAX_GUESS];	1323 char* guessorig[MAX_GUESS];

1291 int gscore[MAX_GUESS];	1324 int gscore[MAX_GUESS];

1292 for (int i = 0; i < MAX_GUESS; i++) {	1325 for (int i = 0; i < MAX_GUESS; i++) {

1293 guess[i] = NULL;	1326 guess[i] = NULL;

1294 guessorig[i] = NULL;	1327 guessorig[i] = NULL;

1295 gscore[i] = -100 * i;	1328 gscore[i] = -100 * i;

1296 }	1329 }

1297	1330

1298 lp = MAX_GUESS - 1;	1331 lp = MAX_GUESS - 1;

1299	1332

1300 struct guessword* glst;	1333 struct guessword* glst;

1301 glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword));	1334 glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword));

1302 if (!glst) {	1335 if (!glst) {

1303 if (nonbmp)	1336 if (nonbmp)

1304 utf8 = 1;	1337 utf8 = 1;

1305 return;	1338 return;

1306 }	1339 }

1307	1340

	1341 std::vector<w_char> w_glst_word;

1308 for (int i = 0; i < MAX_ROOTS; i++) {	1342 for (int i = 0; i < MAX_ROOTS; i++) {

1309 if (roots[i]) {	1343 if (roots[i]) {

1310 struct hentry* rp = roots[i];	1344 struct hentry* rp = roots[i];

1311	1345

1312 std::string f;	1346 f.clear();

1313 const char *field = NULL;	1347 const char *field = NULL;

1314 if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))	1348 if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))

1315 field = f.c_str();	1349 field = f.c_str();

1316 int nw = pAMgr->expand_rootword(	1350 int nw = pAMgr->expand_rootword(

1317 glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word,	1351 glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word,

1318 nc, field);	1352 nc, field);

1319	1353

1320 for (int k = 0; k < nw; k++) {	1354 for (int k = 0; k < nw; k++) {

1321 sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +	1355 if (utf8) {

1322 leftcommonsubstring(word, glst[k].word);	1356 w_glst_word.clear();

	1357 u8_u16(w_glst_word, glst[k].word);

	1358 sc = ngram(n, w_word, w_glst_word,

	1359 NGRAM_ANY_MISMATCH + low) +

	1360 leftcommonsubstring(w_word, w_glst_word);

	1361 } else {

	1362 sc = ngram(n, word, glst[k].word,

	1363 NGRAM_ANY_MISMATCH + low) +

	1364 leftcommonsubstring(word, glst[k].word);

	1365 }

1323	1366

1324 if (sc > thresh) {	1367 if (sc > thresh) {

1325 if (sc > gscore[lp]) {	1368 if (sc > gscore[lp]) {

1326 if (guess[lp]) {	1369 if (guess[lp]) {

1327 free(guess[lp]);	1370 free(guess[lp]);

1328 if (guessorig[lp]) {	1371 if (guessorig[lp]) {

1329 free(guessorig[lp]);	1372 free(guessorig[lp]);

1330 guessorig[lp] = NULL;	1373 guessorig[lp] = NULL;

1331 }	1374 }

1332 }	1375 }

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1366	1409

1367 int is_swap = 0;	1410 int is_swap = 0;

1368 int re = 0;	1411 int re = 0;

1369 double fact = 1.0;	1412 double fact = 1.0;

1370 if (pAMgr) {	1413 if (pAMgr) {

1371 int maxd = pAMgr->get_maxdiff();	1414 int maxd = pAMgr->get_maxdiff();

1372 if (maxd >= 0)	1415 if (maxd >= 0)

1373 fact = (10.0 - maxd) / 5.0;	1416 fact = (10.0 - maxd) / 5.0;

1374 }	1417 }

1375	1418

	1419 std::vector<w_char> w_gl;

1376 for (int i = 0; i < MAX_GUESS; i++) {	1420 for (int i = 0; i < MAX_GUESS; i++) {

1377 if (guess[i]) {	1421 if (guess[i]) {

1378 // lowering guess[i]	1422 // lowering guess[i]

1379 std::string gl;	1423 std::string gl;

1380 int len;	1424 int len;

1381 if (utf8) {	1425 if (utf8) {

1382 std::vector<w_char> _w;	1426 w_gl.clear();

1383 len = u8_u16(_w, guess[i]);	1427 len = u8_u16(w_gl, guess[i]);

1384 mkallsmall_utf(_w, langnum);	1428 mkallsmall_utf(w_gl, langnum);

1385 u16_u8(gl, _w);	1429 u16_u8(gl, w_gl);

1386 } else {	1430 } else {

1387 gl.assign(guess[i]);	1431 gl.assign(guess[i]);

1388 if (!nonbmp)	1432 if (!nonbmp)

1389 mkallsmall(gl, csconv);	1433 mkallsmall(gl, csconv);

1390 len = strlen(guess[i]);	1434 len = strlen(guess[i]);

1391 }	1435 }

1392	1436

1393 int _lcs = lcslen(word, gl.c_str());	1437 int _lcs = lcslen(word, gl.c_str());

1394	1438

1395 // same characters with different casing	1439 // same characters with different casing

1396 if ((n == len) && (n == _lcs)) {	1440 if ((n == len) && (n == _lcs)) {

1397 gscore[i] += 2000;	1441 gscore[i] += 2000;

1398 break;	1442 break;

1399 }	1443 }

1400 // using 2-gram instead of 3, and other weightening	1444 // using 2-gram instead of 3, and other weightening

1401	1445

1402 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +	1446 w_gl.clear();

1403 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);	1447 if (utf8) {

	1448 u8_u16(w_gl, gl);

	1449 re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +

	1450 ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);

	1451 } else {

	1452 re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +

	1453 ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);

	1454 }

1404	1455

	1456 int ngram_score, leftcommon_score;

	1457 if (utf8) {

	1458 ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);

	1459 leftcommon_score = leftcommonsubstring(w_word, w_gl);

	1460 } else {

	1461 ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);

	1462 leftcommon_score = leftcommonsubstring(word, gl.c_str());

	1463 }

1405 gscore[i] =	1464 gscore[i] =

1406 // length of longest common subsequent minus length difference	1465 // length of longest common subsequent minus length difference

1407 2 * _lcs - abs((int)(n - len)) +	1466 2 * _lcs - abs((int)(n - len)) +

1408 // weight length of the left common substring	1467 // weight length of the left common substring

1409 leftcommonsubstring(word, gl.c_str()) +	1468 leftcommon_score +

1410 // weight equal character positions	1469 // weight equal character positions

1411 (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)	1470 (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)

1412 ? 1	1471 ? 1

1413 : 0) +	1472 : 0) +

1414 // swap character (not neighboring)	1473 // swap character (not neighboring)

1415 ((is_swap) ? 10 : 0) +	1474 ((is_swap) ? 10 : 0) +

1416 // ngram	1475 // ngram

1417 ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +	1476 ngram_score +

1418 // weighted ngrams	1477 // weighted ngrams

1419 re +	1478 re +

1420 // different limit for dictionaries with PHONE rules	1479 // different limit for dictionaries with PHONE rules

1421 (ph ? (re < len * fact ? -1000 : 0)	1480 (ph ? (re < len * fact ? -1000 : 0)

1422 : (re < (n + len) * fact ? -1000 : 0));	1481 : (re < (n + len) * fact ? -1000 : 0));

1423 }	1482 }

1424 }	1483 }

1425	1484

1426 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);	1485 bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);

1427	1486

1428 // phonetic version	1487 // phonetic version

1429 if (ph)	1488 if (ph)

1430 for (int i = 0; i < MAX_ROOTS; i++) {	1489 for (int i = 0; i < MAX_ROOTS; i++) {

1431 if (rootsphon[i]) {	1490 if (rootsphon[i]) {

1432 // lowering rootphon[i]	1491 // lowering rootphon[i]

1433 std::string gl;	1492 std::string gl;

1434 int len;	1493 int len;

	1494 w_gl.clear();

1435 if (utf8) {	1495 if (utf8) {

1436 std::vector<w_char> _w;	1496 len = u8_u16(w_gl, rootsphon[i]);

1437 len = u8_u16(_w, rootsphon[i]);	1497 mkallsmall_utf(w_gl, langnum);

1438 mkallsmall_utf(_w, langnum);	1498 u16_u8(gl, w_gl);

1439 u16_u8(gl, _w);

1440 } else {	1499 } else {

1441 gl.assign(rootsphon[i]);	1500 gl.assign(rootsphon[i]);

1442 if (!nonbmp)	1501 if (!nonbmp)

1443 mkallsmall(gl, csconv);	1502 mkallsmall(gl, csconv);

1444 len = strlen(rootsphon[i]);	1503 len = strlen(rootsphon[i]);

1445 }	1504 }

1446	1505

	1506 // weight length of the left common substring

	1507 int leftcommon_score;

	1508 if (utf8)

	1509 leftcommon_score = leftcommonsubstring(w_word, w_gl);

	1510 else

	1511 leftcommon_score = leftcommonsubstring(word, gl.c_str());

1447 // heuristic weigthing of ngram scores	1512 // heuristic weigthing of ngram scores

1448 scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +	1513 scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +

1449 // weight length of the left common substring	1514 leftcommon_score;

1450 leftcommonsubstring(word, gl.c_str());

1451 }	1515 }

1452 }	1516 }

1453	1517

1454 if (ph)	1518 if (ph)

1455 bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);	1519 bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);

1456	1520

1457 // copy over	1521 // copy over

1458 size_t oldns = wlst.size();	1522 size_t oldns = wlst.size();

1459	1523

1460 int same = 0;	1524 int same = 0;

(...skipping 384 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1845 if (!result2.empty() \|\| !strstr(pattern, MORPH_DERI_SFX))	1909 if (!result2.empty() \|\| !strstr(pattern, MORPH_DERI_SFX))

1846 break;	1910 break;

1847	1911

1848 newpattern.assign(pattern);	1912 newpattern.assign(pattern);

1849 mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX);	1913 mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX);

1850 pattern = newpattern.c_str();	1914 pattern = newpattern.c_str();

1851 }	1915 }

1852 return result2;	1916 return result2;

1853 }	1917 }

1854	1918

1855 // generate an n-gram score comparing s1 and s2	1919 // generate an n-gram score comparing s1 and s2, UTF16 version

1856 int SuggestMgr::ngram(int n,	1920 int SuggestMgr::ngram(int n,

1857 const std::string& s1,	1921 const std::vector<w_char>& su1,

1858 const std::string& s2,	1922 const std::vector<w_char>& su2,

1859 int opt) {	1923 int opt) {

1860 int nscore = 0;	1924 int nscore = 0;

1861 int ns;	1925 int ns;

1862 int l1;	1926 int l1;

1863 int l2;	1927 int l2;

1864 int test = 0;	1928 int test = 0;

1865	1929

1866 if (utf8) {	1930 l1 = su1.size();

1867 std::vector<w_char> su1;	1931 l2 = su2.size();

1868 std::vector<w_char> su2;	1932 if (l2 == 0)

1869 l1 = u8_u16(su1, s1);	1933 return 0;

1870 l2 = u8_u16(su2, s2);	1934 // lowering dictionary word

1871 if ((l2 <= 0) \|\| (l1 == -1))	1935 const std::vector<w_char>* p_su2 = &su2;

1872 return 0;	1936 std::vector<w_char> su2_copy;

1873 // lowering dictionary word	1937 if (opt & NGRAM_LOWERING) {

1874 if (opt & NGRAM_LOWERING)	1938 su2_copy = su2;

1875 mkallsmall_utf(su2, langnum);	1939 mkallsmall_utf(su2_copy, langnum);

1876 for (int j = 1; j <= n; j++) {	1940 p_su2 = &su2_copy;

1877 ns = 0;	1941 }

1878 for (int i = 0; i <= (l1 - j); i++) {	1942 for (int j = 1; j <= n; j++) {

1879 int k = 0;	1943 ns = 0;

1880 for (int l = 0; l <= (l2 - j); l++) {	1944 for (int i = 0; i <= (l1 - j); i++) {

1881 for (k = 0; k < j; k++) {	1945 int k = 0;

1882 w_char& c1 = su1[i + k];	1946 for (int l = 0; l <= (l2 - j); l++) {

1883 w_char& c2 = su2[l + k];	1947 for (k = 0; k < j; k++) {

1884 if ((c1.l != c2.l) \|\| (c1.h != c2.h))	1948 const w_char& c1 = su1[i + k];

1885 break;	1949 const w_char& c2 = (*p_su2)[l + k];

1886 }	1950 if ((c1.l != c2.l) \|\| (c1.h != c2.h))

1887 if (k == j) {

1888 ns++;

1889 break;	1951 break;

1890 }

1891 }	1952 }

1892 if (k != j && opt & NGRAM_WEIGHTED) {	1953 if (k == j) {

1893 ns--;	1954 ns++;

1894 test++;	1955 break;

1895 if (i == 0 \|\| i == l1 - j)

1896 ns--; // side weight

1897 }	1956 }

1898 }	1957 }

1899 nscore = nscore + ns;	1958 if (k != j && opt & NGRAM_WEIGHTED) {

1900 if (ns < 2 && !(opt & NGRAM_WEIGHTED))	1959 ns--;

1901 break;	1960 test++;

	1961 if (i == 0 \|\| i == l1 - j)

	1962 ns--; // side weight

	1963 }

1902 }	1964 }

1903 } else {	1965 nscore = nscore + ns;

1904 l2 = s2.size();	1966 if (ns < 2 && !(opt & NGRAM_WEIGHTED))

1905 if (l2 == 0)	1967 break;

1906 return 0;

1907 l1 = s1.size();

1908 std::string t(s2);

1909 if (opt & NGRAM_LOWERING)

1910 mkallsmall(t, csconv);

1911 for (int j = 1; j <= n; j++) {

1912 ns = 0;

1913 for (int i = 0; i <= (l1 - j); i++) {

1914 //t is haystack, s1[i..i+j) is needle

1915 if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {

1916 ns++;

1917 } else if (opt & NGRAM_WEIGHTED) {

1918 ns--;

1919 test++;

1920 if (i == 0 \|\| i == l1 - j)

1921 ns--; // side weight

1922 }

1923 }

1924 nscore = nscore + ns;

1925 if (ns < 2 && !(opt & NGRAM_WEIGHTED))

1926 break;

1927 }

1928 }	1968 }

1929	1969

1930 ns = 0;	1970 ns = 0;

1931 if (opt & NGRAM_LONGER_WORSE)	1971 if (opt & NGRAM_LONGER_WORSE)

1932 ns = (l2 - l1) - 2;	1972 ns = (l2 - l1) - 2;

1933 if (opt & NGRAM_ANY_MISMATCH)	1973 if (opt & NGRAM_ANY_MISMATCH)

1934 ns = abs(l2 - l1) - 2;	1974 ns = abs(l2 - l1) - 2;

1935 ns = (nscore - ((ns > 0) ? ns : 0));	1975 ns = (nscore - ((ns > 0) ? ns : 0));

1936 return ns;	1976 return ns;

1937 }	1977 }

1938	1978

1939 // length of the left common substring of s1 and (decapitalised) s2	1979 // generate an n-gram score comparing s1 and s2, non-UTF16 version

1940 int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {	1980 int SuggestMgr::ngram(int n,

1941 if (utf8) {	1981 const std::string& s1,

1942 std::vector<w_char> su1;	1982 const std::string& s2,

1943 std::vector<w_char> su2;	1983 int opt) {

1944 int l1 = u8_u16(su1, s1);	1984 int nscore = 0;

1945 int l2 = u8_u16(su2, s2);	1985 int ns;

1946 // decapitalize dictionary word	1986 int l1;

1947 if (complexprefixes) {	1987 int l2;

1948 if (su1[l1 - 1] == su2[l2 - 1])	1988 int test = 0;

1949 return 1;	1989

1950 } else {	1990 l2 = s2.size();

1951 unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;	1991 if (l2 == 0)

1952 unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;	1992 return 0;

1953 if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))	1993 l1 = s1.size();

1954 return 0;	1994 std::string t(s2);

1955 int i;	1995 if (opt & NGRAM_LOWERING)

1956 for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&	1996 mkallsmall(t, csconv);

1957 (su1[i].h == su2[i].h);	1997 for (int j = 1; j <= n; j++) {

1958 i++)	1998 ns = 0;

1959 ;	1999 for (int i = 0; i <= (l1 - j); i++) {

1960 return i;	2000 //t is haystack, s1[i..i+j) is needle

	2001 if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {

	2002 ns++;

	2003 } else if (opt & NGRAM_WEIGHTED) {

	2004 ns--;

	2005 test++;

	2006 if (i == 0 \|\| i == l1 - j)

	2007 ns--; // side weight

	2008 }

1961 }	2009 }

	2010 nscore = nscore + ns;

	2011 if (ns < 2 && !(opt & NGRAM_WEIGHTED))

	2012 break;

	2013 }

	2014

	2015 ns = 0;

	2016 if (opt & NGRAM_LONGER_WORSE)

	2017 ns = (l2 - l1) - 2;

	2018 if (opt & NGRAM_ANY_MISMATCH)

	2019 ns = abs(l2 - l1) - 2;

	2020 ns = (nscore - ((ns > 0) ? ns : 0));

	2021 return ns;

	2022 }

	2023

	2024 // length of the left common substring of s1 and (decapitalised) s2, UTF version

	2025 int SuggestMgr::leftcommonsubstring(

	2026 const std::vector<w_char>& su1,

	2027 const std::vector<w_char>& su2) {

	2028 int l1 = su1.size();

	2029 int l2 = su2.size();

	2030 // decapitalize dictionary word

	2031 if (complexprefixes) {

	2032 if (su1[l1 - 1] == su2[l2 - 1])

	2033 return 1;

1962 } else {	2034 } else {

1963 if (complexprefixes) {	2035 unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;

1964 int l1 = strlen(s1);	2036 unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;

1965 int l2 = strlen(s2);	2037 if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))

1966 if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])	2038 return 0;

1967 return 1;	2039 int i;

1968 } else if (csconv) {	2040 for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&

1969 const char* olds = s1;	2041 (su1[i].h == su2[i].h);

1970 // decapitalise dictionary word	2042 i++)

1971 if ((s1 != s2) && (s1 != csconv[((unsigned char)s2)].clower))	2043 ;

1972 return 0;	2044 return i;

1973 do {

1974 s1++;

1975 s2++;

1976 } while ((s1 == s2) && (*s1 != '\0'));

1977 return (int)(s1 - olds);

1978 }

1979 }	2045 }

1980 return 0;	2046 return 0;

1981 }	2047 }

	2048

	2049 // length of the left common substring of s1 and (decapitalised) s2, non-UTF

	2050 int SuggestMgr::leftcommonsubstring(

	2051 const char* s1,

	2052 const char* s2) {

	2053 if (complexprefixes) {

	2054 int l1 = strlen(s1);

	2055 int l2 = strlen(s2);

	2056 if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])

	2057 return 1;

	2058 } else if (csconv) {

	2059 const char* olds = s1;

	2060 // decapitalise dictionary word

	2061 if ((s1 != s2) && (s1 != csconv[((unsigned char)s2)].clower))

	2062 return 0;

	2063 do {

	2064 s1++;

	2065 s2++;

	2066 } while ((s1 == s2) && (*s1 != '\0'));

	2067 return (int)(s1 - olds);

	2068 }

	2069 return 0;

	2070 }

1982	2071

1983 int SuggestMgr::commoncharacterpositions(const char* s1,	2072 int SuggestMgr::commoncharacterpositions(const char* s1,

1984 const char* s2,	2073 const char* s2,

1985 int* is_swap) {	2074 int* is_swap) {

1986 int num = 0;	2075 int num = 0;

1987 int diff = 0;	2076 int diff = 0;

1988 int diffpos[2];	2077 int diffpos[2];

1989 *is_swap = 0;	2078 *is_swap = 0;

1990 if (utf8) {	2079 if (utf8) {

1991 std::vector<w_char> su1;	2080 std::vector<w_char> su1;

(...skipping 158 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2150 } else	2239 } else

2151 j--;	2240 j--;

2152 }	2241 }

2153 free(result);	2242 free(result);

2154 return len;	2243 return len;

2155 }	2244 }

2156	2245

2157 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) {	2246 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) {

2158 return lcslen(s.c_str(), s2.c_str());	2247 return lcslen(s.c_str(), s2.c_str());

2159 }	2248 }

OLD	NEW

« third_party/hunspell/README.chromium ('K') | « third_party/hunspell/src/hunspell/suggestmgr.hxx ('k') | no next file » | no next file with comments »