Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: third_party/hunspell/src/hunspell/affixmgr.cxx

Issue 2239005: Merges our hunspell change to hunspell 1.2.10.... (Closed) Base URL: svn://chrome-svn.corp.google.com/chrome/trunk/deps/
Patch Set: '' Created 10 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 #include "license.hunspell" 1 #include "license.hunspell"
2 #include "license.myspell" 2 #include "license.myspell"
3 3
4 #include <stdlib.h> 4 #include <stdlib.h>
5 #include <string.h> 5 #include <string.h>
6 #include <stdio.h> 6 #include <stdio.h>
7 #include <ctype.h> 7 #include <ctype.h>
8 8
9 #include <vector> 9 #include <vector>
10 10
11 #include "affixmgr.hxx" 11 #include "affixmgr.hxx"
12 #include "affentry.hxx" 12 #include "affentry.hxx"
13 #include "langnum.hxx" 13 #include "langnum.hxx"
14 14
15 #include "csutil.hxx" 15 #include "csutil.hxx"
16 16
17 #ifdef HUNSPELL_CHROME_CLIENT
18 AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md)
19 {
20 bdict_reader = reader;
21 #else
17 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * k ey) 22 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * k ey)
18 { 23 {
24 #endif
19 // register hash manager and load affix data from aff file 25 // register hash manager and load affix data from aff file
20 pHMgr = ptr[0]; 26 pHMgr = ptr[0];
21 alldic = ptr; 27 alldic = ptr;
22 maxdic = md; 28 maxdic = md;
23 keystring = NULL; 29 keystring = NULL;
24 trystring = NULL; 30 trystring = NULL;
25 encoding=NULL; 31 encoding=NULL;
26 csconv=NULL; 32 csconv=NULL;
27 utf8 = 0; 33 utf8 = 0;
28 complexprefixes = 0; 34 complexprefixes = 0;
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
92 sfx = NULL; 98 sfx = NULL;
93 pfx = NULL; 99 pfx = NULL;
94 100
95 for (int i=0; i < SETSIZE; i++) { 101 for (int i=0; i < SETSIZE; i++) {
96 pStart[i] = NULL; 102 pStart[i] = NULL;
97 sStart[i] = NULL; 103 sStart[i] = NULL;
98 pFlag[i] = NULL; 104 pFlag[i] = NULL;
99 sFlag[i] = NULL; 105 sFlag[i] = NULL;
100 } 106 }
101 107
108 #ifdef HUNSPELL_CHROME_CLIENT
109 // Define dummy parameters for parse_file() to avoid changing the parameters
110 // of parse_file(). This may make it easier to merge the changes of the
111 // original hunspell.
112 const char* affpath = NULL;
113 const char* key = NULL;
114 #else
102 for (int j=0; j < CONTSIZE; j++) { 115 for (int j=0; j < CONTSIZE; j++) {
103 contclasses[j] = 0; 116 contclasses[j] = 0;
104 } 117 }
118 #endif
105 119
106 if (parse_file(affpath, key)) { 120 if (parse_file(affpath, key)) {
107 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); 121 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
108 } 122 }
109 123
110 if (cpdmin == -1) cpdmin = MINCPDLEN; 124 if (cpdmin == -1) cpdmin = MINCPDLEN;
111 125
112 } 126 }
113 127
114 128
(...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after
245 #endif 259 #endif
246 } 260 }
247 261
248 262
249 // read in aff file and build up prefix and suffix entry objects 263 // read in aff file and build up prefix and suffix entry objects
250 int AffixMgr::parse_file(const char * affpath, const char * key) 264 int AffixMgr::parse_file(const char * affpath, const char * key)
251 { 265 {
252 char * line; // io buffers 266 char * line; // io buffers
253 char ft; // affix type 267 char ft; // affix type
254 268
269 #ifdef HUNSPELL_CHROME_CLIENT
270 // open the affix file
271 // We're always UTF-8
272 utf8 = 1;
273
274 // A BDICT file stores PFX and SFX lines in a special section and it provides
275 // a special line iterator for reading PFX and SFX lines.
276 // We create a FileMgr object from this iterator and parse PFX and SFX lines
277 // before parsing other lines.
278 hunspell::LineIterator affix_iterator = bdict_reader->GetAffixLineIterator();
279 FileMgr* iterator = new FileMgr(&affix_iterator);
280 if (!iterator) {
281 HUNSPELL_WARNING(stderr,
282 "error: could not create a FileMgr from an affix line iterator.\n");
283 return 1;
284 }
285
286 while (line = iterator->getline()) {
287 ft = ' ';
288 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
289 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
290 if (ft != ' ')
291 parse_affix(line, ft, iterator, NULL);
292 }
293 delete iterator;
294
295 // Create a FileMgr object for reading lines except PFX and SFX lines.
296 // We don't need to change the loop below since our FileMgr emulates the
297 // original one.
298 hunspell::LineIterator other_iterator = bdict_reader->GetOtherLineIterator();
299 FileMgr * afflst = new FileMgr(&other_iterator);
300 if (!afflst) {
301 HUNSPELL_WARNING(stderr,
302 "error: could not create a FileMgr from an other line iterator.\n");
303 return 1;
304 }
305 #else
255 // checking flag duplication 306 // checking flag duplication
256 char dupflags[CONTSIZE]; 307 char dupflags[CONTSIZE];
257 char dupflags_ini = 1; 308 char dupflags_ini = 1;
258 309
259 // first line indicator for removing byte order mark 310 // first line indicator for removing byte order mark
260 int firstline = 1; 311 int firstline = 1;
261 312
262 // open the affix file 313 // open the affix file
263 FileMgr * afflst = new FileMgr(affpath, key); 314 FileMgr * afflst = new FileMgr(affpath, key);
264 if (!afflst) { 315 if (!afflst) {
265 HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n" ,affpath); 316 HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n" ,affpath);
266 return 1; 317 return 1;
267 } 318 }
319 #endif
268 320
269 // step one is to parse the affix file building up the internal 321 // step one is to parse the affix file building up the internal
270 // affix data structures 322 // affix data structures
271 323
272 // read in each line ignoring any that do not 324 // read in each line ignoring any that do not
273 // start with a known line type indicator 325 // start with a known line type indicator
274 while ((line = afflst->getline())) { 326 while ((line = afflst->getline())) {
275 mychomp(line); 327 mychomp(line);
276 328
329 #ifndef HUNSPELL_CHROME_CLIENT
277 /* remove byte order mark */ 330 /* remove byte order mark */
278 if (firstline) { 331 if (firstline) {
279 firstline = 0; 332 firstline = 0;
280 // Affix file begins with byte order mark: possible incompatibility wit h old Hunspell versions 333 // Affix file begins with byte order mark: possible incompatibility wit h old Hunspell versions
281 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { 334 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
282 memmove(line, line+3, strlen(line+3)+1); 335 memmove(line, line+3, strlen(line+3)+1);
283 } 336 }
284 } 337 }
338 #endif
285 339
286 /* parse in the keyboard string */ 340 /* parse in the keyboard string */
287 if (strncmp(line,"KEY",3) == 0) { 341 if (strncmp(line,"KEY",3) == 0) {
288 if (parse_string(line, &keystring, afflst->getlinenum())) { 342 if (parse_string(line, &keystring, afflst->getlinenum())) {
289 delete afflst; 343 delete afflst;
290 return 1; 344 return 1;
291 } 345 }
292 } 346 }
293 347
294 /* parse in the try string */ 348 /* parse in the try string */
(...skipping 215 matching lines...) Expand 10 before | Expand all | Expand 10 after
510 } 564 }
511 565
512 /* parse in the ignored characters (for example, Arabic optional diacreti cs charachters */ 566 /* parse in the ignored characters (for example, Arabic optional diacreti cs charachters */
513 if (strncmp(line,"IGNORE",6) == 0) { 567 if (strncmp(line,"IGNORE",6) == 0) {
514 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_u tf16_len, utf8, afflst->getlinenum())) { 568 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_u tf16_len, utf8, afflst->getlinenum())) {
515 delete afflst; 569 delete afflst;
516 return 1; 570 return 1;
517 } 571 }
518 } 572 }
519 573
574 #ifndef HUNSPELL_CHROME_CLIENT
520 /* parse in the typical fault correcting table */ 575 /* parse in the typical fault correcting table */
521 if (strncmp(line,"REP",3) == 0) { 576 if (strncmp(line,"REP",3) == 0) {
522 if (parse_reptable(line, afflst)) { 577 if (parse_reptable(line, afflst)) {
523 delete afflst; 578 delete afflst;
524 return 1; 579 return 1;
525 } 580 }
526 } 581 }
582 #endif
527 583
528 /* parse in the input conversion table */ 584 /* parse in the input conversion table */
529 if (strncmp(line,"ICONV",5) == 0) { 585 if (strncmp(line,"ICONV",5) == 0) {
530 if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { 586 if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
531 delete afflst; 587 delete afflst;
532 return 1; 588 return 1;
533 } 589 }
534 } 590 }
535 591
536 /* parse in the input conversion table */ 592 /* parse in the input conversion table */
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
627 if (parse_flag(line, &substandard, afflst)) { 683 if (parse_flag(line, &substandard, afflst)) {
628 delete afflst; 684 delete afflst;
629 return 1; 685 return 1;
630 } 686 }
631 } 687 }
632 688
633 if (strncmp(line,"CHECKSHARPS",11) == 0) { 689 if (strncmp(line,"CHECKSHARPS",11) == 0) {
634 checksharps=1; 690 checksharps=1;
635 } 691 }
636 692
693 #ifndef HUNSPELL_CHROME_CLIENT // Chrome handled affixes above.
637 /* parse this affix: P - prefix, S - suffix */ 694 /* parse this affix: P - prefix, S - suffix */
638 ft = ' '; 695 ft = ' ';
639 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P'; 696 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
640 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S'; 697 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
641 if (ft != ' ') { 698 if (ft != ' ') {
642 if (dupflags_ini) { 699 if (dupflags_ini) {
643 memset(dupflags, 0, sizeof(dupflags)); 700 memset(dupflags, 0, sizeof(dupflags));
644 dupflags_ini = 0; 701 dupflags_ini = 0;
645 } 702 }
646 if (parse_affix(line, ft, afflst, dupflags)) { 703 if (parse_affix(line, ft, afflst, dupflags)) {
647 delete afflst; 704 delete afflst;
648 process_pfx_tree_to_list(); 705 process_pfx_tree_to_list();
649 process_sfx_tree_to_list(); 706 process_sfx_tree_to_list();
650 return 1; 707 return 1;
651 } 708 }
652 } 709 }
710 #endif
653 711
654 } 712 }
655 delete afflst; 713 delete afflst;
656 714
657 // convert affix trees to sorted list 715 // convert affix trees to sorted list
658 process_pfx_tree_to_list(); 716 process_pfx_tree_to_list();
659 process_sfx_tree_to_list(); 717 process_sfx_tree_to_list();
660 718
661 // now we can speed up performance greatly taking advantage of the 719 // now we can speed up performance greatly taking advantage of the
662 // relationship between the affixes and the idea of "subsets". 720 // relationship between the affixes and the idea of "subsets".
(...skipping 577 matching lines...) Expand 10 before | Expand all | Expand 10 after
1240 return NULL; 1298 return NULL;
1241 } 1299 }
1242 1300
1243 // Is word a non compound with a REP substitution (see checkcompoundrep)? 1301 // Is word a non compound with a REP substitution (see checkcompoundrep)?
1244 int AffixMgr::cpdrep_check(const char * word, int wl) 1302 int AffixMgr::cpdrep_check(const char * word, int wl)
1245 { 1303 {
1246 char candidate[MAXLNLEN]; 1304 char candidate[MAXLNLEN];
1247 const char * r; 1305 const char * r;
1248 int lenr, lenp; 1306 int lenr, lenp;
1249 1307
1308 #ifdef HUNSPELL_CHROME_CLIENT
1309 const char *pattern, *pattern2;
1310 hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator( );
1311 while (iterator.GetNext(&pattern, &pattern2)) {
1312 r = word;
1313 lenr = strlen(pattern2);
1314 lenp = strlen(pattern);
1315
1316 // search every occurence of the pattern in the word
1317 while ((r=strstr(r, pattern)) != NULL) {
1318 strcpy(candidate, word);
1319 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1320 strcpy(candidate+(r-word), pattern2);
1321 strcpy(candidate+(r-word)+lenr, r+lenp);
1322 if (candidate_check(candidate,strlen(candidate))) return 1;
1323 r++; // search for the next letter
1324 }
1325 }
1326
1327 #else
1250 if ((wl < 2) || !numrep) return 0; 1328 if ((wl < 2) || !numrep) return 0;
1251 1329
1252 for (int i=0; i < numrep; i++ ) { 1330 for (int i=0; i < numrep; i++ ) {
1253 r = word; 1331 r = word;
1254 lenr = strlen(reptable[i].pattern2); 1332 lenr = strlen(reptable[i].pattern2);
1255 lenp = strlen(reptable[i].pattern); 1333 lenp = strlen(reptable[i].pattern);
1256 // search every occurence of the pattern in the word 1334 // search every occurence of the pattern in the word
1257 while ((r=strstr(r, reptable[i].pattern)) != NULL) { 1335 while ((r=strstr(r, reptable[i].pattern)) != NULL) {
1258 strcpy(candidate, word); 1336 strcpy(candidate, word);
1259 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; 1337 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1260 strcpy(candidate+(r-word),reptable[i].pattern2); 1338 strcpy(candidate+(r-word),reptable[i].pattern2);
1261 strcpy(candidate+(r-word)+lenr, r+lenp); 1339 strcpy(candidate+(r-word)+lenr, r+lenp);
1262 if (candidate_check(candidate,strlen(candidate))) return 1; 1340 if (candidate_check(candidate,strlen(candidate))) return 1;
1263 r++; // search for the next letter 1341 r++; // search for the next letter
1264 } 1342 }
1265 } 1343 }
1344 #endif
1266 return 0; 1345 return 0;
1267 } 1346 }
1268 1347
1269 // forbid compoundings when there are special patterns at word bound 1348 // forbid compoundings when there are special patterns at word bound
1270 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2) 1349 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2)
1271 { 1350 {
1272 int len; 1351 int len;
1273 for (int i = 0; i < numcheckcpd; i++) { 1352 for (int i = 0; i < numcheckcpd; i++) {
1274 if (isSubset(checkcpdtable[i].pattern2, word + pos) && 1353 if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
1275 (!r1 || !checkcpdtable[i].cond || 1354 (!r1 || !checkcpdtable[i].cond ||
(...skipping 2049 matching lines...) Expand 10 before | Expand all | Expand 10 after
3325 piece = mystrsep(&tp, 0); 3404 piece = mystrsep(&tp, 0);
3326 } 3405 }
3327 if (np < 2) { 3406 if (np < 2) {
3328 HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable informa tion\n", af->getlinenum()); 3407 HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable informa tion\n", af->getlinenum());
3329 return 1; 3408 return 1;
3330 } 3409 }
3331 if (np == 2) cpdvowels = mystrdup("aeiouAEIOU"); 3410 if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
3332 return 0; 3411 return 0;
3333 } 3412 }
3334 3413
3414 #ifndef HUNSPELL_CHROME_CLIENT
3335 /* parse in the typical fault correcting table */ 3415 /* parse in the typical fault correcting table */
3336 int AffixMgr::parse_reptable(char * line, FileMgr * af) 3416 int AffixMgr::parse_reptable(char * line, FileMgr * af)
3337 { 3417 {
3338 if (numrep != 0) { 3418 if (numrep != 0) {
3339 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum()); 3419 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum());
3340 return 1; 3420 return 1;
3341 } 3421 }
3342 char * tp = line; 3422 char * tp = line;
3343 char * piece; 3423 char * piece;
3344 int i = 0; 3424 int i = 0;
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
3400 piece = mystrsep(&tp, 0); 3480 piece = mystrsep(&tp, 0);
3401 } 3481 }
3402 if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { 3482 if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
3403 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af-> getlinenum()); 3483 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af-> getlinenum());
3404 numrep = 0; 3484 numrep = 0;
3405 return 1; 3485 return 1;
3406 } 3486 }
3407 } 3487 }
3408 return 0; 3488 return 0;
3409 } 3489 }
3490 #endif
3410 3491
3411 /* parse in the typical fault correcting table */ 3492 /* parse in the typical fault correcting table */
3412 int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const c har * keyword) 3493 int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const c har * keyword)
3413 { 3494 {
3414 if (*rl) { 3495 if (*rl) {
3415 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum()); 3496 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum());
3416 return 1; 3497 return 1;
3417 } 3498 }
3418 char * tp = line; 3499 char * tp = line;
3419 char * piece; 3500 char * piece;
(...skipping 583 matching lines...) Expand 10 before | Expand all | Expand 10 after
4003 while (piece) { 4084 while (piece) {
4004 if (*piece != '\0') { 4085 if (*piece != '\0') {
4005 switch(i) { 4086 switch(i) {
4006 // piece 1 - is type of affix 4087 // piece 1 - is type of affix
4007 case 0: { np++; break; } 4088 case 0: { np++; break; }
4008 4089
4009 // piece 2 - is affix char 4090 // piece 2 - is affix char
4010 case 1: { 4091 case 1: {
4011 np++; 4092 np++;
4012 aflag = pHMgr->decode_flag(piece); 4093 aflag = pHMgr->decode_flag(piece);
4094 #ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates.
4013 if (((at == 'S') && (dupflags[aflag] & dupSFX)) || 4095 if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4014 ((at == 'P') && (dupflags[aflag] & dupPFX))) { 4096 ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4015 HUNSPELL_WARNING(stderr, "error: line %d: multiple defin itions of an affix flag\n", 4097 HUNSPELL_WARNING(stderr, "error: line %d: multiple defin itions of an affix flag\n",
4016 af->getlinenum()); 4098 af->getlinenum());
4017 // return 1; XXX permissive mode for bad dictionaries 4099 // return 1; XXX permissive mode for bad dictionaries
4018 } 4100 }
4019 dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); 4101 dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
4102 #endif
4020 break; 4103 break;
4021 } 4104 }
4022 // piece 3 - is cross product indicator 4105 // piece 3 - is cross product indicator
4023 case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; } 4106 case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
4024 4107
4025 // piece 4 - is number of affentries 4108 // piece 4 - is number of affentries
4026 case 3: { 4109 case 3: {
4027 np++; 4110 np++;
4028 numents = atoi(piece); 4111 numents = atoi(piece);
4029 if (numents == 0) { 4112 if (numents == 0) {
(...skipping 287 matching lines...) Expand 10 before | Expand all | Expand 10 after
4317 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping c haracters and condition\n", linenum); 4400 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping c haracters and condition\n", linenum);
4318 return 0; 4401 return 0;
4319 } 4402 }
4320 } 4403 }
4321 } 4404 }
4322 if (j < 0) return 1; 4405 if (j < 0) return 1;
4323 } 4406 }
4324 } 4407 }
4325 return 0; 4408 return 0;
4326 } 4409 }
OLDNEW
« no previous file with comments | « third_party/hunspell/src/hunspell/affixmgr.hxx ('k') | third_party/hunspell/src/hunspell/filemgr.hxx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698