OLD | NEW |
| (Empty) |
1 #include "license.hunspell" | |
2 #include "license.myspell" | |
3 | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <stdio.h> | |
7 #include <ctype.h> | |
8 | |
9 #include <vector> | |
10 | |
11 #include "affixmgr.hxx" | |
12 #include "affentry.hxx" | |
13 #include "langnum.hxx" | |
14 | |
15 #include "csutil.hxx" | |
16 | |
17 #ifdef HUNSPELL_CHROME_CLIENT | |
18 AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md) | |
19 { | |
20 bdict_reader = reader; | |
21 #else | |
22 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * k
ey) | |
23 { | |
24 #endif | |
25 // register hash manager and load affix data from aff file | |
26 pHMgr = ptr[0]; | |
27 alldic = ptr; | |
28 maxdic = md; | |
29 keystring = NULL; | |
30 trystring = NULL; | |
31 encoding=NULL; | |
32 csconv=NULL; | |
33 utf8 = 0; | |
34 complexprefixes = 0; | |
35 maptable = NULL; | |
36 nummap = 0; | |
37 breaktable = NULL; | |
38 numbreak = -1; | |
39 reptable = NULL; | |
40 numrep = 0; | |
41 iconvtable = NULL; | |
42 oconvtable = NULL; | |
43 checkcpdtable = NULL; | |
44 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) | |
45 simplifiedcpd = 0; | |
46 numcheckcpd = 0; | |
47 defcpdtable = NULL; | |
48 numdefcpd = 0; | |
49 phone = NULL; | |
50 compoundflag = FLAG_NULL; // permits word in compound forms | |
51 compoundbegin = FLAG_NULL; // may be first word in compound forms | |
52 compoundmiddle = FLAG_NULL; // may be middle word in compound forms | |
53 compoundend = FLAG_NULL; // may be last word in compound forms | |
54 compoundroot = FLAG_NULL; // compound word signing flag | |
55 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word | |
56 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word | |
57 compoundmoresuffixes = 0; // allow more suffixes within compound words | |
58 checkcompounddup = 0; // forbid double words in compounds | |
59 checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a
REP substitution) | |
60 checkcompoundcase = 0; // forbid upper and lowercase combinations at word boun
ds | |
61 checkcompoundtriple = 0; // forbid compounds with triple letters | |
62 simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+
fahrt -> Schiffahrt) | |
63 forbiddenword = FORBIDDENWORD; // forbidden word signing flag | |
64 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag | |
65 nongramsuggest = FLAG_NULL; | |
66 lang = NULL; // language | |
67 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) | |
68 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes | |
69 cpdwordmax = -1; // default: unlimited wordcount in compound words | |
70 cpdmin = -1; // undefined | |
71 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words | |
72 cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n
) search! XXX) | |
73 cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) se
arch) | |
74 cpdvowels_utf16_len=0; // vowels | |
75 pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG | |
76 sfxappnd=NULL; // previous suffix for counting a special syllables BUG | |
77 cpdsyllablenum=NULL; // syllable count incrementing flag | |
78 checknum=0; // checking numbers, and word with numbers | |
79 wordchars=NULL; // letters + spec. word characters | |
80 wordchars_utf16=NULL; // letters + spec. word characters | |
81 wordchars_utf16_len=0; // letters + spec. word characters | |
82 ignorechars=NULL; // letters + spec. word characters | |
83 ignorechars_utf16=NULL; // letters + spec. word characters | |
84 ignorechars_utf16_len=0; // letters + spec. word characters | |
85 version=NULL; // affix and dictionary file version string | |
86 havecontclass=0; // flags of possible continuing classes (double affix) | |
87 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents | |
88 // in morhological description in dictionary file. It's often combined with PS
EUDOROOT. | |
89 lemma_present = FLAG_NULL; | |
90 circumfix = FLAG_NULL; | |
91 onlyincompound = FLAG_NULL; | |
92 maxngramsugs = -1; // undefined | |
93 maxdiff = -1; // undefined | |
94 onlymaxdiff = 0; | |
95 maxcpdsugs = -1; // undefined | |
96 nosplitsugs = 0; | |
97 sugswithdots = 0; | |
98 keepcase = 0; | |
99 forceucase = 0; | |
100 warn = 0; | |
101 forbidwarn = 0; | |
102 checksharps = 0; | |
103 substandard = FLAG_NULL; | |
104 fullstrip = 0; | |
105 | |
106 sfx = NULL; | |
107 pfx = NULL; | |
108 | |
109 for (int i=0; i < SETSIZE; i++) { | |
110 pStart[i] = NULL; | |
111 sStart[i] = NULL; | |
112 pFlag[i] = NULL; | |
113 sFlag[i] = NULL; | |
114 } | |
115 | |
116 #ifdef HUNSPELL_CHROME_CLIENT | |
117 // Define dummy parameters for parse_file() to avoid changing the parameters | |
118 // of parse_file(). This may make it easier to merge the changes of the | |
119 // original hunspell. | |
120 const char* affpath = NULL; | |
121 const char* key = NULL; | |
122 #else | |
123 for (int j=0; j < CONTSIZE; j++) { | |
124 contclasses[j] = 0; | |
125 } | |
126 #endif | |
127 | |
128 if (parse_file(affpath, key)) { | |
129 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); | |
130 } | |
131 | |
132 if (cpdmin == -1) cpdmin = MINCPDLEN; | |
133 | |
134 } | |
135 | |
136 | |
137 AffixMgr::~AffixMgr() | |
138 { | |
139 // pass through linked prefix entries and clean up | |
140 for (int i=0; i < SETSIZE ;i++) { | |
141 pFlag[i] = NULL; | |
142 PfxEntry * ptr = pStart[i]; | |
143 PfxEntry * nptr = NULL; | |
144 while (ptr) { | |
145 nptr = ptr->getNext(); | |
146 delete(ptr); | |
147 ptr = nptr; | |
148 nptr = NULL; | |
149 } | |
150 } | |
151 | |
152 // pass through linked suffix entries and clean up | |
153 for (int j=0; j < SETSIZE ; j++) { | |
154 sFlag[j] = NULL; | |
155 SfxEntry * ptr = sStart[j]; | |
156 SfxEntry * nptr = NULL; | |
157 while (ptr) { | |
158 nptr = ptr->getNext(); | |
159 delete(ptr); | |
160 ptr = nptr; | |
161 nptr = NULL; | |
162 } | |
163 sStart[j] = NULL; | |
164 } | |
165 | |
166 if (keystring) free(keystring); | |
167 keystring=NULL; | |
168 if (trystring) free(trystring); | |
169 trystring=NULL; | |
170 if (encoding) free(encoding); | |
171 encoding=NULL; | |
172 if (maptable) { | |
173 for (int j=0; j < nummap; j++) { | |
174 for (int k=0; k < maptable[j].len; k++) { | |
175 if (maptable[j].set[k]) free(maptable[j].set[k]); | |
176 } | |
177 free(maptable[j].set); | |
178 maptable[j].set = NULL; | |
179 maptable[j].len = 0; | |
180 } | |
181 free(maptable); | |
182 maptable = NULL; | |
183 } | |
184 nummap = 0; | |
185 if (breaktable) { | |
186 for (int j=0; j < numbreak; j++) { | |
187 if (breaktable[j]) free(breaktable[j]); | |
188 breaktable[j] = NULL; | |
189 } | |
190 free(breaktable); | |
191 breaktable = NULL; | |
192 } | |
193 numbreak = 0; | |
194 if (reptable) { | |
195 for (int j=0; j < numrep; j++) { | |
196 free(reptable[j].pattern); | |
197 free(reptable[j].pattern2); | |
198 } | |
199 free(reptable); | |
200 reptable = NULL; | |
201 } | |
202 if (iconvtable) delete iconvtable; | |
203 if (oconvtable) delete oconvtable; | |
204 if (phone && phone->rules) { | |
205 for (int j=0; j < phone->num + 1; j++) { | |
206 free(phone->rules[j * 2]); | |
207 free(phone->rules[j * 2 + 1]); | |
208 } | |
209 free(phone->rules); | |
210 free(phone); | |
211 phone = NULL; | |
212 } | |
213 | |
214 if (defcpdtable) { | |
215 for (int j=0; j < numdefcpd; j++) { | |
216 free(defcpdtable[j].def); | |
217 defcpdtable[j].def = NULL; | |
218 } | |
219 free(defcpdtable); | |
220 defcpdtable = NULL; | |
221 } | |
222 numrep = 0; | |
223 if (checkcpdtable) { | |
224 for (int j=0; j < numcheckcpd; j++) { | |
225 free(checkcpdtable[j].pattern); | |
226 free(checkcpdtable[j].pattern2); | |
227 free(checkcpdtable[j].pattern3); | |
228 checkcpdtable[j].pattern = NULL; | |
229 checkcpdtable[j].pattern2 = NULL; | |
230 checkcpdtable[j].pattern3 = NULL; | |
231 } | |
232 free(checkcpdtable); | |
233 checkcpdtable = NULL; | |
234 } | |
235 numcheckcpd = 0; | |
236 FREE_FLAG(compoundflag); | |
237 FREE_FLAG(compoundbegin); | |
238 FREE_FLAG(compoundmiddle); | |
239 FREE_FLAG(compoundend); | |
240 FREE_FLAG(compoundpermitflag); | |
241 FREE_FLAG(compoundforbidflag); | |
242 FREE_FLAG(compoundroot); | |
243 FREE_FLAG(forbiddenword); | |
244 FREE_FLAG(nosuggest); | |
245 FREE_FLAG(nongramsuggest); | |
246 FREE_FLAG(needaffix); | |
247 FREE_FLAG(lemma_present); | |
248 FREE_FLAG(circumfix); | |
249 FREE_FLAG(onlyincompound); | |
250 | |
251 cpdwordmax = 0; | |
252 pHMgr = NULL; | |
253 cpdmin = 0; | |
254 cpdmaxsyllable = 0; | |
255 if (cpdvowels) free(cpdvowels); | |
256 if (cpdvowels_utf16) free(cpdvowels_utf16); | |
257 if (cpdsyllablenum) free(cpdsyllablenum); | |
258 free_utf_tbl(); | |
259 if (lang) free(lang); | |
260 if (wordchars) free(wordchars); | |
261 if (wordchars_utf16) free(wordchars_utf16); | |
262 if (ignorechars) free(ignorechars); | |
263 if (ignorechars_utf16) free(ignorechars_utf16); | |
264 if (version) free(version); | |
265 checknum=0; | |
266 #ifdef MOZILLA_CLIENT | |
267 delete [] csconv; | |
268 #endif | |
269 } | |
270 | |
271 void AffixMgr::finishFileMgr(FileMgr *afflst) | |
272 { | |
273 delete afflst; | |
274 | |
275 // convert affix trees to sorted list | |
276 process_pfx_tree_to_list(); | |
277 process_sfx_tree_to_list(); | |
278 } | |
279 | |
280 // read in aff file and build up prefix and suffix entry objects | |
281 int AffixMgr::parse_file(const char * affpath, const char * key) | |
282 { | |
283 char * line; // io buffers | |
284 char ft; // affix type | |
285 | |
286 #ifdef HUNSPELL_CHROME_CLIENT | |
287 // open the affix file | |
288 // We're always UTF-8 | |
289 utf8 = 1; | |
290 | |
291 // A BDICT file stores PFX and SFX lines in a special section and it provides | |
292 // a special line iterator for reading PFX and SFX lines. | |
293 // We create a FileMgr object from this iterator and parse PFX and SFX lines | |
294 // before parsing other lines. | |
295 hunspell::LineIterator affix_iterator = bdict_reader->GetAffixLineIterator(); | |
296 FileMgr* iterator = new FileMgr(&affix_iterator); | |
297 if (!iterator) { | |
298 HUNSPELL_WARNING(stderr, | |
299 "error: could not create a FileMgr from an affix line iterator.\n"); | |
300 return 1; | |
301 } | |
302 | |
303 while ((line = iterator->getline())) { | |
304 ft = ' '; | |
305 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P'; | |
306 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S'; | |
307 if (ft != ' ') | |
308 parse_affix(line, ft, iterator, NULL); | |
309 } | |
310 delete iterator; | |
311 | |
312 // Create a FileMgr object for reading lines except PFX and SFX lines. | |
313 // We don't need to change the loop below since our FileMgr emulates the | |
314 // original one. | |
315 hunspell::LineIterator other_iterator = bdict_reader->GetOtherLineIterator(); | |
316 FileMgr * afflst = new FileMgr(&other_iterator); | |
317 if (!afflst) { | |
318 HUNSPELL_WARNING(stderr, | |
319 "error: could not create a FileMgr from an other line iterator.\n"); | |
320 return 1; | |
321 } | |
322 #else | |
323 // checking flag duplication | |
324 char dupflags[CONTSIZE]; | |
325 char dupflags_ini = 1; | |
326 | |
327 // first line indicator for removing byte order mark | |
328 int firstline = 1; | |
329 | |
330 // open the affix file | |
331 FileMgr * afflst = new FileMgr(affpath, key); | |
332 if (!afflst) { | |
333 HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n"
,affpath); | |
334 return 1; | |
335 } | |
336 #endif | |
337 | |
338 // step one is to parse the affix file building up the internal | |
339 // affix data structures | |
340 | |
341 // read in each line ignoring any that do not | |
342 // start with a known line type indicator | |
343 while ((line = afflst->getline()) != NULL) { | |
344 mychomp(line); | |
345 | |
346 #ifndef HUNSPELL_CHROME_CLIENT | |
347 /* remove byte order mark */ | |
348 if (firstline) { | |
349 firstline = 0; | |
350 // Affix file begins with byte order mark: possible incompatibility wit
h old Hunspell versions | |
351 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { | |
352 memmove(line, line+3, strlen(line+3)+1); | |
353 } | |
354 } | |
355 #endif | |
356 | |
357 /* parse in the keyboard string */ | |
358 if (strncmp(line,"KEY",3) == 0) { | |
359 if (parse_string(line, &keystring, afflst->getlinenum())) { | |
360 finishFileMgr(afflst); | |
361 return 1; | |
362 } | |
363 } | |
364 | |
365 /* parse in the try string */ | |
366 if (strncmp(line,"TRY",3) == 0) { | |
367 if (parse_string(line, &trystring, afflst->getlinenum())) { | |
368 finishFileMgr(afflst); | |
369 return 1; | |
370 } | |
371 } | |
372 | |
373 /* parse in the name of the character set used by the .dict and .aff */ | |
374 if (strncmp(line,"SET",3) == 0) { | |
375 if (parse_string(line, &encoding, afflst->getlinenum())) { | |
376 finishFileMgr(afflst); | |
377 return 1; | |
378 } | |
379 if (strcmp(encoding, "UTF-8") == 0) { | |
380 utf8 = 1; | |
381 #ifndef OPENOFFICEORG | |
382 #ifndef MOZILLA_CLIENT | |
383 if (initialize_utf_tbl()) return 1; | |
384 #endif | |
385 #endif | |
386 } | |
387 } | |
388 | |
389 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left w
riting system */ | |
390 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) | |
391 complexprefixes = 1; | |
392 | |
393 /* parse in the flag used by the controlled compound words */ | |
394 if (strncmp(line,"COMPOUNDFLAG",12) == 0) { | |
395 if (parse_flag(line, &compoundflag, afflst)) { | |
396 finishFileMgr(afflst); | |
397 return 1; | |
398 } | |
399 } | |
400 | |
401 /* parse in the flag used by compound words */ | |
402 if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { | |
403 if (complexprefixes) { | |
404 if (parse_flag(line, &compoundend, afflst)) { | |
405 finishFileMgr(afflst); | |
406 return 1; | |
407 } | |
408 } else { | |
409 if (parse_flag(line, &compoundbegin, afflst)) { | |
410 finishFileMgr(afflst); | |
411 return 1; | |
412 } | |
413 } | |
414 } | |
415 | |
416 /* parse in the flag used by compound words */ | |
417 if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { | |
418 if (parse_flag(line, &compoundmiddle, afflst)) { | |
419 finishFileMgr(afflst); | |
420 return 1; | |
421 } | |
422 } | |
423 /* parse in the flag used by compound words */ | |
424 if (strncmp(line,"COMPOUNDEND",11) == 0) { | |
425 if (complexprefixes) { | |
426 if (parse_flag(line, &compoundbegin, afflst)) { | |
427 finishFileMgr(afflst); | |
428 return 1; | |
429 } | |
430 } else { | |
431 if (parse_flag(line, &compoundend, afflst)) { | |
432 finishFileMgr(afflst); | |
433 return 1; | |
434 } | |
435 } | |
436 } | |
437 | |
438 /* parse in the data used by compound_check() method */ | |
439 if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { | |
440 if (parse_num(line, &cpdwordmax, afflst)) { | |
441 finishFileMgr(afflst); | |
442 return 1; | |
443 } | |
444 } | |
445 | |
446 /* parse in the flag sign compounds in dictionary */ | |
447 if (strncmp(line,"COMPOUNDROOT",12) == 0) { | |
448 if (parse_flag(line, &compoundroot, afflst)) { | |
449 finishFileMgr(afflst); | |
450 return 1; | |
451 } | |
452 } | |
453 | |
454 /* parse in the flag used by compound_check() method */ | |
455 if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { | |
456 if (parse_flag(line, &compoundpermitflag, afflst)) { | |
457 finishFileMgr(afflst); | |
458 return 1; | |
459 } | |
460 } | |
461 | |
462 /* parse in the flag used by compound_check() method */ | |
463 if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { | |
464 if (parse_flag(line, &compoundforbidflag, afflst)) { | |
465 finishFileMgr(afflst); | |
466 return 1; | |
467 } | |
468 } | |
469 | |
470 if (strncmp(line,"COMPOUNDMORESUFFIXES",20) == 0) { | |
471 compoundmoresuffixes = 1; | |
472 } | |
473 | |
474 if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) { | |
475 checkcompounddup = 1; | |
476 } | |
477 | |
478 if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) { | |
479 checkcompoundrep = 1; | |
480 } | |
481 | |
482 if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) { | |
483 checkcompoundtriple = 1; | |
484 } | |
485 | |
486 if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) { | |
487 simplifiedtriple = 1; | |
488 } | |
489 | |
490 if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) { | |
491 checkcompoundcase = 1; | |
492 } | |
493 | |
494 if (strncmp(line,"NOSUGGEST",9) == 0) { | |
495 if (parse_flag(line, &nosuggest, afflst)) { | |
496 finishFileMgr(afflst); | |
497 return 1; | |
498 } | |
499 } | |
500 | |
501 if (strncmp(line,"NONGRAMSUGGEST",14) == 0) { | |
502 if (parse_flag(line, &nongramsuggest, afflst)) { | |
503 finishFileMgr(afflst); | |
504 return 1; | |
505 } | |
506 } | |
507 | |
508 /* parse in the flag used by forbidden words */ | |
509 if (strncmp(line,"FORBIDDENWORD",13) == 0) { | |
510 if (parse_flag(line, &forbiddenword, afflst)) { | |
511 finishFileMgr(afflst); | |
512 return 1; | |
513 } | |
514 } | |
515 | |
516 /* parse in the flag used by forbidden words */ | |
517 if (strncmp(line,"LEMMA_PRESENT",13) == 0) { | |
518 if (parse_flag(line, &lemma_present, afflst)) { | |
519 finishFileMgr(afflst); | |
520 return 1; | |
521 } | |
522 } | |
523 | |
524 /* parse in the flag used by circumfixes */ | |
525 if (strncmp(line,"CIRCUMFIX",9) == 0) { | |
526 if (parse_flag(line, &circumfix, afflst)) { | |
527 finishFileMgr(afflst); | |
528 return 1; | |
529 } | |
530 } | |
531 | |
532 /* parse in the flag used by fogemorphemes */ | |
533 if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { | |
534 if (parse_flag(line, &onlyincompound, afflst)) { | |
535 finishFileMgr(afflst); | |
536 return 1; | |
537 } | |
538 } | |
539 | |
540 /* parse in the flag used by `needaffixs' */ | |
541 if (strncmp(line,"PSEUDOROOT",10) == 0) { | |
542 if (parse_flag(line, &needaffix, afflst)) { | |
543 finishFileMgr(afflst); | |
544 return 1; | |
545 } | |
546 } | |
547 | |
548 /* parse in the flag used by `needaffixs' */ | |
549 if (strncmp(line,"NEEDAFFIX",9) == 0) { | |
550 if (parse_flag(line, &needaffix, afflst)) { | |
551 finishFileMgr(afflst); | |
552 return 1; | |
553 } | |
554 } | |
555 | |
556 /* parse in the minimal length for words in compounds */ | |
557 if (strncmp(line,"COMPOUNDMIN",11) == 0) { | |
558 if (parse_num(line, &cpdmin, afflst)) { | |
559 finishFileMgr(afflst); | |
560 return 1; | |
561 } | |
562 if (cpdmin < 1) cpdmin = 1; | |
563 } | |
564 | |
565 /* parse in the max. words and syllables in compounds */ | |
566 if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { | |
567 if (parse_cpdsyllable(line, afflst)) { | |
568 finishFileMgr(afflst); | |
569 return 1; | |
570 } | |
571 } | |
572 | |
573 /* parse in the flag used by compound_check() method */ | |
574 if (strncmp(line,"SYLLABLENUM",11) == 0) { | |
575 if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { | |
576 finishFileMgr(afflst); | |
577 return 1; | |
578 } | |
579 } | |
580 | |
581 /* parse in the flag used by the controlled compound words */ | |
582 if (strncmp(line,"CHECKNUM",8) == 0) { | |
583 checknum=1; | |
584 } | |
585 | |
586 /* parse in the extra word characters */ | |
587 if (strncmp(line,"WORDCHARS",9) == 0) { | |
588 if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_l
en, utf8, afflst->getlinenum())) { | |
589 finishFileMgr(afflst); | |
590 return 1; | |
591 } | |
592 } | |
593 | |
594 /* parse in the ignored characters (for example, Arabic optional diacreti
cs charachters */ | |
595 if (strncmp(line,"IGNORE",6) == 0) { | |
596 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_u
tf16_len, utf8, afflst->getlinenum())) { | |
597 finishFileMgr(afflst); | |
598 return 1; | |
599 } | |
600 } | |
601 | |
602 #ifndef HUNSPELL_CHROME_CLIENT | |
603 /* parse in the typical fault correcting table */ | |
604 if (strncmp(line,"REP",3) == 0) { | |
605 if (parse_reptable(line, afflst)) { | |
606 finishFileMgr(afflst); | |
607 return 1; | |
608 } | |
609 } | |
610 #endif | |
611 | |
612 /* parse in the input conversion table */ | |
613 if (strncmp(line,"ICONV",5) == 0) { | |
614 if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { | |
615 finishFileMgr(afflst); | |
616 return 1; | |
617 } | |
618 } | |
619 | |
620 /* parse in the input conversion table */ | |
621 if (strncmp(line,"OCONV",5) == 0) { | |
622 if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { | |
623 finishFileMgr(afflst); | |
624 return 1; | |
625 } | |
626 } | |
627 | |
628 /* parse in the phonetic translation table */ | |
629 if (strncmp(line,"PHONE",5) == 0) { | |
630 if (parse_phonetable(line, afflst)) { | |
631 finishFileMgr(afflst); | |
632 return 1; | |
633 } | |
634 } | |
635 | |
636 /* parse in the checkcompoundpattern table */ | |
637 if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { | |
638 if (parse_checkcpdtable(line, afflst)) { | |
639 finishFileMgr(afflst); | |
640 return 1; | |
641 } | |
642 } | |
643 | |
644 /* parse in the defcompound table */ | |
645 if (strncmp(line,"COMPOUNDRULE",12) == 0) { | |
646 if (parse_defcpdtable(line, afflst)) { | |
647 finishFileMgr(afflst); | |
648 return 1; | |
649 } | |
650 } | |
651 | |
652 /* parse in the related character map table */ | |
653 if (strncmp(line,"MAP",3) == 0) { | |
654 if (parse_maptable(line, afflst)) { | |
655 finishFileMgr(afflst); | |
656 return 1; | |
657 } | |
658 } | |
659 | |
660 /* parse in the word breakpoints table */ | |
661 if (strncmp(line,"BREAK",5) == 0) { | |
662 if (parse_breaktable(line, afflst)) { | |
663 finishFileMgr(afflst); | |
664 return 1; | |
665 } | |
666 } | |
667 | |
668 /* parse in the language for language specific codes */ | |
669 if (strncmp(line,"LANG",4) == 0) { | |
670 if (parse_string(line, &lang, afflst->getlinenum())) { | |
671 finishFileMgr(afflst); | |
672 return 1; | |
673 } | |
674 langnum = get_lang_num(lang); | |
675 } | |
676 | |
677 if (strncmp(line,"VERSION",7) == 0) { | |
678 for(line = line + 7; *line == ' ' || *line == '\t'; line++); | |
679 version = mystrdup(line); | |
680 } | |
681 | |
682 if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { | |
683 if (parse_num(line, &maxngramsugs, afflst)) { | |
684 finishFileMgr(afflst); | |
685 return 1; | |
686 } | |
687 } | |
688 | |
689 if (strncmp(line,"ONLYMAXDIFF", 11) == 0) | |
690 onlymaxdiff = 1; | |
691 | |
692 if (strncmp(line,"MAXDIFF",7) == 0) { | |
693 if (parse_num(line, &maxdiff, afflst)) { | |
694 finishFileMgr(afflst); | |
695 return 1; | |
696 } | |
697 } | |
698 | |
699 if (strncmp(line,"MAXCPDSUGS",10) == 0) { | |
700 if (parse_num(line, &maxcpdsugs, afflst)) { | |
701 finishFileMgr(afflst); | |
702 return 1; | |
703 } | |
704 } | |
705 | |
706 if (strncmp(line,"NOSPLITSUGS",11) == 0) { | |
707 nosplitsugs=1; | |
708 } | |
709 | |
710 if (strncmp(line,"FULLSTRIP",9) == 0) { | |
711 fullstrip=1; | |
712 } | |
713 | |
714 if (strncmp(line,"SUGSWITHDOTS",12) == 0) { | |
715 sugswithdots=1; | |
716 } | |
717 | |
718 /* parse in the flag used by forbidden words */ | |
719 if (strncmp(line,"KEEPCASE",8) == 0) { | |
720 if (parse_flag(line, &keepcase, afflst)) { | |
721 finishFileMgr(afflst); | |
722 return 1; | |
723 } | |
724 } | |
725 | |
726 /* parse in the flag used by `forceucase' */ | |
727 if (strncmp(line,"FORCEUCASE",10) == 0) { | |
728 if (parse_flag(line, &forceucase, afflst)) { | |
729 finishFileMgr(afflst); | |
730 return 1; | |
731 } | |
732 } | |
733 | |
734 /* parse in the flag used by `warn' */ | |
735 if (strncmp(line,"WARN",4) == 0) { | |
736 if (parse_flag(line, &warn, afflst)) { | |
737 finishFileMgr(afflst); | |
738 return 1; | |
739 } | |
740 } | |
741 | |
742 if (strncmp(line,"FORBIDWARN",10) == 0) { | |
743 forbidwarn=1; | |
744 } | |
745 | |
746 /* parse in the flag used by the affix generator */ | |
747 if (strncmp(line,"SUBSTANDARD",11) == 0) { | |
748 if (parse_flag(line, &substandard, afflst)) { | |
749 finishFileMgr(afflst); | |
750 return 1; | |
751 } | |
752 } | |
753 | |
754 if (strncmp(line,"CHECKSHARPS",11) == 0) { | |
755 checksharps=1; | |
756 } | |
757 | |
758 #ifndef HUNSPELL_CHROME_CLIENT | |
759 /* parse this affix: P - prefix, S - suffix */ | |
760 ft = ' '; | |
761 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P'; | |
762 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S'; | |
763 if (ft != ' ') { | |
764 if (dupflags_ini) { | |
765 memset(dupflags, 0, sizeof(dupflags)); | |
766 dupflags_ini = 0; | |
767 } | |
768 if (parse_affix(line, ft, afflst, dupflags)) { | |
769 finishFileMgr(afflst); | |
770 return 1; | |
771 } | |
772 } | |
773 #endif | |
774 } | |
775 | |
776 finishFileMgr(afflst); | |
777 // affix trees are sorted now | |
778 | |
779 // now we can speed up performance greatly taking advantage of the | |
780 // relationship between the affixes and the idea of "subsets". | |
781 | |
782 // View each prefix as a potential leading subset of another and view | |
783 // each suffix (reversed) as a potential trailing subset of another. | |
784 | |
785 // To illustrate this relationship if we know the prefix "ab" is found in th
e | |
786 // word to examine, only prefixes that "ab" is a leading subset of need be e
xamined. | |
787 // Furthermore is "ab" is not present then none of the prefixes that "ab" is | |
788 // is a subset need be examined. | |
789 // The same argument goes for suffix string that are reversed. | |
790 | |
791 // Then to top this off why not examine the first char of the word to quickl
y | |
792 // limit the set of prefixes to examine (i.e. the prefixes to examine must | |
793 // be leading supersets of the first character of the word (if they exist) | |
794 | |
795 // To take advantage of this "subset" relationship, we need to add two links | |
796 // from entry. One to take next if the current prefix is found (call it nex
teq) | |
797 // and one to take next if the current prefix is not found (call it nextne). | |
798 | |
799 // Since we have built ordered lists, all that remains is to properly initia
lize | |
800 // the nextne and nexteq pointers that relate them | |
801 | |
802 process_pfx_order(); | |
803 process_sfx_order(); | |
804 | |
805 /* get encoding for CHECKCOMPOUNDCASE */ | |
806 if (!utf8) { | |
807 char * enc = get_encoding(); | |
808 csconv = get_current_cs(enc); | |
809 free(enc); | |
810 enc = NULL; | |
811 | |
812 char expw[MAXLNLEN]; | |
813 if (wordchars) { | |
814 strcpy(expw, wordchars); | |
815 free(wordchars); | |
816 } else *expw = '\0'; | |
817 | |
818 for (int i = 0; i <= 255; i++) { | |
819 if ( (csconv[i].cupper != csconv[i].clower) && | |
820 (! strchr(expw, (char) i))) { | |
821 *(expw + strlen(expw) + 1) = '\0'; | |
822 *(expw + strlen(expw)) = (char) i; | |
823 } | |
824 } | |
825 | |
826 wordchars = mystrdup(expw); | |
827 } | |
828 | |
829 // default BREAK definition | |
830 if (numbreak == -1) { | |
831 breaktable = (char **) malloc(sizeof(char *) * 3); | |
832 if (!breaktable) return 1; | |
833 breaktable[0] = mystrdup("-"); | |
834 breaktable[1] = mystrdup("^-"); | |
835 breaktable[2] = mystrdup("-$"); | |
836 if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3; | |
837 } | |
838 return 0; | |
839 } | |
840 | |
841 | |
842 // we want to be able to quickly access prefix information | |
843 // both by prefix flag, and sorted by prefix string itself | |
844 // so we need to set up two indexes | |
845 | |
846 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) | |
847 { | |
848 PfxEntry * ptr; | |
849 PfxEntry * pptr; | |
850 PfxEntry * ep = pfxptr; | |
851 | |
852 // get the right starting points | |
853 const char * key = ep->getKey(); | |
854 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF); | |
855 | |
856 // first index by flag which must exist | |
857 ptr = pFlag[flg]; | |
858 ep->setFlgNxt(ptr); | |
859 pFlag[flg] = ep; | |
860 | |
861 | |
862 // handle the special case of null affix string | |
863 if (strlen(key) == 0) { | |
864 // always inset them at head of list at element 0 | |
865 ptr = pStart[0]; | |
866 ep->setNext(ptr); | |
867 pStart[0] = ep; | |
868 return 0; | |
869 } | |
870 | |
871 // now handle the normal case | |
872 ep->setNextEQ(NULL); | |
873 ep->setNextNE(NULL); | |
874 | |
875 unsigned char sp = *((const unsigned char *)key); | |
876 ptr = pStart[sp]; | |
877 | |
878 // handle the first insert | |
879 if (!ptr) { | |
880 pStart[sp] = ep; | |
881 return 0; | |
882 } | |
883 | |
884 | |
885 // otherwise use binary tree insertion so that a sorted | |
886 // list can easily be generated later | |
887 pptr = NULL; | |
888 for (;;) { | |
889 pptr = ptr; | |
890 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { | |
891 ptr = ptr->getNextEQ(); | |
892 if (!ptr) { | |
893 pptr->setNextEQ(ep); | |
894 break; | |
895 } | |
896 } else { | |
897 ptr = ptr->getNextNE(); | |
898 if (!ptr) { | |
899 pptr->setNextNE(ep); | |
900 break; | |
901 } | |
902 } | |
903 } | |
904 return 0; | |
905 } | |
906 | |
907 // we want to be able to quickly access suffix information | |
908 // both by suffix flag, and sorted by the reverse of the | |
909 // suffix string itself; so we need to set up two indexes | |
910 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) | |
911 { | |
912 SfxEntry * ptr; | |
913 SfxEntry * pptr; | |
914 SfxEntry * ep = sfxptr; | |
915 | |
916 /* get the right starting point */ | |
917 const char * key = ep->getKey(); | |
918 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF); | |
919 | |
920 // first index by flag which must exist | |
921 ptr = sFlag[flg]; | |
922 ep->setFlgNxt(ptr); | |
923 sFlag[flg] = ep; | |
924 | |
925 // next index by affix string | |
926 | |
927 // handle the special case of null affix string | |
928 if (strlen(key) == 0) { | |
929 // always inset them at head of list at element 0 | |
930 ptr = sStart[0]; | |
931 ep->setNext(ptr); | |
932 sStart[0] = ep; | |
933 return 0; | |
934 } | |
935 | |
936 // now handle the normal case | |
937 ep->setNextEQ(NULL); | |
938 ep->setNextNE(NULL); | |
939 | |
940 unsigned char sp = *((const unsigned char *)key); | |
941 ptr = sStart[sp]; | |
942 | |
943 // handle the first insert | |
944 if (!ptr) { | |
945 sStart[sp] = ep; | |
946 return 0; | |
947 } | |
948 | |
949 // otherwise use binary tree insertion so that a sorted | |
950 // list can easily be generated later | |
951 pptr = NULL; | |
952 for (;;) { | |
953 pptr = ptr; | |
954 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { | |
955 ptr = ptr->getNextEQ(); | |
956 if (!ptr) { | |
957 pptr->setNextEQ(ep); | |
958 break; | |
959 } | |
960 } else { | |
961 ptr = ptr->getNextNE(); | |
962 if (!ptr) { | |
963 pptr->setNextNE(ep); | |
964 break; | |
965 } | |
966 } | |
967 } | |
968 return 0; | |
969 } | |
970 | |
971 // convert from binary tree to sorted list | |
972 int AffixMgr::process_pfx_tree_to_list() | |
973 { | |
974 for (int i=1; i< SETSIZE; i++) { | |
975 pStart[i] = process_pfx_in_order(pStart[i],NULL); | |
976 } | |
977 return 0; | |
978 } | |
979 | |
980 | |
981 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) | |
982 { | |
983 if (ptr) { | |
984 nptr = process_pfx_in_order(ptr->getNextNE(), nptr); | |
985 ptr->setNext(nptr); | |
986 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr); | |
987 } | |
988 return nptr; | |
989 } | |
990 | |
991 | |
992 // convert from binary tree to sorted list | |
993 int AffixMgr:: process_sfx_tree_to_list() | |
994 { | |
995 for (int i=1; i< SETSIZE; i++) { | |
996 sStart[i] = process_sfx_in_order(sStart[i],NULL); | |
997 } | |
998 return 0; | |
999 } | |
1000 | |
1001 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) | |
1002 { | |
1003 if (ptr) { | |
1004 nptr = process_sfx_in_order(ptr->getNextNE(), nptr); | |
1005 ptr->setNext(nptr); | |
1006 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr); | |
1007 } | |
1008 return nptr; | |
1009 } | |
1010 | |
1011 | |
1012 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching | |
1013 // using the idea of leading subsets this time | |
1014 int AffixMgr::process_pfx_order() | |
1015 { | |
1016 PfxEntry* ptr; | |
1017 | |
1018 // loop through each prefix list starting point | |
1019 for (int i=1; i < SETSIZE; i++) { | |
1020 | |
1021 ptr = pStart[i]; | |
1022 | |
1023 // look through the remainder of the list | |
1024 // and find next entry with affix that | |
1025 // the current one is not a subset of | |
1026 // mark that as destination for NextNE | |
1027 // use next in list that you are a subset | |
1028 // of as NextEQ | |
1029 | |
1030 for (; ptr != NULL; ptr = ptr->getNext()) { | |
1031 | |
1032 PfxEntry * nptr = ptr->getNext(); | |
1033 for (; nptr != NULL; nptr = nptr->getNext()) { | |
1034 if (! isSubset( ptr->getKey() , nptr->getKey() )) break; | |
1035 } | |
1036 ptr->setNextNE(nptr); | |
1037 ptr->setNextEQ(NULL); | |
1038 if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->
getKey())) | |
1039 ptr->setNextEQ(ptr->getNext()); | |
1040 } | |
1041 | |
1042 // now clean up by adding smart search termination strings: | |
1043 // if you are already a superset of the previous prefix | |
1044 // but not a subset of the next, search can end here | |
1045 // so set NextNE properly | |
1046 | |
1047 ptr = pStart[i]; | |
1048 for (; ptr != NULL; ptr = ptr->getNext()) { | |
1049 PfxEntry * nptr = ptr->getNext(); | |
1050 PfxEntry * mptr = NULL; | |
1051 for (; nptr != NULL; nptr = nptr->getNext()) { | |
1052 if (! isSubset(ptr->getKey(),nptr->getKey())) break; | |
1053 mptr = nptr; | |
1054 } | |
1055 if (mptr) mptr->setNextNE(NULL); | |
1056 } | |
1057 } | |
1058 return 0; | |
1059 } | |
1060 | |
1061 // initialize the SfxEntry links NextEQ and NextNE to speed searching | |
1062 // using the idea of leading subsets this time | |
1063 int AffixMgr::process_sfx_order() | |
1064 { | |
1065 SfxEntry* ptr; | |
1066 | |
1067 // loop through each prefix list starting point | |
1068 for (int i=1; i < SETSIZE; i++) { | |
1069 | |
1070 ptr = sStart[i]; | |
1071 | |
1072 // look through the remainder of the list | |
1073 // and find next entry with affix that | |
1074 // the current one is not a subset of | |
1075 // mark that as destination for NextNE | |
1076 // use next in list that you are a subset | |
1077 // of as NextEQ | |
1078 | |
1079 for (; ptr != NULL; ptr = ptr->getNext()) { | |
1080 SfxEntry * nptr = ptr->getNext(); | |
1081 for (; nptr != NULL; nptr = nptr->getNext()) { | |
1082 if (! isSubset(ptr->getKey(),nptr->getKey())) break; | |
1083 } | |
1084 ptr->setNextNE(nptr); | |
1085 ptr->setNextEQ(NULL); | |
1086 if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->ge
tKey())) | |
1087 ptr->setNextEQ(ptr->getNext()); | |
1088 } | |
1089 | |
1090 | |
1091 // now clean up by adding smart search termination strings: | |
1092 // if you are already a superset of the previous suffix | |
1093 // but not a subset of the next, search can end here | |
1094 // so set NextNE properly | |
1095 | |
1096 ptr = sStart[i]; | |
1097 for (; ptr != NULL; ptr = ptr->getNext()) { | |
1098 SfxEntry * nptr = ptr->getNext(); | |
1099 SfxEntry * mptr = NULL; | |
1100 for (; nptr != NULL; nptr = nptr->getNext()) { | |
1101 if (! isSubset(ptr->getKey(),nptr->getKey())) break; | |
1102 mptr = nptr; | |
1103 } | |
1104 if (mptr) mptr->setNextNE(NULL); | |
1105 } | |
1106 } | |
1107 return 0; | |
1108 } | |
1109 | |
1110 // add flags to the result for dictionary debugging | |
1111 void AffixMgr::debugflag(char * result, unsigned short flag) { | |
1112 char * st = encode_flag(flag); | |
1113 mystrcat(result, " ", MAXLNLEN); | |
1114 mystrcat(result, MORPH_FLAG, MAXLNLEN); | |
1115 if (st) { | |
1116 mystrcat(result, st, MAXLNLEN); | |
1117 free(st); | |
1118 } | |
1119 } | |
1120 | |
1121 // calculate the character length of the condition | |
1122 int AffixMgr::condlen(char * st) | |
1123 { | |
1124 int l = 0; | |
1125 bool group = false; | |
1126 for(; *st; st++) { | |
1127 if (*st == '[') { | |
1128 group = true; | |
1129 l++; | |
1130 } else if (*st == ']') group = false; | |
1131 else if (!group && (!utf8 || | |
1132 (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; | |
1133 } | |
1134 return l; | |
1135 } | |
1136 | |
1137 int AffixMgr::encodeit(affentry &entry, char * cs) | |
1138 { | |
1139 if (strcmp(cs,".") != 0) { | |
1140 entry.numconds = (char) condlen(cs); | |
1141 strncpy(entry.c.conds, cs, MAXCONDLEN); | |
1142 // long condition (end of conds padded by strncpy) | |
1143 if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { | |
1144 entry.opts += aeLONGCOND; | |
1145 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); | |
1146 if (!entry.c.l.conds2) return 1; | |
1147 } | |
1148 } else { | |
1149 entry.numconds = 0; | |
1150 entry.c.conds[0] = '\0'; | |
1151 } | |
1152 return 0; | |
1153 } | |
1154 | |
1155 // return 1 if s1 is a leading subset of s2 (dots are for infixes) | |
1156 inline int AffixMgr::isSubset(const char * s1, const char * s2) | |
1157 { | |
1158 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { | |
1159 s1++; | |
1160 s2++; | |
1161 } | |
1162 return (*s1 == '\0'); | |
1163 } | |
1164 | |
1165 | |
1166 // check word for prefixes | |
1167 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compo
und, | |
1168 const FLAG needflag) | |
1169 { | |
1170 struct hentry * rv= NULL; | |
1171 | |
1172 pfx = NULL; | |
1173 pfxappnd = NULL; | |
1174 sfxappnd = NULL; | |
1175 | |
1176 // first handle the special case of 0 length prefixes | |
1177 PfxEntry * pe = pStart[0]; | |
1178 while (pe) { | |
1179 if ( | |
1180 // fogemorpheme | |
1181 ((in_compound != IN_CPD_NOT) || !(pe->getCont() && | |
1182 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &
& | |
1183 // permit prefixes in compounds | |
1184 ((in_compound != IN_CPD_END) || (pe->getCont() && | |
1185 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))
)) | |
1186 ) { | |
1187 // check prefix | |
1188 rv = pe->checkword(word, len, in_compound, needflag); | |
1189 if (rv) { | |
1190 pfx=pe; // BUG: pfx not stateless | |
1191 return rv; | |
1192 } | |
1193 } | |
1194 pe = pe->getNext(); | |
1195 } | |
1196 | |
1197 // now handle the general case | |
1198 unsigned char sp = *((const unsigned char *)word); | |
1199 PfxEntry * pptr = pStart[sp]; | |
1200 | |
1201 while (pptr) { | |
1202 if (isSubset(pptr->getKey(),word)) { | |
1203 if ( | |
1204 // fogemorpheme | |
1205 ((in_compound != IN_CPD_NOT) || !(pptr->getCont() && | |
1206 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))
)) && | |
1207 // permit prefixes in compounds | |
1208 ((in_compound != IN_CPD_END) || (pptr->getCont() && | |
1209 (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen
())))) | |
1210 ) { | |
1211 // check prefix | |
1212 rv = pptr->checkword(word, len, in_compound, needflag); | |
1213 if (rv) { | |
1214 pfx=pptr; // BUG: pfx not stateless | |
1215 return rv; | |
1216 } | |
1217 } | |
1218 pptr = pptr->getNextEQ(); | |
1219 } else { | |
1220 pptr = pptr->getNextNE(); | |
1221 } | |
1222 } | |
1223 | |
1224 return NULL; | |
1225 } | |
1226 | |
1227 // check word for prefixes | |
1228 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, | |
1229 char in_compound, const FLAG needflag) | |
1230 { | |
1231 struct hentry * rv= NULL; | |
1232 | |
1233 pfx = NULL; | |
1234 sfxappnd = NULL; | |
1235 | |
1236 // first handle the special case of 0 length prefixes | |
1237 PfxEntry * pe = pStart[0]; | |
1238 | |
1239 while (pe) { | |
1240 rv = pe->check_twosfx(word, len, in_compound, needflag); | |
1241 if (rv) return rv; | |
1242 pe = pe->getNext(); | |
1243 } | |
1244 | |
1245 // now handle the general case | |
1246 unsigned char sp = *((const unsigned char *)word); | |
1247 PfxEntry * pptr = pStart[sp]; | |
1248 | |
1249 while (pptr) { | |
1250 if (isSubset(pptr->getKey(),word)) { | |
1251 rv = pptr->check_twosfx(word, len, in_compound, needflag); | |
1252 if (rv) { | |
1253 pfx = pptr; | |
1254 return rv; | |
1255 } | |
1256 pptr = pptr->getNextEQ(); | |
1257 } else { | |
1258 pptr = pptr->getNextNE(); | |
1259 } | |
1260 } | |
1261 | |
1262 return NULL; | |
1263 } | |
1264 | |
1265 // check word for prefixes | |
1266 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound
, | |
1267 const FLAG needflag) | |
1268 { | |
1269 char * st; | |
1270 | |
1271 char result[MAXLNLEN]; | |
1272 result[0] = '\0'; | |
1273 | |
1274 pfx = NULL; | |
1275 sfxappnd = NULL; | |
1276 | |
1277 // first handle the special case of 0 length prefixes | |
1278 PfxEntry * pe = pStart[0]; | |
1279 while (pe) { | |
1280 st = pe->check_morph(word,len,in_compound, needflag); | |
1281 if (st) { | |
1282 mystrcat(result, st, MAXLNLEN); | |
1283 free(st); | |
1284 } | |
1285 // if (rv) return rv; | |
1286 pe = pe->getNext(); | |
1287 } | |
1288 | |
1289 // now handle the general case | |
1290 unsigned char sp = *((const unsigned char *)word); | |
1291 PfxEntry * pptr = pStart[sp]; | |
1292 | |
1293 while (pptr) { | |
1294 if (isSubset(pptr->getKey(),word)) { | |
1295 st = pptr->check_morph(word,len,in_compound, needflag); | |
1296 if (st) { | |
1297 // fogemorpheme | |
1298 if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && | |
1299 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContL
en()))))) { | |
1300 mystrcat(result, st, MAXLNLEN); | |
1301 pfx = pptr; | |
1302 } | |
1303 free(st); | |
1304 } | |
1305 pptr = pptr->getNextEQ(); | |
1306 } else { | |
1307 pptr = pptr->getNextNE(); | |
1308 } | |
1309 } | |
1310 | |
1311 if (*result) return mystrdup(result); | |
1312 return NULL; | |
1313 } | |
1314 | |
1315 | |
1316 // check word for prefixes | |
1317 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, | |
1318 char in_compound, const FLAG needflag) | |
1319 { | |
1320 char * st; | |
1321 | |
1322 char result[MAXLNLEN]; | |
1323 result[0] = '\0'; | |
1324 | |
1325 pfx = NULL; | |
1326 sfxappnd = NULL; | |
1327 | |
1328 // first handle the special case of 0 length prefixes | |
1329 PfxEntry * pe = pStart[0]; | |
1330 while (pe) { | |
1331 st = pe->check_twosfx_morph(word,len,in_compound, needflag); | |
1332 if (st) { | |
1333 mystrcat(result, st, MAXLNLEN); | |
1334 free(st); | |
1335 } | |
1336 pe = pe->getNext(); | |
1337 } | |
1338 | |
1339 // now handle the general case | |
1340 unsigned char sp = *((const unsigned char *)word); | |
1341 PfxEntry * pptr = pStart[sp]; | |
1342 | |
1343 while (pptr) { | |
1344 if (isSubset(pptr->getKey(),word)) { | |
1345 st = pptr->check_twosfx_morph(word, len, in_compound, needflag); | |
1346 if (st) { | |
1347 mystrcat(result, st, MAXLNLEN); | |
1348 free(st); | |
1349 pfx = pptr; | |
1350 } | |
1351 pptr = pptr->getNextEQ(); | |
1352 } else { | |
1353 pptr = pptr->getNextNE(); | |
1354 } | |
1355 } | |
1356 | |
1357 if (*result) return mystrdup(result); | |
1358 return NULL; | |
1359 } | |
1360 | |
1361 // Is word a non compound with a REP substitution (see checkcompoundrep)? | |
1362 int AffixMgr::cpdrep_check(const char * word, int wl) | |
1363 { | |
1364 char candidate[MAXLNLEN]; | |
1365 const char * r; | |
1366 int lenr, lenp; | |
1367 | |
1368 #ifdef HUNSPELL_CHROME_CLIENT | |
1369 const char *pattern, *pattern2; | |
1370 hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator(
); | |
1371 while (iterator.GetNext(&pattern, &pattern2)) { | |
1372 r = word; | |
1373 lenr = strlen(pattern2); | |
1374 lenp = strlen(pattern); | |
1375 | |
1376 // search every occurence of the pattern in the word | |
1377 while ((r=strstr(r, pattern)) != NULL) { | |
1378 strcpy(candidate, word); | |
1379 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; | |
1380 strcpy(candidate+(r-word), pattern2); | |
1381 strcpy(candidate+(r-word)+lenr, r+lenp); | |
1382 if (candidate_check(candidate,strlen(candidate))) return 1; | |
1383 r++; // search for the next letter | |
1384 } | |
1385 } | |
1386 | |
1387 #else | |
1388 if ((wl < 2) || !numrep) return 0; | |
1389 | |
1390 for (int i=0; i < numrep; i++ ) { | |
1391 r = word; | |
1392 lenr = strlen(reptable[i].pattern2); | |
1393 lenp = strlen(reptable[i].pattern); | |
1394 // search every occurence of the pattern in the word | |
1395 while ((r=strstr(r, reptable[i].pattern)) != NULL) { | |
1396 strcpy(candidate, word); | |
1397 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; | |
1398 strcpy(candidate+(r-word),reptable[i].pattern2); | |
1399 strcpy(candidate+(r-word)+lenr, r+lenp); | |
1400 if (candidate_check(candidate,strlen(candidate))) return 1; | |
1401 r++; // search for the next letter | |
1402 } | |
1403 } | |
1404 #endif | |
1405 return 0; | |
1406 } | |
1407 | |
1408 // forbid compoundings when there are special patterns at word bound | |
1409 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2,
const char /*affixed*/) | |
1410 { | |
1411 int len; | |
1412 for (int i = 0; i < numcheckcpd; i++) { | |
1413 if (isSubset(checkcpdtable[i].pattern2, word + pos) && | |
1414 (!r1 || !checkcpdtable[i].cond || | |
1415 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && | |
1416 (!r2 || !checkcpdtable[i].cond2 || | |
1417 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && | |
1418 // zero length pattern => only TESTAFF | |
1419 // zero pattern (0/flag) => unmodified stem (zero affixes allowed) | |
1420 (!*(checkcpdtable[i].pattern) || ( | |
1421 (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word
+ pos - r1->blen, r1->word, r1->blen) == 0) || | |
1422 (*(checkcpdtable[i].pattern)!='0' && ((len = strlen(checkcpdtable[i]
.pattern)) != 0) && | |
1423 strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))
) { | |
1424 return 1; | |
1425 } | |
1426 } | |
1427 return 0; | |
1428 } | |
1429 | |
1430 // forbid compounding with neighbouring upper and lower case characters at word
bounds | |
1431 int AffixMgr::cpdcase_check(const char * word, int pos) | |
1432 { | |
1433 if (utf8) { | |
1434 w_char u, w; | |
1435 const char * p; | |
1436 u8_u16(&u, 1, word + pos); | |
1437 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--); | |
1438 u8_u16(&w, 1, p); | |
1439 unsigned short a = (u.h << 8) + u.l; | |
1440 unsigned short b = (w.h << 8) + w.l; | |
1441 if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b
)) && | |
1442 (a != '-') && (b != '-')) return 1; | |
1443 } else { | |
1444 unsigned char a = *(word + pos - 1); | |
1445 unsigned char b = *(word + pos); | |
1446 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) retu
rn 1; | |
1447 } | |
1448 return 0; | |
1449 } | |
1450 | |
1451 // check compound patterns | |
1452 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry **
def, char all) | |
1453 { | |
1454 signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtrack
ing | |
1455 signed short btwp[MAXWORDLEN]; // word positions for metacharacters | |
1456 int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positi
ons | |
1457 short bt = 0; | |
1458 int i, j; | |
1459 int ok; | |
1460 int w = 0; | |
1461 | |
1462 if (!*words) { | |
1463 w = 1; | |
1464 *words = def; | |
1465 } | |
1466 | |
1467 if (!*words) { | |
1468 return 0; | |
1469 } | |
1470 | |
1471 (*words)[wnum] = rv; | |
1472 | |
1473 // has the last word COMPOUNDRULE flag? | |
1474 if (rv->alen == 0) { | |
1475 (*words)[wnum] = NULL; | |
1476 if (w) *words = NULL; | |
1477 return 0; | |
1478 } | |
1479 ok = 0; | |
1480 for (i = 0; i < numdefcpd; i++) { | |
1481 for (j = 0; j < defcpdtable[i].len; j++) { | |
1482 if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && | |
1483 TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1; | |
1484 } | |
1485 } | |
1486 if (ok == 0) { | |
1487 (*words)[wnum] = NULL; | |
1488 if (w) *words = NULL; | |
1489 return 0; | |
1490 } | |
1491 | |
1492 for (i = 0; i < numdefcpd; i++) { | |
1493 signed short pp = 0; // pattern position | |
1494 signed short wp = 0; // "words" position | |
1495 int ok2; | |
1496 ok = 1; | |
1497 ok2 = 1; | |
1498 do { | |
1499 while ((pp < defcpdtable[i].len) && (wp <= wnum)) { | |
1500 if (((pp+1) < defcpdtable[i].len) && | |
1501 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'
))) { | |
1502 int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum; | |
1503 ok2 = 1; | |
1504 pp+=2; | |
1505 btpp[bt] = pp; | |
1506 btwp[bt] = wp; | |
1507 while (wp <= wend) { | |
1508 if (!(*words)[wp]->alen || | |
1509 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words
)[wp]->alen)) { | |
1510 ok2 = 0; | |
1511 break; | |
1512 } | |
1513 wp++; | |
1514 } | |
1515 if (wp <= wnum) ok2 = 0; | |
1516 btnum[bt] = wp - btwp[bt]; | |
1517 if (btnum[bt] > 0) bt++; | |
1518 if (ok2) break; | |
1519 } else { | |
1520 ok2 = 1; | |
1521 if (!(*words)[wp] || !(*words)[wp]->alen || | |
1522 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]-
>alen)) { | |
1523 ok = 0; | |
1524 break; | |
1525 } | |
1526 pp++; | |
1527 wp++; | |
1528 if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0; | |
1529 } | |
1530 } | |
1531 if (ok && ok2) { | |
1532 int r = pp; | |
1533 while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && | |
1534 ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'
))) r+=2; | |
1535 if (defcpdtable[i].len <= r) return 1; | |
1536 } | |
1537 // backtrack | |
1538 if (bt) do { | |
1539 ok = 1; | |
1540 btnum[bt - 1]--; | |
1541 pp = btpp[bt - 1]; | |
1542 wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; | |
1543 } while ((btnum[bt - 1] < 0) && --bt); | |
1544 } while (bt); | |
1545 | |
1546 if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; | |
1547 | |
1548 // check zero ending | |
1549 while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len)
&& | |
1550 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp
+=2; | |
1551 if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1; | |
1552 } | |
1553 (*words)[wnum] = NULL; | |
1554 if (w) *words = NULL; | |
1555 return 0; | |
1556 } | |
1557 | |
1558 inline int AffixMgr::candidate_check(const char * word, int len) | |
1559 { | |
1560 struct hentry * rv=NULL; | |
1561 | |
1562 rv = lookup(word); | |
1563 if (rv) return 1; | |
1564 | |
1565 // rv = prefix_check(word,len,1); | |
1566 // if (rv) return 1; | |
1567 | |
1568 rv = affix_check(word,len); | |
1569 if (rv) return 1; | |
1570 return 0; | |
1571 } | |
1572 | |
1573 // calculate number of syllable for compound-checking | |
1574 short AffixMgr::get_syllable(const char * word, int wlen) | |
1575 { | |
1576 if (cpdmaxsyllable==0) return 0; | |
1577 | |
1578 short num=0; | |
1579 | |
1580 if (!utf8) { | |
1581 for (int i=0; i<wlen; i++) { | |
1582 if (strchr(cpdvowels, word[i])) num++; | |
1583 } | |
1584 } else if (cpdvowels_utf16) { | |
1585 w_char w[MAXWORDUTF8LEN]; | |
1586 int i = u8_u16(w, MAXWORDUTF8LEN, word); | |
1587 for (; i > 0; i--) { | |
1588 if (flag_bsearch((unsigned short *) cpdvowels_utf16, | |
1589 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; | |
1590 } | |
1591 } | |
1592 return num; | |
1593 } | |
1594 | |
1595 void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) { | |
1596 if (utf8) { | |
1597 int i; | |
1598 for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) { | |
1599 for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++); | |
1600 } | |
1601 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) { | |
1602 for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--); | |
1603 } | |
1604 } else { | |
1605 *cmin = cpdmin; | |
1606 *cmax = len - cpdmin + 1; | |
1607 } | |
1608 } | |
1609 | |
1610 | |
1611 // check if compound word is correctly spelled | |
1612 // hu_mov_rule = spec. Hungarian rule (XXX) | |
1613 struct hentry * AffixMgr::compound_check(const char * word, int len, | |
1614 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** wo
rds = NULL, | |
1615 char hu_mov_rule = 0, char is_sug = 0, int * info = NULL) | |
1616 { | |
1617 int i; | |
1618 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; | |
1619 struct hentry * rv = NULL; | |
1620 struct hentry * rv_first; | |
1621 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking | |
1622 char st [MAXWORDUTF8LEN + 4]; | |
1623 char ch = '\0'; | |
1624 int cmin; | |
1625 int cmax; | |
1626 int striple = 0; | |
1627 int scpd = 0; | |
1628 int soldi = 0; | |
1629 int oldcmin = 0; | |
1630 int oldcmax = 0; | |
1631 int oldlen = 0; | |
1632 int checkedstriple = 0; | |
1633 int onlycpdrule; | |
1634 char affixed = 0; | |
1635 hentry ** oldwords = words; | |
1636 | |
1637 int checked_prefix; | |
1638 | |
1639 setcminmax(&cmin, &cmax, word, len); | |
1640 | |
1641 strcpy(st, word); | |
1642 | |
1643 for (i = cmin; i < cmax; i++) { | |
1644 // go to end of the UTF-8 character | |
1645 if (utf8) { | |
1646 for (; (st[i] & 0xc0) == 0x80; i++); | |
1647 if (i >= cmax) return NULL; | |
1648 } | |
1649 | |
1650 words = oldwords; | |
1651 onlycpdrule = (words) ? 1 : 0; | |
1652 | |
1653 do { // onlycpdrule loop | |
1654 | |
1655 oldnumsyllable = numsyllable; | |
1656 oldwordnum = wordnum; | |
1657 checked_prefix = 0; | |
1658 | |
1659 | |
1660 do { // simplified checkcompoundpattern loop | |
1661 | |
1662 if (scpd > 0) { | |
1663 for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 || | |
1664 strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtab
le[scpd-1].pattern3)) != 0); scpd++); | |
1665 | |
1666 if (scpd > numcheckcpd) break; // break simplified checkcompoundpatter
n loop | |
1667 strcpy(st + i, checkcpdtable[scpd-1].pattern); | |
1668 soldi = i; | |
1669 i += strlen(checkcpdtable[scpd-1].pattern); | |
1670 strcpy(st + i, checkcpdtable[scpd-1].pattern2); | |
1671 strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi +
strlen(checkcpdtable[scpd-1].pattern3)); | |
1672 | |
1673 oldlen = len; | |
1674 len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[sc
pd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3); | |
1675 oldcmin = cmin; | |
1676 oldcmax = cmax; | |
1677 setcminmax(&cmin, &cmax, st, len); | |
1678 | |
1679 cmax = len - cpdmin + 1; | |
1680 } | |
1681 | |
1682 ch = st[i]; | |
1683 st[i] = '\0'; | |
1684 | |
1685 sfx = NULL; | |
1686 pfx = NULL; | |
1687 | |
1688 // FIRST WORD | |
1689 | |
1690 affixed = 1; | |
1691 rv = lookup(st); // perhaps without prefix | |
1692 | |
1693 // search homonym with compound flag | |
1694 while ((rv) && !hu_mov_rule && | |
1695 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || | |
1696 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, c
ompoundflag, rv->alen)) || | |
1697 (compoundbegin && !wordnum && !onlycpdrule && | |
1698 TESTAFF(rv->astr, compoundbegin, rv->alen)) || | |
1699 (compoundmiddle && wordnum && !words && !onlycpdrule && | |
1700 TESTAFF(rv->astr, compoundmiddle, rv->alen)) || | |
1701 (numdefcpd && onlycpdrule && | |
1702 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hent
ry **) &rwords, 0)) || | |
1703 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords
, 0))))) || | |
1704 (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL && | |
1705 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen))) | |
1706 ) { | |
1707 rv = rv->next_homonym; | |
1708 } | |
1709 | |
1710 if (rv) affixed = 0; | |
1711 | |
1712 if (!rv) { | |
1713 if (onlycpdrule) break; | |
1714 if (compoundflag && | |
1715 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGI
N, compoundflag))) { | |
1716 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, | |
1717 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN
_CPD_BEGIN)) || | |
1718 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i
, 0, NULL, compoundflag)))) && !hu_mov_rule && | |
1719 sfx->getCont() && | |
1720 ((compoundforbidflag && TESTAFF(sfx->getCont(), compound
forbidflag, | |
1721 sfx->getContLen())) || (compoundend && | |
1722 TESTAFF(sfx->getCont(), compoundend, | |
1723 sfx->getContLen())))) { | |
1724 rv = NULL; | |
1725 } | |
1726 } | |
1727 | |
1728 if (rv || | |
1729 (((wordnum == 0) && compoundbegin && | |
1730 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, co
mpoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || | |
1731 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NUL
L, compoundbegin))) || // twofold suffixes + compound | |
1732 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BE
GIN, compoundbegin)))) || | |
1733 ((wordnum > 0) && compoundmiddle && | |
1734 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, co
mpoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || | |
1735 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NUL
L, compoundmiddle))) || // twofold suffixes + compound | |
1736 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BE
GIN, compoundmiddle))))) | |
1737 ) checked_prefix = 1; | |
1738 // else check forbiddenwords and needaffix | |
1739 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || | |
1740 TESTAFF(rv->astr, needaffix, rv->alen) || | |
1741 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
1742 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) | |
1743 )) { | |
1744 st[i] = ch; | |
1745 //continue; | |
1746 break; | |
1747 } | |
1748 | |
1749 // check non_compound flag in suffix and prefix | |
1750 if ((rv) && !hu_mov_rule && | |
1751 ((pfx && pfx->getCont() && | |
1752 TESTAFF(pfx->getCont(), compoundforbidflag, | |
1753 pfx->getContLen())) || | |
1754 (sfx && sfx->getCont() && | |
1755 TESTAFF(sfx->getCont(), compoundforbidflag, | |
1756 sfx->getContLen())))) { | |
1757 rv = NULL; | |
1758 } | |
1759 | |
1760 // check compoundend flag in suffix and prefix | |
1761 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && | |
1762 ((pfx && pfx->getCont() && | |
1763 TESTAFF(pfx->getCont(), compoundend, | |
1764 pfx->getContLen())) || | |
1765 (sfx && sfx->getCont() && | |
1766 TESTAFF(sfx->getCont(), compoundend, | |
1767 sfx->getContLen())))) { | |
1768 rv = NULL; | |
1769 } | |
1770 | |
1771 // check compoundmiddle flag in suffix and prefix | |
1772 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu
_mov_rule && | |
1773 ((pfx && pfx->getCont() && | |
1774 TESTAFF(pfx->getCont(), compoundmiddle, | |
1775 pfx->getContLen())) || | |
1776 (sfx && sfx->getCont() && | |
1777 TESTAFF(sfx->getCont(), compoundmiddle, | |
1778 sfx->getContLen())))) { | |
1779 rv = NULL; | |
1780 } | |
1781 | |
1782 // check forbiddenwords | |
1783 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || | |
1784 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
1785 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { | |
1786 return NULL; | |
1787 } | |
1788 | |
1789 // increment word number, if the second root has a compoundroot flag | |
1790 if ((rv) && compoundroot && | |
1791 (TESTAFF(rv->astr, compoundroot, rv->alen))) { | |
1792 wordnum++; | |
1793 } | |
1794 | |
1795 // first word is acceptable in compound words? | |
1796 if (((rv) && | |
1797 ( checked_prefix || (words && words[wnum]) || | |
1798 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || | |
1799 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbeg
in, rv->alen)) || | |
1800 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmid
dle, rv->alen))// || | |
1801 // (numdefcpd && ) | |
1802 | |
1803 // LANG_hu section: spec. Hungarian rule | |
1804 || ((langnum == LANG_hu) && hu_mov_rule && ( | |
1805 TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungari
an dictionary codes | |
1806 TESTAFF(rv->astr, 'G', rv->alen) || | |
1807 TESTAFF(rv->astr, 'H', rv->alen) | |
1808 ) | |
1809 ) | |
1810 // END of LANG_hu section | |
1811 ) && | |
1812 ( | |
1813 // test CHECKCOMPOUNDPATTERN conditions | |
1814 scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL || | |
1815 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen) | |
1816 ) | |
1817 && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple l
etters | |
1818 (word[i-1]==word[i]) && ( | |
1819 ((i>1) && (word[i-1]==word[i-2])) || | |
1820 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' | |
1821 ) | |
1822 ) || | |
1823 ( | |
1824 checkcompoundcase && scpd == 0 && !words && cpdcase_check(word,
i) | |
1825 )) | |
1826 ) | |
1827 // LANG_hu section: spec. Hungarian rule | |
1828 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(s
t,i)) && | |
1829 (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes | |
1830 TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getCo
ntLen()) || | |
1831 TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getCo
ntLen()) | |
1832 ) | |
1833 ) | |
1834 ) | |
1835 ) { // first word is ok condition | |
1836 | |
1837 // LANG_hu section: spec. Hungarian rule | |
1838 if (langnum == LANG_hu) { | |
1839 // calculate syllable number of the word | |
1840 numsyllable += get_syllable(st, i); | |
1841 // + 1 word, if syllable number of the prefix > 1 (hungarian con
vention) | |
1842 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) >
1)) wordnum++; | |
1843 } | |
1844 // END of LANG_hu section | |
1845 | |
1846 // NEXT WORD(S) | |
1847 rv_first = rv; | |
1848 st[i] = ch; | |
1849 | |
1850 do { // striple loop | |
1851 | |
1852 // check simplifiedtriple | |
1853 if (simplifiedtriple) { | |
1854 if (striple) { | |
1855 checkedstriple = 1; | |
1856 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" | |
1857 } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1; | |
1858 } | |
1859 | |
1860 rv = lookup((st+i)); // perhaps without prefix | |
1861 | |
1862 // search homonym with compound flag | |
1863 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || | |
1864 !((compoundflag && !words && TESTAFF(rv->astr, compoundf
lag, rv->alen)) || | |
1865 (compoundend && !words && TESTAFF(rv->astr, compounden
d, rv->alen)) || | |
1866 (numdefcpd && words && defcpd_check(&words, wnum + 1,
rv, NULL,1))) || | |
1867 (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_N
ULL && | |
1868 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2,
rv->alen)) | |
1869 )) { | |
1870 rv = rv->next_homonym; | |
1871 } | |
1872 | |
1873 // check FORCEUCASE | |
1874 if (rv && forceucase && (rv) && | |
1875 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & S
PELL_ORIGCAP)) rv = NULL; | |
1876 | |
1877 if (rv && words && words[wnum + 1]) return rv_first; | |
1878 | |
1879 oldnumsyllable2 = numsyllable; | |
1880 oldwordnum2 = wordnum; | |
1881 | |
1882 | |
1883 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code | |
1884 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen
)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { | |
1885 numsyllable--; | |
1886 } | |
1887 // END of LANG_hu section | |
1888 | |
1889 // increment word number, if the second root has a compoundroot flag | |
1890 if ((rv) && (compoundroot) && | |
1891 (TESTAFF(rv->astr, compoundroot, rv->alen))) { | |
1892 wordnum++; | |
1893 } | |
1894 | |
1895 // check forbiddenwords | |
1896 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen
) || | |
1897 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
1898 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))))
return NULL; | |
1899 | |
1900 // second word is acceptable, as a root? | |
1901 // hungarian conventions: compounding is acceptable, | |
1902 // when compound forms consist of 2 words, or if more, | |
1903 // then the syllable number of root words must be 6, or lesser. | |
1904 | |
1905 if ((rv) && ( | |
1906 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)
) || | |
1907 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) | |
1908 ) | |
1909 && ( | |
1910 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || | |
1911 ((cpdmaxsyllable!=0) && | |
1912 (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)
<=cpdmaxsyllable)) | |
1913 ) && | |
1914 ( | |
1915 // test CHECKCOMPOUNDPATTERN | |
1916 !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, r
v, 0) | |
1917 ) && | |
1918 ( | |
1919 (!checkcompounddup || (rv != rv_first)) | |
1920 ) | |
1921 // test CHECKCOMPOUNDPATTERN conditions | |
1922 && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || | |
1923 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) | |
1924 ) | |
1925 { | |
1926 // forbid compound word, if it is a non compound word with
typical fault | |
1927 if (checkcompoundrep && cpdrep_check(word,len)) return NUL
L; | |
1928 return rv_first; | |
1929 } | |
1930 | |
1931 numsyllable = oldnumsyllable2; | |
1932 wordnum = oldwordnum2; | |
1933 | |
1934 // perhaps second word has prefix or/and suffix | |
1935 sfx = NULL; | |
1936 sfxflag = FLAG_NULL; | |
1937 rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(wo
rd+i), compoundflag, IN_CPD_END) : NULL; | |
1938 if (!rv && compoundend && !onlycpdrule) { | |
1939 sfx = NULL; | |
1940 pfx = NULL; | |
1941 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_EN
D); | |
1942 } | |
1943 | |
1944 if (!rv && numdefcpd && words) { | |
1945 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); | |
1946 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv
_first; | |
1947 rv = NULL; | |
1948 } | |
1949 | |
1950 // test CHECKCOMPOUNDPATTERN conditions (allowed forms) | |
1951 if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
| |
1952 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv =
NULL; | |
1953 | |
1954 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) | |
1955 if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first
, rv, affixed)) rv = NULL; | |
1956 | |
1957 // check non_compound flag in suffix and prefix | |
1958 if ((rv) && | |
1959 ((pfx && pfx->getCont() && | |
1960 TESTAFF(pfx->getCont(), compoundforbidflag, | |
1961 pfx->getContLen())) || | |
1962 (sfx && sfx->getCont() && | |
1963 TESTAFF(sfx->getCont(), compoundforbidflag, | |
1964 sfx->getContLen())))) { | |
1965 rv = NULL; | |
1966 } | |
1967 | |
1968 // check FORCEUCASE | |
1969 if (rv && forceucase && (rv) && | |
1970 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & S
PELL_ORIGCAP)) rv = NULL; | |
1971 | |
1972 // check forbiddenwords | |
1973 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen
) || | |
1974 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
1975 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))))
return NULL; | |
1976 | |
1977 // pfxappnd = prefix of word+i, or NULL | |
1978 // calculate syllable number of prefix. | |
1979 // hungarian convention: when syllable number of prefix is more, | |
1980 // than 1, the prefix+word counts as two words. | |
1981 | |
1982 if (langnum == LANG_hu) { | |
1983 // calculate syllable number of the word | |
1984 numsyllable += get_syllable(word + i, strlen(word + i)); | |
1985 | |
1986 // - affix syllable num. | |
1987 // XXX only second suffix (inflections, not derivations) | |
1988 if (sfxappnd) { | |
1989 char * tmp = myrevstrdup(sfxappnd); | |
1990 numsyllable -= get_syllable(tmp, strlen(tmp)); | |
1991 free(tmp); | |
1992 } | |
1993 | |
1994 // + 1 word, if syllable number of the prefix > 1 (hungarian con
vention) | |
1995 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) >
1)) wordnum++; | |
1996 | |
1997 // increment syllable num, if last word has a SYLLABLENUM flag | |
1998 // and the suffix is beginning `s' | |
1999 | |
2000 if (cpdsyllablenum) { | |
2001 switch (sfxflag) { | |
2002 case 'c': { numsyllable+=2; break; } | |
2003 case 'J': { numsyllable += 1; break; } | |
2004 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen))
numsyllable += 1; break; } | |
2005 } | |
2006 } | |
2007 } | |
2008 | |
2009 // increment word number, if the second word has a compoundroot flag | |
2010 if ((rv) && (compoundroot) && | |
2011 (TESTAFF(rv->astr, compoundroot, rv->alen))) { | |
2012 wordnum++; | |
2013 } | |
2014 | |
2015 // second word is acceptable, as a word with prefix or/and suffix? | |
2016 // hungarian conventions: compounding is acceptable, | |
2017 // when compound forms consist 2 word, otherwise | |
2018 // the syllable number of root words is 6, or lesser. | |
2019 if ((rv) && | |
2020 ( | |
2021 ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || | |
2022 ((cpdmaxsyllable != 0) && | |
2023 (numsyllable <= cpdmaxsyllable)) | |
2024 ) | |
2025 && ( | |
2026 (!checkcompounddup || (rv != rv_first)) | |
2027 )) { | |
2028 // forbid compound word, if it is a non compound word with t
ypical fault | |
2029 if (checkcompoundrep && cpdrep_check(word, len)) return NULL
; | |
2030 return rv_first; | |
2031 } | |
2032 | |
2033 numsyllable = oldnumsyllable2; | |
2034 wordnum = oldwordnum2; | |
2035 | |
2036 // perhaps second word is a compound word (recursive call) | |
2037 if (wordnum < maxwordnum) { | |
2038 rv = compound_check((st+i),strlen(st+i), wordnum+1, | |
2039 numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info); | |
2040 | |
2041 if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv
_first, rv, affixed)) || | |
2042 (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))
) rv = NULL; | |
2043 } else { | |
2044 rv=NULL; | |
2045 } | |
2046 if (rv) { | |
2047 // forbid compound word, if it is a non compound word with typic
al fault | |
2048 if (checkcompoundrep || forbiddenword) { | |
2049 struct hentry * rv2 = NULL; | |
2050 | |
2051 if (checkcompoundrep && cpdrep_check(word, len)) return NULL
; | |
2052 | |
2053 // check first part | |
2054 if (strncmp(rv->word, word + i, rv->blen) == 0) { | |
2055 char r = *(st + i + rv->blen); | |
2056 *(st + i + rv->blen) = '\0'; | |
2057 | |
2058 if (checkcompoundrep && cpdrep_check(st, i + rv->blen))
{ | |
2059 *(st + i + rv->blen) = r; | |
2060 continue; | |
2061 } | |
2062 | |
2063 if (forbiddenword) { | |
2064 rv2 = lookup(word); | |
2065 if (!rv2) rv2 = affix_check(word, len); | |
2066 if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbidden
word, rv2->alen) && | |
2067 (strncmp(rv2->word, st, i + rv->blen) == 0)) { | |
2068 return NULL; | |
2069 } | |
2070 } | |
2071 *(st + i + rv->blen) = r; | |
2072 } | |
2073 } | |
2074 return rv_first; | |
2075 } | |
2076 } while (striple && !checkedstriple); // end of striple loop | |
2077 | |
2078 if (checkedstriple) { | |
2079 i++; | |
2080 checkedstriple = 0; | |
2081 striple = 0; | |
2082 } | |
2083 | |
2084 } // first word is ok condition | |
2085 | |
2086 if (soldi != 0) { | |
2087 i = soldi; | |
2088 soldi = 0; | |
2089 len = oldlen; | |
2090 cmin = oldcmin; | |
2091 cmax = oldcmax; | |
2092 } | |
2093 scpd++; | |
2094 | |
2095 | |
2096 } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end o
f simplifiedcpd loop | |
2097 | |
2098 scpd = 0; | |
2099 wordnum = oldwordnum; | |
2100 numsyllable = oldnumsyllable; | |
2101 | |
2102 if (soldi != 0) { | |
2103 i = soldi; | |
2104 strcpy(st, word); // XXX add more optim. | |
2105 soldi = 0; | |
2106 } else st[i] = ch; | |
2107 | |
2108 } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule =
1)); // end of onlycpd loop | |
2109 | |
2110 } | |
2111 | |
2112 return NULL; | |
2113 } | |
2114 | |
2115 // check if compound word is correctly spelled | |
2116 // hu_mov_rule = spec. Hungarian rule (XXX) | |
2117 int AffixMgr::compound_check_morph(const char * word, int len, | |
2118 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** wo
rds, | |
2119 char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL) | |
2120 { | |
2121 int i; | |
2122 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; | |
2123 int ok = 0; | |
2124 | |
2125 struct hentry * rv = NULL; | |
2126 struct hentry * rv_first; | |
2127 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking | |
2128 char st [MAXWORDUTF8LEN + 4]; | |
2129 char ch; | |
2130 | |
2131 int checked_prefix; | |
2132 char presult[MAXLNLEN]; | |
2133 | |
2134 int cmin; | |
2135 int cmax; | |
2136 | |
2137 int onlycpdrule; | |
2138 char affixed = 0; | |
2139 hentry ** oldwords = words; | |
2140 | |
2141 setcminmax(&cmin, &cmax, word, len); | |
2142 | |
2143 strcpy(st, word); | |
2144 | |
2145 for (i = cmin; i < cmax; i++) { | |
2146 oldnumsyllable = numsyllable; | |
2147 oldwordnum = wordnum; | |
2148 checked_prefix = 0; | |
2149 | |
2150 // go to end of the UTF-8 character | |
2151 if (utf8) { | |
2152 for (; (st[i] & 0xc0) == 0x80; i++); | |
2153 if (i >= cmax) return 0; | |
2154 } | |
2155 | |
2156 words = oldwords; | |
2157 onlycpdrule = (words) ? 1 : 0; | |
2158 | |
2159 do { // onlycpdrule loop | |
2160 | |
2161 oldnumsyllable = numsyllable; | |
2162 oldwordnum = wordnum; | |
2163 checked_prefix = 0; | |
2164 | |
2165 ch = st[i]; | |
2166 st[i] = '\0'; | |
2167 sfx = NULL; | |
2168 | |
2169 // FIRST WORD | |
2170 | |
2171 affixed = 1; | |
2172 | |
2173 *presult = '\0'; | |
2174 if (partresult) mystrcat(presult, partresult, MAXLNLEN); | |
2175 | |
2176 rv = lookup(st); // perhaps without prefix | |
2177 | |
2178 // search homonym with compound flag | |
2179 while ((rv) && !hu_mov_rule && | |
2180 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || | |
2181 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, c
ompoundflag, rv->alen)) || | |
2182 (compoundbegin && !wordnum && !onlycpdrule && | |
2183 TESTAFF(rv->astr, compoundbegin, rv->alen)) || | |
2184 (compoundmiddle && wordnum && !words && !onlycpdrule && | |
2185 TESTAFF(rv->astr, compoundmiddle, rv->alen)) || | |
2186 (numdefcpd && onlycpdrule && | |
2187 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hent
ry **) &rwords, 0)) || | |
2188 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords
, 0)))) | |
2189 ))) { | |
2190 rv = rv->next_homonym; | |
2191 } | |
2192 | |
2193 if (rv) affixed = 0; | |
2194 | |
2195 if (rv) { | |
2196 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, s
t); | |
2197 if (!HENTRY_FIND(rv, MORPH_STEM)) { | |
2198 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STE
M, st); | |
2199 } | |
2200 // store the pointer of the hash entry | |
2201 // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTR
Y, rv); | |
2202 if (HENTRY_DATA(rv)) { | |
2203 sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA
2(rv)); | |
2204 } | |
2205 } | |
2206 | |
2207 if (!rv) { | |
2208 if (onlycpdrule) break; | |
2209 if (compoundflag && | |
2210 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGI
N, compoundflag))) { | |
2211 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, | |
2212 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN
_CPD_BEGIN)) || | |
2213 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i
, 0, NULL, compoundflag)))) && !hu_mov_rule && | |
2214 sfx->getCont() && | |
2215 ((compoundforbidflag && TESTAFF(sfx->getCont(), compound
forbidflag, | |
2216 sfx->getContLen())) || (compoundend && | |
2217 TESTAFF(sfx->getCont(), compoundend, | |
2218 sfx->getContLen())))) { | |
2219 rv = NULL; | |
2220 } | |
2221 } | |
2222 | |
2223 if (rv || | |
2224 (((wordnum == 0) && compoundbegin && | |
2225 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, co
mpoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || | |
2226 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NUL
L, compoundbegin))) || // twofold suffix+compound | |
2227 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BE
GIN, compoundbegin)))) || | |
2228 ((wordnum > 0) && compoundmiddle && | |
2229 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, co
mpoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || | |
2230 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NUL
L, compoundmiddle))) || // twofold suffix+compound | |
2231 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BE
GIN, compoundmiddle))))) | |
2232 ) { | |
2233 // char * p = prefix_check_morph(st, i, 0, compound); | |
2234 char * p = NULL; | |
2235 if (compoundflag) p = affix_check_morph(st, i, compoundflag); | |
2236 if (!p || (*p == '\0')) { | |
2237 if (p) free(p); | |
2238 p = NULL; | |
2239 if ((wordnum == 0) && compoundbegin) { | |
2240 p = affix_check_morph(st, i, compoundbegin); | |
2241 } else if ((wordnum > 0) && compoundmiddle) { | |
2242 p = affix_check_morph(st, i, compoundmiddle);
| |
2243 } | |
2244 } | |
2245 if (p && (*p != '\0')) { | |
2246 sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, | |
2247 MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); | |
2248 } | |
2249 if (p) free(p); | |
2250 checked_prefix = 1; | |
2251 } | |
2252 // else check forbiddenwords | |
2253 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || | |
2254 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || | |
2255 TESTAFF(rv->astr, needaffix, rv->alen))) { | |
2256 st[i] = ch; | |
2257 continue; | |
2258 } | |
2259 | |
2260 // check non_compound flag in suffix and prefix | |
2261 if ((rv) && !hu_mov_rule && | |
2262 ((pfx && pfx->getCont() && | |
2263 TESTAFF(pfx->getCont(), compoundforbidflag, | |
2264 pfx->getContLen())) || | |
2265 (sfx && sfx->getCont() && | |
2266 TESTAFF(sfx->getCont(), compoundforbidflag, | |
2267 sfx->getContLen())))) { | |
2268 continue; | |
2269 } | |
2270 | |
2271 // check compoundend flag in suffix and prefix | |
2272 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && | |
2273 ((pfx && pfx->getCont() && | |
2274 TESTAFF(pfx->getCont(), compoundend, | |
2275 pfx->getContLen())) || | |
2276 (sfx && sfx->getCont() && | |
2277 TESTAFF(sfx->getCont(), compoundend, | |
2278 sfx->getContLen())))) { | |
2279 continue; | |
2280 } | |
2281 | |
2282 // check compoundmiddle flag in suffix and prefix | |
2283 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu
_mov_rule && | |
2284 ((pfx && pfx->getCont() && | |
2285 TESTAFF(pfx->getCont(), compoundmiddle, | |
2286 pfx->getContLen())) || | |
2287 (sfx && sfx->getCont() && | |
2288 TESTAFF(sfx->getCont(), compoundmiddle, | |
2289 sfx->getContLen())))) { | |
2290 rv = NULL; | |
2291 } | |
2292 | |
2293 // check forbiddenwords | |
2294 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) | |
2295 || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue; | |
2296 | |
2297 // increment word number, if the second root has a compoundroot flag | |
2298 if ((rv) && (compoundroot) && | |
2299 (TESTAFF(rv->astr, compoundroot, rv->alen))) { | |
2300 wordnum++; | |
2301 } | |
2302 | |
2303 // first word is acceptable in compound words? | |
2304 if (((rv) && | |
2305 ( checked_prefix || (words && words[wnum]) || | |
2306 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || | |
2307 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbeg
in, rv->alen)) || | |
2308 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmid
dle, rv->alen)) | |
2309 // LANG_hu section: spec. Hungarian rule | |
2310 || ((langnum == LANG_hu) && // hu_mov_rule | |
2311 hu_mov_rule && ( | |
2312 TESTAFF(rv->astr, 'F', rv->alen) || | |
2313 TESTAFF(rv->astr, 'G', rv->alen) || | |
2314 TESTAFF(rv->astr, 'H', rv->alen) | |
2315 ) | |
2316 ) | |
2317 // END of LANG_hu section | |
2318 ) | |
2319 && ! (( checkcompoundtriple && !words && // test triple letters | |
2320 (word[i-1]==word[i]) && ( | |
2321 ((i>1) && (word[i-1]==word[i-2])) || | |
2322 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' | |
2323 ) | |
2324 ) || | |
2325 ( | |
2326 // test CHECKCOMPOUNDPATTERN | |
2327 numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affi
xed) | |
2328 ) || | |
2329 ( | |
2330 checkcompoundcase && !words && cpdcase_check(word, i) | |
2331 )) | |
2332 ) | |
2333 // LANG_hu section: spec. Hungarian rule | |
2334 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(s
t,i)) && | |
2335 (sfx && sfx->getCont() && ( | |
2336 TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getCo
ntLen()) || | |
2337 TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getCo
ntLen()) | |
2338 ) | |
2339 ) | |
2340 ) | |
2341 // END of LANG_hu section | |
2342 ) { | |
2343 | |
2344 // LANG_hu section: spec. Hungarian rule | |
2345 if (langnum == LANG_hu) { | |
2346 // calculate syllable number of the word | |
2347 numsyllable += get_syllable(st, i); | |
2348 | |
2349 // + 1 word, if syllable number of the prefix > 1 (hungarian con
vention) | |
2350 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) >
1)) wordnum++; | |
2351 } | |
2352 // END of LANG_hu section | |
2353 | |
2354 // NEXT WORD(S) | |
2355 rv_first = rv; | |
2356 rv = lookup((word+i)); // perhaps without prefix | |
2357 | |
2358 // search homonym with compound flag | |
2359 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || | |
2360 !((compoundflag && !words && TESTAFF(rv->astr, compoundf
lag, rv->alen)) || | |
2361 (compoundend && !words && TESTAFF(rv->astr, compounden
d, rv->alen)) || | |
2362 (numdefcpd && words && defcpd_check(&words, wnum + 1,
rv, NULL,1))))) { | |
2363 rv = rv->next_homonym; | |
2364 } | |
2365 | |
2366 if (rv && words && words[wnum + 1]) { | |
2367 mystrcat(*result, presult, MAXLNLEN); | |
2368 mystrcat(*result, " ", MAXLNLEN); | |
2369 mystrcat(*result, MORPH_PART, MAXLNLEN); | |
2370 mystrcat(*result, word+i, MAXLNLEN); | |
2371 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENT
RY_DATA2(rv), MAXLNLEN); | |
2372 if (!HENTRY_FIND(rv, MORPH_STEM)) { | |
2373 mystrcat(*result, " ", MAXLNLEN); | |
2374 mystrcat(*result, MORPH_STEM, MAXLNLEN); | |
2375 mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); | |
2376 } | |
2377 // store the pointer of the hash entry | |
2378 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv
); | |
2379 if (!complexprefixes && HENTRY_DATA(rv)) { | |
2380 mystrcat(*result, " ", MAXLNLEN); | |
2381 mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); | |
2382 } | |
2383 mystrcat(*result, "\n", MAXLNLEN); | |
2384 ok = 1; | |
2385 return 0; | |
2386 } | |
2387 | |
2388 oldnumsyllable2 = numsyllable; | |
2389 oldwordnum2 = wordnum; | |
2390 | |
2391 // LANG_hu section: spec. Hungarian rule | |
2392 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen
)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { | |
2393 numsyllable--; | |
2394 } | |
2395 // END of LANG_hu section | |
2396 // increment word number, if the second root has a compoundroot flag | |
2397 if ((rv) && (compoundroot) && | |
2398 (TESTAFF(rv->astr, compoundroot, rv->alen))) { | |
2399 wordnum++; | |
2400 } | |
2401 | |
2402 // check forbiddenwords | |
2403 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen
) || | |
2404 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) { | |
2405 st[i] = ch; | |
2406 continue; | |
2407 } | |
2408 | |
2409 // second word is acceptable, as a root? | |
2410 // hungarian conventions: compounding is acceptable, | |
2411 // when compound forms consist of 2 words, or if more, | |
2412 // then the syllable number of root words must be 6, or lesser. | |
2413 if ((rv) && ( | |
2414 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)
) || | |
2415 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) | |
2416 ) | |
2417 && ( | |
2418 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || | |
2419 ((cpdmaxsyllable!=0) && | |
2420 (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=c
pdmaxsyllable)) | |
2421 ) | |
2422 && ( | |
2423 (!checkcompounddup || (rv != rv_first)) | |
2424 ) | |
2425 ) | |
2426 { | |
2427 // bad compound word | |
2428 mystrcat(*result, presult, MAXLNLEN); | |
2429 mystrcat(*result, " ", MAXLNLEN); | |
2430 mystrcat(*result, MORPH_PART, MAXLNLEN); | |
2431 mystrcat(*result, word+i, MAXLNLEN); | |
2432 | |
2433 if (HENTRY_DATA(rv)) { | |
2434 if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv),
MAXLNLEN); | |
2435 if (! HENTRY_FIND(rv, MORPH_STEM)) { | |
2436 mystrcat(*result, " ", MAXLNLEN); | |
2437 mystrcat(*result, MORPH_STEM, MAXLNLEN); | |
2438 mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); | |
2439 } | |
2440 // store the pointer of the hash entry | |
2441 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENT
RY, rv); | |
2442 if (!complexprefixes) { | |
2443 mystrcat(*result, " ", MAXLNLEN); | |
2444 mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); | |
2445 } | |
2446 } | |
2447 mystrcat(*result, "\n", MAXLNLEN); | |
2448 ok = 1; | |
2449 } | |
2450 | |
2451 numsyllable = oldnumsyllable2 ; | |
2452 wordnum = oldwordnum2; | |
2453 | |
2454 // perhaps second word has prefix or/and suffix | |
2455 sfx = NULL; | |
2456 sfxflag = FLAG_NULL; | |
2457 | |
2458 if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(w
ord+i), compoundflag); else rv = NULL; | |
2459 | |
2460 if (!rv && compoundend && !onlycpdrule) { | |
2461 sfx = NULL; | |
2462 pfx = NULL; | |
2463 rv = affix_check((word+i),strlen(word+i), compoundend); | |
2464 } | |
2465 | |
2466 if (!rv && numdefcpd && words) { | |
2467 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); | |
2468 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1))
{ | |
2469 char * m = NULL; | |
2470 if (compoundflag) m = affix_check_morph((word+i),strlen(wo
rd+i), compoundflag); | |
2471 if ((!m || *m == '\0') && compoundend) { | |
2472 if (m) free(m); | |
2473 m = affix_check_morph((word+i),strlen(word+i), compo
undend); | |
2474 } | |
2475 mystrcat(*result, presult, MAXLNLEN); | |
2476 if (m || (*m != '\0')) { | |
2477 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, | |
2478 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); | |
2479 } | |
2480 if (m) free(m); | |
2481 mystrcat(*result, "\n", MAXLNLEN); | |
2482 ok = 1; | |
2483 } | |
2484 } | |
2485 | |
2486 // check non_compound flag in suffix and prefix | |
2487 if ((rv) && | |
2488 ((pfx && pfx->getCont() && | |
2489 TESTAFF(pfx->getCont(), compoundforbidflag, | |
2490 pfx->getContLen())) || | |
2491 (sfx && sfx->getCont() && | |
2492 TESTAFF(sfx->getCont(), compoundforbidflag, | |
2493 sfx->getContLen())))) { | |
2494 rv = NULL; | |
2495 } | |
2496 | |
2497 // check forbiddenwords | |
2498 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen)
|| | |
2499 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) | |
2500 && (! TESTAFF(rv->astr, needaffix, rv->alen))) { | |
2501 st[i] = ch; | |
2502 continue; | |
2503 } | |
2504 | |
2505 if (langnum == LANG_hu) { | |
2506 // calculate syllable number of the word | |
2507 numsyllable += get_syllable(word + i, strlen(word + i)); | |
2508 | |
2509 // - affix syllable num. | |
2510 // XXX only second suffix (inflections, not derivations) | |
2511 if (sfxappnd) { | |
2512 char * tmp = myrevstrdup(sfxappnd); | |
2513 numsyllable -= get_syllable(tmp, strlen(tmp)); | |
2514 free(tmp); | |
2515 } | |
2516 | |
2517 // + 1 word, if syllable number of the prefix > 1 (hungarian con
vention) | |
2518 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) >
1)) wordnum++; | |
2519 | |
2520 // increment syllable num, if last word has a SYLLABLENUM flag | |
2521 // and the suffix is beginning `s' | |
2522 | |
2523 if (cpdsyllablenum) { | |
2524 switch (sfxflag) { | |
2525 case 'c': { numsyllable+=2; break; } | |
2526 case 'J': { numsyllable += 1; break; } | |
2527 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen))
numsyllable += 1; break; } | |
2528 } | |
2529 } | |
2530 } | |
2531 | |
2532 // increment word number, if the second word has a compoundroot flag | |
2533 if ((rv) && (compoundroot) && | |
2534 (TESTAFF(rv->astr, compoundroot, rv->alen))) { | |
2535 wordnum++; | |
2536 } | |
2537 // second word is acceptable, as a word with prefix or/and suffix? | |
2538 // hungarian conventions: compounding is acceptable, | |
2539 // when compound forms consist 2 word, otherwise | |
2540 // the syllable number of root words is 6, or lesser. | |
2541 if ((rv) && | |
2542 ( | |
2543 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || | |
2544 ((cpdmaxsyllable!=0) && | |
2545 (numsyllable <= cpdmaxsyllable)) | |
2546 ) | |
2547 && ( | |
2548 (!checkcompounddup || (rv != rv_first)) | |
2549 )) { | |
2550 char * m = NULL; | |
2551 if (compoundflag) m = affix_check_morph((word+i),strlen(wo
rd+i), compoundflag); | |
2552 if ((!m || *m == '\0') && compoundend) { | |
2553 if (m) free(m); | |
2554 m = affix_check_morph((word+i),strlen(word+i), compo
undend); | |
2555 } | |
2556 mystrcat(*result, presult, MAXLNLEN); | |
2557 if (m && (*m != '\0')) { | |
2558 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, | |
2559 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); | |
2560 } | |
2561 if (m) free(m); | |
2562 sprintf(*result + strlen(*result), "%c", MSEP_REC); | |
2563 ok = 1; | |
2564 } | |
2565 | |
2566 numsyllable = oldnumsyllable2; | |
2567 wordnum = oldwordnum2; | |
2568 | |
2569 // perhaps second word is a compound word (recursive call) | |
2570 if ((wordnum < maxwordnum) && (ok == 0)) { | |
2571 compound_check_morph((word+i),strlen(word+i), wordnum+1,
| |
2572 numsyllable, maxwordnum, wnum + 1, words, 0, result
, presult); | |
2573 } else { | |
2574 rv=NULL; | |
2575 } | |
2576 } | |
2577 st[i] = ch; | |
2578 wordnum = oldwordnum; | |
2579 numsyllable = oldnumsyllable; | |
2580 | |
2581 } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule =
1)); // end of onlycpd loop | |
2582 | |
2583 } | |
2584 return 0; | |
2585 } | |
2586 | |
2587 // return 1 if s1 (reversed) is a leading subset of end of s2 | |
2588 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int
len) | |
2589 { | |
2590 while ((len > 0) && *s1 && (*s1 == *end_of_s2)) { | |
2591 s1++; | |
2592 end_of_s2--; | |
2593 len--; | |
2594 } | |
2595 return (*s1 == '\0'); | |
2596 } | |
2597 */ | |
2598 | |
2599 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int le
n) | |
2600 { | |
2601 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.')))
{ | |
2602 s1++; | |
2603 end_of_s2--; | |
2604 len--; | |
2605 } | |
2606 return (*s1 == '\0'); | |
2607 } | |
2608 | |
2609 // check word for suffixes | |
2610 | |
2611 struct hentry * AffixMgr::suffix_check (const char * word, int len, | |
2612 int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns, | |
2613 const FLAG cclass, const FLAG needflag, char in_compound) | |
2614 { | |
2615 struct hentry * rv = NULL; | |
2616 PfxEntry* ep = ppfx; | |
2617 | |
2618 // first handle the special case of 0 length suffixes | |
2619 SfxEntry * se = sStart[0]; | |
2620 | |
2621 while (se) { | |
2622 if (!cclass || se->getCont()) { | |
2623 // suffixes are not allowed in beginning of compounds | |
2624 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass | |
2625 // except when signed with compoundpermitflag flag | |
2626 (se->getCont() && compoundpermitflag && | |
2627 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) &&
(!circumfix || | |
2628 // no circumfix flag in prefix and suffix | |
2629 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), | |
2630 circumfix, ep->getContLen())) && | |
2631 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContL
en())))) || | |
2632 // circumfix flag in prefix AND suffix | |
2633 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), | |
2634 circumfix, ep->getContLen())) && | |
2635 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen
()))))) && | |
2636 // fogemorpheme | |
2637 (in_compound || | |
2638 !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->
getContLen())))) && | |
2639 // needaffix on prefix or first suffix | |
2640 (cclass || | |
2641 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getC
ontLen())) || | |
2642 (ppfx && !((ep->getCont()) && | |
2643 TESTAFF(ep->getCont(), needaffix, | |
2644 ep->getContLen()))) | |
2645 )) { | |
2646 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (F
LAG) cclass, | |
2647 needflag, (in_compound ? 0 : onlyincompound)); | |
2648 if (rv) { | |
2649 sfx=se; // BUG: sfx not stateless | |
2650 return rv; | |
2651 } | |
2652 } | |
2653 } | |
2654 se = se->getNext(); | |
2655 } | |
2656 | |
2657 // now handle the general case | |
2658 if (len == 0) return NULL; // FULLSTRIP | |
2659 unsigned char sp= *((const unsigned char *)(word + len - 1)); | |
2660 SfxEntry * sptr = sStart[sp]; | |
2661 | |
2662 while (sptr) { | |
2663 if (isRevSubset(sptr->getKey(), word + len - 1, len) | |
2664 ) { | |
2665 // suffixes are not allowed in beginning of compounds | |
2666 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass | |
2667 // except when signed with compoundpermitflag flag | |
2668 (sptr->getCont() && compoundpermitflag && | |
2669 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen())))
&& (!circumfix || | |
2670 // no circumfix flag in prefix and suffix | |
2671 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), | |
2672 circumfix, ep->getContLen())) && | |
2673 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->ge
tContLen())))) || | |
2674 // circumfix flag in prefix AND suffix | |
2675 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), | |
2676 circumfix, ep->getContLen())) && | |
2677 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getC
ontLen()))))) && | |
2678 // fogemorpheme | |
2679 (in_compound || | |
2680 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
sptr->getContLen()))))) && | |
2681 // needaffix on prefix or first suffix | |
2682 (cclass || | |
2683 !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr-
>getContLen())) || | |
2684 (ppfx && !((ep->getCont()) && | |
2685 TESTAFF(ep->getCont(), needaffix, | |
2686 ep->getContLen()))) | |
2687 ) | |
2688 ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TEST
AFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { | |
2689 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst, | |
2690 maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompo
und)); | |
2691 if (rv) { | |
2692 sfx=sptr; // BUG: sfx not stateless | |
2693 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless | |
2694 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxap
pnd not stateless | |
2695 return rv; | |
2696 } | |
2697 } | |
2698 sptr = sptr->getNextEQ(); | |
2699 } else { | |
2700 sptr = sptr->getNextNE(); | |
2701 } | |
2702 } | |
2703 | |
2704 return NULL; | |
2705 } | |
2706 | |
2707 // check word for two-level suffixes | |
2708 | |
2709 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, | |
2710 int sfxopts, PfxEntry * ppfx, const FLAG needflag) | |
2711 { | |
2712 struct hentry * rv = NULL; | |
2713 | |
2714 // first handle the special case of 0 length suffixes | |
2715 SfxEntry * se = sStart[0]; | |
2716 while (se) { | |
2717 if (contclasses[se->getFlag()]) | |
2718 { | |
2719 rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag); | |
2720 if (rv) return rv; | |
2721 } | |
2722 se = se->getNext(); | |
2723 } | |
2724 | |
2725 // now handle the general case | |
2726 if (len == 0) return NULL; // FULLSTRIP | |
2727 unsigned char sp = *((const unsigned char *)(word + len - 1)); | |
2728 SfxEntry * sptr = sStart[sp]; | |
2729 | |
2730 while (sptr) { | |
2731 if (isRevSubset(sptr->getKey(), word + len - 1, len)) { | |
2732 if (contclasses[sptr->getFlag()]) | |
2733 { | |
2734 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag); | |
2735 if (rv) { | |
2736 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless | |
2737 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxap
pnd not stateless | |
2738 return rv; | |
2739 } | |
2740 } | |
2741 sptr = sptr->getNextEQ(); | |
2742 } else { | |
2743 sptr = sptr->getNextNE(); | |
2744 } | |
2745 } | |
2746 | |
2747 return NULL; | |
2748 } | |
2749 | |
2750 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, | |
2751 int sfxopts, PfxEntry * ppfx, const FLAG needflag) | |
2752 { | |
2753 char result[MAXLNLEN]; | |
2754 char result2[MAXLNLEN]; | |
2755 char result3[MAXLNLEN]; | |
2756 | |
2757 char * st; | |
2758 | |
2759 result[0] = '\0'; | |
2760 result2[0] = '\0'; | |
2761 result3[0] = '\0'; | |
2762 | |
2763 // first handle the special case of 0 length suffixes | |
2764 SfxEntry * se = sStart[0]; | |
2765 while (se) { | |
2766 if (contclasses[se->getFlag()]) | |
2767 { | |
2768 st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); | |
2769 if (st) { | |
2770 if (ppfx) { | |
2771 if (ppfx->getMorph()) { | |
2772 mystrcat(result, ppfx->getMorph(), MAXLNLEN); | |
2773 mystrcat(result, " ", MAXLNLEN); | |
2774 } else debugflag(result, ppfx->getFlag()); | |
2775 } | |
2776 mystrcat(result, st, MAXLNLEN); | |
2777 free(st); | |
2778 if (se->getMorph()) { | |
2779 mystrcat(result, " ", MAXLNLEN); | |
2780 mystrcat(result, se->getMorph(), MAXLNLEN); | |
2781 } else debugflag(result, se->getFlag()); | |
2782 mystrcat(result, "\n", MAXLNLEN); | |
2783 } | |
2784 } | |
2785 se = se->getNext(); | |
2786 } | |
2787 | |
2788 // now handle the general case | |
2789 if (len == 0) return NULL; // FULLSTRIP | |
2790 unsigned char sp = *((const unsigned char *)(word + len - 1)); | |
2791 SfxEntry * sptr = sStart[sp]; | |
2792 | |
2793 while (sptr) { | |
2794 if (isRevSubset(sptr->getKey(), word + len - 1, len)) { | |
2795 if (contclasses[sptr->getFlag()]) | |
2796 { | |
2797 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag)
; | |
2798 if (st) { | |
2799 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless | |
2800 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxap
pnd not stateless | |
2801 strcpy(result2, st); | |
2802 free(st); | |
2803 | |
2804 result3[0] = '\0'; | |
2805 | |
2806 if (sptr->getMorph()) { | |
2807 mystrcat(result3, " ", MAXLNLEN); | |
2808 mystrcat(result3, sptr->getMorph(), MAXLNLEN); | |
2809 } else debugflag(result3, sptr->getFlag()); | |
2810 strlinecat(result2, result3); | |
2811 mystrcat(result2, "\n", MAXLNLEN); | |
2812 mystrcat(result, result2, MAXLNLEN); | |
2813 } | |
2814 } | |
2815 sptr = sptr->getNextEQ(); | |
2816 } else { | |
2817 sptr = sptr->getNextNE(); | |
2818 } | |
2819 } | |
2820 if (*result) return mystrdup(result); | |
2821 return NULL; | |
2822 } | |
2823 | |
2824 char * AffixMgr::suffix_check_morph(const char * word, int len, | |
2825 int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, cha
r in_compound) | |
2826 { | |
2827 char result[MAXLNLEN]; | |
2828 | |
2829 struct hentry * rv = NULL; | |
2830 | |
2831 result[0] = '\0'; | |
2832 | |
2833 PfxEntry* ep = ppfx; | |
2834 | |
2835 // first handle the special case of 0 length suffixes | |
2836 SfxEntry * se = sStart[0]; | |
2837 while (se) { | |
2838 if (!cclass || se->getCont()) { | |
2839 // suffixes are not allowed in beginning of compounds | |
2840 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass | |
2841 // except when signed with compoundpermitflag flag | |
2842 (se->getCont() && compoundpermitflag && | |
2843 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) &&
(!circumfix || | |
2844 // no circumfix flag in prefix and suffix | |
2845 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), | |
2846 circumfix, ep->getContLen())) && | |
2847 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContL
en())))) || | |
2848 // circumfix flag in prefix AND suffix | |
2849 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), | |
2850 circumfix, ep->getContLen())) && | |
2851 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen
()))))) && | |
2852 // fogemorpheme | |
2853 (in_compound || | |
2854 !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se-
>getContLen()))))) && | |
2855 // needaffix on prefix or first suffix | |
2856 (cclass || | |
2857 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getC
ontLen())) || | |
2858 (ppfx && !((ep->getCont()) && | |
2859 TESTAFF(ep->getCont(), needaffix, | |
2860 ep->getContLen()))) | |
2861 ) | |
2862 )) | |
2863 rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, nee
dflag); | |
2864 while (rv) { | |
2865 if (ppfx) { | |
2866 if (ppfx->getMorph()) { | |
2867 mystrcat(result, ppfx->getMorph(), MAXLNLEN); | |
2868 mystrcat(result, " ", MAXLNLEN); | |
2869 } else debugflag(result, ppfx->getFlag()); | |
2870 } | |
2871 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA
2(rv), MAXLNLEN); | |
2872 if (! HENTRY_FIND(rv, MORPH_STEM)) { | |
2873 mystrcat(result, " ", MAXLNLEN); | |
2874 mystrcat(result, MORPH_STEM, MAXLNLEN); | |
2875 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); | |
2876 } | |
2877 // store the pointer of the hash entry | |
2878 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); | |
2879 | |
2880 if (!complexprefixes && HENTRY_DATA(rv)) { | |
2881 mystrcat(result, " ", MAXLNLEN);
| |
2882 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); | |
2883 } | |
2884 if (se->getMorph()) { | |
2885 mystrcat(result, " ", MAXLNLEN); | |
2886 mystrcat(result, se->getMorph(), MAXLNLEN); | |
2887 } else debugflag(result, se->getFlag()); | |
2888 mystrcat(result, "\n", MAXLNLEN); | |
2889 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); | |
2890 } | |
2891 } | |
2892 se = se->getNext(); | |
2893 } | |
2894 | |
2895 // now handle the general case | |
2896 if (len == 0) return NULL; // FULLSTRIP | |
2897 unsigned char sp = *((const unsigned char *)(word + len - 1)); | |
2898 SfxEntry * sptr = sStart[sp]; | |
2899 | |
2900 while (sptr) { | |
2901 if (isRevSubset(sptr->getKey(), word + len - 1, len) | |
2902 ) { | |
2903 // suffixes are not allowed in beginning of compounds | |
2904 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass | |
2905 // except when signed with compoundpermitflag flag | |
2906 (sptr->getCont() && compoundpermitflag && | |
2907 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen())))
&& (!circumfix || | |
2908 // no circumfix flag in prefix and suffix | |
2909 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), | |
2910 circumfix, ep->getContLen())) && | |
2911 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->ge
tContLen())))) || | |
2912 // circumfix flag in prefix AND suffix | |
2913 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), | |
2914 circumfix, ep->getContLen())) && | |
2915 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getC
ontLen()))))) && | |
2916 // fogemorpheme | |
2917 (in_compound || | |
2918 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
sptr->getContLen()))))) && | |
2919 // needaffix on first suffix | |
2920 (cclass || !(sptr->getCont() && | |
2921 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) | |
2922 )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass,
needflag); | |
2923 while (rv) { | |
2924 if (ppfx) { | |
2925 if (ppfx->getMorph()) { | |
2926 mystrcat(result, ppfx->getMorph(), MAXLNLEN); | |
2927 mystrcat(result, " ", MAXLNLEN); | |
2928 } else debugflag(result, ppfx->getFlag()); | |
2929 } | |
2930 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HEN
TRY_DATA2(rv), MAXLNLEN); | |
2931 if (! HENTRY_FIND(rv, MORPH_STEM)) { | |
2932 mystrcat(result, " ", MAXLNLEN);
| |
2933 mystrcat(result, MORPH_STEM, MAXLNLEN); | |
2934 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); | |
2935 } | |
2936 // store the pointer of the hash entry | |
2937 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv
); | |
2938 | |
2939 if (!complexprefixes && HENTRY_DATA(rv)) { | |
2940 mystrcat(result, " ", MAXLNLEN);
| |
2941 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); | |
2942 } | |
2943 | |
2944 if (sptr->getMorph()) { | |
2945 mystrcat(result, " ", MAXLNLEN); | |
2946 mystrcat(result, sptr->getMorph(), MAXLNLEN); | |
2947 } else debugflag(result, sptr->getFlag()); | |
2948 mystrcat(result, "\n", MAXLNLEN); | |
2949 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag)
; | |
2950 } | |
2951 sptr = sptr->getNextEQ(); | |
2952 } else { | |
2953 sptr = sptr->getNextNE(); | |
2954 } | |
2955 } | |
2956 | |
2957 if (*result) return mystrdup(result); | |
2958 return NULL; | |
2959 } | |
2960 | |
2961 // check if word with affixes is correctly spelled | |
2962 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG ne
edflag, char in_compound) | |
2963 { | |
2964 struct hentry * rv= NULL; | |
2965 | |
2966 // check all prefixes (also crossed with suffixes if allowed) | |
2967 rv = prefix_check(word, len, in_compound, needflag); | |
2968 if (rv) return rv; | |
2969 | |
2970 // if still not found check all suffixes | |
2971 rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in
_compound); | |
2972 | |
2973 if (havecontclass) { | |
2974 sfx = NULL; | |
2975 pfx = NULL; | |
2976 | |
2977 if (rv) return rv; | |
2978 // if still not found check all two-level suffixes | |
2979 rv = suffix_check_twosfx(word, len, 0, NULL, needflag); | |
2980 | |
2981 if (rv) return rv; | |
2982 // if still not found check all two-level suffixes | |
2983 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag); | |
2984 } | |
2985 | |
2986 return rv; | |
2987 } | |
2988 | |
2989 // check if word with affixes is correctly spelled | |
2990 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl
ag, char in_compound) | |
2991 { | |
2992 char result[MAXLNLEN]; | |
2993 char * st = NULL; | |
2994 | |
2995 *result = '\0'; | |
2996 | |
2997 // check all prefixes (also crossed with suffixes if allowed) | |
2998 st = prefix_check_morph(word, len, in_compound); | |
2999 if (st) { | |
3000 mystrcat(result, st, MAXLNLEN); | |
3001 free(st); | |
3002 } | |
3003 | |
3004 // if still not found check all suffixes | |
3005 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); | |
3006 if (st) { | |
3007 mystrcat(result, st, MAXLNLEN); | |
3008 free(st); | |
3009 } | |
3010 | |
3011 if (havecontclass) { | |
3012 sfx = NULL; | |
3013 pfx = NULL; | |
3014 // if still not found check all two-level suffixes | |
3015 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); | |
3016 if (st) { | |
3017 mystrcat(result, st, MAXLNLEN); | |
3018 free(st); | |
3019 } | |
3020 | |
3021 // if still not found check all two-level suffixes | |
3022 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); | |
3023 if (st) { | |
3024 mystrcat(result, st, MAXLNLEN); | |
3025 free(st); | |
3026 } | |
3027 } | |
3028 | |
3029 return mystrdup(result); | |
3030 } | |
3031 | |
3032 char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, | |
3033 unsigned short al, char * morph, char * targetmorph, int level) | |
3034 { | |
3035 // handle suffixes | |
3036 char * stemmorph; | |
3037 char * stemmorphcatpos; | |
3038 char mymorph[MAXLNLEN]; | |
3039 | |
3040 if (!morph) return NULL; | |
3041 | |
3042 // check substandard flag | |
3043 if (TESTAFF(ap, substandard, al)) return NULL; | |
3044 | |
3045 if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); | |
3046 | |
3047 // int targetcount = get_sfxcount(targetmorph); | |
3048 | |
3049 // use input suffix fields, if exist | |
3050 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { | |
3051 stemmorph = mymorph; | |
3052 strcpy(stemmorph, morph); | |
3053 mystrcat(stemmorph, " ", MAXLNLEN); | |
3054 stemmorphcatpos = stemmorph + strlen(stemmorph); | |
3055 } else { | |
3056 stemmorph = morph; | |
3057 stemmorphcatpos = NULL; | |
3058 } | |
3059 | |
3060 for (int i = 0; i < al; i++) { | |
3061 const unsigned char c = (unsigned char) (ap[i] & 0x00FF); | |
3062 SfxEntry * sptr = sFlag[c]; | |
3063 while (sptr) { | |
3064 if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContL
en() == 0) || | |
3065 // don't generate forms with substandard affixes | |
3066 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { | |
3067 | |
3068 if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); | |
3069 else stemmorph = (char *) sptr->getMorph(); | |
3070 | |
3071 int cmp = morphcmp(stemmorph, targetmorph); | |
3072 | |
3073 if (cmp == 0) { | |
3074 char * newword = sptr->add(ts, wl); | |
3075 if (newword) { | |
3076 hentry * check = pHMgr->lookup(newword); // XXX extra di
c | |
3077 if (!check || !check->astr || | |
3078 !(TESTAFF(check->astr, forbiddenword, check->alen) |
| | |
3079 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))
) { | |
3080 return newword; | |
3081 } | |
3082 free(newword); | |
3083 } | |
3084 } | |
3085 | |
3086 // recursive call for secondary suffixes | |
3087 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && | |
3088 // (get_sfxcount(stemmorph) < targetcount) && | |
3089 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))
{ | |
3090 char * newword = sptr->add(ts, wl); | |
3091 if (newword) { | |
3092 char * newword2 = morphgen(newword, strlen(newword), spt
r->getCont(), | |
3093 sptr->getContLen(), stemmorph, targetmorph, 1); | |
3094 | |
3095 if (newword2) { | |
3096 free(newword); | |
3097 return newword2; | |
3098 } | |
3099 free(newword); | |
3100 newword = NULL; | |
3101 } | |
3102 } | |
3103 } | |
3104 sptr = sptr->getFlgNxt(); | |
3105 } | |
3106 } | |
3107 return NULL; | |
3108 } | |
3109 | |
3110 | |
3111 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts
, | |
3112 int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, | |
3113 char * phon) | |
3114 { | |
3115 int nh=0; | |
3116 // first add root word to list | |
3117 if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || | |
3118 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { | |
3119 wlst[nh].word = mystrdup(ts); | |
3120 if (!wlst[nh].word) return 0; | |
3121 wlst[nh].allow = (1 == 0); | |
3122 wlst[nh].orig = NULL; | |
3123 nh++; | |
3124 // add special phonetic version | |
3125 if (phon && (nh < maxn)) { | |
3126 wlst[nh].word = mystrdup(phon); | |
3127 if (!wlst[nh].word) return nh - 1; | |
3128 wlst[nh].allow = (1 == 0); | |
3129 wlst[nh].orig = mystrdup(ts); | |
3130 if (!wlst[nh].orig) return nh - 1; | |
3131 nh++; | |
3132 } | |
3133 } | |
3134 | |
3135 // handle suffixes | |
3136 for (int i = 0; i < al; i++) { | |
3137 const unsigned char c = (unsigned char) (ap[i] & 0x00FF); | |
3138 SfxEntry * sptr = sFlag[c]; | |
3139 while (sptr) { | |
3140 if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr-
>getKeyLen()) && | |
3141 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))
) && | |
3142 // check needaffix flag | |
3143 !(sptr->getCont() && ((needaffix && | |
3144 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) |
| | |
3145 (circumfix && | |
3146 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) |
| | |
3147 (onlyincompound && | |
3148 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen(
))))) | |
3149 ) { | |
3150 char * newword = sptr->add(ts, wl); | |
3151 if (newword) { | |
3152 if (nh < maxn) { | |
3153 wlst[nh].word = newword; | |
3154 wlst[nh].allow = sptr->allowCross(); | |
3155 wlst[nh].orig = NULL; | |
3156 nh++; | |
3157 // add special phonetic version | |
3158 if (phon && (nh < maxn)) { | |
3159 char st[MAXWORDUTF8LEN]; | |
3160 strcpy(st, phon); | |
3161 strcat(st, sptr->getKey()); | |
3162 reverseword(st + strlen(phon)); | |
3163 wlst[nh].word = mystrdup(st); | |
3164 if (!wlst[nh].word) return nh - 1; | |
3165 wlst[nh].allow = (1 == 0); | |
3166 wlst[nh].orig = mystrdup(newword); | |
3167 if (!wlst[nh].orig) return nh - 1; | |
3168 nh++; | |
3169 } | |
3170 } else { | |
3171 free(newword); | |
3172 } | |
3173 } | |
3174 } | |
3175 sptr = sptr->getFlgNxt(); | |
3176 } | |
3177 } | |
3178 | |
3179 int n = nh; | |
3180 | |
3181 // handle cross products of prefixes and suffixes | |
3182 for (int j=1;j<n ;j++) | |
3183 if (wlst[j].allow) { | |
3184 for (int k = 0; k < al; k++) { | |
3185 const unsigned char c = (unsigned char) (ap[k] & 0x00FF); | |
3186 PfxEntry * cptr = pFlag[c]; | |
3187 while (cptr) { | |
3188 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->
getKeyLen() || ((badl > cptr->getKeyLen()) && | |
3189 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))
) { | |
3190 int l1 = strlen(wlst[j].word); | |
3191 char * newword = cptr->add(wlst[j].word, l1); | |
3192 if (newword) { | |
3193 if (nh < maxn) { | |
3194 wlst[nh].word = newword; | |
3195 wlst[nh].allow = cptr->allowCross(); | |
3196 wlst[nh].orig = NULL; | |
3197 nh++; | |
3198 } else { | |
3199 free(newword); | |
3200 } | |
3201 } | |
3202 } | |
3203 cptr = cptr->getFlgNxt(); | |
3204 } | |
3205 } | |
3206 } | |
3207 | |
3208 | |
3209 // now handle pure prefixes | |
3210 for (int m = 0; m < al; m ++) { | |
3211 const unsigned char c = (unsigned char) (ap[m] & 0x00FF); | |
3212 PfxEntry * ptr = pFlag[c]; | |
3213 while (ptr) { | |
3214 if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->ge
tKeyLen()) && | |
3215 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && | |
3216 // check needaffix flag | |
3217 !(ptr->getCont() && ((needaffix && | |
3218 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || | |
3219 (circumfix && | |
3220 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
| |
3221 (onlyincompound && | |
3222 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen())
))) | |
3223 ) { | |
3224 char * newword = ptr->add(ts, wl); | |
3225 if (newword) { | |
3226 if (nh < maxn) { | |
3227 wlst[nh].word = newword; | |
3228 wlst[nh].allow = ptr->allowCross(); | |
3229 wlst[nh].orig = NULL; | |
3230 nh++; | |
3231 } else { | |
3232 free(newword); | |
3233 } | |
3234 } | |
3235 } | |
3236 ptr = ptr->getFlgNxt(); | |
3237 } | |
3238 } | |
3239 | |
3240 return nh; | |
3241 } | |
3242 | |
3243 // return length of replacing table | |
3244 int AffixMgr::get_numrep() const | |
3245 { | |
3246 return numrep; | |
3247 } | |
3248 | |
3249 // return replacing table | |
3250 struct replentry * AffixMgr::get_reptable() const | |
3251 { | |
3252 if (! reptable ) return NULL; | |
3253 return reptable; | |
3254 } | |
3255 | |
3256 // return iconv table | |
3257 RepList * AffixMgr::get_iconvtable() const | |
3258 { | |
3259 if (! iconvtable ) return NULL; | |
3260 return iconvtable; | |
3261 } | |
3262 | |
3263 // return oconv table | |
3264 RepList * AffixMgr::get_oconvtable() const | |
3265 { | |
3266 if (! oconvtable ) return NULL; | |
3267 return oconvtable; | |
3268 } | |
3269 | |
3270 // return replacing table | |
3271 struct phonetable * AffixMgr::get_phonetable() const | |
3272 { | |
3273 if (! phone ) return NULL; | |
3274 return phone; | |
3275 } | |
3276 | |
3277 // return length of character map table | |
3278 int AffixMgr::get_nummap() const | |
3279 { | |
3280 return nummap; | |
3281 } | |
3282 | |
3283 // return character map table | |
3284 struct mapentry * AffixMgr::get_maptable() const | |
3285 { | |
3286 if (! maptable ) return NULL; | |
3287 return maptable; | |
3288 } | |
3289 | |
3290 // return length of word break table | |
3291 int AffixMgr::get_numbreak() const | |
3292 { | |
3293 return numbreak; | |
3294 } | |
3295 | |
3296 // return character map table | |
3297 char ** AffixMgr::get_breaktable() const | |
3298 { | |
3299 if (! breaktable ) return NULL; | |
3300 return breaktable; | |
3301 } | |
3302 | |
3303 // return text encoding of dictionary | |
3304 char * AffixMgr::get_encoding() | |
3305 { | |
3306 if (! encoding ) encoding = mystrdup(SPELL_ENCODING); | |
3307 return mystrdup(encoding); | |
3308 } | |
3309 | |
3310 // return text encoding of dictionary | |
3311 int AffixMgr::get_langnum() const | |
3312 { | |
3313 return langnum; | |
3314 } | |
3315 | |
3316 // return double prefix option | |
3317 int AffixMgr::get_complexprefixes() const | |
3318 { | |
3319 return complexprefixes; | |
3320 } | |
3321 | |
3322 // return FULLSTRIP option | |
3323 int AffixMgr::get_fullstrip() const | |
3324 { | |
3325 return fullstrip; | |
3326 } | |
3327 | |
3328 FLAG AffixMgr::get_keepcase() const | |
3329 { | |
3330 return keepcase; | |
3331 } | |
3332 | |
3333 FLAG AffixMgr::get_forceucase() const | |
3334 { | |
3335 return forceucase; | |
3336 } | |
3337 | |
3338 FLAG AffixMgr::get_warn() const | |
3339 { | |
3340 return warn; | |
3341 } | |
3342 | |
3343 int AffixMgr::get_forbidwarn() const | |
3344 { | |
3345 return forbidwarn; | |
3346 } | |
3347 | |
3348 int AffixMgr::get_checksharps() const | |
3349 { | |
3350 return checksharps; | |
3351 } | |
3352 | |
3353 char * AffixMgr::encode_flag(unsigned short aflag) const | |
3354 { | |
3355 return pHMgr->encode_flag(aflag); | |
3356 } | |
3357 | |
3358 | |
3359 // return the preferred ignore string for suggestions | |
3360 char * AffixMgr::get_ignore() const | |
3361 { | |
3362 if (!ignorechars) return NULL; | |
3363 return ignorechars; | |
3364 } | |
3365 | |
3366 // return the preferred ignore string for suggestions | |
3367 unsigned short * AffixMgr::get_ignore_utf16(int * len) const | |
3368 { | |
3369 *len = ignorechars_utf16_len; | |
3370 return ignorechars_utf16; | |
3371 } | |
3372 | |
3373 // return the keyboard string for suggestions | |
3374 char * AffixMgr::get_key_string() | |
3375 { | |
3376 if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING); | |
3377 return mystrdup(keystring); | |
3378 } | |
3379 | |
3380 // return the preferred try string for suggestions | |
3381 char * AffixMgr::get_try_string() const | |
3382 { | |
3383 if (! trystring ) return NULL; | |
3384 return mystrdup(trystring); | |
3385 } | |
3386 | |
3387 // return the preferred try string for suggestions | |
3388 const char * AffixMgr::get_wordchars() const | |
3389 { | |
3390 return wordchars; | |
3391 } | |
3392 | |
3393 unsigned short * AffixMgr::get_wordchars_utf16(int * len) const | |
3394 { | |
3395 *len = wordchars_utf16_len; | |
3396 return wordchars_utf16; | |
3397 } | |
3398 | |
3399 // is there compounding? | |
3400 int AffixMgr::get_compound() const | |
3401 { | |
3402 return compoundflag || compoundbegin || numdefcpd; | |
3403 } | |
3404 | |
3405 // return the compound words control flag | |
3406 FLAG AffixMgr::get_compoundflag() const | |
3407 { | |
3408 return compoundflag; | |
3409 } | |
3410 | |
3411 // return the forbidden words control flag | |
3412 FLAG AffixMgr::get_forbiddenword() const | |
3413 { | |
3414 return forbiddenword; | |
3415 } | |
3416 | |
3417 // return the forbidden words control flag | |
3418 FLAG AffixMgr::get_nosuggest() const | |
3419 { | |
3420 return nosuggest; | |
3421 } | |
3422 | |
3423 // return the forbidden words control flag | |
3424 FLAG AffixMgr::get_nongramsuggest() const | |
3425 { | |
3426 return nongramsuggest; | |
3427 } | |
3428 | |
3429 // return the forbidden words flag modify flag | |
3430 FLAG AffixMgr::get_needaffix() const | |
3431 { | |
3432 return needaffix; | |
3433 } | |
3434 | |
3435 // return the onlyincompound flag | |
3436 FLAG AffixMgr::get_onlyincompound() const | |
3437 { | |
3438 return onlyincompound; | |
3439 } | |
3440 | |
3441 // return the compound word signal flag | |
3442 FLAG AffixMgr::get_compoundroot() const | |
3443 { | |
3444 return compoundroot; | |
3445 } | |
3446 | |
3447 // return the compound begin signal flag | |
3448 FLAG AffixMgr::get_compoundbegin() const | |
3449 { | |
3450 return compoundbegin; | |
3451 } | |
3452 | |
3453 // return the value of checknum | |
3454 int AffixMgr::get_checknum() const | |
3455 { | |
3456 return checknum; | |
3457 } | |
3458 | |
3459 // return the value of prefix | |
3460 const char * AffixMgr::get_prefix() const | |
3461 { | |
3462 if (pfx) return pfx->getKey(); | |
3463 return NULL; | |
3464 } | |
3465 | |
3466 // return the value of suffix | |
3467 const char * AffixMgr::get_suffix() const | |
3468 { | |
3469 return sfxappnd; | |
3470 } | |
3471 | |
3472 // return the value of suffix | |
3473 const char * AffixMgr::get_version() const | |
3474 { | |
3475 return version; | |
3476 } | |
3477 | |
3478 // return lemma_present flag | |
3479 FLAG AffixMgr::get_lemma_present() const | |
3480 { | |
3481 return lemma_present; | |
3482 } | |
3483 | |
3484 // utility method to look up root words in hash table | |
3485 struct hentry * AffixMgr::lookup(const char * word) | |
3486 { | |
3487 int i; | |
3488 struct hentry * he = NULL; | |
3489 for (i = 0; i < *maxdic && !he; i++) { | |
3490 he = (alldic[i])->lookup(word); | |
3491 } | |
3492 return he; | |
3493 } | |
3494 | |
3495 // return the value of suffix | |
3496 int AffixMgr::have_contclass() const | |
3497 { | |
3498 return havecontclass; | |
3499 } | |
3500 | |
3501 // return utf8 | |
3502 int AffixMgr::get_utf8() const | |
3503 { | |
3504 return utf8; | |
3505 } | |
3506 | |
3507 int AffixMgr::get_maxngramsugs(void) const | |
3508 { | |
3509 return maxngramsugs; | |
3510 } | |
3511 | |
3512 int AffixMgr::get_maxcpdsugs(void) const | |
3513 { | |
3514 return maxcpdsugs; | |
3515 } | |
3516 | |
3517 int AffixMgr::get_maxdiff(void) const | |
3518 { | |
3519 return maxdiff; | |
3520 } | |
3521 | |
3522 int AffixMgr::get_onlymaxdiff(void) const | |
3523 { | |
3524 return onlymaxdiff; | |
3525 } | |
3526 | |
3527 // return nosplitsugs | |
3528 int AffixMgr::get_nosplitsugs(void) const | |
3529 { | |
3530 return nosplitsugs; | |
3531 } | |
3532 | |
3533 // return sugswithdots | |
3534 int AffixMgr::get_sugswithdots(void) const | |
3535 { | |
3536 return sugswithdots; | |
3537 } | |
3538 | |
3539 /* parse flag */ | |
3540 int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) { | |
3541 char * s = NULL; | |
3542 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { | |
3543 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix
file parameter\n", af->getlinenum()); | |
3544 return 1; | |
3545 } | |
3546 if (parse_string(line, &s, af->getlinenum())) return 1; | |
3547 *out = pHMgr->decode_flag(s); | |
3548 free(s); | |
3549 return 0; | |
3550 } | |
3551 | |
3552 /* parse num */ | |
3553 int AffixMgr::parse_num(char * line, int * out, FileMgr * af) { | |
3554 char * s = NULL; | |
3555 if (*out != -1) { | |
3556 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix
file parameter\n", af->getlinenum()); | |
3557 return 1; | |
3558 } | |
3559 if (parse_string(line, &s, af->getlinenum())) return 1; | |
3560 *out = atoi(s); | |
3561 free(s); | |
3562 return 0; | |
3563 } | |
3564 | |
3565 /* parse in the max syllablecount of compound words and */ | |
3566 int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) | |
3567 { | |
3568 char * tp = line; | |
3569 char * piece; | |
3570 int i = 0; | |
3571 int np = 0; | |
3572 w_char w[MAXWORDLEN]; | |
3573 piece = mystrsep(&tp, 0); | |
3574 while (piece) { | |
3575 if (*piece != '\0') { | |
3576 switch(i) { | |
3577 case 0: { np++; break; } | |
3578 case 1: { cpdmaxsyllable = atoi(piece); np++; break; } | |
3579 case 2: { | |
3580 if (!utf8) { | |
3581 cpdvowels = mystrdup(piece); | |
3582 } else { | |
3583 int n = u8_u16(w, MAXWORDLEN, piece); | |
3584 if (n > 0) { | |
3585 flag_qsort((unsigned short *) w, 0, n); | |
3586 cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char)); | |
3587 if (!cpdvowels_utf16) return 1; | |
3588 memcpy(cpdvowels_utf16, w, n * sizeof(w_char)); | |
3589 } | |
3590 cpdvowels_utf16_len = n; | |
3591 } | |
3592 np++; | |
3593 break; | |
3594 } | |
3595 default: break; | |
3596 } | |
3597 i++; | |
3598 } | |
3599 piece = mystrsep(&tp, 0); | |
3600 } | |
3601 if (np < 2) { | |
3602 HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable informa
tion\n", af->getlinenum()); | |
3603 return 1; | |
3604 } | |
3605 if (np == 2) cpdvowels = mystrdup("aeiouAEIOU"); | |
3606 return 0; | |
3607 } | |
3608 | |
3609 /* parse in the typical fault correcting table */ | |
3610 int AffixMgr::parse_reptable(char * line, FileMgr * af) | |
3611 { | |
3612 if (numrep != 0) { | |
3613 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
3614 return 1; | |
3615 } | |
3616 char * tp = line; | |
3617 char * piece; | |
3618 int i = 0; | |
3619 int np = 0; | |
3620 piece = mystrsep(&tp, 0); | |
3621 while (piece) { | |
3622 if (*piece != '\0') { | |
3623 switch(i) { | |
3624 case 0: { np++; break; } | |
3625 case 1: { | |
3626 numrep = atoi(piece); | |
3627 if (numrep < 1) { | |
3628 HUNSPELL_WARNING(stderr, "error: line %d: incorrect en
try number\n", af->getlinenum()); | |
3629 return 1; | |
3630 } | |
3631 reptable = (replentry *) malloc(numrep * sizeof(struct re
plentry)); | |
3632 if (!reptable) return 1; | |
3633 np++; | |
3634 break; | |
3635 } | |
3636 default: break; | |
3637 } | |
3638 i++; | |
3639 } | |
3640 piece = mystrsep(&tp, 0); | |
3641 } | |
3642 if (np != 2) { | |
3643 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
3644 return 1; | |
3645 } | |
3646 | |
3647 /* now parse the numrep lines to read in the remainder of the table */ | |
3648 char * nl; | |
3649 for (int j=0; j < numrep; j++) { | |
3650 if ((nl = af->getline()) == NULL) return 1; | |
3651 mychomp(nl); | |
3652 tp = nl; | |
3653 i = 0; | |
3654 reptable[j].pattern = NULL; | |
3655 reptable[j].pattern2 = NULL; | |
3656 piece = mystrsep(&tp, 0); | |
3657 while (piece) { | |
3658 if (*piece != '\0') { | |
3659 switch(i) { | |
3660 case 0: { | |
3661 if (strncmp(piece,"REP",3) != 0) { | |
3662 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
3663 numrep = 0; | |
3664 return 1; | |
3665 } | |
3666 break; | |
3667 } | |
3668 case 1: { | |
3669 if (*piece == '^') reptable[j].start = true; else re
ptable[j].start = false; | |
3670 reptable[j].pattern = mystrrep(mystrdup(piece + int(
reptable[j].start)),"_"," "); | |
3671 int lr = strlen(reptable[j].pattern) - 1; | |
3672 if (reptable[j].pattern[lr] == '$') { | |
3673 reptable[j].end = true; | |
3674 reptable[j].pattern[lr] = '\0'; | |
3675 } else reptable[j].end = false; | |
3676 break; | |
3677 } | |
3678 case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_",
" "); break; } | |
3679 default: break; | |
3680 } | |
3681 i++; | |
3682 } | |
3683 piece = mystrsep(&tp, 0); | |
3684 } | |
3685 if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { | |
3686 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
3687 numrep = 0; | |
3688 return 1; | |
3689 } | |
3690 } | |
3691 return 0; | |
3692 } | |
3693 | |
3694 /* parse in the typical fault correcting table */ | |
3695 int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const c
har * keyword) | |
3696 { | |
3697 if (*rl) { | |
3698 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
3699 return 1; | |
3700 } | |
3701 char * tp = line; | |
3702 char * piece; | |
3703 int i = 0; | |
3704 int np = 0; | |
3705 int numrl = 0; | |
3706 piece = mystrsep(&tp, 0); | |
3707 while (piece) { | |
3708 if (*piece != '\0') { | |
3709 switch(i) { | |
3710 case 0: { np++; break; } | |
3711 case 1: { | |
3712 numrl = atoi(piece); | |
3713 if (numrl < 1) { | |
3714 HUNSPELL_WARNING(stderr, "error: line %d: incorrect en
try number\n", af->getlinenum()); | |
3715 return 1; | |
3716 } | |
3717 *rl = new RepList(numrl); | |
3718 if (!*rl) return 1; | |
3719 np++; | |
3720 break; | |
3721 } | |
3722 default: break; | |
3723 } | |
3724 i++; | |
3725 } | |
3726 piece = mystrsep(&tp, 0); | |
3727 } | |
3728 if (np != 2) { | |
3729 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
3730 return 1; | |
3731 } | |
3732 | |
3733 /* now parse the num lines to read in the remainder of the table */ | |
3734 char * nl; | |
3735 for (int j=0; j < numrl; j++) { | |
3736 if (!(nl = af->getline())) return 1; | |
3737 mychomp(nl); | |
3738 tp = nl; | |
3739 i = 0; | |
3740 char * pattern = NULL; | |
3741 char * pattern2 = NULL; | |
3742 piece = mystrsep(&tp, 0); | |
3743 while (piece) { | |
3744 if (*piece != '\0') { | |
3745 switch(i) { | |
3746 case 0: { | |
3747 if (strncmp(piece, keyword, strlen(keyword)) != 0)
{ | |
3748 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
3749 delete *rl; | |
3750 *rl = NULL; | |
3751 return 1; | |
3752 } | |
3753 break; | |
3754 } | |
3755 case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break;
} | |
3756 case 2: { | |
3757 pattern2 = mystrrep(mystrdup(piece),"_"," "); | |
3758 break; | |
3759 } | |
3760 default: break; | |
3761 } | |
3762 i++; | |
3763 } | |
3764 piece = mystrsep(&tp, 0); | |
3765 } | |
3766 if (!pattern || !pattern2) { | |
3767 if (pattern) | |
3768 free(pattern); | |
3769 if (pattern2) | |
3770 free(pattern2); | |
3771 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->g
etlinenum()); | |
3772 return 1; | |
3773 } | |
3774 (*rl)->add(pattern, pattern2); | |
3775 } | |
3776 return 0; | |
3777 } | |
3778 | |
3779 | |
3780 /* parse in the typical fault correcting table */ | |
3781 int AffixMgr::parse_phonetable(char * line, FileMgr * af) | |
3782 { | |
3783 if (phone) { | |
3784 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
3785 return 1; | |
3786 } | |
3787 char * tp = line; | |
3788 char * piece; | |
3789 int i = 0; | |
3790 int np = 0; | |
3791 piece = mystrsep(&tp, 0); | |
3792 while (piece) { | |
3793 if (*piece != '\0') { | |
3794 switch(i) { | |
3795 case 0: { np++; break; } | |
3796 case 1: { | |
3797 phone = (phonetable *) malloc(sizeof(struct phonetable)); | |
3798 if (!phone) return 1; | |
3799 phone->num = atoi(piece); | |
3800 phone->rules = NULL; | |
3801 phone->utf8 = (char) utf8; | |
3802 if (phone->num < 1) { | |
3803 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
3804 return 1; | |
3805 } | |
3806 phone->rules = (char * *) malloc(2 * (phone->num + 1) * s
izeof(char *)); | |
3807 if (!phone->rules) { | |
3808 free(phone); | |
3809 phone = NULL; | |
3810 return 1; | |
3811 } | |
3812 np++; | |
3813 break; | |
3814 } | |
3815 default: break; | |
3816 } | |
3817 i++; | |
3818 } | |
3819 piece = mystrsep(&tp, 0); | |
3820 } | |
3821 if (np != 2) { | |
3822 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
3823 return 1; | |
3824 } | |
3825 | |
3826 /* now parse the phone->num lines to read in the remainder of the table */ | |
3827 char * nl; | |
3828 for (int j=0; j < phone->num; j++) { | |
3829 if (!(nl = af->getline())) return 1; | |
3830 mychomp(nl); | |
3831 tp = nl; | |
3832 i = 0; | |
3833 phone->rules[j * 2] = NULL; | |
3834 phone->rules[j * 2 + 1] = NULL; | |
3835 piece = mystrsep(&tp, 0); | |
3836 while (piece) { | |
3837 if (*piece != '\0') { | |
3838 switch(i) { | |
3839 case 0: { | |
3840 if (strncmp(piece,"PHONE",5) != 0) { | |
3841 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
3842 phone->num = 0; | |
3843 return 1; | |
3844 } | |
3845 break; | |
3846 } | |
3847 case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_","
"); break; } | |
3848 case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"
_",""); break; } | |
3849 default: break; | |
3850 } | |
3851 i++; | |
3852 } | |
3853 piece = mystrsep(&tp, 0); | |
3854 } | |
3855 if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { | |
3856 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
3857 phone->num = 0; | |
3858 return 1; | |
3859 } | |
3860 } | |
3861 phone->rules[phone->num * 2] = mystrdup(""); | |
3862 phone->rules[phone->num * 2 + 1] = mystrdup(""); | |
3863 init_phonet_hash(*phone); | |
3864 return 0; | |
3865 } | |
3866 | |
3867 /* parse in the checkcompoundpattern table */ | |
3868 int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) | |
3869 { | |
3870 if (numcheckcpd != 0) { | |
3871 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
3872 return 1; | |
3873 } | |
3874 char * tp = line; | |
3875 char * piece; | |
3876 int i = 0; | |
3877 int np = 0; | |
3878 piece = mystrsep(&tp, 0); | |
3879 while (piece) { | |
3880 if (*piece != '\0') { | |
3881 switch(i) { | |
3882 case 0: { np++; break; } | |
3883 case 1: { | |
3884 numcheckcpd = atoi(piece); | |
3885 if (numcheckcpd < 1) { | |
3886 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
3887 return 1; | |
3888 } | |
3889 checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(
struct patentry)); | |
3890 if (!checkcpdtable) return 1; | |
3891 np++; | |
3892 break; | |
3893 } | |
3894 default: break; | |
3895 } | |
3896 i++; | |
3897 } | |
3898 piece = mystrsep(&tp, 0); | |
3899 } | |
3900 if (np != 2) { | |
3901 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum
()); | |
3902 return 1; | |
3903 } | |
3904 | |
3905 /* now parse the numcheckcpd lines to read in the remainder of the table */ | |
3906 char * nl; | |
3907 for (int j=0; j < numcheckcpd; j++) { | |
3908 if (!(nl = af->getline())) return 1; | |
3909 mychomp(nl); | |
3910 tp = nl; | |
3911 i = 0; | |
3912 checkcpdtable[j].pattern = NULL; | |
3913 checkcpdtable[j].pattern2 = NULL; | |
3914 checkcpdtable[j].pattern3 = NULL; | |
3915 checkcpdtable[j].cond = FLAG_NULL; | |
3916 checkcpdtable[j].cond2 = FLAG_NULL; | |
3917 piece = mystrsep(&tp, 0); | |
3918 while (piece) { | |
3919 if (*piece != '\0') { | |
3920 switch(i) { | |
3921 case 0: { | |
3922 if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0)
{ | |
3923 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
3924 numcheckcpd = 0; | |
3925 return 1; | |
3926 } | |
3927 break; | |
3928 } | |
3929 case 1: { | |
3930 checkcpdtable[j].pattern = mystrdup(piece); | |
3931 char * p = strchr(checkcpdtable[j].pattern, '/'); | |
3932 if (p) { | |
3933 *p = '\0'; | |
3934 checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); | |
3935 } | |
3936 break; } | |
3937 case 2: { | |
3938 checkcpdtable[j].pattern2 = mystrdup(piece); | |
3939 char * p = strchr(checkcpdtable[j].pattern2, '/'); | |
3940 if (p) { | |
3941 *p = '\0'; | |
3942 checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); | |
3943 } | |
3944 break; | |
3945 } | |
3946 case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplif
iedcpd = 1; break; } | |
3947 default: break; | |
3948 } | |
3949 i++; | |
3950 } | |
3951 piece = mystrsep(&tp, 0); | |
3952 } | |
3953 if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { | |
3954 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
3955 numcheckcpd = 0; | |
3956 return 1; | |
3957 } | |
3958 } | |
3959 return 0; | |
3960 } | |
3961 | |
3962 /* parse in the compound rule table */ | |
3963 int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) | |
3964 { | |
3965 if (numdefcpd != 0) { | |
3966 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
3967 return 1; | |
3968 } | |
3969 char * tp = line; | |
3970 char * piece; | |
3971 int i = 0; | |
3972 int np = 0; | |
3973 piece = mystrsep(&tp, 0); | |
3974 while (piece) { | |
3975 if (*piece != '\0') { | |
3976 switch(i) { | |
3977 case 0: { np++; break; } | |
3978 case 1: { | |
3979 numdefcpd = atoi(piece); | |
3980 if (numdefcpd < 1) { | |
3981 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
3982 return 1; | |
3983 } | |
3984 defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(fla
gentry)); | |
3985 if (!defcpdtable) return 1; | |
3986 np++; | |
3987 break; | |
3988 } | |
3989 default: break; | |
3990 } | |
3991 i++; | |
3992 } | |
3993 piece = mystrsep(&tp, 0); | |
3994 } | |
3995 if (np != 2) { | |
3996 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
3997 return 1; | |
3998 } | |
3999 | |
4000 /* now parse the numdefcpd lines to read in the remainder of the table */ | |
4001 char * nl; | |
4002 for (int j=0; j < numdefcpd; j++) { | |
4003 if (!(nl = af->getline())) return 1; | |
4004 mychomp(nl); | |
4005 tp = nl; | |
4006 i = 0; | |
4007 defcpdtable[j].def = NULL; | |
4008 piece = mystrsep(&tp, 0); | |
4009 while (piece) { | |
4010 if (*piece != '\0') { | |
4011 switch(i) { | |
4012 case 0: { | |
4013 if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { | |
4014 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
4015 numdefcpd = 0; | |
4016 return 1; | |
4017 } | |
4018 break; | |
4019 } | |
4020 case 1: { // handle parenthesized flags | |
4021 if (strchr(piece, '(')) { | |
4022 defcpdtable[j].def = (FLAG *) malloc(strlen(piec
e) * sizeof(FLAG)); | |
4023 defcpdtable[j].len = 0; | |
4024 int end = 0; | |
4025 FLAG * conv; | |
4026 while (!end) { | |
4027 char * par = piece + 1; | |
4028 while (*par != '(' && *par != ')' && *par !=
'\0') par++; | |
4029 if (*par == '\0') end = 1; else *par = '\0'; | |
4030 if (*piece == '(') piece++; | |
4031 if (*piece == '*' || *piece == '?') { | |
4032 defcpdtable[j].def[defcpdtable[j].len++]
= (FLAG) *piece; | |
4033 } else if (*piece != '\0') { | |
4034 int l = pHMgr->decode_flags(&conv, piece
, af); | |
4035 for (int k = 0; k < l; k++) defcpdtable[
j].def[defcpdtable[j].len++] = conv[k]; | |
4036 free(conv); | |
4037 } | |
4038 piece = par + 1; | |
4039 } | |
4040 } else { | |
4041 defcpdtable[j].len = pHMgr->decode_flags(&(defcp
dtable[j].def), piece, af); | |
4042 } | |
4043 break; | |
4044 } | |
4045 default: break; | |
4046 } | |
4047 i++; | |
4048 } | |
4049 piece = mystrsep(&tp, 0); | |
4050 } | |
4051 if (!defcpdtable[j].len) { | |
4052 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
4053 numdefcpd = 0; | |
4054 return 1; | |
4055 } | |
4056 } | |
4057 return 0; | |
4058 } | |
4059 | |
4060 | |
4061 /* parse in the character map table */ | |
4062 int AffixMgr::parse_maptable(char * line, FileMgr * af) | |
4063 { | |
4064 if (nummap != 0) { | |
4065 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
4066 return 1; | |
4067 } | |
4068 char * tp = line; | |
4069 char * piece; | |
4070 int i = 0; | |
4071 int np = 0; | |
4072 piece = mystrsep(&tp, 0); | |
4073 while (piece) { | |
4074 if (*piece != '\0') { | |
4075 switch(i) { | |
4076 case 0: { np++; break; } | |
4077 case 1: { | |
4078 nummap = atoi(piece); | |
4079 if (nummap < 1) { | |
4080 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
4081 return 1; | |
4082 } | |
4083 maptable = (mapentry *) malloc(nummap * sizeof(struct map
entry)); | |
4084 if (!maptable) return 1; | |
4085 np++; | |
4086 break; | |
4087 } | |
4088 default: break; | |
4089 } | |
4090 i++; | |
4091 } | |
4092 piece = mystrsep(&tp, 0); | |
4093 } | |
4094 if (np != 2) { | |
4095 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
4096 return 1; | |
4097 } | |
4098 | |
4099 /* now parse the nummap lines to read in the remainder of the table */ | |
4100 char * nl; | |
4101 for (int j=0; j < nummap; j++) { | |
4102 if (!(nl = af->getline())) return 1; | |
4103 mychomp(nl); | |
4104 tp = nl; | |
4105 i = 0; | |
4106 maptable[j].set = NULL; | |
4107 maptable[j].len = 0; | |
4108 piece = mystrsep(&tp, 0); | |
4109 while (piece) { | |
4110 if (*piece != '\0') { | |
4111 switch(i) { | |
4112 case 0: { | |
4113 if (strncmp(piece,"MAP",3) != 0) { | |
4114 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
4115 nummap = 0; | |
4116 return 1; | |
4117 } | |
4118 break; | |
4119 } | |
4120 case 1: { | |
4121 int setn = 0; | |
4122 maptable[j].len = strlen(piece); | |
4123 maptable[j].set = (char **) malloc(maptable[j].len *
sizeof(char*)); | |
4124 if (!maptable[j].set) return 1; | |
4125 for (int k = 0; k < maptable[j].len; k++) { | |
4126 int chl = 1; | |
4127 int chb = k; | |
4128 if (piece[k] == '(') { | |
4129 char * parpos = strchr(piece + k, ')'); | |
4130 if (parpos != NULL) { | |
4131 chb = k + 1; | |
4132 chl = (int)(parpos - piece) - k - 1; | |
4133 k = k + chl + 1; | |
4134 } | |
4135 } else { | |
4136 if (utf8 && (piece[k] & 0xc0) == 0xc0) { | |
4137 for (k++; utf8 && (piece[k] & 0xc0) == 0
x80; k++); | |
4138 chl = k - chb; | |
4139 k--; | |
4140 } | |
4141 } | |
4142 maptable[j].set[setn] = (char *) malloc(chl + 1)
; | |
4143 if (!maptable[j].set[setn]) return 1; | |
4144 strncpy(maptable[j].set[setn], piece + chb, chl)
; | |
4145 maptable[j].set[setn][chl] = '\0'; | |
4146 setn++; | |
4147 } | |
4148 maptable[j].len = setn; | |
4149 break; } | |
4150 default: break; | |
4151 } | |
4152 i++; | |
4153 } | |
4154 piece = mystrsep(&tp, 0); | |
4155 } | |
4156 if (!maptable[j].set || !maptable[j].len) { | |
4157 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
4158 nummap = 0; | |
4159 return 1; | |
4160 } | |
4161 } | |
4162 return 0; | |
4163 } | |
4164 | |
4165 /* parse in the word breakpoint table */ | |
4166 int AffixMgr::parse_breaktable(char * line, FileMgr * af) | |
4167 { | |
4168 if (numbreak > -1) { | |
4169 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
4170 return 1; | |
4171 } | |
4172 char * tp = line; | |
4173 char * piece; | |
4174 int i = 0; | |
4175 int np = 0; | |
4176 piece = mystrsep(&tp, 0); | |
4177 while (piece) { | |
4178 if (*piece != '\0') { | |
4179 switch(i) { | |
4180 case 0: { np++; break; } | |
4181 case 1: { | |
4182 numbreak = atoi(piece); | |
4183 if (numbreak < 0) { | |
4184 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
4185 return 1; | |
4186 } | |
4187 if (numbreak == 0) return 0; | |
4188 breaktable = (char **) malloc(numbreak * sizeof(char *)); | |
4189 if (!breaktable) return 1; | |
4190 np++; | |
4191 break; | |
4192 } | |
4193 default: break; | |
4194 } | |
4195 i++; | |
4196 } | |
4197 piece = mystrsep(&tp, 0); | |
4198 } | |
4199 if (np != 2) { | |
4200 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
4201 return 1; | |
4202 } | |
4203 | |
4204 /* now parse the numbreak lines to read in the remainder of the table */ | |
4205 char * nl; | |
4206 for (int j=0; j < numbreak; j++) { | |
4207 if (!(nl = af->getline())) return 1; | |
4208 mychomp(nl); | |
4209 tp = nl; | |
4210 i = 0; | |
4211 piece = mystrsep(&tp, 0); | |
4212 while (piece) { | |
4213 if (*piece != '\0') { | |
4214 switch(i) { | |
4215 case 0: { | |
4216 if (strncmp(piece,"BREAK",5) != 0) { | |
4217 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
4218 numbreak = 0; | |
4219 return 1; | |
4220 } | |
4221 break; | |
4222 } | |
4223 case 1: { | |
4224 breaktable[j] = mystrdup(piece); | |
4225 break; | |
4226 } | |
4227 default: break; | |
4228 } | |
4229 i++; | |
4230 } | |
4231 piece = mystrsep(&tp, 0); | |
4232 } | |
4233 if (!breaktable) { | |
4234 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
4235 numbreak = 0; | |
4236 return 1; | |
4237 } | |
4238 } | |
4239 return 0; | |
4240 } | |
4241 | |
4242 void AffixMgr::reverse_condition(char * piece) { | |
4243 int neg = 0; | |
4244 for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { | |
4245 switch(*k) { | |
4246 case '[': { | |
4247 if (neg) *(k+1) = '['; else *k = ']'; | |
4248 break; | |
4249 } | |
4250 case ']': { | |
4251 *k = '['; | |
4252 if (neg) *(k+1) = '^'; | |
4253 neg = 0; | |
4254 break; | |
4255 } | |
4256 case '^': { | |
4257 if (*(k+1) == ']') neg = 1; else *(k+1) = *k; | |
4258 break; | |
4259 } | |
4260 default: { | |
4261 if (neg) *(k+1) = *k; | |
4262 } | |
4263 } | |
4264 } | |
4265 } | |
4266 | |
4267 int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf
lags) | |
4268 { | |
4269 int numents = 0; // number of affentry structures to parse | |
4270 | |
4271 unsigned short aflag = 0; // affix char identifier | |
4272 | |
4273 char ff=0; | |
4274 std::vector<affentry> affentries; | |
4275 | |
4276 char * tp = line; | |
4277 char * nl = line; | |
4278 char * piece; | |
4279 int i = 0; | |
4280 | |
4281 // checking lines with bad syntax | |
4282 #ifdef DEBUG | |
4283 int basefieldnum = 0; | |
4284 #endif | |
4285 | |
4286 // split affix header line into pieces | |
4287 | |
4288 int np = 0; | |
4289 | |
4290 piece = mystrsep(&tp, 0); | |
4291 while (piece) { | |
4292 if (*piece != '\0') { | |
4293 switch(i) { | |
4294 // piece 1 - is type of affix | |
4295 case 0: { np++; break; } | |
4296 | |
4297 // piece 2 - is affix char | |
4298 case 1: { | |
4299 np++; | |
4300 aflag = pHMgr->decode_flag(piece); | |
4301 #ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates. | |
4302 if (((at == 'S') && (dupflags[aflag] & dupSFX)) || | |
4303 ((at == 'P') && (dupflags[aflag] & dupPFX))) { | |
4304 HUNSPELL_WARNING(stderr, "error: line %d: multiple defin
itions of an affix flag\n", | |
4305 af->getlinenum()); | |
4306 // return 1; XXX permissive mode for bad dictionaries | |
4307 } | |
4308 dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); | |
4309 #endif | |
4310 break; | |
4311 } | |
4312 // piece 3 - is cross product indicator | |
4313 case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; } | |
4314 | |
4315 // piece 4 - is number of affentries | |
4316 case 3: { | |
4317 np++; | |
4318 numents = atoi(piece); | |
4319 if (numents == 0) { | |
4320 char * err = pHMgr->encode_flag(aflag); | |
4321 if (err) { | |
4322 HUNSPELL_WARNING(stderr, "error: line %d: bad en
try number\n", | |
4323 af->getlinenum()); | |
4324 free(err); | |
4325 } | |
4326 return 1; | |
4327 } | |
4328 affentries.resize(numents); | |
4329 affentries[0].opts = ff; | |
4330 if (utf8) affentries[0].opts += aeUTF8; | |
4331 if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF; | |
4332 if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM; | |
4333 affentries[0].aflag = aflag; | |
4334 } | |
4335 | |
4336 default: break; | |
4337 } | |
4338 i++; | |
4339 } | |
4340 piece = mystrsep(&tp, 0); | |
4341 } | |
4342 // check to make sure we parsed enough pieces | |
4343 if (np != 4) { | |
4344 char * err = pHMgr->encode_flag(aflag); | |
4345 if (err) { | |
4346 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getli
nenum()); | |
4347 free(err); | |
4348 } | |
4349 return 1; | |
4350 } | |
4351 | |
4352 // now parse numents affentries for this affix | |
4353 std::vector<affentry>::iterator start = affentries.begin(); | |
4354 std::vector<affentry>::iterator end = affentries.end(); | |
4355 for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { | |
4356 if ((nl = af->getline()) == NULL) return 1; | |
4357 mychomp(nl); | |
4358 tp = nl; | |
4359 i = 0; | |
4360 np = 0; | |
4361 | |
4362 // split line into pieces | |
4363 piece = mystrsep(&tp, 0); | |
4364 while (piece) { | |
4365 if (*piece != '\0') { | |
4366 switch(i) { | |
4367 // piece 1 - is type | |
4368 case 0: { | |
4369 np++; | |
4370 if (entry != start) entry->opts = start->opts & | |
4371 (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); | |
4372 break; | |
4373 } | |
4374 | |
4375 // piece 2 - is affix char | |
4376 case 1: { | |
4377 np++; | |
4378 if (pHMgr->decode_flag(piece) != aflag) { | |
4379 char * err = pHMgr->encode_flag(aflag); | |
4380 if (err) { | |
4381 HUNSPELL_WARNING(stderr, "error: line %d: affix
%s is corrupt\n", | |
4382 af->getlinenum(), err); | |
4383 free(err); | |
4384 } | |
4385 return 1; | |
4386 } | |
4387 | |
4388 if (entry != start) entry->aflag = start->aflag; | |
4389 break; | |
4390 } | |
4391 | |
4392 // piece 3 - is string to strip or 0 for null | |
4393 case 2: { | |
4394 np++; | |
4395 if (complexprefixes) { | |
4396 if (utf8) reverseword_utf(piece); else reverseword(p
iece); | |
4397 } | |
4398 entry->strip = mystrdup(piece); | |
4399 entry->stripl = (unsigned char) strlen(entry->strip); | |
4400 if (strcmp(entry->strip,"0") == 0) { | |
4401 free(entry->strip); | |
4402 entry->strip=mystrdup(""); | |
4403 entry->stripl = 0; | |
4404 } | |
4405 break; | |
4406 } | |
4407 | |
4408 // piece 4 - is affix string or 0 for null | |
4409 case 3: { | |
4410 char * dash; | |
4411 entry->morphcode = NULL; | |
4412 entry->contclass = NULL; | |
4413 entry->contclasslen = 0; | |
4414 np++; | |
4415 dash = strchr(piece, '/'); | |
4416 if (dash) { | |
4417 *dash = '\0'; | |
4418 | |
4419 if (ignorechars) { | |
4420 if (utf8) { | |
4421 remove_ignored_chars_utf(piece, ignorechars_utf1
6, ignorechars_utf16_len); | |
4422 } else { | |
4423 remove_ignored_chars(piece,ignorechars); | |
4424 } | |
4425 } | |
4426 | |
4427 if (complexprefixes) { | |
4428 if (utf8) reverseword_utf(piece); else reversewo
rd(piece); | |
4429 } | |
4430 entry->appnd = mystrdup(piece); | |
4431 | |
4432 if (pHMgr->is_aliasf()) { | |
4433 int index = atoi(dash + 1); | |
4434 entry->contclasslen = (unsigned short) pHMgr->ge
t_aliasf(index, &(entry->contclass), af); | |
4435 if (!entry->contclasslen) HUNSPELL_WARNING(stder
r, "error: bad affix flag alias: \"%s\"\n", dash+1); | |
4436 } else { | |
4437 entry->contclasslen = (unsigned short) pHMgr->de
code_flags(&(entry->contclass), dash + 1, af); | |
4438 flag_qsort(entry->contclass, 0, entry->contclass
len); | |
4439 } | |
4440 *dash = '/'; | |
4441 | |
4442 havecontclass = 1; | |
4443 for (unsigned short _i = 0; _i < entry->contclasslen
; _i++) { | |
4444 contclasses[(entry->contclass)[_i]] = 1; | |
4445 } | |
4446 } else { | |
4447 if (ignorechars) { | |
4448 if (utf8) { | |
4449 remove_ignored_chars_utf(piece, ignorechars_utf1
6, ignorechars_utf16_len); | |
4450 } else { | |
4451 remove_ignored_chars(piece,ignorechars); | |
4452 } | |
4453 } | |
4454 | |
4455 if (complexprefixes) { | |
4456 if (utf8) reverseword_utf(piece); else reversewo
rd(piece); | |
4457 } | |
4458 entry->appnd = mystrdup(piece); | |
4459 } | |
4460 | |
4461 entry->appndl = (unsigned char) strlen(entry->appnd); | |
4462 if (strcmp(entry->appnd,"0") == 0) { | |
4463 free(entry->appnd); | |
4464 entry->appnd=mystrdup(""); | |
4465 entry->appndl = 0; | |
4466 } | |
4467 break; | |
4468 } | |
4469 | |
4470 // piece 5 - is the conditions descriptions | |
4471 case 4: { | |
4472 np++; | |
4473 if (complexprefixes) { | |
4474 if (utf8) reverseword_utf(piece); else reverseword(p
iece); | |
4475 reverse_condition(piece); | |
4476 } | |
4477 if (entry->stripl && (strcmp(piece, ".") != 0) && | |
4478 redundant_condition(at, entry->strip, entry->stripl,
piece, af->getlinenum())) | |
4479 strcpy(piece, "."); | |
4480 if (at == 'S') { | |
4481 reverseword(piece); | |
4482 reverse_condition(piece); | |
4483 } | |
4484 if (encodeit(*entry, piece)) return 1; | |
4485 break; | |
4486 } | |
4487 | |
4488 case 5: { | |
4489 np++; | |
4490 if (pHMgr->is_aliasm()) { | |
4491 int index = atoi(piece); | |
4492 entry->morphcode = pHMgr->get_aliasm(index); | |
4493 } else { | |
4494 if (complexprefixes) { // XXX - fix me for morph. ge
n. | |
4495 if (utf8) reverseword_utf(piece); else reversewo
rd(piece); | |
4496 } | |
4497 // add the remaining of the line | |
4498 if (*tp) { | |
4499 *(tp - 1) = ' '; | |
4500 tp = tp + strlen(tp); | |
4501 } | |
4502 entry->morphcode = mystrdup(piece); | |
4503 if (!entry->morphcode) return 1; | |
4504 } | |
4505 break; | |
4506 } | |
4507 default: break; | |
4508 } | |
4509 i++; | |
4510 } | |
4511 piece = mystrsep(&tp, 0); | |
4512 } | |
4513 // check to make sure we parsed enough pieces | |
4514 if (np < 4) { | |
4515 char * err = pHMgr->encode_flag(aflag); | |
4516 if (err) { | |
4517 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", | |
4518 af->getlinenum(), err); | |
4519 free(err); | |
4520 } | |
4521 return 1; | |
4522 } | |
4523 | |
4524 #ifdef DEBUG | |
4525 // detect unnecessary fields, excepting comments | |
4526 if (basefieldnum) { | |
4527 int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5
: 6); | |
4528 if (fieldnum != basefieldnum) | |
4529 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af-
>getlinenum()); | |
4530 } else { | |
4531 basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5
: 6); | |
4532 } | |
4533 #endif | |
4534 } | |
4535 | |
4536 // now create SfxEntry or PfxEntry objects and use links to | |
4537 // build an ordered (sorted by affix string) list | |
4538 for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { | |
4539 if (at == 'P') { | |
4540 PfxEntry * pfxptr = new PfxEntry(this,&(*entry)); | |
4541 build_pfxtree(pfxptr); | |
4542 } else { | |
4543 SfxEntry * sfxptr = new SfxEntry(this,&(*entry)); | |
4544 build_sfxtree(sfxptr); | |
4545 } | |
4546 } | |
4547 return 0; | |
4548 } | |
4549 | |
4550 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char
* cond, int linenum) { | |
4551 int condl = strlen(cond); | |
4552 int i; | |
4553 int j; | |
4554 int neg; | |
4555 int in; | |
4556 if (ft == 'P') { // prefix | |
4557 if (strncmp(strip, cond, condl) == 0) return 1; | |
4558 if (utf8) { | |
4559 } else { | |
4560 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { | |
4561 if (cond[j] != '[') { | |
4562 if (cond[j] != strip[i]) { | |
4563 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping c
haracters and condition\n", linenum); | |
4564 return 0; | |
4565 } | |
4566 } else { | |
4567 neg = (cond[j+1] == '^') ? 1 : 0; | |
4568 in = 0; | |
4569 do { | |
4570 j++; | |
4571 if (strip[i] == cond[j]) in = 1; | |
4572 } while ((j < (condl - 1)) && (cond[j] != ']')); | |
4573 if (j == (condl - 1) && (cond[j] != ']')) { | |
4574 HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%
s\n", linenum, cond); | |
4575 return 0; | |
4576 } | |
4577 if ((!neg && !in) || (neg && in)) { | |
4578 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping c
haracters and condition\n", linenum); | |
4579 return 0; | |
4580 } | |
4581 } | |
4582 } | |
4583 if (j >= condl) return 1; | |
4584 } | |
4585 } else { // suffix | |
4586 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1
; | |
4587 if (utf8) { | |
4588 } else { | |
4589 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { | |
4590 if (cond[j] != ']') { | |
4591 if (cond[j] != strip[i]) { | |
4592 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping c
haracters and condition\n", linenum); | |
4593 return 0; | |
4594 } | |
4595 } else { | |
4596 in = 0; | |
4597 do { | |
4598 j--; | |
4599 if (strip[i] == cond[j]) in = 1; | |
4600 } while ((j > 0) && (cond[j] != '[')); | |
4601 if ((j == 0) && (cond[j] != '[')) { | |
4602 HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n
%s\n", linenum, cond); | |
4603 return 0; | |
4604 } | |
4605 neg = (cond[j+1] == '^') ? 1 : 0; | |
4606 if ((!neg && !in) || (neg && in)) { | |
4607 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping c
haracters and condition\n", linenum); | |
4608 return 0; | |
4609 } | |
4610 } | |
4611 } | |
4612 if (j < 0) return 1; | |
4613 } | |
4614 } | |
4615 return 0; | |
4616 } | |
OLD | NEW |