OLD | NEW |
| (Empty) |
1 #include "license.hunspell" | |
2 #include "license.myspell" | |
3 | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <stdio.h> | |
7 #include <ctype.h> | |
8 | |
9 #include "affentry.hxx" | |
10 #include "csutil.hxx" | |
11 | |
12 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) | |
13 { | |
14 // register affix manager | |
15 pmyMgr = pmgr; | |
16 | |
17 // set up its initial values | |
18 | |
19 aflag = dp->aflag; // flag | |
20 strip = dp->strip; // string to strip | |
21 appnd = dp->appnd; // string to append | |
22 stripl = dp->stripl; // length of strip string | |
23 appndl = dp->appndl; // length of append string | |
24 numconds = dp->numconds; // length of the condition | |
25 opts = dp->opts; // cross product flag | |
26 // then copy over all of the conditions | |
27 if (opts & aeLONGCOND) { | |
28 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); | |
29 c.l.conds2 = dp->c.l.conds2; | |
30 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); | |
31 next = NULL; | |
32 nextne = NULL; | |
33 nexteq = NULL; | |
34 morphcode = dp->morphcode; | |
35 contclass = dp->contclass; | |
36 contclasslen = dp->contclasslen; | |
37 } | |
38 | |
39 | |
40 PfxEntry::~PfxEntry() | |
41 { | |
42 aflag = 0; | |
43 if (appnd) free(appnd); | |
44 if (strip) free(strip); | |
45 pmyMgr = NULL; | |
46 appnd = NULL; | |
47 strip = NULL; | |
48 if (opts & aeLONGCOND) free(c.l.conds2); | |
49 if (morphcode && !(opts & aeALIASM)) free(morphcode); | |
50 if (contclass && !(opts & aeALIASF)) free(contclass); | |
51 } | |
52 | |
53 // add prefix to this word assuming conditions hold | |
54 char * PfxEntry::add(const char * word, int len) | |
55 { | |
56 char tword[MAXWORDUTF8LEN + 4]; | |
57 | |
58 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && | |
59 (len >= numconds) && test_condition(word) && | |
60 (!stripl || (strncmp(word, strip, stripl) == 0)) && | |
61 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { | |
62 /* we have a match so add prefix */ | |
63 char * pp = tword; | |
64 if (appndl) { | |
65 strcpy(tword,appnd); | |
66 pp += appndl; | |
67 } | |
68 strcpy(pp, (word + stripl)); | |
69 return mystrdup(tword); | |
70 } | |
71 return NULL; | |
72 } | |
73 | |
74 inline char * PfxEntry::nextchar(char * p) { | |
75 if (p) { | |
76 p++; | |
77 if (opts & aeLONGCOND) { | |
78 // jump to the 2nd part of the condition | |
79 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; | |
80 // end of the MAXCONDLEN length condition | |
81 } else if (p == c.conds + MAXCONDLEN) return NULL; | |
82 return *p ? p : NULL; | |
83 } | |
84 return NULL; | |
85 } | |
86 | |
87 inline int PfxEntry::test_condition(const char * st) | |
88 { | |
89 const char * pos = NULL; // group with pos input position | |
90 bool neg = false; // complementer | |
91 bool ingroup = false; // character in the group | |
92 if (numconds == 0) return 1; | |
93 char * p = c.conds; | |
94 while (1) { | |
95 switch (*p) { | |
96 case '\0': return 1; | |
97 case '[': { | |
98 neg = false; | |
99 ingroup = false; | |
100 p = nextchar(p); | |
101 pos = st; break; | |
102 } | |
103 case '^': { p = nextchar(p); neg = true; break; } | |
104 case ']': { | |
105 if ((neg && ingroup) || (!neg && !ingroup)) return 0; | |
106 pos = NULL; | |
107 p = nextchar(p); | |
108 // skip the next character | |
109 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0)
== 0x80; st++); | |
110 if (*st == '\0' && p) return 0; // word <= condition | |
111 break; | |
112 } | |
113 case '.': if (!pos) { // dots are not metacharacters in groups: [.] | |
114 p = nextchar(p); | |
115 // skip the next character | |
116 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); | |
117 if (*st == '\0' && p) return 0; // word <= condition | |
118 break; | |
119 } | |
120 default: { | |
121 if (*st == *p) { | |
122 st++; | |
123 p = nextchar(p); | |
124 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte | |
125 while (p && (*p & 0xc0) == 0x80) { // character | |
126 if (*p != *st) { | |
127 if (!pos) return 0; | |
128 st = pos; | |
129 break; | |
130 } | |
131 p = nextchar(p); | |
132 st++; | |
133 } | |
134 if (pos && st != pos) { | |
135 ingroup = true; | |
136 while (p && *p != ']' && ((p = nextchar(p)) != NULL)
); | |
137 } | |
138 } else if (pos) { | |
139 ingroup = true; | |
140 while (p && *p != ']' && ((p = nextchar(p)) != NULL)); | |
141 } | |
142 } else if (pos) { // group | |
143 p = nextchar(p); | |
144 } else return 0; | |
145 } | |
146 } | |
147 if (!p) return 1; | |
148 } | |
149 } | |
150 | |
151 // check if this prefix entry matches | |
152 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound
, const FLAG needflag) | |
153 { | |
154 int tmpl; // length of tmpword | |
155 struct hentry * he; // hash entry of root word or NULL | |
156 char tmpword[MAXWORDUTF8LEN + 4]; | |
157 | |
158 // on entry prefix is 0 length or already matches the beginning of the word. | |
159 // So if the remaining root word has positive length | |
160 // and if there are enough chars in root word and added back strip chars | |
161 // to meet the number of characters conditions, then test it | |
162 | |
163 tmpl = len - appndl; | |
164 | |
165 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { | |
166 | |
167 // generate new root word by removing prefix and adding | |
168 // back any characters that would have been stripped | |
169 | |
170 if (stripl) strcpy (tmpword, strip); | |
171 strcpy ((tmpword + stripl), (word + appndl)); | |
172 | |
173 // now make sure all of the conditions on characters | |
174 // are met. Please see the appendix at the end of | |
175 // this file for more info on exactly what is being | |
176 // tested | |
177 | |
178 // if all conditions are met then check if resulting | |
179 // root word in the dictionary | |
180 | |
181 if (test_condition(tmpword)) { | |
182 tmpl += stripl; | |
183 if ((he = pmyMgr->lookup(tmpword)) != NULL) { | |
184 do { | |
185 if (TESTAFF(he->astr, aflag, he->alen) && | |
186 // forbid single prefixes with needaffix flag | |
187 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclassl
en) && | |
188 // needflag | |
189 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || | |
190 (contclass && TESTAFF(contclass, needflag, contclasslen
)))) | |
191 return he; | |
192 he = he->next_homonym; // check homonyms | |
193 } while (he); | |
194 } | |
195 | |
196 // prefix matched but no root word was found | |
197 // if aeXPRODUCT is allowed, try again but now | |
198 // ross checked combined with a suffix | |
199 | |
200 //if ((opts & aeXPRODUCT) && in_compound) { | |
201 if ((opts & aeXPRODUCT)) { | |
202 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NU
LL, | |
203 0, NULL, FLAG_NULL, needflag, in_compound); | |
204 if (he) return he; | |
205 } | |
206 } | |
207 } | |
208 return NULL; | |
209 } | |
210 | |
211 // check if this prefix entry matches | |
212 struct hentry * PfxEntry::check_twosfx(const char * word, int len, | |
213 char in_compound, const FLAG needflag) | |
214 { | |
215 int tmpl; // length of tmpword | |
216 struct hentry * he; // hash entry of root word or NULL | |
217 char tmpword[MAXWORDUTF8LEN + 4]; | |
218 | |
219 // on entry prefix is 0 length or already matches the beginning of the word. | |
220 // So if the remaining root word has positive length | |
221 // and if there are enough chars in root word and added back strip chars | |
222 // to meet the number of characters conditions, then test it | |
223 | |
224 tmpl = len - appndl; | |
225 | |
226 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && | |
227 (tmpl + stripl >= numconds)) { | |
228 | |
229 // generate new root word by removing prefix and adding | |
230 // back any characters that would have been stripped | |
231 | |
232 if (stripl) strcpy (tmpword, strip); | |
233 strcpy ((tmpword + stripl), (word + appndl)); | |
234 | |
235 // now make sure all of the conditions on characters | |
236 // are met. Please see the appendix at the end of | |
237 // this file for more info on exactly what is being | |
238 // tested | |
239 | |
240 // if all conditions are met then check if resulting | |
241 // root word in the dictionary | |
242 | |
243 if (test_condition(tmpword)) { | |
244 tmpl += stripl; | |
245 | |
246 // prefix matched but no root word was found | |
247 // if aeXPRODUCT is allowed, try again but now | |
248 // cross checked combined with a suffix | |
249 | |
250 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { | |
251 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, t
his, needflag); | |
252 if (he) return he; | |
253 } | |
254 } | |
255 } | |
256 return NULL; | |
257 } | |
258 | |
259 // check if this prefix entry matches | |
260 char * PfxEntry::check_twosfx_morph(const char * word, int len, | |
261 char in_compound, const FLAG needflag) | |
262 { | |
263 int tmpl; // length of tmpword | |
264 char tmpword[MAXWORDUTF8LEN + 4]; | |
265 | |
266 // on entry prefix is 0 length or already matches the beginning of the word. | |
267 // So if the remaining root word has positive length | |
268 // and if there are enough chars in root word and added back strip chars | |
269 // to meet the number of characters conditions, then test it | |
270 | |
271 tmpl = len - appndl; | |
272 | |
273 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && | |
274 (tmpl + stripl >= numconds)) { | |
275 | |
276 // generate new root word by removing prefix and adding | |
277 // back any characters that would have been stripped | |
278 | |
279 if (stripl) strcpy (tmpword, strip); | |
280 strcpy ((tmpword + stripl), (word + appndl)); | |
281 | |
282 // now make sure all of the conditions on characters | |
283 // are met. Please see the appendix at the end of | |
284 // this file for more info on exactly what is being | |
285 // tested | |
286 | |
287 // if all conditions are met then check if resulting | |
288 // root word in the dictionary | |
289 | |
290 if (test_condition(tmpword)) { | |
291 tmpl += stripl; | |
292 | |
293 // prefix matched but no root word was found | |
294 // if aeXPRODUCT is allowed, try again but now | |
295 // ross checked combined with a suffix | |
296 | |
297 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { | |
298 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl, | |
299 aeXPRODUCT, this, needflag); | |
300 } | |
301 } | |
302 } | |
303 return NULL; | |
304 } | |
305 | |
306 // check if this prefix entry matches | |
307 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const
FLAG needflag) | |
308 { | |
309 int tmpl; // length of tmpword | |
310 struct hentry * he; // hash entry of root word or NULL | |
311 char tmpword[MAXWORDUTF8LEN + 4]; | |
312 char result[MAXLNLEN]; | |
313 char * st; | |
314 | |
315 *result = '\0'; | |
316 | |
317 // on entry prefix is 0 length or already matches the beginning of the word. | |
318 // So if the remaining root word has positive length | |
319 // and if there are enough chars in root word and added back strip chars | |
320 // to meet the number of characters conditions, then test it | |
321 | |
322 tmpl = len - appndl; | |
323 | |
324 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && | |
325 (tmpl + stripl >= numconds)) { | |
326 | |
327 // generate new root word by removing prefix and adding | |
328 // back any characters that would have been stripped | |
329 | |
330 if (stripl) strcpy (tmpword, strip); | |
331 strcpy ((tmpword + stripl), (word + appndl)); | |
332 | |
333 // now make sure all of the conditions on characters | |
334 // are met. Please see the appendix at the end of | |
335 // this file for more info on exactly what is being | |
336 // tested | |
337 | |
338 // if all conditions are met then check if resulting | |
339 // root word in the dictionary | |
340 | |
341 if (test_condition(tmpword)) { | |
342 tmpl += stripl; | |
343 if ((he = pmyMgr->lookup(tmpword)) != NULL) { | |
344 do { | |
345 if (TESTAFF(he->astr, aflag, he->alen) && | |
346 // forbid single prefixes with needaffix flag | |
347 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclassl
en) && | |
348 // needflag | |
349 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || | |
350 (contclass && TESTAFF(contclass, needflag, contclasslen
)))) { | |
351 if (morphcode) { | |
352 mystrcat(result, " ", MAXLNLEN); | |
353 mystrcat(result, morphcode, MAXLNLEN); | |
354 } else mystrcat(result,getKey(), MAXLNLEN); | |
355 if (!HENTRY_FIND(he, MORPH_STEM)) { | |
356 mystrcat(result, " ", MAXLNLEN); | |
357 mystrcat(result, MORPH_STEM, MAXLNLEN); | |
358 mystrcat(result, HENTRY_WORD(he), MAXLNLEN); | |
359 } | |
360 // store the pointer of the hash entry | |
361 if (HENTRY_DATA(he)) { | |
362 mystrcat(result, " ", MAXLNLEN); | |
363 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); | |
364 } else { | |
365 // return with debug information | |
366 char * flag = pmyMgr->encode_flag(getFlag()); | |
367 mystrcat(result, " ", MAXLNLEN); | |
368 mystrcat(result, MORPH_FLAG, MAXLNLEN); | |
369 mystrcat(result, flag, MAXLNLEN); | |
370 free(flag); | |
371 } | |
372 mystrcat(result, "\n", MAXLNLEN); | |
373 } | |
374 he = he->next_homonym; | |
375 } while (he); | |
376 } | |
377 | |
378 // prefix matched but no root word was found | |
379 // if aeXPRODUCT is allowed, try again but now | |
380 // ross checked combined with a suffix | |
381 | |
382 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { | |
383 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, th
is, | |
384 FLAG_NULL, needflag); | |
385 if (st) { | |
386 mystrcat(result, st, MAXLNLEN); | |
387 free(st); | |
388 } | |
389 } | |
390 } | |
391 } | |
392 | |
393 if (*result) return mystrdup(result); | |
394 return NULL; | |
395 } | |
396 | |
397 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) | |
398 { | |
399 // register affix manager | |
400 pmyMgr = pmgr; | |
401 | |
402 // set up its initial values | |
403 aflag = dp->aflag; // char flag | |
404 strip = dp->strip; // string to strip | |
405 appnd = dp->appnd; // string to append | |
406 stripl = dp->stripl; // length of strip string | |
407 appndl = dp->appndl; // length of append string | |
408 numconds = dp->numconds; // length of the condition | |
409 opts = dp->opts; // cross product flag | |
410 | |
411 // then copy over all of the conditions | |
412 if (opts & aeLONGCOND) { | |
413 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); | |
414 c.l.conds2 = dp->c.l.conds2; | |
415 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); | |
416 next = NULL; | |
417 nextne = NULL; | |
418 nexteq = NULL; | |
419 rappnd = myrevstrdup(appnd); | |
420 morphcode = dp->morphcode; | |
421 contclass = dp->contclass; | |
422 contclasslen = dp->contclasslen; | |
423 } | |
424 | |
425 | |
426 SfxEntry::~SfxEntry() | |
427 { | |
428 aflag = 0; | |
429 if (appnd) free(appnd); | |
430 if (rappnd) free(rappnd); | |
431 if (strip) free(strip); | |
432 pmyMgr = NULL; | |
433 appnd = NULL; | |
434 strip = NULL; | |
435 if (opts & aeLONGCOND) free(c.l.conds2); | |
436 if (morphcode && !(opts & aeALIASM)) free(morphcode); | |
437 if (contclass && !(opts & aeALIASF)) free(contclass); | |
438 } | |
439 | |
440 // add suffix to this word assuming conditions hold | |
441 char * SfxEntry::add(const char * word, int len) | |
442 { | |
443 char tword[MAXWORDUTF8LEN + 4]; | |
444 | |
445 /* make sure all conditions match */ | |
446 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && | |
447 (len >= numconds) && test_condition(word + len, word) && | |
448 (!stripl || (strcmp(word + len - stripl, strip) == 0)) && | |
449 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { | |
450 /* we have a match so add suffix */ | |
451 strcpy(tword,word); | |
452 if (appndl) { | |
453 strcpy(tword + len - stripl, appnd); | |
454 } else { | |
455 *(tword + len - stripl) = '\0'; | |
456 } | |
457 return mystrdup(tword); | |
458 } | |
459 return NULL; | |
460 } | |
461 | |
462 inline char * SfxEntry::nextchar(char * p) { | |
463 if (p) { | |
464 p++; | |
465 if (opts & aeLONGCOND) { | |
466 // jump to the 2nd part of the condition | |
467 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; | |
468 // end of the MAXCONDLEN length condition | |
469 } else if (p == c.conds + MAXCONDLEN) return NULL; | |
470 return *p ? p : NULL; | |
471 } | |
472 return NULL; | |
473 } | |
474 | |
475 inline int SfxEntry::test_condition(const char * st, const char * beg) | |
476 { | |
477 const char * pos = NULL; // group with pos input position | |
478 bool neg = false; // complementer | |
479 bool ingroup = false; // character in the group | |
480 if (numconds == 0) return 1; | |
481 char * p = c.conds; | |
482 st--; | |
483 int i = 1; | |
484 while (1) { | |
485 switch (*p) { | |
486 case '\0': return 1; | |
487 case '[': { p = nextchar(p); pos = st; break; } | |
488 case '^': { p = nextchar(p); neg = true; break; } | |
489 case ']': { if (!neg && !ingroup) return 0; | |
490 i++; | |
491 // skip the next character | |
492 if (!ingroup) { | |
493 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x8
0; st--); | |
494 st--; | |
495 } | |
496 pos = NULL; | |
497 neg = false; | |
498 ingroup = false; | |
499 p = nextchar(p); | |
500 if (st < beg && p) return 0; // word <= condition | |
501 break; | |
502 } | |
503 case '.': if (!pos) { // dots are not metacharacters in groups: [.] | |
504 p = nextchar(p); | |
505 // skip the next character | |
506 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x8
0; st--); | |
507 if (st < beg) { // word <= condition | |
508 if (p) return 0; else return 1; | |
509 } | |
510 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 char
acter | |
511 st--; | |
512 if (st < beg) { // word <= condition | |
513 if (p) return 0; else return 1; | |
514 } | |
515 } | |
516 break; | |
517 } | |
518 default: { | |
519 if (*st == *p) { | |
520 p = nextchar(p); | |
521 if ((opts & aeUTF8) && (*st & 0x80)) { | |
522 st--; | |
523 while (p && (st >= beg)) { | |
524 if (*p != *st) { | |
525 if (!pos) return 0; | |
526 st = pos; | |
527 break; | |
528 } | |
529 // first byte of the UTF-8 multibyte character | |
530 if ((*p & 0xc0) != 0x80) break; | |
531 p = nextchar(p); | |
532 st--; | |
533 } | |
534 if (pos && st != pos) { | |
535 if (neg) return 0; | |
536 else if (i == numconds) return 1; | |
537 ingroup = true; | |
538 while (p && *p != ']' && ((p = nextchar(p)) != NULL)
); | |
539 st--; | |
540 } | |
541 if (p && *p != ']') p = nextchar(p); | |
542 } else if (pos) { | |
543 if (neg) return 0; | |
544 else if (i == numconds) return 1; | |
545 ingroup = true; | |
546 while (p && *p != ']' && ((p = nextchar(p)) != NULL)); | |
547 // if (p && *p != ']') p = nextchar(p); | |
548 st--; | |
549 } | |
550 if (!pos) { | |
551 i++; | |
552 st--; | |
553 } | |
554 if (st < beg && p && *p != ']') return 0; // word <= conditi
on | |
555 } else if (pos) { // group | |
556 p = nextchar(p); | |
557 } else return 0; | |
558 } | |
559 } | |
560 if (!p) return 1; | |
561 } | |
562 } | |
563 | |
564 // see if this suffix is present in the word | |
565 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, | |
566 PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const
FLAG needflag, | |
567 const FLAG badflag) | |
568 { | |
569 int tmpl; // length of tmpword | |
570 struct hentry * he; // hash entry pointer | |
571 unsigned char * cp; | |
572 char tmpword[MAXWORDUTF8LEN + 4]; | |
573 PfxEntry* ep = ppfx; | |
574 | |
575 // if this suffix is being cross checked with a prefix | |
576 // but it does not support cross products skip it | |
577 | |
578 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) | |
579 return NULL; | |
580 | |
581 // upon entry suffix is 0 length or already matches the end of the word. | |
582 // So if the remaining root word has positive length | |
583 // and if there are enough chars in root word and added back strip chars | |
584 // to meet the number of characters conditions, then test it | |
585 | |
586 tmpl = len - appndl; | |
587 // the second condition is not enough for UTF-8 strings | |
588 // it checked in test_condition() | |
589 | |
590 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && | |
591 (tmpl + stripl >= numconds)) { | |
592 | |
593 // generate new root word by removing suffix and adding | |
594 // back any characters that would have been stripped or | |
595 // or null terminating the shorter string | |
596 | |
597 strcpy (tmpword, word); | |
598 cp = (unsigned char *)(tmpword + tmpl); | |
599 if (stripl) { | |
600 strcpy ((char *)cp, strip); | |
601 tmpl += stripl; | |
602 cp = (unsigned char *)(tmpword + tmpl); | |
603 } else *cp = '\0'; | |
604 | |
605 // now make sure all of the conditions on characters | |
606 // are met. Please see the appendix at the end of | |
607 // this file for more info on exactly what is being | |
608 // tested | |
609 | |
610 // if all conditions are met then check if resulting | |
611 // root word in the dictionary | |
612 | |
613 if (test_condition((char *) cp, (char *) tmpword)) { | |
614 | |
615 #ifdef SZOSZABLYA_POSSIBLE_ROOTS | |
616 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag); | |
617 #endif | |
618 if ((he = pmyMgr->lookup(tmpword)) != NULL) { | |
619 do { | |
620 // check conditional suffix (enabled by prefix) | |
621 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->ge
tCont() && | |
622 TESTAFF(ep->getCont(), aflag, ep->getContLen
()))) && | |
623 (((optflags & aeXPRODUCT) == 0) || | |
624 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) |
| | |
625 // enabled by prefix | |
626 ((contclass) && (ep && TESTAFF(contclass, ep->getFla
g(), contclasslen))) | |
627 ) && | |
628 // handle cont. class | |
629 ((!cclass) || | |
630 ((contclass) && TESTAFF(contclass, cclass, contc
lasslen)) | |
631 ) && | |
632 // check only in compound homonyms (bad flags) | |
633 (!badflag || !TESTAFF(he->astr, badflag, he->alen) | |
634 ) && | |
635 // handle required flag | |
636 ((!needflag) || | |
637 (TESTAFF(he->astr, needflag, he->alen) || | |
638 ((contclass) && TESTAFF(contclass, needflag, contc
lasslen))) | |
639 ) | |
640 ) return he; | |
641 he = he->next_homonym; // check homonyms | |
642 } while (he); | |
643 | |
644 // obsolote stemming code (used only by the | |
645 // experimental SuffixMgr:suggest_pos_stems) | |
646 // store resulting root in wlst | |
647 } else if (wlst && (*ns < maxSug)) { | |
648 int cwrd = 1; | |
649 for (int k=0; k < *ns; k++) | |
650 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; | |
651 if (cwrd) { | |
652 wlst[*ns] = mystrdup(tmpword); | |
653 if (wlst[*ns] == NULL) { | |
654 for (int j=0; j<*ns; j++) free(wlst[j]); | |
655 *ns = -1; | |
656 return NULL; | |
657 } | |
658 (*ns)++; | |
659 } | |
660 } | |
661 } | |
662 } | |
663 return NULL; | |
664 } | |
665 | |
666 // see if two-level suffix is present in the word | |
667 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, | |
668 PfxEntry* ppfx, const FLAG needflag) | |
669 { | |
670 int tmpl; // length of tmpword | |
671 struct hentry * he; // hash entry pointer | |
672 unsigned char * cp; | |
673 char tmpword[MAXWORDUTF8LEN + 4]; | |
674 PfxEntry* ep = ppfx; | |
675 | |
676 | |
677 // if this suffix is being cross checked with a prefix | |
678 // but it does not support cross products skip it | |
679 | |
680 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) | |
681 return NULL; | |
682 | |
683 // upon entry suffix is 0 length or already matches the end of the word. | |
684 // So if the remaining root word has positive length | |
685 // and if there are enough chars in root word and added back strip chars | |
686 // to meet the number of characters conditions, then test it | |
687 | |
688 tmpl = len - appndl; | |
689 | |
690 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && | |
691 (tmpl + stripl >= numconds)) { | |
692 | |
693 // generate new root word by removing suffix and adding | |
694 // back any characters that would have been stripped or | |
695 // or null terminating the shorter string | |
696 | |
697 strcpy (tmpword, word); | |
698 cp = (unsigned char *)(tmpword + tmpl); | |
699 if (stripl) { | |
700 strcpy ((char *)cp, strip); | |
701 tmpl += stripl; | |
702 cp = (unsigned char *)(tmpword + tmpl); | |
703 } else *cp = '\0'; | |
704 | |
705 // now make sure all of the conditions on characters | |
706 // are met. Please see the appendix at the end of | |
707 // this file for more info on exactly what is being | |
708 // tested | |
709 | |
710 // if all conditions are met then recall suffix_check | |
711 | |
712 if (test_condition((char *) cp, (char *) tmpword)) { | |
713 if (ppfx) { | |
714 // handle conditional suffix | |
715 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contcla
sslen)) | |
716 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL,
0, NULL, (FLAG) aflag, needflag); | |
717 else | |
718 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx,
NULL, 0, NULL, (FLAG) aflag, needflag); | |
719 } else { | |
720 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, N
ULL, (FLAG) aflag, needflag); | |
721 } | |
722 if (he) return he; | |
723 } | |
724 } | |
725 return NULL; | |
726 } | |
727 | |
728 // see if two-level suffix is present in the word | |
729 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, | |
730 PfxEntry* ppfx, const FLAG needflag) | |
731 { | |
732 int tmpl; // length of tmpword | |
733 unsigned char * cp; | |
734 char tmpword[MAXWORDUTF8LEN + 4]; | |
735 PfxEntry* ep = ppfx; | |
736 char * st; | |
737 | |
738 char result[MAXLNLEN]; | |
739 | |
740 *result = '\0'; | |
741 | |
742 // if this suffix is being cross checked with a prefix | |
743 // but it does not support cross products skip it | |
744 | |
745 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) | |
746 return NULL; | |
747 | |
748 // upon entry suffix is 0 length or already matches the end of the word. | |
749 // So if the remaining root word has positive length | |
750 // and if there are enough chars in root word and added back strip chars | |
751 // to meet the number of characters conditions, then test it | |
752 | |
753 tmpl = len - appndl; | |
754 | |
755 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && | |
756 (tmpl + stripl >= numconds)) { | |
757 | |
758 // generate new root word by removing suffix and adding | |
759 // back any characters that would have been stripped or | |
760 // or null terminating the shorter string | |
761 | |
762 strcpy (tmpword, word); | |
763 cp = (unsigned char *)(tmpword + tmpl); | |
764 if (stripl) { | |
765 strcpy ((char *)cp, strip); | |
766 tmpl += stripl; | |
767 cp = (unsigned char *)(tmpword + tmpl); | |
768 } else *cp = '\0'; | |
769 | |
770 // now make sure all of the conditions on characters | |
771 // are met. Please see the appendix at the end of | |
772 // this file for more info on exactly what is being | |
773 // tested | |
774 | |
775 // if all conditions are met then recall suffix_check | |
776 | |
777 if (test_condition((char *) cp, (char *) tmpword)) { | |
778 if (ppfx) { | |
779 // handle conditional suffix | |
780 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contcla
sslen)) { | |
781 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL,
aflag, needflag); | |
782 if (st) { | |
783 if (ppfx->getMorph()) { | |
784 mystrcat(result, ppfx->getMorph(), MAXLNLEN); | |
785 mystrcat(result, " ", MAXLNLEN); | |
786 } | |
787 mystrcat(result,st, MAXLNLEN); | |
788 free(st); | |
789 mychomp(result); | |
790 } | |
791 } else { | |
792 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags,
ppfx, aflag, needflag); | |
793 if (st) { | |
794 mystrcat(result, st, MAXLNLEN); | |
795 free(st); | |
796 mychomp(result); | |
797 } | |
798 } | |
799 } else { | |
800 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL,
aflag, needflag); | |
801 if (st) { | |
802 mystrcat(result, st, MAXLNLEN); | |
803 free(st); | |
804 mychomp(result); | |
805 } | |
806 } | |
807 if (*result) return mystrdup(result); | |
808 } | |
809 } | |
810 return NULL; | |
811 } | |
812 | |
813 // get next homonym with same affix | |
814 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, Pfx
Entry* ppfx, | |
815 const FLAG cclass, const FLAG needflag) | |
816 { | |
817 PfxEntry* ep = ppfx; | |
818 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; | |
819 | |
820 while (he->next_homonym) { | |
821 he = he->next_homonym; | |
822 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTA
FF(ep->getCont(), aflag, ep->getContLen()))) && | |
823 ((optflags & aeXPRODUCT) == 0 || | |
824 TESTAFF(he->astr, eFlag, he->alen) || | |
825 // handle conditional suffix | |
826 ((contclass) && TESTAFF(contclass, eFlag, contclassl
en)) | |
827 ) && | |
828 // handle cont. class | |
829 ((!cclass) || | |
830 ((contclass) && TESTAFF(contclass, cclass, contc
lasslen)) | |
831 ) && | |
832 // handle required flag | |
833 ((!needflag) || | |
834 (TESTAFF(he->astr, needflag, he->alen) || | |
835 ((contclass) && TESTAFF(contclass, needflag, contc
lasslen))) | |
836 ) | |
837 ) return he; | |
838 } | |
839 return NULL; | |
840 } | |
841 | |
842 | |
843 #if 0 | |
844 | |
845 Appendix: Understanding Affix Code | |
846 | |
847 | |
848 An affix is either a prefix or a suffix attached to root words to make | |
849 other words. | |
850 | |
851 Basically a Prefix or a Suffix is set of AffEntry objects | |
852 which store information about the prefix or suffix along | |
853 with supporting routines to check if a word has a particular | |
854 prefix or suffix or a combination. | |
855 | |
856 The structure affentry is defined as follows: | |
857 | |
858 struct affentry | |
859 { | |
860 unsigned short aflag; // ID used to represent the affix | |
861 char * strip; // string to strip before adding affix | |
862 char * appnd; // the affix string to add | |
863 unsigned char stripl; // length of the strip string | |
864 unsigned char appndl; // length of the affix string | |
865 char numconds; // the number of conditions that must be met | |
866 char opts; // flag: aeXPRODUCT- combine both prefix and suffix | |
867 char conds[SETSIZE]; // array which encodes the conditions to be met | |
868 }; | |
869 | |
870 | |
871 Here is a suffix borrowed from the en_US.aff file. This file | |
872 is whitespace delimited. | |
873 | |
874 SFX D Y 4 | |
875 SFX D 0 e d | |
876 SFX D y ied [^aeiou]y | |
877 SFX D 0 ed [^ey] | |
878 SFX D 0 ed [aeiou]y | |
879 | |
880 This information can be interpreted as follows: | |
881 | |
882 In the first line has 4 fields | |
883 | |
884 Field | |
885 ----- | |
886 1 SFX - indicates this is a suffix | |
887 2 D - is the name of the character flag which represents this suffix | |
888 3 Y - indicates it can be combined with prefixes (cross product) | |
889 4 4 - indicates that sequence of 4 affentry structures are needed to | |
890 properly store the affix information | |
891 | |
892 The remaining lines describe the unique information for the 4 SfxEntry | |
893 objects that make up this affix. Each line can be interpreted | |
894 as follows: (note fields 1 and 2 are as a check against line 1 info) | |
895 | |
896 Field | |
897 ----- | |
898 1 SFX - indicates this is a suffix | |
899 2 D - is the name of the character flag for this affix | |
900 3 y - the string of chars to strip off before adding affix | |
901 (a 0 here indicates the NULL string) | |
902 4 ied - the string of affix characters to add | |
903 5 [^aeiou]y - the conditions which must be met before the affix | |
904 can be applied | |
905 | |
906 Field 5 is interesting. Since this is a suffix, field 5 tells us that | |
907 there are 2 conditions that must be met. The first condition is that | |
908 the next to the last character in the word must *NOT* be any of the | |
909 following "a", "e", "i", "o" or "u". The second condition is that | |
910 the last character of the word must end in "y". | |
911 | |
912 So how can we encode this information concisely and be able to | |
913 test for both conditions in a fast manner? The answer is found | |
914 but studying the wonderful ispell code of Geoff Kuenning, et.al. | |
915 (now available under a normal BSD license). | |
916 | |
917 If we set up a conds array of 256 bytes indexed (0 to 255) and access it | |
918 using a character (cast to an unsigned char) of a string, we have 8 bits | |
919 of information we can store about that character. Specifically we | |
920 could use each bit to say if that character is allowed in any of the | |
921 last (or first for prefixes) 8 characters of the word. | |
922 | |
923 Basically, each character at one end of the word (up to the number | |
924 of conditions) is used to index into the conds array and the resulting | |
925 value found there says whether the that character is valid for a | |
926 specific character position in the word. | |
927 | |
928 For prefixes, it does this by setting bit 0 if that char is valid | |
929 in the first position, bit 1 if valid in the second position, and so on. | |
930 | |
931 If a bit is not set, then that char is not valid for that postion in the | |
932 word. | |
933 | |
934 If working with suffixes bit 0 is used for the character closest | |
935 to the front, bit 1 for the next character towards the end, ..., | |
936 with bit numconds-1 representing the last char at the end of the string. | |
937 | |
938 Note: since entries in the conds[] are 8 bits, only 8 conditions | |
939 (read that only 8 character positions) can be examined at one | |
940 end of a word (the beginning for prefixes and the end for suffixes. | |
941 | |
942 So to make this clearer, lets encode the conds array values for the | |
943 first two affentries for the suffix D described earlier. | |
944 | |
945 | |
946 For the first affentry: | |
947 numconds = 1 (only examine the last character) | |
948 | |
949 conds['e'] = (1 << 0) (the word must end in an E) | |
950 all others are all 0 | |
951 | |
952 For the second affentry: | |
953 numconds = 2 (only examine the last two characters) | |
954 | |
955 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) | |
956 where X is all characters *but* a, e, i, o, or u | |
957 | |
958 | |
959 conds['y'] = (1 << 1) (the last char must be a y) | |
960 all other bits for all other entries in the conds array are zero | |
961 | |
962 | |
963 #endif | |
964 | |
OLD | NEW |