OLD | NEW |
| (Empty) |
1 #include "license.hunspell" | |
2 #include "license.myspell" | |
3 | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <stdio.h> | |
7 | |
8 #include "hunspell.hxx" | |
9 #include "hunspell.h" | |
10 #ifndef HUNSPELL_CHROME_CLIENT | |
11 #ifndef MOZILLA_CLIENT | |
12 # include "config.h" | |
13 #endif | |
14 #endif | |
15 #include "csutil.hxx" | |
16 | |
17 #ifdef HUNSPELL_CHROME_CLIENT | |
18 Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length) | |
19 #else | |
20 Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key) | |
21 #endif | |
22 { | |
23 encoding = NULL; | |
24 csconv = NULL; | |
25 utf8 = 0; | |
26 complexprefixes = 0; | |
27 #ifndef HUNSPELL_CHROME_CLIENT | |
28 affixpath = mystrdup(affpath); | |
29 #endif | |
30 maxdic = 0; | |
31 | |
32 #ifdef HUNSPELL_CHROME_CLIENT | |
33 bdict_reader = new hunspell::BDictReader; | |
34 bdict_reader->Init(bdict_data, bdict_length); | |
35 | |
36 pHMgr[0] = new HashMgr(bdict_reader); | |
37 if (pHMgr[0]) maxdic = 1; | |
38 | |
39 pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic); | |
40 #else | |
41 /* first set up the hash manager */ | |
42 pHMgr[0] = new HashMgr(dpath, affpath, key); | |
43 if (pHMgr[0]) maxdic = 1; | |
44 | |
45 /* next set up the affix manager */ | |
46 /* it needs access to the hash manager lookup methods */ | |
47 pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); | |
48 #endif | |
49 | |
50 /* get the preferred try string and the dictionary */ | |
51 /* encoding from the Affix Manager for that dictionary */ | |
52 char * try_string = pAMgr->get_try_string(); | |
53 encoding = pAMgr->get_encoding(); | |
54 langnum = pAMgr->get_langnum(); | |
55 utf8 = pAMgr->get_utf8(); | |
56 if (!utf8) | |
57 csconv = get_current_cs(encoding); | |
58 complexprefixes = pAMgr->get_complexprefixes(); | |
59 wordbreak = pAMgr->get_breaktable(); | |
60 | |
61 /* and finally set up the suggestion manager */ | |
62 #ifdef HUNSPELL_CHROME_CLIENT | |
63 pSMgr = new SuggestMgr(bdict_reader, try_string, MAXSUGGESTION, pAMgr); | |
64 #else | |
65 pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); | |
66 #endif | |
67 if (try_string) free(try_string); | |
68 } | |
69 | |
70 Hunspell::~Hunspell() | |
71 { | |
72 if (pSMgr) delete pSMgr; | |
73 if (pAMgr) delete pAMgr; | |
74 for (int i = 0; i < maxdic; i++) delete pHMgr[i]; | |
75 maxdic = 0; | |
76 pSMgr = NULL; | |
77 pAMgr = NULL; | |
78 #ifdef MOZILLA_CLIENT | |
79 delete [] csconv; | |
80 #endif | |
81 csconv= NULL; | |
82 if (encoding) free(encoding); | |
83 encoding = NULL; | |
84 #ifdef HUNSPELL_CHROME_CLIENT | |
85 if (bdict_reader) delete bdict_reader; | |
86 bdict_reader = NULL; | |
87 #else | |
88 if (affixpath) free(affixpath); | |
89 affixpath = NULL; | |
90 #endif | |
91 } | |
92 | |
93 #ifndef HUNSPELL_CHROME_CLIENT | |
94 // load extra dictionaries | |
95 int Hunspell::add_dic(const char * dpath, const char * key) { | |
96 if (maxdic == MAXDIC || !affixpath) return 1; | |
97 pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); | |
98 if (pHMgr[maxdic]) maxdic++; else return 1; | |
99 return 0; | |
100 } | |
101 #endif | |
102 | |
103 // make a copy of src at destination while removing all leading | |
104 // blanks and removing any trailing periods after recording | |
105 // their presence with the abbreviation flag | |
106 // also since already going through character by character, | |
107 // set the capitalization type | |
108 // return the length of the "cleaned" (and UTF-8 encoded) word | |
109 | |
110 int Hunspell::cleanword2(char * dest, const char * src, | |
111 w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) | |
112 { | |
113 unsigned char * p = (unsigned char *) dest; | |
114 const unsigned char * q = (const unsigned char * ) src; | |
115 | |
116 // first skip over any leading blanks | |
117 while ((*q != '\0') && (*q == ' ')) q++; | |
118 | |
119 // now strip off any trailing periods (recording their presence) | |
120 *pabbrev = 0; | |
121 int nl = strlen((const char *)q); | |
122 while ((nl > 0) && (*(q+nl-1)=='.')) { | |
123 nl--; | |
124 (*pabbrev)++; | |
125 } | |
126 | |
127 // if no characters are left it can't be capitalized | |
128 if (nl <= 0) { | |
129 *pcaptype = NOCAP; | |
130 *p = '\0'; | |
131 return 0; | |
132 } | |
133 | |
134 strncpy(dest, (char *) q, nl); | |
135 *(dest + nl) = '\0'; | |
136 nl = strlen(dest); | |
137 if (utf8) { | |
138 *nc = u8_u16(dest_utf, MAXWORDLEN, dest); | |
139 // don't check too long words | |
140 if (*nc >= MAXWORDLEN) return 0; | |
141 if (*nc == -1) { // big Unicode character (non BMP area) | |
142 *pcaptype = NOCAP; | |
143 return nl; | |
144 } | |
145 *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); | |
146 } else { | |
147 *pcaptype = get_captype(dest, nl, csconv); | |
148 *nc = nl; | |
149 } | |
150 return nl; | |
151 } | |
152 | |
153 int Hunspell::cleanword(char * dest, const char * src, | |
154 int * pcaptype, int * pabbrev) | |
155 { | |
156 unsigned char * p = (unsigned char *) dest; | |
157 const unsigned char * q = (const unsigned char * ) src; | |
158 int firstcap = 0; | |
159 | |
160 // first skip over any leading blanks | |
161 while ((*q != '\0') && (*q == ' ')) q++; | |
162 | |
163 // now strip off any trailing periods (recording their presence) | |
164 *pabbrev = 0; | |
165 int nl = strlen((const char *)q); | |
166 while ((nl > 0) && (*(q+nl-1)=='.')) { | |
167 nl--; | |
168 (*pabbrev)++; | |
169 } | |
170 | |
171 // if no characters are left it can't be capitalized | |
172 if (nl <= 0) { | |
173 *pcaptype = NOCAP; | |
174 *p = '\0'; | |
175 return 0; | |
176 } | |
177 | |
178 // now determine the capitalization type of the first nl letters | |
179 int ncap = 0; | |
180 int nneutral = 0; | |
181 int nc = 0; | |
182 | |
183 if (!utf8) { | |
184 while (nl > 0) { | |
185 nc++; | |
186 if (csconv[(*q)].ccase) ncap++; | |
187 if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; | |
188 *p++ = *q++; | |
189 nl--; | |
190 } | |
191 // remember to terminate the destination string | |
192 *p = '\0'; | |
193 firstcap = csconv[(unsigned char)(*dest)].ccase; | |
194 } else { | |
195 unsigned short idx; | |
196 w_char t[MAXWORDLEN]; | |
197 nc = u8_u16(t, MAXWORDLEN, src); | |
198 for (int i = 0; i < nc; i++) { | |
199 idx = (t[i].h << 8) + t[i].l; | |
200 unsigned short low = unicodetolower(idx, langnum); | |
201 if (idx != low) ncap++; | |
202 if (unicodetoupper(idx, langnum) == low) nneutral++; | |
203 } | |
204 u16_u8(dest, MAXWORDUTF8LEN, t, nc); | |
205 if (ncap) { | |
206 idx = (t[0].h << 8) + t[0].l; | |
207 firstcap = (idx != unicodetolower(idx, langnum)); | |
208 } | |
209 } | |
210 | |
211 // now finally set the captype | |
212 if (ncap == 0) { | |
213 *pcaptype = NOCAP; | |
214 } else if ((ncap == 1) && firstcap) { | |
215 *pcaptype = INITCAP; | |
216 } else if ((ncap == nc) || ((ncap + nneutral) == nc)){ | |
217 *pcaptype = ALLCAP; | |
218 } else if ((ncap > 1) && firstcap) { | |
219 *pcaptype = HUHINITCAP; | |
220 } else { | |
221 *pcaptype = HUHCAP; | |
222 } | |
223 return strlen(dest); | |
224 } | |
225 | |
226 void Hunspell::mkallcap(char * p) | |
227 { | |
228 if (utf8) { | |
229 w_char u[MAXWORDLEN]; | |
230 int nc = u8_u16(u, MAXWORDLEN, p); | |
231 unsigned short idx; | |
232 for (int i = 0; i < nc; i++) { | |
233 idx = (u[i].h << 8) + u[i].l; | |
234 if (idx != unicodetoupper(idx, langnum)) { | |
235 u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); | |
236 u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); | |
237 } | |
238 } | |
239 u16_u8(p, MAXWORDUTF8LEN, u, nc); | |
240 } else { | |
241 while (*p != '\0') { | |
242 *p = csconv[((unsigned char) *p)].cupper; | |
243 p++; | |
244 } | |
245 } | |
246 } | |
247 | |
248 int Hunspell::mkallcap2(char * p, w_char * u, int nc) | |
249 { | |
250 if (utf8) { | |
251 unsigned short idx; | |
252 for (int i = 0; i < nc; i++) { | |
253 idx = (u[i].h << 8) + u[i].l; | |
254 unsigned short up = unicodetoupper(idx, langnum); | |
255 if (idx != up) { | |
256 u[i].h = (unsigned char) (up >> 8); | |
257 u[i].l = (unsigned char) (up & 0x00FF); | |
258 } | |
259 } | |
260 u16_u8(p, MAXWORDUTF8LEN, u, nc); | |
261 return strlen(p); | |
262 } else { | |
263 while (*p != '\0') { | |
264 *p = csconv[((unsigned char) *p)].cupper; | |
265 p++; | |
266 } | |
267 } | |
268 return nc; | |
269 } | |
270 | |
271 | |
272 void Hunspell::mkallsmall(char * p) | |
273 { | |
274 while (*p != '\0') { | |
275 *p = csconv[((unsigned char) *p)].clower; | |
276 p++; | |
277 } | |
278 } | |
279 | |
280 int Hunspell::mkallsmall2(char * p, w_char * u, int nc) | |
281 { | |
282 if (utf8) { | |
283 unsigned short idx; | |
284 for (int i = 0; i < nc; i++) { | |
285 idx = (u[i].h << 8) + u[i].l; | |
286 unsigned short low = unicodetolower(idx, langnum); | |
287 if (idx != low) { | |
288 u[i].h = (unsigned char) (low >> 8); | |
289 u[i].l = (unsigned char) (low & 0x00FF); | |
290 } | |
291 } | |
292 u16_u8(p, MAXWORDUTF8LEN, u, nc); | |
293 return strlen(p); | |
294 } else { | |
295 while (*p != '\0') { | |
296 *p = csconv[((unsigned char) *p)].clower; | |
297 p++; | |
298 } | |
299 } | |
300 return nc; | |
301 } | |
302 | |
303 // convert UTF-8 sharp S codes to latin 1 | |
304 char * Hunspell::sharps_u8_l1(char * dest, char * source) { | |
305 char * p = dest; | |
306 *p = *source; | |
307 for (p++, source++; *(source - 1); p++, source++) { | |
308 *p = *source; | |
309 if (*source == '\x9F') *--p = '\xDF'; | |
310 } | |
311 return dest; | |
312 } | |
313 | |
314 // recursive search for right ss - sharp s permutations | |
315 hentry * Hunspell::spellsharps(char * base, char * pos, int n, | |
316 int repnum, char * tmp, int * info, char **root) { | |
317 pos = strstr(pos, "ss"); | |
318 if (pos && (n < MAXSHARPS)) { | |
319 *pos = '\xC3'; | |
320 *(pos + 1) = '\x9F'; | |
321 hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, ro
ot); | |
322 if (h) return h; | |
323 *pos = 's'; | |
324 *(pos + 1) = 's'; | |
325 h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root); | |
326 if (h) return h; | |
327 } else if (repnum > 0) { | |
328 if (utf8) return checkword(base, info, root); | |
329 return checkword(sharps_u8_l1(tmp, base), info, root); | |
330 } | |
331 return NULL; | |
332 } | |
333 | |
334 int Hunspell::is_keepcase(const hentry * rv) { | |
335 return pAMgr && rv->astr && pAMgr->get_keepcase() && | |
336 TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); | |
337 } | |
338 | |
339 /* insert a word to the beginning of the suggestion array and return ns */ | |
340 int Hunspell::insert_sug(char ***slst, char * word, int ns) { | |
341 char * dup = mystrdup(word); | |
342 if (!dup) return ns; | |
343 if (ns == MAXSUGGESTION) { | |
344 ns--; | |
345 free((*slst)[ns]); | |
346 } | |
347 for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; | |
348 (*slst)[0] = dup; | |
349 return ns + 1; | |
350 } | |
351 | |
352 int Hunspell::spell(const char * word, int * info, char ** root) | |
353 { | |
354 #ifdef HUNSPELL_CHROME_CLIENT | |
355 if (pHMgr[0]) pHMgr[0]->EmptyHentryCache(); | |
356 #endif | |
357 struct hentry * rv=NULL; | |
358 // need larger vector. For example, Turkish capital letter I converted a | |
359 // 2-byte UTF-8 character (dotless i) by mkallsmall. | |
360 char cw[MAXWORDUTF8LEN]; | |
361 char wspace[MAXWORDUTF8LEN]; | |
362 w_char unicw[MAXWORDLEN]; | |
363 // Hunspell supports XML input of the simplified API (see manual) | |
364 if (strcmp(word, SPELL_XML) == 0) return 1; | |
365 int nc = strlen(word); | |
366 int wl2 = 0; | |
367 if (utf8) { | |
368 if (nc >= MAXWORDUTF8LEN) return 0; | |
369 } else { | |
370 if (nc >= MAXWORDLEN) return 0; | |
371 } | |
372 int captype = 0; | |
373 int abbv = 0; | |
374 int wl = 0; | |
375 | |
376 // input conversion | |
377 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; | |
378 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &cap
type, &abbv); | |
379 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); | |
380 | |
381 int info2 = 0; | |
382 if (wl == 0 || maxdic == 0) return 1; | |
383 if (root) *root = NULL; | |
384 | |
385 // allow numbers with dots, dashes and commas (but forbid double separators: "
..", "--" etc.) | |
386 enum { NBEGIN, NNUM, NSEP }; | |
387 int nstate = NBEGIN; | |
388 int i; | |
389 | |
390 for (i = 0; (i < wl); i++) { | |
391 if ((cw[i] <= '9') && (cw[i] >= '0')) { | |
392 nstate = NNUM; | |
393 } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) { | |
394 if ((nstate == NSEP) || (i == 0)) break; | |
395 nstate = NSEP; | |
396 } else break; | |
397 } | |
398 if ((i == wl) && (nstate == NNUM)) return 1; | |
399 if (!info) info = &info2; else *info = 0; | |
400 | |
401 switch(captype) { | |
402 case HUHCAP: | |
403 case HUHINITCAP: | |
404 *info += SPELL_ORIGCAP; | |
405 case NOCAP: { | |
406 rv = checkword(cw, info, root); | |
407 if ((abbv) && !(rv)) { | |
408 memcpy(wspace,cw,wl); | |
409 *(wspace+wl) = '.'; | |
410 *(wspace+wl+1) = '\0'; | |
411 rv = checkword(wspace, info, root); | |
412 } | |
413 break; | |
414 } | |
415 case ALLCAP: { | |
416 *info += SPELL_ORIGCAP; | |
417 rv = checkword(cw, info, root); | |
418 if (rv) break; | |
419 if (abbv) { | |
420 memcpy(wspace,cw,wl); | |
421 *(wspace+wl) = '.'; | |
422 *(wspace+wl+1) = '\0'; | |
423 rv = checkword(wspace, info, root); | |
424 if (rv) break; | |
425 } | |
426 // Spec. prefix handling for Catalan, French, Italian: | |
427 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). | |
428 if (pAMgr && strchr(cw, '\'')) { | |
429 wl = mkallsmall2(cw, unicw, nc); | |
430 //There are no really sane circumstances where this could fail, | |
431 //but anyway... | |
432 if (char * apostrophe = strchr(cw, '\'')) { | |
433 if (utf8) { | |
434 w_char tmpword[MAXWORDLEN]; | |
435 *apostrophe = '\0'; | |
436 wl2 = u8_u16(tmpword, MAXWORDLEN, cw); | |
437 *apostrophe = '\''; | |
438 if (wl2 < nc) { | |
439 mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2
- 1); | |
440 rv = checkword(cw, info, root); | |
441 if (rv) break; | |
442 } | |
443 } else { | |
444 mkinitcap2(apostrophe + 1, unicw, nc); | |
445 rv = checkword(cw, info, root); | |
446 if (rv) break; | |
447 } | |
448 } | |
449 mkinitcap2(cw, unicw, nc); | |
450 rv = checkword(cw, info, root); | |
451 if (rv) break; | |
452 } | |
453 if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { | |
454 char tmpword[MAXWORDUTF8LEN]; | |
455 wl = mkallsmall2(cw, unicw, nc); | |
456 memcpy(wspace,cw,(wl+1)); | |
457 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); | |
458 if (!rv) { | |
459 wl2 = mkinitcap2(cw, unicw, nc); | |
460 rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); | |
461 } | |
462 if ((abbv) && !(rv)) { | |
463 *(wspace+wl) = '.'; | |
464 *(wspace+wl+1) = '\0'; | |
465 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); | |
466 if (!rv) { | |
467 memcpy(wspace, cw, wl2); | |
468 *(wspace+wl2) = '.'; | |
469 *(wspace+wl2+1) = '\0'; | |
470 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, ro
ot); | |
471 } | |
472 } | |
473 if (rv) break; | |
474 } | |
475 } | |
476 case INITCAP: { | |
477 *info += SPELL_ORIGCAP; | |
478 wl = mkallsmall2(cw, unicw, nc); | |
479 memcpy(wspace,cw,(wl+1)); | |
480 wl2 = mkinitcap2(cw, unicw, nc); | |
481 if (captype == INITCAP) *info += SPELL_INITCAP; | |
482 rv = checkword(cw, info, root); | |
483 if (captype == INITCAP) *info -= SPELL_INITCAP; | |
484 // forbid bad capitalization | |
485 // (for example, ijs -> Ijs instead of IJs in Dutch) | |
486 // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) | |
487 if (*info & SPELL_FORBIDDEN) { | |
488 rv = NULL; | |
489 break; | |
490 } | |
491 if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; | |
492 if (rv) break; | |
493 | |
494 rv = checkword(wspace, info, root); | |
495 if (abbv && !rv) { | |
496 | |
497 *(wspace+wl) = '.'; | |
498 *(wspace+wl+1) = '\0'; | |
499 rv = checkword(wspace, info, root); | |
500 if (!rv) { | |
501 memcpy(wspace, cw, wl2); | |
502 *(wspace+wl2) = '.'; | |
503 *(wspace+wl2+1) = '\0'; | |
504 if (captype == INITCAP) *info += SPELL_INITCAP; | |
505 rv = checkword(wspace, info, root); | |
506 if (captype == INITCAP) *info -= SPELL_INITCAP; | |
507 if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; | |
508 break; | |
509 } | |
510 } | |
511 if (rv && is_keepcase(rv) && | |
512 ((captype == ALLCAP) || | |
513 // if CHECKSHARPS: KEEPCASE words with \xDF are allowed | |
514 // in INITCAP form, too. | |
515 !(pAMgr->get_checksharps() && | |
516 ((utf8 && strstr(wspace, "\xC3\x9F")) || | |
517 (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; | |
518 break; | |
519 } | |
520 } | |
521 | |
522 if (rv) { | |
523 if (pAMgr && pAMgr->get_warn() && rv->astr && | |
524 TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { | |
525 *info += SPELL_WARN; | |
526 if (pAMgr->get_forbidwarn()) return 0; | |
527 return HUNSPELL_OK_WARN; | |
528 } | |
529 return HUNSPELL_OK; | |
530 } | |
531 | |
532 // recursive breaking at break points | |
533 if (wordbreak) { | |
534 char * s; | |
535 char r; | |
536 int nbr = 0; | |
537 wl = strlen(cw); | |
538 int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; | |
539 | |
540 // calculate break points for recursion limit | |
541 for (int j = 0; j < numbreak; j++) { | |
542 s = cw; | |
543 do { | |
544 s = (char *) strstr(s, wordbreak[j]); | |
545 if (s) { | |
546 nbr++; | |
547 s++; | |
548 } | |
549 } while (s); | |
550 } | |
551 if (nbr >= 10) return 0; | |
552 | |
553 // check boundary patterns (^begin and end$) | |
554 for (int j = 0; j < numbreak; j++) { | |
555 int plen = strlen(wordbreak[j]); | |
556 if (plen == 1 || plen > wl) continue; | |
557 if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 | |
558 && spell(cw + plen - 1)) return 1; | |
559 if (wordbreak[j][plen - 1] == '$' && | |
560 strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { | |
561 r = cw[wl - plen + 1]; | |
562 cw[wl - plen + 1] = '\0'; | |
563 if (spell(cw)) return 1; | |
564 cw[wl - plen + 1] = r; | |
565 } | |
566 } | |
567 | |
568 // other patterns | |
569 for (int j = 0; j < numbreak; j++) { | |
570 int plen = strlen(wordbreak[j]); | |
571 s=(char *) strstr(cw, wordbreak[j]); | |
572 if (s && (s > cw) && (s < cw + wl - plen)) { | |
573 if (!spell(s + plen)) continue; | |
574 r = *s; | |
575 *s = '\0'; | |
576 // examine 2 sides of the break point | |
577 if (spell(cw)) return 1; | |
578 *s = r; | |
579 | |
580 // LANG_hu: spec. dash rule | |
581 if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { | |
582 r = s[1]; | |
583 s[1] = '\0'; | |
584 if (spell(cw)) return 1; // check the first part with dash | |
585 s[1] = r; | |
586 } | |
587 // end of LANG speficic region | |
588 | |
589 } | |
590 } | |
591 } | |
592 | |
593 return 0; | |
594 } | |
595 | |
596 struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) | |
597 { | |
598 struct hentry * he = NULL; | |
599 int len, i; | |
600 char w2[MAXWORDUTF8LEN]; | |
601 const char * word; | |
602 | |
603 char * ignoredchars = pAMgr->get_ignore(); | |
604 if (ignoredchars != NULL) { | |
605 strcpy(w2, w); | |
606 if (utf8) { | |
607 int ignoredchars_utf16_len; | |
608 unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredch
ars_utf16_len); | |
609 remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len)
; | |
610 } else { | |
611 remove_ignored_chars(w2,ignoredchars); | |
612 } | |
613 word = w2; | |
614 } else word = w; | |
615 | |
616 len = strlen(word); | |
617 | |
618 if (!len) | |
619 return NULL; | |
620 | |
621 #ifdef HUNSPELL_CHROME_CLIENT | |
622 // We need to check if the word length is valid to make coverity (Event | |
623 // fixed_size_dest: Possible overrun of N byte fixed size buffer) happy. | |
624 if ((utf8 && strlen(word) >= MAXWORDUTF8LEN) || (!utf8 && strlen(word) >= MAXW
ORDLEN)) | |
625 return NULL; | |
626 #endif | |
627 | |
628 // word reversing wrapper for complex prefixes | |
629 if (complexprefixes) { | |
630 if (word != w2) { | |
631 strcpy(w2, word); | |
632 word = w2; | |
633 } | |
634 if (utf8) reverseword_utf(w2); else reverseword(w2); | |
635 } | |
636 | |
637 // look word in hash table | |
638 for (i = 0; (i < maxdic) && !he; i ++) { | |
639 he = (pHMgr[i])->lookup(word); | |
640 | |
641 // check forbidden and onlyincompound words | |
642 if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenwor
d(), he->alen)) { | |
643 if (info) *info += SPELL_FORBIDDEN; | |
644 // LANG_hu section: set dash information for suggestions | |
645 if (langnum == LANG_hu) { | |
646 if (pAMgr->get_compoundflag() && | |
647 TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { | |
648 if (info) *info += SPELL_COMPOUND; | |
649 } | |
650 } | |
651 return NULL; | |
652 } | |
653 | |
654 // he = next not needaffix, onlyincompound homonym or onlyupcase word | |
655 while (he && (he->astr) && | |
656 ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->al
en)) || | |
657 (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompou
nd(), he->alen)) || | |
658 (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he-
>alen)) | |
659 )) he = he->next_homonym; | |
660 } | |
661 | |
662 // check with affixes | |
663 if (!he && pAMgr) { | |
664 // try stripping off affixes */ | |
665 he = pAMgr->affix_check(word, len, 0); | |
666 | |
667 // check compound restriction and onlyupcase | |
668 if (he && he->astr && ( | |
669 (pAMgr->get_onlyincompound() && | |
670 TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || | |
671 (info && (*info & SPELL_INITCAP) && | |
672 TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { | |
673 he = NULL; | |
674 } | |
675 | |
676 if (he) { | |
677 if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(
), he->alen)) { | |
678 if (info) *info += SPELL_FORBIDDEN; | |
679 return NULL; | |
680 } | |
681 if (root) { | |
682 *root = mystrdup(he->word); | |
683 if (*root && complexprefixes) { | |
684 if (utf8) reverseword_utf(*root); else reverseword(*root); | |
685 } | |
686 } | |
687 // try check compound word | |
688 } else if (pAMgr->get_compound()) { | |
689 he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info); | |
690 // LANG_hu section: `moving rule' with last dash | |
691 if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { | |
692 char * dup = mystrdup(word); | |
693 if (!dup) return NULL; | |
694 dup[len-1] = '\0'; | |
695 he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, i
nfo); | |
696 free(dup); | |
697 } | |
698 // end of LANG speficic region | |
699 if (he) { | |
700 if (root) { | |
701 *root = mystrdup(he->word); | |
702 if (*root && complexprefixes) { | |
703 if (utf8) reverseword_utf(*root); else reverseword(*root
); | |
704 } | |
705 } | |
706 if (info) *info += SPELL_COMPOUND; | |
707 } | |
708 } | |
709 | |
710 } | |
711 | |
712 return he; | |
713 } | |
714 | |
715 int Hunspell::suggest(char*** slst, const char * word) | |
716 { | |
717 #ifdef HUNSPELL_CHROME_CLIENT | |
718 if (pHMgr[0]) pHMgr[0]->EmptyHentryCache(); | |
719 #endif | |
720 int onlycmpdsug = 0; | |
721 char cw[MAXWORDUTF8LEN]; | |
722 char wspace[MAXWORDUTF8LEN]; | |
723 if (!pSMgr || maxdic == 0) return 0; | |
724 w_char unicw[MAXWORDLEN]; | |
725 *slst = NULL; | |
726 // process XML input of the simplified API (see manual) | |
727 if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { | |
728 return spellml(slst, word); | |
729 } | |
730 int nc = strlen(word); | |
731 if (utf8) { | |
732 if (nc >= MAXWORDUTF8LEN) return 0; | |
733 } else { | |
734 if (nc >= MAXWORDLEN) return 0; | |
735 } | |
736 int captype = 0; | |
737 int abbv = 0; | |
738 int wl = 0; | |
739 | |
740 // input conversion | |
741 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; | |
742 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &cap
type, &abbv); | |
743 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); | |
744 | |
745 if (wl == 0) return 0; | |
746 int ns = 0; | |
747 int capwords = 0; | |
748 | |
749 // check capitalized form for FORCEUCASE | |
750 if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { | |
751 int info = SPELL_ORIGCAP; | |
752 char ** wlst; | |
753 if (checkword(cw, &info, NULL)) { | |
754 if (*slst) { | |
755 wlst = *slst; | |
756 } else { | |
757 wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *)); | |
758 if (wlst == NULL) return -1; | |
759 *slst = wlst; | |
760 for (int i = 0; i < MAXSUGGESTION; i++) { | |
761 wlst[i] = NULL; | |
762 } | |
763 } | |
764 wlst[0] = mystrdup(cw); | |
765 mkinitcap(wlst[0]); | |
766 return 1; | |
767 } | |
768 } | |
769 | |
770 switch(captype) { | |
771 case NOCAP: { | |
772 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); | |
773 break; | |
774 } | |
775 | |
776 case INITCAP: { | |
777 capwords = 1; | |
778 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); | |
779 if (ns == -1) break; | |
780 memcpy(wspace,cw,(wl+1)); | |
781 mkallsmall2(wspace, unicw, nc); | |
782 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); | |
783 break; | |
784 } | |
785 case HUHINITCAP: | |
786 capwords = 1; | |
787 case HUHCAP: { | |
788 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); | |
789 if (ns != -1) { | |
790 int prevns; | |
791 // something.The -> something. The | |
792 char * dot = strchr(cw, '.'); | |
793 if (dot && (dot > cw)) { | |
794 int captype_; | |
795 if (utf8) { | |
796 w_char w_[MAXWORDLEN]; | |
797 int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); | |
798 captype_ = get_captype_utf8(w_, wl_, langnum); | |
799 } else captype_ = get_captype(dot+1, strlen(dot+1),
csconv); | |
800 if (captype_ == INITCAP) { | |
801 char * st = mystrdup(cw); | |
802 if (st) st = (char *) realloc(st, wl + 2); | |
803 if (st) { | |
804 st[(dot - cw) + 1] = ' '; | |
805 strcpy(st + (dot - cw) + 2, dot + 1); | |
806 ns = insert_sug(slst, st, ns); | |
807 free(st); | |
808 } | |
809 } | |
810 } | |
811 if (captype == HUHINITCAP) { | |
812 // TheOpenOffice.org -> The OpenOffice.org | |
813 memcpy(wspace,cw,(wl+1)); | |
814 mkinitsmall2(wspace, unicw, nc); | |
815 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); | |
816 } | |
817 memcpy(wspace,cw,(wl+1)); | |
818 mkallsmall2(wspace, unicw, nc); | |
819 if (spell(wspace)) ns = insert_sug(slst, wspace, ns); | |
820 prevns = ns; | |
821 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); | |
822 if (captype == HUHINITCAP) { | |
823 mkinitcap2(wspace, unicw, nc); | |
824 if (spell(wspace)) ns = insert_sug(slst, wspace, ns)
; | |
825 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); | |
826 } | |
827 // aNew -> "a New" (instead of "a new") | |
828 for (int j = prevns; j < ns; j++) { | |
829 char * space = strchr((*slst)[j],' '); | |
830 if (space) { | |
831 int slen = strlen(space + 1); | |
832 // different case after space (need capitalisati
on) | |
833 if ((slen < wl) && strcmp(cw + wl - slen, space
+ 1)) { | |
834 w_char w[MAXWORDLEN]; | |
835 int wc = 0; | |
836 char * r = (*slst)[j]; | |
837 if (utf8) wc = u8_u16(w, MAXWORDLEN, space +
1); | |
838 mkinitcap2(space + 1, w, wc); | |
839 // set as first suggestion | |
840 for (int k = j; k > 0; k--) (*slst)[k] = (*s
lst)[k - 1]; | |
841 (*slst)[0] = r; | |
842 } | |
843 } | |
844 } | |
845 } | |
846 break; | |
847 } | |
848 | |
849 case ALLCAP: { | |
850 memcpy(wspace, cw, (wl+1)); | |
851 mkallsmall2(wspace, unicw, nc); | |
852 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); | |
853 if (ns == -1) break; | |
854 if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) | |
855 ns = insert_sug(slst, wspace, ns); | |
856 mkinitcap2(wspace, unicw, nc); | |
857 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); | |
858 for (int j=0; j < ns; j++) { | |
859 mkallcap((*slst)[j]); | |
860 if (pAMgr && pAMgr->get_checksharps()) { | |
861 char * pos; | |
862 if (utf8) { | |
863 pos = strstr((*slst)[j], "\xC3\x9F"); | |
864 while (pos) { | |
865 *pos = 'S'; | |
866 *(pos+1) = 'S'; | |
867 pos = strstr(pos+2, "\xC3\x9F"); | |
868 } | |
869 } else { | |
870 pos = strchr((*slst)[j], '\xDF'); | |
871 while (pos) { | |
872 (*slst)[j] = (char *) realloc((*slst)[j], st
rlen((*slst)[j]) + 2); | |
873 mystrrep((*slst)[j], "\xDF", "SS"); | |
874 pos = strchr((*slst)[j], '\xDF'); | |
875 } | |
876 } | |
877 } | |
878 } | |
879 break; | |
880 } | |
881 } | |
882 | |
883 // LANG_hu section: replace '-' with ' ' in Hungarian | |
884 if (langnum == LANG_hu) { | |
885 for (int j=0; j < ns; j++) { | |
886 char * pos = strchr((*slst)[j],'-'); | |
887 if (pos) { | |
888 int info; | |
889 char w[MAXWORDUTF8LEN]; | |
890 *pos = '\0'; | |
891 strcpy(w, (*slst)[j]); | |
892 strcat(w, pos + 1); | |
893 spell(w, &info, NULL); | |
894 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { | |
895 *pos = ' '; | |
896 } else *pos = '-'; | |
897 } | |
898 } | |
899 } | |
900 // END OF LANG_hu section | |
901 | |
902 // try ngram approach since found nothing or only compound words | |
903 if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (
*slst)) { | |
904 switch(captype) { | |
905 case NOCAP: { | |
906 ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); | |
907 break; | |
908 } | |
909 case HUHINITCAP: | |
910 capwords = 1; | |
911 case HUHCAP: { | |
912 memcpy(wspace,cw,(wl+1)); | |
913 mkallsmall2(wspace, unicw, nc); | |
914 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); | |
915 break; | |
916 } | |
917 case INITCAP: { | |
918 capwords = 1; | |
919 memcpy(wspace,cw,(wl+1)); | |
920 mkallsmall2(wspace, unicw, nc); | |
921 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); | |
922 break; | |
923 } | |
924 case ALLCAP: { | |
925 memcpy(wspace,cw,(wl+1)); | |
926 mkallsmall2(wspace, unicw, nc); | |
927 int oldns = ns; | |
928 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); | |
929 for (int j = oldns; j < ns; j++) | |
930 mkallcap((*slst)[j]); | |
931 break; | |
932 } | |
933 } | |
934 } | |
935 | |
936 // try dash suggestion (Afo-American -> Afro-American) | |
937 if (char * pos = strchr(cw, '-')) { | |
938 char * ppos = cw; | |
939 int nodashsug = 1; | |
940 char ** nlst = NULL; | |
941 int nn = 0; | |
942 int last = 0; | |
943 if (*slst) { | |
944 for (int j = 0; j < ns && nodashsug == 1; j++) { | |
945 if (strchr((*slst)[j], '-')) nodashsug = 0; | |
946 } | |
947 } | |
948 while (nodashsug && !last) { | |
949 if (*pos == '\0') last = 1; else *pos = '\0'; | |
950 if (!spell(ppos)) { | |
951 nn = suggest(&nlst, ppos); | |
952 for (int j = nn - 1; j >= 0; j--) { | |
953 strncpy(wspace, cw, ppos - cw); | |
954 strcpy(wspace + (ppos - cw), nlst[j]); | |
955 if (!last) { | |
956 strcat(wspace, "-"); | |
957 strcat(wspace, pos + 1); | |
958 } | |
959 ns = insert_sug(slst, wspace, ns); | |
960 free(nlst[j]); | |
961 } | |
962 if (nlst != NULL) free(nlst); | |
963 nodashsug = 0; | |
964 } | |
965 if (!last) { | |
966 *pos = '-'; | |
967 ppos = pos + 1; | |
968 pos = strchr(ppos, '-'); | |
969 } | |
970 if (!pos) pos = cw + strlen(cw); | |
971 } | |
972 } | |
973 | |
974 // word reversing wrapper for complex prefixes | |
975 if (complexprefixes) { | |
976 for (int j = 0; j < ns; j++) { | |
977 if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); | |
978 } | |
979 } | |
980 | |
981 // capitalize | |
982 if (capwords) for (int j=0; j < ns; j++) { | |
983 mkinitcap((*slst)[j]); | |
984 } | |
985 | |
986 // expand suggestions with dot(s) | |
987 if (abbv && pAMgr && pAMgr->get_sugswithdots()) { | |
988 for (int j = 0; j < ns; j++) { | |
989 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); | |
990 strcat((*slst)[j], word + strlen(word) - abbv); | |
991 } | |
992 } | |
993 | |
994 // remove bad capitalized and forbidden forms | |
995 if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { | |
996 switch (captype) { | |
997 case INITCAP: | |
998 case ALLCAP: { | |
999 int l = 0; | |
1000 for (int j=0; j < ns; j++) { | |
1001 if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { | |
1002 char s[MAXSWUTF8L]; | |
1003 w_char w[MAXSWL]; | |
1004 int len; | |
1005 if (utf8) { | |
1006 len = u8_u16(w, MAXSWL, (*slst)[j]); | |
1007 } else { | |
1008 strcpy(s, (*slst)[j]); | |
1009 len = strlen(s); | |
1010 } | |
1011 mkallsmall2(s, w, len); | |
1012 free((*slst)[j]); | |
1013 if (spell(s)) { | |
1014 (*slst)[l] = mystrdup(s); | |
1015 if ((*slst)[l]) l++; | |
1016 } else { | |
1017 mkinitcap2(s, w, len); | |
1018 if (spell(s)) { | |
1019 (*slst)[l] = mystrdup(s); | |
1020 if ((*slst)[l]) l++; | |
1021 } | |
1022 } | |
1023 } else { | |
1024 (*slst)[l] = (*slst)[j]; | |
1025 l++; | |
1026 } | |
1027 } | |
1028 ns = l; | |
1029 } | |
1030 } | |
1031 } | |
1032 | |
1033 // remove duplications | |
1034 int l = 0; | |
1035 for (int j = 0; j < ns; j++) { | |
1036 (*slst)[l] = (*slst)[j]; | |
1037 for (int k = 0; k < l; k++) { | |
1038 if (strcmp((*slst)[k], (*slst)[j]) == 0) { | |
1039 free((*slst)[j]); | |
1040 l--; | |
1041 break; | |
1042 } | |
1043 } | |
1044 l++; | |
1045 } | |
1046 ns = l; | |
1047 | |
1048 // output conversion | |
1049 rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; | |
1050 for (int j = 0; rl && j < ns; j++) { | |
1051 if (rl->conv((*slst)[j], wspace)) { | |
1052 free((*slst)[j]); | |
1053 (*slst)[j] = mystrdup(wspace); | |
1054 } | |
1055 } | |
1056 | |
1057 // if suggestions removed by nosuggest, onlyincompound parameters | |
1058 if (l == 0 && *slst) { | |
1059 free(*slst); | |
1060 *slst = NULL; | |
1061 } | |
1062 return l; | |
1063 } | |
1064 | |
1065 void Hunspell::free_list(char *** slst, int n) { | |
1066 freelist(slst, n); | |
1067 } | |
1068 | |
1069 char * Hunspell::get_dic_encoding() | |
1070 { | |
1071 return encoding; | |
1072 } | |
1073 | |
1074 #ifdef HUNSPELL_EXPERIMENTAL | |
1075 // XXX need UTF-8 support | |
1076 int Hunspell::suggest_auto(char*** slst, const char * word) | |
1077 { | |
1078 char cw[MAXWORDUTF8LEN]; | |
1079 char wspace[MAXWORDUTF8LEN]; | |
1080 if (!pSMgr || maxdic == 0) return 0; | |
1081 int wl = strlen(word); | |
1082 if (utf8) { | |
1083 if (wl >= MAXWORDUTF8LEN) return 0; | |
1084 } else { | |
1085 if (wl >= MAXWORDLEN) return 0; | |
1086 } | |
1087 int captype = 0; | |
1088 int abbv = 0; | |
1089 wl = cleanword(cw, word, &captype, &abbv); | |
1090 if (wl == 0) return 0; | |
1091 int ns = 0; | |
1092 *slst = NULL; // HU, nsug in pSMgr->suggest | |
1093 | |
1094 switch(captype) { | |
1095 case NOCAP: { | |
1096 ns = pSMgr->suggest_auto(slst, cw, ns); | |
1097 if (ns>0) break; | |
1098 break; | |
1099 } | |
1100 | |
1101 case INITCAP: { | |
1102 memcpy(wspace,cw,(wl+1)); | |
1103 mkallsmall(wspace); | |
1104 ns = pSMgr->suggest_auto(slst, wspace, ns); | |
1105 for (int j=0; j < ns; j++) | |
1106 mkinitcap((*slst)[j]); | |
1107 ns = pSMgr->suggest_auto(slst, cw, ns); | |
1108 break; | |
1109 | |
1110 } | |
1111 | |
1112 case HUHINITCAP: | |
1113 case HUHCAP: { | |
1114 ns = pSMgr->suggest_auto(slst, cw, ns); | |
1115 if (ns == 0) { | |
1116 memcpy(wspace,cw,(wl+1)); | |
1117 mkallsmall(wspace); | |
1118 ns = pSMgr->suggest_auto(slst, wspace, ns); | |
1119 } | |
1120 break; | |
1121 } | |
1122 | |
1123 case ALLCAP: { | |
1124 memcpy(wspace,cw,(wl+1)); | |
1125 mkallsmall(wspace); | |
1126 ns = pSMgr->suggest_auto(slst, wspace, ns); | |
1127 | |
1128 mkinitcap(wspace); | |
1129 ns = pSMgr->suggest_auto(slst, wspace, ns); | |
1130 | |
1131 for (int j=0; j < ns; j++) | |
1132 mkallcap((*slst)[j]); | |
1133 break; | |
1134 } | |
1135 } | |
1136 | |
1137 // word reversing wrapper for complex prefixes | |
1138 if (complexprefixes) { | |
1139 for (int j = 0; j < ns; j++) { | |
1140 if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); | |
1141 } | |
1142 } | |
1143 | |
1144 // expand suggestions with dot(s) | |
1145 if (abbv && pAMgr && pAMgr->get_sugswithdots()) { | |
1146 for (int j = 0; j < ns; j++) { | |
1147 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); | |
1148 strcat((*slst)[j], word + strlen(word) - abbv); | |
1149 } | |
1150 } | |
1151 | |
1152 // LANG_hu section: replace '-' with ' ' in Hungarian | |
1153 if (langnum == LANG_hu) { | |
1154 for (int j=0; j < ns; j++) { | |
1155 char * pos = strchr((*slst)[j],'-'); | |
1156 if (pos) { | |
1157 int info; | |
1158 char w[MAXWORDUTF8LEN]; | |
1159 *pos = '\0'; | |
1160 strcpy(w, (*slst)[j]); | |
1161 strcat(w, pos + 1); | |
1162 spell(w, &info, NULL); | |
1163 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { | |
1164 *pos = ' '; | |
1165 } else *pos = '-'; | |
1166 } | |
1167 } | |
1168 } | |
1169 // END OF LANG_hu section | |
1170 return ns; | |
1171 } | |
1172 #endif | |
1173 | |
1174 int Hunspell::stem(char*** slst, char ** desc, int n) | |
1175 { | |
1176 char result[MAXLNLEN]; | |
1177 char result2[MAXLNLEN]; | |
1178 *slst = NULL; | |
1179 if (n == 0) return 0; | |
1180 *result2 = '\0'; | |
1181 for (int i = 0; i < n; i++) { | |
1182 *result = '\0'; | |
1183 // add compound word parts (except the last one) | |
1184 char * s = (char *) desc[i]; | |
1185 char * part = strstr(s, MORPH_PART); | |
1186 if (part) { | |
1187 char * nextpart = strstr(part + 1, MORPH_PART); | |
1188 while (nextpart) { | |
1189 copy_field(result + strlen(result), part, MORPH_PART); | |
1190 part = nextpart; | |
1191 nextpart = strstr(part + 1, MORPH_PART); | |
1192 } | |
1193 s = part; | |
1194 } | |
1195 | |
1196 char **pl; | |
1197 char tok[MAXLNLEN]; | |
1198 strcpy(tok, s); | |
1199 char * alt = strstr(tok, " | "); | |
1200 while (alt) { | |
1201 alt[1] = MSEP_ALT; | |
1202 alt = strstr(alt, " | "); | |
1203 } | |
1204 int pln = line_tok(tok, &pl, MSEP_ALT); | |
1205 for (int k = 0; k < pln; k++) { | |
1206 // add derivational suffixes | |
1207 if (strstr(pl[k], MORPH_DERI_SFX)) { | |
1208 // remove inflectional suffixes | |
1209 char * is = strstr(pl[k], MORPH_INFL_SFX); | |
1210 if (is) *is = '\0'; | |
1211 char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); | |
1212 if (sg) { | |
1213 char ** gen; | |
1214 int genl = line_tok(sg, &gen, MSEP_REC); | |
1215 free(sg); | |
1216 for (int j = 0; j < genl; j++) { | |
1217 sprintf(result2 + strlen(result2), "%c%s%s", | |
1218 MSEP_REC, result, gen[j]); | |
1219 } | |
1220 freelist(&gen, genl); | |
1221 } | |
1222 } else { | |
1223 sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); | |
1224 if (strstr(pl[k], MORPH_SURF_PFX)) { | |
1225 copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); | |
1226 } | |
1227 copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); | |
1228 } | |
1229 } | |
1230 freelist(&pl, pln); | |
1231 } | |
1232 int sln = line_tok(result2, slst, MSEP_REC); | |
1233 return uniqlist(*slst, sln); | |
1234 | |
1235 } | |
1236 | |
1237 int Hunspell::stem(char*** slst, const char * word) | |
1238 { | |
1239 char ** pl; | |
1240 int pln = analyze(&pl, word); | |
1241 int pln2 = stem(slst, pl, pln); | |
1242 freelist(&pl, pln); | |
1243 return pln2; | |
1244 } | |
1245 | |
1246 #ifdef HUNSPELL_EXPERIMENTAL | |
1247 int Hunspell::suggest_pos_stems(char*** slst, const char * word) | |
1248 { | |
1249 char cw[MAXWORDUTF8LEN]; | |
1250 char wspace[MAXWORDUTF8LEN]; | |
1251 if (! pSMgr || maxdic == 0) return 0; | |
1252 int wl = strlen(word); | |
1253 if (utf8) { | |
1254 if (wl >= MAXWORDUTF8LEN) return 0; | |
1255 } else { | |
1256 if (wl >= MAXWORDLEN) return 0; | |
1257 } | |
1258 int captype = 0; | |
1259 int abbv = 0; | |
1260 wl = cleanword(cw, word, &captype, &abbv); | |
1261 if (wl == 0) return 0; | |
1262 | |
1263 int ns = 0; // ns=0 = normalized input | |
1264 | |
1265 *slst = NULL; // HU, nsug in pSMgr->suggest | |
1266 | |
1267 switch(captype) { | |
1268 case HUHCAP: | |
1269 case NOCAP: { | |
1270 ns = pSMgr->suggest_pos_stems(slst, cw, ns); | |
1271 | |
1272 if ((abbv) && (ns == 0)) { | |
1273 memcpy(wspace,cw,wl); | |
1274 *(wspace+wl) = '.'; | |
1275 *(wspace+wl+1) = '\0'; | |
1276 ns = pSMgr->suggest_pos_stems(slst, wspace, ns); | |
1277 } | |
1278 | |
1279 break; | |
1280 } | |
1281 | |
1282 case INITCAP: { | |
1283 | |
1284 ns = pSMgr->suggest_pos_stems(slst, cw, ns); | |
1285 | |
1286 if (ns == 0 || ((*slst)[0][0] == '#')) { | |
1287 memcpy(wspace,cw,(wl+1)); | |
1288 mkallsmall(wspace); | |
1289 ns = pSMgr->suggest_pos_stems(slst, wspace, ns); | |
1290 } | |
1291 | |
1292 break; | |
1293 | |
1294 } | |
1295 | |
1296 case ALLCAP: { | |
1297 ns = pSMgr->suggest_pos_stems(slst, cw, ns); | |
1298 if (ns != 0) break; | |
1299 | |
1300 memcpy(wspace,cw,(wl+1)); | |
1301 mkallsmall(wspace); | |
1302 ns = pSMgr->suggest_pos_stems(slst, wspace, ns); | |
1303 | |
1304 if (ns == 0) { | |
1305 mkinitcap(wspace); | |
1306 ns = pSMgr->suggest_pos_stems(slst, wspace, ns); | |
1307 } | |
1308 break; | |
1309 } | |
1310 } | |
1311 | |
1312 return ns; | |
1313 } | |
1314 #endif // END OF HUNSPELL_EXPERIMENTAL CODE | |
1315 | |
1316 const char * Hunspell::get_wordchars() | |
1317 { | |
1318 return pAMgr->get_wordchars(); | |
1319 } | |
1320 | |
1321 unsigned short * Hunspell::get_wordchars_utf16(int * len) | |
1322 { | |
1323 return pAMgr->get_wordchars_utf16(len); | |
1324 } | |
1325 | |
1326 void Hunspell::mkinitcap(char * p) | |
1327 { | |
1328 if (!utf8) { | |
1329 if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; | |
1330 } else { | |
1331 int len; | |
1332 w_char u[MAXWORDLEN]; | |
1333 len = u8_u16(u, MAXWORDLEN, p); | |
1334 unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); | |
1335 u[0].h = (unsigned char) (i >> 8); | |
1336 u[0].l = (unsigned char) (i & 0x00FF); | |
1337 u16_u8(p, MAXWORDUTF8LEN, u, len); | |
1338 } | |
1339 } | |
1340 | |
1341 int Hunspell::mkinitcap2(char * p, w_char * u, int nc) | |
1342 { | |
1343 if (!utf8) { | |
1344 if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; | |
1345 } else if (nc > 0) { | |
1346 unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); | |
1347 u[0].h = (unsigned char) (i >> 8); | |
1348 u[0].l = (unsigned char) (i & 0x00FF); | |
1349 u16_u8(p, MAXWORDUTF8LEN, u, nc); | |
1350 return strlen(p); | |
1351 } | |
1352 return nc; | |
1353 } | |
1354 | |
1355 int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) | |
1356 { | |
1357 if (!utf8) { | |
1358 if (*p != '\0') *p = csconv[((unsigned char)*p)].clower; | |
1359 } else if (nc > 0) { | |
1360 unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum); | |
1361 u[0].h = (unsigned char) (i >> 8); | |
1362 u[0].l = (unsigned char) (i & 0x00FF); | |
1363 u16_u8(p, MAXWORDUTF8LEN, u, nc); | |
1364 return strlen(p); | |
1365 } | |
1366 return nc; | |
1367 } | |
1368 | |
1369 int Hunspell::add(const char * word) | |
1370 { | |
1371 if (pHMgr[0]) return (pHMgr[0])->add(word); | |
1372 return 0; | |
1373 } | |
1374 | |
1375 int Hunspell::add_with_affix(const char * word, const char * example) | |
1376 { | |
1377 if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); | |
1378 return 0; | |
1379 } | |
1380 | |
1381 int Hunspell::remove(const char * word) | |
1382 { | |
1383 if (pHMgr[0]) return (pHMgr[0])->remove(word); | |
1384 return 0; | |
1385 } | |
1386 | |
1387 const char * Hunspell::get_version() | |
1388 { | |
1389 return pAMgr->get_version(); | |
1390 } | |
1391 | |
1392 struct cs_info * Hunspell::get_csconv() | |
1393 { | |
1394 return csconv; | |
1395 } | |
1396 | |
1397 void Hunspell::cat_result(char * result, char * st) | |
1398 { | |
1399 if (st) { | |
1400 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1401 mystrcat(result, st, MAXLNLEN); | |
1402 free(st); | |
1403 } | |
1404 } | |
1405 | |
1406 int Hunspell::analyze(char*** slst, const char * word) | |
1407 { | |
1408 char cw[MAXWORDUTF8LEN]; | |
1409 char wspace[MAXWORDUTF8LEN]; | |
1410 w_char unicw[MAXWORDLEN]; | |
1411 int wl2 = 0; | |
1412 *slst = NULL; | |
1413 if (! pSMgr || maxdic == 0) return 0; | |
1414 int nc = strlen(word); | |
1415 if (utf8) { | |
1416 if (nc >= MAXWORDUTF8LEN) return 0; | |
1417 } else { | |
1418 if (nc >= MAXWORDLEN) return 0; | |
1419 } | |
1420 int captype = 0; | |
1421 int abbv = 0; | |
1422 int wl = 0; | |
1423 | |
1424 // input conversion | |
1425 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; | |
1426 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &cap
type, &abbv); | |
1427 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); | |
1428 | |
1429 if (wl == 0) { | |
1430 if (abbv) { | |
1431 for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; | |
1432 cw[wl] = '\0'; | |
1433 abbv = 0; | |
1434 } else return 0; | |
1435 } | |
1436 | |
1437 char result[MAXLNLEN]; | |
1438 char * st = NULL; | |
1439 | |
1440 *result = '\0'; | |
1441 | |
1442 int n = 0; | |
1443 int n2 = 0; | |
1444 int n3 = 0; | |
1445 | |
1446 // test numbers | |
1447 // LANG_hu section: set dash information for suggestions | |
1448 if (langnum == LANG_hu) { | |
1449 while ((n < wl) && | |
1450 (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ','
)) && (n > 0)))) { | |
1451 n++; | |
1452 if ((cw[n] == '.') || (cw[n] == ',')) { | |
1453 if (((n2 == 0) && (n > 3)) || | |
1454 ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) br
eak; | |
1455 n2++; | |
1456 n3 = n; | |
1457 } | |
1458 } | |
1459 | |
1460 if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; | |
1461 if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n
, NULL, NULL))) { | |
1462 mystrcat(result, cw, MAXLNLEN); | |
1463 result[n - 1] = '\0'; | |
1464 if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); | |
1465 else { | |
1466 char sign = cw[n]; | |
1467 cw[n] = '\0'; | |
1468 cat_result(result, pSMgr->suggest_morph(cw + n - 1)); | |
1469 mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE | |
1470 cw[n] = sign; | |
1471 cat_result(result, pSMgr->suggest_morph(cw + n)); | |
1472 } | |
1473 return line_tok(result, slst, MSEP_REC); | |
1474 } | |
1475 } | |
1476 // END OF LANG_hu section | |
1477 | |
1478 switch(captype) { | |
1479 case HUHCAP: | |
1480 case HUHINITCAP: | |
1481 case NOCAP: { | |
1482 cat_result(result, pSMgr->suggest_morph(cw)); | |
1483 if (abbv) { | |
1484 memcpy(wspace,cw,wl); | |
1485 *(wspace+wl) = '.'; | |
1486 *(wspace+wl+1) = '\0'; | |
1487 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1488 } | |
1489 break; | |
1490 } | |
1491 case INITCAP: { | |
1492 wl = mkallsmall2(cw, unicw, nc); | |
1493 memcpy(wspace,cw,(wl+1)); | |
1494 wl2 = mkinitcap2(cw, unicw, nc); | |
1495 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1496 cat_result(result, pSMgr->suggest_morph(cw)); | |
1497 if (abbv) { | |
1498 *(wspace+wl) = '.'; | |
1499 *(wspace+wl+1) = '\0'; | |
1500 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1501 | |
1502 memcpy(wspace, cw, wl2); | |
1503 *(wspace+wl2) = '.'; | |
1504 *(wspace+wl2+1) = '\0'; | |
1505 | |
1506 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1507 } | |
1508 break; | |
1509 } | |
1510 case ALLCAP: { | |
1511 cat_result(result, pSMgr->suggest_morph(cw)); | |
1512 if (abbv) { | |
1513 memcpy(wspace,cw,wl); | |
1514 *(wspace+wl) = '.'; | |
1515 *(wspace+wl+1) = '\0'; | |
1516 cat_result(result, pSMgr->suggest_morph(cw)); | |
1517 } | |
1518 wl = mkallsmall2(cw, unicw, nc); | |
1519 memcpy(wspace,cw,(wl+1)); | |
1520 wl2 = mkinitcap2(cw, unicw, nc); | |
1521 | |
1522 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1523 cat_result(result, pSMgr->suggest_morph(cw)); | |
1524 if (abbv) { | |
1525 *(wspace+wl) = '.'; | |
1526 *(wspace+wl+1) = '\0'; | |
1527 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1528 | |
1529 memcpy(wspace, cw, wl2); | |
1530 *(wspace+wl2) = '.'; | |
1531 *(wspace+wl2+1) = '\0'; | |
1532 | |
1533 cat_result(result, pSMgr->suggest_morph(wspace)); | |
1534 } | |
1535 break; | |
1536 } | |
1537 } | |
1538 | |
1539 if (*result) { | |
1540 // word reversing wrapper for complex prefixes | |
1541 if (complexprefixes) { | |
1542 if (utf8) reverseword_utf(result); else reverseword(result); | |
1543 } | |
1544 return line_tok(result, slst, MSEP_REC); | |
1545 } | |
1546 | |
1547 // compound word with dash (HU) I18n | |
1548 char * dash = NULL; | |
1549 int nresult = 0; | |
1550 // LANG_hu section: set dash information for suggestions | |
1551 if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); | |
1552 if ((langnum == LANG_hu) && dash) { | |
1553 *dash='\0'; | |
1554 // examine 2 sides of the dash | |
1555 if (dash[1] == '\0') { // base word ending with dash | |
1556 if (spell(cw)) { | |
1557 char * p = pSMgr->suggest_morph(cw); | |
1558 if (p) { | |
1559 int ret = line_tok(p, slst, MSEP_REC); | |
1560 free(p); | |
1561 return ret; | |
1562 } | |
1563 | |
1564 } | |
1565 } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. | |
1566 if (spell(cw) && (spell("-e"))) { | |
1567 st = pSMgr->suggest_morph(cw); | |
1568 if (st) { | |
1569 mystrcat(result, st, MAXLNLEN); | |
1570 free(st); | |
1571 } | |
1572 mystrcat(result,"+", MAXLNLEN); // XXX spec. separator i
n MORPHCODE | |
1573 st = pSMgr->suggest_morph("-e"); | |
1574 if (st) { | |
1575 mystrcat(result, st, MAXLNLEN); | |
1576 free(st); | |
1577 } | |
1578 return line_tok(result, slst, MSEP_REC); | |
1579 } | |
1580 } else { | |
1581 // first word ending with dash: word- XXX ??? | |
1582 char r2 = *(dash + 1); | |
1583 dash[0]='-'; | |
1584 dash[1]='\0'; | |
1585 nresult = spell(cw); | |
1586 dash[1] = r2; | |
1587 dash[0]='\0'; | |
1588 if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || | |
1589 ((dash[1] > '0') && (dash[1] < '9')))) { | |
1590 st = pSMgr->suggest_morph(cw); | |
1591 if (st) { | |
1592 mystrcat(result, st, MAXLNLEN); | |
1593 free(st); | |
1594 mystrcat(result,"+", MAXLNLEN); // XXX spec. sep
arator in MORPHCODE | |
1595 } | |
1596 st = pSMgr->suggest_morph(dash+1); | |
1597 if (st) { | |
1598 mystrcat(result, st, MAXLNLEN); | |
1599 free(st); | |
1600 } | |
1601 return line_tok(result, slst, MSEP_REC); | |
1602 } | |
1603 } | |
1604 // affixed number in correct word | |
1605 if (nresult && (dash > cw) && (((*(dash-1)<='9') && | |
1606 (*(dash-1)>='0')) || (*(dash-1)=='.'))) { | |
1607 *dash='-'; | |
1608 n = 1; | |
1609 if (*(dash - n) == '.') n++; | |
1610 // search first not a number character to left from dash | |
1611 while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6))
{ | |
1612 n++; | |
1613 } | |
1614 if ((dash - n) < cw) n--; | |
1615 // numbers: valami1000000-hoz | |
1616 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, | |
1617 // 56-hoz, 6-hoz | |
1618 for(; n >= 1; n--) { | |
1619 if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash -
n, NULL, NULL)) { | |
1620 mystrcat(result, cw, MAXLNLEN); | |
1621 result[dash - cw - n] = '\0'; | |
1622 st = pSMgr->suggest_morph(dash - n); | |
1623 if (st) { | |
1624 mystrcat(result, st, MAXLNLEN); | |
1625 free(st); | |
1626 } | |
1627 return line_tok(result, slst, MSEP_REC); | |
1628 } | |
1629 } | |
1630 } | |
1631 } | |
1632 return 0; | |
1633 } | |
1634 | |
1635 int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) | |
1636 { | |
1637 *slst = NULL; | |
1638 if (!pSMgr || !pln) return 0; | |
1639 char **pl2; | |
1640 int pl2n = analyze(&pl2, word); | |
1641 int captype = 0; | |
1642 int abbv = 0; | |
1643 char cw[MAXWORDUTF8LEN]; | |
1644 cleanword(cw, word, &captype, &abbv); | |
1645 char result[MAXLNLEN]; | |
1646 *result = '\0'; | |
1647 | |
1648 for (int i = 0; i < pln; i++) { | |
1649 cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); | |
1650 } | |
1651 freelist(&pl2, pl2n); | |
1652 | |
1653 if (*result) { | |
1654 // allcap | |
1655 if (captype == ALLCAP) mkallcap(result); | |
1656 | |
1657 // line split | |
1658 int linenum = line_tok(result, slst, MSEP_REC); | |
1659 | |
1660 // capitalize | |
1661 if (captype == INITCAP || captype == HUHINITCAP) { | |
1662 for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); | |
1663 } | |
1664 | |
1665 // temporary filtering of prefix related errors (eg. | |
1666 // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") | |
1667 | |
1668 int r = 0; | |
1669 for (int j=0; j < linenum; j++) { | |
1670 if (!spell((*slst)[j])) { | |
1671 free((*slst)[j]); | |
1672 (*slst)[j] = NULL; | |
1673 } else { | |
1674 if (r < j) (*slst)[r] = (*slst)[j]; | |
1675 r++; | |
1676 } | |
1677 } | |
1678 if (r > 0) return r; | |
1679 free(*slst); | |
1680 *slst = NULL; | |
1681 } | |
1682 return 0; | |
1683 } | |
1684 | |
1685 int Hunspell::generate(char*** slst, const char * word, const char * pattern) | |
1686 { | |
1687 char **pl; | |
1688 int pln = analyze(&pl, pattern); | |
1689 int n = generate(slst, word, pl, pln); | |
1690 freelist(&pl, pln); | |
1691 return uniqlist(*slst, n); | |
1692 } | |
1693 | |
1694 // minimal XML parser functions | |
1695 int Hunspell::get_xml_par(char * dest, const char * par, int max) | |
1696 { | |
1697 char * d = dest; | |
1698 if (!par) return 0; | |
1699 char end = *par; | |
1700 char * dmax = dest + max; | |
1701 if (end == '>') end = '<'; | |
1702 else if (end != '\'' && end != '"') return 0; // bad XML | |
1703 for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; | |
1704 *d = '\0'; | |
1705 mystrrep(dest, "<", "<"); | |
1706 mystrrep(dest, "&", "&"); | |
1707 return (int)(d - dest); | |
1708 } | |
1709 | |
1710 int Hunspell::get_langnum() const | |
1711 { | |
1712 return langnum; | |
1713 } | |
1714 | |
1715 // return the beginning of the element (attr == NULL) or the attribute | |
1716 const char * Hunspell::get_xml_pos(const char * s, const char * attr) | |
1717 { | |
1718 const char * end = strchr(s, '>'); | |
1719 const char * p = s; | |
1720 if (attr == NULL) return end; | |
1721 do { | |
1722 p = strstr(p, attr); | |
1723 if (!p || p >= end) return 0; | |
1724 } while (*(p-1) != ' ' && *(p-1) != '\n'); | |
1725 return p + strlen(attr); | |
1726 } | |
1727 | |
1728 int Hunspell::check_xml_par(const char * q, const char * attr, const char * valu
e) { | |
1729 char cw[MAXWORDUTF8LEN]; | |
1730 if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && | |
1731 strcmp(cw, value) == 0) return 1; | |
1732 return 0; | |
1733 } | |
1734 | |
1735 int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { | |
1736 int n = 0; | |
1737 char * p; | |
1738 if (!list) return 0; | |
1739 for (p = list; ((p = strstr(p, tag)) != NULL); p++) n++; | |
1740 if (n == 0) return 0; | |
1741 *slst = (char **) malloc(sizeof(char *) * n); | |
1742 if (!*slst) return 0; | |
1743 for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) { | |
1744 int l = strlen(p); | |
1745 (*slst)[n] = (char *) malloc(l + 1); | |
1746 if (!(*slst)[n]) return n; | |
1747 if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) { | |
1748 free((*slst)[n]); | |
1749 break; | |
1750 } | |
1751 } | |
1752 return n; | |
1753 } | |
1754 | |
1755 int Hunspell::spellml(char*** slst, const char * word) | |
1756 { | |
1757 char *q, *q2; | |
1758 char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; | |
1759 q = (char *) strstr(word, "<query"); | |
1760 if (!q) return 0; // bad XML input | |
1761 q2 = strchr(q, '>'); | |
1762 if (!q2) return 0; // bad XML input | |
1763 q2 = strstr(q2, "<word"); | |
1764 if (!q2) return 0; // bad XML input | |
1765 if (check_xml_par(q, "type=", "analyze")) { | |
1766 int n = 0, s = 0; | |
1767 if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(sls
t, cw); | |
1768 if (n == 0) return 0; | |
1769 // convert the result to <code><a>ana1</a><a>ana2</a></code> format | |
1770 for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); | |
1771 char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->&
; | |
1772 if (!r) return 0; | |
1773 strcpy(r, "<code>"); | |
1774 for (int i = 0; i < n; i++) { | |
1775 int l = strlen(r); | |
1776 strcpy(r + l, "<a>"); | |
1777 strcpy(r + l + 3, (*slst)[i]); | |
1778 mystrrep(r + l + 3, "\t", " "); | |
1779 mystrrep(r + l + 3, "<", "<"); | |
1780 mystrrep(r + l + 3, "&", "&"); | |
1781 strcat(r, "</a>"); | |
1782 free((*slst)[i]); | |
1783 } | |
1784 strcat(r, "</code>"); | |
1785 (*slst)[0] = r; | |
1786 return 1; | |
1787 } else if (check_xml_par(q, "type=", "stem")) { | |
1788 if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst
, cw); | |
1789 } else if (check_xml_par(q, "type=", "generate")) { | |
1790 int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1); | |
1791 if (n == 0) return 0; | |
1792 char * q3 = strstr(q2 + 1, "<word"); | |
1793 if (q3) { | |
1794 if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) { | |
1795 return generate(slst, cw, cw2); | |
1796 } | |
1797 } else { | |
1798 if ((q2 = strstr(q2 + 1, "<code")) != NULL) { | |
1799 char ** slst2; | |
1800 if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>")) != 0) { | |
1801 int n2 = generate(slst, cw, slst2, n); | |
1802 freelist(&slst2, n); | |
1803 return uniqlist(*slst, n2); | |
1804 } | |
1805 freelist(&slst2, n); | |
1806 } | |
1807 } | |
1808 } | |
1809 return 0; | |
1810 } | |
1811 | |
1812 | |
1813 #ifdef HUNSPELL_EXPERIMENTAL | |
1814 // XXX need UTF-8 support | |
1815 char * Hunspell::morph_with_correction(const char * word) | |
1816 { | |
1817 char cw[MAXWORDUTF8LEN]; | |
1818 char wspace[MAXWORDUTF8LEN]; | |
1819 if (! pSMgr || maxdic == 0) return NULL; | |
1820 int wl = strlen(word); | |
1821 if (utf8) { | |
1822 if (wl >= MAXWORDUTF8LEN) return NULL; | |
1823 } else { | |
1824 if (wl >= MAXWORDLEN) return NULL; | |
1825 } | |
1826 int captype = 0; | |
1827 int abbv = 0; | |
1828 wl = cleanword(cw, word, &captype, &abbv); | |
1829 if (wl == 0) return NULL; | |
1830 | |
1831 char result[MAXLNLEN]; | |
1832 char * st = NULL; | |
1833 | |
1834 *result = '\0'; | |
1835 | |
1836 | |
1837 switch(captype) { | |
1838 case NOCAP: { | |
1839 st = pSMgr->suggest_morph_for_spelling_error(cw); | |
1840 if (st) { | |
1841 mystrcat(result, st, MAXLNLEN); | |
1842 free(st); | |
1843 } | |
1844 if (abbv) { | |
1845 memcpy(wspace,cw,wl); | |
1846 *(wspace+wl) = '.'; | |
1847 *(wspace+wl+1) = '\0'; | |
1848 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1849 if (st) { | |
1850 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1851 mystrcat(result, st, MAXLNLEN); | |
1852 free(st); | |
1853 } | |
1854 } | |
1855 break; | |
1856 } | |
1857 case INITCAP: { | |
1858 memcpy(wspace,cw,(wl+1)); | |
1859 mkallsmall(wspace); | |
1860 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1861 if (st) { | |
1862 mystrcat(result, st, MAXLNLEN); | |
1863 free(st); | |
1864 } | |
1865 st = pSMgr->suggest_morph_for_spelling_error(cw); | |
1866 if (st) { | |
1867 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1868 mystrcat(result, st, MAXLNLEN); | |
1869 free(st); | |
1870 } | |
1871 if (abbv) { | |
1872 memcpy(wspace,cw,wl); | |
1873 *(wspace+wl) = '.'; | |
1874 *(wspace+wl+1) = '\0'; | |
1875 mkallsmall(wspace); | |
1876 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1877 if (st) { | |
1878 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1879 mystrcat(result, st, MAXLNLEN); | |
1880 free(st); | |
1881 } | |
1882 mkinitcap(wspace); | |
1883 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1884 if (st) { | |
1885 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1886 mystrcat(result, st, MAXLNLEN); | |
1887 free(st); | |
1888 } | |
1889 } | |
1890 break; | |
1891 } | |
1892 case HUHCAP: { | |
1893 st = pSMgr->suggest_morph_for_spelling_error(cw); | |
1894 if (st) { | |
1895 mystrcat(result, st, MAXLNLEN); | |
1896 free(st); | |
1897 } | |
1898 memcpy(wspace,cw,(wl+1)); | |
1899 mkallsmall(wspace); | |
1900 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1901 if (st) { | |
1902 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1903 mystrcat(result, st, MAXLNLEN); | |
1904 free(st); | |
1905 } | |
1906 break; | |
1907 } | |
1908 case ALLCAP: { | |
1909 memcpy(wspace,cw,(wl+1)); | |
1910 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1911 if (st) { | |
1912 mystrcat(result, st, MAXLNLEN); | |
1913 free(st); | |
1914 } | |
1915 mkallsmall(wspace); | |
1916 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1917 if (st) { | |
1918 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1919 mystrcat(result, st, MAXLNLEN); | |
1920 free(st); | |
1921 } | |
1922 mkinitcap(wspace); | |
1923 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1924 if (st) { | |
1925 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1926 mystrcat(result, st, MAXLNLEN); | |
1927 free(st); | |
1928 } | |
1929 if (abbv) { | |
1930 memcpy(wspace,cw,(wl+1)); | |
1931 *(wspace+wl) = '.'; | |
1932 *(wspace+wl+1) = '\0'; | |
1933 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1934 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1935 if (st) { | |
1936 mystrcat(result, st, MAXLNLEN); | |
1937 free(st); | |
1938 } | |
1939 mkallsmall(wspace); | |
1940 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1941 if (st) { | |
1942 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1943 mystrcat(result, st, MAXLNLEN); | |
1944 free(st); | |
1945 } | |
1946 mkinitcap(wspace); | |
1947 st = pSMgr->suggest_morph_for_spelling_error(wspace); | |
1948 if (st) { | |
1949 if (*result) mystrcat(result, "\n", MAXLNLEN); | |
1950 mystrcat(result, st, MAXLNLEN); | |
1951 free(st); | |
1952 } | |
1953 } | |
1954 break; | |
1955 } | |
1956 } | |
1957 | |
1958 if (*result) return mystrdup(result); | |
1959 return NULL; | |
1960 } | |
1961 | |
1962 #endif // END OF HUNSPELL_EXPERIMENTAL CODE | |
1963 | |
1964 Hunhandle *Hunspell_create(const char * affpath, const char * dpath) | |
1965 { | |
1966 #ifdef HUNSPELL_CHROME_CLIENT | |
1967 return NULL; | |
1968 #else | |
1969 return (Hunhandle*)(new Hunspell(affpath, dpath)); | |
1970 #endif | |
1971 } | |
1972 | |
1973 Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, | |
1974 const char * key) | |
1975 { | |
1976 #ifdef HUNSPELL_CHROME_CLIENT | |
1977 return NULL; | |
1978 #else | |
1979 return (Hunhandle*)(new Hunspell(affpath, dpath, key)); | |
1980 #endif | |
1981 } | |
1982 | |
1983 void Hunspell_destroy(Hunhandle *pHunspell) | |
1984 { | |
1985 delete (Hunspell*)(pHunspell); | |
1986 } | |
1987 | |
1988 int Hunspell_spell(Hunhandle *pHunspell, const char *word) | |
1989 { | |
1990 return ((Hunspell*)pHunspell)->spell(word); | |
1991 } | |
1992 | |
1993 char *Hunspell_get_dic_encoding(Hunhandle *pHunspell) | |
1994 { | |
1995 return ((Hunspell*)pHunspell)->get_dic_encoding(); | |
1996 } | |
1997 | |
1998 int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) | |
1999 { | |
2000 return ((Hunspell*)pHunspell)->suggest(slst, word); | |
2001 } | |
2002 | |
2003 int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) | |
2004 { | |
2005 return ((Hunspell*)pHunspell)->analyze(slst, word); | |
2006 } | |
2007 | |
2008 int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) | |
2009 { | |
2010 return ((Hunspell*)pHunspell)->stem(slst, word); | |
2011 } | |
2012 | |
2013 int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n) | |
2014 { | |
2015 return ((Hunspell*)pHunspell)->stem(slst, desc, n); | |
2016 } | |
2017 | |
2018 int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, | |
2019 const char * word2) | |
2020 { | |
2021 return ((Hunspell*)pHunspell)->generate(slst, word, word2); | |
2022 } | |
2023 | |
2024 int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, | |
2025 char** desc, int n) | |
2026 { | |
2027 return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); | |
2028 } | |
2029 | |
2030 /* functions for run-time modification of the dictionary */ | |
2031 | |
2032 /* add word to the run-time dictionary */ | |
2033 | |
2034 int Hunspell_add(Hunhandle *pHunspell, const char * word) { | |
2035 return ((Hunspell*)pHunspell)->add(word); | |
2036 } | |
2037 | |
2038 /* add word to the run-time dictionary with affix flags of | |
2039 * the example (a dictionary word): Hunspell will recognize | |
2040 * affixed forms of the new word, too. | |
2041 */ | |
2042 | |
2043 int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, | |
2044 const char * example) { | |
2045 return ((Hunspell*)pHunspell)->add_with_affix(word, example); | |
2046 } | |
2047 | |
2048 /* remove word from the run-time dictionary */ | |
2049 | |
2050 int Hunspell_remove(Hunhandle *pHunspell, const char * word) { | |
2051 return ((Hunspell*)pHunspell)->remove(word); | |
2052 } | |
2053 | |
2054 void Hunspell_free_list(Hunhandle *, char *** slst, int n) { | |
2055 freelist(slst, n); | |
2056 } | |
OLD | NEW |