OLD | NEW |
| (Empty) |
1 #include "license.hunspell" | |
2 #include "license.myspell" | |
3 | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <stdio.h> | |
7 #include <ctype.h> | |
8 | |
9 #include "hashmgr.hxx" | |
10 #include "csutil.hxx" | |
11 #include "atypes.hxx" | |
12 | |
13 // build a hash table from a munched word list | |
14 | |
15 #ifdef HUNSPELL_CHROME_CLIENT | |
16 HashMgr::HashMgr(hunspell::BDictReader* reader) | |
17 { | |
18 bdict_reader = reader; | |
19 #else | |
20 HashMgr::HashMgr(const char * tpath, const char * apath, const char * key) | |
21 { | |
22 #endif | |
23 tablesize = 0; | |
24 tableptr = NULL; | |
25 flag_mode = FLAG_CHAR; | |
26 complexprefixes = 0; | |
27 utf8 = 0; | |
28 langnum = 0; | |
29 lang = NULL; | |
30 enc = NULL; | |
31 csconv = 0; | |
32 ignorechars = NULL; | |
33 ignorechars_utf16 = NULL; | |
34 ignorechars_utf16_len = 0; | |
35 numaliasf = 0; | |
36 aliasf = NULL; | |
37 numaliasm = 0; | |
38 aliasm = NULL; | |
39 forbiddenword = FORBIDDENWORD; // forbidden word signing flag | |
40 #ifdef HUNSPELL_CHROME_CLIENT | |
41 // No tables to load, just the AF lines. | |
42 load_config(NULL, NULL); | |
43 int ec = LoadAFLines(); | |
44 #else | |
45 load_config(apath, key); | |
46 int ec = load_tables(tpath, key); | |
47 #endif | |
48 if (ec) { | |
49 /* error condition - what should we do here */ | |
50 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec); | |
51 if (tableptr) { | |
52 free(tableptr); | |
53 tableptr = NULL; | |
54 } | |
55 tablesize = 0; | |
56 } | |
57 } | |
58 | |
59 | |
60 HashMgr::~HashMgr() | |
61 { | |
62 if (tableptr) { | |
63 // now pass through hash table freeing up everything | |
64 // go through column by column of the table | |
65 for (int i=0; i < tablesize; i++) { | |
66 struct hentry * pt = tableptr[i]; | |
67 struct hentry * nt = NULL; | |
68 while(pt) { | |
69 nt = pt->next; | |
70 if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))
) free(pt->astr); | |
71 free(pt); | |
72 pt = nt; | |
73 } | |
74 } | |
75 free(tableptr); | |
76 } | |
77 tablesize = 0; | |
78 | |
79 if (aliasf) { | |
80 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]); | |
81 free(aliasf); | |
82 aliasf = NULL; | |
83 if (aliasflen) { | |
84 free(aliasflen); | |
85 aliasflen = NULL; | |
86 } | |
87 } | |
88 if (aliasm) { | |
89 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]); | |
90 free(aliasm); | |
91 aliasm = NULL; | |
92 } | |
93 | |
94 #ifndef OPENOFFICEORG | |
95 #ifndef MOZILLA_CLIENT | |
96 if (utf8) free_utf_tbl(); | |
97 #endif | |
98 #endif | |
99 | |
100 if (enc) free(enc); | |
101 if (lang) free(lang); | |
102 | |
103 if (ignorechars) free(ignorechars); | |
104 if (ignorechars_utf16) free(ignorechars_utf16); | |
105 | |
106 #ifdef HUNSPELL_CHROME_CLIENT | |
107 EmptyHentryCache(); | |
108 for (std::vector<std::string*>::iterator it = pointer_to_strings_.begin(); | |
109 it != pointer_to_strings_.end(); ++it) { | |
110 delete *it; | |
111 } | |
112 #endif | |
113 #ifdef MOZILLA_CLIENT | |
114 delete [] csconv; | |
115 #endif | |
116 } | |
117 | |
118 #ifdef HUNSPELL_CHROME_CLIENT | |
119 void HashMgr::EmptyHentryCache() { | |
120 // We need to delete each cache entry, and each additional one in the linked | |
121 // list of homonyms. | |
122 for (HEntryCache::iterator i = hentry_cache.begin(); | |
123 i != hentry_cache.end(); ++i) { | |
124 hentry* cur = i->second; | |
125 while (cur) { | |
126 hentry* next = cur->next_homonym; | |
127 DeleteHashEntry(cur); | |
128 cur = next; | |
129 } | |
130 } | |
131 hentry_cache.clear(); | |
132 } | |
133 #endif | |
134 | |
135 // lookup a root word in the hashtable | |
136 | |
137 struct hentry * HashMgr::lookup(const char *word) const | |
138 { | |
139 #ifdef HUNSPELL_CHROME_CLIENT | |
140 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD]; | |
141 int affix_count = bdict_reader->FindWord(word, affix_ids); | |
142 if (affix_count == 0) { // look for custom added word | |
143 std::map<base::StringPiece, int>::const_iterator iter = | |
144 custom_word_to_affix_id_map_.find(word); | |
145 if (iter != custom_word_to_affix_id_map_.end()) { | |
146 affix_count = 1; | |
147 affix_ids[0] = iter->second; | |
148 } | |
149 } | |
150 | |
151 static const int kMaxWordLen = 128; | |
152 static char word_buf[kMaxWordLen]; | |
153 // To take account of null-termination, we use upto 127. | |
154 strncpy(word_buf, word, kMaxWordLen - 1); | |
155 | |
156 return AffixIDsToHentry(word_buf, affix_ids, affix_count); | |
157 #else | |
158 struct hentry * dp; | |
159 if (tableptr) { | |
160 dp = tableptr[hash(word)]; | |
161 if (!dp) return NULL; | |
162 for ( ; dp != NULL; dp = dp->next) { | |
163 if (strcmp(word, dp->word) == 0) return dp; | |
164 } | |
165 } | |
166 return NULL; | |
167 #endif | |
168 } | |
169 | |
170 // add a word to the hash table (private) | |
171 int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, | |
172 int al, const char * desc, bool onlyupcase) | |
173 { | |
174 #ifndef HUNSPELL_CHROME_CLIENT | |
175 bool upcasehomonym = false; | |
176 int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0; | |
177 // variable-length hash record with word and optional fields | |
178 struct hentry* hp = | |
179 (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); | |
180 if (!hp) return 1; | |
181 char * hpw = hp->word; | |
182 strcpy(hpw, word); | |
183 if (ignorechars != NULL) { | |
184 if (utf8) { | |
185 remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len); | |
186 } else { | |
187 remove_ignored_chars(hpw, ignorechars); | |
188 } | |
189 } | |
190 if (complexprefixes) { | |
191 if (utf8) reverseword_utf(hpw); else reverseword(hpw); | |
192 } | |
193 | |
194 int i = hash(hpw); | |
195 | |
196 hp->blen = (unsigned char) wbl; | |
197 hp->clen = (unsigned char) wcl; | |
198 hp->alen = (short) al; | |
199 hp->astr = aff; | |
200 hp->next = NULL; | |
201 hp->next_homonym = NULL; | |
202 | |
203 // store the description string or its pointer | |
204 if (desc) { | |
205 hp->var = H_OPT; | |
206 if (aliasm) { | |
207 hp->var += H_OPT_ALIASM; | |
208 store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); | |
209 } else { | |
210 strcpy(hpw + wbl + 1, desc); | |
211 if (complexprefixes) { | |
212 if (utf8) reverseword_utf(HENTRY_DATA(hp)); | |
213 else reverseword(HENTRY_DATA(hp)); | |
214 } | |
215 } | |
216 if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; | |
217 } else hp->var = 0; | |
218 | |
219 struct hentry * dp = tableptr[i]; | |
220 if (!dp) { | |
221 tableptr[i] = hp; | |
222 return 0; | |
223 } | |
224 while (dp->next != NULL) { | |
225 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) { | |
226 // remove hidden onlyupcase homonym | |
227 if (!onlyupcase) { | |
228 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { | |
229 free(dp->astr); | |
230 dp->astr = hp->astr; | |
231 dp->alen = hp->alen; | |
232 free(hp); | |
233 return 0; | |
234 } else { | |
235 dp->next_homonym = hp; | |
236 } | |
237 } else { | |
238 upcasehomonym = true; | |
239 } | |
240 } | |
241 dp=dp->next; | |
242 } | |
243 if (strcmp(hp->word, dp->word) == 0) { | |
244 // remove hidden onlyupcase homonym | |
245 if (!onlyupcase) { | |
246 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { | |
247 free(dp->astr); | |
248 dp->astr = hp->astr; | |
249 dp->alen = hp->alen; | |
250 free(hp); | |
251 return 0; | |
252 } else { | |
253 dp->next_homonym = hp; | |
254 } | |
255 } else { | |
256 upcasehomonym = true; | |
257 } | |
258 } | |
259 if (!upcasehomonym) { | |
260 dp->next = hp; | |
261 } else { | |
262 // remove hidden onlyupcase homonym | |
263 if (hp->astr) free(hp->astr); | |
264 free(hp); | |
265 } | |
266 #else | |
267 std::map<base::StringPiece, int>::iterator iter = | |
268 custom_word_to_affix_id_map_.find(word); | |
269 if(iter == custom_word_to_affix_id_map_.end()) { // word needs to be added | |
270 std::string* new_string_word = new std::string(word); | |
271 pointer_to_strings_.push_back(new_string_word); | |
272 base::StringPiece sp(*(new_string_word)); | |
273 custom_word_to_affix_id_map_[sp] = 0; // no affixes for custom words | |
274 return 1; | |
275 } | |
276 #endif | |
277 return 0; | |
278 } | |
279 | |
280 int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, | |
281 unsigned short * flags, int al, char * dp, int captype) | |
282 { | |
283 // add inner capitalized forms to handle the following allcap forms: | |
284 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG | |
285 // Allcaps with suffixes: CIA's -> CIA'S | |
286 if (((captype == HUHCAP) || (captype == HUHINITCAP) || | |
287 ((captype == ALLCAP) && (flags != NULL))) && | |
288 !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) { | |
289 unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned s
hort) * (al+1)); | |
290 if (!flags2) return 1; | |
291 if (al) memcpy(flags2, flags, al * sizeof(unsigned short)); | |
292 flags2[al] = ONLYUPCASEFLAG; | |
293 if (utf8) { | |
294 char st[BUFSIZE]; | |
295 w_char w[BUFSIZE]; | |
296 int wlen = u8_u16(w, BUFSIZE, word); | |
297 mkallsmall_utf(w, wlen, langnum); | |
298 mkallcap_utf(w, 1, langnum); | |
299 u16_u8(st, BUFSIZE, w, wlen); | |
300 return add_word(st,wbl,wcl,flags2,al+1,dp, true); | |
301 } else { | |
302 mkallsmall(word, csconv); | |
303 mkinitcap(word, csconv); | |
304 return add_word(word,wbl,wcl,flags2,al+1,dp, true); | |
305 } | |
306 } | |
307 return 0; | |
308 } | |
309 | |
310 // detect captype and modify word length for UTF-8 encoding | |
311 int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { | |
312 int len; | |
313 if (utf8) { | |
314 w_char dest_utf[BUFSIZE]; | |
315 len = u8_u16(dest_utf, BUFSIZE, word); | |
316 *captype = get_captype_utf8(dest_utf, len, langnum); | |
317 } else { | |
318 len = wbl; | |
319 *captype = get_captype((char *) word, len, csconv); | |
320 } | |
321 return len; | |
322 } | |
323 | |
324 // remove word (personal dictionary function for standalone applications) | |
325 int HashMgr::remove(const char * word) | |
326 { | |
327 #ifdef HUNSPELL_CHROME_CLIENT | |
328 std::map<base::StringPiece, int>::iterator iter = | |
329 custom_word_to_affix_id_map_.find(word); | |
330 if (iter != custom_word_to_affix_id_map_.end()) | |
331 custom_word_to_affix_id_map_.erase(iter); | |
332 #else | |
333 struct hentry * dp = lookup(word); | |
334 while (dp) { | |
335 if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { | |
336 unsigned short * flags = | |
337 (unsigned short *) malloc(sizeof(short) * (dp->alen + 1)); | |
338 if (!flags) return 1; | |
339 for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; | |
340 flags[dp->alen] = forbiddenword; | |
341 dp->astr = flags; | |
342 dp->alen++; | |
343 flag_qsort(flags, 0, dp->alen); | |
344 } | |
345 dp = dp->next_homonym; | |
346 } | |
347 #endif | |
348 return 0; | |
349 } | |
350 | |
351 /* remove forbidden flag to add a personal word to the hash */ | |
352 int HashMgr::remove_forbidden_flag(const char * word) { | |
353 struct hentry * dp = lookup(word); | |
354 if (!dp) return 1; | |
355 while (dp) { | |
356 if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { | |
357 if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal
dic. | |
358 else { | |
359 unsigned short * flags2 = | |
360 (unsigned short *) malloc(sizeof(short) * (dp->alen - 1)); | |
361 if (!flags2) return 1; | |
362 int i, j = 0; | |
363 for (i = 0; i < dp->alen; i++) { | |
364 if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i]; | |
365 } | |
366 dp->alen--; | |
367 dp->astr = flags2; // XXX allowed forbidden words | |
368 } | |
369 } | |
370 dp = dp->next_homonym; | |
371 } | |
372 return 0; | |
373 } | |
374 | |
375 // add a custom dic. word to the hash table (public) | |
376 int HashMgr::add(const char * word) | |
377 { | |
378 unsigned short * flags = NULL; | |
379 int al = 0; | |
380 if (remove_forbidden_flag(word)) { | |
381 int captype; | |
382 int wbl = strlen(word); | |
383 int wcl = get_clen_and_captype(word, wbl, &captype); | |
384 add_word(word, wbl, wcl, flags, al, NULL, false); | |
385 return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, N
ULL, captype); | |
386 } | |
387 return 0; | |
388 } | |
389 | |
390 int HashMgr::add_with_affix(const char * word, const char * example) | |
391 { | |
392 // detect captype and modify word length for UTF-8 encoding | |
393 struct hentry * dp = lookup(example); | |
394 remove_forbidden_flag(word); | |
395 if (dp && dp->astr) { | |
396 int captype; | |
397 int wbl = strlen(word); | |
398 int wcl = get_clen_and_captype(word, wbl, &captype); | |
399 if (aliasf) { | |
400 add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); | |
401 } else { | |
402 unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeo
f(short)); | |
403 if (flags) { | |
404 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(shor
t)); | |
405 add_word(word, wbl, wcl, flags, dp->alen, NULL, false); | |
406 } else return 1; | |
407 } | |
408 return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp
->alen, NULL, captype); | |
409 } | |
410 return 1; | |
411 } | |
412 | |
413 // walk the hash table entry by entry - null at end | |
414 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); | |
415 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const | |
416 { | |
417 #ifdef HUNSPELL_CHROME_CLIENT | |
418 // Return NULL if dictionary is not valid. | |
419 if (!bdict_reader->IsValid()) | |
420 return NULL; | |
421 | |
422 // This function is only ever called by one place and not nested. We can | |
423 // therefore keep static state between calls and use |col| as a "reset" flag | |
424 // to avoid changing the API. It is set to -1 for the first call. | |
425 // Allocate the iterator on the heap to prevent an exit time destructor. | |
426 static hunspell::WordIterator& word_iterator = | |
427 *new hunspell::WordIterator(bdict_reader->GetAllWordIterator()); | |
428 if (col < 0) { | |
429 col = 1; | |
430 word_iterator = bdict_reader->GetAllWordIterator(); | |
431 } | |
432 | |
433 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD]; | |
434 static const int kMaxWordLen = 128; | |
435 static char word[kMaxWordLen]; | |
436 int affix_count = word_iterator.Advance(word, kMaxWordLen, affix_ids); | |
437 if (affix_count == 0) | |
438 return NULL; | |
439 short word_len = static_cast<short>(strlen(word)); | |
440 | |
441 // Since hunspell 1.2.8, an hentry struct becomes a variable-length struct, | |
442 // i.e. a struct which uses its array 'word[1]' as a variable-length array. | |
443 // As noted above, this function is not nested. So, we just use a static | |
444 // struct which consists of an hentry and a char[kMaxWordLen], and initialize | |
445 // the static struct and return it for now. | |
446 // No need to create linked lists for the extra affixes. | |
447 static struct { | |
448 hentry entry; | |
449 char word[kMaxWordLen]; | |
450 } hash_entry; | |
451 | |
452 return InitHashEntry(&hash_entry.entry, sizeof(hash_entry), | |
453 &word[0], word_len, affix_ids[0]); | |
454 #else | |
455 if (hp && hp->next != NULL) return hp->next; | |
456 for (col++; col < tablesize; col++) { | |
457 if (tableptr[col]) return tableptr[col]; | |
458 } | |
459 // null at end and reset to start | |
460 col = -1; | |
461 return NULL; | |
462 #endif | |
463 } | |
464 | |
465 // load a munched word list and build a hash table on the fly | |
466 int HashMgr::load_tables(const char * tpath, const char * key) | |
467 { | |
468 #ifndef HUNSPELL_CHROME_CLIENT | |
469 int al; | |
470 char * ap; | |
471 char * dp; | |
472 char * dp2; | |
473 unsigned short * flags; | |
474 char * ts; | |
475 | |
476 // open dictionary file | |
477 FileMgr * dict = new FileMgr(tpath, key); | |
478 if (dict == NULL) return 1; | |
479 | |
480 // first read the first line of file to get hash table size */ | |
481 if ((ts = dict->getline()) == NULL) { | |
482 HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath); | |
483 delete dict; | |
484 return 2; | |
485 } | |
486 mychomp(ts); | |
487 | |
488 /* remove byte order mark */ | |
489 if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) { | |
490 memmove(ts, ts+3, strlen(ts+3)+1); | |
491 // warning: dic file begins with byte order mark: possible incompatibility w
ith old Hunspell versions | |
492 } | |
493 | |
494 tablesize = atoi(ts); | |
495 if (tablesize == 0) { | |
496 HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the di
c file\n"); | |
497 delete dict; | |
498 return 4; | |
499 } | |
500 tablesize = tablesize + 5 + USERWORD; | |
501 if ((tablesize %2) == 0) tablesize++; | |
502 | |
503 // allocate the hash table | |
504 tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *)); | |
505 if (! tableptr) { | |
506 delete dict; | |
507 return 3; | |
508 } | |
509 for (int i=0; i<tablesize; i++) tableptr[i] = NULL; | |
510 | |
511 // loop through all words on much list and add to hash | |
512 // table and create word and affix strings | |
513 | |
514 while ((ts = dict->getline()) != NULL) { | |
515 mychomp(ts); | |
516 // split each line into word and morphological description | |
517 dp = ts; | |
518 while ((dp = strchr(dp, ':')) != NULL) { | |
519 if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { | |
520 for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--); | |
521 if (dp < ts) { // missing word | |
522 dp = NULL; | |
523 } else { | |
524 *(dp + 1) = '\0'; | |
525 dp = dp + 2; | |
526 } | |
527 break; | |
528 } | |
529 dp++; | |
530 } | |
531 | |
532 // tabulator is the old morphological field separator | |
533 dp2 = strchr(ts, '\t'); | |
534 if (dp2 && (!dp || dp2 < dp)) { | |
535 *dp2 = '\0'; | |
536 dp = dp2 + 1; | |
537 } | |
538 | |
539 // split each line into word and affix char strings | |
540 // "\/" signs slash in words (not affix separator) | |
541 // "/" at beginning of the line is word character (not affix separator) | |
542 ap = strchr(ts,'/'); | |
543 while (ap) { | |
544 if (ap == ts) { | |
545 ap++; | |
546 continue; | |
547 } else if (*(ap - 1) != '\\') break; | |
548 // replace "\/" with "/" | |
549 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++); | |
550 ap = strchr(ap,'/'); | |
551 } | |
552 | |
553 if (ap) { | |
554 *ap = '\0'; | |
555 if (aliasf) { | |
556 int index = atoi(ap + 1); | |
557 al = get_aliasf(index, &flags, dict); | |
558 if (!al) { | |
559 HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
dict->getlinenum()); | |
560 *ap = '\0'; | |
561 } | |
562 } else { | |
563 al = decode_flags(&flags, ap + 1, dict); | |
564 if (al == -1) { | |
565 HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); | |
566 delete dict; | |
567 return 6; | |
568 } | |
569 flag_qsort(flags, 0, al); | |
570 } | |
571 } else { | |
572 al = 0; | |
573 ap = NULL; | |
574 flags = NULL; | |
575 } | |
576 | |
577 int captype; | |
578 int wbl = strlen(ts); | |
579 int wcl = get_clen_and_captype(ts, wbl, &captype); | |
580 // add the word and its index plus its capitalized form optionally | |
581 if (add_word(ts,wbl,wcl,flags,al,dp, false) || | |
582 add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) { | |
583 delete dict; | |
584 return 5; | |
585 } | |
586 } | |
587 | |
588 delete dict; | |
589 #endif | |
590 return 0; | |
591 } | |
592 | |
593 // the hash function is a simple load and rotate | |
594 // algorithm borrowed | |
595 | |
596 int HashMgr::hash(const char * word) const | |
597 { | |
598 #ifdef HUNSPELL_CHROME_CLIENT | |
599 return 0; | |
600 #else | |
601 long hv = 0; | |
602 for (int i=0; i < 4 && *word != 0; i++) | |
603 hv = (hv << 8) | (*word++); | |
604 while (*word != 0) { | |
605 ROTATE(hv,ROTATE_LEN); | |
606 hv ^= (*word++); | |
607 } | |
608 return (unsigned long) hv % tablesize; | |
609 #endif | |
610 } | |
611 | |
612 int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af)
{ | |
613 int len; | |
614 if (*flags == '\0') { | |
615 *result = NULL; | |
616 return 0; | |
617 } | |
618 switch (flag_mode) { | |
619 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) | |
620 len = strlen(flags); | |
621 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector
\n", af->getlinenum()); | |
622 len /= 2; | |
623 *result = (unsigned short *) malloc(len * sizeof(short)); | |
624 if (!*result) return -1; | |
625 for (int i = 0; i < len; i++) { | |
626 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned sh
ort) flags[i * 2 + 1]; | |
627 } | |
628 break; | |
629 } | |
630 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 452
1 23 233) | |
631 int i; | |
632 len = 1; | |
633 char * src = flags; | |
634 unsigned short * dest; | |
635 char * p; | |
636 for (p = flags; *p; p++) { | |
637 if (*p == ',') len++; | |
638 } | |
639 *result = (unsigned short *) malloc(len * sizeof(short)); | |
640 if (!*result) return -1; | |
641 dest = *result; | |
642 for (p = flags; *p; p++) { | |
643 if (*p == ',') { | |
644 i = atoi(src); | |
645 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: fla
g id %d is too large (max: %d)\n", | |
646 af->getlinenum(), i, DEFAULTFLAGS - 1); | |
647 *dest = (unsigned short) i; | |
648 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong
flag id\n", af->getlinenum()); | |
649 src = p + 1; | |
650 dest++; | |
651 } | |
652 } | |
653 i = atoi(src); | |
654 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id
%d is too large (max: %d)\n", | |
655 af->getlinenum(), i, DEFAULTFLAGS - 1); | |
656 *dest = (unsigned short) i; | |
657 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong fla
g id\n", af->getlinenum()); | |
658 break; | |
659 } | |
660 case FLAG_UNI: { // UTF-8 characters | |
661 w_char w[BUFSIZE/2]; | |
662 len = u8_u16(w, BUFSIZE/2, flags); | |
663 *result = (unsigned short *) malloc(len * sizeof(short)); | |
664 if (!*result) return -1; | |
665 memcpy(*result, w, len * sizeof(short)); | |
666 break; | |
667 } | |
668 default: { // Ispell's one-character flags (erfg -> e r f g) | |
669 unsigned short * dest; | |
670 len = strlen(flags); | |
671 *result = (unsigned short *) malloc(len * sizeof(short)); | |
672 if (!*result) return -1; | |
673 dest = *result; | |
674 for (unsigned char * p = (unsigned char *) flags; *p; p++) { | |
675 *dest = (unsigned short) *p; | |
676 dest++; | |
677 } | |
678 } | |
679 } | |
680 return len; | |
681 } | |
682 | |
683 unsigned short HashMgr::decode_flag(const char * f) { | |
684 unsigned short s = 0; | |
685 int i; | |
686 switch (flag_mode) { | |
687 case FLAG_LONG: | |
688 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1]; | |
689 break; | |
690 case FLAG_NUM: | |
691 i = atoi(f); | |
692 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is to
o large (max: %d)\n", i, DEFAULTFLAGS - 1); | |
693 s = (unsigned short) i; | |
694 break; | |
695 case FLAG_UNI: | |
696 u8_u16((w_char *) &s, 1, f); | |
697 break; | |
698 default: | |
699 s = (unsigned short) *((unsigned char *)f); | |
700 } | |
701 if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); | |
702 return s; | |
703 } | |
704 | |
705 char * HashMgr::encode_flag(unsigned short f) { | |
706 unsigned char ch[10]; | |
707 if (f==0) return mystrdup("(NULL)"); | |
708 if (flag_mode == FLAG_LONG) { | |
709 ch[0] = (unsigned char) (f >> 8); | |
710 ch[1] = (unsigned char) (f - ((f >> 8) << 8)); | |
711 ch[2] = '\0'; | |
712 } else if (flag_mode == FLAG_NUM) { | |
713 sprintf((char *) ch, "%d", f); | |
714 } else if (flag_mode == FLAG_UNI) { | |
715 u16_u8((char *) &ch, 10, (w_char *) &f, 1); | |
716 } else { | |
717 ch[0] = (unsigned char) (f); | |
718 ch[1] = '\0'; | |
719 } | |
720 return mystrdup((char *) ch); | |
721 } | |
722 | |
723 // read in aff file and set flag mode | |
724 int HashMgr::load_config(const char * affpath, const char * key) | |
725 { | |
726 char * line; // io buffers | |
727 int firstline = 1; | |
728 | |
729 // open the affix file | |
730 #ifdef HUNSPELL_CHROME_CLIENT | |
731 hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator(); | |
732 FileMgr * afflst = new FileMgr(&iterator); | |
733 #else | |
734 FileMgr * afflst = new FileMgr(affpath, key); | |
735 #endif | |
736 if (!afflst) { | |
737 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n
",affpath); | |
738 return 1; | |
739 } | |
740 | |
741 // read in each line ignoring any that do not | |
742 // start with a known line type indicator | |
743 | |
744 while ((line = afflst->getline()) != NULL) { | |
745 mychomp(line); | |
746 | |
747 /* remove byte order mark */ | |
748 if (firstline) { | |
749 firstline = 0; | |
750 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(l
ine+3)+1); | |
751 } | |
752 | |
753 /* parse in the try string */ | |
754 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { | |
755 if (flag_mode != FLAG_CHAR) { | |
756 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions o
f the FLAG affix file parameter\n", afflst->getlinenum()); | |
757 } | |
758 if (strstr(line, "long")) flag_mode = FLAG_LONG; | |
759 if (strstr(line, "num")) flag_mode = FLAG_NUM; | |
760 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; | |
761 if (flag_mode == FLAG_CHAR) { | |
762 HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `lon
g' or `UTF-8' parameter\n", afflst->getlinenum()); | |
763 } | |
764 } | |
765 if (strncmp(line,"FORBIDDENWORD",13) == 0) { | |
766 char * st = NULL; | |
767 if (parse_string(line, &st, afflst->getlinenum())) { | |
768 delete afflst; | |
769 return 1; | |
770 } | |
771 forbiddenword = decode_flag(st); | |
772 free(st); | |
773 } | |
774 if (strncmp(line, "SET", 3) == 0) { | |
775 if (parse_string(line, &enc, afflst->getlinenum())) { | |
776 delete afflst; | |
777 return 1; | |
778 } | |
779 if (strcmp(enc, "UTF-8") == 0) { | |
780 utf8 = 1; | |
781 #ifndef OPENOFFICEORG | |
782 #ifndef MOZILLA_CLIENT | |
783 initialize_utf_tbl(); | |
784 #endif | |
785 #endif | |
786 } else csconv = get_current_cs(enc); | |
787 } | |
788 if (strncmp(line, "LANG", 4) == 0) { | |
789 if (parse_string(line, &lang, afflst->getlinenum())) { | |
790 delete afflst; | |
791 return 1; | |
792 } | |
793 langnum = get_lang_num(lang); | |
794 } | |
795 | |
796 /* parse in the ignored characters (for example, Arabic optional diacriti
cs characters */ | |
797 if (strncmp(line,"IGNORE",6) == 0) { | |
798 if (parse_array(line, &ignorechars, &ignorechars_utf16, | |
799 &ignorechars_utf16_len, utf8, afflst->getlinenum())) { | |
800 delete afflst; | |
801 return 1; | |
802 } | |
803 } | |
804 | |
805 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { | |
806 if (parse_aliasf(line, afflst)) { | |
807 delete afflst; | |
808 return 1; | |
809 } | |
810 } | |
811 | |
812 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { | |
813 if (parse_aliasm(line, afflst)) { | |
814 delete afflst; | |
815 return 1; | |
816 } | |
817 } | |
818 | |
819 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; | |
820 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && iss
pace(line[3])) break; | |
821 } | |
822 if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING); | |
823 delete afflst; | |
824 return 0; | |
825 } | |
826 | |
827 /* parse in the ALIAS table */ | |
828 int HashMgr::parse_aliasf(char * line, FileMgr * af) | |
829 { | |
830 if (numaliasf != 0) { | |
831 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
832 return 1; | |
833 } | |
834 char * tp = line; | |
835 char * piece; | |
836 int i = 0; | |
837 int np = 0; | |
838 piece = mystrsep(&tp, 0); | |
839 while (piece) { | |
840 if (*piece != '\0') { | |
841 switch(i) { | |
842 case 0: { np++; break; } | |
843 case 1: { | |
844 numaliasf = atoi(piece); | |
845 if (numaliasf < 1) { | |
846 numaliasf = 0; | |
847 aliasf = NULL; | |
848 aliasflen = NULL; | |
849 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
850 return 1; | |
851 } | |
852 aliasf = (unsigned short **) malloc(numaliasf * sizeof(un
signed short *)); | |
853 aliasflen = (unsigned short *) malloc(numaliasf * sizeof(
short)); | |
854 if (!aliasf || !aliasflen) { | |
855 numaliasf = 0; | |
856 if (aliasf) free(aliasf); | |
857 if (aliasflen) free(aliasflen); | |
858 aliasf = NULL; | |
859 aliasflen = NULL; | |
860 return 1; | |
861 } | |
862 np++; | |
863 break; | |
864 } | |
865 default: break; | |
866 } | |
867 i++; | |
868 } | |
869 piece = mystrsep(&tp, 0); | |
870 } | |
871 if (np != 2) { | |
872 numaliasf = 0; | |
873 free(aliasf); | |
874 free(aliasflen); | |
875 aliasf = NULL; | |
876 aliasflen = NULL; | |
877 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
878 return 1; | |
879 } | |
880 | |
881 /* now parse the numaliasf lines to read in the remainder of the table */ | |
882 char * nl; | |
883 for (int j=0; j < numaliasf; j++) { | |
884 if ((nl = af->getline()) == NULL) return 1; | |
885 mychomp(nl); | |
886 tp = nl; | |
887 i = 0; | |
888 aliasf[j] = NULL; | |
889 aliasflen[j] = 0; | |
890 piece = mystrsep(&tp, 0); | |
891 while (piece) { | |
892 if (*piece != '\0') { | |
893 switch(i) { | |
894 case 0: { | |
895 if (strncmp(piece,"AF",2) != 0) { | |
896 numaliasf = 0; | |
897 free(aliasf); | |
898 free(aliasflen); | |
899 aliasf = NULL; | |
900 aliasflen = NULL; | |
901 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
902 return 1; | |
903 } | |
904 break; | |
905 } | |
906 case 1: { | |
907 aliasflen[j] = (unsigned short) decode_flags(&(alias
f[j]), piece, af); | |
908 flag_qsort(aliasf[j], 0, aliasflen[j]); | |
909 break; | |
910 } | |
911 default: break; | |
912 } | |
913 i++; | |
914 } | |
915 piece = mystrsep(&tp, 0); | |
916 } | |
917 if (!aliasf[j]) { | |
918 free(aliasf); | |
919 free(aliasflen); | |
920 aliasf = NULL; | |
921 aliasflen = NULL; | |
922 numaliasf = 0; | |
923 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
924 return 1; | |
925 } | |
926 } | |
927 return 0; | |
928 } | |
929 | |
930 #ifdef HUNSPELL_CHROME_CLIENT | |
931 int HashMgr::LoadAFLines() | |
932 { | |
933 utf8 = 1; // We always use UTF-8. | |
934 | |
935 // Read in all the AF lines which tell us the rules for each affix group ID. | |
936 hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator(); | |
937 FileMgr afflst(&iterator); | |
938 while (char* line = afflst.getline()) { | |
939 int rv = parse_aliasf(line, &afflst); | |
940 if (rv) | |
941 return rv; | |
942 } | |
943 | |
944 return 0; | |
945 } | |
946 | |
947 hentry* HashMgr::InitHashEntry(hentry* entry, | |
948 size_t item_size, | |
949 const char* word, | |
950 int word_length, | |
951 int affix_index) const { | |
952 // Return if the given buffer doesn't have enough space for a hentry struct | |
953 // or the given word is too long. | |
954 // Our BDICT cannot handle words longer than (128 - 1) bytes. So, it is | |
955 // better to return an error if the given word is too long and prevent | |
956 // an unexpected result caused by a long word. | |
957 const int kMaxWordLen = 128; | |
958 if (item_size < sizeof(hentry) + word_length + 1 || | |
959 word_length >= kMaxWordLen) | |
960 return NULL; | |
961 | |
962 // Initialize a hentry struct with the given parameters, and | |
963 // append the given string at the end of this hentry struct. | |
964 memset(entry, 0, item_size); | |
965 FileMgr af(NULL); | |
966 entry->alen = static_cast<short>( | |
967 const_cast<HashMgr*>(this)->get_aliasf(affix_index, &entry->astr, &af)); | |
968 entry->blen = static_cast<unsigned char>(word_length); | |
969 memcpy(&entry->word, word, word_length); | |
970 | |
971 return entry; | |
972 } | |
973 | |
974 hentry* HashMgr::CreateHashEntry(const char* word, | |
975 int word_length, | |
976 int affix_index) const { | |
977 // Return if the given word is too long. | |
978 // (See the comment in HashMgr::InitHashEntry().) | |
979 const int kMaxWordLen = 128; | |
980 if (word_length >= kMaxWordLen) | |
981 return NULL; | |
982 | |
983 const size_t kEntrySize = sizeof(hentry) + word_length + 1; | |
984 struct hentry* entry = reinterpret_cast<hentry*>(malloc(kEntrySize)); | |
985 if (entry) | |
986 InitHashEntry(entry, kEntrySize, word, word_length, affix_index); | |
987 | |
988 return entry; | |
989 } | |
990 | |
991 void HashMgr::DeleteHashEntry(hentry* entry) const { | |
992 free(entry); | |
993 } | |
994 | |
995 hentry* HashMgr::AffixIDsToHentry(char* word, | |
996 int* affix_ids, | |
997 int affix_count) const | |
998 { | |
999 if (affix_count == 0) | |
1000 return NULL; | |
1001 | |
1002 HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache; | |
1003 std::string std_word(word); | |
1004 HEntryCache::iterator found = cache.find(std_word); | |
1005 if (found != cache.end()) { | |
1006 // We must return an existing hentry for the same word if we've previously | |
1007 // handed one out. Hunspell will compare pointers in some cases to see if | |
1008 // two words it has found are the same. | |
1009 return found->second; | |
1010 } | |
1011 | |
1012 short word_len = static_cast<short>(strlen(word)); | |
1013 | |
1014 // We can get a number of prefixes per word. There will normally be only one, | |
1015 // but if not, there will be a linked list of "hentry"s for the "homonym"s | |
1016 // for the word. | |
1017 struct hentry* first_he = NULL; | |
1018 struct hentry* prev_he = NULL; // For making linked list. | |
1019 for (int i = 0; i < affix_count; i++) { | |
1020 struct hentry* he = CreateHashEntry(word, word_len, affix_ids[i]); | |
1021 if (!he) | |
1022 break; | |
1023 if (i == 0) | |
1024 first_he = he; | |
1025 if (prev_he) | |
1026 prev_he->next_homonym = he; | |
1027 prev_he = he; | |
1028 } | |
1029 | |
1030 cache[std_word] = first_he; // Save this word in the cache for later. | |
1031 return first_he; | |
1032 } | |
1033 | |
1034 hentry* HashMgr::GetHentryFromHEntryCache(char* word) { | |
1035 HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache; | |
1036 std::string std_word(word); | |
1037 HEntryCache::iterator found = cache.find(std_word); | |
1038 if (found != cache.end()) | |
1039 return found->second; | |
1040 else | |
1041 return NULL; | |
1042 } | |
1043 #endif | |
1044 | |
1045 int HashMgr::is_aliasf() { | |
1046 return (aliasf != NULL); | |
1047 } | |
1048 | |
1049 int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) { | |
1050 if ((index > 0) && (index <= numaliasf)) { | |
1051 *fvec = aliasf[index - 1]; | |
1052 return aliasflen[index - 1]; | |
1053 } | |
1054 HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->g
etlinenum(), index); | |
1055 *fvec = NULL; | |
1056 return 0; | |
1057 } | |
1058 | |
1059 /* parse morph alias definitions */ | |
1060 int HashMgr::parse_aliasm(char * line, FileMgr * af) | |
1061 { | |
1062 if (numaliasm != 0) { | |
1063 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a
f->getlinenum()); | |
1064 return 1; | |
1065 } | |
1066 char * tp = line; | |
1067 char * piece; | |
1068 int i = 0; | |
1069 int np = 0; | |
1070 piece = mystrsep(&tp, 0); | |
1071 while (piece) { | |
1072 if (*piece != '\0') { | |
1073 switch(i) { | |
1074 case 0: { np++; break; } | |
1075 case 1: { | |
1076 numaliasm = atoi(piece); | |
1077 if (numaliasm < 1) { | |
1078 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu
mber\n", af->getlinenum()); | |
1079 return 1; | |
1080 } | |
1081 aliasm = (char **) malloc(numaliasm * sizeof(char *)); | |
1082 if (!aliasm) { | |
1083 numaliasm = 0; | |
1084 return 1; | |
1085 } | |
1086 np++; | |
1087 break; | |
1088 } | |
1089 default: break; | |
1090 } | |
1091 i++; | |
1092 } | |
1093 piece = mystrsep(&tp, 0); | |
1094 } | |
1095 if (np != 2) { | |
1096 numaliasm = 0; | |
1097 free(aliasm); | |
1098 aliasm = NULL; | |
1099 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum(
)); | |
1100 return 1; | |
1101 } | |
1102 | |
1103 /* now parse the numaliasm lines to read in the remainder of the table */ | |
1104 char * nl = line; | |
1105 for (int j=0; j < numaliasm; j++) { | |
1106 if ((nl = af->getline()) == NULL) return 1; | |
1107 mychomp(nl); | |
1108 tp = nl; | |
1109 i = 0; | |
1110 aliasm[j] = NULL; | |
1111 piece = mystrsep(&tp, ' '); | |
1112 while (piece) { | |
1113 if (*piece != '\0') { | |
1114 switch(i) { | |
1115 case 0: { | |
1116 if (strncmp(piece,"AM",2) != 0) { | |
1117 HUNSPELL_WARNING(stderr, "error: line %d: table
is corrupt\n", af->getlinenum()); | |
1118 numaliasm = 0; | |
1119 free(aliasm); | |
1120 aliasm = NULL; | |
1121 return 1; | |
1122 } | |
1123 break; | |
1124 } | |
1125 case 1: { | |
1126 // add the remaining of the line | |
1127 if (*tp) { | |
1128 *(tp - 1) = ' '; | |
1129 tp = tp + strlen(tp); | |
1130 } | |
1131 if (complexprefixes) { | |
1132 if (utf8) reverseword_utf(piece); | |
1133 else reverseword(piece); | |
1134 } | |
1135 aliasm[j] = mystrdup(piece); | |
1136 if (!aliasm[j]) { | |
1137 numaliasm = 0; | |
1138 free(aliasm); | |
1139 aliasm = NULL; | |
1140 return 1; | |
1141 } | |
1142 break; } | |
1143 default: break; | |
1144 } | |
1145 i++; | |
1146 } | |
1147 piece = mystrsep(&tp, ' '); | |
1148 } | |
1149 if (!aliasm[j]) { | |
1150 numaliasm = 0; | |
1151 free(aliasm); | |
1152 aliasm = NULL; | |
1153 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->
getlinenum()); | |
1154 return 1; | |
1155 } | |
1156 } | |
1157 return 0; | |
1158 } | |
1159 | |
1160 int HashMgr::is_aliasm() { | |
1161 return (aliasm != NULL); | |
1162 } | |
1163 | |
1164 char * HashMgr::get_aliasm(int index) { | |
1165 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1]; | |
1166 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); | |
1167 return NULL; | |
1168 } | |
OLD | NEW |