Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(71)

Side by Side Diff: third_party/hunspell_new/src/hunspell/hashmgr.cxx

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #include "license.hunspell"
2 #include "license.myspell"
3
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
8
9 #include "hashmgr.hxx"
10 #include "csutil.hxx"
11 #include "atypes.hxx"
12
13 // build a hash table from a munched word list
14
15 #ifdef HUNSPELL_CHROME_CLIENT
16 HashMgr::HashMgr(hunspell::BDictReader* reader)
17 {
18 bdict_reader = reader;
19 #else
20 HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
21 {
22 #endif
23 tablesize = 0;
24 tableptr = NULL;
25 flag_mode = FLAG_CHAR;
26 complexprefixes = 0;
27 utf8 = 0;
28 langnum = 0;
29 lang = NULL;
30 enc = NULL;
31 csconv = 0;
32 ignorechars = NULL;
33 ignorechars_utf16 = NULL;
34 ignorechars_utf16_len = 0;
35 numaliasf = 0;
36 aliasf = NULL;
37 numaliasm = 0;
38 aliasm = NULL;
39 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
40 #ifdef HUNSPELL_CHROME_CLIENT
41 // No tables to load, just the AF lines.
42 load_config(NULL, NULL);
43 int ec = LoadAFLines();
44 #else
45 load_config(apath, key);
46 int ec = load_tables(tpath, key);
47 #endif
48 if (ec) {
49 /* error condition - what should we do here */
50 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
51 if (tableptr) {
52 free(tableptr);
53 tableptr = NULL;
54 }
55 tablesize = 0;
56 }
57 }
58
59
60 HashMgr::~HashMgr()
61 {
62 if (tableptr) {
63 // now pass through hash table freeing up everything
64 // go through column by column of the table
65 for (int i=0; i < tablesize; i++) {
66 struct hentry * pt = tableptr[i];
67 struct hentry * nt = NULL;
68 while(pt) {
69 nt = pt->next;
70 if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)) ) free(pt->astr);
71 free(pt);
72 pt = nt;
73 }
74 }
75 free(tableptr);
76 }
77 tablesize = 0;
78
79 if (aliasf) {
80 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
81 free(aliasf);
82 aliasf = NULL;
83 if (aliasflen) {
84 free(aliasflen);
85 aliasflen = NULL;
86 }
87 }
88 if (aliasm) {
89 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
90 free(aliasm);
91 aliasm = NULL;
92 }
93
94 #ifndef OPENOFFICEORG
95 #ifndef MOZILLA_CLIENT
96 if (utf8) free_utf_tbl();
97 #endif
98 #endif
99
100 if (enc) free(enc);
101 if (lang) free(lang);
102
103 if (ignorechars) free(ignorechars);
104 if (ignorechars_utf16) free(ignorechars_utf16);
105
106 #ifdef HUNSPELL_CHROME_CLIENT
107 EmptyHentryCache();
108 for (std::vector<std::string*>::iterator it = pointer_to_strings_.begin();
109 it != pointer_to_strings_.end(); ++it) {
110 delete *it;
111 }
112 #endif
113 #ifdef MOZILLA_CLIENT
114 delete [] csconv;
115 #endif
116 }
117
118 #ifdef HUNSPELL_CHROME_CLIENT
119 void HashMgr::EmptyHentryCache() {
120 // We need to delete each cache entry, and each additional one in the linked
121 // list of homonyms.
122 for (HEntryCache::iterator i = hentry_cache.begin();
123 i != hentry_cache.end(); ++i) {
124 hentry* cur = i->second;
125 while (cur) {
126 hentry* next = cur->next_homonym;
127 DeleteHashEntry(cur);
128 cur = next;
129 }
130 }
131 hentry_cache.clear();
132 }
133 #endif
134
135 // lookup a root word in the hashtable
136
137 struct hentry * HashMgr::lookup(const char *word) const
138 {
139 #ifdef HUNSPELL_CHROME_CLIENT
140 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
141 int affix_count = bdict_reader->FindWord(word, affix_ids);
142 if (affix_count == 0) { // look for custom added word
143 std::map<base::StringPiece, int>::const_iterator iter =
144 custom_word_to_affix_id_map_.find(word);
145 if (iter != custom_word_to_affix_id_map_.end()) {
146 affix_count = 1;
147 affix_ids[0] = iter->second;
148 }
149 }
150
151 static const int kMaxWordLen = 128;
152 static char word_buf[kMaxWordLen];
153 // To take account of null-termination, we use upto 127.
154 strncpy(word_buf, word, kMaxWordLen - 1);
155
156 return AffixIDsToHentry(word_buf, affix_ids, affix_count);
157 #else
158 struct hentry * dp;
159 if (tableptr) {
160 dp = tableptr[hash(word)];
161 if (!dp) return NULL;
162 for ( ; dp != NULL; dp = dp->next) {
163 if (strcmp(word, dp->word) == 0) return dp;
164 }
165 }
166 return NULL;
167 #endif
168 }
169
170 // add a word to the hash table (private)
171 int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
172 int al, const char * desc, bool onlyupcase)
173 {
174 #ifndef HUNSPELL_CHROME_CLIENT
175 bool upcasehomonym = false;
176 int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
177 // variable-length hash record with word and optional fields
178 struct hentry* hp =
179 (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);
180 if (!hp) return 1;
181 char * hpw = hp->word;
182 strcpy(hpw, word);
183 if (ignorechars != NULL) {
184 if (utf8) {
185 remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
186 } else {
187 remove_ignored_chars(hpw, ignorechars);
188 }
189 }
190 if (complexprefixes) {
191 if (utf8) reverseword_utf(hpw); else reverseword(hpw);
192 }
193
194 int i = hash(hpw);
195
196 hp->blen = (unsigned char) wbl;
197 hp->clen = (unsigned char) wcl;
198 hp->alen = (short) al;
199 hp->astr = aff;
200 hp->next = NULL;
201 hp->next_homonym = NULL;
202
203 // store the description string or its pointer
204 if (desc) {
205 hp->var = H_OPT;
206 if (aliasm) {
207 hp->var += H_OPT_ALIASM;
208 store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
209 } else {
210 strcpy(hpw + wbl + 1, desc);
211 if (complexprefixes) {
212 if (utf8) reverseword_utf(HENTRY_DATA(hp));
213 else reverseword(HENTRY_DATA(hp));
214 }
215 }
216 if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;
217 } else hp->var = 0;
218
219 struct hentry * dp = tableptr[i];
220 if (!dp) {
221 tableptr[i] = hp;
222 return 0;
223 }
224 while (dp->next != NULL) {
225 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
226 // remove hidden onlyupcase homonym
227 if (!onlyupcase) {
228 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
229 free(dp->astr);
230 dp->astr = hp->astr;
231 dp->alen = hp->alen;
232 free(hp);
233 return 0;
234 } else {
235 dp->next_homonym = hp;
236 }
237 } else {
238 upcasehomonym = true;
239 }
240 }
241 dp=dp->next;
242 }
243 if (strcmp(hp->word, dp->word) == 0) {
244 // remove hidden onlyupcase homonym
245 if (!onlyupcase) {
246 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
247 free(dp->astr);
248 dp->astr = hp->astr;
249 dp->alen = hp->alen;
250 free(hp);
251 return 0;
252 } else {
253 dp->next_homonym = hp;
254 }
255 } else {
256 upcasehomonym = true;
257 }
258 }
259 if (!upcasehomonym) {
260 dp->next = hp;
261 } else {
262 // remove hidden onlyupcase homonym
263 if (hp->astr) free(hp->astr);
264 free(hp);
265 }
266 #else
267 std::map<base::StringPiece, int>::iterator iter =
268 custom_word_to_affix_id_map_.find(word);
269 if(iter == custom_word_to_affix_id_map_.end()) { // word needs to be added
270 std::string* new_string_word = new std::string(word);
271 pointer_to_strings_.push_back(new_string_word);
272 base::StringPiece sp(*(new_string_word));
273 custom_word_to_affix_id_map_[sp] = 0; // no affixes for custom words
274 return 1;
275 }
276 #endif
277 return 0;
278 }
279
280 int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,
281 unsigned short * flags, int al, char * dp, int captype)
282 {
283 // add inner capitalized forms to handle the following allcap forms:
284 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
285 // Allcaps with suffixes: CIA's -> CIA'S
286 if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
287 ((captype == ALLCAP) && (flags != NULL))) &&
288 !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {
289 unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned s hort) * (al+1));
290 if (!flags2) return 1;
291 if (al) memcpy(flags2, flags, al * sizeof(unsigned short));
292 flags2[al] = ONLYUPCASEFLAG;
293 if (utf8) {
294 char st[BUFSIZE];
295 w_char w[BUFSIZE];
296 int wlen = u8_u16(w, BUFSIZE, word);
297 mkallsmall_utf(w, wlen, langnum);
298 mkallcap_utf(w, 1, langnum);
299 u16_u8(st, BUFSIZE, w, wlen);
300 return add_word(st,wbl,wcl,flags2,al+1,dp, true);
301 } else {
302 mkallsmall(word, csconv);
303 mkinitcap(word, csconv);
304 return add_word(word,wbl,wcl,flags2,al+1,dp, true);
305 }
306 }
307 return 0;
308 }
309
310 // detect captype and modify word length for UTF-8 encoding
311 int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {
312 int len;
313 if (utf8) {
314 w_char dest_utf[BUFSIZE];
315 len = u8_u16(dest_utf, BUFSIZE, word);
316 *captype = get_captype_utf8(dest_utf, len, langnum);
317 } else {
318 len = wbl;
319 *captype = get_captype((char *) word, len, csconv);
320 }
321 return len;
322 }
323
324 // remove word (personal dictionary function for standalone applications)
325 int HashMgr::remove(const char * word)
326 {
327 #ifdef HUNSPELL_CHROME_CLIENT
328 std::map<base::StringPiece, int>::iterator iter =
329 custom_word_to_affix_id_map_.find(word);
330 if (iter != custom_word_to_affix_id_map_.end())
331 custom_word_to_affix_id_map_.erase(iter);
332 #else
333 struct hentry * dp = lookup(word);
334 while (dp) {
335 if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
336 unsigned short * flags =
337 (unsigned short *) malloc(sizeof(short) * (dp->alen + 1));
338 if (!flags) return 1;
339 for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];
340 flags[dp->alen] = forbiddenword;
341 dp->astr = flags;
342 dp->alen++;
343 flag_qsort(flags, 0, dp->alen);
344 }
345 dp = dp->next_homonym;
346 }
347 #endif
348 return 0;
349 }
350
351 /* remove forbidden flag to add a personal word to the hash */
352 int HashMgr::remove_forbidden_flag(const char * word) {
353 struct hentry * dp = lookup(word);
354 if (!dp) return 1;
355 while (dp) {
356 if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
357 if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.
358 else {
359 unsigned short * flags2 =
360 (unsigned short *) malloc(sizeof(short) * (dp->alen - 1));
361 if (!flags2) return 1;
362 int i, j = 0;
363 for (i = 0; i < dp->alen; i++) {
364 if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];
365 }
366 dp->alen--;
367 dp->astr = flags2; // XXX allowed forbidden words
368 }
369 }
370 dp = dp->next_homonym;
371 }
372 return 0;
373 }
374
375 // add a custom dic. word to the hash table (public)
376 int HashMgr::add(const char * word)
377 {
378 unsigned short * flags = NULL;
379 int al = 0;
380 if (remove_forbidden_flag(word)) {
381 int captype;
382 int wbl = strlen(word);
383 int wcl = get_clen_and_captype(word, wbl, &captype);
384 add_word(word, wbl, wcl, flags, al, NULL, false);
385 return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, N ULL, captype);
386 }
387 return 0;
388 }
389
390 int HashMgr::add_with_affix(const char * word, const char * example)
391 {
392 // detect captype and modify word length for UTF-8 encoding
393 struct hentry * dp = lookup(example);
394 remove_forbidden_flag(word);
395 if (dp && dp->astr) {
396 int captype;
397 int wbl = strlen(word);
398 int wcl = get_clen_and_captype(word, wbl, &captype);
399 if (aliasf) {
400 add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);
401 } else {
402 unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeo f(short));
403 if (flags) {
404 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(shor t));
405 add_word(word, wbl, wcl, flags, dp->alen, NULL, false);
406 } else return 1;
407 }
408 return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp ->alen, NULL, captype);
409 }
410 return 1;
411 }
412
413 // walk the hash table entry by entry - null at end
414 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
415 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
416 {
417 #ifdef HUNSPELL_CHROME_CLIENT
418 // Return NULL if dictionary is not valid.
419 if (!bdict_reader->IsValid())
420 return NULL;
421
422 // This function is only ever called by one place and not nested. We can
423 // therefore keep static state between calls and use |col| as a "reset" flag
424 // to avoid changing the API. It is set to -1 for the first call.
425 // Allocate the iterator on the heap to prevent an exit time destructor.
426 static hunspell::WordIterator& word_iterator =
427 *new hunspell::WordIterator(bdict_reader->GetAllWordIterator());
428 if (col < 0) {
429 col = 1;
430 word_iterator = bdict_reader->GetAllWordIterator();
431 }
432
433 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
434 static const int kMaxWordLen = 128;
435 static char word[kMaxWordLen];
436 int affix_count = word_iterator.Advance(word, kMaxWordLen, affix_ids);
437 if (affix_count == 0)
438 return NULL;
439 short word_len = static_cast<short>(strlen(word));
440
441 // Since hunspell 1.2.8, an hentry struct becomes a variable-length struct,
442 // i.e. a struct which uses its array 'word[1]' as a variable-length array.
443 // As noted above, this function is not nested. So, we just use a static
444 // struct which consists of an hentry and a char[kMaxWordLen], and initialize
445 // the static struct and return it for now.
446 // No need to create linked lists for the extra affixes.
447 static struct {
448 hentry entry;
449 char word[kMaxWordLen];
450 } hash_entry;
451
452 return InitHashEntry(&hash_entry.entry, sizeof(hash_entry),
453 &word[0], word_len, affix_ids[0]);
454 #else
455 if (hp && hp->next != NULL) return hp->next;
456 for (col++; col < tablesize; col++) {
457 if (tableptr[col]) return tableptr[col];
458 }
459 // null at end and reset to start
460 col = -1;
461 return NULL;
462 #endif
463 }
464
465 // load a munched word list and build a hash table on the fly
466 int HashMgr::load_tables(const char * tpath, const char * key)
467 {
468 #ifndef HUNSPELL_CHROME_CLIENT
469 int al;
470 char * ap;
471 char * dp;
472 char * dp2;
473 unsigned short * flags;
474 char * ts;
475
476 // open dictionary file
477 FileMgr * dict = new FileMgr(tpath, key);
478 if (dict == NULL) return 1;
479
480 // first read the first line of file to get hash table size */
481 if ((ts = dict->getline()) == NULL) {
482 HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
483 delete dict;
484 return 2;
485 }
486 mychomp(ts);
487
488 /* remove byte order mark */
489 if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {
490 memmove(ts, ts+3, strlen(ts+3)+1);
491 // warning: dic file begins with byte order mark: possible incompatibility w ith old Hunspell versions
492 }
493
494 tablesize = atoi(ts);
495 if (tablesize == 0) {
496 HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the di c file\n");
497 delete dict;
498 return 4;
499 }
500 tablesize = tablesize + 5 + USERWORD;
501 if ((tablesize %2) == 0) tablesize++;
502
503 // allocate the hash table
504 tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *));
505 if (! tableptr) {
506 delete dict;
507 return 3;
508 }
509 for (int i=0; i<tablesize; i++) tableptr[i] = NULL;
510
511 // loop through all words on much list and add to hash
512 // table and create word and affix strings
513
514 while ((ts = dict->getline()) != NULL) {
515 mychomp(ts);
516 // split each line into word and morphological description
517 dp = ts;
518 while ((dp = strchr(dp, ':')) != NULL) {
519 if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) {
520 for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--);
521 if (dp < ts) { // missing word
522 dp = NULL;
523 } else {
524 *(dp + 1) = '\0';
525 dp = dp + 2;
526 }
527 break;
528 }
529 dp++;
530 }
531
532 // tabulator is the old morphological field separator
533 dp2 = strchr(ts, '\t');
534 if (dp2 && (!dp || dp2 < dp)) {
535 *dp2 = '\0';
536 dp = dp2 + 1;
537 }
538
539 // split each line into word and affix char strings
540 // "\/" signs slash in words (not affix separator)
541 // "/" at beginning of the line is word character (not affix separator)
542 ap = strchr(ts,'/');
543 while (ap) {
544 if (ap == ts) {
545 ap++;
546 continue;
547 } else if (*(ap - 1) != '\\') break;
548 // replace "\/" with "/"
549 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
550 ap = strchr(ap,'/');
551 }
552
553 if (ap) {
554 *ap = '\0';
555 if (aliasf) {
556 int index = atoi(ap + 1);
557 al = get_aliasf(index, &flags, dict);
558 if (!al) {
559 HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());
560 *ap = '\0';
561 }
562 } else {
563 al = decode_flags(&flags, ap + 1, dict);
564 if (al == -1) {
565 HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
566 delete dict;
567 return 6;
568 }
569 flag_qsort(flags, 0, al);
570 }
571 } else {
572 al = 0;
573 ap = NULL;
574 flags = NULL;
575 }
576
577 int captype;
578 int wbl = strlen(ts);
579 int wcl = get_clen_and_captype(ts, wbl, &captype);
580 // add the word and its index plus its capitalized form optionally
581 if (add_word(ts,wbl,wcl,flags,al,dp, false) ||
582 add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
583 delete dict;
584 return 5;
585 }
586 }
587
588 delete dict;
589 #endif
590 return 0;
591 }
592
593 // the hash function is a simple load and rotate
594 // algorithm borrowed
595
596 int HashMgr::hash(const char * word) const
597 {
598 #ifdef HUNSPELL_CHROME_CLIENT
599 return 0;
600 #else
601 long hv = 0;
602 for (int i=0; i < 4 && *word != 0; i++)
603 hv = (hv << 8) | (*word++);
604 while (*word != 0) {
605 ROTATE(hv,ROTATE_LEN);
606 hv ^= (*word++);
607 }
608 return (unsigned long) hv % tablesize;
609 #endif
610 }
611
612 int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
613 int len;
614 if (*flags == '\0') {
615 *result = NULL;
616 return 0;
617 }
618 switch (flag_mode) {
619 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
620 len = strlen(flags);
621 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector \n", af->getlinenum());
622 len /= 2;
623 *result = (unsigned short *) malloc(len * sizeof(short));
624 if (!*result) return -1;
625 for (int i = 0; i < len; i++) {
626 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned sh ort) flags[i * 2 + 1];
627 }
628 break;
629 }
630 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 452 1 23 233)
631 int i;
632 len = 1;
633 char * src = flags;
634 unsigned short * dest;
635 char * p;
636 for (p = flags; *p; p++) {
637 if (*p == ',') len++;
638 }
639 *result = (unsigned short *) malloc(len * sizeof(short));
640 if (!*result) return -1;
641 dest = *result;
642 for (p = flags; *p; p++) {
643 if (*p == ',') {
644 i = atoi(src);
645 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: fla g id %d is too large (max: %d)\n",
646 af->getlinenum(), i, DEFAULTFLAGS - 1);
647 *dest = (unsigned short) i;
648 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
649 src = p + 1;
650 dest++;
651 }
652 }
653 i = atoi(src);
654 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
655 af->getlinenum(), i, DEFAULTFLAGS - 1);
656 *dest = (unsigned short) i;
657 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong fla g id\n", af->getlinenum());
658 break;
659 }
660 case FLAG_UNI: { // UTF-8 characters
661 w_char w[BUFSIZE/2];
662 len = u8_u16(w, BUFSIZE/2, flags);
663 *result = (unsigned short *) malloc(len * sizeof(short));
664 if (!*result) return -1;
665 memcpy(*result, w, len * sizeof(short));
666 break;
667 }
668 default: { // Ispell's one-character flags (erfg -> e r f g)
669 unsigned short * dest;
670 len = strlen(flags);
671 *result = (unsigned short *) malloc(len * sizeof(short));
672 if (!*result) return -1;
673 dest = *result;
674 for (unsigned char * p = (unsigned char *) flags; *p; p++) {
675 *dest = (unsigned short) *p;
676 dest++;
677 }
678 }
679 }
680 return len;
681 }
682
683 unsigned short HashMgr::decode_flag(const char * f) {
684 unsigned short s = 0;
685 int i;
686 switch (flag_mode) {
687 case FLAG_LONG:
688 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
689 break;
690 case FLAG_NUM:
691 i = atoi(f);
692 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is to o large (max: %d)\n", i, DEFAULTFLAGS - 1);
693 s = (unsigned short) i;
694 break;
695 case FLAG_UNI:
696 u8_u16((w_char *) &s, 1, f);
697 break;
698 default:
699 s = (unsigned short) *((unsigned char *)f);
700 }
701 if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
702 return s;
703 }
704
705 char * HashMgr::encode_flag(unsigned short f) {
706 unsigned char ch[10];
707 if (f==0) return mystrdup("(NULL)");
708 if (flag_mode == FLAG_LONG) {
709 ch[0] = (unsigned char) (f >> 8);
710 ch[1] = (unsigned char) (f - ((f >> 8) << 8));
711 ch[2] = '\0';
712 } else if (flag_mode == FLAG_NUM) {
713 sprintf((char *) ch, "%d", f);
714 } else if (flag_mode == FLAG_UNI) {
715 u16_u8((char *) &ch, 10, (w_char *) &f, 1);
716 } else {
717 ch[0] = (unsigned char) (f);
718 ch[1] = '\0';
719 }
720 return mystrdup((char *) ch);
721 }
722
723 // read in aff file and set flag mode
724 int HashMgr::load_config(const char * affpath, const char * key)
725 {
726 char * line; // io buffers
727 int firstline = 1;
728
729 // open the affix file
730 #ifdef HUNSPELL_CHROME_CLIENT
731 hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator();
732 FileMgr * afflst = new FileMgr(&iterator);
733 #else
734 FileMgr * afflst = new FileMgr(affpath, key);
735 #endif
736 if (!afflst) {
737 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n ",affpath);
738 return 1;
739 }
740
741 // read in each line ignoring any that do not
742 // start with a known line type indicator
743
744 while ((line = afflst->getline()) != NULL) {
745 mychomp(line);
746
747 /* remove byte order mark */
748 if (firstline) {
749 firstline = 0;
750 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(l ine+3)+1);
751 }
752
753 /* parse in the try string */
754 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
755 if (flag_mode != FLAG_CHAR) {
756 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions o f the FLAG affix file parameter\n", afflst->getlinenum());
757 }
758 if (strstr(line, "long")) flag_mode = FLAG_LONG;
759 if (strstr(line, "num")) flag_mode = FLAG_NUM;
760 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
761 if (flag_mode == FLAG_CHAR) {
762 HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `lon g' or `UTF-8' parameter\n", afflst->getlinenum());
763 }
764 }
765 if (strncmp(line,"FORBIDDENWORD",13) == 0) {
766 char * st = NULL;
767 if (parse_string(line, &st, afflst->getlinenum())) {
768 delete afflst;
769 return 1;
770 }
771 forbiddenword = decode_flag(st);
772 free(st);
773 }
774 if (strncmp(line, "SET", 3) == 0) {
775 if (parse_string(line, &enc, afflst->getlinenum())) {
776 delete afflst;
777 return 1;
778 }
779 if (strcmp(enc, "UTF-8") == 0) {
780 utf8 = 1;
781 #ifndef OPENOFFICEORG
782 #ifndef MOZILLA_CLIENT
783 initialize_utf_tbl();
784 #endif
785 #endif
786 } else csconv = get_current_cs(enc);
787 }
788 if (strncmp(line, "LANG", 4) == 0) {
789 if (parse_string(line, &lang, afflst->getlinenum())) {
790 delete afflst;
791 return 1;
792 }
793 langnum = get_lang_num(lang);
794 }
795
796 /* parse in the ignored characters (for example, Arabic optional diacriti cs characters */
797 if (strncmp(line,"IGNORE",6) == 0) {
798 if (parse_array(line, &ignorechars, &ignorechars_utf16,
799 &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
800 delete afflst;
801 return 1;
802 }
803 }
804
805 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
806 if (parse_aliasf(line, afflst)) {
807 delete afflst;
808 return 1;
809 }
810 }
811
812 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
813 if (parse_aliasm(line, afflst)) {
814 delete afflst;
815 return 1;
816 }
817 }
818
819 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
820 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && iss pace(line[3])) break;
821 }
822 if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);
823 delete afflst;
824 return 0;
825 }
826
827 /* parse in the ALIAS table */
828 int HashMgr::parse_aliasf(char * line, FileMgr * af)
829 {
830 if (numaliasf != 0) {
831 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum());
832 return 1;
833 }
834 char * tp = line;
835 char * piece;
836 int i = 0;
837 int np = 0;
838 piece = mystrsep(&tp, 0);
839 while (piece) {
840 if (*piece != '\0') {
841 switch(i) {
842 case 0: { np++; break; }
843 case 1: {
844 numaliasf = atoi(piece);
845 if (numaliasf < 1) {
846 numaliasf = 0;
847 aliasf = NULL;
848 aliasflen = NULL;
849 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu mber\n", af->getlinenum());
850 return 1;
851 }
852 aliasf = (unsigned short **) malloc(numaliasf * sizeof(un signed short *));
853 aliasflen = (unsigned short *) malloc(numaliasf * sizeof( short));
854 if (!aliasf || !aliasflen) {
855 numaliasf = 0;
856 if (aliasf) free(aliasf);
857 if (aliasflen) free(aliasflen);
858 aliasf = NULL;
859 aliasflen = NULL;
860 return 1;
861 }
862 np++;
863 break;
864 }
865 default: break;
866 }
867 i++;
868 }
869 piece = mystrsep(&tp, 0);
870 }
871 if (np != 2) {
872 numaliasf = 0;
873 free(aliasf);
874 free(aliasflen);
875 aliasf = NULL;
876 aliasflen = NULL;
877 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum( ));
878 return 1;
879 }
880
881 /* now parse the numaliasf lines to read in the remainder of the table */
882 char * nl;
883 for (int j=0; j < numaliasf; j++) {
884 if ((nl = af->getline()) == NULL) return 1;
885 mychomp(nl);
886 tp = nl;
887 i = 0;
888 aliasf[j] = NULL;
889 aliasflen[j] = 0;
890 piece = mystrsep(&tp, 0);
891 while (piece) {
892 if (*piece != '\0') {
893 switch(i) {
894 case 0: {
895 if (strncmp(piece,"AF",2) != 0) {
896 numaliasf = 0;
897 free(aliasf);
898 free(aliasflen);
899 aliasf = NULL;
900 aliasflen = NULL;
901 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
902 return 1;
903 }
904 break;
905 }
906 case 1: {
907 aliasflen[j] = (unsigned short) decode_flags(&(alias f[j]), piece, af);
908 flag_qsort(aliasf[j], 0, aliasflen[j]);
909 break;
910 }
911 default: break;
912 }
913 i++;
914 }
915 piece = mystrsep(&tp, 0);
916 }
917 if (!aliasf[j]) {
918 free(aliasf);
919 free(aliasflen);
920 aliasf = NULL;
921 aliasflen = NULL;
922 numaliasf = 0;
923 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af-> getlinenum());
924 return 1;
925 }
926 }
927 return 0;
928 }
929
930 #ifdef HUNSPELL_CHROME_CLIENT
931 int HashMgr::LoadAFLines()
932 {
933 utf8 = 1; // We always use UTF-8.
934
935 // Read in all the AF lines which tell us the rules for each affix group ID.
936 hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator();
937 FileMgr afflst(&iterator);
938 while (char* line = afflst.getline()) {
939 int rv = parse_aliasf(line, &afflst);
940 if (rv)
941 return rv;
942 }
943
944 return 0;
945 }
946
947 hentry* HashMgr::InitHashEntry(hentry* entry,
948 size_t item_size,
949 const char* word,
950 int word_length,
951 int affix_index) const {
952 // Return if the given buffer doesn't have enough space for a hentry struct
953 // or the given word is too long.
954 // Our BDICT cannot handle words longer than (128 - 1) bytes. So, it is
955 // better to return an error if the given word is too long and prevent
956 // an unexpected result caused by a long word.
957 const int kMaxWordLen = 128;
958 if (item_size < sizeof(hentry) + word_length + 1 ||
959 word_length >= kMaxWordLen)
960 return NULL;
961
962 // Initialize a hentry struct with the given parameters, and
963 // append the given string at the end of this hentry struct.
964 memset(entry, 0, item_size);
965 FileMgr af(NULL);
966 entry->alen = static_cast<short>(
967 const_cast<HashMgr*>(this)->get_aliasf(affix_index, &entry->astr, &af));
968 entry->blen = static_cast<unsigned char>(word_length);
969 memcpy(&entry->word, word, word_length);
970
971 return entry;
972 }
973
974 hentry* HashMgr::CreateHashEntry(const char* word,
975 int word_length,
976 int affix_index) const {
977 // Return if the given word is too long.
978 // (See the comment in HashMgr::InitHashEntry().)
979 const int kMaxWordLen = 128;
980 if (word_length >= kMaxWordLen)
981 return NULL;
982
983 const size_t kEntrySize = sizeof(hentry) + word_length + 1;
984 struct hentry* entry = reinterpret_cast<hentry*>(malloc(kEntrySize));
985 if (entry)
986 InitHashEntry(entry, kEntrySize, word, word_length, affix_index);
987
988 return entry;
989 }
990
991 void HashMgr::DeleteHashEntry(hentry* entry) const {
992 free(entry);
993 }
994
995 hentry* HashMgr::AffixIDsToHentry(char* word,
996 int* affix_ids,
997 int affix_count) const
998 {
999 if (affix_count == 0)
1000 return NULL;
1001
1002 HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache;
1003 std::string std_word(word);
1004 HEntryCache::iterator found = cache.find(std_word);
1005 if (found != cache.end()) {
1006 // We must return an existing hentry for the same word if we've previously
1007 // handed one out. Hunspell will compare pointers in some cases to see if
1008 // two words it has found are the same.
1009 return found->second;
1010 }
1011
1012 short word_len = static_cast<short>(strlen(word));
1013
1014 // We can get a number of prefixes per word. There will normally be only one,
1015 // but if not, there will be a linked list of "hentry"s for the "homonym"s
1016 // for the word.
1017 struct hentry* first_he = NULL;
1018 struct hentry* prev_he = NULL; // For making linked list.
1019 for (int i = 0; i < affix_count; i++) {
1020 struct hentry* he = CreateHashEntry(word, word_len, affix_ids[i]);
1021 if (!he)
1022 break;
1023 if (i == 0)
1024 first_he = he;
1025 if (prev_he)
1026 prev_he->next_homonym = he;
1027 prev_he = he;
1028 }
1029
1030 cache[std_word] = first_he; // Save this word in the cache for later.
1031 return first_he;
1032 }
1033
1034 hentry* HashMgr::GetHentryFromHEntryCache(char* word) {
1035 HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache;
1036 std::string std_word(word);
1037 HEntryCache::iterator found = cache.find(std_word);
1038 if (found != cache.end())
1039 return found->second;
1040 else
1041 return NULL;
1042 }
1043 #endif
1044
1045 int HashMgr::is_aliasf() {
1046 return (aliasf != NULL);
1047 }
1048
1049 int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {
1050 if ((index > 0) && (index <= numaliasf)) {
1051 *fvec = aliasf[index - 1];
1052 return aliasflen[index - 1];
1053 }
1054 HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->g etlinenum(), index);
1055 *fvec = NULL;
1056 return 0;
1057 }
1058
1059 /* parse morph alias definitions */
1060 int HashMgr::parse_aliasm(char * line, FileMgr * af)
1061 {
1062 if (numaliasm != 0) {
1063 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", a f->getlinenum());
1064 return 1;
1065 }
1066 char * tp = line;
1067 char * piece;
1068 int i = 0;
1069 int np = 0;
1070 piece = mystrsep(&tp, 0);
1071 while (piece) {
1072 if (*piece != '\0') {
1073 switch(i) {
1074 case 0: { np++; break; }
1075 case 1: {
1076 numaliasm = atoi(piece);
1077 if (numaliasm < 1) {
1078 HUNSPELL_WARNING(stderr, "error: line %d: bad entry nu mber\n", af->getlinenum());
1079 return 1;
1080 }
1081 aliasm = (char **) malloc(numaliasm * sizeof(char *));
1082 if (!aliasm) {
1083 numaliasm = 0;
1084 return 1;
1085 }
1086 np++;
1087 break;
1088 }
1089 default: break;
1090 }
1091 i++;
1092 }
1093 piece = mystrsep(&tp, 0);
1094 }
1095 if (np != 2) {
1096 numaliasm = 0;
1097 free(aliasm);
1098 aliasm = NULL;
1099 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum( ));
1100 return 1;
1101 }
1102
1103 /* now parse the numaliasm lines to read in the remainder of the table */
1104 char * nl = line;
1105 for (int j=0; j < numaliasm; j++) {
1106 if ((nl = af->getline()) == NULL) return 1;
1107 mychomp(nl);
1108 tp = nl;
1109 i = 0;
1110 aliasm[j] = NULL;
1111 piece = mystrsep(&tp, ' ');
1112 while (piece) {
1113 if (*piece != '\0') {
1114 switch(i) {
1115 case 0: {
1116 if (strncmp(piece,"AM",2) != 0) {
1117 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
1118 numaliasm = 0;
1119 free(aliasm);
1120 aliasm = NULL;
1121 return 1;
1122 }
1123 break;
1124 }
1125 case 1: {
1126 // add the remaining of the line
1127 if (*tp) {
1128 *(tp - 1) = ' ';
1129 tp = tp + strlen(tp);
1130 }
1131 if (complexprefixes) {
1132 if (utf8) reverseword_utf(piece);
1133 else reverseword(piece);
1134 }
1135 aliasm[j] = mystrdup(piece);
1136 if (!aliasm[j]) {
1137 numaliasm = 0;
1138 free(aliasm);
1139 aliasm = NULL;
1140 return 1;
1141 }
1142 break; }
1143 default: break;
1144 }
1145 i++;
1146 }
1147 piece = mystrsep(&tp, ' ');
1148 }
1149 if (!aliasm[j]) {
1150 numaliasm = 0;
1151 free(aliasm);
1152 aliasm = NULL;
1153 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af-> getlinenum());
1154 return 1;
1155 }
1156 }
1157 return 0;
1158 }
1159
1160 int HashMgr::is_aliasm() {
1161 return (aliasm != NULL);
1162 }
1163
1164 char * HashMgr::get_aliasm(int index) {
1165 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
1166 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
1167 return NULL;
1168 }
OLDNEW
« no previous file with comments | « third_party/hunspell_new/src/hunspell/hashmgr.hxx ('k') | third_party/hunspell_new/src/hunspell/htypes.hxx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698