Index: chrome/third_party/hunspell/src/hunspell/hunspell.cxx |
=================================================================== |
--- chrome/third_party/hunspell/src/hunspell/hunspell.cxx (revision 21721) |
+++ chrome/third_party/hunspell/src/hunspell/hunspell.cxx (working copy) |
@@ -6,16 +6,17 @@ |
#include <cstring> |
#include <cstdio> |
#else |
-#include <stdlib.h> |
+#include <stdlib.h> |
#include <string.h> |
-#include <stdio.h> |
+#include <stdio.h> |
#endif |
#include "hunspell.hxx" |
#include "hunspell.h" |
+#include "csutil.hxx" |
#ifndef MOZILLA_CLIENT |
-#ifndef W32 |
+#ifndef WIN32 |
using namespace std; |
#endif |
#endif |
@@ -23,27 +24,34 @@ |
#ifdef HUNSPELL_CHROME_CLIENT |
Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length) |
#else |
-Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) |
+Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL) |
#endif |
{ |
encoding = NULL; |
csconv = NULL; |
utf8 = 0; |
complexprefixes = 0; |
+#ifndef HUNSPELL_CHROME_CLIENT |
+ affixpath = mystrdup(affpath); |
+#endif |
+ maxdic = 0; |
#ifdef HUNSPELL_CHROME_CLIENT |
bdict_reader = new hunspell::BDictReader; |
bdict_reader->Init(bdict_data, bdict_length); |
- pHMgr = new HashMgr(bdict_reader); |
- pAMgr = new AffixMgr(bdict_reader, pHMgr); |
+ pHMgr[0] = new HashMgr(bdict_reader); |
+ if (pHMgr[0]) maxdic = 1; |
+ |
+ pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic); |
#else |
/* first set up the hash manager */ |
- pHMgr = new HashMgr(dic_handle, aff_handle); |
+ pHMgr[0] = new HashMgr(dic_handle, aff_handle, key); |
+ if (pHMgr[0]) maxdic = 1; |
/* next set up the affix manager */ |
/* it needs access to the hash manager lookup methods */ |
- pAMgr = new AffixMgr(aff_handle, pHMgr); |
+ pAMgr = new AffixMgr(aff_handle, pHMgr, &maxdic, key); |
#endif |
/* get the preferred try string and the dictionary */ |
@@ -65,10 +73,13 @@ |
{ |
if (pSMgr) delete pSMgr; |
if (pAMgr) delete pAMgr; |
- if (pHMgr) delete pHMgr; |
+ for (int i = 0; i < maxdic; i++) delete pHMgr[i]; |
+ maxdic = 0; |
pSMgr = NULL; |
pAMgr = NULL; |
- pHMgr = NULL; |
+#ifdef MOZILLA_CLIENT |
+ free(csconv); |
+#endif |
csconv= NULL; |
if (encoding) free(encoding); |
encoding = NULL; |
@@ -76,27 +87,38 @@ |
#ifdef HUNSPELL_CHROME_CLIENT |
if (bdict_reader) delete bdict_reader; |
bdict_reader = NULL; |
+#else |
+ if (affixpath) free(affixpath); |
+ affixpath = NULL; |
#endif |
} |
+#ifndef HUNSPELL_CHROME_CLIENT |
+// load extra dictionaries |
+int Hunspell::add_dic(const char * dpath, const char * key) { |
+ if (maxdic == MAXDIC || !affixpath) return 1; |
+ pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); |
+ if (pHMgr[maxdic]) maxdic++; else return 1; |
+ return 0; |
+} |
+#endif |
// make a copy of src at destination while removing all leading |
// blanks and removing any trailing periods after recording |
// their presence with the abbreviation flag |
-// also since already going through character by character, |
+// also since already going through character by character, |
// set the capitalization type |
// return the length of the "cleaned" (and UTF-8 encoded) word |
-int Hunspell::cleanword2(char * dest, const char * src, |
+int Hunspell::cleanword2(char * dest, const char * src, |
w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) |
-{ |
+{ |
unsigned char * p = (unsigned char *) dest; |
const unsigned char * q = (const unsigned char * ) src; |
- int firstcap = 0; |
// first skip over any leading blanks |
while ((*q != '\0') && (*q == ' ')) q++; |
- |
+ |
// now strip off any trailing periods (recording their presence) |
*pabbrev = 0; |
int nl = strlen((const char *)q); |
@@ -104,80 +126,43 @@ |
nl--; |
(*pabbrev)++; |
} |
- |
+ |
// if no characters are left it can't be capitalized |
- if (nl <= 0) { |
+ if (nl <= 0) { |
*pcaptype = NOCAP; |
*p = '\0'; |
return 0; |
} |
- // now determine the capitalization type of the first nl letters |
- int ncap = 0; |
- int nneutral = 0; |
- *nc = 0; |
- |
- if (!utf8) { |
- while (nl > 0) { |
- (*nc)++; |
- if (csconv[(*q)].ccase) ncap++; |
- if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; |
- *p++ = *q++; |
- nl--; |
- } |
- // remember to terminate the destination string |
- *p = '\0'; |
- if (ncap) { |
- firstcap = csconv[(unsigned char)(*dest)].ccase; |
- } |
- } else { |
- unsigned short idx; |
- *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q); |
+ strncpy(dest, (char *) q, nl); |
+ *(dest + nl) = '\0'; |
+ nl = strlen(dest); |
+ if (utf8) { |
+ *nc = u8_u16(dest_utf, MAXWORDLEN, dest); |
// don't check too long words |
if (*nc >= MAXWORDLEN) return 0; |
if (*nc == -1) { // big Unicode character (non BMP area) |
*pcaptype = NOCAP; |
- strcpy((char *) p, (char *) q); |
- return strlen(dest); |
+ return nl; |
} |
- *nc -= *pabbrev; |
- for (int i = 0; i < *nc; i++) { |
- idx = (dest_utf[i].h << 8) + dest_utf[i].l; |
- if (idx != unicodetolower(idx, langnum)) ncap++; |
- if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; |
- } |
- u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc); |
- if (ncap) { |
- idx = (dest_utf[0].h << 8) + dest_utf[0].l; |
- firstcap = (idx != unicodetolower(idx, langnum)); |
- } |
- } |
- |
- // now finally set the captype |
- if (ncap == 0) { |
- *pcaptype = NOCAP; |
- } else if ((ncap == 1) && firstcap) { |
- *pcaptype = INITCAP; |
- } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) { |
- *pcaptype = ALLCAP; |
- } else if ((ncap > 1) && firstcap) { |
- *pcaptype = HUHINITCAP; |
+ *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); |
} else { |
- *pcaptype = HUHCAP; |
+ *pcaptype = get_captype(dest, nl, csconv); |
+ *nc = nl; |
} |
- return strlen(dest); |
-} |
+ return nl; |
+} |
-int Hunspell::cleanword(char * dest, const char * src, |
+int Hunspell::cleanword(char * dest, const char * src, |
int * pcaptype, int * pabbrev) |
-{ |
+{ |
unsigned char * p = (unsigned char *) dest; |
const unsigned char * q = (const unsigned char * ) src; |
int firstcap = 0; |
// first skip over any leading blanks |
while ((*q != '\0') && (*q == ' ')) q++; |
- |
+ |
// now strip off any trailing periods (recording their presence) |
*pabbrev = 0; |
int nl = strlen((const char *)q); |
@@ -185,9 +170,9 @@ |
nl--; |
(*pabbrev)++; |
} |
- |
+ |
// if no characters are left it can't be capitalized |
- if (nl <= 0) { |
+ if (nl <= 0) { |
*pcaptype = NOCAP; |
*p = '\0'; |
return 0; |
@@ -215,8 +200,9 @@ |
nc = u8_u16(t, MAXWORDLEN, src); |
for (int i = 0; i < nc; i++) { |
idx = (t[i].h << 8) + t[i].l; |
- if (idx != unicodetolower(idx, langnum)) ncap++; |
- if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; |
+ unsigned short low = unicodetolower(idx, langnum); |
+ if (idx != low) ncap++; |
+ if (unicodetoupper(idx, langnum) == low) nneutral++; |
} |
u16_u8(dest, MAXWORDUTF8LEN, t, nc); |
if (ncap) { |
@@ -238,8 +224,7 @@ |
*pcaptype = HUHCAP; |
} |
return strlen(dest); |
-} |
- |
+} |
void Hunspell::mkallcap(char * p) |
{ |
@@ -256,7 +241,7 @@ |
} |
u16_u8(p, MAXWORDUTF8LEN, u, nc); |
} else { |
- while (*p != '\0') { |
+ while (*p != '\0') { |
*p = csconv[((unsigned char) *p)].cupper; |
p++; |
} |
@@ -269,15 +254,16 @@ |
unsigned short idx; |
for (int i = 0; i < nc; i++) { |
idx = (u[i].h << 8) + u[i].l; |
- if (idx != unicodetoupper(idx, langnum)) { |
- u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); |
- u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); |
+ unsigned short up = unicodetoupper(idx, langnum); |
+ if (idx != up) { |
+ u[i].h = (unsigned char) (up >> 8); |
+ u[i].l = (unsigned char) (up & 0x00FF); |
} |
} |
u16_u8(p, MAXWORDUTF8LEN, u, nc); |
- return strlen(p); |
+ return strlen(p); |
} else { |
- while (*p != '\0') { |
+ while (*p != '\0') { |
*p = csconv[((unsigned char) *p)].cupper; |
p++; |
} |
@@ -288,7 +274,7 @@ |
void Hunspell::mkallsmall(char * p) |
{ |
- while (*p != '\0') { |
+ while (*p != '\0') { |
*p = csconv[((unsigned char) *p)].clower; |
p++; |
} |
@@ -300,15 +286,16 @@ |
unsigned short idx; |
for (int i = 0; i < nc; i++) { |
idx = (u[i].h << 8) + u[i].l; |
- if (idx != unicodetolower(idx, langnum)) { |
- u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); |
- u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); |
+ unsigned short low = unicodetolower(idx, langnum); |
+ if (idx != low) { |
+ u[i].h = (unsigned char) (low >> 8); |
+ u[i].l = (unsigned char) (low & 0x00FF); |
} |
} |
u16_u8(p, MAXWORDUTF8LEN, u, nc); |
return strlen(p); |
} else { |
- while (*p != '\0') { |
+ while (*p != '\0') { |
*p = csconv[((unsigned char) *p)].clower; |
p++; |
} |
@@ -322,18 +309,18 @@ |
*p = *source; |
for (p++, source++; *(source - 1); p++, source++) { |
*p = *source; |
- if (*source == '\x9f') *--p = '\xdf'; |
+ if (*source == '\x9F') *--p = '\xDF'; |
} |
return dest; |
} |
-// recursive search for right ss-\xdf permutations |
+// recursive search for right ss - sharp s permutations |
hentry * Hunspell::spellsharps(char * base, char * pos, int n, |
int repnum, char * tmp, int * info, char **root) { |
pos = strstr(pos, "ss"); |
if (pos && (n < MAXSHARPS)) { |
- *pos = '\xc3'; |
- *(pos + 1) = '\x9f'; |
+ *pos = '\xC3'; |
+ *(pos + 1) = '\x9F'; |
hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); |
if (h) return h; |
*pos = 's'; |
@@ -352,31 +339,32 @@ |
TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); |
} |
-/* check and insert a word to beginning of the suggestion array */ |
-int Hunspell::insert_sug(char ***slst, char * word, int *ns) { |
- if (spell(word)) { |
- if (*ns == MAXSUGGESTION) { |
- (*ns)--; |
- free((*slst)[*ns]); |
- } |
- for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; |
- (*slst)[0] = mystrdup(word); |
- (*ns)++; |
+/* insert a word to the beginning of the suggestion array and return ns */ |
+int Hunspell::insert_sug(char ***slst, char * word, int ns) { |
+ char * dup = mystrdup(word); |
+ if (!dup) return ns; |
+ if (ns == MAXSUGGESTION) { |
+ ns--; |
+ free((*slst)[ns]); |
} |
- return 0; |
+ for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; |
+ (*slst)[0] = dup; |
+ return ns + 1; |
} |
int Hunspell::spell(const char * word, int * info, char ** root) |
{ |
#ifdef HUNSPELL_CHROME_CLIENT |
- if (pHMgr) pHMgr->EmptyHentryCache(); |
+ if (pHMgr) pHMgr[0]->EmptyHentryCache(); |
#endif |
struct hentry * rv=NULL; |
// need larger vector. For example, Turkish capital letter I converted a |
// 2-byte UTF-8 character (dotless i) by mkallsmall. |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- w_char unicw[MAXWORDLEN + 1]; |
+ char cw[MAXWORDUTF8LEN]; |
+ char wspace[MAXWORDUTF8LEN]; |
+ w_char unicw[MAXWORDLEN]; |
+ // Hunspell supports XML input of the simplified API (see manual) |
+ if (strcmp(word, SPELL_XML) == 0) return 1; |
int nc = strlen(word); |
int wl2 = 0; |
if (utf8) { |
@@ -386,14 +374,18 @@ |
} |
int captype = 0; |
int abbv = 0; |
- int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
+ int wl = 0; |
- if (wl == 0) return 1; |
+ // input conversion |
+ RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; |
+ if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); |
+ else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
- if (info) *info = 0; |
+ int info2 = 0; |
+ if (wl == 0 || maxdic == 0) return 1; |
if (root) *root = NULL; |
- // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.) |
+ // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) |
enum { NBEGIN, NNUM, NSEP }; |
int nstate = NBEGIN; |
int i; |
@@ -407,173 +399,179 @@ |
} else break; |
} |
if ((i == wl) && (nstate == NNUM)) return 1; |
+ if (!info) info = &info2; else *info = 0; |
- // LANG_hu section: number(s) + (percent or degree) with suffixes |
- if (langnum == LANG_hu) { |
- if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '\xb0')) |
- && checkword(cw + i, info, root)) return 1; |
- } |
- // END of LANG_hu section |
- |
switch(captype) { |
- case HUHCAP: |
- case HUHINITCAP: |
- case NOCAP: { |
- rv = checkword(cw, info, root); |
- if ((abbv) && !(rv)) { |
- memcpy(wspace,cw,wl); |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- rv = checkword(wspace, info, root); |
- } |
- break; |
- } |
+ case HUHCAP: |
+ case HUHINITCAP: |
+ case NOCAP: { |
+ rv = checkword(cw, info, root); |
+ if ((abbv) && !(rv)) { |
+ memcpy(wspace,cw,wl); |
+ *(wspace+wl) = '.'; |
+ *(wspace+wl+1) = '\0'; |
+ rv = checkword(wspace, info, root); |
+ } |
+ break; |
+ } |
case ALLCAP: { |
- rv = checkword(cw, info, root); |
- if (rv) break; |
- if (abbv) { |
- memcpy(wspace,cw,wl); |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- rv = checkword(wspace, info, root); |
- if (rv) break; |
- } |
- if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { |
- char tmpword[MAXWORDUTF8LEN]; |
- wl = mkallsmall2(cw, unicw, nc); |
- memcpy(wspace,cw,(wl+1)); |
+ rv = checkword(cw, info, root); |
+ if (rv) break; |
+ if (abbv) { |
+ memcpy(wspace,cw,wl); |
+ *(wspace+wl) = '.'; |
+ *(wspace+wl+1) = '\0'; |
+ rv = checkword(wspace, info, root); |
+ if (rv) break; |
+ } |
+ // Spec. prefix handling for Catalan, French, Italian: |
+ // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). |
+ if (pAMgr && strchr(cw, '\'')) { |
+ wl = mkallsmall2(cw, unicw, nc); |
+ char * apostrophe = strchr(cw, '\''); |
+ if (utf8) { |
+ w_char tmpword[MAXWORDLEN]; |
+ *apostrophe = '\0'; |
+ wl2 = u8_u16(tmpword, MAXWORDLEN, cw); |
+ *apostrophe = '\''; |
+ if (wl2 < nc) { |
+ mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); |
+ rv = checkword(cw, info, root); |
+ if (rv) break; |
+ } |
+ } else { |
+ mkinitcap2(apostrophe + 1, unicw, nc); |
+ rv = checkword(cw, info, root); |
+ if (rv) break; |
+ } |
+ mkinitcap2(cw, unicw, nc); |
+ rv = checkword(cw, info, root); |
+ if (rv) break; |
+ } |
+ if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { |
+ char tmpword[MAXWORDUTF8LEN]; |
+ wl = mkallsmall2(cw, unicw, nc); |
+ memcpy(wspace,cw,(wl+1)); |
+ rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
+ if (!rv) { |
+ wl2 = mkinitcap2(cw, unicw, nc); |
+ rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); |
+ } |
+ if ((abbv) && !(rv)) { |
+ *(wspace+wl) = '.'; |
+ *(wspace+wl+1) = '\0'; |
+ rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
+ if (!rv) { |
+ memcpy(wspace, cw, wl2); |
+ *(wspace+wl2) = '.'; |
+ *(wspace+wl2+1) = '\0'; |
rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
- if (!rv) { |
- wl2 = mkinitcap2(cw, unicw, nc); |
- rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); |
- } |
- if ((abbv) && !(rv)) { |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
- if (!rv) { |
- memcpy(wspace, cw, wl2); |
- *(wspace+wl2) = '.'; |
- *(wspace+wl2+1) = '\0'; |
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
- } |
- } |
- if (rv) break; |
} |
} |
- case INITCAP: { |
- wl = mkallsmall2(cw, unicw, nc); |
- memcpy(wspace,cw,(wl+1)); |
- rv = checkword(wspace, info, root); |
- if (!rv || (is_keepcase(rv) && !((captype == INITCAP) && |
- // if CHECKSHARPS: KEEPCASE words with \xdf are allowed |
- // in INITCAP form, too. |
- pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "\xc3\x9f")) || |
- (!utf8 && strchr(wspace, '\xdf')))))) { |
- wl2 = mkinitcap2(cw, unicw, nc); |
- rv = checkword(cw, info, root); |
- if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL; |
- } |
- if (abbv && !rv) { |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- rv = checkword(wspace, info, root); |
- if (!rv || is_keepcase(rv)) { |
- memcpy(wspace, cw, wl2); |
- *(wspace+wl2) = '.'; |
- *(wspace+wl2+1) = '\0'; |
- rv = checkword(wspace, info, root); |
- if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL; |
- } |
- } |
- break; |
- } |
+ if (rv) break; |
+ } |
+ } |
+ case INITCAP: { |
+ wl = mkallsmall2(cw, unicw, nc); |
+ memcpy(wspace,cw,(wl+1)); |
+ wl2 = mkinitcap2(cw, unicw, nc); |
+ if (captype == INITCAP) *info += SPELL_INITCAP; |
+ rv = checkword(cw, info, root); |
+ if (captype == INITCAP) *info -= SPELL_INITCAP; |
+ // forbid bad capitalization |
+ // (for example, ijs -> Ijs instead of IJs in Dutch) |
+ // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) |
+ if (*info & SPELL_FORBIDDEN) { |
+ rv = NULL; |
+ break; |
+ } |
+ if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; |
+ if (rv) break; |
+ |
+ rv = checkword(wspace, info, root); |
+ if (abbv && !rv) { |
+ |
+ *(wspace+wl) = '.'; |
+ *(wspace+wl+1) = '\0'; |
+ rv = checkword(wspace, info, root); |
+ if (!rv) { |
+ memcpy(wspace, cw, wl2); |
+ *(wspace+wl2) = '.'; |
+ *(wspace+wl2+1) = '\0'; |
+ if (captype == INITCAP) *info += SPELL_INITCAP; |
+ rv = checkword(wspace, info, root); |
+ if (captype == INITCAP) *info -= SPELL_INITCAP; |
+ if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; |
+ break; |
+ } |
+ } |
+ if (rv && is_keepcase(rv) && |
+ ((captype == ALLCAP) || |
+ // if CHECKSHARPS: KEEPCASE words with \xDF are allowed |
+ // in INITCAP form, too. |
+ !(pAMgr->get_checksharps() && |
+ ((utf8 && strstr(wspace, "\xC3\x9F")) || |
+ (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; |
+ break; |
+ } |
} |
- |
+ |
if (rv) return 1; |
- // recursive breaking at break points (not good for morphological analysis) |
+ // recursive breaking at break points |
if (wordbreak) { |
char * s; |
char r; |
- for (int j = 0; j < pAMgr->get_numbreak(); j++) { |
+ int corr = 0; |
+ wl = strlen(cw); |
+ int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; |
+ // check boundary patterns (^begin and end$) |
+ for (int j = 0; j < numbreak; j++) { |
+ int plen = strlen(wordbreak[j]); |
+ if (plen == 1 || plen > wl) continue; |
+ if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 |
+ && spell(cw + plen - 1)) return 1; |
+ if (wordbreak[j][plen - 1] == '$' && |
+ strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { |
+ r = cw[wl - plen + 1]; |
+ cw[wl - plen + 1] = '\0'; |
+ if (spell(cw)) return 1; |
+ cw[wl - plen + 1] = r; |
+ } |
+ } |
+ // other patterns |
+ for (int j = 0; j < numbreak; j++) { |
+ int result = 0; |
+ int plen = strlen(wordbreak[j]); |
s=(char *) strstr(cw, wordbreak[j]); |
- if (s) { |
+ if (s && (s > cw) && (s < cw + wl - plen)) { |
+ if (!spell(s + plen)) continue; |
r = *s; |
*s = '\0'; |
// examine 2 sides of the break point |
- if (spell(cw) && spell(s + strlen(wordbreak[j]))) { |
- *s = r; |
- return 1; |
- } |
+ if (spell(cw)) return 1; |
*s = r; |
+ |
+ // LANG_hu: spec. dash rule |
+ if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { |
+ r = s[1]; |
+ s[1] = '\0'; |
+ if (spell(cw)) return 1; // check the first part with dash |
+ s[1] = r; |
+ } |
+ // end of LANG speficic region |
+ |
} |
} |
} |
- // LANG_hu: compoundings with dashes and n-dashes XXX deprecated! |
- if (langnum == LANG_hu) { |
- int n; |
- // compound word with dash (HU) I18n |
- char * dash; |
- int result = 0; |
- // n-dash |
- dash = (char *) strstr(cw,"\xe2\x80\x93"); |
- if (dash && !wordbreak) { |
- *dash = '\0'; |
- // examine 2 sides of the dash |
- if (spell(cw) && spell(dash + 3)) { |
- *dash = '\xe2'; |
- return 1; |
- } |
- *dash = '\xe2'; |
- } |
- dash = (char *) strchr(cw,'-'); |
- if (dash) { |
- *dash='\0'; |
- // examine 2 sides of the dash |
- if (dash[1] == '\0') { // base word ending with dash |
- if (spell(cw)) return 1; |
- } else { |
- // first word ending with dash: word- |
- char r2 = *(dash + 1); |
- dash[0]='-'; |
- dash[1]='\0'; |
- result = spell(cw); |
- dash[1] = r2; |
- dash[0]='\0'; |
- if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') || |
- ((dash[1] > '0') && (dash[1] < '9')))) return 1; |
- } |
- // affixed number in correct word |
- if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) { |
- *dash='-'; |
- n = 1; |
- if (*(dash - n) == '.') n++; |
- // search first not a number character to left from dash |
- while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { |
- n++; |
- } |
- if ((dash - n) < cw) n--; |
- // numbers: deprecated |
- for(; n >= 1; n--) { |
- if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && |
- checkword(dash - n, info, root)) return 1; |
- } |
- } |
- } |
- } |
return 0; |
} |
-//int Hunspell::spell(const char * word) { |
-// return spell(word, NULL, NULL); |
-//} |
- |
struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) |
{ |
struct hentry * he = NULL; |
- int len; |
+ int len, i; |
char w2[MAXWORDUTF8LEN]; |
const char * word; |
@@ -600,26 +598,29 @@ |
} |
// look word in hash table |
- if (pHMgr) he = pHMgr->lookup(word); |
+ for (i = 0; (i < maxdic) && !he; i ++) { |
+ he = (pHMgr[i])->lookup(word); |
// check forbidden and onlyincompound words |
if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { |
- info += SPELL_FORBIDDEN; |
+ if (info) *info += SPELL_FORBIDDEN; |
// LANG_hu section: set dash information for suggestions |
if (langnum == LANG_hu) { |
if (pAMgr->get_compoundflag() && |
TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { |
- info += SPELL_COMPOUND; |
+ if (info) *info += SPELL_COMPOUND; |
} |
} |
return NULL; |
} |
- // he = next not pseudoroot and not onlyincompound homonym or NULL |
+ // he = next not needaffix, onlyincompound homonym or onlyupcase word |
while (he && (he->astr) && |
- ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) || |
- (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) |
+ ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || |
+ (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || |
+ (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) |
)) he = he->next_homonym; |
+ } |
// check with affixes |
if (!he && pAMgr) { |
@@ -627,38 +628,42 @@ |
len = strlen(word); |
he = pAMgr->affix_check(word, len, 0); |
- // check compound restriction |
- if (he && he->astr && pAMgr->get_onlyincompound() && |
- TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL; |
+ // check compound restriction and onlyupcase |
+ if (he && he->astr && ( |
+ (pAMgr->get_onlyincompound() && |
+ TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || |
+ (info && (*info & SPELL_INITCAP) && |
+ TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { |
+ he = NULL; |
+ } |
if (he) { |
if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { |
- info += SPELL_FORBIDDEN; |
+ if (info) *info += SPELL_FORBIDDEN; |
return NULL; |
} |
if (root) { |
- *root = mystrdup(he->word); |
- if (complexprefixes) { |
+ *root = mystrdup(&(he->word)); |
+ if (*root && complexprefixes) { |
if (utf8) reverseword_utf(*root); else reverseword(*root); |
} |
} |
// try check compound word |
} else if (pAMgr->get_compound()) { |
- he = pAMgr->compound_check(word, len, |
- 0,0,100,0,NULL,0,NULL,NULL,0); |
+ he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0); |
// LANG_hu section: `moving rule' with last dash |
- if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) { |
+ if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { |
char * dup = mystrdup(word); |
+ if (!dup) return NULL; |
dup[len-1] = '\0'; |
- he = pAMgr->compound_check(dup, len-1, |
- -5,0,100,0,NULL,1,NULL,NULL,0); |
+ he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0); |
free(dup); |
} |
- // end of LANG speficic region |
+ // end of LANG speficic region |
if (he) { |
if (root) { |
- *root = mystrdup(he->word); |
- if (complexprefixes) { |
+ *root = mystrdup(&(he->word)); |
+ if (*root && complexprefixes) { |
if (utf8) reverseword_utf(*root); else reverseword(*root); |
} |
} |
@@ -674,12 +679,18 @@ |
int Hunspell::suggest(char*** slst, const char * word) |
{ |
#ifdef HUNSPELL_CHROME_CLIENT |
- if (pHMgr) pHMgr->EmptyHentryCache(); |
+ if (pHMgr) pHMgr[0]->EmptyHentryCache(); |
#endif |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- if (! pSMgr) return 0; |
- w_char unicw[MAXWORDLEN + 1]; |
+ int onlycmpdsug = 0; |
+ char cw[MAXWORDUTF8LEN]; |
+ char wspace[MAXWORDUTF8LEN]; |
+ if (!pSMgr || maxdic == 0) return 0; |
+ w_char unicw[MAXWORDLEN]; |
+ *slst = NULL; |
+ // process XML input of the simplified API (see manual) |
+ if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { |
+ return spellml(slst, word); |
+ } |
int nc = strlen(word); |
if (utf8) { |
if (nc >= MAXWORDUTF8LEN) return 0; |
@@ -688,49 +699,73 @@ |
} |
int captype = 0; |
int abbv = 0; |
- int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
+ int wl = 0; |
+ |
+ // input conversion |
+ RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; |
+ if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); |
+ else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
+ |
if (wl == 0) return 0; |
int ns = 0; |
- *slst = NULL; |
int capwords = 0; |
- int ngramsugs = 0; |
switch(captype) { |
- case NOCAP: { |
- ns = pSMgr->suggest(slst, cw, ns); |
+ case NOCAP: { |
+ ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); |
break; |
} |
- case INITCAP: { |
+ case INITCAP: { |
capwords = 1; |
- ns = pSMgr->suggest(slst, cw, ns); |
+ ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); |
if (ns == -1) break; |
memcpy(wspace,cw,(wl+1)); |
mkallsmall2(wspace, unicw, nc); |
- ns = pSMgr->suggest(slst, wspace, ns); |
+ ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
break; |
} |
case HUHINITCAP: |
capwords = 1; |
- case HUHCAP: { |
- ns = pSMgr->suggest(slst, cw, ns); |
+ case HUHCAP: { |
+ ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); |
if (ns != -1) { |
int prevns; |
+ // something.The -> something. The |
+ char * dot = strchr(cw, '.'); |
+ if (dot && (dot > cw)) { |
+ int captype_; |
+ if (utf8) { |
+ w_char w_[MAXWORDLEN]; |
+ int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); |
+ captype_ = get_captype_utf8(w_, wl_, langnum); |
+ } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); |
+ if (captype_ == INITCAP) { |
+ char * st = mystrdup(cw); |
+ if (st) st = (char *) realloc(st, wl + 2); |
+ if (st) { |
+ st[(dot - cw) + 1] = ' '; |
+ strcpy(st + (dot - cw) + 2, dot + 1); |
+ ns = insert_sug(slst, st, ns); |
+ free(st); |
+ } |
+ } |
+ } |
if (captype == HUHINITCAP) { |
// TheOpenOffice.org -> The OpenOffice.org |
memcpy(wspace,cw,(wl+1)); |
mkinitsmall2(wspace, unicw, nc); |
- ns = pSMgr->suggest(slst, wspace, ns); |
+ ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
} |
memcpy(wspace,cw,(wl+1)); |
mkallsmall2(wspace, unicw, nc); |
- insert_sug(slst, wspace, &ns); |
+ if (spell(wspace)) ns = insert_sug(slst, wspace, ns); |
prevns = ns; |
- ns = pSMgr->suggest(slst, wspace, ns); |
+ ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
if (captype == HUHINITCAP) { |
mkinitcap2(wspace, unicw, nc); |
- insert_sug(slst, wspace, &ns); |
- ns = pSMgr->suggest(slst, wspace, ns); |
+ if (spell(wspace)) ns = insert_sug(slst, wspace, ns); |
+ ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
} |
// aNew -> "a New" (instead of "a new") |
for (int j = prevns; j < ns; j++) { |
@@ -739,7 +774,7 @@ |
int slen = strlen(space + 1); |
// different case after space (need capitalisation) |
if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { |
- w_char w[MAXWORDLEN + 1]; |
+ w_char w[MAXWORDLEN]; |
int wc = 0; |
char * r = (*slst)[j]; |
if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); |
@@ -754,31 +789,32 @@ |
break; |
} |
- case ALLCAP: { |
+ case ALLCAP: { |
memcpy(wspace, cw, (wl+1)); |
mkallsmall2(wspace, unicw, nc); |
- ns = pSMgr->suggest(slst, wspace, ns); |
+ ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
if (ns == -1) break; |
- if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns); |
+ if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) |
+ ns = insert_sug(slst, wspace, ns); |
mkinitcap2(wspace, unicw, nc); |
- ns = pSMgr->suggest(slst, wspace, ns); |
+ ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
for (int j=0; j < ns; j++) { |
mkallcap((*slst)[j]); |
if (pAMgr && pAMgr->get_checksharps()) { |
char * pos; |
if (utf8) { |
- pos = strstr((*slst)[j], "\xc3\x9f"); |
+ pos = strstr((*slst)[j], "\xC3\x9F"); |
while (pos) { |
*pos = 'S'; |
*(pos+1) = 'S'; |
- pos = strstr(pos+2, "\xc3\x9f"); |
+ pos = strstr(pos+2, "\xC3\x9F"); |
} |
} else { |
- pos = strchr((*slst)[j], '\xdf'); |
+ pos = strchr((*slst)[j], '\xDF'); |
while (pos) { |
(*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); |
- mystrrep((*slst)[j], "\xdf", "SS"); |
- pos = strchr((*slst)[j], '\xdf'); |
+ mystrrep((*slst)[j], "\xDF", "SS"); |
+ pos = strchr((*slst)[j], '\xDF'); |
} |
} |
} |
@@ -807,37 +843,76 @@ |
// END OF LANG_hu section |
// try ngram approach since found nothing |
- if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { |
- ngramsugs = 1; |
+ if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { |
switch(captype) { |
case NOCAP: { |
- ns = pSMgr->ngsuggest(*slst, cw, pHMgr); |
+ ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); |
break; |
} |
+ case HUHINITCAP: |
+ capwords = 1; |
case HUHCAP: { |
memcpy(wspace,cw,(wl+1)); |
mkallsmall2(wspace, unicw, nc); |
- ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); |
- break; |
+ ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); |
+ break; |
} |
- case INITCAP: { |
+ case INITCAP: { |
capwords = 1; |
memcpy(wspace,cw,(wl+1)); |
mkallsmall2(wspace, unicw, nc); |
- ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); |
+ ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); |
break; |
} |
case ALLCAP: { |
memcpy(wspace,cw,(wl+1)); |
mkallsmall2(wspace, unicw, nc); |
- ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); |
- for (int j=0; j < ns; j++) |
+ int oldns = ns; |
+ ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); |
+ for (int j = oldns; j < ns; j++) |
mkallcap((*slst)[j]); |
break; |
} |
} |
} |
+ // try dash suggestion (Afo-American -> Afro-American) |
+ if (strchr(cw, '-')) { |
+ char * pos = strchr(cw, '-'); |
+ char * ppos = cw; |
+ int nodashsug = 1; |
+ char ** nlst = NULL; |
+ int nn = 0; |
+ int last = 0; |
+ for (int j = 0; j < ns && nodashsug == 1; j++) { |
+ if (strchr((*slst)[j], '-')) nodashsug = 0; |
+ } |
+ while (nodashsug && !last) { |
+ if (*pos == '\0') last = 1; else *pos = '\0'; |
+ if (!spell(ppos)) { |
+ nn = suggest(&nlst, ppos); |
+ for (int j = nn - 1; j >= 0; j--) { |
+ strncpy(wspace, cw, ppos - cw); |
+ strcpy(wspace + (ppos - cw), nlst[j]); |
+ if (!last) { |
+ strcat(wspace, "-"); |
+ strcat(wspace, pos + 1); |
+ } |
+ ns = insert_sug(slst, wspace, ns); |
+ free(nlst[j]); |
+ } |
+ if (nlst != NULL) free(nlst); |
+ nodashsug = 0; |
+ } |
+ if (!last) { |
+ *pos = '-'; |
+ ppos = pos + 1; |
+ pos = strchr(ppos, '-'); |
+ } |
+ if (!pos) pos = cw + strlen(cw); |
+ } |
+ } |
+ |
// word reversing wrapper for complex prefixes |
if (complexprefixes) { |
for (int j = 0; j < ns; j++) { |
@@ -858,14 +933,14 @@ |
} |
} |
- // suggest keepcase |
- if (pAMgr->get_keepcase()) { |
+ // remove bad capitalized and forbidden forms |
+ if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { |
switch (captype) { |
case INITCAP: |
case ALLCAP: { |
int l = 0; |
for (int j=0; j < ns; j++) { |
- if (!spell((*slst)[j])) { |
+ if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { |
char s[MAXSWUTF8L]; |
w_char w[MAXSWL]; |
int len; |
@@ -876,21 +951,21 @@ |
len = strlen(s); |
} |
mkallsmall2(s, w, len); |
- free((*slst)[j]); |
+ free((*slst)[j]); |
if (spell(s)) { |
(*slst)[l] = mystrdup(s); |
- l++; |
+ if ((*slst)[l]) l++; |
} else { |
mkinitcap2(s, w, len); |
if (spell(s)) { |
(*slst)[l] = mystrdup(s); |
- l++; |
+ if ((*slst)[l]) l++; |
} |
} |
} else { |
(*slst)[l] = (*slst)[j]; |
l++; |
- } |
+ } |
} |
ns = l; |
} |
@@ -909,9 +984,28 @@ |
} |
l++; |
} |
+ |
+ // output conversion |
+ rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; |
+ for (int j = 0; rl && j < ns; j++) { |
+ if (rl->conv((*slst)[j], wspace)) { |
+ free((*slst)[j]); |
+ (*slst)[j] = mystrdup(wspace); |
+ } |
+ } |
+ |
+ // if suggestions removed by nosuggest, onlyincompound parameters |
+ if (l == 0 && *slst) { |
+ free(*slst); |
+ *slst = NULL; |
+ } |
return l; |
} |
+void Hunspell::free_list(char *** slst, int n) { |
+ freelist(slst, n); |
+} |
+ |
char * Hunspell::get_dic_encoding() |
{ |
return encoding; |
@@ -921,9 +1015,9 @@ |
// XXX need UTF-8 support |
int Hunspell::suggest_auto(char*** slst, const char * word) |
{ |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- if (! pSMgr) return 0; |
+ char cw[MAXWORDUTF8LEN]; |
+ char wspace[MAXWORDUTF8LEN]; |
+ if (!pSMgr || maxdic == 0) return 0; |
int wl = strlen(word); |
if (utf8) { |
if (wl >= MAXWORDUTF8LEN) return 0; |
@@ -936,15 +1030,15 @@ |
if (wl == 0) return 0; |
int ns = 0; |
*slst = NULL; // HU, nsug in pSMgr->suggest |
- |
+ |
switch(captype) { |
- case NOCAP: { |
+ case NOCAP: { |
ns = pSMgr->suggest_auto(slst, cw, ns); |
if (ns>0) break; |
break; |
} |
- case INITCAP: { |
+ case INITCAP: { |
memcpy(wspace,cw,(wl+1)); |
mkallsmall(wspace); |
ns = pSMgr->suggest_auto(slst, wspace, ns); |
@@ -952,10 +1046,11 @@ |
mkinitcap((*slst)[j]); |
ns = pSMgr->suggest_auto(slst, cw, ns); |
break; |
- |
+ |
} |
- case HUHCAP: { |
+ case HUHINITCAP: |
+ case HUHCAP: { |
ns = pSMgr->suggest_auto(slst, cw, ns); |
if (ns == 0) { |
memcpy(wspace,cw,(wl+1)); |
@@ -965,7 +1060,7 @@ |
break; |
} |
- case ALLCAP: { |
+ case ALLCAP: { |
memcpy(wspace,cw,(wl+1)); |
mkallsmall(wspace); |
ns = pSMgr->suggest_auto(slst, wspace, ns); |
@@ -1011,103 +1106,89 @@ |
} |
} |
} |
- // END OF LANG_hu section |
+ // END OF LANG_hu section |
return ns; |
} |
+#endif |
-// XXX need UTF-8 support |
-int Hunspell::stem(char*** slst, const char * word) |
+int Hunspell::stem(char*** slst, char ** desc, int n) |
{ |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- if (! pSMgr) return 0; |
- int wl = strlen(word); |
- if (utf8) { |
- if (wl >= MAXWORDUTF8LEN) return 0; |
- } else { |
- if (wl >= MAXWORDLEN) return 0; |
+ char result[MAXLNLEN]; |
+ char result2[MAXLNLEN]; |
+ *slst = NULL; |
+ if (n == 0) return 0; |
+ *result2 = '\0'; |
+ for (int i = 0; i < n; i++) { |
+ *result = '\0'; |
+ // add compound word parts (except the last one) |
+ char * s = (char *) desc[i]; |
+ char * part = strstr(s, MORPH_PART); |
+ if (part) { |
+ char * nextpart = strstr(part + 1, MORPH_PART); |
+ while (nextpart) { |
+ copy_field(result + strlen(result), part, MORPH_PART); |
+ part = nextpart; |
+ nextpart = strstr(part + 1, MORPH_PART); |
+ } |
+ s = part; |
+ } |
+ |
+ char **pl; |
+ char tok[MAXLNLEN]; |
+ strcpy(tok, s); |
+ char * alt = strstr(tok, " | "); |
+ while (alt) { |
+ alt[1] = MSEP_ALT; |
+ alt = strstr(alt, " | "); |
+ } |
+ int pln = line_tok(tok, &pl, MSEP_ALT); |
+ for (int k = 0; k < pln; k++) { |
+ // add derivational suffixes |
+ if (strstr(pl[k], MORPH_DERI_SFX)) { |
+ // remove inflectional suffixes |
+ char * is = strstr(pl[k], MORPH_INFL_SFX); |
+ if (is) *is = '\0'; |
+ char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); |
+ if (sg) { |
+ char ** gen; |
+ int genl = line_tok(sg, &gen, MSEP_REC); |
+ free(sg); |
+ for (int j = 0; j < genl; j++) { |
+ sprintf(result2 + strlen(result2), "%c%s%s", |
+ MSEP_REC, result, gen[j]); |
+ } |
+ freelist(&gen, genl); |
+ } |
+ } else { |
+ sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); |
+ if (strstr(pl[k], MORPH_SURF_PFX)) { |
+ copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); |
+ } |
+ copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); |
+ } |
+ } |
+ freelist(&pl, pln); |
} |
- int captype = 0; |
- int abbv = 0; |
- wl = cleanword(cw, word, &captype, &abbv); |
- if (wl == 0) return 0; |
- |
- int ns = 0; |
+ int sln = line_tok(result2, slst, MSEP_REC); |
+ return uniqlist(*slst, sln); |
- *slst = NULL; // HU, nsug in pSMgr->suggest |
- |
- switch(captype) { |
- case HUHCAP: |
- case NOCAP: { |
- ns = pSMgr->suggest_stems(slst, cw, ns); |
+} |
- if ((abbv) && (ns == 0)) { |
- memcpy(wspace,cw,wl); |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- ns = pSMgr->suggest_stems(slst, wspace, ns); |
- } |
- |
- break; |
- } |
- |
- case INITCAP: { |
- |
- ns = pSMgr->suggest_stems(slst, cw, ns); |
- |
- if (ns == 0) { |
- memcpy(wspace,cw,(wl+1)); |
- mkallsmall(wspace); |
- ns = pSMgr->suggest_stems(slst, wspace, ns); |
- |
- } |
- |
- if ((abbv) && (ns == 0)) { |
- memcpy(wspace,cw,wl); |
- mkallsmall(wspace); |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- ns = pSMgr->suggest_stems(slst, wspace, ns); |
- } |
- |
- break; |
- |
- } |
- |
- case ALLCAP: { |
- ns = pSMgr->suggest_stems(slst, cw, ns); |
- if (ns != 0) break; |
- |
- memcpy(wspace,cw,(wl+1)); |
- mkallsmall(wspace); |
- ns = pSMgr->suggest_stems(slst, wspace, ns); |
- |
- if (ns == 0) { |
- mkinitcap(wspace); |
- ns = pSMgr->suggest_stems(slst, wspace, ns); |
- } |
- |
- if ((abbv) && (ns == 0)) { |
- memcpy(wspace,cw,wl); |
- mkallsmall(wspace); |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- ns = pSMgr->suggest_stems(slst, wspace, ns); |
- } |
- |
- |
- break; |
- } |
- } |
- |
- return ns; |
+int Hunspell::stem(char*** slst, const char * word) |
+{ |
+ char ** pl; |
+ int pln = analyze(&pl, word); |
+ int pln2 = stem(slst, pl, pln); |
+ freelist(&pl, pln); |
+ return pln2; |
} |
+#ifdef HUNSPELL_EXPERIMENTAL |
int Hunspell::suggest_pos_stems(char*** slst, const char * word) |
{ |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- if (! pSMgr) return 0; |
+ char cw[MAXWORDUTF8LEN]; |
+ char wspace[MAXWORDUTF8LEN]; |
+ if (! pSMgr || maxdic == 0) return 0; |
int wl = strlen(word); |
if (utf8) { |
if (wl >= MAXWORDUTF8LEN) return 0; |
@@ -1118,14 +1199,14 @@ |
int abbv = 0; |
wl = cleanword(cw, word, &captype, &abbv); |
if (wl == 0) return 0; |
- |
+ |
int ns = 0; // ns=0 = normalized input |
*slst = NULL; // HU, nsug in pSMgr->suggest |
- |
+ |
switch(captype) { |
case HUHCAP: |
- case NOCAP: { |
+ case NOCAP: { |
ns = pSMgr->suggest_pos_stems(slst, cw, ns); |
if ((abbv) && (ns == 0)) { |
@@ -1138,7 +1219,7 @@ |
break; |
} |
- case INITCAP: { |
+ case INITCAP: { |
ns = pSMgr->suggest_pos_stems(slst, cw, ns); |
@@ -1147,15 +1228,15 @@ |
mkallsmall(wspace); |
ns = pSMgr->suggest_pos_stems(slst, wspace, ns); |
} |
- |
+ |
break; |
- |
+ |
} |
- case ALLCAP: { |
+ case ALLCAP: { |
ns = pSMgr->suggest_pos_stems(slst, cw, ns); |
if (ns != 0) break; |
- |
+ |
memcpy(wspace,cw,(wl+1)); |
mkallsmall(wspace); |
ns = pSMgr->suggest_pos_stems(slst, wspace, ns); |
@@ -1225,22 +1306,24 @@ |
return nc; |
} |
-int Hunspell::put_word(const char * word) |
+int Hunspell::add(const char * word) |
{ |
- if (pHMgr) { |
- return pHMgr->put_word(word, strlen(word), NULL); |
- } |
+ if (pHMgr[0]) return (pHMgr[0])->add(word); |
return 0; |
} |
-int Hunspell::put_word_pattern(const char * word, const char * pattern) |
+int Hunspell::add_with_affix(const char * word, const char * example) |
{ |
- if (pHMgr) { |
- return pHMgr->put_word_pattern(word, strlen(word), pattern); |
- } |
+ if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); |
return 0; |
} |
+int Hunspell::remove(const char * word) |
+{ |
+ if (pHMgr[0]) return (pHMgr[0])->remove(word); |
+ return 0; |
+} |
+ |
const char * Hunspell::get_version() |
{ |
return pAMgr->get_version(); |
@@ -1251,22 +1334,38 @@ |
return csconv; |
} |
-#ifdef HUNSPELL_EXPERIMENTAL |
-// XXX need UTF-8 support |
-char * Hunspell::morph(const char * word) |
+void Hunspell::cat_result(char * result, char * st) |
{ |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- if (! pSMgr) return 0; |
- int wl = strlen(word); |
+ if (st) { |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
+ free(st); |
+ } |
+} |
+ |
+int Hunspell::analyze(char*** slst, const char * word) |
+{ |
+ char cw[MAXWORDUTF8LEN]; |
+ char wspace[MAXWORDUTF8LEN]; |
+ w_char unicw[MAXWORDLEN]; |
+ int wl2 = 0; |
+ *slst = NULL; |
+ if (! pSMgr || maxdic == 0) return 0; |
+ int nc = strlen(word); |
if (utf8) { |
- if (wl >= MAXWORDUTF8LEN) return 0; |
+ if (nc >= MAXWORDUTF8LEN) return 0; |
} else { |
- if (wl >= MAXWORDLEN) return 0; |
+ if (nc >= MAXWORDLEN) return 0; |
} |
int captype = 0; |
int abbv = 0; |
- wl = cleanword(cw, word, &captype, &abbv); |
+ int wl = 0; |
+ |
+ // input conversion |
+ RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; |
+ if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); |
+ else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
+ |
if (wl == 0) { |
if (abbv) { |
for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; |
@@ -1277,7 +1376,7 @@ |
char result[MAXLNLEN]; |
char * st = NULL; |
- |
+ |
*result = '\0'; |
int n = 0; |
@@ -1287,177 +1386,103 @@ |
// test numbers |
// LANG_hu section: set dash information for suggestions |
if (langnum == LANG_hu) { |
- while ((n < wl) && |
+ while ((n < wl) && |
(((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { |
n++; |
if ((cw[n] == '.') || (cw[n] == ',')) { |
- if (((n2 == 0) && (n > 3)) || |
+ if (((n2 == 0) && (n > 3)) || |
((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; |
n2++; |
n3 = n; |
} |
} |
- if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL; |
- if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xb0')) && checkword(cw+n, NULL, NULL))) { |
- strcat(result, cw); |
+ if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; |
+ if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { |
+ mystrcat(result, cw, MAXLNLEN); |
result[n - 1] = '\0'; |
- if (n == wl) { |
- st = pSMgr->suggest_morph(cw + n - 1); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
- } else { |
+ if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); |
+ else { |
char sign = cw[n]; |
cw[n] = '\0'; |
- st = pSMgr->suggest_morph(cw + n - 1); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
- strcat(result, "+"); // XXX SPEC. MORPHCODE |
+ cat_result(result, pSMgr->suggest_morph(cw + n - 1)); |
+ mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE |
cw[n] = sign; |
- st = pSMgr->suggest_morph(cw + n); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
+ cat_result(result, pSMgr->suggest_morph(cw + n)); |
} |
- return mystrdup(result); |
+ return line_tok(result, slst, MSEP_REC); |
} |
} |
// END OF LANG_hu section |
- |
+ |
switch(captype) { |
- case NOCAP: { |
- st = pSMgr->suggest_morph(cw); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
- if (abbv) { |
- memcpy(wspace,cw,wl); |
+ case HUHCAP: |
+ case HUHINITCAP: |
+ case NOCAP: { |
+ cat_result(result, pSMgr->suggest_morph(cw)); |
+ if (abbv) { |
+ memcpy(wspace,cw,wl); |
+ *(wspace+wl) = '.'; |
+ *(wspace+wl+1) = '\0'; |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
+ } |
+ break; |
+ } |
+ case INITCAP: { |
+ wl = mkallsmall2(cw, unicw, nc); |
+ memcpy(wspace,cw,(wl+1)); |
+ wl2 = mkinitcap2(cw, unicw, nc); |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
+ cat_result(result, pSMgr->suggest_morph(cw)); |
+ if (abbv) { |
*(wspace+wl) = '.'; |
*(wspace+wl+1) = '\0'; |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
+ |
+ memcpy(wspace, cw, wl2); |
+ *(wspace+wl2) = '.'; |
+ *(wspace+wl2+1) = '\0'; |
+ |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
} |
- break; |
+ break; |
} |
- case INITCAP: { |
- memcpy(wspace,cw,(wl+1)); |
- mkallsmall(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
- st = pSMgr->suggest_morph(cw); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
- if (abbv) { |
- memcpy(wspace,cw,wl); |
+ case ALLCAP: { |
+ cat_result(result, pSMgr->suggest_morph(cw)); |
+ if (abbv) { |
+ memcpy(wspace,cw,wl); |
*(wspace+wl) = '.'; |
*(wspace+wl+1) = '\0'; |
- mkallsmall(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
- mkinitcap(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
+ cat_result(result, pSMgr->suggest_morph(cw)); |
} |
- break; |
- } |
- case HUHCAP: { |
- st = pSMgr->suggest_morph(cw); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
-#if 0 |
+ wl = mkallsmall2(cw, unicw, nc); |
memcpy(wspace,cw,(wl+1)); |
- mkallsmall(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
+ wl2 = mkinitcap2(cw, unicw, nc); |
+ |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
+ cat_result(result, pSMgr->suggest_morph(cw)); |
+ if (abbv) { |
+ *(wspace+wl) = '.'; |
+ *(wspace+wl+1) = '\0'; |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
+ |
+ memcpy(wspace, cw, wl2); |
+ *(wspace+wl2) = '.'; |
+ *(wspace+wl2+1) = '\0'; |
+ |
+ cat_result(result, pSMgr->suggest_morph(wspace)); |
} |
-#endif |
break; |
- } |
- case ALLCAP: { |
- memcpy(wspace,cw,(wl+1)); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
- mkallsmall(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
- mkinitcap(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
- if (abbv) { |
- memcpy(wspace,cw,(wl+1)); |
- *(wspace+wl) = '.'; |
- *(wspace+wl+1) = '\0'; |
- if (*result) strcat(result, "\n"); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- strcat(result, st); |
- free(st); |
- } |
- mkallsmall(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
- mkinitcap(wspace); |
- st = pSMgr->suggest_morph(wspace); |
- if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
- free(st); |
- } |
- } |
- break; |
} |
} |
- if (result && (*result)) { |
+ if (*result) { |
// word reversing wrapper for complex prefixes |
if (complexprefixes) { |
if (utf8) reverseword_utf(result); else reverseword(result); |
} |
- return mystrdup(result); |
+ return line_tok(result, slst, MSEP_REC); |
+ |
} |
// compound word with dash (HU) I18n |
@@ -1466,24 +1491,24 @@ |
// LANG_hu section: set dash information for suggestions |
if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); |
if ((langnum == LANG_hu) && dash) { |
- *dash='\0'; |
+ *dash='\0'; |
// examine 2 sides of the dash |
if (dash[1] == '\0') { // base word ending with dash |
- if (spell(cw)) return pSMgr->suggest_morph(cw); |
+ if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC); |
} else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. |
if (spell(cw) && (spell("-e"))) { |
st = pSMgr->suggest_morph(cw); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- strcat(result,"+"); // XXX spec. separator in MORPHCODE |
+ mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE |
st = pSMgr->suggest_morph("-e"); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- return mystrdup(result); |
+ return line_tok(result, slst, MSEP_REC); |
} |
} else { |
// first word ending with dash: word- XXX ??? |
@@ -1495,22 +1520,22 @@ |
dash[0]='\0'; |
if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || |
((dash[1] > '0') && (dash[1] < '9')))) { |
- st = morph(cw); |
+ st = pSMgr->suggest_morph(cw); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
- strcat(result,"+"); // XXX spec. separator in MORPHCODE |
+ mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE |
} |
- st = morph(dash+1); |
+ st = pSMgr->suggest_morph(dash+1); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- return mystrdup(result); |
+ return line_tok(result, slst, MSEP_REC); |
} |
} |
// affixed number in correct word |
- if (nresult && (dash > cw) && (((*(dash-1)<='9') && |
+ if (nresult && (dash > cw) && (((*(dash-1)<='9') && |
(*(dash-1)>='0')) || (*(dash-1)=='.'))) { |
*dash='-'; |
n = 1; |
@@ -1525,195 +1550,338 @@ |
// 56-hoz, 6-hoz |
for(; n >= 1; n--) { |
if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { |
- strcat(result, cw); |
+ mystrcat(result, cw, MAXLNLEN); |
result[dash - cw - n] = '\0'; |
st = pSMgr->suggest_morph(dash - n); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- return mystrdup(result); |
+ return line_tok(result, slst, MSEP_REC); |
} |
} |
} |
} |
- return NULL; |
+ return 0; |
} |
+int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) |
+{ |
+ *slst = NULL; |
+ if (!pSMgr || !pln) return 0; |
+ char **pl2; |
+ int pl2n = analyze(&pl2, word); |
+ int captype = 0; |
+ int abbv = 0; |
+ char cw[MAXWORDUTF8LEN]; |
+ cleanword(cw, word, &captype, &abbv); |
+ char result[MAXLNLEN]; |
+ *result = '\0'; |
+ |
+ for (int i = 0; i < pln; i++) { |
+ cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); |
+ } |
+ freelist(&pl2, pl2n); |
+ |
+ if (*result) { |
+ // allcap |
+ if (captype == ALLCAP) mkallcap(result); |
+ |
+ // line split |
+ int linenum = line_tok(result, slst, MSEP_REC); |
+ |
+ // capitalize |
+ if (captype == INITCAP || captype == HUHINITCAP) { |
+ for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); |
+ } |
+ |
+ // temporary filtering of prefix related errors (eg. |
+ // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") |
+ |
+ int r = 0; |
+ for (int j=0; j < linenum; j++) { |
+ if (!spell((*slst)[j])) { |
+ free((*slst)[j]); |
+ (*slst)[j] = NULL; |
+ } else { |
+ if (r < j) (*slst)[r] = (*slst)[j]; |
+ r++; |
+ } |
+ } |
+ if (r > 0) return r; |
+ free(*slst); |
+ *slst = NULL; |
+ } |
+ return 0; |
+} |
+ |
+int Hunspell::generate(char*** slst, const char * word, const char * pattern) |
+{ |
+ char **pl; |
+ int pln = analyze(&pl, pattern); |
+ int n = generate(slst, word, pl, pln); |
+ freelist(&pl, pln); |
+ return uniqlist(*slst, n); |
+} |
+ |
+// minimal XML parser functions |
+int Hunspell::get_xml_par(char * dest, const char * par, int max) |
+{ |
+ char * d = dest; |
+ if (!par) return 0; |
+ char end = *par; |
+ char * dmax = dest + max; |
+ if (end == '>') end = '<'; |
+ else if (end != '\'' && end != '"') return 0; // bad XML |
+ for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; |
+ *d = '\0'; |
+ mystrrep(dest, "<", "<"); |
+ mystrrep(dest, "&", "&"); |
+ return d - dest; |
+} |
+ |
+// return the beginning of the element (attr == NULL) or the attribute |
+const char * Hunspell::get_xml_pos(const char * s, const char * attr) |
+{ |
+ const char * end = strchr(s, '>'); |
+ const char * p = s; |
+ if (attr == NULL) return end; |
+ do { |
+ p = strstr(p, attr); |
+ if (!p || p >= end) return 0; |
+ } while (*(p-1) != ' ' && *(p-1) != '\n'); |
+ return p + strlen(attr); |
+} |
+ |
+int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { |
+ char cw[MAXWORDUTF8LEN]; |
+ if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && |
+ strcmp(cw, value) == 0) return 1; |
+ return 0; |
+} |
+ |
+int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { |
+ int n = 0; |
+ char * p; |
+ if (!list) return 0; |
+ for (p = list; (p = strstr(p, tag)); p++) n++; |
+ if (n == 0) return 0; |
+ *slst = (char **) malloc(sizeof(char *) * n); |
+ if (!*slst) return 0; |
+ for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { |
+ int l = strlen(p); |
+ (*slst)[n] = (char *) malloc(l); |
+ if (!(*slst)[n]) return (n > 0 ? n - 1 : 0); |
+ get_xml_par((*slst)[n], p + strlen(tag) - 1, l); |
+ } |
+ return n; |
+} |
+ |
+int Hunspell::spellml(char*** slst, const char * word) |
+{ |
+ char *q, *q2; |
+ char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; |
+ q = (char *) strstr(word, "<query"); |
+ if (!q) return 0; // bad XML input |
+ q2 = strchr(q, '>'); |
+ if (!q2) return 0; // bad XML input |
+ q2 = strstr(q2, "<word"); |
+ if (!q2) return 0; // bad XML input |
+ if (check_xml_par(q, "type=", "analyze")) { |
+ int n = 0, s = 0; |
+ if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) n = analyze(slst, cw); |
+ if (n == 0) return 0; |
+ // convert the result to <code><a>ana1</a><a>ana2</a></code> format |
+ for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); |
+ char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& |
+ if (!r) return 0; |
+ strcpy(r, "<code>"); |
+ for (int i = 0; i < n; i++) { |
+ int l = strlen(r); |
+ strcpy(r + l, "<a>"); |
+ strcpy(r + l + 3, (*slst)[i]); |
+ mystrrep(r + l + 3, "\t", " "); |
+ mystrrep(r + l + 3, "<", "<"); |
+ mystrrep(r + l + 3, "&", "&"); |
+ strcat(r, "</a>"); |
+ free((*slst)[i]); |
+ } |
+ strcat(r, "</code>"); |
+ (*slst)[0] = r; |
+ return 1; |
+ } else if (check_xml_par(q, "type=", "stem")) { |
+ if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) return stem(slst, cw); |
+ } else if (check_xml_par(q, "type=", "generate")) { |
+ int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN); |
+ if (n == 0) return 0; |
+ char * q3 = strstr(q2 + 1, "<word"); |
+ if (q3) { |
+ if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN)) { |
+ return generate(slst, cw, cw2); |
+ } |
+ } else { |
+ char ** slst2; |
+ if ((q2 = strstr(q2 + 1, "<code")) && |
+ (n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { |
+ int n2 = generate(slst, cw, slst2, n); |
+ freelist(&slst2, n); |
+ return uniqlist(*slst, n2); |
+ } |
+ } |
+ } |
+ return 0; |
+} |
+ |
+ |
+#ifdef HUNSPELL_EXPERIMENTAL |
// XXX need UTF-8 support |
char * Hunspell::morph_with_correction(const char * word) |
{ |
- char cw[MAXWORDUTF8LEN + 4]; |
- char wspace[MAXWORDUTF8LEN + 4]; |
- if (! pSMgr) return 0; |
+ char cw[MAXWORDUTF8LEN]; |
+ char wspace[MAXWORDUTF8LEN]; |
+ if (! pSMgr || maxdic == 0) return NULL; |
int wl = strlen(word); |
if (utf8) { |
- if (wl >= MAXWORDUTF8LEN) return 0; |
+ if (wl >= MAXWORDUTF8LEN) return NULL; |
} else { |
- if (wl >= MAXWORDLEN) return 0; |
+ if (wl >= MAXWORDLEN) return NULL; |
} |
int captype = 0; |
int abbv = 0; |
wl = cleanword(cw, word, &captype, &abbv); |
- if (wl == 0) return 0; |
+ if (wl == 0) return NULL; |
char result[MAXLNLEN]; |
char * st = NULL; |
- |
+ |
*result = '\0'; |
- |
- |
+ |
+ |
switch(captype) { |
- case NOCAP: { |
+ case NOCAP: { |
st = pSMgr->suggest_morph_for_spelling_error(cw); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- if (abbv) { |
- memcpy(wspace,cw,wl); |
+ if (abbv) { |
+ memcpy(wspace,cw,wl); |
*(wspace+wl) = '.'; |
*(wspace+wl+1) = '\0'; |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
} |
break; |
} |
- case INITCAP: { |
+ case INITCAP: { |
memcpy(wspace,cw,(wl+1)); |
mkallsmall(wspace); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
- } |
- st = pSMgr->suggest_morph_for_spelling_error(cw); |
+ } |
+ st = pSMgr->suggest_morph_for_spelling_error(cw); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- if (abbv) { |
- memcpy(wspace,cw,wl); |
+ if (abbv) { |
+ memcpy(wspace,cw,wl); |
*(wspace+wl) = '.'; |
*(wspace+wl+1) = '\0'; |
mkallsmall(wspace); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
- } |
+ } |
mkinitcap(wspace); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
- } |
+ } |
} |
break; |
} |
- case HUHCAP: { |
+ case HUHCAP: { |
st = pSMgr->suggest_morph_for_spelling_error(cw); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
memcpy(wspace,cw,(wl+1)); |
mkallsmall(wspace); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
- } |
+ } |
break; |
} |
- case ALLCAP: { |
+ case ALLCAP: { |
memcpy(wspace,cw,(wl+1)); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- strcat(result, st); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
- } |
+ } |
mkallsmall(wspace); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- mkinitcap(wspace); |
- st = pSMgr->suggest_morph_for_spelling_error(wspace); |
+ mkinitcap(wspace); |
+ st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- if (abbv) { |
+ if (abbv) { |
memcpy(wspace,cw,(wl+1)); |
*(wspace+wl) = '.'; |
*(wspace+wl+1) = '\0'; |
- if (*result) strcat(result, "\n"); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- strcat(result, st); |
- free(st); |
- } |
+ mystrcat(result, st, MAXLNLEN); |
+ free(st); |
+ } |
mkallsmall(wspace); |
st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- mkinitcap(wspace); |
- st = pSMgr->suggest_morph_for_spelling_error(wspace); |
+ mkinitcap(wspace); |
+ st = pSMgr->suggest_morph_for_spelling_error(wspace); |
if (st) { |
- if (*result) strcat(result, "\n"); |
- strcat(result, st); |
+ if (*result) mystrcat(result, "\n", MAXLNLEN); |
+ mystrcat(result, st, MAXLNLEN); |
free(st); |
} |
- } |
+ } |
break; |
} |
} |
- if (result) return mystrdup(result); |
+ if (*result) return mystrdup(result); |
return NULL; |
} |
-/* analyze word |
- * return line count |
- * XXX need a better data structure for morphological analysis */ |
-int Hunspell::analyze(char ***out, const char *word) { |
- int n = 0; |
- if (!word) return 0; |
- char * m = morph(word); |
- if(!m) return 0; |
- if (!out) return line_tok(m, out); |
- |
- // without memory allocation |
- /* BUG missing buffer size checking */ |
- int i, p; |
- for(p = 0, i = 0; m[i]; i++) { |
- if(m[i] == '\n' || !m[i+1]) { |
- n++; |
- strncpy((*out)[n++], m + p, i - p + 1); |
- if (m[i] == '\n') (*out)[n++][i - p] = '\0'; |
- if(!m[i+1]) break; |
- p = i + 1; |
- } |
- } |
- free(m); |
- return n; |
-} |
- |
#endif // END OF HUNSPELL_EXPERIMENTAL CODE |
Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) |
@@ -1725,6 +1893,17 @@ |
#endif |
} |
+ |
+Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, |
+ const char * key) |
+{ |
+#ifdef HUNSPELL_CHROME_CLIENT |
+ return NULL; |
+#else |
+ return (Hunhandle*)(new Hunspell(affpath, dpath, key)); |
+#endif |
+} |
+ |
void Hunspell_destroy(Hunhandle *pHunspell) |
{ |
delete (Hunspell*)(pHunspell); |
@@ -1745,3 +1924,57 @@ |
return ((Hunspell*)pHunspell)->suggest(slst, word); |
} |
+int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) |
+{ |
+ return ((Hunspell*)pHunspell)->analyze(slst, word); |
+} |
+ |
+int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) |
+{ |
+ return ((Hunspell*)pHunspell)->stem(slst, word); |
+} |
+ |
+int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n) |
+{ |
+ return ((Hunspell*)pHunspell)->stem(slst, desc, n); |
+} |
+ |
+int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, |
+ const char * word2) |
+{ |
+ return ((Hunspell*)pHunspell)->generate(slst, word, word2); |
+} |
+ |
+int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, |
+ char** desc, int n) |
+{ |
+ return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); |
+} |
+ |
+ /* functions for run-time modification of the dictionary */ |
+ |
+ /* add word to the run-time dictionary */ |
+ |
+int Hunspell_add(Hunhandle *pHunspell, const char * word) { |
+ return ((Hunspell*)pHunspell)->add(word); |
+} |
+ |
+ /* add word to the run-time dictionary with affix flags of |
+ * the example (a dictionary word): Hunspell will recognize |
+ * affixed forms of the new word, too. |
+ */ |
+ |
+int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, |
+ const char * example) { |
+ return ((Hunspell*)pHunspell)->add_with_affix(word, example); |
+} |
+ |
+ /* remove word from the run-time dictionary */ |
+ |
+int Hunspell_remove(Hunhandle *pHunspell, const char * word) { |
+ return ((Hunspell*)pHunspell)->remove(word); |
+} |
+ |
+void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n) { |
+ freelist(slst, n); |
+} |
Property changes on: chrome\third_party\hunspell\src\hunspell\hunspell.cxx |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |