| Index: chrome/third_party/hunspell/src/hunspell/hunspell.cxx
|
| ===================================================================
|
| --- chrome/third_party/hunspell/src/hunspell/hunspell.cxx (revision 21721)
|
| +++ chrome/third_party/hunspell/src/hunspell/hunspell.cxx (working copy)
|
| @@ -6,16 +6,17 @@
|
| #include <cstring>
|
| #include <cstdio>
|
| #else
|
| -#include <stdlib.h>
|
| +#include <stdlib.h>
|
| #include <string.h>
|
| -#include <stdio.h>
|
| +#include <stdio.h>
|
| #endif
|
|
|
| #include "hunspell.hxx"
|
| #include "hunspell.h"
|
| +#include "csutil.hxx"
|
|
|
| #ifndef MOZILLA_CLIENT
|
| -#ifndef W32
|
| +#ifndef WIN32
|
| using namespace std;
|
| #endif
|
| #endif
|
| @@ -23,27 +24,34 @@
|
| #ifdef HUNSPELL_CHROME_CLIENT
|
| Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length)
|
| #else
|
| -Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle)
|
| +Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL)
|
| #endif
|
| {
|
| encoding = NULL;
|
| csconv = NULL;
|
| utf8 = 0;
|
| complexprefixes = 0;
|
| +#ifndef HUNSPELL_CHROME_CLIENT
|
| + affixpath = mystrdup(affpath);
|
| +#endif
|
| + maxdic = 0;
|
|
|
| #ifdef HUNSPELL_CHROME_CLIENT
|
| bdict_reader = new hunspell::BDictReader;
|
| bdict_reader->Init(bdict_data, bdict_length);
|
|
|
| - pHMgr = new HashMgr(bdict_reader);
|
| - pAMgr = new AffixMgr(bdict_reader, pHMgr);
|
| + pHMgr[0] = new HashMgr(bdict_reader);
|
| + if (pHMgr[0]) maxdic = 1;
|
| +
|
| + pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic);
|
| #else
|
| /* first set up the hash manager */
|
| - pHMgr = new HashMgr(dic_handle, aff_handle);
|
| + pHMgr[0] = new HashMgr(dic_handle, aff_handle, key);
|
| + if (pHMgr[0]) maxdic = 1;
|
|
|
| /* next set up the affix manager */
|
| /* it needs access to the hash manager lookup methods */
|
| - pAMgr = new AffixMgr(aff_handle, pHMgr);
|
| + pAMgr = new AffixMgr(aff_handle, pHMgr, &maxdic, key);
|
| #endif
|
|
|
| /* get the preferred try string and the dictionary */
|
| @@ -65,10 +73,13 @@
|
| {
|
| if (pSMgr) delete pSMgr;
|
| if (pAMgr) delete pAMgr;
|
| - if (pHMgr) delete pHMgr;
|
| + for (int i = 0; i < maxdic; i++) delete pHMgr[i];
|
| + maxdic = 0;
|
| pSMgr = NULL;
|
| pAMgr = NULL;
|
| - pHMgr = NULL;
|
| +#ifdef MOZILLA_CLIENT
|
| + free(csconv);
|
| +#endif
|
| csconv= NULL;
|
| if (encoding) free(encoding);
|
| encoding = NULL;
|
| @@ -76,27 +87,38 @@
|
| #ifdef HUNSPELL_CHROME_CLIENT
|
| if (bdict_reader) delete bdict_reader;
|
| bdict_reader = NULL;
|
| +#else
|
| + if (affixpath) free(affixpath);
|
| + affixpath = NULL;
|
| #endif
|
| }
|
|
|
| +#ifndef HUNSPELL_CHROME_CLIENT
|
| +// load extra dictionaries
|
| +int Hunspell::add_dic(const char * dpath, const char * key) {
|
| + if (maxdic == MAXDIC || !affixpath) return 1;
|
| + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
|
| + if (pHMgr[maxdic]) maxdic++; else return 1;
|
| + return 0;
|
| +}
|
| +#endif
|
|
|
| // make a copy of src at destination while removing all leading
|
| // blanks and removing any trailing periods after recording
|
| // their presence with the abbreviation flag
|
| -// also since already going through character by character,
|
| +// also since already going through character by character,
|
| // set the capitalization type
|
| // return the length of the "cleaned" (and UTF-8 encoded) word
|
|
|
| -int Hunspell::cleanword2(char * dest, const char * src,
|
| +int Hunspell::cleanword2(char * dest, const char * src,
|
| w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
|
| -{
|
| +{
|
| unsigned char * p = (unsigned char *) dest;
|
| const unsigned char * q = (const unsigned char * ) src;
|
| - int firstcap = 0;
|
|
|
| // first skip over any leading blanks
|
| while ((*q != '\0') && (*q == ' ')) q++;
|
| -
|
| +
|
| // now strip off any trailing periods (recording their presence)
|
| *pabbrev = 0;
|
| int nl = strlen((const char *)q);
|
| @@ -104,80 +126,43 @@
|
| nl--;
|
| (*pabbrev)++;
|
| }
|
| -
|
| +
|
| // if no characters are left it can't be capitalized
|
| - if (nl <= 0) {
|
| + if (nl <= 0) {
|
| *pcaptype = NOCAP;
|
| *p = '\0';
|
| return 0;
|
| }
|
|
|
| - // now determine the capitalization type of the first nl letters
|
| - int ncap = 0;
|
| - int nneutral = 0;
|
| - *nc = 0;
|
| -
|
| - if (!utf8) {
|
| - while (nl > 0) {
|
| - (*nc)++;
|
| - if (csconv[(*q)].ccase) ncap++;
|
| - if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
|
| - *p++ = *q++;
|
| - nl--;
|
| - }
|
| - // remember to terminate the destination string
|
| - *p = '\0';
|
| - if (ncap) {
|
| - firstcap = csconv[(unsigned char)(*dest)].ccase;
|
| - }
|
| - } else {
|
| - unsigned short idx;
|
| - *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q);
|
| + strncpy(dest, (char *) q, nl);
|
| + *(dest + nl) = '\0';
|
| + nl = strlen(dest);
|
| + if (utf8) {
|
| + *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
|
| // don't check too long words
|
| if (*nc >= MAXWORDLEN) return 0;
|
| if (*nc == -1) { // big Unicode character (non BMP area)
|
| *pcaptype = NOCAP;
|
| - strcpy((char *) p, (char *) q);
|
| - return strlen(dest);
|
| + return nl;
|
| }
|
| - *nc -= *pabbrev;
|
| - for (int i = 0; i < *nc; i++) {
|
| - idx = (dest_utf[i].h << 8) + dest_utf[i].l;
|
| - if (idx != unicodetolower(idx, langnum)) ncap++;
|
| - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++;
|
| - }
|
| - u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc);
|
| - if (ncap) {
|
| - idx = (dest_utf[0].h << 8) + dest_utf[0].l;
|
| - firstcap = (idx != unicodetolower(idx, langnum));
|
| - }
|
| - }
|
| -
|
| - // now finally set the captype
|
| - if (ncap == 0) {
|
| - *pcaptype = NOCAP;
|
| - } else if ((ncap == 1) && firstcap) {
|
| - *pcaptype = INITCAP;
|
| - } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) {
|
| - *pcaptype = ALLCAP;
|
| - } else if ((ncap > 1) && firstcap) {
|
| - *pcaptype = HUHINITCAP;
|
| + *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
|
| } else {
|
| - *pcaptype = HUHCAP;
|
| + *pcaptype = get_captype(dest, nl, csconv);
|
| + *nc = nl;
|
| }
|
| - return strlen(dest);
|
| -}
|
| + return nl;
|
| +}
|
|
|
| -int Hunspell::cleanword(char * dest, const char * src,
|
| +int Hunspell::cleanword(char * dest, const char * src,
|
| int * pcaptype, int * pabbrev)
|
| -{
|
| +{
|
| unsigned char * p = (unsigned char *) dest;
|
| const unsigned char * q = (const unsigned char * ) src;
|
| int firstcap = 0;
|
|
|
| // first skip over any leading blanks
|
| while ((*q != '\0') && (*q == ' ')) q++;
|
| -
|
| +
|
| // now strip off any trailing periods (recording their presence)
|
| *pabbrev = 0;
|
| int nl = strlen((const char *)q);
|
| @@ -185,9 +170,9 @@
|
| nl--;
|
| (*pabbrev)++;
|
| }
|
| -
|
| +
|
| // if no characters are left it can't be capitalized
|
| - if (nl <= 0) {
|
| + if (nl <= 0) {
|
| *pcaptype = NOCAP;
|
| *p = '\0';
|
| return 0;
|
| @@ -215,8 +200,9 @@
|
| nc = u8_u16(t, MAXWORDLEN, src);
|
| for (int i = 0; i < nc; i++) {
|
| idx = (t[i].h << 8) + t[i].l;
|
| - if (idx != unicodetolower(idx, langnum)) ncap++;
|
| - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++;
|
| + unsigned short low = unicodetolower(idx, langnum);
|
| + if (idx != low) ncap++;
|
| + if (unicodetoupper(idx, langnum) == low) nneutral++;
|
| }
|
| u16_u8(dest, MAXWORDUTF8LEN, t, nc);
|
| if (ncap) {
|
| @@ -238,8 +224,7 @@
|
| *pcaptype = HUHCAP;
|
| }
|
| return strlen(dest);
|
| -}
|
| -
|
| +}
|
|
|
| void Hunspell::mkallcap(char * p)
|
| {
|
| @@ -256,7 +241,7 @@
|
| }
|
| u16_u8(p, MAXWORDUTF8LEN, u, nc);
|
| } else {
|
| - while (*p != '\0') {
|
| + while (*p != '\0') {
|
| *p = csconv[((unsigned char) *p)].cupper;
|
| p++;
|
| }
|
| @@ -269,15 +254,16 @@
|
| unsigned short idx;
|
| for (int i = 0; i < nc; i++) {
|
| idx = (u[i].h << 8) + u[i].l;
|
| - if (idx != unicodetoupper(idx, langnum)) {
|
| - u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
|
| - u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
|
| + unsigned short up = unicodetoupper(idx, langnum);
|
| + if (idx != up) {
|
| + u[i].h = (unsigned char) (up >> 8);
|
| + u[i].l = (unsigned char) (up & 0x00FF);
|
| }
|
| }
|
| u16_u8(p, MAXWORDUTF8LEN, u, nc);
|
| - return strlen(p);
|
| + return strlen(p);
|
| } else {
|
| - while (*p != '\0') {
|
| + while (*p != '\0') {
|
| *p = csconv[((unsigned char) *p)].cupper;
|
| p++;
|
| }
|
| @@ -288,7 +274,7 @@
|
|
|
| void Hunspell::mkallsmall(char * p)
|
| {
|
| - while (*p != '\0') {
|
| + while (*p != '\0') {
|
| *p = csconv[((unsigned char) *p)].clower;
|
| p++;
|
| }
|
| @@ -300,15 +286,16 @@
|
| unsigned short idx;
|
| for (int i = 0; i < nc; i++) {
|
| idx = (u[i].h << 8) + u[i].l;
|
| - if (idx != unicodetolower(idx, langnum)) {
|
| - u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8);
|
| - u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF);
|
| + unsigned short low = unicodetolower(idx, langnum);
|
| + if (idx != low) {
|
| + u[i].h = (unsigned char) (low >> 8);
|
| + u[i].l = (unsigned char) (low & 0x00FF);
|
| }
|
| }
|
| u16_u8(p, MAXWORDUTF8LEN, u, nc);
|
| return strlen(p);
|
| } else {
|
| - while (*p != '\0') {
|
| + while (*p != '\0') {
|
| *p = csconv[((unsigned char) *p)].clower;
|
| p++;
|
| }
|
| @@ -322,18 +309,18 @@
|
| *p = *source;
|
| for (p++, source++; *(source - 1); p++, source++) {
|
| *p = *source;
|
| - if (*source == '\x9f') *--p = '\xdf';
|
| + if (*source == '\x9F') *--p = '\xDF';
|
| }
|
| return dest;
|
| }
|
|
|
| -// recursive search for right ss-\xdf permutations
|
| +// recursive search for right ss - sharp s permutations
|
| hentry * Hunspell::spellsharps(char * base, char * pos, int n,
|
| int repnum, char * tmp, int * info, char **root) {
|
| pos = strstr(pos, "ss");
|
| if (pos && (n < MAXSHARPS)) {
|
| - *pos = '\xc3';
|
| - *(pos + 1) = '\x9f';
|
| + *pos = '\xC3';
|
| + *(pos + 1) = '\x9F';
|
| hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
|
| if (h) return h;
|
| *pos = 's';
|
| @@ -352,31 +339,32 @@
|
| TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
|
| }
|
|
|
| -/* check and insert a word to beginning of the suggestion array */
|
| -int Hunspell::insert_sug(char ***slst, char * word, int *ns) {
|
| - if (spell(word)) {
|
| - if (*ns == MAXSUGGESTION) {
|
| - (*ns)--;
|
| - free((*slst)[*ns]);
|
| - }
|
| - for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
|
| - (*slst)[0] = mystrdup(word);
|
| - (*ns)++;
|
| +/* insert a word to the beginning of the suggestion array and return ns */
|
| +int Hunspell::insert_sug(char ***slst, char * word, int ns) {
|
| + char * dup = mystrdup(word);
|
| + if (!dup) return ns;
|
| + if (ns == MAXSUGGESTION) {
|
| + ns--;
|
| + free((*slst)[ns]);
|
| }
|
| - return 0;
|
| + for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
|
| + (*slst)[0] = dup;
|
| + return ns + 1;
|
| }
|
|
|
| int Hunspell::spell(const char * word, int * info, char ** root)
|
| {
|
| #ifdef HUNSPELL_CHROME_CLIENT
|
| - if (pHMgr) pHMgr->EmptyHentryCache();
|
| + if (pHMgr) pHMgr[0]->EmptyHentryCache();
|
| #endif
|
| struct hentry * rv=NULL;
|
| // need larger vector. For example, Turkish capital letter I converted a
|
| // 2-byte UTF-8 character (dotless i) by mkallsmall.
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - w_char unicw[MAXWORDLEN + 1];
|
| + char cw[MAXWORDUTF8LEN];
|
| + char wspace[MAXWORDUTF8LEN];
|
| + w_char unicw[MAXWORDLEN];
|
| + // Hunspell supports XML input of the simplified API (see manual)
|
| + if (strcmp(word, SPELL_XML) == 0) return 1;
|
| int nc = strlen(word);
|
| int wl2 = 0;
|
| if (utf8) {
|
| @@ -386,14 +374,18 @@
|
| }
|
| int captype = 0;
|
| int abbv = 0;
|
| - int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
|
| + int wl = 0;
|
|
|
| - if (wl == 0) return 1;
|
| + // input conversion
|
| + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
|
| + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
|
| + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
|
|
|
| - if (info) *info = 0;
|
| + int info2 = 0;
|
| + if (wl == 0 || maxdic == 0) return 1;
|
| if (root) *root = NULL;
|
|
|
| - // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.)
|
| + // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
|
| enum { NBEGIN, NNUM, NSEP };
|
| int nstate = NBEGIN;
|
| int i;
|
| @@ -407,173 +399,179 @@
|
| } else break;
|
| }
|
| if ((i == wl) && (nstate == NNUM)) return 1;
|
| + if (!info) info = &info2; else *info = 0;
|
|
|
| - // LANG_hu section: number(s) + (percent or degree) with suffixes
|
| - if (langnum == LANG_hu) {
|
| - if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '\xb0'))
|
| - && checkword(cw + i, info, root)) return 1;
|
| - }
|
| - // END of LANG_hu section
|
| -
|
| switch(captype) {
|
| - case HUHCAP:
|
| - case HUHINITCAP:
|
| - case NOCAP: {
|
| - rv = checkword(cw, info, root);
|
| - if ((abbv) && !(rv)) {
|
| - memcpy(wspace,cw,wl);
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - rv = checkword(wspace, info, root);
|
| - }
|
| - break;
|
| - }
|
| + case HUHCAP:
|
| + case HUHINITCAP:
|
| + case NOCAP: {
|
| + rv = checkword(cw, info, root);
|
| + if ((abbv) && !(rv)) {
|
| + memcpy(wspace,cw,wl);
|
| + *(wspace+wl) = '.';
|
| + *(wspace+wl+1) = '\0';
|
| + rv = checkword(wspace, info, root);
|
| + }
|
| + break;
|
| + }
|
| case ALLCAP: {
|
| - rv = checkword(cw, info, root);
|
| - if (rv) break;
|
| - if (abbv) {
|
| - memcpy(wspace,cw,wl);
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - rv = checkword(wspace, info, root);
|
| - if (rv) break;
|
| - }
|
| - if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
|
| - char tmpword[MAXWORDUTF8LEN];
|
| - wl = mkallsmall2(cw, unicw, nc);
|
| - memcpy(wspace,cw,(wl+1));
|
| + rv = checkword(cw, info, root);
|
| + if (rv) break;
|
| + if (abbv) {
|
| + memcpy(wspace,cw,wl);
|
| + *(wspace+wl) = '.';
|
| + *(wspace+wl+1) = '\0';
|
| + rv = checkword(wspace, info, root);
|
| + if (rv) break;
|
| + }
|
| + // Spec. prefix handling for Catalan, French, Italian:
|
| + // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
|
| + if (pAMgr && strchr(cw, '\'')) {
|
| + wl = mkallsmall2(cw, unicw, nc);
|
| + char * apostrophe = strchr(cw, '\'');
|
| + if (utf8) {
|
| + w_char tmpword[MAXWORDLEN];
|
| + *apostrophe = '\0';
|
| + wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
|
| + *apostrophe = '\'';
|
| + if (wl2 < nc) {
|
| + mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
|
| + rv = checkword(cw, info, root);
|
| + if (rv) break;
|
| + }
|
| + } else {
|
| + mkinitcap2(apostrophe + 1, unicw, nc);
|
| + rv = checkword(cw, info, root);
|
| + if (rv) break;
|
| + }
|
| + mkinitcap2(cw, unicw, nc);
|
| + rv = checkword(cw, info, root);
|
| + if (rv) break;
|
| + }
|
| + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
|
| + char tmpword[MAXWORDUTF8LEN];
|
| + wl = mkallsmall2(cw, unicw, nc);
|
| + memcpy(wspace,cw,(wl+1));
|
| + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
|
| + if (!rv) {
|
| + wl2 = mkinitcap2(cw, unicw, nc);
|
| + rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
|
| + }
|
| + if ((abbv) && !(rv)) {
|
| + *(wspace+wl) = '.';
|
| + *(wspace+wl+1) = '\0';
|
| + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
|
| + if (!rv) {
|
| + memcpy(wspace, cw, wl2);
|
| + *(wspace+wl2) = '.';
|
| + *(wspace+wl2+1) = '\0';
|
| rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
|
| - if (!rv) {
|
| - wl2 = mkinitcap2(cw, unicw, nc);
|
| - rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
|
| - }
|
| - if ((abbv) && !(rv)) {
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
|
| - if (!rv) {
|
| - memcpy(wspace, cw, wl2);
|
| - *(wspace+wl2) = '.';
|
| - *(wspace+wl2+1) = '\0';
|
| - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
|
| - }
|
| - }
|
| - if (rv) break;
|
| }
|
| }
|
| - case INITCAP: {
|
| - wl = mkallsmall2(cw, unicw, nc);
|
| - memcpy(wspace,cw,(wl+1));
|
| - rv = checkword(wspace, info, root);
|
| - if (!rv || (is_keepcase(rv) && !((captype == INITCAP) &&
|
| - // if CHECKSHARPS: KEEPCASE words with \xdf are allowed
|
| - // in INITCAP form, too.
|
| - pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "\xc3\x9f")) ||
|
| - (!utf8 && strchr(wspace, '\xdf')))))) {
|
| - wl2 = mkinitcap2(cw, unicw, nc);
|
| - rv = checkword(cw, info, root);
|
| - if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL;
|
| - }
|
| - if (abbv && !rv) {
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - rv = checkword(wspace, info, root);
|
| - if (!rv || is_keepcase(rv)) {
|
| - memcpy(wspace, cw, wl2);
|
| - *(wspace+wl2) = '.';
|
| - *(wspace+wl2+1) = '\0';
|
| - rv = checkword(wspace, info, root);
|
| - if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL;
|
| - }
|
| - }
|
| - break;
|
| - }
|
| + if (rv) break;
|
| + }
|
| + }
|
| + case INITCAP: {
|
| + wl = mkallsmall2(cw, unicw, nc);
|
| + memcpy(wspace,cw,(wl+1));
|
| + wl2 = mkinitcap2(cw, unicw, nc);
|
| + if (captype == INITCAP) *info += SPELL_INITCAP;
|
| + rv = checkword(cw, info, root);
|
| + if (captype == INITCAP) *info -= SPELL_INITCAP;
|
| + // forbid bad capitalization
|
| + // (for example, ijs -> Ijs instead of IJs in Dutch)
|
| + // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
|
| + if (*info & SPELL_FORBIDDEN) {
|
| + rv = NULL;
|
| + break;
|
| + }
|
| + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
|
| + if (rv) break;
|
| +
|
| + rv = checkword(wspace, info, root);
|
| + if (abbv && !rv) {
|
| +
|
| + *(wspace+wl) = '.';
|
| + *(wspace+wl+1) = '\0';
|
| + rv = checkword(wspace, info, root);
|
| + if (!rv) {
|
| + memcpy(wspace, cw, wl2);
|
| + *(wspace+wl2) = '.';
|
| + *(wspace+wl2+1) = '\0';
|
| + if (captype == INITCAP) *info += SPELL_INITCAP;
|
| + rv = checkword(wspace, info, root);
|
| + if (captype == INITCAP) *info -= SPELL_INITCAP;
|
| + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
|
| + break;
|
| + }
|
| + }
|
| + if (rv && is_keepcase(rv) &&
|
| + ((captype == ALLCAP) ||
|
| + // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
|
| + // in INITCAP form, too.
|
| + !(pAMgr->get_checksharps() &&
|
| + ((utf8 && strstr(wspace, "\xC3\x9F")) ||
|
| + (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
|
| + break;
|
| + }
|
| }
|
| -
|
| +
|
| if (rv) return 1;
|
|
|
| - // recursive breaking at break points (not good for morphological analysis)
|
| + // recursive breaking at break points
|
| if (wordbreak) {
|
| char * s;
|
| char r;
|
| - for (int j = 0; j < pAMgr->get_numbreak(); j++) {
|
| + int corr = 0;
|
| + wl = strlen(cw);
|
| + int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
|
| + // check boundary patterns (^begin and end$)
|
| + for (int j = 0; j < numbreak; j++) {
|
| + int plen = strlen(wordbreak[j]);
|
| + if (plen == 1 || plen > wl) continue;
|
| + if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
|
| + && spell(cw + plen - 1)) return 1;
|
| + if (wordbreak[j][plen - 1] == '$' &&
|
| + strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
|
| + r = cw[wl - plen + 1];
|
| + cw[wl - plen + 1] = '\0';
|
| + if (spell(cw)) return 1;
|
| + cw[wl - plen + 1] = r;
|
| + }
|
| + }
|
| + // other patterns
|
| + for (int j = 0; j < numbreak; j++) {
|
| + int result = 0;
|
| + int plen = strlen(wordbreak[j]);
|
| s=(char *) strstr(cw, wordbreak[j]);
|
| - if (s) {
|
| + if (s && (s > cw) && (s < cw + wl - plen)) {
|
| + if (!spell(s + plen)) continue;
|
| r = *s;
|
| *s = '\0';
|
| // examine 2 sides of the break point
|
| - if (spell(cw) && spell(s + strlen(wordbreak[j]))) {
|
| - *s = r;
|
| - return 1;
|
| - }
|
| + if (spell(cw)) return 1;
|
| *s = r;
|
| +
|
| + // LANG_hu: spec. dash rule
|
| + if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
|
| + r = s[1];
|
| + s[1] = '\0';
|
| + if (spell(cw)) return 1; // check the first part with dash
|
| + s[1] = r;
|
| + }
|
| + // end of LANG speficic region
|
| +
|
| }
|
| }
|
| }
|
|
|
| - // LANG_hu: compoundings with dashes and n-dashes XXX deprecated!
|
| - if (langnum == LANG_hu) {
|
| - int n;
|
| - // compound word with dash (HU) I18n
|
| - char * dash;
|
| - int result = 0;
|
| - // n-dash
|
| - dash = (char *) strstr(cw,"\xe2\x80\x93");
|
| - if (dash && !wordbreak) {
|
| - *dash = '\0';
|
| - // examine 2 sides of the dash
|
| - if (spell(cw) && spell(dash + 3)) {
|
| - *dash = '\xe2';
|
| - return 1;
|
| - }
|
| - *dash = '\xe2';
|
| - }
|
| - dash = (char *) strchr(cw,'-');
|
| - if (dash) {
|
| - *dash='\0';
|
| - // examine 2 sides of the dash
|
| - if (dash[1] == '\0') { // base word ending with dash
|
| - if (spell(cw)) return 1;
|
| - } else {
|
| - // first word ending with dash: word-
|
| - char r2 = *(dash + 1);
|
| - dash[0]='-';
|
| - dash[1]='\0';
|
| - result = spell(cw);
|
| - dash[1] = r2;
|
| - dash[0]='\0';
|
| - if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') ||
|
| - ((dash[1] > '0') && (dash[1] < '9')))) return 1;
|
| - }
|
| - // affixed number in correct word
|
| - if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) {
|
| - *dash='-';
|
| - n = 1;
|
| - if (*(dash - n) == '.') n++;
|
| - // search first not a number character to left from dash
|
| - while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
|
| - n++;
|
| - }
|
| - if ((dash - n) < cw) n--;
|
| - // numbers: deprecated
|
| - for(; n >= 1; n--) {
|
| - if ((*(dash - n) >= '0') && (*(dash - n) <= '9') &&
|
| - checkword(dash - n, info, root)) return 1;
|
| - }
|
| - }
|
| - }
|
| - }
|
| return 0;
|
| }
|
|
|
| -//int Hunspell::spell(const char * word) {
|
| -// return spell(word, NULL, NULL);
|
| -//}
|
| -
|
| struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
|
| {
|
| struct hentry * he = NULL;
|
| - int len;
|
| + int len, i;
|
| char w2[MAXWORDUTF8LEN];
|
| const char * word;
|
|
|
| @@ -600,26 +598,29 @@
|
| }
|
|
|
| // look word in hash table
|
| - if (pHMgr) he = pHMgr->lookup(word);
|
| + for (i = 0; (i < maxdic) && !he; i ++) {
|
| + he = (pHMgr[i])->lookup(word);
|
|
|
| // check forbidden and onlyincompound words
|
| if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
|
| - info += SPELL_FORBIDDEN;
|
| + if (info) *info += SPELL_FORBIDDEN;
|
| // LANG_hu section: set dash information for suggestions
|
| if (langnum == LANG_hu) {
|
| if (pAMgr->get_compoundflag() &&
|
| TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
|
| - info += SPELL_COMPOUND;
|
| + if (info) *info += SPELL_COMPOUND;
|
| }
|
| }
|
| return NULL;
|
| }
|
|
|
| - // he = next not pseudoroot and not onlyincompound homonym or NULL
|
| + // he = next not needaffix, onlyincompound homonym or onlyupcase word
|
| while (he && (he->astr) &&
|
| - ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) ||
|
| - (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen))
|
| + ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
|
| + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
|
| + (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
|
| )) he = he->next_homonym;
|
| + }
|
|
|
| // check with affixes
|
| if (!he && pAMgr) {
|
| @@ -627,38 +628,42 @@
|
| len = strlen(word);
|
| he = pAMgr->affix_check(word, len, 0);
|
|
|
| - // check compound restriction
|
| - if (he && he->astr && pAMgr->get_onlyincompound() &&
|
| - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL;
|
| + // check compound restriction and onlyupcase
|
| + if (he && he->astr && (
|
| + (pAMgr->get_onlyincompound() &&
|
| + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
|
| + (info && (*info & SPELL_INITCAP) &&
|
| + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
|
| + he = NULL;
|
| + }
|
|
|
| if (he) {
|
| if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
|
| - info += SPELL_FORBIDDEN;
|
| + if (info) *info += SPELL_FORBIDDEN;
|
| return NULL;
|
| }
|
| if (root) {
|
| - *root = mystrdup(he->word);
|
| - if (complexprefixes) {
|
| + *root = mystrdup(&(he->word));
|
| + if (*root && complexprefixes) {
|
| if (utf8) reverseword_utf(*root); else reverseword(*root);
|
| }
|
| }
|
| // try check compound word
|
| } else if (pAMgr->get_compound()) {
|
| - he = pAMgr->compound_check(word, len,
|
| - 0,0,100,0,NULL,0,NULL,NULL,0);
|
| + he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0);
|
| // LANG_hu section: `moving rule' with last dash
|
| - if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) {
|
| + if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
|
| char * dup = mystrdup(word);
|
| + if (!dup) return NULL;
|
| dup[len-1] = '\0';
|
| - he = pAMgr->compound_check(dup, len-1,
|
| - -5,0,100,0,NULL,1,NULL,NULL,0);
|
| + he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0);
|
| free(dup);
|
| }
|
| - // end of LANG speficic region
|
| + // end of LANG speficic region
|
| if (he) {
|
| if (root) {
|
| - *root = mystrdup(he->word);
|
| - if (complexprefixes) {
|
| + *root = mystrdup(&(he->word));
|
| + if (*root && complexprefixes) {
|
| if (utf8) reverseword_utf(*root); else reverseword(*root);
|
| }
|
| }
|
| @@ -674,12 +679,18 @@
|
| int Hunspell::suggest(char*** slst, const char * word)
|
| {
|
| #ifdef HUNSPELL_CHROME_CLIENT
|
| - if (pHMgr) pHMgr->EmptyHentryCache();
|
| + if (pHMgr) pHMgr[0]->EmptyHentryCache();
|
| #endif
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - if (! pSMgr) return 0;
|
| - w_char unicw[MAXWORDLEN + 1];
|
| + int onlycmpdsug = 0;
|
| + char cw[MAXWORDUTF8LEN];
|
| + char wspace[MAXWORDUTF8LEN];
|
| + if (!pSMgr || maxdic == 0) return 0;
|
| + w_char unicw[MAXWORDLEN];
|
| + *slst = NULL;
|
| + // process XML input of the simplified API (see manual)
|
| + if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
|
| + return spellml(slst, word);
|
| + }
|
| int nc = strlen(word);
|
| if (utf8) {
|
| if (nc >= MAXWORDUTF8LEN) return 0;
|
| @@ -688,49 +699,73 @@
|
| }
|
| int captype = 0;
|
| int abbv = 0;
|
| - int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
|
| + int wl = 0;
|
| +
|
| + // input conversion
|
| + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
|
| + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
|
| + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
|
| +
|
| if (wl == 0) return 0;
|
| int ns = 0;
|
| - *slst = NULL;
|
| int capwords = 0;
|
| - int ngramsugs = 0;
|
|
|
| switch(captype) {
|
| - case NOCAP: {
|
| - ns = pSMgr->suggest(slst, cw, ns);
|
| + case NOCAP: {
|
| + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
|
| break;
|
| }
|
|
|
| - case INITCAP: {
|
| + case INITCAP: {
|
| capwords = 1;
|
| - ns = pSMgr->suggest(slst, cw, ns);
|
| + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
|
| if (ns == -1) break;
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall2(wspace, unicw, nc);
|
| - ns = pSMgr->suggest(slst, wspace, ns);
|
| + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
|
| break;
|
| }
|
| case HUHINITCAP:
|
| capwords = 1;
|
| - case HUHCAP: {
|
| - ns = pSMgr->suggest(slst, cw, ns);
|
| + case HUHCAP: {
|
| + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
|
| if (ns != -1) {
|
| int prevns;
|
| + // something.The -> something. The
|
| + char * dot = strchr(cw, '.');
|
| + if (dot && (dot > cw)) {
|
| + int captype_;
|
| + if (utf8) {
|
| + w_char w_[MAXWORDLEN];
|
| + int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
|
| + captype_ = get_captype_utf8(w_, wl_, langnum);
|
| + } else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
|
| + if (captype_ == INITCAP) {
|
| + char * st = mystrdup(cw);
|
| + if (st) st = (char *) realloc(st, wl + 2);
|
| + if (st) {
|
| + st[(dot - cw) + 1] = ' ';
|
| + strcpy(st + (dot - cw) + 2, dot + 1);
|
| + ns = insert_sug(slst, st, ns);
|
| + free(st);
|
| + }
|
| + }
|
| + }
|
| if (captype == HUHINITCAP) {
|
| // TheOpenOffice.org -> The OpenOffice.org
|
| memcpy(wspace,cw,(wl+1));
|
| mkinitsmall2(wspace, unicw, nc);
|
| - ns = pSMgr->suggest(slst, wspace, ns);
|
| + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
|
| }
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall2(wspace, unicw, nc);
|
| - insert_sug(slst, wspace, &ns);
|
| + if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
|
| prevns = ns;
|
| - ns = pSMgr->suggest(slst, wspace, ns);
|
| + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
|
| if (captype == HUHINITCAP) {
|
| mkinitcap2(wspace, unicw, nc);
|
| - insert_sug(slst, wspace, &ns);
|
| - ns = pSMgr->suggest(slst, wspace, ns);
|
| + if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
|
| + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
|
| }
|
| // aNew -> "a New" (instead of "a new")
|
| for (int j = prevns; j < ns; j++) {
|
| @@ -739,7 +774,7 @@
|
| int slen = strlen(space + 1);
|
| // different case after space (need capitalisation)
|
| if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
|
| - w_char w[MAXWORDLEN + 1];
|
| + w_char w[MAXWORDLEN];
|
| int wc = 0;
|
| char * r = (*slst)[j];
|
| if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
|
| @@ -754,31 +789,32 @@
|
| break;
|
| }
|
|
|
| - case ALLCAP: {
|
| + case ALLCAP: {
|
| memcpy(wspace, cw, (wl+1));
|
| mkallsmall2(wspace, unicw, nc);
|
| - ns = pSMgr->suggest(slst, wspace, ns);
|
| + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
|
| if (ns == -1) break;
|
| - if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns);
|
| + if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
|
| + ns = insert_sug(slst, wspace, ns);
|
| mkinitcap2(wspace, unicw, nc);
|
| - ns = pSMgr->suggest(slst, wspace, ns);
|
| + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
|
| for (int j=0; j < ns; j++) {
|
| mkallcap((*slst)[j]);
|
| if (pAMgr && pAMgr->get_checksharps()) {
|
| char * pos;
|
| if (utf8) {
|
| - pos = strstr((*slst)[j], "\xc3\x9f");
|
| + pos = strstr((*slst)[j], "\xC3\x9F");
|
| while (pos) {
|
| *pos = 'S';
|
| *(pos+1) = 'S';
|
| - pos = strstr(pos+2, "\xc3\x9f");
|
| + pos = strstr(pos+2, "\xC3\x9F");
|
| }
|
| } else {
|
| - pos = strchr((*slst)[j], '\xdf');
|
| + pos = strchr((*slst)[j], '\xDF');
|
| while (pos) {
|
| (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
|
| - mystrrep((*slst)[j], "\xdf", "SS");
|
| - pos = strchr((*slst)[j], '\xdf');
|
| + mystrrep((*slst)[j], "\xDF", "SS");
|
| + pos = strchr((*slst)[j], '\xDF');
|
| }
|
| }
|
| }
|
| @@ -807,37 +843,76 @@
|
| // END OF LANG_hu section
|
|
|
| // try ngram approach since found nothing
|
| - if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) {
|
| - ngramsugs = 1;
|
| + if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) {
|
| switch(captype) {
|
| case NOCAP: {
|
| - ns = pSMgr->ngsuggest(*slst, cw, pHMgr);
|
| + ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
|
| break;
|
| }
|
| + case HUHINITCAP:
|
| + capwords = 1;
|
| case HUHCAP: {
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall2(wspace, unicw, nc);
|
| - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr);
|
| - break;
|
| + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
|
| + break;
|
| }
|
| - case INITCAP: {
|
| + case INITCAP: {
|
| capwords = 1;
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall2(wspace, unicw, nc);
|
| - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr);
|
| + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
|
| break;
|
| }
|
| case ALLCAP: {
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall2(wspace, unicw, nc);
|
| - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr);
|
| - for (int j=0; j < ns; j++)
|
| + int oldns = ns;
|
| + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
|
| + for (int j = oldns; j < ns; j++)
|
| mkallcap((*slst)[j]);
|
| break;
|
| }
|
| }
|
| }
|
|
|
| + // try dash suggestion (Afo-American -> Afro-American)
|
| + if (strchr(cw, '-')) {
|
| + char * pos = strchr(cw, '-');
|
| + char * ppos = cw;
|
| + int nodashsug = 1;
|
| + char ** nlst = NULL;
|
| + int nn = 0;
|
| + int last = 0;
|
| + for (int j = 0; j < ns && nodashsug == 1; j++) {
|
| + if (strchr((*slst)[j], '-')) nodashsug = 0;
|
| + }
|
| + while (nodashsug && !last) {
|
| + if (*pos == '\0') last = 1; else *pos = '\0';
|
| + if (!spell(ppos)) {
|
| + nn = suggest(&nlst, ppos);
|
| + for (int j = nn - 1; j >= 0; j--) {
|
| + strncpy(wspace, cw, ppos - cw);
|
| + strcpy(wspace + (ppos - cw), nlst[j]);
|
| + if (!last) {
|
| + strcat(wspace, "-");
|
| + strcat(wspace, pos + 1);
|
| + }
|
| + ns = insert_sug(slst, wspace, ns);
|
| + free(nlst[j]);
|
| + }
|
| + if (nlst != NULL) free(nlst);
|
| + nodashsug = 0;
|
| + }
|
| + if (!last) {
|
| + *pos = '-';
|
| + ppos = pos + 1;
|
| + pos = strchr(ppos, '-');
|
| + }
|
| + if (!pos) pos = cw + strlen(cw);
|
| + }
|
| + }
|
| +
|
| // word reversing wrapper for complex prefixes
|
| if (complexprefixes) {
|
| for (int j = 0; j < ns; j++) {
|
| @@ -858,14 +933,14 @@
|
| }
|
| }
|
|
|
| - // suggest keepcase
|
| - if (pAMgr->get_keepcase()) {
|
| + // remove bad capitalized and forbidden forms
|
| + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
|
| switch (captype) {
|
| case INITCAP:
|
| case ALLCAP: {
|
| int l = 0;
|
| for (int j=0; j < ns; j++) {
|
| - if (!spell((*slst)[j])) {
|
| + if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
|
| char s[MAXSWUTF8L];
|
| w_char w[MAXSWL];
|
| int len;
|
| @@ -876,21 +951,21 @@
|
| len = strlen(s);
|
| }
|
| mkallsmall2(s, w, len);
|
| - free((*slst)[j]);
|
| + free((*slst)[j]);
|
| if (spell(s)) {
|
| (*slst)[l] = mystrdup(s);
|
| - l++;
|
| + if ((*slst)[l]) l++;
|
| } else {
|
| mkinitcap2(s, w, len);
|
| if (spell(s)) {
|
| (*slst)[l] = mystrdup(s);
|
| - l++;
|
| + if ((*slst)[l]) l++;
|
| }
|
| }
|
| } else {
|
| (*slst)[l] = (*slst)[j];
|
| l++;
|
| - }
|
| + }
|
| }
|
| ns = l;
|
| }
|
| @@ -909,9 +984,28 @@
|
| }
|
| l++;
|
| }
|
| +
|
| + // output conversion
|
| + rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
|
| + for (int j = 0; rl && j < ns; j++) {
|
| + if (rl->conv((*slst)[j], wspace)) {
|
| + free((*slst)[j]);
|
| + (*slst)[j] = mystrdup(wspace);
|
| + }
|
| + }
|
| +
|
| + // if suggestions removed by nosuggest, onlyincompound parameters
|
| + if (l == 0 && *slst) {
|
| + free(*slst);
|
| + *slst = NULL;
|
| + }
|
| return l;
|
| }
|
|
|
| +void Hunspell::free_list(char *** slst, int n) {
|
| + freelist(slst, n);
|
| +}
|
| +
|
| char * Hunspell::get_dic_encoding()
|
| {
|
| return encoding;
|
| @@ -921,9 +1015,9 @@
|
| // XXX need UTF-8 support
|
| int Hunspell::suggest_auto(char*** slst, const char * word)
|
| {
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - if (! pSMgr) return 0;
|
| + char cw[MAXWORDUTF8LEN];
|
| + char wspace[MAXWORDUTF8LEN];
|
| + if (!pSMgr || maxdic == 0) return 0;
|
| int wl = strlen(word);
|
| if (utf8) {
|
| if (wl >= MAXWORDUTF8LEN) return 0;
|
| @@ -936,15 +1030,15 @@
|
| if (wl == 0) return 0;
|
| int ns = 0;
|
| *slst = NULL; // HU, nsug in pSMgr->suggest
|
| -
|
| +
|
| switch(captype) {
|
| - case NOCAP: {
|
| + case NOCAP: {
|
| ns = pSMgr->suggest_auto(slst, cw, ns);
|
| if (ns>0) break;
|
| break;
|
| }
|
|
|
| - case INITCAP: {
|
| + case INITCAP: {
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall(wspace);
|
| ns = pSMgr->suggest_auto(slst, wspace, ns);
|
| @@ -952,10 +1046,11 @@
|
| mkinitcap((*slst)[j]);
|
| ns = pSMgr->suggest_auto(slst, cw, ns);
|
| break;
|
| -
|
| +
|
| }
|
|
|
| - case HUHCAP: {
|
| + case HUHINITCAP:
|
| + case HUHCAP: {
|
| ns = pSMgr->suggest_auto(slst, cw, ns);
|
| if (ns == 0) {
|
| memcpy(wspace,cw,(wl+1));
|
| @@ -965,7 +1060,7 @@
|
| break;
|
| }
|
|
|
| - case ALLCAP: {
|
| + case ALLCAP: {
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall(wspace);
|
| ns = pSMgr->suggest_auto(slst, wspace, ns);
|
| @@ -1011,103 +1106,89 @@
|
| }
|
| }
|
| }
|
| - // END OF LANG_hu section
|
| + // END OF LANG_hu section
|
| return ns;
|
| }
|
| +#endif
|
|
|
| -// XXX need UTF-8 support
|
| -int Hunspell::stem(char*** slst, const char * word)
|
| +int Hunspell::stem(char*** slst, char ** desc, int n)
|
| {
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - if (! pSMgr) return 0;
|
| - int wl = strlen(word);
|
| - if (utf8) {
|
| - if (wl >= MAXWORDUTF8LEN) return 0;
|
| - } else {
|
| - if (wl >= MAXWORDLEN) return 0;
|
| + char result[MAXLNLEN];
|
| + char result2[MAXLNLEN];
|
| + *slst = NULL;
|
| + if (n == 0) return 0;
|
| + *result2 = '\0';
|
| + for (int i = 0; i < n; i++) {
|
| + *result = '\0';
|
| + // add compound word parts (except the last one)
|
| + char * s = (char *) desc[i];
|
| + char * part = strstr(s, MORPH_PART);
|
| + if (part) {
|
| + char * nextpart = strstr(part + 1, MORPH_PART);
|
| + while (nextpart) {
|
| + copy_field(result + strlen(result), part, MORPH_PART);
|
| + part = nextpart;
|
| + nextpart = strstr(part + 1, MORPH_PART);
|
| + }
|
| + s = part;
|
| + }
|
| +
|
| + char **pl;
|
| + char tok[MAXLNLEN];
|
| + strcpy(tok, s);
|
| + char * alt = strstr(tok, " | ");
|
| + while (alt) {
|
| + alt[1] = MSEP_ALT;
|
| + alt = strstr(alt, " | ");
|
| + }
|
| + int pln = line_tok(tok, &pl, MSEP_ALT);
|
| + for (int k = 0; k < pln; k++) {
|
| + // add derivational suffixes
|
| + if (strstr(pl[k], MORPH_DERI_SFX)) {
|
| + // remove inflectional suffixes
|
| + char * is = strstr(pl[k], MORPH_INFL_SFX);
|
| + if (is) *is = '\0';
|
| + char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
|
| + if (sg) {
|
| + char ** gen;
|
| + int genl = line_tok(sg, &gen, MSEP_REC);
|
| + free(sg);
|
| + for (int j = 0; j < genl; j++) {
|
| + sprintf(result2 + strlen(result2), "%c%s%s",
|
| + MSEP_REC, result, gen[j]);
|
| + }
|
| + freelist(&gen, genl);
|
| + }
|
| + } else {
|
| + sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
|
| + if (strstr(pl[k], MORPH_SURF_PFX)) {
|
| + copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
|
| + }
|
| + copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
|
| + }
|
| + }
|
| + freelist(&pl, pln);
|
| }
|
| - int captype = 0;
|
| - int abbv = 0;
|
| - wl = cleanword(cw, word, &captype, &abbv);
|
| - if (wl == 0) return 0;
|
| -
|
| - int ns = 0;
|
| + int sln = line_tok(result2, slst, MSEP_REC);
|
| + return uniqlist(*slst, sln);
|
|
|
| - *slst = NULL; // HU, nsug in pSMgr->suggest
|
| -
|
| - switch(captype) {
|
| - case HUHCAP:
|
| - case NOCAP: {
|
| - ns = pSMgr->suggest_stems(slst, cw, ns);
|
| +}
|
|
|
| - if ((abbv) && (ns == 0)) {
|
| - memcpy(wspace,cw,wl);
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - ns = pSMgr->suggest_stems(slst, wspace, ns);
|
| - }
|
| -
|
| - break;
|
| - }
|
| -
|
| - case INITCAP: {
|
| -
|
| - ns = pSMgr->suggest_stems(slst, cw, ns);
|
| -
|
| - if (ns == 0) {
|
| - memcpy(wspace,cw,(wl+1));
|
| - mkallsmall(wspace);
|
| - ns = pSMgr->suggest_stems(slst, wspace, ns);
|
| -
|
| - }
|
| -
|
| - if ((abbv) && (ns == 0)) {
|
| - memcpy(wspace,cw,wl);
|
| - mkallsmall(wspace);
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - ns = pSMgr->suggest_stems(slst, wspace, ns);
|
| - }
|
| -
|
| - break;
|
| -
|
| - }
|
| -
|
| - case ALLCAP: {
|
| - ns = pSMgr->suggest_stems(slst, cw, ns);
|
| - if (ns != 0) break;
|
| -
|
| - memcpy(wspace,cw,(wl+1));
|
| - mkallsmall(wspace);
|
| - ns = pSMgr->suggest_stems(slst, wspace, ns);
|
| -
|
| - if (ns == 0) {
|
| - mkinitcap(wspace);
|
| - ns = pSMgr->suggest_stems(slst, wspace, ns);
|
| - }
|
| -
|
| - if ((abbv) && (ns == 0)) {
|
| - memcpy(wspace,cw,wl);
|
| - mkallsmall(wspace);
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - ns = pSMgr->suggest_stems(slst, wspace, ns);
|
| - }
|
| -
|
| -
|
| - break;
|
| - }
|
| - }
|
| -
|
| - return ns;
|
| +int Hunspell::stem(char*** slst, const char * word)
|
| +{
|
| + char ** pl;
|
| + int pln = analyze(&pl, word);
|
| + int pln2 = stem(slst, pl, pln);
|
| + freelist(&pl, pln);
|
| + return pln2;
|
| }
|
|
|
| +#ifdef HUNSPELL_EXPERIMENTAL
|
| int Hunspell::suggest_pos_stems(char*** slst, const char * word)
|
| {
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - if (! pSMgr) return 0;
|
| + char cw[MAXWORDUTF8LEN];
|
| + char wspace[MAXWORDUTF8LEN];
|
| + if (! pSMgr || maxdic == 0) return 0;
|
| int wl = strlen(word);
|
| if (utf8) {
|
| if (wl >= MAXWORDUTF8LEN) return 0;
|
| @@ -1118,14 +1199,14 @@
|
| int abbv = 0;
|
| wl = cleanword(cw, word, &captype, &abbv);
|
| if (wl == 0) return 0;
|
| -
|
| +
|
| int ns = 0; // ns=0 = normalized input
|
|
|
| *slst = NULL; // HU, nsug in pSMgr->suggest
|
| -
|
| +
|
| switch(captype) {
|
| case HUHCAP:
|
| - case NOCAP: {
|
| + case NOCAP: {
|
| ns = pSMgr->suggest_pos_stems(slst, cw, ns);
|
|
|
| if ((abbv) && (ns == 0)) {
|
| @@ -1138,7 +1219,7 @@
|
| break;
|
| }
|
|
|
| - case INITCAP: {
|
| + case INITCAP: {
|
|
|
| ns = pSMgr->suggest_pos_stems(slst, cw, ns);
|
|
|
| @@ -1147,15 +1228,15 @@
|
| mkallsmall(wspace);
|
| ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
|
| }
|
| -
|
| +
|
| break;
|
| -
|
| +
|
| }
|
|
|
| - case ALLCAP: {
|
| + case ALLCAP: {
|
| ns = pSMgr->suggest_pos_stems(slst, cw, ns);
|
| if (ns != 0) break;
|
| -
|
| +
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall(wspace);
|
| ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
|
| @@ -1225,22 +1306,24 @@
|
| return nc;
|
| }
|
|
|
| -int Hunspell::put_word(const char * word)
|
| +int Hunspell::add(const char * word)
|
| {
|
| - if (pHMgr) {
|
| - return pHMgr->put_word(word, strlen(word), NULL);
|
| - }
|
| + if (pHMgr[0]) return (pHMgr[0])->add(word);
|
| return 0;
|
| }
|
|
|
| -int Hunspell::put_word_pattern(const char * word, const char * pattern)
|
| +int Hunspell::add_with_affix(const char * word, const char * example)
|
| {
|
| - if (pHMgr) {
|
| - return pHMgr->put_word_pattern(word, strlen(word), pattern);
|
| - }
|
| + if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
|
| return 0;
|
| }
|
|
|
| +int Hunspell::remove(const char * word)
|
| +{
|
| + if (pHMgr[0]) return (pHMgr[0])->remove(word);
|
| + return 0;
|
| +}
|
| +
|
| const char * Hunspell::get_version()
|
| {
|
| return pAMgr->get_version();
|
| @@ -1251,22 +1334,38 @@
|
| return csconv;
|
| }
|
|
|
| -#ifdef HUNSPELL_EXPERIMENTAL
|
| -// XXX need UTF-8 support
|
| -char * Hunspell::morph(const char * word)
|
| +void Hunspell::cat_result(char * result, char * st)
|
| {
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - if (! pSMgr) return 0;
|
| - int wl = strlen(word);
|
| + if (st) {
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| + free(st);
|
| + }
|
| +}
|
| +
|
| +int Hunspell::analyze(char*** slst, const char * word)
|
| +{
|
| + char cw[MAXWORDUTF8LEN];
|
| + char wspace[MAXWORDUTF8LEN];
|
| + w_char unicw[MAXWORDLEN];
|
| + int wl2 = 0;
|
| + *slst = NULL;
|
| + if (! pSMgr || maxdic == 0) return 0;
|
| + int nc = strlen(word);
|
| if (utf8) {
|
| - if (wl >= MAXWORDUTF8LEN) return 0;
|
| + if (nc >= MAXWORDUTF8LEN) return 0;
|
| } else {
|
| - if (wl >= MAXWORDLEN) return 0;
|
| + if (nc >= MAXWORDLEN) return 0;
|
| }
|
| int captype = 0;
|
| int abbv = 0;
|
| - wl = cleanword(cw, word, &captype, &abbv);
|
| + int wl = 0;
|
| +
|
| + // input conversion
|
| + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
|
| + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
|
| + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
|
| +
|
| if (wl == 0) {
|
| if (abbv) {
|
| for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
|
| @@ -1277,7 +1376,7 @@
|
|
|
| char result[MAXLNLEN];
|
| char * st = NULL;
|
| -
|
| +
|
| *result = '\0';
|
|
|
| int n = 0;
|
| @@ -1287,177 +1386,103 @@
|
| // test numbers
|
| // LANG_hu section: set dash information for suggestions
|
| if (langnum == LANG_hu) {
|
| - while ((n < wl) &&
|
| + while ((n < wl) &&
|
| (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
|
| n++;
|
| if ((cw[n] == '.') || (cw[n] == ',')) {
|
| - if (((n2 == 0) && (n > 3)) ||
|
| + if (((n2 == 0) && (n > 3)) ||
|
| ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
|
| n2++;
|
| n3 = n;
|
| }
|
| }
|
|
|
| - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL;
|
| - if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xb0')) && checkword(cw+n, NULL, NULL))) {
|
| - strcat(result, cw);
|
| + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
|
| + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
|
| + mystrcat(result, cw, MAXLNLEN);
|
| result[n - 1] = '\0';
|
| - if (n == wl) {
|
| - st = pSMgr->suggest_morph(cw + n - 1);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - } else {
|
| + if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
|
| + else {
|
| char sign = cw[n];
|
| cw[n] = '\0';
|
| - st = pSMgr->suggest_morph(cw + n - 1);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - strcat(result, "+"); // XXX SPEC. MORPHCODE
|
| + cat_result(result, pSMgr->suggest_morph(cw + n - 1));
|
| + mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
|
| cw[n] = sign;
|
| - st = pSMgr->suggest_morph(cw + n);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| + cat_result(result, pSMgr->suggest_morph(cw + n));
|
| }
|
| - return mystrdup(result);
|
| + return line_tok(result, slst, MSEP_REC);
|
| }
|
| }
|
| // END OF LANG_hu section
|
| -
|
| +
|
| switch(captype) {
|
| - case NOCAP: {
|
| - st = pSMgr->suggest_morph(cw);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - if (abbv) {
|
| - memcpy(wspace,cw,wl);
|
| + case HUHCAP:
|
| + case HUHINITCAP:
|
| + case NOCAP: {
|
| + cat_result(result, pSMgr->suggest_morph(cw));
|
| + if (abbv) {
|
| + memcpy(wspace,cw,wl);
|
| + *(wspace+wl) = '.';
|
| + *(wspace+wl+1) = '\0';
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| + }
|
| + break;
|
| + }
|
| + case INITCAP: {
|
| + wl = mkallsmall2(cw, unicw, nc);
|
| + memcpy(wspace,cw,(wl+1));
|
| + wl2 = mkinitcap2(cw, unicw, nc);
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| + cat_result(result, pSMgr->suggest_morph(cw));
|
| + if (abbv) {
|
| *(wspace+wl) = '.';
|
| *(wspace+wl+1) = '\0';
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| +
|
| + memcpy(wspace, cw, wl2);
|
| + *(wspace+wl2) = '.';
|
| + *(wspace+wl2+1) = '\0';
|
| +
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| }
|
| - break;
|
| + break;
|
| }
|
| - case INITCAP: {
|
| - memcpy(wspace,cw,(wl+1));
|
| - mkallsmall(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - st = pSMgr->suggest_morph(cw);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - if (abbv) {
|
| - memcpy(wspace,cw,wl);
|
| + case ALLCAP: {
|
| + cat_result(result, pSMgr->suggest_morph(cw));
|
| + if (abbv) {
|
| + memcpy(wspace,cw,wl);
|
| *(wspace+wl) = '.';
|
| *(wspace+wl+1) = '\0';
|
| - mkallsmall(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - mkinitcap(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| + cat_result(result, pSMgr->suggest_morph(cw));
|
| }
|
| - break;
|
| - }
|
| - case HUHCAP: {
|
| - st = pSMgr->suggest_morph(cw);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| -#if 0
|
| + wl = mkallsmall2(cw, unicw, nc);
|
| memcpy(wspace,cw,(wl+1));
|
| - mkallsmall(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| + wl2 = mkinitcap2(cw, unicw, nc);
|
| +
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| + cat_result(result, pSMgr->suggest_morph(cw));
|
| + if (abbv) {
|
| + *(wspace+wl) = '.';
|
| + *(wspace+wl+1) = '\0';
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| +
|
| + memcpy(wspace, cw, wl2);
|
| + *(wspace+wl2) = '.';
|
| + *(wspace+wl2+1) = '\0';
|
| +
|
| + cat_result(result, pSMgr->suggest_morph(wspace));
|
| }
|
| -#endif
|
| break;
|
| - }
|
| - case ALLCAP: {
|
| - memcpy(wspace,cw,(wl+1));
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - mkallsmall(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - mkinitcap(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - if (abbv) {
|
| - memcpy(wspace,cw,(wl+1));
|
| - *(wspace+wl) = '.';
|
| - *(wspace+wl+1) = '\0';
|
| - if (*result) strcat(result, "\n");
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - mkallsmall(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - mkinitcap(wspace);
|
| - st = pSMgr->suggest_morph(wspace);
|
| - if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| - }
|
| - break;
|
| }
|
| }
|
|
|
| - if (result && (*result)) {
|
| + if (*result) {
|
| // word reversing wrapper for complex prefixes
|
| if (complexprefixes) {
|
| if (utf8) reverseword_utf(result); else reverseword(result);
|
| }
|
| - return mystrdup(result);
|
| + return line_tok(result, slst, MSEP_REC);
|
| +
|
| }
|
|
|
| // compound word with dash (HU) I18n
|
| @@ -1466,24 +1491,24 @@
|
| // LANG_hu section: set dash information for suggestions
|
| if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
|
| if ((langnum == LANG_hu) && dash) {
|
| - *dash='\0';
|
| + *dash='\0';
|
| // examine 2 sides of the dash
|
| if (dash[1] == '\0') { // base word ending with dash
|
| - if (spell(cw)) return pSMgr->suggest_morph(cw);
|
| + if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC);
|
| } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
|
| if (spell(cw) && (spell("-e"))) {
|
| st = pSMgr->suggest_morph(cw);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - strcat(result,"+"); // XXX spec. separator in MORPHCODE
|
| + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
|
| st = pSMgr->suggest_morph("-e");
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - return mystrdup(result);
|
| + return line_tok(result, slst, MSEP_REC);
|
| }
|
| } else {
|
| // first word ending with dash: word- XXX ???
|
| @@ -1495,22 +1520,22 @@
|
| dash[0]='\0';
|
| if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
|
| ((dash[1] > '0') && (dash[1] < '9')))) {
|
| - st = morph(cw);
|
| + st = pSMgr->suggest_morph(cw);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| - strcat(result,"+"); // XXX spec. separator in MORPHCODE
|
| + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
|
| }
|
| - st = morph(dash+1);
|
| + st = pSMgr->suggest_morph(dash+1);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - return mystrdup(result);
|
| + return line_tok(result, slst, MSEP_REC);
|
| }
|
| }
|
| // affixed number in correct word
|
| - if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
|
| + if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
|
| (*(dash-1)>='0')) || (*(dash-1)=='.'))) {
|
| *dash='-';
|
| n = 1;
|
| @@ -1525,195 +1550,338 @@
|
| // 56-hoz, 6-hoz
|
| for(; n >= 1; n--) {
|
| if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) {
|
| - strcat(result, cw);
|
| + mystrcat(result, cw, MAXLNLEN);
|
| result[dash - cw - n] = '\0';
|
| st = pSMgr->suggest_morph(dash - n);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - return mystrdup(result);
|
| + return line_tok(result, slst, MSEP_REC);
|
| }
|
| }
|
| }
|
| }
|
| - return NULL;
|
| + return 0;
|
| }
|
|
|
| +int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln)
|
| +{
|
| + *slst = NULL;
|
| + if (!pSMgr || !pln) return 0;
|
| + char **pl2;
|
| + int pl2n = analyze(&pl2, word);
|
| + int captype = 0;
|
| + int abbv = 0;
|
| + char cw[MAXWORDUTF8LEN];
|
| + cleanword(cw, word, &captype, &abbv);
|
| + char result[MAXLNLEN];
|
| + *result = '\0';
|
| +
|
| + for (int i = 0; i < pln; i++) {
|
| + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
|
| + }
|
| + freelist(&pl2, pl2n);
|
| +
|
| + if (*result) {
|
| + // allcap
|
| + if (captype == ALLCAP) mkallcap(result);
|
| +
|
| + // line split
|
| + int linenum = line_tok(result, slst, MSEP_REC);
|
| +
|
| + // capitalize
|
| + if (captype == INITCAP || captype == HUHINITCAP) {
|
| + for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]);
|
| + }
|
| +
|
| + // temporary filtering of prefix related errors (eg.
|
| + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
|
| +
|
| + int r = 0;
|
| + for (int j=0; j < linenum; j++) {
|
| + if (!spell((*slst)[j])) {
|
| + free((*slst)[j]);
|
| + (*slst)[j] = NULL;
|
| + } else {
|
| + if (r < j) (*slst)[r] = (*slst)[j];
|
| + r++;
|
| + }
|
| + }
|
| + if (r > 0) return r;
|
| + free(*slst);
|
| + *slst = NULL;
|
| + }
|
| + return 0;
|
| +}
|
| +
|
| +int Hunspell::generate(char*** slst, const char * word, const char * pattern)
|
| +{
|
| + char **pl;
|
| + int pln = analyze(&pl, pattern);
|
| + int n = generate(slst, word, pl, pln);
|
| + freelist(&pl, pln);
|
| + return uniqlist(*slst, n);
|
| +}
|
| +
|
| +// minimal XML parser functions
|
| +int Hunspell::get_xml_par(char * dest, const char * par, int max)
|
| +{
|
| + char * d = dest;
|
| + if (!par) return 0;
|
| + char end = *par;
|
| + char * dmax = dest + max;
|
| + if (end == '>') end = '<';
|
| + else if (end != '\'' && end != '"') return 0; // bad XML
|
| + for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par;
|
| + *d = '\0';
|
| + mystrrep(dest, "<", "<");
|
| + mystrrep(dest, "&", "&");
|
| + return d - dest;
|
| +}
|
| +
|
| +// return the beginning of the element (attr == NULL) or the attribute
|
| +const char * Hunspell::get_xml_pos(const char * s, const char * attr)
|
| +{
|
| + const char * end = strchr(s, '>');
|
| + const char * p = s;
|
| + if (attr == NULL) return end;
|
| + do {
|
| + p = strstr(p, attr);
|
| + if (!p || p >= end) return 0;
|
| + } while (*(p-1) != ' ' && *(p-1) != '\n');
|
| + return p + strlen(attr);
|
| +}
|
| +
|
| +int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) {
|
| + char cw[MAXWORDUTF8LEN];
|
| + if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
|
| + strcmp(cw, value) == 0) return 1;
|
| + return 0;
|
| +}
|
| +
|
| +int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) {
|
| + int n = 0;
|
| + char * p;
|
| + if (!list) return 0;
|
| + for (p = list; (p = strstr(p, tag)); p++) n++;
|
| + if (n == 0) return 0;
|
| + *slst = (char **) malloc(sizeof(char *) * n);
|
| + if (!*slst) return 0;
|
| + for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) {
|
| + int l = strlen(p);
|
| + (*slst)[n] = (char *) malloc(l);
|
| + if (!(*slst)[n]) return (n > 0 ? n - 1 : 0);
|
| + get_xml_par((*slst)[n], p + strlen(tag) - 1, l);
|
| + }
|
| + return n;
|
| +}
|
| +
|
| +int Hunspell::spellml(char*** slst, const char * word)
|
| +{
|
| + char *q, *q2;
|
| + char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
|
| + q = (char *) strstr(word, "<query");
|
| + if (!q) return 0; // bad XML input
|
| + q2 = strchr(q, '>');
|
| + if (!q2) return 0; // bad XML input
|
| + q2 = strstr(q2, "<word");
|
| + if (!q2) return 0; // bad XML input
|
| + if (check_xml_par(q, "type=", "analyze")) {
|
| + int n = 0, s = 0;
|
| + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) n = analyze(slst, cw);
|
| + if (n == 0) return 0;
|
| + // convert the result to <code><a>ana1</a><a>ana2</a></code> format
|
| + for (int i = 0; i < n; i++) s+= strlen((*slst)[i]);
|
| + char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->&
|
| + if (!r) return 0;
|
| + strcpy(r, "<code>");
|
| + for (int i = 0; i < n; i++) {
|
| + int l = strlen(r);
|
| + strcpy(r + l, "<a>");
|
| + strcpy(r + l + 3, (*slst)[i]);
|
| + mystrrep(r + l + 3, "\t", " ");
|
| + mystrrep(r + l + 3, "<", "<");
|
| + mystrrep(r + l + 3, "&", "&");
|
| + strcat(r, "</a>");
|
| + free((*slst)[i]);
|
| + }
|
| + strcat(r, "</code>");
|
| + (*slst)[0] = r;
|
| + return 1;
|
| + } else if (check_xml_par(q, "type=", "stem")) {
|
| + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) return stem(slst, cw);
|
| + } else if (check_xml_par(q, "type=", "generate")) {
|
| + int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN);
|
| + if (n == 0) return 0;
|
| + char * q3 = strstr(q2 + 1, "<word");
|
| + if (q3) {
|
| + if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN)) {
|
| + return generate(slst, cw, cw2);
|
| + }
|
| + } else {
|
| + char ** slst2;
|
| + if ((q2 = strstr(q2 + 1, "<code")) &&
|
| + (n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) {
|
| + int n2 = generate(slst, cw, slst2, n);
|
| + freelist(&slst2, n);
|
| + return uniqlist(*slst, n2);
|
| + }
|
| + }
|
| + }
|
| + return 0;
|
| +}
|
| +
|
| +
|
| +#ifdef HUNSPELL_EXPERIMENTAL
|
| // XXX need UTF-8 support
|
| char * Hunspell::morph_with_correction(const char * word)
|
| {
|
| - char cw[MAXWORDUTF8LEN + 4];
|
| - char wspace[MAXWORDUTF8LEN + 4];
|
| - if (! pSMgr) return 0;
|
| + char cw[MAXWORDUTF8LEN];
|
| + char wspace[MAXWORDUTF8LEN];
|
| + if (! pSMgr || maxdic == 0) return NULL;
|
| int wl = strlen(word);
|
| if (utf8) {
|
| - if (wl >= MAXWORDUTF8LEN) return 0;
|
| + if (wl >= MAXWORDUTF8LEN) return NULL;
|
| } else {
|
| - if (wl >= MAXWORDLEN) return 0;
|
| + if (wl >= MAXWORDLEN) return NULL;
|
| }
|
| int captype = 0;
|
| int abbv = 0;
|
| wl = cleanword(cw, word, &captype, &abbv);
|
| - if (wl == 0) return 0;
|
| + if (wl == 0) return NULL;
|
|
|
| char result[MAXLNLEN];
|
| char * st = NULL;
|
| -
|
| +
|
| *result = '\0';
|
| -
|
| -
|
| +
|
| +
|
| switch(captype) {
|
| - case NOCAP: {
|
| + case NOCAP: {
|
| st = pSMgr->suggest_morph_for_spelling_error(cw);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - if (abbv) {
|
| - memcpy(wspace,cw,wl);
|
| + if (abbv) {
|
| + memcpy(wspace,cw,wl);
|
| *(wspace+wl) = '.';
|
| *(wspace+wl+1) = '\0';
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| }
|
| break;
|
| }
|
| - case INITCAP: {
|
| + case INITCAP: {
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall(wspace);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| - }
|
| - st = pSMgr->suggest_morph_for_spelling_error(cw);
|
| + }
|
| + st = pSMgr->suggest_morph_for_spelling_error(cw);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - if (abbv) {
|
| - memcpy(wspace,cw,wl);
|
| + if (abbv) {
|
| + memcpy(wspace,cw,wl);
|
| *(wspace+wl) = '.';
|
| *(wspace+wl+1) = '\0';
|
| mkallsmall(wspace);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| - }
|
| + }
|
| mkinitcap(wspace);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| - }
|
| + }
|
| }
|
| break;
|
| }
|
| - case HUHCAP: {
|
| + case HUHCAP: {
|
| st = pSMgr->suggest_morph_for_spelling_error(cw);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| memcpy(wspace,cw,(wl+1));
|
| mkallsmall(wspace);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| - }
|
| + }
|
| break;
|
| }
|
| - case ALLCAP: {
|
| + case ALLCAP: {
|
| memcpy(wspace,cw,(wl+1));
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - strcat(result, st);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| - }
|
| + }
|
| mkallsmall(wspace);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - mkinitcap(wspace);
|
| - st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| + mkinitcap(wspace);
|
| + st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - if (abbv) {
|
| + if (abbv) {
|
| memcpy(wspace,cw,(wl+1));
|
| *(wspace+wl) = '.';
|
| *(wspace+wl+1) = '\0';
|
| - if (*result) strcat(result, "\n");
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - strcat(result, st);
|
| - free(st);
|
| - }
|
| + mystrcat(result, st, MAXLNLEN);
|
| + free(st);
|
| + }
|
| mkallsmall(wspace);
|
| st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - mkinitcap(wspace);
|
| - st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| + mkinitcap(wspace);
|
| + st = pSMgr->suggest_morph_for_spelling_error(wspace);
|
| if (st) {
|
| - if (*result) strcat(result, "\n");
|
| - strcat(result, st);
|
| + if (*result) mystrcat(result, "\n", MAXLNLEN);
|
| + mystrcat(result, st, MAXLNLEN);
|
| free(st);
|
| }
|
| - }
|
| + }
|
| break;
|
| }
|
| }
|
|
|
| - if (result) return mystrdup(result);
|
| + if (*result) return mystrdup(result);
|
| return NULL;
|
| }
|
|
|
| -/* analyze word
|
| - * return line count
|
| - * XXX need a better data structure for morphological analysis */
|
| -int Hunspell::analyze(char ***out, const char *word) {
|
| - int n = 0;
|
| - if (!word) return 0;
|
| - char * m = morph(word);
|
| - if(!m) return 0;
|
| - if (!out) return line_tok(m, out);
|
| -
|
| - // without memory allocation
|
| - /* BUG missing buffer size checking */
|
| - int i, p;
|
| - for(p = 0, i = 0; m[i]; i++) {
|
| - if(m[i] == '\n' || !m[i+1]) {
|
| - n++;
|
| - strncpy((*out)[n++], m + p, i - p + 1);
|
| - if (m[i] == '\n') (*out)[n++][i - p] = '\0';
|
| - if(!m[i+1]) break;
|
| - p = i + 1;
|
| - }
|
| - }
|
| - free(m);
|
| - return n;
|
| -}
|
| -
|
| #endif // END OF HUNSPELL_EXPERIMENTAL CODE
|
|
|
| Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle)
|
| @@ -1725,6 +1893,17 @@
|
| #endif
|
| }
|
|
|
| +
|
| +Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
|
| + const char * key)
|
| +{
|
| +#ifdef HUNSPELL_CHROME_CLIENT
|
| + return NULL;
|
| +#else
|
| + return (Hunhandle*)(new Hunspell(affpath, dpath, key));
|
| +#endif
|
| +}
|
| +
|
| void Hunspell_destroy(Hunhandle *pHunspell)
|
| {
|
| delete (Hunspell*)(pHunspell);
|
| @@ -1745,3 +1924,57 @@
|
| return ((Hunspell*)pHunspell)->suggest(slst, word);
|
| }
|
|
|
| +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
|
| +{
|
| + return ((Hunspell*)pHunspell)->analyze(slst, word);
|
| +}
|
| +
|
| +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
|
| +{
|
| + return ((Hunspell*)pHunspell)->stem(slst, word);
|
| +}
|
| +
|
| +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n)
|
| +{
|
| + return ((Hunspell*)pHunspell)->stem(slst, desc, n);
|
| +}
|
| +
|
| +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
|
| + const char * word2)
|
| +{
|
| + return ((Hunspell*)pHunspell)->generate(slst, word, word2);
|
| +}
|
| +
|
| +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
|
| + char** desc, int n)
|
| +{
|
| + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
|
| +}
|
| +
|
| + /* functions for run-time modification of the dictionary */
|
| +
|
| + /* add word to the run-time dictionary */
|
| +
|
| +int Hunspell_add(Hunhandle *pHunspell, const char * word) {
|
| + return ((Hunspell*)pHunspell)->add(word);
|
| +}
|
| +
|
| + /* add word to the run-time dictionary with affix flags of
|
| + * the example (a dictionary word): Hunspell will recognize
|
| + * affixed forms of the new word, too.
|
| + */
|
| +
|
| +int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
|
| + const char * example) {
|
| + return ((Hunspell*)pHunspell)->add_with_affix(word, example);
|
| +}
|
| +
|
| + /* remove word from the run-time dictionary */
|
| +
|
| +int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
|
| + return ((Hunspell*)pHunspell)->remove(word);
|
| +}
|
| +
|
| +void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n) {
|
| + freelist(slst, n);
|
| +}
|
|
|
| Property changes on: chrome\third_party\hunspell\src\hunspell\hunspell.cxx
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|