| Index: third_party/hunspell/src/hunspell/affentry.cxx
|
| diff --git a/third_party/hunspell/src/hunspell/affentry.cxx b/third_party/hunspell/src/hunspell/affentry.cxx
|
| index 0ff8b5df827f257f78c14c102bca79780480d5eb..70b468c0a45f46f193e88609639c4c8d133406c4 100644
|
| --- a/third_party/hunspell/src/hunspell/affentry.cxx
|
| +++ b/third_party/hunspell/src/hunspell/affentry.cxx
|
| @@ -1,5 +1,75 @@
|
| -#include "license.hunspell"
|
| -#include "license.myspell"
|
| +/* ***** BEGIN LICENSE BLOCK *****
|
| + * Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
| + *
|
| + * The contents of this file are subject to the Mozilla Public License Version
|
| + * 1.1 (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + * http://www.mozilla.org/MPL/
|
| + *
|
| + * Software distributed under the License is distributed on an "AS IS" basis,
|
| + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
| + * for the specific language governing rights and limitations under the
|
| + * License.
|
| + *
|
| + * The Original Code is Hunspell, based on MySpell.
|
| + *
|
| + * The Initial Developers of the Original Code are
|
| + * Kevin Hendricks (MySpell) and Németh László (Hunspell).
|
| + * Portions created by the Initial Developers are Copyright (C) 2002-2005
|
| + * the Initial Developers. All Rights Reserved.
|
| + *
|
| + * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
|
| + * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
|
| + * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
|
| + * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
|
| + * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
|
| + *
|
| + * Alternatively, the contents of this file may be used under the terms of
|
| + * either the GNU General Public License Version 2 or later (the "GPL"), or
|
| + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
| + * in which case the provisions of the GPL or the LGPL are applicable instead
|
| + * of those above. If you wish to allow use of your version of this file only
|
| + * under the terms of either the GPL or the LGPL, and not to allow others to
|
| + * use your version of this file under the terms of the MPL, indicate your
|
| + * decision by deleting the provisions above and replace them with the notice
|
| + * and other provisions required by the GPL or the LGPL. If you do not delete
|
| + * the provisions above, a recipient may use your version of this file under
|
| + * the terms of any one of the MPL, the GPL or the LGPL.
|
| + *
|
| + * ***** END LICENSE BLOCK ***** */
|
| +/*
|
| + * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
|
| + * And Contributors. All rights reserved.
|
| + *
|
| + * Redistribution and use in source and binary forms, with or without
|
| + * modification, are permitted provided that the following conditions
|
| + * are met:
|
| + *
|
| + * 1. Redistributions of source code must retain the above copyright
|
| + * notice, this list of conditions and the following disclaimer.
|
| + *
|
| + * 2. Redistributions in binary form must reproduce the above copyright
|
| + * notice, this list of conditions and the following disclaimer in the
|
| + * documentation and/or other materials provided with the distribution.
|
| + *
|
| + * 3. All modifications to the source code must be clearly marked as
|
| + * such. Binary redistributions based on modified source code
|
| + * must be clearly marked as modified versions in the documentation
|
| + * and/or other materials provided with the distribution.
|
| + *
|
| + * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
|
| + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
| + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
| + * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
| + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
| + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
| + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
| + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
| + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
| + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
| + * SUCH DAMAGE.
|
| + */
|
|
|
| #include <stdlib.h>
|
| #include <string.h>
|
| @@ -9,836 +79,792 @@
|
| #include "affentry.hxx"
|
| #include "csutil.hxx"
|
|
|
| -PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
|
| -{
|
| - // register affix manager
|
| - pmyMgr = pmgr;
|
| -
|
| - // set up its initial values
|
| -
|
| - aflag = dp->aflag; // flag
|
| - strip = dp->strip; // string to strip
|
| - appnd = dp->appnd; // string to append
|
| - stripl = dp->stripl; // length of strip string
|
| - appndl = dp->appndl; // length of append string
|
| - numconds = dp->numconds; // length of the condition
|
| - opts = dp->opts; // cross product flag
|
| - // then copy over all of the conditions
|
| - if (opts & aeLONGCOND) {
|
| - memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
|
| - c.l.conds2 = dp->c.l.conds2;
|
| - } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
|
| - next = NULL;
|
| - nextne = NULL;
|
| - nexteq = NULL;
|
| - morphcode = dp->morphcode;
|
| - contclass = dp->contclass;
|
| - contclasslen = dp->contclasslen;
|
| +AffEntry::~AffEntry() {
|
| + if (opts & aeLONGCOND)
|
| + free(c.l.conds2);
|
| + if (morphcode && !(opts & aeALIASM))
|
| + free(morphcode);
|
| + if (contclass && !(opts & aeALIASF))
|
| + free(contclass);
|
| }
|
|
|
| -
|
| -PfxEntry::~PfxEntry()
|
| -{
|
| - aflag = 0;
|
| - if (appnd) free(appnd);
|
| - if (strip) free(strip);
|
| - pmyMgr = NULL;
|
| - appnd = NULL;
|
| - strip = NULL;
|
| - if (opts & aeLONGCOND) free(c.l.conds2);
|
| - if (morphcode && !(opts & aeALIASM)) free(morphcode);
|
| - if (contclass && !(opts & aeALIASF)) free(contclass);
|
| +PfxEntry::PfxEntry(AffixMgr* pmgr)
|
| + // register affix manager
|
| + : pmyMgr(pmgr),
|
| + next(NULL),
|
| + nexteq(NULL),
|
| + nextne(NULL),
|
| + flgnxt(NULL) {
|
| }
|
|
|
| // add prefix to this word assuming conditions hold
|
| -char * PfxEntry::add(const char * word, int len)
|
| -{
|
| - char tword[MAXWORDUTF8LEN + 4];
|
| -
|
| - if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
|
| - (len >= numconds) && test_condition(word) &&
|
| - (!stripl || (strncmp(word, strip, stripl) == 0)) &&
|
| - ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
|
| +std::string PfxEntry::add(const char* word, size_t len) {
|
| + std::string result;
|
| + if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
|
| + (len >= numconds) && test_condition(word) &&
|
| + (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
|
| /* we have a match so add prefix */
|
| - char * pp = tword;
|
| - if (appndl) {
|
| - strcpy(tword,appnd);
|
| - pp += appndl;
|
| - }
|
| - strcpy(pp, (word + stripl));
|
| - return mystrdup(tword);
|
| - }
|
| - return NULL;
|
| + result.assign(appnd);
|
| + result.append(word + strip.size());
|
| + }
|
| + return result;
|
| }
|
|
|
| -inline char * PfxEntry::nextchar(char * p) {
|
| - if (p) {
|
| - p++;
|
| - if (opts & aeLONGCOND) {
|
| - // jump to the 2nd part of the condition
|
| - if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
|
| - // end of the MAXCONDLEN length condition
|
| - } else if (p == c.conds + MAXCONDLEN) return NULL;
|
| - return *p ? p : NULL;
|
| - }
|
| - return NULL;
|
| +inline char* PfxEntry::nextchar(char* p) {
|
| + if (p) {
|
| + p++;
|
| + if (opts & aeLONGCOND) {
|
| + // jump to the 2nd part of the condition
|
| + if (p == c.conds + MAXCONDLEN_1)
|
| + return c.l.conds2;
|
| + // end of the MAXCONDLEN length condition
|
| + } else if (p == c.conds + MAXCONDLEN)
|
| + return NULL;
|
| + return *p ? p : NULL;
|
| + }
|
| + return NULL;
|
| }
|
|
|
| -inline int PfxEntry::test_condition(const char * st)
|
| -{
|
| - const char * pos = NULL; // group with pos input position
|
| - bool neg = false; // complementer
|
| - bool ingroup = false; // character in the group
|
| - if (numconds == 0) return 1;
|
| - char * p = c.conds;
|
| - while (1) {
|
| - switch (*p) {
|
| - case '\0': return 1;
|
| - case '[': {
|
| - neg = false;
|
| - ingroup = false;
|
| - p = nextchar(p);
|
| - pos = st; break;
|
| - }
|
| - case '^': { p = nextchar(p); neg = true; break; }
|
| - case ']': {
|
| - if ((neg && ingroup) || (!neg && !ingroup)) return 0;
|
| - pos = NULL;
|
| - p = nextchar(p);
|
| - // skip the next character
|
| - if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
|
| - if (*st == '\0' && p) return 0; // word <= condition
|
| +inline int PfxEntry::test_condition(const char* st) {
|
| + const char* pos = NULL; // group with pos input position
|
| + bool neg = false; // complementer
|
| + bool ingroup = false; // character in the group
|
| + if (numconds == 0)
|
| + return 1;
|
| + char* p = c.conds;
|
| + while (1) {
|
| + switch (*p) {
|
| + case '\0':
|
| + return 1;
|
| + case '[': {
|
| + neg = false;
|
| + ingroup = false;
|
| + p = nextchar(p);
|
| + pos = st;
|
| + break;
|
| + }
|
| + case '^': {
|
| + p = nextchar(p);
|
| + neg = true;
|
| + break;
|
| + }
|
| + case ']': {
|
| + if ((neg && ingroup) || (!neg && !ingroup))
|
| + return 0;
|
| + pos = NULL;
|
| + p = nextchar(p);
|
| + // skip the next character
|
| + if (!ingroup && *st)
|
| + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
|
| + ;
|
| + if (*st == '\0' && p)
|
| + return 0; // word <= condition
|
| + break;
|
| + }
|
| + case '.':
|
| + if (!pos) { // dots are not metacharacters in groups: [.]
|
| + p = nextchar(p);
|
| + // skip the next character
|
| + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
|
| + ;
|
| + if (*st == '\0' && p)
|
| + return 0; // word <= condition
|
| + break;
|
| + }
|
| + /* FALLTHROUGH */
|
| + default: {
|
| + if (*st == *p) {
|
| + st++;
|
| + p = nextchar(p);
|
| + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
|
| + while (p && (*p & 0xc0) == 0x80) { // character
|
| + if (*p != *st) {
|
| + if (!pos)
|
| + return 0;
|
| + st = pos;
|
| break;
|
| + }
|
| + p = nextchar(p);
|
| + st++;
|
| }
|
| - case '.': if (!pos) { // dots are not metacharacters in groups: [.]
|
| - p = nextchar(p);
|
| - // skip the next character
|
| - for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
|
| - if (*st == '\0' && p) return 0; // word <= condition
|
| - break;
|
| + if (pos && st != pos) {
|
| + ingroup = true;
|
| + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
|
| + }
|
| }
|
| - default: {
|
| - if (*st == *p) {
|
| - st++;
|
| - p = nextchar(p);
|
| - if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
|
| - while (p && (*p & 0xc0) == 0x80) { // character
|
| - if (*p != *st) {
|
| - if (!pos) return 0;
|
| - st = pos;
|
| - break;
|
| - }
|
| - p = nextchar(p);
|
| - st++;
|
| - }
|
| - if (pos && st != pos) {
|
| - ingroup = true;
|
| - while (p && *p != ']' && ((p = nextchar(p)) != NULL));
|
| - }
|
| - } else if (pos) {
|
| - ingroup = true;
|
| - while (p && *p != ']' && ((p = nextchar(p)) != NULL));
|
| - }
|
| - } else if (pos) { // group
|
| - p = nextchar(p);
|
| - } else return 0;
|
| + } else if (pos) {
|
| + ingroup = true;
|
| + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
|
| }
|
| + }
|
| + } else if (pos) { // group
|
| + p = nextchar(p);
|
| + } else
|
| + return 0;
|
| }
|
| - if (!p) return 1;
|
| }
|
| + if (!p)
|
| + return 1;
|
| + }
|
| }
|
|
|
| // check if this prefix entry matches
|
| -struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - struct hentry * he; // hash entry of root word or NULL
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| -
|
| - // on entry prefix is 0 length or already matches the beginning of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| -
|
| - tmpl = len - appndl;
|
| -
|
| - if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
|
| -
|
| - // generate new root word by removing prefix and adding
|
| - // back any characters that would have been stripped
|
| -
|
| - if (stripl) strcpy (tmpword, strip);
|
| - strcpy ((tmpword + stripl), (word + appndl));
|
| -
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| -
|
| - // if all conditions are met then check if resulting
|
| - // root word in the dictionary
|
| -
|
| - if (test_condition(tmpword)) {
|
| - tmpl += stripl;
|
| - if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
| - do {
|
| - if (TESTAFF(he->astr, aflag, he->alen) &&
|
| - // forbid single prefixes with needaffix flag
|
| - ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
|
| - // needflag
|
| - ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
|
| - (contclass && TESTAFF(contclass, needflag, contclasslen))))
|
| - return he;
|
| - he = he->next_homonym; // check homonyms
|
| - } while (he);
|
| - }
|
| -
|
| - // prefix matched but no root word was found
|
| - // if aeXPRODUCT is allowed, try again but now
|
| - // ross checked combined with a suffix
|
| -
|
| - //if ((opts & aeXPRODUCT) && in_compound) {
|
| - if ((opts & aeXPRODUCT)) {
|
| - he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
|
| - 0, NULL, FLAG_NULL, needflag, in_compound);
|
| - if (he) return he;
|
| - }
|
| - }
|
| - }
|
| - return NULL;
|
| -}
|
| -
|
| -// check if this prefix entry matches
|
| -struct hentry * PfxEntry::check_twosfx(const char * word, int len,
|
| - char in_compound, const FLAG needflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - struct hentry * he; // hash entry of root word or NULL
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| -
|
| - // on entry prefix is 0 length or already matches the beginning of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| -
|
| - tmpl = len - appndl;
|
| -
|
| - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| - (tmpl + stripl >= numconds)) {
|
| -
|
| - // generate new root word by removing prefix and adding
|
| - // back any characters that would have been stripped
|
| -
|
| - if (stripl) strcpy (tmpword, strip);
|
| - strcpy ((tmpword + stripl), (word + appndl));
|
| -
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| -
|
| - // if all conditions are met then check if resulting
|
| - // root word in the dictionary
|
| -
|
| - if (test_condition(tmpword)) {
|
| - tmpl += stripl;
|
| +struct hentry* PfxEntry::checkword(const char* word,
|
| + int len,
|
| + char in_compound,
|
| + const FLAG needflag) {
|
| + struct hentry* he; // hash entry of root word or NULL
|
| +
|
| + // on entry prefix is 0 length or already matches the beginning of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
| +
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| +
|
| + if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
|
| + // generate new root word by removing prefix and adding
|
| + // back any characters that would have been stripped
|
| +
|
| + std::string tmpword(strip);
|
| + tmpword.append(word + appnd.size());
|
| +
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
| +
|
| + // if all conditions are met then check if resulting
|
| + // root word in the dictionary
|
| +
|
| + if (test_condition(tmpword.c_str())) {
|
| + tmpl += strip.size();
|
| + if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
|
| + do {
|
| + if (TESTAFF(he->astr, aflag, he->alen) &&
|
| + // forbid single prefixes with needaffix flag
|
| + !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
|
| + // needflag
|
| + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
|
| + (contclass && TESTAFF(contclass, needflag, contclasslen))))
|
| + return he;
|
| + he = he->next_homonym; // check homonyms
|
| + } while (he);
|
| + }
|
|
|
| - // prefix matched but no root word was found
|
| - // if aeXPRODUCT is allowed, try again but now
|
| - // cross checked combined with a suffix
|
| + // prefix matched but no root word was found
|
| + // if aeXPRODUCT is allowed, try again but now
|
| + // ross checked combined with a suffix
|
|
|
| - if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
| - he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
|
| - if (he) return he;
|
| - }
|
| - }
|
| - }
|
| - return NULL;
|
| + // if ((opts & aeXPRODUCT) && in_compound) {
|
| + if ((opts & aeXPRODUCT)) {
|
| + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
|
| + FLAG_NULL, needflag, in_compound);
|
| + if (he)
|
| + return he;
|
| + }
|
| + }
|
| + }
|
| + return NULL;
|
| }
|
|
|
| // check if this prefix entry matches
|
| -char * PfxEntry::check_twosfx_morph(const char * word, int len,
|
| - char in_compound, const FLAG needflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| -
|
| - // on entry prefix is 0 length or already matches the beginning of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| -
|
| - tmpl = len - appndl;
|
| -
|
| - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| - (tmpl + stripl >= numconds)) {
|
| -
|
| - // generate new root word by removing prefix and adding
|
| - // back any characters that would have been stripped
|
| -
|
| - if (stripl) strcpy (tmpword, strip);
|
| - strcpy ((tmpword + stripl), (word + appndl));
|
| -
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| -
|
| - // if all conditions are met then check if resulting
|
| - // root word in the dictionary
|
| -
|
| - if (test_condition(tmpword)) {
|
| - tmpl += stripl;
|
| -
|
| - // prefix matched but no root word was found
|
| - // if aeXPRODUCT is allowed, try again but now
|
| - // ross checked combined with a suffix
|
| +struct hentry* PfxEntry::check_twosfx(const char* word,
|
| + int len,
|
| + char in_compound,
|
| + const FLAG needflag) {
|
| + // on entry prefix is 0 length or already matches the beginning of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
| +
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| +
|
| + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| + (tmpl + strip.size() >= numconds)) {
|
| + // generate new root word by removing prefix and adding
|
| + // back any characters that would have been stripped
|
| +
|
| + std::string tmpword(strip);
|
| + tmpword.append(word + appnd.size());
|
| +
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
| +
|
| + // if all conditions are met then check if resulting
|
| + // root word in the dictionary
|
| +
|
| + if (test_condition(tmpword.c_str())) {
|
| + tmpl += strip.size();
|
| +
|
| + // prefix matched but no root word was found
|
| + // if aeXPRODUCT is allowed, try again but now
|
| + // cross checked combined with a suffix
|
| +
|
| + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
| + // hash entry of root word or NULL
|
| + struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
|
| + needflag);
|
| + if (he)
|
| + return he;
|
| + }
|
| + }
|
| + }
|
| + return NULL;
|
| +}
|
|
|
| - if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
| - return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
|
| - aeXPRODUCT, this, needflag);
|
| - }
|
| - }
|
| - }
|
| - return NULL;
|
| +// check if this prefix entry matches
|
| +std::string PfxEntry::check_twosfx_morph(const char* word,
|
| + int len,
|
| + char in_compound,
|
| + const FLAG needflag) {
|
| + std::string result;
|
| + // on entry prefix is 0 length or already matches the beginning of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| +
|
| + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| + (tmpl + strip.size() >= numconds)) {
|
| + // generate new root word by removing prefix and adding
|
| + // back any characters that would have been stripped
|
| +
|
| + std::string tmpword(strip);
|
| + tmpword.append(word + appnd.size());
|
| +
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
| +
|
| + // if all conditions are met then check if resulting
|
| + // root word in the dictionary
|
| +
|
| + if (test_condition(tmpword.c_str())) {
|
| + tmpl += strip.size();
|
| +
|
| + // prefix matched but no root word was found
|
| + // if aeXPRODUCT is allowed, try again but now
|
| + // ross checked combined with a suffix
|
| +
|
| + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
| + result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
|
| + aeXPRODUCT,
|
| + this, needflag);
|
| + }
|
| + }
|
| + }
|
| + return result;
|
| }
|
|
|
| // check if this prefix entry matches
|
| -char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - struct hentry * he; // hash entry of root word or NULL
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| - char result[MAXLNLEN];
|
| - char * st;
|
| -
|
| - *result = '\0';
|
| -
|
| - // on entry prefix is 0 length or already matches the beginning of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| -
|
| - tmpl = len - appndl;
|
| -
|
| - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| - (tmpl + stripl >= numconds)) {
|
| -
|
| - // generate new root word by removing prefix and adding
|
| - // back any characters that would have been stripped
|
| -
|
| - if (stripl) strcpy (tmpword, strip);
|
| - strcpy ((tmpword + stripl), (word + appndl));
|
| -
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| -
|
| - // if all conditions are met then check if resulting
|
| - // root word in the dictionary
|
| -
|
| - if (test_condition(tmpword)) {
|
| - tmpl += stripl;
|
| - if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
| - do {
|
| - if (TESTAFF(he->astr, aflag, he->alen) &&
|
| - // forbid single prefixes with needaffix flag
|
| - ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
|
| - // needflag
|
| - ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
|
| - (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
|
| - if (morphcode) {
|
| - mystrcat(result, " ", MAXLNLEN);
|
| - mystrcat(result, morphcode, MAXLNLEN);
|
| - } else mystrcat(result,getKey(), MAXLNLEN);
|
| - if (!HENTRY_FIND(he, MORPH_STEM)) {
|
| - mystrcat(result, " ", MAXLNLEN);
|
| - mystrcat(result, MORPH_STEM, MAXLNLEN);
|
| - mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
|
| - }
|
| - // store the pointer of the hash entry
|
| - if (HENTRY_DATA(he)) {
|
| - mystrcat(result, " ", MAXLNLEN);
|
| - mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
|
| - } else {
|
| - // return with debug information
|
| - char * flag = pmyMgr->encode_flag(getFlag());
|
| - mystrcat(result, " ", MAXLNLEN);
|
| - mystrcat(result, MORPH_FLAG, MAXLNLEN);
|
| - mystrcat(result, flag, MAXLNLEN);
|
| - free(flag);
|
| - }
|
| - mystrcat(result, "\n", MAXLNLEN);
|
| - }
|
| - he = he->next_homonym;
|
| - } while (he);
|
| - }
|
| -
|
| - // prefix matched but no root word was found
|
| - // if aeXPRODUCT is allowed, try again but now
|
| - // ross checked combined with a suffix
|
| -
|
| - if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
| - st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
|
| - FLAG_NULL, needflag);
|
| - if (st) {
|
| - mystrcat(result, st, MAXLNLEN);
|
| - free(st);
|
| - }
|
| - }
|
| +std::string PfxEntry::check_morph(const char* word,
|
| + int len,
|
| + char in_compound,
|
| + const FLAG needflag) {
|
| + std::string result;
|
| +
|
| + // on entry prefix is 0 length or already matches the beginning of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
| +
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| +
|
| + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| + (tmpl + strip.size() >= numconds)) {
|
| + // generate new root word by removing prefix and adding
|
| + // back any characters that would have been stripped
|
| +
|
| + std::string tmpword(strip);
|
| + tmpword.append(word + appnd.size());
|
| +
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
| +
|
| + // if all conditions are met then check if resulting
|
| + // root word in the dictionary
|
| +
|
| + if (test_condition(tmpword.c_str())) {
|
| + tmpl += strip.size();
|
| + struct hentry* he; // hash entry of root word or NULL
|
| + if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
|
| + do {
|
| + if (TESTAFF(he->astr, aflag, he->alen) &&
|
| + // forbid single prefixes with needaffix flag
|
| + !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
|
| + // needflag
|
| + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
|
| + (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
|
| + if (morphcode) {
|
| + result.append(" ");
|
| + result.append(morphcode);
|
| + } else
|
| + result.append(getKey());
|
| + if (!HENTRY_FIND(he, MORPH_STEM)) {
|
| + result.append(" ");
|
| + result.append(MORPH_STEM);
|
| + result.append(HENTRY_WORD(he));
|
| }
|
| - }
|
| -
|
| - if (*result) return mystrdup(result);
|
| - return NULL;
|
| -}
|
| + // store the pointer of the hash entry
|
| + if (HENTRY_DATA(he)) {
|
| + result.append(" ");
|
| + result.append(HENTRY_DATA2(he));
|
| + } else {
|
| + // return with debug information
|
| + char* flag = pmyMgr->encode_flag(getFlag());
|
| + result.append(" ");
|
| + result.append(MORPH_FLAG);
|
| + result.append(flag);
|
| + free(flag);
|
| + }
|
| + result.append("\n");
|
| + }
|
| + he = he->next_homonym;
|
| + } while (he);
|
| + }
|
|
|
| -SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
|
| -{
|
| - // register affix manager
|
| - pmyMgr = pmgr;
|
| -
|
| - // set up its initial values
|
| - aflag = dp->aflag; // char flag
|
| - strip = dp->strip; // string to strip
|
| - appnd = dp->appnd; // string to append
|
| - stripl = dp->stripl; // length of strip string
|
| - appndl = dp->appndl; // length of append string
|
| - numconds = dp->numconds; // length of the condition
|
| - opts = dp->opts; // cross product flag
|
| -
|
| - // then copy over all of the conditions
|
| - if (opts & aeLONGCOND) {
|
| - memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
|
| - c.l.conds2 = dp->c.l.conds2;
|
| - } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
|
| - next = NULL;
|
| - nextne = NULL;
|
| - nexteq = NULL;
|
| - rappnd = myrevstrdup(appnd);
|
| - morphcode = dp->morphcode;
|
| - contclass = dp->contclass;
|
| - contclasslen = dp->contclasslen;
|
| -}
|
| + // prefix matched but no root word was found
|
| + // if aeXPRODUCT is allowed, try again but now
|
| + // ross checked combined with a suffix
|
|
|
| + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
| + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
|
| + FLAG_NULL, needflag);
|
| + if (!st.empty()) {
|
| + result.append(st);
|
| + }
|
| + }
|
| + }
|
| + }
|
|
|
| -SfxEntry::~SfxEntry()
|
| -{
|
| - aflag = 0;
|
| - if (appnd) free(appnd);
|
| - if (rappnd) free(rappnd);
|
| - if (strip) free(strip);
|
| - pmyMgr = NULL;
|
| - appnd = NULL;
|
| - strip = NULL;
|
| - if (opts & aeLONGCOND) free(c.l.conds2);
|
| - if (morphcode && !(opts & aeALIASM)) free(morphcode);
|
| - if (contclass && !(opts & aeALIASF)) free(contclass);
|
| + return result;
|
| +}
|
| +
|
| +SfxEntry::SfxEntry(AffixMgr* pmgr)
|
| + : pmyMgr(pmgr) // register affix manager
|
| + ,
|
| + next(NULL),
|
| + nexteq(NULL),
|
| + nextne(NULL),
|
| + flgnxt(NULL),
|
| + l_morph(NULL),
|
| + r_morph(NULL),
|
| + eq_morph(NULL) {
|
| }
|
|
|
| // add suffix to this word assuming conditions hold
|
| -char * SfxEntry::add(const char * word, int len)
|
| -{
|
| - char tword[MAXWORDUTF8LEN + 4];
|
| -
|
| - /* make sure all conditions match */
|
| - if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
|
| - (len >= numconds) && test_condition(word + len, word) &&
|
| - (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
|
| - ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
|
| - /* we have a match so add suffix */
|
| - strcpy(tword,word);
|
| - if (appndl) {
|
| - strcpy(tword + len - stripl, appnd);
|
| - } else {
|
| - *(tword + len - stripl) = '\0';
|
| - }
|
| - return mystrdup(tword);
|
| - }
|
| - return NULL;
|
| +std::string SfxEntry::add(const char* word, size_t len) {
|
| + std::string result;
|
| + /* make sure all conditions match */
|
| + if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
|
| + (len >= numconds) && test_condition(word + len, word) &&
|
| + (!strip.size() ||
|
| + (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
|
| + result.assign(word);
|
| + /* we have a match so add suffix */
|
| + result.replace(len - strip.size(), std::string::npos, appnd);
|
| + }
|
| + return result;
|
| }
|
|
|
| -inline char * SfxEntry::nextchar(char * p) {
|
| - if (p) {
|
| - p++;
|
| - if (opts & aeLONGCOND) {
|
| - // jump to the 2nd part of the condition
|
| - if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
|
| - // end of the MAXCONDLEN length condition
|
| - } else if (p == c.conds + MAXCONDLEN) return NULL;
|
| - return *p ? p : NULL;
|
| - }
|
| - return NULL;
|
| +inline char* SfxEntry::nextchar(char* p) {
|
| + if (p) {
|
| + p++;
|
| + if (opts & aeLONGCOND) {
|
| + // jump to the 2nd part of the condition
|
| + if (p == c.l.conds1 + MAXCONDLEN_1)
|
| + return c.l.conds2;
|
| + // end of the MAXCONDLEN length condition
|
| + } else if (p == c.conds + MAXCONDLEN)
|
| + return NULL;
|
| + return *p ? p : NULL;
|
| + }
|
| + return NULL;
|
| }
|
|
|
| -inline int SfxEntry::test_condition(const char * st, const char * beg)
|
| -{
|
| - const char * pos = NULL; // group with pos input position
|
| - bool neg = false; // complementer
|
| - bool ingroup = false; // character in the group
|
| - if (numconds == 0) return 1;
|
| - char * p = c.conds;
|
| - st--;
|
| - int i = 1;
|
| - while (1) {
|
| - switch (*p) {
|
| - case '\0': return 1;
|
| - case '[': { p = nextchar(p); pos = st; break; }
|
| - case '^': { p = nextchar(p); neg = true; break; }
|
| - case ']': { if (!neg && !ingroup) return 0;
|
| - i++;
|
| - // skip the next character
|
| - if (!ingroup) {
|
| - for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
|
| - st--;
|
| - }
|
| - pos = NULL;
|
| - neg = false;
|
| - ingroup = false;
|
| - p = nextchar(p);
|
| - if (st < beg && p) return 0; // word <= condition
|
| - break;
|
| +inline int SfxEntry::test_condition(const char* st, const char* beg) {
|
| + const char* pos = NULL; // group with pos input position
|
| + bool neg = false; // complementer
|
| + bool ingroup = false; // character in the group
|
| + if (numconds == 0)
|
| + return 1;
|
| + char* p = c.conds;
|
| + st--;
|
| + int i = 1;
|
| + while (1) {
|
| + switch (*p) {
|
| + case '\0':
|
| + return 1;
|
| + case '[':
|
| + p = nextchar(p);
|
| + pos = st;
|
| + break;
|
| + case '^':
|
| + p = nextchar(p);
|
| + neg = true;
|
| + break;
|
| + case ']':
|
| + if (!neg && !ingroup)
|
| + return 0;
|
| + i++;
|
| + // skip the next character
|
| + if (!ingroup) {
|
| + for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
|
| + ;
|
| + st--;
|
| + }
|
| + pos = NULL;
|
| + neg = false;
|
| + ingroup = false;
|
| + p = nextchar(p);
|
| + if (st < beg && p)
|
| + return 0; // word <= condition
|
| + break;
|
| + case '.':
|
| + if (!pos) {
|
| + // dots are not metacharacters in groups: [.]
|
| + p = nextchar(p);
|
| + // skip the next character
|
| + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
|
| + st--)
|
| + ;
|
| + if (st < beg) { // word <= condition
|
| + if (p)
|
| + return 0;
|
| + else
|
| + return 1;
|
| + }
|
| + if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
|
| + st--;
|
| + if (st < beg) { // word <= condition
|
| + if (p)
|
| + return 0;
|
| + else
|
| + return 1;
|
| }
|
| - case '.': if (!pos) { // dots are not metacharacters in groups: [.]
|
| - p = nextchar(p);
|
| - // skip the next character
|
| - for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
|
| - if (st < beg) { // word <= condition
|
| - if (p) return 0; else return 1;
|
| - }
|
| - if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
|
| - st--;
|
| - if (st < beg) { // word <= condition
|
| - if (p) return 0; else return 1;
|
| - }
|
| - }
|
| + }
|
| + break;
|
| + }
|
| + /* FALLTHROUGH */
|
| + default: {
|
| + if (*st == *p) {
|
| + p = nextchar(p);
|
| + if ((opts & aeUTF8) && (*st & 0x80)) {
|
| + st--;
|
| + while (p && (st >= beg)) {
|
| + if (*p != *st) {
|
| + if (!pos)
|
| + return 0;
|
| + st = pos;
|
| + break;
|
| + }
|
| + // first byte of the UTF-8 multibyte character
|
| + if ((*p & 0xc0) != 0x80)
|
| break;
|
| + p = nextchar(p);
|
| + st--;
|
| }
|
| - default: {
|
| - if (*st == *p) {
|
| - p = nextchar(p);
|
| - if ((opts & aeUTF8) && (*st & 0x80)) {
|
| - st--;
|
| - while (p && (st >= beg)) {
|
| - if (*p != *st) {
|
| - if (!pos) return 0;
|
| - st = pos;
|
| - break;
|
| - }
|
| - // first byte of the UTF-8 multibyte character
|
| - if ((*p & 0xc0) != 0x80) break;
|
| - p = nextchar(p);
|
| - st--;
|
| - }
|
| - if (pos && st != pos) {
|
| - if (neg) return 0;
|
| - else if (i == numconds) return 1;
|
| - ingroup = true;
|
| - while (p && *p != ']' && ((p = nextchar(p)) != NULL));
|
| - st--;
|
| - }
|
| - if (p && *p != ']') p = nextchar(p);
|
| - } else if (pos) {
|
| - if (neg) return 0;
|
| - else if (i == numconds) return 1;
|
| - ingroup = true;
|
| - while (p && *p != ']' && ((p = nextchar(p)) != NULL));
|
| -// if (p && *p != ']') p = nextchar(p);
|
| - st--;
|
| - }
|
| - if (!pos) {
|
| - i++;
|
| - st--;
|
| - }
|
| - if (st < beg && p && *p != ']') return 0; // word <= condition
|
| - } else if (pos) { // group
|
| - p = nextchar(p);
|
| - } else return 0;
|
| + if (pos && st != pos) {
|
| + if (neg)
|
| + return 0;
|
| + else if (i == numconds)
|
| + return 1;
|
| + ingroup = true;
|
| + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
|
| + }
|
| + st--;
|
| + }
|
| + if (p && *p != ']')
|
| + p = nextchar(p);
|
| + } else if (pos) {
|
| + if (neg)
|
| + return 0;
|
| + else if (i == numconds)
|
| + return 1;
|
| + ingroup = true;
|
| + while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
|
| }
|
| + // if (p && *p != ']') p = nextchar(p);
|
| + st--;
|
| + }
|
| + if (!pos) {
|
| + i++;
|
| + st--;
|
| + }
|
| + if (st < beg && p && *p != ']')
|
| + return 0; // word <= condition
|
| + } else if (pos) { // group
|
| + p = nextchar(p);
|
| + } else
|
| + return 0;
|
| }
|
| - if (!p) return 1;
|
| }
|
| + if (!p)
|
| + return 1;
|
| + }
|
| }
|
|
|
| // see if this suffix is present in the word
|
| -struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
|
| - PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
|
| - const FLAG badflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - struct hentry * he; // hash entry pointer
|
| - unsigned char * cp;
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| - PfxEntry* ep = ppfx;
|
| -
|
| - // if this suffix is being cross checked with a prefix
|
| - // but it does not support cross products skip it
|
| -
|
| - if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
|
| - return NULL;
|
| -
|
| - // upon entry suffix is 0 length or already matches the end of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| +struct hentry* SfxEntry::checkword(const char* word,
|
| + int len,
|
| + int optflags,
|
| + PfxEntry* ppfx,
|
| + const FLAG cclass,
|
| + const FLAG needflag,
|
| + const FLAG badflag) {
|
| + struct hentry* he; // hash entry pointer
|
| + PfxEntry* ep = ppfx;
|
| +
|
| + // if this suffix is being cross checked with a prefix
|
| + // but it does not support cross products skip it
|
| +
|
| + if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
|
| + return NULL;
|
|
|
| - tmpl = len - appndl;
|
| - // the second condition is not enough for UTF-8 strings
|
| - // it checked in test_condition()
|
| + // upon entry suffix is 0 length or already matches the end of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
|
|
| - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| - (tmpl + stripl >= numconds)) {
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| + // the second condition is not enough for UTF-8 strings
|
| + // it checked in test_condition()
|
|
|
| - // generate new root word by removing suffix and adding
|
| - // back any characters that would have been stripped or
|
| - // or null terminating the shorter string
|
| + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| + (tmpl + strip.size() >= numconds)) {
|
| + // generate new root word by removing suffix and adding
|
| + // back any characters that would have been stripped or
|
| + // or null terminating the shorter string
|
|
|
| - strcpy (tmpword, word);
|
| - cp = (unsigned char *)(tmpword + tmpl);
|
| - if (stripl) {
|
| - strcpy ((char *)cp, strip);
|
| - tmpl += stripl;
|
| - cp = (unsigned char *)(tmpword + tmpl);
|
| - } else *cp = '\0';
|
| + std::string tmpstring(word, tmpl);
|
| + if (strip.size()) {
|
| + tmpstring.append(strip);
|
| + }
|
|
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| + const char* tmpword = tmpstring.c_str();
|
| + const char* endword = tmpword + tmpstring.size();
|
|
|
| - // if all conditions are met then check if resulting
|
| - // root word in the dictionary
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
|
|
| - if (test_condition((char *) cp, (char *) tmpword)) {
|
| + // if all conditions are met then check if resulting
|
| + // root word in the dictionary
|
|
|
| + if (test_condition(endword, tmpword)) {
|
| #ifdef SZOSZABLYA_POSSIBLE_ROOTS
|
| - fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
|
| + fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
|
| #endif
|
| - if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
| - do {
|
| - // check conditional suffix (enabled by prefix)
|
| - if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
|
| - TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
|
| - (((optflags & aeXPRODUCT) == 0) ||
|
| - (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
|
| - // enabled by prefix
|
| - ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
|
| - ) &&
|
| - // handle cont. class
|
| - ((!cclass) ||
|
| - ((contclass) && TESTAFF(contclass, cclass, contclasslen))
|
| - ) &&
|
| - // check only in compound homonyms (bad flags)
|
| - (!badflag || !TESTAFF(he->astr, badflag, he->alen)
|
| - ) &&
|
| - // handle required flag
|
| - ((!needflag) ||
|
| - (TESTAFF(he->astr, needflag, he->alen) ||
|
| - ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
|
| - )
|
| - ) return he;
|
| - he = he->next_homonym; // check homonyms
|
| - } while (he);
|
| -
|
| - // obsolote stemming code (used only by the
|
| - // experimental SuffixMgr:suggest_pos_stems)
|
| - // store resulting root in wlst
|
| - } else if (wlst && (*ns < maxSug)) {
|
| - int cwrd = 1;
|
| - for (int k=0; k < *ns; k++)
|
| - if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
|
| - if (cwrd) {
|
| - wlst[*ns] = mystrdup(tmpword);
|
| - if (wlst[*ns] == NULL) {
|
| - for (int j=0; j<*ns; j++) free(wlst[j]);
|
| - *ns = -1;
|
| - return NULL;
|
| - }
|
| - (*ns)++;
|
| - }
|
| - }
|
| - }
|
| + if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
| + do {
|
| + // check conditional suffix (enabled by prefix)
|
| + if ((TESTAFF(he->astr, aflag, he->alen) ||
|
| + (ep && ep->getCont() &&
|
| + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
|
| + (((optflags & aeXPRODUCT) == 0) ||
|
| + (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
|
| + // enabled by prefix
|
| + ((contclass) &&
|
| + (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
|
| + // handle cont. class
|
| + ((!cclass) ||
|
| + ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
|
| + // check only in compound homonyms (bad flags)
|
| + (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
|
| + // handle required flag
|
| + ((!needflag) ||
|
| + (TESTAFF(he->astr, needflag, he->alen) ||
|
| + ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
|
| + return he;
|
| + he = he->next_homonym; // check homonyms
|
| + } while (he);
|
| + }
|
| }
|
| - return NULL;
|
| + }
|
| + return NULL;
|
| }
|
|
|
| // see if two-level suffix is present in the word
|
| -struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
|
| - PfxEntry* ppfx, const FLAG needflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - struct hentry * he; // hash entry pointer
|
| - unsigned char * cp;
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| - PfxEntry* ep = ppfx;
|
| -
|
| -
|
| - // if this suffix is being cross checked with a prefix
|
| - // but it does not support cross products skip it
|
| -
|
| - if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
|
| - return NULL;
|
| -
|
| - // upon entry suffix is 0 length or already matches the end of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| -
|
| - tmpl = len - appndl;
|
| -
|
| - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| - (tmpl + stripl >= numconds)) {
|
| -
|
| - // generate new root word by removing suffix and adding
|
| - // back any characters that would have been stripped or
|
| - // or null terminating the shorter string
|
| -
|
| - strcpy (tmpword, word);
|
| - cp = (unsigned char *)(tmpword + tmpl);
|
| - if (stripl) {
|
| - strcpy ((char *)cp, strip);
|
| - tmpl += stripl;
|
| - cp = (unsigned char *)(tmpword + tmpl);
|
| - } else *cp = '\0';
|
| -
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| -
|
| - // if all conditions are met then recall suffix_check
|
| -
|
| - if (test_condition((char *) cp, (char *) tmpword)) {
|
| - if (ppfx) {
|
| - // handle conditional suffix
|
| - if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
|
| - he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
|
| - else
|
| - he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
|
| - } else {
|
| - he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
|
| - }
|
| - if (he) return he;
|
| - }
|
| - }
|
| +struct hentry* SfxEntry::check_twosfx(const char* word,
|
| + int len,
|
| + int optflags,
|
| + PfxEntry* ppfx,
|
| + const FLAG needflag) {
|
| + PfxEntry* ep = ppfx;
|
| +
|
| + // if this suffix is being cross checked with a prefix
|
| + // but it does not support cross products skip it
|
| +
|
| + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
|
| return NULL;
|
| +
|
| + // upon entry suffix is 0 length or already matches the end of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
| +
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| +
|
| + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| + (tmpl + strip.size() >= numconds)) {
|
| + // generate new root word by removing suffix and adding
|
| + // back any characters that would have been stripped or
|
| + // or null terminating the shorter string
|
| +
|
| + std::string tmpword(word);
|
| + tmpword.resize(tmpl);
|
| + tmpword.append(strip);
|
| + tmpl += strip.size();
|
| +
|
| + const char* beg = tmpword.c_str();
|
| + const char* end = beg + tmpl;
|
| +
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
| +
|
| + // if all conditions are met then recall suffix_check
|
| +
|
| + if (test_condition(end, beg)) {
|
| + struct hentry* he; // hash entry pointer
|
| + if (ppfx) {
|
| + // handle conditional suffix
|
| + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
|
| + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
|
| + (FLAG)aflag, needflag, IN_CPD_NOT);
|
| + else
|
| + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx,
|
| + (FLAG)aflag, needflag, IN_CPD_NOT);
|
| + } else {
|
| + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
|
| + (FLAG)aflag, needflag, IN_CPD_NOT);
|
| + }
|
| + if (he)
|
| + return he;
|
| + }
|
| + }
|
| + return NULL;
|
| }
|
|
|
| // see if two-level suffix is present in the word
|
| -char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
|
| - PfxEntry* ppfx, const FLAG needflag)
|
| -{
|
| - int tmpl; // length of tmpword
|
| - unsigned char * cp;
|
| - char tmpword[MAXWORDUTF8LEN + 4];
|
| - PfxEntry* ep = ppfx;
|
| - char * st;
|
| -
|
| - char result[MAXLNLEN];
|
| -
|
| - *result = '\0';
|
| -
|
| - // if this suffix is being cross checked with a prefix
|
| - // but it does not support cross products skip it
|
| -
|
| - if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
|
| - return NULL;
|
| -
|
| - // upon entry suffix is 0 length or already matches the end of the word.
|
| - // So if the remaining root word has positive length
|
| - // and if there are enough chars in root word and added back strip chars
|
| - // to meet the number of characters conditions, then test it
|
| -
|
| - tmpl = len - appndl;
|
| -
|
| - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| - (tmpl + stripl >= numconds)) {
|
| -
|
| - // generate new root word by removing suffix and adding
|
| - // back any characters that would have been stripped or
|
| - // or null terminating the shorter string
|
| -
|
| - strcpy (tmpword, word);
|
| - cp = (unsigned char *)(tmpword + tmpl);
|
| - if (stripl) {
|
| - strcpy ((char *)cp, strip);
|
| - tmpl += stripl;
|
| - cp = (unsigned char *)(tmpword + tmpl);
|
| - } else *cp = '\0';
|
| -
|
| - // now make sure all of the conditions on characters
|
| - // are met. Please see the appendix at the end of
|
| - // this file for more info on exactly what is being
|
| - // tested
|
| -
|
| - // if all conditions are met then recall suffix_check
|
| -
|
| - if (test_condition((char *) cp, (char *) tmpword)) {
|
| - if (ppfx) {
|
| - // handle conditional suffix
|
| - if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
|
| - st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
|
| - if (st) {
|
| - if (ppfx->getMorph()) {
|
| - mystrcat(result, ppfx->getMorph(), MAXLNLEN);
|
| - mystrcat(result, " ", MAXLNLEN);
|
| - }
|
| - mystrcat(result,st, MAXLNLEN);
|
| - free(st);
|
| - mychomp(result);
|
| - }
|
| - } else {
|
| - st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
|
| - if (st) {
|
| - mystrcat(result, st, MAXLNLEN);
|
| - free(st);
|
| - mychomp(result);
|
| - }
|
| - }
|
| - } else {
|
| - st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
|
| - if (st) {
|
| - mystrcat(result, st, MAXLNLEN);
|
| - free(st);
|
| - mychomp(result);
|
| - }
|
| - }
|
| - if (*result) return mystrdup(result);
|
| +std::string SfxEntry::check_twosfx_morph(const char* word,
|
| + int len,
|
| + int optflags,
|
| + PfxEntry* ppfx,
|
| + const FLAG needflag) {
|
| + PfxEntry* ep = ppfx;
|
| +
|
| + std::string result;
|
| +
|
| + // if this suffix is being cross checked with a prefix
|
| + // but it does not support cross products skip it
|
| +
|
| + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
|
| + return result;
|
| +
|
| + // upon entry suffix is 0 length or already matches the end of the word.
|
| + // So if the remaining root word has positive length
|
| + // and if there are enough chars in root word and added back strip chars
|
| + // to meet the number of characters conditions, then test it
|
| +
|
| + int tmpl = len - appnd.size(); // length of tmpword
|
| +
|
| + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
|
| + (tmpl + strip.size() >= numconds)) {
|
| + // generate new root word by removing suffix and adding
|
| + // back any characters that would have been stripped or
|
| + // or null terminating the shorter string
|
| +
|
| + std::string tmpword(word);
|
| + tmpword.resize(tmpl);
|
| + tmpword.append(strip);
|
| + tmpl += strip.size();
|
| +
|
| + const char* beg = tmpword.c_str();
|
| + const char* end = beg + tmpl;
|
| +
|
| + // now make sure all of the conditions on characters
|
| + // are met. Please see the appendix at the end of
|
| + // this file for more info on exactly what is being
|
| + // tested
|
| +
|
| + // if all conditions are met then recall suffix_check
|
| +
|
| + if (test_condition(end, beg)) {
|
| + if (ppfx) {
|
| + // handle conditional suffix
|
| + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
|
| + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
|
| + needflag);
|
| + if (!st.empty()) {
|
| + if (ppfx->getMorph()) {
|
| + result.append(ppfx->getMorph());
|
| + result.append(" ");
|
| }
|
| + result.append(st);
|
| + mychomp(result);
|
| + }
|
| + } else {
|
| + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
|
| + needflag);
|
| + if (!st.empty()) {
|
| + result.append(st);
|
| + mychomp(result);
|
| + }
|
| + }
|
| + } else {
|
| + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
|
| + if (!st.empty()) {
|
| + result.append(st);
|
| + mychomp(result);
|
| + }
|
| + }
|
| }
|
| - return NULL;
|
| + }
|
| + return result;
|
| }
|
|
|
| // get next homonym with same affix
|
| -struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
|
| - const FLAG cclass, const FLAG needflag)
|
| -{
|
| - PfxEntry* ep = ppfx;
|
| - FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
|
| -
|
| - while (he->next_homonym) {
|
| - he = he->next_homonym;
|
| - if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
|
| - ((optflags & aeXPRODUCT) == 0 ||
|
| - TESTAFF(he->astr, eFlag, he->alen) ||
|
| - // handle conditional suffix
|
| - ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
|
| - ) &&
|
| - // handle cont. class
|
| - ((!cclass) ||
|
| - ((contclass) && TESTAFF(contclass, cclass, contclasslen))
|
| - ) &&
|
| - // handle required flag
|
| - ((!needflag) ||
|
| - (TESTAFF(he->astr, needflag, he->alen) ||
|
| - ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
|
| - )
|
| - ) return he;
|
| - }
|
| - return NULL;
|
| +struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
|
| + int optflags,
|
| + PfxEntry* ppfx,
|
| + const FLAG cclass,
|
| + const FLAG needflag) {
|
| + PfxEntry* ep = ppfx;
|
| + FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
|
| +
|
| + while (he->next_homonym) {
|
| + he = he->next_homonym;
|
| + if ((TESTAFF(he->astr, aflag, he->alen) ||
|
| + (ep && ep->getCont() &&
|
| + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
|
| + ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
|
| + // handle conditional suffix
|
| + ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
|
| + // handle cont. class
|
| + ((!cclass) ||
|
| + ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
|
| + // handle required flag
|
| + ((!needflag) ||
|
| + (TESTAFF(he->astr, needflag, he->alen) ||
|
| + ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
|
| + return he;
|
| + }
|
| + return NULL;
|
| }
|
|
|
| +void SfxEntry::initReverseWord() {
|
| + rappnd = appnd;
|
| + reverseword(rappnd);
|
| +}
|
|
|
| #if 0
|
|
|
| @@ -858,10 +884,8 @@ The structure affentry is defined as follows:
|
| struct affentry
|
| {
|
| unsigned short aflag; // ID used to represent the affix
|
| - char * strip; // string to strip before adding affix
|
| - char * appnd; // the affix string to add
|
| - unsigned char stripl; // length of the strip string
|
| - unsigned char appndl; // length of the affix string
|
| + std::string strip; // string to strip before adding affix
|
| + std::string appnd; // the affix string to add
|
| char numconds; // the number of conditions that must be met
|
| char opts; // flag: aeXPRODUCT- combine both prefix and suffix
|
| char conds[SETSIZE]; // array which encodes the conditions to be met
|
| @@ -959,6 +983,4 @@ first two affentries for the suffix D described earlier.
|
| conds['y'] = (1 << 1) (the last char must be a y)
|
| all other bits for all other entries in the conds array are zero
|
|
|
| -
|
| #endif
|
| -
|
|
|